xref: /netbsd-src/external/bsd/nsd/dist/server.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #ifdef USE_DNSTAP
85 #include "dnstap/dnstap_collector.h"
86 #endif
87 
88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
89 
90 #ifdef USE_TCP_FASTOPEN
91   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
92   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
93 #endif
94 
95 /*
96  * Data for the UDP handlers.
97  */
98 struct udp_handler_data
99 {
100 	struct nsd        *nsd;
101 	struct nsd_socket *socket;
102 	struct event       event;
103 };
104 
105 struct tcp_accept_handler_data {
106 	struct nsd        *nsd;
107 	struct nsd_socket *socket;
108 	int                event_added;
109 	struct event       event;
110 #ifdef HAVE_SSL
111 	/* handler accepts TLS connections on the dedicated port */
112 	int                tls_accept;
113 #endif
114 };
115 
116 /*
117  * These globals are used to enable the TCP accept handlers
118  * when the number of TCP connection drops below the maximum
119  * number of TCP connections.
120  */
121 static size_t tcp_accept_handler_count;
122 static struct tcp_accept_handler_data *tcp_accept_handlers;
123 
124 static struct event slowaccept_event;
125 static int slowaccept;
126 
127 #ifdef HAVE_SSL
128 static unsigned char *ocspdata = NULL;
129 static long ocspdata_len = 0;
130 #endif
131 
132 #ifdef NONBLOCKING_IS_BROKEN
133 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
134    read multiple times from a socket when reported ready by select. */
135 # define NUM_RECV_PER_SELECT (1)
136 #else /* !NONBLOCKING_IS_BROKEN */
137 # define NUM_RECV_PER_SELECT (100)
138 #endif /* NONBLOCKING_IS_BROKEN */
139 
140 #ifndef HAVE_MMSGHDR
141 struct mmsghdr {
142 	struct msghdr msg_hdr;
143 	unsigned int  msg_len;
144 };
145 #endif
146 
147 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
148 static struct iovec iovecs[NUM_RECV_PER_SELECT];
149 static struct query *queries[NUM_RECV_PER_SELECT];
150 
151 /*
152  * Data for the TCP connection handlers.
153  *
154  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
155  * blocking the entire server on a slow TCP connection, but does make
156  * reading from and writing to the socket more complicated.
157  *
158  * Basically, whenever a read/write would block (indicated by the
159  * EAGAIN errno variable) we remember the position we were reading
160  * from/writing to and return from the TCP reading/writing event
161  * handler.  When the socket becomes readable/writable again we
162  * continue from the same position.
163  */
164 struct tcp_handler_data
165 {
166 	/*
167 	 * The region used to allocate all TCP connection related
168 	 * data, including this structure.  This region is destroyed
169 	 * when the connection is closed.
170 	 */
171 	region_type*		region;
172 
173 	/*
174 	 * The global nsd structure.
175 	 */
176 	struct nsd*			nsd;
177 
178 	/*
179 	 * The current query data for this TCP connection.
180 	 */
181 	query_type*			query;
182 
183 	/*
184 	 * The query_state is used to remember if we are performing an
185 	 * AXFR, if we're done processing, or if we should discard the
186 	 * query and connection.
187 	 */
188 	query_state_type	query_state;
189 
190 	/*
191 	 * The event for the file descriptor and tcp timeout
192 	 */
193 	struct event event;
194 
195 	/*
196 	 * The bytes_transmitted field is used to remember the number
197 	 * of bytes transmitted when receiving or sending a DNS
198 	 * packet.  The count includes the two additional bytes used
199 	 * to specify the packet length on a TCP connection.
200 	 */
201 	size_t				bytes_transmitted;
202 
203 	/*
204 	 * The number of queries handled by this specific TCP connection.
205 	 */
206 	int					query_count;
207 
208 	/*
209 	 * The timeout in msec for this tcp connection
210 	 */
211 	int	tcp_timeout;
212 
213 	/*
214 	 * If the connection is allowed to have further queries on it.
215 	 */
216 	int tcp_no_more_queries;
217 #ifdef HAVE_SSL
218 	/*
219 	 * TLS object.
220 	 */
221 	SSL* tls;
222 
223 	/*
224 	 * TLS handshake state.
225 	 */
226 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
227 		tls_hs_read_event, tls_hs_write_event } shake_state;
228 #endif
229 	/* list of connections, for service of remaining tcp channels */
230 	struct tcp_handler_data *prev, *next;
231 };
232 /* global that is the list of active tcp channels */
233 static struct tcp_handler_data *tcp_active_list = NULL;
234 
235 /*
236  * Handle incoming queries on the UDP server sockets.
237  */
238 static void handle_udp(int fd, short event, void* arg);
239 
240 /*
241  * Handle incoming connections on the TCP sockets.  These handlers
242  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
243  * connection) but are disabled when the number of current TCP
244  * connections is equal to the maximum number of TCP connections.
245  * Disabling is done by changing the handler to wait for the
246  * NETIO_EVENT_NONE type.  This is done using the function
247  * configure_tcp_accept_handlers.
248  */
249 static void handle_tcp_accept(int fd, short event, void* arg);
250 
251 /*
252  * Handle incoming queries on a TCP connection.  The TCP connections
253  * are configured to be non-blocking and the handler may be called
254  * multiple times before a complete query is received.
255  */
256 static void handle_tcp_reading(int fd, short event, void* arg);
257 
258 /*
259  * Handle outgoing responses on a TCP connection.  The TCP connections
260  * are configured to be non-blocking and the handler may be called
261  * multiple times before a complete response is sent.
262  */
263 static void handle_tcp_writing(int fd, short event, void* arg);
264 
265 #ifdef HAVE_SSL
266 /* Create SSL object and associate fd */
267 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
268 /*
269  * Handle TLS handshake. May be called multiple times if incomplete.
270  */
271 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
272 
273 /*
274  * Handle incoming queries on a TLS over TCP connection.  The TLS
275  * connections are configured to be non-blocking and the handler may
276  * be called multiple times before a complete query is received.
277  */
278 static void handle_tls_reading(int fd, short event, void* arg);
279 
280 /*
281  * Handle outgoing responses on a TLS over TCP connection.  The TLS
282  * connections are configured to be non-blocking and the handler may
283  * be called multiple times before a complete response is sent.
284  */
285 static void handle_tls_writing(int fd, short event, void* arg);
286 #endif
287 
288 /*
289  * Send all children the quit nonblocking, then close pipe.
290  */
291 static void send_children_quit(struct nsd* nsd);
292 /* same, for shutdown time, waits for child to exit to avoid restart issues */
293 static void send_children_quit_and_wait(struct nsd* nsd);
294 
295 /* set childrens flags to send NSD_STATS to them */
296 #ifdef BIND8_STATS
297 static void set_children_stats(struct nsd* nsd);
298 #endif /* BIND8_STATS */
299 
300 /*
301  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
302  */
303 static void configure_handler_event_types(short event_types);
304 
305 static uint16_t *compressed_dname_offsets = 0;
306 static uint32_t compression_table_capacity = 0;
307 static uint32_t compression_table_size = 0;
308 static domain_type* compressed_dnames[MAXRRSPP];
309 
310 #ifdef USE_TCP_FASTOPEN
311 /* Checks to see if the kernel value must be manually changed in order for
312    TCP Fast Open to support server mode */
313 static void report_tcp_fastopen_config() {
314 
315 	int tcp_fastopen_fp;
316 	uint8_t tcp_fastopen_value;
317 
318 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
319 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
320 	}
321 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
322 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
323 		close(tcp_fastopen_fp);
324 	}
325 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
326 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
327 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
328 		log_msg(LOG_WARNING, "To enable TFO use the command:");
329 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
330 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
331 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
332 		close(tcp_fastopen_fp);
333 	}
334 	close(tcp_fastopen_fp);
335 }
336 #endif
337 
338 /*
339  * Remove the specified pid from the list of child pids.  Returns -1 if
340  * the pid is not in the list, child_num otherwise.  The field is set to 0.
341  */
342 static int
343 delete_child_pid(struct nsd *nsd, pid_t pid)
344 {
345 	size_t i;
346 	for (i = 0; i < nsd->child_count; ++i) {
347 		if (nsd->children[i].pid == pid) {
348 			nsd->children[i].pid = 0;
349 			if(!nsd->children[i].need_to_exit) {
350 				if(nsd->children[i].child_fd != -1)
351 					close(nsd->children[i].child_fd);
352 				nsd->children[i].child_fd = -1;
353 				if(nsd->children[i].handler)
354 					nsd->children[i].handler->fd = -1;
355 			}
356 			return i;
357 		}
358 	}
359 	return -1;
360 }
361 
362 /*
363  * Restart child servers if necessary.
364  */
365 static int
366 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
367 	int* xfrd_sock_p)
368 {
369 	struct main_ipc_handler_data *ipc_data;
370 	size_t i;
371 	int sv[2];
372 
373 	/* Fork the child processes... */
374 	for (i = 0; i < nsd->child_count; ++i) {
375 		if (nsd->children[i].pid <= 0) {
376 			if (nsd->children[i].child_fd != -1)
377 				close(nsd->children[i].child_fd);
378 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
379 				log_msg(LOG_ERR, "socketpair: %s",
380 					strerror(errno));
381 				return -1;
382 			}
383 			nsd->children[i].child_fd = sv[0];
384 			nsd->children[i].parent_fd = sv[1];
385 			nsd->children[i].pid = fork();
386 			switch (nsd->children[i].pid) {
387 			default: /* SERVER MAIN */
388 				close(nsd->children[i].parent_fd);
389 				nsd->children[i].parent_fd = -1;
390 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
391 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
392 				}
393 				if(!nsd->children[i].handler)
394 				{
395 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
396 						region, sizeof(struct main_ipc_handler_data));
397 					ipc_data->nsd = nsd;
398 					ipc_data->child = &nsd->children[i];
399 					ipc_data->child_num = i;
400 					ipc_data->xfrd_sock = xfrd_sock_p;
401 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
402 					ipc_data->forward_mode = 0;
403 					ipc_data->got_bytes = 0;
404 					ipc_data->total_bytes = 0;
405 					ipc_data->acl_num = 0;
406 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
407 						region, sizeof(struct netio_handler));
408 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
409 					nsd->children[i].handler->timeout = NULL;
410 					nsd->children[i].handler->user_data = ipc_data;
411 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
412 					nsd->children[i].handler->event_handler = parent_handle_child_command;
413 					netio_add_handler(netio, nsd->children[i].handler);
414 				}
415 				/* clear any ongoing ipc */
416 				ipc_data = (struct main_ipc_handler_data*)
417 					nsd->children[i].handler->user_data;
418 				ipc_data->forward_mode = 0;
419 				/* restart - update fd */
420 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
421 				break;
422 			case 0: /* CHILD */
423 				/* the child need not be able to access the
424 				 * nsd.db file */
425 				namedb_close_udb(nsd->db);
426 #ifdef MEMCLEAN /* OS collects memory pages */
427 				region_destroy(region);
428 #endif
429 				nsd->pid = 0;
430 				nsd->child_count = 0;
431 				nsd->server_kind = nsd->children[i].kind;
432 				nsd->this_child = &nsd->children[i];
433 				nsd->this_child->child_num = i;
434 				/* remove signal flags inherited from parent
435 				   the parent will handle them. */
436 				nsd->signal_hint_reload_hup = 0;
437 				nsd->signal_hint_reload = 0;
438 				nsd->signal_hint_child = 0;
439 				nsd->signal_hint_quit = 0;
440 				nsd->signal_hint_shutdown = 0;
441 				nsd->signal_hint_stats = 0;
442 				nsd->signal_hint_statsusr = 0;
443 				close(*xfrd_sock_p);
444 				close(nsd->this_child->child_fd);
445 				nsd->this_child->child_fd = -1;
446 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
447 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
448 				}
449 				server_child(nsd);
450 				/* NOTREACH */
451 				exit(0);
452 			case -1:
453 				log_msg(LOG_ERR, "fork failed: %s",
454 					strerror(errno));
455 				return -1;
456 			}
457 		}
458 	}
459 	return 0;
460 }
461 
462 #ifdef BIND8_STATS
463 static void set_bind8_alarm(struct nsd* nsd)
464 {
465 	/* resync so that the next alarm is on the next whole minute */
466 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
467 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
468 }
469 #endif
470 
471 /* set zone stat ids for zones initially read in */
472 static void
473 zonestatid_tree_set(struct nsd* nsd)
474 {
475 	struct radnode* n;
476 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
477 		zone_type* zone = (zone_type*)n->elem;
478 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
479 	}
480 }
481 
482 #ifdef USE_ZONE_STATS
483 void
484 server_zonestat_alloc(struct nsd* nsd)
485 {
486 	size_t num = (nsd->options->zonestatnames->count==0?1:
487 			nsd->options->zonestatnames->count);
488 	size_t sz = sizeof(struct nsdst)*num;
489 	char tmpfile[256];
490 	uint8_t z = 0;
491 
492 	/* file names */
493 	nsd->zonestatfname[0] = 0;
494 	nsd->zonestatfname[1] = 0;
495 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
496 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
497 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
498 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
499 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
500 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
501 
502 	/* file descriptors */
503 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
504 	if(nsd->zonestatfd[0] == -1) {
505 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
506 			strerror(errno));
507 		exit(1);
508 	}
509 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
510 	if(nsd->zonestatfd[0] == -1) {
511 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
512 			strerror(errno));
513 		close(nsd->zonestatfd[0]);
514 		unlink(nsd->zonestatfname[0]);
515 		exit(1);
516 	}
517 
518 #ifdef HAVE_MMAP
519 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
520 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
521 			strerror(errno));
522 		exit(1);
523 	}
524 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
525 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
526 			nsd->zonestatfname[0], strerror(errno));
527 		exit(1);
528 	}
529 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
530 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
531 			strerror(errno));
532 		exit(1);
533 	}
534 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
535 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
536 			nsd->zonestatfname[1], strerror(errno));
537 		exit(1);
538 	}
539 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
540 		MAP_SHARED, nsd->zonestatfd[0], 0);
541 	if(nsd->zonestat[0] == MAP_FAILED) {
542 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
543 		unlink(nsd->zonestatfname[0]);
544 		unlink(nsd->zonestatfname[1]);
545 		exit(1);
546 	}
547 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
548 		MAP_SHARED, nsd->zonestatfd[1], 0);
549 	if(nsd->zonestat[1] == MAP_FAILED) {
550 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
551 		unlink(nsd->zonestatfname[0]);
552 		unlink(nsd->zonestatfname[1]);
553 		exit(1);
554 	}
555 	memset(nsd->zonestat[0], 0, sz);
556 	memset(nsd->zonestat[1], 0, sz);
557 	nsd->zonestatsize[0] = num;
558 	nsd->zonestatsize[1] = num;
559 	nsd->zonestatdesired = num;
560 	nsd->zonestatsizenow = num;
561 	nsd->zonestatnow = nsd->zonestat[0];
562 #endif /* HAVE_MMAP */
563 }
564 
565 void
566 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
567 {
568 #ifdef HAVE_MMAP
569 #ifdef MREMAP_MAYMOVE
570 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
571 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
572 		MREMAP_MAYMOVE);
573 	if(nsd->zonestat[idx] == MAP_FAILED) {
574 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
575 		exit(1);
576 	}
577 #else /* !HAVE MREMAP */
578 	if(msync(nsd->zonestat[idx],
579 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
580 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
581 	if(munmap(nsd->zonestat[idx],
582 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
583 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
584 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
585 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
586 	if(nsd->zonestat[idx] == MAP_FAILED) {
587 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
588 		exit(1);
589 	}
590 #endif /* MREMAP */
591 #endif /* HAVE_MMAP */
592 }
593 
594 /* realloc the zonestat array for the one that is not currently in use,
595  * to match the desired new size of the array (if applicable) */
596 void
597 server_zonestat_realloc(struct nsd* nsd)
598 {
599 #ifdef HAVE_MMAP
600 	uint8_t z = 0;
601 	size_t sz;
602 	int idx = 0; /* index of the zonestat array that is not in use */
603 	if(nsd->zonestatnow == nsd->zonestat[0])
604 		idx = 1;
605 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
606 		return;
607 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
608 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
609 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
610 			strerror(errno));
611 		exit(1);
612 	}
613 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
614 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
615 			nsd->zonestatfname[idx], strerror(errno));
616 		exit(1);
617 	}
618 	zonestat_remap(nsd, idx, sz);
619 	/* zero the newly allocated region */
620 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
621 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
622 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
623 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
624 	}
625 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
626 #endif /* HAVE_MMAP */
627 }
628 
629 /* switchover to use the other array for the new children, that
630  * briefly coexist with the old children.  And we want to avoid them
631  * both writing to the same statistics arrays. */
632 void
633 server_zonestat_switch(struct nsd* nsd)
634 {
635 	if(nsd->zonestatnow == nsd->zonestat[0]) {
636 		nsd->zonestatnow = nsd->zonestat[1];
637 		nsd->zonestatsizenow = nsd->zonestatsize[1];
638 	} else {
639 		nsd->zonestatnow = nsd->zonestat[0];
640 		nsd->zonestatsizenow = nsd->zonestatsize[0];
641 	}
642 }
643 #endif /* USE_ZONE_STATS */
644 
645 static void
646 cleanup_dname_compression_tables(void *ptr)
647 {
648 	free(ptr);
649 	compressed_dname_offsets = NULL;
650 	compression_table_capacity = 0;
651 }
652 
653 static void
654 initialize_dname_compression_tables(struct nsd *nsd)
655 {
656 	size_t needed = domain_table_count(nsd->db->domains) + 1;
657 	needed += EXTRA_DOMAIN_NUMBERS;
658 	if(compression_table_capacity < needed) {
659 		if(compressed_dname_offsets) {
660 			region_remove_cleanup(nsd->db->region,
661 				cleanup_dname_compression_tables,
662 				compressed_dname_offsets);
663 			free(compressed_dname_offsets);
664 		}
665 		compressed_dname_offsets = (uint16_t *) xmallocarray(
666 			needed, sizeof(uint16_t));
667 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
668 			compressed_dname_offsets);
669 		compression_table_capacity = needed;
670 		compression_table_size=domain_table_count(nsd->db->domains)+1;
671 	}
672 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
673 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
674 }
675 
676 static int
677 set_cloexec(struct nsd_socket *sock)
678 {
679 	assert(sock != NULL);
680 
681 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
682 		const char *socktype =
683 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
684 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
685 			socktype, strerror(errno));
686 		return -1;
687 	}
688 
689 	return 1;
690 }
691 
692 static int
693 set_reuseport(struct nsd_socket *sock)
694 {
695 #ifdef SO_REUSEPORT
696 	int on = 1;
697 #ifdef SO_REUSEPORT_LB
698 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
699 	 * SO_REUSEPORT on Linux. This is what the users want with the config
700 	 * option in nsd.conf; if we actually need local address and port reuse
701 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
702 	 * _LB they want.
703 	 */
704 	int opt = SO_REUSEPORT_LB;
705 	static const char optname[] = "SO_REUSEPORT_LB";
706 #else /* !SO_REUSEPORT_LB */
707 	int opt = SO_REUSEPORT;
708 	static const char optname[] = "SO_REUSEPORT";
709 #endif /* SO_REUSEPORT_LB */
710 
711 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
712 		return 1;
713 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
714 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
715 			optname, strerror(errno));
716 	}
717 	return -1;
718 #else
719 	(void)sock;
720 #endif /* SO_REUSEPORT */
721 
722 	return 0;
723 }
724 
725 static int
726 set_reuseaddr(struct nsd_socket *sock)
727 {
728 #ifdef SO_REUSEADDR
729 	int on = 1;
730 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
731 		return 1;
732 	}
733 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
734 		strerror(errno));
735 	return -1;
736 #endif /* SO_REUSEADDR */
737 	return 0;
738 }
739 
740 static int
741 set_rcvbuf(struct nsd_socket *sock, int rcv)
742 {
743 #ifdef SO_RCVBUF
744 #ifdef SO_RCVBUFFORCE
745 	if(0 == setsockopt(
746 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
747 	{
748 		return 1;
749 	}
750 	if(errno == EPERM || errno == ENOBUFS) {
751 		return 0;
752 	}
753 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
754 		strerror(errno));
755 	return -1;
756 #else /* !SO_RCVBUFFORCE */
757 	if (0 == setsockopt(
758 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
759 	{
760 		return 1;
761 	}
762 	if(errno == ENOSYS || errno == ENOBUFS) {
763 		return 0;
764 	}
765 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
766 		strerror(errno));
767 	return -1;
768 #endif /* SO_RCVBUFFORCE */
769 #endif /* SO_RCVBUF */
770 
771 	return 0;
772 }
773 
774 static int
775 set_sndbuf(struct nsd_socket *sock, int snd)
776 {
777 #ifdef SO_SNDBUF
778 #ifdef SO_SNDBUFFORCE
779 	if(0 == setsockopt(
780 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
781 	{
782 		return 1;
783 	}
784 	if(errno == EPERM || errno == ENOBUFS) {
785 		return 0;
786 	}
787 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
788 		strerror(errno));
789 	return -1;
790 #else /* !SO_SNDBUFFORCE */
791 	if(0 == setsockopt(
792 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
793 	{
794 		return 1;
795 	}
796 	if(errno == ENOSYS || errno == ENOBUFS) {
797 		return 0;
798 	}
799 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
800 		strerror(errno));
801 	return -1;
802 #endif /* SO_SNDBUFFORCE */
803 #endif /* SO_SNDBUF */
804 
805 	return 0;
806 }
807 
808 static int
809 set_nonblock(struct nsd_socket *sock)
810 {
811 	const char *socktype =
812 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
813 
814 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
815 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
816 			socktype, strerror(errno));
817 		return -1;
818 	}
819 
820 	return 1;
821 }
822 
823 static int
824 set_ipv6_v6only(struct nsd_socket *sock)
825 {
826 #ifdef INET6
827 #ifdef IPV6_V6ONLY
828 	int on = 1;
829 	const char *socktype =
830 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
831 
832 	if(0 == setsockopt(
833 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
834 	{
835 		return 1;
836 	}
837 
838 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
839 		socktype, strerror(errno));
840 	return -1;
841 #endif /* IPV6_V6ONLY */
842 #endif /* INET6 */
843 
844 	return 0;
845 }
846 
847 static int
848 set_ipv6_use_min_mtu(struct nsd_socket *sock)
849 {
850 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU))
851 #if defined(IPV6_USE_MIN_MTU)
852 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
853 	 * network. Therefore we do not send UDP datagrams larger than the
854 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
855 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
856 	 */
857 	int opt = IPV6_USE_MIN_MTU;
858 	int optval = 1;
859 	static const char optname[] = "IPV6_USE_MIN_MTU";
860 #elif defined(IPV6_MTU)
861 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
862 	 * to the MIN MTU to get the same.
863 	 */
864 	int opt = IPV6_MTU;
865 	int optval = IPV6_MIN_MTU;
866 	static const char optname[] = "IPV6_MTU";
867 #endif
868 	if(0 == setsockopt(
869 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
870 	{
871 		return 1;
872 	}
873 
874 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
875 		optname, strerror(errno));
876 	return -1;
877 #else
878 	(void)sock;
879 #endif /* INET6 */
880 
881 	return 0;
882 }
883 
884 static int
885 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
886 {
887 	int ret = 0;
888 
889 #if defined(IP_MTU_DISCOVER)
890 	int opt = IP_MTU_DISCOVER;
891 	int optval;
892 # if defined(IP_PMTUDISC_OMIT)
893 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
894 	 * information and send packets with DF=0. Fragmentation is allowed if
895 	 * and only if the packet size exceeds the outgoing interface MTU or
896 	 * the packet encounters smaller MTU link in network. This mitigates
897 	 * DNS fragmentation attacks by preventing forged PMTU information.
898 	 * FreeBSD already has same semantics without setting the option.
899 	 */
900 	optval = IP_PMTUDISC_OMIT;
901 	if(0 == setsockopt(
902 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
903 	{
904 		return 1;
905 	}
906 
907 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
908 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
909 # endif /* IP_PMTUDISC_OMIT */
910 # if defined(IP_PMTUDISC_DONT)
911 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
912 	optval = IP_PMTUDISC_DONT;
913 	if(0 == setsockopt(
914 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
915 	{
916 		return 1;
917 	}
918 
919 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
920 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
921 # endif
922 	ret = -1;
923 #elif defined(IP_DONTFRAG)
924 	int off = 0;
925 	if (0 == setsockopt(
926 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
927 	{
928 		return 1;
929 	}
930 
931 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
932 		strerror(errno));
933 	ret = -1;
934 #else
935 	(void)sock;
936 #endif
937 
938 	return ret;
939 }
940 
941 static int
942 set_ip_freebind(struct nsd_socket *sock)
943 {
944 #ifdef IP_FREEBIND
945 	int on = 1;
946 	const char *socktype =
947 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
948 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
949 	{
950 		return 1;
951 	}
952 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
953 		socktype, strerror(errno));
954 	return -1;
955 #else
956 	(void)sock;
957 #endif /* IP_FREEBIND */
958 
959 	return 0;
960 }
961 
962 static int
963 set_ip_transparent(struct nsd_socket *sock)
964 {
965 	/*
966 	The scandalous preprocessor blob here calls for some explanation :)
967 	POSIX does not specify an option to bind non-local IPs, so
968 	platforms developed several implementation-specific options,
969 	all set in the same way, but with different names.
970 	For additional complexity, some platform manage this setting
971 	differently for different address families (IPv4 vs IPv6).
972 	This scandalous preprocessor blob below abstracts such variability
973 	in the way which leaves the C code as lean and clear as possible.
974 	*/
975 
976 #if defined(IP_TRANSPARENT)
977 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
978 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
979 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
980 // as of 2020-01, Linux does not support this on IPv6 programmatically
981 #elif defined(SO_BINDANY)
982 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
983 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
984 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
985 #elif defined(IP_BINDANY)
986 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
987 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
988 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
989 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
990 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
991 #endif
992 
993 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
994 	(void)sock;
995 #else
996 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
997 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
998 #	endif
999 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1000 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1001 #	endif
1002 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1003 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1004 #	endif
1005 
1006 	int on = 1;
1007 	const char *socktype =
1008 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1009 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1010 
1011 	if(0 == setsockopt(
1012 		sock->s,
1013 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1014 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1015 		&on, sizeof(on)))
1016 	{
1017 		return 1;
1018 	}
1019 
1020 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1021 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1022 	return -1;
1023 #endif
1024 
1025 	return 0;
1026 }
1027 
1028 static int
1029 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1030 {
1031 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1032 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1033 		return 1;
1034 	}
1035 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1036 		strerror(errno));
1037 	return -1;
1038 #else
1039 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1040 #endif
1041 	return 0;
1042 }
1043 
1044 #ifdef USE_TCP_FASTOPEN
1045 static int
1046 set_tcp_fastopen(struct nsd_socket *sock)
1047 {
1048 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1049 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1050 	 */
1051 	int qlen;
1052 
1053 #ifdef __APPLE__
1054 	/* macOS X implementation only supports qlen of 1 via this call. The
1055 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1056 	 * kernel parameter.
1057 	 */
1058 	qlen = 1;
1059 #else
1060 	/* 5 is recommended on Linux. */
1061 	qlen = 5;
1062 #endif
1063 	if (0 == setsockopt(
1064 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1065 	{
1066 		return 1;
1067 	}
1068 
1069 	if (errno == EPERM) {
1070 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1071 				 "; this could likely be because sysctl "
1072 				 "net.inet.tcp.fastopen.enabled, "
1073 				 "net.inet.tcp.fastopen.server_enable, or "
1074 				 "net.ipv4.tcp_fastopen is disabled",
1075 			strerror(errno));
1076 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1077 	 * disabled, except when verbosity enabled for debugging
1078 	 */
1079 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1080 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1081 			strerror(errno));
1082 	}
1083 
1084 	return (errno == ENOPROTOOPT ? 0 : -1);
1085 }
1086 #endif /* USE_TCP_FASTOPEN */
1087 
1088 static int
1089 set_bindtodevice(struct nsd_socket *sock)
1090 {
1091 #if defined(SO_BINDTODEVICE)
1092 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1093 		sock->device, strlen(sock->device)) == -1)
1094 	{
1095 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1096 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1097 		return -1;
1098 	}
1099 
1100 	return 1;
1101 #else
1102 	(void)sock;
1103 	return 0;
1104 #endif
1105 }
1106 
1107 static int
1108 set_setfib(struct nsd_socket *sock)
1109 {
1110 #if defined(SO_SETFIB)
1111 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1112 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1113 	{
1114 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1115 		                 "SO_SETFIB", sock->fib, strerror(errno));
1116 		return -1;
1117 	}
1118 
1119 	return 1;
1120 #else
1121 	(void)sock;
1122 	return 0;
1123 #endif
1124 }
1125 
1126 static int
1127 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1128 {
1129 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1130 
1131 	if(-1 == (sock->s = socket(
1132 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1133 	{
1134 #ifdef INET6
1135 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1136 		   (sock->addr.ai_family == AF_INET6) &&
1137 		   (errno == EAFNOSUPPORT))
1138 		{
1139 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1140 				"not supported");
1141 			return 0;
1142 		}
1143 #endif
1144 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1145 		return -1;
1146 	}
1147 
1148 	set_cloexec(sock);
1149 
1150 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1151 		*reuseport_works = (set_reuseport(sock) == 1);
1152 
1153 	if(nsd->options->receive_buffer_size > 0)
1154 		rcv = nsd->options->receive_buffer_size;
1155 	if(set_rcvbuf(sock, rcv) == -1)
1156 		return -1;
1157 
1158 	if(nsd->options->send_buffer_size > 0)
1159 		snd = nsd->options->send_buffer_size;
1160 	if(set_sndbuf(sock, snd) == -1)
1161 		return -1;
1162 #ifdef INET6
1163 	if(sock->addr.ai_family == AF_INET6) {
1164 		if(set_ipv6_v6only(sock) == -1 ||
1165 		   set_ipv6_use_min_mtu(sock) == -1)
1166 			return -1;
1167 	} else
1168 #endif /* INET6 */
1169 	if(sock->addr.ai_family == AF_INET) {
1170 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1171 			return -1;
1172 	}
1173 
1174 	/* Set socket to non-blocking. Otherwise, on operating systems
1175 	 * with thundering herd problems, the UDP recv could block
1176 	 * after select returns readable.
1177 	 */
1178 	set_nonblock(sock);
1179 
1180 	if(nsd->options->ip_freebind)
1181 		(void)set_ip_freebind(sock);
1182 	if(nsd->options->ip_transparent)
1183 		(void)set_ip_transparent(sock);
1184 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1185 		return -1;
1186 	if(sock->fib != -1 && set_setfib(sock) == -1)
1187 		return -1;
1188 
1189 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1190 		char buf[256];
1191 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1192 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1193 			buf, strerror(errno));
1194 		return -1;
1195 	}
1196 
1197 	return 1;
1198 }
1199 
1200 static int
1201 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1202 {
1203 #ifdef USE_TCP_FASTOPEN
1204 	report_tcp_fastopen_config();
1205 #endif
1206 
1207 	(void)reuseport_works;
1208 
1209 	if(-1 == (sock->s = socket(
1210 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1211 	{
1212 #ifdef INET6
1213 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1214 		   (sock->addr.ai_family == AF_INET6) &&
1215 		   (errno == EAFNOSUPPORT))
1216 		{
1217 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1218 			                     "not supported");
1219 			return 0;
1220 		}
1221 #endif /* INET6 */
1222 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1223 		return -1;
1224 	}
1225 
1226 	set_cloexec(sock);
1227 
1228 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1229 		*reuseport_works = (set_reuseport(sock) == 1);
1230 
1231 	(void)set_reuseaddr(sock);
1232 
1233 #ifdef INET6
1234 	if(sock->addr.ai_family == AF_INET6) {
1235 		if (set_ipv6_v6only(sock) == -1 ||
1236 		    set_ipv6_use_min_mtu(sock) == -1)
1237 			return -1;
1238 	}
1239 #endif
1240 
1241 	if(nsd->tcp_mss > 0)
1242 		set_tcp_maxseg(sock, nsd->tcp_mss);
1243 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1244 	   it may block in accept, even if select() says readable. */
1245 	(void)set_nonblock(sock);
1246 	if(nsd->options->ip_freebind)
1247 		(void)set_ip_freebind(sock);
1248 	if(nsd->options->ip_transparent)
1249 		(void)set_ip_transparent(sock);
1250 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1251 		return -1;
1252 	if(sock->fib != -1 && set_setfib(sock) == -1)
1253 		return -1;
1254 
1255 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1256 		char buf[256];
1257 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1258 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1259 			buf, strerror(errno));
1260 		return -1;
1261 	}
1262 
1263 #ifdef USE_TCP_FASTOPEN
1264 	(void)set_tcp_fastopen(sock);
1265 #endif
1266 
1267 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1268 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1269 		return -1;
1270 	}
1271 
1272 	return 1;
1273 }
1274 
1275 /*
1276  * Initialize the server, reuseport, create and bind the sockets.
1277  */
1278 int
1279 server_init(struct nsd *nsd)
1280 {
1281 	size_t i;
1282 	int reuseport = 1; /* Determine if REUSEPORT works. */
1283 
1284 	/* open server interface ports */
1285 	for(i = 0; i < nsd->ifs; i++) {
1286 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1287 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1288 		{
1289 			return -1;
1290 		}
1291 	}
1292 
1293 	if(nsd->reuseport && reuseport) {
1294 		size_t ifs = nsd->ifs * nsd->reuseport;
1295 
1296 		/* increase the size of the interface arrays, there are going
1297 		 * to be separate interface file descriptors for every server
1298 		 * instance */
1299 		region_remove_cleanup(nsd->region, free, nsd->udp);
1300 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1301 
1302 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1303 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1304 		region_add_cleanup(nsd->region, free, nsd->udp);
1305 		region_add_cleanup(nsd->region, free, nsd->tcp);
1306 		if(ifs > nsd->ifs) {
1307 			memset(&nsd->udp[nsd->ifs], 0,
1308 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1309 			memset(&nsd->tcp[nsd->ifs], 0,
1310 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1311 		}
1312 
1313 		for(i = nsd->ifs; i < ifs; i++) {
1314 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1315 			nsd->udp[i].s = -1;
1316 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1317 				return -1;
1318 			}
1319 			/* Turn off REUSEPORT for TCP by copying the socket
1320 			 * file descriptor.
1321 			 * This means we should not close TCP used by
1322 			 * other servers in reuseport enabled mode, in
1323 			 * server_child().
1324 			 */
1325 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1326 		}
1327 
1328 		nsd->ifs = ifs;
1329 	} else {
1330 		nsd->reuseport = 0;
1331 	}
1332 
1333 	return 0;
1334 }
1335 
1336 /*
1337  * Prepare the server for take off.
1338  *
1339  */
1340 int
1341 server_prepare(struct nsd *nsd)
1342 {
1343 #ifdef RATELIMIT
1344 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1345 #ifdef HAVE_GETRANDOM
1346 	uint32_t v;
1347 	if(getrandom(&v, sizeof(v), 0) == -1) {
1348 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1349 		exit(1);
1350 	}
1351 	hash_set_raninit(v);
1352 #elif defined(HAVE_ARC4RANDOM)
1353 	hash_set_raninit(arc4random());
1354 #else
1355 	uint32_t v = getpid() ^ time(NULL);
1356 	srandom((unsigned long)v);
1357 #  ifdef HAVE_SSL
1358 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1359 		hash_set_raninit(v);
1360 	else
1361 #  endif
1362 		hash_set_raninit(random());
1363 #endif
1364 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1365 		nsd->options->rrl_ratelimit,
1366 		nsd->options->rrl_whitelist_ratelimit,
1367 		nsd->options->rrl_slip,
1368 		nsd->options->rrl_ipv4_prefix_length,
1369 		nsd->options->rrl_ipv6_prefix_length);
1370 #endif /* RATELIMIT */
1371 
1372 	/* Open the database... */
1373 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1374 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1375 			nsd->dbfile, strerror(errno));
1376 		unlink(nsd->task[0]->fname);
1377 		unlink(nsd->task[1]->fname);
1378 #ifdef USE_ZONE_STATS
1379 		unlink(nsd->zonestatfname[0]);
1380 		unlink(nsd->zonestatfname[1]);
1381 #endif
1382 		xfrd_del_tempdir(nsd);
1383 		return -1;
1384 	}
1385 	/* check if zone files have been modified */
1386 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1387 	 * for all zones */
1388 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1389 		nsd->options->database[0] == 0))
1390 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1391 	zonestatid_tree_set(nsd);
1392 
1393 	compression_table_capacity = 0;
1394 	initialize_dname_compression_tables(nsd);
1395 
1396 #ifdef	BIND8_STATS
1397 	/* Initialize times... */
1398 	time(&nsd->st.boot);
1399 	set_bind8_alarm(nsd);
1400 #endif /* BIND8_STATS */
1401 
1402 	return 0;
1403 }
1404 
1405 /*
1406  * Fork the required number of servers.
1407  */
1408 static int
1409 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1410 	int* xfrd_sock_p)
1411 {
1412 	size_t i;
1413 
1414 	/* Start all child servers initially.  */
1415 	for (i = 0; i < nsd->child_count; ++i) {
1416 		nsd->children[i].pid = 0;
1417 	}
1418 
1419 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1420 }
1421 
1422 static void
1423 server_close_socket(struct nsd_socket *sock)
1424 {
1425 	if(sock->s != -1) {
1426 		close(sock->s);
1427 		sock->s = -1;
1428 	}
1429 }
1430 
1431 void
1432 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1433 {
1434 	size_t i;
1435 
1436 	/* Close all the sockets... */
1437 	for (i = 0; i < n; ++i) {
1438 		server_close_socket(&sockets[i]);
1439 	}
1440 }
1441 
1442 /*
1443  * Close the sockets, shutdown the server and exit.
1444  * Does not return.
1445  */
1446 void
1447 server_shutdown(struct nsd *nsd)
1448 {
1449 	size_t i;
1450 
1451 	server_close_all_sockets(nsd->udp, nsd->ifs);
1452 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1453 	/* CHILD: close command channel to parent */
1454 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1455 	{
1456 		close(nsd->this_child->parent_fd);
1457 		nsd->this_child->parent_fd = -1;
1458 	}
1459 	/* SERVER: close command channels to children */
1460 	if(!nsd->this_child)
1461 	{
1462 		for(i=0; i < nsd->child_count; ++i)
1463 			if(nsd->children[i].child_fd != -1)
1464 			{
1465 				close(nsd->children[i].child_fd);
1466 				nsd->children[i].child_fd = -1;
1467 			}
1468 	}
1469 
1470 	tsig_finalize();
1471 #ifdef HAVE_SSL
1472 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1473 	if (nsd->tls_ctx)
1474 		SSL_CTX_free(nsd->tls_ctx);
1475 #endif
1476 
1477 #ifdef MEMCLEAN /* OS collects memory pages */
1478 #ifdef RATELIMIT
1479 	rrl_mmap_deinit_keep_mmap();
1480 #endif
1481 #ifdef USE_DNSTAP
1482 	dt_collector_destroy(nsd->dt_collector, nsd);
1483 #endif
1484 	udb_base_free_keep_mmap(nsd->task[0]);
1485 	udb_base_free_keep_mmap(nsd->task[1]);
1486 	namedb_close_udb(nsd->db); /* keeps mmap */
1487 	namedb_close(nsd->db);
1488 	nsd_options_destroy(nsd->options);
1489 	region_destroy(nsd->region);
1490 #endif
1491 	log_finalize();
1492 	exit(0);
1493 }
1494 
1495 void
1496 server_prepare_xfrd(struct nsd* nsd)
1497 {
1498 	char tmpfile[256];
1499 	/* create task mmaps */
1500 	nsd->mytask = 0;
1501 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1502 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1503 	nsd->task[0] = task_file_create(tmpfile);
1504 	if(!nsd->task[0]) {
1505 #ifdef USE_ZONE_STATS
1506 		unlink(nsd->zonestatfname[0]);
1507 		unlink(nsd->zonestatfname[1]);
1508 #endif
1509 		xfrd_del_tempdir(nsd);
1510 		exit(1);
1511 	}
1512 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1513 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1514 	nsd->task[1] = task_file_create(tmpfile);
1515 	if(!nsd->task[1]) {
1516 		unlink(nsd->task[0]->fname);
1517 #ifdef USE_ZONE_STATS
1518 		unlink(nsd->zonestatfname[0]);
1519 		unlink(nsd->zonestatfname[1]);
1520 #endif
1521 		xfrd_del_tempdir(nsd);
1522 		exit(1);
1523 	}
1524 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1525 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1526 	/* create xfrd listener structure */
1527 	nsd->xfrd_listener = region_alloc(nsd->region,
1528 		sizeof(netio_handler_type));
1529 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1530 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1531 	nsd->xfrd_listener->fd = -1;
1532 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1533 		nsd;
1534 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1535 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1536 }
1537 
1538 
1539 void
1540 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1541 {
1542 	pid_t pid;
1543 	int sockets[2] = {0,0};
1544 	struct ipc_handler_conn_data *data;
1545 
1546 	if(nsd->xfrd_listener->fd != -1)
1547 		close(nsd->xfrd_listener->fd);
1548 	if(del_db) {
1549 		/* recreate taskdb that xfrd was using, it may be corrupt */
1550 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1551 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1552 		nsd->task[1-nsd->mytask]->fname = NULL;
1553 		/* free alloc already, so udb does not shrink itself */
1554 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1555 		nsd->task[1-nsd->mytask]->alloc = NULL;
1556 		udb_base_free(nsd->task[1-nsd->mytask]);
1557 		/* create new file, overwrite the old one */
1558 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1559 		free(tmpfile);
1560 	}
1561 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1562 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1563 		return;
1564 	}
1565 	pid = fork();
1566 	switch (pid) {
1567 	case -1:
1568 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1569 		break;
1570 	default:
1571 		/* PARENT: close first socket, use second one */
1572 		close(sockets[0]);
1573 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1574 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1575 		}
1576 		if(del_db) xfrd_free_namedb(nsd);
1577 		/* use other task than I am using, since if xfrd died and is
1578 		 * restarted, the reload is using nsd->mytask */
1579 		nsd->mytask = 1 - nsd->mytask;
1580 
1581 #ifdef HAVE_SETPROCTITLE
1582 		setproctitle("xfrd");
1583 #endif
1584 #ifdef HAVE_CPUSET_T
1585 		if(nsd->use_cpu_affinity) {
1586 			set_cpu_affinity(nsd->xfrd_cpuset);
1587 		}
1588 #endif
1589 
1590 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1591 		/* ENOTREACH */
1592 		break;
1593 	case 0:
1594 		/* CHILD: close second socket, use first one */
1595 		close(sockets[1]);
1596 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1597 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1598 		}
1599 		nsd->xfrd_listener->fd = sockets[0];
1600 		break;
1601 	}
1602 	/* server-parent only */
1603 	nsd->xfrd_listener->timeout = NULL;
1604 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1605 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1606 	/* clear ongoing ipc reads */
1607 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1608 	data->conn->is_reading = 0;
1609 }
1610 
1611 /** add all soainfo to taskdb */
1612 static void
1613 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1614 {
1615 	struct radnode* n;
1616 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1617 	/* add all SOA INFO to mytask */
1618 	udb_ptr_init(&task_last, taskudb);
1619 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1620 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1621 	}
1622 	udb_ptr_unlink(&task_last, taskudb);
1623 }
1624 
1625 void
1626 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1627 {
1628 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1629 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1630 	 *   then they exchange and process.
1631 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1632 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1633 	 *   expire notifications can be sent back via a normal reload later
1634 	 *   (xfrd will wait for current running reload to finish if any).
1635 	 */
1636 	sig_atomic_t cmd = 0;
1637 	pid_t mypid;
1638 	int xfrd_sock = nsd->xfrd_listener->fd;
1639 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1640 	udb_ptr t;
1641 	if(!shortsoa) {
1642 		if(nsd->signal_hint_shutdown) {
1643 		shutdown:
1644 			log_msg(LOG_WARNING, "signal received, shutting down...");
1645 			server_close_all_sockets(nsd->udp, nsd->ifs);
1646 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1647 #ifdef HAVE_SSL
1648 			daemon_remote_close(nsd->rc);
1649 #endif
1650 			/* Unlink it if possible... */
1651 			unlinkpid(nsd->pidfile);
1652 			unlink(nsd->task[0]->fname);
1653 			unlink(nsd->task[1]->fname);
1654 #ifdef USE_ZONE_STATS
1655 			unlink(nsd->zonestatfname[0]);
1656 			unlink(nsd->zonestatfname[1]);
1657 #endif
1658 			/* write the nsd.db to disk, wait for it to complete */
1659 			udb_base_sync(nsd->db->udb, 1);
1660 			udb_base_close(nsd->db->udb);
1661 			server_shutdown(nsd);
1662 			/* ENOTREACH */
1663 			exit(0);
1664 		}
1665 	}
1666 	if(shortsoa) {
1667 		/* put SOA in xfrd task because mytask may be in use */
1668 		taskudb = nsd->task[1-nsd->mytask];
1669 	}
1670 
1671 	add_all_soa_to_task(nsd, taskudb);
1672 	if(!shortsoa) {
1673 		/* wait for xfrd to signal task is ready, RELOAD signal */
1674 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1675 			cmd != NSD_RELOAD) {
1676 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1677 			exit(1);
1678 		}
1679 		if(nsd->signal_hint_shutdown) {
1680 			goto shutdown;
1681 		}
1682 	}
1683 	/* give xfrd our task, signal it with RELOAD_DONE */
1684 	task_process_sync(taskudb);
1685 	cmd = NSD_RELOAD_DONE;
1686 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1687 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1688 			(int)nsd->pid, strerror(errno));
1689 	}
1690 	mypid = getpid();
1691 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1692 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1693 			strerror(errno));
1694 	}
1695 
1696 	if(!shortsoa) {
1697 		/* process the xfrd task works (expiry data) */
1698 		nsd->mytask = 1 - nsd->mytask;
1699 		taskudb = nsd->task[nsd->mytask];
1700 		task_remap(taskudb);
1701 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1702 		while(!udb_ptr_is_null(&t)) {
1703 			task_process_expire(nsd->db, TASKLIST(&t));
1704 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1705 		}
1706 		udb_ptr_unlink(&t, taskudb);
1707 		task_clear(taskudb);
1708 
1709 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1710 		cmd = NSD_RELOAD_DONE;
1711 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1712 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1713 				(int)nsd->pid, strerror(errno));
1714 		}
1715 	}
1716 }
1717 
1718 #ifdef HAVE_SSL
1719 static void
1720 log_crypto_from_err(const char* str, unsigned long err)
1721 {
1722 	/* error:[error code]:[library name]:[function name]:[reason string] */
1723 	char buf[128];
1724 	unsigned long e;
1725 	ERR_error_string_n(err, buf, sizeof(buf));
1726 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1727 	while( (e=ERR_get_error()) ) {
1728 		ERR_error_string_n(e, buf, sizeof(buf));
1729 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1730 	}
1731 }
1732 
1733 void
1734 log_crypto_err(const char* str)
1735 {
1736 	log_crypto_from_err(str, ERR_get_error());
1737 }
1738 
1739 /** true if the ssl handshake error has to be squelched from the logs */
1740 static int
1741 squelch_err_ssl_handshake(unsigned long err)
1742 {
1743 	if(verbosity >= 3)
1744 		return 0; /* only squelch on low verbosity */
1745 	/* this is very specific, we could filter on ERR_GET_REASON()
1746 	 * (the third element in ERR_PACK) */
1747 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1748 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1749 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1750 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1751 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1752 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1753 #endif
1754 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1755 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1756 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1757 #  ifdef SSL_R_VERSION_TOO_LOW
1758 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1759 #  endif
1760 #endif
1761 		)
1762 		return 1;
1763 	return 0;
1764 }
1765 
1766 void
1767 perform_openssl_init(void)
1768 {
1769 	/* init SSL library */
1770 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1771 	ERR_load_crypto_strings();
1772 #endif
1773 	ERR_load_SSL_strings();
1774 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1775 	OpenSSL_add_all_algorithms();
1776 #else
1777 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1778 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1779 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1780 #endif
1781 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1782 	(void)SSL_library_init();
1783 #else
1784 	OPENSSL_init_ssl(0, NULL);
1785 #endif
1786 
1787 	if(!RAND_status()) {
1788 		/* try to seed it */
1789 		unsigned char buf[256];
1790 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1791 		size_t i;
1792 		v = seed;
1793 		for(i=0; i<256/sizeof(v); i++) {
1794 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1795 			v = v*seed + (unsigned int)i;
1796 		}
1797 		RAND_seed(buf, 256);
1798 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1799 	}
1800 }
1801 
1802 static int
1803 get_ocsp(char *filename, unsigned char **ocsp)
1804 {
1805 	BIO *bio;
1806 	OCSP_RESPONSE *response;
1807 	int len = -1;
1808 	unsigned char *p, *buf;
1809 	assert(filename);
1810 
1811 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1812 		log_crypto_err("get_ocsp: BIO_new_file failed");
1813 		return -1;
1814 	}
1815 
1816 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1817 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1818 		BIO_free(bio);
1819 		return -1;
1820 	}
1821 
1822 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1823 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1824 		OCSP_RESPONSE_free(response);
1825 		BIO_free(bio);
1826 		return -1;
1827 	}
1828 
1829 	if ((buf = malloc((size_t) len)) == NULL) {
1830 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1831 		OCSP_RESPONSE_free(response);
1832 		BIO_free(bio);
1833 		return -1;
1834 	}
1835 
1836 	p = buf;
1837 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1838 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1839 		free(buf);
1840 		OCSP_RESPONSE_free(response);
1841 		BIO_free(bio);
1842 		return -1;
1843 	}
1844 
1845 	OCSP_RESPONSE_free(response);
1846 	BIO_free(bio);
1847 
1848 	*ocsp = buf;
1849 	return len;
1850 }
1851 
1852 /* further setup ssl ctx after the keys are loaded */
1853 static void
1854 listen_sslctx_setup_2(void* ctxt)
1855 {
1856 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1857 	(void)ctx;
1858 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1859 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1860 		/* ENOTREACH */
1861 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1862 	}
1863 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1864 	if(1) {
1865 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1866 		if (!ecdh) {
1867 			log_crypto_err("could not find p256, not enabling ECDHE");
1868 		} else {
1869 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1870 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1871 			}
1872 			EC_KEY_free (ecdh);
1873 		}
1874 	}
1875 #endif
1876 }
1877 
1878 static int
1879 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1880 {
1881 	if(ocspdata) {
1882 		unsigned char *p;
1883 		if ((p=malloc(ocspdata_len)) == NULL) {
1884 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1885 			return SSL_TLSEXT_ERR_NOACK;
1886 		}
1887 		memcpy(p, ocspdata, ocspdata_len);
1888 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1889 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1890 			free(p);
1891 			return SSL_TLSEXT_ERR_NOACK;
1892 		}
1893 		return SSL_TLSEXT_ERR_OK;
1894 	} else {
1895 		return SSL_TLSEXT_ERR_NOACK;
1896 	}
1897 }
1898 
1899 SSL_CTX*
1900 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1901 {
1902 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1903 	if(!ctx) {
1904 		log_crypto_err("could not SSL_CTX_new");
1905 		return NULL;
1906 	}
1907 	/* no SSLv2, SSLv3 because has defects */
1908 #if SSL_OP_NO_SSLv2 != 0
1909 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1910 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1911 		SSL_CTX_free(ctx);
1912 		return NULL;
1913 	}
1914 #endif
1915 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1916 		!= SSL_OP_NO_SSLv3){
1917 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1918 		SSL_CTX_free(ctx);
1919 		return 0;
1920 	}
1921 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1922 	/* if we have tls 1.1 disable 1.0 */
1923 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1924 		!= SSL_OP_NO_TLSv1){
1925 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1926 		SSL_CTX_free(ctx);
1927 		return 0;
1928 	}
1929 #endif
1930 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1931 	/* if we have tls 1.2 disable 1.1 */
1932 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1933 		!= SSL_OP_NO_TLSv1_1){
1934 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1935 		SSL_CTX_free(ctx);
1936 		return 0;
1937 	}
1938 #endif
1939 #if defined(SSL_OP_NO_RENEGOTIATION)
1940 	/* disable client renegotiation */
1941 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1942 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1943 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1944 		SSL_CTX_free(ctx);
1945 		return 0;
1946 	}
1947 #endif
1948 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
1949 	/* if we have sha256, set the cipher list to have no known vulns */
1950 	if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
1951 		log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
1952 #endif
1953 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
1954 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
1955 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
1956 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
1957 		SSL_CTX_free(ctx);
1958 		return 0;
1959 	}
1960 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
1961 	SSL_CTX_set_security_level(ctx, 0);
1962 #endif
1963 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
1964 		log_msg(LOG_ERR, "error for cert file: %s", pem);
1965 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
1966 		SSL_CTX_free(ctx);
1967 		return NULL;
1968 	}
1969 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
1970 		log_msg(LOG_ERR, "error for private key file: %s", key);
1971 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
1972 		SSL_CTX_free(ctx);
1973 		return NULL;
1974 	}
1975 	if(!SSL_CTX_check_private_key(ctx)) {
1976 		log_msg(LOG_ERR, "error for key file: %s", key);
1977 		log_crypto_err("Error in SSL_CTX check_private_key");
1978 		SSL_CTX_free(ctx);
1979 		return NULL;
1980 	}
1981 	listen_sslctx_setup_2(ctx);
1982 	if(verifypem && verifypem[0]) {
1983 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
1984 			log_crypto_err("Error in SSL_CTX verify locations");
1985 			SSL_CTX_free(ctx);
1986 			return NULL;
1987 		}
1988 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
1989 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
1990 	}
1991 	return ctx;
1992 }
1993 
1994 SSL_CTX*
1995 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
1996 {
1997 	char *key, *pem;
1998 	SSL_CTX *ctx;
1999 
2000 	key = nsd->options->tls_service_key;
2001 	pem = nsd->options->tls_service_pem;
2002 	if(!key || key[0] == 0) {
2003 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2004 		return NULL;
2005 	}
2006 	if(!pem || pem[0] == 0) {
2007 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2008 		return NULL;
2009 	}
2010 
2011 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2012 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2013 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2014 	if(!ctx) {
2015 		log_msg(LOG_ERR, "could not setup server TLS context");
2016 		return NULL;
2017 	}
2018 	if(ocspfile && ocspfile[0]) {
2019 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2020 			log_crypto_err("Error reading OCSPfile");
2021 			SSL_CTX_free(ctx);
2022 			return NULL;
2023 		} else {
2024 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2025 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2026 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2027 				SSL_CTX_free(ctx);
2028 				return NULL;
2029 			}
2030 		}
2031 	}
2032 	return ctx;
2033 }
2034 
2035 /* check if tcp_handler_accept_data created for TLS dedicated port */
2036 int
2037 using_tls_port(struct sockaddr* addr, const char* tls_port)
2038 {
2039 	in_port_t port = 0;
2040 
2041 	if (addr->sa_family == AF_INET)
2042 		port = ((struct sockaddr_in*)addr)->sin_port;
2043 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2044 	else
2045 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2046 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2047 	if (atoi(tls_port) == ntohs(port))
2048 		return 1;
2049 
2050 	return 0;
2051 }
2052 #endif
2053 
2054 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2055 ssize_t
2056 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2057 {
2058 	uint8_t* buf = (uint8_t*) p;
2059 	ssize_t total = 0;
2060 	struct pollfd fd;
2061 	memset(&fd, 0, sizeof(fd));
2062 	fd.fd = s;
2063 	fd.events = POLLIN;
2064 
2065 	while( total < sz) {
2066 		ssize_t ret;
2067 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2068 		if(ret == -1) {
2069 			if(errno == EAGAIN)
2070 				/* blocking read */
2071 				continue;
2072 			if(errno == EINTR) {
2073 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2074 					return -1;
2075 				/* other signals can be handled later */
2076 				continue;
2077 			}
2078 			/* some error */
2079 			return -1;
2080 		}
2081 		if(ret == 0) {
2082 			/* operation timed out */
2083 			return -2;
2084 		}
2085 		ret = read(s, buf+total, sz-total);
2086 		if(ret == -1) {
2087 			if(errno == EAGAIN)
2088 				/* blocking read */
2089 				continue;
2090 			if(errno == EINTR) {
2091 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2092 					return -1;
2093 				/* other signals can be handled later */
2094 				continue;
2095 			}
2096 			/* some error */
2097 			return -1;
2098 		}
2099 		if(ret == 0) {
2100 			/* closed connection! */
2101 			return 0;
2102 		}
2103 		total += ret;
2104 	}
2105 	return total;
2106 }
2107 
2108 static void
2109 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2110 {
2111 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2112 	udb_ptr t, next;
2113 	udb_base* u = nsd->task[nsd->mytask];
2114 	udb_ptr_init(&next, u);
2115 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2116 	udb_base_set_userdata(u, 0);
2117 	while(!udb_ptr_is_null(&t)) {
2118 		/* store next in list so this one can be deleted or reused */
2119 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2120 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2121 
2122 		/* process task t */
2123 		/* append results for task t and update last_task */
2124 		task_process_in_reload(nsd, u, last_task, &t);
2125 
2126 		/* go to next */
2127 		udb_ptr_set_ptr(&t, u, &next);
2128 
2129 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2130 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2131 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2132 			if(cmd == NSD_QUIT) {
2133 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2134 				/* sync to disk (if needed) */
2135 				udb_base_sync(nsd->db->udb, 0);
2136 				/* unlink files of remainder of tasks */
2137 				while(!udb_ptr_is_null(&t)) {
2138 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2139 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2140 					}
2141 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2142 				}
2143 				udb_ptr_unlink(&t, u);
2144 				udb_ptr_unlink(&next, u);
2145 				exit(0);
2146 			}
2147 		}
2148 
2149 	}
2150 	udb_ptr_unlink(&t, u);
2151 	udb_ptr_unlink(&next, u);
2152 }
2153 
2154 #ifdef BIND8_STATS
2155 static void
2156 parent_send_stats(struct nsd* nsd, int cmdfd)
2157 {
2158 	size_t i;
2159 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2160 		log_msg(LOG_ERR, "could not write stats to reload");
2161 		return;
2162 	}
2163 	for(i=0; i<nsd->child_count; i++)
2164 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2165 			sizeof(stc_type))) {
2166 			log_msg(LOG_ERR, "could not write stats to reload");
2167 			return;
2168 		}
2169 }
2170 
2171 static void
2172 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2173 {
2174 	struct nsdst s;
2175 	stc_type* p;
2176 	size_t i;
2177 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2178 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2179 		log_msg(LOG_ERR, "could not read stats from oldpar");
2180 		return;
2181 	}
2182 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2183 	s.db_mem = region_get_mem(nsd->db->region);
2184 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2185 		nsd->child_count);
2186 	if(!p) return;
2187 	for(i=0; i<nsd->child_count; i++) {
2188 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2189 			sizeof(stc_type))
2190 			return;
2191 	}
2192 }
2193 #endif /* BIND8_STATS */
2194 
2195 /*
2196  * Reload the database, stop parent, re-fork children and continue.
2197  * as server_main.
2198  */
2199 static void
2200 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2201 	int cmdsocket)
2202 {
2203 	pid_t mypid;
2204 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2205 	int ret;
2206 	udb_ptr last_task;
2207 	struct sigaction old_sigchld, ign_sigchld;
2208 	/* ignore SIGCHLD from the previous server_main that used this pid */
2209 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2210 	ign_sigchld.sa_handler = SIG_IGN;
2211 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2212 
2213 #ifdef HAVE_SETPROCTITLE
2214 	setproctitle("main");
2215 #endif
2216 #ifdef HAVE_CPUSET_T
2217 	if(nsd->use_cpu_affinity) {
2218 		set_cpu_affinity(nsd->cpuset);
2219 	}
2220 #endif
2221 
2222 	/* see what tasks we got from xfrd */
2223 	task_remap(nsd->task[nsd->mytask]);
2224 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2225 	udb_compact_inhibited(nsd->db->udb, 1);
2226 	reload_process_tasks(nsd, &last_task, cmdsocket);
2227 	udb_compact_inhibited(nsd->db->udb, 0);
2228 	udb_compact(nsd->db->udb);
2229 
2230 #ifndef NDEBUG
2231 	if(nsd_debug_level >= 1)
2232 		region_log_stats(nsd->db->region);
2233 #endif /* NDEBUG */
2234 	/* sync to disk (if needed) */
2235 	udb_base_sync(nsd->db->udb, 0);
2236 
2237 	initialize_dname_compression_tables(nsd);
2238 
2239 #ifdef BIND8_STATS
2240 	/* Restart dumping stats if required.  */
2241 	time(&nsd->st.boot);
2242 	set_bind8_alarm(nsd);
2243 #endif
2244 #ifdef USE_ZONE_STATS
2245 	server_zonestat_realloc(nsd); /* realloc for new children */
2246 	server_zonestat_switch(nsd);
2247 #endif
2248 
2249 	/* listen for the signals of failed children again */
2250 	sigaction(SIGCHLD, &old_sigchld, NULL);
2251 	/* Start new child processes */
2252 	if (server_start_children(nsd, server_region, netio, &nsd->
2253 		xfrd_listener->fd) != 0) {
2254 		send_children_quit(nsd);
2255 		exit(1);
2256 	}
2257 
2258 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2259 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2260 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2261 		if(cmd == NSD_QUIT) {
2262 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2263 			send_children_quit(nsd);
2264 			exit(0);
2265 		}
2266 	}
2267 
2268 	/* Send quit command to parent: blocking, wait for receipt. */
2269 	do {
2270 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2271 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2272 		{
2273 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2274 				strerror(errno));
2275 		}
2276 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2277 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2278 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2279 			RELOAD_SYNC_TIMEOUT);
2280 		if(ret == -2) {
2281 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2282 		}
2283 	} while (ret == -2);
2284 	if(ret == -1) {
2285 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2286 			strerror(errno));
2287 	}
2288 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2289 	if(cmd == NSD_QUIT) {
2290 		/* small race condition possible here, parent got quit cmd. */
2291 		send_children_quit(nsd);
2292 		exit(1);
2293 	}
2294 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2295 #ifdef BIND8_STATS
2296 	reload_do_stats(cmdsocket, nsd, &last_task);
2297 #endif
2298 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2299 	task_process_sync(nsd->task[nsd->mytask]);
2300 #ifdef USE_ZONE_STATS
2301 	server_zonestat_realloc(nsd); /* realloc for next children */
2302 #endif
2303 
2304 	/* send soainfo to the xfrd process, signal it that reload is done,
2305 	 * it picks up the taskudb */
2306 	cmd = NSD_RELOAD_DONE;
2307 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2308 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2309 			strerror(errno));
2310 	}
2311 	mypid = getpid();
2312 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2313 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2314 			strerror(errno));
2315 	}
2316 
2317 	/* try to reopen file */
2318 	if (nsd->file_rotation_ok)
2319 		log_reopen(nsd->log_filename, 1);
2320 	/* exit reload, continue as new server_main */
2321 }
2322 
2323 /*
2324  * Get the mode depending on the signal hints that have been received.
2325  * Multiple signal hints can be received and will be handled in turn.
2326  */
2327 static sig_atomic_t
2328 server_signal_mode(struct nsd *nsd)
2329 {
2330 	if(nsd->signal_hint_quit) {
2331 		nsd->signal_hint_quit = 0;
2332 		return NSD_QUIT;
2333 	}
2334 	else if(nsd->signal_hint_shutdown) {
2335 		nsd->signal_hint_shutdown = 0;
2336 		return NSD_SHUTDOWN;
2337 	}
2338 	else if(nsd->signal_hint_child) {
2339 		nsd->signal_hint_child = 0;
2340 		return NSD_REAP_CHILDREN;
2341 	}
2342 	else if(nsd->signal_hint_reload) {
2343 		nsd->signal_hint_reload = 0;
2344 		return NSD_RELOAD;
2345 	}
2346 	else if(nsd->signal_hint_reload_hup) {
2347 		nsd->signal_hint_reload_hup = 0;
2348 		return NSD_RELOAD_REQ;
2349 	}
2350 	else if(nsd->signal_hint_stats) {
2351 		nsd->signal_hint_stats = 0;
2352 #ifdef BIND8_STATS
2353 		set_bind8_alarm(nsd);
2354 #endif
2355 		return NSD_STATS;
2356 	}
2357 	else if(nsd->signal_hint_statsusr) {
2358 		nsd->signal_hint_statsusr = 0;
2359 		return NSD_STATS;
2360 	}
2361 	return NSD_RUN;
2362 }
2363 
2364 /*
2365  * The main server simply waits for signals and child processes to
2366  * terminate.  Child processes are restarted as necessary.
2367  */
2368 void
2369 server_main(struct nsd *nsd)
2370 {
2371 	region_type *server_region = region_create(xalloc, free);
2372 	netio_type *netio = netio_create(server_region);
2373 	netio_handler_type reload_listener;
2374 	int reload_sockets[2] = {-1, -1};
2375 	struct timespec timeout_spec;
2376 	int status;
2377 	pid_t child_pid;
2378 	pid_t reload_pid = -1;
2379 	sig_atomic_t mode;
2380 
2381 	/* Ensure we are the main process */
2382 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2383 
2384 	/* Add listener for the XFRD process */
2385 	netio_add_handler(netio, nsd->xfrd_listener);
2386 
2387 	/* Start the child processes that handle incoming queries */
2388 	if (server_start_children(nsd, server_region, netio,
2389 		&nsd->xfrd_listener->fd) != 0) {
2390 		send_children_quit(nsd);
2391 		exit(1);
2392 	}
2393 	reload_listener.fd = -1;
2394 
2395 	/* This_child MUST be 0, because this is the parent process */
2396 	assert(nsd->this_child == 0);
2397 
2398 	/* Run the server until we get a shutdown signal */
2399 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2400 		/* Did we receive a signal that changes our mode? */
2401 		if(mode == NSD_RUN) {
2402 			nsd->mode = mode = server_signal_mode(nsd);
2403 		}
2404 
2405 		switch (mode) {
2406 		case NSD_RUN:
2407 			/* see if any child processes terminated */
2408 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2409 				int is_child = delete_child_pid(nsd, child_pid);
2410 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2411 					if(nsd->children[is_child].child_fd == -1)
2412 						nsd->children[is_child].has_exited = 1;
2413 					parent_check_all_children_exited(nsd);
2414 				} else if(is_child != -1) {
2415 					log_msg(LOG_WARNING,
2416 					       "server %d died unexpectedly with status %d, restarting",
2417 					       (int) child_pid, status);
2418 					restart_child_servers(nsd, server_region, netio,
2419 						&nsd->xfrd_listener->fd);
2420 				} else if (child_pid == reload_pid) {
2421 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2422 					pid_t mypid;
2423 					log_msg(LOG_WARNING,
2424 					       "Reload process %d failed with status %d, continuing with old database",
2425 					       (int) child_pid, status);
2426 					reload_pid = -1;
2427 					if(reload_listener.fd != -1) close(reload_listener.fd);
2428 					reload_listener.fd = -1;
2429 					reload_listener.event_types = NETIO_EVENT_NONE;
2430 					task_process_sync(nsd->task[nsd->mytask]);
2431 					/* inform xfrd reload attempt ended */
2432 					if(!write_socket(nsd->xfrd_listener->fd,
2433 						&cmd, sizeof(cmd))) {
2434 						log_msg(LOG_ERR, "problems "
2435 						  "sending SOAEND to xfrd: %s",
2436 						  strerror(errno));
2437 					}
2438 					mypid = getpid();
2439 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2440 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2441 							strerror(errno));
2442 					}
2443 				} else if(status != 0) {
2444 					/* check for status, because we get
2445 					 * the old-servermain because reload
2446 					 * is the process-parent of old-main,
2447 					 * and we get older server-processes
2448 					 * that are exiting after a reload */
2449 					log_msg(LOG_WARNING,
2450 					       "process %d terminated with status %d",
2451 					       (int) child_pid, status);
2452 				}
2453 			}
2454 			if (child_pid == -1) {
2455 				if (errno == EINTR) {
2456 					continue;
2457 				}
2458 				if (errno != ECHILD)
2459 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2460 			}
2461 			if (nsd->mode != NSD_RUN)
2462 				break;
2463 
2464 			/* timeout to collect processes. In case no sigchild happens. */
2465 			timeout_spec.tv_sec = 60;
2466 			timeout_spec.tv_nsec = 0;
2467 
2468 			/* listen on ports, timeout for collecting terminated children */
2469 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2470 				if (errno != EINTR) {
2471 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2472 				}
2473 			}
2474 			if(nsd->restart_children) {
2475 				restart_child_servers(nsd, server_region, netio,
2476 					&nsd->xfrd_listener->fd);
2477 				nsd->restart_children = 0;
2478 			}
2479 			if(nsd->reload_failed) {
2480 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2481 				pid_t mypid;
2482 				nsd->reload_failed = 0;
2483 				log_msg(LOG_WARNING,
2484 				       "Reload process %d failed, continuing with old database",
2485 				       (int) reload_pid);
2486 				reload_pid = -1;
2487 				if(reload_listener.fd != -1) close(reload_listener.fd);
2488 				reload_listener.fd = -1;
2489 				reload_listener.event_types = NETIO_EVENT_NONE;
2490 				task_process_sync(nsd->task[nsd->mytask]);
2491 				/* inform xfrd reload attempt ended */
2492 				if(!write_socket(nsd->xfrd_listener->fd,
2493 					&cmd, sizeof(cmd))) {
2494 					log_msg(LOG_ERR, "problems "
2495 					  "sending SOAEND to xfrd: %s",
2496 					  strerror(errno));
2497 				}
2498 				mypid = getpid();
2499 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2500 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2501 						strerror(errno));
2502 				}
2503 			}
2504 
2505 			break;
2506 		case NSD_RELOAD_REQ: {
2507 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2508 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2509 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2510 				"main: ipc send reload_req to xfrd"));
2511 			if(!write_socket(nsd->xfrd_listener->fd,
2512 				&cmd, sizeof(cmd))) {
2513 				log_msg(LOG_ERR, "server_main: could not send "
2514 				"reload_req to xfrd: %s", strerror(errno));
2515 			}
2516 			nsd->mode = NSD_RUN;
2517 			} break;
2518 		case NSD_RELOAD:
2519 			/* Continue to run nsd after reload */
2520 			nsd->mode = NSD_RUN;
2521 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2522 			if (reload_pid != -1) {
2523 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2524 				       (int) reload_pid);
2525 				break;
2526 			}
2527 
2528 			/* switch the mytask to keep track of who owns task*/
2529 			nsd->mytask = 1 - nsd->mytask;
2530 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2531 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2532 				reload_pid = -1;
2533 				break;
2534 			}
2535 
2536 			/* Do actual reload */
2537 			reload_pid = fork();
2538 			switch (reload_pid) {
2539 			case -1:
2540 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2541 				break;
2542 			default:
2543 				/* PARENT */
2544 				close(reload_sockets[0]);
2545 				server_reload(nsd, server_region, netio,
2546 					reload_sockets[1]);
2547 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2548 				close(reload_sockets[1]);
2549 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2550 				/* drop stale xfrd ipc data */
2551 				((struct ipc_handler_conn_data*)nsd->
2552 					xfrd_listener->user_data)
2553 					->conn->is_reading = 0;
2554 				reload_pid = -1;
2555 				reload_listener.fd = -1;
2556 				reload_listener.event_types = NETIO_EVENT_NONE;
2557 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2558 				break;
2559 			case 0:
2560 				/* CHILD */
2561 				/* server_main keep running until NSD_QUIT_SYNC
2562 				 * received from reload. */
2563 				close(reload_sockets[1]);
2564 				reload_listener.fd = reload_sockets[0];
2565 				reload_listener.timeout = NULL;
2566 				reload_listener.user_data = nsd;
2567 				reload_listener.event_types = NETIO_EVENT_READ;
2568 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2569 				netio_add_handler(netio, &reload_listener);
2570 				reload_pid = getppid();
2571 				break;
2572 			}
2573 			break;
2574 		case NSD_QUIT_SYNC:
2575 			/* synchronisation of xfrd, parent and reload */
2576 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2577 				sig_atomic_t cmd = NSD_RELOAD;
2578 				/* stop xfrd ipc writes in progress */
2579 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2580 					"main: ipc send indication reload"));
2581 				if(!write_socket(nsd->xfrd_listener->fd,
2582 					&cmd, sizeof(cmd))) {
2583 					log_msg(LOG_ERR, "server_main: could not send reload "
2584 					"indication to xfrd: %s", strerror(errno));
2585 				}
2586 				/* wait for ACK from xfrd */
2587 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2588 				nsd->quit_sync_done = 1;
2589 			}
2590 			nsd->mode = NSD_RUN;
2591 			break;
2592 		case NSD_QUIT:
2593 			/* silent shutdown during reload */
2594 			if(reload_listener.fd != -1) {
2595 				/* acknowledge the quit, to sync reload that we will really quit now */
2596 				sig_atomic_t cmd = NSD_RELOAD;
2597 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2598 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2599 					log_msg(LOG_ERR, "server_main: "
2600 						"could not ack quit: %s", strerror(errno));
2601 				}
2602 #ifdef BIND8_STATS
2603 				parent_send_stats(nsd, reload_listener.fd);
2604 #endif /* BIND8_STATS */
2605 				close(reload_listener.fd);
2606 			}
2607 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2608 			/* only quit children after xfrd has acked */
2609 			send_children_quit(nsd);
2610 
2611 #ifdef MEMCLEAN /* OS collects memory pages */
2612 			region_destroy(server_region);
2613 #endif
2614 			server_shutdown(nsd);
2615 
2616 			/* ENOTREACH */
2617 			break;
2618 		case NSD_SHUTDOWN:
2619 			break;
2620 		case NSD_REAP_CHILDREN:
2621 			/* continue; wait for child in run loop */
2622 			nsd->mode = NSD_RUN;
2623 			break;
2624 		case NSD_STATS:
2625 #ifdef BIND8_STATS
2626 			set_children_stats(nsd);
2627 #endif
2628 			nsd->mode = NSD_RUN;
2629 			break;
2630 		default:
2631 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2632 			nsd->mode = NSD_RUN;
2633 			break;
2634 		}
2635 	}
2636 	log_msg(LOG_WARNING, "signal received, shutting down...");
2637 
2638 	/* close opened ports to avoid race with restart of nsd */
2639 	server_close_all_sockets(nsd->udp, nsd->ifs);
2640 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2641 #ifdef HAVE_SSL
2642 	daemon_remote_close(nsd->rc);
2643 #endif
2644 	send_children_quit_and_wait(nsd);
2645 
2646 	/* Unlink it if possible... */
2647 	unlinkpid(nsd->pidfile);
2648 	unlink(nsd->task[0]->fname);
2649 	unlink(nsd->task[1]->fname);
2650 #ifdef USE_ZONE_STATS
2651 	unlink(nsd->zonestatfname[0]);
2652 	unlink(nsd->zonestatfname[1]);
2653 #endif
2654 #ifdef USE_DNSTAP
2655 	dt_collector_close(nsd->dt_collector, nsd);
2656 #endif
2657 
2658 	if(reload_listener.fd != -1) {
2659 		sig_atomic_t cmd = NSD_QUIT;
2660 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2661 			"main: ipc send quit to reload-process"));
2662 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2663 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2664 				strerror(errno));
2665 		}
2666 		fsync(reload_listener.fd);
2667 		close(reload_listener.fd);
2668 		/* wait for reload to finish processing */
2669 		while(1) {
2670 			if(waitpid(reload_pid, NULL, 0) == -1) {
2671 				if(errno == EINTR) continue;
2672 				if(errno == ECHILD) break;
2673 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2674 					(int)reload_pid, strerror(errno));
2675 			}
2676 			break;
2677 		}
2678 	}
2679 	if(nsd->xfrd_listener->fd != -1) {
2680 		/* complete quit, stop xfrd */
2681 		sig_atomic_t cmd = NSD_QUIT;
2682 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2683 			"main: ipc send quit to xfrd"));
2684 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2685 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2686 				strerror(errno));
2687 		}
2688 		fsync(nsd->xfrd_listener->fd);
2689 		close(nsd->xfrd_listener->fd);
2690 		(void)kill(nsd->pid, SIGTERM);
2691 	}
2692 
2693 #ifdef MEMCLEAN /* OS collects memory pages */
2694 	region_destroy(server_region);
2695 #endif
2696 	/* write the nsd.db to disk, wait for it to complete */
2697 	udb_base_sync(nsd->db->udb, 1);
2698 	udb_base_close(nsd->db->udb);
2699 	server_shutdown(nsd);
2700 }
2701 
2702 static query_state_type
2703 server_process_query(struct nsd *nsd, struct query *query)
2704 {
2705 	return query_process(query, nsd);
2706 }
2707 
2708 static query_state_type
2709 server_process_query_udp(struct nsd *nsd, struct query *query)
2710 {
2711 #ifdef RATELIMIT
2712 	if(query_process(query, nsd) != QUERY_DISCARDED) {
2713 		if(rrl_process_query(query))
2714 			return rrl_slip(query);
2715 		else	return QUERY_PROCESSED;
2716 	}
2717 	return QUERY_DISCARDED;
2718 #else
2719 	return query_process(query, nsd);
2720 #endif
2721 }
2722 
2723 const char*
2724 nsd_event_vs(void)
2725 {
2726 #ifdef USE_MINI_EVENT
2727 	return "";
2728 #else
2729 	return event_get_version();
2730 #endif
2731 }
2732 
2733 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2734 static const char* ub_ev_backend2str(int b)
2735 {
2736 	switch(b) {
2737 	case EVBACKEND_SELECT:	return "select";
2738 	case EVBACKEND_POLL:	return "poll";
2739 	case EVBACKEND_EPOLL:	return "epoll";
2740 	case EVBACKEND_KQUEUE:	return "kqueue";
2741 	case EVBACKEND_DEVPOLL: return "devpoll";
2742 	case EVBACKEND_PORT:	return "evport";
2743 	}
2744 	return "unknown";
2745 }
2746 #endif
2747 
2748 const char*
2749 nsd_event_method(void)
2750 {
2751 #ifdef USE_MINI_EVENT
2752 	return "select";
2753 #else
2754 	struct event_base* b = nsd_child_event_base();
2755 	const char* m = "?";
2756 #  ifdef EV_FEATURE_BACKENDS
2757 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2758 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2759 	m = event_base_get_method(b);
2760 #  endif
2761 #  ifdef MEMCLEAN
2762 	event_base_free(b);
2763 #  endif
2764 	return m;
2765 #endif
2766 }
2767 
2768 struct event_base*
2769 nsd_child_event_base(void)
2770 {
2771 	struct event_base* base;
2772 #ifdef USE_MINI_EVENT
2773 	static time_t secs;
2774 	static struct timeval now;
2775 	base = event_init(&secs, &now);
2776 #else
2777 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2778 	/* libev */
2779 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2780 #  else
2781 	/* libevent */
2782 #    ifdef HAVE_EVENT_BASE_NEW
2783 	base = event_base_new();
2784 #    else
2785 	base = event_init();
2786 #    endif
2787 #  endif
2788 #endif
2789 	return base;
2790 }
2791 
2792 static void
2793 add_udp_handler(
2794 	struct nsd *nsd,
2795 	struct nsd_socket *sock,
2796 	struct udp_handler_data *data)
2797 {
2798 	struct event *handler = &data->event;
2799 
2800 	data->nsd = nsd;
2801 	data->socket = sock;
2802 
2803 	memset(handler, 0, sizeof(*handler));
2804 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2805 	if(event_base_set(nsd->event_base, handler) != 0)
2806 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2807 	if(event_add(handler, NULL) != 0)
2808 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2809 }
2810 
2811 void
2812 add_tcp_handler(
2813 	struct nsd *nsd,
2814 	struct nsd_socket *sock,
2815 	struct tcp_accept_handler_data *data)
2816 {
2817 	struct event *handler = &data->event;
2818 
2819 	data->nsd = nsd;
2820 	data->socket = sock;
2821 
2822 #ifdef HAVE_SSL
2823 	if (nsd->tls_ctx &&
2824 	    nsd->options->tls_port &&
2825 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2826 	{
2827 		data->tls_accept = 1;
2828 		if(verbosity >= 2) {
2829 			char buf[48];
2830 			addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2831 			VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2832 		}
2833 	} else {
2834 		data->tls_accept = 0;
2835 	}
2836 #endif
2837 
2838 	memset(handler, 0, sizeof(*handler));
2839 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2840 	if(event_base_set(nsd->event_base, handler) != 0)
2841 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2842 	if(event_add(handler, NULL) != 0)
2843 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2844 	data->event_added = 1;
2845 }
2846 
2847 /*
2848  * Serve DNS requests.
2849  */
2850 void
2851 server_child(struct nsd *nsd)
2852 {
2853 	size_t i, from, numifs;
2854 	region_type *server_region = region_create(xalloc, free);
2855 	struct event_base* event_base = nsd_child_event_base();
2856 	sig_atomic_t mode;
2857 
2858 	if(!event_base) {
2859 		log_msg(LOG_ERR, "nsd server could not create event base");
2860 		exit(1);
2861 	}
2862 	nsd->event_base = event_base;
2863 	nsd->server_region = server_region;
2864 
2865 #ifdef RATELIMIT
2866 	rrl_init(nsd->this_child->child_num);
2867 #endif
2868 
2869 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2870 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2871 
2872 #ifdef HAVE_SETPROCTITLE
2873 	setproctitle("server %d", nsd->this_child->child_num + 1);
2874 #endif
2875 #ifdef HAVE_CPUSET_T
2876 	if(nsd->use_cpu_affinity) {
2877 		set_cpu_affinity(nsd->this_child->cpuset);
2878 	}
2879 #endif
2880 
2881 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2882 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2883 	}
2884 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2885 		server_close_all_sockets(nsd->udp, nsd->ifs);
2886 	}
2887 
2888 	if (nsd->this_child->parent_fd != -1) {
2889 		struct event *handler;
2890 		struct ipc_handler_conn_data* user_data =
2891 			(struct ipc_handler_conn_data*)region_alloc(
2892 			server_region, sizeof(struct ipc_handler_conn_data));
2893 		user_data->nsd = nsd;
2894 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2895 
2896 		handler = (struct event*) region_alloc(
2897 			server_region, sizeof(*handler));
2898 		memset(handler, 0, sizeof(*handler));
2899 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2900 			EV_READ, child_handle_parent_command, user_data);
2901 		if(event_base_set(event_base, handler) != 0)
2902 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2903 		if(event_add(handler, NULL) != 0)
2904 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2905 	}
2906 
2907 	if(nsd->reuseport) {
2908 		numifs = nsd->ifs / nsd->reuseport;
2909 		from = numifs * nsd->this_child->child_num;
2910 		if(from+numifs > nsd->ifs) { /* should not happen */
2911 			from = 0;
2912 			numifs = nsd->ifs;
2913 		}
2914 	} else {
2915 		from = 0;
2916 		numifs = nsd->ifs;
2917 	}
2918 
2919 	if (nsd->server_kind & NSD_SERVER_UDP) {
2920 		int child = nsd->this_child->child_num;
2921 		memset(msgs, 0, sizeof(msgs));
2922 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2923 			queries[i] = query_create(server_region,
2924 				compressed_dname_offsets,
2925 				compression_table_size, compressed_dnames);
2926 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2927 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2928 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
2929 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2930 			msgs[i].msg_hdr.msg_iovlen  = 1;
2931 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2932 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2933 		}
2934 
2935 		for (i = 0; i < nsd->ifs; i++) {
2936 			int listen;
2937 			struct udp_handler_data *data;
2938 
2939 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
2940 
2941 			if(i >= from && i < (from + numifs) && listen) {
2942 				data = region_alloc_zero(
2943 					nsd->server_region, sizeof(*data));
2944 				add_udp_handler(nsd, &nsd->udp[i], data);
2945 			} else {
2946 				/* close sockets intended for other servers */
2947 				server_close_socket(&nsd->udp[i]);
2948 			}
2949 		}
2950 	}
2951 
2952 	/*
2953 	 * Keep track of all the TCP accept handlers so we can enable
2954 	 * and disable them based on the current number of active TCP
2955 	 * connections.
2956 	 */
2957 	if (nsd->server_kind & NSD_SERVER_TCP) {
2958 		int child = nsd->this_child->child_num;
2959 		tcp_accept_handler_count = numifs;
2960 		tcp_accept_handlers = region_alloc_array(server_region,
2961 			numifs, sizeof(*tcp_accept_handlers));
2962 
2963 		for (i = 0; i < nsd->ifs; i++) {
2964 			int listen;
2965 			struct tcp_accept_handler_data *data;
2966 
2967 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
2968 
2969 			if(i >= from && i < (from + numifs) && listen) {
2970 				data = &tcp_accept_handlers[i-from];
2971 				memset(data, 0, sizeof(*data));
2972 				add_tcp_handler(nsd, &nsd->tcp[i], data);
2973 			} else {
2974 				/* close sockets intended for other servers */
2975 				/*
2976 				 * uncomment this once tcp servers are no
2977 				 * longer copied in the tcp fd copy line
2978 				 * in server_init().
2979 				server_close_socket(&nsd->tcp[i]);
2980 				*/
2981 				/* close sockets not meant for this server*/
2982 				if(!listen)
2983 					server_close_socket(&nsd->tcp[i]);
2984 			}
2985 		}
2986 	} else {
2987 		tcp_accept_handler_count = 0;
2988 	}
2989 
2990 	/* The main loop... */
2991 	while ((mode = nsd->mode) != NSD_QUIT) {
2992 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2993 
2994 		/* Do we need to do the statistics... */
2995 		if (mode == NSD_STATS) {
2996 #ifdef BIND8_STATS
2997 			int p = nsd->st.period;
2998 			nsd->st.period = 1; /* force stats printout */
2999 			/* Dump the statistics */
3000 			bind8_stats(nsd);
3001 			nsd->st.period = p;
3002 #else /* !BIND8_STATS */
3003 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3004 #endif /* BIND8_STATS */
3005 
3006 			nsd->mode = NSD_RUN;
3007 		}
3008 		else if (mode == NSD_REAP_CHILDREN) {
3009 			/* got signal, notify parent. parent reaps terminated children. */
3010 			if (nsd->this_child->parent_fd != -1) {
3011 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3012 				if (write(nsd->this_child->parent_fd,
3013 				    &parent_notify,
3014 				    sizeof(parent_notify)) == -1)
3015 				{
3016 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3017 						(int) nsd->this_child->pid, strerror(errno));
3018 				}
3019 			} else /* no parent, so reap 'em */
3020 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3021 			nsd->mode = NSD_RUN;
3022 		}
3023 		else if(mode == NSD_RUN) {
3024 			/* Wait for a query... */
3025 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3026 				if (errno != EINTR) {
3027 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3028 					break;
3029 				}
3030 			}
3031 		} else if(mode == NSD_QUIT) {
3032 			/* ignore here, quit */
3033 		} else {
3034 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3035 				(int)mode);
3036 			nsd->mode = NSD_RUN;
3037 		}
3038 	}
3039 
3040 	service_remaining_tcp(nsd);
3041 #ifdef	BIND8_STATS
3042 	bind8_stats(nsd);
3043 #endif /* BIND8_STATS */
3044 
3045 #ifdef MEMCLEAN /* OS collects memory pages */
3046 #ifdef RATELIMIT
3047 	rrl_deinit(nsd->this_child->child_num);
3048 #endif
3049 	event_base_free(event_base);
3050 	region_destroy(server_region);
3051 #endif
3052 	server_shutdown(nsd);
3053 }
3054 
3055 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3056 {
3057 	int* timed_out = (int*)arg;
3058         assert(event & EV_TIMEOUT); (void)event;
3059 	/* wake up the service tcp thread, note event is no longer
3060 	 * registered */
3061 	*timed_out = 1;
3062 }
3063 
3064 void
3065 service_remaining_tcp(struct nsd* nsd)
3066 {
3067 	struct tcp_handler_data* p;
3068 	struct event_base* event_base;
3069 	/* check if it is needed */
3070 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3071 		return;
3072 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3073 
3074 	/* setup event base */
3075 	event_base = nsd_child_event_base();
3076 	if(!event_base) {
3077 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3078 		return;
3079 	}
3080 	/* register tcp connections */
3081 	for(p = tcp_active_list; p != NULL; p = p->next) {
3082 		struct timeval timeout;
3083 		int fd = p->event.ev_fd;
3084 #ifdef USE_MINI_EVENT
3085 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3086 #else
3087 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3088 #endif
3089 		void (*fn)(int, short, void*);
3090 #ifdef HAVE_SSL
3091 		if(p->tls) {
3092 			if((event&EV_READ))
3093 				fn = handle_tls_reading;
3094 			else	fn = handle_tls_writing;
3095 		} else {
3096 #endif
3097 			if((event&EV_READ))
3098 				fn = handle_tcp_reading;
3099 			else	fn = handle_tcp_writing;
3100 #ifdef HAVE_SSL
3101 		}
3102 #endif
3103 
3104 		p->tcp_no_more_queries = 1;
3105 		/* set timeout to 1/10 second */
3106 		if(p->tcp_timeout > 100)
3107 			p->tcp_timeout = 100;
3108 		timeout.tv_sec = p->tcp_timeout / 1000;
3109 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3110 		event_del(&p->event);
3111 		memset(&p->event, 0, sizeof(p->event));
3112 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3113 			fn, p);
3114 		if(event_base_set(event_base, &p->event) != 0)
3115 			log_msg(LOG_ERR, "event base set failed");
3116 		if(event_add(&p->event, &timeout) != 0)
3117 			log_msg(LOG_ERR, "event add failed");
3118 	}
3119 
3120 	/* handle it */
3121 	while(nsd->current_tcp_count > 0) {
3122 		mode_t m = server_signal_mode(nsd);
3123 		struct event timeout;
3124 		struct timeval tv;
3125 		int timed_out = 0;
3126 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3127 			m == NSD_REAP_CHILDREN) {
3128 			/* quit */
3129 			break;
3130 		}
3131 		/* timer */
3132 		/* have to do something every second */
3133 		tv.tv_sec = 1;
3134 		tv.tv_usec = 0;
3135 		memset(&timeout, 0, sizeof(timeout));
3136 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3137 			&timed_out);
3138 		if(event_base_set(event_base, &timeout) != 0)
3139 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3140 		if(event_add(&timeout, &tv) != 0)
3141 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3142 
3143 		/* service loop */
3144 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3145 			if (errno != EINTR) {
3146 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3147 				break;
3148 			}
3149 		}
3150 		if(!timed_out) {
3151 			event_del(&timeout);
3152 		} else {
3153 			/* timed out, quit */
3154 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3155 			break;
3156 		}
3157 	}
3158 #ifdef MEMCLEAN
3159 	event_base_free(event_base);
3160 #endif
3161 	/* continue to quit after return */
3162 }
3163 
3164 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3165  * are always used, even if nonblocking operations are broken, in which case
3166  * NUM_RECV_PER_SELECT is defined to 1 (one).
3167  */
3168 #if defined(HAVE_RECVMMSG)
3169 #define nsd_recvmmsg recvmmsg
3170 #else /* !HAVE_RECVMMSG */
3171 
3172 static int
3173 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3174              int flags, struct timespec *timeout)
3175 {
3176 	unsigned int vpos = 0;
3177 	ssize_t rcvd;
3178 
3179 	/* timeout is ignored, ensure caller does not expect it to work */
3180 	assert(timeout == NULL); (void)timeout;
3181 
3182 	while(vpos < vlen) {
3183 		rcvd = recvfrom(sockfd,
3184 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3185 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3186 		                flags,
3187 		                msgvec[vpos].msg_hdr.msg_name,
3188 		               &msgvec[vpos].msg_hdr.msg_namelen);
3189 		if(rcvd < 0) {
3190 			break;
3191 		} else {
3192 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3193 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3194 			vpos++;
3195 		}
3196 	}
3197 
3198 	if(vpos) {
3199 		/* error will be picked up next time */
3200 		return (int)vpos;
3201 	} else if(errno == 0) {
3202 		return 0;
3203 	} else if(errno == EAGAIN) {
3204 		return 0;
3205 	}
3206 
3207 	return -1;
3208 }
3209 #endif /* HAVE_RECVMMSG */
3210 
3211 #ifdef HAVE_SENDMMSG
3212 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3213 #else /* !HAVE_SENDMMSG */
3214 
3215 static int
3216 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3217 {
3218 	unsigned int vpos = 0;
3219 	ssize_t snd;
3220 
3221 	while(vpos < vlen) {
3222 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3223 		snd = sendto(sockfd,
3224 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3225 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3226 		             flags,
3227 		             msgvec[vpos].msg_hdr.msg_name,
3228 		             msgvec[vpos].msg_hdr.msg_namelen);
3229 		if(snd < 0) {
3230 			break;
3231 		} else {
3232 			msgvec[vpos].msg_len = (unsigned int)snd;
3233 			vpos++;
3234 		}
3235 	}
3236 
3237 	if(vpos) {
3238 		return (int)vpos;
3239 	} else if(errno == 0) {
3240 		return 0;
3241 	}
3242 
3243 	return -1;
3244 }
3245 #endif /* HAVE_SENDMMSG */
3246 
3247 static void
3248 handle_udp(int fd, short event, void* arg)
3249 {
3250 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3251 	int received, sent, recvcount, i;
3252 	struct query *q;
3253 
3254 	if (!(event & EV_READ)) {
3255 		return;
3256 	}
3257 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3258 	/* this printf strangely gave a performance increase on Linux */
3259 	/* printf("recvcount %d \n", recvcount); */
3260 	if (recvcount == -1) {
3261 		if (errno != EAGAIN && errno != EINTR) {
3262 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3263 			STATUP(data->nsd, rxerr);
3264 			/* No zone statup */
3265 		}
3266 		/* Simply no data available */
3267 		return;
3268 	}
3269 	for (i = 0; i < recvcount; i++) {
3270 	loopstart:
3271 		received = msgs[i].msg_len;
3272 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3273 		q = queries[i];
3274 		if (received == -1) {
3275 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3276 #if defined(HAVE_RECVMMSG)
3277 				msgs[i].msg_hdr.msg_flags
3278 #else
3279 				errno
3280 #endif
3281 				));
3282 			STATUP(data->nsd, rxerr);
3283 			/* No zone statup */
3284 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3285 			iovecs[i].iov_len = buffer_remaining(q->packet);
3286 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3287 			goto swap_drop;
3288 		}
3289 
3290 		/* Account... */
3291 #ifdef BIND8_STATS
3292 		if (data->socket->addr.ai_family == AF_INET) {
3293 			STATUP(data->nsd, qudp);
3294 		} else if (data->socket->addr.ai_family == AF_INET6) {
3295 			STATUP(data->nsd, qudp6);
3296 		}
3297 #endif
3298 
3299 		buffer_skip(q->packet, received);
3300 		buffer_flip(q->packet);
3301 #ifdef USE_DNSTAP
3302 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
3303 			q->tcp, q->packet);
3304 #endif /* USE_DNSTAP */
3305 
3306 		/* Process and answer the query... */
3307 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
3308 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3309 				STATUP(data->nsd, nona);
3310 				ZTATUP(data->nsd, q->zone, nona);
3311 			}
3312 
3313 #ifdef USE_ZONE_STATS
3314 			if (data->socket->addr.ai_family == AF_INET) {
3315 				ZTATUP(data->nsd, q->zone, qudp);
3316 			} else if (data->socket->addr.ai_family == AF_INET6) {
3317 				ZTATUP(data->nsd, q->zone, qudp6);
3318 			}
3319 #endif
3320 
3321 			/* Add EDNS0 and TSIG info if necessary.  */
3322 			query_add_optional(q, data->nsd);
3323 
3324 			buffer_flip(q->packet);
3325 			iovecs[i].iov_len = buffer_remaining(q->packet);
3326 #ifdef BIND8_STATS
3327 			/* Account the rcode & TC... */
3328 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3329 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3330 			if (TC(q->packet)) {
3331 				STATUP(data->nsd, truncated);
3332 				ZTATUP(data->nsd, q->zone, truncated);
3333 			}
3334 #endif /* BIND8_STATS */
3335 #ifdef USE_DNSTAP
3336 			dt_collector_submit_auth_response(data->nsd,
3337 				&q->addr, q->addrlen, q->tcp, q->packet,
3338 				q->zone);
3339 #endif /* USE_DNSTAP */
3340 		} else {
3341 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3342 			iovecs[i].iov_len = buffer_remaining(q->packet);
3343 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3344 		swap_drop:
3345 			STATUP(data->nsd, dropped);
3346 			ZTATUP(data->nsd, q->zone, dropped);
3347 			if(i != recvcount-1) {
3348 				/* swap with last and decrease recvcount */
3349 				struct mmsghdr mtmp = msgs[i];
3350 				struct iovec iotmp = iovecs[i];
3351 				recvcount--;
3352 				msgs[i] = msgs[recvcount];
3353 				iovecs[i] = iovecs[recvcount];
3354 				queries[i] = queries[recvcount];
3355 				msgs[recvcount] = mtmp;
3356 				iovecs[recvcount] = iotmp;
3357 				queries[recvcount] = q;
3358 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3359 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3360 				goto loopstart;
3361 			} else { recvcount --; }
3362 		}
3363 	}
3364 
3365 	/* send until all are sent */
3366 	i = 0;
3367 	while(i<recvcount) {
3368 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3369 		if(sent == -1) {
3370 			if(errno == ENOBUFS ||
3371 #ifdef EWOULDBLOCK
3372 				errno == EWOULDBLOCK ||
3373 #endif
3374 				errno == EAGAIN) {
3375 				/* block to wait until send buffer avail */
3376 				int flag, errstore;
3377 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3378 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3379 					flag = 0;
3380 				}
3381 				flag &= ~O_NONBLOCK;
3382 				if(fcntl(fd, F_SETFL, flag) == -1)
3383 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3384 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3385 				errstore = errno;
3386 				flag |= O_NONBLOCK;
3387 				if(fcntl(fd, F_SETFL, flag) == -1)
3388 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3389 				if(sent != -1) {
3390 					i += sent;
3391 					continue;
3392 				}
3393 				errno = errstore;
3394 			}
3395 			/* don't log transient network full errors, unless
3396 			 * on higher verbosity */
3397 			if(!(errno == ENOBUFS && verbosity < 1) &&
3398 #ifdef EWOULDBLOCK
3399 			   errno != EWOULDBLOCK &&
3400 #endif
3401 			   errno != EAGAIN) {
3402 				const char* es = strerror(errno);
3403 				char a[64];
3404 				addrport2str(&queries[i]->addr, a, sizeof(a));
3405 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3406 			}
3407 #ifdef BIND8_STATS
3408 			data->nsd->st.txerr += recvcount-i;
3409 #endif /* BIND8_STATS */
3410 			break;
3411 		}
3412 		i += sent;
3413 	}
3414 	for(i=0; i<recvcount; i++) {
3415 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3416 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3417 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3418 	}
3419 }
3420 
3421 #ifdef HAVE_SSL
3422 /*
3423  * Setup an event for the tcp handler.
3424  */
3425 static void
3426 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3427        int fd, short event)
3428 {
3429 	struct timeval timeout;
3430 	struct event_base* ev_base;
3431 
3432 	timeout.tv_sec = data->nsd->tcp_timeout;
3433 	timeout.tv_usec = 0L;
3434 
3435 	ev_base = data->event.ev_base;
3436 	event_del(&data->event);
3437 	memset(&data->event, 0, sizeof(data->event));
3438 	event_set(&data->event, fd, event, fn, data);
3439 	if(event_base_set(ev_base, &data->event) != 0)
3440 		log_msg(LOG_ERR, "event base set failed");
3441 	if(event_add(&data->event, &timeout) != 0)
3442 		log_msg(LOG_ERR, "event add failed");
3443 }
3444 #endif /* HAVE_SSL */
3445 
3446 static void
3447 cleanup_tcp_handler(struct tcp_handler_data* data)
3448 {
3449 	event_del(&data->event);
3450 #ifdef HAVE_SSL
3451 	if(data->tls) {
3452 		SSL_shutdown(data->tls);
3453 		SSL_free(data->tls);
3454 		data->tls = NULL;
3455 	}
3456 #endif
3457 	close(data->event.ev_fd);
3458 	if(data->prev)
3459 		data->prev->next = data->next;
3460 	else	tcp_active_list = data->next;
3461 	if(data->next)
3462 		data->next->prev = data->prev;
3463 
3464 	/*
3465 	 * Enable the TCP accept handlers when the current number of
3466 	 * TCP connections is about to drop below the maximum number
3467 	 * of TCP connections.
3468 	 */
3469 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3470 		configure_handler_event_types(EV_READ|EV_PERSIST);
3471 		if(slowaccept) {
3472 			event_del(&slowaccept_event);
3473 			slowaccept = 0;
3474 		}
3475 	}
3476 	--data->nsd->current_tcp_count;
3477 	assert(data->nsd->current_tcp_count >= 0);
3478 
3479 	region_destroy(data->region);
3480 }
3481 
3482 static void
3483 handle_tcp_reading(int fd, short event, void* arg)
3484 {
3485 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3486 	ssize_t received;
3487 	struct event_base* ev_base;
3488 	struct timeval timeout;
3489 
3490 	if ((event & EV_TIMEOUT)) {
3491 		/* Connection timed out.  */
3492 		cleanup_tcp_handler(data);
3493 		return;
3494 	}
3495 
3496 	if ((data->nsd->tcp_query_count > 0 &&
3497 		data->query_count >= data->nsd->tcp_query_count) ||
3498 		data->tcp_no_more_queries) {
3499 		/* No more queries allowed on this tcp connection. */
3500 		cleanup_tcp_handler(data);
3501 		return;
3502 	}
3503 
3504 	assert((event & EV_READ));
3505 
3506 	if (data->bytes_transmitted == 0) {
3507 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3508 	}
3509 
3510 	/*
3511 	 * Check if we received the leading packet length bytes yet.
3512 	 */
3513 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3514 		received = read(fd,
3515 				(char *) &data->query->tcplen
3516 				+ data->bytes_transmitted,
3517 				sizeof(uint16_t) - data->bytes_transmitted);
3518 		if (received == -1) {
3519 			if (errno == EAGAIN || errno == EINTR) {
3520 				/*
3521 				 * Read would block, wait until more
3522 				 * data is available.
3523 				 */
3524 				return;
3525 			} else {
3526 				char buf[48];
3527 				addr2str(&data->query->addr, buf, sizeof(buf));
3528 #ifdef ECONNRESET
3529 				if (verbosity >= 2 || errno != ECONNRESET)
3530 #endif /* ECONNRESET */
3531 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3532 				cleanup_tcp_handler(data);
3533 				return;
3534 			}
3535 		} else if (received == 0) {
3536 			/* EOF */
3537 			cleanup_tcp_handler(data);
3538 			return;
3539 		}
3540 
3541 		data->bytes_transmitted += received;
3542 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3543 			/*
3544 			 * Not done with the tcplen yet, wait for more
3545 			 * data to become available.
3546 			 */
3547 			return;
3548 		}
3549 
3550 		assert(data->bytes_transmitted == sizeof(uint16_t));
3551 
3552 		data->query->tcplen = ntohs(data->query->tcplen);
3553 
3554 		/*
3555 		 * Minimum query size is:
3556 		 *
3557 		 *     Size of the header (12)
3558 		 *   + Root domain name   (1)
3559 		 *   + Query class        (2)
3560 		 *   + Query type         (2)
3561 		 */
3562 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3563 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3564 			cleanup_tcp_handler(data);
3565 			return;
3566 		}
3567 
3568 		if (data->query->tcplen > data->query->maxlen) {
3569 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3570 			cleanup_tcp_handler(data);
3571 			return;
3572 		}
3573 
3574 		buffer_set_limit(data->query->packet, data->query->tcplen);
3575 	}
3576 
3577 	assert(buffer_remaining(data->query->packet) > 0);
3578 
3579 	/* Read the (remaining) query data.  */
3580 	received = read(fd,
3581 			buffer_current(data->query->packet),
3582 			buffer_remaining(data->query->packet));
3583 	if (received == -1) {
3584 		if (errno == EAGAIN || errno == EINTR) {
3585 			/*
3586 			 * Read would block, wait until more data is
3587 			 * available.
3588 			 */
3589 			return;
3590 		} else {
3591 			char buf[48];
3592 			addr2str(&data->query->addr, buf, sizeof(buf));
3593 #ifdef ECONNRESET
3594 			if (verbosity >= 2 || errno != ECONNRESET)
3595 #endif /* ECONNRESET */
3596 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3597 			cleanup_tcp_handler(data);
3598 			return;
3599 		}
3600 	} else if (received == 0) {
3601 		/* EOF */
3602 		cleanup_tcp_handler(data);
3603 		return;
3604 	}
3605 
3606 	data->bytes_transmitted += received;
3607 	buffer_skip(data->query->packet, received);
3608 	if (buffer_remaining(data->query->packet) > 0) {
3609 		/*
3610 		 * Message not yet complete, wait for more data to
3611 		 * become available.
3612 		 */
3613 		return;
3614 	}
3615 
3616 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3617 
3618 	/* Account... */
3619 #ifdef BIND8_STATS
3620 #ifndef INET6
3621 	STATUP(data->nsd, ctcp);
3622 #else
3623 	if (data->query->addr.ss_family == AF_INET) {
3624 		STATUP(data->nsd, ctcp);
3625 	} else if (data->query->addr.ss_family == AF_INET6) {
3626 		STATUP(data->nsd, ctcp6);
3627 	}
3628 #endif
3629 #endif /* BIND8_STATS */
3630 
3631 	/* We have a complete query, process it.  */
3632 
3633 	/* tcp-query-count: handle query counter ++ */
3634 	data->query_count++;
3635 
3636 	buffer_flip(data->query->packet);
3637 #ifdef USE_DNSTAP
3638 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3639 		data->query->addrlen, data->query->tcp, data->query->packet);
3640 #endif /* USE_DNSTAP */
3641 	data->query_state = server_process_query(data->nsd, data->query);
3642 	if (data->query_state == QUERY_DISCARDED) {
3643 		/* Drop the packet and the entire connection... */
3644 		STATUP(data->nsd, dropped);
3645 		ZTATUP(data->nsd, data->query->zone, dropped);
3646 		cleanup_tcp_handler(data);
3647 		return;
3648 	}
3649 
3650 #ifdef BIND8_STATS
3651 	if (RCODE(data->query->packet) == RCODE_OK
3652 	    && !AA(data->query->packet))
3653 	{
3654 		STATUP(data->nsd, nona);
3655 		ZTATUP(data->nsd, data->query->zone, nona);
3656 	}
3657 #endif /* BIND8_STATS */
3658 
3659 #ifdef USE_ZONE_STATS
3660 #ifndef INET6
3661 	ZTATUP(data->nsd, data->query->zone, ctcp);
3662 #else
3663 	if (data->query->addr.ss_family == AF_INET) {
3664 		ZTATUP(data->nsd, data->query->zone, ctcp);
3665 	} else if (data->query->addr.ss_family == AF_INET6) {
3666 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3667 	}
3668 #endif
3669 #endif /* USE_ZONE_STATS */
3670 
3671 	query_add_optional(data->query, data->nsd);
3672 
3673 	/* Switch to the tcp write handler.  */
3674 	buffer_flip(data->query->packet);
3675 	data->query->tcplen = buffer_remaining(data->query->packet);
3676 #ifdef BIND8_STATS
3677 	/* Account the rcode & TC... */
3678 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3679 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3680 	if (TC(data->query->packet)) {
3681 		STATUP(data->nsd, truncated);
3682 		ZTATUP(data->nsd, data->query->zone, truncated);
3683 	}
3684 #endif /* BIND8_STATS */
3685 #ifdef USE_DNSTAP
3686 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3687 		data->query->addrlen, data->query->tcp, data->query->packet,
3688 		data->query->zone);
3689 #endif /* USE_DNSTAP */
3690 	data->bytes_transmitted = 0;
3691 
3692 	timeout.tv_sec = data->tcp_timeout / 1000;
3693 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3694 
3695 	ev_base = data->event.ev_base;
3696 	event_del(&data->event);
3697 	memset(&data->event, 0, sizeof(data->event));
3698 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3699 		handle_tcp_reading, data);
3700 	if(event_base_set(ev_base, &data->event) != 0)
3701 		log_msg(LOG_ERR, "event base set tcpr failed");
3702 	if(event_add(&data->event, &timeout) != 0)
3703 		log_msg(LOG_ERR, "event add tcpr failed");
3704 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3705 	handle_tcp_writing(fd, EV_WRITE, data);
3706 }
3707 
3708 static void
3709 handle_tcp_writing(int fd, short event, void* arg)
3710 {
3711 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3712 	ssize_t sent;
3713 	struct query *q = data->query;
3714 	struct timeval timeout;
3715 	struct event_base* ev_base;
3716 
3717 	if ((event & EV_TIMEOUT)) {
3718 		/* Connection timed out.  */
3719 		cleanup_tcp_handler(data);
3720 		return;
3721 	}
3722 
3723 	assert((event & EV_WRITE));
3724 
3725 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3726 		/* Writing the response packet length.  */
3727 		uint16_t n_tcplen = htons(q->tcplen);
3728 #ifdef HAVE_WRITEV
3729 		struct iovec iov[2];
3730 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3731 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3732 		iov[1].iov_base = buffer_begin(q->packet);
3733 		iov[1].iov_len = buffer_limit(q->packet);
3734 		sent = writev(fd, iov, 2);
3735 #else /* HAVE_WRITEV */
3736 		sent = write(fd,
3737 			     (const char *) &n_tcplen + data->bytes_transmitted,
3738 			     sizeof(n_tcplen) - data->bytes_transmitted);
3739 #endif /* HAVE_WRITEV */
3740 		if (sent == -1) {
3741 			if (errno == EAGAIN || errno == EINTR) {
3742 				/*
3743 				 * Write would block, wait until
3744 				 * socket becomes writable again.
3745 				 */
3746 				return;
3747 			} else {
3748 #ifdef ECONNRESET
3749 				if(verbosity >= 2 || errno != ECONNRESET)
3750 #endif /* ECONNRESET */
3751 #ifdef EPIPE
3752 				  if(verbosity >= 2 || errno != EPIPE)
3753 #endif /* EPIPE 'broken pipe' */
3754 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3755 				cleanup_tcp_handler(data);
3756 				return;
3757 			}
3758 		}
3759 
3760 		data->bytes_transmitted += sent;
3761 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3762 			/*
3763 			 * Writing not complete, wait until socket
3764 			 * becomes writable again.
3765 			 */
3766 			return;
3767 		}
3768 
3769 #ifdef HAVE_WRITEV
3770 		sent -= sizeof(n_tcplen);
3771 		/* handle potential 'packet done' code */
3772 		goto packet_could_be_done;
3773 #endif
3774  	}
3775 
3776 	sent = write(fd,
3777 		     buffer_current(q->packet),
3778 		     buffer_remaining(q->packet));
3779 	if (sent == -1) {
3780 		if (errno == EAGAIN || errno == EINTR) {
3781 			/*
3782 			 * Write would block, wait until
3783 			 * socket becomes writable again.
3784 			 */
3785 			return;
3786 		} else {
3787 #ifdef ECONNRESET
3788 			if(verbosity >= 2 || errno != ECONNRESET)
3789 #endif /* ECONNRESET */
3790 #ifdef EPIPE
3791 				  if(verbosity >= 2 || errno != EPIPE)
3792 #endif /* EPIPE 'broken pipe' */
3793 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3794 			cleanup_tcp_handler(data);
3795 			return;
3796 		}
3797 	}
3798 
3799 	data->bytes_transmitted += sent;
3800 #ifdef HAVE_WRITEV
3801   packet_could_be_done:
3802 #endif
3803 	buffer_skip(q->packet, sent);
3804 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3805 		/*
3806 		 * Still more data to write when socket becomes
3807 		 * writable again.
3808 		 */
3809 		return;
3810 	}
3811 
3812 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3813 
3814 	if (data->query_state == QUERY_IN_AXFR) {
3815 		/* Continue processing AXFR and writing back results.  */
3816 		buffer_clear(q->packet);
3817 		data->query_state = query_axfr(data->nsd, q);
3818 		if (data->query_state != QUERY_PROCESSED) {
3819 			query_add_optional(data->query, data->nsd);
3820 
3821 			/* Reset data. */
3822 			buffer_flip(q->packet);
3823 			q->tcplen = buffer_remaining(q->packet);
3824 			data->bytes_transmitted = 0;
3825 			/* Reset timeout.  */
3826 			timeout.tv_sec = data->tcp_timeout / 1000;
3827 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3828 			ev_base = data->event.ev_base;
3829 			event_del(&data->event);
3830 			memset(&data->event, 0, sizeof(data->event));
3831 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
3832 				handle_tcp_writing, data);
3833 			if(event_base_set(ev_base, &data->event) != 0)
3834 				log_msg(LOG_ERR, "event base set tcpw failed");
3835 			if(event_add(&data->event, &timeout) != 0)
3836 				log_msg(LOG_ERR, "event add tcpw failed");
3837 
3838 			/*
3839 			 * Write data if/when the socket is writable
3840 			 * again.
3841 			 */
3842 			return;
3843 		}
3844 	}
3845 
3846 	/*
3847 	 * Done sending, wait for the next request to arrive on the
3848 	 * TCP socket by installing the TCP read handler.
3849 	 */
3850 	if ((data->nsd->tcp_query_count > 0 &&
3851 		data->query_count >= data->nsd->tcp_query_count) ||
3852 		data->tcp_no_more_queries) {
3853 
3854 		(void) shutdown(fd, SHUT_WR);
3855 	}
3856 
3857 	data->bytes_transmitted = 0;
3858 
3859 	timeout.tv_sec = data->tcp_timeout / 1000;
3860 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3861 	ev_base = data->event.ev_base;
3862 	event_del(&data->event);
3863 	memset(&data->event, 0, sizeof(data->event));
3864 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3865 		handle_tcp_reading, data);
3866 	if(event_base_set(ev_base, &data->event) != 0)
3867 		log_msg(LOG_ERR, "event base set tcpw failed");
3868 	if(event_add(&data->event, &timeout) != 0)
3869 		log_msg(LOG_ERR, "event add tcpw failed");
3870 }
3871 
3872 #ifdef HAVE_SSL
3873 /** create SSL object and associate fd */
3874 static SSL*
3875 incoming_ssl_fd(SSL_CTX* ctx, int fd)
3876 {
3877 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
3878 	if(!ssl) {
3879 		log_crypto_err("could not SSL_new");
3880 		return NULL;
3881 	}
3882 	SSL_set_accept_state(ssl);
3883 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
3884 	if(!SSL_set_fd(ssl, fd)) {
3885 		log_crypto_err("could not SSL_set_fd");
3886 		SSL_free(ssl);
3887 		return NULL;
3888 	}
3889 	return ssl;
3890 }
3891 
3892 /** TLS handshake to upgrade TCP connection */
3893 static int
3894 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
3895 {
3896 	int r;
3897 	if(data->shake_state == tls_hs_read_event) {
3898 		/* read condition satisfied back to writing */
3899 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3900 		data->shake_state = tls_hs_none;
3901 		return 1;
3902 	}
3903 	if(data->shake_state == tls_hs_write_event) {
3904 		/* write condition satisfied back to reading */
3905 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3906 		data->shake_state = tls_hs_none;
3907 		return 1;
3908 	}
3909 
3910 	/* (continue to) setup the TLS connection */
3911 	ERR_clear_error();
3912 	r = SSL_do_handshake(data->tls);
3913 
3914 	if(r != 1) {
3915 		int want = SSL_get_error(data->tls, r);
3916 		if(want == SSL_ERROR_WANT_READ) {
3917 			if(data->shake_state == tls_hs_read) {
3918 				/* try again later */
3919 				return 1;
3920 			}
3921 			data->shake_state = tls_hs_read;
3922 			/* switch back to reading mode */
3923 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3924 			return 1;
3925 		} else if(want == SSL_ERROR_WANT_WRITE) {
3926 			if(data->shake_state == tls_hs_write) {
3927 				/* try again later */
3928 				return 1;
3929 			}
3930 			data->shake_state = tls_hs_write;
3931 			/* switch back to writing mode */
3932 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3933 			return 1;
3934 		} else {
3935 			if(r == 0)
3936 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
3937 			else {
3938 				unsigned long err = ERR_get_error();
3939 				if(!squelch_err_ssl_handshake(err)) {
3940 					char a[64], s[256];
3941 					addr2str(&data->query->addr, a, sizeof(a));
3942 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
3943 					log_crypto_from_err(s, err);
3944 				}
3945 			}
3946 			cleanup_tcp_handler(data);
3947 			return 0;
3948 		}
3949 	}
3950 
3951 	/* Use to log successful upgrade for testing - could be removed*/
3952 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
3953 	/* set back to the event we need to have when reading (or writing) */
3954 	if(data->shake_state == tls_hs_read && writing) {
3955 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3956 	} else if(data->shake_state == tls_hs_write && !writing) {
3957 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3958 	}
3959 	data->shake_state = tls_hs_none;
3960 	return 1;
3961 }
3962 
3963 /** handle TLS reading of incoming query */
3964 static void
3965 handle_tls_reading(int fd, short event, void* arg)
3966 {
3967 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3968 	ssize_t received;
3969 
3970 	if ((event & EV_TIMEOUT)) {
3971 		/* Connection timed out.  */
3972 		cleanup_tcp_handler(data);
3973 		return;
3974 	}
3975 
3976 	if ((data->nsd->tcp_query_count > 0 &&
3977 	    data->query_count >= data->nsd->tcp_query_count) ||
3978 	    data->tcp_no_more_queries) {
3979 		/* No more queries allowed on this tcp connection. */
3980 		cleanup_tcp_handler(data);
3981 		return;
3982 	}
3983 
3984 	assert((event & EV_READ));
3985 
3986 	if (data->bytes_transmitted == 0) {
3987 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3988 	}
3989 
3990 	if(data->shake_state != tls_hs_none) {
3991 		if(!tls_handshake(data, fd, 0))
3992 			return;
3993 		if(data->shake_state != tls_hs_none)
3994 			return;
3995 	}
3996 
3997 	/*
3998 	 * Check if we received the leading packet length bytes yet.
3999 	 */
4000 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4001 		ERR_clear_error();
4002 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
4003 		    + data->bytes_transmitted,
4004 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4005 			int want = SSL_get_error(data->tls, received);
4006 			if(want == SSL_ERROR_ZERO_RETURN) {
4007 				cleanup_tcp_handler(data);
4008 				return; /* shutdown, closed */
4009 			} else if(want == SSL_ERROR_WANT_READ) {
4010 				/* wants to be called again */
4011 				return;
4012 			}
4013 			else if(want == SSL_ERROR_WANT_WRITE) {
4014 				/* switch to writing */
4015 				data->shake_state = tls_hs_write_event;
4016 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4017 				return;
4018 			}
4019 			cleanup_tcp_handler(data);
4020 			log_crypto_err("could not SSL_read");
4021 			return;
4022 		}
4023 
4024 		data->bytes_transmitted += received;
4025 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4026 			/*
4027 			 * Not done with the tcplen yet, wait for more
4028 			 * data to become available.
4029 			 */
4030 			return;
4031 		}
4032 
4033 		assert(data->bytes_transmitted == sizeof(uint16_t));
4034 
4035 		data->query->tcplen = ntohs(data->query->tcplen);
4036 
4037 		/*
4038 		 * Minimum query size is:
4039 		 *
4040 		 *     Size of the header (12)
4041 		 *   + Root domain name   (1)
4042 		 *   + Query class        (2)
4043 		 *   + Query type         (2)
4044 		 */
4045 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4046 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4047 			cleanup_tcp_handler(data);
4048 			return;
4049 		}
4050 
4051 		if (data->query->tcplen > data->query->maxlen) {
4052 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4053 			cleanup_tcp_handler(data);
4054 			return;
4055 		}
4056 
4057 		buffer_set_limit(data->query->packet, data->query->tcplen);
4058 	}
4059 
4060 	assert(buffer_remaining(data->query->packet) > 0);
4061 
4062 	/* Read the (remaining) query data.  */
4063 	ERR_clear_error();
4064 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4065 			    (int)buffer_remaining(data->query->packet));
4066 	if(received <= 0) {
4067 		int want = SSL_get_error(data->tls, received);
4068 		if(want == SSL_ERROR_ZERO_RETURN) {
4069 			cleanup_tcp_handler(data);
4070 			return; /* shutdown, closed */
4071 		} else if(want == SSL_ERROR_WANT_READ) {
4072 			/* wants to be called again */
4073 			return;
4074 		}
4075 		else if(want == SSL_ERROR_WANT_WRITE) {
4076 			/* switch back writing */
4077 			data->shake_state = tls_hs_write_event;
4078 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4079 			return;
4080 		}
4081 		cleanup_tcp_handler(data);
4082 		log_crypto_err("could not SSL_read");
4083 		return;
4084 	}
4085 
4086 	data->bytes_transmitted += received;
4087 	buffer_skip(data->query->packet, received);
4088 	if (buffer_remaining(data->query->packet) > 0) {
4089 		/*
4090 		 * Message not yet complete, wait for more data to
4091 		 * become available.
4092 		 */
4093 		return;
4094 	}
4095 
4096 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4097 
4098 	/* Account... */
4099 #ifndef INET6
4100 	STATUP(data->nsd, ctls);
4101 #else
4102 	if (data->query->addr.ss_family == AF_INET) {
4103 		STATUP(data->nsd, ctls);
4104 	} else if (data->query->addr.ss_family == AF_INET6) {
4105 		STATUP(data->nsd, ctls6);
4106 	}
4107 #endif
4108 
4109 	/* We have a complete query, process it.  */
4110 
4111 	/* tcp-query-count: handle query counter ++ */
4112 	data->query_count++;
4113 
4114 	buffer_flip(data->query->packet);
4115 #ifdef USE_DNSTAP
4116 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
4117 		data->query->addrlen, data->query->tcp, data->query->packet);
4118 #endif /* USE_DNSTAP */
4119 	data->query_state = server_process_query(data->nsd, data->query);
4120 	if (data->query_state == QUERY_DISCARDED) {
4121 		/* Drop the packet and the entire connection... */
4122 		STATUP(data->nsd, dropped);
4123 		ZTATUP(data->nsd, data->query->zone, dropped);
4124 		cleanup_tcp_handler(data);
4125 		return;
4126 	}
4127 
4128 #ifdef BIND8_STATS
4129 	if (RCODE(data->query->packet) == RCODE_OK
4130 	    && !AA(data->query->packet))
4131 	{
4132 		STATUP(data->nsd, nona);
4133 		ZTATUP(data->nsd, data->query->zone, nona);
4134 	}
4135 #endif /* BIND8_STATS */
4136 
4137 #ifdef USE_ZONE_STATS
4138 #ifndef INET6
4139 	ZTATUP(data->nsd, data->query->zone, ctls);
4140 #else
4141 	if (data->query->addr.ss_family == AF_INET) {
4142 		ZTATUP(data->nsd, data->query->zone, ctls);
4143 	} else if (data->query->addr.ss_family == AF_INET6) {
4144 		ZTATUP(data->nsd, data->query->zone, ctls6);
4145 	}
4146 #endif
4147 #endif /* USE_ZONE_STATS */
4148 
4149 	query_add_optional(data->query, data->nsd);
4150 
4151 	/* Switch to the tcp write handler.  */
4152 	buffer_flip(data->query->packet);
4153 	data->query->tcplen = buffer_remaining(data->query->packet);
4154 #ifdef BIND8_STATS
4155 	/* Account the rcode & TC... */
4156 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4157 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4158 	if (TC(data->query->packet)) {
4159 		STATUP(data->nsd, truncated);
4160 		ZTATUP(data->nsd, data->query->zone, truncated);
4161 	}
4162 #endif /* BIND8_STATS */
4163 #ifdef USE_DNSTAP
4164 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
4165 		data->query->addrlen, data->query->tcp, data->query->packet,
4166 		data->query->zone);
4167 #endif /* USE_DNSTAP */
4168 	data->bytes_transmitted = 0;
4169 
4170 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4171 
4172 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4173 	handle_tls_writing(fd, EV_WRITE, data);
4174 }
4175 
4176 /** handle TLS writing of outgoing response */
4177 static void
4178 handle_tls_writing(int fd, short event, void* arg)
4179 {
4180 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4181 	ssize_t sent;
4182 	struct query *q = data->query;
4183 	/* static variable that holds reassembly buffer used to put the
4184 	 * TCP length in front of the packet, like writev. */
4185 	static buffer_type* global_tls_temp_buffer = NULL;
4186 	buffer_type* write_buffer;
4187 
4188 	if ((event & EV_TIMEOUT)) {
4189 		/* Connection timed out.  */
4190 		cleanup_tcp_handler(data);
4191 		return;
4192 	}
4193 
4194 	assert((event & EV_WRITE));
4195 
4196 	if(data->shake_state != tls_hs_none) {
4197 		if(!tls_handshake(data, fd, 1))
4198 			return;
4199 		if(data->shake_state != tls_hs_none)
4200 			return;
4201 	}
4202 
4203 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4204 
4205 	/* If we are writing the start of a message, we must include the length
4206 	 * this is done with a copy into write_buffer. */
4207 	write_buffer = NULL;
4208 	if (data->bytes_transmitted == 0) {
4209 		if(!global_tls_temp_buffer) {
4210 			/* gets deallocated when nsd shuts down from
4211 			 * nsd.region */
4212 			global_tls_temp_buffer = buffer_create(nsd.region,
4213 				QIOBUFSZ + sizeof(q->tcplen));
4214 			if (!global_tls_temp_buffer) {
4215 				return;
4216 			}
4217 		}
4218 		write_buffer = global_tls_temp_buffer;
4219 		buffer_clear(write_buffer);
4220 		buffer_write_u16(write_buffer, q->tcplen);
4221 		buffer_write(write_buffer, buffer_current(q->packet),
4222 			(int)buffer_remaining(q->packet));
4223 		buffer_flip(write_buffer);
4224 	} else {
4225 		write_buffer = q->packet;
4226 	}
4227 
4228 	/* Write the response */
4229 	ERR_clear_error();
4230 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4231 	if(sent <= 0) {
4232 		int want = SSL_get_error(data->tls, sent);
4233 		if(want == SSL_ERROR_ZERO_RETURN) {
4234 			cleanup_tcp_handler(data);
4235 			/* closed */
4236 		} else if(want == SSL_ERROR_WANT_READ) {
4237 			/* switch back to reading */
4238 			data->shake_state = tls_hs_read_event;
4239 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4240 		} else if(want != SSL_ERROR_WANT_WRITE) {
4241 			cleanup_tcp_handler(data);
4242 			log_crypto_err("could not SSL_write");
4243 		}
4244 		return;
4245 	}
4246 
4247 	buffer_skip(write_buffer, sent);
4248 	if(buffer_remaining(write_buffer) != 0) {
4249 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4250 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4251 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4252 		}
4253 	}
4254 
4255 	data->bytes_transmitted += sent;
4256 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4257 		/*
4258 		 * Still more data to write when socket becomes
4259 		 * writable again.
4260 		 */
4261 		return;
4262 	}
4263 
4264 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4265 
4266 	if (data->query_state == QUERY_IN_AXFR) {
4267 		/* Continue processing AXFR and writing back results.  */
4268 		buffer_clear(q->packet);
4269 		data->query_state = query_axfr(data->nsd, q);
4270 		if (data->query_state != QUERY_PROCESSED) {
4271 			query_add_optional(data->query, data->nsd);
4272 
4273 			/* Reset data. */
4274 			buffer_flip(q->packet);
4275 			q->tcplen = buffer_remaining(q->packet);
4276 			data->bytes_transmitted = 0;
4277 			/* Reset to writing mode.  */
4278 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4279 
4280 			/*
4281 			 * Write data if/when the socket is writable
4282 			 * again.
4283 			 */
4284 			return;
4285 		}
4286 	}
4287 
4288 	/*
4289 	 * Done sending, wait for the next request to arrive on the
4290 	 * TCP socket by installing the TCP read handler.
4291 	 */
4292 	if ((data->nsd->tcp_query_count > 0 &&
4293 		data->query_count >= data->nsd->tcp_query_count) ||
4294 		data->tcp_no_more_queries) {
4295 
4296 		(void) shutdown(fd, SHUT_WR);
4297 	}
4298 
4299 	data->bytes_transmitted = 0;
4300 
4301 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4302 }
4303 #endif
4304 
4305 static void
4306 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4307 	void* ATTR_UNUSED(arg))
4308 {
4309 	if(slowaccept) {
4310 		configure_handler_event_types(EV_PERSIST | EV_READ);
4311 		slowaccept = 0;
4312 	}
4313 }
4314 
4315 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4316 {
4317 #ifndef HAVE_ACCEPT4
4318 	int s = accept(fd, addr, addrlen);
4319 	if (s != -1) {
4320 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4321 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4322 			close(s);
4323 			s = -1;
4324 			errno=EINTR; /* stop error printout as error in accept4
4325 				by setting this errno, it omits printout, in
4326 				later code that calls nsd_accept4 */
4327 		}
4328 	}
4329 	return s;
4330 #else
4331 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4332 #endif /* HAVE_ACCEPT4 */
4333 }
4334 
4335 /*
4336  * Handle an incoming TCP connection.  The connection is accepted and
4337  * a new TCP reader event handler is added.  The TCP handler
4338  * is responsible for cleanup when the connection is closed.
4339  */
4340 static void
4341 handle_tcp_accept(int fd, short event, void* arg)
4342 {
4343 	struct tcp_accept_handler_data *data
4344 		= (struct tcp_accept_handler_data *) arg;
4345 	int s;
4346 	int reject = 0;
4347 	struct tcp_handler_data *tcp_data;
4348 	region_type *tcp_region;
4349 #ifdef INET6
4350 	struct sockaddr_storage addr;
4351 #else
4352 	struct sockaddr_in addr;
4353 #endif
4354 	socklen_t addrlen;
4355 	struct timeval timeout;
4356 
4357 	if (!(event & EV_READ)) {
4358 		return;
4359 	}
4360 
4361 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4362 		reject = data->nsd->options->tcp_reject_overflow;
4363 		if (!reject) {
4364 			return;
4365 		}
4366 	}
4367 
4368 	/* Accept it... */
4369 	addrlen = sizeof(addr);
4370 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4371 	if (s == -1) {
4372 		/**
4373 		 * EMFILE and ENFILE is a signal that the limit of open
4374 		 * file descriptors has been reached. Pause accept().
4375 		 * EINTR is a signal interrupt. The others are various OS ways
4376 		 * of saying that the client has closed the connection.
4377 		 */
4378 		if (errno == EMFILE || errno == ENFILE) {
4379 			if (!slowaccept) {
4380 				/* disable accept events */
4381 				struct timeval tv;
4382 				configure_handler_event_types(0);
4383 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4384 				tv.tv_usec = 0L;
4385 				memset(&slowaccept_event, 0,
4386 					sizeof(slowaccept_event));
4387 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4388 					handle_slowaccept_timeout, NULL);
4389 				(void)event_base_set(data->event.ev_base,
4390 					&slowaccept_event);
4391 				(void)event_add(&slowaccept_event, &tv);
4392 				slowaccept = 1;
4393 				/* We don't want to spam the logs here */
4394 			}
4395 		} else if (errno != EINTR
4396 			&& errno != EWOULDBLOCK
4397 #ifdef ECONNABORTED
4398 			&& errno != ECONNABORTED
4399 #endif /* ECONNABORTED */
4400 #ifdef EPROTO
4401 			&& errno != EPROTO
4402 #endif /* EPROTO */
4403 			) {
4404 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4405 		}
4406 		return;
4407 	}
4408 
4409 	if (reject) {
4410 		shutdown(s, SHUT_RDWR);
4411 		close(s);
4412 		return;
4413 	}
4414 
4415 	/*
4416 	 * This region is deallocated when the TCP connection is
4417 	 * closed by the TCP handler.
4418 	 */
4419 	tcp_region = region_create(xalloc, free);
4420 	tcp_data = (struct tcp_handler_data *) region_alloc(
4421 		tcp_region, sizeof(struct tcp_handler_data));
4422 	tcp_data->region = tcp_region;
4423 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4424 		compression_table_size, compressed_dnames);
4425 	tcp_data->nsd = data->nsd;
4426 	tcp_data->query_count = 0;
4427 #ifdef HAVE_SSL
4428 	tcp_data->shake_state = tls_hs_none;
4429 	tcp_data->tls = NULL;
4430 #endif
4431 	tcp_data->prev = NULL;
4432 	tcp_data->next = NULL;
4433 
4434 	tcp_data->query_state = QUERY_PROCESSED;
4435 	tcp_data->bytes_transmitted = 0;
4436 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4437 	tcp_data->query->addrlen = addrlen;
4438 
4439 	tcp_data->tcp_no_more_queries = 0;
4440 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4441 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4442 		/* very busy, give smaller timeout */
4443 		tcp_data->tcp_timeout = 200;
4444 	}
4445 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4446 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4447 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4448 
4449 #ifdef HAVE_SSL
4450 	if (data->tls_accept) {
4451 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4452 		if(!tcp_data->tls) {
4453 			close(s);
4454 			return;
4455 		}
4456 		tcp_data->shake_state = tls_hs_read;
4457 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4458 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4459 			  handle_tls_reading, tcp_data);
4460 	} else {
4461 #endif
4462 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4463 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4464 			  handle_tcp_reading, tcp_data);
4465 #ifdef HAVE_SSL
4466 	}
4467 #endif
4468 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4469 		log_msg(LOG_ERR, "cannot set tcp event base");
4470 		close(s);
4471 		region_destroy(tcp_region);
4472 		return;
4473 	}
4474 	if(event_add(&tcp_data->event, &timeout) != 0) {
4475 		log_msg(LOG_ERR, "cannot add tcp to event base");
4476 		close(s);
4477 		region_destroy(tcp_region);
4478 		return;
4479 	}
4480 	if(tcp_active_list) {
4481 		tcp_active_list->prev = tcp_data;
4482 		tcp_data->next = tcp_active_list;
4483 	}
4484 	tcp_active_list = tcp_data;
4485 
4486 	/*
4487 	 * Keep track of the total number of TCP handlers installed so
4488 	 * we can stop accepting connections when the maximum number
4489 	 * of simultaneous TCP connections is reached.
4490 	 *
4491 	 * If tcp-reject-overflow is enabled, however, then we do not
4492 	 * change the handler event type; we keep it as-is and accept
4493 	 * overflow TCP connections only so that we can forcibly kill
4494 	 * them off.
4495 	 */
4496 	++data->nsd->current_tcp_count;
4497 	if (!data->nsd->options->tcp_reject_overflow &&
4498 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4499 	{
4500 		configure_handler_event_types(0);
4501 	}
4502 }
4503 
4504 static void
4505 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4506 {
4507 	size_t i;
4508 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4509 	for (i = 0; i < nsd->child_count; ++i) {
4510 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4511 			if (write(nsd->children[i].child_fd,
4512 				&command,
4513 				sizeof(command)) == -1)
4514 			{
4515 				if(errno != EAGAIN && errno != EINTR)
4516 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4517 					(int) command,
4518 					(int) nsd->children[i].pid,
4519 					strerror(errno));
4520 			} else if (timeout > 0) {
4521 				(void)block_read(NULL,
4522 					nsd->children[i].child_fd,
4523 					&command, sizeof(command), timeout);
4524 			}
4525 			fsync(nsd->children[i].child_fd);
4526 			close(nsd->children[i].child_fd);
4527 			nsd->children[i].child_fd = -1;
4528 		}
4529 	}
4530 }
4531 
4532 static void
4533 send_children_quit(struct nsd* nsd)
4534 {
4535 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4536 	send_children_command(nsd, NSD_QUIT, 0);
4537 }
4538 
4539 static void
4540 send_children_quit_and_wait(struct nsd* nsd)
4541 {
4542 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4543 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4544 }
4545 
4546 #ifdef BIND8_STATS
4547 static void
4548 set_children_stats(struct nsd* nsd)
4549 {
4550 	size_t i;
4551 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4552 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4553 	for (i = 0; i < nsd->child_count; ++i) {
4554 		nsd->children[i].need_to_send_STATS = 1;
4555 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4556 	}
4557 }
4558 #endif /* BIND8_STATS */
4559 
4560 static void
4561 configure_handler_event_types(short event_types)
4562 {
4563 	size_t i;
4564 
4565 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4566 		struct event* handler = &tcp_accept_handlers[i].event;
4567 		if(event_types) {
4568 			/* reassign */
4569 			int fd = handler->ev_fd;
4570 			struct event_base* base = handler->ev_base;
4571 			if(tcp_accept_handlers[i].event_added)
4572 				event_del(handler);
4573 			memset(handler, 0, sizeof(*handler));
4574 			event_set(handler, fd, event_types,
4575 				handle_tcp_accept, &tcp_accept_handlers[i]);
4576 			if(event_base_set(base, handler) != 0)
4577 				log_msg(LOG_ERR, "conhand: cannot event_base");
4578 			if(event_add(handler, NULL) != 0)
4579 				log_msg(LOG_ERR, "conhand: cannot event_add");
4580 			tcp_accept_handlers[i].event_added = 1;
4581 		} else {
4582 			/* remove */
4583 			if(tcp_accept_handlers[i].event_added) {
4584 				event_del(handler);
4585 				tcp_accept_handlers[i].event_added = 0;
4586 			}
4587 		}
4588 	}
4589 }
4590