xref: /openbsd-src/usr.sbin/nsd/server.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #ifdef USE_DNSTAP
85 #include "dnstap/dnstap_collector.h"
86 #endif
87 
88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
89 
90 #ifdef USE_TCP_FASTOPEN
91   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
92   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
93 #endif
94 
95 /*
96  * Data for the UDP handlers.
97  */
98 struct udp_handler_data
99 {
100 	struct nsd        *nsd;
101 	struct nsd_socket *socket;
102 	struct event       event;
103 };
104 
105 struct tcp_accept_handler_data {
106 	struct nsd        *nsd;
107 	struct nsd_socket *socket;
108 	int                event_added;
109 	struct event       event;
110 #ifdef HAVE_SSL
111 	/* handler accepts TLS connections on the dedicated port */
112 	int                tls_accept;
113 #endif
114 };
115 
116 /*
117  * These globals are used to enable the TCP accept handlers
118  * when the number of TCP connection drops below the maximum
119  * number of TCP connections.
120  */
121 static size_t tcp_accept_handler_count;
122 static struct tcp_accept_handler_data *tcp_accept_handlers;
123 
124 static struct event slowaccept_event;
125 static int slowaccept;
126 
127 #ifdef HAVE_SSL
128 static unsigned char *ocspdata = NULL;
129 static long ocspdata_len = 0;
130 #endif
131 
132 #ifdef NONBLOCKING_IS_BROKEN
133 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
134    read multiple times from a socket when reported ready by select. */
135 # define NUM_RECV_PER_SELECT (1)
136 #else /* !NONBLOCKING_IS_BROKEN */
137 # define NUM_RECV_PER_SELECT (100)
138 #endif /* NONBLOCKING_IS_BROKEN */
139 
140 #ifndef HAVE_MMSGHDR
141 struct mmsghdr {
142 	struct msghdr msg_hdr;
143 	unsigned int  msg_len;
144 };
145 #endif
146 
147 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
148 static struct iovec iovecs[NUM_RECV_PER_SELECT];
149 static struct query *queries[NUM_RECV_PER_SELECT];
150 
151 /*
152  * Data for the TCP connection handlers.
153  *
154  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
155  * blocking the entire server on a slow TCP connection, but does make
156  * reading from and writing to the socket more complicated.
157  *
158  * Basically, whenever a read/write would block (indicated by the
159  * EAGAIN errno variable) we remember the position we were reading
160  * from/writing to and return from the TCP reading/writing event
161  * handler.  When the socket becomes readable/writable again we
162  * continue from the same position.
163  */
164 struct tcp_handler_data
165 {
166 	/*
167 	 * The region used to allocate all TCP connection related
168 	 * data, including this structure.  This region is destroyed
169 	 * when the connection is closed.
170 	 */
171 	region_type*		region;
172 
173 	/*
174 	 * The global nsd structure.
175 	 */
176 	struct nsd*			nsd;
177 
178 	/*
179 	 * The current query data for this TCP connection.
180 	 */
181 	query_type*			query;
182 
183 	/*
184 	 * The query_state is used to remember if we are performing an
185 	 * AXFR, if we're done processing, or if we should discard the
186 	 * query and connection.
187 	 */
188 	query_state_type	query_state;
189 
190 	/*
191 	 * The event for the file descriptor and tcp timeout
192 	 */
193 	struct event event;
194 
195 	/*
196 	 * The bytes_transmitted field is used to remember the number
197 	 * of bytes transmitted when receiving or sending a DNS
198 	 * packet.  The count includes the two additional bytes used
199 	 * to specify the packet length on a TCP connection.
200 	 */
201 	size_t				bytes_transmitted;
202 
203 	/*
204 	 * The number of queries handled by this specific TCP connection.
205 	 */
206 	int					query_count;
207 
208 	/*
209 	 * The timeout in msec for this tcp connection
210 	 */
211 	int	tcp_timeout;
212 #ifdef HAVE_SSL
213 	/*
214 	 * TLS object.
215 	 */
216 	SSL* tls;
217 
218 	/*
219 	 * TLS handshake state.
220 	 */
221 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
222 		tls_hs_read_event, tls_hs_write_event } shake_state;
223 #endif
224 	/* list of connections, for service of remaining tcp channels */
225 	struct tcp_handler_data *prev, *next;
226 };
227 /* global that is the list of active tcp channels */
228 static struct tcp_handler_data *tcp_active_list = NULL;
229 
230 /*
231  * Handle incoming queries on the UDP server sockets.
232  */
233 static void handle_udp(int fd, short event, void* arg);
234 
235 /*
236  * Handle incoming connections on the TCP sockets.  These handlers
237  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
238  * connection) but are disabled when the number of current TCP
239  * connections is equal to the maximum number of TCP connections.
240  * Disabling is done by changing the handler to wait for the
241  * NETIO_EVENT_NONE type.  This is done using the function
242  * configure_tcp_accept_handlers.
243  */
244 static void handle_tcp_accept(int fd, short event, void* arg);
245 
246 /*
247  * Handle incoming queries on a TCP connection.  The TCP connections
248  * are configured to be non-blocking and the handler may be called
249  * multiple times before a complete query is received.
250  */
251 static void handle_tcp_reading(int fd, short event, void* arg);
252 
253 /*
254  * Handle outgoing responses on a TCP connection.  The TCP connections
255  * are configured to be non-blocking and the handler may be called
256  * multiple times before a complete response is sent.
257  */
258 static void handle_tcp_writing(int fd, short event, void* arg);
259 
260 #ifdef HAVE_SSL
261 /* Create SSL object and associate fd */
262 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
263 /*
264  * Handle TLS handshake. May be called multiple times if incomplete.
265  */
266 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
267 
268 /*
269  * Handle incoming queries on a TLS over TCP connection.  The TLS
270  * connections are configured to be non-blocking and the handler may
271  * be called multiple times before a complete query is received.
272  */
273 static void handle_tls_reading(int fd, short event, void* arg);
274 
275 /*
276  * Handle outgoing responses on a TLS over TCP connection.  The TLS
277  * connections are configured to be non-blocking and the handler may
278  * be called multiple times before a complete response is sent.
279  */
280 static void handle_tls_writing(int fd, short event, void* arg);
281 #endif
282 
283 /*
284  * Send all children the quit nonblocking, then close pipe.
285  */
286 static void send_children_quit(struct nsd* nsd);
287 /* same, for shutdown time, waits for child to exit to avoid restart issues */
288 static void send_children_quit_and_wait(struct nsd* nsd);
289 
290 /* set childrens flags to send NSD_STATS to them */
291 #ifdef BIND8_STATS
292 static void set_children_stats(struct nsd* nsd);
293 #endif /* BIND8_STATS */
294 
295 /*
296  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
297  */
298 static void configure_handler_event_types(short event_types);
299 
300 static uint16_t *compressed_dname_offsets = 0;
301 static uint32_t compression_table_capacity = 0;
302 static uint32_t compression_table_size = 0;
303 static domain_type* compressed_dnames[MAXRRSPP];
304 
305 #ifdef USE_TCP_FASTOPEN
306 /* Checks to see if the kernel value must be manually changed in order for
307    TCP Fast Open to support server mode */
308 static void report_tcp_fastopen_config() {
309 
310 	int tcp_fastopen_fp;
311 	uint8_t tcp_fastopen_value;
312 
313 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
314 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
315 	}
316 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
317 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
318 		close(tcp_fastopen_fp);
319 	}
320 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
321 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
322 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
323 		log_msg(LOG_WARNING, "To enable TFO use the command:");
324 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
325 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
326 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
327 		close(tcp_fastopen_fp);
328 	}
329 	close(tcp_fastopen_fp);
330 }
331 #endif
332 
333 /*
334  * Remove the specified pid from the list of child pids.  Returns -1 if
335  * the pid is not in the list, child_num otherwise.  The field is set to 0.
336  */
337 static int
338 delete_child_pid(struct nsd *nsd, pid_t pid)
339 {
340 	size_t i;
341 	for (i = 0; i < nsd->child_count; ++i) {
342 		if (nsd->children[i].pid == pid) {
343 			nsd->children[i].pid = 0;
344 			if(!nsd->children[i].need_to_exit) {
345 				if(nsd->children[i].child_fd != -1)
346 					close(nsd->children[i].child_fd);
347 				nsd->children[i].child_fd = -1;
348 				if(nsd->children[i].handler)
349 					nsd->children[i].handler->fd = -1;
350 			}
351 			return i;
352 		}
353 	}
354 	return -1;
355 }
356 
357 /*
358  * Restart child servers if necessary.
359  */
360 static int
361 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
362 	int* xfrd_sock_p)
363 {
364 	struct main_ipc_handler_data *ipc_data;
365 	size_t i;
366 	int sv[2];
367 
368 	/* Fork the child processes... */
369 	for (i = 0; i < nsd->child_count; ++i) {
370 		if (nsd->children[i].pid <= 0) {
371 			if (nsd->children[i].child_fd != -1)
372 				close(nsd->children[i].child_fd);
373 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
374 				log_msg(LOG_ERR, "socketpair: %s",
375 					strerror(errno));
376 				return -1;
377 			}
378 			nsd->children[i].child_fd = sv[0];
379 			nsd->children[i].parent_fd = sv[1];
380 			nsd->children[i].pid = fork();
381 			switch (nsd->children[i].pid) {
382 			default: /* SERVER MAIN */
383 				close(nsd->children[i].parent_fd);
384 				nsd->children[i].parent_fd = -1;
385 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
386 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
387 				}
388 				if(!nsd->children[i].handler)
389 				{
390 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
391 						region, sizeof(struct main_ipc_handler_data));
392 					ipc_data->nsd = nsd;
393 					ipc_data->child = &nsd->children[i];
394 					ipc_data->child_num = i;
395 					ipc_data->xfrd_sock = xfrd_sock_p;
396 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
397 					ipc_data->forward_mode = 0;
398 					ipc_data->got_bytes = 0;
399 					ipc_data->total_bytes = 0;
400 					ipc_data->acl_num = 0;
401 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
402 						region, sizeof(struct netio_handler));
403 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
404 					nsd->children[i].handler->timeout = NULL;
405 					nsd->children[i].handler->user_data = ipc_data;
406 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
407 					nsd->children[i].handler->event_handler = parent_handle_child_command;
408 					netio_add_handler(netio, nsd->children[i].handler);
409 				}
410 				/* clear any ongoing ipc */
411 				ipc_data = (struct main_ipc_handler_data*)
412 					nsd->children[i].handler->user_data;
413 				ipc_data->forward_mode = 0;
414 				/* restart - update fd */
415 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
416 				break;
417 			case 0: /* CHILD */
418 				/* the child need not be able to access the
419 				 * nsd.db file */
420 				namedb_close_udb(nsd->db);
421 #ifdef MEMCLEAN /* OS collects memory pages */
422 				region_destroy(region);
423 #endif
424 
425 				if (pledge("stdio rpath inet", NULL) == -1) {
426 					log_msg(LOG_ERR, "pledge");
427 					exit(1);
428 				}
429 
430 				nsd->pid = 0;
431 				nsd->child_count = 0;
432 				nsd->server_kind = nsd->children[i].kind;
433 				nsd->this_child = &nsd->children[i];
434 				nsd->this_child->child_num = i;
435 				/* remove signal flags inherited from parent
436 				   the parent will handle them. */
437 				nsd->signal_hint_reload_hup = 0;
438 				nsd->signal_hint_reload = 0;
439 				nsd->signal_hint_child = 0;
440 				nsd->signal_hint_quit = 0;
441 				nsd->signal_hint_shutdown = 0;
442 				nsd->signal_hint_stats = 0;
443 				nsd->signal_hint_statsusr = 0;
444 				close(*xfrd_sock_p);
445 				close(nsd->this_child->child_fd);
446 				nsd->this_child->child_fd = -1;
447 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
448 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
449 				}
450 				server_child(nsd);
451 				/* NOTREACH */
452 				exit(0);
453 			case -1:
454 				log_msg(LOG_ERR, "fork failed: %s",
455 					strerror(errno));
456 				return -1;
457 			}
458 		}
459 	}
460 	return 0;
461 }
462 
463 #ifdef BIND8_STATS
464 static void set_bind8_alarm(struct nsd* nsd)
465 {
466 	/* resync so that the next alarm is on the next whole minute */
467 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
468 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
469 }
470 #endif
471 
472 /* set zone stat ids for zones initially read in */
473 static void
474 zonestatid_tree_set(struct nsd* nsd)
475 {
476 	struct radnode* n;
477 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
478 		zone_type* zone = (zone_type*)n->elem;
479 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
480 	}
481 }
482 
483 #ifdef USE_ZONE_STATS
484 void
485 server_zonestat_alloc(struct nsd* nsd)
486 {
487 	size_t num = (nsd->options->zonestatnames->count==0?1:
488 			nsd->options->zonestatnames->count);
489 	size_t sz = sizeof(struct nsdst)*num;
490 	char tmpfile[256];
491 	uint8_t z = 0;
492 
493 	/* file names */
494 	nsd->zonestatfname[0] = 0;
495 	nsd->zonestatfname[1] = 0;
496 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
497 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
498 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
499 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
500 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
501 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
502 
503 	/* file descriptors */
504 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
505 	if(nsd->zonestatfd[0] == -1) {
506 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
507 			strerror(errno));
508 		exit(1);
509 	}
510 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
511 	if(nsd->zonestatfd[0] == -1) {
512 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
513 			strerror(errno));
514 		close(nsd->zonestatfd[0]);
515 		unlink(nsd->zonestatfname[0]);
516 		exit(1);
517 	}
518 
519 #ifdef HAVE_MMAP
520 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
521 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
522 			strerror(errno));
523 		exit(1);
524 	}
525 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
526 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
527 			nsd->zonestatfname[0], strerror(errno));
528 		exit(1);
529 	}
530 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
531 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
532 			strerror(errno));
533 		exit(1);
534 	}
535 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
536 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
537 			nsd->zonestatfname[1], strerror(errno));
538 		exit(1);
539 	}
540 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
541 		MAP_SHARED, nsd->zonestatfd[0], 0);
542 	if(nsd->zonestat[0] == MAP_FAILED) {
543 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
544 		unlink(nsd->zonestatfname[0]);
545 		unlink(nsd->zonestatfname[1]);
546 		exit(1);
547 	}
548 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
549 		MAP_SHARED, nsd->zonestatfd[1], 0);
550 	if(nsd->zonestat[1] == MAP_FAILED) {
551 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
552 		unlink(nsd->zonestatfname[0]);
553 		unlink(nsd->zonestatfname[1]);
554 		exit(1);
555 	}
556 	memset(nsd->zonestat[0], 0, sz);
557 	memset(nsd->zonestat[1], 0, sz);
558 	nsd->zonestatsize[0] = num;
559 	nsd->zonestatsize[1] = num;
560 	nsd->zonestatdesired = num;
561 	nsd->zonestatsizenow = num;
562 	nsd->zonestatnow = nsd->zonestat[0];
563 #endif /* HAVE_MMAP */
564 }
565 
566 void
567 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
568 {
569 #ifdef HAVE_MMAP
570 #ifdef MREMAP_MAYMOVE
571 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
572 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
573 		MREMAP_MAYMOVE);
574 	if(nsd->zonestat[idx] == MAP_FAILED) {
575 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
576 		exit(1);
577 	}
578 #else /* !HAVE MREMAP */
579 	if(msync(nsd->zonestat[idx],
580 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
581 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
582 	if(munmap(nsd->zonestat[idx],
583 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
584 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
585 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
586 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
587 	if(nsd->zonestat[idx] == MAP_FAILED) {
588 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
589 		exit(1);
590 	}
591 #endif /* MREMAP */
592 #endif /* HAVE_MMAP */
593 }
594 
595 /* realloc the zonestat array for the one that is not currently in use,
596  * to match the desired new size of the array (if applicable) */
597 void
598 server_zonestat_realloc(struct nsd* nsd)
599 {
600 #ifdef HAVE_MMAP
601 	uint8_t z = 0;
602 	size_t sz;
603 	int idx = 0; /* index of the zonestat array that is not in use */
604 	if(nsd->zonestatnow == nsd->zonestat[0])
605 		idx = 1;
606 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
607 		return;
608 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
609 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
610 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
611 			strerror(errno));
612 		exit(1);
613 	}
614 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
615 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
616 			nsd->zonestatfname[idx], strerror(errno));
617 		exit(1);
618 	}
619 	zonestat_remap(nsd, idx, sz);
620 	/* zero the newly allocated region */
621 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
622 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
623 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
624 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
625 	}
626 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
627 #endif /* HAVE_MMAP */
628 }
629 
630 /* switchover to use the other array for the new children, that
631  * briefly coexist with the old children.  And we want to avoid them
632  * both writing to the same statistics arrays. */
633 void
634 server_zonestat_switch(struct nsd* nsd)
635 {
636 	if(nsd->zonestatnow == nsd->zonestat[0]) {
637 		nsd->zonestatnow = nsd->zonestat[1];
638 		nsd->zonestatsizenow = nsd->zonestatsize[1];
639 	} else {
640 		nsd->zonestatnow = nsd->zonestat[0];
641 		nsd->zonestatsizenow = nsd->zonestatsize[0];
642 	}
643 }
644 #endif /* USE_ZONE_STATS */
645 
646 static void
647 cleanup_dname_compression_tables(void *ptr)
648 {
649 	free(ptr);
650 	compressed_dname_offsets = NULL;
651 	compression_table_capacity = 0;
652 }
653 
654 static void
655 initialize_dname_compression_tables(struct nsd *nsd)
656 {
657 	size_t needed = domain_table_count(nsd->db->domains) + 1;
658 	needed += EXTRA_DOMAIN_NUMBERS;
659 	if(compression_table_capacity < needed) {
660 		if(compressed_dname_offsets) {
661 			region_remove_cleanup(nsd->db->region,
662 				cleanup_dname_compression_tables,
663 				compressed_dname_offsets);
664 			free(compressed_dname_offsets);
665 		}
666 		compressed_dname_offsets = (uint16_t *) xmallocarray(
667 			needed, sizeof(uint16_t));
668 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
669 			compressed_dname_offsets);
670 		compression_table_capacity = needed;
671 		compression_table_size=domain_table_count(nsd->db->domains)+1;
672 	}
673 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
674 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
675 }
676 
677 static int
678 set_cloexec(struct nsd_socket *sock)
679 {
680 	assert(sock != NULL);
681 
682 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
683 		const char *socktype =
684 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
685 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
686 			socktype, strerror(errno));
687 		return -1;
688 	}
689 
690 	return 1;
691 }
692 
693 static int
694 set_reuseport(struct nsd_socket *sock)
695 {
696 #ifdef SO_REUSEPORT
697 	int on = 1;
698 #ifdef SO_REUSEPORT_LB
699 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
700 	 * SO_REUSEPORT on Linux. This is what the users want with the config
701 	 * option in nsd.conf; if we actually need local address and port reuse
702 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
703 	 * _LB they want.
704 	 */
705 	int opt = SO_REUSEPORT_LB;
706 	static const char optname[] = "SO_REUSEPORT_LB";
707 #else /* !SO_REUSEPORT_LB */
708 	int opt = SO_REUSEPORT;
709 	static const char optname[] = "SO_REUSEPORT";
710 #endif /* SO_REUSEPORT_LB */
711 
712 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
713 		return 1;
714 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
715 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
716 			optname, strerror(errno));
717 	}
718 	return -1;
719 #else
720 	(void)sock;
721 #endif /* SO_REUSEPORT */
722 
723 	return 0;
724 }
725 
726 static int
727 set_reuseaddr(struct nsd_socket *sock)
728 {
729 #ifdef SO_REUSEADDR
730 	int on = 1;
731 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
732 		return 1;
733 	}
734 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
735 		strerror(errno));
736 	return -1;
737 #endif /* SO_REUSEADDR */
738 	return 0;
739 }
740 
741 static int
742 set_rcvbuf(struct nsd_socket *sock, int rcv)
743 {
744 #ifdef SO_RCVBUF
745 #ifdef SO_RCVBUFFORCE
746 	if(0 == setsockopt(
747 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
748 	{
749 		return 1;
750 	}
751 	if(errno == EPERM || errno == ENOBUFS) {
752 		return 0;
753 	}
754 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
755 		strerror(errno));
756 	return -1;
757 #else /* !SO_RCVBUFFORCE */
758 	if (0 == setsockopt(
759 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
760 	{
761 		return 1;
762 	}
763 	if(errno == ENOSYS || errno == ENOBUFS) {
764 		return 0;
765 	}
766 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
767 		strerror(errno));
768 	return -1;
769 #endif /* SO_RCVBUFFORCE */
770 #endif /* SO_RCVBUF */
771 
772 	return 0;
773 }
774 
775 static int
776 set_sndbuf(struct nsd_socket *sock, int snd)
777 {
778 #ifdef SO_SNDBUF
779 #ifdef SO_SNDBUFFORCE
780 	if(0 == setsockopt(
781 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
782 	{
783 		return 1;
784 	}
785 	if(errno == EPERM || errno == ENOBUFS) {
786 		return 0;
787 	}
788 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
789 		strerror(errno));
790 	return -1;
791 #else /* !SO_SNDBUFFORCE */
792 	if(0 == setsockopt(
793 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
794 	{
795 		return 1;
796 	}
797 	if(errno == ENOSYS || errno == ENOBUFS) {
798 		return 0;
799 	}
800 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
801 		strerror(errno));
802 	return -1;
803 #endif /* SO_SNDBUFFORCE */
804 #endif /* SO_SNDBUF */
805 
806 	return 0;
807 }
808 
809 static int
810 set_nonblock(struct nsd_socket *sock)
811 {
812 	const char *socktype =
813 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
814 
815 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
816 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
817 			socktype, strerror(errno));
818 		return -1;
819 	}
820 
821 	return 1;
822 }
823 
824 static int
825 set_ipv6_v6only(struct nsd_socket *sock)
826 {
827 #ifdef INET6
828 #ifdef IPV6_V6ONLY
829 	int on = 1;
830 	const char *socktype =
831 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
832 
833 	if(0 == setsockopt(
834 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
835 	{
836 		return 1;
837 	}
838 
839 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
840 		socktype, strerror(errno));
841 	return -1;
842 #endif /* IPV6_V6ONLY */
843 #endif /* INET6 */
844 
845 	return 0;
846 }
847 
848 static int
849 set_ipv6_use_min_mtu(struct nsd_socket *sock)
850 {
851 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU))
852 #if defined(IPV6_USE_MIN_MTU)
853 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
854 	 * network. Therefore we do not send UDP datagrams larger than the
855 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
856 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
857 	 */
858 	int opt = IPV6_USE_MIN_MTU;
859 	int optval = 1;
860 	static const char optname[] = "IPV6_USE_MIN_MTU";
861 #elif defined(IPV6_MTU)
862 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
863 	 * to the MIN MTU to get the same.
864 	 */
865 	int opt = IPV6_MTU;
866 	int optval = IPV6_MIN_MTU;
867 	static const char optname[] = "IPV6_MTU";
868 #endif
869 	if(0 == setsockopt(
870 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
871 	{
872 		return 1;
873 	}
874 
875 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
876 		optname, strerror(errno));
877 	return -1;
878 #else
879 	(void)sock;
880 #endif /* INET6 */
881 
882 	return 0;
883 }
884 
885 static int
886 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
887 {
888 	int ret = 0;
889 
890 #if defined(IP_MTU_DISCOVER)
891 	int opt = IP_MTU_DISCOVER;
892 	int optval;
893 # if defined(IP_PMTUDISC_OMIT)
894 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
895 	 * information and send packets with DF=0. Fragmentation is allowed if
896 	 * and only if the packet size exceeds the outgoing interface MTU or
897 	 * the packet encounters smaller MTU link in network. This mitigates
898 	 * DNS fragmentation attacks by preventing forged PMTU information.
899 	 * FreeBSD already has same semantics without setting the option.
900 	 */
901 	optval = IP_PMTUDISC_OMIT;
902 	if(0 == setsockopt(
903 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
904 	{
905 		return 1;
906 	}
907 
908 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
909 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
910 # endif /* IP_PMTUDISC_OMIT */
911 # if defined(IP_PMTUDISC_DONT)
912 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
913 	optval = IP_PMTUDISC_DONT;
914 	if(0 == setsockopt(
915 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
916 	{
917 		return 1;
918 	}
919 
920 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
921 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
922 # endif
923 	ret = -1;
924 #elif defined(IP_DONTFRAG)
925 	int off = 0;
926 	if (0 == setsockopt(
927 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
928 	{
929 		return 1;
930 	}
931 
932 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
933 		strerror(errno));
934 	ret = -1;
935 #else
936 	(void)sock;
937 #endif
938 
939 	return ret;
940 }
941 
942 static int
943 set_ip_freebind(struct nsd_socket *sock)
944 {
945 #ifdef IP_FREEBIND
946 	int on = 1;
947 	const char *socktype =
948 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
949 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
950 	{
951 		return 1;
952 	}
953 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
954 		socktype, strerror(errno));
955 	return -1;
956 #else
957 	(void)sock;
958 #endif /* IP_FREEBIND */
959 
960 	return 0;
961 }
962 
963 static int
964 set_ip_transparent(struct nsd_socket *sock)
965 {
966 	/*
967 	The scandalous preprocessor blob here calls for some explanation :)
968 	POSIX does not specify an option to bind non-local IPs, so
969 	platforms developed several implementation-specific options,
970 	all set in the same way, but with different names.
971 	For additional complexity, some platform manage this setting
972 	differently for different address families (IPv4 vs IPv6).
973 	This scandalous preprocessor blob below abstracts such variability
974 	in the way which leaves the C code as lean and clear as possible.
975 	*/
976 
977 #if defined(IP_TRANSPARENT)
978 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
979 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
980 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
981 // as of 2020-01, Linux does not support this on IPv6 programmatically
982 #elif defined(SO_BINDANY)
983 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
984 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
985 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
986 #elif defined(IP_BINDANY)
987 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
988 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
989 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
990 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
991 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
992 #endif
993 
994 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
995 	(void)sock;
996 #else
997 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
998 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
999 #	endif
1000 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1001 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1002 #	endif
1003 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1004 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1005 #	endif
1006 
1007 	int on = 1;
1008 	const char *socktype =
1009 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1010 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1011 
1012 	if(0 == setsockopt(
1013 		sock->s,
1014 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1015 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1016 		&on, sizeof(on)))
1017 	{
1018 		return 1;
1019 	}
1020 
1021 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1022 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1023 	return -1;
1024 #endif
1025 
1026 	return 0;
1027 }
1028 
1029 static int
1030 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1031 {
1032 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1033 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1034 		return 1;
1035 	}
1036 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1037 		strerror(errno));
1038 	return -1;
1039 #else
1040 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1041 #endif
1042 	return 0;
1043 }
1044 
1045 #ifdef USE_TCP_FASTOPEN
1046 static int
1047 set_tcp_fastopen(struct nsd_socket *sock)
1048 {
1049 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1050 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1051 	 */
1052 	int qlen;
1053 
1054 #ifdef __APPLE__
1055 	/* macOS X implementation only supports qlen of 1 via this call. The
1056 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1057 	 * kernel parameter.
1058 	 */
1059 	qlen = 1;
1060 #else
1061 	/* 5 is recommended on Linux. */
1062 	qlen = 5;
1063 #endif
1064 	if (0 == setsockopt(
1065 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1066 	{
1067 		return 1;
1068 	}
1069 
1070 	if (errno == EPERM) {
1071 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1072 				 "; this could likely be because sysctl "
1073 				 "net.inet.tcp.fastopen.enabled, "
1074 				 "net.inet.tcp.fastopen.server_enable, or "
1075 				 "net.ipv4.tcp_fastopen is disabled",
1076 			strerror(errno));
1077 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1078 	 * disabled, except when verbosity enabled for debugging
1079 	 */
1080 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1081 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1082 			strerror(errno));
1083 	}
1084 
1085 	return (errno == ENOPROTOOPT ? 0 : -1);
1086 }
1087 #endif /* USE_TCP_FASTOPEN */
1088 
1089 static int
1090 set_bindtodevice(struct nsd_socket *sock)
1091 {
1092 #if defined(SO_BINDTODEVICE)
1093 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1094 		sock->device, strlen(sock->device)) == -1)
1095 	{
1096 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1097 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1098 		return -1;
1099 	}
1100 
1101 	return 1;
1102 #else
1103 	(void)sock;
1104 	return 0;
1105 #endif
1106 }
1107 
1108 static int
1109 set_setfib(struct nsd_socket *sock)
1110 {
1111 #if defined(SO_SETFIB)
1112 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1113 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1114 	{
1115 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1116 		                 "SO_SETFIB", sock->fib, strerror(errno));
1117 		return -1;
1118 	}
1119 
1120 	return 1;
1121 #else
1122 	(void)sock;
1123 	return 0;
1124 #endif
1125 }
1126 
1127 static int
1128 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1129 {
1130 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1131 
1132 	if(-1 == (sock->s = socket(
1133 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1134 	{
1135 #ifdef INET6
1136 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1137 		   (sock->addr.ai_family == AF_INET6) &&
1138 		   (errno == EAFNOSUPPORT))
1139 		{
1140 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1141 				"not supported");
1142 			return 0;
1143 		}
1144 #endif
1145 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1146 		return -1;
1147 	}
1148 
1149 	set_cloexec(sock);
1150 
1151 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1152 		*reuseport_works = (set_reuseport(sock) == 1);
1153 
1154 	if(nsd->options->receive_buffer_size > 0)
1155 		rcv = nsd->options->receive_buffer_size;
1156 	if(set_rcvbuf(sock, rcv) == -1)
1157 		return -1;
1158 
1159 	if(nsd->options->send_buffer_size > 0)
1160 		snd = nsd->options->send_buffer_size;
1161 	if(set_sndbuf(sock, snd) == -1)
1162 		return -1;
1163 #ifdef INET6
1164 	if(sock->addr.ai_family == AF_INET6) {
1165 		if(set_ipv6_v6only(sock) == -1 ||
1166 		   set_ipv6_use_min_mtu(sock) == -1)
1167 			return -1;
1168 	} else
1169 #endif /* INET6 */
1170 	if(sock->addr.ai_family == AF_INET) {
1171 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1172 			return -1;
1173 	}
1174 
1175 	/* Set socket to non-blocking. Otherwise, on operating systems
1176 	 * with thundering herd problems, the UDP recv could block
1177 	 * after select returns readable.
1178 	 */
1179 	set_nonblock(sock);
1180 
1181 	if(nsd->options->ip_freebind)
1182 		(void)set_ip_freebind(sock);
1183 	if(nsd->options->ip_transparent)
1184 		(void)set_ip_transparent(sock);
1185 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1186 		return -1;
1187 	if(sock->fib != -1 && set_setfib(sock) == -1)
1188 		return -1;
1189 
1190 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1191 		char buf[256];
1192 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1193 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1194 			buf, strerror(errno));
1195 		return -1;
1196 	}
1197 
1198 	return 1;
1199 }
1200 
1201 static int
1202 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1203 {
1204 #ifdef USE_TCP_FASTOPEN
1205 	report_tcp_fastopen_config();
1206 #endif
1207 
1208 	(void)reuseport_works;
1209 
1210 	if(-1 == (sock->s = socket(
1211 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1212 	{
1213 #ifdef INET6
1214 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1215 		   (sock->addr.ai_family == AF_INET6) &&
1216 		   (errno == EAFNOSUPPORT))
1217 		{
1218 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1219 			                     "not supported");
1220 			return 0;
1221 		}
1222 #endif /* INET6 */
1223 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1224 		return -1;
1225 	}
1226 
1227 	set_cloexec(sock);
1228 
1229 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1230 		*reuseport_works = (set_reuseport(sock) == 1);
1231 
1232 	(void)set_reuseaddr(sock);
1233 
1234 #ifdef INET6
1235 	if(sock->addr.ai_family == AF_INET6) {
1236 		if (set_ipv6_v6only(sock) == -1 ||
1237 		    set_ipv6_use_min_mtu(sock) == -1)
1238 			return -1;
1239 	}
1240 #endif
1241 
1242 	if(nsd->tcp_mss > 0)
1243 		set_tcp_maxseg(sock, nsd->tcp_mss);
1244 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1245 	   it may block in accept, even if select() says readable. */
1246 	(void)set_nonblock(sock);
1247 	if(nsd->options->ip_freebind)
1248 		(void)set_ip_freebind(sock);
1249 	if(nsd->options->ip_transparent)
1250 		(void)set_ip_transparent(sock);
1251 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1252 		return -1;
1253 	if(sock->fib != -1 && set_setfib(sock) == -1)
1254 		return -1;
1255 
1256 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1257 		char buf[256];
1258 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1259 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1260 			buf, strerror(errno));
1261 		return -1;
1262 	}
1263 
1264 #ifdef USE_TCP_FASTOPEN
1265 	(void)set_tcp_fastopen(sock);
1266 #endif
1267 
1268 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1269 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1270 		return -1;
1271 	}
1272 
1273 	return 1;
1274 }
1275 
1276 /*
1277  * Initialize the server, reuseport, create and bind the sockets.
1278  */
1279 int
1280 server_init(struct nsd *nsd)
1281 {
1282 	size_t i;
1283 	int reuseport = 1; /* Determine if REUSEPORT works. */
1284 
1285 	/* open server interface ports */
1286 	for(i = 0; i < nsd->ifs; i++) {
1287 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1288 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1289 		{
1290 			return -1;
1291 		}
1292 	}
1293 
1294 	if(nsd->reuseport && reuseport) {
1295 		size_t ifs = nsd->ifs * nsd->reuseport;
1296 
1297 		/* increase the size of the interface arrays, there are going
1298 		 * to be separate interface file descriptors for every server
1299 		 * instance */
1300 		region_remove_cleanup(nsd->region, free, nsd->udp);
1301 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1302 
1303 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1304 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1305 		region_add_cleanup(nsd->region, free, nsd->udp);
1306 		region_add_cleanup(nsd->region, free, nsd->tcp);
1307 		if(ifs > nsd->ifs) {
1308 			memset(&nsd->udp[nsd->ifs], 0,
1309 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1310 			memset(&nsd->tcp[nsd->ifs], 0,
1311 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1312 		}
1313 
1314 		for(i = nsd->ifs; i < ifs; i++) {
1315 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1316 			nsd->udp[i].s = -1;
1317 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1318 				return -1;
1319 			}
1320 			/* Turn off REUSEPORT for TCP by copying the socket
1321 			 * file descriptor.
1322 			 * This means we should not close TCP used by
1323 			 * other servers in reuseport enabled mode, in
1324 			 * server_child().
1325 			 */
1326 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1327 		}
1328 
1329 		nsd->ifs = ifs;
1330 	} else {
1331 		nsd->reuseport = 0;
1332 	}
1333 
1334 	return 0;
1335 }
1336 
1337 /*
1338  * Prepare the server for take off.
1339  *
1340  */
1341 int
1342 server_prepare(struct nsd *nsd)
1343 {
1344 #ifdef RATELIMIT
1345 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1346 #ifdef HAVE_GETRANDOM
1347 	uint32_t v;
1348 	if(getrandom(&v, sizeof(v), 0) == -1) {
1349 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1350 		exit(1);
1351 	}
1352 	hash_set_raninit(v);
1353 #elif defined(HAVE_ARC4RANDOM)
1354 	hash_set_raninit(arc4random());
1355 #else
1356 	uint32_t v = getpid() ^ time(NULL);
1357 	srandom((unsigned long)v);
1358 #  ifdef HAVE_SSL
1359 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1360 		hash_set_raninit(v);
1361 	else
1362 #  endif
1363 		hash_set_raninit(random());
1364 #endif
1365 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1366 		nsd->options->rrl_ratelimit,
1367 		nsd->options->rrl_whitelist_ratelimit,
1368 		nsd->options->rrl_slip,
1369 		nsd->options->rrl_ipv4_prefix_length,
1370 		nsd->options->rrl_ipv6_prefix_length);
1371 #endif /* RATELIMIT */
1372 
1373 	/* Open the database... */
1374 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1375 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1376 			nsd->dbfile, strerror(errno));
1377 		unlink(nsd->task[0]->fname);
1378 		unlink(nsd->task[1]->fname);
1379 #ifdef USE_ZONE_STATS
1380 		unlink(nsd->zonestatfname[0]);
1381 		unlink(nsd->zonestatfname[1]);
1382 #endif
1383 		xfrd_del_tempdir(nsd);
1384 		return -1;
1385 	}
1386 	/* check if zone files have been modified */
1387 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1388 	 * for all zones */
1389 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1390 		nsd->options->database[0] == 0))
1391 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1392 	zonestatid_tree_set(nsd);
1393 
1394 	compression_table_capacity = 0;
1395 	initialize_dname_compression_tables(nsd);
1396 
1397 #ifdef	BIND8_STATS
1398 	/* Initialize times... */
1399 	time(&nsd->st.boot);
1400 	set_bind8_alarm(nsd);
1401 #endif /* BIND8_STATS */
1402 
1403 	return 0;
1404 }
1405 
1406 /*
1407  * Fork the required number of servers.
1408  */
1409 static int
1410 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1411 	int* xfrd_sock_p)
1412 {
1413 	size_t i;
1414 
1415 	/* Start all child servers initially.  */
1416 	for (i = 0; i < nsd->child_count; ++i) {
1417 		nsd->children[i].pid = 0;
1418 	}
1419 
1420 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1421 }
1422 
1423 static void
1424 server_close_socket(struct nsd_socket *sock)
1425 {
1426 	if(sock->s != -1) {
1427 		close(sock->s);
1428 		sock->s = -1;
1429 	}
1430 }
1431 
1432 void
1433 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1434 {
1435 	size_t i;
1436 
1437 	/* Close all the sockets... */
1438 	for (i = 0; i < n; ++i) {
1439 		server_close_socket(&sockets[i]);
1440 	}
1441 }
1442 
1443 /*
1444  * Close the sockets, shutdown the server and exit.
1445  * Does not return.
1446  */
1447 void
1448 server_shutdown(struct nsd *nsd)
1449 {
1450 	size_t i;
1451 
1452 	server_close_all_sockets(nsd->udp, nsd->ifs);
1453 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1454 	/* CHILD: close command channel to parent */
1455 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1456 	{
1457 		close(nsd->this_child->parent_fd);
1458 		nsd->this_child->parent_fd = -1;
1459 	}
1460 	/* SERVER: close command channels to children */
1461 	if(!nsd->this_child)
1462 	{
1463 		for(i=0; i < nsd->child_count; ++i)
1464 			if(nsd->children[i].child_fd != -1)
1465 			{
1466 				close(nsd->children[i].child_fd);
1467 				nsd->children[i].child_fd = -1;
1468 			}
1469 	}
1470 
1471 	tsig_finalize();
1472 #ifdef HAVE_SSL
1473 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1474 	if (nsd->tls_ctx)
1475 		SSL_CTX_free(nsd->tls_ctx);
1476 #endif
1477 
1478 #ifdef MEMCLEAN /* OS collects memory pages */
1479 #ifdef RATELIMIT
1480 	rrl_mmap_deinit_keep_mmap();
1481 #endif
1482 #ifdef USE_DNSTAP
1483 	dt_collector_destroy(nsd->dt_collector, nsd);
1484 #endif
1485 	udb_base_free_keep_mmap(nsd->task[0]);
1486 	udb_base_free_keep_mmap(nsd->task[1]);
1487 	namedb_close_udb(nsd->db); /* keeps mmap */
1488 	namedb_close(nsd->db);
1489 	nsd_options_destroy(nsd->options);
1490 	region_destroy(nsd->region);
1491 #endif
1492 	log_finalize();
1493 	exit(0);
1494 }
1495 
1496 void
1497 server_prepare_xfrd(struct nsd* nsd)
1498 {
1499 	char tmpfile[256];
1500 	/* create task mmaps */
1501 	nsd->mytask = 0;
1502 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1503 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1504 	nsd->task[0] = task_file_create(tmpfile);
1505 	if(!nsd->task[0]) {
1506 #ifdef USE_ZONE_STATS
1507 		unlink(nsd->zonestatfname[0]);
1508 		unlink(nsd->zonestatfname[1]);
1509 #endif
1510 		xfrd_del_tempdir(nsd);
1511 		exit(1);
1512 	}
1513 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1514 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1515 	nsd->task[1] = task_file_create(tmpfile);
1516 	if(!nsd->task[1]) {
1517 		unlink(nsd->task[0]->fname);
1518 #ifdef USE_ZONE_STATS
1519 		unlink(nsd->zonestatfname[0]);
1520 		unlink(nsd->zonestatfname[1]);
1521 #endif
1522 		xfrd_del_tempdir(nsd);
1523 		exit(1);
1524 	}
1525 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1526 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1527 	/* create xfrd listener structure */
1528 	nsd->xfrd_listener = region_alloc(nsd->region,
1529 		sizeof(netio_handler_type));
1530 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1531 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1532 	nsd->xfrd_listener->fd = -1;
1533 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1534 		nsd;
1535 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1536 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1537 }
1538 
1539 
1540 void
1541 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1542 {
1543 	pid_t pid;
1544 	int sockets[2] = {0,0};
1545 	struct ipc_handler_conn_data *data;
1546 
1547 	if(nsd->xfrd_listener->fd != -1)
1548 		close(nsd->xfrd_listener->fd);
1549 	if(del_db) {
1550 		/* recreate taskdb that xfrd was using, it may be corrupt */
1551 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1552 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1553 		nsd->task[1-nsd->mytask]->fname = NULL;
1554 		/* free alloc already, so udb does not shrink itself */
1555 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1556 		nsd->task[1-nsd->mytask]->alloc = NULL;
1557 		udb_base_free(nsd->task[1-nsd->mytask]);
1558 		/* create new file, overwrite the old one */
1559 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1560 		free(tmpfile);
1561 	}
1562 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1563 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1564 		return;
1565 	}
1566 	pid = fork();
1567 	switch (pid) {
1568 	case -1:
1569 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1570 		break;
1571 	default:
1572 		/* PARENT: close first socket, use second one */
1573 		close(sockets[0]);
1574 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1575 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1576 		}
1577 		if(del_db) xfrd_free_namedb(nsd);
1578 		/* use other task than I am using, since if xfrd died and is
1579 		 * restarted, the reload is using nsd->mytask */
1580 		nsd->mytask = 1 - nsd->mytask;
1581 
1582 #ifdef HAVE_SETPROCTITLE
1583 		setproctitle("xfrd");
1584 #endif
1585 #ifdef HAVE_CPUSET_T
1586 		if(nsd->use_cpu_affinity) {
1587 			set_cpu_affinity(nsd->xfrd_cpuset);
1588 		}
1589 #endif
1590 
1591 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1592 		/* ENOTREACH */
1593 		break;
1594 	case 0:
1595 		/* CHILD: close second socket, use first one */
1596 		close(sockets[1]);
1597 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1598 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1599 		}
1600 		nsd->xfrd_listener->fd = sockets[0];
1601 		break;
1602 	}
1603 	/* server-parent only */
1604 	nsd->xfrd_listener->timeout = NULL;
1605 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1606 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1607 	/* clear ongoing ipc reads */
1608 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1609 	data->conn->is_reading = 0;
1610 }
1611 
1612 /** add all soainfo to taskdb */
1613 static void
1614 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1615 {
1616 	struct radnode* n;
1617 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1618 	/* add all SOA INFO to mytask */
1619 	udb_ptr_init(&task_last, taskudb);
1620 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1621 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1622 	}
1623 	udb_ptr_unlink(&task_last, taskudb);
1624 }
1625 
1626 void
1627 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1628 {
1629 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1630 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1631 	 *   then they exchange and process.
1632 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1633 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1634 	 *   expire notifications can be sent back via a normal reload later
1635 	 *   (xfrd will wait for current running reload to finish if any).
1636 	 */
1637 	sig_atomic_t cmd = 0;
1638 	pid_t mypid;
1639 	int xfrd_sock = nsd->xfrd_listener->fd;
1640 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1641 	udb_ptr t;
1642 	if(!shortsoa) {
1643 		if(nsd->signal_hint_shutdown) {
1644 		shutdown:
1645 			log_msg(LOG_WARNING, "signal received, shutting down...");
1646 			server_close_all_sockets(nsd->udp, nsd->ifs);
1647 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1648 #ifdef HAVE_SSL
1649 			daemon_remote_close(nsd->rc);
1650 #endif
1651 			/* Unlink it if possible... */
1652 			unlinkpid(nsd->pidfile);
1653 			unlink(nsd->task[0]->fname);
1654 			unlink(nsd->task[1]->fname);
1655 #ifdef USE_ZONE_STATS
1656 			unlink(nsd->zonestatfname[0]);
1657 			unlink(nsd->zonestatfname[1]);
1658 #endif
1659 			/* write the nsd.db to disk, wait for it to complete */
1660 			udb_base_sync(nsd->db->udb, 1);
1661 			udb_base_close(nsd->db->udb);
1662 			server_shutdown(nsd);
1663 			/* ENOTREACH */
1664 			exit(0);
1665 		}
1666 	}
1667 	if(shortsoa) {
1668 		/* put SOA in xfrd task because mytask may be in use */
1669 		taskudb = nsd->task[1-nsd->mytask];
1670 	}
1671 
1672 	add_all_soa_to_task(nsd, taskudb);
1673 	if(!shortsoa) {
1674 		/* wait for xfrd to signal task is ready, RELOAD signal */
1675 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1676 			cmd != NSD_RELOAD) {
1677 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1678 			exit(1);
1679 		}
1680 		if(nsd->signal_hint_shutdown) {
1681 			goto shutdown;
1682 		}
1683 	}
1684 	/* give xfrd our task, signal it with RELOAD_DONE */
1685 	task_process_sync(taskudb);
1686 	cmd = NSD_RELOAD_DONE;
1687 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1688 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1689 			(int)nsd->pid, strerror(errno));
1690 	}
1691 	mypid = getpid();
1692 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1693 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1694 			strerror(errno));
1695 	}
1696 
1697 	if(!shortsoa) {
1698 		/* process the xfrd task works (expiry data) */
1699 		nsd->mytask = 1 - nsd->mytask;
1700 		taskudb = nsd->task[nsd->mytask];
1701 		task_remap(taskudb);
1702 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1703 		while(!udb_ptr_is_null(&t)) {
1704 			task_process_expire(nsd->db, TASKLIST(&t));
1705 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1706 		}
1707 		udb_ptr_unlink(&t, taskudb);
1708 		task_clear(taskudb);
1709 
1710 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1711 		cmd = NSD_RELOAD_DONE;
1712 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1713 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1714 				(int)nsd->pid, strerror(errno));
1715 		}
1716 	}
1717 }
1718 
1719 #ifdef HAVE_SSL
1720 static void
1721 log_crypto_from_err(const char* str, unsigned long err)
1722 {
1723 	/* error:[error code]:[library name]:[function name]:[reason string] */
1724 	char buf[128];
1725 	unsigned long e;
1726 	ERR_error_string_n(err, buf, sizeof(buf));
1727 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1728 	while( (e=ERR_get_error()) ) {
1729 		ERR_error_string_n(e, buf, sizeof(buf));
1730 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1731 	}
1732 }
1733 
1734 void
1735 log_crypto_err(const char* str)
1736 {
1737 	log_crypto_from_err(str, ERR_get_error());
1738 }
1739 
1740 /** true if the ssl handshake error has to be squelched from the logs */
1741 static int
1742 squelch_err_ssl_handshake(unsigned long err)
1743 {
1744 	if(verbosity >= 3)
1745 		return 0; /* only squelch on low verbosity */
1746 	/* this is very specific, we could filter on ERR_GET_REASON()
1747 	 * (the third element in ERR_PACK) */
1748 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1749 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1750 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1751 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1752 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1753 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1754 #endif
1755 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1756 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1757 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1758 #  ifdef SSL_R_VERSION_TOO_LOW
1759 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1760 #  endif
1761 #endif
1762 		)
1763 		return 1;
1764 	return 0;
1765 }
1766 
1767 void
1768 perform_openssl_init(void)
1769 {
1770 	/* init SSL library */
1771 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1772 	ERR_load_crypto_strings();
1773 #endif
1774 	ERR_load_SSL_strings();
1775 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1776 	OpenSSL_add_all_algorithms();
1777 #else
1778 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1779 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1780 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1781 #endif
1782 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1783 	(void)SSL_library_init();
1784 #else
1785 	OPENSSL_init_ssl(0, NULL);
1786 #endif
1787 
1788 	if(!RAND_status()) {
1789 		/* try to seed it */
1790 		unsigned char buf[256];
1791 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1792 		size_t i;
1793 		v = seed;
1794 		for(i=0; i<256/sizeof(v); i++) {
1795 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1796 			v = v*seed + (unsigned int)i;
1797 		}
1798 		RAND_seed(buf, 256);
1799 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1800 	}
1801 }
1802 
1803 static int
1804 get_ocsp(char *filename, unsigned char **ocsp)
1805 {
1806 	BIO *bio;
1807 	OCSP_RESPONSE *response;
1808 	int len = -1;
1809 	unsigned char *p, *buf;
1810 	assert(filename);
1811 
1812 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1813 		log_crypto_err("get_ocsp: BIO_new_file failed");
1814 		return -1;
1815 	}
1816 
1817 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1818 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1819 		BIO_free(bio);
1820 		return -1;
1821 	}
1822 
1823 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1824 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1825 		OCSP_RESPONSE_free(response);
1826 		BIO_free(bio);
1827 		return -1;
1828 	}
1829 
1830 	if ((buf = malloc((size_t) len)) == NULL) {
1831 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1832 		OCSP_RESPONSE_free(response);
1833 		BIO_free(bio);
1834 		return -1;
1835 	}
1836 
1837 	p = buf;
1838 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1839 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1840 		free(buf);
1841 		OCSP_RESPONSE_free(response);
1842 		BIO_free(bio);
1843 		return -1;
1844 	}
1845 
1846 	OCSP_RESPONSE_free(response);
1847 	BIO_free(bio);
1848 
1849 	*ocsp = buf;
1850 	return len;
1851 }
1852 
1853 /* further setup ssl ctx after the keys are loaded */
1854 static void
1855 listen_sslctx_setup_2(void* ctxt)
1856 {
1857 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1858 	(void)ctx;
1859 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1860 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1861 		/* ENOTREACH */
1862 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1863 	}
1864 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1865 	if(1) {
1866 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1867 		if (!ecdh) {
1868 			log_crypto_err("could not find p256, not enabling ECDHE");
1869 		} else {
1870 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1871 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1872 			}
1873 			EC_KEY_free (ecdh);
1874 		}
1875 	}
1876 #endif
1877 }
1878 
1879 static int
1880 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1881 {
1882 	if(ocspdata) {
1883 		unsigned char *p;
1884 		if ((p=malloc(ocspdata_len)) == NULL) {
1885 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1886 			return SSL_TLSEXT_ERR_NOACK;
1887 		}
1888 		memcpy(p, ocspdata, ocspdata_len);
1889 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1890 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1891 			free(p);
1892 			return SSL_TLSEXT_ERR_NOACK;
1893 		}
1894 		return SSL_TLSEXT_ERR_OK;
1895 	} else {
1896 		return SSL_TLSEXT_ERR_NOACK;
1897 	}
1898 }
1899 
1900 SSL_CTX*
1901 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1902 {
1903 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1904 	if(!ctx) {
1905 		log_crypto_err("could not SSL_CTX_new");
1906 		return NULL;
1907 	}
1908 	/* no SSLv2, SSLv3 because has defects */
1909 #if SSL_OP_NO_SSLv2 != 0
1910 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1911 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1912 		SSL_CTX_free(ctx);
1913 		return NULL;
1914 	}
1915 #endif
1916 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1917 		!= SSL_OP_NO_SSLv3){
1918 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1919 		SSL_CTX_free(ctx);
1920 		return 0;
1921 	}
1922 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1923 	/* if we have tls 1.1 disable 1.0 */
1924 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1925 		!= SSL_OP_NO_TLSv1){
1926 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1927 		SSL_CTX_free(ctx);
1928 		return 0;
1929 	}
1930 #endif
1931 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1932 	/* if we have tls 1.2 disable 1.1 */
1933 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1934 		!= SSL_OP_NO_TLSv1_1){
1935 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1936 		SSL_CTX_free(ctx);
1937 		return 0;
1938 	}
1939 #endif
1940 #if defined(SSL_OP_NO_RENEGOTIATION)
1941 	/* disable client renegotiation */
1942 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1943 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1944 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1945 		SSL_CTX_free(ctx);
1946 		return 0;
1947 	}
1948 #endif
1949 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
1950 	/* if we have sha256, set the cipher list to have no known vulns */
1951 	if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
1952 		log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
1953 #endif
1954 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
1955 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
1956 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
1957 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
1958 		SSL_CTX_free(ctx);
1959 		return 0;
1960 	}
1961 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
1962 	SSL_CTX_set_security_level(ctx, 0);
1963 #endif
1964 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
1965 		log_msg(LOG_ERR, "error for cert file: %s", pem);
1966 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
1967 		SSL_CTX_free(ctx);
1968 		return NULL;
1969 	}
1970 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
1971 		log_msg(LOG_ERR, "error for private key file: %s", key);
1972 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
1973 		SSL_CTX_free(ctx);
1974 		return NULL;
1975 	}
1976 	if(!SSL_CTX_check_private_key(ctx)) {
1977 		log_msg(LOG_ERR, "error for key file: %s", key);
1978 		log_crypto_err("Error in SSL_CTX check_private_key");
1979 		SSL_CTX_free(ctx);
1980 		return NULL;
1981 	}
1982 	listen_sslctx_setup_2(ctx);
1983 	if(verifypem && verifypem[0]) {
1984 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
1985 			log_crypto_err("Error in SSL_CTX verify locations");
1986 			SSL_CTX_free(ctx);
1987 			return NULL;
1988 		}
1989 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
1990 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
1991 	}
1992 	return ctx;
1993 }
1994 
1995 SSL_CTX*
1996 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
1997 {
1998 	char *key, *pem;
1999 	SSL_CTX *ctx;
2000 
2001 	key = nsd->options->tls_service_key;
2002 	pem = nsd->options->tls_service_pem;
2003 	if(!key || key[0] == 0) {
2004 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2005 		return NULL;
2006 	}
2007 	if(!pem || pem[0] == 0) {
2008 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2009 		return NULL;
2010 	}
2011 
2012 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2013 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2014 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2015 	if(!ctx) {
2016 		log_msg(LOG_ERR, "could not setup server TLS context");
2017 		return NULL;
2018 	}
2019 	if(ocspfile && ocspfile[0]) {
2020 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2021 			log_crypto_err("Error reading OCSPfile");
2022 			SSL_CTX_free(ctx);
2023 			return NULL;
2024 		} else {
2025 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2026 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2027 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2028 				SSL_CTX_free(ctx);
2029 				return NULL;
2030 			}
2031 		}
2032 	}
2033 	return ctx;
2034 }
2035 
2036 /* check if tcp_handler_accept_data created for TLS dedicated port */
2037 int
2038 using_tls_port(struct sockaddr* addr, const char* tls_port)
2039 {
2040 	in_port_t port = 0;
2041 
2042 	if (addr->sa_family == AF_INET)
2043 		port = ((struct sockaddr_in*)addr)->sin_port;
2044 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2045 	else
2046 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2047 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2048 	if (atoi(tls_port) == ntohs(port))
2049 		return 1;
2050 
2051 	return 0;
2052 }
2053 #endif
2054 
2055 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2056 ssize_t
2057 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2058 {
2059 	uint8_t* buf = (uint8_t*) p;
2060 	ssize_t total = 0;
2061 	struct pollfd fd;
2062 	memset(&fd, 0, sizeof(fd));
2063 	fd.fd = s;
2064 	fd.events = POLLIN;
2065 
2066 	while( total < sz) {
2067 		ssize_t ret;
2068 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2069 		if(ret == -1) {
2070 			if(errno == EAGAIN)
2071 				/* blocking read */
2072 				continue;
2073 			if(errno == EINTR) {
2074 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2075 					return -1;
2076 				/* other signals can be handled later */
2077 				continue;
2078 			}
2079 			/* some error */
2080 			return -1;
2081 		}
2082 		if(ret == 0) {
2083 			/* operation timed out */
2084 			return -2;
2085 		}
2086 		ret = read(s, buf+total, sz-total);
2087 		if(ret == -1) {
2088 			if(errno == EAGAIN)
2089 				/* blocking read */
2090 				continue;
2091 			if(errno == EINTR) {
2092 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2093 					return -1;
2094 				/* other signals can be handled later */
2095 				continue;
2096 			}
2097 			/* some error */
2098 			return -1;
2099 		}
2100 		if(ret == 0) {
2101 			/* closed connection! */
2102 			return 0;
2103 		}
2104 		total += ret;
2105 	}
2106 	return total;
2107 }
2108 
2109 static void
2110 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2111 {
2112 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2113 	udb_ptr t, next;
2114 	udb_base* u = nsd->task[nsd->mytask];
2115 	udb_ptr_init(&next, u);
2116 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2117 	udb_base_set_userdata(u, 0);
2118 	while(!udb_ptr_is_null(&t)) {
2119 		/* store next in list so this one can be deleted or reused */
2120 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2121 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2122 
2123 		/* process task t */
2124 		/* append results for task t and update last_task */
2125 		task_process_in_reload(nsd, u, last_task, &t);
2126 
2127 		/* go to next */
2128 		udb_ptr_set_ptr(&t, u, &next);
2129 
2130 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2131 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2132 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2133 			if(cmd == NSD_QUIT) {
2134 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2135 				/* sync to disk (if needed) */
2136 				udb_base_sync(nsd->db->udb, 0);
2137 				/* unlink files of remainder of tasks */
2138 				while(!udb_ptr_is_null(&t)) {
2139 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2140 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2141 					}
2142 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2143 				}
2144 				udb_ptr_unlink(&t, u);
2145 				udb_ptr_unlink(&next, u);
2146 				exit(0);
2147 			}
2148 		}
2149 
2150 	}
2151 	udb_ptr_unlink(&t, u);
2152 	udb_ptr_unlink(&next, u);
2153 }
2154 
2155 #ifdef BIND8_STATS
2156 static void
2157 parent_send_stats(struct nsd* nsd, int cmdfd)
2158 {
2159 	size_t i;
2160 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2161 		log_msg(LOG_ERR, "could not write stats to reload");
2162 		return;
2163 	}
2164 	for(i=0; i<nsd->child_count; i++)
2165 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2166 			sizeof(stc_type))) {
2167 			log_msg(LOG_ERR, "could not write stats to reload");
2168 			return;
2169 		}
2170 }
2171 
2172 static void
2173 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2174 {
2175 	struct nsdst s;
2176 	stc_type* p;
2177 	size_t i;
2178 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2179 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2180 		log_msg(LOG_ERR, "could not read stats from oldpar");
2181 		return;
2182 	}
2183 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2184 	s.db_mem = region_get_mem(nsd->db->region);
2185 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2186 		nsd->child_count);
2187 	if(!p) return;
2188 	for(i=0; i<nsd->child_count; i++) {
2189 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2190 			sizeof(stc_type))
2191 			return;
2192 	}
2193 }
2194 #endif /* BIND8_STATS */
2195 
2196 /*
2197  * Reload the database, stop parent, re-fork children and continue.
2198  * as server_main.
2199  */
2200 static void
2201 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2202 	int cmdsocket)
2203 {
2204 	pid_t mypid;
2205 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2206 	int ret;
2207 	udb_ptr last_task;
2208 	struct sigaction old_sigchld, ign_sigchld;
2209 	/* ignore SIGCHLD from the previous server_main that used this pid */
2210 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2211 	ign_sigchld.sa_handler = SIG_IGN;
2212 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2213 
2214 #ifdef HAVE_SETPROCTITLE
2215 	setproctitle("main");
2216 #endif
2217 #ifdef HAVE_CPUSET_T
2218 	if(nsd->use_cpu_affinity) {
2219 		set_cpu_affinity(nsd->cpuset);
2220 	}
2221 #endif
2222 
2223 	/* see what tasks we got from xfrd */
2224 	task_remap(nsd->task[nsd->mytask]);
2225 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2226 	udb_compact_inhibited(nsd->db->udb, 1);
2227 	reload_process_tasks(nsd, &last_task, cmdsocket);
2228 	udb_compact_inhibited(nsd->db->udb, 0);
2229 	udb_compact(nsd->db->udb);
2230 
2231 #ifndef NDEBUG
2232 	if(nsd_debug_level >= 1)
2233 		region_log_stats(nsd->db->region);
2234 #endif /* NDEBUG */
2235 	/* sync to disk (if needed) */
2236 	udb_base_sync(nsd->db->udb, 0);
2237 
2238 	initialize_dname_compression_tables(nsd);
2239 
2240 #ifdef BIND8_STATS
2241 	/* Restart dumping stats if required.  */
2242 	time(&nsd->st.boot);
2243 	set_bind8_alarm(nsd);
2244 #endif
2245 #ifdef USE_ZONE_STATS
2246 	server_zonestat_realloc(nsd); /* realloc for new children */
2247 	server_zonestat_switch(nsd);
2248 #endif
2249 
2250 	/* listen for the signals of failed children again */
2251 	sigaction(SIGCHLD, &old_sigchld, NULL);
2252 	/* Start new child processes */
2253 	if (server_start_children(nsd, server_region, netio, &nsd->
2254 		xfrd_listener->fd) != 0) {
2255 		send_children_quit(nsd);
2256 		exit(1);
2257 	}
2258 
2259 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2260 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2261 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2262 		if(cmd == NSD_QUIT) {
2263 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2264 			send_children_quit(nsd);
2265 			exit(0);
2266 		}
2267 	}
2268 
2269 	/* Send quit command to parent: blocking, wait for receipt. */
2270 	do {
2271 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2272 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2273 		{
2274 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2275 				strerror(errno));
2276 		}
2277 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2278 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2279 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2280 			RELOAD_SYNC_TIMEOUT);
2281 		if(ret == -2) {
2282 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2283 		}
2284 	} while (ret == -2);
2285 	if(ret == -1) {
2286 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2287 			strerror(errno));
2288 	}
2289 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2290 	if(cmd == NSD_QUIT) {
2291 		/* small race condition possible here, parent got quit cmd. */
2292 		send_children_quit(nsd);
2293 		exit(1);
2294 	}
2295 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2296 #ifdef BIND8_STATS
2297 	reload_do_stats(cmdsocket, nsd, &last_task);
2298 #endif
2299 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2300 	task_process_sync(nsd->task[nsd->mytask]);
2301 #ifdef USE_ZONE_STATS
2302 	server_zonestat_realloc(nsd); /* realloc for next children */
2303 #endif
2304 
2305 	/* send soainfo to the xfrd process, signal it that reload is done,
2306 	 * it picks up the taskudb */
2307 	cmd = NSD_RELOAD_DONE;
2308 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2309 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2310 			strerror(errno));
2311 	}
2312 	mypid = getpid();
2313 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2314 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2315 			strerror(errno));
2316 	}
2317 
2318 	/* try to reopen file */
2319 	if (nsd->file_rotation_ok)
2320 		log_reopen(nsd->log_filename, 1);
2321 	/* exit reload, continue as new server_main */
2322 }
2323 
2324 /*
2325  * Get the mode depending on the signal hints that have been received.
2326  * Multiple signal hints can be received and will be handled in turn.
2327  */
2328 static sig_atomic_t
2329 server_signal_mode(struct nsd *nsd)
2330 {
2331 	if(nsd->signal_hint_quit) {
2332 		nsd->signal_hint_quit = 0;
2333 		return NSD_QUIT;
2334 	}
2335 	else if(nsd->signal_hint_shutdown) {
2336 		nsd->signal_hint_shutdown = 0;
2337 		return NSD_SHUTDOWN;
2338 	}
2339 	else if(nsd->signal_hint_child) {
2340 		nsd->signal_hint_child = 0;
2341 		return NSD_REAP_CHILDREN;
2342 	}
2343 	else if(nsd->signal_hint_reload) {
2344 		nsd->signal_hint_reload = 0;
2345 		return NSD_RELOAD;
2346 	}
2347 	else if(nsd->signal_hint_reload_hup) {
2348 		nsd->signal_hint_reload_hup = 0;
2349 		return NSD_RELOAD_REQ;
2350 	}
2351 	else if(nsd->signal_hint_stats) {
2352 		nsd->signal_hint_stats = 0;
2353 #ifdef BIND8_STATS
2354 		set_bind8_alarm(nsd);
2355 #endif
2356 		return NSD_STATS;
2357 	}
2358 	else if(nsd->signal_hint_statsusr) {
2359 		nsd->signal_hint_statsusr = 0;
2360 		return NSD_STATS;
2361 	}
2362 	return NSD_RUN;
2363 }
2364 
2365 /*
2366  * The main server simply waits for signals and child processes to
2367  * terminate.  Child processes are restarted as necessary.
2368  */
2369 void
2370 server_main(struct nsd *nsd)
2371 {
2372 	region_type *server_region = region_create(xalloc, free);
2373 	netio_type *netio = netio_create(server_region);
2374 	netio_handler_type reload_listener;
2375 	int reload_sockets[2] = {-1, -1};
2376 	struct timespec timeout_spec;
2377 	int status;
2378 	pid_t child_pid;
2379 	pid_t reload_pid = -1;
2380 	sig_atomic_t mode;
2381 
2382 	/* Ensure we are the main process */
2383 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2384 
2385 	/* Add listener for the XFRD process */
2386 	netio_add_handler(netio, nsd->xfrd_listener);
2387 
2388 	/* Start the child processes that handle incoming queries */
2389 	if (server_start_children(nsd, server_region, netio,
2390 		&nsd->xfrd_listener->fd) != 0) {
2391 		send_children_quit(nsd);
2392 		exit(1);
2393 	}
2394 	reload_listener.fd = -1;
2395 
2396 	/* This_child MUST be 0, because this is the parent process */
2397 	assert(nsd->this_child == 0);
2398 
2399 	/* Run the server until we get a shutdown signal */
2400 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2401 		/* Did we receive a signal that changes our mode? */
2402 		if(mode == NSD_RUN) {
2403 			nsd->mode = mode = server_signal_mode(nsd);
2404 		}
2405 
2406 		switch (mode) {
2407 		case NSD_RUN:
2408 			/* see if any child processes terminated */
2409 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2410 				int is_child = delete_child_pid(nsd, child_pid);
2411 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2412 					if(nsd->children[is_child].child_fd == -1)
2413 						nsd->children[is_child].has_exited = 1;
2414 					parent_check_all_children_exited(nsd);
2415 				} else if(is_child != -1) {
2416 					log_msg(LOG_WARNING,
2417 					       "server %d died unexpectedly with status %d, restarting",
2418 					       (int) child_pid, status);
2419 					restart_child_servers(nsd, server_region, netio,
2420 						&nsd->xfrd_listener->fd);
2421 				} else if (child_pid == reload_pid) {
2422 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2423 					pid_t mypid;
2424 					log_msg(LOG_WARNING,
2425 					       "Reload process %d failed with status %d, continuing with old database",
2426 					       (int) child_pid, status);
2427 					reload_pid = -1;
2428 					if(reload_listener.fd != -1) close(reload_listener.fd);
2429 					reload_listener.fd = -1;
2430 					reload_listener.event_types = NETIO_EVENT_NONE;
2431 					task_process_sync(nsd->task[nsd->mytask]);
2432 					/* inform xfrd reload attempt ended */
2433 					if(!write_socket(nsd->xfrd_listener->fd,
2434 						&cmd, sizeof(cmd))) {
2435 						log_msg(LOG_ERR, "problems "
2436 						  "sending SOAEND to xfrd: %s",
2437 						  strerror(errno));
2438 					}
2439 					mypid = getpid();
2440 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2441 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2442 							strerror(errno));
2443 					}
2444 				} else if(status != 0) {
2445 					/* check for status, because we get
2446 					 * the old-servermain because reload
2447 					 * is the process-parent of old-main,
2448 					 * and we get older server-processes
2449 					 * that are exiting after a reload */
2450 					log_msg(LOG_WARNING,
2451 					       "process %d terminated with status %d",
2452 					       (int) child_pid, status);
2453 				}
2454 			}
2455 			if (child_pid == -1) {
2456 				if (errno == EINTR) {
2457 					continue;
2458 				}
2459 				if (errno != ECHILD)
2460 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2461 			}
2462 			if (nsd->mode != NSD_RUN)
2463 				break;
2464 
2465 			/* timeout to collect processes. In case no sigchild happens. */
2466 			timeout_spec.tv_sec = 60;
2467 			timeout_spec.tv_nsec = 0;
2468 
2469 			/* listen on ports, timeout for collecting terminated children */
2470 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2471 				if (errno != EINTR) {
2472 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2473 				}
2474 			}
2475 			if(nsd->restart_children) {
2476 				restart_child_servers(nsd, server_region, netio,
2477 					&nsd->xfrd_listener->fd);
2478 				nsd->restart_children = 0;
2479 			}
2480 			if(nsd->reload_failed) {
2481 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2482 				pid_t mypid;
2483 				nsd->reload_failed = 0;
2484 				log_msg(LOG_WARNING,
2485 				       "Reload process %d failed, continuing with old database",
2486 				       (int) reload_pid);
2487 				reload_pid = -1;
2488 				if(reload_listener.fd != -1) close(reload_listener.fd);
2489 				reload_listener.fd = -1;
2490 				reload_listener.event_types = NETIO_EVENT_NONE;
2491 				task_process_sync(nsd->task[nsd->mytask]);
2492 				/* inform xfrd reload attempt ended */
2493 				if(!write_socket(nsd->xfrd_listener->fd,
2494 					&cmd, sizeof(cmd))) {
2495 					log_msg(LOG_ERR, "problems "
2496 					  "sending SOAEND to xfrd: %s",
2497 					  strerror(errno));
2498 				}
2499 				mypid = getpid();
2500 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2501 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2502 						strerror(errno));
2503 				}
2504 			}
2505 
2506 			break;
2507 		case NSD_RELOAD_REQ: {
2508 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2509 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2510 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2511 				"main: ipc send reload_req to xfrd"));
2512 			if(!write_socket(nsd->xfrd_listener->fd,
2513 				&cmd, sizeof(cmd))) {
2514 				log_msg(LOG_ERR, "server_main: could not send "
2515 				"reload_req to xfrd: %s", strerror(errno));
2516 			}
2517 			nsd->mode = NSD_RUN;
2518 			} break;
2519 		case NSD_RELOAD:
2520 			/* Continue to run nsd after reload */
2521 			nsd->mode = NSD_RUN;
2522 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2523 			if (reload_pid != -1) {
2524 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2525 				       (int) reload_pid);
2526 				break;
2527 			}
2528 
2529 			/* switch the mytask to keep track of who owns task*/
2530 			nsd->mytask = 1 - nsd->mytask;
2531 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2532 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2533 				reload_pid = -1;
2534 				break;
2535 			}
2536 
2537 			/* Do actual reload */
2538 			reload_pid = fork();
2539 			switch (reload_pid) {
2540 			case -1:
2541 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2542 				break;
2543 			default:
2544 				/* PARENT */
2545 				close(reload_sockets[0]);
2546 				server_reload(nsd, server_region, netio,
2547 					reload_sockets[1]);
2548 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2549 				close(reload_sockets[1]);
2550 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2551 				/* drop stale xfrd ipc data */
2552 				((struct ipc_handler_conn_data*)nsd->
2553 					xfrd_listener->user_data)
2554 					->conn->is_reading = 0;
2555 				reload_pid = -1;
2556 				reload_listener.fd = -1;
2557 				reload_listener.event_types = NETIO_EVENT_NONE;
2558 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2559 				break;
2560 			case 0:
2561 				/* CHILD */
2562 				/* server_main keep running until NSD_QUIT_SYNC
2563 				 * received from reload. */
2564 				close(reload_sockets[1]);
2565 				reload_listener.fd = reload_sockets[0];
2566 				reload_listener.timeout = NULL;
2567 				reload_listener.user_data = nsd;
2568 				reload_listener.event_types = NETIO_EVENT_READ;
2569 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2570 				netio_add_handler(netio, &reload_listener);
2571 				reload_pid = getppid();
2572 				break;
2573 			}
2574 			break;
2575 		case NSD_QUIT_SYNC:
2576 			/* synchronisation of xfrd, parent and reload */
2577 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2578 				sig_atomic_t cmd = NSD_RELOAD;
2579 				/* stop xfrd ipc writes in progress */
2580 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2581 					"main: ipc send indication reload"));
2582 				if(!write_socket(nsd->xfrd_listener->fd,
2583 					&cmd, sizeof(cmd))) {
2584 					log_msg(LOG_ERR, "server_main: could not send reload "
2585 					"indication to xfrd: %s", strerror(errno));
2586 				}
2587 				/* wait for ACK from xfrd */
2588 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2589 				nsd->quit_sync_done = 1;
2590 			}
2591 			nsd->mode = NSD_RUN;
2592 			break;
2593 		case NSD_QUIT:
2594 			/* silent shutdown during reload */
2595 			if(reload_listener.fd != -1) {
2596 				/* acknowledge the quit, to sync reload that we will really quit now */
2597 				sig_atomic_t cmd = NSD_RELOAD;
2598 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2599 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2600 					log_msg(LOG_ERR, "server_main: "
2601 						"could not ack quit: %s", strerror(errno));
2602 				}
2603 #ifdef BIND8_STATS
2604 				parent_send_stats(nsd, reload_listener.fd);
2605 #endif /* BIND8_STATS */
2606 				close(reload_listener.fd);
2607 			}
2608 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2609 			/* only quit children after xfrd has acked */
2610 			send_children_quit(nsd);
2611 
2612 #ifdef MEMCLEAN /* OS collects memory pages */
2613 			region_destroy(server_region);
2614 #endif
2615 			server_shutdown(nsd);
2616 
2617 			/* ENOTREACH */
2618 			break;
2619 		case NSD_SHUTDOWN:
2620 			break;
2621 		case NSD_REAP_CHILDREN:
2622 			/* continue; wait for child in run loop */
2623 			nsd->mode = NSD_RUN;
2624 			break;
2625 		case NSD_STATS:
2626 #ifdef BIND8_STATS
2627 			set_children_stats(nsd);
2628 #endif
2629 			nsd->mode = NSD_RUN;
2630 			break;
2631 		default:
2632 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2633 			nsd->mode = NSD_RUN;
2634 			break;
2635 		}
2636 	}
2637 	log_msg(LOG_WARNING, "signal received, shutting down...");
2638 
2639 	/* close opened ports to avoid race with restart of nsd */
2640 	server_close_all_sockets(nsd->udp, nsd->ifs);
2641 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2642 #ifdef HAVE_SSL
2643 	daemon_remote_close(nsd->rc);
2644 #endif
2645 	send_children_quit_and_wait(nsd);
2646 
2647 	/* Unlink it if possible... */
2648 	unlinkpid(nsd->pidfile);
2649 	unlink(nsd->task[0]->fname);
2650 	unlink(nsd->task[1]->fname);
2651 #ifdef USE_ZONE_STATS
2652 	unlink(nsd->zonestatfname[0]);
2653 	unlink(nsd->zonestatfname[1]);
2654 #endif
2655 #ifdef USE_DNSTAP
2656 	dt_collector_close(nsd->dt_collector, nsd);
2657 #endif
2658 
2659 	if(reload_listener.fd != -1) {
2660 		sig_atomic_t cmd = NSD_QUIT;
2661 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2662 			"main: ipc send quit to reload-process"));
2663 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2664 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2665 				strerror(errno));
2666 		}
2667 		fsync(reload_listener.fd);
2668 		close(reload_listener.fd);
2669 		/* wait for reload to finish processing */
2670 		while(1) {
2671 			if(waitpid(reload_pid, NULL, 0) == -1) {
2672 				if(errno == EINTR) continue;
2673 				if(errno == ECHILD) break;
2674 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2675 					(int)reload_pid, strerror(errno));
2676 			}
2677 			break;
2678 		}
2679 	}
2680 	if(nsd->xfrd_listener->fd != -1) {
2681 		/* complete quit, stop xfrd */
2682 		sig_atomic_t cmd = NSD_QUIT;
2683 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2684 			"main: ipc send quit to xfrd"));
2685 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2686 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2687 				strerror(errno));
2688 		}
2689 		fsync(nsd->xfrd_listener->fd);
2690 		close(nsd->xfrd_listener->fd);
2691 		(void)kill(nsd->pid, SIGTERM);
2692 	}
2693 
2694 #ifdef MEMCLEAN /* OS collects memory pages */
2695 	region_destroy(server_region);
2696 #endif
2697 	/* write the nsd.db to disk, wait for it to complete */
2698 	udb_base_sync(nsd->db->udb, 1);
2699 	udb_base_close(nsd->db->udb);
2700 	server_shutdown(nsd);
2701 }
2702 
2703 static query_state_type
2704 server_process_query(struct nsd *nsd, struct query *query)
2705 {
2706 	return query_process(query, nsd);
2707 }
2708 
2709 static query_state_type
2710 server_process_query_udp(struct nsd *nsd, struct query *query)
2711 {
2712 #ifdef RATELIMIT
2713 	if(query_process(query, nsd) != QUERY_DISCARDED) {
2714 		if(rrl_process_query(query))
2715 			return rrl_slip(query);
2716 		else	return QUERY_PROCESSED;
2717 	}
2718 	return QUERY_DISCARDED;
2719 #else
2720 	return query_process(query, nsd);
2721 #endif
2722 }
2723 
2724 const char*
2725 nsd_event_vs(void)
2726 {
2727 #ifdef USE_MINI_EVENT
2728 	return "";
2729 #else
2730 	return event_get_version();
2731 #endif
2732 }
2733 
2734 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
2735 static const char* ub_ev_backend2str(int b)
2736 {
2737 	switch(b) {
2738 	case EVBACKEND_SELECT:	return "select";
2739 	case EVBACKEND_POLL:	return "poll";
2740 	case EVBACKEND_EPOLL:	return "epoll";
2741 	case EVBACKEND_KQUEUE:	return "kqueue";
2742 	case EVBACKEND_DEVPOLL: return "devpoll";
2743 	case EVBACKEND_PORT:	return "evport";
2744 	}
2745 	return "unknown";
2746 }
2747 #endif
2748 
2749 const char*
2750 nsd_event_method(void)
2751 {
2752 #ifdef USE_MINI_EVENT
2753 	return "select";
2754 #else
2755 	struct event_base* b = nsd_child_event_base();
2756 	const char* m = "?";
2757 #  ifdef EV_FEATURE_BACKENDS
2758 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2759 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2760 	m = event_base_get_method(b);
2761 #  endif
2762 #  ifdef MEMCLEAN
2763 	event_base_free(b);
2764 #  endif
2765 	return m;
2766 #endif
2767 }
2768 
2769 struct event_base*
2770 nsd_child_event_base(void)
2771 {
2772 	struct event_base* base;
2773 #ifdef USE_MINI_EVENT
2774 	static time_t secs;
2775 	static struct timeval now;
2776 	base = event_init(&secs, &now);
2777 #else
2778 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2779 	/* libev */
2780 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2781 #  else
2782 	/* libevent */
2783 #    ifdef HAVE_EVENT_BASE_NEW
2784 	base = event_base_new();
2785 #    else
2786 	base = event_init();
2787 #    endif
2788 #  endif
2789 #endif
2790 	return base;
2791 }
2792 
2793 static void
2794 add_udp_handler(
2795 	struct nsd *nsd,
2796 	struct nsd_socket *sock,
2797 	struct udp_handler_data *data)
2798 {
2799 	struct event *handler = &data->event;
2800 
2801 	data->nsd = nsd;
2802 	data->socket = sock;
2803 
2804 	memset(handler, 0, sizeof(*handler));
2805 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2806 	if(event_base_set(nsd->event_base, handler) != 0)
2807 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2808 	if(event_add(handler, NULL) != 0)
2809 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2810 }
2811 
2812 void
2813 add_tcp_handler(
2814 	struct nsd *nsd,
2815 	struct nsd_socket *sock,
2816 	struct tcp_accept_handler_data *data)
2817 {
2818 	struct event *handler = &data->event;
2819 
2820 	data->nsd = nsd;
2821 	data->socket = sock;
2822 
2823 #ifdef HAVE_SSL
2824 	if (nsd->tls_ctx &&
2825 	    nsd->options->tls_port &&
2826 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2827 	{
2828 		data->tls_accept = 1;
2829 		if(verbosity >= 2) {
2830 			char buf[48];
2831 			addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2832 			VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2833 		}
2834 	} else {
2835 		data->tls_accept = 0;
2836 	}
2837 #endif
2838 
2839 	memset(handler, 0, sizeof(*handler));
2840 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2841 	if(event_base_set(nsd->event_base, handler) != 0)
2842 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2843 	if(event_add(handler, NULL) != 0)
2844 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2845 	data->event_added = 1;
2846 }
2847 
2848 /*
2849  * Serve DNS requests.
2850  */
2851 void
2852 server_child(struct nsd *nsd)
2853 {
2854 	size_t i, from, numifs;
2855 	region_type *server_region = region_create(xalloc, free);
2856 	struct event_base* event_base = nsd_child_event_base();
2857 	sig_atomic_t mode;
2858 
2859 	if(!event_base) {
2860 		log_msg(LOG_ERR, "nsd server could not create event base");
2861 		exit(1);
2862 	}
2863 	nsd->event_base = event_base;
2864 	nsd->server_region = server_region;
2865 
2866 #ifdef RATELIMIT
2867 	rrl_init(nsd->this_child->child_num);
2868 #endif
2869 
2870 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2871 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2872 
2873 #ifdef HAVE_SETPROCTITLE
2874 	setproctitle("server %d", nsd->this_child->child_num + 1);
2875 #endif
2876 #ifdef HAVE_CPUSET_T
2877 	if(nsd->use_cpu_affinity) {
2878 		set_cpu_affinity(nsd->this_child->cpuset);
2879 	}
2880 #endif
2881 
2882 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2883 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2884 	}
2885 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2886 		server_close_all_sockets(nsd->udp, nsd->ifs);
2887 	}
2888 
2889 	if (nsd->this_child->parent_fd != -1) {
2890 		struct event *handler;
2891 		struct ipc_handler_conn_data* user_data =
2892 			(struct ipc_handler_conn_data*)region_alloc(
2893 			server_region, sizeof(struct ipc_handler_conn_data));
2894 		user_data->nsd = nsd;
2895 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2896 
2897 		handler = (struct event*) region_alloc(
2898 			server_region, sizeof(*handler));
2899 		memset(handler, 0, sizeof(*handler));
2900 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2901 			EV_READ, child_handle_parent_command, user_data);
2902 		if(event_base_set(event_base, handler) != 0)
2903 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2904 		if(event_add(handler, NULL) != 0)
2905 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2906 	}
2907 
2908 	if(nsd->reuseport) {
2909 		numifs = nsd->ifs / nsd->reuseport;
2910 		from = numifs * nsd->this_child->child_num;
2911 		if(from+numifs > nsd->ifs) { /* should not happen */
2912 			from = 0;
2913 			numifs = nsd->ifs;
2914 		}
2915 	} else {
2916 		from = 0;
2917 		numifs = nsd->ifs;
2918 	}
2919 
2920 	if (nsd->server_kind & NSD_SERVER_UDP) {
2921 		int child = nsd->this_child->child_num;
2922 		memset(msgs, 0, sizeof(msgs));
2923 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2924 			queries[i] = query_create(server_region,
2925 				compressed_dname_offsets,
2926 				compression_table_size, compressed_dnames);
2927 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2928 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2929 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
2930 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2931 			msgs[i].msg_hdr.msg_iovlen  = 1;
2932 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2933 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2934 		}
2935 
2936 		for (i = 0; i < nsd->ifs; i++) {
2937 			int listen;
2938 			struct udp_handler_data *data;
2939 
2940 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
2941 
2942 			if(i >= from && i < (from + numifs) && listen) {
2943 				data = region_alloc_zero(
2944 					nsd->server_region, sizeof(*data));
2945 				add_udp_handler(nsd, &nsd->udp[i], data);
2946 			} else {
2947 				/* close sockets intended for other servers */
2948 				server_close_socket(&nsd->udp[i]);
2949 			}
2950 		}
2951 	}
2952 
2953 	/*
2954 	 * Keep track of all the TCP accept handlers so we can enable
2955 	 * and disable them based on the current number of active TCP
2956 	 * connections.
2957 	 */
2958 	if (nsd->server_kind & NSD_SERVER_TCP) {
2959 		int child = nsd->this_child->child_num;
2960 		tcp_accept_handler_count = numifs;
2961 		tcp_accept_handlers = region_alloc_array(server_region,
2962 			numifs, sizeof(*tcp_accept_handlers));
2963 
2964 		for (i = 0; i < nsd->ifs; i++) {
2965 			int listen;
2966 			struct tcp_accept_handler_data *data;
2967 
2968 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
2969 
2970 			if(i >= from && i < (from + numifs) && listen) {
2971 				data = &tcp_accept_handlers[i-from];
2972 				memset(data, 0, sizeof(*data));
2973 				add_tcp_handler(nsd, &nsd->tcp[i], data);
2974 			} else {
2975 				/* close sockets intended for other servers */
2976 				/*
2977 				 * uncomment this once tcp servers are no
2978 				 * longer copied in the tcp fd copy line
2979 				 * in server_init().
2980 				server_close_socket(&nsd->tcp[i]);
2981 				*/
2982 				/* close sockets not meant for this server*/
2983 				if(!listen)
2984 					server_close_socket(&nsd->tcp[i]);
2985 			}
2986 		}
2987 	} else {
2988 		tcp_accept_handler_count = 0;
2989 	}
2990 
2991 	/* The main loop... */
2992 	while ((mode = nsd->mode) != NSD_QUIT) {
2993 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2994 
2995 		/* Do we need to do the statistics... */
2996 		if (mode == NSD_STATS) {
2997 #ifdef BIND8_STATS
2998 			int p = nsd->st.period;
2999 			nsd->st.period = 1; /* force stats printout */
3000 			/* Dump the statistics */
3001 			bind8_stats(nsd);
3002 			nsd->st.period = p;
3003 #else /* !BIND8_STATS */
3004 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3005 #endif /* BIND8_STATS */
3006 
3007 			nsd->mode = NSD_RUN;
3008 		}
3009 		else if (mode == NSD_REAP_CHILDREN) {
3010 			/* got signal, notify parent. parent reaps terminated children. */
3011 			if (nsd->this_child->parent_fd != -1) {
3012 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3013 				if (write(nsd->this_child->parent_fd,
3014 				    &parent_notify,
3015 				    sizeof(parent_notify)) == -1)
3016 				{
3017 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3018 						(int) nsd->this_child->pid, strerror(errno));
3019 				}
3020 			} else /* no parent, so reap 'em */
3021 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3022 			nsd->mode = NSD_RUN;
3023 		}
3024 		else if(mode == NSD_RUN) {
3025 			/* Wait for a query... */
3026 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3027 				if (errno != EINTR) {
3028 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3029 					break;
3030 				}
3031 			}
3032 		} else if(mode == NSD_QUIT) {
3033 			/* ignore here, quit */
3034 		} else {
3035 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3036 				(int)mode);
3037 			nsd->mode = NSD_RUN;
3038 		}
3039 	}
3040 
3041 	service_remaining_tcp(nsd);
3042 #ifdef	BIND8_STATS
3043 	bind8_stats(nsd);
3044 #endif /* BIND8_STATS */
3045 
3046 #ifdef MEMCLEAN /* OS collects memory pages */
3047 #ifdef RATELIMIT
3048 	rrl_deinit(nsd->this_child->child_num);
3049 #endif
3050 	event_base_free(event_base);
3051 	region_destroy(server_region);
3052 #endif
3053 	server_shutdown(nsd);
3054 }
3055 
3056 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3057 {
3058 	int* timed_out = (int*)arg;
3059         assert(event & EV_TIMEOUT); (void)event;
3060 	/* wake up the service tcp thread, note event is no longer
3061 	 * registered */
3062 	*timed_out = 1;
3063 }
3064 
3065 void
3066 service_remaining_tcp(struct nsd* nsd)
3067 {
3068 	struct tcp_handler_data* p;
3069 	struct event_base* event_base;
3070 	/* check if it is needed */
3071 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3072 		return;
3073 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3074 
3075 	/* setup event base */
3076 	event_base = nsd_child_event_base();
3077 	if(!event_base) {
3078 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3079 		return;
3080 	}
3081 	/* register tcp connections */
3082 	for(p = tcp_active_list; p != NULL; p = p->next) {
3083 		struct timeval timeout;
3084 		int fd = p->event.ev_fd;
3085 #ifdef USE_MINI_EVENT
3086 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3087 #else
3088 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3089 #endif
3090 		void (*fn)(int, short, void*);
3091 #ifdef HAVE_SSL
3092 		if(p->tls) {
3093 			if((event&EV_READ))
3094 				fn = handle_tls_reading;
3095 			else	fn = handle_tls_writing;
3096 		} else {
3097 #endif
3098 			if((event&EV_READ))
3099 				fn = handle_tcp_reading;
3100 			else	fn = handle_tcp_writing;
3101 #ifdef HAVE_SSL
3102 		}
3103 #endif
3104 
3105 		/* set timeout to 1/10 second */
3106 		if(p->tcp_timeout > 100)
3107 			p->tcp_timeout = 100;
3108 		timeout.tv_sec = p->tcp_timeout / 1000;
3109 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3110 		event_del(&p->event);
3111 		memset(&p->event, 0, sizeof(p->event));
3112 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3113 			fn, p);
3114 		if(event_base_set(event_base, &p->event) != 0)
3115 			log_msg(LOG_ERR, "event base set failed");
3116 		if(event_add(&p->event, &timeout) != 0)
3117 			log_msg(LOG_ERR, "event add failed");
3118 	}
3119 
3120 	/* handle it */
3121 	while(nsd->current_tcp_count > 0) {
3122 		mode_t m = server_signal_mode(nsd);
3123 		struct event timeout;
3124 		struct timeval tv;
3125 		int timed_out = 0;
3126 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3127 			m == NSD_REAP_CHILDREN) {
3128 			/* quit */
3129 			break;
3130 		}
3131 		/* timer */
3132 		/* have to do something every second */
3133 		tv.tv_sec = 1;
3134 		tv.tv_usec = 0;
3135 		memset(&timeout, 0, sizeof(timeout));
3136 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3137 			&timed_out);
3138 		if(event_base_set(event_base, &timeout) != 0)
3139 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3140 		if(event_add(&timeout, &tv) != 0)
3141 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3142 
3143 		/* service loop */
3144 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3145 			if (errno != EINTR) {
3146 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3147 				break;
3148 			}
3149 		}
3150 		if(!timed_out) {
3151 			event_del(&timeout);
3152 		} else {
3153 			/* timed out, quit */
3154 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3155 			break;
3156 		}
3157 	}
3158 #ifdef MEMCLEAN
3159 	event_base_free(event_base);
3160 #endif
3161 	/* continue to quit after return */
3162 }
3163 
3164 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3165  * are always used, even if nonblocking operations are broken, in which case
3166  * NUM_RECV_PER_SELECT is defined to 1 (one).
3167  */
3168 #if defined(HAVE_RECVMMSG)
3169 #define nsd_recvmmsg recvmmsg
3170 #else /* !HAVE_RECVMMSG */
3171 
3172 static int
3173 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3174              int flags, struct timespec *timeout)
3175 {
3176 	unsigned int vpos = 0;
3177 	ssize_t rcvd;
3178 
3179 	/* timeout is ignored, ensure caller does not expect it to work */
3180 	assert(timeout == NULL); (void)timeout;
3181 
3182 	while(vpos < vlen) {
3183 		rcvd = recvfrom(sockfd,
3184 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3185 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3186 		                flags,
3187 		                msgvec[vpos].msg_hdr.msg_name,
3188 		               &msgvec[vpos].msg_hdr.msg_namelen);
3189 		if(rcvd < 0) {
3190 			break;
3191 		} else {
3192 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3193 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3194 			vpos++;
3195 		}
3196 	}
3197 
3198 	if(vpos) {
3199 		/* error will be picked up next time */
3200 		return (int)vpos;
3201 	} else if(errno == 0) {
3202 		return 0;
3203 	} else if(errno == EAGAIN) {
3204 		return 0;
3205 	}
3206 
3207 	return -1;
3208 }
3209 #endif /* HAVE_RECVMMSG */
3210 
3211 #ifdef HAVE_SENDMMSG
3212 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3213 #else /* !HAVE_SENDMMSG */
3214 
3215 static int
3216 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3217 {
3218 	unsigned int vpos = 0;
3219 	ssize_t snd;
3220 
3221 	while(vpos < vlen) {
3222 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3223 		snd = sendto(sockfd,
3224 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3225 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3226 		             flags,
3227 		             msgvec[vpos].msg_hdr.msg_name,
3228 		             msgvec[vpos].msg_hdr.msg_namelen);
3229 		if(snd < 0) {
3230 			break;
3231 		} else {
3232 			msgvec[vpos].msg_len = (unsigned int)snd;
3233 			vpos++;
3234 		}
3235 	}
3236 
3237 	if(vpos) {
3238 		return (int)vpos;
3239 	} else if(errno == 0) {
3240 		return 0;
3241 	}
3242 
3243 	return -1;
3244 }
3245 #endif /* HAVE_SENDMMSG */
3246 
3247 static void
3248 handle_udp(int fd, short event, void* arg)
3249 {
3250 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3251 	int received, sent, recvcount, i;
3252 	struct query *q;
3253 
3254 	if (!(event & EV_READ)) {
3255 		return;
3256 	}
3257 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3258 	/* this printf strangely gave a performance increase on Linux */
3259 	/* printf("recvcount %d \n", recvcount); */
3260 	if (recvcount == -1) {
3261 		if (errno != EAGAIN && errno != EINTR) {
3262 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3263 			STATUP(data->nsd, rxerr);
3264 			/* No zone statup */
3265 		}
3266 		/* Simply no data available */
3267 		return;
3268 	}
3269 	for (i = 0; i < recvcount; i++) {
3270 	loopstart:
3271 		received = msgs[i].msg_len;
3272 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3273 		q = queries[i];
3274 		if (received == -1) {
3275 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3276 #if defined(HAVE_RECVMMSG)
3277 				msgs[i].msg_hdr.msg_flags
3278 #else
3279 				errno
3280 #endif
3281 				));
3282 			STATUP(data->nsd, rxerr);
3283 			/* No zone statup */
3284 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3285 			iovecs[i].iov_len = buffer_remaining(q->packet);
3286 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3287 			goto swap_drop;
3288 		}
3289 
3290 		/* Account... */
3291 #ifdef BIND8_STATS
3292 		if (data->socket->addr.ai_family == AF_INET) {
3293 			STATUP(data->nsd, qudp);
3294 		} else if (data->socket->addr.ai_family == AF_INET6) {
3295 			STATUP(data->nsd, qudp6);
3296 		}
3297 #endif
3298 
3299 		buffer_skip(q->packet, received);
3300 		buffer_flip(q->packet);
3301 #ifdef USE_DNSTAP
3302 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
3303 			q->tcp, q->packet);
3304 #endif /* USE_DNSTAP */
3305 
3306 		/* Process and answer the query... */
3307 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
3308 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3309 				STATUP(data->nsd, nona);
3310 				ZTATUP(data->nsd, q->zone, nona);
3311 			}
3312 
3313 #ifdef USE_ZONE_STATS
3314 			if (data->socket->addr.ai_family == AF_INET) {
3315 				ZTATUP(data->nsd, q->zone, qudp);
3316 			} else if (data->socket->addr.ai_family == AF_INET6) {
3317 				ZTATUP(data->nsd, q->zone, qudp6);
3318 			}
3319 #endif
3320 
3321 			/* Add EDNS0 and TSIG info if necessary.  */
3322 			query_add_optional(q, data->nsd);
3323 
3324 			buffer_flip(q->packet);
3325 			iovecs[i].iov_len = buffer_remaining(q->packet);
3326 #ifdef BIND8_STATS
3327 			/* Account the rcode & TC... */
3328 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3329 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3330 			if (TC(q->packet)) {
3331 				STATUP(data->nsd, truncated);
3332 				ZTATUP(data->nsd, q->zone, truncated);
3333 			}
3334 #endif /* BIND8_STATS */
3335 #ifdef USE_DNSTAP
3336 			dt_collector_submit_auth_response(data->nsd,
3337 				&q->addr, q->addrlen, q->tcp, q->packet,
3338 				q->zone);
3339 #endif /* USE_DNSTAP */
3340 		} else {
3341 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3342 			iovecs[i].iov_len = buffer_remaining(q->packet);
3343 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3344 		swap_drop:
3345 			STATUP(data->nsd, dropped);
3346 			ZTATUP(data->nsd, q->zone, dropped);
3347 			if(i != recvcount-1) {
3348 				/* swap with last and decrease recvcount */
3349 				struct mmsghdr mtmp = msgs[i];
3350 				struct iovec iotmp = iovecs[i];
3351 				recvcount--;
3352 				msgs[i] = msgs[recvcount];
3353 				iovecs[i] = iovecs[recvcount];
3354 				queries[i] = queries[recvcount];
3355 				msgs[recvcount] = mtmp;
3356 				iovecs[recvcount] = iotmp;
3357 				queries[recvcount] = q;
3358 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3359 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3360 				goto loopstart;
3361 			} else { recvcount --; }
3362 		}
3363 	}
3364 
3365 	/* send until all are sent */
3366 	i = 0;
3367 	while(i<recvcount) {
3368 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3369 		if(sent == -1) {
3370 			if(errno == ENOBUFS ||
3371 #ifdef EWOULDBLOCK
3372 				errno == EWOULDBLOCK ||
3373 #endif
3374 				errno == EAGAIN) {
3375 				/* block to wait until send buffer avail */
3376 				int flag;
3377 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3378 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3379 					flag = 0;
3380 				}
3381 				flag &= ~O_NONBLOCK;
3382 				if(fcntl(fd, F_SETFL, flag) == -1)
3383 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3384 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3385 				flag |= O_NONBLOCK;
3386 				if(fcntl(fd, F_SETFL, flag) == -1)
3387 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3388 				if(sent != -1) {
3389 					i += sent;
3390 					continue;
3391 				}
3392 			}
3393 			/* don't log transient network full errors, unless
3394 			 * on higher verbosity */
3395 			if(!(errno == ENOBUFS && verbosity < 1) &&
3396 #ifdef EWOULDBLOCK
3397 			   errno != EWOULDBLOCK &&
3398 #endif
3399 			   errno != EAGAIN) {
3400 				const char* es = strerror(errno);
3401 				char a[48];
3402 				addr2str(&queries[i]->addr, a, sizeof(a));
3403 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3404 			}
3405 #ifdef BIND8_STATS
3406 			data->nsd->st.txerr += recvcount-i;
3407 #endif /* BIND8_STATS */
3408 			break;
3409 		}
3410 		i += sent;
3411 	}
3412 	for(i=0; i<recvcount; i++) {
3413 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3414 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3415 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3416 	}
3417 }
3418 
3419 #ifdef HAVE_SSL
3420 /*
3421  * Setup an event for the tcp handler.
3422  */
3423 static void
3424 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3425        int fd, short event)
3426 {
3427 	struct timeval timeout;
3428 	struct event_base* ev_base;
3429 
3430 	timeout.tv_sec = data->nsd->tcp_timeout;
3431 	timeout.tv_usec = 0L;
3432 
3433 	ev_base = data->event.ev_base;
3434 	event_del(&data->event);
3435 	memset(&data->event, 0, sizeof(data->event));
3436 	event_set(&data->event, fd, event, fn, data);
3437 	if(event_base_set(ev_base, &data->event) != 0)
3438 		log_msg(LOG_ERR, "event base set failed");
3439 	if(event_add(&data->event, &timeout) != 0)
3440 		log_msg(LOG_ERR, "event add failed");
3441 }
3442 #endif /* HAVE_SSL */
3443 
3444 static void
3445 cleanup_tcp_handler(struct tcp_handler_data* data)
3446 {
3447 	event_del(&data->event);
3448 #ifdef HAVE_SSL
3449 	if(data->tls) {
3450 		SSL_shutdown(data->tls);
3451 		SSL_free(data->tls);
3452 		data->tls = NULL;
3453 	}
3454 #endif
3455 	close(data->event.ev_fd);
3456 	if(data->prev)
3457 		data->prev->next = data->next;
3458 	else	tcp_active_list = data->next;
3459 	if(data->next)
3460 		data->next->prev = data->prev;
3461 
3462 	/*
3463 	 * Enable the TCP accept handlers when the current number of
3464 	 * TCP connections is about to drop below the maximum number
3465 	 * of TCP connections.
3466 	 */
3467 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3468 		configure_handler_event_types(EV_READ|EV_PERSIST);
3469 		if(slowaccept) {
3470 			event_del(&slowaccept_event);
3471 			slowaccept = 0;
3472 		}
3473 	}
3474 	--data->nsd->current_tcp_count;
3475 	assert(data->nsd->current_tcp_count >= 0);
3476 
3477 	region_destroy(data->region);
3478 }
3479 
3480 static void
3481 handle_tcp_reading(int fd, short event, void* arg)
3482 {
3483 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3484 	ssize_t received;
3485 	struct event_base* ev_base;
3486 	struct timeval timeout;
3487 
3488 	if ((event & EV_TIMEOUT)) {
3489 		/* Connection timed out.  */
3490 		cleanup_tcp_handler(data);
3491 		return;
3492 	}
3493 
3494 	if (data->nsd->tcp_query_count > 0 &&
3495 		data->query_count >= data->nsd->tcp_query_count) {
3496 		/* No more queries allowed on this tcp connection. */
3497 		cleanup_tcp_handler(data);
3498 		return;
3499 	}
3500 
3501 	assert((event & EV_READ));
3502 
3503 	if (data->bytes_transmitted == 0) {
3504 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3505 	}
3506 
3507 	/*
3508 	 * Check if we received the leading packet length bytes yet.
3509 	 */
3510 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3511 		received = read(fd,
3512 				(char *) &data->query->tcplen
3513 				+ data->bytes_transmitted,
3514 				sizeof(uint16_t) - data->bytes_transmitted);
3515 		if (received == -1) {
3516 			if (errno == EAGAIN || errno == EINTR) {
3517 				/*
3518 				 * Read would block, wait until more
3519 				 * data is available.
3520 				 */
3521 				return;
3522 			} else {
3523 				char buf[48];
3524 				addr2str(&data->query->addr, buf, sizeof(buf));
3525 #ifdef ECONNRESET
3526 				if (verbosity >= 2 || errno != ECONNRESET)
3527 #endif /* ECONNRESET */
3528 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3529 				cleanup_tcp_handler(data);
3530 				return;
3531 			}
3532 		} else if (received == 0) {
3533 			/* EOF */
3534 			cleanup_tcp_handler(data);
3535 			return;
3536 		}
3537 
3538 		data->bytes_transmitted += received;
3539 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3540 			/*
3541 			 * Not done with the tcplen yet, wait for more
3542 			 * data to become available.
3543 			 */
3544 			return;
3545 		}
3546 
3547 		assert(data->bytes_transmitted == sizeof(uint16_t));
3548 
3549 		data->query->tcplen = ntohs(data->query->tcplen);
3550 
3551 		/*
3552 		 * Minimum query size is:
3553 		 *
3554 		 *     Size of the header (12)
3555 		 *   + Root domain name   (1)
3556 		 *   + Query class        (2)
3557 		 *   + Query type         (2)
3558 		 */
3559 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3560 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3561 			cleanup_tcp_handler(data);
3562 			return;
3563 		}
3564 
3565 		if (data->query->tcplen > data->query->maxlen) {
3566 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3567 			cleanup_tcp_handler(data);
3568 			return;
3569 		}
3570 
3571 		buffer_set_limit(data->query->packet, data->query->tcplen);
3572 	}
3573 
3574 	assert(buffer_remaining(data->query->packet) > 0);
3575 
3576 	/* Read the (remaining) query data.  */
3577 	received = read(fd,
3578 			buffer_current(data->query->packet),
3579 			buffer_remaining(data->query->packet));
3580 	if (received == -1) {
3581 		if (errno == EAGAIN || errno == EINTR) {
3582 			/*
3583 			 * Read would block, wait until more data is
3584 			 * available.
3585 			 */
3586 			return;
3587 		} else {
3588 			char buf[48];
3589 			addr2str(&data->query->addr, buf, sizeof(buf));
3590 #ifdef ECONNRESET
3591 			if (verbosity >= 2 || errno != ECONNRESET)
3592 #endif /* ECONNRESET */
3593 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3594 			cleanup_tcp_handler(data);
3595 			return;
3596 		}
3597 	} else if (received == 0) {
3598 		/* EOF */
3599 		cleanup_tcp_handler(data);
3600 		return;
3601 	}
3602 
3603 	data->bytes_transmitted += received;
3604 	buffer_skip(data->query->packet, received);
3605 	if (buffer_remaining(data->query->packet) > 0) {
3606 		/*
3607 		 * Message not yet complete, wait for more data to
3608 		 * become available.
3609 		 */
3610 		return;
3611 	}
3612 
3613 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3614 
3615 	/* Account... */
3616 #ifdef BIND8_STATS
3617 #ifndef INET6
3618 	STATUP(data->nsd, ctcp);
3619 #else
3620 	if (data->query->addr.ss_family == AF_INET) {
3621 		STATUP(data->nsd, ctcp);
3622 	} else if (data->query->addr.ss_family == AF_INET6) {
3623 		STATUP(data->nsd, ctcp6);
3624 	}
3625 #endif
3626 #endif /* BIND8_STATS */
3627 
3628 	/* We have a complete query, process it.  */
3629 
3630 	/* tcp-query-count: handle query counter ++ */
3631 	data->query_count++;
3632 
3633 	buffer_flip(data->query->packet);
3634 #ifdef USE_DNSTAP
3635 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3636 		data->query->addrlen, data->query->tcp, data->query->packet);
3637 #endif /* USE_DNSTAP */
3638 	data->query_state = server_process_query(data->nsd, data->query);
3639 	if (data->query_state == QUERY_DISCARDED) {
3640 		/* Drop the packet and the entire connection... */
3641 		STATUP(data->nsd, dropped);
3642 		ZTATUP(data->nsd, data->query->zone, dropped);
3643 		cleanup_tcp_handler(data);
3644 		return;
3645 	}
3646 
3647 #ifdef BIND8_STATS
3648 	if (RCODE(data->query->packet) == RCODE_OK
3649 	    && !AA(data->query->packet))
3650 	{
3651 		STATUP(data->nsd, nona);
3652 		ZTATUP(data->nsd, data->query->zone, nona);
3653 	}
3654 #endif /* BIND8_STATS */
3655 
3656 #ifdef USE_ZONE_STATS
3657 #ifndef INET6
3658 	ZTATUP(data->nsd, data->query->zone, ctcp);
3659 #else
3660 	if (data->query->addr.ss_family == AF_INET) {
3661 		ZTATUP(data->nsd, data->query->zone, ctcp);
3662 	} else if (data->query->addr.ss_family == AF_INET6) {
3663 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3664 	}
3665 #endif
3666 #endif /* USE_ZONE_STATS */
3667 
3668 	query_add_optional(data->query, data->nsd);
3669 
3670 	/* Switch to the tcp write handler.  */
3671 	buffer_flip(data->query->packet);
3672 	data->query->tcplen = buffer_remaining(data->query->packet);
3673 #ifdef BIND8_STATS
3674 	/* Account the rcode & TC... */
3675 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3676 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3677 	if (TC(data->query->packet)) {
3678 		STATUP(data->nsd, truncated);
3679 		ZTATUP(data->nsd, data->query->zone, truncated);
3680 	}
3681 #endif /* BIND8_STATS */
3682 #ifdef USE_DNSTAP
3683 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3684 		data->query->addrlen, data->query->tcp, data->query->packet,
3685 		data->query->zone);
3686 #endif /* USE_DNSTAP */
3687 	data->bytes_transmitted = 0;
3688 
3689 	timeout.tv_sec = data->tcp_timeout / 1000;
3690 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3691 
3692 	ev_base = data->event.ev_base;
3693 	event_del(&data->event);
3694 	memset(&data->event, 0, sizeof(data->event));
3695 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3696 		handle_tcp_reading, data);
3697 	if(event_base_set(ev_base, &data->event) != 0)
3698 		log_msg(LOG_ERR, "event base set tcpr failed");
3699 	if(event_add(&data->event, &timeout) != 0)
3700 		log_msg(LOG_ERR, "event add tcpr failed");
3701 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3702 	handle_tcp_writing(fd, EV_WRITE, data);
3703 }
3704 
3705 static void
3706 handle_tcp_writing(int fd, short event, void* arg)
3707 {
3708 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3709 	ssize_t sent;
3710 	struct query *q = data->query;
3711 	struct timeval timeout;
3712 	struct event_base* ev_base;
3713 
3714 	if ((event & EV_TIMEOUT)) {
3715 		/* Connection timed out.  */
3716 		cleanup_tcp_handler(data);
3717 		return;
3718 	}
3719 
3720 	assert((event & EV_WRITE));
3721 
3722 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3723 		/* Writing the response packet length.  */
3724 		uint16_t n_tcplen = htons(q->tcplen);
3725 #ifdef HAVE_WRITEV
3726 		struct iovec iov[2];
3727 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3728 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3729 		iov[1].iov_base = buffer_begin(q->packet);
3730 		iov[1].iov_len = buffer_limit(q->packet);
3731 		sent = writev(fd, iov, 2);
3732 #else /* HAVE_WRITEV */
3733 		sent = write(fd,
3734 			     (const char *) &n_tcplen + data->bytes_transmitted,
3735 			     sizeof(n_tcplen) - data->bytes_transmitted);
3736 #endif /* HAVE_WRITEV */
3737 		if (sent == -1) {
3738 			if (errno == EAGAIN || errno == EINTR) {
3739 				/*
3740 				 * Write would block, wait until
3741 				 * socket becomes writable again.
3742 				 */
3743 				return;
3744 			} else {
3745 #ifdef ECONNRESET
3746 				if(verbosity >= 2 || errno != ECONNRESET)
3747 #endif /* ECONNRESET */
3748 #ifdef EPIPE
3749 				  if(verbosity >= 2 || errno != EPIPE)
3750 #endif /* EPIPE 'broken pipe' */
3751 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3752 				cleanup_tcp_handler(data);
3753 				return;
3754 			}
3755 		}
3756 
3757 		data->bytes_transmitted += sent;
3758 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3759 			/*
3760 			 * Writing not complete, wait until socket
3761 			 * becomes writable again.
3762 			 */
3763 			return;
3764 		}
3765 
3766 #ifdef HAVE_WRITEV
3767 		sent -= sizeof(n_tcplen);
3768 		/* handle potential 'packet done' code */
3769 		goto packet_could_be_done;
3770 #endif
3771  	}
3772 
3773 	sent = write(fd,
3774 		     buffer_current(q->packet),
3775 		     buffer_remaining(q->packet));
3776 	if (sent == -1) {
3777 		if (errno == EAGAIN || errno == EINTR) {
3778 			/*
3779 			 * Write would block, wait until
3780 			 * socket becomes writable again.
3781 			 */
3782 			return;
3783 		} else {
3784 #ifdef ECONNRESET
3785 			if(verbosity >= 2 || errno != ECONNRESET)
3786 #endif /* ECONNRESET */
3787 #ifdef EPIPE
3788 				  if(verbosity >= 2 || errno != EPIPE)
3789 #endif /* EPIPE 'broken pipe' */
3790 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3791 			cleanup_tcp_handler(data);
3792 			return;
3793 		}
3794 	}
3795 
3796 	data->bytes_transmitted += sent;
3797 #ifdef HAVE_WRITEV
3798   packet_could_be_done:
3799 #endif
3800 	buffer_skip(q->packet, sent);
3801 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3802 		/*
3803 		 * Still more data to write when socket becomes
3804 		 * writable again.
3805 		 */
3806 		return;
3807 	}
3808 
3809 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3810 
3811 	if (data->query_state == QUERY_IN_AXFR) {
3812 		/* Continue processing AXFR and writing back results.  */
3813 		buffer_clear(q->packet);
3814 		data->query_state = query_axfr(data->nsd, q);
3815 		if (data->query_state != QUERY_PROCESSED) {
3816 			query_add_optional(data->query, data->nsd);
3817 
3818 			/* Reset data. */
3819 			buffer_flip(q->packet);
3820 			q->tcplen = buffer_remaining(q->packet);
3821 			data->bytes_transmitted = 0;
3822 			/* Reset timeout.  */
3823 			timeout.tv_sec = data->tcp_timeout / 1000;
3824 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3825 			ev_base = data->event.ev_base;
3826 			event_del(&data->event);
3827 			memset(&data->event, 0, sizeof(data->event));
3828 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
3829 				handle_tcp_writing, data);
3830 			if(event_base_set(ev_base, &data->event) != 0)
3831 				log_msg(LOG_ERR, "event base set tcpw failed");
3832 			if(event_add(&data->event, &timeout) != 0)
3833 				log_msg(LOG_ERR, "event add tcpw failed");
3834 
3835 			/*
3836 			 * Write data if/when the socket is writable
3837 			 * again.
3838 			 */
3839 			return;
3840 		}
3841 	}
3842 
3843 	/*
3844 	 * Done sending, wait for the next request to arrive on the
3845 	 * TCP socket by installing the TCP read handler.
3846 	 */
3847 	if (data->nsd->tcp_query_count > 0 &&
3848 		data->query_count >= data->nsd->tcp_query_count) {
3849 
3850 		(void) shutdown(fd, SHUT_WR);
3851 	}
3852 
3853 	data->bytes_transmitted = 0;
3854 
3855 	timeout.tv_sec = data->tcp_timeout / 1000;
3856 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3857 	ev_base = data->event.ev_base;
3858 	event_del(&data->event);
3859 	memset(&data->event, 0, sizeof(data->event));
3860 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3861 		handle_tcp_reading, data);
3862 	if(event_base_set(ev_base, &data->event) != 0)
3863 		log_msg(LOG_ERR, "event base set tcpw failed");
3864 	if(event_add(&data->event, &timeout) != 0)
3865 		log_msg(LOG_ERR, "event add tcpw failed");
3866 }
3867 
3868 #ifdef HAVE_SSL
3869 /** create SSL object and associate fd */
3870 static SSL*
3871 incoming_ssl_fd(SSL_CTX* ctx, int fd)
3872 {
3873 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
3874 	if(!ssl) {
3875 		log_crypto_err("could not SSL_new");
3876 		return NULL;
3877 	}
3878 	SSL_set_accept_state(ssl);
3879 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
3880 	if(!SSL_set_fd(ssl, fd)) {
3881 		log_crypto_err("could not SSL_set_fd");
3882 		SSL_free(ssl);
3883 		return NULL;
3884 	}
3885 	return ssl;
3886 }
3887 
3888 /** TLS handshake to upgrade TCP connection */
3889 static int
3890 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
3891 {
3892 	int r;
3893 	if(data->shake_state == tls_hs_read_event) {
3894 		/* read condition satisfied back to writing */
3895 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3896 		data->shake_state = tls_hs_none;
3897 		return 1;
3898 	}
3899 	if(data->shake_state == tls_hs_write_event) {
3900 		/* write condition satisfied back to reading */
3901 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3902 		data->shake_state = tls_hs_none;
3903 		return 1;
3904 	}
3905 
3906 	/* (continue to) setup the TLS connection */
3907 	ERR_clear_error();
3908 	r = SSL_do_handshake(data->tls);
3909 
3910 	if(r != 1) {
3911 		int want = SSL_get_error(data->tls, r);
3912 		if(want == SSL_ERROR_WANT_READ) {
3913 			if(data->shake_state == tls_hs_read) {
3914 				/* try again later */
3915 				return 1;
3916 			}
3917 			data->shake_state = tls_hs_read;
3918 			/* switch back to reading mode */
3919 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3920 			return 1;
3921 		} else if(want == SSL_ERROR_WANT_WRITE) {
3922 			if(data->shake_state == tls_hs_write) {
3923 				/* try again later */
3924 				return 1;
3925 			}
3926 			data->shake_state = tls_hs_write;
3927 			/* switch back to writing mode */
3928 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3929 			return 1;
3930 		} else {
3931 			if(r == 0)
3932 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
3933 			else {
3934 				unsigned long err = ERR_get_error();
3935 				if(!squelch_err_ssl_handshake(err)) {
3936 					char a[64], s[256];
3937 					addr2str(&data->query->addr, a, sizeof(a));
3938 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
3939 					log_crypto_from_err(s, err);
3940 				}
3941 			}
3942 			cleanup_tcp_handler(data);
3943 			return 0;
3944 		}
3945 	}
3946 
3947 	/* Use to log successful upgrade for testing - could be removed*/
3948 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
3949 	/* set back to the event we need to have when reading (or writing) */
3950 	if(data->shake_state == tls_hs_read && writing) {
3951 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3952 	} else if(data->shake_state == tls_hs_write && !writing) {
3953 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3954 	}
3955 	data->shake_state = tls_hs_none;
3956 	return 1;
3957 }
3958 
3959 /** handle TLS reading of incoming query */
3960 static void
3961 handle_tls_reading(int fd, short event, void* arg)
3962 {
3963 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3964 	ssize_t received;
3965 
3966 	if ((event & EV_TIMEOUT)) {
3967 		/* Connection timed out.  */
3968 		cleanup_tcp_handler(data);
3969 		return;
3970 	}
3971 
3972 	if (data->nsd->tcp_query_count > 0 &&
3973 	    data->query_count >= data->nsd->tcp_query_count) {
3974 		/* No more queries allowed on this tcp connection. */
3975 		cleanup_tcp_handler(data);
3976 		return;
3977 	}
3978 
3979 	assert((event & EV_READ));
3980 
3981 	if (data->bytes_transmitted == 0) {
3982 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3983 	}
3984 
3985 	if(data->shake_state != tls_hs_none) {
3986 		if(!tls_handshake(data, fd, 0))
3987 			return;
3988 		if(data->shake_state != tls_hs_none)
3989 			return;
3990 	}
3991 
3992 	/*
3993 	 * Check if we received the leading packet length bytes yet.
3994 	 */
3995 	if(data->bytes_transmitted < sizeof(uint16_t)) {
3996 		ERR_clear_error();
3997 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
3998 		    + data->bytes_transmitted,
3999 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
4000 			int want = SSL_get_error(data->tls, received);
4001 			if(want == SSL_ERROR_ZERO_RETURN) {
4002 				cleanup_tcp_handler(data);
4003 				return; /* shutdown, closed */
4004 			} else if(want == SSL_ERROR_WANT_READ) {
4005 				/* wants to be called again */
4006 				return;
4007 			}
4008 			else if(want == SSL_ERROR_WANT_WRITE) {
4009 				/* switch to writing */
4010 				data->shake_state = tls_hs_write_event;
4011 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4012 				return;
4013 			}
4014 			cleanup_tcp_handler(data);
4015 			log_crypto_err("could not SSL_read");
4016 			return;
4017 		}
4018 
4019 		data->bytes_transmitted += received;
4020 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4021 			/*
4022 			 * Not done with the tcplen yet, wait for more
4023 			 * data to become available.
4024 			 */
4025 			return;
4026 		}
4027 
4028 		assert(data->bytes_transmitted == sizeof(uint16_t));
4029 
4030 		data->query->tcplen = ntohs(data->query->tcplen);
4031 
4032 		/*
4033 		 * Minimum query size is:
4034 		 *
4035 		 *     Size of the header (12)
4036 		 *   + Root domain name   (1)
4037 		 *   + Query class        (2)
4038 		 *   + Query type         (2)
4039 		 */
4040 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4041 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4042 			cleanup_tcp_handler(data);
4043 			return;
4044 		}
4045 
4046 		if (data->query->tcplen > data->query->maxlen) {
4047 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4048 			cleanup_tcp_handler(data);
4049 			return;
4050 		}
4051 
4052 		buffer_set_limit(data->query->packet, data->query->tcplen);
4053 	}
4054 
4055 	assert(buffer_remaining(data->query->packet) > 0);
4056 
4057 	/* Read the (remaining) query data.  */
4058 	ERR_clear_error();
4059 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
4060 			    (int)buffer_remaining(data->query->packet));
4061 	if(received <= 0) {
4062 		int want = SSL_get_error(data->tls, received);
4063 		if(want == SSL_ERROR_ZERO_RETURN) {
4064 			cleanup_tcp_handler(data);
4065 			return; /* shutdown, closed */
4066 		} else if(want == SSL_ERROR_WANT_READ) {
4067 			/* wants to be called again */
4068 			return;
4069 		}
4070 		else if(want == SSL_ERROR_WANT_WRITE) {
4071 			/* switch back writing */
4072 			data->shake_state = tls_hs_write_event;
4073 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4074 			return;
4075 		}
4076 		cleanup_tcp_handler(data);
4077 		log_crypto_err("could not SSL_read");
4078 		return;
4079 	}
4080 
4081 	data->bytes_transmitted += received;
4082 	buffer_skip(data->query->packet, received);
4083 	if (buffer_remaining(data->query->packet) > 0) {
4084 		/*
4085 		 * Message not yet complete, wait for more data to
4086 		 * become available.
4087 		 */
4088 		return;
4089 	}
4090 
4091 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4092 
4093 	/* Account... */
4094 #ifndef INET6
4095 	STATUP(data->nsd, ctls);
4096 #else
4097 	if (data->query->addr.ss_family == AF_INET) {
4098 		STATUP(data->nsd, ctls);
4099 	} else if (data->query->addr.ss_family == AF_INET6) {
4100 		STATUP(data->nsd, ctls6);
4101 	}
4102 #endif
4103 
4104 	/* We have a complete query, process it.  */
4105 
4106 	/* tcp-query-count: handle query counter ++ */
4107 	data->query_count++;
4108 
4109 	buffer_flip(data->query->packet);
4110 #ifdef USE_DNSTAP
4111 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
4112 		data->query->addrlen, data->query->tcp, data->query->packet);
4113 #endif /* USE_DNSTAP */
4114 	data->query_state = server_process_query(data->nsd, data->query);
4115 	if (data->query_state == QUERY_DISCARDED) {
4116 		/* Drop the packet and the entire connection... */
4117 		STATUP(data->nsd, dropped);
4118 		ZTATUP(data->nsd, data->query->zone, dropped);
4119 		cleanup_tcp_handler(data);
4120 		return;
4121 	}
4122 
4123 #ifdef BIND8_STATS
4124 	if (RCODE(data->query->packet) == RCODE_OK
4125 	    && !AA(data->query->packet))
4126 	{
4127 		STATUP(data->nsd, nona);
4128 		ZTATUP(data->nsd, data->query->zone, nona);
4129 	}
4130 #endif /* BIND8_STATS */
4131 
4132 #ifdef USE_ZONE_STATS
4133 #ifndef INET6
4134 	ZTATUP(data->nsd, data->query->zone, ctls);
4135 #else
4136 	if (data->query->addr.ss_family == AF_INET) {
4137 		ZTATUP(data->nsd, data->query->zone, ctls);
4138 	} else if (data->query->addr.ss_family == AF_INET6) {
4139 		ZTATUP(data->nsd, data->query->zone, ctls6);
4140 	}
4141 #endif
4142 #endif /* USE_ZONE_STATS */
4143 
4144 	query_add_optional(data->query, data->nsd);
4145 
4146 	/* Switch to the tcp write handler.  */
4147 	buffer_flip(data->query->packet);
4148 	data->query->tcplen = buffer_remaining(data->query->packet);
4149 #ifdef BIND8_STATS
4150 	/* Account the rcode & TC... */
4151 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4152 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4153 	if (TC(data->query->packet)) {
4154 		STATUP(data->nsd, truncated);
4155 		ZTATUP(data->nsd, data->query->zone, truncated);
4156 	}
4157 #endif /* BIND8_STATS */
4158 #ifdef USE_DNSTAP
4159 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
4160 		data->query->addrlen, data->query->tcp, data->query->packet,
4161 		data->query->zone);
4162 #endif /* USE_DNSTAP */
4163 	data->bytes_transmitted = 0;
4164 
4165 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4166 
4167 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4168 	handle_tls_writing(fd, EV_WRITE, data);
4169 }
4170 
4171 /** handle TLS writing of outgoing response */
4172 static void
4173 handle_tls_writing(int fd, short event, void* arg)
4174 {
4175 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4176 	ssize_t sent;
4177 	struct query *q = data->query;
4178 	/* static variable that holds reassembly buffer used to put the
4179 	 * TCP length in front of the packet, like writev. */
4180 	static buffer_type* global_tls_temp_buffer = NULL;
4181 	buffer_type* write_buffer;
4182 
4183 	if ((event & EV_TIMEOUT)) {
4184 		/* Connection timed out.  */
4185 		cleanup_tcp_handler(data);
4186 		return;
4187 	}
4188 
4189 	assert((event & EV_WRITE));
4190 
4191 	if(data->shake_state != tls_hs_none) {
4192 		if(!tls_handshake(data, fd, 1))
4193 			return;
4194 		if(data->shake_state != tls_hs_none)
4195 			return;
4196 	}
4197 
4198 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4199 
4200 	/* If we are writing the start of a message, we must include the length
4201 	 * this is done with a copy into write_buffer. */
4202 	write_buffer = NULL;
4203 	if (data->bytes_transmitted == 0) {
4204 		if(!global_tls_temp_buffer) {
4205 			/* gets deallocated when nsd shuts down from
4206 			 * nsd.region */
4207 			global_tls_temp_buffer = buffer_create(nsd.region,
4208 				QIOBUFSZ + sizeof(q->tcplen));
4209 			if (!global_tls_temp_buffer) {
4210 				return;
4211 			}
4212 		}
4213 		write_buffer = global_tls_temp_buffer;
4214 		buffer_clear(write_buffer);
4215 		buffer_write_u16(write_buffer, q->tcplen);
4216 		buffer_write(write_buffer, buffer_current(q->packet),
4217 			(int)buffer_remaining(q->packet));
4218 		buffer_flip(write_buffer);
4219 	} else {
4220 		write_buffer = q->packet;
4221 	}
4222 
4223 	/* Write the response */
4224 	ERR_clear_error();
4225 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4226 	if(sent <= 0) {
4227 		int want = SSL_get_error(data->tls, sent);
4228 		if(want == SSL_ERROR_ZERO_RETURN) {
4229 			cleanup_tcp_handler(data);
4230 			/* closed */
4231 		} else if(want == SSL_ERROR_WANT_READ) {
4232 			/* switch back to reading */
4233 			data->shake_state = tls_hs_read_event;
4234 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4235 		} else if(want != SSL_ERROR_WANT_WRITE) {
4236 			cleanup_tcp_handler(data);
4237 			log_crypto_err("could not SSL_write");
4238 		}
4239 		return;
4240 	}
4241 
4242 	buffer_skip(write_buffer, sent);
4243 	if(buffer_remaining(write_buffer) != 0) {
4244 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4245 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4246 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4247 		}
4248 	}
4249 
4250 	data->bytes_transmitted += sent;
4251 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4252 		/*
4253 		 * Still more data to write when socket becomes
4254 		 * writable again.
4255 		 */
4256 		return;
4257 	}
4258 
4259 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4260 
4261 	if (data->query_state == QUERY_IN_AXFR) {
4262 		/* Continue processing AXFR and writing back results.  */
4263 		buffer_clear(q->packet);
4264 		data->query_state = query_axfr(data->nsd, q);
4265 		if (data->query_state != QUERY_PROCESSED) {
4266 			query_add_optional(data->query, data->nsd);
4267 
4268 			/* Reset data. */
4269 			buffer_flip(q->packet);
4270 			q->tcplen = buffer_remaining(q->packet);
4271 			data->bytes_transmitted = 0;
4272 			/* Reset to writing mode.  */
4273 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4274 
4275 			/*
4276 			 * Write data if/when the socket is writable
4277 			 * again.
4278 			 */
4279 			return;
4280 		}
4281 	}
4282 
4283 	/*
4284 	 * Done sending, wait for the next request to arrive on the
4285 	 * TCP socket by installing the TCP read handler.
4286 	 */
4287 	if (data->nsd->tcp_query_count > 0 &&
4288 		data->query_count >= data->nsd->tcp_query_count) {
4289 
4290 		(void) shutdown(fd, SHUT_WR);
4291 	}
4292 
4293 	data->bytes_transmitted = 0;
4294 
4295 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4296 }
4297 #endif
4298 
4299 static void
4300 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4301 	void* ATTR_UNUSED(arg))
4302 {
4303 	if(slowaccept) {
4304 		configure_handler_event_types(EV_PERSIST | EV_READ);
4305 		slowaccept = 0;
4306 	}
4307 }
4308 
4309 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4310 {
4311 #ifndef HAVE_ACCEPT4
4312 	int s = accept(fd, addr, addrlen);
4313 	if (s != -1) {
4314 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4315 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4316 			close(s);
4317 			s = -1;
4318 			errno=EINTR; /* stop error printout as error in accept4
4319 				by setting this errno, it omits printout, in
4320 				later code that calls nsd_accept4 */
4321 		}
4322 	}
4323 	return s;
4324 #else
4325 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4326 #endif /* HAVE_ACCEPT4 */
4327 }
4328 
4329 /*
4330  * Handle an incoming TCP connection.  The connection is accepted and
4331  * a new TCP reader event handler is added.  The TCP handler
4332  * is responsible for cleanup when the connection is closed.
4333  */
4334 static void
4335 handle_tcp_accept(int fd, short event, void* arg)
4336 {
4337 	struct tcp_accept_handler_data *data
4338 		= (struct tcp_accept_handler_data *) arg;
4339 	int s;
4340 	int reject = 0;
4341 	struct tcp_handler_data *tcp_data;
4342 	region_type *tcp_region;
4343 #ifdef INET6
4344 	struct sockaddr_storage addr;
4345 #else
4346 	struct sockaddr_in addr;
4347 #endif
4348 	socklen_t addrlen;
4349 	struct timeval timeout;
4350 
4351 	if (!(event & EV_READ)) {
4352 		return;
4353 	}
4354 
4355 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4356 		reject = data->nsd->options->tcp_reject_overflow;
4357 		if (!reject) {
4358 			return;
4359 		}
4360 	}
4361 
4362 	/* Accept it... */
4363 	addrlen = sizeof(addr);
4364 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4365 	if (s == -1) {
4366 		/**
4367 		 * EMFILE and ENFILE is a signal that the limit of open
4368 		 * file descriptors has been reached. Pause accept().
4369 		 * EINTR is a signal interrupt. The others are various OS ways
4370 		 * of saying that the client has closed the connection.
4371 		 */
4372 		if (errno == EMFILE || errno == ENFILE) {
4373 			if (!slowaccept) {
4374 				/* disable accept events */
4375 				struct timeval tv;
4376 				configure_handler_event_types(0);
4377 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4378 				tv.tv_usec = 0L;
4379 				memset(&slowaccept_event, 0,
4380 					sizeof(slowaccept_event));
4381 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4382 					handle_slowaccept_timeout, NULL);
4383 				(void)event_base_set(data->event.ev_base,
4384 					&slowaccept_event);
4385 				(void)event_add(&slowaccept_event, &tv);
4386 				slowaccept = 1;
4387 				/* We don't want to spam the logs here */
4388 			}
4389 		} else if (errno != EINTR
4390 			&& errno != EWOULDBLOCK
4391 #ifdef ECONNABORTED
4392 			&& errno != ECONNABORTED
4393 #endif /* ECONNABORTED */
4394 #ifdef EPROTO
4395 			&& errno != EPROTO
4396 #endif /* EPROTO */
4397 			) {
4398 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4399 		}
4400 		return;
4401 	}
4402 
4403 	if (reject) {
4404 		shutdown(s, SHUT_RDWR);
4405 		close(s);
4406 		return;
4407 	}
4408 
4409 	/*
4410 	 * This region is deallocated when the TCP connection is
4411 	 * closed by the TCP handler.
4412 	 */
4413 	tcp_region = region_create(xalloc, free);
4414 	tcp_data = (struct tcp_handler_data *) region_alloc(
4415 		tcp_region, sizeof(struct tcp_handler_data));
4416 	tcp_data->region = tcp_region;
4417 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4418 		compression_table_size, compressed_dnames);
4419 	tcp_data->nsd = data->nsd;
4420 	tcp_data->query_count = 0;
4421 #ifdef HAVE_SSL
4422 	tcp_data->shake_state = tls_hs_none;
4423 	tcp_data->tls = NULL;
4424 #endif
4425 	tcp_data->prev = NULL;
4426 	tcp_data->next = NULL;
4427 
4428 	tcp_data->query_state = QUERY_PROCESSED;
4429 	tcp_data->bytes_transmitted = 0;
4430 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4431 	tcp_data->query->addrlen = addrlen;
4432 
4433 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4434 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4435 		/* very busy, give smaller timeout */
4436 		tcp_data->tcp_timeout = 200;
4437 	}
4438 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4439 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4440 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4441 
4442 #ifdef HAVE_SSL
4443 	if (data->tls_accept) {
4444 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4445 		if(!tcp_data->tls) {
4446 			close(s);
4447 			return;
4448 		}
4449 		tcp_data->shake_state = tls_hs_read;
4450 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4451 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4452 			  handle_tls_reading, tcp_data);
4453 	} else {
4454 #endif
4455 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4456 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4457 			  handle_tcp_reading, tcp_data);
4458 #ifdef HAVE_SSL
4459 	}
4460 #endif
4461 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4462 		log_msg(LOG_ERR, "cannot set tcp event base");
4463 		close(s);
4464 		region_destroy(tcp_region);
4465 		return;
4466 	}
4467 	if(event_add(&tcp_data->event, &timeout) != 0) {
4468 		log_msg(LOG_ERR, "cannot add tcp to event base");
4469 		close(s);
4470 		region_destroy(tcp_region);
4471 		return;
4472 	}
4473 	if(tcp_active_list) {
4474 		tcp_active_list->prev = tcp_data;
4475 		tcp_data->next = tcp_active_list;
4476 	}
4477 	tcp_active_list = tcp_data;
4478 
4479 	/*
4480 	 * Keep track of the total number of TCP handlers installed so
4481 	 * we can stop accepting connections when the maximum number
4482 	 * of simultaneous TCP connections is reached.
4483 	 *
4484 	 * If tcp-reject-overflow is enabled, however, then we do not
4485 	 * change the handler event type; we keep it as-is and accept
4486 	 * overflow TCP connections only so that we can forcibly kill
4487 	 * them off.
4488 	 */
4489 	++data->nsd->current_tcp_count;
4490 	if (!data->nsd->options->tcp_reject_overflow &&
4491 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4492 	{
4493 		configure_handler_event_types(0);
4494 	}
4495 }
4496 
4497 static void
4498 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4499 {
4500 	size_t i;
4501 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4502 	for (i = 0; i < nsd->child_count; ++i) {
4503 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4504 			if (write(nsd->children[i].child_fd,
4505 				&command,
4506 				sizeof(command)) == -1)
4507 			{
4508 				if(errno != EAGAIN && errno != EINTR)
4509 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4510 					(int) command,
4511 					(int) nsd->children[i].pid,
4512 					strerror(errno));
4513 			} else if (timeout > 0) {
4514 				(void)block_read(NULL,
4515 					nsd->children[i].child_fd,
4516 					&command, sizeof(command), timeout);
4517 			}
4518 			fsync(nsd->children[i].child_fd);
4519 			close(nsd->children[i].child_fd);
4520 			nsd->children[i].child_fd = -1;
4521 		}
4522 	}
4523 }
4524 
4525 static void
4526 send_children_quit(struct nsd* nsd)
4527 {
4528 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4529 	send_children_command(nsd, NSD_QUIT, 0);
4530 }
4531 
4532 static void
4533 send_children_quit_and_wait(struct nsd* nsd)
4534 {
4535 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4536 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4537 }
4538 
4539 #ifdef BIND8_STATS
4540 static void
4541 set_children_stats(struct nsd* nsd)
4542 {
4543 	size_t i;
4544 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4545 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4546 	for (i = 0; i < nsd->child_count; ++i) {
4547 		nsd->children[i].need_to_send_STATS = 1;
4548 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4549 	}
4550 }
4551 #endif /* BIND8_STATS */
4552 
4553 static void
4554 configure_handler_event_types(short event_types)
4555 {
4556 	size_t i;
4557 
4558 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4559 		struct event* handler = &tcp_accept_handlers[i].event;
4560 		if(event_types) {
4561 			/* reassign */
4562 			int fd = handler->ev_fd;
4563 			struct event_base* base = handler->ev_base;
4564 			if(tcp_accept_handlers[i].event_added)
4565 				event_del(handler);
4566 			memset(handler, 0, sizeof(*handler));
4567 			event_set(handler, fd, event_types,
4568 				handle_tcp_accept, &tcp_accept_handlers[i]);
4569 			if(event_base_set(base, handler) != 0)
4570 				log_msg(LOG_ERR, "conhand: cannot event_base");
4571 			if(event_add(handler, NULL) != 0)
4572 				log_msg(LOG_ERR, "conhand: cannot event_add");
4573 			tcp_accept_handlers[i].event_added = 1;
4574 		} else {
4575 			/* remove */
4576 			if(tcp_accept_handlers[i].event_added) {
4577 				event_del(handler);
4578 				tcp_accept_handlers[i].event_added = 0;
4579 			}
4580 		}
4581 	}
4582 }
4583