xref: /openbsd-src/usr.sbin/nsd/server.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifndef SHUT_WR
39 #define SHUT_WR 1
40 #endif
41 #ifdef HAVE_MMAP
42 #include <sys/mman.h>
43 #endif /* HAVE_MMAP */
44 #ifdef HAVE_OPENSSL_RAND_H
45 #include <openssl/rand.h>
46 #endif
47 #ifdef HAVE_OPENSSL_SSL_H
48 #include <openssl/ssl.h>
49 #endif
50 #ifdef HAVE_OPENSSL_ERR_H
51 #include <openssl/err.h>
52 #endif
53 #ifdef HAVE_OPENSSL_OCSP_H
54 #include <openssl/ocsp.h>
55 #endif
56 #ifndef USE_MINI_EVENT
57 #  ifdef HAVE_EVENT_H
58 #    include <event.h>
59 #  else
60 #    include <event2/event.h>
61 #    include "event2/event_struct.h"
62 #    include "event2/event_compat.h"
63 #  endif
64 #else
65 #  include "mini_event.h"
66 #endif
67 
68 #include "axfr.h"
69 #include "namedb.h"
70 #include "netio.h"
71 #include "xfrd.h"
72 #include "xfrd-tcp.h"
73 #include "xfrd-disk.h"
74 #include "difffile.h"
75 #include "nsec3.h"
76 #include "ipc.h"
77 #include "udb.h"
78 #include "remote.h"
79 #include "lookup3.h"
80 #include "rrl.h"
81 #ifdef USE_DNSTAP
82 #include "dnstap/dnstap_collector.h"
83 #endif
84 
85 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
86 
87 #ifdef USE_TCP_FASTOPEN
88   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
89   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
90 #endif
91 
92 /*
93  * Data for the UDP handlers.
94  */
95 struct udp_handler_data
96 {
97 	struct nsd        *nsd;
98 	struct nsd_socket *socket;
99 	struct event       event;
100 };
101 
102 struct tcp_accept_handler_data {
103 	struct nsd        *nsd;
104 	struct nsd_socket *socket;
105 	int                event_added;
106 	struct event       event;
107 #ifdef HAVE_SSL
108 	/* handler accepts TLS connections on the dedicated port */
109 	int                tls_accept;
110 #endif
111 };
112 
113 /*
114  * These globals are used to enable the TCP accept handlers
115  * when the number of TCP connection drops below the maximum
116  * number of TCP connections.
117  */
118 static size_t tcp_accept_handler_count;
119 static struct tcp_accept_handler_data *tcp_accept_handlers;
120 
121 static struct event slowaccept_event;
122 static int slowaccept;
123 
124 #ifdef HAVE_SSL
125 static unsigned char *ocspdata = NULL;
126 static long ocspdata_len = 0;
127 #endif
128 
129 #ifdef NONBLOCKING_IS_BROKEN
130 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
131    read multiple times from a socket when reported ready by select. */
132 # define NUM_RECV_PER_SELECT (1)
133 #else /* !NONBLOCKING_IS_BROKEN */
134 # define NUM_RECV_PER_SELECT (100)
135 #endif /* NONBLOCKING_IS_BROKEN */
136 
137 #ifndef HAVE_MMSGHDR
138 struct mmsghdr {
139 	struct msghdr msg_hdr;
140 	unsigned int  msg_len;
141 };
142 #endif
143 
144 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
145 static struct iovec iovecs[NUM_RECV_PER_SELECT];
146 static struct query *queries[NUM_RECV_PER_SELECT];
147 
148 /*
149  * Data for the TCP connection handlers.
150  *
151  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
152  * blocking the entire server on a slow TCP connection, but does make
153  * reading from and writing to the socket more complicated.
154  *
155  * Basically, whenever a read/write would block (indicated by the
156  * EAGAIN errno variable) we remember the position we were reading
157  * from/writing to and return from the TCP reading/writing event
158  * handler.  When the socket becomes readable/writable again we
159  * continue from the same position.
160  */
161 struct tcp_handler_data
162 {
163 	/*
164 	 * The region used to allocate all TCP connection related
165 	 * data, including this structure.  This region is destroyed
166 	 * when the connection is closed.
167 	 */
168 	region_type*		region;
169 
170 	/*
171 	 * The global nsd structure.
172 	 */
173 	struct nsd*			nsd;
174 
175 	/*
176 	 * The current query data for this TCP connection.
177 	 */
178 	query_type*			query;
179 
180 	/*
181 	 * The query_state is used to remember if we are performing an
182 	 * AXFR, if we're done processing, or if we should discard the
183 	 * query and connection.
184 	 */
185 	query_state_type	query_state;
186 
187 	/*
188 	 * The event for the file descriptor and tcp timeout
189 	 */
190 	struct event event;
191 
192 	/*
193 	 * The bytes_transmitted field is used to remember the number
194 	 * of bytes transmitted when receiving or sending a DNS
195 	 * packet.  The count includes the two additional bytes used
196 	 * to specify the packet length on a TCP connection.
197 	 */
198 	size_t				bytes_transmitted;
199 
200 	/*
201 	 * The number of queries handled by this specific TCP connection.
202 	 */
203 	int					query_count;
204 
205 	/*
206 	 * The timeout in msec for this tcp connection
207 	 */
208 	int	tcp_timeout;
209 #ifdef HAVE_SSL
210 	/*
211 	 * TLS object.
212 	 */
213 	SSL* tls;
214 
215 	/*
216 	 * TLS handshake state.
217 	 */
218 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
219 		tls_hs_read_event, tls_hs_write_event } shake_state;
220 #endif
221 	/* list of connections, for service of remaining tcp channels */
222 	struct tcp_handler_data *prev, *next;
223 };
224 /* global that is the list of active tcp channels */
225 static struct tcp_handler_data *tcp_active_list = NULL;
226 
227 /*
228  * Handle incoming queries on the UDP server sockets.
229  */
230 static void handle_udp(int fd, short event, void* arg);
231 
232 /*
233  * Handle incoming connections on the TCP sockets.  These handlers
234  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
235  * connection) but are disabled when the number of current TCP
236  * connections is equal to the maximum number of TCP connections.
237  * Disabling is done by changing the handler to wait for the
238  * NETIO_EVENT_NONE type.  This is done using the function
239  * configure_tcp_accept_handlers.
240  */
241 static void handle_tcp_accept(int fd, short event, void* arg);
242 
243 /*
244  * Handle incoming queries on a TCP connection.  The TCP connections
245  * are configured to be non-blocking and the handler may be called
246  * multiple times before a complete query is received.
247  */
248 static void handle_tcp_reading(int fd, short event, void* arg);
249 
250 /*
251  * Handle outgoing responses on a TCP connection.  The TCP connections
252  * are configured to be non-blocking and the handler may be called
253  * multiple times before a complete response is sent.
254  */
255 static void handle_tcp_writing(int fd, short event, void* arg);
256 
257 #ifdef HAVE_SSL
258 /* Create SSL object and associate fd */
259 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
260 /*
261  * Handle TLS handshake. May be called multiple times if incomplete.
262  */
263 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
264 
265 /*
266  * Handle incoming queries on a TLS over TCP connection.  The TLS
267  * connections are configured to be non-blocking and the handler may
268  * be called multiple times before a complete query is received.
269  */
270 static void handle_tls_reading(int fd, short event, void* arg);
271 
272 /*
273  * Handle outgoing responses on a TLS over TCP connection.  The TLS
274  * connections are configured to be non-blocking and the handler may
275  * be called multiple times before a complete response is sent.
276  */
277 static void handle_tls_writing(int fd, short event, void* arg);
278 #endif
279 
280 /*
281  * Send all children the quit nonblocking, then close pipe.
282  */
283 static void send_children_quit(struct nsd* nsd);
284 /* same, for shutdown time, waits for child to exit to avoid restart issues */
285 static void send_children_quit_and_wait(struct nsd* nsd);
286 
287 /* set childrens flags to send NSD_STATS to them */
288 #ifdef BIND8_STATS
289 static void set_children_stats(struct nsd* nsd);
290 #endif /* BIND8_STATS */
291 
292 /*
293  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
294  */
295 static void configure_handler_event_types(short event_types);
296 
297 static uint16_t *compressed_dname_offsets = 0;
298 static uint32_t compression_table_capacity = 0;
299 static uint32_t compression_table_size = 0;
300 static domain_type* compressed_dnames[MAXRRSPP];
301 
302 #ifdef USE_TCP_FASTOPEN
303 /* Checks to see if the kernel value must be manually changed in order for
304    TCP Fast Open to support server mode */
305 static void report_tcp_fastopen_config() {
306 
307 	int tcp_fastopen_fp;
308 	uint8_t tcp_fastopen_value;
309 
310 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
311 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
312 	}
313 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
314 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
315 		close(tcp_fastopen_fp);
316 	}
317 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
318 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
319 		log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n");
320 		log_msg(LOG_WARNING, "To enable TFO use the command:");
321 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
322 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
323 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
324 		close(tcp_fastopen_fp);
325 	}
326 	close(tcp_fastopen_fp);
327 }
328 #endif
329 
330 /*
331  * Remove the specified pid from the list of child pids.  Returns -1 if
332  * the pid is not in the list, child_num otherwise.  The field is set to 0.
333  */
334 static int
335 delete_child_pid(struct nsd *nsd, pid_t pid)
336 {
337 	size_t i;
338 	for (i = 0; i < nsd->child_count; ++i) {
339 		if (nsd->children[i].pid == pid) {
340 			nsd->children[i].pid = 0;
341 			if(!nsd->children[i].need_to_exit) {
342 				if(nsd->children[i].child_fd != -1)
343 					close(nsd->children[i].child_fd);
344 				nsd->children[i].child_fd = -1;
345 				if(nsd->children[i].handler)
346 					nsd->children[i].handler->fd = -1;
347 			}
348 			return i;
349 		}
350 	}
351 	return -1;
352 }
353 
354 /*
355  * Restart child servers if necessary.
356  */
357 static int
358 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
359 	int* xfrd_sock_p)
360 {
361 	struct main_ipc_handler_data *ipc_data;
362 	size_t i;
363 	int sv[2];
364 
365 	/* Fork the child processes... */
366 	for (i = 0; i < nsd->child_count; ++i) {
367 		if (nsd->children[i].pid <= 0) {
368 			if (nsd->children[i].child_fd != -1)
369 				close(nsd->children[i].child_fd);
370 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
371 				log_msg(LOG_ERR, "socketpair: %s",
372 					strerror(errno));
373 				return -1;
374 			}
375 			nsd->children[i].child_fd = sv[0];
376 			nsd->children[i].parent_fd = sv[1];
377 			nsd->children[i].pid = fork();
378 			switch (nsd->children[i].pid) {
379 			default: /* SERVER MAIN */
380 				close(nsd->children[i].parent_fd);
381 				nsd->children[i].parent_fd = -1;
382 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
383 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
384 				}
385 				if(!nsd->children[i].handler)
386 				{
387 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
388 						region, sizeof(struct main_ipc_handler_data));
389 					ipc_data->nsd = nsd;
390 					ipc_data->child = &nsd->children[i];
391 					ipc_data->child_num = i;
392 					ipc_data->xfrd_sock = xfrd_sock_p;
393 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
394 					ipc_data->forward_mode = 0;
395 					ipc_data->got_bytes = 0;
396 					ipc_data->total_bytes = 0;
397 					ipc_data->acl_num = 0;
398 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
399 						region, sizeof(struct netio_handler));
400 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
401 					nsd->children[i].handler->timeout = NULL;
402 					nsd->children[i].handler->user_data = ipc_data;
403 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
404 					nsd->children[i].handler->event_handler = parent_handle_child_command;
405 					netio_add_handler(netio, nsd->children[i].handler);
406 				}
407 				/* clear any ongoing ipc */
408 				ipc_data = (struct main_ipc_handler_data*)
409 					nsd->children[i].handler->user_data;
410 				ipc_data->forward_mode = 0;
411 				/* restart - update fd */
412 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
413 				break;
414 			case 0: /* CHILD */
415 				/* the child need not be able to access the
416 				 * nsd.db file */
417 				namedb_close_udb(nsd->db);
418 #ifdef MEMCLEAN /* OS collects memory pages */
419 				region_destroy(region);
420 #endif
421 
422 				if (pledge("stdio rpath inet", NULL) == -1) {
423 					log_msg(LOG_ERR, "pledge");
424 					exit(1);
425 				}
426 
427 				nsd->pid = 0;
428 				nsd->child_count = 0;
429 				nsd->server_kind = nsd->children[i].kind;
430 				nsd->this_child = &nsd->children[i];
431 				nsd->this_child->child_num = i;
432 				/* remove signal flags inherited from parent
433 				   the parent will handle them. */
434 				nsd->signal_hint_reload_hup = 0;
435 				nsd->signal_hint_reload = 0;
436 				nsd->signal_hint_child = 0;
437 				nsd->signal_hint_quit = 0;
438 				nsd->signal_hint_shutdown = 0;
439 				nsd->signal_hint_stats = 0;
440 				nsd->signal_hint_statsusr = 0;
441 				close(*xfrd_sock_p);
442 				close(nsd->this_child->child_fd);
443 				nsd->this_child->child_fd = -1;
444 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
445 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
446 				}
447 				server_child(nsd);
448 				/* NOTREACH */
449 				exit(0);
450 			case -1:
451 				log_msg(LOG_ERR, "fork failed: %s",
452 					strerror(errno));
453 				return -1;
454 			}
455 		}
456 	}
457 	return 0;
458 }
459 
460 #ifdef BIND8_STATS
461 static void set_bind8_alarm(struct nsd* nsd)
462 {
463 	/* resync so that the next alarm is on the next whole minute */
464 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
465 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
466 }
467 #endif
468 
469 /* set zone stat ids for zones initially read in */
470 static void
471 zonestatid_tree_set(struct nsd* nsd)
472 {
473 	struct radnode* n;
474 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
475 		zone_type* zone = (zone_type*)n->elem;
476 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
477 	}
478 }
479 
480 #ifdef USE_ZONE_STATS
481 void
482 server_zonestat_alloc(struct nsd* nsd)
483 {
484 	size_t num = (nsd->options->zonestatnames->count==0?1:
485 			nsd->options->zonestatnames->count);
486 	size_t sz = sizeof(struct nsdst)*num;
487 	char tmpfile[256];
488 	uint8_t z = 0;
489 
490 	/* file names */
491 	nsd->zonestatfname[0] = 0;
492 	nsd->zonestatfname[1] = 0;
493 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
494 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
495 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
496 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
497 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
498 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
499 
500 	/* file descriptors */
501 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
502 	if(nsd->zonestatfd[0] == -1) {
503 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
504 			strerror(errno));
505 		exit(1);
506 	}
507 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
508 	if(nsd->zonestatfd[0] == -1) {
509 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
510 			strerror(errno));
511 		close(nsd->zonestatfd[0]);
512 		unlink(nsd->zonestatfname[0]);
513 		exit(1);
514 	}
515 
516 #ifdef HAVE_MMAP
517 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
518 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
519 			strerror(errno));
520 		exit(1);
521 	}
522 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
523 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
524 			nsd->zonestatfname[0], strerror(errno));
525 		exit(1);
526 	}
527 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
528 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
529 			strerror(errno));
530 		exit(1);
531 	}
532 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
533 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
534 			nsd->zonestatfname[1], strerror(errno));
535 		exit(1);
536 	}
537 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
538 		MAP_SHARED, nsd->zonestatfd[0], 0);
539 	if(nsd->zonestat[0] == MAP_FAILED) {
540 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
541 		unlink(nsd->zonestatfname[0]);
542 		unlink(nsd->zonestatfname[1]);
543 		exit(1);
544 	}
545 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
546 		MAP_SHARED, nsd->zonestatfd[1], 0);
547 	if(nsd->zonestat[1] == MAP_FAILED) {
548 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
549 		unlink(nsd->zonestatfname[0]);
550 		unlink(nsd->zonestatfname[1]);
551 		exit(1);
552 	}
553 	memset(nsd->zonestat[0], 0, sz);
554 	memset(nsd->zonestat[1], 0, sz);
555 	nsd->zonestatsize[0] = num;
556 	nsd->zonestatsize[1] = num;
557 	nsd->zonestatdesired = num;
558 	nsd->zonestatsizenow = num;
559 	nsd->zonestatnow = nsd->zonestat[0];
560 #endif /* HAVE_MMAP */
561 }
562 
563 void
564 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
565 {
566 #ifdef HAVE_MMAP
567 #ifdef MREMAP_MAYMOVE
568 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
569 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
570 		MREMAP_MAYMOVE);
571 	if(nsd->zonestat[idx] == MAP_FAILED) {
572 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
573 		exit(1);
574 	}
575 #else /* !HAVE MREMAP */
576 	if(msync(nsd->zonestat[idx],
577 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
578 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
579 	if(munmap(nsd->zonestat[idx],
580 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
581 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
582 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
583 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
584 	if(nsd->zonestat[idx] == MAP_FAILED) {
585 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
586 		exit(1);
587 	}
588 #endif /* MREMAP */
589 #endif /* HAVE_MMAP */
590 }
591 
592 /* realloc the zonestat array for the one that is not currently in use,
593  * to match the desired new size of the array (if applicable) */
594 void
595 server_zonestat_realloc(struct nsd* nsd)
596 {
597 #ifdef HAVE_MMAP
598 	uint8_t z = 0;
599 	size_t sz;
600 	int idx = 0; /* index of the zonestat array that is not in use */
601 	if(nsd->zonestatnow == nsd->zonestat[0])
602 		idx = 1;
603 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
604 		return;
605 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
606 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
607 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
608 			strerror(errno));
609 		exit(1);
610 	}
611 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
612 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
613 			nsd->zonestatfname[idx], strerror(errno));
614 		exit(1);
615 	}
616 	zonestat_remap(nsd, idx, sz);
617 	/* zero the newly allocated region */
618 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
619 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
620 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
621 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
622 	}
623 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
624 #endif /* HAVE_MMAP */
625 }
626 
627 /* switchover to use the other array for the new children, that
628  * briefly coexist with the old children.  And we want to avoid them
629  * both writing to the same statistics arrays. */
630 void
631 server_zonestat_switch(struct nsd* nsd)
632 {
633 	if(nsd->zonestatnow == nsd->zonestat[0]) {
634 		nsd->zonestatnow = nsd->zonestat[1];
635 		nsd->zonestatsizenow = nsd->zonestatsize[1];
636 	} else {
637 		nsd->zonestatnow = nsd->zonestat[0];
638 		nsd->zonestatsizenow = nsd->zonestatsize[0];
639 	}
640 }
641 #endif /* USE_ZONE_STATS */
642 
643 static void
644 cleanup_dname_compression_tables(void *ptr)
645 {
646 	free(ptr);
647 	compressed_dname_offsets = NULL;
648 	compression_table_capacity = 0;
649 }
650 
651 static void
652 initialize_dname_compression_tables(struct nsd *nsd)
653 {
654 	size_t needed = domain_table_count(nsd->db->domains) + 1;
655 	needed += EXTRA_DOMAIN_NUMBERS;
656 	if(compression_table_capacity < needed) {
657 		if(compressed_dname_offsets) {
658 			region_remove_cleanup(nsd->db->region,
659 				cleanup_dname_compression_tables,
660 				compressed_dname_offsets);
661 			free(compressed_dname_offsets);
662 		}
663 		compressed_dname_offsets = (uint16_t *) xmallocarray(
664 			needed, sizeof(uint16_t));
665 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
666 			compressed_dname_offsets);
667 		compression_table_capacity = needed;
668 		compression_table_size=domain_table_count(nsd->db->domains)+1;
669 	}
670 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
671 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
672 }
673 
674 static int
675 set_reuseport(struct nsd_socket *sock)
676 {
677 #ifdef SO_REUSEPORT
678 	int on = 1;
679 #ifdef SO_REUSEPORT_LB
680 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
681 	 * SO_REUSEPORT on Linux. This is what the users want with the config
682 	 * option in nsd.conf; if we actually need local address and port reuse
683 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
684 	 * _LB they want.
685 	 */
686 	int opt = SO_REUSEPORT_LB;
687 	static const char optname[] = "SO_REUSEPORT_LB";
688 #else /* !SO_REUSEPORT_LB */
689 	int opt = SO_REUSEPORT;
690 	static const char optname[] = "SO_REUSEPORT";
691 #endif /* SO_REUSEPORT_LB */
692 
693 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
694 		return 1;
695 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
696 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
697 			optname, strerror(errno));
698 	}
699 	return -1;
700 #else
701 	(void)sock;
702 #endif /* SO_REUSEPORT */
703 
704 	return 0;
705 }
706 
707 static int
708 set_reuseaddr(struct nsd_socket *sock)
709 {
710 #ifdef SO_REUSEADDR
711 	int on = 1;
712 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
713 		return 1;
714 	}
715 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
716 		strerror(errno));
717 	return -1;
718 #endif /* SO_REUSEADDR */
719 	return 0;
720 }
721 
722 static int
723 set_rcvbuf(struct nsd_socket *sock, int rcv)
724 {
725 #ifdef SO_RCVBUF
726 #ifdef SO_RCVBUFFORCE
727 	if(0 == setsockopt(
728 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
729 	{
730 		return 1;
731 	}
732 	if(errno == EPERM || errno == ENOBUFS) {
733 		return 0;
734 	}
735 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
736 		strerror(errno));
737 	return -1;
738 #else /* !SO_RCVBUFFORCE */
739 	if (0 == setsockopt(
740 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
741 	{
742 		return 1;
743 	}
744 	if(errno == ENOSYS || errno == ENOBUFS) {
745 		return 0;
746 	}
747 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
748 		strerror(errno));
749 	return -1;
750 #endif /* SO_RCVBUFFORCE */
751 #endif /* SO_RCVBUF */
752 
753 	return 0;
754 }
755 
756 static int
757 set_sndbuf(struct nsd_socket *sock, int snd)
758 {
759 #ifdef SO_SNDBUF
760 #ifdef SO_SNDBUFFORCE
761 	if(0 == setsockopt(
762 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
763 	{
764 		return 1;
765 	}
766 	if(errno == EPERM || errno == ENOBUFS) {
767 		return 0;
768 	}
769 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
770 		strerror(errno));
771 	return -1;
772 #else /* !SO_SNDBUFFORCE */
773 	if(0 == setsockopt(
774 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
775 	{
776 		return 1;
777 	}
778 	if(errno == ENOSYS || errno == ENOBUFS) {
779 		return 0;
780 	}
781 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
782 		strerror(errno));
783 	return -1;
784 #endif /* SO_SNDBUFFORCE */
785 #endif /* SO_SNDBUF */
786 
787 	return 0;
788 }
789 
790 static int
791 set_nonblock(struct nsd_socket *sock)
792 {
793 	const char *socktype =
794 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
795 
796 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
797 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
798 			socktype, strerror(errno));
799 		return -1;
800 	}
801 
802 	return 1;
803 }
804 
805 static int
806 set_ipv6_v6only(struct nsd_socket *sock)
807 {
808 #ifdef INET6
809 #ifdef IPV6_V6ONLY
810 	int on = 1;
811 	const char *socktype =
812 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
813 
814 	if(0 == setsockopt(
815 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
816 	{
817 		return 1;
818 	}
819 
820 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
821 		socktype, strerror(errno));
822 	return -1;
823 #endif /* IPV6_V6ONLY */
824 #endif /* INET6 */
825 
826 	return 0;
827 }
828 
829 static int
830 set_ipv6_use_min_mtu(struct nsd_socket *sock)
831 {
832 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU))
833 #if defined(IPV6_USE_MIN_MTU)
834 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
835 	 * network. Therefore we do not send UDP datagrams larger than the
836 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
837 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
838 	 */
839 	int opt = IPV6_USE_MIN_MTU;
840 	int optval = 1;
841 	static const char optname[] = "IPV6_USE_MIN_MTU";
842 #elif defined(IPV6_MTU)
843 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
844 	 * to the MIN MTU to get the same.
845 	 */
846 	int opt = IPV6_MTU;
847 	int optval = IPV6_MIN_MTU;
848 	static const char optname[] = "IPV6_MTU";
849 #endif
850 	if(0 == setsockopt(
851 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
852 	{
853 		return 1;
854 	}
855 
856 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
857 		optname, strerror(errno));
858 	return -1;
859 #else
860 	(void)sock;
861 #endif /* INET6 */
862 
863 	return 0;
864 }
865 
866 static int
867 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
868 {
869 	int ret = 0;
870 
871 #if defined(IP_MTU_DISCOVER)
872 	int opt = IP_MTU_DISCOVER;
873 	int optval;
874 # if defined(IP_PMTUDISC_OMIT)
875 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
876 	 * information and send packets with DF=0. Fragmentation is allowed if
877 	 * and only if the packet size exceeds the outgoing interface MTU or
878 	 * the packet encounters smaller MTU link in network. This mitigates
879 	 * DNS fragmentation attacks by preventing forged PMTU information.
880 	 * FreeBSD already has same semantics without setting the option.
881 	 */
882 	optval = IP_PMTUDISC_OMIT;
883 	if(0 == setsockopt(
884 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
885 	{
886 		return 1;
887 	}
888 
889 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
890 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
891 # endif /* IP_PMTUDISC_OMIT */
892 # if defined(IP_PMTUDISC_DONT)
893 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
894 	optval = IP_PMTUDISC_DONT;
895 	if(0 == setsockopt(
896 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
897 	{
898 		return 1;
899 	}
900 
901 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
902 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
903 # endif
904 	ret = -1;
905 #elif defined(IP_DONTFRAG)
906 	int off = 0;
907 	if (0 == setsockopt(
908 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
909 	{
910 		return 1;
911 	}
912 
913 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
914 		strerror(errno));
915 	ret = -1;
916 #else
917 	(void)sock;
918 #endif
919 
920 	return ret;
921 }
922 
923 static int
924 set_ip_freebind(struct nsd_socket *sock)
925 {
926 #ifdef IP_FREEBIND
927 	int on = 1;
928 	const char *socktype =
929 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
930 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
931 	{
932 		return 1;
933 	}
934 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
935 		socktype, strerror(errno));
936 	return -1;
937 #else
938 	(void)sock;
939 #endif /* IP_FREEBIND */
940 
941 	return 0;
942 }
943 
944 static int
945 set_ip_transparent(struct nsd_socket *sock)
946 {
947 #if defined(IP_TRANSPARENT)
948 	int on = 1;
949 	const char *socktype =
950 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
951 	if(0 == setsockopt(
952 		sock->s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)))
953 	{
954 		return 1;
955 	}
956 
957 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
958 		"IP_TRANSPARENT", socktype, strerror(errno));
959 	return -1;
960 #elif defined(SO_BINDANY)
961 	int on = 1;
962 	const char *socktype =
963 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
964 	if(0 == setsockopt(
965 		sock->s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)))
966 	{
967 		return 1;
968 	}
969 
970 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
971 		"SO_BINDANY", socktype, strerror(errno));
972 	return -1;
973 #else
974 	(void)sock;
975 #endif
976 
977 	return 0;
978 }
979 
980 static int
981 set_tcp_maxseg(struct nsd_socket *sock, int mss)
982 {
983 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
984 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
985 		return 1;
986 	}
987 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
988 		strerror(errno));
989 	return -1;
990 #else
991 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
992 #endif
993 	return 0;
994 }
995 
996 #ifdef USE_TCP_FASTOPEN
997 static int
998 set_tcp_fastopen(struct nsd_socket *sock)
999 {
1000 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1001 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1002 	 */
1003 	int qlen;
1004 
1005 #ifdef __APPLE__
1006 	/* macOS X implementation only supports qlen of 1 via this call. The
1007 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1008 	 * kernel parameter.
1009 	 */
1010 	qlen = 1;
1011 #else
1012 	/* 5 is recommended on Linux. */
1013 	qlen = 5;
1014 #endif
1015 	if (0 == setsockopt(
1016 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1017 	{
1018 		return 1;
1019 	}
1020 
1021 	if (errno == EPERM) {
1022 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1023 				 "; this could likely be because sysctl "
1024 				 "net.inet.tcp.fastopen.enabled, "
1025 				 "net.inet.tcp.fastopen.server_enable, or "
1026 				 "net.ipv4.tcp_fastopen is disabled",
1027 			strerror(errno));
1028 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1029 	 * disabled, except when verbosity enabled for debugging
1030 	 */
1031 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1032 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1033 			strerror(errno));
1034 	}
1035 
1036 	return (errno == ENOPROTOOPT ? 0 : -1);
1037 }
1038 #endif /* USE_TCP_FASTOPEN */
1039 
1040 static int
1041 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1042 {
1043 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1044 
1045 	if(-1 == (sock->s = socket(
1046 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1047 	{
1048 #ifdef INET6
1049 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1050 		   (sock->addr.ai_family == AF_INET6) &&
1051 		   (errno == EAFNOSUPPORT))
1052 		{
1053 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1054 				"not supported");
1055 			return 0;
1056 		}
1057 #endif
1058 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1059 		return -1;
1060 	}
1061 
1062 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1063 		*reuseport_works = (set_reuseport(sock) == 1);
1064 
1065 	if(nsd->options->receive_buffer_size > 0)
1066 		rcv = nsd->options->receive_buffer_size;
1067 	if(set_rcvbuf(sock, rcv) == -1)
1068 		return -1;
1069 
1070 	if(nsd->options->send_buffer_size > 0)
1071 		snd = nsd->options->send_buffer_size;
1072 	if(set_sndbuf(sock, snd) == -1)
1073 		return -1;
1074 #ifdef INET6
1075 	if(sock->addr.ai_family == AF_INET6) {
1076 		if(set_ipv6_v6only(sock) == -1 ||
1077 		   set_ipv6_use_min_mtu(sock) == -1)
1078 			return -1;
1079 	} else
1080 #endif /* INET6 */
1081 	if(sock->addr.ai_family == AF_INET) {
1082 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1083 			return -1;
1084 	}
1085 
1086 	/* Set socket to non-blocking. Otherwise, on operating systems
1087 	 * with thundering herd problems, the UDP recv could block
1088 	 * after select returns readable.
1089 	 */
1090 	set_nonblock(sock);
1091 
1092 	if(nsd->options->ip_freebind)
1093 		(void)set_ip_freebind(sock);
1094 	if(nsd->options->ip_transparent)
1095 		(void)set_ip_transparent(sock);
1096 
1097 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1098 		char buf[256];
1099 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1100 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1101 			buf, strerror(errno));
1102 		return -1;
1103 	}
1104 
1105 	return 1;
1106 }
1107 
1108 static int
1109 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1110 {
1111 #ifdef USE_TCP_FASTOPEN
1112 	report_tcp_fastopen_config();
1113 #endif
1114 
1115 	(void)reuseport_works;
1116 
1117 	if(-1 == (sock->s = socket(
1118 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1119 	{
1120 #ifdef INET6
1121 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1122 		   (sock->addr.ai_family == AF_INET6) &&
1123 		   (errno == EAFNOSUPPORT))
1124 		{
1125 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1126 			                     "not supported");
1127 			return 0;
1128 		}
1129 #endif /* INET6 */
1130 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1131 		return -1;
1132 	}
1133 
1134 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1135 		*reuseport_works = (set_reuseport(sock) == 1);
1136 
1137 	(void)set_reuseaddr(sock);
1138 
1139 #ifdef INET6
1140 	if(sock->addr.ai_family == AF_INET6) {
1141 		if (set_ipv6_v6only(sock) == -1 ||
1142 		    set_ipv6_use_min_mtu(sock) == -1)
1143 			return -1;
1144 	}
1145 #endif
1146 
1147 	if(nsd->tcp_mss > 0)
1148 		set_tcp_maxseg(sock, nsd->tcp_mss);
1149 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1150 	   it may block in accept, even if select() says readable. */
1151 	(void)set_nonblock(sock);
1152 	if(nsd->options->ip_freebind)
1153 		(void)set_ip_freebind(sock);
1154 	if(nsd->options->ip_transparent)
1155 		(void)set_ip_transparent(sock);
1156 
1157 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1158 		char buf[256];
1159 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1160 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1161 			buf, strerror(errno));
1162 		return -1;
1163 	}
1164 
1165 #ifdef USE_TCP_FASTOPEN
1166 	(void)set_tcp_fastopen(sock);
1167 #endif
1168 
1169 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1170 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1171 		return -1;
1172 	}
1173 
1174 	return 1;
1175 }
1176 
1177 /*
1178  * Initialize the server, reuseport, create and bind the sockets.
1179  */
1180 int
1181 server_init(struct nsd *nsd)
1182 {
1183 	size_t i;
1184 	int reuseport = 1; /* Determine if REUSEPORT works. */
1185 
1186 	/* open server interface ports */
1187 	for(i = 0; i < nsd->ifs; i++) {
1188 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1189 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1190 		{
1191 			return -1;
1192 		}
1193 	}
1194 
1195 	if(nsd->reuseport && reuseport) {
1196 		size_t ifs = nsd->ifs * nsd->reuseport;
1197 
1198 		/* increase the size of the interface arrays, there are going
1199 		 * to be separate interface file descriptors for every server
1200 		 * instance */
1201 		region_remove_cleanup(nsd->region, free, nsd->udp);
1202 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1203 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1204 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1205 		region_add_cleanup(nsd->region, free, nsd->udp);
1206 		region_add_cleanup(nsd->region, free, nsd->tcp);
1207 
1208 		for(i = nsd->ifs; i < ifs; i++) {
1209 			nsd->udp[i].addr = nsd->udp[i%nsd->ifs].addr;
1210 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1211 				return -1;
1212 			}
1213 			/* Turn off REUSEPORT for TCP by copying the socket
1214 			 * file descriptor.
1215 			 */
1216 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1217 		}
1218 
1219 		nsd->ifs = ifs;
1220 	} else {
1221 		nsd->reuseport = 0;
1222 	}
1223 
1224 	return 0;
1225 }
1226 
1227 /*
1228  * Prepare the server for take off.
1229  *
1230  */
1231 int
1232 server_prepare(struct nsd *nsd)
1233 {
1234 #ifdef RATELIMIT
1235 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1236 #ifdef HAVE_ARC4RANDOM
1237 	hash_set_raninit(arc4random());
1238 #else
1239 	uint32_t v = getpid() ^ time(NULL);
1240 	srandom((unsigned long)v);
1241 #  ifdef HAVE_SSL
1242 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1243 		hash_set_raninit(v);
1244 	else
1245 #  endif
1246 		hash_set_raninit(random());
1247 #endif
1248 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1249 		nsd->options->rrl_ratelimit,
1250 		nsd->options->rrl_whitelist_ratelimit,
1251 		nsd->options->rrl_slip,
1252 		nsd->options->rrl_ipv4_prefix_length,
1253 		nsd->options->rrl_ipv6_prefix_length);
1254 #endif /* RATELIMIT */
1255 
1256 	/* Open the database... */
1257 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1258 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1259 			nsd->dbfile, strerror(errno));
1260 		unlink(nsd->task[0]->fname);
1261 		unlink(nsd->task[1]->fname);
1262 #ifdef USE_ZONE_STATS
1263 		unlink(nsd->zonestatfname[0]);
1264 		unlink(nsd->zonestatfname[1]);
1265 #endif
1266 		xfrd_del_tempdir(nsd);
1267 		return -1;
1268 	}
1269 	/* check if zone files have been modified */
1270 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1271 	 * for all zones */
1272 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1273 		nsd->options->database[0] == 0))
1274 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1275 	zonestatid_tree_set(nsd);
1276 
1277 	compression_table_capacity = 0;
1278 	initialize_dname_compression_tables(nsd);
1279 
1280 #ifdef	BIND8_STATS
1281 	/* Initialize times... */
1282 	time(&nsd->st.boot);
1283 	set_bind8_alarm(nsd);
1284 #endif /* BIND8_STATS */
1285 
1286 	return 0;
1287 }
1288 
1289 /*
1290  * Fork the required number of servers.
1291  */
1292 static int
1293 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1294 	int* xfrd_sock_p)
1295 {
1296 	size_t i;
1297 
1298 	/* Start all child servers initially.  */
1299 	for (i = 0; i < nsd->child_count; ++i) {
1300 		nsd->children[i].pid = 0;
1301 	}
1302 
1303 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1304 }
1305 
1306 void
1307 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1308 {
1309 	size_t i;
1310 
1311 	/* Close all the sockets... */
1312 	for (i = 0; i < n; ++i) {
1313 		if (sockets[i].s != -1) {
1314 			close(sockets[i].s);
1315 			sockets[i].s = -1;
1316 		}
1317 	}
1318 }
1319 
1320 /*
1321  * Close the sockets, shutdown the server and exit.
1322  * Does not return.
1323  */
1324 void
1325 server_shutdown(struct nsd *nsd)
1326 {
1327 	size_t i;
1328 
1329 	server_close_all_sockets(nsd->udp, nsd->ifs);
1330 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1331 	/* CHILD: close command channel to parent */
1332 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1333 	{
1334 		close(nsd->this_child->parent_fd);
1335 		nsd->this_child->parent_fd = -1;
1336 	}
1337 	/* SERVER: close command channels to children */
1338 	if(!nsd->this_child)
1339 	{
1340 		for(i=0; i < nsd->child_count; ++i)
1341 			if(nsd->children[i].child_fd != -1)
1342 			{
1343 				close(nsd->children[i].child_fd);
1344 				nsd->children[i].child_fd = -1;
1345 			}
1346 	}
1347 
1348 	tsig_finalize();
1349 #ifdef HAVE_SSL
1350 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1351 	if (nsd->tls_ctx)
1352 		SSL_CTX_free(nsd->tls_ctx);
1353 #endif
1354 
1355 #ifdef MEMCLEAN /* OS collects memory pages */
1356 #ifdef RATELIMIT
1357 	rrl_mmap_deinit_keep_mmap();
1358 #endif
1359 #ifdef USE_DNSTAP
1360 	dt_collector_destroy(nsd->dt_collector, nsd);
1361 #endif
1362 	udb_base_free_keep_mmap(nsd->task[0]);
1363 	udb_base_free_keep_mmap(nsd->task[1]);
1364 	namedb_close_udb(nsd->db); /* keeps mmap */
1365 	namedb_close(nsd->db);
1366 	nsd_options_destroy(nsd->options);
1367 	region_destroy(nsd->region);
1368 #endif
1369 	log_finalize();
1370 	exit(0);
1371 }
1372 
1373 void
1374 server_prepare_xfrd(struct nsd* nsd)
1375 {
1376 	char tmpfile[256];
1377 	/* create task mmaps */
1378 	nsd->mytask = 0;
1379 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1380 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1381 	nsd->task[0] = task_file_create(tmpfile);
1382 	if(!nsd->task[0]) {
1383 #ifdef USE_ZONE_STATS
1384 		unlink(nsd->zonestatfname[0]);
1385 		unlink(nsd->zonestatfname[1]);
1386 #endif
1387 		xfrd_del_tempdir(nsd);
1388 		exit(1);
1389 	}
1390 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1391 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1392 	nsd->task[1] = task_file_create(tmpfile);
1393 	if(!nsd->task[1]) {
1394 		unlink(nsd->task[0]->fname);
1395 #ifdef USE_ZONE_STATS
1396 		unlink(nsd->zonestatfname[0]);
1397 		unlink(nsd->zonestatfname[1]);
1398 #endif
1399 		xfrd_del_tempdir(nsd);
1400 		exit(1);
1401 	}
1402 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1403 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1404 	/* create xfrd listener structure */
1405 	nsd->xfrd_listener = region_alloc(nsd->region,
1406 		sizeof(netio_handler_type));
1407 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1408 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1409 	nsd->xfrd_listener->fd = -1;
1410 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1411 		nsd;
1412 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1413 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1414 }
1415 
1416 
1417 void
1418 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1419 {
1420 	pid_t pid;
1421 	int sockets[2] = {0,0};
1422 	struct ipc_handler_conn_data *data;
1423 
1424 	if(nsd->xfrd_listener->fd != -1)
1425 		close(nsd->xfrd_listener->fd);
1426 	if(del_db) {
1427 		/* recreate taskdb that xfrd was using, it may be corrupt */
1428 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1429 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1430 		nsd->task[1-nsd->mytask]->fname = NULL;
1431 		/* free alloc already, so udb does not shrink itself */
1432 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1433 		nsd->task[1-nsd->mytask]->alloc = NULL;
1434 		udb_base_free(nsd->task[1-nsd->mytask]);
1435 		/* create new file, overwrite the old one */
1436 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1437 		free(tmpfile);
1438 	}
1439 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1440 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1441 		return;
1442 	}
1443 	pid = fork();
1444 	switch (pid) {
1445 	case -1:
1446 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1447 		break;
1448 	default:
1449 		/* PARENT: close first socket, use second one */
1450 		close(sockets[0]);
1451 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1452 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1453 		}
1454 		if(del_db) xfrd_free_namedb(nsd);
1455 		/* use other task than I am using, since if xfrd died and is
1456 		 * restarted, the reload is using nsd->mytask */
1457 		nsd->mytask = 1 - nsd->mytask;
1458 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1459 		/* ENOTREACH */
1460 		break;
1461 	case 0:
1462 		/* CHILD: close second socket, use first one */
1463 		close(sockets[1]);
1464 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1465 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1466 		}
1467 		nsd->xfrd_listener->fd = sockets[0];
1468 		break;
1469 	}
1470 	/* server-parent only */
1471 	nsd->xfrd_listener->timeout = NULL;
1472 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1473 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1474 	/* clear ongoing ipc reads */
1475 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1476 	data->conn->is_reading = 0;
1477 }
1478 
1479 /** add all soainfo to taskdb */
1480 static void
1481 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1482 {
1483 	struct radnode* n;
1484 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1485 	/* add all SOA INFO to mytask */
1486 	udb_ptr_init(&task_last, taskudb);
1487 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1488 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1489 	}
1490 	udb_ptr_unlink(&task_last, taskudb);
1491 }
1492 
1493 void
1494 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1495 {
1496 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1497 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1498 	 *   then they exchange and process.
1499 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1500 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1501 	 *   expire notifications can be sent back via a normal reload later
1502 	 *   (xfrd will wait for current running reload to finish if any).
1503 	 */
1504 	sig_atomic_t cmd = 0;
1505 	pid_t mypid;
1506 	int xfrd_sock = nsd->xfrd_listener->fd;
1507 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1508 	udb_ptr t;
1509 	if(!shortsoa) {
1510 		if(nsd->signal_hint_shutdown) {
1511 		shutdown:
1512 			log_msg(LOG_WARNING, "signal received, shutting down...");
1513 			server_close_all_sockets(nsd->udp, nsd->ifs);
1514 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1515 #ifdef HAVE_SSL
1516 			daemon_remote_close(nsd->rc);
1517 #endif
1518 			/* Unlink it if possible... */
1519 			unlinkpid(nsd->pidfile);
1520 			unlink(nsd->task[0]->fname);
1521 			unlink(nsd->task[1]->fname);
1522 #ifdef USE_ZONE_STATS
1523 			unlink(nsd->zonestatfname[0]);
1524 			unlink(nsd->zonestatfname[1]);
1525 #endif
1526 			/* write the nsd.db to disk, wait for it to complete */
1527 			udb_base_sync(nsd->db->udb, 1);
1528 			udb_base_close(nsd->db->udb);
1529 			server_shutdown(nsd);
1530 			exit(0);
1531 		}
1532 	}
1533 	if(shortsoa) {
1534 		/* put SOA in xfrd task because mytask may be in use */
1535 		taskudb = nsd->task[1-nsd->mytask];
1536 	}
1537 
1538 	add_all_soa_to_task(nsd, taskudb);
1539 	if(!shortsoa) {
1540 		/* wait for xfrd to signal task is ready, RELOAD signal */
1541 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1542 			cmd != NSD_RELOAD) {
1543 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1544 			exit(1);
1545 		}
1546 		if(nsd->signal_hint_shutdown) {
1547 			goto shutdown;
1548 		}
1549 	}
1550 	/* give xfrd our task, signal it with RELOAD_DONE */
1551 	task_process_sync(taskudb);
1552 	cmd = NSD_RELOAD_DONE;
1553 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1554 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1555 			(int)nsd->pid, strerror(errno));
1556 	}
1557 	mypid = getpid();
1558 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1559 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1560 			strerror(errno));
1561 	}
1562 
1563 	if(!shortsoa) {
1564 		/* process the xfrd task works (expiry data) */
1565 		nsd->mytask = 1 - nsd->mytask;
1566 		taskudb = nsd->task[nsd->mytask];
1567 		task_remap(taskudb);
1568 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1569 		while(!udb_ptr_is_null(&t)) {
1570 			task_process_expire(nsd->db, TASKLIST(&t));
1571 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1572 		}
1573 		udb_ptr_unlink(&t, taskudb);
1574 		task_clear(taskudb);
1575 
1576 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1577 		cmd = NSD_RELOAD_DONE;
1578 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1579 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1580 				(int)nsd->pid, strerror(errno));
1581 		}
1582 	}
1583 }
1584 
1585 #ifdef HAVE_SSL
1586 static void
1587 log_crypto_from_err(const char* str, unsigned long err)
1588 {
1589 	/* error:[error code]:[library name]:[function name]:[reason string] */
1590 	char buf[128];
1591 	unsigned long e;
1592 	ERR_error_string_n(err, buf, sizeof(buf));
1593 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1594 	while( (e=ERR_get_error()) ) {
1595 		ERR_error_string_n(e, buf, sizeof(buf));
1596 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1597 	}
1598 }
1599 
1600 void
1601 log_crypto_err(const char* str)
1602 {
1603 	log_crypto_from_err(str, ERR_get_error());
1604 }
1605 
1606 /** true if the ssl handshake error has to be squelched from the logs */
1607 static int
1608 squelch_err_ssl_handshake(unsigned long err)
1609 {
1610 	if(verbosity >= 3)
1611 		return 0; /* only squelch on low verbosity */
1612 	/* this is very specific, we could filter on ERR_GET_REASON()
1613 	 * (the third element in ERR_PACK) */
1614 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1615 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1616 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1617 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1618 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1619 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1620 #endif
1621 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1622 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1623 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1624 #  ifdef SSL_R_VERSION_TOO_LOW
1625 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1626 #  endif
1627 #endif
1628 		)
1629 		return 1;
1630 	return 0;
1631 }
1632 
1633 void
1634 perform_openssl_init(void)
1635 {
1636 	/* init SSL library */
1637 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1638 	ERR_load_crypto_strings();
1639 #endif
1640 	ERR_load_SSL_strings();
1641 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1642 	OpenSSL_add_all_algorithms();
1643 #else
1644 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1645 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1646 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1647 #endif
1648 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1649 	(void)SSL_library_init();
1650 #else
1651 	OPENSSL_init_ssl(0, NULL);
1652 #endif
1653 
1654 	if(!RAND_status()) {
1655 		/* try to seed it */
1656 		unsigned char buf[256];
1657 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1658 		size_t i;
1659 		v = seed;
1660 		for(i=0; i<256/sizeof(v); i++) {
1661 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1662 			v = v*seed + (unsigned int)i;
1663 		}
1664 		RAND_seed(buf, 256);
1665 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1666 	}
1667 }
1668 
1669 static int
1670 get_ocsp(char *filename, unsigned char **ocsp)
1671 {
1672 	BIO *bio;
1673 	OCSP_RESPONSE *response;
1674 	int len = -1;
1675 	unsigned char *p, *buf;
1676 	assert(filename);
1677 
1678 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1679 		log_crypto_err("get_ocsp: BIO_new_file failed");
1680 		return -1;
1681 	}
1682 
1683 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1684 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1685 		BIO_free(bio);
1686 		return -1;
1687 	}
1688 
1689 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1690 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1691 		OCSP_RESPONSE_free(response);
1692 		BIO_free(bio);
1693 		return -1;
1694 	}
1695 
1696 	if ((buf = malloc((size_t) len)) == NULL) {
1697 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1698 		OCSP_RESPONSE_free(response);
1699 		BIO_free(bio);
1700 		return -1;
1701 	}
1702 
1703 	p = buf;
1704 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1705 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1706 		free(buf);
1707 		OCSP_RESPONSE_free(response);
1708 		BIO_free(bio);
1709 		return -1;
1710 	}
1711 
1712 	OCSP_RESPONSE_free(response);
1713 	BIO_free(bio);
1714 
1715 	*ocsp = buf;
1716 	return len;
1717 }
1718 
1719 /* further setup ssl ctx after the keys are loaded */
1720 static void
1721 listen_sslctx_setup_2(void* ctxt)
1722 {
1723 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
1724 	(void)ctx;
1725 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
1726 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
1727 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
1728 	}
1729 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
1730 	if(1) {
1731 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
1732 		if (!ecdh) {
1733 			log_crypto_err("could not find p256, not enabling ECDHE");
1734 		} else {
1735 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
1736 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
1737 			}
1738 			EC_KEY_free (ecdh);
1739 		}
1740 	}
1741 #endif
1742 }
1743 
1744 static int
1745 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
1746 {
1747 	if(ocspdata) {
1748 		unsigned char *p;
1749 		if ((p=malloc(ocspdata_len)) == NULL) {
1750 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
1751 			return SSL_TLSEXT_ERR_NOACK;
1752 		}
1753 		memcpy(p, ocspdata, ocspdata_len);
1754 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
1755 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
1756 			free(p);
1757 			return SSL_TLSEXT_ERR_NOACK;
1758 		}
1759 		return SSL_TLSEXT_ERR_OK;
1760 	} else {
1761 		return SSL_TLSEXT_ERR_NOACK;
1762 	}
1763 }
1764 
1765 SSL_CTX*
1766 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
1767 {
1768 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
1769 	if(!ctx) {
1770 		log_crypto_err("could not SSL_CTX_new");
1771 		return NULL;
1772 	}
1773 	/* no SSLv2, SSLv3 because has defects */
1774 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
1775 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
1776 		SSL_CTX_free(ctx);
1777 		return NULL;
1778 	}
1779 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
1780 		!= SSL_OP_NO_SSLv3){
1781 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
1782 		SSL_CTX_free(ctx);
1783 		return 0;
1784 	}
1785 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
1786 	/* if we have tls 1.1 disable 1.0 */
1787 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
1788 		!= SSL_OP_NO_TLSv1){
1789 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
1790 		SSL_CTX_free(ctx);
1791 		return 0;
1792 	}
1793 #endif
1794 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
1795 	/* if we have tls 1.2 disable 1.1 */
1796 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
1797 		!= SSL_OP_NO_TLSv1_1){
1798 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
1799 		SSL_CTX_free(ctx);
1800 		return 0;
1801 	}
1802 #endif
1803 #if defined(SSL_OP_NO_RENEGOTIATION)
1804 	/* disable client renegotiation */
1805 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
1806 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
1807 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
1808 		SSL_CTX_free(ctx);
1809 		return 0;
1810 	}
1811 #endif
1812 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
1813 	/* if we have sha256, set the cipher list to have no known vulns */
1814 	if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
1815 		log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
1816 #endif
1817 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
1818 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
1819 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
1820 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
1821 		SSL_CTX_free(ctx);
1822 		return 0;
1823 	}
1824 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
1825 	SSL_CTX_set_security_level(ctx, 0);
1826 #endif
1827 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
1828 		log_msg(LOG_ERR, "error for cert file: %s", pem);
1829 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
1830 		SSL_CTX_free(ctx);
1831 		return NULL;
1832 	}
1833 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
1834 		log_msg(LOG_ERR, "error for private key file: %s", key);
1835 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
1836 		SSL_CTX_free(ctx);
1837 		return NULL;
1838 	}
1839 	if(!SSL_CTX_check_private_key(ctx)) {
1840 		log_msg(LOG_ERR, "error for key file: %s", key);
1841 		log_crypto_err("Error in SSL_CTX check_private_key");
1842 		SSL_CTX_free(ctx);
1843 		return NULL;
1844 	}
1845 	listen_sslctx_setup_2(ctx);
1846 	if(verifypem && verifypem[0]) {
1847 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
1848 			log_crypto_err("Error in SSL_CTX verify locations");
1849 			SSL_CTX_free(ctx);
1850 			return NULL;
1851 		}
1852 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
1853 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
1854 	}
1855 	return ctx;
1856 }
1857 
1858 SSL_CTX*
1859 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
1860 {
1861 	char *key, *pem;
1862 	SSL_CTX *ctx;
1863 
1864 	key = nsd->options->tls_service_key;
1865 	pem = nsd->options->tls_service_pem;
1866 	if(!key || key[0] == 0) {
1867 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
1868 		return NULL;
1869 	}
1870 	if(!pem || pem[0] == 0) {
1871 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
1872 		return NULL;
1873 	}
1874 
1875 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
1876 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
1877 	ctx = server_tls_ctx_setup(key, pem, verifypem);
1878 	if(!ctx) {
1879 		log_msg(LOG_ERR, "could not setup server TLS context");
1880 		return NULL;
1881 	}
1882 	if(ocspfile && ocspfile[0]) {
1883 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
1884 			log_crypto_err("Error reading OCSPfile");
1885 			SSL_CTX_free(ctx);
1886 			return NULL;
1887 		} else {
1888 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
1889 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
1890 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
1891 				SSL_CTX_free(ctx);
1892 				return NULL;
1893 			}
1894 		}
1895 	}
1896 	return ctx;
1897 }
1898 
1899 /* check if tcp_handler_accept_data created for TLS dedicated port */
1900 int
1901 using_tls_port(struct sockaddr* addr, const char* tls_port)
1902 {
1903 	in_port_t port = 0;
1904 
1905 	if (addr->sa_family == AF_INET)
1906 		port = ((struct sockaddr_in*)addr)->sin_port;
1907 #ifndef HAVE_STRUCT_SOCKADDR_IN6
1908 	else
1909 		port = ((struct sockaddr_in6*)addr)->sin6_port;
1910 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
1911 	if (atoi(tls_port) == ntohs(port))
1912 		return 1;
1913 
1914 	return 0;
1915 }
1916 #endif
1917 
1918 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1919 ssize_t
1920 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1921 {
1922 	uint8_t* buf = (uint8_t*) p;
1923 	ssize_t total = 0;
1924 	struct pollfd fd;
1925 	memset(&fd, 0, sizeof(fd));
1926 	fd.fd = s;
1927 	fd.events = POLLIN;
1928 
1929 	while( total < sz) {
1930 		ssize_t ret;
1931 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1932 		if(ret == -1) {
1933 			if(errno == EAGAIN)
1934 				/* blocking read */
1935 				continue;
1936 			if(errno == EINTR) {
1937 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1938 					return -1;
1939 				/* other signals can be handled later */
1940 				continue;
1941 			}
1942 			/* some error */
1943 			return -1;
1944 		}
1945 		if(ret == 0) {
1946 			/* operation timed out */
1947 			return -2;
1948 		}
1949 		ret = read(s, buf+total, sz-total);
1950 		if(ret == -1) {
1951 			if(errno == EAGAIN)
1952 				/* blocking read */
1953 				continue;
1954 			if(errno == EINTR) {
1955 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1956 					return -1;
1957 				/* other signals can be handled later */
1958 				continue;
1959 			}
1960 			/* some error */
1961 			return -1;
1962 		}
1963 		if(ret == 0) {
1964 			/* closed connection! */
1965 			return 0;
1966 		}
1967 		total += ret;
1968 	}
1969 	return total;
1970 }
1971 
1972 static void
1973 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1974 {
1975 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1976 	udb_ptr t, next;
1977 	udb_base* u = nsd->task[nsd->mytask];
1978 	udb_ptr_init(&next, u);
1979 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1980 	udb_base_set_userdata(u, 0);
1981 	while(!udb_ptr_is_null(&t)) {
1982 		/* store next in list so this one can be deleted or reused */
1983 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1984 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1985 
1986 		/* process task t */
1987 		/* append results for task t and update last_task */
1988 		task_process_in_reload(nsd, u, last_task, &t);
1989 
1990 		/* go to next */
1991 		udb_ptr_set_ptr(&t, u, &next);
1992 
1993 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1994 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1995 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1996 			if(cmd == NSD_QUIT) {
1997 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1998 				/* sync to disk (if needed) */
1999 				udb_base_sync(nsd->db->udb, 0);
2000 				/* unlink files of remainder of tasks */
2001 				while(!udb_ptr_is_null(&t)) {
2002 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2003 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2004 					}
2005 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2006 				}
2007 				udb_ptr_unlink(&t, u);
2008 				udb_ptr_unlink(&next, u);
2009 				exit(0);
2010 			}
2011 		}
2012 
2013 	}
2014 	udb_ptr_unlink(&t, u);
2015 	udb_ptr_unlink(&next, u);
2016 }
2017 
2018 #ifdef BIND8_STATS
2019 static void
2020 parent_send_stats(struct nsd* nsd, int cmdfd)
2021 {
2022 	size_t i;
2023 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
2024 		log_msg(LOG_ERR, "could not write stats to reload");
2025 		return;
2026 	}
2027 	for(i=0; i<nsd->child_count; i++)
2028 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
2029 			sizeof(stc_type))) {
2030 			log_msg(LOG_ERR, "could not write stats to reload");
2031 			return;
2032 		}
2033 }
2034 
2035 static void
2036 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
2037 {
2038 	struct nsdst s;
2039 	stc_type* p;
2040 	size_t i;
2041 	if(block_read(nsd, cmdfd, &s, sizeof(s),
2042 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
2043 		log_msg(LOG_ERR, "could not read stats from oldpar");
2044 		return;
2045 	}
2046 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
2047 	s.db_mem = region_get_mem(nsd->db->region);
2048 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
2049 		nsd->child_count);
2050 	if(!p) return;
2051 	for(i=0; i<nsd->child_count; i++) {
2052 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
2053 			sizeof(stc_type))
2054 			return;
2055 	}
2056 }
2057 #endif /* BIND8_STATS */
2058 
2059 /*
2060  * Reload the database, stop parent, re-fork children and continue.
2061  * as server_main.
2062  */
2063 static void
2064 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2065 	int cmdsocket)
2066 {
2067 	pid_t mypid;
2068 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2069 	int ret;
2070 	udb_ptr last_task;
2071 	struct sigaction old_sigchld, ign_sigchld;
2072 	/* ignore SIGCHLD from the previous server_main that used this pid */
2073 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2074 	ign_sigchld.sa_handler = SIG_IGN;
2075 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2076 
2077 	/* see what tasks we got from xfrd */
2078 	task_remap(nsd->task[nsd->mytask]);
2079 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2080 	udb_compact_inhibited(nsd->db->udb, 1);
2081 	reload_process_tasks(nsd, &last_task, cmdsocket);
2082 	udb_compact_inhibited(nsd->db->udb, 0);
2083 	udb_compact(nsd->db->udb);
2084 
2085 #ifndef NDEBUG
2086 	if(nsd_debug_level >= 1)
2087 		region_log_stats(nsd->db->region);
2088 #endif /* NDEBUG */
2089 	/* sync to disk (if needed) */
2090 	udb_base_sync(nsd->db->udb, 0);
2091 
2092 	initialize_dname_compression_tables(nsd);
2093 
2094 #ifdef BIND8_STATS
2095 	/* Restart dumping stats if required.  */
2096 	time(&nsd->st.boot);
2097 	set_bind8_alarm(nsd);
2098 #endif
2099 #ifdef USE_ZONE_STATS
2100 	server_zonestat_realloc(nsd); /* realloc for new children */
2101 	server_zonestat_switch(nsd);
2102 #endif
2103 
2104 	/* listen for the signals of failed children again */
2105 	sigaction(SIGCHLD, &old_sigchld, NULL);
2106 	/* Start new child processes */
2107 	if (server_start_children(nsd, server_region, netio, &nsd->
2108 		xfrd_listener->fd) != 0) {
2109 		send_children_quit(nsd);
2110 		exit(1);
2111 	}
2112 
2113 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2114 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2115 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2116 		if(cmd == NSD_QUIT) {
2117 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2118 			send_children_quit(nsd);
2119 			exit(0);
2120 		}
2121 	}
2122 
2123 	/* Send quit command to parent: blocking, wait for receipt. */
2124 	do {
2125 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2126 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2127 		{
2128 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2129 				strerror(errno));
2130 		}
2131 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2132 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2133 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2134 			RELOAD_SYNC_TIMEOUT);
2135 		if(ret == -2) {
2136 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2137 		}
2138 	} while (ret == -2);
2139 	if(ret == -1) {
2140 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2141 			strerror(errno));
2142 	}
2143 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2144 	if(cmd == NSD_QUIT) {
2145 		/* small race condition possible here, parent got quit cmd. */
2146 		send_children_quit(nsd);
2147 		exit(1);
2148 	}
2149 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2150 #ifdef BIND8_STATS
2151 	reload_do_stats(cmdsocket, nsd, &last_task);
2152 #endif
2153 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2154 	task_process_sync(nsd->task[nsd->mytask]);
2155 #ifdef USE_ZONE_STATS
2156 	server_zonestat_realloc(nsd); /* realloc for next children */
2157 #endif
2158 
2159 	/* send soainfo to the xfrd process, signal it that reload is done,
2160 	 * it picks up the taskudb */
2161 	cmd = NSD_RELOAD_DONE;
2162 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2163 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2164 			strerror(errno));
2165 	}
2166 	mypid = getpid();
2167 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2168 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2169 			strerror(errno));
2170 	}
2171 
2172 	/* try to reopen file */
2173 	if (nsd->file_rotation_ok)
2174 		log_reopen(nsd->log_filename, 1);
2175 	/* exit reload, continue as new server_main */
2176 }
2177 
2178 /*
2179  * Get the mode depending on the signal hints that have been received.
2180  * Multiple signal hints can be received and will be handled in turn.
2181  */
2182 static sig_atomic_t
2183 server_signal_mode(struct nsd *nsd)
2184 {
2185 	if(nsd->signal_hint_quit) {
2186 		nsd->signal_hint_quit = 0;
2187 		return NSD_QUIT;
2188 	}
2189 	else if(nsd->signal_hint_shutdown) {
2190 		nsd->signal_hint_shutdown = 0;
2191 		return NSD_SHUTDOWN;
2192 	}
2193 	else if(nsd->signal_hint_child) {
2194 		nsd->signal_hint_child = 0;
2195 		return NSD_REAP_CHILDREN;
2196 	}
2197 	else if(nsd->signal_hint_reload) {
2198 		nsd->signal_hint_reload = 0;
2199 		return NSD_RELOAD;
2200 	}
2201 	else if(nsd->signal_hint_reload_hup) {
2202 		nsd->signal_hint_reload_hup = 0;
2203 		return NSD_RELOAD_REQ;
2204 	}
2205 	else if(nsd->signal_hint_stats) {
2206 		nsd->signal_hint_stats = 0;
2207 #ifdef BIND8_STATS
2208 		set_bind8_alarm(nsd);
2209 #endif
2210 		return NSD_STATS;
2211 	}
2212 	else if(nsd->signal_hint_statsusr) {
2213 		nsd->signal_hint_statsusr = 0;
2214 		return NSD_STATS;
2215 	}
2216 	return NSD_RUN;
2217 }
2218 
2219 /*
2220  * The main server simply waits for signals and child processes to
2221  * terminate.  Child processes are restarted as necessary.
2222  */
2223 void
2224 server_main(struct nsd *nsd)
2225 {
2226 	region_type *server_region = region_create(xalloc, free);
2227 	netio_type *netio = netio_create(server_region);
2228 	netio_handler_type reload_listener;
2229 	int reload_sockets[2] = {-1, -1};
2230 	struct timespec timeout_spec;
2231 	int status;
2232 	pid_t child_pid;
2233 	pid_t reload_pid = -1;
2234 	sig_atomic_t mode;
2235 
2236 	/* Ensure we are the main process */
2237 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2238 
2239 	/* Add listener for the XFRD process */
2240 	netio_add_handler(netio, nsd->xfrd_listener);
2241 
2242 	/* Start the child processes that handle incoming queries */
2243 	if (server_start_children(nsd, server_region, netio,
2244 		&nsd->xfrd_listener->fd) != 0) {
2245 		send_children_quit(nsd);
2246 		exit(1);
2247 	}
2248 	reload_listener.fd = -1;
2249 
2250 	/* This_child MUST be 0, because this is the parent process */
2251 	assert(nsd->this_child == 0);
2252 
2253 	/* Run the server until we get a shutdown signal */
2254 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2255 		/* Did we receive a signal that changes our mode? */
2256 		if(mode == NSD_RUN) {
2257 			nsd->mode = mode = server_signal_mode(nsd);
2258 		}
2259 
2260 		switch (mode) {
2261 		case NSD_RUN:
2262 			/* see if any child processes terminated */
2263 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2264 				int is_child = delete_child_pid(nsd, child_pid);
2265 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2266 					if(nsd->children[is_child].child_fd == -1)
2267 						nsd->children[is_child].has_exited = 1;
2268 					parent_check_all_children_exited(nsd);
2269 				} else if(is_child != -1) {
2270 					log_msg(LOG_WARNING,
2271 					       "server %d died unexpectedly with status %d, restarting",
2272 					       (int) child_pid, status);
2273 					restart_child_servers(nsd, server_region, netio,
2274 						&nsd->xfrd_listener->fd);
2275 				} else if (child_pid == reload_pid) {
2276 					sig_atomic_t cmd = NSD_RELOAD_DONE;
2277 					pid_t mypid;
2278 					log_msg(LOG_WARNING,
2279 					       "Reload process %d failed with status %d, continuing with old database",
2280 					       (int) child_pid, status);
2281 					reload_pid = -1;
2282 					if(reload_listener.fd != -1) close(reload_listener.fd);
2283 					reload_listener.fd = -1;
2284 					reload_listener.event_types = NETIO_EVENT_NONE;
2285 					task_process_sync(nsd->task[nsd->mytask]);
2286 					/* inform xfrd reload attempt ended */
2287 					if(!write_socket(nsd->xfrd_listener->fd,
2288 						&cmd, sizeof(cmd))) {
2289 						log_msg(LOG_ERR, "problems "
2290 						  "sending SOAEND to xfrd: %s",
2291 						  strerror(errno));
2292 					}
2293 					mypid = getpid();
2294 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2295 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2296 							strerror(errno));
2297 					}
2298 				} else if(status != 0) {
2299 					/* check for status, because we get
2300 					 * the old-servermain because reload
2301 					 * is the process-parent of old-main,
2302 					 * and we get older server-processes
2303 					 * that are exiting after a reload */
2304 					log_msg(LOG_WARNING,
2305 					       "process %d terminated with status %d",
2306 					       (int) child_pid, status);
2307 				}
2308 			}
2309 			if (child_pid == -1) {
2310 				if (errno == EINTR) {
2311 					continue;
2312 				}
2313 				if (errno != ECHILD)
2314 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2315 			}
2316 			if (nsd->mode != NSD_RUN)
2317 				break;
2318 
2319 			/* timeout to collect processes. In case no sigchild happens. */
2320 			timeout_spec.tv_sec = 60;
2321 			timeout_spec.tv_nsec = 0;
2322 
2323 			/* listen on ports, timeout for collecting terminated children */
2324 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2325 				if (errno != EINTR) {
2326 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2327 				}
2328 			}
2329 			if(nsd->restart_children) {
2330 				restart_child_servers(nsd, server_region, netio,
2331 					&nsd->xfrd_listener->fd);
2332 				nsd->restart_children = 0;
2333 			}
2334 			if(nsd->reload_failed) {
2335 				sig_atomic_t cmd = NSD_RELOAD_DONE;
2336 				pid_t mypid;
2337 				nsd->reload_failed = 0;
2338 				log_msg(LOG_WARNING,
2339 				       "Reload process %d failed, continuing with old database",
2340 				       (int) reload_pid);
2341 				reload_pid = -1;
2342 				if(reload_listener.fd != -1) close(reload_listener.fd);
2343 				reload_listener.fd = -1;
2344 				reload_listener.event_types = NETIO_EVENT_NONE;
2345 				task_process_sync(nsd->task[nsd->mytask]);
2346 				/* inform xfrd reload attempt ended */
2347 				if(!write_socket(nsd->xfrd_listener->fd,
2348 					&cmd, sizeof(cmd))) {
2349 					log_msg(LOG_ERR, "problems "
2350 					  "sending SOAEND to xfrd: %s",
2351 					  strerror(errno));
2352 				}
2353 				mypid = getpid();
2354 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2355 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2356 						strerror(errno));
2357 				}
2358 			}
2359 
2360 			break;
2361 		case NSD_RELOAD_REQ: {
2362 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2363 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2364 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2365 				"main: ipc send reload_req to xfrd"));
2366 			if(!write_socket(nsd->xfrd_listener->fd,
2367 				&cmd, sizeof(cmd))) {
2368 				log_msg(LOG_ERR, "server_main: could not send "
2369 				"reload_req to xfrd: %s", strerror(errno));
2370 			}
2371 			nsd->mode = NSD_RUN;
2372 			} break;
2373 		case NSD_RELOAD:
2374 			/* Continue to run nsd after reload */
2375 			nsd->mode = NSD_RUN;
2376 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2377 			if (reload_pid != -1) {
2378 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2379 				       (int) reload_pid);
2380 				break;
2381 			}
2382 
2383 			/* switch the mytask to keep track of who owns task*/
2384 			nsd->mytask = 1 - nsd->mytask;
2385 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2386 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2387 				reload_pid = -1;
2388 				break;
2389 			}
2390 
2391 			/* Do actual reload */
2392 			reload_pid = fork();
2393 			switch (reload_pid) {
2394 			case -1:
2395 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2396 				break;
2397 			default:
2398 				/* PARENT */
2399 				close(reload_sockets[0]);
2400 				server_reload(nsd, server_region, netio,
2401 					reload_sockets[1]);
2402 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2403 				close(reload_sockets[1]);
2404 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2405 				/* drop stale xfrd ipc data */
2406 				((struct ipc_handler_conn_data*)nsd->
2407 					xfrd_listener->user_data)
2408 					->conn->is_reading = 0;
2409 				reload_pid = -1;
2410 				reload_listener.fd = -1;
2411 				reload_listener.event_types = NETIO_EVENT_NONE;
2412 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2413 				break;
2414 			case 0:
2415 				/* CHILD */
2416 				/* server_main keep running until NSD_QUIT_SYNC
2417 				 * received from reload. */
2418 				close(reload_sockets[1]);
2419 				reload_listener.fd = reload_sockets[0];
2420 				reload_listener.timeout = NULL;
2421 				reload_listener.user_data = nsd;
2422 				reload_listener.event_types = NETIO_EVENT_READ;
2423 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2424 				netio_add_handler(netio, &reload_listener);
2425 				reload_pid = getppid();
2426 				break;
2427 			}
2428 			break;
2429 		case NSD_QUIT_SYNC:
2430 			/* synchronisation of xfrd, parent and reload */
2431 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2432 				sig_atomic_t cmd = NSD_RELOAD;
2433 				/* stop xfrd ipc writes in progress */
2434 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2435 					"main: ipc send indication reload"));
2436 				if(!write_socket(nsd->xfrd_listener->fd,
2437 					&cmd, sizeof(cmd))) {
2438 					log_msg(LOG_ERR, "server_main: could not send reload "
2439 					"indication to xfrd: %s", strerror(errno));
2440 				}
2441 				/* wait for ACK from xfrd */
2442 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2443 				nsd->quit_sync_done = 1;
2444 			}
2445 			nsd->mode = NSD_RUN;
2446 			break;
2447 		case NSD_QUIT:
2448 			/* silent shutdown during reload */
2449 			if(reload_listener.fd != -1) {
2450 				/* acknowledge the quit, to sync reload that we will really quit now */
2451 				sig_atomic_t cmd = NSD_RELOAD;
2452 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2453 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2454 					log_msg(LOG_ERR, "server_main: "
2455 						"could not ack quit: %s", strerror(errno));
2456 				}
2457 #ifdef BIND8_STATS
2458 				parent_send_stats(nsd, reload_listener.fd);
2459 #endif /* BIND8_STATS */
2460 				close(reload_listener.fd);
2461 			}
2462 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2463 			/* only quit children after xfrd has acked */
2464 			send_children_quit(nsd);
2465 
2466 #ifdef MEMCLEAN /* OS collects memory pages */
2467 			region_destroy(server_region);
2468 #endif
2469 			server_shutdown(nsd);
2470 
2471 			/* ENOTREACH */
2472 			break;
2473 		case NSD_SHUTDOWN:
2474 			break;
2475 		case NSD_REAP_CHILDREN:
2476 			/* continue; wait for child in run loop */
2477 			nsd->mode = NSD_RUN;
2478 			break;
2479 		case NSD_STATS:
2480 #ifdef BIND8_STATS
2481 			set_children_stats(nsd);
2482 #endif
2483 			nsd->mode = NSD_RUN;
2484 			break;
2485 		default:
2486 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2487 			nsd->mode = NSD_RUN;
2488 			break;
2489 		}
2490 	}
2491 	log_msg(LOG_WARNING, "signal received, shutting down...");
2492 
2493 	/* close opened ports to avoid race with restart of nsd */
2494 	server_close_all_sockets(nsd->udp, nsd->ifs);
2495 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2496 #ifdef HAVE_SSL
2497 	daemon_remote_close(nsd->rc);
2498 #endif
2499 	send_children_quit_and_wait(nsd);
2500 
2501 	/* Unlink it if possible... */
2502 	unlinkpid(nsd->pidfile);
2503 	unlink(nsd->task[0]->fname);
2504 	unlink(nsd->task[1]->fname);
2505 #ifdef USE_ZONE_STATS
2506 	unlink(nsd->zonestatfname[0]);
2507 	unlink(nsd->zonestatfname[1]);
2508 #endif
2509 #ifdef USE_DNSTAP
2510 	dt_collector_close(nsd->dt_collector, nsd);
2511 #endif
2512 
2513 	if(reload_listener.fd != -1) {
2514 		sig_atomic_t cmd = NSD_QUIT;
2515 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2516 			"main: ipc send quit to reload-process"));
2517 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2518 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2519 				strerror(errno));
2520 		}
2521 		fsync(reload_listener.fd);
2522 		close(reload_listener.fd);
2523 		/* wait for reload to finish processing */
2524 		while(1) {
2525 			if(waitpid(reload_pid, NULL, 0) == -1) {
2526 				if(errno == EINTR) continue;
2527 				if(errno == ECHILD) break;
2528 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2529 					(int)reload_pid, strerror(errno));
2530 			}
2531 			break;
2532 		}
2533 	}
2534 	if(nsd->xfrd_listener->fd != -1) {
2535 		/* complete quit, stop xfrd */
2536 		sig_atomic_t cmd = NSD_QUIT;
2537 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2538 			"main: ipc send quit to xfrd"));
2539 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2540 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2541 				strerror(errno));
2542 		}
2543 		fsync(nsd->xfrd_listener->fd);
2544 		close(nsd->xfrd_listener->fd);
2545 		(void)kill(nsd->pid, SIGTERM);
2546 	}
2547 
2548 #ifdef MEMCLEAN /* OS collects memory pages */
2549 	region_destroy(server_region);
2550 #endif
2551 	/* write the nsd.db to disk, wait for it to complete */
2552 	udb_base_sync(nsd->db->udb, 1);
2553 	udb_base_close(nsd->db->udb);
2554 	server_shutdown(nsd);
2555 }
2556 
2557 static query_state_type
2558 server_process_query(struct nsd *nsd, struct query *query)
2559 {
2560 	return query_process(query, nsd);
2561 }
2562 
2563 static query_state_type
2564 server_process_query_udp(struct nsd *nsd, struct query *query)
2565 {
2566 #ifdef RATELIMIT
2567 	if(query_process(query, nsd) != QUERY_DISCARDED) {
2568 		if(rrl_process_query(query))
2569 			return rrl_slip(query);
2570 		else	return QUERY_PROCESSED;
2571 	}
2572 	return QUERY_DISCARDED;
2573 #else
2574 	return query_process(query, nsd);
2575 #endif
2576 }
2577 
2578 struct event_base*
2579 nsd_child_event_base(void)
2580 {
2581 	struct event_base* base;
2582 #ifdef USE_MINI_EVENT
2583 	static time_t secs;
2584 	static struct timeval now;
2585 	base = event_init(&secs, &now);
2586 #else
2587 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2588 	/* libev */
2589 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2590 #  else
2591 	/* libevent */
2592 #    ifdef HAVE_EVENT_BASE_NEW
2593 	base = event_base_new();
2594 #    else
2595 	base = event_init();
2596 #    endif
2597 #  endif
2598 #endif
2599 	return base;
2600 }
2601 
2602 static void
2603 add_udp_handler(
2604 	struct nsd *nsd,
2605 	struct nsd_socket *sock,
2606 	struct udp_handler_data *data)
2607 {
2608 	struct event *handler = &data->event;
2609 
2610 	data->nsd = nsd;
2611 	data->socket = sock;
2612 
2613 	memset(handler, 0, sizeof(*handler));
2614 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
2615 	if(event_base_set(nsd->event_base, handler) != 0)
2616 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2617 	if(event_add(handler, NULL) != 0)
2618 		log_msg(LOG_ERR, "nsd udp: event_add failed");
2619 }
2620 
2621 void
2622 add_tcp_handler(
2623 	struct nsd *nsd,
2624 	struct nsd_socket *sock,
2625 	struct tcp_accept_handler_data *data)
2626 {
2627 	struct event *handler = &data->event;
2628 
2629 	data->nsd = nsd;
2630 	data->socket = sock;
2631 
2632 #ifdef HAVE_SSL
2633 	if (nsd->tls_ctx &&
2634 	    nsd->options->tls_port &&
2635 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
2636 	{
2637 		data->tls_accept = 1;
2638 		if(verbosity >= 2) {
2639 			char buf[48];
2640 			addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
2641 			VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
2642 		}
2643 	} else {
2644 		data->tls_accept = 0;
2645 	}
2646 #endif
2647 
2648 	memset(handler, 0, sizeof(*handler));
2649 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
2650 	if(event_base_set(nsd->event_base, handler) != 0)
2651 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2652 	if(event_add(handler, NULL) != 0)
2653 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
2654 	data->event_added = 1;
2655 }
2656 
2657 /*
2658  * Serve DNS requests.
2659  */
2660 void
2661 server_child(struct nsd *nsd)
2662 {
2663 	size_t i, from, numifs;
2664 	region_type *server_region = region_create(xalloc, free);
2665 	struct event_base* event_base = nsd_child_event_base();
2666 	sig_atomic_t mode;
2667 
2668 	if(!event_base) {
2669 		log_msg(LOG_ERR, "nsd server could not create event base");
2670 		exit(1);
2671 	}
2672 	nsd->event_base = event_base;
2673 	nsd->server_region = server_region;
2674 
2675 #ifdef RATELIMIT
2676 	rrl_init(nsd->this_child->child_num);
2677 #endif
2678 
2679 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2680 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2681 
2682 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2683 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2684 	}
2685 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2686 		server_close_all_sockets(nsd->udp, nsd->ifs);
2687 	}
2688 
2689 	if (nsd->this_child->parent_fd != -1) {
2690 		struct event *handler;
2691 		struct ipc_handler_conn_data* user_data =
2692 			(struct ipc_handler_conn_data*)region_alloc(
2693 			server_region, sizeof(struct ipc_handler_conn_data));
2694 		user_data->nsd = nsd;
2695 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2696 
2697 		handler = (struct event*) region_alloc(
2698 			server_region, sizeof(*handler));
2699 		memset(handler, 0, sizeof(*handler));
2700 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2701 			EV_READ, child_handle_parent_command, user_data);
2702 		if(event_base_set(event_base, handler) != 0)
2703 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2704 		if(event_add(handler, NULL) != 0)
2705 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2706 	}
2707 
2708 	if(nsd->reuseport) {
2709 		numifs = nsd->ifs / nsd->reuseport;
2710 		from = numifs * nsd->this_child->child_num;
2711 		if(from+numifs > nsd->ifs) { /* should not happen */
2712 			from = 0;
2713 			numifs = nsd->ifs;
2714 		}
2715 	} else {
2716 		from = 0;
2717 		numifs = nsd->ifs;
2718 	}
2719 
2720 	if (nsd->server_kind & NSD_SERVER_UDP) {
2721 		memset(msgs, 0, sizeof(msgs));
2722 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2723 			queries[i] = query_create(server_region,
2724 				compressed_dname_offsets,
2725 				compression_table_size, compressed_dnames);
2726 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2727 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2728 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2729 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2730 			msgs[i].msg_hdr.msg_iovlen  = 1;
2731 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2732 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2733 		}
2734 
2735 		for (i = from; i < from+numifs; ++i) {
2736 			struct udp_handler_data *data =	region_alloc_zero(
2737 				nsd->server_region, sizeof(*data));
2738 			add_udp_handler(nsd, &nsd->udp[i], data);
2739 		}
2740 	}
2741 
2742 	/*
2743 	 * Keep track of all the TCP accept handlers so we can enable
2744 	 * and disable them based on the current number of active TCP
2745 	 * connections.
2746 	 */
2747 	if (nsd->server_kind & NSD_SERVER_TCP) {
2748 		tcp_accept_handler_count = numifs;
2749 		tcp_accept_handlers = region_alloc_array(server_region,
2750 			numifs, sizeof(*tcp_accept_handlers));
2751 
2752 		for (i = from; i < numifs; i++) {
2753 			struct tcp_accept_handler_data *data =
2754 				&tcp_accept_handlers[i-from];
2755 			memset(data, 0, sizeof(*data));
2756 			add_tcp_handler(nsd, &nsd->tcp[i], data);
2757 		}
2758 	} else {
2759 		tcp_accept_handler_count = 0;
2760 	}
2761 
2762 	/* The main loop... */
2763 	while ((mode = nsd->mode) != NSD_QUIT) {
2764 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2765 
2766 		/* Do we need to do the statistics... */
2767 		if (mode == NSD_STATS) {
2768 #ifdef BIND8_STATS
2769 			int p = nsd->st.period;
2770 			nsd->st.period = 1; /* force stats printout */
2771 			/* Dump the statistics */
2772 			bind8_stats(nsd);
2773 			nsd->st.period = p;
2774 #else /* !BIND8_STATS */
2775 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2776 #endif /* BIND8_STATS */
2777 
2778 			nsd->mode = NSD_RUN;
2779 		}
2780 		else if (mode == NSD_REAP_CHILDREN) {
2781 			/* got signal, notify parent. parent reaps terminated children. */
2782 			if (nsd->this_child->parent_fd != -1) {
2783 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2784 				if (write(nsd->this_child->parent_fd,
2785 				    &parent_notify,
2786 				    sizeof(parent_notify)) == -1)
2787 				{
2788 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2789 						(int) nsd->this_child->pid, strerror(errno));
2790 				}
2791 			} else /* no parent, so reap 'em */
2792 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2793 			nsd->mode = NSD_RUN;
2794 		}
2795 		else if(mode == NSD_RUN) {
2796 			/* Wait for a query... */
2797 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2798 				if (errno != EINTR) {
2799 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2800 					break;
2801 				}
2802 			}
2803 		} else if(mode == NSD_QUIT) {
2804 			/* ignore here, quit */
2805 		} else {
2806 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2807 				(int)mode);
2808 			nsd->mode = NSD_RUN;
2809 		}
2810 	}
2811 
2812 	service_remaining_tcp(nsd);
2813 #ifdef	BIND8_STATS
2814 	bind8_stats(nsd);
2815 #endif /* BIND8_STATS */
2816 
2817 #ifdef MEMCLEAN /* OS collects memory pages */
2818 #ifdef RATELIMIT
2819 	rrl_deinit(nsd->this_child->child_num);
2820 #endif
2821 	event_base_free(event_base);
2822 	region_destroy(server_region);
2823 #endif
2824 	server_shutdown(nsd);
2825 }
2826 
2827 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
2828 {
2829 	int* timed_out = (int*)arg;
2830         assert(event & EV_TIMEOUT);
2831 	/* wake up the service tcp thread, note event is no longer
2832 	 * registered */
2833 	*timed_out = 1;
2834 }
2835 
2836 void
2837 service_remaining_tcp(struct nsd* nsd)
2838 {
2839 	struct tcp_handler_data* p;
2840 	struct event_base* event_base;
2841 	/* check if it is needed */
2842 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
2843 		return;
2844 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
2845 
2846 	/* setup event base */
2847 	event_base = nsd_child_event_base();
2848 	if(!event_base) {
2849 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
2850 		return;
2851 	}
2852 	/* register tcp connections */
2853 	for(p = tcp_active_list; p != NULL; p = p->next) {
2854 		struct timeval timeout;
2855 		int fd = p->event.ev_fd;
2856 #ifdef USE_MINI_EVENT
2857 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
2858 #else
2859 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
2860 #endif
2861 		void (*fn)(int, short, void*);
2862 #ifdef HAVE_SSL
2863 		if(p->tls) {
2864 			if((event&EV_READ))
2865 				fn = handle_tls_reading;
2866 			else	fn = handle_tls_writing;
2867 		} else {
2868 #endif
2869 			if((event&EV_READ))
2870 				fn = handle_tcp_reading;
2871 			else	fn = handle_tcp_writing;
2872 #ifdef HAVE_SSL
2873 		}
2874 #endif
2875 
2876 		/* set timeout to 1/10 second */
2877 		if(p->tcp_timeout > 100)
2878 			p->tcp_timeout = 100;
2879 		timeout.tv_sec = p->tcp_timeout / 1000;
2880 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
2881 		event_del(&p->event);
2882 		memset(&p->event, 0, sizeof(p->event));
2883 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
2884 			fn, p);
2885 		if(event_base_set(event_base, &p->event) != 0)
2886 			log_msg(LOG_ERR, "event base set failed");
2887 		if(event_add(&p->event, &timeout) != 0)
2888 			log_msg(LOG_ERR, "event add failed");
2889 	}
2890 
2891 	/* handle it */
2892 	while(nsd->current_tcp_count > 0) {
2893 		mode_t m = server_signal_mode(nsd);
2894 		struct event timeout;
2895 		struct timeval tv;
2896 		int timed_out = 0;
2897 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
2898 			m == NSD_REAP_CHILDREN) {
2899 			/* quit */
2900 			break;
2901 		}
2902 		/* timer */
2903 		/* have to do something every second */
2904 		tv.tv_sec = 1;
2905 		tv.tv_usec = 0;
2906 		memset(&timeout, 0, sizeof(timeout));
2907 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
2908 			&timed_out);
2909 		if(event_base_set(event_base, &timeout) != 0)
2910 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
2911 		if(event_add(&timeout, &tv) != 0)
2912 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
2913 
2914 		/* service loop */
2915 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2916 			if (errno != EINTR) {
2917 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2918 				break;
2919 			}
2920 		}
2921 		if(!timed_out) {
2922 			event_del(&timeout);
2923 		} else {
2924 			/* timed out, quit */
2925 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
2926 			break;
2927 		}
2928 	}
2929 #ifdef MEMCLEAN
2930 	event_base_free(event_base);
2931 #endif
2932 	/* continue to quit after return */
2933 }
2934 
2935 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
2936  * are always used, even if nonblocking operations are broken, in which case
2937  * NUM_RECV_PER_SELECT is defined to 1 (one).
2938  */
2939 #if defined(HAVE_RECVMMSG)
2940 #define nsd_recvmmsg recvmmsg
2941 #else /* !HAVE_RECVMMSG */
2942 
2943 static int
2944 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
2945              int flags, struct timespec *timeout)
2946 {
2947 	int orig_errno;
2948 	unsigned int vpos = 0;
2949 	ssize_t rcvd;
2950 
2951 	/* timeout is ignored, ensure caller does not expect it to work */
2952 	assert(timeout == NULL);
2953 
2954 	orig_errno = errno;
2955 	errno = 0;
2956 	while(vpos < vlen) {
2957 		rcvd = recvfrom(sockfd,
2958 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
2959 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
2960 		                flags,
2961 		                msgvec[vpos].msg_hdr.msg_name,
2962 		               &msgvec[vpos].msg_hdr.msg_namelen);
2963 		if(rcvd < 0) {
2964 			break;
2965 		} else {
2966 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
2967 			msgvec[vpos].msg_len = (unsigned int)rcvd;
2968 			vpos++;
2969 		}
2970 	}
2971 
2972 	if(vpos) {
2973 		/* error will be picked up next time */
2974 		return (int)vpos;
2975 	} else if(errno == 0) {
2976 		errno = orig_errno;
2977 		return 0;
2978 	} else if(errno == EAGAIN) {
2979 		return 0;
2980 	}
2981 
2982 	return -1;
2983 }
2984 #endif /* HAVE_RECVMMSG */
2985 
2986 #ifdef HAVE_SENDMMSG
2987 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
2988 #else /* !HAVE_SENDMMSG */
2989 
2990 static int
2991 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
2992 {
2993 	int orig_errno;
2994 	unsigned int vpos = 0;
2995 	ssize_t snd;
2996 
2997 	orig_errno = errno;
2998 	errno = 0;
2999 	while(vpos < vlen) {
3000 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3001 		snd = sendto(sockfd,
3002 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3003 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3004 		             flags,
3005 		             msgvec[vpos].msg_hdr.msg_name,
3006 		             msgvec[vpos].msg_hdr.msg_namelen);
3007 		if(snd < 0) {
3008 			break;
3009 		} else {
3010 			msgvec[vpos].msg_len = (unsigned int)snd;
3011 			vpos++;
3012 		}
3013 	}
3014 
3015 	if(vpos) {
3016 		return (int)vpos;
3017 	} else if(errno == 0) {
3018 		errno = orig_errno;
3019 		return 0;
3020 	}
3021 
3022 	return -1;
3023 }
3024 #endif /* HAVE_SENDMMSG */
3025 
3026 static void
3027 handle_udp(int fd, short event, void* arg)
3028 {
3029 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3030 	int received, sent, recvcount, i;
3031 	struct query *q;
3032 
3033 	if (!(event & EV_READ)) {
3034 		return;
3035 	}
3036 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3037 	/* this printf strangely gave a performance increase on Linux */
3038 	/* printf("recvcount %d \n", recvcount); */
3039 	if (recvcount == -1) {
3040 		if (errno != EAGAIN && errno != EINTR) {
3041 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3042 			STATUP(data->nsd, rxerr);
3043 			/* No zone statup */
3044 		}
3045 		/* Simply no data available */
3046 		return;
3047 	}
3048 	for (i = 0; i < recvcount; i++) {
3049 	loopstart:
3050 		received = msgs[i].msg_len;
3051 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
3052 		q = queries[i];
3053 		if (received == -1) {
3054 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3055 #if defined(HAVE_RECVMMSG)
3056 				msgs[i].msg_hdr.msg_flags
3057 #else
3058 				errno
3059 #endif
3060 				));
3061 			STATUP(data->nsd, rxerr);
3062 			/* No zone statup */
3063 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3064 			iovecs[i].iov_len = buffer_remaining(q->packet);
3065 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3066 			goto swap_drop;
3067 		}
3068 
3069 		/* Account... */
3070 #ifdef BIND8_STATS
3071 		if (data->socket->addr.ai_family == AF_INET) {
3072 			STATUP(data->nsd, qudp);
3073 		} else if (data->socket->addr.ai_family == AF_INET6) {
3074 			STATUP(data->nsd, qudp6);
3075 		}
3076 #endif
3077 
3078 		buffer_skip(q->packet, received);
3079 		buffer_flip(q->packet);
3080 #ifdef USE_DNSTAP
3081 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
3082 			q->tcp, q->packet);
3083 #endif /* USE_DNSTAP */
3084 
3085 		/* Process and answer the query... */
3086 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
3087 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3088 				STATUP(data->nsd, nona);
3089 				ZTATUP(data->nsd, q->zone, nona);
3090 			}
3091 
3092 #ifdef USE_ZONE_STATS
3093 			if (data->socket->addr.ai_family == AF_INET) {
3094 				ZTATUP(data->nsd, q->zone, qudp);
3095 			} else if (data->socket->addr.ai_family == AF_INET6) {
3096 				ZTATUP(data->nsd, q->zone, qudp6);
3097 			}
3098 #endif
3099 
3100 			/* Add EDNS0 and TSIG info if necessary.  */
3101 			query_add_optional(q, data->nsd);
3102 
3103 			buffer_flip(q->packet);
3104 			iovecs[i].iov_len = buffer_remaining(q->packet);
3105 #ifdef BIND8_STATS
3106 			/* Account the rcode & TC... */
3107 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3108 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3109 			if (TC(q->packet)) {
3110 				STATUP(data->nsd, truncated);
3111 				ZTATUP(data->nsd, q->zone, truncated);
3112 			}
3113 #endif /* BIND8_STATS */
3114 #ifdef USE_DNSTAP
3115 			dt_collector_submit_auth_response(data->nsd,
3116 				&q->addr, q->addrlen, q->tcp, q->packet,
3117 				q->zone);
3118 #endif /* USE_DNSTAP */
3119 		} else {
3120 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3121 			iovecs[i].iov_len = buffer_remaining(q->packet);
3122 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3123 		swap_drop:
3124 			STATUP(data->nsd, dropped);
3125 			ZTATUP(data->nsd, q->zone, dropped);
3126 			if(i != recvcount-1) {
3127 				/* swap with last and decrease recvcount */
3128 				struct mmsghdr mtmp = msgs[i];
3129 				struct iovec iotmp = iovecs[i];
3130 				recvcount--;
3131 				msgs[i] = msgs[recvcount];
3132 				iovecs[i] = iovecs[recvcount];
3133 				queries[i] = queries[recvcount];
3134 				msgs[recvcount] = mtmp;
3135 				iovecs[recvcount] = iotmp;
3136 				queries[recvcount] = q;
3137 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3138 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3139 				goto loopstart;
3140 			} else { recvcount --; }
3141 		}
3142 	}
3143 
3144 	/* send until all are sent */
3145 	i = 0;
3146 	while(i<recvcount) {
3147 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3148 		if(sent == -1) {
3149 			/* don't log transient network full errors, unless
3150 			 * on higher verbosity */
3151 			if(!(errno == ENOBUFS && verbosity < 1) &&
3152 #ifdef EWOULDBLOCK
3153 			   !(errno == EWOULDBLOCK && verbosity < 1) &&
3154 #endif
3155 			   !(errno == EAGAIN && verbosity < 1)) {
3156 				const char* es = strerror(errno);
3157 				char a[48];
3158 				addr2str(&queries[i]->addr, a, sizeof(a));
3159 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3160 			}
3161 #ifdef BIND8_STATS
3162 			data->nsd->st.txerr += recvcount-i;
3163 #endif /* BIND8_STATS */
3164 			break;
3165 		}
3166 		i += sent;
3167 	}
3168 	for(i=0; i<recvcount; i++) {
3169 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3170 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3171 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
3172 	}
3173 }
3174 
3175 #ifdef HAVE_SSL
3176 /*
3177  * Setup an event for the tcp handler.
3178  */
3179 static void
3180 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3181        int fd, short event)
3182 {
3183 	struct timeval timeout;
3184 	struct event_base* ev_base;
3185 
3186 	timeout.tv_sec = data->nsd->tcp_timeout;
3187 	timeout.tv_usec = 0L;
3188 
3189 	ev_base = data->event.ev_base;
3190 	event_del(&data->event);
3191 	memset(&data->event, 0, sizeof(data->event));
3192 	event_set(&data->event, fd, event, fn, data);
3193 	if(event_base_set(ev_base, &data->event) != 0)
3194 		log_msg(LOG_ERR, "event base set failed");
3195 	if(event_add(&data->event, &timeout) != 0)
3196 		log_msg(LOG_ERR, "event add failed");
3197 }
3198 #endif /* HAVE_SSL */
3199 
3200 static void
3201 cleanup_tcp_handler(struct tcp_handler_data* data)
3202 {
3203 	event_del(&data->event);
3204 #ifdef HAVE_SSL
3205 	if(data->tls) {
3206 		SSL_shutdown(data->tls);
3207 		SSL_free(data->tls);
3208 		data->tls = NULL;
3209 	}
3210 #endif
3211 	close(data->event.ev_fd);
3212 	if(data->prev)
3213 		data->prev->next = data->next;
3214 	else	tcp_active_list = data->next;
3215 	if(data->next)
3216 		data->next->prev = data->prev;
3217 
3218 	/*
3219 	 * Enable the TCP accept handlers when the current number of
3220 	 * TCP connections is about to drop below the maximum number
3221 	 * of TCP connections.
3222 	 */
3223 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3224 		configure_handler_event_types(EV_READ|EV_PERSIST);
3225 		if(slowaccept) {
3226 			event_del(&slowaccept_event);
3227 			slowaccept = 0;
3228 		}
3229 	}
3230 	--data->nsd->current_tcp_count;
3231 	assert(data->nsd->current_tcp_count >= 0);
3232 
3233 	region_destroy(data->region);
3234 }
3235 
3236 static void
3237 handle_tcp_reading(int fd, short event, void* arg)
3238 {
3239 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3240 	ssize_t received;
3241 	struct event_base* ev_base;
3242 	struct timeval timeout;
3243 
3244 	if ((event & EV_TIMEOUT)) {
3245 		/* Connection timed out.  */
3246 		cleanup_tcp_handler(data);
3247 		return;
3248 	}
3249 
3250 	if (data->nsd->tcp_query_count > 0 &&
3251 		data->query_count >= data->nsd->tcp_query_count) {
3252 		/* No more queries allowed on this tcp connection. */
3253 		cleanup_tcp_handler(data);
3254 		return;
3255 	}
3256 
3257 	assert((event & EV_READ));
3258 
3259 	if (data->bytes_transmitted == 0) {
3260 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3261 	}
3262 
3263 	/*
3264 	 * Check if we received the leading packet length bytes yet.
3265 	 */
3266 	if (data->bytes_transmitted < sizeof(uint16_t)) {
3267 		received = read(fd,
3268 				(char *) &data->query->tcplen
3269 				+ data->bytes_transmitted,
3270 				sizeof(uint16_t) - data->bytes_transmitted);
3271 		if (received == -1) {
3272 			if (errno == EAGAIN || errno == EINTR) {
3273 				/*
3274 				 * Read would block, wait until more
3275 				 * data is available.
3276 				 */
3277 				return;
3278 			} else {
3279 				char buf[48];
3280 				addr2str(&data->query->addr, buf, sizeof(buf));
3281 #ifdef ECONNRESET
3282 				if (verbosity >= 2 || errno != ECONNRESET)
3283 #endif /* ECONNRESET */
3284 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3285 				cleanup_tcp_handler(data);
3286 				return;
3287 			}
3288 		} else if (received == 0) {
3289 			/* EOF */
3290 			cleanup_tcp_handler(data);
3291 			return;
3292 		}
3293 
3294 		data->bytes_transmitted += received;
3295 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3296 			/*
3297 			 * Not done with the tcplen yet, wait for more
3298 			 * data to become available.
3299 			 */
3300 			return;
3301 		}
3302 
3303 		assert(data->bytes_transmitted == sizeof(uint16_t));
3304 
3305 		data->query->tcplen = ntohs(data->query->tcplen);
3306 
3307 		/*
3308 		 * Minimum query size is:
3309 		 *
3310 		 *     Size of the header (12)
3311 		 *   + Root domain name   (1)
3312 		 *   + Query class        (2)
3313 		 *   + Query type         (2)
3314 		 */
3315 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3316 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3317 			cleanup_tcp_handler(data);
3318 			return;
3319 		}
3320 
3321 		if (data->query->tcplen > data->query->maxlen) {
3322 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3323 			cleanup_tcp_handler(data);
3324 			return;
3325 		}
3326 
3327 		buffer_set_limit(data->query->packet, data->query->tcplen);
3328 	}
3329 
3330 	assert(buffer_remaining(data->query->packet) > 0);
3331 
3332 	/* Read the (remaining) query data.  */
3333 	received = read(fd,
3334 			buffer_current(data->query->packet),
3335 			buffer_remaining(data->query->packet));
3336 	if (received == -1) {
3337 		if (errno == EAGAIN || errno == EINTR) {
3338 			/*
3339 			 * Read would block, wait until more data is
3340 			 * available.
3341 			 */
3342 			return;
3343 		} else {
3344 			char buf[48];
3345 			addr2str(&data->query->addr, buf, sizeof(buf));
3346 #ifdef ECONNRESET
3347 			if (verbosity >= 2 || errno != ECONNRESET)
3348 #endif /* ECONNRESET */
3349 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
3350 			cleanup_tcp_handler(data);
3351 			return;
3352 		}
3353 	} else if (received == 0) {
3354 		/* EOF */
3355 		cleanup_tcp_handler(data);
3356 		return;
3357 	}
3358 
3359 	data->bytes_transmitted += received;
3360 	buffer_skip(data->query->packet, received);
3361 	if (buffer_remaining(data->query->packet) > 0) {
3362 		/*
3363 		 * Message not yet complete, wait for more data to
3364 		 * become available.
3365 		 */
3366 		return;
3367 	}
3368 
3369 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3370 
3371 	/* Account... */
3372 #ifdef BIND8_STATS
3373 #ifndef INET6
3374 	STATUP(data->nsd, ctcp);
3375 #else
3376 	if (data->query->addr.ss_family == AF_INET) {
3377 		STATUP(data->nsd, ctcp);
3378 	} else if (data->query->addr.ss_family == AF_INET6) {
3379 		STATUP(data->nsd, ctcp6);
3380 	}
3381 #endif
3382 #endif /* BIND8_STATS */
3383 
3384 	/* We have a complete query, process it.  */
3385 
3386 	/* tcp-query-count: handle query counter ++ */
3387 	data->query_count++;
3388 
3389 	buffer_flip(data->query->packet);
3390 #ifdef USE_DNSTAP
3391 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3392 		data->query->addrlen, data->query->tcp, data->query->packet);
3393 #endif /* USE_DNSTAP */
3394 	data->query_state = server_process_query(data->nsd, data->query);
3395 	if (data->query_state == QUERY_DISCARDED) {
3396 		/* Drop the packet and the entire connection... */
3397 		STATUP(data->nsd, dropped);
3398 		ZTATUP(data->nsd, data->query->zone, dropped);
3399 		cleanup_tcp_handler(data);
3400 		return;
3401 	}
3402 
3403 #ifdef BIND8_STATS
3404 	if (RCODE(data->query->packet) == RCODE_OK
3405 	    && !AA(data->query->packet))
3406 	{
3407 		STATUP(data->nsd, nona);
3408 		ZTATUP(data->nsd, data->query->zone, nona);
3409 	}
3410 #endif /* BIND8_STATS */
3411 
3412 #ifdef USE_ZONE_STATS
3413 #ifndef INET6
3414 	ZTATUP(data->nsd, data->query->zone, ctcp);
3415 #else
3416 	if (data->query->addr.ss_family == AF_INET) {
3417 		ZTATUP(data->nsd, data->query->zone, ctcp);
3418 	} else if (data->query->addr.ss_family == AF_INET6) {
3419 		ZTATUP(data->nsd, data->query->zone, ctcp6);
3420 	}
3421 #endif
3422 #endif /* USE_ZONE_STATS */
3423 
3424 	query_add_optional(data->query, data->nsd);
3425 
3426 	/* Switch to the tcp write handler.  */
3427 	buffer_flip(data->query->packet);
3428 	data->query->tcplen = buffer_remaining(data->query->packet);
3429 #ifdef BIND8_STATS
3430 	/* Account the rcode & TC... */
3431 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3432 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3433 	if (TC(data->query->packet)) {
3434 		STATUP(data->nsd, truncated);
3435 		ZTATUP(data->nsd, data->query->zone, truncated);
3436 	}
3437 #endif /* BIND8_STATS */
3438 #ifdef USE_DNSTAP
3439 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3440 		data->query->addrlen, data->query->tcp, data->query->packet,
3441 		data->query->zone);
3442 #endif /* USE_DNSTAP */
3443 	data->bytes_transmitted = 0;
3444 
3445 	timeout.tv_sec = data->tcp_timeout / 1000;
3446 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3447 
3448 	ev_base = data->event.ev_base;
3449 	event_del(&data->event);
3450 	memset(&data->event, 0, sizeof(data->event));
3451 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3452 		handle_tcp_reading, data);
3453 	if(event_base_set(ev_base, &data->event) != 0)
3454 		log_msg(LOG_ERR, "event base set tcpr failed");
3455 	if(event_add(&data->event, &timeout) != 0)
3456 		log_msg(LOG_ERR, "event add tcpr failed");
3457 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3458 	handle_tcp_writing(fd, EV_WRITE, data);
3459 }
3460 
3461 static void
3462 handle_tcp_writing(int fd, short event, void* arg)
3463 {
3464 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3465 	ssize_t sent;
3466 	struct query *q = data->query;
3467 	struct timeval timeout;
3468 	struct event_base* ev_base;
3469 
3470 	if ((event & EV_TIMEOUT)) {
3471 		/* Connection timed out.  */
3472 		cleanup_tcp_handler(data);
3473 		return;
3474 	}
3475 
3476 	assert((event & EV_WRITE));
3477 
3478 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
3479 		/* Writing the response packet length.  */
3480 		uint16_t n_tcplen = htons(q->tcplen);
3481 #ifdef HAVE_WRITEV
3482 		struct iovec iov[2];
3483 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
3484 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
3485 		iov[1].iov_base = buffer_begin(q->packet);
3486 		iov[1].iov_len = buffer_limit(q->packet);
3487 		sent = writev(fd, iov, 2);
3488 #else /* HAVE_WRITEV */
3489 		sent = write(fd,
3490 			     (const char *) &n_tcplen + data->bytes_transmitted,
3491 			     sizeof(n_tcplen) - data->bytes_transmitted);
3492 #endif /* HAVE_WRITEV */
3493 		if (sent == -1) {
3494 			if (errno == EAGAIN || errno == EINTR) {
3495 				/*
3496 				 * Write would block, wait until
3497 				 * socket becomes writable again.
3498 				 */
3499 				return;
3500 			} else {
3501 #ifdef ECONNRESET
3502 				if(verbosity >= 2 || errno != ECONNRESET)
3503 #endif /* ECONNRESET */
3504 #ifdef EPIPE
3505 				  if(verbosity >= 2 || errno != EPIPE)
3506 #endif /* EPIPE 'broken pipe' */
3507 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3508 				cleanup_tcp_handler(data);
3509 				return;
3510 			}
3511 		}
3512 
3513 		data->bytes_transmitted += sent;
3514 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
3515 			/*
3516 			 * Writing not complete, wait until socket
3517 			 * becomes writable again.
3518 			 */
3519 			return;
3520 		}
3521 
3522 #ifdef HAVE_WRITEV
3523 		sent -= sizeof(n_tcplen);
3524 		/* handle potential 'packet done' code */
3525 		goto packet_could_be_done;
3526 #endif
3527  	}
3528 
3529 	sent = write(fd,
3530 		     buffer_current(q->packet),
3531 		     buffer_remaining(q->packet));
3532 	if (sent == -1) {
3533 		if (errno == EAGAIN || errno == EINTR) {
3534 			/*
3535 			 * Write would block, wait until
3536 			 * socket becomes writable again.
3537 			 */
3538 			return;
3539 		} else {
3540 #ifdef ECONNRESET
3541 			if(verbosity >= 2 || errno != ECONNRESET)
3542 #endif /* ECONNRESET */
3543 #ifdef EPIPE
3544 				  if(verbosity >= 2 || errno != EPIPE)
3545 #endif /* EPIPE 'broken pipe' */
3546 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
3547 			cleanup_tcp_handler(data);
3548 			return;
3549 		}
3550 	}
3551 
3552 	data->bytes_transmitted += sent;
3553 #ifdef HAVE_WRITEV
3554   packet_could_be_done:
3555 #endif
3556 	buffer_skip(q->packet, sent);
3557 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
3558 		/*
3559 		 * Still more data to write when socket becomes
3560 		 * writable again.
3561 		 */
3562 		return;
3563 	}
3564 
3565 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
3566 
3567 	if (data->query_state == QUERY_IN_AXFR) {
3568 		/* Continue processing AXFR and writing back results.  */
3569 		buffer_clear(q->packet);
3570 		data->query_state = query_axfr(data->nsd, q);
3571 		if (data->query_state != QUERY_PROCESSED) {
3572 			query_add_optional(data->query, data->nsd);
3573 
3574 			/* Reset data. */
3575 			buffer_flip(q->packet);
3576 			q->tcplen = buffer_remaining(q->packet);
3577 			data->bytes_transmitted = 0;
3578 			/* Reset timeout.  */
3579 			timeout.tv_sec = data->tcp_timeout / 1000;
3580 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3581 			ev_base = data->event.ev_base;
3582 			event_del(&data->event);
3583 			memset(&data->event, 0, sizeof(data->event));
3584 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
3585 				handle_tcp_writing, data);
3586 			if(event_base_set(ev_base, &data->event) != 0)
3587 				log_msg(LOG_ERR, "event base set tcpw failed");
3588 			if(event_add(&data->event, &timeout) != 0)
3589 				log_msg(LOG_ERR, "event add tcpw failed");
3590 
3591 			/*
3592 			 * Write data if/when the socket is writable
3593 			 * again.
3594 			 */
3595 			return;
3596 		}
3597 	}
3598 
3599 	/*
3600 	 * Done sending, wait for the next request to arrive on the
3601 	 * TCP socket by installing the TCP read handler.
3602 	 */
3603 	if (data->nsd->tcp_query_count > 0 &&
3604 		data->query_count >= data->nsd->tcp_query_count) {
3605 
3606 		(void) shutdown(fd, SHUT_WR);
3607 	}
3608 
3609 	data->bytes_transmitted = 0;
3610 
3611 	timeout.tv_sec = data->tcp_timeout / 1000;
3612 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
3613 	ev_base = data->event.ev_base;
3614 	event_del(&data->event);
3615 	memset(&data->event, 0, sizeof(data->event));
3616 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
3617 		handle_tcp_reading, data);
3618 	if(event_base_set(ev_base, &data->event) != 0)
3619 		log_msg(LOG_ERR, "event base set tcpw failed");
3620 	if(event_add(&data->event, &timeout) != 0)
3621 		log_msg(LOG_ERR, "event add tcpw failed");
3622 }
3623 
3624 #ifdef HAVE_SSL
3625 /** create SSL object and associate fd */
3626 static SSL*
3627 incoming_ssl_fd(SSL_CTX* ctx, int fd)
3628 {
3629 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
3630 	if(!ssl) {
3631 		log_crypto_err("could not SSL_new");
3632 		return NULL;
3633 	}
3634 	SSL_set_accept_state(ssl);
3635 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
3636 	if(!SSL_set_fd(ssl, fd)) {
3637 		log_crypto_err("could not SSL_set_fd");
3638 		SSL_free(ssl);
3639 		return NULL;
3640 	}
3641 	return ssl;
3642 }
3643 
3644 /** TLS handshake to upgrade TCP connection */
3645 static int
3646 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
3647 {
3648 	int r;
3649 	if(data->shake_state == tls_hs_read_event) {
3650 		/* read condition satisfied back to writing */
3651 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3652 		data->shake_state = tls_hs_none;
3653 		return 1;
3654 	}
3655 	if(data->shake_state == tls_hs_write_event) {
3656 		/* write condition satisfied back to reading */
3657 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3658 		data->shake_state = tls_hs_none;
3659 		return 1;
3660 	}
3661 
3662 	/* (continue to) setup the TLS connection */
3663 	ERR_clear_error();
3664 	r = SSL_do_handshake(data->tls);
3665 
3666 	if(r != 1) {
3667 		int want = SSL_get_error(data->tls, r);
3668 		if(want == SSL_ERROR_WANT_READ) {
3669 			if(data->shake_state == tls_hs_read) {
3670 				/* try again later */
3671 				return 1;
3672 			}
3673 			data->shake_state = tls_hs_read;
3674 			/* switch back to reading mode */
3675 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3676 			return 1;
3677 		} else if(want == SSL_ERROR_WANT_WRITE) {
3678 			if(data->shake_state == tls_hs_write) {
3679 				/* try again later */
3680 				return 1;
3681 			}
3682 			data->shake_state = tls_hs_write;
3683 			/* switch back to writing mode */
3684 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3685 			return 1;
3686 		} else {
3687 			if(r == 0)
3688 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
3689 			else {
3690 				unsigned long err = ERR_get_error();
3691 				if(!squelch_err_ssl_handshake(err)) {
3692 					char a[64], s[256];
3693 					addr2str(&data->query->addr, a, sizeof(a));
3694 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
3695 					log_crypto_from_err(s, err);
3696 				}
3697 			}
3698 			cleanup_tcp_handler(data);
3699 			return 0;
3700 		}
3701 	}
3702 
3703 	/* Use to log successful upgrade for testing - could be removed*/
3704 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
3705 	/* set back to the event we need to have when reading (or writing) */
3706 	if(data->shake_state == tls_hs_read && writing) {
3707 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
3708 	} else if(data->shake_state == tls_hs_write && !writing) {
3709 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
3710 	}
3711 	data->shake_state = tls_hs_none;
3712 	return 1;
3713 }
3714 
3715 /** handle TLS reading of incoming query */
3716 static void
3717 handle_tls_reading(int fd, short event, void* arg)
3718 {
3719 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3720 	ssize_t received;
3721 
3722 	if ((event & EV_TIMEOUT)) {
3723 		/* Connection timed out.  */
3724 		cleanup_tcp_handler(data);
3725 		return;
3726 	}
3727 
3728 	if (data->nsd->tcp_query_count > 0 &&
3729 	    data->query_count >= data->nsd->tcp_query_count) {
3730 		/* No more queries allowed on this tcp connection. */
3731 		cleanup_tcp_handler(data);
3732 		return;
3733 	}
3734 
3735 	assert((event & EV_READ));
3736 
3737 	if (data->bytes_transmitted == 0) {
3738 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
3739 	}
3740 
3741 	if(data->shake_state != tls_hs_none) {
3742 		if(!tls_handshake(data, fd, 0))
3743 			return;
3744 		if(data->shake_state != tls_hs_none)
3745 			return;
3746 	}
3747 
3748 	/*
3749 	 * Check if we received the leading packet length bytes yet.
3750 	 */
3751 	if(data->bytes_transmitted < sizeof(uint16_t)) {
3752 		ERR_clear_error();
3753 		if((received=SSL_read(data->tls, (char *) &data->query->tcplen
3754 		    + data->bytes_transmitted,
3755 		    sizeof(uint16_t) - data->bytes_transmitted)) <= 0) {
3756 			int want = SSL_get_error(data->tls, received);
3757 			if(want == SSL_ERROR_ZERO_RETURN) {
3758 				cleanup_tcp_handler(data);
3759 				return; /* shutdown, closed */
3760 			} else if(want == SSL_ERROR_WANT_READ) {
3761 				/* wants to be called again */
3762 				return;
3763 			}
3764 			else if(want == SSL_ERROR_WANT_WRITE) {
3765 				/* switch to writing */
3766 				data->shake_state = tls_hs_write_event;
3767 				tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3768 				return;
3769 			}
3770 			cleanup_tcp_handler(data);
3771 			log_crypto_err("could not SSL_read");
3772 			return;
3773 		}
3774 
3775 		data->bytes_transmitted += received;
3776 		if (data->bytes_transmitted < sizeof(uint16_t)) {
3777 			/*
3778 			 * Not done with the tcplen yet, wait for more
3779 			 * data to become available.
3780 			 */
3781 			return;
3782 		}
3783 
3784 		assert(data->bytes_transmitted == sizeof(uint16_t));
3785 
3786 		data->query->tcplen = ntohs(data->query->tcplen);
3787 
3788 		/*
3789 		 * Minimum query size is:
3790 		 *
3791 		 *     Size of the header (12)
3792 		 *   + Root domain name   (1)
3793 		 *   + Query class        (2)
3794 		 *   + Query type         (2)
3795 		 */
3796 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
3797 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
3798 			cleanup_tcp_handler(data);
3799 			return;
3800 		}
3801 
3802 		if (data->query->tcplen > data->query->maxlen) {
3803 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
3804 			cleanup_tcp_handler(data);
3805 			return;
3806 		}
3807 
3808 		buffer_set_limit(data->query->packet, data->query->tcplen);
3809 	}
3810 
3811 	assert(buffer_remaining(data->query->packet) > 0);
3812 
3813 	/* Read the (remaining) query data.  */
3814 	ERR_clear_error();
3815 	received = SSL_read(data->tls, (void*)buffer_current(data->query->packet),
3816 			    (int)buffer_remaining(data->query->packet));
3817 	if(received <= 0) {
3818 		int want = SSL_get_error(data->tls, received);
3819 		if(want == SSL_ERROR_ZERO_RETURN) {
3820 			cleanup_tcp_handler(data);
3821 			return; /* shutdown, closed */
3822 		} else if(want == SSL_ERROR_WANT_READ) {
3823 			/* wants to be called again */
3824 			return;
3825 		}
3826 		else if(want == SSL_ERROR_WANT_WRITE) {
3827 			/* switch back writing */
3828 			data->shake_state = tls_hs_write_event;
3829 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3830 			return;
3831 		}
3832 		cleanup_tcp_handler(data);
3833 		log_crypto_err("could not SSL_read");
3834 		return;
3835 	}
3836 
3837 	data->bytes_transmitted += received;
3838 	buffer_skip(data->query->packet, received);
3839 	if (buffer_remaining(data->query->packet) > 0) {
3840 		/*
3841 		 * Message not yet complete, wait for more data to
3842 		 * become available.
3843 		 */
3844 		return;
3845 	}
3846 
3847 	assert(buffer_position(data->query->packet) == data->query->tcplen);
3848 
3849 	/* Account... */
3850 #ifndef INET6
3851 	STATUP(data->nsd, ctls);
3852 #else
3853 	if (data->query->addr.ss_family == AF_INET) {
3854 		STATUP(data->nsd, ctls);
3855 	} else if (data->query->addr.ss_family == AF_INET6) {
3856 		STATUP(data->nsd, ctls6);
3857 	}
3858 #endif
3859 
3860 	/* We have a complete query, process it.  */
3861 
3862 	/* tcp-query-count: handle query counter ++ */
3863 	data->query_count++;
3864 
3865 	buffer_flip(data->query->packet);
3866 #ifdef USE_DNSTAP
3867 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
3868 		data->query->addrlen, data->query->tcp, data->query->packet);
3869 #endif /* USE_DNSTAP */
3870 	data->query_state = server_process_query(data->nsd, data->query);
3871 	if (data->query_state == QUERY_DISCARDED) {
3872 		/* Drop the packet and the entire connection... */
3873 		STATUP(data->nsd, dropped);
3874 		ZTATUP(data->nsd, data->query->zone, dropped);
3875 		cleanup_tcp_handler(data);
3876 		return;
3877 	}
3878 
3879 #ifdef BIND8_STATS
3880 	if (RCODE(data->query->packet) == RCODE_OK
3881 	    && !AA(data->query->packet))
3882 	{
3883 		STATUP(data->nsd, nona);
3884 		ZTATUP(data->nsd, data->query->zone, nona);
3885 	}
3886 #endif /* BIND8_STATS */
3887 
3888 #ifdef USE_ZONE_STATS
3889 #ifndef INET6
3890 	ZTATUP(data->nsd, data->query->zone, ctls);
3891 #else
3892 	if (data->query->addr.ss_family == AF_INET) {
3893 		ZTATUP(data->nsd, data->query->zone, ctls);
3894 	} else if (data->query->addr.ss_family == AF_INET6) {
3895 		ZTATUP(data->nsd, data->query->zone, ctls6);
3896 	}
3897 #endif
3898 #endif /* USE_ZONE_STATS */
3899 
3900 	query_add_optional(data->query, data->nsd);
3901 
3902 	/* Switch to the tcp write handler.  */
3903 	buffer_flip(data->query->packet);
3904 	data->query->tcplen = buffer_remaining(data->query->packet);
3905 #ifdef BIND8_STATS
3906 	/* Account the rcode & TC... */
3907 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
3908 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
3909 	if (TC(data->query->packet)) {
3910 		STATUP(data->nsd, truncated);
3911 		ZTATUP(data->nsd, data->query->zone, truncated);
3912 	}
3913 #endif /* BIND8_STATS */
3914 #ifdef USE_DNSTAP
3915 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
3916 		data->query->addrlen, data->query->tcp, data->query->packet,
3917 		data->query->zone);
3918 #endif /* USE_DNSTAP */
3919 	data->bytes_transmitted = 0;
3920 
3921 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
3922 
3923 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
3924 	handle_tls_writing(fd, EV_WRITE, data);
3925 }
3926 
3927 /** handle TLS writing of outgoing response */
3928 static void
3929 handle_tls_writing(int fd, short event, void* arg)
3930 {
3931 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
3932 	ssize_t sent;
3933 	struct query *q = data->query;
3934 	/* static variable that holds reassembly buffer used to put the
3935 	 * TCP length in front of the packet, like writev. */
3936 	static buffer_type* global_tls_temp_buffer = NULL;
3937 	buffer_type* write_buffer;
3938 
3939 	if ((event & EV_TIMEOUT)) {
3940 		/* Connection timed out.  */
3941 		cleanup_tcp_handler(data);
3942 		return;
3943 	}
3944 
3945 	assert((event & EV_WRITE));
3946 
3947 	if(data->shake_state != tls_hs_none) {
3948 		if(!tls_handshake(data, fd, 1))
3949 			return;
3950 		if(data->shake_state != tls_hs_none)
3951 			return;
3952 	}
3953 
3954 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
3955 
3956 	/* If we are writing the start of a message, we must include the length
3957 	 * this is done with a copy into write_buffer. */
3958 	write_buffer = NULL;
3959 	if (data->bytes_transmitted == 0) {
3960 		if(!global_tls_temp_buffer) {
3961 			/* gets deallocated when nsd shuts down from
3962 			 * nsd.region */
3963 			global_tls_temp_buffer = buffer_create(nsd.region,
3964 				QIOBUFSZ + sizeof(q->tcplen));
3965 			if (!global_tls_temp_buffer) {
3966 				return;
3967 			}
3968 		}
3969 		write_buffer = global_tls_temp_buffer;
3970 		buffer_clear(write_buffer);
3971 		buffer_write_u16(write_buffer, q->tcplen);
3972 		buffer_write(write_buffer, buffer_current(q->packet),
3973 			(int)buffer_remaining(q->packet));
3974 		buffer_flip(write_buffer);
3975 	} else {
3976 		write_buffer = q->packet;
3977 	}
3978 
3979 	/* Write the response */
3980 	ERR_clear_error();
3981 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
3982 	if(sent <= 0) {
3983 		int want = SSL_get_error(data->tls, sent);
3984 		if(want == SSL_ERROR_ZERO_RETURN) {
3985 			cleanup_tcp_handler(data);
3986 			/* closed */
3987 		} else if(want == SSL_ERROR_WANT_READ) {
3988 			/* switch back to reading */
3989 			data->shake_state = tls_hs_read_event;
3990 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
3991 		} else if(want != SSL_ERROR_WANT_WRITE) {
3992 			cleanup_tcp_handler(data);
3993 			log_crypto_err("could not SSL_write");
3994 		}
3995 		return;
3996 	}
3997 
3998 	buffer_skip(write_buffer, sent);
3999 	if(buffer_remaining(write_buffer) != 0) {
4000 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4001 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4002 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4003 		}
4004 	}
4005 
4006 	data->bytes_transmitted += sent;
4007 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4008 		/*
4009 		 * Still more data to write when socket becomes
4010 		 * writable again.
4011 		 */
4012 		return;
4013 	}
4014 
4015 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4016 
4017 	if (data->query_state == QUERY_IN_AXFR) {
4018 		/* Continue processing AXFR and writing back results.  */
4019 		buffer_clear(q->packet);
4020 		data->query_state = query_axfr(data->nsd, q);
4021 		if (data->query_state != QUERY_PROCESSED) {
4022 			query_add_optional(data->query, data->nsd);
4023 
4024 			/* Reset data. */
4025 			buffer_flip(q->packet);
4026 			q->tcplen = buffer_remaining(q->packet);
4027 			data->bytes_transmitted = 0;
4028 			/* Reset to writing mode.  */
4029 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4030 
4031 			/*
4032 			 * Write data if/when the socket is writable
4033 			 * again.
4034 			 */
4035 			return;
4036 		}
4037 	}
4038 
4039 	/*
4040 	 * Done sending, wait for the next request to arrive on the
4041 	 * TCP socket by installing the TCP read handler.
4042 	 */
4043 	if (data->nsd->tcp_query_count > 0 &&
4044 		data->query_count >= data->nsd->tcp_query_count) {
4045 
4046 		(void) shutdown(fd, SHUT_WR);
4047 	}
4048 
4049 	data->bytes_transmitted = 0;
4050 
4051 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4052 }
4053 #endif
4054 
4055 static void
4056 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
4057 	void* ATTR_UNUSED(arg))
4058 {
4059 	if(slowaccept) {
4060 		configure_handler_event_types(EV_PERSIST | EV_READ);
4061 		slowaccept = 0;
4062 	}
4063 }
4064 
4065 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
4066 {
4067 #ifndef HAVE_ACCEPT4
4068 	int s = accept(fd, addr, addrlen);
4069 	if (s != -1) {
4070 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
4071 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
4072 			close(s);
4073 			s = -1;
4074 			errno=EINTR; /* stop error printout as error in accept4
4075 				by setting this errno, it omits printout, in
4076 				later code that calls nsd_accept4 */
4077 		}
4078 	}
4079 	return s;
4080 #else
4081 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
4082 #endif /* HAVE_ACCEPT4 */
4083 }
4084 
4085 /*
4086  * Handle an incoming TCP connection.  The connection is accepted and
4087  * a new TCP reader event handler is added.  The TCP handler
4088  * is responsible for cleanup when the connection is closed.
4089  */
4090 static void
4091 handle_tcp_accept(int fd, short event, void* arg)
4092 {
4093 	struct tcp_accept_handler_data *data
4094 		= (struct tcp_accept_handler_data *) arg;
4095 	int s;
4096 	int reject = 0;
4097 	struct tcp_handler_data *tcp_data;
4098 	region_type *tcp_region;
4099 #ifdef INET6
4100 	struct sockaddr_storage addr;
4101 #else
4102 	struct sockaddr_in addr;
4103 #endif
4104 	socklen_t addrlen;
4105 	struct timeval timeout;
4106 
4107 	if (!(event & EV_READ)) {
4108 		return;
4109 	}
4110 
4111 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
4112 		reject = data->nsd->options->tcp_reject_overflow;
4113 		if (!reject) {
4114 			return;
4115 		}
4116 	}
4117 
4118 	/* Accept it... */
4119 	addrlen = sizeof(addr);
4120 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
4121 	if (s == -1) {
4122 		/**
4123 		 * EMFILE and ENFILE is a signal that the limit of open
4124 		 * file descriptors has been reached. Pause accept().
4125 		 * EINTR is a signal interrupt. The others are various OS ways
4126 		 * of saying that the client has closed the connection.
4127 		 */
4128 		if (errno == EMFILE || errno == ENFILE) {
4129 			if (!slowaccept) {
4130 				/* disable accept events */
4131 				struct timeval tv;
4132 				configure_handler_event_types(0);
4133 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
4134 				tv.tv_usec = 0L;
4135 				memset(&slowaccept_event, 0,
4136 					sizeof(slowaccept_event));
4137 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
4138 					handle_slowaccept_timeout, NULL);
4139 				(void)event_base_set(data->event.ev_base,
4140 					&slowaccept_event);
4141 				(void)event_add(&slowaccept_event, &tv);
4142 				slowaccept = 1;
4143 				/* We don't want to spam the logs here */
4144 			}
4145 		} else if (errno != EINTR
4146 			&& errno != EWOULDBLOCK
4147 #ifdef ECONNABORTED
4148 			&& errno != ECONNABORTED
4149 #endif /* ECONNABORTED */
4150 #ifdef EPROTO
4151 			&& errno != EPROTO
4152 #endif /* EPROTO */
4153 			) {
4154 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
4155 		}
4156 		return;
4157 	}
4158 
4159 	if (reject) {
4160 		shutdown(s, SHUT_RDWR);
4161 		close(s);
4162 		return;
4163 	}
4164 
4165 	/*
4166 	 * This region is deallocated when the TCP connection is
4167 	 * closed by the TCP handler.
4168 	 */
4169 	tcp_region = region_create(xalloc, free);
4170 	tcp_data = (struct tcp_handler_data *) region_alloc(
4171 		tcp_region, sizeof(struct tcp_handler_data));
4172 	tcp_data->region = tcp_region;
4173 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
4174 		compression_table_size, compressed_dnames);
4175 	tcp_data->nsd = data->nsd;
4176 	tcp_data->query_count = 0;
4177 #ifdef HAVE_SSL
4178 	tcp_data->shake_state = tls_hs_none;
4179 	tcp_data->tls = NULL;
4180 #endif
4181 	tcp_data->prev = NULL;
4182 	tcp_data->next = NULL;
4183 
4184 	tcp_data->query_state = QUERY_PROCESSED;
4185 	tcp_data->bytes_transmitted = 0;
4186 	memcpy(&tcp_data->query->addr, &addr, addrlen);
4187 	tcp_data->query->addrlen = addrlen;
4188 
4189 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
4190 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
4191 		/* very busy, give smaller timeout */
4192 		tcp_data->tcp_timeout = 200;
4193 	}
4194 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4195 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
4196 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
4197 
4198 #ifdef HAVE_SSL
4199 	if (data->tls_accept) {
4200 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
4201 		if(!tcp_data->tls) {
4202 			close(s);
4203 			return;
4204 		}
4205 		tcp_data->shake_state = tls_hs_read;
4206 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4207 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4208 			  handle_tls_reading, tcp_data);
4209 	} else {
4210 #endif
4211 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
4212 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
4213 			  handle_tcp_reading, tcp_data);
4214 #ifdef HAVE_SSL
4215 	}
4216 #endif
4217 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
4218 		log_msg(LOG_ERR, "cannot set tcp event base");
4219 		close(s);
4220 		region_destroy(tcp_region);
4221 		return;
4222 	}
4223 	if(event_add(&tcp_data->event, &timeout) != 0) {
4224 		log_msg(LOG_ERR, "cannot add tcp to event base");
4225 		close(s);
4226 		region_destroy(tcp_region);
4227 		return;
4228 	}
4229 	if(tcp_active_list) {
4230 		tcp_active_list->prev = tcp_data;
4231 		tcp_data->next = tcp_active_list;
4232 	}
4233 	tcp_active_list = tcp_data;
4234 
4235 	/*
4236 	 * Keep track of the total number of TCP handlers installed so
4237 	 * we can stop accepting connections when the maximum number
4238 	 * of simultaneous TCP connections is reached.
4239 	 *
4240 	 * If tcp-reject-overflow is enabled, however, then we do not
4241 	 * change the handler event type; we keep it as-is and accept
4242 	 * overflow TCP connections only so that we can forcibly kill
4243 	 * them off.
4244 	 */
4245 	++data->nsd->current_tcp_count;
4246 	if (!data->nsd->options->tcp_reject_overflow &&
4247 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
4248 	{
4249 		configure_handler_event_types(0);
4250 	}
4251 }
4252 
4253 static void
4254 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
4255 {
4256 	size_t i;
4257 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4258 	for (i = 0; i < nsd->child_count; ++i) {
4259 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
4260 			if (write(nsd->children[i].child_fd,
4261 				&command,
4262 				sizeof(command)) == -1)
4263 			{
4264 				if(errno != EAGAIN && errno != EINTR)
4265 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
4266 					(int) command,
4267 					(int) nsd->children[i].pid,
4268 					strerror(errno));
4269 			} else if (timeout > 0) {
4270 				(void)block_read(NULL,
4271 					nsd->children[i].child_fd,
4272 					&command, sizeof(command), timeout);
4273 			}
4274 			fsync(nsd->children[i].child_fd);
4275 			close(nsd->children[i].child_fd);
4276 			nsd->children[i].child_fd = -1;
4277 		}
4278 	}
4279 }
4280 
4281 static void
4282 send_children_quit(struct nsd* nsd)
4283 {
4284 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
4285 	send_children_command(nsd, NSD_QUIT, 0);
4286 }
4287 
4288 static void
4289 send_children_quit_and_wait(struct nsd* nsd)
4290 {
4291 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
4292 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
4293 }
4294 
4295 #ifdef BIND8_STATS
4296 static void
4297 set_children_stats(struct nsd* nsd)
4298 {
4299 	size_t i;
4300 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
4301 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
4302 	for (i = 0; i < nsd->child_count; ++i) {
4303 		nsd->children[i].need_to_send_STATS = 1;
4304 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
4305 	}
4306 }
4307 #endif /* BIND8_STATS */
4308 
4309 static void
4310 configure_handler_event_types(short event_types)
4311 {
4312 	size_t i;
4313 
4314 	for (i = 0; i < tcp_accept_handler_count; ++i) {
4315 		struct event* handler = &tcp_accept_handlers[i].event;
4316 		if(event_types) {
4317 			/* reassign */
4318 			int fd = handler->ev_fd;
4319 			struct event_base* base = handler->ev_base;
4320 			if(tcp_accept_handlers[i].event_added)
4321 				event_del(handler);
4322 			memset(handler, 0, sizeof(*handler));
4323 			event_set(handler, fd, event_types,
4324 				handle_tcp_accept, &tcp_accept_handlers[i]);
4325 			if(event_base_set(base, handler) != 0)
4326 				log_msg(LOG_ERR, "conhand: cannot event_base");
4327 			if(event_add(handler, NULL) != 0)
4328 				log_msg(LOG_ERR, "conhand: cannot event_add");
4329 			tcp_accept_handlers[i].event_added = 1;
4330 		} else {
4331 			/* remove */
4332 			if(tcp_accept_handlers[i].event_added) {
4333 				event_del(handler);
4334 				tcp_accept_handlers[i].event_added = 0;
4335 			}
4336 		}
4337 	}
4338 }
4339