xref: /netbsd-src/external/bsd/nsd/dist/server.c (revision 36f29c42dc045ef9455baf105305a0d7958f2a71)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <limits.h>
15 #include <sys/socket.h>
16 #include <sys/uio.h>
17 #include <sys/wait.h>
18 
19 #include <netinet/in.h>
20 #ifdef USE_TCP_FASTOPEN
21   #include <netinet/tcp.h>
22 #endif
23 #include <arpa/inet.h>
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stddef.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <time.h>
34 #include <unistd.h>
35 #include <signal.h>
36 #include <netdb.h>
37 #include <poll.h>
38 #ifdef HAVE_SYS_RANDOM_H
39 #include <sys/random.h>
40 #endif
41 #ifndef SHUT_WR
42 #define SHUT_WR 1
43 #endif
44 #ifdef HAVE_MMAP
45 #include <sys/mman.h>
46 #endif /* HAVE_MMAP */
47 #ifdef HAVE_OPENSSL_RAND_H
48 #include <openssl/rand.h>
49 #endif
50 #ifdef HAVE_OPENSSL_SSL_H
51 #include <openssl/ssl.h>
52 #endif
53 #ifdef HAVE_OPENSSL_ERR_H
54 #include <openssl/err.h>
55 #endif
56 #ifdef HAVE_OPENSSL_OCSP_H
57 #include <openssl/ocsp.h>
58 #endif
59 #ifndef USE_MINI_EVENT
60 #  ifdef HAVE_EVENT_H
61 #    include <event.h>
62 #  else
63 #    include <event2/event.h>
64 #    include "event2/event_struct.h"
65 #    include "event2/event_compat.h"
66 #  endif
67 #else
68 #  include "mini_event.h"
69 #endif
70 
71 #include "axfr.h"
72 #include "namedb.h"
73 #include "netio.h"
74 #include "xfrd.h"
75 #include "xfrd-tcp.h"
76 #include "xfrd-disk.h"
77 #include "difffile.h"
78 #include "nsec3.h"
79 #include "ipc.h"
80 #include "udb.h"
81 #include "remote.h"
82 #include "lookup3.h"
83 #include "rrl.h"
84 #include "ixfr.h"
85 #ifdef USE_DNSTAP
86 #include "dnstap/dnstap_collector.h"
87 #endif
88 #include "verify.h"
89 #include "util/proxy_protocol.h"
90 
91 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
92 
93 #ifdef USE_DNSTAP
94 /*
95  * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content
96  * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*)
97  */
98 static void
log_addr(const char * descr,struct sockaddr_storage * addr)99 log_addr(const char* descr,
100 #ifdef INET6
101 	struct sockaddr_storage* addr
102 #else
103 	struct sockaddr_in* addr
104 #endif
105 	)
106 {
107 	char str_buf[64];
108 	if(verbosity < 6)
109 		return;
110 	if(
111 #ifdef INET6
112 		addr->ss_family == AF_INET
113 #else
114 		addr->sin_family == AF_INET
115 #endif
116 		) {
117 		struct sockaddr_in* s = (struct sockaddr_in*)addr;
118 		inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf));
119 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port)));
120 #ifdef INET6
121 	} else {
122 		struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr;
123 		inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf));
124 		VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port)));
125 #endif
126 	}
127 }
128 #endif /* USE_DNSTAP */
129 
130 #ifdef USE_TCP_FASTOPEN
131   #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen"
132   #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2
133 #endif
134 
135 /* header state for the PROXYv2 header (for TCP) */
136 enum pp2_header_state {
137 	/* no header encounter yet */
138 	pp2_header_none = 0,
139 	/* read the static part of the header */
140 	pp2_header_init,
141 	/* read the full header */
142 	pp2_header_done
143 };
144 
145 /*
146  * Data for the UDP handlers.
147  */
148 struct udp_handler_data
149 {
150 	struct nsd        *nsd;
151 	struct nsd_socket *socket;
152 	struct event       event;
153 	/* if set, PROXYv2 is expected on this connection */
154 	int pp2_enabled;
155 };
156 
157 struct tcp_accept_handler_data {
158 	struct nsd        *nsd;
159 	struct nsd_socket *socket;
160 	int                event_added;
161 	struct event       event;
162 #ifdef HAVE_SSL
163 	/* handler accepts TLS connections on the dedicated port */
164 	int                tls_accept;
165 #endif
166 	/* if set, PROXYv2 is expected on this connection */
167 	int pp2_enabled;
168 };
169 
170 /*
171  * These globals are used to enable the TCP accept handlers
172  * when the number of TCP connection drops below the maximum
173  * number of TCP connections.
174  */
175 static size_t tcp_accept_handler_count;
176 static struct tcp_accept_handler_data *tcp_accept_handlers;
177 
178 static struct event slowaccept_event;
179 static int slowaccept;
180 
181 #ifdef HAVE_SSL
182 static unsigned char *ocspdata = NULL;
183 static long ocspdata_len = 0;
184 #endif
185 
186 #ifdef NONBLOCKING_IS_BROKEN
187 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to
188    read multiple times from a socket when reported ready by select. */
189 # define NUM_RECV_PER_SELECT (1)
190 #else /* !NONBLOCKING_IS_BROKEN */
191 # define NUM_RECV_PER_SELECT (100)
192 #endif /* NONBLOCKING_IS_BROKEN */
193 
194 #ifndef HAVE_MMSGHDR
195 struct mmsghdr {
196 	struct msghdr msg_hdr;
197 	unsigned int  msg_len;
198 };
199 #endif
200 
201 static struct mmsghdr msgs[NUM_RECV_PER_SELECT];
202 static struct iovec iovecs[NUM_RECV_PER_SELECT];
203 static struct query *queries[NUM_RECV_PER_SELECT];
204 
205 /*
206  * Data for the TCP connection handlers.
207  *
208  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
209  * blocking the entire server on a slow TCP connection, but does make
210  * reading from and writing to the socket more complicated.
211  *
212  * Basically, whenever a read/write would block (indicated by the
213  * EAGAIN errno variable) we remember the position we were reading
214  * from/writing to and return from the TCP reading/writing event
215  * handler.  When the socket becomes readable/writable again we
216  * continue from the same position.
217  */
218 struct tcp_handler_data
219 {
220 	/*
221 	 * The region used to allocate all TCP connection related
222 	 * data, including this structure.  This region is destroyed
223 	 * when the connection is closed.
224 	 */
225 	region_type*		region;
226 
227 	/*
228 	 * The global nsd structure.
229 	 */
230 	struct nsd*			nsd;
231 
232 	/*
233 	 * The current query data for this TCP connection.
234 	 */
235 	query_type*			query;
236 
237 	/*
238 	 * The query_state is used to remember if we are performing an
239 	 * AXFR, if we're done processing, or if we should discard the
240 	 * query and connection.
241 	 */
242 	query_state_type	query_state;
243 
244 	/*
245 	 * The event for the file descriptor and tcp timeout
246 	 */
247 	struct event event;
248 
249 	/*
250 	 * The bytes_transmitted field is used to remember the number
251 	 * of bytes transmitted when receiving or sending a DNS
252 	 * packet.  The count includes the two additional bytes used
253 	 * to specify the packet length on a TCP connection.
254 	 */
255 	size_t				bytes_transmitted;
256 
257 	/* If the query is restarted and needs a reset */
258 	int query_needs_reset;
259 
260 	/*
261 	 * The number of queries handled by this specific TCP connection.
262 	 */
263 	int					query_count;
264 
265 	/*
266 	 * The timeout in msec for this tcp connection
267 	 */
268 	int	tcp_timeout;
269 
270 	/*
271 	 * If the connection is allowed to have further queries on it.
272 	 */
273 	int tcp_no_more_queries;
274 
275 #ifdef USE_DNSTAP
276 	/* the socket of the accept socket to find proper service (local) address the socket is bound to. */
277 	struct nsd_socket *socket;
278 #endif /* USE_DNSTAP */
279 
280 	/* if set, PROXYv2 is expected on this connection */
281 	int pp2_enabled;
282 
283 	/* header state for the PROXYv2 header (for TCP) */
284 	enum pp2_header_state pp2_header_state;
285 
286 #ifdef HAVE_SSL
287 	/*
288 	 * TLS object.
289 	 */
290 	SSL* tls;
291 
292 	/*
293 	 * TLS handshake state.
294 	 */
295 	enum { tls_hs_none, tls_hs_read, tls_hs_write,
296 		tls_hs_read_event, tls_hs_write_event } shake_state;
297 #endif
298 	/* list of connections, for service of remaining tcp channels */
299 	struct tcp_handler_data *prev, *next;
300 };
301 /* global that is the list of active tcp channels */
302 static struct tcp_handler_data *tcp_active_list = NULL;
303 
304 /*
305  * Handle incoming queries on the UDP server sockets.
306  */
307 static void handle_udp(int fd, short event, void* arg);
308 
309 /*
310  * Handle incoming connections on the TCP sockets.  These handlers
311  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
312  * connection) but are disabled when the number of current TCP
313  * connections is equal to the maximum number of TCP connections.
314  * Disabling is done by changing the handler to wait for the
315  * NETIO_EVENT_NONE type.  This is done using the function
316  * configure_tcp_accept_handlers.
317  */
318 static void handle_tcp_accept(int fd, short event, void* arg);
319 
320 /*
321  * Handle incoming queries on a TCP connection.  The TCP connections
322  * are configured to be non-blocking and the handler may be called
323  * multiple times before a complete query is received.
324  */
325 static void handle_tcp_reading(int fd, short event, void* arg);
326 
327 /*
328  * Handle outgoing responses on a TCP connection.  The TCP connections
329  * are configured to be non-blocking and the handler may be called
330  * multiple times before a complete response is sent.
331  */
332 static void handle_tcp_writing(int fd, short event, void* arg);
333 
334 #ifdef HAVE_SSL
335 /* Create SSL object and associate fd */
336 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd);
337 /*
338  * Handle TLS handshake. May be called multiple times if incomplete.
339  */
340 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing);
341 
342 /*
343  * Handle incoming queries on a TLS over TCP connection.  The TLS
344  * connections are configured to be non-blocking and the handler may
345  * be called multiple times before a complete query is received.
346  */
347 static void handle_tls_reading(int fd, short event, void* arg);
348 
349 /*
350  * Handle outgoing responses on a TLS over TCP connection.  The TLS
351  * connections are configured to be non-blocking and the handler may
352  * be called multiple times before a complete response is sent.
353  */
354 static void handle_tls_writing(int fd, short event, void* arg);
355 #endif
356 
357 /*
358  * Send all children the quit nonblocking, then close pipe.
359  */
360 static void send_children_quit(struct nsd* nsd);
361 /* same, for shutdown time, waits for child to exit to avoid restart issues */
362 static void send_children_quit_and_wait(struct nsd* nsd);
363 
364 /* set childrens flags to send NSD_STATS to them */
365 #ifdef BIND8_STATS
366 static void set_children_stats(struct nsd* nsd);
367 #endif /* BIND8_STATS */
368 
369 /*
370  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
371  */
372 static void configure_handler_event_types(short event_types);
373 
374 static uint16_t *compressed_dname_offsets = 0;
375 static uint32_t compression_table_capacity = 0;
376 static uint32_t compression_table_size = 0;
377 static domain_type* compressed_dnames[MAXRRSPP];
378 
379 #ifdef USE_TCP_FASTOPEN
380 /* Checks to see if the kernel value must be manually changed in order for
381    TCP Fast Open to support server mode */
report_tcp_fastopen_config()382 static void report_tcp_fastopen_config() {
383 
384 	int tcp_fastopen_fp;
385 	uint8_t tcp_fastopen_value;
386 
387 	if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) {
388 		log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
389 	}
390 	if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) {
391 		log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno));
392 		close(tcp_fastopen_fp);
393 	}
394 	if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) {
395 		log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n");
396 		log_msg(LOG_WARNING, "However the kernel parameters are not configured to support TCP_FASTOPEN in server mode.\n");
397 		log_msg(LOG_WARNING, "To enable TFO use the command:");
398 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n");
399 		log_msg(LOG_WARNING, "  'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n");
400 		log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n");
401 		close(tcp_fastopen_fp);
402 	}
403 	close(tcp_fastopen_fp);
404 }
405 #endif
406 
407 /*
408  * Remove the specified pid from the list of child pids.  Returns -1 if
409  * the pid is not in the list, child_num otherwise.  The field is set to 0.
410  */
411 static int
delete_child_pid(struct nsd * nsd,pid_t pid)412 delete_child_pid(struct nsd *nsd, pid_t pid)
413 {
414 	size_t i;
415 	for (i = 0; i < nsd->child_count; ++i) {
416 		if (nsd->children[i].pid == pid) {
417 			nsd->children[i].pid = 0;
418 			if(!nsd->children[i].need_to_exit) {
419 				if(nsd->children[i].child_fd != -1)
420 					close(nsd->children[i].child_fd);
421 				nsd->children[i].child_fd = -1;
422 				if(nsd->children[i].handler)
423 					nsd->children[i].handler->fd = -1;
424 			}
425 			return i;
426 		}
427 	}
428 	return -1;
429 }
430 
431 /*
432  * Restart child servers if necessary.
433  */
434 static int
restart_child_servers(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)435 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
436 	int* xfrd_sock_p)
437 {
438 	struct main_ipc_handler_data *ipc_data;
439 	size_t i;
440 	int sv[2];
441 
442 	/* Fork the child processes... */
443 	for (i = 0; i < nsd->child_count; ++i) {
444 		if (nsd->children[i].pid <= 0) {
445 			if (nsd->children[i].child_fd != -1)
446 				close(nsd->children[i].child_fd);
447 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
448 				log_msg(LOG_ERR, "socketpair: %s",
449 					strerror(errno));
450 				return -1;
451 			}
452 			nsd->children[i].child_fd = sv[0];
453 			nsd->children[i].parent_fd = sv[1];
454 			nsd->children[i].pid = fork();
455 			switch (nsd->children[i].pid) {
456 			default: /* SERVER MAIN */
457 				close(nsd->children[i].parent_fd);
458 				nsd->children[i].parent_fd = -1;
459 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
460 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
461 				}
462 				if(!nsd->children[i].handler)
463 				{
464 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
465 						region, sizeof(struct main_ipc_handler_data));
466 					ipc_data->nsd = nsd;
467 					ipc_data->child = &nsd->children[i];
468 					ipc_data->child_num = i;
469 					ipc_data->xfrd_sock = xfrd_sock_p;
470 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
471 					ipc_data->forward_mode = 0;
472 					ipc_data->got_bytes = 0;
473 					ipc_data->total_bytes = 0;
474 					ipc_data->acl_num = 0;
475 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
476 						region, sizeof(struct netio_handler));
477 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
478 					nsd->children[i].handler->timeout = NULL;
479 					nsd->children[i].handler->user_data = ipc_data;
480 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
481 					nsd->children[i].handler->event_handler = parent_handle_child_command;
482 					netio_add_handler(netio, nsd->children[i].handler);
483 				}
484 				/* clear any ongoing ipc */
485 				ipc_data = (struct main_ipc_handler_data*)
486 					nsd->children[i].handler->user_data;
487 				ipc_data->forward_mode = 0;
488 				/* restart - update fd */
489 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
490 				break;
491 			case 0: /* CHILD */
492 #ifdef MEMCLEAN /* OS collects memory pages */
493 				region_destroy(region);
494 #endif
495 				nsd->pid = 0;
496 				nsd->child_count = 0;
497 				nsd->server_kind = nsd->children[i].kind;
498 				nsd->this_child = &nsd->children[i];
499 				nsd->this_child->child_num = i;
500 				/* remove signal flags inherited from parent
501 				   the parent will handle them. */
502 				nsd->signal_hint_reload_hup = 0;
503 				nsd->signal_hint_reload = 0;
504 				nsd->signal_hint_child = 0;
505 				nsd->signal_hint_quit = 0;
506 				nsd->signal_hint_shutdown = 0;
507 				nsd->signal_hint_stats = 0;
508 				nsd->signal_hint_statsusr = 0;
509 				close(*xfrd_sock_p);
510 				close(nsd->this_child->child_fd);
511 				nsd->this_child->child_fd = -1;
512 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
513 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
514 				}
515 				server_child(nsd);
516 				/* NOTREACH */
517 				exit(0);
518 			case -1:
519 				log_msg(LOG_ERR, "fork failed: %s",
520 					strerror(errno));
521 				return -1;
522 			}
523 		}
524 	}
525 	return 0;
526 }
527 
528 #ifdef BIND8_STATS
set_bind8_alarm(struct nsd * nsd)529 static void set_bind8_alarm(struct nsd* nsd)
530 {
531 	/* resync so that the next alarm is on the next whole minute */
532 	if(nsd->st_period > 0) /* % by 0 gives divbyzero error */
533 		alarm(nsd->st_period - (time(NULL) % nsd->st_period));
534 }
535 #endif
536 
537 /* set zone stat ids for zones initially read in */
538 static void
zonestatid_tree_set(struct nsd * nsd)539 zonestatid_tree_set(struct nsd* nsd)
540 {
541 	struct radnode* n;
542 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
543 		zone_type* zone = (zone_type*)n->elem;
544 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
545 	}
546 }
547 
548 #ifdef USE_ZONE_STATS
549 void
server_zonestat_alloc(struct nsd * nsd)550 server_zonestat_alloc(struct nsd* nsd)
551 {
552 	size_t num = (nsd->options->zonestatnames->count==0?1:
553 			nsd->options->zonestatnames->count);
554 	size_t sz = sizeof(struct nsdst)*num;
555 	char tmpfile[256];
556 	uint8_t z = 0;
557 
558 	/* file names */
559 	nsd->zonestatfname[0] = 0;
560 	nsd->zonestatfname[1] = 0;
561 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
562 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
563 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
564 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
565 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
566 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
567 
568 	/* file descriptors */
569 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
570 	if(nsd->zonestatfd[0] == -1) {
571 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
572 			strerror(errno));
573 		exit(1);
574 	}
575 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
576 	if(nsd->zonestatfd[0] == -1) {
577 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
578 			strerror(errno));
579 		close(nsd->zonestatfd[0]);
580 		unlink(nsd->zonestatfname[0]);
581 		exit(1);
582 	}
583 
584 #ifdef HAVE_MMAP
585 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
586 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
587 			strerror(errno));
588 		exit(1);
589 	}
590 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
591 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
592 			nsd->zonestatfname[0], strerror(errno));
593 		exit(1);
594 	}
595 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
596 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
597 			strerror(errno));
598 		exit(1);
599 	}
600 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
601 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
602 			nsd->zonestatfname[1], strerror(errno));
603 		exit(1);
604 	}
605 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
606 		MAP_SHARED, nsd->zonestatfd[0], 0);
607 	if(nsd->zonestat[0] == MAP_FAILED) {
608 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
609 		unlink(nsd->zonestatfname[0]);
610 		unlink(nsd->zonestatfname[1]);
611 		exit(1);
612 	}
613 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
614 		MAP_SHARED, nsd->zonestatfd[1], 0);
615 	if(nsd->zonestat[1] == MAP_FAILED) {
616 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
617 		unlink(nsd->zonestatfname[0]);
618 		unlink(nsd->zonestatfname[1]);
619 		exit(1);
620 	}
621 	memset(nsd->zonestat[0], 0, sz);
622 	memset(nsd->zonestat[1], 0, sz);
623 	nsd->zonestatsize[0] = num;
624 	nsd->zonestatsize[1] = num;
625 	nsd->zonestatdesired = num;
626 	nsd->zonestatsizenow = num;
627 	nsd->zonestatnow = nsd->zonestat[0];
628 #endif /* HAVE_MMAP */
629 }
630 
631 void
zonestat_remap(struct nsd * nsd,int idx,size_t sz)632 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
633 {
634 #ifdef HAVE_MMAP
635 #ifdef MREMAP_MAYMOVE
636 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
637 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
638 		MREMAP_MAYMOVE);
639 	if(nsd->zonestat[idx] == MAP_FAILED) {
640 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
641 		exit(1);
642 	}
643 #else /* !HAVE MREMAP */
644 	if(msync(nsd->zonestat[idx],
645 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
646 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
647 	if(munmap(nsd->zonestat[idx],
648 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
649 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
650 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
651 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
652 	if(nsd->zonestat[idx] == MAP_FAILED) {
653 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
654 		exit(1);
655 	}
656 #endif /* MREMAP */
657 #endif /* HAVE_MMAP */
658 }
659 
660 /* realloc the zonestat array for the one that is not currently in use,
661  * to match the desired new size of the array (if applicable) */
662 void
server_zonestat_realloc(struct nsd * nsd)663 server_zonestat_realloc(struct nsd* nsd)
664 {
665 #ifdef HAVE_MMAP
666 	uint8_t z = 0;
667 	size_t sz;
668 	int idx = 0; /* index of the zonestat array that is not in use */
669 	if(nsd->zonestatnow == nsd->zonestat[0])
670 		idx = 1;
671 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
672 		return;
673 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
674 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
675 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
676 			strerror(errno));
677 		exit(1);
678 	}
679 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
680 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
681 			nsd->zonestatfname[idx], strerror(errno));
682 		exit(1);
683 	}
684 	zonestat_remap(nsd, idx, sz);
685 	/* zero the newly allocated region */
686 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
687 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
688 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
689 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
690 	}
691 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
692 #endif /* HAVE_MMAP */
693 }
694 
695 /* switchover to use the other array for the new children, that
696  * briefly coexist with the old children.  And we want to avoid them
697  * both writing to the same statistics arrays. */
698 void
server_zonestat_switch(struct nsd * nsd)699 server_zonestat_switch(struct nsd* nsd)
700 {
701 	if(nsd->zonestatnow == nsd->zonestat[0]) {
702 		nsd->zonestatnow = nsd->zonestat[1];
703 		nsd->zonestatsizenow = nsd->zonestatsize[1];
704 	} else {
705 		nsd->zonestatnow = nsd->zonestat[0];
706 		nsd->zonestatsizenow = nsd->zonestatsize[0];
707 	}
708 }
709 #endif /* USE_ZONE_STATS */
710 
711 #ifdef BIND8_STATS
712 void
server_stat_alloc(struct nsd * nsd)713 server_stat_alloc(struct nsd* nsd)
714 {
715 	char tmpfile[256];
716 	size_t sz = sizeof(struct nsdst) * nsd->child_count * 2;
717 	uint8_t z = 0;
718 
719 	/* file name */
720 	nsd->statfname = 0;
721 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.stat",
722 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
723 	nsd->statfname = region_strdup(nsd->region, tmpfile);
724 
725 	/* file descriptor */
726 	nsd->statfd = open(nsd->statfname, O_CREAT|O_RDWR, 0600);
727 	if(nsd->statfd == -1) {
728 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->statfname,
729 			strerror(errno));
730 		unlink(nsd->zonestatfname[0]);
731 		unlink(nsd->zonestatfname[1]);
732 		exit(1);
733 	}
734 
735 #ifdef HAVE_MMAP
736 	if(lseek(nsd->statfd, (off_t)sz-1, SEEK_SET) == -1) {
737 		log_msg(LOG_ERR, "lseek %s: %s", nsd->statfname,
738 			strerror(errno));
739 		goto fail_exit;
740 	}
741 	if(write(nsd->statfd, &z, 1) == -1) {
742 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
743 			nsd->statfname, strerror(errno));
744 		goto fail_exit;
745 	}
746 	nsd->stat_map = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
747 		MAP_SHARED, nsd->statfd, 0);
748 	if(nsd->stat_map == MAP_FAILED) {
749 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
750 fail_exit:
751 		close(nsd->statfd);
752 		unlink(nsd->statfname);
753 		unlink(nsd->zonestatfname[0]);
754 		unlink(nsd->zonestatfname[1]);
755 		exit(1);
756 	}
757 	memset(nsd->stat_map, 0, sz);
758 	nsd->stats_per_child[0] = nsd->stat_map;
759 	nsd->stats_per_child[1] = &nsd->stat_map[nsd->child_count];
760 	nsd->stat_current = 0;
761 	nsd->st = &nsd->stats_per_child[nsd->stat_current][0];
762 #endif /* HAVE_MMAP */
763 }
764 #endif /* BIND8_STATS */
765 
766 #ifdef BIND8_STATS
767 void
server_stat_free(struct nsd * nsd)768 server_stat_free(struct nsd* nsd)
769 {
770 	unlink(nsd->statfname);
771 }
772 #endif /* BIND8_STATS */
773 
774 static void
cleanup_dname_compression_tables(void * ptr)775 cleanup_dname_compression_tables(void *ptr)
776 {
777 	free(ptr);
778 	compressed_dname_offsets = NULL;
779 	compression_table_capacity = 0;
780 }
781 
782 static void
initialize_dname_compression_tables(struct nsd * nsd)783 initialize_dname_compression_tables(struct nsd *nsd)
784 {
785 	size_t needed = domain_table_count(nsd->db->domains) + 1;
786 	needed += EXTRA_DOMAIN_NUMBERS;
787 	if(compression_table_capacity < needed) {
788 		if(compressed_dname_offsets) {
789 			region_remove_cleanup(nsd->db->region,
790 				cleanup_dname_compression_tables,
791 				compressed_dname_offsets);
792 			free(compressed_dname_offsets);
793 		}
794 		compressed_dname_offsets = (uint16_t *) xmallocarray(
795 			needed, sizeof(uint16_t));
796 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
797 			compressed_dname_offsets);
798 		compression_table_capacity = needed;
799 		compression_table_size=domain_table_count(nsd->db->domains)+1;
800 	}
801 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
802 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
803 }
804 
805 static int
set_cloexec(struct nsd_socket * sock)806 set_cloexec(struct nsd_socket *sock)
807 {
808 	assert(sock != NULL);
809 
810 	if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) {
811 		const char *socktype =
812 			sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp";
813 		log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s",
814 			socktype, strerror(errno));
815 		return -1;
816 	}
817 
818 	return 1;
819 }
820 
821 static int
set_reuseport(struct nsd_socket * sock)822 set_reuseport(struct nsd_socket *sock)
823 {
824 #ifdef SO_REUSEPORT
825 	int on = 1;
826 #ifdef SO_REUSEPORT_LB
827 	/* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like
828 	 * SO_REUSEPORT on Linux. This is what the users want with the config
829 	 * option in nsd.conf; if we actually need local address and port reuse
830 	 * they'll also need to have SO_REUSEPORT set for them, assume it was
831 	 * _LB they want.
832 	 */
833 	int opt = SO_REUSEPORT_LB;
834 	static const char optname[] = "SO_REUSEPORT_LB";
835 #else /* !SO_REUSEPORT_LB */
836 	int opt = SO_REUSEPORT;
837 	static const char optname[] = "SO_REUSEPORT";
838 #endif /* SO_REUSEPORT_LB */
839 
840 	if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) {
841 		return 1;
842 	} else if(verbosity >= 3 || errno != ENOPROTOOPT) {
843 		log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
844 			optname, strerror(errno));
845 	}
846 	return -1;
847 #else
848 	(void)sock;
849 #endif /* SO_REUSEPORT */
850 
851 	return 0;
852 }
853 
854 static int
set_reuseaddr(struct nsd_socket * sock)855 set_reuseaddr(struct nsd_socket *sock)
856 {
857 #ifdef SO_REUSEADDR
858 	int on = 1;
859 	if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) {
860 		return 1;
861 	}
862 	log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s",
863 		strerror(errno));
864 	return -1;
865 #endif /* SO_REUSEADDR */
866 	return 0;
867 }
868 
869 static int
set_rcvbuf(struct nsd_socket * sock,int rcv)870 set_rcvbuf(struct nsd_socket *sock, int rcv)
871 {
872 #ifdef SO_RCVBUF
873 #ifdef SO_RCVBUFFORCE
874 	if(0 == setsockopt(
875 		sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv)))
876 	{
877 		return 1;
878 	}
879 	if(errno == EPERM || errno == ENOBUFS) {
880 		return 0;
881 	}
882 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s",
883 		strerror(errno));
884 	return -1;
885 #else /* !SO_RCVBUFFORCE */
886 	if (0 == setsockopt(
887 		sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv)))
888 	{
889 		return 1;
890 	}
891 	if(errno == ENOSYS || errno == ENOBUFS) {
892 		return 0;
893 	}
894 	log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s",
895 		strerror(errno));
896 	return -1;
897 #endif /* SO_RCVBUFFORCE */
898 #endif /* SO_RCVBUF */
899 
900 	return 0;
901 }
902 
903 static int
set_sndbuf(struct nsd_socket * sock,int snd)904 set_sndbuf(struct nsd_socket *sock, int snd)
905 {
906 #ifdef SO_SNDBUF
907 #ifdef SO_SNDBUFFORCE
908 	if(0 == setsockopt(
909 		sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd)))
910 	{
911 		return 1;
912 	}
913 	if(errno == EPERM || errno == ENOBUFS) {
914 		return 0;
915 	}
916 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s",
917 		strerror(errno));
918 	return -1;
919 #else /* !SO_SNDBUFFORCE */
920 	if(0 == setsockopt(
921 		sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd)))
922 	{
923 		return 1;
924 	}
925 	if(errno == ENOSYS || errno == ENOBUFS) {
926 		return 0;
927 	}
928 	log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s",
929 		strerror(errno));
930 	return -1;
931 #endif /* SO_SNDBUFFORCE */
932 #endif /* SO_SNDBUF */
933 
934 	return 0;
935 }
936 
937 static int
set_nonblock(struct nsd_socket * sock)938 set_nonblock(struct nsd_socket *sock)
939 {
940 	const char *socktype =
941 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
942 
943 	if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) {
944 		log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s",
945 			socktype, strerror(errno));
946 		return -1;
947 	}
948 
949 	return 1;
950 }
951 
952 #ifdef INET6
953 static int
set_ipv6_v6only(struct nsd_socket * sock)954 set_ipv6_v6only(struct nsd_socket *sock)
955 {
956 #ifdef IPV6_V6ONLY
957 	int on = 1;
958 	const char *socktype =
959 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
960 
961 	if(0 == setsockopt(
962 		sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)))
963 	{
964 		return 1;
965 	}
966 
967 	log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s",
968 		socktype, strerror(errno));
969 	return -1;
970 #else
971 	(void)sock;
972 #endif /* IPV6_V6ONLY */
973 
974 	return 0;
975 }
976 #endif /* INET6 */
977 
978 #ifdef INET6
979 static int
set_ipv6_use_min_mtu(struct nsd_socket * sock)980 set_ipv6_use_min_mtu(struct nsd_socket *sock)
981 {
982 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)
983 #if defined(IPV6_USE_MIN_MTU)
984 	/* There is no fragmentation of IPv6 datagrams during forwarding in the
985 	 * network. Therefore we do not send UDP datagrams larger than the
986 	 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be
987 	 * larger if the network stack supports IPV6_USE_MIN_MTU.
988 	 */
989 	int opt = IPV6_USE_MIN_MTU;
990 	int optval = 1;
991 	static const char optname[] = "IPV6_USE_MIN_MTU";
992 #elif defined(IPV6_MTU)
993 	/* On Linux, PMTUD is disabled by default for datagrams so set the MTU
994 	 * to the MIN MTU to get the same.
995 	 */
996 	int opt = IPV6_MTU;
997 	int optval = IPV6_MIN_MTU;
998 	static const char optname[] = "IPV6_MTU";
999 #endif
1000 	if(0 == setsockopt(
1001 		sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval)))
1002 	{
1003 		return 1;
1004 	}
1005 
1006 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s",
1007 		optname, strerror(errno));
1008 	return -1;
1009 #else
1010 	(void)sock;
1011 #endif /* INET6 */
1012 
1013 	return 0;
1014 }
1015 #endif /* INET6 */
1016 
1017 static int
set_ipv4_no_pmtu_disc(struct nsd_socket * sock)1018 set_ipv4_no_pmtu_disc(struct nsd_socket *sock)
1019 {
1020 	int ret = 0;
1021 
1022 #if defined(IP_MTU_DISCOVER)
1023 	int opt = IP_MTU_DISCOVER;
1024 	int optval;
1025 # if defined(IP_PMTUDISC_OMIT)
1026 	/* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU
1027 	 * information and send packets with DF=0. Fragmentation is allowed if
1028 	 * and only if the packet size exceeds the outgoing interface MTU or
1029 	 * the packet encounters smaller MTU link in network. This mitigates
1030 	 * DNS fragmentation attacks by preventing forged PMTU information.
1031 	 * FreeBSD already has same semantics without setting the option.
1032 	 */
1033 	optval = IP_PMTUDISC_OMIT;
1034 	if(0 == setsockopt(
1035 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1036 	{
1037 		return 1;
1038 	}
1039 
1040 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1041 		"IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno));
1042 # endif /* IP_PMTUDISC_OMIT */
1043 # if defined(IP_PMTUDISC_DONT)
1044 	/* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */
1045 	optval = IP_PMTUDISC_DONT;
1046 	if(0 == setsockopt(
1047 		sock->s, IPPROTO_IP, opt, &optval, sizeof(optval)))
1048 	{
1049 		return 1;
1050 	}
1051 
1052 	log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1053 		"IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno));
1054 # endif
1055 	ret = -1;
1056 #elif defined(IP_DONTFRAG)
1057 	int off = 0;
1058 	if (0 == setsockopt(
1059 		sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off)))
1060 	{
1061 		return 1;
1062 	}
1063 
1064 	log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
1065 		strerror(errno));
1066 	ret = -1;
1067 #else
1068 	(void)sock;
1069 #endif
1070 
1071 	return ret;
1072 }
1073 
1074 static int
set_ip_freebind(struct nsd_socket * sock)1075 set_ip_freebind(struct nsd_socket *sock)
1076 {
1077 #ifdef IP_FREEBIND
1078 	int on = 1;
1079 	const char *socktype =
1080 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1081 	if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0)
1082 	{
1083 		return 1;
1084 	}
1085 	log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s",
1086 		socktype, strerror(errno));
1087 	return -1;
1088 #else
1089 	(void)sock;
1090 #endif /* IP_FREEBIND */
1091 
1092 	return 0;
1093 }
1094 
1095 static int
set_ip_transparent(struct nsd_socket * sock)1096 set_ip_transparent(struct nsd_socket *sock)
1097 {
1098 	/*
1099 	The scandalous preprocessor blob here calls for some explanation :)
1100 	POSIX does not specify an option to bind non-local IPs, so
1101 	platforms developed several implementation-specific options,
1102 	all set in the same way, but with different names.
1103 	For additional complexity, some platform manage this setting
1104 	differently for different address families (IPv4 vs IPv6).
1105 	This scandalous preprocessor blob below abstracts such variability
1106 	in the way which leaves the C code as lean and clear as possible.
1107 	*/
1108 
1109 #if defined(IP_TRANSPARENT)
1110 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_TRANSPARENT
1111 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1112 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_TRANSPARENT"
1113 // as of 2020-01, Linux does not support this on IPv6 programmatically
1114 #elif defined(SO_BINDANY)
1115 #	define NSD_SOCKET_OPTION_TRANSPARENT						SO_BINDANY
1116 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		SOL_SOCKET
1117 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"SO_BINDANY"
1118 #elif defined(IP_BINDANY)
1119 #	define NSD_SOCKET_OPTION_TRANSPARENT 						IP_BINDANY
1120 #	define NSD_SOCKET_OPTION_TRANSPARENT6						IPV6_BINDANY
1121 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL		IPPROTO_IP
1122 #	define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6	IPPROTO_IPV6
1123 #	define NSD_SOCKET_OPTION_TRANSPARENT_NAME 			"IP_BINDANY"
1124 #endif
1125 
1126 #ifndef NSD_SOCKET_OPTION_TRANSPARENT
1127 	(void)sock;
1128 #else
1129 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT6
1130 #		define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT
1131 #	endif
1132 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6
1133 #		define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL
1134 #	endif
1135 #	ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6
1136 #		define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME
1137 #	endif
1138 
1139 	int on = 1;
1140 	const char *socktype =
1141 		sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp";
1142 	const int is_ip6 = (sock->addr.ai_family == AF_INET6);
1143 
1144 	if(0 == setsockopt(
1145 		sock->s,
1146 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL,
1147 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT,
1148 		&on, sizeof(on)))
1149 	{
1150 		return 1;
1151 	}
1152 
1153 	log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s",
1154 		is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno));
1155 	return -1;
1156 #endif
1157 
1158 	return 0;
1159 }
1160 
1161 static int
set_tcp_maxseg(struct nsd_socket * sock,int mss)1162 set_tcp_maxseg(struct nsd_socket *sock, int mss)
1163 {
1164 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
1165 	if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) {
1166 		return 1;
1167 	}
1168 	log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s",
1169 		strerror(errno));
1170 	return -1;
1171 #else
1172 	log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
1173 #endif
1174 	return 0;
1175 }
1176 
1177 #ifdef USE_TCP_FASTOPEN
1178 static int
set_tcp_fastopen(struct nsd_socket * sock)1179 set_tcp_fastopen(struct nsd_socket *sock)
1180 {
1181 	/* qlen specifies how many outstanding TFO requests to allow. Limit is
1182 	 * a defense against IP spoofing attacks as suggested in RFC7413.
1183 	 */
1184 	int qlen;
1185 
1186 #ifdef __APPLE__
1187 	/* macOS X implementation only supports qlen of 1 via this call. The
1188 	 * actual value is configured by the net.inet.tcp.fastopen_backlog
1189 	 * kernel parameter.
1190 	 */
1191 	qlen = 1;
1192 #else
1193 	/* 5 is recommended on Linux. */
1194 	qlen = 5;
1195 #endif
1196 	if (0 == setsockopt(
1197 		sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)))
1198 	{
1199 		return 1;
1200 	}
1201 
1202 	if (errno == EPERM) {
1203 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s "
1204 				 "; this could likely be because sysctl "
1205 				 "net.inet.tcp.fastopen.enabled, "
1206 				 "net.inet.tcp.fastopen.server_enable, or "
1207 				 "net.ipv4.tcp_fastopen is disabled",
1208 			strerror(errno));
1209 	/* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support
1210 	 * disabled, except when verbosity enabled for debugging
1211 	 */
1212 	} else if(errno != ENOPROTOOPT || verbosity >= 3) {
1213 		log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s",
1214 			strerror(errno));
1215 	}
1216 
1217 	return (errno == ENOPROTOOPT ? 0 : -1);
1218 }
1219 #endif /* USE_TCP_FASTOPEN */
1220 
1221 static int
set_bindtodevice(struct nsd_socket * sock)1222 set_bindtodevice(struct nsd_socket *sock)
1223 {
1224 #if defined(SO_BINDTODEVICE)
1225 	if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE,
1226 		sock->device, strlen(sock->device)) == -1)
1227 	{
1228 		log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s",
1229 		                 "SO_BINDTODEVICE", sock->device, strerror(errno));
1230 		return -1;
1231 	}
1232 
1233 	return 1;
1234 #else
1235 	(void)sock;
1236 	return 0;
1237 #endif
1238 }
1239 
1240 static int
set_setfib(struct nsd_socket * sock)1241 set_setfib(struct nsd_socket *sock)
1242 {
1243 #if defined(SO_SETFIB)
1244 	if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB,
1245 	              (const void *)&sock->fib, sizeof(sock->fib)) == -1)
1246 	{
1247 		log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s",
1248 		                 "SO_SETFIB", sock->fib, strerror(errno));
1249 		return -1;
1250 	}
1251 
1252 	return 1;
1253 #else
1254 	(void)sock;
1255 	return 0;
1256 #endif
1257 }
1258 
1259 static int
open_udp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1260 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1261 {
1262 	int rcv = 1*1024*1024, snd = 1*1024*1024;
1263 
1264 	if(-1 == (sock->s = socket(
1265 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1266 	{
1267 #ifdef INET6
1268 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1269 		   (sock->addr.ai_family == AF_INET6) &&
1270 		   (errno == EAFNOSUPPORT))
1271 		{
1272 			log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: "
1273 				"not supported");
1274 			return 0;
1275 		}
1276 #endif
1277 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1278 		return -1;
1279 	}
1280 
1281 	set_cloexec(sock);
1282 
1283 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1284 		*reuseport_works = (set_reuseport(sock) == 1);
1285 
1286 	if(nsd->options->receive_buffer_size > 0)
1287 		rcv = nsd->options->receive_buffer_size;
1288 	if(set_rcvbuf(sock, rcv) == -1)
1289 		return -1;
1290 
1291 	if(nsd->options->send_buffer_size > 0)
1292 		snd = nsd->options->send_buffer_size;
1293 	if(set_sndbuf(sock, snd) == -1)
1294 		return -1;
1295 #ifdef INET6
1296 	if(sock->addr.ai_family == AF_INET6) {
1297 		if(set_ipv6_v6only(sock) == -1 ||
1298 		   set_ipv6_use_min_mtu(sock) == -1)
1299 			return -1;
1300 	} else
1301 #endif /* INET6 */
1302 	if(sock->addr.ai_family == AF_INET) {
1303 		if(set_ipv4_no_pmtu_disc(sock) == -1)
1304 			return -1;
1305 	}
1306 
1307 	/* Set socket to non-blocking. Otherwise, on operating systems
1308 	 * with thundering herd problems, the UDP recv could block
1309 	 * after select returns readable.
1310 	 */
1311 	set_nonblock(sock);
1312 
1313 	if(nsd->options->ip_freebind)
1314 		(void)set_ip_freebind(sock);
1315 	if(nsd->options->ip_transparent)
1316 		(void)set_ip_transparent(sock);
1317 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1318 		return -1;
1319 	if(sock->fib != -1 && set_setfib(sock) == -1)
1320 		return -1;
1321 
1322 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1323 		char buf[256];
1324 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1325 		log_msg(LOG_ERR, "can't bind udp socket %s: %s",
1326 			buf, strerror(errno));
1327 		return -1;
1328 	}
1329 
1330 	return 1;
1331 }
1332 
1333 static int
open_tcp_socket(struct nsd * nsd,struct nsd_socket * sock,int * reuseport_works)1334 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works)
1335 {
1336 #ifdef USE_TCP_FASTOPEN
1337 	report_tcp_fastopen_config();
1338 #endif
1339 
1340 	(void)reuseport_works;
1341 
1342 	if(-1 == (sock->s = socket(
1343 		sock->addr.ai_family, sock->addr.ai_socktype, 0)))
1344 	{
1345 #ifdef INET6
1346 		if((sock->flags & NSD_SOCKET_IS_OPTIONAL) &&
1347 		   (sock->addr.ai_family == AF_INET6) &&
1348 		   (errno == EAFNOSUPPORT))
1349 		{
1350 			log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: "
1351 			                     "not supported");
1352 			return 0;
1353 		}
1354 #endif /* INET6 */
1355 		log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
1356 		return -1;
1357 	}
1358 
1359 	set_cloexec(sock);
1360 
1361 	if(nsd->reuseport && reuseport_works && *reuseport_works)
1362 		*reuseport_works = (set_reuseport(sock) == 1);
1363 
1364 	(void)set_reuseaddr(sock);
1365 
1366 #ifdef INET6
1367 	if(sock->addr.ai_family == AF_INET6) {
1368 		if (set_ipv6_v6only(sock) == -1 ||
1369 		    set_ipv6_use_min_mtu(sock) == -1)
1370 			return -1;
1371 	}
1372 #endif
1373 
1374 	if(nsd->tcp_mss > 0)
1375 		set_tcp_maxseg(sock, nsd->tcp_mss);
1376 	/* (StevensUNP p463), if TCP listening socket is blocking, then
1377 	   it may block in accept, even if select() says readable. */
1378 	(void)set_nonblock(sock);
1379 	if(nsd->options->ip_freebind)
1380 		(void)set_ip_freebind(sock);
1381 	if(nsd->options->ip_transparent)
1382 		(void)set_ip_transparent(sock);
1383 	if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1)
1384 		return -1;
1385 	if(sock->fib != -1 && set_setfib(sock) == -1)
1386 		return -1;
1387 
1388 	if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) {
1389 		char buf[256];
1390 		addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf));
1391 		log_msg(LOG_ERR, "can't bind tcp socket %s: %s",
1392 			buf, strerror(errno));
1393 		return -1;
1394 	}
1395 
1396 #ifdef USE_TCP_FASTOPEN
1397 	(void)set_tcp_fastopen(sock);
1398 #endif
1399 
1400 	if(listen(sock->s, TCP_BACKLOG) == -1) {
1401 		log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
1402 		return -1;
1403 	}
1404 
1405 	return 1;
1406 }
1407 
1408 /*
1409  * Initialize the server, reuseport, create and bind the sockets.
1410  */
1411 int
server_init(struct nsd * nsd)1412 server_init(struct nsd *nsd)
1413 {
1414 	size_t i;
1415 	int reuseport = 1; /* Determine if REUSEPORT works. */
1416 
1417 	/* open server interface ports */
1418 	for(i = 0; i < nsd->ifs; i++) {
1419 		if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 ||
1420 		   open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1)
1421 		{
1422 			return -1;
1423 		}
1424 	}
1425 
1426 	if(nsd->reuseport && reuseport) {
1427 		size_t ifs = nsd->ifs * nsd->reuseport;
1428 
1429 		/* increase the size of the interface arrays, there are going
1430 		 * to be separate interface file descriptors for every server
1431 		 * instance */
1432 		region_remove_cleanup(nsd->region, free, nsd->udp);
1433 		region_remove_cleanup(nsd->region, free, nsd->tcp);
1434 
1435 		nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp));
1436 		nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp));
1437 		region_add_cleanup(nsd->region, free, nsd->udp);
1438 		region_add_cleanup(nsd->region, free, nsd->tcp);
1439 		if(ifs > nsd->ifs) {
1440 			memset(&nsd->udp[nsd->ifs], 0,
1441 				(ifs-nsd->ifs)*sizeof(*nsd->udp));
1442 			memset(&nsd->tcp[nsd->ifs], 0,
1443 				(ifs-nsd->ifs)*sizeof(*nsd->tcp));
1444 		}
1445 
1446 		for(i = nsd->ifs; i < ifs; i++) {
1447 			nsd->udp[i] = nsd->udp[i%nsd->ifs];
1448 			nsd->udp[i].s = -1;
1449 			if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) {
1450 				return -1;
1451 			}
1452 			/* Turn off REUSEPORT for TCP by copying the socket
1453 			 * file descriptor.
1454 			 * This means we should not close TCP used by
1455 			 * other servers in reuseport enabled mode, in
1456 			 * server_child().
1457 			 */
1458 			nsd->tcp[i] = nsd->tcp[i%nsd->ifs];
1459 		}
1460 
1461 		nsd->ifs = ifs;
1462 	} else {
1463 		nsd->reuseport = 0;
1464 	}
1465 
1466 	/* open server interface ports for verifiers */
1467 	for(i = 0; i < nsd->verify_ifs; i++) {
1468 		if(open_udp_socket(nsd, &nsd->verify_udp[i], NULL) == -1 ||
1469 		   open_tcp_socket(nsd, &nsd->verify_tcp[i], NULL) == -1)
1470 		{
1471 			return -1;
1472 		}
1473 	}
1474 
1475 	return 0;
1476 }
1477 
1478 /*
1479  * Prepare the server for take off.
1480  *
1481  */
1482 int
server_prepare(struct nsd * nsd)1483 server_prepare(struct nsd *nsd)
1484 {
1485 #ifdef RATELIMIT
1486 	/* set secret modifier for hashing (rate limits) */
1487 #ifdef HAVE_GETRANDOM
1488 	uint32_t v;
1489 	if(getrandom(&v, sizeof(v), 0) == -1) {
1490 		log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno));
1491 		exit(1);
1492 	}
1493 	hash_set_raninit(v);
1494 #elif defined(HAVE_ARC4RANDOM)
1495 	hash_set_raninit(arc4random());
1496 #else
1497 	uint32_t v = getpid() ^ time(NULL);
1498 	srandom((unsigned long)v);
1499 #  ifdef HAVE_SSL
1500 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1501 		hash_set_raninit(v);
1502 	else
1503 #  endif
1504 		hash_set_raninit(random());
1505 #endif
1506 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1507 		nsd->options->rrl_ratelimit,
1508 		nsd->options->rrl_whitelist_ratelimit,
1509 		nsd->options->rrl_slip,
1510 		nsd->options->rrl_ipv4_prefix_length,
1511 		nsd->options->rrl_ipv6_prefix_length);
1512 #endif /* RATELIMIT */
1513 
1514 	/* Open the database... */
1515 	if ((nsd->db = namedb_open(nsd->options)) == NULL) {
1516 		log_msg(LOG_ERR, "unable to open the database: %s", strerror(errno));
1517 		unlink(nsd->task[0]->fname);
1518 		unlink(nsd->task[1]->fname);
1519 #ifdef USE_ZONE_STATS
1520 		unlink(nsd->zonestatfname[0]);
1521 		unlink(nsd->zonestatfname[1]);
1522 #endif
1523 #ifdef BIND8_STATS
1524 		server_stat_free(nsd);
1525 #endif
1526 		xfrd_del_tempdir(nsd);
1527 		return -1;
1528 	}
1529 	/* check if zone files can be read */
1530 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1531 	 * for all zones */
1532 	namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1533 	zonestatid_tree_set(nsd);
1534 
1535 	compression_table_capacity = 0;
1536 	initialize_dname_compression_tables(nsd);
1537 
1538 #ifdef	BIND8_STATS
1539 	/* Initialize times... */
1540 	time(&nsd->st->boot);
1541 	set_bind8_alarm(nsd);
1542 #endif /* BIND8_STATS */
1543 
1544 	return 0;
1545 }
1546 
1547 /*
1548  * Fork the required number of servers.
1549  */
1550 static int
server_start_children(struct nsd * nsd,region_type * region,netio_type * netio,int * xfrd_sock_p)1551 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1552 	int* xfrd_sock_p)
1553 {
1554 	size_t i;
1555 
1556 	/* Start all child servers initially.  */
1557 	for (i = 0; i < nsd->child_count; ++i) {
1558 		nsd->children[i].pid = 0;
1559 	}
1560 
1561 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1562 }
1563 
1564 static void
server_close_socket(struct nsd_socket * sock)1565 server_close_socket(struct nsd_socket *sock)
1566 {
1567 	if(sock->s != -1) {
1568 		close(sock->s);
1569 		sock->s = -1;
1570 	}
1571 }
1572 
1573 void
server_close_all_sockets(struct nsd_socket sockets[],size_t n)1574 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1575 {
1576 	size_t i;
1577 
1578 	/* Close all the sockets... */
1579 	for (i = 0; i < n; ++i) {
1580 		server_close_socket(&sockets[i]);
1581 	}
1582 }
1583 
1584 /*
1585  * Close the sockets, shutdown the server and exit.
1586  * Does not return.
1587  */
1588 void
server_shutdown(struct nsd * nsd)1589 server_shutdown(struct nsd *nsd)
1590 {
1591 	size_t i;
1592 
1593 	server_close_all_sockets(nsd->udp, nsd->ifs);
1594 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1595 	/* CHILD: close command channel to parent */
1596 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1597 	{
1598 		close(nsd->this_child->parent_fd);
1599 		nsd->this_child->parent_fd = -1;
1600 	}
1601 	/* SERVER: close command channels to children */
1602 	if(!nsd->this_child)
1603 	{
1604 		for(i=0; i < nsd->child_count; ++i)
1605 			if(nsd->children[i].child_fd != -1)
1606 			{
1607 				close(nsd->children[i].child_fd);
1608 				nsd->children[i].child_fd = -1;
1609 			}
1610 	}
1611 
1612 	tsig_finalize();
1613 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1614 #ifdef HAVE_SSL
1615 	if (nsd->tls_ctx)
1616 		SSL_CTX_free(nsd->tls_ctx);
1617 #endif
1618 
1619 #ifdef MEMCLEAN /* OS collects memory pages */
1620 #ifdef RATELIMIT
1621 	rrl_mmap_deinit_keep_mmap();
1622 #endif
1623 #ifdef USE_DNSTAP
1624 	dt_collector_destroy(nsd->dt_collector, nsd);
1625 #endif
1626 	udb_base_free_keep_mmap(nsd->task[0]);
1627 	udb_base_free_keep_mmap(nsd->task[1]);
1628 	namedb_free_ixfr(nsd->db);
1629 	namedb_close(nsd->db);
1630 	nsd_options_destroy(nsd->options);
1631 	region_destroy(nsd->region);
1632 #endif
1633 	log_finalize();
1634 	exit(0);
1635 }
1636 
1637 void
server_prepare_xfrd(struct nsd * nsd)1638 server_prepare_xfrd(struct nsd* nsd)
1639 {
1640 	char tmpfile[256];
1641 	/* create task mmaps */
1642 	nsd->mytask = 0;
1643 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1644 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1645 	nsd->task[0] = task_file_create(tmpfile);
1646 	if(!nsd->task[0]) {
1647 #ifdef USE_ZONE_STATS
1648 		unlink(nsd->zonestatfname[0]);
1649 		unlink(nsd->zonestatfname[1]);
1650 #endif
1651 #ifdef BIND8_STATS
1652 		server_stat_free(nsd);
1653 #endif
1654 		xfrd_del_tempdir(nsd);
1655 		exit(1);
1656 	}
1657 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1658 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1659 	nsd->task[1] = task_file_create(tmpfile);
1660 	if(!nsd->task[1]) {
1661 		unlink(nsd->task[0]->fname);
1662 #ifdef USE_ZONE_STATS
1663 		unlink(nsd->zonestatfname[0]);
1664 		unlink(nsd->zonestatfname[1]);
1665 #endif
1666 #ifdef BIND8_STATS
1667 		server_stat_free(nsd);
1668 #endif
1669 		xfrd_del_tempdir(nsd);
1670 		exit(1);
1671 	}
1672 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1673 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1674 	/* create xfrd listener structure */
1675 	nsd->xfrd_listener = region_alloc(nsd->region,
1676 		sizeof(netio_handler_type));
1677 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1678 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1679 	nsd->xfrd_listener->fd = -1;
1680 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1681 		nsd;
1682 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1683 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1684 }
1685 
1686 
1687 void
server_start_xfrd(struct nsd * nsd,int del_db,int reload_active)1688 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1689 {
1690 	pid_t pid;
1691 	int sockets[2] = {0,0};
1692 	struct ipc_handler_conn_data *data;
1693 
1694 	if(nsd->xfrd_listener->fd != -1)
1695 		close(nsd->xfrd_listener->fd);
1696 	if(del_db) {
1697 		/* recreate taskdb that xfrd was using, it may be corrupt */
1698 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1699 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1700 		nsd->task[1-nsd->mytask]->fname = NULL;
1701 		/* free alloc already, so udb does not shrink itself */
1702 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1703 		nsd->task[1-nsd->mytask]->alloc = NULL;
1704 		udb_base_free(nsd->task[1-nsd->mytask]);
1705 		/* create new file, overwrite the old one */
1706 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1707 		free(tmpfile);
1708 	}
1709 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1710 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1711 		return;
1712 	}
1713 	pid = fork();
1714 	switch (pid) {
1715 	case -1:
1716 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1717 		break;
1718 	default:
1719 		/* PARENT: close first socket, use second one */
1720 		close(sockets[0]);
1721 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1722 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1723 		}
1724 		if(del_db) xfrd_free_namedb(nsd);
1725 		/* use other task than I am using, since if xfrd died and is
1726 		 * restarted, the reload is using nsd->mytask */
1727 		nsd->mytask = 1 - nsd->mytask;
1728 
1729 #ifdef HAVE_SETPROCTITLE
1730 		setproctitle("xfrd");
1731 #endif
1732 #ifdef HAVE_CPUSET_T
1733 		if(nsd->use_cpu_affinity) {
1734 			set_cpu_affinity(nsd->xfrd_cpuset);
1735 		}
1736 #endif
1737 
1738 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1739 		/* ENOTREACH */
1740 		break;
1741 	case 0:
1742 		/* CHILD: close second socket, use first one */
1743 		close(sockets[1]);
1744 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1745 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1746 		}
1747 		nsd->xfrd_listener->fd = sockets[0];
1748 		break;
1749 	}
1750 	/* server-parent only */
1751 	nsd->xfrd_listener->timeout = NULL;
1752 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1753 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1754 	/* clear ongoing ipc reads */
1755 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1756 	data->conn->is_reading = 0;
1757 }
1758 
1759 /** add all soainfo to taskdb */
1760 static void
add_all_soa_to_task(struct nsd * nsd,struct udb_base * taskudb)1761 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1762 {
1763 	struct radnode* n;
1764 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1765 	/* add all SOA INFO to mytask */
1766 	udb_ptr_init(&task_last, taskudb);
1767 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1768 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1769 	}
1770 	udb_ptr_unlink(&task_last, taskudb);
1771 }
1772 
1773 void
server_send_soa_xfrd(struct nsd * nsd,int shortsoa)1774 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1775 {
1776 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1777 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1778 	 *   then they exchange and process.
1779 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1780 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1781 	 *   expire notifications can be sent back via a normal reload later
1782 	 *   (xfrd will wait for current running reload to finish if any).
1783 	 */
1784 	sig_atomic_t cmd = 0;
1785 	pid_t mypid;
1786 	int xfrd_sock = nsd->xfrd_listener->fd;
1787 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1788 	udb_ptr t;
1789 	if(!shortsoa) {
1790 		if(nsd->signal_hint_shutdown) {
1791 		shutdown:
1792 			log_msg(LOG_WARNING, "signal received, shutting down...");
1793 			server_close_all_sockets(nsd->udp, nsd->ifs);
1794 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1795 			daemon_remote_close(nsd->rc);
1796 			/* Unlink it if possible... */
1797 			unlinkpid(nsd->pidfile);
1798 			unlink(nsd->task[0]->fname);
1799 			unlink(nsd->task[1]->fname);
1800 #ifdef USE_ZONE_STATS
1801 			unlink(nsd->zonestatfname[0]);
1802 			unlink(nsd->zonestatfname[1]);
1803 #endif
1804 #ifdef BIND8_STATS
1805 			server_stat_free(nsd);
1806 #endif
1807 			server_shutdown(nsd);
1808 			/* ENOTREACH */
1809 			exit(0);
1810 		}
1811 	}
1812 	if(shortsoa) {
1813 		/* put SOA in xfrd task because mytask may be in use */
1814 		taskudb = nsd->task[1-nsd->mytask];
1815 	}
1816 
1817 	add_all_soa_to_task(nsd, taskudb);
1818 	if(!shortsoa) {
1819 		/* wait for xfrd to signal task is ready, RELOAD signal */
1820 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1821 			cmd != NSD_RELOAD) {
1822 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1823 			exit(1);
1824 		}
1825 		if(nsd->signal_hint_shutdown) {
1826 			goto shutdown;
1827 		}
1828 	}
1829 	/* give xfrd our task, signal it with RELOAD_DONE */
1830 	task_process_sync(taskudb);
1831 	cmd = NSD_RELOAD_DONE;
1832 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1833 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1834 			(int)nsd->pid, strerror(errno));
1835 	}
1836 	mypid = getpid();
1837 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1838 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1839 			strerror(errno));
1840 	}
1841 
1842 	if(!shortsoa) {
1843 		/* process the xfrd task works (expiry data) */
1844 		nsd->mytask = 1 - nsd->mytask;
1845 		taskudb = nsd->task[nsd->mytask];
1846 		task_remap(taskudb);
1847 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1848 		while(!udb_ptr_is_null(&t)) {
1849 			task_process_expire(nsd->db, TASKLIST(&t));
1850 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1851 		}
1852 		udb_ptr_unlink(&t, taskudb);
1853 		task_clear(taskudb);
1854 
1855 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1856 		cmd = NSD_RELOAD_DONE;
1857 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1858 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1859 				(int)nsd->pid, strerror(errno));
1860 		}
1861 	}
1862 }
1863 
1864 #ifdef HAVE_SSL
1865 static void
log_crypto_from_err(const char * str,unsigned long err)1866 log_crypto_from_err(const char* str, unsigned long err)
1867 {
1868 	/* error:[error code]:[library name]:[function name]:[reason string] */
1869 	char buf[128];
1870 	unsigned long e;
1871 	ERR_error_string_n(err, buf, sizeof(buf));
1872 	log_msg(LOG_ERR, "%s crypto %s", str, buf);
1873 	while( (e=ERR_get_error()) ) {
1874 		ERR_error_string_n(e, buf, sizeof(buf));
1875 		log_msg(LOG_ERR, "and additionally crypto %s", buf);
1876 	}
1877 }
1878 
1879 void
log_crypto_err(const char * str)1880 log_crypto_err(const char* str)
1881 {
1882 	log_crypto_from_err(str, ERR_get_error());
1883 }
1884 
1885 /** true if the ssl handshake error has to be squelched from the logs */
1886 static int
squelch_err_ssl_handshake(unsigned long err)1887 squelch_err_ssl_handshake(unsigned long err)
1888 {
1889 	if(verbosity >= 3)
1890 		return 0; /* only squelch on low verbosity */
1891 	/* this is very specific, we could filter on ERR_GET_REASON()
1892 	 * (the third element in ERR_PACK) */
1893 	if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) ||
1894 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) ||
1895 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) ||
1896 		err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE)
1897 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO
1898 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER)
1899 #endif
1900 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO
1901 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL)
1902 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL)
1903 #  ifdef SSL_R_VERSION_TOO_LOW
1904 		|| err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW)
1905 #  endif
1906 #endif
1907 		)
1908 		return 1;
1909 	return 0;
1910 }
1911 
1912 void
perform_openssl_init(void)1913 perform_openssl_init(void)
1914 {
1915 	/* init SSL library */
1916 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS
1917 	ERR_load_crypto_strings();
1918 #endif
1919 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS)
1920 	ERR_load_SSL_strings();
1921 #endif
1922 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO)
1923 	OpenSSL_add_all_algorithms();
1924 #else
1925 	OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS
1926 		| OPENSSL_INIT_ADD_ALL_DIGESTS
1927 		| OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL);
1928 #endif
1929 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL)
1930 	(void)SSL_library_init();
1931 #else
1932 	OPENSSL_init_ssl(0, NULL);
1933 #endif
1934 
1935 	if(!RAND_status()) {
1936 		/* try to seed it */
1937 		unsigned char buf[256];
1938 		unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid();
1939 		size_t i;
1940 		v = seed;
1941 		for(i=0; i<256/sizeof(v); i++) {
1942 			memmove(buf+i*sizeof(v), &v, sizeof(v));
1943 			v = v*seed + (unsigned int)i;
1944 		}
1945 		RAND_seed(buf, 256);
1946 		log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time");
1947 	}
1948 }
1949 
1950 static int
get_ocsp(char * filename,unsigned char ** ocsp)1951 get_ocsp(char *filename, unsigned char **ocsp)
1952 {
1953 	BIO *bio;
1954 	OCSP_RESPONSE *response;
1955 	int len = -1;
1956 	unsigned char *p, *buf;
1957 	assert(filename);
1958 
1959 	if ((bio = BIO_new_file(filename, "r")) == NULL) {
1960 		log_crypto_err("get_ocsp: BIO_new_file failed");
1961 		return -1;
1962 	}
1963 
1964 	if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) {
1965 		log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed");
1966 		BIO_free(bio);
1967 		return -1;
1968 	}
1969 
1970 	if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) {
1971 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed");
1972 		OCSP_RESPONSE_free(response);
1973 		BIO_free(bio);
1974 		return -1;
1975 	}
1976 
1977 	if ((buf = malloc((size_t) len)) == NULL) {
1978 		log_msg(LOG_ERR, "get_ocsp: malloc failed");
1979 		OCSP_RESPONSE_free(response);
1980 		BIO_free(bio);
1981 		return -1;
1982 	}
1983 
1984 	p = buf;
1985 	if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) {
1986 		log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed");
1987 		free(buf);
1988 		OCSP_RESPONSE_free(response);
1989 		BIO_free(bio);
1990 		return -1;
1991 	}
1992 
1993 	OCSP_RESPONSE_free(response);
1994 	BIO_free(bio);
1995 
1996 	*ocsp = buf;
1997 	return len;
1998 }
1999 
2000 /* further setup ssl ctx after the keys are loaded */
2001 static void
listen_sslctx_setup_2(void * ctxt)2002 listen_sslctx_setup_2(void* ctxt)
2003 {
2004 	SSL_CTX* ctx = (SSL_CTX*)ctxt;
2005 	(void)ctx;
2006 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO
2007 	if(!SSL_CTX_set_ecdh_auto(ctx,1)) {
2008 		/* ENOTREACH */
2009 		log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE");
2010 	}
2011 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME)
2012 	if(1) {
2013 		EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1);
2014 		if (!ecdh) {
2015 			log_crypto_err("could not find p256, not enabling ECDHE");
2016 		} else {
2017 			if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) {
2018 				log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE");
2019 			}
2020 			EC_KEY_free (ecdh);
2021 		}
2022 	}
2023 #endif
2024 }
2025 
2026 static int
add_ocsp_data_cb(SSL * s,void * ATTR_UNUSED (arg))2027 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg))
2028 {
2029 	if(ocspdata) {
2030 		unsigned char *p;
2031 		if ((p=malloc(ocspdata_len)) == NULL) {
2032 			log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure");
2033 			return SSL_TLSEXT_ERR_NOACK;
2034 		}
2035 		memcpy(p, ocspdata, ocspdata_len);
2036 		if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) {
2037 			log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp");
2038 			free(p);
2039 			return SSL_TLSEXT_ERR_NOACK;
2040 		}
2041 		return SSL_TLSEXT_ERR_OK;
2042 	} else {
2043 		return SSL_TLSEXT_ERR_NOACK;
2044 	}
2045 }
2046 
2047 SSL_CTX*
server_tls_ctx_setup(char * key,char * pem,char * verifypem)2048 server_tls_ctx_setup(char* key, char* pem, char* verifypem)
2049 {
2050 	SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method());
2051 	if(!ctx) {
2052 		log_crypto_err("could not SSL_CTX_new");
2053 		return NULL;
2054 	}
2055 	/* no SSLv2, SSLv3 because has defects */
2056 #if SSL_OP_NO_SSLv2 != 0
2057 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){
2058 		log_crypto_err("could not set SSL_OP_NO_SSLv2");
2059 		SSL_CTX_free(ctx);
2060 		return NULL;
2061 	}
2062 #endif
2063 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3)
2064 		!= SSL_OP_NO_SSLv3){
2065 		log_crypto_err("could not set SSL_OP_NO_SSLv3");
2066 		SSL_CTX_free(ctx);
2067 		return 0;
2068 	}
2069 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1)
2070 	/* if we have tls 1.1 disable 1.0 */
2071 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1)
2072 		!= SSL_OP_NO_TLSv1){
2073 		log_crypto_err("could not set SSL_OP_NO_TLSv1");
2074 		SSL_CTX_free(ctx);
2075 		return 0;
2076 	}
2077 #endif
2078 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2)
2079 	/* if we have tls 1.2 disable 1.1 */
2080 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1)
2081 		!= SSL_OP_NO_TLSv1_1){
2082 		log_crypto_err("could not set SSL_OP_NO_TLSv1_1");
2083 		SSL_CTX_free(ctx);
2084 		return 0;
2085 	}
2086 #endif
2087 #if defined(SSL_OP_NO_RENEGOTIATION)
2088 	/* disable client renegotiation */
2089 	if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) &
2090 		SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) {
2091 		log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION");
2092 		SSL_CTX_free(ctx);
2093 		return 0;
2094 	}
2095 #endif
2096 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20)
2097 	/* if we detect system-wide crypto policies, use those */
2098 	if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) {
2099 		/* if we have sha256, set the cipher list to have no known vulns */
2100 		if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20"))
2101 			log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list");
2102 	}
2103 #endif
2104 	if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) &
2105 		SSL_OP_CIPHER_SERVER_PREFERENCE) !=
2106 		SSL_OP_CIPHER_SERVER_PREFERENCE) {
2107 		log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE");
2108 		SSL_CTX_free(ctx);
2109 		return 0;
2110 	}
2111 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL
2112 	SSL_CTX_set_security_level(ctx, 0);
2113 #endif
2114 	if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) {
2115 		log_msg(LOG_ERR, "error for cert file: %s", pem);
2116 		log_crypto_err("error in SSL_CTX use_certificate_chain_file");
2117 		SSL_CTX_free(ctx);
2118 		return NULL;
2119 	}
2120 	if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) {
2121 		log_msg(LOG_ERR, "error for private key file: %s", key);
2122 		log_crypto_err("Error in SSL_CTX use_PrivateKey_file");
2123 		SSL_CTX_free(ctx);
2124 		return NULL;
2125 	}
2126 	if(!SSL_CTX_check_private_key(ctx)) {
2127 		log_msg(LOG_ERR, "error for key file: %s", key);
2128 		log_crypto_err("Error in SSL_CTX check_private_key");
2129 		SSL_CTX_free(ctx);
2130 		return NULL;
2131 	}
2132 	listen_sslctx_setup_2(ctx);
2133 	if(verifypem && verifypem[0]) {
2134 		if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) {
2135 			log_crypto_err("Error in SSL_CTX verify locations");
2136 			SSL_CTX_free(ctx);
2137 			return NULL;
2138 		}
2139 		SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem));
2140 		SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL);
2141 	}
2142 	return ctx;
2143 }
2144 
2145 SSL_CTX*
server_tls_ctx_create(struct nsd * nsd,char * verifypem,char * ocspfile)2146 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile)
2147 {
2148 	char *key, *pem;
2149 	SSL_CTX *ctx;
2150 
2151 	key = nsd->options->tls_service_key;
2152 	pem = nsd->options->tls_service_pem;
2153 	if(!key || key[0] == 0) {
2154 		log_msg(LOG_ERR, "error: no tls-service-key file specified");
2155 		return NULL;
2156 	}
2157 	if(!pem || pem[0] == 0) {
2158 		log_msg(LOG_ERR, "error: no tls-service-pem file specified");
2159 		return NULL;
2160 	}
2161 
2162 	/* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but
2163 	 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/
2164 	ctx = server_tls_ctx_setup(key, pem, verifypem);
2165 	if(!ctx) {
2166 		log_msg(LOG_ERR, "could not setup server TLS context");
2167 		return NULL;
2168 	}
2169 	if(ocspfile && ocspfile[0]) {
2170 		if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) {
2171 			log_crypto_err("Error reading OCSPfile");
2172 			SSL_CTX_free(ctx);
2173 			return NULL;
2174 		} else {
2175 			VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile));
2176 			if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) {
2177 				log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb");
2178 				SSL_CTX_free(ctx);
2179 				return NULL;
2180 			}
2181 		}
2182 	}
2183 	return ctx;
2184 }
2185 
2186 /* check if tcp_handler_accept_data created for TLS dedicated port */
2187 int
using_tls_port(struct sockaddr * addr,const char * tls_port)2188 using_tls_port(struct sockaddr* addr, const char* tls_port)
2189 {
2190 	in_port_t port = 0;
2191 
2192 	if (addr->sa_family == AF_INET)
2193 		port = ((struct sockaddr_in*)addr)->sin_port;
2194 #ifndef HAVE_STRUCT_SOCKADDR_IN6
2195 	else
2196 		port = ((struct sockaddr_in6*)addr)->sin6_port;
2197 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */
2198 	if (atoi(tls_port) == ntohs(port))
2199 		return 1;
2200 
2201 	return 0;
2202 }
2203 #endif
2204 
2205 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
2206 ssize_t
block_read(struct nsd * nsd,int s,void * p,ssize_t sz,int timeout)2207 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
2208 {
2209 	uint8_t* buf = (uint8_t*) p;
2210 	ssize_t total = 0;
2211 	struct pollfd fd;
2212 	memset(&fd, 0, sizeof(fd));
2213 	fd.fd = s;
2214 	fd.events = POLLIN;
2215 
2216 	while( total < sz) {
2217 		ssize_t ret;
2218 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
2219 		if(ret == -1) {
2220 			if(errno == EAGAIN)
2221 				/* blocking read */
2222 				continue;
2223 			if(errno == EINTR) {
2224 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2225 					return -1;
2226 				/* other signals can be handled later */
2227 				continue;
2228 			}
2229 			/* some error */
2230 			return -1;
2231 		}
2232 		if(ret == 0) {
2233 			/* operation timed out */
2234 			return -2;
2235 		}
2236 		ret = read(s, buf+total, sz-total);
2237 		if(ret == -1) {
2238 			if(errno == EAGAIN)
2239 				/* blocking read */
2240 				continue;
2241 			if(errno == EINTR) {
2242 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
2243 					return -1;
2244 				/* other signals can be handled later */
2245 				continue;
2246 			}
2247 			/* some error */
2248 			return -1;
2249 		}
2250 		if(ret == 0) {
2251 			/* closed connection! */
2252 			return 0;
2253 		}
2254 		total += ret;
2255 	}
2256 	return total;
2257 }
2258 
2259 static void
reload_process_tasks(struct nsd * nsd,udb_ptr * last_task,int cmdsocket)2260 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
2261 {
2262 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2263 	udb_ptr t, next;
2264 	udb_base* u = nsd->task[nsd->mytask];
2265 	udb_ptr_init(&next, u);
2266 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
2267 	udb_base_set_userdata(u, 0);
2268 	while(!udb_ptr_is_null(&t)) {
2269 		/* store next in list so this one can be deleted or reused */
2270 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
2271 		udb_rptr_zero(&TASKLIST(&t)->next, u);
2272 
2273 		/* process task t */
2274 		/* append results for task t and update last_task */
2275 		task_process_in_reload(nsd, u, last_task, &t);
2276 
2277 		/* go to next */
2278 		udb_ptr_set_ptr(&t, u, &next);
2279 
2280 		/* if the parent has quit, we must quit too, poll the fd for cmds */
2281 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2282 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2283 			if(cmd == NSD_QUIT) {
2284 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2285 				/* unlink files of remainder of tasks */
2286 				while(!udb_ptr_is_null(&t)) {
2287 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
2288 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
2289 					}
2290 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
2291 				}
2292 				udb_ptr_unlink(&t, u);
2293 				udb_ptr_unlink(&next, u);
2294 				exit(0);
2295 			}
2296 		}
2297 
2298 	}
2299 	udb_ptr_unlink(&t, u);
2300 	udb_ptr_unlink(&next, u);
2301 }
2302 
2303 void server_verify(struct nsd *nsd, int cmdsocket);
2304 
2305 /*
2306  * Reload the database, stop parent, re-fork children and continue.
2307  * as server_main.
2308  */
2309 static void
server_reload(struct nsd * nsd,region_type * server_region,netio_type * netio,int cmdsocket)2310 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
2311 	int cmdsocket)
2312 {
2313 	pid_t mypid;
2314 	sig_atomic_t cmd = NSD_QUIT_SYNC;
2315 	int ret;
2316 	udb_ptr last_task;
2317 	struct sigaction old_sigchld, ign_sigchld;
2318 	struct radnode* node;
2319 	zone_type* zone;
2320 	enum soainfo_hint hint;
2321 	/* ignore SIGCHLD from the previous server_main that used this pid */
2322 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
2323 	ign_sigchld.sa_handler = SIG_IGN;
2324 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
2325 
2326 #ifdef HAVE_SETPROCTITLE
2327 	setproctitle("main");
2328 #endif
2329 #ifdef HAVE_CPUSET_T
2330 	if(nsd->use_cpu_affinity) {
2331 		set_cpu_affinity(nsd->cpuset);
2332 	}
2333 #endif
2334 
2335 	/* see what tasks we got from xfrd */
2336 	task_remap(nsd->task[nsd->mytask]);
2337 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
2338 	reload_process_tasks(nsd, &last_task, cmdsocket);
2339 
2340 #ifndef NDEBUG
2341 	if(nsd_debug_level >= 1)
2342 		region_log_stats(nsd->db->region);
2343 #endif /* NDEBUG */
2344 	initialize_dname_compression_tables(nsd);
2345 
2346 #ifdef BIND8_STATS
2347 	/* Restart dumping stats if required.  */
2348 	time(&nsd->st->boot);
2349 	set_bind8_alarm(nsd);
2350 	/* Switch to a different set of stat array for new server processes,
2351 	 * because they can briefly coexist with the old processes. They
2352 	 * have their own stat structure. */
2353 	nsd->stat_current = (nsd->stat_current==0?1:0);
2354 #endif
2355 #ifdef USE_ZONE_STATS
2356 	server_zonestat_realloc(nsd); /* realloc for new children */
2357 	server_zonestat_switch(nsd);
2358 #endif
2359 
2360 	if(nsd->options->verify_enable) {
2361 #ifdef RATELIMIT
2362 		/* allocate resources for rate limiting. use a slot that is guaranteed
2363 		   not mapped to a file so no persistent data is overwritten */
2364 		rrl_init(nsd->child_count + 1);
2365 #endif
2366 
2367 		/* spin-up server and execute verifiers for each zone */
2368 		server_verify(nsd, cmdsocket);
2369 #ifdef RATELIMIT
2370 		/* deallocate rate limiting resources */
2371 		rrl_deinit(nsd->child_count + 1);
2372 #endif
2373 	}
2374 
2375 	for(node = radix_first(nsd->db->zonetree);
2376 	    node != NULL;
2377 	    node = radix_next(node))
2378 	{
2379 		zone = (zone_type *)node->elem;
2380 		if(zone->is_updated) {
2381 			if(zone->is_bad) {
2382 				nsd->mode = NSD_RELOAD_FAILED;
2383 				hint = soainfo_bad;
2384 			} else {
2385 				hint = soainfo_ok;
2386 			}
2387 			/* update(s), verified or not, possibly with subsequent
2388 			   skipped update(s). skipped update(s) are picked up
2389 			   by failed update check in xfrd */
2390 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2391 			                 zone, hint);
2392 		} else if(zone->is_skipped) {
2393 			/* corrupt or inconsistent update without preceding
2394 			   update(s), communicate soainfo_gone */
2395 			task_new_soainfo(nsd->task[nsd->mytask], &last_task,
2396 			                 zone, soainfo_gone);
2397 		}
2398 		zone->is_updated = 0;
2399 		zone->is_skipped = 0;
2400 	}
2401 
2402 	if(nsd->mode == NSD_RELOAD_FAILED) {
2403 		exit(NSD_RELOAD_FAILED);
2404 	}
2405 
2406 	/* listen for the signals of failed children again */
2407 	sigaction(SIGCHLD, &old_sigchld, NULL);
2408 #ifdef USE_DNSTAP
2409 	if (nsd->dt_collector) {
2410 		int *swap_fd_send;
2411 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes"));
2412 		/* Swap fd_send with fd_swap so old serve child and new serve
2413 		 * childs will not write to the same pipe ends simultaneously */
2414 		swap_fd_send = nsd->dt_collector_fd_send;
2415 		nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap;
2416 		nsd->dt_collector_fd_swap = swap_fd_send;
2417 
2418 	}
2419 #endif
2420 	/* Start new child processes */
2421 	if (server_start_children(nsd, server_region, netio, &nsd->
2422 		xfrd_listener->fd) != 0) {
2423 		send_children_quit(nsd);
2424 		exit(1);
2425 	}
2426 
2427 	/* if the parent has quit, we must quit too, poll the fd for cmds */
2428 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
2429 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
2430 		if(cmd == NSD_QUIT) {
2431 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
2432 			send_children_quit(nsd);
2433 			exit(0);
2434 		}
2435 	}
2436 
2437 	/* Send quit command to parent: blocking, wait for receipt. */
2438 	do {
2439 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
2440 		cmd = NSD_QUIT_SYNC;
2441 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
2442 		{
2443 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
2444 				strerror(errno));
2445 		}
2446 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
2447 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
2448 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
2449 			RELOAD_SYNC_TIMEOUT);
2450 		if(ret == -2) {
2451 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
2452 		}
2453 	} while (ret == -2);
2454 	if(ret == -1) {
2455 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
2456 			strerror(errno));
2457 	}
2458 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
2459 	if(cmd == NSD_QUIT) {
2460 		/* small race condition possible here, parent got quit cmd. */
2461 		send_children_quit(nsd);
2462 		exit(1);
2463 	}
2464 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
2465 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
2466 	task_process_sync(nsd->task[nsd->mytask]);
2467 #ifdef USE_ZONE_STATS
2468 	server_zonestat_realloc(nsd); /* realloc for next children */
2469 #endif
2470 
2471 	/* send soainfo to the xfrd process, signal it that reload is done,
2472 	 * it picks up the taskudb */
2473 	cmd = NSD_RELOAD_DONE;
2474 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
2475 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
2476 			strerror(errno));
2477 	}
2478 	mypid = getpid();
2479 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2480 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2481 			strerror(errno));
2482 	}
2483 
2484 	/* try to reopen file */
2485 	if (nsd->file_rotation_ok)
2486 		log_reopen(nsd->log_filename, 1);
2487 	/* exit reload, continue as new server_main */
2488 }
2489 
2490 /*
2491  * Get the mode depending on the signal hints that have been received.
2492  * Multiple signal hints can be received and will be handled in turn.
2493  */
2494 static sig_atomic_t
server_signal_mode(struct nsd * nsd)2495 server_signal_mode(struct nsd *nsd)
2496 {
2497 	if(nsd->signal_hint_quit) {
2498 		nsd->signal_hint_quit = 0;
2499 		return NSD_QUIT;
2500 	}
2501 	else if(nsd->signal_hint_shutdown) {
2502 		nsd->signal_hint_shutdown = 0;
2503 		return NSD_SHUTDOWN;
2504 	}
2505 	else if(nsd->signal_hint_child) {
2506 		nsd->signal_hint_child = 0;
2507 		return NSD_REAP_CHILDREN;
2508 	}
2509 	else if(nsd->signal_hint_reload) {
2510 		nsd->signal_hint_reload = 0;
2511 		return NSD_RELOAD;
2512 	}
2513 	else if(nsd->signal_hint_reload_hup) {
2514 		nsd->signal_hint_reload_hup = 0;
2515 		return NSD_RELOAD_REQ;
2516 	}
2517 	else if(nsd->signal_hint_stats) {
2518 		nsd->signal_hint_stats = 0;
2519 #ifdef BIND8_STATS
2520 		set_bind8_alarm(nsd);
2521 #endif
2522 		return NSD_STATS;
2523 	}
2524 	else if(nsd->signal_hint_statsusr) {
2525 		nsd->signal_hint_statsusr = 0;
2526 		return NSD_STATS;
2527 	}
2528 	return NSD_RUN;
2529 }
2530 
2531 /*
2532  * The main server simply waits for signals and child processes to
2533  * terminate.  Child processes are restarted as necessary.
2534  */
2535 void
server_main(struct nsd * nsd)2536 server_main(struct nsd *nsd)
2537 {
2538 	region_type *server_region = region_create(xalloc, free);
2539 	netio_type *netio = netio_create(server_region);
2540 	netio_handler_type reload_listener;
2541 	int reload_sockets[2] = {-1, -1};
2542 	struct timespec timeout_spec;
2543 	int status;
2544 	pid_t child_pid;
2545 	pid_t reload_pid = -1;
2546 	sig_atomic_t mode;
2547 
2548 	/* Ensure we are the main process */
2549 	assert(nsd->server_kind == NSD_SERVER_MAIN);
2550 
2551 	/* Add listener for the XFRD process */
2552 	netio_add_handler(netio, nsd->xfrd_listener);
2553 
2554 #ifdef BIND8_STATS
2555 	nsd->st = &nsd->stat_map[0];
2556 	nsd->st->db_disk = 0;
2557 	nsd->st->db_mem = region_get_mem(nsd->db->region);
2558 #endif
2559 
2560 	/* Start the child processes that handle incoming queries */
2561 	if (server_start_children(nsd, server_region, netio,
2562 		&nsd->xfrd_listener->fd) != 0) {
2563 		send_children_quit(nsd);
2564 		exit(1);
2565 	}
2566 	reload_listener.fd = -1;
2567 
2568 	/* This_child MUST be 0, because this is the parent process */
2569 	assert(nsd->this_child == 0);
2570 
2571 	/* Run the server until we get a shutdown signal */
2572 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
2573 		/* Did we receive a signal that changes our mode? */
2574 		if(mode == NSD_RUN) {
2575 			nsd->mode = mode = server_signal_mode(nsd);
2576 		}
2577 
2578 		switch (mode) {
2579 		case NSD_RUN:
2580 			/* see if any child processes terminated */
2581 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
2582 				int is_child = delete_child_pid(nsd, child_pid);
2583 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
2584 					if(nsd->children[is_child].child_fd == -1)
2585 						nsd->children[is_child].has_exited = 1;
2586 					parent_check_all_children_exited(nsd);
2587 				} else if(is_child != -1) {
2588 					log_msg(LOG_WARNING,
2589 					       "server %d died unexpectedly with status %d, restarting",
2590 					       (int) child_pid, status);
2591 					restart_child_servers(nsd, server_region, netio,
2592 						&nsd->xfrd_listener->fd);
2593 				} else if (child_pid == reload_pid) {
2594 					sig_atomic_t cmd = NSD_RELOAD_FAILED;
2595 					pid_t mypid;
2596 					log_msg(LOG_WARNING,
2597 					       "Reload process %d failed with status %d, continuing with old database",
2598 					       (int) child_pid, status);
2599 					reload_pid = -1;
2600 					if(reload_listener.fd != -1) close(reload_listener.fd);
2601 					netio_remove_handler(netio, &reload_listener);
2602 					reload_listener.fd = -1;
2603 					reload_listener.event_types = NETIO_EVENT_NONE;
2604 					task_process_sync(nsd->task[nsd->mytask]);
2605 					/* inform xfrd reload attempt ended */
2606 					if(!write_socket(nsd->xfrd_listener->fd,
2607 						&cmd, sizeof(cmd))) {
2608 						log_msg(LOG_ERR, "problems "
2609 						  "sending SOAEND to xfrd: %s",
2610 						  strerror(errno));
2611 					}
2612 					mypid = getpid();
2613 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2614 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2615 							strerror(errno));
2616 					}
2617 #ifdef USE_DNSTAP
2618 				} else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) {
2619 					log_msg(LOG_WARNING,
2620 					       "dnstap-collector %d terminated with status %d",
2621 					       (int) child_pid, status);
2622 					if(nsd->dt_collector) {
2623 						dt_collector_close(nsd->dt_collector, nsd);
2624 						dt_collector_destroy(nsd->dt_collector, nsd);
2625 						nsd->dt_collector = NULL;
2626 					}
2627 					/* Only respawn a crashed (or exited)
2628 					 * dnstap-collector when not reloading,
2629 					 * to not induce a reload during a
2630 					 * reload (which would seriously
2631 					 * disrupt nsd procedures and lead to
2632 					 * unpredictable results)!
2633 					 *
2634 					 * This will *leave* a dnstap-collector
2635 					 * process terminated, but because
2636 					 * signalling of the reload process to
2637 					 * the main process to respawn in this
2638 					 * situation will be cumbersome, and
2639 					 * because this situation is so
2640 					 * specific (and therefore hopefully
2641 					 * extremely rare or non-existing at
2642 					 * all), plus the fact that we are left
2643 					 * with a perfectly function NSD
2644 					 * (besides not logging dnstap
2645 					 * messages), I consider it acceptable
2646 					 * to leave this unresolved.
2647 					 */
2648 					if(reload_pid == -1 && nsd->options->dnstap_enable) {
2649 						nsd->dt_collector = dt_collector_create(nsd);
2650 						dt_collector_start(nsd->dt_collector, nsd);
2651 						nsd->mode = NSD_RELOAD_REQ;
2652 					}
2653 #endif
2654 				} else if(status != 0) {
2655 					/* check for status, because we get
2656 					 * the old-servermain because reload
2657 					 * is the process-parent of old-main,
2658 					 * and we get older server-processes
2659 					 * that are exiting after a reload */
2660 					log_msg(LOG_WARNING,
2661 					       "process %d terminated with status %d",
2662 					       (int) child_pid, status);
2663 				}
2664 			}
2665 			if (child_pid == -1) {
2666 				if (errno == EINTR) {
2667 					continue;
2668 				}
2669 				if (errno != ECHILD)
2670 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
2671 			}
2672 			if (nsd->mode != NSD_RUN)
2673 				break;
2674 
2675 			/* timeout to collect processes. In case no sigchild happens. */
2676 			timeout_spec.tv_sec = 60;
2677 			timeout_spec.tv_nsec = 0;
2678 
2679 			/* listen on ports, timeout for collecting terminated children */
2680 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
2681 				if (errno != EINTR) {
2682 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
2683 				}
2684 			}
2685 			if(nsd->restart_children) {
2686 				restart_child_servers(nsd, server_region, netio,
2687 					&nsd->xfrd_listener->fd);
2688 				nsd->restart_children = 0;
2689 			}
2690 			if(nsd->reload_failed) {
2691 				sig_atomic_t cmd = NSD_RELOAD_FAILED;
2692 				pid_t mypid;
2693 				nsd->reload_failed = 0;
2694 				log_msg(LOG_WARNING,
2695 				       "Reload process %d failed, continuing with old database",
2696 				       (int) reload_pid);
2697 				reload_pid = -1;
2698 				if(reload_listener.fd != -1) close(reload_listener.fd);
2699 				netio_remove_handler(netio, &reload_listener);
2700 				reload_listener.fd = -1;
2701 				reload_listener.event_types = NETIO_EVENT_NONE;
2702 				task_process_sync(nsd->task[nsd->mytask]);
2703 				/* inform xfrd reload attempt ended */
2704 				if(!write_socket(nsd->xfrd_listener->fd,
2705 					&cmd, sizeof(cmd))) {
2706 					log_msg(LOG_ERR, "problems "
2707 					  "sending SOAEND to xfrd: %s",
2708 					  strerror(errno));
2709 				}
2710 				mypid = getpid();
2711 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
2712 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
2713 						strerror(errno));
2714 				}
2715 			}
2716 
2717 			break;
2718 		case NSD_RELOAD_REQ: {
2719 			sig_atomic_t cmd = NSD_RELOAD_REQ;
2720 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
2721 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
2722 				"main: ipc send reload_req to xfrd"));
2723 			if(!write_socket(nsd->xfrd_listener->fd,
2724 				&cmd, sizeof(cmd))) {
2725 				log_msg(LOG_ERR, "server_main: could not send "
2726 				"reload_req to xfrd: %s", strerror(errno));
2727 			}
2728 			nsd->mode = NSD_RUN;
2729 			} break;
2730 		case NSD_RELOAD:
2731 			/* Continue to run nsd after reload */
2732 			nsd->mode = NSD_RUN;
2733 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
2734 			if (reload_pid != -1) {
2735 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
2736 				       (int) reload_pid);
2737 				break;
2738 			}
2739 
2740 			/* switch the mytask to keep track of who owns task*/
2741 			nsd->mytask = 1 - nsd->mytask;
2742 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
2743 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
2744 				reload_pid = -1;
2745 				break;
2746 			}
2747 
2748 			/* Do actual reload */
2749 			reload_pid = fork();
2750 			switch (reload_pid) {
2751 			case -1:
2752 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
2753 				break;
2754 			default:
2755 				/* PARENT */
2756 				close(reload_sockets[0]);
2757 				server_reload(nsd, server_region, netio,
2758 					reload_sockets[1]);
2759 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
2760 				close(reload_sockets[1]);
2761 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
2762 				/* drop stale xfrd ipc data */
2763 				((struct ipc_handler_conn_data*)nsd->
2764 					xfrd_listener->user_data)
2765 					->conn->is_reading = 0;
2766 				reload_pid = -1;
2767 				reload_listener.fd = -1;
2768 				reload_listener.event_types = NETIO_EVENT_NONE;
2769 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
2770 				break;
2771 			case 0:
2772 				/* CHILD */
2773 				/* server_main keep running until NSD_QUIT_SYNC
2774 				 * received from reload. */
2775 				close(reload_sockets[1]);
2776 				reload_listener.fd = reload_sockets[0];
2777 				reload_listener.timeout = NULL;
2778 				reload_listener.user_data = nsd;
2779 				reload_listener.event_types = NETIO_EVENT_READ;
2780 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
2781 				netio_add_handler(netio, &reload_listener);
2782 				reload_pid = getppid();
2783 				break;
2784 			}
2785 			break;
2786 		case NSD_QUIT_SYNC:
2787 			/* synchronisation of xfrd, parent and reload */
2788 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
2789 				sig_atomic_t cmd = NSD_RELOAD;
2790 				/* stop xfrd ipc writes in progress */
2791 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
2792 					"main: ipc send indication reload"));
2793 				if(!write_socket(nsd->xfrd_listener->fd,
2794 					&cmd, sizeof(cmd))) {
2795 					log_msg(LOG_ERR, "server_main: could not send reload "
2796 					"indication to xfrd: %s", strerror(errno));
2797 				}
2798 				/* wait for ACK from xfrd */
2799 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
2800 				nsd->quit_sync_done = 1;
2801 			}
2802 			nsd->mode = NSD_RUN;
2803 			break;
2804 		case NSD_QUIT:
2805 			/* silent shutdown during reload */
2806 			if(reload_listener.fd != -1) {
2807 				/* acknowledge the quit, to sync reload that we will really quit now */
2808 				sig_atomic_t cmd = NSD_RELOAD;
2809 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
2810 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2811 					log_msg(LOG_ERR, "server_main: "
2812 						"could not ack quit: %s", strerror(errno));
2813 				}
2814 				close(reload_listener.fd);
2815 			}
2816 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
2817 			/* only quit children after xfrd has acked */
2818 			send_children_quit(nsd);
2819 
2820 #ifdef MEMCLEAN /* OS collects memory pages */
2821 			region_destroy(server_region);
2822 #endif
2823 			server_shutdown(nsd);
2824 
2825 			/* ENOTREACH */
2826 			break;
2827 		case NSD_SHUTDOWN:
2828 			break;
2829 		case NSD_REAP_CHILDREN:
2830 			/* continue; wait for child in run loop */
2831 			nsd->mode = NSD_RUN;
2832 			break;
2833 		case NSD_STATS:
2834 #ifdef BIND8_STATS
2835 			set_children_stats(nsd);
2836 #endif
2837 			nsd->mode = NSD_RUN;
2838 			break;
2839 		default:
2840 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
2841 			nsd->mode = NSD_RUN;
2842 			break;
2843 		}
2844 	}
2845 	log_msg(LOG_WARNING, "signal received, shutting down...");
2846 
2847 	/* close opened ports to avoid race with restart of nsd */
2848 	server_close_all_sockets(nsd->udp, nsd->ifs);
2849 	server_close_all_sockets(nsd->tcp, nsd->ifs);
2850 	daemon_remote_close(nsd->rc);
2851 	send_children_quit_and_wait(nsd);
2852 
2853 	/* Unlink it if possible... */
2854 	unlinkpid(nsd->pidfile);
2855 	unlink(nsd->task[0]->fname);
2856 	unlink(nsd->task[1]->fname);
2857 #ifdef USE_ZONE_STATS
2858 	unlink(nsd->zonestatfname[0]);
2859 	unlink(nsd->zonestatfname[1]);
2860 #endif
2861 #ifdef BIND8_STATS
2862 	server_stat_free(nsd);
2863 #endif
2864 #ifdef USE_DNSTAP
2865 	dt_collector_close(nsd->dt_collector, nsd);
2866 #endif
2867 
2868 	if(reload_listener.fd != -1) {
2869 		sig_atomic_t cmd = NSD_QUIT;
2870 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2871 			"main: ipc send quit to reload-process"));
2872 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
2873 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
2874 				strerror(errno));
2875 		}
2876 		fsync(reload_listener.fd);
2877 		close(reload_listener.fd);
2878 		/* wait for reload to finish processing */
2879 		while(1) {
2880 			if(waitpid(reload_pid, NULL, 0) == -1) {
2881 				if(errno == EINTR) continue;
2882 				if(errno == ECHILD) break;
2883 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
2884 					(int)reload_pid, strerror(errno));
2885 			}
2886 			break;
2887 		}
2888 	}
2889 	if(nsd->xfrd_listener->fd != -1) {
2890 		/* complete quit, stop xfrd */
2891 		sig_atomic_t cmd = NSD_QUIT;
2892 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2893 			"main: ipc send quit to xfrd"));
2894 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2895 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2896 				strerror(errno));
2897 		}
2898 		fsync(nsd->xfrd_listener->fd);
2899 		close(nsd->xfrd_listener->fd);
2900 		(void)kill(nsd->pid, SIGTERM);
2901 	}
2902 
2903 #ifdef MEMCLEAN /* OS collects memory pages */
2904 	region_destroy(server_region);
2905 #endif
2906 	server_shutdown(nsd);
2907 }
2908 
2909 static query_state_type
server_process_query(struct nsd * nsd,struct query * query,uint32_t * now_p)2910 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p)
2911 {
2912 	return query_process(query, nsd, now_p);
2913 }
2914 
2915 static query_state_type
server_process_query_udp(struct nsd * nsd,struct query * query,uint32_t * now_p)2916 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p)
2917 {
2918 #ifdef RATELIMIT
2919 	if(query_process(query, nsd, now_p) != QUERY_DISCARDED) {
2920 		if(query->edns.cookie_status != COOKIE_VALID
2921 		&& query->edns.cookie_status != COOKIE_VALID_REUSE
2922 		&& rrl_process_query(query))
2923 			return rrl_slip(query);
2924 		else	return QUERY_PROCESSED;
2925 	}
2926 	return QUERY_DISCARDED;
2927 #else
2928 	return query_process(query, nsd, now_p);
2929 #endif
2930 }
2931 
2932 const char*
nsd_event_vs(void)2933 nsd_event_vs(void)
2934 {
2935 #ifdef USE_MINI_EVENT
2936 	return "";
2937 #else
2938 	return event_get_version();
2939 #endif
2940 }
2941 
2942 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS)
ub_ev_backend2str(int b)2943 static const char* ub_ev_backend2str(int b)
2944 {
2945 	switch(b) {
2946 	case EVBACKEND_SELECT:	return "select";
2947 	case EVBACKEND_POLL:	return "poll";
2948 	case EVBACKEND_EPOLL:	return "epoll";
2949 	case EVBACKEND_KQUEUE:	return "kqueue";
2950 	case EVBACKEND_DEVPOLL: return "devpoll";
2951 	case EVBACKEND_PORT:	return "evport";
2952 	}
2953 	return "unknown";
2954 }
2955 #endif
2956 
2957 const char*
nsd_event_method(void)2958 nsd_event_method(void)
2959 {
2960 #ifdef USE_MINI_EVENT
2961 	return "select";
2962 #else
2963 	struct event_base* b = nsd_child_event_base();
2964 	const char* m;
2965 #  ifdef EV_FEATURE_BACKENDS
2966 	m = ub_ev_backend2str(ev_backend((struct ev_loop*)b));
2967 #  elif defined(HAVE_EVENT_BASE_GET_METHOD)
2968 	m = event_base_get_method(b);
2969 #  else
2970 	m = "?";
2971 #  endif
2972 #  ifdef MEMCLEAN
2973 	event_base_free(b);
2974 #  endif
2975 	return m;
2976 #endif
2977 }
2978 
2979 struct event_base*
nsd_child_event_base(void)2980 nsd_child_event_base(void)
2981 {
2982 	struct event_base* base;
2983 #ifdef USE_MINI_EVENT
2984 	static time_t secs;
2985 	static struct timeval now;
2986 	base = event_init(&secs, &now);
2987 #else
2988 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2989 	/* libev */
2990 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2991 #  else
2992 	/* libevent */
2993 #    ifdef HAVE_EVENT_BASE_NEW
2994 	base = event_base_new();
2995 #    else
2996 	base = event_init();
2997 #    endif
2998 #  endif
2999 #endif
3000 	return base;
3001 }
3002 
3003 static void
add_udp_handler(struct nsd * nsd,struct nsd_socket * sock,struct udp_handler_data * data)3004 add_udp_handler(
3005 	struct nsd *nsd,
3006 	struct nsd_socket *sock,
3007 	struct udp_handler_data *data)
3008 {
3009 	struct event *handler = &data->event;
3010 
3011 	data->nsd = nsd;
3012 	data->socket = sock;
3013 
3014 	if(nsd->options->proxy_protocol_port &&
3015 		sockaddr_uses_proxy_protocol_port(nsd->options,
3016 		(struct sockaddr *)&sock->addr.ai_addr)) {
3017 		data->pp2_enabled = 1;
3018 	}
3019 
3020 	memset(handler, 0, sizeof(*handler));
3021 	event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data);
3022 	if(event_base_set(nsd->event_base, handler) != 0)
3023 		log_msg(LOG_ERR, "nsd udp: event_base_set failed");
3024 	if(event_add(handler, NULL) != 0)
3025 		log_msg(LOG_ERR, "nsd udp: event_add failed");
3026 }
3027 
3028 void
add_tcp_handler(struct nsd * nsd,struct nsd_socket * sock,struct tcp_accept_handler_data * data)3029 add_tcp_handler(
3030 	struct nsd *nsd,
3031 	struct nsd_socket *sock,
3032 	struct tcp_accept_handler_data *data)
3033 {
3034 	struct event *handler = &data->event;
3035 
3036 	data->nsd = nsd;
3037 	data->socket = sock;
3038 
3039 	if(nsd->options->proxy_protocol_port &&
3040 		sockaddr_uses_proxy_protocol_port(nsd->options,
3041 		(struct sockaddr *)&sock->addr.ai_addr)) {
3042 		data->pp2_enabled = 1;
3043 	}
3044 
3045 #ifdef HAVE_SSL
3046 	if (nsd->tls_ctx &&
3047 	    nsd->options->tls_port &&
3048 	    using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port))
3049 	{
3050 		data->tls_accept = 1;
3051 		if(verbosity >= 2) {
3052 			char buf[48];
3053 			addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf));
3054 			VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf));
3055 		}
3056 	} else {
3057 		data->tls_accept = 0;
3058 	}
3059 #endif
3060 
3061 	memset(handler, 0, sizeof(*handler));
3062 	event_set(handler, sock->s, EV_PERSIST|EV_READ,	handle_tcp_accept, data);
3063 	if(event_base_set(nsd->event_base, handler) != 0)
3064 		log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
3065 	if(event_add(handler, NULL) != 0)
3066 		log_msg(LOG_ERR, "nsd tcp: event_add failed");
3067 	data->event_added = 1;
3068 }
3069 
3070 /*
3071  * Serve DNS request to verifiers (short-lived)
3072  */
server_verify(struct nsd * nsd,int cmdsocket)3073 void server_verify(struct nsd *nsd, int cmdsocket)
3074 {
3075 	size_t size = 0;
3076 	struct event cmd_event, signal_event, exit_event;
3077 	struct zone *zone;
3078 
3079 	assert(nsd != NULL);
3080 
3081 	zone = verify_next_zone(nsd, NULL);
3082 	if(zone == NULL)
3083 		return;
3084 
3085 	nsd->server_region = region_create(xalloc, free);
3086 	nsd->event_base = nsd_child_event_base();
3087 
3088 	nsd->next_zone_to_verify = zone;
3089 	nsd->verifier_count = 0;
3090 	nsd->verifier_limit = nsd->options->verifier_count;
3091 	size = sizeof(struct verifier) * nsd->verifier_limit;
3092 	if(pipe(nsd->verifier_pipe) == -1) {
3093 		log_msg(LOG_ERR, "verify: could not create pipe: %s",
3094 				strerror(errno));
3095 		goto fail_pipe;
3096 	}
3097 	fcntl(nsd->verifier_pipe[0], F_SETFD, FD_CLOEXEC);
3098 	fcntl(nsd->verifier_pipe[1], F_SETFD, FD_CLOEXEC);
3099 	nsd->verifiers = region_alloc_zero(nsd->server_region, size);
3100 
3101 	for(size_t i = 0; i < nsd->verifier_limit; i++) {
3102 		nsd->verifiers[i].nsd = nsd;
3103 		nsd->verifiers[i].zone = NULL;
3104 		nsd->verifiers[i].pid = -1;
3105 		nsd->verifiers[i].output_stream.fd = -1;
3106 		nsd->verifiers[i].output_stream.priority = LOG_INFO;
3107 		nsd->verifiers[i].error_stream.fd = -1;
3108 		nsd->verifiers[i].error_stream.priority = LOG_ERR;
3109 	}
3110 
3111 	event_set(&cmd_event, cmdsocket, EV_READ|EV_PERSIST, verify_handle_command, nsd);
3112 	if(event_base_set(nsd->event_base, &cmd_event) != 0 ||
3113 	   event_add(&cmd_event, NULL) != 0)
3114 	{
3115 		log_msg(LOG_ERR, "verify: could not add command event");
3116 		goto fail;
3117 	}
3118 
3119 	event_set(&signal_event, SIGCHLD, EV_SIGNAL|EV_PERSIST, verify_handle_signal, nsd);
3120 	if(event_base_set(nsd->event_base, &signal_event) != 0 ||
3121 	   signal_add(&signal_event, NULL) != 0)
3122 	{
3123 		log_msg(LOG_ERR, "verify: could not add signal event");
3124 		goto fail;
3125 	}
3126 
3127 	event_set(&exit_event, nsd->verifier_pipe[0], EV_READ|EV_PERSIST, verify_handle_exit, nsd);
3128 	if(event_base_set(nsd->event_base, &exit_event) != 0 ||
3129 	   event_add(&exit_event, NULL) != 0)
3130   {
3131 		log_msg(LOG_ERR, "verify: could not add exit event");
3132 		goto fail;
3133 	}
3134 
3135 	memset(msgs, 0, sizeof(msgs));
3136 	for (int i = 0; i < NUM_RECV_PER_SELECT; i++) {
3137 		queries[i] = query_create(nsd->server_region,
3138 			compressed_dname_offsets,
3139 			compression_table_size, compressed_dnames);
3140 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3141 		iovecs[i].iov_base = buffer_begin(queries[i]->packet);
3142 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3143 		msgs[i].msg_hdr.msg_iov = &iovecs[i];
3144 		msgs[i].msg_hdr.msg_iovlen = 1;
3145 		msgs[i].msg_hdr.msg_name = &queries[i]->remote_addr;
3146 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3147 	}
3148 
3149 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3150 		struct udp_handler_data *data;
3151 		data = region_alloc_zero(
3152 			nsd->server_region, sizeof(*data));
3153 		add_udp_handler(nsd, &nsd->verify_udp[i], data);
3154 	}
3155 
3156 	tcp_accept_handler_count = nsd->verify_ifs;
3157 	tcp_accept_handlers = region_alloc_array(nsd->server_region,
3158 		nsd->verify_ifs, sizeof(*tcp_accept_handlers));
3159 
3160 	for (size_t i = 0; i < nsd->verify_ifs; i++) {
3161 		struct tcp_accept_handler_data *data;
3162 		data = &tcp_accept_handlers[i];
3163 		memset(data, 0, sizeof(*data));
3164 		add_tcp_handler(nsd, &nsd->verify_tcp[i], data);
3165 	}
3166 
3167 	while(nsd->next_zone_to_verify != NULL &&
3168 	      nsd->verifier_count < nsd->verifier_limit)
3169 	{
3170 		verify_zone(nsd, nsd->next_zone_to_verify);
3171 		nsd->next_zone_to_verify
3172 			= verify_next_zone(nsd, nsd->next_zone_to_verify);
3173 	}
3174 
3175 	/* short-lived main loop */
3176 	event_base_dispatch(nsd->event_base);
3177 
3178 	/* remove command and exit event handlers */
3179 	event_del(&exit_event);
3180 	event_del(&signal_event);
3181 	event_del(&cmd_event);
3182 
3183 	assert(nsd->next_zone_to_verify == NULL || nsd->mode == NSD_QUIT);
3184 	assert(nsd->verifier_count == 0 || nsd->mode == NSD_QUIT);
3185 fail:
3186 	close(nsd->verifier_pipe[0]);
3187 	close(nsd->verifier_pipe[1]);
3188 fail_pipe:
3189 	event_base_free(nsd->event_base);
3190 	region_destroy(nsd->server_region);
3191 
3192 	nsd->event_base = NULL;
3193 	nsd->server_region = NULL;
3194 	nsd->verifier_limit = 0;
3195 	nsd->verifier_pipe[0] = -1;
3196 	nsd->verifier_pipe[1] = -1;
3197 	nsd->verifiers = NULL;
3198 }
3199 
3200 /*
3201  * Serve DNS requests.
3202  */
3203 void
server_child(struct nsd * nsd)3204 server_child(struct nsd *nsd)
3205 {
3206 	size_t i, from, numifs;
3207 	region_type *server_region = region_create(xalloc, free);
3208 	struct event_base* event_base = nsd_child_event_base();
3209 	sig_atomic_t mode;
3210 
3211 	if(!event_base) {
3212 		log_msg(LOG_ERR, "nsd server could not create event base");
3213 		exit(1);
3214 	}
3215 	nsd->event_base = event_base;
3216 	nsd->server_region = server_region;
3217 
3218 #ifdef RATELIMIT
3219 	rrl_init(nsd->this_child->child_num);
3220 #endif
3221 
3222 	assert(nsd->server_kind != NSD_SERVER_MAIN);
3223 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
3224 
3225 #ifdef HAVE_SETPROCTITLE
3226 	setproctitle("server %d", nsd->this_child->child_num + 1);
3227 #endif
3228 #ifdef HAVE_CPUSET_T
3229 	if(nsd->use_cpu_affinity) {
3230 		set_cpu_affinity(nsd->this_child->cpuset);
3231 	}
3232 #endif
3233 #ifdef BIND8_STATS
3234 	nsd->st = &nsd->stats_per_child[nsd->stat_current]
3235 		[nsd->this_child->child_num];
3236 	nsd->st->boot = nsd->stat_map[0].boot;
3237 	memcpy(&nsd->stat_proc, nsd->st, sizeof(nsd->stat_proc));
3238 #endif
3239 
3240 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
3241 		server_close_all_sockets(nsd->tcp, nsd->ifs);
3242 	}
3243 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
3244 		server_close_all_sockets(nsd->udp, nsd->ifs);
3245 	}
3246 
3247 	if (nsd->this_child->parent_fd != -1) {
3248 		struct event *handler;
3249 		struct ipc_handler_conn_data* user_data =
3250 			(struct ipc_handler_conn_data*)region_alloc(
3251 			server_region, sizeof(struct ipc_handler_conn_data));
3252 		user_data->nsd = nsd;
3253 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
3254 
3255 		handler = (struct event*) region_alloc(
3256 			server_region, sizeof(*handler));
3257 		memset(handler, 0, sizeof(*handler));
3258 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
3259 			EV_READ, child_handle_parent_command, user_data);
3260 		if(event_base_set(event_base, handler) != 0)
3261 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
3262 		if(event_add(handler, NULL) != 0)
3263 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
3264 	}
3265 
3266 	if(nsd->reuseport) {
3267 		numifs = nsd->ifs / nsd->reuseport;
3268 		from = numifs * nsd->this_child->child_num;
3269 		if(from+numifs > nsd->ifs) { /* should not happen */
3270 			from = 0;
3271 			numifs = nsd->ifs;
3272 		}
3273 	} else {
3274 		from = 0;
3275 		numifs = nsd->ifs;
3276 	}
3277 
3278 	if (nsd->server_kind & NSD_SERVER_UDP) {
3279 		int child = nsd->this_child->child_num;
3280 		memset(msgs, 0, sizeof(msgs));
3281 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
3282 			queries[i] = query_create(server_region,
3283 				compressed_dname_offsets,
3284 				compression_table_size, compressed_dnames);
3285 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3286 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
3287 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);
3288 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
3289 			msgs[i].msg_hdr.msg_iovlen  = 1;
3290 			msgs[i].msg_hdr.msg_name    = &queries[i]->remote_addr;
3291 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3292 		}
3293 
3294 		for (i = 0; i < nsd->ifs; i++) {
3295 			int listen;
3296 			struct udp_handler_data *data;
3297 
3298 			listen = nsd_bitset_isset(nsd->udp[i].servers, child);
3299 
3300 			if(i >= from && i < (from + numifs) && listen) {
3301 				data = region_alloc_zero(
3302 					nsd->server_region, sizeof(*data));
3303 				add_udp_handler(nsd, &nsd->udp[i], data);
3304 			} else {
3305 				/* close sockets intended for other servers */
3306 				server_close_socket(&nsd->udp[i]);
3307 			}
3308 		}
3309 	}
3310 
3311 	/*
3312 	 * Keep track of all the TCP accept handlers so we can enable
3313 	 * and disable them based on the current number of active TCP
3314 	 * connections.
3315 	 */
3316 	if (nsd->server_kind & NSD_SERVER_TCP) {
3317 		int child = nsd->this_child->child_num;
3318 		tcp_accept_handler_count = numifs;
3319 		tcp_accept_handlers = region_alloc_array(server_region,
3320 			numifs, sizeof(*tcp_accept_handlers));
3321 
3322 		for (i = 0; i < nsd->ifs; i++) {
3323 			int listen;
3324 			struct tcp_accept_handler_data *data;
3325 
3326 			listen = nsd_bitset_isset(nsd->tcp[i].servers, child);
3327 
3328 			if(i >= from && i < (from + numifs) && listen) {
3329 				data = &tcp_accept_handlers[i-from];
3330 				memset(data, 0, sizeof(*data));
3331 				add_tcp_handler(nsd, &nsd->tcp[i], data);
3332 			} else {
3333 				/* close sockets intended for other servers */
3334 				/*
3335 				 * uncomment this once tcp servers are no
3336 				 * longer copied in the tcp fd copy line
3337 				 * in server_init().
3338 				server_close_socket(&nsd->tcp[i]);
3339 				*/
3340 				/* close sockets not meant for this server*/
3341 				if(!listen)
3342 					server_close_socket(&nsd->tcp[i]);
3343 			}
3344 		}
3345 	} else {
3346 		tcp_accept_handler_count = 0;
3347 	}
3348 
3349 	/* The main loop... */
3350 	while ((mode = nsd->mode) != NSD_QUIT) {
3351 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
3352 
3353 		/* Do we need to do the statistics... */
3354 		if (mode == NSD_STATS) {
3355 #ifdef BIND8_STATS
3356 			int p = nsd->st_period;
3357 			nsd->st_period = 1; /* force stats printout */
3358 			/* Dump the statistics */
3359 			bind8_stats(nsd);
3360 			nsd->st_period = p;
3361 #else /* !BIND8_STATS */
3362 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
3363 #endif /* BIND8_STATS */
3364 
3365 			nsd->mode = NSD_RUN;
3366 		}
3367 		else if (mode == NSD_REAP_CHILDREN) {
3368 			/* got signal, notify parent. parent reaps terminated children. */
3369 			if (nsd->this_child->parent_fd != -1) {
3370 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
3371 				if (write(nsd->this_child->parent_fd,
3372 				    &parent_notify,
3373 				    sizeof(parent_notify)) == -1)
3374 				{
3375 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
3376 						(int) nsd->this_child->pid, strerror(errno));
3377 				}
3378 			} else /* no parent, so reap 'em */
3379 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
3380 			nsd->mode = NSD_RUN;
3381 		}
3382 		else if(mode == NSD_RUN) {
3383 			/* Wait for a query... */
3384 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3385 				if (errno != EINTR) {
3386 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3387 					break;
3388 				}
3389 			}
3390 		} else if(mode == NSD_QUIT) {
3391 			/* ignore here, quit */
3392 		} else {
3393 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
3394 				(int)mode);
3395 			nsd->mode = NSD_RUN;
3396 		}
3397 	}
3398 
3399 	service_remaining_tcp(nsd);
3400 #ifdef	BIND8_STATS
3401 	bind8_stats(nsd);
3402 #endif /* BIND8_STATS */
3403 
3404 #ifdef MEMCLEAN /* OS collects memory pages */
3405 #ifdef RATELIMIT
3406 	rrl_deinit(nsd->this_child->child_num);
3407 #endif
3408 	event_base_free(event_base);
3409 	region_destroy(server_region);
3410 #endif
3411 	server_shutdown(nsd);
3412 }
3413 
remaining_tcp_timeout(int ATTR_UNUSED (fd),short event,void * arg)3414 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg)
3415 {
3416 	int* timed_out = (int*)arg;
3417         assert(event & EV_TIMEOUT); (void)event;
3418 	/* wake up the service tcp thread, note event is no longer
3419 	 * registered */
3420 	*timed_out = 1;
3421 }
3422 
3423 void
service_remaining_tcp(struct nsd * nsd)3424 service_remaining_tcp(struct nsd* nsd)
3425 {
3426 	struct tcp_handler_data* p;
3427 	struct event_base* event_base;
3428 	/* check if it is needed */
3429 	if(nsd->current_tcp_count == 0 || tcp_active_list == NULL)
3430 		return;
3431 	VERBOSITY(4, (LOG_INFO, "service remaining TCP connections"));
3432 #ifdef USE_DNSTAP
3433 	/* remove dnstap collector, we cannot write there because the new
3434 	 * child process is using the file descriptor, or the child
3435 	 * process after that. */
3436 	dt_collector_destroy(nsd->dt_collector, nsd);
3437 	nsd->dt_collector = NULL;
3438 #endif
3439 	/* setup event base */
3440 	event_base = nsd_child_event_base();
3441 	if(!event_base) {
3442 		log_msg(LOG_ERR, "nsd remain tcp could not create event base");
3443 		return;
3444 	}
3445 	/* register tcp connections */
3446 	for(p = tcp_active_list; p != NULL; p = p->next) {
3447 		struct timeval timeout;
3448 		int fd = p->event.ev_fd;
3449 #ifdef USE_MINI_EVENT
3450 		short event = p->event.ev_flags & (EV_READ|EV_WRITE);
3451 #else
3452 		short event = p->event.ev_events & (EV_READ|EV_WRITE);
3453 #endif
3454 		void (*fn)(int, short, void*);
3455 #ifdef HAVE_SSL
3456 		if(p->tls) {
3457 			if((event&EV_READ))
3458 				fn = handle_tls_reading;
3459 			else	fn = handle_tls_writing;
3460 		} else {
3461 #endif
3462 			if((event&EV_READ))
3463 				fn = handle_tcp_reading;
3464 			else	fn = handle_tcp_writing;
3465 #ifdef HAVE_SSL
3466 		}
3467 #endif
3468 
3469 		p->tcp_no_more_queries = 1;
3470 		/* set timeout to 3 seconds (previously 1/10 second) */
3471 		if(p->tcp_timeout > 3000)
3472 			p->tcp_timeout = 3000;
3473 		timeout.tv_sec = p->tcp_timeout / 1000;
3474 		timeout.tv_usec = (p->tcp_timeout % 1000)*1000;
3475 		event_del(&p->event);
3476 		memset(&p->event, 0, sizeof(p->event));
3477 		event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT,
3478 			fn, p);
3479 		if(event_base_set(event_base, &p->event) != 0)
3480 			log_msg(LOG_ERR, "event base set failed");
3481 		if(event_add(&p->event, &timeout) != 0)
3482 			log_msg(LOG_ERR, "event add failed");
3483 	}
3484 
3485 	/* handle it */
3486 	while(nsd->current_tcp_count > 0) {
3487 		mode_t m = server_signal_mode(nsd);
3488 		struct event timeout;
3489 		struct timeval tv;
3490 		int timed_out = 0;
3491 		if(m == NSD_QUIT || m == NSD_SHUTDOWN ||
3492 			m == NSD_REAP_CHILDREN) {
3493 			/* quit */
3494 			break;
3495 		}
3496 		/* timer */
3497 		/* have to do something every 3 seconds */
3498 		tv.tv_sec = 3;
3499 		tv.tv_usec = 0;
3500 		memset(&timeout, 0, sizeof(timeout));
3501 		event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout,
3502 			&timed_out);
3503 		if(event_base_set(event_base, &timeout) != 0)
3504 			log_msg(LOG_ERR, "remaintcp timer: event_base_set failed");
3505 		if(event_add(&timeout, &tv) != 0)
3506 			log_msg(LOG_ERR, "remaintcp timer: event_add failed");
3507 
3508 		/* service loop */
3509 		if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
3510 			if (errno != EINTR) {
3511 				log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
3512 				break;
3513 			}
3514 		}
3515 		if(!timed_out) {
3516 			event_del(&timeout);
3517 		} else {
3518 			/* timed out, quit */
3519 			VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit"));
3520 			break;
3521 		}
3522 	}
3523 #ifdef MEMCLEAN
3524 	event_base_free(event_base);
3525 #endif
3526 	/* continue to quit after return */
3527 }
3528 
3529 /* Implement recvmmsg and sendmmsg if the platform does not. These functions
3530  * are always used, even if nonblocking operations are broken, in which case
3531  * NUM_RECV_PER_SELECT is defined to 1 (one).
3532  */
3533 #if defined(HAVE_RECVMMSG)
3534 #define nsd_recvmmsg recvmmsg
3535 #else /* !HAVE_RECVMMSG */
3536 
3537 static int
nsd_recvmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags,struct timespec * timeout)3538 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen,
3539              int flags, struct timespec *timeout)
3540 {
3541 	unsigned int vpos = 0;
3542 	ssize_t rcvd;
3543 
3544 	/* timeout is ignored, ensure caller does not expect it to work */
3545 	assert(timeout == NULL); (void)timeout;
3546 
3547 	while(vpos < vlen) {
3548 		rcvd = recvfrom(sockfd,
3549 		                msgvec[vpos].msg_hdr.msg_iov->iov_base,
3550 		                msgvec[vpos].msg_hdr.msg_iov->iov_len,
3551 		                flags,
3552 		                msgvec[vpos].msg_hdr.msg_name,
3553 		               &msgvec[vpos].msg_hdr.msg_namelen);
3554 		if(rcvd < 0) {
3555 			break;
3556 		} else {
3557 			assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX);
3558 			msgvec[vpos].msg_len = (unsigned int)rcvd;
3559 			vpos++;
3560 		}
3561 	}
3562 
3563 	if(vpos) {
3564 		/* error will be picked up next time */
3565 		return (int)vpos;
3566 	} else if(errno == 0) {
3567 		return 0;
3568 	} else if(errno == EAGAIN) {
3569 		return 0;
3570 	}
3571 
3572 	return -1;
3573 }
3574 #endif /* HAVE_RECVMMSG */
3575 
3576 #ifdef HAVE_SENDMMSG
3577 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__)
3578 #else /* !HAVE_SENDMMSG */
3579 
3580 static int
nsd_sendmmsg(int sockfd,struct mmsghdr * msgvec,unsigned int vlen,int flags)3581 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags)
3582 {
3583 	unsigned int vpos = 0;
3584 	ssize_t snd;
3585 
3586 	while(vpos < vlen) {
3587 		assert(msgvec[vpos].msg_hdr.msg_iovlen == 1);
3588 		snd = sendto(sockfd,
3589 		             msgvec[vpos].msg_hdr.msg_iov->iov_base,
3590 		             msgvec[vpos].msg_hdr.msg_iov->iov_len,
3591 		             flags,
3592 		             msgvec[vpos].msg_hdr.msg_name,
3593 		             msgvec[vpos].msg_hdr.msg_namelen);
3594 		if(snd < 0) {
3595 			break;
3596 		} else {
3597 			msgvec[vpos].msg_len = (unsigned int)snd;
3598 			vpos++;
3599 		}
3600 	}
3601 
3602 	if(vpos) {
3603 		return (int)vpos;
3604 	} else if(errno == 0) {
3605 		return 0;
3606 	}
3607 
3608 	return -1;
3609 }
3610 #endif /* HAVE_SENDMMSG */
3611 
3612 static int
port_is_zero(struct sockaddr_storage * addr)3613 port_is_zero(
3614 #ifdef INET6
3615         struct sockaddr_storage *addr
3616 #else
3617         struct sockaddr_in *addr
3618 #endif
3619 	)
3620 {
3621 #ifdef INET6
3622 	if(addr->ss_family == AF_INET6) {
3623 		return (((struct sockaddr_in6 *)addr)->sin6_port) == 0;
3624 	} else if(addr->ss_family == AF_INET) {
3625 		return (((struct sockaddr_in *)addr)->sin_port) == 0;
3626 	}
3627 	return 0;
3628 #else
3629 	if(addr->sin_family == AF_INET) {
3630 		return addr->sin_port == 0;
3631 	}
3632 	return 0;
3633 #endif
3634 }
3635 
3636 /* Parses the PROXYv2 header from buf and updates the struct.
3637  * Returns 1 on success, 0 on failure. */
3638 static int
consume_pp2_header(struct buffer * buf,struct query * q,int stream)3639 consume_pp2_header(struct buffer* buf, struct query* q, int stream)
3640 {
3641 	size_t size;
3642 	struct pp2_header* header;
3643 	int err = pp2_read_header(buffer_begin(buf), buffer_remaining(buf));
3644 	if(err) {
3645 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: could not parse "
3646 			"PROXYv2 header: %s", pp_lookup_error(err)));
3647 		return 0;
3648 	}
3649 	header = (struct pp2_header*)buffer_begin(buf);
3650 	size = PP2_HEADER_SIZE + read_uint16(&header->len);
3651 	if(size > buffer_limit(buf)) {
3652 		VERBOSITY(4, (LOG_ERR, "proxy-protocol: not enough buffer "
3653 			"size to read PROXYv2 header"));
3654 		return 0;
3655 	}
3656 	if((header->ver_cmd & 0xF) == PP2_CMD_LOCAL) {
3657 		/* A connection from the proxy itself.
3658 		 * No need to do anything with addresses. */
3659 		goto done;
3660 	}
3661 	if(header->fam_prot == PP2_UNSPEC_UNSPEC) {
3662 		/* Unspecified family and protocol. This could be used for
3663 		 * health checks by proxies.
3664 		 * No need to do anything with addresses. */
3665 		goto done;
3666 	}
3667 	/* Read the proxied address */
3668 	switch(header->fam_prot) {
3669 		case PP2_INET_STREAM:
3670 		case PP2_INET_DGRAM:
3671 			{
3672 			struct sockaddr_in* addr =
3673 				(struct sockaddr_in*)&q->client_addr;
3674 			addr->sin_family = AF_INET;
3675 			memmove(&addr->sin_addr.s_addr,
3676 				&header->addr.addr4.src_addr, 4);
3677 			memmove(&addr->sin_port, &header->addr.addr4.src_port,
3678 				2);
3679 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in);
3680 			}
3681 			/* Ignore the destination address; it should be us. */
3682 			break;
3683 #ifdef INET6
3684 		case PP2_INET6_STREAM:
3685 		case PP2_INET6_DGRAM:
3686 			{
3687 			struct sockaddr_in6* addr =
3688 				(struct sockaddr_in6*)&q->client_addr;
3689 			memset(addr, 0, sizeof(*addr));
3690 			addr->sin6_family = AF_INET6;
3691 			memmove(&addr->sin6_addr,
3692 				header->addr.addr6.src_addr, 16);
3693 			memmove(&addr->sin6_port, &header->addr.addr6.src_port,
3694 				2);
3695 			q->client_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
3696 			}
3697 			/* Ignore the destination address; it should be us. */
3698 			break;
3699 #endif /* INET6 */
3700 		default:
3701 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: unsupported "
3702 				"family and protocol 0x%x",
3703 				(int)header->fam_prot));
3704 			return 0;
3705 	}
3706 	q->is_proxied = 1;
3707 done:
3708 	if(!stream) {
3709 		/* We are reading a whole packet;
3710 		 * Move the rest of the data to overwrite the PROXYv2 header */
3711 		/* XXX can we do better to avoid memmove? */
3712 		memmove(header, ((char*)header)+size, buffer_limit(buf)-size);
3713 		buffer_set_limit(buf, buffer_limit(buf)-size);
3714 	}
3715 	return 1;
3716 }
3717 
3718 static void
handle_udp(int fd,short event,void * arg)3719 handle_udp(int fd, short event, void* arg)
3720 {
3721 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
3722 	int received, sent, recvcount, i;
3723 	struct query *q;
3724 	uint32_t now = 0;
3725 
3726 	if (!(event & EV_READ)) {
3727 		return;
3728 	}
3729 	recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
3730 	/* this printf strangely gave a performance increase on Linux */
3731 	/* printf("recvcount %d \n", recvcount); */
3732 	if (recvcount == -1) {
3733 		if (errno != EAGAIN && errno != EINTR) {
3734 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
3735 			STATUP(data->nsd, rxerr);
3736 			/* No zone statup */
3737 		}
3738 		/* Simply no data available */
3739 		return;
3740 	}
3741 	for (i = 0; i < recvcount; i++) {
3742 	loopstart:
3743 		received = msgs[i].msg_len;
3744 		queries[i]->remote_addrlen = msgs[i].msg_hdr.msg_namelen;
3745 		queries[i]->client_addrlen = (socklen_t)sizeof(queries[i]->client_addr);
3746 		queries[i]->is_proxied = 0;
3747 		q = queries[i];
3748 		if (received == -1) {
3749 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
3750 #if defined(HAVE_RECVMMSG)
3751 				msgs[i].msg_hdr.msg_flags
3752 #else
3753 				errno
3754 #endif
3755 				));
3756 			STATUP(data->nsd, rxerr);
3757 			/* No zone statup */
3758 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3759 			iovecs[i].iov_len = buffer_remaining(q->packet);
3760 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3761 			goto swap_drop;
3762 		}
3763 
3764 		/* Account... */
3765 #ifdef BIND8_STATS
3766 		if (data->socket->addr.ai_family == AF_INET) {
3767 			STATUP(data->nsd, qudp);
3768 		} else if (data->socket->addr.ai_family == AF_INET6) {
3769 			STATUP(data->nsd, qudp6);
3770 		}
3771 #endif
3772 
3773 		buffer_skip(q->packet, received);
3774 		buffer_flip(q->packet);
3775 		if(data->pp2_enabled && !consume_pp2_header(q->packet, q, 0)) {
3776 			VERBOSITY(2, (LOG_ERR, "proxy-protocol: could not "
3777 				"consume PROXYv2 header"));
3778 			goto swap_drop;
3779 		}
3780 		if(!q->is_proxied) {
3781 			q->client_addrlen = q->remote_addrlen;
3782 			memmove(&q->client_addr, &q->remote_addr,
3783 				q->remote_addrlen);
3784 		}
3785 #ifdef USE_DNSTAP
3786 		/*
3787 		 * sending UDP-query with server address (local) and client address to dnstap process
3788 		 */
3789 		log_addr("query from client", &q->client_addr);
3790 		log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
3791 		if(verbosity >= 6 && q->is_proxied)
3792 			log_addr("query via proxy", &q->remote_addr);
3793 		dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->client_addr, q->client_addrlen,
3794 			q->tcp, q->packet);
3795 #endif /* USE_DNSTAP */
3796 
3797 		/* Process and answer the query... */
3798 		if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) {
3799 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
3800 				STATUP(data->nsd, nona);
3801 				ZTATUP(data->nsd, q->zone, nona);
3802 			}
3803 
3804 #ifdef USE_ZONE_STATS
3805 			if (data->socket->addr.ai_family == AF_INET) {
3806 				ZTATUP(data->nsd, q->zone, qudp);
3807 			} else if (data->socket->addr.ai_family == AF_INET6) {
3808 				ZTATUP(data->nsd, q->zone, qudp6);
3809 			}
3810 #endif
3811 
3812 			/* Add EDNS0 and TSIG info if necessary.  */
3813 			query_add_optional(q, data->nsd, &now);
3814 
3815 			buffer_flip(q->packet);
3816 			iovecs[i].iov_len = buffer_remaining(q->packet);
3817 #ifdef BIND8_STATS
3818 			/* Account the rcode & TC... */
3819 			STATUP2(data->nsd, rcode, RCODE(q->packet));
3820 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
3821 			if (TC(q->packet)) {
3822 				STATUP(data->nsd, truncated);
3823 				ZTATUP(data->nsd, q->zone, truncated);
3824 			}
3825 #endif /* BIND8_STATS */
3826 #ifdef USE_DNSTAP
3827 			/*
3828 			 * sending UDP-response with server address (local) and client address to dnstap process
3829 			 */
3830 			log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
3831 			log_addr("response to client", &q->client_addr);
3832 			if(verbosity >= 6 && q->is_proxied)
3833 				log_addr("response via proxy", &q->remote_addr);
3834 			dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr,
3835 				&q->client_addr, q->client_addrlen, q->tcp, q->packet,
3836 				q->zone);
3837 #endif /* USE_DNSTAP */
3838 		} else {
3839 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3840 			iovecs[i].iov_len = buffer_remaining(q->packet);
3841 			msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3842 		swap_drop:
3843 			STATUP(data->nsd, dropped);
3844 			ZTATUP(data->nsd, q->zone, dropped);
3845 			if(i != recvcount-1) {
3846 				/* swap with last and decrease recvcount */
3847 				struct mmsghdr mtmp = msgs[i];
3848 				struct iovec iotmp = iovecs[i];
3849 				recvcount--;
3850 				msgs[i] = msgs[recvcount];
3851 				iovecs[i] = iovecs[recvcount];
3852 				queries[i] = queries[recvcount];
3853 				msgs[recvcount] = mtmp;
3854 				iovecs[recvcount] = iotmp;
3855 				queries[recvcount] = q;
3856 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
3857 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
3858 				goto loopstart;
3859 			} else { recvcount --; }
3860 		}
3861 	}
3862 
3863 	/* send until all are sent */
3864 	i = 0;
3865 	while(i<recvcount) {
3866 		sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3867 		if(sent == -1) {
3868 			if(errno == ENOBUFS ||
3869 #ifdef EWOULDBLOCK
3870 				errno == EWOULDBLOCK ||
3871 #endif
3872 				errno == EAGAIN) {
3873 				/* block to wait until send buffer avail */
3874 				int flag, errstore;
3875 				if((flag = fcntl(fd, F_GETFL)) == -1) {
3876 					log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno));
3877 					flag = 0;
3878 				}
3879 				flag &= ~O_NONBLOCK;
3880 				if(fcntl(fd, F_SETFL, flag) == -1)
3881 					log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno));
3882 				sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0);
3883 				errstore = errno;
3884 				flag |= O_NONBLOCK;
3885 				if(fcntl(fd, F_SETFL, flag) == -1)
3886 					log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno));
3887 				if(sent != -1) {
3888 					i += sent;
3889 					continue;
3890 				}
3891 				errno = errstore;
3892 			}
3893 			if(errno == EINVAL) {
3894 				/* skip the invalid argument entry,
3895 				 * send the remaining packets in the list */
3896 				if(!(port_is_zero((void*)&queries[i]->remote_addr) &&
3897 					verbosity < 3)) {
3898 					const char* es = strerror(errno);
3899 					char a[64];
3900 					addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
3901 					log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3902 				}
3903 				i += 1;
3904 				continue;
3905 			}
3906 			/* don't log transient network full errors, unless
3907 			 * on higher verbosity */
3908 			if(!(errno == ENOBUFS && verbosity < 1) &&
3909 #ifdef EWOULDBLOCK
3910 			   errno != EWOULDBLOCK &&
3911 #endif
3912 			   errno != EAGAIN) {
3913 				const char* es = strerror(errno);
3914 				char a[64];
3915 				addrport2str((void*)&queries[i]->remote_addr, a, sizeof(a));
3916 				log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
3917 			}
3918 #ifdef BIND8_STATS
3919 			data->nsd->st->txerr += recvcount-i;
3920 #endif /* BIND8_STATS */
3921 			break;
3922 		}
3923 		i += sent;
3924 	}
3925 	for(i=0; i<recvcount; i++) {
3926 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
3927 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
3928 		msgs[i].msg_hdr.msg_namelen = queries[i]->remote_addrlen;
3929 	}
3930 }
3931 
3932 #ifdef HAVE_SSL
3933 /*
3934  * Setup an event for the tcp handler.
3935  */
3936 static void
tcp_handler_setup_event(struct tcp_handler_data * data,void (* fn)(int,short,void *),int fd,short event)3937 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *),
3938        int fd, short event)
3939 {
3940 	struct timeval timeout;
3941 	struct event_base* ev_base;
3942 
3943 	timeout.tv_sec = data->nsd->tcp_timeout;
3944 	timeout.tv_usec = 0L;
3945 
3946 	ev_base = data->event.ev_base;
3947 	event_del(&data->event);
3948 	memset(&data->event, 0, sizeof(data->event));
3949 	event_set(&data->event, fd, event, fn, data);
3950 	if(event_base_set(ev_base, &data->event) != 0)
3951 		log_msg(LOG_ERR, "event base set failed");
3952 	if(event_add(&data->event, &timeout) != 0)
3953 		log_msg(LOG_ERR, "event add failed");
3954 }
3955 #endif /* HAVE_SSL */
3956 
3957 static void
cleanup_tcp_handler(struct tcp_handler_data * data)3958 cleanup_tcp_handler(struct tcp_handler_data* data)
3959 {
3960 	event_del(&data->event);
3961 #ifdef HAVE_SSL
3962 	if(data->tls) {
3963 		SSL_shutdown(data->tls);
3964 		SSL_free(data->tls);
3965 		data->tls = NULL;
3966 	}
3967 #endif
3968 	data->pp2_header_state = pp2_header_none;
3969 	close(data->event.ev_fd);
3970 	if(data->prev)
3971 		data->prev->next = data->next;
3972 	else	tcp_active_list = data->next;
3973 	if(data->next)
3974 		data->next->prev = data->prev;
3975 
3976 	/*
3977 	 * Enable the TCP accept handlers when the current number of
3978 	 * TCP connections is about to drop below the maximum number
3979 	 * of TCP connections.
3980 	 */
3981 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3982 		configure_handler_event_types(EV_READ|EV_PERSIST);
3983 		if(slowaccept) {
3984 			event_del(&slowaccept_event);
3985 			slowaccept = 0;
3986 		}
3987 	}
3988 	--data->nsd->current_tcp_count;
3989 	assert(data->nsd->current_tcp_count >= 0);
3990 
3991 	region_destroy(data->region);
3992 }
3993 
3994 /* Read more data into the buffer for tcp read. Pass the amount of additional
3995  * data required. Returns false if nothing needs to be done this event, or
3996  * true if the additional data is in the buffer. */
3997 static int
more_read_buf_tcp(int fd,struct tcp_handler_data * data,void * bufpos,size_t add_amount,ssize_t * received)3998 more_read_buf_tcp(int fd, struct tcp_handler_data* data, void* bufpos,
3999 	size_t add_amount, ssize_t* received)
4000 {
4001 	*received = read(fd, bufpos, add_amount);
4002 	if (*received == -1) {
4003 		if (errno == EAGAIN || errno == EINTR) {
4004 			/*
4005 			 * Read would block, wait until more
4006 			 * data is available.
4007 			 */
4008 			return 0;
4009 		} else {
4010 			char buf[48];
4011 			addr2str(&data->query->remote_addr, buf, sizeof(buf));
4012 #ifdef ECONNRESET
4013 			if (verbosity >= 2 || errno != ECONNRESET)
4014 #endif /* ECONNRESET */
4015 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
4016 			cleanup_tcp_handler(data);
4017 			return 0;
4018 		}
4019 	} else if (*received == 0) {
4020 		/* EOF */
4021 		cleanup_tcp_handler(data);
4022 		return 0;
4023 	}
4024 	return 1;
4025 }
4026 
4027 static void
handle_tcp_reading(int fd,short event,void * arg)4028 handle_tcp_reading(int fd, short event, void* arg)
4029 {
4030 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4031 	ssize_t received;
4032 	struct event_base* ev_base;
4033 	struct timeval timeout;
4034 	uint32_t now = 0;
4035 
4036 	if ((event & EV_TIMEOUT)) {
4037 		/* Connection timed out.  */
4038 		cleanup_tcp_handler(data);
4039 		return;
4040 	}
4041 
4042 	if ((data->nsd->tcp_query_count > 0 &&
4043 	     data->query_count >= data->nsd->tcp_query_count) ||
4044 	    (data->query_count > 0 && data->tcp_no_more_queries))
4045   {
4046 		/* No more queries allowed on this tcp connection. */
4047 		cleanup_tcp_handler(data);
4048 		return;
4049 	}
4050 
4051 	assert((event & EV_READ));
4052 
4053 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4054 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4055 		data->query_needs_reset = 0;
4056 	}
4057 
4058 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4059 		struct pp2_header* header = NULL;
4060 		size_t want_read_size = 0;
4061 		size_t current_read_size = 0;
4062 		if(data->pp2_header_state == pp2_header_none) {
4063 			want_read_size = PP2_HEADER_SIZE;
4064 			if(buffer_remaining(data->query->packet) <
4065 				want_read_size) {
4066 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4067 				cleanup_tcp_handler(data);
4068 				return;
4069 			}
4070 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4071 			current_read_size = want_read_size;
4072 			if(data->bytes_transmitted < current_read_size) {
4073 				if(!more_read_buf_tcp(fd, data,
4074 					(void*)buffer_at(data->query->packet,
4075 						data->bytes_transmitted),
4076 					current_read_size - data->bytes_transmitted,
4077 					&received))
4078 					return;
4079 				data->bytes_transmitted += received;
4080 				buffer_skip(data->query->packet, received);
4081 				if(data->bytes_transmitted != current_read_size)
4082 					return;
4083 				data->pp2_header_state = pp2_header_init;
4084 			}
4085 		}
4086 		if(data->pp2_header_state == pp2_header_init) {
4087 			int err;
4088 			err = pp2_read_header(buffer_begin(data->query->packet),
4089 				buffer_limit(data->query->packet));
4090 			if(err) {
4091 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4092 				cleanup_tcp_handler(data);
4093 				return;
4094 			}
4095 			header = (struct pp2_header*)buffer_begin(data->query->packet);
4096 			want_read_size = ntohs(header->len);
4097 			if(buffer_limit(data->query->packet) <
4098 				PP2_HEADER_SIZE + want_read_size) {
4099 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4100 				cleanup_tcp_handler(data);
4101 				return;
4102 			}
4103 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4104 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4105 			if(want_read_size == 0) {
4106 				/* nothing more to read; header is complete */
4107 				data->pp2_header_state = pp2_header_done;
4108 			} else if(data->bytes_transmitted < current_read_size) {
4109 				if(!more_read_buf_tcp(fd, data,
4110 					(void*)buffer_at(data->query->packet,
4111 						data->bytes_transmitted),
4112 					current_read_size - data->bytes_transmitted,
4113 					&received))
4114 					return;
4115 				data->bytes_transmitted += received;
4116 				buffer_skip(data->query->packet, received);
4117 				if(data->bytes_transmitted != current_read_size)
4118 					return;
4119 				data->pp2_header_state = pp2_header_done;
4120 			}
4121 		}
4122 		if(data->pp2_header_state != pp2_header_done || !header) {
4123 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4124 
4125 			cleanup_tcp_handler(data);
4126 			return;
4127 		}
4128 		buffer_flip(data->query->packet);
4129 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4130 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4131 
4132 			cleanup_tcp_handler(data);
4133 			return;
4134 		}
4135 		/* Clear and reset the buffer to read the following
4136 		 * DNS packet(s). */
4137 		buffer_clear(data->query->packet);
4138 		data->bytes_transmitted = 0;
4139 	}
4140 
4141 	/*
4142 	 * Check if we received the leading packet length bytes yet.
4143 	 */
4144 	if (data->bytes_transmitted < sizeof(uint16_t)) {
4145 		if(!more_read_buf_tcp(fd, data,
4146 			(char*) &data->query->tcplen + data->bytes_transmitted,
4147 			sizeof(uint16_t) - data->bytes_transmitted, &received))
4148 			return;
4149 		data->bytes_transmitted += received;
4150 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4151 			/*
4152 			 * Not done with the tcplen yet, wait for more
4153 			 * data to become available.
4154 			 */
4155 			return;
4156 		}
4157 		assert(data->bytes_transmitted == sizeof(uint16_t));
4158 
4159 		data->query->tcplen = ntohs(data->query->tcplen);
4160 
4161 		/*
4162 		 * Minimum query size is:
4163 		 *
4164 		 *     Size of the header (12)
4165 		 *   + Root domain name   (1)
4166 		 *   + Query class        (2)
4167 		 *   + Query type         (2)
4168 		 */
4169 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4170 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4171 			cleanup_tcp_handler(data);
4172 			return;
4173 		}
4174 
4175 		if (data->query->tcplen > data->query->maxlen) {
4176 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4177 			cleanup_tcp_handler(data);
4178 			return;
4179 		}
4180 
4181 		buffer_set_limit(data->query->packet, data->query->tcplen);
4182 	}
4183 
4184 	assert(buffer_remaining(data->query->packet) > 0);
4185 
4186 	/* Read the (remaining) query data.  */
4187 	if(!more_read_buf_tcp(fd, data, buffer_current(data->query->packet),
4188 		buffer_remaining(data->query->packet), &received))
4189 		return;
4190 	data->bytes_transmitted += received;
4191 	buffer_skip(data->query->packet, received);
4192 	if (buffer_remaining(data->query->packet) > 0) {
4193 		/*
4194 		 * Message not yet complete, wait for more data to
4195 		 * become available.
4196 		 */
4197 		return;
4198 	}
4199 
4200 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4201 
4202 	/* Account... */
4203 #ifdef BIND8_STATS
4204 #ifndef INET6
4205 	STATUP(data->nsd, ctcp);
4206 #else
4207 	if (data->query->remote_addr.ss_family == AF_INET) {
4208 		STATUP(data->nsd, ctcp);
4209 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4210 		STATUP(data->nsd, ctcp6);
4211 	}
4212 #endif
4213 #endif /* BIND8_STATS */
4214 
4215 	/* We have a complete query, process it.  */
4216 
4217 	/* tcp-query-count: handle query counter ++ */
4218 	data->query_count++;
4219 
4220 	buffer_flip(data->query->packet);
4221 #ifdef USE_DNSTAP
4222 	/*
4223 	 * and send TCP-query with found address (local) and client address to dnstap process
4224 	 */
4225 	log_addr("query from client", &data->query->client_addr);
4226 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4227 	if(verbosity >= 6 && data->query->is_proxied)
4228 		log_addr("query via proxy", &data->query->remote_addr);
4229 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4230 		data->query->client_addrlen, data->query->tcp, data->query->packet);
4231 #endif /* USE_DNSTAP */
4232 	data->query_state = server_process_query(data->nsd, data->query, &now);
4233 	if (data->query_state == QUERY_DISCARDED) {
4234 		/* Drop the packet and the entire connection... */
4235 		STATUP(data->nsd, dropped);
4236 		ZTATUP(data->nsd, data->query->zone, dropped);
4237 		cleanup_tcp_handler(data);
4238 		return;
4239 	}
4240 
4241 #ifdef BIND8_STATS
4242 	if (RCODE(data->query->packet) == RCODE_OK
4243 	    && !AA(data->query->packet))
4244 	{
4245 		STATUP(data->nsd, nona);
4246 		ZTATUP(data->nsd, data->query->zone, nona);
4247 	}
4248 #endif /* BIND8_STATS */
4249 
4250 #ifdef USE_ZONE_STATS
4251 #ifndef INET6
4252 	ZTATUP(data->nsd, data->query->zone, ctcp);
4253 #else
4254 	if (data->query->remote_addr.ss_family == AF_INET) {
4255 		ZTATUP(data->nsd, data->query->zone, ctcp);
4256 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4257 		ZTATUP(data->nsd, data->query->zone, ctcp6);
4258 	}
4259 #endif
4260 #endif /* USE_ZONE_STATS */
4261 
4262 	query_add_optional(data->query, data->nsd, &now);
4263 
4264 	/* Switch to the tcp write handler.  */
4265 	buffer_flip(data->query->packet);
4266 	data->query->tcplen = buffer_remaining(data->query->packet);
4267 #ifdef BIND8_STATS
4268 	/* Account the rcode & TC... */
4269 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4270 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4271 	if (TC(data->query->packet)) {
4272 		STATUP(data->nsd, truncated);
4273 		ZTATUP(data->nsd, data->query->zone, truncated);
4274 	}
4275 #endif /* BIND8_STATS */
4276 #ifdef USE_DNSTAP
4277 	/*
4278 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4279 	 */
4280 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4281 	log_addr("response to client", &data->query->client_addr);
4282 	if(verbosity >= 6 && data->query->is_proxied)
4283 		log_addr("response via proxy", &data->query->remote_addr);
4284 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4285 		data->query->client_addrlen, data->query->tcp, data->query->packet,
4286 		data->query->zone);
4287 #endif /* USE_DNSTAP */
4288 	data->bytes_transmitted = 0;
4289 
4290 	timeout.tv_sec = data->tcp_timeout / 1000;
4291 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4292 
4293 	ev_base = data->event.ev_base;
4294 	event_del(&data->event);
4295 	memset(&data->event, 0, sizeof(data->event));
4296 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4297 		handle_tcp_writing, data);
4298 	if(event_base_set(ev_base, &data->event) != 0)
4299 		log_msg(LOG_ERR, "event base set tcpr failed");
4300 	if(event_add(&data->event, &timeout) != 0)
4301 		log_msg(LOG_ERR, "event add tcpr failed");
4302 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4303 	handle_tcp_writing(fd, EV_WRITE, data);
4304 }
4305 
4306 static void
handle_tcp_writing(int fd,short event,void * arg)4307 handle_tcp_writing(int fd, short event, void* arg)
4308 {
4309 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4310 	ssize_t sent;
4311 	struct query *q = data->query;
4312 	struct timeval timeout;
4313 	struct event_base* ev_base;
4314 	uint32_t now = 0;
4315 
4316 	if ((event & EV_TIMEOUT)) {
4317 		/* Connection timed out.  */
4318 		cleanup_tcp_handler(data);
4319 		return;
4320 	}
4321 
4322 	assert((event & EV_WRITE));
4323 
4324 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
4325 		/* Writing the response packet length.  */
4326 		uint16_t n_tcplen = htons(q->tcplen);
4327 #ifdef HAVE_WRITEV
4328 		struct iovec iov[2];
4329 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
4330 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
4331 		iov[1].iov_base = buffer_begin(q->packet);
4332 		iov[1].iov_len = buffer_limit(q->packet);
4333 		sent = writev(fd, iov, 2);
4334 #else /* HAVE_WRITEV */
4335 		sent = write(fd,
4336 			     (const char *) &n_tcplen + data->bytes_transmitted,
4337 			     sizeof(n_tcplen) - data->bytes_transmitted);
4338 #endif /* HAVE_WRITEV */
4339 		if (sent == -1) {
4340 			if (errno == EAGAIN || errno == EINTR) {
4341 				/*
4342 				 * Write would block, wait until
4343 				 * socket becomes writable again.
4344 				 */
4345 				return;
4346 			} else {
4347 #ifdef ECONNRESET
4348 				if(verbosity >= 2 || errno != ECONNRESET)
4349 #endif /* ECONNRESET */
4350 #ifdef EPIPE
4351 				  if(verbosity >= 2 || errno != EPIPE)
4352 #endif /* EPIPE 'broken pipe' */
4353 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4354 				cleanup_tcp_handler(data);
4355 				return;
4356 			}
4357 		}
4358 
4359 		data->bytes_transmitted += sent;
4360 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
4361 			/*
4362 			 * Writing not complete, wait until socket
4363 			 * becomes writable again.
4364 			 */
4365 			return;
4366 		}
4367 
4368 #ifdef HAVE_WRITEV
4369 		sent -= sizeof(n_tcplen);
4370 		/* handle potential 'packet done' code */
4371 		goto packet_could_be_done;
4372 #endif
4373  	}
4374 
4375 	sent = write(fd,
4376 		     buffer_current(q->packet),
4377 		     buffer_remaining(q->packet));
4378 	if (sent == -1) {
4379 		if (errno == EAGAIN || errno == EINTR) {
4380 			/*
4381 			 * Write would block, wait until
4382 			 * socket becomes writable again.
4383 			 */
4384 			return;
4385 		} else {
4386 #ifdef ECONNRESET
4387 			if(verbosity >= 2 || errno != ECONNRESET)
4388 #endif /* ECONNRESET */
4389 #ifdef EPIPE
4390 				  if(verbosity >= 2 || errno != EPIPE)
4391 #endif /* EPIPE 'broken pipe' */
4392 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
4393 			cleanup_tcp_handler(data);
4394 			return;
4395 		}
4396 	}
4397 
4398 	data->bytes_transmitted += sent;
4399 #ifdef HAVE_WRITEV
4400   packet_could_be_done:
4401 #endif
4402 	buffer_skip(q->packet, sent);
4403 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4404 		/*
4405 		 * Still more data to write when socket becomes
4406 		 * writable again.
4407 		 */
4408 		return;
4409 	}
4410 
4411 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4412 
4413 	if (data->query_state == QUERY_IN_AXFR ||
4414 		data->query_state == QUERY_IN_IXFR) {
4415 		/* Continue processing AXFR and writing back results.  */
4416 		buffer_clear(q->packet);
4417 		if(data->query_state == QUERY_IN_AXFR)
4418 			data->query_state = query_axfr(data->nsd, q, 0);
4419 		else data->query_state = query_ixfr(data->nsd, q);
4420 		if (data->query_state != QUERY_PROCESSED) {
4421 			query_add_optional(data->query, data->nsd, &now);
4422 
4423 			/* Reset data. */
4424 			buffer_flip(q->packet);
4425 			q->tcplen = buffer_remaining(q->packet);
4426 			data->bytes_transmitted = 0;
4427 			/* Reset timeout.  */
4428 			timeout.tv_sec = data->tcp_timeout / 1000;
4429 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4430 			ev_base = data->event.ev_base;
4431 			event_del(&data->event);
4432 			memset(&data->event, 0, sizeof(data->event));
4433 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
4434 				handle_tcp_writing, data);
4435 			if(event_base_set(ev_base, &data->event) != 0)
4436 				log_msg(LOG_ERR, "event base set tcpw failed");
4437 			if(event_add(&data->event, &timeout) != 0)
4438 				log_msg(LOG_ERR, "event add tcpw failed");
4439 
4440 			/*
4441 			 * Write data if/when the socket is writable
4442 			 * again.
4443 			 */
4444 			return;
4445 		}
4446 	}
4447 
4448 	/*
4449 	 * Done sending, wait for the next request to arrive on the
4450 	 * TCP socket by installing the TCP read handler.
4451 	 */
4452 	if ((data->nsd->tcp_query_count > 0 &&
4453 		data->query_count >= data->nsd->tcp_query_count) ||
4454 		data->tcp_no_more_queries) {
4455 
4456 		(void) shutdown(fd, SHUT_WR);
4457 	}
4458 
4459 	data->bytes_transmitted = 0;
4460 	data->query_needs_reset = 1;
4461 
4462 	timeout.tv_sec = data->tcp_timeout / 1000;
4463 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
4464 	ev_base = data->event.ev_base;
4465 	event_del(&data->event);
4466 	memset(&data->event, 0, sizeof(data->event));
4467 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
4468 		handle_tcp_reading, data);
4469 	if(event_base_set(ev_base, &data->event) != 0)
4470 		log_msg(LOG_ERR, "event base set tcpw failed");
4471 	if(event_add(&data->event, &timeout) != 0)
4472 		log_msg(LOG_ERR, "event add tcpw failed");
4473 }
4474 
4475 #ifdef HAVE_SSL
4476 /** create SSL object and associate fd */
4477 static SSL*
incoming_ssl_fd(SSL_CTX * ctx,int fd)4478 incoming_ssl_fd(SSL_CTX* ctx, int fd)
4479 {
4480 	SSL* ssl = SSL_new((SSL_CTX*)ctx);
4481 	if(!ssl) {
4482 		log_crypto_err("could not SSL_new");
4483 		return NULL;
4484 	}
4485 	SSL_set_accept_state(ssl);
4486 	(void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY);
4487 	if(!SSL_set_fd(ssl, fd)) {
4488 		log_crypto_err("could not SSL_set_fd");
4489 		SSL_free(ssl);
4490 		return NULL;
4491 	}
4492 	return ssl;
4493 }
4494 
4495 /** TLS handshake to upgrade TCP connection */
4496 static int
tls_handshake(struct tcp_handler_data * data,int fd,int writing)4497 tls_handshake(struct tcp_handler_data* data, int fd, int writing)
4498 {
4499 	int r;
4500 	if(data->shake_state == tls_hs_read_event) {
4501 		/* read condition satisfied back to writing */
4502 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4503 		data->shake_state = tls_hs_none;
4504 		return 1;
4505 	}
4506 	if(data->shake_state == tls_hs_write_event) {
4507 		/* write condition satisfied back to reading */
4508 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4509 		data->shake_state = tls_hs_none;
4510 		return 1;
4511 	}
4512 
4513 	/* (continue to) setup the TLS connection */
4514 	ERR_clear_error();
4515 	r = SSL_do_handshake(data->tls);
4516 
4517 	if(r != 1) {
4518 		int want = SSL_get_error(data->tls, r);
4519 		if(want == SSL_ERROR_WANT_READ) {
4520 			if(data->shake_state == tls_hs_read) {
4521 				/* try again later */
4522 				return 1;
4523 			}
4524 			data->shake_state = tls_hs_read;
4525 			/* switch back to reading mode */
4526 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4527 			return 1;
4528 		} else if(want == SSL_ERROR_WANT_WRITE) {
4529 			if(data->shake_state == tls_hs_write) {
4530 				/* try again later */
4531 				return 1;
4532 			}
4533 			data->shake_state = tls_hs_write;
4534 			/* switch back to writing mode */
4535 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4536 			return 1;
4537 		} else {
4538 			if(r == 0)
4539 				VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely"));
4540 			else {
4541 				unsigned long err = ERR_get_error();
4542 				if(!squelch_err_ssl_handshake(err)) {
4543 					char a[64], s[256];
4544 					addr2str(&data->query->remote_addr, a, sizeof(a));
4545 					snprintf(s, sizeof(s), "TLS handshake failed from %s", a);
4546 					log_crypto_from_err(s, err);
4547 				}
4548 			}
4549 			cleanup_tcp_handler(data);
4550 			return 0;
4551 		}
4552 	}
4553 
4554 	/* Use to log successful upgrade for testing - could be removed*/
4555 	VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded."));
4556 	/* set back to the event we need to have when reading (or writing) */
4557 	if(data->shake_state == tls_hs_read && writing) {
4558 		tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE);
4559 	} else if(data->shake_state == tls_hs_write && !writing) {
4560 		tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ);
4561 	}
4562 	data->shake_state = tls_hs_none;
4563 	return 1;
4564 }
4565 
4566 /* Read more data into the buffer for tls read. Pass the amount of additional
4567  * data required. Returns false if nothing needs to be done this event, or
4568  * true if the additional data is in the buffer. */
4569 static int
more_read_buf_tls(int fd,struct tcp_handler_data * data,void * bufpos,size_t add_amount,ssize_t * received)4570 more_read_buf_tls(int fd, struct tcp_handler_data* data, void* bufpos,
4571 	size_t add_amount, ssize_t* received)
4572 {
4573 	ERR_clear_error();
4574 	if((*received=SSL_read(data->tls, bufpos, add_amount)) <= 0) {
4575 		int want = SSL_get_error(data->tls, *received);
4576 		if(want == SSL_ERROR_ZERO_RETURN) {
4577 			cleanup_tcp_handler(data);
4578 			return 0; /* shutdown, closed */
4579 		} else if(want == SSL_ERROR_WANT_READ) {
4580 			/* wants to be called again */
4581 			return 0;
4582 		}
4583 		else if(want == SSL_ERROR_WANT_WRITE) {
4584 			/* switch to writing */
4585 			data->shake_state = tls_hs_write_event;
4586 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4587 			return 0;
4588 		}
4589 		cleanup_tcp_handler(data);
4590 		log_crypto_err("could not SSL_read");
4591 		return 0;
4592 	}
4593 	return 1;
4594 }
4595 
4596 /** handle TLS reading of incoming query */
4597 static void
handle_tls_reading(int fd,short event,void * arg)4598 handle_tls_reading(int fd, short event, void* arg)
4599 {
4600 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4601 	ssize_t received;
4602 	uint32_t now = 0;
4603 
4604 	if ((event & EV_TIMEOUT)) {
4605 		/* Connection timed out.  */
4606 		cleanup_tcp_handler(data);
4607 		return;
4608 	}
4609 
4610 	if ((data->nsd->tcp_query_count > 0 &&
4611 	     data->query_count >= data->nsd->tcp_query_count) ||
4612 	    (data->query_count > 0 && data->tcp_no_more_queries))
4613 	{
4614 		/* No more queries allowed on this tcp connection. */
4615 		cleanup_tcp_handler(data);
4616 		return;
4617 	}
4618 
4619 	assert((event & EV_READ));
4620 
4621 	if (data->bytes_transmitted == 0 && data->query_needs_reset) {
4622 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
4623 		data->query_needs_reset = 0;
4624 	}
4625 
4626 	if(data->shake_state != tls_hs_none) {
4627 		if(!tls_handshake(data, fd, 0))
4628 			return;
4629 		if(data->shake_state != tls_hs_none)
4630 			return;
4631 	}
4632 
4633 	if(data->pp2_enabled && data->pp2_header_state != pp2_header_done) {
4634 		struct pp2_header* header = NULL;
4635 		size_t want_read_size = 0;
4636 		size_t current_read_size = 0;
4637 		if(data->pp2_header_state == pp2_header_none) {
4638 			want_read_size = PP2_HEADER_SIZE;
4639 			if(buffer_remaining(data->query->packet) <
4640 				want_read_size) {
4641 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4642 				cleanup_tcp_handler(data);
4643 				return;
4644 			}
4645 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading fixed part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4646 			current_read_size = want_read_size;
4647 			if(data->bytes_transmitted < current_read_size) {
4648 				if(!more_read_buf_tls(fd, data,
4649 					buffer_at(data->query->packet,
4650 						data->bytes_transmitted),
4651 					current_read_size - data->bytes_transmitted,
4652 					&received))
4653 					return;
4654 				data->bytes_transmitted += received;
4655 				buffer_skip(data->query->packet, received);
4656 				if(data->bytes_transmitted != current_read_size)
4657 					return;
4658 				data->pp2_header_state = pp2_header_init;
4659 			}
4660 		}
4661 		if(data->pp2_header_state == pp2_header_init) {
4662 			int err;
4663 			err = pp2_read_header(buffer_begin(data->query->packet),
4664 				buffer_limit(data->query->packet));
4665 			if(err) {
4666 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not parse PROXYv2 header: %s", pp_lookup_error(err)));
4667 				cleanup_tcp_handler(data);
4668 				return;
4669 			}
4670 			header = (struct pp2_header*)buffer_begin(data->query->packet);
4671 			want_read_size = ntohs(header->len);
4672 			if(buffer_limit(data->query->packet) <
4673 				PP2_HEADER_SIZE + want_read_size) {
4674 				VERBOSITY(6, (LOG_ERR, "proxy-protocol: not enough buffer size to read PROXYv2 header"));
4675 				cleanup_tcp_handler(data);
4676 				return;
4677 			}
4678 			VERBOSITY(6, (LOG_INFO, "proxy-protocol: reading variable part of PROXYv2 header (len %lu)", (unsigned long)want_read_size));
4679 			current_read_size = PP2_HEADER_SIZE + want_read_size;
4680 			if(want_read_size == 0) {
4681 				/* nothing more to read; header is complete */
4682 				data->pp2_header_state = pp2_header_done;
4683 			} else if(data->bytes_transmitted < current_read_size) {
4684 				if(!more_read_buf_tls(fd, data,
4685 					buffer_at(data->query->packet,
4686 						data->bytes_transmitted),
4687 					current_read_size - data->bytes_transmitted,
4688 					&received))
4689 					return;
4690 				data->bytes_transmitted += received;
4691 				buffer_skip(data->query->packet, received);
4692 				if(data->bytes_transmitted != current_read_size)
4693 					return;
4694 				data->pp2_header_state = pp2_header_done;
4695 			}
4696 		}
4697 		if(data->pp2_header_state != pp2_header_done || !header) {
4698 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: wrong state for the PROXYv2 header"));
4699 			cleanup_tcp_handler(data);
4700 			return;
4701 		}
4702 		buffer_flip(data->query->packet);
4703 		if(!consume_pp2_header(data->query->packet, data->query, 1)) {
4704 			VERBOSITY(6, (LOG_ERR, "proxy-protocol: could not consume PROXYv2 header"));
4705 			cleanup_tcp_handler(data);
4706 			return;
4707 		}
4708 		/* Clear and reset the buffer to read the following
4709 		 * DNS packet(s). */
4710 		buffer_clear(data->query->packet);
4711 		data->bytes_transmitted = 0;
4712 	}
4713 	/*
4714 	 * Check if we received the leading packet length bytes yet.
4715 	 */
4716 	if(data->bytes_transmitted < sizeof(uint16_t)) {
4717 		if(!more_read_buf_tls(fd, data,
4718 		    (char *) &data->query->tcplen + data->bytes_transmitted,
4719 		    sizeof(uint16_t) - data->bytes_transmitted, &received))
4720 			return;
4721 		data->bytes_transmitted += received;
4722 		if (data->bytes_transmitted < sizeof(uint16_t)) {
4723 			/*
4724 			 * Not done with the tcplen yet, wait for more
4725 			 * data to become available.
4726 			 */
4727 			return;
4728 		}
4729 
4730 		assert(data->bytes_transmitted == sizeof(uint16_t));
4731 
4732 		data->query->tcplen = ntohs(data->query->tcplen);
4733 
4734 		/*
4735 		 * Minimum query size is:
4736 		 *
4737 		 *     Size of the header (12)
4738 		 *   + Root domain name   (1)
4739 		 *   + Query class        (2)
4740 		 *   + Query type         (2)
4741 		 */
4742 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
4743 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
4744 			cleanup_tcp_handler(data);
4745 			return;
4746 		}
4747 
4748 		if (data->query->tcplen > data->query->maxlen) {
4749 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
4750 			cleanup_tcp_handler(data);
4751 			return;
4752 		}
4753 
4754 		buffer_set_limit(data->query->packet, data->query->tcplen);
4755 	}
4756 
4757 	assert(buffer_remaining(data->query->packet) > 0);
4758 
4759 	/* Read the (remaining) query data.  */
4760 	if(!more_read_buf_tls(fd, data, buffer_current(data->query->packet),
4761 		buffer_remaining(data->query->packet), &received))
4762 		return;
4763 	data->bytes_transmitted += received;
4764 	buffer_skip(data->query->packet, received);
4765 	if (buffer_remaining(data->query->packet) > 0) {
4766 		/*
4767 		 * Message not yet complete, wait for more data to
4768 		 * become available.
4769 		 */
4770 		return;
4771 	}
4772 
4773 	assert(buffer_position(data->query->packet) == data->query->tcplen);
4774 
4775 	/* Account... */
4776 #ifndef INET6
4777 	STATUP(data->nsd, ctls);
4778 #else
4779 	if (data->query->remote_addr.ss_family == AF_INET) {
4780 		STATUP(data->nsd, ctls);
4781 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4782 		STATUP(data->nsd, ctls6);
4783 	}
4784 #endif
4785 
4786 	/* We have a complete query, process it.  */
4787 
4788 	/* tcp-query-count: handle query counter ++ */
4789 	data->query_count++;
4790 
4791 	buffer_flip(data->query->packet);
4792 #ifdef USE_DNSTAP
4793 	/*
4794 	 * and send TCP-query with found address (local) and client address to dnstap process
4795 	 */
4796 	log_addr("query from client", &data->query->client_addr);
4797 	log_addr("to server (local)", (void*)&data->socket->addr.ai_addr);
4798 	if(verbosity >= 6 && data->query->is_proxied)
4799 		log_addr("query via proxy", &data->query->remote_addr);
4800 	dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4801 		data->query->client_addrlen, data->query->tcp, data->query->packet);
4802 #endif /* USE_DNSTAP */
4803 	data->query_state = server_process_query(data->nsd, data->query, &now);
4804 	if (data->query_state == QUERY_DISCARDED) {
4805 		/* Drop the packet and the entire connection... */
4806 		STATUP(data->nsd, dropped);
4807 		ZTATUP(data->nsd, data->query->zone, dropped);
4808 		cleanup_tcp_handler(data);
4809 		return;
4810 	}
4811 
4812 #ifdef BIND8_STATS
4813 	if (RCODE(data->query->packet) == RCODE_OK
4814 	    && !AA(data->query->packet))
4815 	{
4816 		STATUP(data->nsd, nona);
4817 		ZTATUP(data->nsd, data->query->zone, nona);
4818 	}
4819 #endif /* BIND8_STATS */
4820 
4821 #ifdef USE_ZONE_STATS
4822 #ifndef INET6
4823 	ZTATUP(data->nsd, data->query->zone, ctls);
4824 #else
4825 	if (data->query->remote_addr.ss_family == AF_INET) {
4826 		ZTATUP(data->nsd, data->query->zone, ctls);
4827 	} else if (data->query->remote_addr.ss_family == AF_INET6) {
4828 		ZTATUP(data->nsd, data->query->zone, ctls6);
4829 	}
4830 #endif
4831 #endif /* USE_ZONE_STATS */
4832 
4833 	query_add_optional(data->query, data->nsd, &now);
4834 
4835 	/* Switch to the tcp write handler.  */
4836 	buffer_flip(data->query->packet);
4837 	data->query->tcplen = buffer_remaining(data->query->packet);
4838 #ifdef BIND8_STATS
4839 	/* Account the rcode & TC... */
4840 	STATUP2(data->nsd, rcode, RCODE(data->query->packet));
4841 	ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet));
4842 	if (TC(data->query->packet)) {
4843 		STATUP(data->nsd, truncated);
4844 		ZTATUP(data->nsd, data->query->zone, truncated);
4845 	}
4846 #endif /* BIND8_STATS */
4847 #ifdef USE_DNSTAP
4848 	/*
4849 	 * sending TCP-response with found (earlier) address (local) and client address to dnstap process
4850 	 */
4851 	log_addr("from server (local)", (void*)&data->socket->addr.ai_addr);
4852 	log_addr("response to client", &data->query->client_addr);
4853 	if(verbosity >= 6 && data->query->is_proxied)
4854 		log_addr("response via proxy", &data->query->remote_addr);
4855 	dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->client_addr,
4856 		data->query->client_addrlen, data->query->tcp, data->query->packet,
4857 		data->query->zone);
4858 #endif /* USE_DNSTAP */
4859 	data->bytes_transmitted = 0;
4860 
4861 	tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4862 
4863 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
4864 	handle_tls_writing(fd, EV_WRITE, data);
4865 }
4866 
4867 /** handle TLS writing of outgoing response */
4868 static void
handle_tls_writing(int fd,short event,void * arg)4869 handle_tls_writing(int fd, short event, void* arg)
4870 {
4871 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
4872 	ssize_t sent;
4873 	struct query *q = data->query;
4874 	/* static variable that holds reassembly buffer used to put the
4875 	 * TCP length in front of the packet, like writev. */
4876 	static buffer_type* global_tls_temp_buffer = NULL;
4877 	buffer_type* write_buffer;
4878 	uint32_t now = 0;
4879 
4880 	if ((event & EV_TIMEOUT)) {
4881 		/* Connection timed out.  */
4882 		cleanup_tcp_handler(data);
4883 		return;
4884 	}
4885 
4886 	assert((event & EV_WRITE));
4887 
4888 	if(data->shake_state != tls_hs_none) {
4889 		if(!tls_handshake(data, fd, 1))
4890 			return;
4891 		if(data->shake_state != tls_hs_none)
4892 			return;
4893 	}
4894 
4895 	(void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE);
4896 
4897 	/* If we are writing the start of a message, we must include the length
4898 	 * this is done with a copy into write_buffer. */
4899 	write_buffer = NULL;
4900 	if (data->bytes_transmitted == 0) {
4901 		if(!global_tls_temp_buffer) {
4902 			/* gets deallocated when nsd shuts down from
4903 			 * nsd.region */
4904 			global_tls_temp_buffer = buffer_create(nsd.region,
4905 				QIOBUFSZ + sizeof(q->tcplen));
4906 			if (!global_tls_temp_buffer) {
4907 				return;
4908 			}
4909 		}
4910 		write_buffer = global_tls_temp_buffer;
4911 		buffer_clear(write_buffer);
4912 		buffer_write_u16(write_buffer, q->tcplen);
4913 		buffer_write(write_buffer, buffer_current(q->packet),
4914 			(int)buffer_remaining(q->packet));
4915 		buffer_flip(write_buffer);
4916 	} else {
4917 		write_buffer = q->packet;
4918 	}
4919 
4920 	/* Write the response */
4921 	ERR_clear_error();
4922 	sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer));
4923 	if(sent <= 0) {
4924 		int want = SSL_get_error(data->tls, sent);
4925 		if(want == SSL_ERROR_ZERO_RETURN) {
4926 			cleanup_tcp_handler(data);
4927 			/* closed */
4928 		} else if(want == SSL_ERROR_WANT_READ) {
4929 			/* switch back to reading */
4930 			data->shake_state = tls_hs_read_event;
4931 			tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4932 		} else if(want != SSL_ERROR_WANT_WRITE) {
4933 			cleanup_tcp_handler(data);
4934 			log_crypto_err("could not SSL_write");
4935 		}
4936 		return;
4937 	}
4938 
4939 	buffer_skip(write_buffer, sent);
4940 	if(buffer_remaining(write_buffer) != 0) {
4941 		/* If not all sent, sync up the real buffer if it wasn't used.*/
4942 		if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) {
4943 			buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen));
4944 		}
4945 	}
4946 
4947 	data->bytes_transmitted += sent;
4948 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
4949 		/*
4950 		 * Still more data to write when socket becomes
4951 		 * writable again.
4952 		 */
4953 		return;
4954 	}
4955 
4956 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
4957 
4958 	if (data->query_state == QUERY_IN_AXFR ||
4959 		data->query_state == QUERY_IN_IXFR) {
4960 		/* Continue processing AXFR and writing back results.  */
4961 		buffer_clear(q->packet);
4962 		if(data->query_state == QUERY_IN_AXFR)
4963 			data->query_state = query_axfr(data->nsd, q, 0);
4964 		else data->query_state = query_ixfr(data->nsd, q);
4965 		if (data->query_state != QUERY_PROCESSED) {
4966 			query_add_optional(data->query, data->nsd, &now);
4967 
4968 			/* Reset data. */
4969 			buffer_flip(q->packet);
4970 			q->tcplen = buffer_remaining(q->packet);
4971 			data->bytes_transmitted = 0;
4972 			/* Reset to writing mode.  */
4973 			tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT);
4974 
4975 			/*
4976 			 * Write data if/when the socket is writable
4977 			 * again.
4978 			 */
4979 			return;
4980 		}
4981 	}
4982 
4983 	/*
4984 	 * Done sending, wait for the next request to arrive on the
4985 	 * TCP socket by installing the TCP read handler.
4986 	 */
4987 	if ((data->nsd->tcp_query_count > 0 &&
4988 		data->query_count >= data->nsd->tcp_query_count) ||
4989 		data->tcp_no_more_queries) {
4990 
4991 		(void) shutdown(fd, SHUT_WR);
4992 	}
4993 
4994 	data->bytes_transmitted = 0;
4995 	data->query_needs_reset = 1;
4996 
4997 	tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT);
4998 }
4999 #endif
5000 
5001 static void
handle_slowaccept_timeout(int ATTR_UNUSED (fd),short ATTR_UNUSED (event),void * ATTR_UNUSED (arg))5002 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
5003 	void* ATTR_UNUSED(arg))
5004 {
5005 	if(slowaccept) {
5006 		configure_handler_event_types(EV_PERSIST | EV_READ);
5007 		slowaccept = 0;
5008 	}
5009 }
5010 
perform_accept(int fd,struct sockaddr * addr,socklen_t * addrlen)5011 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
5012 {
5013 #ifndef HAVE_ACCEPT4
5014 	int s = accept(fd, addr, addrlen);
5015 	if (s != -1) {
5016 		if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
5017 			log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
5018 			close(s);
5019 			s = -1;
5020 			errno=EINTR; /* stop error printout as error in accept4
5021 				by setting this errno, it omits printout, in
5022 				later code that calls nsd_accept4 */
5023 		}
5024 	}
5025 	return s;
5026 #else
5027 	return accept4(fd, addr, addrlen, SOCK_NONBLOCK);
5028 #endif /* HAVE_ACCEPT4 */
5029 }
5030 
5031 /*
5032  * Handle an incoming TCP connection.  The connection is accepted and
5033  * a new TCP reader event handler is added.  The TCP handler
5034  * is responsible for cleanup when the connection is closed.
5035  */
5036 static void
handle_tcp_accept(int fd,short event,void * arg)5037 handle_tcp_accept(int fd, short event, void* arg)
5038 {
5039 	struct tcp_accept_handler_data *data
5040 		= (struct tcp_accept_handler_data *) arg;
5041 	int s;
5042 	int reject = 0;
5043 	struct tcp_handler_data *tcp_data;
5044 	region_type *tcp_region;
5045 #ifdef INET6
5046 	struct sockaddr_storage addr;
5047 #else
5048 	struct sockaddr_in addr;
5049 #endif
5050 	socklen_t addrlen;
5051 	struct timeval timeout;
5052 
5053 	if (!(event & EV_READ)) {
5054 		return;
5055 	}
5056 
5057 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
5058 		reject = data->nsd->options->tcp_reject_overflow;
5059 		if (!reject) {
5060 			return;
5061 		}
5062 	}
5063 
5064 	/* Accept it... */
5065 	addrlen = sizeof(addr);
5066 	s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen);
5067 	if (s == -1) {
5068 		/**
5069 		 * EMFILE and ENFILE is a signal that the limit of open
5070 		 * file descriptors has been reached. Pause accept().
5071 		 * EINTR is a signal interrupt. The others are various OS ways
5072 		 * of saying that the client has closed the connection.
5073 		 */
5074 		if (errno == EMFILE || errno == ENFILE) {
5075 			if (!slowaccept) {
5076 				/* disable accept events */
5077 				struct timeval tv;
5078 				configure_handler_event_types(0);
5079 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
5080 				tv.tv_usec = 0L;
5081 				memset(&slowaccept_event, 0,
5082 					sizeof(slowaccept_event));
5083 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
5084 					handle_slowaccept_timeout, NULL);
5085 				(void)event_base_set(data->event.ev_base,
5086 					&slowaccept_event);
5087 				(void)event_add(&slowaccept_event, &tv);
5088 				slowaccept = 1;
5089 				/* We don't want to spam the logs here */
5090 			}
5091 		} else if (errno != EINTR
5092 			&& errno != EWOULDBLOCK
5093 #ifdef ECONNABORTED
5094 			&& errno != ECONNABORTED
5095 #endif /* ECONNABORTED */
5096 #ifdef EPROTO
5097 			&& errno != EPROTO
5098 #endif /* EPROTO */
5099 			) {
5100 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
5101 		}
5102 		return;
5103 	}
5104 
5105 	if (reject) {
5106 		shutdown(s, SHUT_RDWR);
5107 		close(s);
5108 		return;
5109 	}
5110 
5111 	/*
5112 	 * This region is deallocated when the TCP connection is
5113 	 * closed by the TCP handler.
5114 	 */
5115 	tcp_region = region_create(xalloc, free);
5116 	tcp_data = (struct tcp_handler_data *) region_alloc(
5117 		tcp_region, sizeof(struct tcp_handler_data));
5118 	tcp_data->region = tcp_region;
5119 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
5120 		compression_table_size, compressed_dnames);
5121 	tcp_data->nsd = data->nsd;
5122 	tcp_data->query_count = 0;
5123 #ifdef HAVE_SSL
5124 	tcp_data->shake_state = tls_hs_none;
5125 	tcp_data->tls = NULL;
5126 #endif
5127 	tcp_data->query_needs_reset = 1;
5128 	tcp_data->pp2_enabled = data->pp2_enabled;
5129 	tcp_data->pp2_header_state = pp2_header_none;
5130 	tcp_data->prev = NULL;
5131 	tcp_data->next = NULL;
5132 
5133 	tcp_data->query_state = QUERY_PROCESSED;
5134 	tcp_data->bytes_transmitted = 0;
5135 	memcpy(&tcp_data->query->remote_addr, &addr, addrlen);
5136 	tcp_data->query->remote_addrlen = addrlen;
5137 	/* Copy remote_address to client_address.
5138 	 * Simplest way/time for streams to do that. */
5139 	memcpy(&tcp_data->query->client_addr, &addr, addrlen);
5140 	tcp_data->query->client_addrlen = addrlen;
5141 	tcp_data->query->is_proxied = 0;
5142 
5143 	tcp_data->tcp_no_more_queries = 0;
5144 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
5145 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
5146 		/* very busy, give smaller timeout */
5147 		tcp_data->tcp_timeout = 200;
5148 	}
5149 	memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5150 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
5151 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
5152 
5153 #ifdef USE_DNSTAP
5154 	/* save the address of the connection */
5155 	tcp_data->socket = data->socket;
5156 #endif /* USE_DNSTAP */
5157 
5158 #ifdef HAVE_SSL
5159 	if (data->tls_accept) {
5160 		tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s);
5161 		if(!tcp_data->tls) {
5162 			close(s);
5163 			return;
5164 		}
5165 		tcp_data->shake_state = tls_hs_read;
5166 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5167 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5168 			  handle_tls_reading, tcp_data);
5169 	} else {
5170 #endif
5171 		memset(&tcp_data->event, 0, sizeof(tcp_data->event));
5172 		event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
5173 			  handle_tcp_reading, tcp_data);
5174 #ifdef HAVE_SSL
5175 	}
5176 #endif
5177 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
5178 		log_msg(LOG_ERR, "cannot set tcp event base");
5179 		close(s);
5180 		region_destroy(tcp_region);
5181 		return;
5182 	}
5183 	if(event_add(&tcp_data->event, &timeout) != 0) {
5184 		log_msg(LOG_ERR, "cannot add tcp to event base");
5185 		close(s);
5186 		region_destroy(tcp_region);
5187 		return;
5188 	}
5189 	if(tcp_active_list) {
5190 		tcp_active_list->prev = tcp_data;
5191 		tcp_data->next = tcp_active_list;
5192 	}
5193 	tcp_active_list = tcp_data;
5194 
5195 	/*
5196 	 * Keep track of the total number of TCP handlers installed so
5197 	 * we can stop accepting connections when the maximum number
5198 	 * of simultaneous TCP connections is reached.
5199 	 *
5200 	 * If tcp-reject-overflow is enabled, however, then we do not
5201 	 * change the handler event type; we keep it as-is and accept
5202 	 * overflow TCP connections only so that we can forcibly kill
5203 	 * them off.
5204 	 */
5205 	++data->nsd->current_tcp_count;
5206 	if (!data->nsd->options->tcp_reject_overflow &&
5207 	     data->nsd->current_tcp_count == data->nsd->maximum_tcp_count)
5208 	{
5209 		configure_handler_event_types(0);
5210 	}
5211 }
5212 
5213 static void
send_children_command(struct nsd * nsd,sig_atomic_t command,int timeout)5214 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
5215 {
5216 	size_t i;
5217 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5218 	for (i = 0; i < nsd->child_count; ++i) {
5219 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
5220 			if (write(nsd->children[i].child_fd,
5221 				&command,
5222 				sizeof(command)) == -1)
5223 			{
5224 				if(errno != EAGAIN && errno != EINTR)
5225 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
5226 					(int) command,
5227 					(int) nsd->children[i].pid,
5228 					strerror(errno));
5229 			} else if (timeout > 0) {
5230 				(void)block_read(NULL,
5231 					nsd->children[i].child_fd,
5232 					&command, sizeof(command), timeout);
5233 			}
5234 			fsync(nsd->children[i].child_fd);
5235 			close(nsd->children[i].child_fd);
5236 			nsd->children[i].child_fd = -1;
5237 		}
5238 	}
5239 }
5240 
5241 static void
send_children_quit(struct nsd * nsd)5242 send_children_quit(struct nsd* nsd)
5243 {
5244 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
5245 	send_children_command(nsd, NSD_QUIT, 0);
5246 }
5247 
5248 static void
send_children_quit_and_wait(struct nsd * nsd)5249 send_children_quit_and_wait(struct nsd* nsd)
5250 {
5251 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
5252 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
5253 }
5254 
5255 #ifdef BIND8_STATS
5256 static void
set_children_stats(struct nsd * nsd)5257 set_children_stats(struct nsd* nsd)
5258 {
5259 	size_t i;
5260 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
5261 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
5262 	for (i = 0; i < nsd->child_count; ++i) {
5263 		nsd->children[i].need_to_send_STATS = 1;
5264 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
5265 	}
5266 }
5267 #endif /* BIND8_STATS */
5268 
5269 static void
configure_handler_event_types(short event_types)5270 configure_handler_event_types(short event_types)
5271 {
5272 	size_t i;
5273 
5274 	for (i = 0; i < tcp_accept_handler_count; ++i) {
5275 		struct event* handler = &tcp_accept_handlers[i].event;
5276 		if(event_types) {
5277 			/* reassign */
5278 			int fd = handler->ev_fd;
5279 			struct event_base* base = handler->ev_base;
5280 			if(tcp_accept_handlers[i].event_added)
5281 				event_del(handler);
5282 			memset(handler, 0, sizeof(*handler));
5283 			event_set(handler, fd, event_types,
5284 				handle_tcp_accept, &tcp_accept_handlers[i]);
5285 			if(event_base_set(base, handler) != 0)
5286 				log_msg(LOG_ERR, "conhand: cannot event_base");
5287 			if(event_add(handler, NULL) != 0)
5288 				log_msg(LOG_ERR, "conhand: cannot event_add");
5289 			tcp_accept_handlers[i].event_added = 1;
5290 		} else {
5291 			/* remove */
5292 			if(tcp_accept_handlers[i].event_added) {
5293 				event_del(handler);
5294 				tcp_accept_handlers[i].event_added = 0;
5295 			}
5296 		}
5297 	}
5298 }
5299