xref: /openbsd-src/usr.sbin/nsd/server.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <fcntl.h>
32 #include <netdb.h>
33 #ifndef SHUT_WR
34 #define SHUT_WR 1
35 #endif
36 #include <openssl/rand.h>
37 #ifndef USE_MINI_EVENT
38 #  ifdef HAVE_EVENT_H
39 #    include <event.h>
40 #  else
41 #    include <event2/event.h>
42 #    include "event2/event_struct.h"
43 #    include "event2/event_compat.h"
44 #  endif
45 #else
46 #  include "mini_event.h"
47 #endif
48 
49 #include "axfr.h"
50 #include "namedb.h"
51 #include "netio.h"
52 #include "xfrd.h"
53 #include "xfrd-tcp.h"
54 #include "xfrd-disk.h"
55 #include "difffile.h"
56 #include "nsec3.h"
57 #include "ipc.h"
58 #include "udb.h"
59 #include "remote.h"
60 #include "lookup3.h"
61 #include "rrl.h"
62 
63 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
64 
65 /*
66  * Data for the UDP handlers.
67  */
68 struct udp_handler_data
69 {
70 	struct nsd        *nsd;
71 	struct nsd_socket *socket;
72 	query_type        *query;
73 };
74 
75 struct tcp_accept_handler_data {
76 	struct nsd         *nsd;
77 	struct nsd_socket  *socket;
78 	int event_added;
79 	struct event       event;
80 };
81 
82 /*
83  * These globals are used to enable the TCP accept handlers
84  * when the number of TCP connection drops below the maximum
85  * number of TCP connections.
86  */
87 static size_t		tcp_accept_handler_count;
88 static struct tcp_accept_handler_data*	tcp_accept_handlers;
89 
90 static struct event slowaccept_event;
91 static int slowaccept;
92 
93 #ifndef NONBLOCKING_IS_BROKEN
94 #  define NUM_RECV_PER_SELECT 100
95 #endif
96 
97 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
98 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
99 struct iovec iovecs[NUM_RECV_PER_SELECT];
100 struct query *queries[NUM_RECV_PER_SELECT];
101 #endif
102 
103 /*
104  * Data for the TCP connection handlers.
105  *
106  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
107  * blocking the entire server on a slow TCP connection, but does make
108  * reading from and writing to the socket more complicated.
109  *
110  * Basically, whenever a read/write would block (indicated by the
111  * EAGAIN errno variable) we remember the position we were reading
112  * from/writing to and return from the TCP reading/writing event
113  * handler.  When the socket becomes readable/writable again we
114  * continue from the same position.
115  */
116 struct tcp_handler_data
117 {
118 	/*
119 	 * The region used to allocate all TCP connection related
120 	 * data, including this structure.  This region is destroyed
121 	 * when the connection is closed.
122 	 */
123 	region_type*		region;
124 
125 	/*
126 	 * The global nsd structure.
127 	 */
128 	struct nsd*			nsd;
129 
130 	/*
131 	 * The current query data for this TCP connection.
132 	 */
133 	query_type*			query;
134 
135 	/*
136 	 * The query_state is used to remember if we are performing an
137 	 * AXFR, if we're done processing, or if we should discard the
138 	 * query and connection.
139 	 */
140 	query_state_type	query_state;
141 
142 	/*
143 	 * The event for the file descriptor and tcp timeout
144 	 */
145 	struct event event;
146 
147 	/*
148 	 * The bytes_transmitted field is used to remember the number
149 	 * of bytes transmitted when receiving or sending a DNS
150 	 * packet.  The count includes the two additional bytes used
151 	 * to specify the packet length on a TCP connection.
152 	 */
153 	size_t				bytes_transmitted;
154 
155 	/*
156 	 * The number of queries handled by this specific TCP connection.
157 	 */
158 	int					query_count;
159 };
160 
161 /*
162  * Handle incoming queries on the UDP server sockets.
163  */
164 static void handle_udp(int fd, short event, void* arg);
165 
166 /*
167  * Handle incoming connections on the TCP sockets.  These handlers
168  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
169  * connection) but are disabled when the number of current TCP
170  * connections is equal to the maximum number of TCP connections.
171  * Disabling is done by changing the handler to wait for the
172  * NETIO_EVENT_NONE type.  This is done using the function
173  * configure_tcp_accept_handlers.
174  */
175 static void handle_tcp_accept(int fd, short event, void* arg);
176 
177 /*
178  * Handle incoming queries on a TCP connection.  The TCP connections
179  * are configured to be non-blocking and the handler may be called
180  * multiple times before a complete query is received.
181  */
182 static void handle_tcp_reading(int fd, short event, void* arg);
183 
184 /*
185  * Handle outgoing responses on a TCP connection.  The TCP connections
186  * are configured to be non-blocking and the handler may be called
187  * multiple times before a complete response is sent.
188  */
189 static void handle_tcp_writing(int fd, short event, void* arg);
190 
191 /*
192  * Send all children the quit nonblocking, then close pipe.
193  */
194 static void send_children_quit(struct nsd* nsd);
195 /* same, for shutdown time, waits for child to exit to avoid restart issues */
196 static void send_children_quit_and_wait(struct nsd* nsd);
197 
198 /* set childrens flags to send NSD_STATS to them */
199 #ifdef BIND8_STATS
200 static void set_children_stats(struct nsd* nsd);
201 #endif /* BIND8_STATS */
202 
203 /*
204  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
205  */
206 static void configure_handler_event_types(short event_types);
207 
208 static uint16_t *compressed_dname_offsets = 0;
209 static uint32_t compression_table_capacity = 0;
210 static uint32_t compression_table_size = 0;
211 
212 /*
213  * Remove the specified pid from the list of child pids.  Returns -1 if
214  * the pid is not in the list, child_num otherwise.  The field is set to 0.
215  */
216 static int
217 delete_child_pid(struct nsd *nsd, pid_t pid)
218 {
219 	size_t i;
220 	for (i = 0; i < nsd->child_count; ++i) {
221 		if (nsd->children[i].pid == pid) {
222 			nsd->children[i].pid = 0;
223 			if(!nsd->children[i].need_to_exit) {
224 				if(nsd->children[i].child_fd != -1)
225 					close(nsd->children[i].child_fd);
226 				nsd->children[i].child_fd = -1;
227 				if(nsd->children[i].handler)
228 					nsd->children[i].handler->fd = -1;
229 			}
230 			return i;
231 		}
232 	}
233 	return -1;
234 }
235 
236 /*
237  * Restart child servers if necessary.
238  */
239 static int
240 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
241 	int* xfrd_sock_p)
242 {
243 	struct main_ipc_handler_data *ipc_data;
244 	size_t i;
245 	int sv[2];
246 
247 	/* Fork the child processes... */
248 	for (i = 0; i < nsd->child_count; ++i) {
249 		if (nsd->children[i].pid <= 0) {
250 			if (nsd->children[i].child_fd != -1)
251 				close(nsd->children[i].child_fd);
252 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
253 				log_msg(LOG_ERR, "socketpair: %s",
254 					strerror(errno));
255 				return -1;
256 			}
257 			nsd->children[i].child_fd = sv[0];
258 			nsd->children[i].parent_fd = sv[1];
259 			nsd->children[i].pid = fork();
260 			switch (nsd->children[i].pid) {
261 			default: /* SERVER MAIN */
262 				close(nsd->children[i].parent_fd);
263 				nsd->children[i].parent_fd = -1;
264 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
265 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
266 				}
267 				if(!nsd->children[i].handler)
268 				{
269 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
270 						region, sizeof(struct main_ipc_handler_data));
271 					ipc_data->nsd = nsd;
272 					ipc_data->child = &nsd->children[i];
273 					ipc_data->child_num = i;
274 					ipc_data->xfrd_sock = xfrd_sock_p;
275 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
276 					ipc_data->forward_mode = 0;
277 					ipc_data->got_bytes = 0;
278 					ipc_data->total_bytes = 0;
279 					ipc_data->acl_num = 0;
280 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
281 						region, sizeof(struct netio_handler));
282 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
283 					nsd->children[i].handler->timeout = NULL;
284 					nsd->children[i].handler->user_data = ipc_data;
285 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
286 					nsd->children[i].handler->event_handler = parent_handle_child_command;
287 					netio_add_handler(netio, nsd->children[i].handler);
288 				}
289 				/* clear any ongoing ipc */
290 				ipc_data = (struct main_ipc_handler_data*)
291 					nsd->children[i].handler->user_data;
292 				ipc_data->forward_mode = 0;
293 				/* restart - update fd */
294 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
295 				break;
296 			case 0: /* CHILD */
297 				/* the child need not be able to access the
298 				 * nsd.db file */
299 				namedb_close_udb(nsd->db);
300 				nsd->pid = 0;
301 				nsd->child_count = 0;
302 				nsd->server_kind = nsd->children[i].kind;
303 				nsd->this_child = &nsd->children[i];
304 				/* remove signal flags inherited from parent
305 				   the parent will handle them. */
306 				nsd->signal_hint_reload_hup = 0;
307 				nsd->signal_hint_reload = 0;
308 				nsd->signal_hint_child = 0;
309 				nsd->signal_hint_quit = 0;
310 				nsd->signal_hint_shutdown = 0;
311 				nsd->signal_hint_stats = 0;
312 				nsd->signal_hint_statsusr = 0;
313 				close(*xfrd_sock_p);
314 				close(nsd->this_child->child_fd);
315 				nsd->this_child->child_fd = -1;
316 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
317 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
318 				}
319 				server_child(nsd);
320 				/* NOTREACH */
321 				exit(0);
322 			case -1:
323 				log_msg(LOG_ERR, "fork failed: %s",
324 					strerror(errno));
325 				return -1;
326 			}
327 		}
328 	}
329 	return 0;
330 }
331 
332 #ifdef BIND8_STATS
333 static void set_bind8_alarm(struct nsd* nsd)
334 {
335 	/* resync so that the next alarm is on the next whole minute */
336 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
337 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
338 }
339 #endif
340 
341 static void
342 cleanup_dname_compression_tables(void *ptr)
343 {
344 	free(ptr);
345 	compressed_dname_offsets = NULL;
346 	compression_table_capacity = 0;
347 }
348 
349 static void
350 initialize_dname_compression_tables(struct nsd *nsd)
351 {
352 	size_t needed = domain_table_count(nsd->db->domains) + 1;
353 	needed += EXTRA_DOMAIN_NUMBERS;
354 	if(compression_table_capacity < needed) {
355 		if(compressed_dname_offsets) {
356 			region_remove_cleanup(nsd->db->region,
357 				cleanup_dname_compression_tables,
358 				compressed_dname_offsets);
359 			free(compressed_dname_offsets);
360 		}
361 		compressed_dname_offsets = (uint16_t *) xalloc(
362 			needed * sizeof(uint16_t));
363 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
364 			compressed_dname_offsets);
365 		compression_table_capacity = needed;
366 		compression_table_size=domain_table_count(nsd->db->domains)+1;
367 	}
368 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
369 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
370 }
371 
372 /*
373  * Initialize the server, create and bind the sockets.
374  *
375  */
376 int
377 server_init(struct nsd *nsd)
378 {
379 	size_t i;
380 #if defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)))
381 	int on = 1;
382 #endif
383 
384 	/* UDP */
385 
386 	/* Make a socket... */
387 	for (i = 0; i < nsd->ifs; i++) {
388 		if (!nsd->udp[i].addr) {
389 			nsd->udp[i].s = -1;
390 			continue;
391 		}
392 		if ((nsd->udp[i].s = socket(nsd->udp[i].addr->ai_family, nsd->udp[i].addr->ai_socktype, 0)) == -1) {
393 #if defined(INET6)
394 			if (nsd->udp[i].addr->ai_family == AF_INET6 &&
395 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
396 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
397 				continue;
398 			}
399 #endif /* INET6 */
400 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
401 			return -1;
402 		}
403 
404 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
405 	if(1) {
406 	int rcv = 1*1024*1024;
407 	int snd = 1*1024*1024;
408 
409 #ifdef SO_RCVBUF
410 #  ifdef SO_RCVBUFFORCE
411 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
412 		(socklen_t)sizeof(rcv)) < 0) {
413 		if(errno != EPERM && errno != ENOBUFS) {
414 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
415                                         "...) failed: %s", strerror(errno));
416 			return -1;
417 		}
418 #  else
419 	if(1) {
420 #  endif /* SO_RCVBUFFORCE */
421 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
422 			 (socklen_t)sizeof(rcv)) < 0) {
423 			if(errno != ENOBUFS) {
424 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
425                                         "...) failed: %s", strerror(errno));
426 				return -1;
427 			}
428 		}
429 	}
430 #endif /* SO_RCVBUF */
431 
432 #ifdef SO_SNDBUF
433 #  ifdef SO_SNDBUFFORCE
434 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
435 		(socklen_t)sizeof(snd)) < 0) {
436 		if(errno != EPERM && errno != ENOBUFS) {
437 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
438                                         "...) failed: %s", strerror(errno));
439 			return -1;
440 		}
441 #  else
442 	if(1) {
443 #  endif /* SO_SNDBUFFORCE */
444 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
445 			 (socklen_t)sizeof(snd)) < 0) {
446 			if(errno != ENOBUFS) {
447 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
448                                         "...) failed: %s", strerror(errno));
449 				return -1;
450 			}
451 		}
452 	}
453 #endif /* SO_SNDBUF */
454 
455 	}
456 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
457 
458 #if defined(INET6)
459 		if (nsd->udp[i].addr->ai_family == AF_INET6) {
460 # if defined(IPV6_V6ONLY)
461 			if (setsockopt(nsd->udp[i].s,
462 				       IPPROTO_IPV6, IPV6_V6ONLY,
463 				       &on, sizeof(on)) < 0)
464 			{
465 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
466 					strerror(errno));
467 				return -1;
468 			}
469 # endif
470 # if defined(IPV6_USE_MIN_MTU)
471 			/*
472 			 * There is no fragmentation of IPv6 datagrams
473 			 * during forwarding in the network. Therefore
474 			 * we do not send UDP datagrams larger than
475 			 * the minimum IPv6 MTU of 1280 octets. The
476 			 * EDNS0 message length can be larger if the
477 			 * network stack supports IPV6_USE_MIN_MTU.
478 			 */
479 			if (setsockopt(nsd->udp[i].s,
480 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
481 				       &on, sizeof(on)) < 0)
482 			{
483 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
484 					strerror(errno));
485 				return -1;
486 			}
487 # elif defined(IPV6_MTU)
488 			/*
489 			 * On Linux, PMTUD is disabled by default for datagrams
490 			 * so set the MTU equal to the MIN MTU to get the same.
491 			 */
492 			on = IPV6_MIN_MTU;
493 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
494 				&on, sizeof(on)) < 0)
495 			{
496 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
497 					strerror(errno));
498 				return -1;
499 			}
500 			on = 1;
501 # endif
502 		}
503 #endif
504 #if defined(AF_INET)
505 		if (nsd->udp[i].addr->ai_family == AF_INET) {
506 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
507 			int action = IP_PMTUDISC_DONT;
508 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
509 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
510 			{
511 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
512 					strerror(errno));
513 				return -1;
514 			}
515 #  elif defined(IP_DONTFRAG)
516 			int off = 0;
517 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
518 				&off, sizeof(off)) < 0)
519 			{
520 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
521 					strerror(errno));
522 				return -1;
523 			}
524 #  endif
525 		}
526 #endif
527 		/* set it nonblocking */
528 		/* otherwise, on OSes with thundering herd problems, the
529 		   UDP recv could block NSD after select returns readable. */
530 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
531 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
532 		}
533 
534 		/* Bind it... */
535 		if (nsd->options->ip_transparent) {
536 #ifdef IP_TRANSPARENT
537 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
538 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
539 					strerror(errno));
540 			}
541 #endif /* IP_TRANSPARENT */
542 		}
543 
544 		if (bind(nsd->udp[i].s, (struct sockaddr *) nsd->udp[i].addr->ai_addr, nsd->udp[i].addr->ai_addrlen) != 0) {
545 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
546 			return -1;
547 		}
548 	}
549 
550 	/* TCP */
551 
552 	/* Make a socket... */
553 	for (i = 0; i < nsd->ifs; i++) {
554 		if (!nsd->tcp[i].addr) {
555 			nsd->tcp[i].s = -1;
556 			continue;
557 		}
558 		if ((nsd->tcp[i].s = socket(nsd->tcp[i].addr->ai_family, nsd->tcp[i].addr->ai_socktype, 0)) == -1) {
559 #if defined(INET6)
560 			if (nsd->tcp[i].addr->ai_family == AF_INET6 &&
561 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
562 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
563 				continue;
564 			}
565 #endif /* INET6 */
566 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
567 			return -1;
568 		}
569 
570 #ifdef	SO_REUSEADDR
571 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
572 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
573 		}
574 #endif /* SO_REUSEADDR */
575 
576 #if defined(INET6)
577 		if (nsd->tcp[i].addr->ai_family == AF_INET6) {
578 # if defined(IPV6_V6ONLY)
579 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
580 				&on, sizeof(on)) < 0) {
581 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
582 				return -1;
583 			}
584 # endif
585 # if defined(IPV6_USE_MIN_MTU)
586 			/*
587 			 * Use minimum MTU to minimize delays learning working
588 			 * PMTU when communicating through a tunnel.
589 			 */
590 			if (setsockopt(nsd->tcp[i].s,
591 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
592 				       &on, sizeof(on)) < 0) {
593 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
594 				return -1;
595 			}
596 # elif defined(IPV6_MTU)
597 			/*
598 			 * On Linux, PMTUD is disabled by default for datagrams
599 			 * so set the MTU equal to the MIN MTU to get the same.
600 			 */
601 			on = IPV6_MIN_MTU;
602 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
603 				&on, sizeof(on)) < 0) {
604 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
605 				return -1;
606 			}
607 			on = 1;
608 # endif
609 		}
610 #endif
611 		/* set it nonblocking */
612 		/* (StevensUNP p463), if tcp listening socket is blocking, then
613 		   it may block in accept, even if select() says readable. */
614 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
615 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
616 		}
617 
618 		/* Bind it... */
619 		if (nsd->options->ip_transparent) {
620 #ifdef IP_TRANSPARENT
621 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
622 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
623 					strerror(errno));
624 			}
625 #endif /* IP_TRANSPARENT */
626 		}
627 
628 		if (bind(nsd->tcp[i].s, (struct sockaddr *) nsd->tcp[i].addr->ai_addr, nsd->tcp[i].addr->ai_addrlen) != 0) {
629 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
630 			return -1;
631 		}
632 
633 		/* Listen to it... */
634 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
635 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
636 			return -1;
637 		}
638 	}
639 
640 	return 0;
641 }
642 
643 /*
644  * Prepare the server for take off.
645  *
646  */
647 int
648 server_prepare(struct nsd *nsd)
649 {
650 #ifdef RATELIMIT
651 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
652 #ifdef HAVE_ARC4RANDOM
653 	hash_set_raninit(arc4random());
654 #else
655 	uint32_t v = getpid() ^ time(NULL);
656 	srandom((unsigned long)v);
657 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
658 		hash_set_raninit(v);
659 	else	hash_set_raninit(random());
660 #endif
661 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
662 		nsd->options->rrl_ratelimit,
663 		nsd->options->rrl_whitelist_ratelimit,
664 		nsd->options->rrl_slip,
665 		nsd->options->rrl_ipv4_prefix_length,
666 		nsd->options->rrl_ipv6_prefix_length);
667 #endif /* RATELIMIT */
668 
669 	/* Open the database... */
670 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
671 		log_msg(LOG_ERR, "unable to open the database %s: %s",
672 			nsd->dbfile, strerror(errno));
673 		unlink(nsd->task[0]->fname);
674 		unlink(nsd->task[1]->fname);
675 		xfrd_del_tempdir(nsd);
676 		return -1;
677 	}
678 	/* check if zone files have been modified */
679 	/* NULL for taskudb because we send soainfo in a moment, batched up,
680 	 * for all zones */
681 	if(nsd->options->zonefiles_check)
682 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
683 
684 	compression_table_capacity = 0;
685 	initialize_dname_compression_tables(nsd);
686 
687 #ifdef	BIND8_STATS
688 	/* Initialize times... */
689 	time(&nsd->st.boot);
690 	set_bind8_alarm(nsd);
691 #endif /* BIND8_STATS */
692 
693 	return 0;
694 }
695 
696 /*
697  * Fork the required number of servers.
698  */
699 static int
700 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
701 	int* xfrd_sock_p)
702 {
703 	size_t i;
704 
705 	/* Start all child servers initially.  */
706 	for (i = 0; i < nsd->child_count; ++i) {
707 		nsd->children[i].pid = 0;
708 	}
709 
710 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
711 }
712 
713 void
714 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
715 {
716 	size_t i;
717 
718 	/* Close all the sockets... */
719 	for (i = 0; i < n; ++i) {
720 		if (sockets[i].s != -1) {
721 			close(sockets[i].s);
722 			freeaddrinfo(sockets[i].addr);
723 			sockets[i].s = -1;
724 		}
725 	}
726 }
727 
728 /*
729  * Close the sockets, shutdown the server and exit.
730  * Does not return.
731  *
732  */
733 void
734 server_shutdown(struct nsd *nsd)
735 {
736 	size_t i;
737 
738 	server_close_all_sockets(nsd->udp, nsd->ifs);
739 	server_close_all_sockets(nsd->tcp, nsd->ifs);
740 	/* CHILD: close command channel to parent */
741 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
742 	{
743 		close(nsd->this_child->parent_fd);
744 		nsd->this_child->parent_fd = -1;
745 	}
746 	/* SERVER: close command channels to children */
747 	if(!nsd->this_child)
748 	{
749 		for(i=0; i < nsd->child_count; ++i)
750 			if(nsd->children[i].child_fd != -1)
751 			{
752 				close(nsd->children[i].child_fd);
753 				nsd->children[i].child_fd = -1;
754 			}
755 	}
756 
757 	tsig_finalize();
758 #ifdef HAVE_SSL
759 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
760 #endif
761 
762 #if 0 /* OS collects memory pages */
763 	nsd_options_destroy(nsd->options);
764 	region_destroy(nsd->region);
765 #endif
766 	log_finalize();
767 	exit(0);
768 }
769 
770 void
771 server_prepare_xfrd(struct nsd* nsd)
772 {
773 	char tmpfile[256];
774 	/* create task mmaps */
775 	nsd->mytask = 0;
776 	snprintf(tmpfile, sizeof(tmpfile), "%snsd.%u.task.0",
777 		nsd->options->xfrdir, (unsigned)getpid());
778 	nsd->task[0] = task_file_create(tmpfile);
779 	if(!nsd->task[0])
780 		exit(1);
781 	snprintf(tmpfile, sizeof(tmpfile), "%snsd.%u.task.1",
782 		nsd->options->xfrdir, (unsigned)getpid());
783 	nsd->task[1] = task_file_create(tmpfile);
784 	if(!nsd->task[1]) {
785 		unlink(nsd->task[0]->fname);
786 		exit(1);
787 	}
788 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
789 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
790 	/* create xfrd listener structure */
791 	nsd->xfrd_listener = region_alloc(nsd->region,
792 		sizeof(netio_handler_type));
793 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
794 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
795 	nsd->xfrd_listener->fd = -1;
796 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
797 		nsd;
798 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
799 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
800 }
801 
802 
803 void
804 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
805 {
806 	pid_t pid;
807 	int sockets[2] = {0,0};
808 	struct ipc_handler_conn_data *data;
809 
810 	if(nsd->xfrd_listener->fd != -1)
811 		close(nsd->xfrd_listener->fd);
812 	if(del_db) {
813 		/* recreate taskdb that xfrd was using, it may be corrupt */
814 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
815 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
816 		nsd->task[1-nsd->mytask]->fname = NULL;
817 		/* free alloc already, so udb does not shrink itself */
818 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
819 		nsd->task[1-nsd->mytask]->alloc = NULL;
820 		udb_base_free(nsd->task[1-nsd->mytask]);
821 		/* create new file, overwrite the old one */
822 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
823 		free(tmpfile);
824 	}
825 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
826 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
827 		return;
828 	}
829 	pid = fork();
830 	switch (pid) {
831 	case -1:
832 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
833 		break;
834 	default:
835 		/* PARENT: close first socket, use second one */
836 		close(sockets[0]);
837 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
838 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
839 		}
840 		if(del_db) xfrd_free_namedb(nsd);
841 		/* use other task than I am using, since if xfrd died and is
842 		 * restarted, the reload is using nsd->mytask */
843 		nsd->mytask = 1 - nsd->mytask;
844 		xfrd_init(sockets[1], nsd, del_db, reload_active);
845 		/* ENOTREACH */
846 		break;
847 	case 0:
848 		/* CHILD: close second socket, use first one */
849 		close(sockets[1]);
850 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
851 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
852 		}
853 		nsd->xfrd_listener->fd = sockets[0];
854 		break;
855 	}
856 	/* server-parent only */
857 	nsd->xfrd_listener->timeout = NULL;
858 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
859 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
860 	/* clear ongoing ipc reads */
861 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
862 	data->conn->is_reading = 0;
863 }
864 
865 /** add all soainfo to taskdb */
866 static void
867 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
868 {
869 	struct radnode* n;
870 	udb_ptr task_last; /* last task, mytask is empty so NULL */
871 	/* add all SOA INFO to mytask */
872 	udb_ptr_init(&task_last, taskudb);
873 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
874 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
875 	}
876 	udb_ptr_unlink(&task_last, taskudb);
877 }
878 
879 void
880 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
881 {
882 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
883 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
884 	 *   then they exchange and process.
885 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
886 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
887 	 *   expire notifications can be sent back via a normal reload later
888 	 *   (xfrd will wait for current running reload to finish if any).
889 	 */
890 	sig_atomic_t cmd = 0;
891 #ifdef BIND8_STATS
892 	pid_t mypid;
893 #endif
894 	int xfrd_sock = nsd->xfrd_listener->fd;
895 	struct udb_base* taskudb = nsd->task[nsd->mytask];
896 	udb_ptr t;
897 	if(shortsoa) {
898 		/* put SOA in xfrd task because mytask may be in use */
899 		taskudb = nsd->task[1-nsd->mytask];
900 	}
901 
902 	add_all_soa_to_task(nsd, taskudb);
903 	if(!shortsoa) {
904 		/* wait for xfrd to signal task is ready, RELOAD signal */
905 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
906 			cmd != NSD_RELOAD) {
907 			log_msg(LOG_ERR, "did not get start signal from xfrd");
908 			exit(1);
909 		}
910 	}
911 	/* give xfrd our task, signal it with RELOAD_DONE */
912 	task_process_sync(taskudb);
913 	cmd = NSD_RELOAD_DONE;
914 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
915 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
916 			(int)nsd->pid, strerror(errno));
917 	}
918 #ifdef BIND8_STATS
919 	mypid = getpid();
920 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
921 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
922 			strerror(errno));
923 	}
924 #endif
925 
926 	if(!shortsoa) {
927 		/* process the xfrd task works (expiry data) */
928 		nsd->mytask = 1 - nsd->mytask;
929 		taskudb = nsd->task[nsd->mytask];
930 		task_remap(taskudb);
931 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
932 		while(!udb_ptr_is_null(&t)) {
933 			task_process_expire(nsd->db, TASKLIST(&t));
934 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
935 		}
936 		udb_ptr_unlink(&t, taskudb);
937 		task_clear(taskudb);
938 
939 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
940 		cmd = NSD_RELOAD_DONE;
941 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
942 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
943 				(int)nsd->pid, strerror(errno));
944 		}
945 	}
946 }
947 
948 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
949 ssize_t
950 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
951 {
952 	uint8_t* buf = (uint8_t*) p;
953 	ssize_t total = 0;
954 	fd_set rfds;
955 	struct timeval tv;
956 	FD_ZERO(&rfds);
957 
958 	while( total < sz) {
959 		ssize_t ret;
960 		FD_SET(s, &rfds);
961 		tv.tv_sec = timeout;
962 		tv.tv_usec = 0;
963 		ret = select(s+1, &rfds, NULL, NULL, timeout==-1?NULL:&tv);
964 		if(ret == -1) {
965 			if(errno == EAGAIN)
966 				/* blocking read */
967 				continue;
968 			if(errno == EINTR) {
969 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
970 					return -1;
971 				/* other signals can be handled later */
972 				continue;
973 			}
974 			/* some error */
975 			return -1;
976 		}
977 		if(ret == 0) {
978 			/* operation timed out */
979 			return -2;
980 		}
981 		ret = read(s, buf+total, sz-total);
982 		if(ret == -1) {
983 			if(errno == EAGAIN)
984 				/* blocking read */
985 				continue;
986 			if(errno == EINTR) {
987 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
988 					return -1;
989 				/* other signals can be handled later */
990 				continue;
991 			}
992 			/* some error */
993 			return -1;
994 		}
995 		if(ret == 0) {
996 			/* closed connection! */
997 			return 0;
998 		}
999 		total += ret;
1000 	}
1001 	return total;
1002 }
1003 
1004 static void
1005 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1006 {
1007 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1008 	udb_ptr t, next;
1009 	udb_base* u = nsd->task[nsd->mytask];
1010 	udb_ptr_init(&next, u);
1011 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1012 	udb_base_set_userdata(u, 0);
1013 	while(!udb_ptr_is_null(&t)) {
1014 		/* store next in list so this one can be deleted or reused */
1015 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1016 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1017 
1018 		/* process task t */
1019 		/* append results for task t and update last_task */
1020 		task_process_in_reload(nsd, u, last_task, &t);
1021 
1022 		/* go to next */
1023 		udb_ptr_set_ptr(&t, u, &next);
1024 
1025 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1026 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1027 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1028 			if(cmd == NSD_QUIT) {
1029 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1030 				/* sync to disk (if needed) */
1031 				udb_base_sync(nsd->db->udb, 0);
1032 				/* unlink files of remainder of tasks */
1033 				while(!udb_ptr_is_null(&t)) {
1034 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1035 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1036 					}
1037 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1038 				}
1039 				udb_ptr_unlink(&t, u);
1040 				udb_ptr_unlink(&next, u);
1041 				exit(0);
1042 			}
1043 		}
1044 
1045 	}
1046 	udb_ptr_unlink(&t, u);
1047 	udb_ptr_unlink(&next, u);
1048 }
1049 
1050 #ifdef BIND8_STATS
1051 static void
1052 parent_send_stats(struct nsd* nsd, int cmdfd)
1053 {
1054 	size_t i;
1055 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1056 		log_msg(LOG_ERR, "could not write stats to reload");
1057 		return;
1058 	}
1059 	for(i=0; i<nsd->child_count; i++)
1060 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1061 			sizeof(stc_t))) {
1062 			log_msg(LOG_ERR, "could not write stats to reload");
1063 			return;
1064 		}
1065 }
1066 
1067 static void
1068 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1069 {
1070 	struct nsdst s;
1071 	stc_t* p;
1072 	size_t i;
1073 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1074 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1075 		log_msg(LOG_ERR, "could not read stats from oldpar");
1076 		return;
1077 	}
1078 	s.db_disk = nsd->db->udb->base_size;
1079 	s.db_mem = region_get_mem(nsd->db->region);
1080 	p = (stc_t*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1081 		nsd->child_count);
1082 	if(!p) return;
1083 	for(i=0; i<nsd->child_count; i++) {
1084 		if(block_read(nsd, cmdfd, p++, sizeof(stc_t), 1)!=sizeof(stc_t))
1085 			return;
1086 	}
1087 }
1088 #endif /* BIND8_STATS */
1089 
1090 /*
1091  * Reload the database, stop parent, re-fork children and continue.
1092  * as server_main.
1093  */
1094 static void
1095 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1096 	int cmdsocket)
1097 {
1098 #ifdef BIND8_STATS
1099 	pid_t mypid;
1100 #endif
1101 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1102 	int ret;
1103 	udb_ptr last_task;
1104 
1105 	/* see what tasks we got from xfrd */
1106 	task_remap(nsd->task[nsd->mytask]);
1107 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1108 	reload_process_tasks(nsd, &last_task, cmdsocket);
1109 
1110 #ifndef NDEBUG
1111 	if(nsd_debug_level >= 1)
1112 		region_log_stats(nsd->db->region);
1113 #endif /* NDEBUG */
1114 	/* sync to disk (if needed) */
1115 	udb_base_sync(nsd->db->udb, 0);
1116 
1117 	initialize_dname_compression_tables(nsd);
1118 
1119 #ifdef BIND8_STATS
1120 	/* Restart dumping stats if required.  */
1121 	time(&nsd->st.boot);
1122 	set_bind8_alarm(nsd);
1123 #endif
1124 
1125 	/* Start new child processes */
1126 	if (server_start_children(nsd, server_region, netio, &nsd->
1127 		xfrd_listener->fd) != 0) {
1128 		send_children_quit(nsd);
1129 		exit(1);
1130 	}
1131 
1132 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1133 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1134 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1135 		if(cmd == NSD_QUIT) {
1136 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1137 			send_children_quit(nsd);
1138 			exit(0);
1139 		}
1140 	}
1141 
1142 	/* Send quit command to parent: blocking, wait for receipt. */
1143 	do {
1144 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1145 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1146 		{
1147 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1148 				strerror(errno));
1149 		}
1150 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1151 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1152 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1153 			RELOAD_SYNC_TIMEOUT);
1154 		if(ret == -2) {
1155 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1156 		}
1157 	} while (ret == -2);
1158 	if(ret == -1) {
1159 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1160 			strerror(errno));
1161 	}
1162 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1163 	if(cmd == NSD_QUIT) {
1164 		/* small race condition possible here, parent got quit cmd. */
1165 		send_children_quit(nsd);
1166 		exit(1);
1167 	}
1168 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1169 #ifdef BIND8_STATS
1170 	reload_do_stats(cmdsocket, nsd, &last_task);
1171 #endif
1172 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1173 	task_process_sync(nsd->task[nsd->mytask]);
1174 
1175 	/* send soainfo to the xfrd process, signal it that reload is done,
1176 	 * it picks up the taskudb */
1177 	cmd = NSD_RELOAD_DONE;
1178 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1179 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1180 			strerror(errno));
1181 	}
1182 #ifdef BIND8_STATS
1183 	mypid = getpid();
1184 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1185 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1186 			strerror(errno));
1187 	}
1188 #endif
1189 
1190 	/* try to reopen file */
1191 	if (nsd->file_rotation_ok)
1192 		log_reopen(nsd->log_filename, 1);
1193 	/* exit reload, continue as new server_main */
1194 }
1195 
1196 /*
1197  * Get the mode depending on the signal hints that have been received.
1198  * Multiple signal hints can be received and will be handled in turn.
1199  */
1200 static sig_atomic_t
1201 server_signal_mode(struct nsd *nsd)
1202 {
1203 	if(nsd->signal_hint_quit) {
1204 		nsd->signal_hint_quit = 0;
1205 		return NSD_QUIT;
1206 	}
1207 	else if(nsd->signal_hint_shutdown) {
1208 		nsd->signal_hint_shutdown = 0;
1209 		return NSD_SHUTDOWN;
1210 	}
1211 	else if(nsd->signal_hint_child) {
1212 		nsd->signal_hint_child = 0;
1213 		return NSD_REAP_CHILDREN;
1214 	}
1215 	else if(nsd->signal_hint_reload) {
1216 		nsd->signal_hint_reload = 0;
1217 		return NSD_RELOAD;
1218 	}
1219 	else if(nsd->signal_hint_reload_hup) {
1220 		nsd->signal_hint_reload_hup = 0;
1221 		return NSD_RELOAD_REQ;
1222 	}
1223 	else if(nsd->signal_hint_stats) {
1224 		nsd->signal_hint_stats = 0;
1225 #ifdef BIND8_STATS
1226 		set_bind8_alarm(nsd);
1227 #endif
1228 		return NSD_STATS;
1229 	}
1230 	else if(nsd->signal_hint_statsusr) {
1231 		nsd->signal_hint_statsusr = 0;
1232 		return NSD_STATS;
1233 	}
1234 	return NSD_RUN;
1235 }
1236 
1237 /*
1238  * The main server simply waits for signals and child processes to
1239  * terminate.  Child processes are restarted as necessary.
1240  */
1241 void
1242 server_main(struct nsd *nsd)
1243 {
1244 	region_type *server_region = region_create(xalloc, free);
1245 	netio_type *netio = netio_create(server_region);
1246 	netio_handler_type reload_listener;
1247 	int reload_sockets[2] = {-1, -1};
1248 	struct timespec timeout_spec;
1249 	int status;
1250 	pid_t child_pid;
1251 	pid_t reload_pid = -1;
1252 	sig_atomic_t mode;
1253 
1254 	/* Ensure we are the main process */
1255 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1256 
1257 	/* Add listener for the XFRD process */
1258 	netio_add_handler(netio, nsd->xfrd_listener);
1259 
1260 	/* Start the child processes that handle incoming queries */
1261 	if (server_start_children(nsd, server_region, netio,
1262 		&nsd->xfrd_listener->fd) != 0) {
1263 		send_children_quit(nsd);
1264 		exit(1);
1265 	}
1266 	reload_listener.fd = -1;
1267 
1268 	/* This_child MUST be 0, because this is the parent process */
1269 	assert(nsd->this_child == 0);
1270 
1271 	/* Run the server until we get a shutdown signal */
1272 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1273 		/* Did we receive a signal that changes our mode? */
1274 		if(mode == NSD_RUN) {
1275 			nsd->mode = mode = server_signal_mode(nsd);
1276 		}
1277 
1278 		switch (mode) {
1279 		case NSD_RUN:
1280 			/* see if any child processes terminated */
1281 			while((child_pid = waitpid(0, &status, WNOHANG)) != -1 && child_pid != 0) {
1282 				int is_child = delete_child_pid(nsd, child_pid);
1283 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1284 					if(nsd->children[is_child].child_fd == -1)
1285 						nsd->children[is_child].has_exited = 1;
1286 					parent_check_all_children_exited(nsd);
1287 				} else if(is_child != -1) {
1288 					log_msg(LOG_WARNING,
1289 					       "server %d died unexpectedly with status %d, restarting",
1290 					       (int) child_pid, status);
1291 					restart_child_servers(nsd, server_region, netio,
1292 						&nsd->xfrd_listener->fd);
1293 				} else if (child_pid == reload_pid) {
1294 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1295 #ifdef BIND8_STATS
1296 					pid_t mypid;
1297 #endif
1298 					log_msg(LOG_WARNING,
1299 					       "Reload process %d failed with status %d, continuing with old database",
1300 					       (int) child_pid, status);
1301 					reload_pid = -1;
1302 					if(reload_listener.fd != -1) close(reload_listener.fd);
1303 					reload_listener.fd = -1;
1304 					reload_listener.event_types = NETIO_EVENT_NONE;
1305 					task_process_sync(nsd->task[nsd->mytask]);
1306 					/* inform xfrd reload attempt ended */
1307 					if(!write_socket(nsd->xfrd_listener->fd,
1308 						&cmd, sizeof(cmd))) {
1309 						log_msg(LOG_ERR, "problems "
1310 						  "sending SOAEND to xfrd: %s",
1311 						  strerror(errno));
1312 					}
1313 #ifdef BIND8_STATS
1314 					mypid = getpid();
1315 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1316 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1317 							strerror(errno));
1318 					}
1319 #endif
1320 				} else {
1321 					log_msg(LOG_WARNING,
1322 					       "Unknown child %d terminated with status %d",
1323 					       (int) child_pid, status);
1324 				}
1325 			}
1326 			if (child_pid == -1) {
1327 				if (errno == EINTR) {
1328 					continue;
1329 				}
1330 				log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1331 			}
1332 			if (nsd->mode != NSD_RUN)
1333 				break;
1334 
1335 			/* timeout to collect processes. In case no sigchild happens. */
1336 			timeout_spec.tv_sec = 60;
1337 			timeout_spec.tv_nsec = 0;
1338 
1339 			/* listen on ports, timeout for collecting terminated children */
1340 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1341 				if (errno != EINTR) {
1342 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1343 				}
1344 			}
1345 
1346 			break;
1347 		case NSD_RELOAD_REQ: {
1348 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1349 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1350 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1351 				"main: ipc send reload_req to xfrd"));
1352 			if(!write_socket(nsd->xfrd_listener->fd,
1353 				&cmd, sizeof(cmd))) {
1354 				log_msg(LOG_ERR, "server_main: could not send "
1355 				"reload_req to xfrd: %s", strerror(errno));
1356 			}
1357 			nsd->mode = NSD_RUN;
1358 			} break;
1359 		case NSD_RELOAD:
1360 			/* Continue to run nsd after reload */
1361 			nsd->mode = NSD_RUN;
1362 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1363 			if (reload_pid != -1) {
1364 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1365 				       (int) reload_pid);
1366 				break;
1367 			}
1368 
1369 			/* switch the mytask to keep track of who owns task*/
1370 			nsd->mytask = 1 - nsd->mytask;
1371 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1372 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1373 				reload_pid = -1;
1374 				break;
1375 			}
1376 
1377 			/* Do actual reload */
1378 			reload_pid = fork();
1379 			switch (reload_pid) {
1380 			case -1:
1381 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1382 				break;
1383 			case 0:
1384 				/* CHILD */
1385 				close(reload_sockets[0]);
1386 				server_reload(nsd, server_region, netio,
1387 					reload_sockets[1]);
1388 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1389 				close(reload_sockets[1]);
1390 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1391 				/* drop stale xfrd ipc data */
1392 				((struct ipc_handler_conn_data*)nsd->
1393 					xfrd_listener->user_data)
1394 					->conn->is_reading = 0;
1395 				reload_pid = -1;
1396 				reload_listener.fd = -1;
1397 				reload_listener.event_types = NETIO_EVENT_NONE;
1398 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1399 				break;
1400 			default:
1401 				/* PARENT, keep running until NSD_QUIT_SYNC
1402 				 * received from CHILD.
1403 				 */
1404 				close(reload_sockets[1]);
1405 				reload_listener.fd = reload_sockets[0];
1406 				reload_listener.timeout = NULL;
1407 				reload_listener.user_data = nsd;
1408 				reload_listener.event_types = NETIO_EVENT_READ;
1409 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1410 				netio_add_handler(netio, &reload_listener);
1411 				break;
1412 			}
1413 			break;
1414 		case NSD_QUIT_SYNC:
1415 			/* synchronisation of xfrd, parent and reload */
1416 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1417 				sig_atomic_t cmd = NSD_RELOAD;
1418 				/* stop xfrd ipc writes in progress */
1419 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1420 					"main: ipc send indication reload"));
1421 				if(!write_socket(nsd->xfrd_listener->fd,
1422 					&cmd, sizeof(cmd))) {
1423 					log_msg(LOG_ERR, "server_main: could not send reload "
1424 					"indication to xfrd: %s", strerror(errno));
1425 				}
1426 				/* wait for ACK from xfrd */
1427 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1428 				nsd->quit_sync_done = 1;
1429 			}
1430 			nsd->mode = NSD_RUN;
1431 			break;
1432 		case NSD_QUIT:
1433 			/* silent shutdown during reload */
1434 			if(reload_listener.fd != -1) {
1435 				/* acknowledge the quit, to sync reload that we will really quit now */
1436 				sig_atomic_t cmd = NSD_RELOAD;
1437 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1438 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1439 					log_msg(LOG_ERR, "server_main: "
1440 						"could not ack quit: %s", strerror(errno));
1441 				}
1442 #ifdef BIND8_STATS
1443 				parent_send_stats(nsd, reload_listener.fd);
1444 #endif /* BIND8_STATS */
1445 				close(reload_listener.fd);
1446 			}
1447 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1448 			/* only quit children after xfrd has acked */
1449 			send_children_quit(nsd);
1450 
1451 #if 0 /* OS collects memory pages */
1452 			region_destroy(server_region);
1453 #endif
1454 			server_shutdown(nsd);
1455 
1456 			/* ENOTREACH */
1457 			break;
1458 		case NSD_SHUTDOWN:
1459 			log_msg(LOG_WARNING, "signal received, shutting down...");
1460 			break;
1461 		case NSD_REAP_CHILDREN:
1462 			/* continue; wait for child in run loop */
1463 			nsd->mode = NSD_RUN;
1464 			break;
1465 		case NSD_STATS:
1466 #ifdef BIND8_STATS
1467 			set_children_stats(nsd);
1468 #endif
1469 			nsd->mode = NSD_RUN;
1470 			break;
1471 		default:
1472 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1473 			nsd->mode = NSD_RUN;
1474 			break;
1475 		}
1476 	}
1477 
1478 	/* close opened ports to avoid race with restart of nsd */
1479 	server_close_all_sockets(nsd->udp, nsd->ifs);
1480 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1481 #ifdef HAVE_SSL
1482 	daemon_remote_close(nsd->rc);
1483 #endif
1484 	send_children_quit_and_wait(nsd);
1485 
1486 	/* Unlink it if possible... */
1487 	unlinkpid(nsd->pidfile);
1488 	unlink(nsd->task[0]->fname);
1489 	unlink(nsd->task[1]->fname);
1490 
1491 	if(reload_listener.fd != -1) {
1492 		sig_atomic_t cmd = NSD_QUIT;
1493 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1494 			"main: ipc send quit to reload-process"));
1495 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1496 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1497 				strerror(errno));
1498 		}
1499 		fsync(reload_listener.fd);
1500 		close(reload_listener.fd);
1501 		/* wait for reload to finish processing */
1502 		while(1) {
1503 			if(waitpid(reload_pid, NULL, 0) == -1) {
1504 				if(errno == EINTR) continue;
1505 				if(errno == ECHILD) break;
1506 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1507 					(int)reload_pid, strerror(errno));
1508 			}
1509 			break;
1510 		}
1511 	}
1512 	if(nsd->xfrd_listener->fd != -1) {
1513 		/* complete quit, stop xfrd */
1514 		sig_atomic_t cmd = NSD_QUIT;
1515 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1516 			"main: ipc send quit to xfrd"));
1517 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1518 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1519 				strerror(errno));
1520 		}
1521 		fsync(nsd->xfrd_listener->fd);
1522 		close(nsd->xfrd_listener->fd);
1523 		(void)kill(nsd->pid, SIGTERM);
1524 	}
1525 
1526 #if 0 /* OS collects memory pages */
1527 	region_destroy(server_region);
1528 #endif
1529 	/* write the nsd.db to disk, wait for it to complete */
1530 	udb_base_sync(nsd->db->udb, 1);
1531 	udb_base_close(nsd->db->udb);
1532 	server_shutdown(nsd);
1533 }
1534 
1535 static query_state_type
1536 server_process_query(struct nsd *nsd, struct query *query)
1537 {
1538 	return query_process(query, nsd);
1539 }
1540 
1541 static query_state_type
1542 server_process_query_udp(struct nsd *nsd, struct query *query)
1543 {
1544 #ifdef RATELIMIT
1545 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1546 		if(rrl_process_query(query))
1547 			return rrl_slip(query);
1548 		else	return QUERY_PROCESSED;
1549 	}
1550 	return QUERY_DISCARDED;
1551 #else
1552 	return query_process(query, nsd);
1553 #endif
1554 }
1555 
1556 struct event_base*
1557 nsd_child_event_base(void)
1558 {
1559 	struct event_base* base;
1560 #ifdef USE_MINI_EVENT
1561 	static time_t secs;
1562 	static struct timeval now;
1563 	base = event_init(&secs, &now);
1564 #else
1565 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
1566 	/* libev */
1567 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
1568 #  else
1569 	/* libevent */
1570 #    ifdef HAVE_EVENT_BASE_NEW
1571 	base = event_base_new();
1572 #    else
1573 	base = event_init();
1574 #    endif
1575 #  endif
1576 #endif
1577 	return base;
1578 }
1579 
1580 /*
1581  * Serve DNS requests.
1582  */
1583 void
1584 server_child(struct nsd *nsd)
1585 {
1586 	size_t i;
1587 	region_type *server_region = region_create(xalloc, free);
1588 	struct event_base* event_base = nsd_child_event_base();
1589 	query_type *udp_query;
1590 	sig_atomic_t mode;
1591 
1592 	if(!event_base) {
1593 		log_msg(LOG_ERR, "nsd server could not create event base");
1594 		exit(1);
1595 	}
1596 
1597 #ifdef RATELIMIT
1598 	rrl_init((nsd->this_child - nsd->children)/sizeof(nsd->children[0]));
1599 #endif
1600 
1601 	assert(nsd->server_kind != NSD_SERVER_MAIN);
1602 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
1603 
1604 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
1605 		server_close_all_sockets(nsd->tcp, nsd->ifs);
1606 	}
1607 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
1608 		server_close_all_sockets(nsd->udp, nsd->ifs);
1609 	}
1610 
1611 	if (nsd->this_child && nsd->this_child->parent_fd != -1) {
1612 		struct event *handler;
1613 		struct ipc_handler_conn_data* user_data =
1614 			(struct ipc_handler_conn_data*)region_alloc(
1615 			server_region, sizeof(struct ipc_handler_conn_data));
1616 		user_data->nsd = nsd;
1617 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
1618 
1619 		handler = (struct event*) region_alloc(
1620 			server_region, sizeof(*handler));
1621 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
1622 			EV_READ, child_handle_parent_command, user_data);
1623 		if(event_base_set(event_base, handler) != 0)
1624 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
1625 		if(event_add(handler, NULL) != 0)
1626 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
1627 	}
1628 
1629 	if (nsd->server_kind & NSD_SERVER_UDP) {
1630 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
1631 		udp_query = query_create(server_region,
1632 			compressed_dname_offsets, compression_table_size);
1633 #else
1634 		udp_query = NULL;
1635 		memset(msgs, 0, sizeof(msgs));
1636 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
1637 			queries[i] = query_create(server_region,
1638 				compressed_dname_offsets, compression_table_size);
1639 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
1640 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
1641 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
1642 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
1643 			msgs[i].msg_hdr.msg_iovlen  = 1;
1644 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
1645 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
1646 		}
1647 #endif
1648 		for (i = 0; i < nsd->ifs; ++i) {
1649 			struct udp_handler_data *data;
1650 			struct event *handler;
1651 
1652 			data = (struct udp_handler_data *) region_alloc(
1653 				server_region,
1654 				sizeof(struct udp_handler_data));
1655 			data->query = udp_query;
1656 			data->nsd = nsd;
1657 			data->socket = &nsd->udp[i];
1658 
1659 			handler = (struct event*) region_alloc(
1660 				server_region, sizeof(*handler));
1661 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
1662 				handle_udp, data);
1663 			if(event_base_set(event_base, handler) != 0)
1664 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
1665 			if(event_add(handler, NULL) != 0)
1666 				log_msg(LOG_ERR, "nsd udp: event_add failed");
1667 		}
1668 	}
1669 
1670 	/*
1671 	 * Keep track of all the TCP accept handlers so we can enable
1672 	 * and disable them based on the current number of active TCP
1673 	 * connections.
1674 	 */
1675 	tcp_accept_handler_count = nsd->ifs;
1676 	tcp_accept_handlers = (struct tcp_accept_handler_data*) region_alloc(
1677 		server_region, nsd->ifs * sizeof(*tcp_accept_handlers));
1678 	if (nsd->server_kind & NSD_SERVER_TCP) {
1679 		for (i = 0; i < nsd->ifs; ++i) {
1680 			struct event *handler = &tcp_accept_handlers[i].event;
1681 			struct tcp_accept_handler_data* data =
1682 				&tcp_accept_handlers[i];
1683 			data->nsd = nsd;
1684 			data->socket = &nsd->tcp[i];
1685 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
1686 				handle_tcp_accept, data);
1687 			if(event_base_set(event_base, handler) != 0)
1688 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
1689 			if(event_add(handler, NULL) != 0)
1690 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
1691 			data->event_added = 1;
1692 		}
1693 	} else tcp_accept_handler_count = 0;
1694 
1695 	/* The main loop... */
1696 	while ((mode = nsd->mode) != NSD_QUIT) {
1697 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
1698 
1699 		/* Do we need to do the statistics... */
1700 		if (mode == NSD_STATS) {
1701 #ifdef BIND8_STATS
1702 			int p = nsd->st.period;
1703 			nsd->st.period = 1; /* force stats printout */
1704 			/* Dump the statistics */
1705 			bind8_stats(nsd);
1706 			nsd->st.period = p;
1707 #else /* !BIND8_STATS */
1708 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
1709 #endif /* BIND8_STATS */
1710 
1711 			nsd->mode = NSD_RUN;
1712 		}
1713 		else if (mode == NSD_REAP_CHILDREN) {
1714 			/* got signal, notify parent. parent reaps terminated children. */
1715 			if (nsd->this_child->parent_fd != -1) {
1716 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
1717 				if (write(nsd->this_child->parent_fd,
1718 				    &parent_notify,
1719 				    sizeof(parent_notify)) == -1)
1720 				{
1721 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
1722 						(int) nsd->this_child->pid, strerror(errno));
1723 				}
1724 			} else /* no parent, so reap 'em */
1725 				while (waitpid(0, NULL, WNOHANG) > 0) ;
1726 			nsd->mode = NSD_RUN;
1727 		}
1728 		else if(mode == NSD_RUN) {
1729 			/* Wait for a query... */
1730 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
1731 				if (errno != EINTR) {
1732 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
1733 					break;
1734 				}
1735 			}
1736 		} else if(mode == NSD_QUIT) {
1737 			/* ignore here, quit */
1738 		} else {
1739 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
1740 				(int)mode);
1741 			nsd->mode = NSD_RUN;
1742 		}
1743 	}
1744 
1745 #ifdef	BIND8_STATS
1746 	bind8_stats(nsd);
1747 #endif /* BIND8_STATS */
1748 
1749 #if 0 /* OS collects memory pages */
1750 	event_base_free(event_base);
1751 	region_destroy(server_region);
1752 #endif
1753 	server_shutdown(nsd);
1754 }
1755 
1756 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
1757 static void
1758 handle_udp(int fd, short event, void* arg)
1759 {
1760 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
1761 	int received, sent, recvcount, i;
1762 	struct query *q;
1763 
1764 	if (!(event & EV_READ)) {
1765 		return;
1766 	}
1767 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
1768 	/* this printf strangely gave a performance increase on Linux */
1769 	/* printf("recvcount %d \n", recvcount); */
1770 	if (recvcount == -1) {
1771 		if (errno != EAGAIN && errno != EINTR) {
1772 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
1773 			STATUP(data->nsd, rxerr);
1774 		}
1775 		/* Simply no data available */
1776 		return;
1777 	}
1778 	for (i = 0; i < recvcount; i++) {
1779 	loopstart:
1780 		received = msgs[i].msg_len;
1781 		q = queries[i];
1782 		if (received == -1) {
1783 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
1784 				msgs[i].msg_hdr.msg_flags));
1785 			STATUP(data->nsd, rxerr);
1786 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
1787 			iovecs[i].iov_len = buffer_remaining(q->packet);
1788 			goto swap_drop;
1789 		}
1790 
1791 		/* Account... */
1792 		if (data->socket->addr->ai_family == AF_INET) {
1793 			STATUP(data->nsd, qudp);
1794 		} else if (data->socket->addr->ai_family == AF_INET6) {
1795 			STATUP(data->nsd, qudp6);
1796 		}
1797 
1798 		buffer_skip(q->packet, received);
1799 		buffer_flip(q->packet);
1800 
1801 		/* Process and answer the query... */
1802 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
1803 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
1804 				STATUP(data->nsd, nona);
1805 			}
1806 
1807 			/* Add EDNS0 and TSIG info if necessary.  */
1808 			query_add_optional(q, data->nsd);
1809 
1810 			buffer_flip(q->packet);
1811 			iovecs[i].iov_len = buffer_remaining(q->packet);
1812 #ifdef BIND8_STATS
1813 			/* Account the rcode & TC... */
1814 			STATUP2(data->nsd, rcode, RCODE(q->packet));
1815 			if (TC(q->packet))
1816 				STATUP(data->nsd, truncated);
1817 #endif /* BIND8_STATS */
1818 		} else {
1819 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
1820 			iovecs[i].iov_len = buffer_remaining(q->packet);
1821 		swap_drop:
1822 			STATUP(data->nsd, dropped);
1823 			if(i != recvcount-1) {
1824 				/* swap with last and decrease recvcount */
1825 				struct mmsghdr mtmp = msgs[i];
1826 				struct iovec iotmp = iovecs[i];
1827 				recvcount--;
1828 				msgs[i] = msgs[recvcount];
1829 				iovecs[i] = iovecs[recvcount];
1830 				queries[i] = queries[recvcount];
1831 				msgs[recvcount] = mtmp;
1832 				iovecs[recvcount] = iotmp;
1833 				queries[recvcount] = q;
1834 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
1835 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
1836 				goto loopstart;
1837 			} else { recvcount --; }
1838 		}
1839 	}
1840 
1841 	/* send until all are sent */
1842 	i = 0;
1843 	while(i<recvcount) {
1844 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
1845 		if(sent == -1) {
1846 			const char* es = strerror(errno);
1847 			char a[48];
1848 			addr2str(&queries[i]->addr, a, sizeof(a));
1849 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
1850 #ifdef BIND8_STATS
1851 			data->nsd->st.txerr += recvcount-i;
1852 #endif /* BIND8_STATS */
1853 			break;
1854 		}
1855 		i += sent;
1856 	}
1857 	for(i=0; i<recvcount; i++) {
1858 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
1859 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
1860 	}
1861 }
1862 
1863 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
1864 
1865 static void
1866 handle_udp(int fd, short event, void* arg)
1867 {
1868 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
1869 	int received, sent;
1870 #ifndef NONBLOCKING_IS_BROKEN
1871 #ifdef HAVE_RECVMMSG
1872 	int recvcount;
1873 #endif /* HAVE_RECVMMSG */
1874 	int i;
1875 #endif /* NONBLOCKING_IS_BROKEN */
1876 	struct query *q;
1877 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
1878 	q = data->query;
1879 #endif
1880 
1881 	if (!(event & EV_READ)) {
1882 		return;
1883 	}
1884 #ifndef NONBLOCKING_IS_BROKEN
1885 #ifdef HAVE_RECVMMSG
1886 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
1887 	/* this printf strangely gave a performance increase on Linux */
1888 	/* printf("recvcount %d \n", recvcount); */
1889 	if (recvcount == -1) {
1890 		if (errno != EAGAIN && errno != EINTR) {
1891 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
1892 			STATUP(data->nsd, rxerr);
1893 		}
1894 		/* Simply no data available */
1895 		return;
1896 	}
1897 	for (i = 0; i < recvcount; i++) {
1898 		received = msgs[i].msg_len;
1899 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
1900 		if (received == -1) {
1901 			log_msg(LOG_ERR, "recvmmsg failed");
1902 			STATUP(data->nsd, rxerr);
1903 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
1904 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
1905 			continue;
1906 		}
1907 		q = queries[i];
1908 #else
1909 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
1910 #endif /* HAVE_RECVMMSG */
1911 #endif /* NONBLOCKING_IS_BROKEN */
1912 
1913 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
1914 		/* Initialize the query... */
1915 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
1916 
1917 		received = recvfrom(fd,
1918 				    buffer_begin(q->packet),
1919 				    buffer_remaining(q->packet),
1920 				    0,
1921 				    (struct sockaddr *)&q->addr,
1922 				    &q->addrlen);
1923 		if (received == -1) {
1924 			if (errno != EAGAIN && errno != EINTR) {
1925 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
1926 				STATUP(data->nsd, rxerr);
1927 			}
1928 			return;
1929 		}
1930 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
1931 
1932 		/* Account... */
1933 		if (data->socket->addr->ai_family == AF_INET) {
1934 			STATUP(data->nsd, qudp);
1935 		} else if (data->socket->addr->ai_family == AF_INET6) {
1936 			STATUP(data->nsd, qudp6);
1937 		}
1938 
1939 		buffer_skip(q->packet, received);
1940 		buffer_flip(q->packet);
1941 
1942 		/* Process and answer the query... */
1943 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
1944 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
1945 				STATUP(data->nsd, nona);
1946 			}
1947 
1948 			/* Add EDNS0 and TSIG info if necessary.  */
1949 			query_add_optional(q, data->nsd);
1950 
1951 			buffer_flip(q->packet);
1952 
1953 			sent = sendto(fd,
1954 				      buffer_begin(q->packet),
1955 				      buffer_remaining(q->packet),
1956 				      0,
1957 				      (struct sockaddr *) &q->addr,
1958 				      q->addrlen);
1959 			if (sent == -1) {
1960 				const char* es = strerror(errno);
1961 				char a[48];
1962 				addr2str(&q->addr, a, sizeof(a));
1963 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
1964 				STATUP(data->nsd, txerr);
1965 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
1966 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
1967 			} else {
1968 #ifdef BIND8_STATS
1969 				/* Account the rcode & TC... */
1970 				STATUP2(data->nsd, rcode, RCODE(q->packet));
1971 				if (TC(q->packet))
1972 					STATUP(data->nsd, truncated);
1973 #endif /* BIND8_STATS */
1974 			}
1975 		} else {
1976 			STATUP(data->nsd, dropped);
1977 		}
1978 #ifndef NONBLOCKING_IS_BROKEN
1979 #ifdef HAVE_RECVMMSG
1980 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
1981 #endif
1982 	}
1983 #endif
1984 }
1985 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
1986 
1987 
1988 static void
1989 cleanup_tcp_handler(struct tcp_handler_data* data)
1990 {
1991 	event_del(&data->event);
1992 	close(data->event.ev_fd);
1993 
1994 	/*
1995 	 * Enable the TCP accept handlers when the current number of
1996 	 * TCP connections is about to drop below the maximum number
1997 	 * of TCP connections.
1998 	 */
1999 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2000 		configure_handler_event_types(EV_READ|EV_PERSIST);
2001 		slowaccept = 0;
2002 	}
2003 	--data->nsd->current_tcp_count;
2004 	assert(data->nsd->current_tcp_count >= 0);
2005 
2006 	region_destroy(data->region);
2007 }
2008 
2009 static void
2010 handle_tcp_reading(int fd, short event, void* arg)
2011 {
2012 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2013 	ssize_t received;
2014 	struct event_base* ev_base;
2015 	struct timeval timeout;
2016 
2017 	if ((event & EV_TIMEOUT)) {
2018 		/* Connection timed out.  */
2019 		cleanup_tcp_handler(data);
2020 		return;
2021 	}
2022 
2023 	if (data->nsd->tcp_query_count > 0 &&
2024 		data->query_count >= data->nsd->tcp_query_count) {
2025 		/* No more queries allowed on this tcp connection.  */
2026 		cleanup_tcp_handler(data);
2027 		return;
2028 	}
2029 
2030 	assert((event & EV_READ));
2031 
2032 	if (data->bytes_transmitted == 0) {
2033 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2034 	}
2035 
2036 	/*
2037 	 * Check if we received the leading packet length bytes yet.
2038 	 */
2039 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2040 		received = read(fd,
2041 				(char *) &data->query->tcplen
2042 				+ data->bytes_transmitted,
2043 				sizeof(uint16_t) - data->bytes_transmitted);
2044 		if (received == -1) {
2045 			if (errno == EAGAIN || errno == EINTR) {
2046 				/*
2047 				 * Read would block, wait until more
2048 				 * data is available.
2049 				 */
2050 				return;
2051 			} else {
2052 				char buf[48];
2053 				addr2str(&data->query->addr, buf, sizeof(buf));
2054 #ifdef ECONNRESET
2055 				if (verbosity >= 2 || errno != ECONNRESET)
2056 #endif /* ECONNRESET */
2057 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2058 				cleanup_tcp_handler(data);
2059 				return;
2060 			}
2061 		} else if (received == 0) {
2062 			/* EOF */
2063 			cleanup_tcp_handler(data);
2064 			return;
2065 		}
2066 
2067 		data->bytes_transmitted += received;
2068 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2069 			/*
2070 			 * Not done with the tcplen yet, wait for more
2071 			 * data to become available.
2072 			 */
2073 			return;
2074 		}
2075 
2076 		assert(data->bytes_transmitted == sizeof(uint16_t));
2077 
2078 		data->query->tcplen = ntohs(data->query->tcplen);
2079 
2080 		/*
2081 		 * Minimum query size is:
2082 		 *
2083 		 *     Size of the header (12)
2084 		 *   + Root domain name   (1)
2085 		 *   + Query class        (2)
2086 		 *   + Query type         (2)
2087 		 */
2088 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2089 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2090 			cleanup_tcp_handler(data);
2091 			return;
2092 		}
2093 
2094 		if (data->query->tcplen > data->query->maxlen) {
2095 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2096 			cleanup_tcp_handler(data);
2097 			return;
2098 		}
2099 
2100 		buffer_set_limit(data->query->packet, data->query->tcplen);
2101 	}
2102 
2103 	assert(buffer_remaining(data->query->packet) > 0);
2104 
2105 	/* Read the (remaining) query data.  */
2106 	received = read(fd,
2107 			buffer_current(data->query->packet),
2108 			buffer_remaining(data->query->packet));
2109 	if (received == -1) {
2110 		if (errno == EAGAIN || errno == EINTR) {
2111 			/*
2112 			 * Read would block, wait until more data is
2113 			 * available.
2114 			 */
2115 			return;
2116 		} else {
2117 			char buf[48];
2118 			addr2str(&data->query->addr, buf, sizeof(buf));
2119 #ifdef ECONNRESET
2120 			if (verbosity >= 2 || errno != ECONNRESET)
2121 #endif /* ECONNRESET */
2122 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2123 			cleanup_tcp_handler(data);
2124 			return;
2125 		}
2126 	} else if (received == 0) {
2127 		/* EOF */
2128 		cleanup_tcp_handler(data);
2129 		return;
2130 	}
2131 
2132 	data->bytes_transmitted += received;
2133 	buffer_skip(data->query->packet, received);
2134 	if (buffer_remaining(data->query->packet) > 0) {
2135 		/*
2136 		 * Message not yet complete, wait for more data to
2137 		 * become available.
2138 		 */
2139 		return;
2140 	}
2141 
2142 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2143 
2144 	/* Account... */
2145 #ifndef INET6
2146         STATUP(data->nsd, ctcp);
2147 #else
2148 	if (data->query->addr.ss_family == AF_INET) {
2149 		STATUP(data->nsd, ctcp);
2150 	} else if (data->query->addr.ss_family == AF_INET6) {
2151 		STATUP(data->nsd, ctcp6);
2152 	}
2153 #endif
2154 
2155 	/* We have a complete query, process it.  */
2156 
2157 	/* tcp-query-count: handle query counter ++ */
2158 	data->query_count++;
2159 
2160 	buffer_flip(data->query->packet);
2161 	data->query_state = server_process_query(data->nsd, data->query);
2162 	if (data->query_state == QUERY_DISCARDED) {
2163 		/* Drop the packet and the entire connection... */
2164 		STATUP(data->nsd, dropped);
2165 		cleanup_tcp_handler(data);
2166 		return;
2167 	}
2168 
2169 	if (RCODE(data->query->packet) == RCODE_OK
2170 	    && !AA(data->query->packet))
2171 	{
2172 		STATUP(data->nsd, nona);
2173 	}
2174 
2175 	query_add_optional(data->query, data->nsd);
2176 
2177 	/* Switch to the tcp write handler.  */
2178 	buffer_flip(data->query->packet);
2179 	data->query->tcplen = buffer_remaining(data->query->packet);
2180 	data->bytes_transmitted = 0;
2181 
2182 	timeout.tv_sec = data->nsd->tcp_timeout;
2183 	timeout.tv_usec = 0L;
2184 
2185 	ev_base = data->event.ev_base;
2186 	event_del(&data->event);
2187 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2188 		handle_tcp_writing, data);
2189 	if(event_base_set(ev_base, &data->event) != 0)
2190 		log_msg(LOG_ERR, "event base set tcpr failed");
2191 	if(event_add(&data->event, &timeout) != 0)
2192 		log_msg(LOG_ERR, "event add tcpr failed");
2193 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2194 	handle_tcp_writing(fd, EV_WRITE, data);
2195 }
2196 
2197 static void
2198 handle_tcp_writing(int fd, short event, void* arg)
2199 {
2200 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2201 	ssize_t sent;
2202 	struct query *q = data->query;
2203 	struct timeval timeout;
2204 	struct event_base* ev_base;
2205 
2206 	if ((event & EV_TIMEOUT)) {
2207 		/* Connection timed out.  */
2208 		cleanup_tcp_handler(data);
2209 		return;
2210 	}
2211 
2212 	assert((event & EV_WRITE));
2213 
2214 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2215 		/* Writing the response packet length.  */
2216 		uint16_t n_tcplen = htons(q->tcplen);
2217 #ifdef HAVE_WRITEV
2218 		struct iovec iov[2];
2219 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2220 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2221 		iov[1].iov_base = buffer_begin(q->packet);
2222 		iov[1].iov_len = buffer_limit(q->packet);
2223 		sent = writev(fd, iov, 2);
2224 #else /* HAVE_WRITEV */
2225 		sent = write(fd,
2226 			     (const char *) &n_tcplen + data->bytes_transmitted,
2227 			     sizeof(n_tcplen) - data->bytes_transmitted);
2228 #endif /* HAVE_WRITEV */
2229 		if (sent == -1) {
2230 			if (errno == EAGAIN || errno == EINTR) {
2231 				/*
2232 				 * Write would block, wait until
2233 				 * socket becomes writable again.
2234 				 */
2235 				return;
2236 			} else {
2237 #ifdef ECONNRESET
2238 				if(verbosity >= 2 || errno != ECONNRESET)
2239 #endif /* ECONNRESET */
2240 #ifdef EPIPE
2241 				  if(verbosity >= 2 || errno != EPIPE)
2242 #endif /* EPIPE 'broken pipe' */
2243 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2244 				cleanup_tcp_handler(data);
2245 				return;
2246 			}
2247 		}
2248 
2249 		data->bytes_transmitted += sent;
2250 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2251 			/*
2252 			 * Writing not complete, wait until socket
2253 			 * becomes writable again.
2254 			 */
2255 			return;
2256 		}
2257 
2258 #ifdef HAVE_WRITEV
2259 		sent -= sizeof(n_tcplen);
2260 		/* handle potential 'packet done' code */
2261 		goto packet_could_be_done;
2262 #endif
2263  	}
2264 
2265 	sent = write(fd,
2266 		     buffer_current(q->packet),
2267 		     buffer_remaining(q->packet));
2268 	if (sent == -1) {
2269 		if (errno == EAGAIN || errno == EINTR) {
2270 			/*
2271 			 * Write would block, wait until
2272 			 * socket becomes writable again.
2273 			 */
2274 			return;
2275 		} else {
2276 #ifdef ECONNRESET
2277 			if(verbosity >= 2 || errno != ECONNRESET)
2278 #endif /* ECONNRESET */
2279 #ifdef EPIPE
2280 				  if(verbosity >= 2 || errno != EPIPE)
2281 #endif /* EPIPE 'broken pipe' */
2282 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2283 			cleanup_tcp_handler(data);
2284 			return;
2285 		}
2286 	}
2287 
2288 	data->bytes_transmitted += sent;
2289 #ifdef HAVE_WRITEV
2290   packet_could_be_done:
2291 #endif
2292 	buffer_skip(q->packet, sent);
2293 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2294 		/*
2295 		 * Still more data to write when socket becomes
2296 		 * writable again.
2297 		 */
2298 		return;
2299 	}
2300 
2301 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2302 
2303 	if (data->query_state == QUERY_IN_AXFR) {
2304 		/* Continue processing AXFR and writing back results.  */
2305 		buffer_clear(q->packet);
2306 		data->query_state = query_axfr(data->nsd, q);
2307 		if (data->query_state != QUERY_PROCESSED) {
2308 			query_add_optional(data->query, data->nsd);
2309 
2310 			/* Reset data. */
2311 			buffer_flip(q->packet);
2312 			q->tcplen = buffer_remaining(q->packet);
2313 			data->bytes_transmitted = 0;
2314 			/* Reset timeout.  */
2315 			timeout.tv_sec = data->nsd->tcp_timeout;
2316 			timeout.tv_usec = 0L;
2317 			ev_base = data->event.ev_base;
2318 			event_del(&data->event);
2319 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2320 				handle_tcp_writing, data);
2321 			if(event_base_set(ev_base, &data->event) != 0)
2322 				log_msg(LOG_ERR, "event base set tcpw failed");
2323 			if(event_add(&data->event, &timeout) != 0)
2324 				log_msg(LOG_ERR, "event add tcpw failed");
2325 
2326 			/*
2327 			 * Write data if/when the socket is writable
2328 			 * again.
2329 			 */
2330 			return;
2331 		}
2332 	}
2333 
2334 	/*
2335 	 * Done sending, wait for the next request to arrive on the
2336 	 * TCP socket by installing the TCP read handler.
2337 	 */
2338 	if (data->nsd->tcp_query_count > 0 &&
2339 		data->query_count >= data->nsd->tcp_query_count) {
2340 
2341 		(void) shutdown(fd, SHUT_WR);
2342 	}
2343 
2344 	data->bytes_transmitted = 0;
2345 
2346 	timeout.tv_sec = data->nsd->tcp_timeout;
2347 	timeout.tv_usec = 0L;
2348 	ev_base = data->event.ev_base;
2349 	event_del(&data->event);
2350 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2351 		handle_tcp_reading, data);
2352 	if(event_base_set(ev_base, &data->event) != 0)
2353 		log_msg(LOG_ERR, "event base set tcpw failed");
2354 	if(event_add(&data->event, &timeout) != 0)
2355 		log_msg(LOG_ERR, "event add tcpw failed");
2356 }
2357 
2358 
2359 static void
2360 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2361 	void* ATTR_UNUSED(arg))
2362 {
2363 	if(slowaccept) {
2364 		configure_handler_event_types(EV_PERSIST | EV_READ);
2365 		slowaccept = 0;
2366 	}
2367 }
2368 
2369 /*
2370  * Handle an incoming TCP connection.  The connection is accepted and
2371  * a new TCP reader event handler is added.  The TCP handler
2372  * is responsible for cleanup when the connection is closed.
2373  */
2374 static void
2375 handle_tcp_accept(int fd, short event, void* arg)
2376 {
2377 	struct tcp_accept_handler_data *data
2378 		= (struct tcp_accept_handler_data *) arg;
2379 	int s;
2380 	struct tcp_handler_data *tcp_data;
2381 	region_type *tcp_region;
2382 #ifdef INET6
2383 	struct sockaddr_storage addr;
2384 #else
2385 	struct sockaddr_in addr;
2386 #endif
2387 	socklen_t addrlen;
2388 	struct timeval timeout;
2389 
2390 	if (!(event & EV_READ)) {
2391 		return;
2392 	}
2393 
2394 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2395 		return;
2396 	}
2397 
2398 	/* Accept it... */
2399 	addrlen = sizeof(addr);
2400 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2401 	if (s == -1) {
2402 		/**
2403 		 * EMFILE and ENFILE is a signal that the limit of open
2404 		 * file descriptors has been reached. Pause accept().
2405 		 * EINTR is a signal interrupt. The others are various OS ways
2406 		 * of saying that the client has closed the connection.
2407 		 */
2408 		if (errno == EMFILE || errno == ENFILE) {
2409 			if (!slowaccept) {
2410 				/* disable accept events */
2411 				struct timeval tv;
2412 				configure_handler_event_types(0);
2413 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2414 				tv.tv_usec = 0L;
2415 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2416 					handle_slowaccept_timeout, NULL);
2417 				(void)event_base_set(data->event.ev_base,
2418 					&slowaccept_event);
2419 				(void)event_add(&slowaccept_event, &tv);
2420 				slowaccept = 1;
2421 				/* We don't want to spam the logs here */
2422 			}
2423 		} else if (errno != EINTR
2424 			&& errno != EWOULDBLOCK
2425 #ifdef ECONNABORTED
2426 			&& errno != ECONNABORTED
2427 #endif /* ECONNABORTED */
2428 #ifdef EPROTO
2429 			&& errno != EPROTO
2430 #endif /* EPROTO */
2431 			) {
2432 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2433 		}
2434 		return;
2435 	}
2436 
2437 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2438 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
2439 		close(s);
2440 		return;
2441 	}
2442 
2443 	/*
2444 	 * This region is deallocated when the TCP connection is
2445 	 * closed by the TCP handler.
2446 	 */
2447 	tcp_region = region_create(xalloc, free);
2448 	tcp_data = (struct tcp_handler_data *) region_alloc(
2449 		tcp_region, sizeof(struct tcp_handler_data));
2450 	tcp_data->region = tcp_region;
2451 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
2452 		compression_table_size);
2453 	tcp_data->nsd = data->nsd;
2454 	tcp_data->query_count = 0;
2455 
2456 	tcp_data->query_state = QUERY_PROCESSED;
2457 	tcp_data->bytes_transmitted = 0;
2458 	memcpy(&tcp_data->query->addr, &addr, addrlen);
2459 	tcp_data->query->addrlen = addrlen;
2460 
2461 	timeout.tv_sec = data->nsd->tcp_timeout;
2462 	timeout.tv_usec = 0;
2463 
2464 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
2465 		handle_tcp_reading, tcp_data);
2466 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0)
2467 		log_msg(LOG_ERR, "cannot set tcp event base");
2468 	if(event_add(&tcp_data->event, &timeout) != 0)
2469 		log_msg(LOG_ERR, "cannot set tcp event base");
2470 
2471 	/*
2472 	 * Keep track of the total number of TCP handlers installed so
2473 	 * we can stop accepting connections when the maximum number
2474 	 * of simultaneous TCP connections is reached.
2475 	 */
2476 	++data->nsd->current_tcp_count;
2477 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2478 		configure_handler_event_types(0);
2479 	}
2480 }
2481 
2482 static void
2483 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
2484 {
2485 	size_t i;
2486 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2487 	for (i = 0; i < nsd->child_count; ++i) {
2488 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
2489 			if (write(nsd->children[i].child_fd,
2490 				&command,
2491 				sizeof(command)) == -1)
2492 			{
2493 				if(errno != EAGAIN && errno != EINTR)
2494 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
2495 					(int) command,
2496 					(int) nsd->children[i].pid,
2497 					strerror(errno));
2498 			} else if (timeout > 0) {
2499 				(void)block_read(NULL,
2500 					nsd->children[i].child_fd,
2501 					&command, sizeof(command), timeout);
2502 			}
2503 			fsync(nsd->children[i].child_fd);
2504 			close(nsd->children[i].child_fd);
2505 			nsd->children[i].child_fd = -1;
2506 		}
2507 	}
2508 }
2509 
2510 static void
2511 send_children_quit(struct nsd* nsd)
2512 {
2513 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
2514 	send_children_command(nsd, NSD_QUIT, 0);
2515 }
2516 
2517 static void
2518 send_children_quit_and_wait(struct nsd* nsd)
2519 {
2520 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
2521 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
2522 }
2523 
2524 #ifdef BIND8_STATS
2525 static void
2526 set_children_stats(struct nsd* nsd)
2527 {
2528 	size_t i;
2529 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2530 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
2531 	for (i = 0; i < nsd->child_count; ++i) {
2532 		nsd->children[i].need_to_send_STATS = 1;
2533 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
2534 	}
2535 }
2536 #endif /* BIND8_STATS */
2537 
2538 static void
2539 configure_handler_event_types(short event_types)
2540 {
2541 	size_t i;
2542 
2543 	for (i = 0; i < tcp_accept_handler_count; ++i) {
2544 		struct event* handler = &tcp_accept_handlers[i].event;
2545 		if(event_types) {
2546 			/* reassign */
2547 			int fd = handler->ev_fd;
2548 			struct event_base* base = handler->ev_base;
2549 			if(tcp_accept_handlers[i].event_added)
2550 				event_del(handler);
2551 			event_set(handler, fd, event_types,
2552 				handle_tcp_accept, &tcp_accept_handlers[i]);
2553 			if(event_base_set(base, handler) != 0)
2554 				log_msg(LOG_ERR, "conhand: cannot event_base");
2555 			if(event_add(handler, NULL) != 0)
2556 				log_msg(LOG_ERR, "conhand: cannot event_add");
2557 			tcp_accept_handlers[i].event_added = 1;
2558 		} else {
2559 			/* remove */
2560 			if(tcp_accept_handlers[i].event_added) {
2561 				event_del(handler);
2562 				tcp_accept_handlers[i].event_added = 0;
2563 			}
2564 		}
2565 	}
2566 }
2567