xref: /openbsd-src/usr.sbin/nsd/server.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #ifdef HAVE_OPENSSL_RAND_H
41 #include <openssl/rand.h>
42 #endif
43 #ifndef USE_MINI_EVENT
44 #  ifdef HAVE_EVENT_H
45 #    include <event.h>
46 #  else
47 #    include <event2/event.h>
48 #    include "event2/event_struct.h"
49 #    include "event2/event_compat.h"
50 #  endif
51 #else
52 #  include "mini_event.h"
53 #endif
54 
55 #include "axfr.h"
56 #include "namedb.h"
57 #include "netio.h"
58 #include "xfrd.h"
59 #include "xfrd-tcp.h"
60 #include "xfrd-disk.h"
61 #include "difffile.h"
62 #include "nsec3.h"
63 #include "ipc.h"
64 #include "udb.h"
65 #include "remote.h"
66 #include "lookup3.h"
67 #include "rrl.h"
68 
69 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
70 
71 /*
72  * Data for the UDP handlers.
73  */
74 struct udp_handler_data
75 {
76 	struct nsd        *nsd;
77 	struct nsd_socket *socket;
78 	query_type        *query;
79 };
80 
81 struct tcp_accept_handler_data {
82 	struct nsd         *nsd;
83 	struct nsd_socket  *socket;
84 	int event_added;
85 	struct event       event;
86 };
87 
88 /*
89  * These globals are used to enable the TCP accept handlers
90  * when the number of TCP connection drops below the maximum
91  * number of TCP connections.
92  */
93 static size_t		tcp_accept_handler_count;
94 static struct tcp_accept_handler_data*	tcp_accept_handlers;
95 
96 static struct event slowaccept_event;
97 static int slowaccept;
98 
99 #ifndef NONBLOCKING_IS_BROKEN
100 #  define NUM_RECV_PER_SELECT 100
101 #endif
102 
103 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
104 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
105 struct iovec iovecs[NUM_RECV_PER_SELECT];
106 struct query *queries[NUM_RECV_PER_SELECT];
107 #endif
108 
109 /*
110  * Data for the TCP connection handlers.
111  *
112  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
113  * blocking the entire server on a slow TCP connection, but does make
114  * reading from and writing to the socket more complicated.
115  *
116  * Basically, whenever a read/write would block (indicated by the
117  * EAGAIN errno variable) we remember the position we were reading
118  * from/writing to and return from the TCP reading/writing event
119  * handler.  When the socket becomes readable/writable again we
120  * continue from the same position.
121  */
122 struct tcp_handler_data
123 {
124 	/*
125 	 * The region used to allocate all TCP connection related
126 	 * data, including this structure.  This region is destroyed
127 	 * when the connection is closed.
128 	 */
129 	region_type*		region;
130 
131 	/*
132 	 * The global nsd structure.
133 	 */
134 	struct nsd*			nsd;
135 
136 	/*
137 	 * The current query data for this TCP connection.
138 	 */
139 	query_type*			query;
140 
141 	/*
142 	 * The query_state is used to remember if we are performing an
143 	 * AXFR, if we're done processing, or if we should discard the
144 	 * query and connection.
145 	 */
146 	query_state_type	query_state;
147 
148 	/*
149 	 * The event for the file descriptor and tcp timeout
150 	 */
151 	struct event event;
152 
153 	/*
154 	 * The bytes_transmitted field is used to remember the number
155 	 * of bytes transmitted when receiving or sending a DNS
156 	 * packet.  The count includes the two additional bytes used
157 	 * to specify the packet length on a TCP connection.
158 	 */
159 	size_t				bytes_transmitted;
160 
161 	/*
162 	 * The number of queries handled by this specific TCP connection.
163 	 */
164 	int					query_count;
165 
166 	/*
167 	 * The timeout in msec for this tcp connection
168 	 */
169 	int	tcp_timeout;
170 };
171 
172 /*
173  * Handle incoming queries on the UDP server sockets.
174  */
175 static void handle_udp(int fd, short event, void* arg);
176 
177 /*
178  * Handle incoming connections on the TCP sockets.  These handlers
179  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
180  * connection) but are disabled when the number of current TCP
181  * connections is equal to the maximum number of TCP connections.
182  * Disabling is done by changing the handler to wait for the
183  * NETIO_EVENT_NONE type.  This is done using the function
184  * configure_tcp_accept_handlers.
185  */
186 static void handle_tcp_accept(int fd, short event, void* arg);
187 
188 /*
189  * Handle incoming queries on a TCP connection.  The TCP connections
190  * are configured to be non-blocking and the handler may be called
191  * multiple times before a complete query is received.
192  */
193 static void handle_tcp_reading(int fd, short event, void* arg);
194 
195 /*
196  * Handle outgoing responses on a TCP connection.  The TCP connections
197  * are configured to be non-blocking and the handler may be called
198  * multiple times before a complete response is sent.
199  */
200 static void handle_tcp_writing(int fd, short event, void* arg);
201 
202 /*
203  * Send all children the quit nonblocking, then close pipe.
204  */
205 static void send_children_quit(struct nsd* nsd);
206 /* same, for shutdown time, waits for child to exit to avoid restart issues */
207 static void send_children_quit_and_wait(struct nsd* nsd);
208 
209 /* set childrens flags to send NSD_STATS to them */
210 #ifdef BIND8_STATS
211 static void set_children_stats(struct nsd* nsd);
212 #endif /* BIND8_STATS */
213 
214 /*
215  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
216  */
217 static void configure_handler_event_types(short event_types);
218 
219 static uint16_t *compressed_dname_offsets = 0;
220 static uint32_t compression_table_capacity = 0;
221 static uint32_t compression_table_size = 0;
222 
223 /*
224  * Remove the specified pid from the list of child pids.  Returns -1 if
225  * the pid is not in the list, child_num otherwise.  The field is set to 0.
226  */
227 static int
228 delete_child_pid(struct nsd *nsd, pid_t pid)
229 {
230 	size_t i;
231 	for (i = 0; i < nsd->child_count; ++i) {
232 		if (nsd->children[i].pid == pid) {
233 			nsd->children[i].pid = 0;
234 			if(!nsd->children[i].need_to_exit) {
235 				if(nsd->children[i].child_fd != -1)
236 					close(nsd->children[i].child_fd);
237 				nsd->children[i].child_fd = -1;
238 				if(nsd->children[i].handler)
239 					nsd->children[i].handler->fd = -1;
240 			}
241 			return i;
242 		}
243 	}
244 	return -1;
245 }
246 
247 /*
248  * Restart child servers if necessary.
249  */
250 static int
251 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
252 	int* xfrd_sock_p)
253 {
254 	struct main_ipc_handler_data *ipc_data;
255 	size_t i;
256 	int sv[2];
257 
258 	/* Fork the child processes... */
259 	for (i = 0; i < nsd->child_count; ++i) {
260 		if (nsd->children[i].pid <= 0) {
261 			if (nsd->children[i].child_fd != -1)
262 				close(nsd->children[i].child_fd);
263 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
264 				log_msg(LOG_ERR, "socketpair: %s",
265 					strerror(errno));
266 				return -1;
267 			}
268 			nsd->children[i].child_fd = sv[0];
269 			nsd->children[i].parent_fd = sv[1];
270 			nsd->children[i].pid = fork();
271 			switch (nsd->children[i].pid) {
272 			default: /* SERVER MAIN */
273 				close(nsd->children[i].parent_fd);
274 				nsd->children[i].parent_fd = -1;
275 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
276 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
277 				}
278 				if(!nsd->children[i].handler)
279 				{
280 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
281 						region, sizeof(struct main_ipc_handler_data));
282 					ipc_data->nsd = nsd;
283 					ipc_data->child = &nsd->children[i];
284 					ipc_data->child_num = i;
285 					ipc_data->xfrd_sock = xfrd_sock_p;
286 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
287 					ipc_data->forward_mode = 0;
288 					ipc_data->got_bytes = 0;
289 					ipc_data->total_bytes = 0;
290 					ipc_data->acl_num = 0;
291 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
292 						region, sizeof(struct netio_handler));
293 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
294 					nsd->children[i].handler->timeout = NULL;
295 					nsd->children[i].handler->user_data = ipc_data;
296 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
297 					nsd->children[i].handler->event_handler = parent_handle_child_command;
298 					netio_add_handler(netio, nsd->children[i].handler);
299 				}
300 				/* clear any ongoing ipc */
301 				ipc_data = (struct main_ipc_handler_data*)
302 					nsd->children[i].handler->user_data;
303 				ipc_data->forward_mode = 0;
304 				/* restart - update fd */
305 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
306 				break;
307 			case 0: /* CHILD */
308 				/* the child need not be able to access the
309 				 * nsd.db file */
310 				namedb_close_udb(nsd->db);
311 
312 				if (pledge("stdio rpath inet", NULL) == -1) {
313 					log_msg(LOG_ERR, "pledge");
314 					exit(1);
315 				}
316 
317 				nsd->pid = 0;
318 				nsd->child_count = 0;
319 				nsd->server_kind = nsd->children[i].kind;
320 				nsd->this_child = &nsd->children[i];
321 				nsd->this_child->child_num = i;
322 				/* remove signal flags inherited from parent
323 				   the parent will handle them. */
324 				nsd->signal_hint_reload_hup = 0;
325 				nsd->signal_hint_reload = 0;
326 				nsd->signal_hint_child = 0;
327 				nsd->signal_hint_quit = 0;
328 				nsd->signal_hint_shutdown = 0;
329 				nsd->signal_hint_stats = 0;
330 				nsd->signal_hint_statsusr = 0;
331 				close(*xfrd_sock_p);
332 				close(nsd->this_child->child_fd);
333 				nsd->this_child->child_fd = -1;
334 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
335 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
336 				}
337 				server_child(nsd);
338 				/* NOTREACH */
339 				exit(0);
340 			case -1:
341 				log_msg(LOG_ERR, "fork failed: %s",
342 					strerror(errno));
343 				return -1;
344 			}
345 		}
346 	}
347 	return 0;
348 }
349 
350 #ifdef BIND8_STATS
351 static void set_bind8_alarm(struct nsd* nsd)
352 {
353 	/* resync so that the next alarm is on the next whole minute */
354 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
355 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
356 }
357 #endif
358 
359 /* set zone stat ids for zones initially read in */
360 static void
361 zonestatid_tree_set(struct nsd* nsd)
362 {
363 	struct radnode* n;
364 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
365 		zone_type* zone = (zone_type*)n->elem;
366 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
367 	}
368 }
369 
370 #ifdef USE_ZONE_STATS
371 void
372 server_zonestat_alloc(struct nsd* nsd)
373 {
374 	size_t num = (nsd->options->zonestatnames->count==0?1:
375 			nsd->options->zonestatnames->count);
376 	size_t sz = sizeof(struct nsdst)*num;
377 	char tmpfile[256];
378 	uint8_t z = 0;
379 
380 	/* file names */
381 	nsd->zonestatfname[0] = 0;
382 	nsd->zonestatfname[1] = 0;
383 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
384 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
385 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
386 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
387 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
388 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
389 
390 	/* file descriptors */
391 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
392 	if(nsd->zonestatfd[0] == -1) {
393 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
394 			strerror(errno));
395 		exit(1);
396 	}
397 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
398 	if(nsd->zonestatfd[0] == -1) {
399 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
400 			strerror(errno));
401 		close(nsd->zonestatfd[0]);
402 		unlink(nsd->zonestatfname[0]);
403 		exit(1);
404 	}
405 
406 #ifdef HAVE_MMAP
407 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
408 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
409 			strerror(errno));
410 		exit(1);
411 	}
412 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
413 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
414 			nsd->zonestatfname[0], strerror(errno));
415 		exit(1);
416 	}
417 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
418 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
419 			strerror(errno));
420 		exit(1);
421 	}
422 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
423 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
424 			nsd->zonestatfname[1], strerror(errno));
425 		exit(1);
426 	}
427 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
428 		MAP_SHARED, nsd->zonestatfd[0], 0);
429 	if(nsd->zonestat[0] == MAP_FAILED) {
430 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
431 		unlink(nsd->zonestatfname[0]);
432 		unlink(nsd->zonestatfname[1]);
433 		exit(1);
434 	}
435 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
436 		MAP_SHARED, nsd->zonestatfd[1], 0);
437 	if(nsd->zonestat[1] == MAP_FAILED) {
438 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
439 		unlink(nsd->zonestatfname[0]);
440 		unlink(nsd->zonestatfname[1]);
441 		exit(1);
442 	}
443 	memset(nsd->zonestat[0], 0, sz);
444 	memset(nsd->zonestat[1], 0, sz);
445 	nsd->zonestatsize[0] = num;
446 	nsd->zonestatsize[1] = num;
447 	nsd->zonestatdesired = num;
448 	nsd->zonestatsizenow = num;
449 	nsd->zonestatnow = nsd->zonestat[0];
450 #endif /* HAVE_MMAP */
451 }
452 
453 void
454 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
455 {
456 #ifdef HAVE_MMAP
457 #ifdef MREMAP_MAYMOVE
458 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
459 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
460 		MREMAP_MAYMOVE);
461 	if(nsd->zonestat[idx] == MAP_FAILED) {
462 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
463 		exit(1);
464 	}
465 #else /* !HAVE MREMAP */
466 	if(msync(nsd->zonestat[idx],
467 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
468 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
469 	if(munmap(nsd->zonestat[idx],
470 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
471 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
472 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
473 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
474 	if(nsd->zonestat[idx] == MAP_FAILED) {
475 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
476 		exit(1);
477 	}
478 #endif /* MREMAP */
479 #endif /* HAVE_MMAP */
480 }
481 
482 /* realloc the zonestat array for the one that is not currently in use,
483  * to match the desired new size of the array (if applicable) */
484 void
485 server_zonestat_realloc(struct nsd* nsd)
486 {
487 #ifdef HAVE_MMAP
488 	uint8_t z = 0;
489 	size_t sz;
490 	int idx = 0; /* index of the zonestat array that is not in use */
491 	if(nsd->zonestatnow == nsd->zonestat[0])
492 		idx = 1;
493 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
494 		return;
495 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
496 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
497 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
498 			strerror(errno));
499 		exit(1);
500 	}
501 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
502 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
503 			nsd->zonestatfname[idx], strerror(errno));
504 		exit(1);
505 	}
506 	zonestat_remap(nsd, idx, sz);
507 	/* zero the newly allocated region */
508 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
509 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
510 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
511 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
512 	}
513 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
514 #endif /* HAVE_MMAP */
515 }
516 
517 /* switchover to use the other array for the new children, that
518  * briefly coexist with the old children.  And we want to avoid them
519  * both writing to the same statistics arrays. */
520 void
521 server_zonestat_switch(struct nsd* nsd)
522 {
523 	if(nsd->zonestatnow == nsd->zonestat[0]) {
524 		nsd->zonestatnow = nsd->zonestat[1];
525 		nsd->zonestatsizenow = nsd->zonestatsize[1];
526 	} else {
527 		nsd->zonestatnow = nsd->zonestat[0];
528 		nsd->zonestatsizenow = nsd->zonestatsize[0];
529 	}
530 }
531 #endif /* USE_ZONE_STATS */
532 
533 static void
534 cleanup_dname_compression_tables(void *ptr)
535 {
536 	free(ptr);
537 	compressed_dname_offsets = NULL;
538 	compression_table_capacity = 0;
539 }
540 
541 static void
542 initialize_dname_compression_tables(struct nsd *nsd)
543 {
544 	size_t needed = domain_table_count(nsd->db->domains) + 1;
545 	needed += EXTRA_DOMAIN_NUMBERS;
546 	if(compression_table_capacity < needed) {
547 		if(compressed_dname_offsets) {
548 			region_remove_cleanup(nsd->db->region,
549 				cleanup_dname_compression_tables,
550 				compressed_dname_offsets);
551 			free(compressed_dname_offsets);
552 		}
553 		compressed_dname_offsets = (uint16_t *) xmallocarray(
554 			needed, sizeof(uint16_t));
555 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
556 			compressed_dname_offsets);
557 		compression_table_capacity = needed;
558 		compression_table_size=domain_table_count(nsd->db->domains)+1;
559 	}
560 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
561 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
562 }
563 
564 /* create and bind sockets.  */
565 static int
566 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
567 {
568 	struct addrinfo* addr;
569 	size_t i;
570 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND) || defined(SO_BINDANY))
571 	int on = 1;
572 #endif
573 
574 	/* UDP */
575 
576 	/* Make a socket... */
577 	for (i = from; i < to; i++) {
578 		/* for reuseports copy socket specs of first entries */
579 		addr = nsd->udp[i%nsd->ifs].addr;
580 		if (!addr) {
581 			nsd->udp[i].s = -1;
582 			continue;
583 		}
584 		nsd->udp[i].fam = (int)addr->ai_family;
585 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
586 #if defined(INET6)
587 			if (addr->ai_family == AF_INET6 &&
588 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
589 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
590 				continue;
591 			}
592 #endif /* INET6 */
593 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
594 			return -1;
595 		}
596 
597 #ifdef SO_REUSEPORT
598 		if(nsd->reuseport && *reuseport_works &&
599 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
600 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
601 			if(verbosity >= 3
602 #ifdef ENOPROTOOPT
603 				|| errno != ENOPROTOOPT
604 #endif
605 				)
606 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
607 				"...) failed: %s", strerror(errno));
608 			*reuseport_works = 0;
609 		}
610 #else
611 		(void)reuseport_works;
612 #endif /* SO_REUSEPORT */
613 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
614 	if(1) {
615 	int rcv = 1*1024*1024;
616 	int snd = 1*1024*1024;
617 
618 #ifdef SO_RCVBUF
619 #  ifdef SO_RCVBUFFORCE
620 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
621 		(socklen_t)sizeof(rcv)) < 0) {
622 		if(errno != EPERM && errno != ENOBUFS) {
623 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
624                                         "...) failed: %s", strerror(errno));
625 			return -1;
626 		}
627 #  else
628 	if(1) {
629 #  endif /* SO_RCVBUFFORCE */
630 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
631 			 (socklen_t)sizeof(rcv)) < 0) {
632 			if(errno != ENOBUFS && errno != ENOSYS) {
633 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
634                                         "...) failed: %s", strerror(errno));
635 				return -1;
636 			}
637 		}
638 	}
639 #endif /* SO_RCVBUF */
640 
641 #ifdef SO_SNDBUF
642 #  ifdef SO_SNDBUFFORCE
643 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
644 		(socklen_t)sizeof(snd)) < 0) {
645 		if(errno != EPERM && errno != ENOBUFS) {
646 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
647                                         "...) failed: %s", strerror(errno));
648 			return -1;
649 		}
650 #  else
651 	if(1) {
652 #  endif /* SO_SNDBUFFORCE */
653 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
654 			 (socklen_t)sizeof(snd)) < 0) {
655 			if(errno != ENOBUFS && errno != ENOSYS) {
656 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
657                                         "...) failed: %s", strerror(errno));
658 				return -1;
659 			}
660 		}
661 	}
662 #endif /* SO_SNDBUF */
663 
664 	}
665 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
666 
667 #if defined(INET6)
668 		if (addr->ai_family == AF_INET6) {
669 # if defined(IPV6_V6ONLY)
670 			if (setsockopt(nsd->udp[i].s,
671 				       IPPROTO_IPV6, IPV6_V6ONLY,
672 				       &on, sizeof(on)) < 0)
673 			{
674 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
675 					strerror(errno));
676 				return -1;
677 			}
678 # endif
679 # if defined(IPV6_USE_MIN_MTU)
680 			/*
681 			 * There is no fragmentation of IPv6 datagrams
682 			 * during forwarding in the network. Therefore
683 			 * we do not send UDP datagrams larger than
684 			 * the minimum IPv6 MTU of 1280 octets. The
685 			 * EDNS0 message length can be larger if the
686 			 * network stack supports IPV6_USE_MIN_MTU.
687 			 */
688 			if (setsockopt(nsd->udp[i].s,
689 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
690 				       &on, sizeof(on)) < 0)
691 			{
692 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
693 					strerror(errno));
694 				return -1;
695 			}
696 # elif defined(IPV6_MTU)
697 			/*
698 			 * On Linux, PMTUD is disabled by default for datagrams
699 			 * so set the MTU equal to the MIN MTU to get the same.
700 			 */
701 			on = IPV6_MIN_MTU;
702 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
703 				&on, sizeof(on)) < 0)
704 			{
705 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
706 					strerror(errno));
707 				return -1;
708 			}
709 			on = 1;
710 # endif
711 		}
712 #endif
713 #if defined(AF_INET)
714 		if (addr->ai_family == AF_INET) {
715 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
716 			int action = IP_PMTUDISC_DONT;
717 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
718 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
719 			{
720 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
721 					strerror(errno));
722 				return -1;
723 			}
724 #  elif defined(IP_DONTFRAG)
725 			int off = 0;
726 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
727 				&off, sizeof(off)) < 0)
728 			{
729 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
730 					strerror(errno));
731 				return -1;
732 			}
733 #  endif
734 		}
735 #endif
736 		/* set it nonblocking */
737 		/* otherwise, on OSes with thundering herd problems, the
738 		   UDP recv could block NSD after select returns readable. */
739 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
740 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
741 		}
742 
743 		/* Bind it... */
744 		if (nsd->options->ip_freebind) {
745 #ifdef IP_FREEBIND
746 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
747 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
748 					strerror(errno));
749 			}
750 #endif /* IP_FREEBIND */
751 		}
752 
753 		if (nsd->options->ip_transparent) {
754 #ifdef IP_TRANSPARENT
755 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
756 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
757 					strerror(errno));
758 			}
759 #endif /* IP_TRANSPARENT */
760 #ifdef SO_BINDANY
761 			if (setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
762 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for udp: %s",
763 					strerror(errno));
764 			}
765 #endif /* SO_BINDANY */
766 		}
767 
768 		if (bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
769 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
770 			return -1;
771 		}
772 	}
773 
774 	/* TCP */
775 
776 	/* Make a socket... */
777 	for (i = from; i < to; i++) {
778 		/* for reuseports copy socket specs of first entries */
779 		addr = nsd->tcp[i%nsd->ifs].addr;
780 		if (!addr) {
781 			nsd->tcp[i].s = -1;
782 			continue;
783 		}
784 		nsd->tcp[i].fam = (int)addr->ai_family;
785 		/* turn off REUSEPORT for TCP by copying the socket fd */
786 		if(i >= nsd->ifs) {
787 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
788 			continue;
789 		}
790 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
791 #if defined(INET6)
792 			if (addr->ai_family == AF_INET6 &&
793 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
794 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
795 				continue;
796 			}
797 #endif /* INET6 */
798 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
799 			return -1;
800 		}
801 
802 #ifdef SO_REUSEPORT
803 		if(nsd->reuseport && *reuseport_works &&
804 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
805 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
806 			if(verbosity >= 3
807 #ifdef ENOPROTOOPT
808 				|| errno != ENOPROTOOPT
809 #endif
810 				)
811 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
812 				"...) failed: %s", strerror(errno));
813 			*reuseport_works = 0;
814 		}
815 #endif /* SO_REUSEPORT */
816 #ifdef	SO_REUSEADDR
817 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
818 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
819 		}
820 #endif /* SO_REUSEADDR */
821 
822 #if defined(INET6)
823 		if (addr->ai_family == AF_INET6) {
824 # if defined(IPV6_V6ONLY)
825 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
826 				&on, sizeof(on)) < 0) {
827 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
828 				return -1;
829 			}
830 # endif
831 # if defined(IPV6_USE_MIN_MTU)
832 			/*
833 			 * Use minimum MTU to minimize delays learning working
834 			 * PMTU when communicating through a tunnel.
835 			 */
836 			if (setsockopt(nsd->tcp[i].s,
837 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
838 				       &on, sizeof(on)) < 0) {
839 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
840 				return -1;
841 			}
842 # elif defined(IPV6_MTU)
843 			/*
844 			 * On Linux, PMTUD is disabled by default for datagrams
845 			 * so set the MTU equal to the MIN MTU to get the same.
846 			 */
847 			on = IPV6_MIN_MTU;
848 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
849 				&on, sizeof(on)) < 0) {
850 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
851 				return -1;
852 			}
853 			on = 1;
854 # endif
855 		}
856 #endif
857 		/* set maximum segment size to tcp socket */
858 		if(nsd->tcp_mss > 0) {
859 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
860 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
861 					(void*)&nsd->tcp_mss,
862 					sizeof(nsd->tcp_mss)) < 0) {
863 				log_msg(LOG_ERR,
864 					"setsockopt(...,TCP_MAXSEG,...)"
865 					" failed for tcp: %s", strerror(errno));
866 			}
867 #else
868 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
869 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
870 		}
871 
872 		/* set it nonblocking */
873 		/* (StevensUNP p463), if tcp listening socket is blocking, then
874 		   it may block in accept, even if select() says readable. */
875 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
876 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
877 		}
878 
879 		/* Bind it... */
880 		if (nsd->options->ip_freebind) {
881 #ifdef IP_FREEBIND
882 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
883 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
884 					strerror(errno));
885 			}
886 #endif /* IP_FREEBIND */
887 		}
888 
889 		if (nsd->options->ip_transparent) {
890 #ifdef IP_TRANSPARENT
891 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
892 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
893 					strerror(errno));
894 			}
895 #endif /* IP_TRANSPARENT */
896 #ifdef SO_BINDANY
897 			if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
898 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for tcp: %s",
899 					strerror(errno));
900 			}
901 #endif /* SO_BINDANY */
902 		}
903 
904 		if (bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
905 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
906 			return -1;
907 		}
908 
909 		/* Listen to it... */
910 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
911 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
912 			return -1;
913 		}
914 	}
915 
916 	return 0;
917 }
918 
919 /*
920  * Initialize the server, reuseport, create and bind the sockets.
921  */
922 int
923 server_init(struct nsd *nsd)
924 {
925 	int reuseport_successful = 1; /* see if reuseport works in OS */
926 	if(nsd->reuseport) {
927 		/* increase the size of the udp and tcp interface arrays,
928 		 * there are going to be separate interface file descriptors
929 		 * for every server instance */
930 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
931 			sizeof(*nsd->udp));
932 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
933 			sizeof(*nsd->tcp));
934 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
935 			(nsd->ifs*(nsd->reuseport-1)));
936 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
937 			(nsd->ifs*(nsd->reuseport-1)));
938 	}
939 
940 	/* open the server interface ports */
941 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
942 		return -1;
943 
944 	/* continue to open the remaining reuseport ports */
945 	if(nsd->reuseport && reuseport_successful) {
946 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
947 			&reuseport_successful) == -1)
948 			return -1;
949 		nsd->ifs *= nsd->reuseport;
950 	} else {
951 		nsd->reuseport = 0;
952 	}
953 	return 0;
954 }
955 
956 /*
957  * Prepare the server for take off.
958  *
959  */
960 int
961 server_prepare(struct nsd *nsd)
962 {
963 #ifdef RATELIMIT
964 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
965 #ifdef HAVE_ARC4RANDOM
966 	hash_set_raninit(arc4random());
967 #else
968 	uint32_t v = getpid() ^ time(NULL);
969 	srandom((unsigned long)v);
970 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
971 		hash_set_raninit(v);
972 	else	hash_set_raninit(random());
973 #endif
974 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
975 		nsd->options->rrl_ratelimit,
976 		nsd->options->rrl_whitelist_ratelimit,
977 		nsd->options->rrl_slip,
978 		nsd->options->rrl_ipv4_prefix_length,
979 		nsd->options->rrl_ipv6_prefix_length);
980 #endif /* RATELIMIT */
981 
982 	/* Open the database... */
983 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
984 		log_msg(LOG_ERR, "unable to open the database %s: %s",
985 			nsd->dbfile, strerror(errno));
986 		unlink(nsd->task[0]->fname);
987 		unlink(nsd->task[1]->fname);
988 #ifdef USE_ZONE_STATS
989 		unlink(nsd->zonestatfname[0]);
990 		unlink(nsd->zonestatfname[1]);
991 #endif
992 		xfrd_del_tempdir(nsd);
993 		return -1;
994 	}
995 	/* check if zone files have been modified */
996 	/* NULL for taskudb because we send soainfo in a moment, batched up,
997 	 * for all zones */
998 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
999 		nsd->options->database[0] == 0))
1000 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1001 	zonestatid_tree_set(nsd);
1002 
1003 	compression_table_capacity = 0;
1004 	initialize_dname_compression_tables(nsd);
1005 
1006 #ifdef	BIND8_STATS
1007 	/* Initialize times... */
1008 	time(&nsd->st.boot);
1009 	set_bind8_alarm(nsd);
1010 #endif /* BIND8_STATS */
1011 
1012 	return 0;
1013 }
1014 
1015 /*
1016  * Fork the required number of servers.
1017  */
1018 static int
1019 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1020 	int* xfrd_sock_p)
1021 {
1022 	size_t i;
1023 
1024 	/* Start all child servers initially.  */
1025 	for (i = 0; i < nsd->child_count; ++i) {
1026 		nsd->children[i].pid = 0;
1027 	}
1028 
1029 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1030 }
1031 
1032 void
1033 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1034 {
1035 	size_t i;
1036 
1037 	/* Close all the sockets... */
1038 	for (i = 0; i < n; ++i) {
1039 		if (sockets[i].s != -1) {
1040 			close(sockets[i].s);
1041 			if(sockets[i].addr)
1042 				freeaddrinfo(sockets[i].addr);
1043 			sockets[i].s = -1;
1044 		}
1045 	}
1046 }
1047 
1048 /*
1049  * Close the sockets, shutdown the server and exit.
1050  * Does not return.
1051  *
1052  */
1053 void
1054 server_shutdown(struct nsd *nsd)
1055 {
1056 	size_t i;
1057 
1058 	server_close_all_sockets(nsd->udp, nsd->ifs);
1059 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1060 	/* CHILD: close command channel to parent */
1061 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1062 	{
1063 		close(nsd->this_child->parent_fd);
1064 		nsd->this_child->parent_fd = -1;
1065 	}
1066 	/* SERVER: close command channels to children */
1067 	if(!nsd->this_child)
1068 	{
1069 		for(i=0; i < nsd->child_count; ++i)
1070 			if(nsd->children[i].child_fd != -1)
1071 			{
1072 				close(nsd->children[i].child_fd);
1073 				nsd->children[i].child_fd = -1;
1074 			}
1075 	}
1076 
1077 	tsig_finalize();
1078 #ifdef HAVE_SSL
1079 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1080 #endif
1081 
1082 #if 0 /* OS collects memory pages */
1083 	nsd_options_destroy(nsd->options);
1084 	region_destroy(nsd->region);
1085 #endif
1086 	log_finalize();
1087 	exit(0);
1088 }
1089 
1090 void
1091 server_prepare_xfrd(struct nsd* nsd)
1092 {
1093 	char tmpfile[256];
1094 	/* create task mmaps */
1095 	nsd->mytask = 0;
1096 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1097 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1098 	nsd->task[0] = task_file_create(tmpfile);
1099 	if(!nsd->task[0]) {
1100 #ifdef USE_ZONE_STATS
1101 		unlink(nsd->zonestatfname[0]);
1102 		unlink(nsd->zonestatfname[1]);
1103 #endif
1104 		xfrd_del_tempdir(nsd);
1105 		exit(1);
1106 	}
1107 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1108 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1109 	nsd->task[1] = task_file_create(tmpfile);
1110 	if(!nsd->task[1]) {
1111 		unlink(nsd->task[0]->fname);
1112 #ifdef USE_ZONE_STATS
1113 		unlink(nsd->zonestatfname[0]);
1114 		unlink(nsd->zonestatfname[1]);
1115 #endif
1116 		xfrd_del_tempdir(nsd);
1117 		exit(1);
1118 	}
1119 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1120 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1121 	/* create xfrd listener structure */
1122 	nsd->xfrd_listener = region_alloc(nsd->region,
1123 		sizeof(netio_handler_type));
1124 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1125 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1126 	nsd->xfrd_listener->fd = -1;
1127 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1128 		nsd;
1129 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1130 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1131 }
1132 
1133 
1134 void
1135 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1136 {
1137 	pid_t pid;
1138 	int sockets[2] = {0,0};
1139 	struct ipc_handler_conn_data *data;
1140 
1141 	if(nsd->xfrd_listener->fd != -1)
1142 		close(nsd->xfrd_listener->fd);
1143 	if(del_db) {
1144 		/* recreate taskdb that xfrd was using, it may be corrupt */
1145 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1146 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1147 		nsd->task[1-nsd->mytask]->fname = NULL;
1148 		/* free alloc already, so udb does not shrink itself */
1149 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1150 		nsd->task[1-nsd->mytask]->alloc = NULL;
1151 		udb_base_free(nsd->task[1-nsd->mytask]);
1152 		/* create new file, overwrite the old one */
1153 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1154 		free(tmpfile);
1155 	}
1156 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1157 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1158 		return;
1159 	}
1160 	pid = fork();
1161 	switch (pid) {
1162 	case -1:
1163 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1164 		break;
1165 	default:
1166 		/* PARENT: close first socket, use second one */
1167 		close(sockets[0]);
1168 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1169 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1170 		}
1171 		if(del_db) xfrd_free_namedb(nsd);
1172 		/* use other task than I am using, since if xfrd died and is
1173 		 * restarted, the reload is using nsd->mytask */
1174 		nsd->mytask = 1 - nsd->mytask;
1175 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1176 		/* ENOTREACH */
1177 		break;
1178 	case 0:
1179 		/* CHILD: close second socket, use first one */
1180 		close(sockets[1]);
1181 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1182 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1183 		}
1184 		nsd->xfrd_listener->fd = sockets[0];
1185 		break;
1186 	}
1187 	/* server-parent only */
1188 	nsd->xfrd_listener->timeout = NULL;
1189 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1190 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1191 	/* clear ongoing ipc reads */
1192 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1193 	data->conn->is_reading = 0;
1194 }
1195 
1196 /** add all soainfo to taskdb */
1197 static void
1198 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1199 {
1200 	struct radnode* n;
1201 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1202 	/* add all SOA INFO to mytask */
1203 	udb_ptr_init(&task_last, taskudb);
1204 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1205 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1206 	}
1207 	udb_ptr_unlink(&task_last, taskudb);
1208 }
1209 
1210 void
1211 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1212 {
1213 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1214 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1215 	 *   then they exchange and process.
1216 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1217 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1218 	 *   expire notifications can be sent back via a normal reload later
1219 	 *   (xfrd will wait for current running reload to finish if any).
1220 	 */
1221 	sig_atomic_t cmd = 0;
1222 	pid_t mypid;
1223 	int xfrd_sock = nsd->xfrd_listener->fd;
1224 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1225 	udb_ptr t;
1226 	if(!shortsoa) {
1227 		if(nsd->signal_hint_shutdown) {
1228 		shutdown:
1229 			log_msg(LOG_WARNING, "signal received, shutting down...");
1230 			server_close_all_sockets(nsd->udp, nsd->ifs);
1231 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1232 #ifdef HAVE_SSL
1233 			daemon_remote_close(nsd->rc);
1234 #endif
1235 			/* Unlink it if possible... */
1236 			unlinkpid(nsd->pidfile);
1237 			unlink(nsd->task[0]->fname);
1238 			unlink(nsd->task[1]->fname);
1239 #ifdef USE_ZONE_STATS
1240 			unlink(nsd->zonestatfname[0]);
1241 			unlink(nsd->zonestatfname[1]);
1242 #endif
1243 			/* write the nsd.db to disk, wait for it to complete */
1244 			udb_base_sync(nsd->db->udb, 1);
1245 			udb_base_close(nsd->db->udb);
1246 			server_shutdown(nsd);
1247 			exit(0);
1248 		}
1249 	}
1250 	if(shortsoa) {
1251 		/* put SOA in xfrd task because mytask may be in use */
1252 		taskudb = nsd->task[1-nsd->mytask];
1253 	}
1254 
1255 	add_all_soa_to_task(nsd, taskudb);
1256 	if(!shortsoa) {
1257 		/* wait for xfrd to signal task is ready, RELOAD signal */
1258 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1259 			cmd != NSD_RELOAD) {
1260 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1261 			exit(1);
1262 		}
1263 		if(nsd->signal_hint_shutdown) {
1264 			goto shutdown;
1265 		}
1266 	}
1267 	/* give xfrd our task, signal it with RELOAD_DONE */
1268 	task_process_sync(taskudb);
1269 	cmd = NSD_RELOAD_DONE;
1270 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1271 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1272 			(int)nsd->pid, strerror(errno));
1273 	}
1274 	mypid = getpid();
1275 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1276 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1277 			strerror(errno));
1278 	}
1279 
1280 	if(!shortsoa) {
1281 		/* process the xfrd task works (expiry data) */
1282 		nsd->mytask = 1 - nsd->mytask;
1283 		taskudb = nsd->task[nsd->mytask];
1284 		task_remap(taskudb);
1285 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1286 		while(!udb_ptr_is_null(&t)) {
1287 			task_process_expire(nsd->db, TASKLIST(&t));
1288 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1289 		}
1290 		udb_ptr_unlink(&t, taskudb);
1291 		task_clear(taskudb);
1292 
1293 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1294 		cmd = NSD_RELOAD_DONE;
1295 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1296 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1297 				(int)nsd->pid, strerror(errno));
1298 		}
1299 	}
1300 }
1301 
1302 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1303 ssize_t
1304 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1305 {
1306 	uint8_t* buf = (uint8_t*) p;
1307 	ssize_t total = 0;
1308 	struct pollfd fd;
1309 	memset(&fd, 0, sizeof(fd));
1310 	fd.fd = s;
1311 	fd.events = POLLIN;
1312 
1313 	while( total < sz) {
1314 		ssize_t ret;
1315 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1316 		if(ret == -1) {
1317 			if(errno == EAGAIN)
1318 				/* blocking read */
1319 				continue;
1320 			if(errno == EINTR) {
1321 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1322 					return -1;
1323 				/* other signals can be handled later */
1324 				continue;
1325 			}
1326 			/* some error */
1327 			return -1;
1328 		}
1329 		if(ret == 0) {
1330 			/* operation timed out */
1331 			return -2;
1332 		}
1333 		ret = read(s, buf+total, sz-total);
1334 		if(ret == -1) {
1335 			if(errno == EAGAIN)
1336 				/* blocking read */
1337 				continue;
1338 			if(errno == EINTR) {
1339 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1340 					return -1;
1341 				/* other signals can be handled later */
1342 				continue;
1343 			}
1344 			/* some error */
1345 			return -1;
1346 		}
1347 		if(ret == 0) {
1348 			/* closed connection! */
1349 			return 0;
1350 		}
1351 		total += ret;
1352 	}
1353 	return total;
1354 }
1355 
1356 static void
1357 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1358 {
1359 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1360 	udb_ptr t, next;
1361 	udb_base* u = nsd->task[nsd->mytask];
1362 	udb_ptr_init(&next, u);
1363 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1364 	udb_base_set_userdata(u, 0);
1365 	while(!udb_ptr_is_null(&t)) {
1366 		/* store next in list so this one can be deleted or reused */
1367 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1368 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1369 
1370 		/* process task t */
1371 		/* append results for task t and update last_task */
1372 		task_process_in_reload(nsd, u, last_task, &t);
1373 
1374 		/* go to next */
1375 		udb_ptr_set_ptr(&t, u, &next);
1376 
1377 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1378 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1379 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1380 			if(cmd == NSD_QUIT) {
1381 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1382 				/* sync to disk (if needed) */
1383 				udb_base_sync(nsd->db->udb, 0);
1384 				/* unlink files of remainder of tasks */
1385 				while(!udb_ptr_is_null(&t)) {
1386 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1387 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1388 					}
1389 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1390 				}
1391 				udb_ptr_unlink(&t, u);
1392 				udb_ptr_unlink(&next, u);
1393 				exit(0);
1394 			}
1395 		}
1396 
1397 	}
1398 	udb_ptr_unlink(&t, u);
1399 	udb_ptr_unlink(&next, u);
1400 }
1401 
1402 #ifdef BIND8_STATS
1403 static void
1404 parent_send_stats(struct nsd* nsd, int cmdfd)
1405 {
1406 	size_t i;
1407 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1408 		log_msg(LOG_ERR, "could not write stats to reload");
1409 		return;
1410 	}
1411 	for(i=0; i<nsd->child_count; i++)
1412 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1413 			sizeof(stc_type))) {
1414 			log_msg(LOG_ERR, "could not write stats to reload");
1415 			return;
1416 		}
1417 }
1418 
1419 static void
1420 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1421 {
1422 	struct nsdst s;
1423 	stc_type* p;
1424 	size_t i;
1425 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1426 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1427 		log_msg(LOG_ERR, "could not read stats from oldpar");
1428 		return;
1429 	}
1430 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1431 	s.db_mem = region_get_mem(nsd->db->region);
1432 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1433 		nsd->child_count);
1434 	if(!p) return;
1435 	for(i=0; i<nsd->child_count; i++) {
1436 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
1437 			sizeof(stc_type))
1438 			return;
1439 	}
1440 }
1441 #endif /* BIND8_STATS */
1442 
1443 /*
1444  * Reload the database, stop parent, re-fork children and continue.
1445  * as server_main.
1446  */
1447 static void
1448 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1449 	int cmdsocket)
1450 {
1451 	pid_t mypid;
1452 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1453 	int ret;
1454 	udb_ptr last_task;
1455 	struct sigaction old_sigchld, ign_sigchld;
1456 	/* ignore SIGCHLD from the previous server_main that used this pid */
1457 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1458 	ign_sigchld.sa_handler = SIG_IGN;
1459 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1460 
1461 	/* see what tasks we got from xfrd */
1462 	task_remap(nsd->task[nsd->mytask]);
1463 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1464 	udb_compact_inhibited(nsd->db->udb, 1);
1465 	reload_process_tasks(nsd, &last_task, cmdsocket);
1466 	udb_compact_inhibited(nsd->db->udb, 0);
1467 	udb_compact(nsd->db->udb);
1468 
1469 #ifndef NDEBUG
1470 	if(nsd_debug_level >= 1)
1471 		region_log_stats(nsd->db->region);
1472 #endif /* NDEBUG */
1473 	/* sync to disk (if needed) */
1474 	udb_base_sync(nsd->db->udb, 0);
1475 
1476 	initialize_dname_compression_tables(nsd);
1477 
1478 #ifdef BIND8_STATS
1479 	/* Restart dumping stats if required.  */
1480 	time(&nsd->st.boot);
1481 	set_bind8_alarm(nsd);
1482 #endif
1483 #ifdef USE_ZONE_STATS
1484 	server_zonestat_realloc(nsd); /* realloc for new children */
1485 	server_zonestat_switch(nsd);
1486 #endif
1487 
1488 	/* listen for the signals of failed children again */
1489 	sigaction(SIGCHLD, &old_sigchld, NULL);
1490 	/* Start new child processes */
1491 	if (server_start_children(nsd, server_region, netio, &nsd->
1492 		xfrd_listener->fd) != 0) {
1493 		send_children_quit(nsd);
1494 		exit(1);
1495 	}
1496 
1497 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1498 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1499 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1500 		if(cmd == NSD_QUIT) {
1501 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1502 			send_children_quit(nsd);
1503 			exit(0);
1504 		}
1505 	}
1506 
1507 	/* Send quit command to parent: blocking, wait for receipt. */
1508 	do {
1509 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1510 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1511 		{
1512 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1513 				strerror(errno));
1514 		}
1515 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1516 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1517 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1518 			RELOAD_SYNC_TIMEOUT);
1519 		if(ret == -2) {
1520 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1521 		}
1522 	} while (ret == -2);
1523 	if(ret == -1) {
1524 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1525 			strerror(errno));
1526 	}
1527 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1528 	if(cmd == NSD_QUIT) {
1529 		/* small race condition possible here, parent got quit cmd. */
1530 		send_children_quit(nsd);
1531 		exit(1);
1532 	}
1533 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1534 #ifdef BIND8_STATS
1535 	reload_do_stats(cmdsocket, nsd, &last_task);
1536 #endif
1537 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1538 	task_process_sync(nsd->task[nsd->mytask]);
1539 #ifdef USE_ZONE_STATS
1540 	server_zonestat_realloc(nsd); /* realloc for next children */
1541 #endif
1542 
1543 	/* send soainfo to the xfrd process, signal it that reload is done,
1544 	 * it picks up the taskudb */
1545 	cmd = NSD_RELOAD_DONE;
1546 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1547 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1548 			strerror(errno));
1549 	}
1550 	mypid = getpid();
1551 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1552 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1553 			strerror(errno));
1554 	}
1555 
1556 	/* try to reopen file */
1557 	if (nsd->file_rotation_ok)
1558 		log_reopen(nsd->log_filename, 1);
1559 	/* exit reload, continue as new server_main */
1560 }
1561 
1562 /*
1563  * Get the mode depending on the signal hints that have been received.
1564  * Multiple signal hints can be received and will be handled in turn.
1565  */
1566 static sig_atomic_t
1567 server_signal_mode(struct nsd *nsd)
1568 {
1569 	if(nsd->signal_hint_quit) {
1570 		nsd->signal_hint_quit = 0;
1571 		return NSD_QUIT;
1572 	}
1573 	else if(nsd->signal_hint_shutdown) {
1574 		nsd->signal_hint_shutdown = 0;
1575 		return NSD_SHUTDOWN;
1576 	}
1577 	else if(nsd->signal_hint_child) {
1578 		nsd->signal_hint_child = 0;
1579 		return NSD_REAP_CHILDREN;
1580 	}
1581 	else if(nsd->signal_hint_reload) {
1582 		nsd->signal_hint_reload = 0;
1583 		return NSD_RELOAD;
1584 	}
1585 	else if(nsd->signal_hint_reload_hup) {
1586 		nsd->signal_hint_reload_hup = 0;
1587 		return NSD_RELOAD_REQ;
1588 	}
1589 	else if(nsd->signal_hint_stats) {
1590 		nsd->signal_hint_stats = 0;
1591 #ifdef BIND8_STATS
1592 		set_bind8_alarm(nsd);
1593 #endif
1594 		return NSD_STATS;
1595 	}
1596 	else if(nsd->signal_hint_statsusr) {
1597 		nsd->signal_hint_statsusr = 0;
1598 		return NSD_STATS;
1599 	}
1600 	return NSD_RUN;
1601 }
1602 
1603 /*
1604  * The main server simply waits for signals and child processes to
1605  * terminate.  Child processes are restarted as necessary.
1606  */
1607 void
1608 server_main(struct nsd *nsd)
1609 {
1610 	region_type *server_region = region_create(xalloc, free);
1611 	netio_type *netio = netio_create(server_region);
1612 	netio_handler_type reload_listener;
1613 	int reload_sockets[2] = {-1, -1};
1614 	struct timespec timeout_spec;
1615 	int status;
1616 	pid_t child_pid;
1617 	pid_t reload_pid = -1;
1618 	sig_atomic_t mode;
1619 
1620 	/* Ensure we are the main process */
1621 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1622 
1623 	/* Add listener for the XFRD process */
1624 	netio_add_handler(netio, nsd->xfrd_listener);
1625 
1626 	/* Start the child processes that handle incoming queries */
1627 	if (server_start_children(nsd, server_region, netio,
1628 		&nsd->xfrd_listener->fd) != 0) {
1629 		send_children_quit(nsd);
1630 		exit(1);
1631 	}
1632 	reload_listener.fd = -1;
1633 
1634 	/* This_child MUST be 0, because this is the parent process */
1635 	assert(nsd->this_child == 0);
1636 
1637 	/* Run the server until we get a shutdown signal */
1638 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1639 		/* Did we receive a signal that changes our mode? */
1640 		if(mode == NSD_RUN) {
1641 			nsd->mode = mode = server_signal_mode(nsd);
1642 		}
1643 
1644 		switch (mode) {
1645 		case NSD_RUN:
1646 			/* see if any child processes terminated */
1647 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1648 				int is_child = delete_child_pid(nsd, child_pid);
1649 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1650 					if(nsd->children[is_child].child_fd == -1)
1651 						nsd->children[is_child].has_exited = 1;
1652 					parent_check_all_children_exited(nsd);
1653 				} else if(is_child != -1) {
1654 					log_msg(LOG_WARNING,
1655 					       "server %d died unexpectedly with status %d, restarting",
1656 					       (int) child_pid, status);
1657 					restart_child_servers(nsd, server_region, netio,
1658 						&nsd->xfrd_listener->fd);
1659 				} else if (child_pid == reload_pid) {
1660 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1661 					pid_t mypid;
1662 					log_msg(LOG_WARNING,
1663 					       "Reload process %d failed with status %d, continuing with old database",
1664 					       (int) child_pid, status);
1665 					reload_pid = -1;
1666 					if(reload_listener.fd != -1) close(reload_listener.fd);
1667 					reload_listener.fd = -1;
1668 					reload_listener.event_types = NETIO_EVENT_NONE;
1669 					task_process_sync(nsd->task[nsd->mytask]);
1670 					/* inform xfrd reload attempt ended */
1671 					if(!write_socket(nsd->xfrd_listener->fd,
1672 						&cmd, sizeof(cmd))) {
1673 						log_msg(LOG_ERR, "problems "
1674 						  "sending SOAEND to xfrd: %s",
1675 						  strerror(errno));
1676 					}
1677 					mypid = getpid();
1678 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1679 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1680 							strerror(errno));
1681 					}
1682 				} else if(status != 0) {
1683 					/* check for status, because we get
1684 					 * the old-servermain because reload
1685 					 * is the process-parent of old-main,
1686 					 * and we get older server-processes
1687 					 * that are exiting after a reload */
1688 					log_msg(LOG_WARNING,
1689 					       "process %d terminated with status %d",
1690 					       (int) child_pid, status);
1691 				}
1692 			}
1693 			if (child_pid == -1) {
1694 				if (errno == EINTR) {
1695 					continue;
1696 				}
1697 				if (errno != ECHILD)
1698 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1699 			}
1700 			if (nsd->mode != NSD_RUN)
1701 				break;
1702 
1703 			/* timeout to collect processes. In case no sigchild happens. */
1704 			timeout_spec.tv_sec = 60;
1705 			timeout_spec.tv_nsec = 0;
1706 
1707 			/* listen on ports, timeout for collecting terminated children */
1708 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1709 				if (errno != EINTR) {
1710 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1711 				}
1712 			}
1713 			if(nsd->restart_children) {
1714 				restart_child_servers(nsd, server_region, netio,
1715 					&nsd->xfrd_listener->fd);
1716 				nsd->restart_children = 0;
1717 			}
1718 			if(nsd->reload_failed) {
1719 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1720 				pid_t mypid;
1721 				nsd->reload_failed = 0;
1722 				log_msg(LOG_WARNING,
1723 				       "Reload process %d failed, continuing with old database",
1724 				       (int) reload_pid);
1725 				reload_pid = -1;
1726 				if(reload_listener.fd != -1) close(reload_listener.fd);
1727 				reload_listener.fd = -1;
1728 				reload_listener.event_types = NETIO_EVENT_NONE;
1729 				task_process_sync(nsd->task[nsd->mytask]);
1730 				/* inform xfrd reload attempt ended */
1731 				if(!write_socket(nsd->xfrd_listener->fd,
1732 					&cmd, sizeof(cmd))) {
1733 					log_msg(LOG_ERR, "problems "
1734 					  "sending SOAEND to xfrd: %s",
1735 					  strerror(errno));
1736 				}
1737 				mypid = getpid();
1738 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1739 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1740 						strerror(errno));
1741 				}
1742 			}
1743 
1744 			break;
1745 		case NSD_RELOAD_REQ: {
1746 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1747 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1748 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1749 				"main: ipc send reload_req to xfrd"));
1750 			if(!write_socket(nsd->xfrd_listener->fd,
1751 				&cmd, sizeof(cmd))) {
1752 				log_msg(LOG_ERR, "server_main: could not send "
1753 				"reload_req to xfrd: %s", strerror(errno));
1754 			}
1755 			nsd->mode = NSD_RUN;
1756 			} break;
1757 		case NSD_RELOAD:
1758 			/* Continue to run nsd after reload */
1759 			nsd->mode = NSD_RUN;
1760 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1761 			if (reload_pid != -1) {
1762 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1763 				       (int) reload_pid);
1764 				break;
1765 			}
1766 
1767 			/* switch the mytask to keep track of who owns task*/
1768 			nsd->mytask = 1 - nsd->mytask;
1769 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1770 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1771 				reload_pid = -1;
1772 				break;
1773 			}
1774 
1775 			/* Do actual reload */
1776 			reload_pid = fork();
1777 			switch (reload_pid) {
1778 			case -1:
1779 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1780 				break;
1781 			default:
1782 				/* PARENT */
1783 				close(reload_sockets[0]);
1784 				server_reload(nsd, server_region, netio,
1785 					reload_sockets[1]);
1786 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1787 				close(reload_sockets[1]);
1788 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1789 				/* drop stale xfrd ipc data */
1790 				((struct ipc_handler_conn_data*)nsd->
1791 					xfrd_listener->user_data)
1792 					->conn->is_reading = 0;
1793 				reload_pid = -1;
1794 				reload_listener.fd = -1;
1795 				reload_listener.event_types = NETIO_EVENT_NONE;
1796 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1797 				break;
1798 			case 0:
1799 				/* CHILD */
1800 				/* server_main keep running until NSD_QUIT_SYNC
1801 				 * received from reload. */
1802 				close(reload_sockets[1]);
1803 				reload_listener.fd = reload_sockets[0];
1804 				reload_listener.timeout = NULL;
1805 				reload_listener.user_data = nsd;
1806 				reload_listener.event_types = NETIO_EVENT_READ;
1807 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1808 				netio_add_handler(netio, &reload_listener);
1809 				reload_pid = getppid();
1810 				break;
1811 			}
1812 			break;
1813 		case NSD_QUIT_SYNC:
1814 			/* synchronisation of xfrd, parent and reload */
1815 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1816 				sig_atomic_t cmd = NSD_RELOAD;
1817 				/* stop xfrd ipc writes in progress */
1818 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1819 					"main: ipc send indication reload"));
1820 				if(!write_socket(nsd->xfrd_listener->fd,
1821 					&cmd, sizeof(cmd))) {
1822 					log_msg(LOG_ERR, "server_main: could not send reload "
1823 					"indication to xfrd: %s", strerror(errno));
1824 				}
1825 				/* wait for ACK from xfrd */
1826 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1827 				nsd->quit_sync_done = 1;
1828 			}
1829 			nsd->mode = NSD_RUN;
1830 			break;
1831 		case NSD_QUIT:
1832 			/* silent shutdown during reload */
1833 			if(reload_listener.fd != -1) {
1834 				/* acknowledge the quit, to sync reload that we will really quit now */
1835 				sig_atomic_t cmd = NSD_RELOAD;
1836 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1837 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1838 					log_msg(LOG_ERR, "server_main: "
1839 						"could not ack quit: %s", strerror(errno));
1840 				}
1841 #ifdef BIND8_STATS
1842 				parent_send_stats(nsd, reload_listener.fd);
1843 #endif /* BIND8_STATS */
1844 				close(reload_listener.fd);
1845 			}
1846 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1847 			/* only quit children after xfrd has acked */
1848 			send_children_quit(nsd);
1849 
1850 #if 0 /* OS collects memory pages */
1851 			region_destroy(server_region);
1852 #endif
1853 			server_shutdown(nsd);
1854 
1855 			/* ENOTREACH */
1856 			break;
1857 		case NSD_SHUTDOWN:
1858 			break;
1859 		case NSD_REAP_CHILDREN:
1860 			/* continue; wait for child in run loop */
1861 			nsd->mode = NSD_RUN;
1862 			break;
1863 		case NSD_STATS:
1864 #ifdef BIND8_STATS
1865 			set_children_stats(nsd);
1866 #endif
1867 			nsd->mode = NSD_RUN;
1868 			break;
1869 		default:
1870 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1871 			nsd->mode = NSD_RUN;
1872 			break;
1873 		}
1874 	}
1875 	log_msg(LOG_WARNING, "signal received, shutting down...");
1876 
1877 	/* close opened ports to avoid race with restart of nsd */
1878 	server_close_all_sockets(nsd->udp, nsd->ifs);
1879 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1880 #ifdef HAVE_SSL
1881 	daemon_remote_close(nsd->rc);
1882 #endif
1883 	send_children_quit_and_wait(nsd);
1884 
1885 	/* Unlink it if possible... */
1886 	unlinkpid(nsd->pidfile);
1887 	unlink(nsd->task[0]->fname);
1888 	unlink(nsd->task[1]->fname);
1889 #ifdef USE_ZONE_STATS
1890 	unlink(nsd->zonestatfname[0]);
1891 	unlink(nsd->zonestatfname[1]);
1892 #endif
1893 
1894 	if(reload_listener.fd != -1) {
1895 		sig_atomic_t cmd = NSD_QUIT;
1896 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1897 			"main: ipc send quit to reload-process"));
1898 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1899 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1900 				strerror(errno));
1901 		}
1902 		fsync(reload_listener.fd);
1903 		close(reload_listener.fd);
1904 		/* wait for reload to finish processing */
1905 		while(1) {
1906 			if(waitpid(reload_pid, NULL, 0) == -1) {
1907 				if(errno == EINTR) continue;
1908 				if(errno == ECHILD) break;
1909 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1910 					(int)reload_pid, strerror(errno));
1911 			}
1912 			break;
1913 		}
1914 	}
1915 	if(nsd->xfrd_listener->fd != -1) {
1916 		/* complete quit, stop xfrd */
1917 		sig_atomic_t cmd = NSD_QUIT;
1918 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1919 			"main: ipc send quit to xfrd"));
1920 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1921 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1922 				strerror(errno));
1923 		}
1924 		fsync(nsd->xfrd_listener->fd);
1925 		close(nsd->xfrd_listener->fd);
1926 		(void)kill(nsd->pid, SIGTERM);
1927 	}
1928 
1929 #if 0 /* OS collects memory pages */
1930 	region_destroy(server_region);
1931 #endif
1932 	/* write the nsd.db to disk, wait for it to complete */
1933 	udb_base_sync(nsd->db->udb, 1);
1934 	udb_base_close(nsd->db->udb);
1935 	server_shutdown(nsd);
1936 }
1937 
1938 static query_state_type
1939 server_process_query(struct nsd *nsd, struct query *query)
1940 {
1941 	return query_process(query, nsd);
1942 }
1943 
1944 static query_state_type
1945 server_process_query_udp(struct nsd *nsd, struct query *query)
1946 {
1947 #ifdef RATELIMIT
1948 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1949 		if(rrl_process_query(query))
1950 			return rrl_slip(query);
1951 		else	return QUERY_PROCESSED;
1952 	}
1953 	return QUERY_DISCARDED;
1954 #else
1955 	return query_process(query, nsd);
1956 #endif
1957 }
1958 
1959 struct event_base*
1960 nsd_child_event_base(void)
1961 {
1962 	struct event_base* base;
1963 #ifdef USE_MINI_EVENT
1964 	static time_t secs;
1965 	static struct timeval now;
1966 	base = event_init(&secs, &now);
1967 #else
1968 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
1969 	/* libev */
1970 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
1971 #  else
1972 	/* libevent */
1973 #    ifdef HAVE_EVENT_BASE_NEW
1974 	base = event_base_new();
1975 #    else
1976 	base = event_init();
1977 #    endif
1978 #  endif
1979 #endif
1980 	return base;
1981 }
1982 
1983 /*
1984  * Serve DNS requests.
1985  */
1986 void
1987 server_child(struct nsd *nsd)
1988 {
1989 	size_t i, from, numifs;
1990 	region_type *server_region = region_create(xalloc, free);
1991 	struct event_base* event_base = nsd_child_event_base();
1992 	query_type *udp_query;
1993 	sig_atomic_t mode;
1994 
1995 	if(!event_base) {
1996 		log_msg(LOG_ERR, "nsd server could not create event base");
1997 		exit(1);
1998 	}
1999 
2000 #ifdef RATELIMIT
2001 	rrl_init(nsd->this_child->child_num);
2002 #endif
2003 
2004 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2005 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2006 
2007 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2008 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2009 	}
2010 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2011 		server_close_all_sockets(nsd->udp, nsd->ifs);
2012 	}
2013 
2014 	if (nsd->this_child && nsd->this_child->parent_fd != -1) {
2015 		struct event *handler;
2016 		struct ipc_handler_conn_data* user_data =
2017 			(struct ipc_handler_conn_data*)region_alloc(
2018 			server_region, sizeof(struct ipc_handler_conn_data));
2019 		user_data->nsd = nsd;
2020 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2021 
2022 		handler = (struct event*) region_alloc(
2023 			server_region, sizeof(*handler));
2024 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2025 			EV_READ, child_handle_parent_command, user_data);
2026 		if(event_base_set(event_base, handler) != 0)
2027 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2028 		if(event_add(handler, NULL) != 0)
2029 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2030 	}
2031 
2032 	if(nsd->reuseport) {
2033 		numifs = nsd->ifs / nsd->reuseport;
2034 		from = numifs * nsd->this_child->child_num;
2035 		if(from+numifs > nsd->ifs) { /* should not happen */
2036 			from = 0;
2037 			numifs = nsd->ifs;
2038 		}
2039 	} else {
2040 		from = 0;
2041 		numifs = nsd->ifs;
2042 	}
2043 
2044 	if (nsd->server_kind & NSD_SERVER_UDP) {
2045 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2046 		udp_query = query_create(server_region,
2047 			compressed_dname_offsets, compression_table_size);
2048 #else
2049 		udp_query = NULL;
2050 		memset(msgs, 0, sizeof(msgs));
2051 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2052 			queries[i] = query_create(server_region,
2053 				compressed_dname_offsets, compression_table_size);
2054 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2055 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2056 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2057 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2058 			msgs[i].msg_hdr.msg_iovlen  = 1;
2059 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2060 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2061 		}
2062 #endif
2063 		for (i = from; i < from+numifs; ++i) {
2064 			struct udp_handler_data *data;
2065 			struct event *handler;
2066 
2067 			data = (struct udp_handler_data *) region_alloc(
2068 				server_region,
2069 				sizeof(struct udp_handler_data));
2070 			data->query = udp_query;
2071 			data->nsd = nsd;
2072 			data->socket = &nsd->udp[i];
2073 
2074 			handler = (struct event*) region_alloc(
2075 				server_region, sizeof(*handler));
2076 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2077 				handle_udp, data);
2078 			if(event_base_set(event_base, handler) != 0)
2079 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2080 			if(event_add(handler, NULL) != 0)
2081 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2082 		}
2083 	}
2084 
2085 	/*
2086 	 * Keep track of all the TCP accept handlers so we can enable
2087 	 * and disable them based on the current number of active TCP
2088 	 * connections.
2089 	 */
2090 	tcp_accept_handler_count = numifs;
2091 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2092 		region_alloc_array(server_region,
2093 		numifs, sizeof(*tcp_accept_handlers));
2094 	if (nsd->server_kind & NSD_SERVER_TCP) {
2095 		for (i = from; i < numifs; ++i) {
2096 			struct event *handler = &tcp_accept_handlers[i-from].event;
2097 			struct tcp_accept_handler_data* data =
2098 				&tcp_accept_handlers[i-from];
2099 			data->nsd = nsd;
2100 			data->socket = &nsd->tcp[i];
2101 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2102 				handle_tcp_accept, data);
2103 			if(event_base_set(event_base, handler) != 0)
2104 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2105 			if(event_add(handler, NULL) != 0)
2106 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2107 			data->event_added = 1;
2108 		}
2109 	} else tcp_accept_handler_count = 0;
2110 
2111 	/* The main loop... */
2112 	while ((mode = nsd->mode) != NSD_QUIT) {
2113 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2114 
2115 		/* Do we need to do the statistics... */
2116 		if (mode == NSD_STATS) {
2117 #ifdef BIND8_STATS
2118 			int p = nsd->st.period;
2119 			nsd->st.period = 1; /* force stats printout */
2120 			/* Dump the statistics */
2121 			bind8_stats(nsd);
2122 			nsd->st.period = p;
2123 #else /* !BIND8_STATS */
2124 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2125 #endif /* BIND8_STATS */
2126 
2127 			nsd->mode = NSD_RUN;
2128 		}
2129 		else if (mode == NSD_REAP_CHILDREN) {
2130 			/* got signal, notify parent. parent reaps terminated children. */
2131 			if (nsd->this_child->parent_fd != -1) {
2132 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2133 				if (write(nsd->this_child->parent_fd,
2134 				    &parent_notify,
2135 				    sizeof(parent_notify)) == -1)
2136 				{
2137 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2138 						(int) nsd->this_child->pid, strerror(errno));
2139 				}
2140 			} else /* no parent, so reap 'em */
2141 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2142 			nsd->mode = NSD_RUN;
2143 		}
2144 		else if(mode == NSD_RUN) {
2145 			/* Wait for a query... */
2146 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2147 				if (errno != EINTR) {
2148 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2149 					break;
2150 				}
2151 			}
2152 		} else if(mode == NSD_QUIT) {
2153 			/* ignore here, quit */
2154 		} else {
2155 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2156 				(int)mode);
2157 			nsd->mode = NSD_RUN;
2158 		}
2159 	}
2160 
2161 #ifdef	BIND8_STATS
2162 	bind8_stats(nsd);
2163 #endif /* BIND8_STATS */
2164 
2165 #if 0 /* OS collects memory pages */
2166 	event_base_free(event_base);
2167 	region_destroy(server_region);
2168 #endif
2169 	server_shutdown(nsd);
2170 }
2171 
2172 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2173 static void
2174 handle_udp(int fd, short event, void* arg)
2175 {
2176 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2177 	int received, sent, recvcount, i;
2178 	struct query *q;
2179 
2180 	if (!(event & EV_READ)) {
2181 		return;
2182 	}
2183 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2184 	/* this printf strangely gave a performance increase on Linux */
2185 	/* printf("recvcount %d \n", recvcount); */
2186 	if (recvcount == -1) {
2187 		if (errno != EAGAIN && errno != EINTR) {
2188 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2189 			STATUP(data->nsd, rxerr);
2190 			/* No zone statup */
2191 		}
2192 		/* Simply no data available */
2193 		return;
2194 	}
2195 	for (i = 0; i < recvcount; i++) {
2196 	loopstart:
2197 		received = msgs[i].msg_len;
2198 		q = queries[i];
2199 		if (received == -1) {
2200 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2201 				msgs[i].msg_hdr.msg_flags));
2202 			STATUP(data->nsd, rxerr);
2203 			/* No zone statup */
2204 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2205 			iovecs[i].iov_len = buffer_remaining(q->packet);
2206 			goto swap_drop;
2207 		}
2208 
2209 		/* Account... */
2210 #ifdef BIND8_STATS
2211 		if (data->socket->fam == AF_INET) {
2212 			STATUP(data->nsd, qudp);
2213 		} else if (data->socket->fam == AF_INET6) {
2214 			STATUP(data->nsd, qudp6);
2215 		}
2216 #endif
2217 
2218 		buffer_skip(q->packet, received);
2219 		buffer_flip(q->packet);
2220 
2221 		/* Process and answer the query... */
2222 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2223 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2224 				STATUP(data->nsd, nona);
2225 				ZTATUP(data->nsd, q->zone, nona);
2226 			}
2227 
2228 #ifdef USE_ZONE_STATS
2229 			if (data->socket->fam == AF_INET) {
2230 				ZTATUP(data->nsd, q->zone, qudp);
2231 			} else if (data->socket->fam == AF_INET6) {
2232 				ZTATUP(data->nsd, q->zone, qudp6);
2233 			}
2234 #endif
2235 
2236 			/* Add EDNS0 and TSIG info if necessary.  */
2237 			query_add_optional(q, data->nsd);
2238 
2239 			buffer_flip(q->packet);
2240 			iovecs[i].iov_len = buffer_remaining(q->packet);
2241 #ifdef BIND8_STATS
2242 			/* Account the rcode & TC... */
2243 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2244 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2245 			if (TC(q->packet)) {
2246 				STATUP(data->nsd, truncated);
2247 				ZTATUP(data->nsd, q->zone, truncated);
2248 			}
2249 #endif /* BIND8_STATS */
2250 		} else {
2251 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2252 			iovecs[i].iov_len = buffer_remaining(q->packet);
2253 		swap_drop:
2254 			STATUP(data->nsd, dropped);
2255 			ZTATUP(data->nsd, q->zone, dropped);
2256 			if(i != recvcount-1) {
2257 				/* swap with last and decrease recvcount */
2258 				struct mmsghdr mtmp = msgs[i];
2259 				struct iovec iotmp = iovecs[i];
2260 				recvcount--;
2261 				msgs[i] = msgs[recvcount];
2262 				iovecs[i] = iovecs[recvcount];
2263 				queries[i] = queries[recvcount];
2264 				msgs[recvcount] = mtmp;
2265 				iovecs[recvcount] = iotmp;
2266 				queries[recvcount] = q;
2267 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2268 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2269 				goto loopstart;
2270 			} else { recvcount --; }
2271 		}
2272 	}
2273 
2274 	/* send until all are sent */
2275 	i = 0;
2276 	while(i<recvcount) {
2277 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2278 		if(sent == -1) {
2279 			const char* es = strerror(errno);
2280 			char a[48];
2281 			addr2str(&queries[i]->addr, a, sizeof(a));
2282 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2283 #ifdef BIND8_STATS
2284 			data->nsd->st.txerr += recvcount-i;
2285 #endif /* BIND8_STATS */
2286 			break;
2287 		}
2288 		i += sent;
2289 	}
2290 	for(i=0; i<recvcount; i++) {
2291 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2292 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2293 	}
2294 }
2295 
2296 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2297 
2298 static void
2299 handle_udp(int fd, short event, void* arg)
2300 {
2301 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2302 	int received, sent;
2303 #ifndef NONBLOCKING_IS_BROKEN
2304 #ifdef HAVE_RECVMMSG
2305 	int recvcount;
2306 #endif /* HAVE_RECVMMSG */
2307 	int i;
2308 #endif /* NONBLOCKING_IS_BROKEN */
2309 	struct query *q;
2310 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2311 	q = data->query;
2312 #endif
2313 
2314 	if (!(event & EV_READ)) {
2315 		return;
2316 	}
2317 #ifndef NONBLOCKING_IS_BROKEN
2318 #ifdef HAVE_RECVMMSG
2319 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2320 	/* this printf strangely gave a performance increase on Linux */
2321 	/* printf("recvcount %d \n", recvcount); */
2322 	if (recvcount == -1) {
2323 		if (errno != EAGAIN && errno != EINTR) {
2324 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2325 			STATUP(data->nsd, rxerr);
2326 			/* No zone statup */
2327 		}
2328 		/* Simply no data available */
2329 		return;
2330 	}
2331 	for (i = 0; i < recvcount; i++) {
2332 		received = msgs[i].msg_len;
2333 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2334 		if (received == -1) {
2335 			log_msg(LOG_ERR, "recvmmsg failed");
2336 			STATUP(data->nsd, rxerr);
2337 			/* No zone statup */
2338 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2339 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2340 			continue;
2341 		}
2342 		q = queries[i];
2343 #else
2344 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2345 #endif /* HAVE_RECVMMSG */
2346 #endif /* NONBLOCKING_IS_BROKEN */
2347 
2348 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2349 		/* Initialize the query... */
2350 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2351 
2352 		received = recvfrom(fd,
2353 				    buffer_begin(q->packet),
2354 				    buffer_remaining(q->packet),
2355 				    0,
2356 				    (struct sockaddr *)&q->addr,
2357 				    &q->addrlen);
2358 		if (received == -1) {
2359 			if (errno != EAGAIN && errno != EINTR) {
2360 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2361 				STATUP(data->nsd, rxerr);
2362 				/* No zone statup */
2363 			}
2364 			return;
2365 		}
2366 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2367 
2368 		/* Account... */
2369 		if (data->socket->fam == AF_INET) {
2370 			STATUP(data->nsd, qudp);
2371 		} else if (data->socket->fam == AF_INET6) {
2372 			STATUP(data->nsd, qudp6);
2373 		}
2374 
2375 		buffer_skip(q->packet, received);
2376 		buffer_flip(q->packet);
2377 
2378 		/* Process and answer the query... */
2379 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2380 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2381 				STATUP(data->nsd, nona);
2382 				ZTATUP(data->nsd, q->zone, nona);
2383 			}
2384 
2385 #ifdef USE_ZONE_STATS
2386 			if (data->socket->fam == AF_INET) {
2387 				ZTATUP(data->nsd, q->zone, qudp);
2388 			} else if (data->socket->fam == AF_INET6) {
2389 				ZTATUP(data->nsd, q->zone, qudp6);
2390 			}
2391 #endif
2392 
2393 			/* Add EDNS0 and TSIG info if necessary.  */
2394 			query_add_optional(q, data->nsd);
2395 
2396 			buffer_flip(q->packet);
2397 
2398 			sent = sendto(fd,
2399 				      buffer_begin(q->packet),
2400 				      buffer_remaining(q->packet),
2401 				      0,
2402 				      (struct sockaddr *) &q->addr,
2403 				      q->addrlen);
2404 			if (sent == -1) {
2405 				const char* es = strerror(errno);
2406 				char a[48];
2407 				addr2str(&q->addr, a, sizeof(a));
2408 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2409 				STATUP(data->nsd, txerr);
2410 				ZTATUP(data->nsd, q->zone, txerr);
2411 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2412 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2413 			} else {
2414 #ifdef BIND8_STATS
2415 				/* Account the rcode & TC... */
2416 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2417 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2418 				if (TC(q->packet)) {
2419 					STATUP(data->nsd, truncated);
2420 					ZTATUP(data->nsd, q->zone, truncated);
2421 				}
2422 #endif /* BIND8_STATS */
2423 			}
2424 		} else {
2425 			STATUP(data->nsd, dropped);
2426 			ZTATUP(data->nsd, q->zone, dropped);
2427 		}
2428 #ifndef NONBLOCKING_IS_BROKEN
2429 #ifdef HAVE_RECVMMSG
2430 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2431 #endif
2432 	}
2433 #endif
2434 }
2435 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2436 
2437 
2438 static void
2439 cleanup_tcp_handler(struct tcp_handler_data* data)
2440 {
2441 	event_del(&data->event);
2442 	close(data->event.ev_fd);
2443 
2444 	/*
2445 	 * Enable the TCP accept handlers when the current number of
2446 	 * TCP connections is about to drop below the maximum number
2447 	 * of TCP connections.
2448 	 */
2449 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2450 		configure_handler_event_types(EV_READ|EV_PERSIST);
2451 		if(slowaccept) {
2452 			event_del(&slowaccept_event);
2453 			slowaccept = 0;
2454 		}
2455 	}
2456 	--data->nsd->current_tcp_count;
2457 	assert(data->nsd->current_tcp_count >= 0);
2458 
2459 	region_destroy(data->region);
2460 }
2461 
2462 static void
2463 handle_tcp_reading(int fd, short event, void* arg)
2464 {
2465 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2466 	ssize_t received;
2467 	struct event_base* ev_base;
2468 	struct timeval timeout;
2469 
2470 	if ((event & EV_TIMEOUT)) {
2471 		/* Connection timed out.  */
2472 		cleanup_tcp_handler(data);
2473 		return;
2474 	}
2475 
2476 	if (data->nsd->tcp_query_count > 0 &&
2477 		data->query_count >= data->nsd->tcp_query_count) {
2478 		/* No more queries allowed on this tcp connection.  */
2479 		cleanup_tcp_handler(data);
2480 		return;
2481 	}
2482 
2483 	assert((event & EV_READ));
2484 
2485 	if (data->bytes_transmitted == 0) {
2486 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2487 	}
2488 
2489 	/*
2490 	 * Check if we received the leading packet length bytes yet.
2491 	 */
2492 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2493 		received = read(fd,
2494 				(char *) &data->query->tcplen
2495 				+ data->bytes_transmitted,
2496 				sizeof(uint16_t) - data->bytes_transmitted);
2497 		if (received == -1) {
2498 			if (errno == EAGAIN || errno == EINTR) {
2499 				/*
2500 				 * Read would block, wait until more
2501 				 * data is available.
2502 				 */
2503 				return;
2504 			} else {
2505 				char buf[48];
2506 				addr2str(&data->query->addr, buf, sizeof(buf));
2507 #ifdef ECONNRESET
2508 				if (verbosity >= 2 || errno != ECONNRESET)
2509 #endif /* ECONNRESET */
2510 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2511 				cleanup_tcp_handler(data);
2512 				return;
2513 			}
2514 		} else if (received == 0) {
2515 			/* EOF */
2516 			cleanup_tcp_handler(data);
2517 			return;
2518 		}
2519 
2520 		data->bytes_transmitted += received;
2521 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2522 			/*
2523 			 * Not done with the tcplen yet, wait for more
2524 			 * data to become available.
2525 			 */
2526 			return;
2527 		}
2528 
2529 		assert(data->bytes_transmitted == sizeof(uint16_t));
2530 
2531 		data->query->tcplen = ntohs(data->query->tcplen);
2532 
2533 		/*
2534 		 * Minimum query size is:
2535 		 *
2536 		 *     Size of the header (12)
2537 		 *   + Root domain name   (1)
2538 		 *   + Query class        (2)
2539 		 *   + Query type         (2)
2540 		 */
2541 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2542 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2543 			cleanup_tcp_handler(data);
2544 			return;
2545 		}
2546 
2547 		if (data->query->tcplen > data->query->maxlen) {
2548 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2549 			cleanup_tcp_handler(data);
2550 			return;
2551 		}
2552 
2553 		buffer_set_limit(data->query->packet, data->query->tcplen);
2554 	}
2555 
2556 	assert(buffer_remaining(data->query->packet) > 0);
2557 
2558 	/* Read the (remaining) query data.  */
2559 	received = read(fd,
2560 			buffer_current(data->query->packet),
2561 			buffer_remaining(data->query->packet));
2562 	if (received == -1) {
2563 		if (errno == EAGAIN || errno == EINTR) {
2564 			/*
2565 			 * Read would block, wait until more data is
2566 			 * available.
2567 			 */
2568 			return;
2569 		} else {
2570 			char buf[48];
2571 			addr2str(&data->query->addr, buf, sizeof(buf));
2572 #ifdef ECONNRESET
2573 			if (verbosity >= 2 || errno != ECONNRESET)
2574 #endif /* ECONNRESET */
2575 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2576 			cleanup_tcp_handler(data);
2577 			return;
2578 		}
2579 	} else if (received == 0) {
2580 		/* EOF */
2581 		cleanup_tcp_handler(data);
2582 		return;
2583 	}
2584 
2585 	data->bytes_transmitted += received;
2586 	buffer_skip(data->query->packet, received);
2587 	if (buffer_remaining(data->query->packet) > 0) {
2588 		/*
2589 		 * Message not yet complete, wait for more data to
2590 		 * become available.
2591 		 */
2592 		return;
2593 	}
2594 
2595 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2596 
2597 	/* Account... */
2598 #ifdef BIND8_STATS
2599 #ifndef INET6
2600 	STATUP(data->nsd, ctcp);
2601 #else
2602 	if (data->query->addr.ss_family == AF_INET) {
2603 		STATUP(data->nsd, ctcp);
2604 	} else if (data->query->addr.ss_family == AF_INET6) {
2605 		STATUP(data->nsd, ctcp6);
2606 	}
2607 #endif
2608 #endif /* BIND8_STATS */
2609 
2610 	/* We have a complete query, process it.  */
2611 
2612 	/* tcp-query-count: handle query counter ++ */
2613 	data->query_count++;
2614 
2615 	buffer_flip(data->query->packet);
2616 	data->query_state = server_process_query(data->nsd, data->query);
2617 	if (data->query_state == QUERY_DISCARDED) {
2618 		/* Drop the packet and the entire connection... */
2619 		STATUP(data->nsd, dropped);
2620 		ZTATUP(data->nsd, data->query->zone, dropped);
2621 		cleanup_tcp_handler(data);
2622 		return;
2623 	}
2624 
2625 #ifdef BIND8_STATS
2626 	if (RCODE(data->query->packet) == RCODE_OK
2627 	    && !AA(data->query->packet))
2628 	{
2629 		STATUP(data->nsd, nona);
2630 		ZTATUP(data->nsd, data->query->zone, nona);
2631 	}
2632 #endif /* BIND8_STATS */
2633 
2634 #ifdef USE_ZONE_STATS
2635 #ifndef INET6
2636 	ZTATUP(data->nsd, data->query->zone, ctcp);
2637 #else
2638 	if (data->query->addr.ss_family == AF_INET) {
2639 		ZTATUP(data->nsd, data->query->zone, ctcp);
2640 	} else if (data->query->addr.ss_family == AF_INET6) {
2641 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2642 	}
2643 #endif
2644 #endif /* USE_ZONE_STATS */
2645 
2646 	query_add_optional(data->query, data->nsd);
2647 
2648 	/* Switch to the tcp write handler.  */
2649 	buffer_flip(data->query->packet);
2650 	data->query->tcplen = buffer_remaining(data->query->packet);
2651 	data->bytes_transmitted = 0;
2652 
2653 	timeout.tv_sec = data->tcp_timeout / 1000;
2654 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2655 
2656 	ev_base = data->event.ev_base;
2657 	event_del(&data->event);
2658 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2659 		handle_tcp_writing, data);
2660 	if(event_base_set(ev_base, &data->event) != 0)
2661 		log_msg(LOG_ERR, "event base set tcpr failed");
2662 	if(event_add(&data->event, &timeout) != 0)
2663 		log_msg(LOG_ERR, "event add tcpr failed");
2664 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2665 	handle_tcp_writing(fd, EV_WRITE, data);
2666 }
2667 
2668 static void
2669 handle_tcp_writing(int fd, short event, void* arg)
2670 {
2671 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2672 	ssize_t sent;
2673 	struct query *q = data->query;
2674 	struct timeval timeout;
2675 	struct event_base* ev_base;
2676 
2677 	if ((event & EV_TIMEOUT)) {
2678 		/* Connection timed out.  */
2679 		cleanup_tcp_handler(data);
2680 		return;
2681 	}
2682 
2683 	assert((event & EV_WRITE));
2684 
2685 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2686 		/* Writing the response packet length.  */
2687 		uint16_t n_tcplen = htons(q->tcplen);
2688 #ifdef HAVE_WRITEV
2689 		struct iovec iov[2];
2690 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2691 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2692 		iov[1].iov_base = buffer_begin(q->packet);
2693 		iov[1].iov_len = buffer_limit(q->packet);
2694 		sent = writev(fd, iov, 2);
2695 #else /* HAVE_WRITEV */
2696 		sent = write(fd,
2697 			     (const char *) &n_tcplen + data->bytes_transmitted,
2698 			     sizeof(n_tcplen) - data->bytes_transmitted);
2699 #endif /* HAVE_WRITEV */
2700 		if (sent == -1) {
2701 			if (errno == EAGAIN || errno == EINTR) {
2702 				/*
2703 				 * Write would block, wait until
2704 				 * socket becomes writable again.
2705 				 */
2706 				return;
2707 			} else {
2708 #ifdef ECONNRESET
2709 				if(verbosity >= 2 || errno != ECONNRESET)
2710 #endif /* ECONNRESET */
2711 #ifdef EPIPE
2712 				  if(verbosity >= 2 || errno != EPIPE)
2713 #endif /* EPIPE 'broken pipe' */
2714 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2715 				cleanup_tcp_handler(data);
2716 				return;
2717 			}
2718 		}
2719 
2720 		data->bytes_transmitted += sent;
2721 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2722 			/*
2723 			 * Writing not complete, wait until socket
2724 			 * becomes writable again.
2725 			 */
2726 			return;
2727 		}
2728 
2729 #ifdef HAVE_WRITEV
2730 		sent -= sizeof(n_tcplen);
2731 		/* handle potential 'packet done' code */
2732 		goto packet_could_be_done;
2733 #endif
2734  	}
2735 
2736 	sent = write(fd,
2737 		     buffer_current(q->packet),
2738 		     buffer_remaining(q->packet));
2739 	if (sent == -1) {
2740 		if (errno == EAGAIN || errno == EINTR) {
2741 			/*
2742 			 * Write would block, wait until
2743 			 * socket becomes writable again.
2744 			 */
2745 			return;
2746 		} else {
2747 #ifdef ECONNRESET
2748 			if(verbosity >= 2 || errno != ECONNRESET)
2749 #endif /* ECONNRESET */
2750 #ifdef EPIPE
2751 				  if(verbosity >= 2 || errno != EPIPE)
2752 #endif /* EPIPE 'broken pipe' */
2753 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2754 			cleanup_tcp_handler(data);
2755 			return;
2756 		}
2757 	}
2758 
2759 	data->bytes_transmitted += sent;
2760 #ifdef HAVE_WRITEV
2761   packet_could_be_done:
2762 #endif
2763 	buffer_skip(q->packet, sent);
2764 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2765 		/*
2766 		 * Still more data to write when socket becomes
2767 		 * writable again.
2768 		 */
2769 		return;
2770 	}
2771 
2772 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2773 
2774 	if (data->query_state == QUERY_IN_AXFR) {
2775 		/* Continue processing AXFR and writing back results.  */
2776 		buffer_clear(q->packet);
2777 		data->query_state = query_axfr(data->nsd, q);
2778 		if (data->query_state != QUERY_PROCESSED) {
2779 			query_add_optional(data->query, data->nsd);
2780 
2781 			/* Reset data. */
2782 			buffer_flip(q->packet);
2783 			q->tcplen = buffer_remaining(q->packet);
2784 			data->bytes_transmitted = 0;
2785 			/* Reset timeout.  */
2786 			timeout.tv_sec = data->tcp_timeout / 1000;
2787 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2788 			ev_base = data->event.ev_base;
2789 			event_del(&data->event);
2790 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2791 				handle_tcp_writing, data);
2792 			if(event_base_set(ev_base, &data->event) != 0)
2793 				log_msg(LOG_ERR, "event base set tcpw failed");
2794 			if(event_add(&data->event, &timeout) != 0)
2795 				log_msg(LOG_ERR, "event add tcpw failed");
2796 
2797 			/*
2798 			 * Write data if/when the socket is writable
2799 			 * again.
2800 			 */
2801 			return;
2802 		}
2803 	}
2804 
2805 	/*
2806 	 * Done sending, wait for the next request to arrive on the
2807 	 * TCP socket by installing the TCP read handler.
2808 	 */
2809 	if (data->nsd->tcp_query_count > 0 &&
2810 		data->query_count >= data->nsd->tcp_query_count) {
2811 
2812 		(void) shutdown(fd, SHUT_WR);
2813 	}
2814 
2815 	data->bytes_transmitted = 0;
2816 
2817 	timeout.tv_sec = data->tcp_timeout / 1000;
2818 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2819 	ev_base = data->event.ev_base;
2820 	event_del(&data->event);
2821 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2822 		handle_tcp_reading, data);
2823 	if(event_base_set(ev_base, &data->event) != 0)
2824 		log_msg(LOG_ERR, "event base set tcpw failed");
2825 	if(event_add(&data->event, &timeout) != 0)
2826 		log_msg(LOG_ERR, "event add tcpw failed");
2827 }
2828 
2829 
2830 static void
2831 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2832 	void* ATTR_UNUSED(arg))
2833 {
2834 	if(slowaccept) {
2835 		configure_handler_event_types(EV_PERSIST | EV_READ);
2836 		slowaccept = 0;
2837 	}
2838 }
2839 
2840 /*
2841  * Handle an incoming TCP connection.  The connection is accepted and
2842  * a new TCP reader event handler is added.  The TCP handler
2843  * is responsible for cleanup when the connection is closed.
2844  */
2845 static void
2846 handle_tcp_accept(int fd, short event, void* arg)
2847 {
2848 	struct tcp_accept_handler_data *data
2849 		= (struct tcp_accept_handler_data *) arg;
2850 	int s;
2851 	struct tcp_handler_data *tcp_data;
2852 	region_type *tcp_region;
2853 #ifdef INET6
2854 	struct sockaddr_storage addr;
2855 #else
2856 	struct sockaddr_in addr;
2857 #endif
2858 	socklen_t addrlen;
2859 	struct timeval timeout;
2860 
2861 	if (!(event & EV_READ)) {
2862 		return;
2863 	}
2864 
2865 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2866 		return;
2867 	}
2868 
2869 	/* Accept it... */
2870 	addrlen = sizeof(addr);
2871 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2872 	if (s == -1) {
2873 		/**
2874 		 * EMFILE and ENFILE is a signal that the limit of open
2875 		 * file descriptors has been reached. Pause accept().
2876 		 * EINTR is a signal interrupt. The others are various OS ways
2877 		 * of saying that the client has closed the connection.
2878 		 */
2879 		if (errno == EMFILE || errno == ENFILE) {
2880 			if (!slowaccept) {
2881 				/* disable accept events */
2882 				struct timeval tv;
2883 				configure_handler_event_types(0);
2884 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2885 				tv.tv_usec = 0L;
2886 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2887 					handle_slowaccept_timeout, NULL);
2888 				(void)event_base_set(data->event.ev_base,
2889 					&slowaccept_event);
2890 				(void)event_add(&slowaccept_event, &tv);
2891 				slowaccept = 1;
2892 				/* We don't want to spam the logs here */
2893 			}
2894 		} else if (errno != EINTR
2895 			&& errno != EWOULDBLOCK
2896 #ifdef ECONNABORTED
2897 			&& errno != ECONNABORTED
2898 #endif /* ECONNABORTED */
2899 #ifdef EPROTO
2900 			&& errno != EPROTO
2901 #endif /* EPROTO */
2902 			) {
2903 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2904 		}
2905 		return;
2906 	}
2907 
2908 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2909 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
2910 		close(s);
2911 		return;
2912 	}
2913 
2914 	/*
2915 	 * This region is deallocated when the TCP connection is
2916 	 * closed by the TCP handler.
2917 	 */
2918 	tcp_region = region_create(xalloc, free);
2919 	tcp_data = (struct tcp_handler_data *) region_alloc(
2920 		tcp_region, sizeof(struct tcp_handler_data));
2921 	tcp_data->region = tcp_region;
2922 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
2923 		compression_table_size);
2924 	tcp_data->nsd = data->nsd;
2925 	tcp_data->query_count = 0;
2926 
2927 	tcp_data->query_state = QUERY_PROCESSED;
2928 	tcp_data->bytes_transmitted = 0;
2929 	memcpy(&tcp_data->query->addr, &addr, addrlen);
2930 	tcp_data->query->addrlen = addrlen;
2931 
2932 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
2933 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
2934 		/* very busy, give smaller timeout */
2935 		tcp_data->tcp_timeout = 200;
2936 	}
2937 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
2938 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
2939 
2940 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
2941 		handle_tcp_reading, tcp_data);
2942 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
2943 		log_msg(LOG_ERR, "cannot set tcp event base");
2944 		close(s);
2945 		region_destroy(tcp_region);
2946 		return;
2947 	}
2948 	if(event_add(&tcp_data->event, &timeout) != 0) {
2949 		log_msg(LOG_ERR, "cannot add tcp to event base");
2950 		close(s);
2951 		region_destroy(tcp_region);
2952 		return;
2953 	}
2954 
2955 	/*
2956 	 * Keep track of the total number of TCP handlers installed so
2957 	 * we can stop accepting connections when the maximum number
2958 	 * of simultaneous TCP connections is reached.
2959 	 */
2960 	++data->nsd->current_tcp_count;
2961 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2962 		configure_handler_event_types(0);
2963 	}
2964 }
2965 
2966 static void
2967 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
2968 {
2969 	size_t i;
2970 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2971 	for (i = 0; i < nsd->child_count; ++i) {
2972 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
2973 			if (write(nsd->children[i].child_fd,
2974 				&command,
2975 				sizeof(command)) == -1)
2976 			{
2977 				if(errno != EAGAIN && errno != EINTR)
2978 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
2979 					(int) command,
2980 					(int) nsd->children[i].pid,
2981 					strerror(errno));
2982 			} else if (timeout > 0) {
2983 				(void)block_read(NULL,
2984 					nsd->children[i].child_fd,
2985 					&command, sizeof(command), timeout);
2986 			}
2987 			fsync(nsd->children[i].child_fd);
2988 			close(nsd->children[i].child_fd);
2989 			nsd->children[i].child_fd = -1;
2990 		}
2991 	}
2992 }
2993 
2994 static void
2995 send_children_quit(struct nsd* nsd)
2996 {
2997 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
2998 	send_children_command(nsd, NSD_QUIT, 0);
2999 }
3000 
3001 static void
3002 send_children_quit_and_wait(struct nsd* nsd)
3003 {
3004 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
3005 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
3006 }
3007 
3008 #ifdef BIND8_STATS
3009 static void
3010 set_children_stats(struct nsd* nsd)
3011 {
3012 	size_t i;
3013 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3014 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
3015 	for (i = 0; i < nsd->child_count; ++i) {
3016 		nsd->children[i].need_to_send_STATS = 1;
3017 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
3018 	}
3019 }
3020 #endif /* BIND8_STATS */
3021 
3022 static void
3023 configure_handler_event_types(short event_types)
3024 {
3025 	size_t i;
3026 
3027 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3028 		struct event* handler = &tcp_accept_handlers[i].event;
3029 		if(event_types) {
3030 			/* reassign */
3031 			int fd = handler->ev_fd;
3032 			struct event_base* base = handler->ev_base;
3033 			if(tcp_accept_handlers[i].event_added)
3034 				event_del(handler);
3035 			event_set(handler, fd, event_types,
3036 				handle_tcp_accept, &tcp_accept_handlers[i]);
3037 			if(event_base_set(base, handler) != 0)
3038 				log_msg(LOG_ERR, "conhand: cannot event_base");
3039 			if(event_add(handler, NULL) != 0)
3040 				log_msg(LOG_ERR, "conhand: cannot event_add");
3041 			tcp_accept_handlers[i].event_added = 1;
3042 		} else {
3043 			/* remove */
3044 			if(tcp_accept_handlers[i].event_added) {
3045 				event_del(handler);
3046 				tcp_accept_handlers[i].event_added = 0;
3047 			}
3048 		}
3049 	}
3050 }
3051