xref: /netbsd-src/external/bsd/nsd/dist/server.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #ifdef HAVE_OPENSSL_RAND_H
41 #include <openssl/rand.h>
42 #endif
43 #ifndef USE_MINI_EVENT
44 #  ifdef HAVE_EVENT_H
45 #    include <event.h>
46 #  else
47 #    include <event2/event.h>
48 #    include "event2/event_struct.h"
49 #    include "event2/event_compat.h"
50 #  endif
51 #else
52 #  include "mini_event.h"
53 #endif
54 
55 #include "axfr.h"
56 #include "namedb.h"
57 #include "netio.h"
58 #include "xfrd.h"
59 #include "xfrd-tcp.h"
60 #include "xfrd-disk.h"
61 #include "difffile.h"
62 #include "nsec3.h"
63 #include "ipc.h"
64 #include "udb.h"
65 #include "remote.h"
66 #include "lookup3.h"
67 #include "rrl.h"
68 
69 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
70 
71 /*
72  * Data for the UDP handlers.
73  */
74 struct udp_handler_data
75 {
76 	struct nsd        *nsd;
77 	struct nsd_socket *socket;
78 	query_type        *query;
79 };
80 
81 struct tcp_accept_handler_data {
82 	struct nsd         *nsd;
83 	struct nsd_socket  *socket;
84 	int event_added;
85 	struct event       event;
86 };
87 
88 /*
89  * These globals are used to enable the TCP accept handlers
90  * when the number of TCP connection drops below the maximum
91  * number of TCP connections.
92  */
93 static size_t		tcp_accept_handler_count;
94 static struct tcp_accept_handler_data*	tcp_accept_handlers;
95 
96 static struct event slowaccept_event;
97 static int slowaccept;
98 
99 #ifndef NONBLOCKING_IS_BROKEN
100 #  define NUM_RECV_PER_SELECT 100
101 #endif
102 
103 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
104 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
105 struct iovec iovecs[NUM_RECV_PER_SELECT];
106 struct query *queries[NUM_RECV_PER_SELECT];
107 #endif
108 
109 /*
110  * Data for the TCP connection handlers.
111  *
112  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
113  * blocking the entire server on a slow TCP connection, but does make
114  * reading from and writing to the socket more complicated.
115  *
116  * Basically, whenever a read/write would block (indicated by the
117  * EAGAIN errno variable) we remember the position we were reading
118  * from/writing to and return from the TCP reading/writing event
119  * handler.  When the socket becomes readable/writable again we
120  * continue from the same position.
121  */
122 struct tcp_handler_data
123 {
124 	/*
125 	 * The region used to allocate all TCP connection related
126 	 * data, including this structure.  This region is destroyed
127 	 * when the connection is closed.
128 	 */
129 	region_type*		region;
130 
131 	/*
132 	 * The global nsd structure.
133 	 */
134 	struct nsd*			nsd;
135 
136 	/*
137 	 * The current query data for this TCP connection.
138 	 */
139 	query_type*			query;
140 
141 	/*
142 	 * The query_state is used to remember if we are performing an
143 	 * AXFR, if we're done processing, or if we should discard the
144 	 * query and connection.
145 	 */
146 	query_state_type	query_state;
147 
148 	/*
149 	 * The event for the file descriptor and tcp timeout
150 	 */
151 	struct event event;
152 
153 	/*
154 	 * The bytes_transmitted field is used to remember the number
155 	 * of bytes transmitted when receiving or sending a DNS
156 	 * packet.  The count includes the two additional bytes used
157 	 * to specify the packet length on a TCP connection.
158 	 */
159 	size_t				bytes_transmitted;
160 
161 	/*
162 	 * The number of queries handled by this specific TCP connection.
163 	 */
164 	int					query_count;
165 
166 	/*
167 	 * The timeout in msec for this tcp connection
168 	 */
169 	int	tcp_timeout;
170 };
171 
172 /*
173  * Handle incoming queries on the UDP server sockets.
174  */
175 static void handle_udp(int fd, short event, void* arg);
176 
177 /*
178  * Handle incoming connections on the TCP sockets.  These handlers
179  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
180  * connection) but are disabled when the number of current TCP
181  * connections is equal to the maximum number of TCP connections.
182  * Disabling is done by changing the handler to wait for the
183  * NETIO_EVENT_NONE type.  This is done using the function
184  * configure_tcp_accept_handlers.
185  */
186 static void handle_tcp_accept(int fd, short event, void* arg);
187 
188 /*
189  * Handle incoming queries on a TCP connection.  The TCP connections
190  * are configured to be non-blocking and the handler may be called
191  * multiple times before a complete query is received.
192  */
193 static void handle_tcp_reading(int fd, short event, void* arg);
194 
195 /*
196  * Handle outgoing responses on a TCP connection.  The TCP connections
197  * are configured to be non-blocking and the handler may be called
198  * multiple times before a complete response is sent.
199  */
200 static void handle_tcp_writing(int fd, short event, void* arg);
201 
202 /*
203  * Send all children the quit nonblocking, then close pipe.
204  */
205 static void send_children_quit(struct nsd* nsd);
206 /* same, for shutdown time, waits for child to exit to avoid restart issues */
207 static void send_children_quit_and_wait(struct nsd* nsd);
208 
209 /* set childrens flags to send NSD_STATS to them */
210 #ifdef BIND8_STATS
211 static void set_children_stats(struct nsd* nsd);
212 #endif /* BIND8_STATS */
213 
214 /*
215  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
216  */
217 static void configure_handler_event_types(short event_types);
218 
219 static uint16_t *compressed_dname_offsets = 0;
220 static uint32_t compression_table_capacity = 0;
221 static uint32_t compression_table_size = 0;
222 
223 /*
224  * Remove the specified pid from the list of child pids.  Returns -1 if
225  * the pid is not in the list, child_num otherwise.  The field is set to 0.
226  */
227 static int
228 delete_child_pid(struct nsd *nsd, pid_t pid)
229 {
230 	size_t i;
231 	for (i = 0; i < nsd->child_count; ++i) {
232 		if (nsd->children[i].pid == pid) {
233 			nsd->children[i].pid = 0;
234 			if(!nsd->children[i].need_to_exit) {
235 				if(nsd->children[i].child_fd != -1)
236 					close(nsd->children[i].child_fd);
237 				nsd->children[i].child_fd = -1;
238 				if(nsd->children[i].handler)
239 					nsd->children[i].handler->fd = -1;
240 			}
241 			return i;
242 		}
243 	}
244 	return -1;
245 }
246 
247 /*
248  * Restart child servers if necessary.
249  */
250 static int
251 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
252 	int* xfrd_sock_p)
253 {
254 	struct main_ipc_handler_data *ipc_data;
255 	size_t i;
256 	int sv[2];
257 
258 	/* Fork the child processes... */
259 	for (i = 0; i < nsd->child_count; ++i) {
260 		if (nsd->children[i].pid <= 0) {
261 			if (nsd->children[i].child_fd != -1)
262 				close(nsd->children[i].child_fd);
263 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
264 				log_msg(LOG_ERR, "socketpair: %s",
265 					strerror(errno));
266 				return -1;
267 			}
268 			nsd->children[i].child_fd = sv[0];
269 			nsd->children[i].parent_fd = sv[1];
270 			nsd->children[i].pid = fork();
271 			switch (nsd->children[i].pid) {
272 			default: /* SERVER MAIN */
273 				close(nsd->children[i].parent_fd);
274 				nsd->children[i].parent_fd = -1;
275 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
276 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
277 				}
278 				if(!nsd->children[i].handler)
279 				{
280 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
281 						region, sizeof(struct main_ipc_handler_data));
282 					ipc_data->nsd = nsd;
283 					ipc_data->child = &nsd->children[i];
284 					ipc_data->child_num = i;
285 					ipc_data->xfrd_sock = xfrd_sock_p;
286 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
287 					ipc_data->forward_mode = 0;
288 					ipc_data->got_bytes = 0;
289 					ipc_data->total_bytes = 0;
290 					ipc_data->acl_num = 0;
291 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
292 						region, sizeof(struct netio_handler));
293 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
294 					nsd->children[i].handler->timeout = NULL;
295 					nsd->children[i].handler->user_data = ipc_data;
296 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
297 					nsd->children[i].handler->event_handler = parent_handle_child_command;
298 					netio_add_handler(netio, nsd->children[i].handler);
299 				}
300 				/* clear any ongoing ipc */
301 				ipc_data = (struct main_ipc_handler_data*)
302 					nsd->children[i].handler->user_data;
303 				ipc_data->forward_mode = 0;
304 				/* restart - update fd */
305 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
306 				break;
307 			case 0: /* CHILD */
308 				/* the child need not be able to access the
309 				 * nsd.db file */
310 				namedb_close_udb(nsd->db);
311 				nsd->pid = 0;
312 				nsd->child_count = 0;
313 				nsd->server_kind = nsd->children[i].kind;
314 				nsd->this_child = &nsd->children[i];
315 				nsd->this_child->child_num = i;
316 				/* remove signal flags inherited from parent
317 				   the parent will handle them. */
318 				nsd->signal_hint_reload_hup = 0;
319 				nsd->signal_hint_reload = 0;
320 				nsd->signal_hint_child = 0;
321 				nsd->signal_hint_quit = 0;
322 				nsd->signal_hint_shutdown = 0;
323 				nsd->signal_hint_stats = 0;
324 				nsd->signal_hint_statsusr = 0;
325 				close(*xfrd_sock_p);
326 				close(nsd->this_child->child_fd);
327 				nsd->this_child->child_fd = -1;
328 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
329 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
330 				}
331 				server_child(nsd);
332 				/* NOTREACH */
333 				exit(0);
334 			case -1:
335 				log_msg(LOG_ERR, "fork failed: %s",
336 					strerror(errno));
337 				return -1;
338 			}
339 		}
340 	}
341 	return 0;
342 }
343 
344 #ifdef BIND8_STATS
345 static void set_bind8_alarm(struct nsd* nsd)
346 {
347 	/* resync so that the next alarm is on the next whole minute */
348 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
349 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
350 }
351 #endif
352 
353 /* set zone stat ids for zones initially read in */
354 static void
355 zonestatid_tree_set(struct nsd* nsd)
356 {
357 	struct radnode* n;
358 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
359 		zone_type* zone = (zone_type*)n->elem;
360 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
361 	}
362 }
363 
364 #ifdef USE_ZONE_STATS
365 void
366 server_zonestat_alloc(struct nsd* nsd)
367 {
368 	size_t num = (nsd->options->zonestatnames->count==0?1:
369 			nsd->options->zonestatnames->count);
370 	size_t sz = sizeof(struct nsdst)*num;
371 	char tmpfile[256];
372 	uint8_t z = 0;
373 
374 	/* file names */
375 	nsd->zonestatfname[0] = 0;
376 	nsd->zonestatfname[1] = 0;
377 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
378 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
379 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
380 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
381 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
382 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
383 
384 	/* file descriptors */
385 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
386 	if(nsd->zonestatfd[0] == -1) {
387 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
388 			strerror(errno));
389 		exit(1);
390 	}
391 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
392 	if(nsd->zonestatfd[0] == -1) {
393 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
394 			strerror(errno));
395 		close(nsd->zonestatfd[0]);
396 		unlink(nsd->zonestatfname[0]);
397 		exit(1);
398 	}
399 
400 #ifdef HAVE_MMAP
401 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
402 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
403 			strerror(errno));
404 		exit(1);
405 	}
406 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
407 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
408 			nsd->zonestatfname[0], strerror(errno));
409 		exit(1);
410 	}
411 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
412 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
413 			strerror(errno));
414 		exit(1);
415 	}
416 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
417 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
418 			nsd->zonestatfname[1], strerror(errno));
419 		exit(1);
420 	}
421 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
422 		MAP_SHARED, nsd->zonestatfd[0], 0);
423 	if(nsd->zonestat[0] == MAP_FAILED) {
424 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
425 		unlink(nsd->zonestatfname[0]);
426 		unlink(nsd->zonestatfname[1]);
427 		exit(1);
428 	}
429 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
430 		MAP_SHARED, nsd->zonestatfd[1], 0);
431 	if(nsd->zonestat[1] == MAP_FAILED) {
432 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
433 		unlink(nsd->zonestatfname[0]);
434 		unlink(nsd->zonestatfname[1]);
435 		exit(1);
436 	}
437 	memset(nsd->zonestat[0], 0, sz);
438 	memset(nsd->zonestat[1], 0, sz);
439 	nsd->zonestatsize[0] = num;
440 	nsd->zonestatsize[1] = num;
441 	nsd->zonestatdesired = num;
442 	nsd->zonestatsizenow = num;
443 	nsd->zonestatnow = nsd->zonestat[0];
444 #endif /* HAVE_MMAP */
445 }
446 
447 void
448 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
449 {
450 #ifdef HAVE_MMAP
451 #ifdef MREMAP_MAYMOVE
452 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
453 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
454 		MREMAP_MAYMOVE);
455 	if(nsd->zonestat[idx] == MAP_FAILED) {
456 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
457 		exit(1);
458 	}
459 #else /* !HAVE MREMAP */
460 	if(msync(nsd->zonestat[idx],
461 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
462 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
463 	if(munmap(nsd->zonestat[idx],
464 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
465 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
466 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
467 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
468 	if(nsd->zonestat[idx] == MAP_FAILED) {
469 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
470 		exit(1);
471 	}
472 #endif /* MREMAP */
473 #endif /* HAVE_MMAP */
474 }
475 
476 /* realloc the zonestat array for the one that is not currently in use,
477  * to match the desired new size of the array (if applicable) */
478 void
479 server_zonestat_realloc(struct nsd* nsd)
480 {
481 #ifdef HAVE_MMAP
482 	uint8_t z = 0;
483 	size_t sz;
484 	int idx = 0; /* index of the zonestat array that is not in use */
485 	if(nsd->zonestatnow == nsd->zonestat[0])
486 		idx = 1;
487 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
488 		return;
489 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
490 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
491 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
492 			strerror(errno));
493 		exit(1);
494 	}
495 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
496 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
497 			nsd->zonestatfname[idx], strerror(errno));
498 		exit(1);
499 	}
500 	zonestat_remap(nsd, idx, sz);
501 	/* zero the newly allocated region */
502 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
503 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
504 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
505 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
506 	}
507 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
508 #endif /* HAVE_MMAP */
509 }
510 
511 /* switchover to use the other array for the new children, that
512  * briefly coexist with the old children.  And we want to avoid them
513  * both writing to the same statistics arrays. */
514 void
515 server_zonestat_switch(struct nsd* nsd)
516 {
517 	if(nsd->zonestatnow == nsd->zonestat[0]) {
518 		nsd->zonestatnow = nsd->zonestat[1];
519 		nsd->zonestatsizenow = nsd->zonestatsize[1];
520 	} else {
521 		nsd->zonestatnow = nsd->zonestat[0];
522 		nsd->zonestatsizenow = nsd->zonestatsize[0];
523 	}
524 }
525 #endif /* USE_ZONE_STATS */
526 
527 static void
528 cleanup_dname_compression_tables(void *ptr)
529 {
530 	free(ptr);
531 	compressed_dname_offsets = NULL;
532 	compression_table_capacity = 0;
533 }
534 
535 static void
536 initialize_dname_compression_tables(struct nsd *nsd)
537 {
538 	size_t needed = domain_table_count(nsd->db->domains) + 1;
539 	needed += EXTRA_DOMAIN_NUMBERS;
540 	if(compression_table_capacity < needed) {
541 		if(compressed_dname_offsets) {
542 			region_remove_cleanup(nsd->db->region,
543 				cleanup_dname_compression_tables,
544 				compressed_dname_offsets);
545 			free(compressed_dname_offsets);
546 		}
547 		compressed_dname_offsets = (uint16_t *) xmallocarray(
548 			needed, sizeof(uint16_t));
549 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
550 			compressed_dname_offsets);
551 		compression_table_capacity = needed;
552 		compression_table_size=domain_table_count(nsd->db->domains)+1;
553 	}
554 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
555 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
556 }
557 
558 /* create and bind sockets.  */
559 static int
560 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
561 {
562 	struct addrinfo* addr;
563 	size_t i;
564 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND) || defined(SO_BINDANY))
565 	int on = 1;
566 #endif
567 
568 	/* UDP */
569 
570 	/* Make a socket... */
571 	for (i = from; i < to; i++) {
572 		/* for reuseports copy socket specs of first entries */
573 		addr = nsd->udp[i%nsd->ifs].addr;
574 		if (!addr) {
575 			nsd->udp[i].s = -1;
576 			continue;
577 		}
578 		nsd->udp[i].fam = (int)addr->ai_family;
579 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
580 #if defined(INET6)
581 			if (addr->ai_family == AF_INET6 &&
582 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
583 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
584 				continue;
585 			}
586 #endif /* INET6 */
587 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
588 			return -1;
589 		}
590 
591 #ifdef SO_REUSEPORT
592 		if(nsd->reuseport && *reuseport_works &&
593 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
594 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
595 			if(verbosity >= 3
596 #ifdef ENOPROTOOPT
597 				|| errno != ENOPROTOOPT
598 #endif
599 				)
600 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
601 				"...) failed: %s", strerror(errno));
602 			*reuseport_works = 0;
603 		}
604 #else
605 		(void)reuseport_works;
606 #endif /* SO_REUSEPORT */
607 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
608 	if(1) {
609 	int rcv = 1*1024*1024;
610 	int snd = 1*1024*1024;
611 
612 #ifdef SO_RCVBUF
613 #  ifdef SO_RCVBUFFORCE
614 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
615 		(socklen_t)sizeof(rcv)) < 0) {
616 		if(errno != EPERM && errno != ENOBUFS) {
617 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
618                                         "...) failed: %s", strerror(errno));
619 			return -1;
620 		}
621 #  else
622 	if(1) {
623 #  endif /* SO_RCVBUFFORCE */
624 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
625 			 (socklen_t)sizeof(rcv)) < 0) {
626 			if(errno != ENOBUFS && errno != ENOSYS) {
627 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
628                                         "...) failed: %s", strerror(errno));
629 				return -1;
630 			}
631 		}
632 	}
633 #endif /* SO_RCVBUF */
634 
635 #ifdef SO_SNDBUF
636 #  ifdef SO_SNDBUFFORCE
637 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
638 		(socklen_t)sizeof(snd)) < 0) {
639 		if(errno != EPERM && errno != ENOBUFS) {
640 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
641                                         "...) failed: %s", strerror(errno));
642 			return -1;
643 		}
644 #  else
645 	if(1) {
646 #  endif /* SO_SNDBUFFORCE */
647 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
648 			 (socklen_t)sizeof(snd)) < 0) {
649 			if(errno != ENOBUFS && errno != ENOSYS) {
650 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
651                                         "...) failed: %s", strerror(errno));
652 				return -1;
653 			}
654 		}
655 	}
656 #endif /* SO_SNDBUF */
657 
658 	}
659 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
660 
661 #if defined(INET6)
662 		if (addr->ai_family == AF_INET6) {
663 # if defined(IPV6_V6ONLY)
664 			if (setsockopt(nsd->udp[i].s,
665 				       IPPROTO_IPV6, IPV6_V6ONLY,
666 				       &on, sizeof(on)) < 0)
667 			{
668 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
669 					strerror(errno));
670 				return -1;
671 			}
672 # endif
673 # if defined(IPV6_USE_MIN_MTU)
674 			/*
675 			 * There is no fragmentation of IPv6 datagrams
676 			 * during forwarding in the network. Therefore
677 			 * we do not send UDP datagrams larger than
678 			 * the minimum IPv6 MTU of 1280 octets. The
679 			 * EDNS0 message length can be larger if the
680 			 * network stack supports IPV6_USE_MIN_MTU.
681 			 */
682 			if (setsockopt(nsd->udp[i].s,
683 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
684 				       &on, sizeof(on)) < 0)
685 			{
686 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
687 					strerror(errno));
688 				return -1;
689 			}
690 # elif defined(IPV6_MTU)
691 			/*
692 			 * On Linux, PMTUD is disabled by default for datagrams
693 			 * so set the MTU equal to the MIN MTU to get the same.
694 			 */
695 			on = IPV6_MIN_MTU;
696 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
697 				&on, sizeof(on)) < 0)
698 			{
699 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
700 					strerror(errno));
701 				return -1;
702 			}
703 			on = 1;
704 # endif
705 		}
706 #endif
707 #if defined(AF_INET)
708 		if (addr->ai_family == AF_INET) {
709 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
710 			int action = IP_PMTUDISC_DONT;
711 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
712 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
713 			{
714 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
715 					strerror(errno));
716 				return -1;
717 			}
718 #  elif defined(IP_DONTFRAG)
719 			int off = 0;
720 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
721 				&off, sizeof(off)) < 0)
722 			{
723 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
724 					strerror(errno));
725 				return -1;
726 			}
727 #  endif
728 		}
729 #endif
730 		/* set it nonblocking */
731 		/* otherwise, on OSes with thundering herd problems, the
732 		   UDP recv could block NSD after select returns readable. */
733 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
734 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
735 		}
736 
737 		/* Bind it... */
738 		if (nsd->options->ip_freebind) {
739 #ifdef IP_FREEBIND
740 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
741 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
742 					strerror(errno));
743 			}
744 #endif /* IP_FREEBIND */
745 		}
746 
747 		if (nsd->options->ip_transparent) {
748 #ifdef IP_TRANSPARENT
749 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
750 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
751 					strerror(errno));
752 			}
753 #endif /* IP_TRANSPARENT */
754 #ifdef SO_BINDANY
755 			if (setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
756 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for udp: %s",
757 					strerror(errno));
758 			}
759 #endif /* SO_BINDANY */
760 		}
761 
762 		if (bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
763 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
764 			return -1;
765 		}
766 	}
767 
768 	/* TCP */
769 
770 	/* Make a socket... */
771 	for (i = from; i < to; i++) {
772 		/* for reuseports copy socket specs of first entries */
773 		addr = nsd->tcp[i%nsd->ifs].addr;
774 		if (!addr) {
775 			nsd->tcp[i].s = -1;
776 			continue;
777 		}
778 		nsd->tcp[i].fam = (int)addr->ai_family;
779 		/* turn off REUSEPORT for TCP by copying the socket fd */
780 		if(i >= nsd->ifs) {
781 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
782 			continue;
783 		}
784 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
785 #if defined(INET6)
786 			if (addr->ai_family == AF_INET6 &&
787 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
788 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
789 				continue;
790 			}
791 #endif /* INET6 */
792 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
793 			return -1;
794 		}
795 
796 #ifdef SO_REUSEPORT
797 		if(nsd->reuseport && *reuseport_works &&
798 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
799 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
800 			if(verbosity >= 3
801 #ifdef ENOPROTOOPT
802 				|| errno != ENOPROTOOPT
803 #endif
804 				)
805 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
806 				"...) failed: %s", strerror(errno));
807 			*reuseport_works = 0;
808 		}
809 #endif /* SO_REUSEPORT */
810 #ifdef	SO_REUSEADDR
811 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
812 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
813 		}
814 #endif /* SO_REUSEADDR */
815 
816 #if defined(INET6)
817 		if (addr->ai_family == AF_INET6) {
818 # if defined(IPV6_V6ONLY)
819 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
820 				&on, sizeof(on)) < 0) {
821 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
822 				return -1;
823 			}
824 # endif
825 # if defined(IPV6_USE_MIN_MTU)
826 			/*
827 			 * Use minimum MTU to minimize delays learning working
828 			 * PMTU when communicating through a tunnel.
829 			 */
830 			if (setsockopt(nsd->tcp[i].s,
831 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
832 				       &on, sizeof(on)) < 0) {
833 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
834 				return -1;
835 			}
836 # elif defined(IPV6_MTU)
837 			/*
838 			 * On Linux, PMTUD is disabled by default for datagrams
839 			 * so set the MTU equal to the MIN MTU to get the same.
840 			 */
841 			on = IPV6_MIN_MTU;
842 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
843 				&on, sizeof(on)) < 0) {
844 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
845 				return -1;
846 			}
847 			on = 1;
848 # endif
849 		}
850 #endif
851 		/* set maximum segment size to tcp socket */
852 		if(nsd->tcp_mss > 0) {
853 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
854 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
855 					(void*)&nsd->tcp_mss,
856 					sizeof(nsd->tcp_mss)) < 0) {
857 				log_msg(LOG_ERR,
858 					"setsockopt(...,TCP_MAXSEG,...)"
859 					" failed for tcp: %s", strerror(errno));
860 			}
861 #else
862 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
863 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
864 		}
865 
866 		/* set it nonblocking */
867 		/* (StevensUNP p463), if tcp listening socket is blocking, then
868 		   it may block in accept, even if select() says readable. */
869 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
870 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
871 		}
872 
873 		/* Bind it... */
874 		if (nsd->options->ip_freebind) {
875 #ifdef IP_FREEBIND
876 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
877 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
878 					strerror(errno));
879 			}
880 #endif /* IP_FREEBIND */
881 		}
882 
883 		if (nsd->options->ip_transparent) {
884 #ifdef IP_TRANSPARENT
885 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
886 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
887 					strerror(errno));
888 			}
889 #endif /* IP_TRANSPARENT */
890 #ifdef SO_BINDANY
891 			if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
892 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for tcp: %s",
893 					strerror(errno));
894 			}
895 #endif /* SO_BINDANY */
896 		}
897 
898 		if (bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
899 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
900 			return -1;
901 		}
902 
903 		/* Listen to it... */
904 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
905 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
906 			return -1;
907 		}
908 	}
909 
910 	return 0;
911 }
912 
913 /*
914  * Initialize the server, reuseport, create and bind the sockets.
915  */
916 int
917 server_init(struct nsd *nsd)
918 {
919 	int reuseport_successful = 1; /* see if reuseport works in OS */
920 	if(nsd->reuseport) {
921 		/* increase the size of the udp and tcp interface arrays,
922 		 * there are going to be separate interface file descriptors
923 		 * for every server instance */
924 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
925 			sizeof(*nsd->udp));
926 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
927 			sizeof(*nsd->tcp));
928 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
929 			(nsd->ifs*(nsd->reuseport-1)));
930 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
931 			(nsd->ifs*(nsd->reuseport-1)));
932 	}
933 
934 	/* open the server interface ports */
935 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
936 		return -1;
937 
938 	/* continue to open the remaining reuseport ports */
939 	if(nsd->reuseport && reuseport_successful) {
940 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
941 			&reuseport_successful) == -1)
942 			return -1;
943 		nsd->ifs *= nsd->reuseport;
944 	} else {
945 		nsd->reuseport = 0;
946 	}
947 	return 0;
948 }
949 
950 /*
951  * Prepare the server for take off.
952  *
953  */
954 int
955 server_prepare(struct nsd *nsd)
956 {
957 #ifdef RATELIMIT
958 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
959 #ifdef HAVE_ARC4RANDOM
960 	hash_set_raninit(arc4random());
961 #else
962 	uint32_t v = getpid() ^ time(NULL);
963 	srandom((unsigned long)v);
964 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
965 		hash_set_raninit(v);
966 	else	hash_set_raninit(random());
967 #endif
968 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
969 		nsd->options->rrl_ratelimit,
970 		nsd->options->rrl_whitelist_ratelimit,
971 		nsd->options->rrl_slip,
972 		nsd->options->rrl_ipv4_prefix_length,
973 		nsd->options->rrl_ipv6_prefix_length);
974 #endif /* RATELIMIT */
975 
976 	/* Open the database... */
977 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
978 		log_msg(LOG_ERR, "unable to open the database %s: %s",
979 			nsd->dbfile, strerror(errno));
980 		unlink(nsd->task[0]->fname);
981 		unlink(nsd->task[1]->fname);
982 #ifdef USE_ZONE_STATS
983 		unlink(nsd->zonestatfname[0]);
984 		unlink(nsd->zonestatfname[1]);
985 #endif
986 		xfrd_del_tempdir(nsd);
987 		return -1;
988 	}
989 	/* check if zone files have been modified */
990 	/* NULL for taskudb because we send soainfo in a moment, batched up,
991 	 * for all zones */
992 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
993 		nsd->options->database[0] == 0))
994 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
995 	zonestatid_tree_set(nsd);
996 
997 	compression_table_capacity = 0;
998 	initialize_dname_compression_tables(nsd);
999 
1000 #ifdef	BIND8_STATS
1001 	/* Initialize times... */
1002 	time(&nsd->st.boot);
1003 	set_bind8_alarm(nsd);
1004 #endif /* BIND8_STATS */
1005 
1006 	return 0;
1007 }
1008 
1009 /*
1010  * Fork the required number of servers.
1011  */
1012 static int
1013 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1014 	int* xfrd_sock_p)
1015 {
1016 	size_t i;
1017 
1018 	/* Start all child servers initially.  */
1019 	for (i = 0; i < nsd->child_count; ++i) {
1020 		nsd->children[i].pid = 0;
1021 	}
1022 
1023 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1024 }
1025 
1026 void
1027 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1028 {
1029 	size_t i;
1030 
1031 	/* Close all the sockets... */
1032 	for (i = 0; i < n; ++i) {
1033 		if (sockets[i].s != -1) {
1034 			close(sockets[i].s);
1035 			if(sockets[i].addr)
1036 				freeaddrinfo(sockets[i].addr);
1037 			sockets[i].s = -1;
1038 		}
1039 	}
1040 }
1041 
1042 /*
1043  * Close the sockets, shutdown the server and exit.
1044  * Does not return.
1045  *
1046  */
1047 void
1048 server_shutdown(struct nsd *nsd)
1049 {
1050 	size_t i;
1051 
1052 	server_close_all_sockets(nsd->udp, nsd->ifs);
1053 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1054 	/* CHILD: close command channel to parent */
1055 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1056 	{
1057 		close(nsd->this_child->parent_fd);
1058 		nsd->this_child->parent_fd = -1;
1059 	}
1060 	/* SERVER: close command channels to children */
1061 	if(!nsd->this_child)
1062 	{
1063 		for(i=0; i < nsd->child_count; ++i)
1064 			if(nsd->children[i].child_fd != -1)
1065 			{
1066 				close(nsd->children[i].child_fd);
1067 				nsd->children[i].child_fd = -1;
1068 			}
1069 	}
1070 
1071 	tsig_finalize();
1072 #ifdef HAVE_SSL
1073 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1074 #endif
1075 
1076 #if 0 /* OS collects memory pages */
1077 	nsd_options_destroy(nsd->options);
1078 	region_destroy(nsd->region);
1079 #endif
1080 	log_finalize();
1081 	exit(0);
1082 }
1083 
1084 void
1085 server_prepare_xfrd(struct nsd* nsd)
1086 {
1087 	char tmpfile[256];
1088 	/* create task mmaps */
1089 	nsd->mytask = 0;
1090 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1091 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1092 	nsd->task[0] = task_file_create(tmpfile);
1093 	if(!nsd->task[0]) {
1094 #ifdef USE_ZONE_STATS
1095 		unlink(nsd->zonestatfname[0]);
1096 		unlink(nsd->zonestatfname[1]);
1097 #endif
1098 		xfrd_del_tempdir(nsd);
1099 		exit(1);
1100 	}
1101 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1102 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1103 	nsd->task[1] = task_file_create(tmpfile);
1104 	if(!nsd->task[1]) {
1105 		unlink(nsd->task[0]->fname);
1106 #ifdef USE_ZONE_STATS
1107 		unlink(nsd->zonestatfname[0]);
1108 		unlink(nsd->zonestatfname[1]);
1109 #endif
1110 		xfrd_del_tempdir(nsd);
1111 		exit(1);
1112 	}
1113 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1114 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1115 	/* create xfrd listener structure */
1116 	nsd->xfrd_listener = region_alloc(nsd->region,
1117 		sizeof(netio_handler_type));
1118 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1119 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1120 	nsd->xfrd_listener->fd = -1;
1121 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1122 		nsd;
1123 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1124 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1125 }
1126 
1127 
1128 void
1129 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1130 {
1131 	pid_t pid;
1132 	int sockets[2] = {0,0};
1133 	struct ipc_handler_conn_data *data;
1134 
1135 	if(nsd->xfrd_listener->fd != -1)
1136 		close(nsd->xfrd_listener->fd);
1137 	if(del_db) {
1138 		/* recreate taskdb that xfrd was using, it may be corrupt */
1139 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1140 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1141 		nsd->task[1-nsd->mytask]->fname = NULL;
1142 		/* free alloc already, so udb does not shrink itself */
1143 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1144 		nsd->task[1-nsd->mytask]->alloc = NULL;
1145 		udb_base_free(nsd->task[1-nsd->mytask]);
1146 		/* create new file, overwrite the old one */
1147 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1148 		free(tmpfile);
1149 	}
1150 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1151 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1152 		return;
1153 	}
1154 	pid = fork();
1155 	switch (pid) {
1156 	case -1:
1157 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1158 		break;
1159 	default:
1160 		/* PARENT: close first socket, use second one */
1161 		close(sockets[0]);
1162 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1163 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1164 		}
1165 		if(del_db) xfrd_free_namedb(nsd);
1166 		/* use other task than I am using, since if xfrd died and is
1167 		 * restarted, the reload is using nsd->mytask */
1168 		nsd->mytask = 1 - nsd->mytask;
1169 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1170 		/* ENOTREACH */
1171 		break;
1172 	case 0:
1173 		/* CHILD: close second socket, use first one */
1174 		close(sockets[1]);
1175 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1176 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1177 		}
1178 		nsd->xfrd_listener->fd = sockets[0];
1179 		break;
1180 	}
1181 	/* server-parent only */
1182 	nsd->xfrd_listener->timeout = NULL;
1183 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1184 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1185 	/* clear ongoing ipc reads */
1186 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1187 	data->conn->is_reading = 0;
1188 }
1189 
1190 /** add all soainfo to taskdb */
1191 static void
1192 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1193 {
1194 	struct radnode* n;
1195 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1196 	/* add all SOA INFO to mytask */
1197 	udb_ptr_init(&task_last, taskudb);
1198 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1199 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1200 	}
1201 	udb_ptr_unlink(&task_last, taskudb);
1202 }
1203 
1204 void
1205 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1206 {
1207 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1208 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1209 	 *   then they exchange and process.
1210 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1211 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1212 	 *   expire notifications can be sent back via a normal reload later
1213 	 *   (xfrd will wait for current running reload to finish if any).
1214 	 */
1215 	sig_atomic_t cmd = 0;
1216 	pid_t mypid;
1217 	int xfrd_sock = nsd->xfrd_listener->fd;
1218 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1219 	udb_ptr t;
1220 	if(!shortsoa) {
1221 		if(nsd->signal_hint_shutdown) {
1222 		shutdown:
1223 			log_msg(LOG_WARNING, "signal received, shutting down...");
1224 			server_close_all_sockets(nsd->udp, nsd->ifs);
1225 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1226 #ifdef HAVE_SSL
1227 			daemon_remote_close(nsd->rc);
1228 #endif
1229 			/* Unlink it if possible... */
1230 			unlinkpid(nsd->pidfile);
1231 			unlink(nsd->task[0]->fname);
1232 			unlink(nsd->task[1]->fname);
1233 #ifdef USE_ZONE_STATS
1234 			unlink(nsd->zonestatfname[0]);
1235 			unlink(nsd->zonestatfname[1]);
1236 #endif
1237 			/* write the nsd.db to disk, wait for it to complete */
1238 			udb_base_sync(nsd->db->udb, 1);
1239 			udb_base_close(nsd->db->udb);
1240 			server_shutdown(nsd);
1241 			exit(0);
1242 		}
1243 	}
1244 	if(shortsoa) {
1245 		/* put SOA in xfrd task because mytask may be in use */
1246 		taskudb = nsd->task[1-nsd->mytask];
1247 	}
1248 
1249 	add_all_soa_to_task(nsd, taskudb);
1250 	if(!shortsoa) {
1251 		/* wait for xfrd to signal task is ready, RELOAD signal */
1252 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1253 			cmd != NSD_RELOAD) {
1254 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1255 			exit(1);
1256 		}
1257 		if(nsd->signal_hint_shutdown) {
1258 			goto shutdown;
1259 		}
1260 	}
1261 	/* give xfrd our task, signal it with RELOAD_DONE */
1262 	task_process_sync(taskudb);
1263 	cmd = NSD_RELOAD_DONE;
1264 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1265 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1266 			(int)nsd->pid, strerror(errno));
1267 	}
1268 	mypid = getpid();
1269 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1270 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1271 			strerror(errno));
1272 	}
1273 
1274 	if(!shortsoa) {
1275 		/* process the xfrd task works (expiry data) */
1276 		nsd->mytask = 1 - nsd->mytask;
1277 		taskudb = nsd->task[nsd->mytask];
1278 		task_remap(taskudb);
1279 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1280 		while(!udb_ptr_is_null(&t)) {
1281 			task_process_expire(nsd->db, TASKLIST(&t));
1282 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1283 		}
1284 		udb_ptr_unlink(&t, taskudb);
1285 		task_clear(taskudb);
1286 
1287 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1288 		cmd = NSD_RELOAD_DONE;
1289 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1290 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1291 				(int)nsd->pid, strerror(errno));
1292 		}
1293 	}
1294 }
1295 
1296 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1297 ssize_t
1298 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1299 {
1300 	uint8_t* buf = (uint8_t*) p;
1301 	ssize_t total = 0;
1302 	struct pollfd fd;
1303 	memset(&fd, 0, sizeof(fd));
1304 	fd.fd = s;
1305 	fd.events = POLLIN;
1306 
1307 	while( total < sz) {
1308 		ssize_t ret;
1309 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1310 		if(ret == -1) {
1311 			if(errno == EAGAIN)
1312 				/* blocking read */
1313 				continue;
1314 			if(errno == EINTR) {
1315 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1316 					return -1;
1317 				/* other signals can be handled later */
1318 				continue;
1319 			}
1320 			/* some error */
1321 			return -1;
1322 		}
1323 		if(ret == 0) {
1324 			/* operation timed out */
1325 			return -2;
1326 		}
1327 		ret = read(s, buf+total, sz-total);
1328 		if(ret == -1) {
1329 			if(errno == EAGAIN)
1330 				/* blocking read */
1331 				continue;
1332 			if(errno == EINTR) {
1333 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1334 					return -1;
1335 				/* other signals can be handled later */
1336 				continue;
1337 			}
1338 			/* some error */
1339 			return -1;
1340 		}
1341 		if(ret == 0) {
1342 			/* closed connection! */
1343 			return 0;
1344 		}
1345 		total += ret;
1346 	}
1347 	return total;
1348 }
1349 
1350 static void
1351 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1352 {
1353 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1354 	udb_ptr t, next;
1355 	udb_base* u = nsd->task[nsd->mytask];
1356 	udb_ptr_init(&next, u);
1357 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1358 	udb_base_set_userdata(u, 0);
1359 	while(!udb_ptr_is_null(&t)) {
1360 		/* store next in list so this one can be deleted or reused */
1361 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1362 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1363 
1364 		/* process task t */
1365 		/* append results for task t and update last_task */
1366 		task_process_in_reload(nsd, u, last_task, &t);
1367 
1368 		/* go to next */
1369 		udb_ptr_set_ptr(&t, u, &next);
1370 
1371 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1372 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1373 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1374 			if(cmd == NSD_QUIT) {
1375 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1376 				/* sync to disk (if needed) */
1377 				udb_base_sync(nsd->db->udb, 0);
1378 				/* unlink files of remainder of tasks */
1379 				while(!udb_ptr_is_null(&t)) {
1380 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1381 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1382 					}
1383 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1384 				}
1385 				udb_ptr_unlink(&t, u);
1386 				udb_ptr_unlink(&next, u);
1387 				exit(0);
1388 			}
1389 		}
1390 
1391 	}
1392 	udb_ptr_unlink(&t, u);
1393 	udb_ptr_unlink(&next, u);
1394 }
1395 
1396 #ifdef BIND8_STATS
1397 static void
1398 parent_send_stats(struct nsd* nsd, int cmdfd)
1399 {
1400 	size_t i;
1401 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1402 		log_msg(LOG_ERR, "could not write stats to reload");
1403 		return;
1404 	}
1405 	for(i=0; i<nsd->child_count; i++)
1406 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1407 			sizeof(stc_type))) {
1408 			log_msg(LOG_ERR, "could not write stats to reload");
1409 			return;
1410 		}
1411 }
1412 
1413 static void
1414 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1415 {
1416 	struct nsdst s;
1417 	stc_type* p;
1418 	size_t i;
1419 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1420 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1421 		log_msg(LOG_ERR, "could not read stats from oldpar");
1422 		return;
1423 	}
1424 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1425 	s.db_mem = region_get_mem(nsd->db->region);
1426 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1427 		nsd->child_count);
1428 	if(!p) return;
1429 	for(i=0; i<nsd->child_count; i++) {
1430 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
1431 			sizeof(stc_type))
1432 			return;
1433 	}
1434 }
1435 #endif /* BIND8_STATS */
1436 
1437 /*
1438  * Reload the database, stop parent, re-fork children and continue.
1439  * as server_main.
1440  */
1441 static void
1442 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1443 	int cmdsocket)
1444 {
1445 	pid_t mypid;
1446 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1447 	int ret;
1448 	udb_ptr last_task;
1449 	struct sigaction old_sigchld, ign_sigchld;
1450 	/* ignore SIGCHLD from the previous server_main that used this pid */
1451 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1452 	ign_sigchld.sa_handler = SIG_IGN;
1453 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1454 
1455 	/* see what tasks we got from xfrd */
1456 	task_remap(nsd->task[nsd->mytask]);
1457 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1458 	udb_compact_inhibited(nsd->db->udb, 1);
1459 	reload_process_tasks(nsd, &last_task, cmdsocket);
1460 	udb_compact_inhibited(nsd->db->udb, 0);
1461 	udb_compact(nsd->db->udb);
1462 
1463 #ifndef NDEBUG
1464 	if(nsd_debug_level >= 1)
1465 		region_log_stats(nsd->db->region);
1466 #endif /* NDEBUG */
1467 	/* sync to disk (if needed) */
1468 	udb_base_sync(nsd->db->udb, 0);
1469 
1470 	initialize_dname_compression_tables(nsd);
1471 
1472 #ifdef BIND8_STATS
1473 	/* Restart dumping stats if required.  */
1474 	time(&nsd->st.boot);
1475 	set_bind8_alarm(nsd);
1476 #endif
1477 #ifdef USE_ZONE_STATS
1478 	server_zonestat_realloc(nsd); /* realloc for new children */
1479 	server_zonestat_switch(nsd);
1480 #endif
1481 
1482 	/* listen for the signals of failed children again */
1483 	sigaction(SIGCHLD, &old_sigchld, NULL);
1484 	/* Start new child processes */
1485 	if (server_start_children(nsd, server_region, netio, &nsd->
1486 		xfrd_listener->fd) != 0) {
1487 		send_children_quit(nsd);
1488 		exit(1);
1489 	}
1490 
1491 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1492 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1493 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1494 		if(cmd == NSD_QUIT) {
1495 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1496 			send_children_quit(nsd);
1497 			exit(0);
1498 		}
1499 	}
1500 
1501 	/* Send quit command to parent: blocking, wait for receipt. */
1502 	do {
1503 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1504 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1505 		{
1506 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1507 				strerror(errno));
1508 		}
1509 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1510 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1511 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1512 			RELOAD_SYNC_TIMEOUT);
1513 		if(ret == -2) {
1514 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1515 		}
1516 	} while (ret == -2);
1517 	if(ret == -1) {
1518 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1519 			strerror(errno));
1520 	}
1521 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1522 	if(cmd == NSD_QUIT) {
1523 		/* small race condition possible here, parent got quit cmd. */
1524 		send_children_quit(nsd);
1525 		exit(1);
1526 	}
1527 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1528 #ifdef BIND8_STATS
1529 	reload_do_stats(cmdsocket, nsd, &last_task);
1530 #endif
1531 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1532 	task_process_sync(nsd->task[nsd->mytask]);
1533 #ifdef USE_ZONE_STATS
1534 	server_zonestat_realloc(nsd); /* realloc for next children */
1535 #endif
1536 
1537 	/* send soainfo to the xfrd process, signal it that reload is done,
1538 	 * it picks up the taskudb */
1539 	cmd = NSD_RELOAD_DONE;
1540 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1541 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1542 			strerror(errno));
1543 	}
1544 	mypid = getpid();
1545 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1546 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1547 			strerror(errno));
1548 	}
1549 
1550 	/* try to reopen file */
1551 	if (nsd->file_rotation_ok)
1552 		log_reopen(nsd->log_filename, 1);
1553 	/* exit reload, continue as new server_main */
1554 }
1555 
1556 /*
1557  * Get the mode depending on the signal hints that have been received.
1558  * Multiple signal hints can be received and will be handled in turn.
1559  */
1560 static sig_atomic_t
1561 server_signal_mode(struct nsd *nsd)
1562 {
1563 	if(nsd->signal_hint_quit) {
1564 		nsd->signal_hint_quit = 0;
1565 		return NSD_QUIT;
1566 	}
1567 	else if(nsd->signal_hint_shutdown) {
1568 		nsd->signal_hint_shutdown = 0;
1569 		return NSD_SHUTDOWN;
1570 	}
1571 	else if(nsd->signal_hint_child) {
1572 		nsd->signal_hint_child = 0;
1573 		return NSD_REAP_CHILDREN;
1574 	}
1575 	else if(nsd->signal_hint_reload) {
1576 		nsd->signal_hint_reload = 0;
1577 		return NSD_RELOAD;
1578 	}
1579 	else if(nsd->signal_hint_reload_hup) {
1580 		nsd->signal_hint_reload_hup = 0;
1581 		return NSD_RELOAD_REQ;
1582 	}
1583 	else if(nsd->signal_hint_stats) {
1584 		nsd->signal_hint_stats = 0;
1585 #ifdef BIND8_STATS
1586 		set_bind8_alarm(nsd);
1587 #endif
1588 		return NSD_STATS;
1589 	}
1590 	else if(nsd->signal_hint_statsusr) {
1591 		nsd->signal_hint_statsusr = 0;
1592 		return NSD_STATS;
1593 	}
1594 	return NSD_RUN;
1595 }
1596 
1597 /*
1598  * The main server simply waits for signals and child processes to
1599  * terminate.  Child processes are restarted as necessary.
1600  */
1601 void
1602 server_main(struct nsd *nsd)
1603 {
1604 	region_type *server_region = region_create(xalloc, free);
1605 	netio_type *netio = netio_create(server_region);
1606 	netio_handler_type reload_listener;
1607 	int reload_sockets[2] = {-1, -1};
1608 	struct timespec timeout_spec;
1609 	int status;
1610 	pid_t child_pid;
1611 	pid_t reload_pid = -1;
1612 	sig_atomic_t mode;
1613 
1614 	/* Ensure we are the main process */
1615 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1616 
1617 	/* Add listener for the XFRD process */
1618 	netio_add_handler(netio, nsd->xfrd_listener);
1619 
1620 	/* Start the child processes that handle incoming queries */
1621 	if (server_start_children(nsd, server_region, netio,
1622 		&nsd->xfrd_listener->fd) != 0) {
1623 		send_children_quit(nsd);
1624 		exit(1);
1625 	}
1626 	reload_listener.fd = -1;
1627 
1628 	/* This_child MUST be 0, because this is the parent process */
1629 	assert(nsd->this_child == 0);
1630 
1631 	/* Run the server until we get a shutdown signal */
1632 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1633 		/* Did we receive a signal that changes our mode? */
1634 		if(mode == NSD_RUN) {
1635 			nsd->mode = mode = server_signal_mode(nsd);
1636 		}
1637 
1638 		switch (mode) {
1639 		case NSD_RUN:
1640 			/* see if any child processes terminated */
1641 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1642 				int is_child = delete_child_pid(nsd, child_pid);
1643 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1644 					if(nsd->children[is_child].child_fd == -1)
1645 						nsd->children[is_child].has_exited = 1;
1646 					parent_check_all_children_exited(nsd);
1647 				} else if(is_child != -1) {
1648 					log_msg(LOG_WARNING,
1649 					       "server %d died unexpectedly with status %d, restarting",
1650 					       (int) child_pid, status);
1651 					restart_child_servers(nsd, server_region, netio,
1652 						&nsd->xfrd_listener->fd);
1653 				} else if (child_pid == reload_pid) {
1654 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1655 					pid_t mypid;
1656 					log_msg(LOG_WARNING,
1657 					       "Reload process %d failed with status %d, continuing with old database",
1658 					       (int) child_pid, status);
1659 					reload_pid = -1;
1660 					if(reload_listener.fd != -1) close(reload_listener.fd);
1661 					reload_listener.fd = -1;
1662 					reload_listener.event_types = NETIO_EVENT_NONE;
1663 					task_process_sync(nsd->task[nsd->mytask]);
1664 					/* inform xfrd reload attempt ended */
1665 					if(!write_socket(nsd->xfrd_listener->fd,
1666 						&cmd, sizeof(cmd))) {
1667 						log_msg(LOG_ERR, "problems "
1668 						  "sending SOAEND to xfrd: %s",
1669 						  strerror(errno));
1670 					}
1671 					mypid = getpid();
1672 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1673 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1674 							strerror(errno));
1675 					}
1676 				} else if(status != 0) {
1677 					/* check for status, because we get
1678 					 * the old-servermain because reload
1679 					 * is the process-parent of old-main,
1680 					 * and we get older server-processes
1681 					 * that are exiting after a reload */
1682 					log_msg(LOG_WARNING,
1683 					       "process %d terminated with status %d",
1684 					       (int) child_pid, status);
1685 				}
1686 			}
1687 			if (child_pid == -1) {
1688 				if (errno == EINTR) {
1689 					continue;
1690 				}
1691 				if (errno != ECHILD)
1692 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1693 			}
1694 			if (nsd->mode != NSD_RUN)
1695 				break;
1696 
1697 			/* timeout to collect processes. In case no sigchild happens. */
1698 			timeout_spec.tv_sec = 60;
1699 			timeout_spec.tv_nsec = 0;
1700 
1701 			/* listen on ports, timeout for collecting terminated children */
1702 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1703 				if (errno != EINTR) {
1704 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1705 				}
1706 			}
1707 			if(nsd->restart_children) {
1708 				restart_child_servers(nsd, server_region, netio,
1709 					&nsd->xfrd_listener->fd);
1710 				nsd->restart_children = 0;
1711 			}
1712 			if(nsd->reload_failed) {
1713 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1714 				pid_t mypid;
1715 				nsd->reload_failed = 0;
1716 				log_msg(LOG_WARNING,
1717 				       "Reload process %d failed, continuing with old database",
1718 				       (int) reload_pid);
1719 				reload_pid = -1;
1720 				if(reload_listener.fd != -1) close(reload_listener.fd);
1721 				reload_listener.fd = -1;
1722 				reload_listener.event_types = NETIO_EVENT_NONE;
1723 				task_process_sync(nsd->task[nsd->mytask]);
1724 				/* inform xfrd reload attempt ended */
1725 				if(!write_socket(nsd->xfrd_listener->fd,
1726 					&cmd, sizeof(cmd))) {
1727 					log_msg(LOG_ERR, "problems "
1728 					  "sending SOAEND to xfrd: %s",
1729 					  strerror(errno));
1730 				}
1731 				mypid = getpid();
1732 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1733 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1734 						strerror(errno));
1735 				}
1736 			}
1737 
1738 			break;
1739 		case NSD_RELOAD_REQ: {
1740 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1741 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1742 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1743 				"main: ipc send reload_req to xfrd"));
1744 			if(!write_socket(nsd->xfrd_listener->fd,
1745 				&cmd, sizeof(cmd))) {
1746 				log_msg(LOG_ERR, "server_main: could not send "
1747 				"reload_req to xfrd: %s", strerror(errno));
1748 			}
1749 			nsd->mode = NSD_RUN;
1750 			} break;
1751 		case NSD_RELOAD:
1752 			/* Continue to run nsd after reload */
1753 			nsd->mode = NSD_RUN;
1754 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1755 			if (reload_pid != -1) {
1756 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1757 				       (int) reload_pid);
1758 				break;
1759 			}
1760 
1761 			/* switch the mytask to keep track of who owns task*/
1762 			nsd->mytask = 1 - nsd->mytask;
1763 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1764 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1765 				reload_pid = -1;
1766 				break;
1767 			}
1768 
1769 			/* Do actual reload */
1770 			reload_pid = fork();
1771 			switch (reload_pid) {
1772 			case -1:
1773 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1774 				break;
1775 			default:
1776 				/* PARENT */
1777 				close(reload_sockets[0]);
1778 				server_reload(nsd, server_region, netio,
1779 					reload_sockets[1]);
1780 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1781 				close(reload_sockets[1]);
1782 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1783 				/* drop stale xfrd ipc data */
1784 				((struct ipc_handler_conn_data*)nsd->
1785 					xfrd_listener->user_data)
1786 					->conn->is_reading = 0;
1787 				reload_pid = -1;
1788 				reload_listener.fd = -1;
1789 				reload_listener.event_types = NETIO_EVENT_NONE;
1790 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1791 				break;
1792 			case 0:
1793 				/* CHILD */
1794 				/* server_main keep running until NSD_QUIT_SYNC
1795 				 * received from reload. */
1796 				close(reload_sockets[1]);
1797 				reload_listener.fd = reload_sockets[0];
1798 				reload_listener.timeout = NULL;
1799 				reload_listener.user_data = nsd;
1800 				reload_listener.event_types = NETIO_EVENT_READ;
1801 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1802 				netio_add_handler(netio, &reload_listener);
1803 				reload_pid = getppid();
1804 				break;
1805 			}
1806 			break;
1807 		case NSD_QUIT_SYNC:
1808 			/* synchronisation of xfrd, parent and reload */
1809 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1810 				sig_atomic_t cmd = NSD_RELOAD;
1811 				/* stop xfrd ipc writes in progress */
1812 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1813 					"main: ipc send indication reload"));
1814 				if(!write_socket(nsd->xfrd_listener->fd,
1815 					&cmd, sizeof(cmd))) {
1816 					log_msg(LOG_ERR, "server_main: could not send reload "
1817 					"indication to xfrd: %s", strerror(errno));
1818 				}
1819 				/* wait for ACK from xfrd */
1820 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1821 				nsd->quit_sync_done = 1;
1822 			}
1823 			nsd->mode = NSD_RUN;
1824 			break;
1825 		case NSD_QUIT:
1826 			/* silent shutdown during reload */
1827 			if(reload_listener.fd != -1) {
1828 				/* acknowledge the quit, to sync reload that we will really quit now */
1829 				sig_atomic_t cmd = NSD_RELOAD;
1830 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1831 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1832 					log_msg(LOG_ERR, "server_main: "
1833 						"could not ack quit: %s", strerror(errno));
1834 				}
1835 #ifdef BIND8_STATS
1836 				parent_send_stats(nsd, reload_listener.fd);
1837 #endif /* BIND8_STATS */
1838 				close(reload_listener.fd);
1839 			}
1840 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1841 			/* only quit children after xfrd has acked */
1842 			send_children_quit(nsd);
1843 
1844 #if 0 /* OS collects memory pages */
1845 			region_destroy(server_region);
1846 #endif
1847 			server_shutdown(nsd);
1848 
1849 			/* ENOTREACH */
1850 			break;
1851 		case NSD_SHUTDOWN:
1852 			break;
1853 		case NSD_REAP_CHILDREN:
1854 			/* continue; wait for child in run loop */
1855 			nsd->mode = NSD_RUN;
1856 			break;
1857 		case NSD_STATS:
1858 #ifdef BIND8_STATS
1859 			set_children_stats(nsd);
1860 #endif
1861 			nsd->mode = NSD_RUN;
1862 			break;
1863 		default:
1864 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1865 			nsd->mode = NSD_RUN;
1866 			break;
1867 		}
1868 	}
1869 	log_msg(LOG_WARNING, "signal received, shutting down...");
1870 
1871 	/* close opened ports to avoid race with restart of nsd */
1872 	server_close_all_sockets(nsd->udp, nsd->ifs);
1873 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1874 #ifdef HAVE_SSL
1875 	daemon_remote_close(nsd->rc);
1876 #endif
1877 	send_children_quit_and_wait(nsd);
1878 
1879 	/* Unlink it if possible... */
1880 	unlinkpid(nsd->pidfile);
1881 	unlink(nsd->task[0]->fname);
1882 	unlink(nsd->task[1]->fname);
1883 #ifdef USE_ZONE_STATS
1884 	unlink(nsd->zonestatfname[0]);
1885 	unlink(nsd->zonestatfname[1]);
1886 #endif
1887 
1888 	if(reload_listener.fd != -1) {
1889 		sig_atomic_t cmd = NSD_QUIT;
1890 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1891 			"main: ipc send quit to reload-process"));
1892 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1893 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1894 				strerror(errno));
1895 		}
1896 		fsync(reload_listener.fd);
1897 		close(reload_listener.fd);
1898 		/* wait for reload to finish processing */
1899 		while(1) {
1900 			if(waitpid(reload_pid, NULL, 0) == -1) {
1901 				if(errno == EINTR) continue;
1902 				if(errno == ECHILD) break;
1903 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1904 					(int)reload_pid, strerror(errno));
1905 			}
1906 			break;
1907 		}
1908 	}
1909 	if(nsd->xfrd_listener->fd != -1) {
1910 		/* complete quit, stop xfrd */
1911 		sig_atomic_t cmd = NSD_QUIT;
1912 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1913 			"main: ipc send quit to xfrd"));
1914 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1915 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1916 				strerror(errno));
1917 		}
1918 		fsync(nsd->xfrd_listener->fd);
1919 		close(nsd->xfrd_listener->fd);
1920 		(void)kill(nsd->pid, SIGTERM);
1921 	}
1922 
1923 #if 0 /* OS collects memory pages */
1924 	region_destroy(server_region);
1925 #endif
1926 	/* write the nsd.db to disk, wait for it to complete */
1927 	udb_base_sync(nsd->db->udb, 1);
1928 	udb_base_close(nsd->db->udb);
1929 	server_shutdown(nsd);
1930 }
1931 
1932 static query_state_type
1933 server_process_query(struct nsd *nsd, struct query *query)
1934 {
1935 	return query_process(query, nsd);
1936 }
1937 
1938 static query_state_type
1939 server_process_query_udp(struct nsd *nsd, struct query *query)
1940 {
1941 #ifdef RATELIMIT
1942 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1943 		if(rrl_process_query(query))
1944 			return rrl_slip(query);
1945 		else	return QUERY_PROCESSED;
1946 	}
1947 	return QUERY_DISCARDED;
1948 #else
1949 	return query_process(query, nsd);
1950 #endif
1951 }
1952 
1953 struct event_base*
1954 nsd_child_event_base(void)
1955 {
1956 	struct event_base* base;
1957 #ifdef USE_MINI_EVENT
1958 	static time_t secs;
1959 	static struct timeval now;
1960 	base = event_init(&secs, &now);
1961 #else
1962 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
1963 	/* libev */
1964 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
1965 #  else
1966 	/* libevent */
1967 #    ifdef HAVE_EVENT_BASE_NEW
1968 	base = event_base_new();
1969 #    else
1970 	base = event_init();
1971 #    endif
1972 #  endif
1973 #endif
1974 	return base;
1975 }
1976 
1977 /*
1978  * Serve DNS requests.
1979  */
1980 void
1981 server_child(struct nsd *nsd)
1982 {
1983 	size_t i, from, numifs;
1984 	region_type *server_region = region_create(xalloc, free);
1985 	struct event_base* event_base = nsd_child_event_base();
1986 	query_type *udp_query;
1987 	sig_atomic_t mode;
1988 
1989 	if(!event_base) {
1990 		log_msg(LOG_ERR, "nsd server could not create event base");
1991 		exit(1);
1992 	}
1993 
1994 #ifdef RATELIMIT
1995 	rrl_init(nsd->this_child->child_num);
1996 #endif
1997 
1998 	assert(nsd->server_kind != NSD_SERVER_MAIN);
1999 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2000 
2001 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2002 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2003 	}
2004 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2005 		server_close_all_sockets(nsd->udp, nsd->ifs);
2006 	}
2007 
2008 	if (nsd->this_child && nsd->this_child->parent_fd != -1) {
2009 		struct event *handler;
2010 		struct ipc_handler_conn_data* user_data =
2011 			(struct ipc_handler_conn_data*)region_alloc(
2012 			server_region, sizeof(struct ipc_handler_conn_data));
2013 		user_data->nsd = nsd;
2014 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2015 
2016 		handler = (struct event*) region_alloc(
2017 			server_region, sizeof(*handler));
2018 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2019 			EV_READ, child_handle_parent_command, user_data);
2020 		if(event_base_set(event_base, handler) != 0)
2021 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2022 		if(event_add(handler, NULL) != 0)
2023 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2024 	}
2025 
2026 	if(nsd->reuseport) {
2027 		numifs = nsd->ifs / nsd->reuseport;
2028 		from = numifs * nsd->this_child->child_num;
2029 		if(from+numifs > nsd->ifs) { /* should not happen */
2030 			from = 0;
2031 			numifs = nsd->ifs;
2032 		}
2033 	} else {
2034 		from = 0;
2035 		numifs = nsd->ifs;
2036 	}
2037 
2038 	if (nsd->server_kind & NSD_SERVER_UDP) {
2039 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2040 		udp_query = query_create(server_region,
2041 			compressed_dname_offsets, compression_table_size);
2042 #else
2043 		udp_query = NULL;
2044 		memset(msgs, 0, sizeof(msgs));
2045 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2046 			queries[i] = query_create(server_region,
2047 				compressed_dname_offsets, compression_table_size);
2048 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2049 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2050 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2051 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2052 			msgs[i].msg_hdr.msg_iovlen  = 1;
2053 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2054 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2055 		}
2056 #endif
2057 		for (i = from; i < from+numifs; ++i) {
2058 			struct udp_handler_data *data;
2059 			struct event *handler;
2060 
2061 			data = (struct udp_handler_data *) region_alloc(
2062 				server_region,
2063 				sizeof(struct udp_handler_data));
2064 			data->query = udp_query;
2065 			data->nsd = nsd;
2066 			data->socket = &nsd->udp[i];
2067 
2068 			handler = (struct event*) region_alloc(
2069 				server_region, sizeof(*handler));
2070 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2071 				handle_udp, data);
2072 			if(event_base_set(event_base, handler) != 0)
2073 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2074 			if(event_add(handler, NULL) != 0)
2075 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2076 		}
2077 	}
2078 
2079 	/*
2080 	 * Keep track of all the TCP accept handlers so we can enable
2081 	 * and disable them based on the current number of active TCP
2082 	 * connections.
2083 	 */
2084 	tcp_accept_handler_count = numifs;
2085 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2086 		region_alloc_array(server_region,
2087 		numifs, sizeof(*tcp_accept_handlers));
2088 	if (nsd->server_kind & NSD_SERVER_TCP) {
2089 		for (i = from; i < numifs; ++i) {
2090 			struct event *handler = &tcp_accept_handlers[i-from].event;
2091 			struct tcp_accept_handler_data* data =
2092 				&tcp_accept_handlers[i-from];
2093 			data->nsd = nsd;
2094 			data->socket = &nsd->tcp[i];
2095 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2096 				handle_tcp_accept, data);
2097 			if(event_base_set(event_base, handler) != 0)
2098 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2099 			if(event_add(handler, NULL) != 0)
2100 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2101 			data->event_added = 1;
2102 		}
2103 	} else tcp_accept_handler_count = 0;
2104 
2105 	/* The main loop... */
2106 	while ((mode = nsd->mode) != NSD_QUIT) {
2107 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2108 
2109 		/* Do we need to do the statistics... */
2110 		if (mode == NSD_STATS) {
2111 #ifdef BIND8_STATS
2112 			int p = nsd->st.period;
2113 			nsd->st.period = 1; /* force stats printout */
2114 			/* Dump the statistics */
2115 			bind8_stats(nsd);
2116 			nsd->st.period = p;
2117 #else /* !BIND8_STATS */
2118 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2119 #endif /* BIND8_STATS */
2120 
2121 			nsd->mode = NSD_RUN;
2122 		}
2123 		else if (mode == NSD_REAP_CHILDREN) {
2124 			/* got signal, notify parent. parent reaps terminated children. */
2125 			if (nsd->this_child->parent_fd != -1) {
2126 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2127 				if (write(nsd->this_child->parent_fd,
2128 				    &parent_notify,
2129 				    sizeof(parent_notify)) == -1)
2130 				{
2131 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2132 						(int) nsd->this_child->pid, strerror(errno));
2133 				}
2134 			} else /* no parent, so reap 'em */
2135 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2136 			nsd->mode = NSD_RUN;
2137 		}
2138 		else if(mode == NSD_RUN) {
2139 			/* Wait for a query... */
2140 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2141 				if (errno != EINTR) {
2142 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2143 					break;
2144 				}
2145 			}
2146 		} else if(mode == NSD_QUIT) {
2147 			/* ignore here, quit */
2148 		} else {
2149 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2150 				(int)mode);
2151 			nsd->mode = NSD_RUN;
2152 		}
2153 	}
2154 
2155 #ifdef	BIND8_STATS
2156 	bind8_stats(nsd);
2157 #endif /* BIND8_STATS */
2158 
2159 #if 0 /* OS collects memory pages */
2160 	event_base_free(event_base);
2161 	region_destroy(server_region);
2162 #endif
2163 	server_shutdown(nsd);
2164 }
2165 
2166 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2167 static void
2168 handle_udp(int fd, short event, void* arg)
2169 {
2170 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2171 	int received, sent, recvcount, i;
2172 	struct query *q;
2173 
2174 	if (!(event & EV_READ)) {
2175 		return;
2176 	}
2177 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2178 	/* this printf strangely gave a performance increase on Linux */
2179 	/* printf("recvcount %d \n", recvcount); */
2180 	if (recvcount == -1) {
2181 		if (errno != EAGAIN && errno != EINTR) {
2182 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2183 			STATUP(data->nsd, rxerr);
2184 			/* No zone statup */
2185 		}
2186 		/* Simply no data available */
2187 		return;
2188 	}
2189 	for (i = 0; i < recvcount; i++) {
2190 	loopstart:
2191 		received = msgs[i].msg_len;
2192 		q = queries[i];
2193 		if (received == -1) {
2194 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2195 				msgs[i].msg_hdr.msg_flags));
2196 			STATUP(data->nsd, rxerr);
2197 			/* No zone statup */
2198 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2199 			iovecs[i].iov_len = buffer_remaining(q->packet);
2200 			goto swap_drop;
2201 		}
2202 
2203 		/* Account... */
2204 #ifdef BIND8_STATS
2205 		if (data->socket->fam == AF_INET) {
2206 			STATUP(data->nsd, qudp);
2207 		} else if (data->socket->fam == AF_INET6) {
2208 			STATUP(data->nsd, qudp6);
2209 		}
2210 #endif
2211 
2212 		buffer_skip(q->packet, received);
2213 		buffer_flip(q->packet);
2214 
2215 		/* Process and answer the query... */
2216 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2217 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2218 				STATUP(data->nsd, nona);
2219 				ZTATUP(data->nsd, q->zone, nona);
2220 			}
2221 
2222 #ifdef USE_ZONE_STATS
2223 			if (data->socket->fam == AF_INET) {
2224 				ZTATUP(data->nsd, q->zone, qudp);
2225 			} else if (data->socket->fam == AF_INET6) {
2226 				ZTATUP(data->nsd, q->zone, qudp6);
2227 			}
2228 #endif
2229 
2230 			/* Add EDNS0 and TSIG info if necessary.  */
2231 			query_add_optional(q, data->nsd);
2232 
2233 			buffer_flip(q->packet);
2234 			iovecs[i].iov_len = buffer_remaining(q->packet);
2235 #ifdef BIND8_STATS
2236 			/* Account the rcode & TC... */
2237 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2238 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2239 			if (TC(q->packet)) {
2240 				STATUP(data->nsd, truncated);
2241 				ZTATUP(data->nsd, q->zone, truncated);
2242 			}
2243 #endif /* BIND8_STATS */
2244 		} else {
2245 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2246 			iovecs[i].iov_len = buffer_remaining(q->packet);
2247 		swap_drop:
2248 			STATUP(data->nsd, dropped);
2249 			ZTATUP(data->nsd, q->zone, dropped);
2250 			if(i != recvcount-1) {
2251 				/* swap with last and decrease recvcount */
2252 				struct mmsghdr mtmp = msgs[i];
2253 				struct iovec iotmp = iovecs[i];
2254 				recvcount--;
2255 				msgs[i] = msgs[recvcount];
2256 				iovecs[i] = iovecs[recvcount];
2257 				queries[i] = queries[recvcount];
2258 				msgs[recvcount] = mtmp;
2259 				iovecs[recvcount] = iotmp;
2260 				queries[recvcount] = q;
2261 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2262 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2263 				goto loopstart;
2264 			} else { recvcount --; }
2265 		}
2266 	}
2267 
2268 	/* send until all are sent */
2269 	i = 0;
2270 	while(i<recvcount) {
2271 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2272 		if(sent == -1) {
2273 			const char* es = strerror(errno);
2274 			char a[48];
2275 			addr2str(&queries[i]->addr, a, sizeof(a));
2276 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2277 #ifdef BIND8_STATS
2278 			data->nsd->st.txerr += recvcount-i;
2279 #endif /* BIND8_STATS */
2280 			break;
2281 		}
2282 		i += sent;
2283 	}
2284 	for(i=0; i<recvcount; i++) {
2285 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2286 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2287 	}
2288 }
2289 
2290 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2291 
2292 static void
2293 handle_udp(int fd, short event, void* arg)
2294 {
2295 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2296 	int received, sent;
2297 #ifndef NONBLOCKING_IS_BROKEN
2298 #ifdef HAVE_RECVMMSG
2299 	int recvcount;
2300 #endif /* HAVE_RECVMMSG */
2301 	int i;
2302 #endif /* NONBLOCKING_IS_BROKEN */
2303 	struct query *q;
2304 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2305 	q = data->query;
2306 #endif
2307 
2308 	if (!(event & EV_READ)) {
2309 		return;
2310 	}
2311 #ifndef NONBLOCKING_IS_BROKEN
2312 #ifdef HAVE_RECVMMSG
2313 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2314 	/* this printf strangely gave a performance increase on Linux */
2315 	/* printf("recvcount %d \n", recvcount); */
2316 	if (recvcount == -1) {
2317 		if (errno != EAGAIN && errno != EINTR) {
2318 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2319 			STATUP(data->nsd, rxerr);
2320 			/* No zone statup */
2321 		}
2322 		/* Simply no data available */
2323 		return;
2324 	}
2325 	for (i = 0; i < recvcount; i++) {
2326 		received = msgs[i].msg_len;
2327 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2328 		if (received == -1) {
2329 			log_msg(LOG_ERR, "recvmmsg failed");
2330 			STATUP(data->nsd, rxerr);
2331 			/* No zone statup */
2332 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2333 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2334 			continue;
2335 		}
2336 		q = queries[i];
2337 #else
2338 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2339 #endif /* HAVE_RECVMMSG */
2340 #endif /* NONBLOCKING_IS_BROKEN */
2341 
2342 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2343 		/* Initialize the query... */
2344 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2345 
2346 		received = recvfrom(fd,
2347 				    buffer_begin(q->packet),
2348 				    buffer_remaining(q->packet),
2349 				    0,
2350 				    (struct sockaddr *)&q->addr,
2351 				    &q->addrlen);
2352 		if (received == -1) {
2353 			if (errno != EAGAIN && errno != EINTR) {
2354 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2355 				STATUP(data->nsd, rxerr);
2356 				/* No zone statup */
2357 			}
2358 			return;
2359 		}
2360 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2361 
2362 		/* Account... */
2363 		if (data->socket->fam == AF_INET) {
2364 			STATUP(data->nsd, qudp);
2365 		} else if (data->socket->fam == AF_INET6) {
2366 			STATUP(data->nsd, qudp6);
2367 		}
2368 
2369 		buffer_skip(q->packet, received);
2370 		buffer_flip(q->packet);
2371 
2372 		/* Process and answer the query... */
2373 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2374 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2375 				STATUP(data->nsd, nona);
2376 				ZTATUP(data->nsd, q->zone, nona);
2377 			}
2378 
2379 #ifdef USE_ZONE_STATS
2380 			if (data->socket->fam == AF_INET) {
2381 				ZTATUP(data->nsd, q->zone, qudp);
2382 			} else if (data->socket->fam == AF_INET6) {
2383 				ZTATUP(data->nsd, q->zone, qudp6);
2384 			}
2385 #endif
2386 
2387 			/* Add EDNS0 and TSIG info if necessary.  */
2388 			query_add_optional(q, data->nsd);
2389 
2390 			buffer_flip(q->packet);
2391 
2392 			sent = sendto(fd,
2393 				      buffer_begin(q->packet),
2394 				      buffer_remaining(q->packet),
2395 				      0,
2396 				      (struct sockaddr *) &q->addr,
2397 				      q->addrlen);
2398 			if (sent == -1) {
2399 				const char* es = strerror(errno);
2400 				char a[48];
2401 				addr2str(&q->addr, a, sizeof(a));
2402 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2403 				STATUP(data->nsd, txerr);
2404 				ZTATUP(data->nsd, q->zone, txerr);
2405 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2406 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2407 			} else {
2408 #ifdef BIND8_STATS
2409 				/* Account the rcode & TC... */
2410 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2411 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2412 				if (TC(q->packet)) {
2413 					STATUP(data->nsd, truncated);
2414 					ZTATUP(data->nsd, q->zone, truncated);
2415 				}
2416 #endif /* BIND8_STATS */
2417 			}
2418 		} else {
2419 			STATUP(data->nsd, dropped);
2420 			ZTATUP(data->nsd, q->zone, dropped);
2421 		}
2422 #ifndef NONBLOCKING_IS_BROKEN
2423 #ifdef HAVE_RECVMMSG
2424 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2425 #endif
2426 	}
2427 #endif
2428 }
2429 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2430 
2431 
2432 static void
2433 cleanup_tcp_handler(struct tcp_handler_data* data)
2434 {
2435 	event_del(&data->event);
2436 	close(data->event.ev_fd);
2437 
2438 	/*
2439 	 * Enable the TCP accept handlers when the current number of
2440 	 * TCP connections is about to drop below the maximum number
2441 	 * of TCP connections.
2442 	 */
2443 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2444 		configure_handler_event_types(EV_READ|EV_PERSIST);
2445 		if(slowaccept) {
2446 			event_del(&slowaccept_event);
2447 			slowaccept = 0;
2448 		}
2449 	}
2450 	--data->nsd->current_tcp_count;
2451 	assert(data->nsd->current_tcp_count >= 0);
2452 
2453 	region_destroy(data->region);
2454 }
2455 
2456 static void
2457 handle_tcp_reading(int fd, short event, void* arg)
2458 {
2459 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2460 	ssize_t received;
2461 	struct event_base* ev_base;
2462 	struct timeval timeout;
2463 
2464 	if ((event & EV_TIMEOUT)) {
2465 		/* Connection timed out.  */
2466 		cleanup_tcp_handler(data);
2467 		return;
2468 	}
2469 
2470 	if (data->nsd->tcp_query_count > 0 &&
2471 		data->query_count >= data->nsd->tcp_query_count) {
2472 		/* No more queries allowed on this tcp connection.  */
2473 		cleanup_tcp_handler(data);
2474 		return;
2475 	}
2476 
2477 	assert((event & EV_READ));
2478 
2479 	if (data->bytes_transmitted == 0) {
2480 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2481 	}
2482 
2483 	/*
2484 	 * Check if we received the leading packet length bytes yet.
2485 	 */
2486 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2487 		received = read(fd,
2488 				(char *) &data->query->tcplen
2489 				+ data->bytes_transmitted,
2490 				sizeof(uint16_t) - data->bytes_transmitted);
2491 		if (received == -1) {
2492 			if (errno == EAGAIN || errno == EINTR) {
2493 				/*
2494 				 * Read would block, wait until more
2495 				 * data is available.
2496 				 */
2497 				return;
2498 			} else {
2499 				char buf[48];
2500 				addr2str(&data->query->addr, buf, sizeof(buf));
2501 #ifdef ECONNRESET
2502 				if (verbosity >= 2 || errno != ECONNRESET)
2503 #endif /* ECONNRESET */
2504 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2505 				cleanup_tcp_handler(data);
2506 				return;
2507 			}
2508 		} else if (received == 0) {
2509 			/* EOF */
2510 			cleanup_tcp_handler(data);
2511 			return;
2512 		}
2513 
2514 		data->bytes_transmitted += received;
2515 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2516 			/*
2517 			 * Not done with the tcplen yet, wait for more
2518 			 * data to become available.
2519 			 */
2520 			return;
2521 		}
2522 
2523 		assert(data->bytes_transmitted == sizeof(uint16_t));
2524 
2525 		data->query->tcplen = ntohs(data->query->tcplen);
2526 
2527 		/*
2528 		 * Minimum query size is:
2529 		 *
2530 		 *     Size of the header (12)
2531 		 *   + Root domain name   (1)
2532 		 *   + Query class        (2)
2533 		 *   + Query type         (2)
2534 		 */
2535 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2536 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2537 			cleanup_tcp_handler(data);
2538 			return;
2539 		}
2540 
2541 		if (data->query->tcplen > data->query->maxlen) {
2542 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2543 			cleanup_tcp_handler(data);
2544 			return;
2545 		}
2546 
2547 		buffer_set_limit(data->query->packet, data->query->tcplen);
2548 	}
2549 
2550 	assert(buffer_remaining(data->query->packet) > 0);
2551 
2552 	/* Read the (remaining) query data.  */
2553 	received = read(fd,
2554 			buffer_current(data->query->packet),
2555 			buffer_remaining(data->query->packet));
2556 	if (received == -1) {
2557 		if (errno == EAGAIN || errno == EINTR) {
2558 			/*
2559 			 * Read would block, wait until more data is
2560 			 * available.
2561 			 */
2562 			return;
2563 		} else {
2564 			char buf[48];
2565 			addr2str(&data->query->addr, buf, sizeof(buf));
2566 #ifdef ECONNRESET
2567 			if (verbosity >= 2 || errno != ECONNRESET)
2568 #endif /* ECONNRESET */
2569 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2570 			cleanup_tcp_handler(data);
2571 			return;
2572 		}
2573 	} else if (received == 0) {
2574 		/* EOF */
2575 		cleanup_tcp_handler(data);
2576 		return;
2577 	}
2578 
2579 	data->bytes_transmitted += received;
2580 	buffer_skip(data->query->packet, received);
2581 	if (buffer_remaining(data->query->packet) > 0) {
2582 		/*
2583 		 * Message not yet complete, wait for more data to
2584 		 * become available.
2585 		 */
2586 		return;
2587 	}
2588 
2589 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2590 
2591 	/* Account... */
2592 #ifdef BIND8_STATS
2593 #ifndef INET6
2594 	STATUP(data->nsd, ctcp);
2595 #else
2596 	if (data->query->addr.ss_family == AF_INET) {
2597 		STATUP(data->nsd, ctcp);
2598 	} else if (data->query->addr.ss_family == AF_INET6) {
2599 		STATUP(data->nsd, ctcp6);
2600 	}
2601 #endif
2602 #endif /* BIND8_STATS */
2603 
2604 	/* We have a complete query, process it.  */
2605 
2606 	/* tcp-query-count: handle query counter ++ */
2607 	data->query_count++;
2608 
2609 	buffer_flip(data->query->packet);
2610 	data->query_state = server_process_query(data->nsd, data->query);
2611 	if (data->query_state == QUERY_DISCARDED) {
2612 		/* Drop the packet and the entire connection... */
2613 		STATUP(data->nsd, dropped);
2614 		ZTATUP(data->nsd, data->query->zone, dropped);
2615 		cleanup_tcp_handler(data);
2616 		return;
2617 	}
2618 
2619 #ifdef BIND8_STATS
2620 	if (RCODE(data->query->packet) == RCODE_OK
2621 	    && !AA(data->query->packet))
2622 	{
2623 		STATUP(data->nsd, nona);
2624 		ZTATUP(data->nsd, data->query->zone, nona);
2625 	}
2626 #endif /* BIND8_STATS */
2627 
2628 #ifdef USE_ZONE_STATS
2629 #ifndef INET6
2630 	ZTATUP(data->nsd, data->query->zone, ctcp);
2631 #else
2632 	if (data->query->addr.ss_family == AF_INET) {
2633 		ZTATUP(data->nsd, data->query->zone, ctcp);
2634 	} else if (data->query->addr.ss_family == AF_INET6) {
2635 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2636 	}
2637 #endif
2638 #endif /* USE_ZONE_STATS */
2639 
2640 	query_add_optional(data->query, data->nsd);
2641 
2642 	/* Switch to the tcp write handler.  */
2643 	buffer_flip(data->query->packet);
2644 	data->query->tcplen = buffer_remaining(data->query->packet);
2645 	data->bytes_transmitted = 0;
2646 
2647 	timeout.tv_sec = data->tcp_timeout / 1000;
2648 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2649 
2650 	ev_base = data->event.ev_base;
2651 	event_del(&data->event);
2652 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2653 		handle_tcp_writing, data);
2654 	if(event_base_set(ev_base, &data->event) != 0)
2655 		log_msg(LOG_ERR, "event base set tcpr failed");
2656 	if(event_add(&data->event, &timeout) != 0)
2657 		log_msg(LOG_ERR, "event add tcpr failed");
2658 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2659 	handle_tcp_writing(fd, EV_WRITE, data);
2660 }
2661 
2662 static void
2663 handle_tcp_writing(int fd, short event, void* arg)
2664 {
2665 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2666 	ssize_t sent;
2667 	struct query *q = data->query;
2668 	struct timeval timeout;
2669 	struct event_base* ev_base;
2670 
2671 	if ((event & EV_TIMEOUT)) {
2672 		/* Connection timed out.  */
2673 		cleanup_tcp_handler(data);
2674 		return;
2675 	}
2676 
2677 	assert((event & EV_WRITE));
2678 
2679 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2680 		/* Writing the response packet length.  */
2681 		uint16_t n_tcplen = htons(q->tcplen);
2682 #ifdef HAVE_WRITEV
2683 		struct iovec iov[2];
2684 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2685 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2686 		iov[1].iov_base = buffer_begin(q->packet);
2687 		iov[1].iov_len = buffer_limit(q->packet);
2688 		sent = writev(fd, iov, 2);
2689 #else /* HAVE_WRITEV */
2690 		sent = write(fd,
2691 			     (const char *) &n_tcplen + data->bytes_transmitted,
2692 			     sizeof(n_tcplen) - data->bytes_transmitted);
2693 #endif /* HAVE_WRITEV */
2694 		if (sent == -1) {
2695 			if (errno == EAGAIN || errno == EINTR) {
2696 				/*
2697 				 * Write would block, wait until
2698 				 * socket becomes writable again.
2699 				 */
2700 				return;
2701 			} else {
2702 #ifdef ECONNRESET
2703 				if(verbosity >= 2 || errno != ECONNRESET)
2704 #endif /* ECONNRESET */
2705 #ifdef EPIPE
2706 				  if(verbosity >= 2 || errno != EPIPE)
2707 #endif /* EPIPE 'broken pipe' */
2708 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2709 				cleanup_tcp_handler(data);
2710 				return;
2711 			}
2712 		}
2713 
2714 		data->bytes_transmitted += sent;
2715 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2716 			/*
2717 			 * Writing not complete, wait until socket
2718 			 * becomes writable again.
2719 			 */
2720 			return;
2721 		}
2722 
2723 #ifdef HAVE_WRITEV
2724 		sent -= sizeof(n_tcplen);
2725 		/* handle potential 'packet done' code */
2726 		goto packet_could_be_done;
2727 #endif
2728  	}
2729 
2730 	sent = write(fd,
2731 		     buffer_current(q->packet),
2732 		     buffer_remaining(q->packet));
2733 	if (sent == -1) {
2734 		if (errno == EAGAIN || errno == EINTR) {
2735 			/*
2736 			 * Write would block, wait until
2737 			 * socket becomes writable again.
2738 			 */
2739 			return;
2740 		} else {
2741 #ifdef ECONNRESET
2742 			if(verbosity >= 2 || errno != ECONNRESET)
2743 #endif /* ECONNRESET */
2744 #ifdef EPIPE
2745 				  if(verbosity >= 2 || errno != EPIPE)
2746 #endif /* EPIPE 'broken pipe' */
2747 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2748 			cleanup_tcp_handler(data);
2749 			return;
2750 		}
2751 	}
2752 
2753 	data->bytes_transmitted += sent;
2754 #ifdef HAVE_WRITEV
2755   packet_could_be_done:
2756 #endif
2757 	buffer_skip(q->packet, sent);
2758 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2759 		/*
2760 		 * Still more data to write when socket becomes
2761 		 * writable again.
2762 		 */
2763 		return;
2764 	}
2765 
2766 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2767 
2768 	if (data->query_state == QUERY_IN_AXFR) {
2769 		/* Continue processing AXFR and writing back results.  */
2770 		buffer_clear(q->packet);
2771 		data->query_state = query_axfr(data->nsd, q);
2772 		if (data->query_state != QUERY_PROCESSED) {
2773 			query_add_optional(data->query, data->nsd);
2774 
2775 			/* Reset data. */
2776 			buffer_flip(q->packet);
2777 			q->tcplen = buffer_remaining(q->packet);
2778 			data->bytes_transmitted = 0;
2779 			/* Reset timeout.  */
2780 			timeout.tv_sec = data->tcp_timeout / 1000;
2781 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2782 			ev_base = data->event.ev_base;
2783 			event_del(&data->event);
2784 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2785 				handle_tcp_writing, data);
2786 			if(event_base_set(ev_base, &data->event) != 0)
2787 				log_msg(LOG_ERR, "event base set tcpw failed");
2788 			if(event_add(&data->event, &timeout) != 0)
2789 				log_msg(LOG_ERR, "event add tcpw failed");
2790 
2791 			/*
2792 			 * Write data if/when the socket is writable
2793 			 * again.
2794 			 */
2795 			return;
2796 		}
2797 	}
2798 
2799 	/*
2800 	 * Done sending, wait for the next request to arrive on the
2801 	 * TCP socket by installing the TCP read handler.
2802 	 */
2803 	if (data->nsd->tcp_query_count > 0 &&
2804 		data->query_count >= data->nsd->tcp_query_count) {
2805 
2806 		(void) shutdown(fd, SHUT_WR);
2807 	}
2808 
2809 	data->bytes_transmitted = 0;
2810 
2811 	timeout.tv_sec = data->tcp_timeout / 1000;
2812 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2813 	ev_base = data->event.ev_base;
2814 	event_del(&data->event);
2815 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2816 		handle_tcp_reading, data);
2817 	if(event_base_set(ev_base, &data->event) != 0)
2818 		log_msg(LOG_ERR, "event base set tcpw failed");
2819 	if(event_add(&data->event, &timeout) != 0)
2820 		log_msg(LOG_ERR, "event add tcpw failed");
2821 }
2822 
2823 
2824 static void
2825 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2826 	void* ATTR_UNUSED(arg))
2827 {
2828 	if(slowaccept) {
2829 		configure_handler_event_types(EV_PERSIST | EV_READ);
2830 		slowaccept = 0;
2831 	}
2832 }
2833 
2834 /*
2835  * Handle an incoming TCP connection.  The connection is accepted and
2836  * a new TCP reader event handler is added.  The TCP handler
2837  * is responsible for cleanup when the connection is closed.
2838  */
2839 static void
2840 handle_tcp_accept(int fd, short event, void* arg)
2841 {
2842 	struct tcp_accept_handler_data *data
2843 		= (struct tcp_accept_handler_data *) arg;
2844 	int s;
2845 	struct tcp_handler_data *tcp_data;
2846 	region_type *tcp_region;
2847 #ifdef INET6
2848 	struct sockaddr_storage addr;
2849 #else
2850 	struct sockaddr_in addr;
2851 #endif
2852 	socklen_t addrlen;
2853 	struct timeval timeout;
2854 
2855 	if (!(event & EV_READ)) {
2856 		return;
2857 	}
2858 
2859 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2860 		return;
2861 	}
2862 
2863 	/* Accept it... */
2864 	addrlen = sizeof(addr);
2865 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2866 	if (s == -1) {
2867 		/**
2868 		 * EMFILE and ENFILE is a signal that the limit of open
2869 		 * file descriptors has been reached. Pause accept().
2870 		 * EINTR is a signal interrupt. The others are various OS ways
2871 		 * of saying that the client has closed the connection.
2872 		 */
2873 		if (errno == EMFILE || errno == ENFILE) {
2874 			if (!slowaccept) {
2875 				/* disable accept events */
2876 				struct timeval tv;
2877 				configure_handler_event_types(0);
2878 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2879 				tv.tv_usec = 0L;
2880 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2881 					handle_slowaccept_timeout, NULL);
2882 				(void)event_base_set(data->event.ev_base,
2883 					&slowaccept_event);
2884 				(void)event_add(&slowaccept_event, &tv);
2885 				slowaccept = 1;
2886 				/* We don't want to spam the logs here */
2887 			}
2888 		} else if (errno != EINTR
2889 			&& errno != EWOULDBLOCK
2890 #ifdef ECONNABORTED
2891 			&& errno != ECONNABORTED
2892 #endif /* ECONNABORTED */
2893 #ifdef EPROTO
2894 			&& errno != EPROTO
2895 #endif /* EPROTO */
2896 			) {
2897 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2898 		}
2899 		return;
2900 	}
2901 
2902 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2903 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
2904 		close(s);
2905 		return;
2906 	}
2907 
2908 	/*
2909 	 * This region is deallocated when the TCP connection is
2910 	 * closed by the TCP handler.
2911 	 */
2912 	tcp_region = region_create(xalloc, free);
2913 	tcp_data = (struct tcp_handler_data *) region_alloc(
2914 		tcp_region, sizeof(struct tcp_handler_data));
2915 	tcp_data->region = tcp_region;
2916 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
2917 		compression_table_size);
2918 	tcp_data->nsd = data->nsd;
2919 	tcp_data->query_count = 0;
2920 
2921 	tcp_data->query_state = QUERY_PROCESSED;
2922 	tcp_data->bytes_transmitted = 0;
2923 	memcpy(&tcp_data->query->addr, &addr, addrlen);
2924 	tcp_data->query->addrlen = addrlen;
2925 
2926 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
2927 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
2928 		/* very busy, give smaller timeout */
2929 		tcp_data->tcp_timeout = 200;
2930 	}
2931 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
2932 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
2933 
2934 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
2935 		handle_tcp_reading, tcp_data);
2936 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
2937 		log_msg(LOG_ERR, "cannot set tcp event base");
2938 		close(s);
2939 		region_destroy(tcp_region);
2940 		return;
2941 	}
2942 	if(event_add(&tcp_data->event, &timeout) != 0) {
2943 		log_msg(LOG_ERR, "cannot add tcp to event base");
2944 		close(s);
2945 		region_destroy(tcp_region);
2946 		return;
2947 	}
2948 
2949 	/*
2950 	 * Keep track of the total number of TCP handlers installed so
2951 	 * we can stop accepting connections when the maximum number
2952 	 * of simultaneous TCP connections is reached.
2953 	 */
2954 	++data->nsd->current_tcp_count;
2955 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2956 		configure_handler_event_types(0);
2957 	}
2958 }
2959 
2960 static void
2961 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
2962 {
2963 	size_t i;
2964 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2965 	for (i = 0; i < nsd->child_count; ++i) {
2966 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
2967 			if (write(nsd->children[i].child_fd,
2968 				&command,
2969 				sizeof(command)) == -1)
2970 			{
2971 				if(errno != EAGAIN && errno != EINTR)
2972 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
2973 					(int) command,
2974 					(int) nsd->children[i].pid,
2975 					strerror(errno));
2976 			} else if (timeout > 0) {
2977 				(void)block_read(NULL,
2978 					nsd->children[i].child_fd,
2979 					&command, sizeof(command), timeout);
2980 			}
2981 			fsync(nsd->children[i].child_fd);
2982 			close(nsd->children[i].child_fd);
2983 			nsd->children[i].child_fd = -1;
2984 		}
2985 	}
2986 }
2987 
2988 static void
2989 send_children_quit(struct nsd* nsd)
2990 {
2991 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
2992 	send_children_command(nsd, NSD_QUIT, 0);
2993 }
2994 
2995 static void
2996 send_children_quit_and_wait(struct nsd* nsd)
2997 {
2998 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
2999 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
3000 }
3001 
3002 #ifdef BIND8_STATS
3003 static void
3004 set_children_stats(struct nsd* nsd)
3005 {
3006 	size_t i;
3007 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3008 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
3009 	for (i = 0; i < nsd->child_count; ++i) {
3010 		nsd->children[i].need_to_send_STATS = 1;
3011 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
3012 	}
3013 }
3014 #endif /* BIND8_STATS */
3015 
3016 static void
3017 configure_handler_event_types(short event_types)
3018 {
3019 	size_t i;
3020 
3021 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3022 		struct event* handler = &tcp_accept_handlers[i].event;
3023 		if(event_types) {
3024 			/* reassign */
3025 			int fd = handler->ev_fd;
3026 			struct event_base* base = handler->ev_base;
3027 			if(tcp_accept_handlers[i].event_added)
3028 				event_del(handler);
3029 			event_set(handler, fd, event_types,
3030 				handle_tcp_accept, &tcp_accept_handlers[i]);
3031 			if(event_base_set(base, handler) != 0)
3032 				log_msg(LOG_ERR, "conhand: cannot event_base");
3033 			if(event_add(handler, NULL) != 0)
3034 				log_msg(LOG_ERR, "conhand: cannot event_add");
3035 			tcp_accept_handlers[i].event_added = 1;
3036 		} else {
3037 			/* remove */
3038 			if(tcp_accept_handlers[i].event_added) {
3039 				event_del(handler);
3040 				tcp_accept_handlers[i].event_added = 0;
3041 			}
3042 		}
3043 	}
3044 }
3045