xref: /openbsd-src/usr.sbin/nsd/server.c (revision d1df930ffab53da22f3324c32bed7ac5709915e6)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #ifdef HAVE_OPENSSL_RAND_H
41 #include <openssl/rand.h>
42 #endif
43 #ifndef USE_MINI_EVENT
44 #  ifdef HAVE_EVENT_H
45 #    include <event.h>
46 #  else
47 #    include <event2/event.h>
48 #    include "event2/event_struct.h"
49 #    include "event2/event_compat.h"
50 #  endif
51 #else
52 #  include "mini_event.h"
53 #endif
54 
55 #include "axfr.h"
56 #include "namedb.h"
57 #include "netio.h"
58 #include "xfrd.h"
59 #include "xfrd-tcp.h"
60 #include "xfrd-disk.h"
61 #include "difffile.h"
62 #include "nsec3.h"
63 #include "ipc.h"
64 #include "udb.h"
65 #include "remote.h"
66 #include "lookup3.h"
67 #include "rrl.h"
68 
69 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
70 
71 /*
72  * Data for the UDP handlers.
73  */
74 struct udp_handler_data
75 {
76 	struct nsd        *nsd;
77 	struct nsd_socket *socket;
78 	query_type        *query;
79 };
80 
81 struct tcp_accept_handler_data {
82 	struct nsd         *nsd;
83 	struct nsd_socket  *socket;
84 	int event_added;
85 	struct event       event;
86 };
87 
88 /*
89  * These globals are used to enable the TCP accept handlers
90  * when the number of TCP connection drops below the maximum
91  * number of TCP connections.
92  */
93 static size_t		tcp_accept_handler_count;
94 static struct tcp_accept_handler_data*	tcp_accept_handlers;
95 
96 static struct event slowaccept_event;
97 static int slowaccept;
98 
99 #ifndef NONBLOCKING_IS_BROKEN
100 #  define NUM_RECV_PER_SELECT 100
101 #endif
102 
103 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
104 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
105 struct iovec iovecs[NUM_RECV_PER_SELECT];
106 struct query *queries[NUM_RECV_PER_SELECT];
107 #endif
108 
109 /*
110  * Data for the TCP connection handlers.
111  *
112  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
113  * blocking the entire server on a slow TCP connection, but does make
114  * reading from and writing to the socket more complicated.
115  *
116  * Basically, whenever a read/write would block (indicated by the
117  * EAGAIN errno variable) we remember the position we were reading
118  * from/writing to and return from the TCP reading/writing event
119  * handler.  When the socket becomes readable/writable again we
120  * continue from the same position.
121  */
122 struct tcp_handler_data
123 {
124 	/*
125 	 * The region used to allocate all TCP connection related
126 	 * data, including this structure.  This region is destroyed
127 	 * when the connection is closed.
128 	 */
129 	region_type*		region;
130 
131 	/*
132 	 * The global nsd structure.
133 	 */
134 	struct nsd*			nsd;
135 
136 	/*
137 	 * The current query data for this TCP connection.
138 	 */
139 	query_type*			query;
140 
141 	/*
142 	 * The query_state is used to remember if we are performing an
143 	 * AXFR, if we're done processing, or if we should discard the
144 	 * query and connection.
145 	 */
146 	query_state_type	query_state;
147 
148 	/*
149 	 * The event for the file descriptor and tcp timeout
150 	 */
151 	struct event event;
152 
153 	/*
154 	 * The bytes_transmitted field is used to remember the number
155 	 * of bytes transmitted when receiving or sending a DNS
156 	 * packet.  The count includes the two additional bytes used
157 	 * to specify the packet length on a TCP connection.
158 	 */
159 	size_t				bytes_transmitted;
160 
161 	/*
162 	 * The number of queries handled by this specific TCP connection.
163 	 */
164 	int					query_count;
165 
166 	/*
167 	 * The timeout in msec for this tcp connection
168 	 */
169 	int	tcp_timeout;
170 };
171 
172 /*
173  * Handle incoming queries on the UDP server sockets.
174  */
175 static void handle_udp(int fd, short event, void* arg);
176 
177 /*
178  * Handle incoming connections on the TCP sockets.  These handlers
179  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
180  * connection) but are disabled when the number of current TCP
181  * connections is equal to the maximum number of TCP connections.
182  * Disabling is done by changing the handler to wait for the
183  * NETIO_EVENT_NONE type.  This is done using the function
184  * configure_tcp_accept_handlers.
185  */
186 static void handle_tcp_accept(int fd, short event, void* arg);
187 
188 /*
189  * Handle incoming queries on a TCP connection.  The TCP connections
190  * are configured to be non-blocking and the handler may be called
191  * multiple times before a complete query is received.
192  */
193 static void handle_tcp_reading(int fd, short event, void* arg);
194 
195 /*
196  * Handle outgoing responses on a TCP connection.  The TCP connections
197  * are configured to be non-blocking and the handler may be called
198  * multiple times before a complete response is sent.
199  */
200 static void handle_tcp_writing(int fd, short event, void* arg);
201 
202 /*
203  * Send all children the quit nonblocking, then close pipe.
204  */
205 static void send_children_quit(struct nsd* nsd);
206 /* same, for shutdown time, waits for child to exit to avoid restart issues */
207 static void send_children_quit_and_wait(struct nsd* nsd);
208 
209 /* set childrens flags to send NSD_STATS to them */
210 #ifdef BIND8_STATS
211 static void set_children_stats(struct nsd* nsd);
212 #endif /* BIND8_STATS */
213 
214 /*
215  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
216  */
217 static void configure_handler_event_types(short event_types);
218 
219 static uint16_t *compressed_dname_offsets = 0;
220 static uint32_t compression_table_capacity = 0;
221 static uint32_t compression_table_size = 0;
222 static domain_type* compressed_dnames[MAXRRSPP];
223 
224 /*
225  * Remove the specified pid from the list of child pids.  Returns -1 if
226  * the pid is not in the list, child_num otherwise.  The field is set to 0.
227  */
228 static int
229 delete_child_pid(struct nsd *nsd, pid_t pid)
230 {
231 	size_t i;
232 	for (i = 0; i < nsd->child_count; ++i) {
233 		if (nsd->children[i].pid == pid) {
234 			nsd->children[i].pid = 0;
235 			if(!nsd->children[i].need_to_exit) {
236 				if(nsd->children[i].child_fd != -1)
237 					close(nsd->children[i].child_fd);
238 				nsd->children[i].child_fd = -1;
239 				if(nsd->children[i].handler)
240 					nsd->children[i].handler->fd = -1;
241 			}
242 			return i;
243 		}
244 	}
245 	return -1;
246 }
247 
248 /*
249  * Restart child servers if necessary.
250  */
251 static int
252 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
253 	int* xfrd_sock_p)
254 {
255 	struct main_ipc_handler_data *ipc_data;
256 	size_t i;
257 	int sv[2];
258 
259 	/* Fork the child processes... */
260 	for (i = 0; i < nsd->child_count; ++i) {
261 		if (nsd->children[i].pid <= 0) {
262 			if (nsd->children[i].child_fd != -1)
263 				close(nsd->children[i].child_fd);
264 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
265 				log_msg(LOG_ERR, "socketpair: %s",
266 					strerror(errno));
267 				return -1;
268 			}
269 			nsd->children[i].child_fd = sv[0];
270 			nsd->children[i].parent_fd = sv[1];
271 			nsd->children[i].pid = fork();
272 			switch (nsd->children[i].pid) {
273 			default: /* SERVER MAIN */
274 				close(nsd->children[i].parent_fd);
275 				nsd->children[i].parent_fd = -1;
276 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
277 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
278 				}
279 				if(!nsd->children[i].handler)
280 				{
281 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
282 						region, sizeof(struct main_ipc_handler_data));
283 					ipc_data->nsd = nsd;
284 					ipc_data->child = &nsd->children[i];
285 					ipc_data->child_num = i;
286 					ipc_data->xfrd_sock = xfrd_sock_p;
287 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
288 					ipc_data->forward_mode = 0;
289 					ipc_data->got_bytes = 0;
290 					ipc_data->total_bytes = 0;
291 					ipc_data->acl_num = 0;
292 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
293 						region, sizeof(struct netio_handler));
294 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
295 					nsd->children[i].handler->timeout = NULL;
296 					nsd->children[i].handler->user_data = ipc_data;
297 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
298 					nsd->children[i].handler->event_handler = parent_handle_child_command;
299 					netio_add_handler(netio, nsd->children[i].handler);
300 				}
301 				/* clear any ongoing ipc */
302 				ipc_data = (struct main_ipc_handler_data*)
303 					nsd->children[i].handler->user_data;
304 				ipc_data->forward_mode = 0;
305 				/* restart - update fd */
306 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
307 				break;
308 			case 0: /* CHILD */
309 				/* the child need not be able to access the
310 				 * nsd.db file */
311 				namedb_close_udb(nsd->db);
312 #ifdef MEMCLEAN /* OS collects memory pages */
313 				region_destroy(region);
314 #endif
315 
316 				if (pledge("stdio rpath inet", NULL) == -1) {
317 					log_msg(LOG_ERR, "pledge");
318 					exit(1);
319 				}
320 
321 				nsd->pid = 0;
322 				nsd->child_count = 0;
323 				nsd->server_kind = nsd->children[i].kind;
324 				nsd->this_child = &nsd->children[i];
325 				nsd->this_child->child_num = i;
326 				/* remove signal flags inherited from parent
327 				   the parent will handle them. */
328 				nsd->signal_hint_reload_hup = 0;
329 				nsd->signal_hint_reload = 0;
330 				nsd->signal_hint_child = 0;
331 				nsd->signal_hint_quit = 0;
332 				nsd->signal_hint_shutdown = 0;
333 				nsd->signal_hint_stats = 0;
334 				nsd->signal_hint_statsusr = 0;
335 				close(*xfrd_sock_p);
336 				close(nsd->this_child->child_fd);
337 				nsd->this_child->child_fd = -1;
338 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
339 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
340 				}
341 				server_child(nsd);
342 				/* NOTREACH */
343 				exit(0);
344 			case -1:
345 				log_msg(LOG_ERR, "fork failed: %s",
346 					strerror(errno));
347 				return -1;
348 			}
349 		}
350 	}
351 	return 0;
352 }
353 
354 #ifdef BIND8_STATS
355 static void set_bind8_alarm(struct nsd* nsd)
356 {
357 	/* resync so that the next alarm is on the next whole minute */
358 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
359 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
360 }
361 #endif
362 
363 /* set zone stat ids for zones initially read in */
364 static void
365 zonestatid_tree_set(struct nsd* nsd)
366 {
367 	struct radnode* n;
368 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
369 		zone_type* zone = (zone_type*)n->elem;
370 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
371 	}
372 }
373 
374 #ifdef USE_ZONE_STATS
375 void
376 server_zonestat_alloc(struct nsd* nsd)
377 {
378 	size_t num = (nsd->options->zonestatnames->count==0?1:
379 			nsd->options->zonestatnames->count);
380 	size_t sz = sizeof(struct nsdst)*num;
381 	char tmpfile[256];
382 	uint8_t z = 0;
383 
384 	/* file names */
385 	nsd->zonestatfname[0] = 0;
386 	nsd->zonestatfname[1] = 0;
387 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
388 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
389 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
390 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
391 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
392 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
393 
394 	/* file descriptors */
395 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
396 	if(nsd->zonestatfd[0] == -1) {
397 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
398 			strerror(errno));
399 		exit(1);
400 	}
401 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
402 	if(nsd->zonestatfd[0] == -1) {
403 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
404 			strerror(errno));
405 		close(nsd->zonestatfd[0]);
406 		unlink(nsd->zonestatfname[0]);
407 		exit(1);
408 	}
409 
410 #ifdef HAVE_MMAP
411 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
412 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
413 			strerror(errno));
414 		exit(1);
415 	}
416 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
417 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
418 			nsd->zonestatfname[0], strerror(errno));
419 		exit(1);
420 	}
421 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
422 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
423 			strerror(errno));
424 		exit(1);
425 	}
426 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
427 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
428 			nsd->zonestatfname[1], strerror(errno));
429 		exit(1);
430 	}
431 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
432 		MAP_SHARED, nsd->zonestatfd[0], 0);
433 	if(nsd->zonestat[0] == MAP_FAILED) {
434 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
435 		unlink(nsd->zonestatfname[0]);
436 		unlink(nsd->zonestatfname[1]);
437 		exit(1);
438 	}
439 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
440 		MAP_SHARED, nsd->zonestatfd[1], 0);
441 	if(nsd->zonestat[1] == MAP_FAILED) {
442 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
443 		unlink(nsd->zonestatfname[0]);
444 		unlink(nsd->zonestatfname[1]);
445 		exit(1);
446 	}
447 	memset(nsd->zonestat[0], 0, sz);
448 	memset(nsd->zonestat[1], 0, sz);
449 	nsd->zonestatsize[0] = num;
450 	nsd->zonestatsize[1] = num;
451 	nsd->zonestatdesired = num;
452 	nsd->zonestatsizenow = num;
453 	nsd->zonestatnow = nsd->zonestat[0];
454 #endif /* HAVE_MMAP */
455 }
456 
457 void
458 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
459 {
460 #ifdef HAVE_MMAP
461 #ifdef MREMAP_MAYMOVE
462 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
463 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
464 		MREMAP_MAYMOVE);
465 	if(nsd->zonestat[idx] == MAP_FAILED) {
466 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
467 		exit(1);
468 	}
469 #else /* !HAVE MREMAP */
470 	if(msync(nsd->zonestat[idx],
471 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
472 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
473 	if(munmap(nsd->zonestat[idx],
474 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
475 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
476 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
477 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
478 	if(nsd->zonestat[idx] == MAP_FAILED) {
479 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
480 		exit(1);
481 	}
482 #endif /* MREMAP */
483 #endif /* HAVE_MMAP */
484 }
485 
486 /* realloc the zonestat array for the one that is not currently in use,
487  * to match the desired new size of the array (if applicable) */
488 void
489 server_zonestat_realloc(struct nsd* nsd)
490 {
491 #ifdef HAVE_MMAP
492 	uint8_t z = 0;
493 	size_t sz;
494 	int idx = 0; /* index of the zonestat array that is not in use */
495 	if(nsd->zonestatnow == nsd->zonestat[0])
496 		idx = 1;
497 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
498 		return;
499 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
500 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
501 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
502 			strerror(errno));
503 		exit(1);
504 	}
505 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
506 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
507 			nsd->zonestatfname[idx], strerror(errno));
508 		exit(1);
509 	}
510 	zonestat_remap(nsd, idx, sz);
511 	/* zero the newly allocated region */
512 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
513 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
514 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
515 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
516 	}
517 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
518 #endif /* HAVE_MMAP */
519 }
520 
521 /* switchover to use the other array for the new children, that
522  * briefly coexist with the old children.  And we want to avoid them
523  * both writing to the same statistics arrays. */
524 void
525 server_zonestat_switch(struct nsd* nsd)
526 {
527 	if(nsd->zonestatnow == nsd->zonestat[0]) {
528 		nsd->zonestatnow = nsd->zonestat[1];
529 		nsd->zonestatsizenow = nsd->zonestatsize[1];
530 	} else {
531 		nsd->zonestatnow = nsd->zonestat[0];
532 		nsd->zonestatsizenow = nsd->zonestatsize[0];
533 	}
534 }
535 #endif /* USE_ZONE_STATS */
536 
537 static void
538 cleanup_dname_compression_tables(void *ptr)
539 {
540 	free(ptr);
541 	compressed_dname_offsets = NULL;
542 	compression_table_capacity = 0;
543 }
544 
545 static void
546 initialize_dname_compression_tables(struct nsd *nsd)
547 {
548 	size_t needed = domain_table_count(nsd->db->domains) + 1;
549 	needed += EXTRA_DOMAIN_NUMBERS;
550 	if(compression_table_capacity < needed) {
551 		if(compressed_dname_offsets) {
552 			region_remove_cleanup(nsd->db->region,
553 				cleanup_dname_compression_tables,
554 				compressed_dname_offsets);
555 			free(compressed_dname_offsets);
556 		}
557 		compressed_dname_offsets = (uint16_t *) xmallocarray(
558 			needed, sizeof(uint16_t));
559 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
560 			compressed_dname_offsets);
561 		compression_table_capacity = needed;
562 		compression_table_size=domain_table_count(nsd->db->domains)+1;
563 	}
564 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
565 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
566 }
567 
568 /* create and bind sockets.  */
569 static int
570 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
571 {
572 	struct addrinfo* addr;
573 	size_t i;
574 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND) || defined(SO_BINDANY))
575 	int on = 1;
576 #endif
577 
578 	/* UDP */
579 
580 	/* Make a socket... */
581 	for (i = from; i < to; i++) {
582 		/* for reuseports copy socket specs of first entries */
583 		addr = nsd->udp[i%nsd->ifs].addr;
584 		if (!addr) {
585 			nsd->udp[i].s = -1;
586 			continue;
587 		}
588 		nsd->udp[i].fam = (int)addr->ai_family;
589 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
590 #if defined(INET6)
591 			if (addr->ai_family == AF_INET6 &&
592 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
593 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
594 				continue;
595 			}
596 #endif /* INET6 */
597 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
598 			return -1;
599 		}
600 
601 #ifdef SO_REUSEPORT
602 		if(nsd->reuseport && *reuseport_works &&
603 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
604 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
605 			if(verbosity >= 3
606 #ifdef ENOPROTOOPT
607 				|| errno != ENOPROTOOPT
608 #endif
609 				)
610 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
611 				"...) failed: %s", strerror(errno));
612 			*reuseport_works = 0;
613 		}
614 #else
615 		(void)reuseport_works;
616 #endif /* SO_REUSEPORT */
617 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
618 	if(1) {
619 	int rcv = 1*1024*1024;
620 	int snd = 1*1024*1024;
621 
622 #ifdef SO_RCVBUF
623 #  ifdef SO_RCVBUFFORCE
624 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
625 		(socklen_t)sizeof(rcv)) < 0) {
626 		if(errno != EPERM && errno != ENOBUFS) {
627 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
628                                         "...) failed: %s", strerror(errno));
629 			return -1;
630 		}
631 #  else
632 	if(1) {
633 #  endif /* SO_RCVBUFFORCE */
634 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
635 			 (socklen_t)sizeof(rcv)) < 0) {
636 			if(errno != ENOBUFS && errno != ENOSYS) {
637 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
638                                         "...) failed: %s", strerror(errno));
639 				return -1;
640 			}
641 		}
642 	}
643 #endif /* SO_RCVBUF */
644 
645 #ifdef SO_SNDBUF
646 #  ifdef SO_SNDBUFFORCE
647 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
648 		(socklen_t)sizeof(snd)) < 0) {
649 		if(errno != EPERM && errno != ENOBUFS) {
650 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
651                                         "...) failed: %s", strerror(errno));
652 			return -1;
653 		}
654 #  else
655 	if(1) {
656 #  endif /* SO_SNDBUFFORCE */
657 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
658 			 (socklen_t)sizeof(snd)) < 0) {
659 			if(errno != ENOBUFS && errno != ENOSYS) {
660 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
661                                         "...) failed: %s", strerror(errno));
662 				return -1;
663 			}
664 		}
665 	}
666 #endif /* SO_SNDBUF */
667 
668 	}
669 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
670 
671 #if defined(INET6)
672 		if (addr->ai_family == AF_INET6) {
673 # if defined(IPV6_V6ONLY)
674 			if (setsockopt(nsd->udp[i].s,
675 				       IPPROTO_IPV6, IPV6_V6ONLY,
676 				       &on, sizeof(on)) < 0)
677 			{
678 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
679 					strerror(errno));
680 				return -1;
681 			}
682 # endif
683 # if defined(IPV6_USE_MIN_MTU)
684 			/*
685 			 * There is no fragmentation of IPv6 datagrams
686 			 * during forwarding in the network. Therefore
687 			 * we do not send UDP datagrams larger than
688 			 * the minimum IPv6 MTU of 1280 octets. The
689 			 * EDNS0 message length can be larger if the
690 			 * network stack supports IPV6_USE_MIN_MTU.
691 			 */
692 			if (setsockopt(nsd->udp[i].s,
693 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
694 				       &on, sizeof(on)) < 0)
695 			{
696 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
697 					strerror(errno));
698 				return -1;
699 			}
700 # elif defined(IPV6_MTU)
701 			/*
702 			 * On Linux, PMTUD is disabled by default for datagrams
703 			 * so set the MTU equal to the MIN MTU to get the same.
704 			 */
705 			on = IPV6_MIN_MTU;
706 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
707 				&on, sizeof(on)) < 0)
708 			{
709 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
710 					strerror(errno));
711 				return -1;
712 			}
713 			on = 1;
714 # endif
715 		}
716 #endif
717 #if defined(AF_INET)
718 		if (addr->ai_family == AF_INET) {
719 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
720 			int action = IP_PMTUDISC_DONT;
721 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
722 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
723 			{
724 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
725 					strerror(errno));
726 				return -1;
727 			}
728 #  elif defined(IP_DONTFRAG)
729 			int off = 0;
730 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
731 				&off, sizeof(off)) < 0)
732 			{
733 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
734 					strerror(errno));
735 				return -1;
736 			}
737 #  endif
738 		}
739 #endif
740 		/* set it nonblocking */
741 		/* otherwise, on OSes with thundering herd problems, the
742 		   UDP recv could block NSD after select returns readable. */
743 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
744 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
745 		}
746 
747 		/* Bind it... */
748 		if (nsd->options->ip_freebind) {
749 #ifdef IP_FREEBIND
750 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
751 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
752 					strerror(errno));
753 			}
754 #endif /* IP_FREEBIND */
755 		}
756 
757 		if (nsd->options->ip_transparent) {
758 #ifdef IP_TRANSPARENT
759 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
760 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
761 					strerror(errno));
762 			}
763 #endif /* IP_TRANSPARENT */
764 #ifdef SO_BINDANY
765 			if (setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
766 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for udp: %s",
767 					strerror(errno));
768 			}
769 #endif /* SO_BINDANY */
770 		}
771 
772 		if (
773 			bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
774 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
775 			return -1;
776 		}
777 	}
778 
779 	/* TCP */
780 
781 	/* Make a socket... */
782 	for (i = from; i < to; i++) {
783 		/* for reuseports copy socket specs of first entries */
784 		addr = nsd->tcp[i%nsd->ifs].addr;
785 		if (!addr) {
786 			nsd->tcp[i].s = -1;
787 			continue;
788 		}
789 		nsd->tcp[i].fam = (int)addr->ai_family;
790 		/* turn off REUSEPORT for TCP by copying the socket fd */
791 		if(i >= nsd->ifs) {
792 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
793 			continue;
794 		}
795 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
796 #if defined(INET6)
797 			if (addr->ai_family == AF_INET6 &&
798 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
799 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
800 				continue;
801 			}
802 #endif /* INET6 */
803 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
804 			return -1;
805 		}
806 
807 #ifdef SO_REUSEPORT
808 		if(nsd->reuseport && *reuseport_works &&
809 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
810 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
811 			if(verbosity >= 3
812 #ifdef ENOPROTOOPT
813 				|| errno != ENOPROTOOPT
814 #endif
815 				)
816 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
817 				"...) failed: %s", strerror(errno));
818 			*reuseport_works = 0;
819 		}
820 #endif /* SO_REUSEPORT */
821 #ifdef	SO_REUSEADDR
822 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
823 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
824 		}
825 #endif /* SO_REUSEADDR */
826 
827 #if defined(INET6)
828 		if (addr->ai_family == AF_INET6) {
829 # if defined(IPV6_V6ONLY)
830 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
831 				&on, sizeof(on)) < 0) {
832 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
833 				return -1;
834 			}
835 # endif
836 # if defined(IPV6_USE_MIN_MTU)
837 			/*
838 			 * Use minimum MTU to minimize delays learning working
839 			 * PMTU when communicating through a tunnel.
840 			 */
841 			if (setsockopt(nsd->tcp[i].s,
842 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
843 				       &on, sizeof(on)) < 0) {
844 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
845 				return -1;
846 			}
847 # elif defined(IPV6_MTU)
848 			/*
849 			 * On Linux, PMTUD is disabled by default for datagrams
850 			 * so set the MTU equal to the MIN MTU to get the same.
851 			 */
852 			on = IPV6_MIN_MTU;
853 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
854 				&on, sizeof(on)) < 0) {
855 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
856 				return -1;
857 			}
858 			on = 1;
859 # endif
860 		}
861 #endif
862 		/* set maximum segment size to tcp socket */
863 		if(nsd->tcp_mss > 0) {
864 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
865 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
866 					(void*)&nsd->tcp_mss,
867 					sizeof(nsd->tcp_mss)) < 0) {
868 				log_msg(LOG_ERR,
869 					"setsockopt(...,TCP_MAXSEG,...)"
870 					" failed for tcp: %s", strerror(errno));
871 			}
872 #else
873 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
874 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
875 		}
876 
877 		/* set it nonblocking */
878 		/* (StevensUNP p463), if tcp listening socket is blocking, then
879 		   it may block in accept, even if select() says readable. */
880 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
881 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
882 		}
883 
884 		/* Bind it... */
885 		if (nsd->options->ip_freebind) {
886 #ifdef IP_FREEBIND
887 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
888 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
889 					strerror(errno));
890 			}
891 #endif /* IP_FREEBIND */
892 		}
893 
894 		if (nsd->options->ip_transparent) {
895 #ifdef IP_TRANSPARENT
896 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
897 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
898 					strerror(errno));
899 			}
900 #endif /* IP_TRANSPARENT */
901 #ifdef SO_BINDANY
902 			if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
903 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for tcp: %s",
904 					strerror(errno));
905 			}
906 #endif /* SO_BINDANY */
907 		}
908 
909 		if(
910 			bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
911 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
912 			return -1;
913 		}
914 
915 		/* Listen to it... */
916 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
917 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
918 			return -1;
919 		}
920 	}
921 
922 	return 0;
923 }
924 
925 /*
926  * Initialize the server, reuseport, create and bind the sockets.
927  */
928 int
929 server_init(struct nsd *nsd)
930 {
931 	int reuseport_successful = 1; /* see if reuseport works in OS */
932 	if(nsd->reuseport) {
933 		/* increase the size of the udp and tcp interface arrays,
934 		 * there are going to be separate interface file descriptors
935 		 * for every server instance */
936 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
937 			sizeof(*nsd->udp));
938 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
939 			sizeof(*nsd->tcp));
940 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
941 			(nsd->ifs*(nsd->reuseport-1)));
942 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
943 			(nsd->ifs*(nsd->reuseport-1)));
944 	}
945 
946 	/* open the server interface ports */
947 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
948 		return -1;
949 
950 	/* continue to open the remaining reuseport ports */
951 	if(nsd->reuseport && reuseport_successful) {
952 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
953 			&reuseport_successful) == -1)
954 			return -1;
955 		nsd->ifs *= nsd->reuseport;
956 	} else {
957 		nsd->reuseport = 0;
958 	}
959 	return 0;
960 }
961 
962 /*
963  * Prepare the server for take off.
964  *
965  */
966 int
967 server_prepare(struct nsd *nsd)
968 {
969 #ifdef RATELIMIT
970 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
971 #ifdef HAVE_ARC4RANDOM
972 	hash_set_raninit(arc4random());
973 #else
974 	uint32_t v = getpid() ^ time(NULL);
975 	srandom((unsigned long)v);
976 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
977 		hash_set_raninit(v);
978 	else	hash_set_raninit(random());
979 #endif
980 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
981 		nsd->options->rrl_ratelimit,
982 		nsd->options->rrl_whitelist_ratelimit,
983 		nsd->options->rrl_slip,
984 		nsd->options->rrl_ipv4_prefix_length,
985 		nsd->options->rrl_ipv6_prefix_length);
986 #endif /* RATELIMIT */
987 
988 	/* Open the database... */
989 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
990 		log_msg(LOG_ERR, "unable to open the database %s: %s",
991 			nsd->dbfile, strerror(errno));
992 		unlink(nsd->task[0]->fname);
993 		unlink(nsd->task[1]->fname);
994 #ifdef USE_ZONE_STATS
995 		unlink(nsd->zonestatfname[0]);
996 		unlink(nsd->zonestatfname[1]);
997 #endif
998 		xfrd_del_tempdir(nsd);
999 		return -1;
1000 	}
1001 	/* check if zone files have been modified */
1002 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1003 	 * for all zones */
1004 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1005 		nsd->options->database[0] == 0))
1006 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1007 	zonestatid_tree_set(nsd);
1008 
1009 	compression_table_capacity = 0;
1010 	initialize_dname_compression_tables(nsd);
1011 
1012 #ifdef	BIND8_STATS
1013 	/* Initialize times... */
1014 	time(&nsd->st.boot);
1015 	set_bind8_alarm(nsd);
1016 #endif /* BIND8_STATS */
1017 
1018 	return 0;
1019 }
1020 
1021 /*
1022  * Fork the required number of servers.
1023  */
1024 static int
1025 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1026 	int* xfrd_sock_p)
1027 {
1028 	size_t i;
1029 
1030 	/* Start all child servers initially.  */
1031 	for (i = 0; i < nsd->child_count; ++i) {
1032 		nsd->children[i].pid = 0;
1033 	}
1034 
1035 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1036 }
1037 
1038 void
1039 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1040 {
1041 	size_t i;
1042 
1043 	/* Close all the sockets... */
1044 	for (i = 0; i < n; ++i) {
1045 		if (sockets[i].s != -1) {
1046 			close(sockets[i].s);
1047 			if(sockets[i].addr)
1048 				freeaddrinfo(sockets[i].addr);
1049 			sockets[i].s = -1;
1050 		}
1051 	}
1052 }
1053 
1054 /*
1055  * Close the sockets, shutdown the server and exit.
1056  * Does not return.
1057  *
1058  */
1059 void
1060 server_shutdown(struct nsd *nsd)
1061 {
1062 	size_t i;
1063 
1064 	server_close_all_sockets(nsd->udp, nsd->ifs);
1065 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1066 	/* CHILD: close command channel to parent */
1067 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1068 	{
1069 		close(nsd->this_child->parent_fd);
1070 		nsd->this_child->parent_fd = -1;
1071 	}
1072 	/* SERVER: close command channels to children */
1073 	if(!nsd->this_child)
1074 	{
1075 		for(i=0; i < nsd->child_count; ++i)
1076 			if(nsd->children[i].child_fd != -1)
1077 			{
1078 				close(nsd->children[i].child_fd);
1079 				nsd->children[i].child_fd = -1;
1080 			}
1081 	}
1082 
1083 	tsig_finalize();
1084 #ifdef HAVE_SSL
1085 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1086 #endif
1087 
1088 #ifdef MEMCLEAN /* OS collects memory pages */
1089 #ifdef RATELIMIT
1090 	rrl_mmap_deinit_keep_mmap();
1091 #endif
1092 	udb_base_free_keep_mmap(nsd->task[0]);
1093 	udb_base_free_keep_mmap(nsd->task[1]);
1094 	namedb_close_udb(nsd->db); /* keeps mmap */
1095 	namedb_close(nsd->db);
1096 	nsd_options_destroy(nsd->options);
1097 	region_destroy(nsd->region);
1098 #endif
1099 	log_finalize();
1100 	exit(0);
1101 }
1102 
1103 void
1104 server_prepare_xfrd(struct nsd* nsd)
1105 {
1106 	char tmpfile[256];
1107 	/* create task mmaps */
1108 	nsd->mytask = 0;
1109 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1110 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1111 	nsd->task[0] = task_file_create(tmpfile);
1112 	if(!nsd->task[0]) {
1113 #ifdef USE_ZONE_STATS
1114 		unlink(nsd->zonestatfname[0]);
1115 		unlink(nsd->zonestatfname[1]);
1116 #endif
1117 		xfrd_del_tempdir(nsd);
1118 		exit(1);
1119 	}
1120 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1121 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1122 	nsd->task[1] = task_file_create(tmpfile);
1123 	if(!nsd->task[1]) {
1124 		unlink(nsd->task[0]->fname);
1125 #ifdef USE_ZONE_STATS
1126 		unlink(nsd->zonestatfname[0]);
1127 		unlink(nsd->zonestatfname[1]);
1128 #endif
1129 		xfrd_del_tempdir(nsd);
1130 		exit(1);
1131 	}
1132 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1133 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1134 	/* create xfrd listener structure */
1135 	nsd->xfrd_listener = region_alloc(nsd->region,
1136 		sizeof(netio_handler_type));
1137 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1138 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1139 	nsd->xfrd_listener->fd = -1;
1140 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1141 		nsd;
1142 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1143 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1144 }
1145 
1146 
1147 void
1148 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1149 {
1150 	pid_t pid;
1151 	int sockets[2] = {0,0};
1152 	struct ipc_handler_conn_data *data;
1153 
1154 	if(nsd->xfrd_listener->fd != -1)
1155 		close(nsd->xfrd_listener->fd);
1156 	if(del_db) {
1157 		/* recreate taskdb that xfrd was using, it may be corrupt */
1158 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1159 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1160 		nsd->task[1-nsd->mytask]->fname = NULL;
1161 		/* free alloc already, so udb does not shrink itself */
1162 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1163 		nsd->task[1-nsd->mytask]->alloc = NULL;
1164 		udb_base_free(nsd->task[1-nsd->mytask]);
1165 		/* create new file, overwrite the old one */
1166 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1167 		free(tmpfile);
1168 	}
1169 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1170 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1171 		return;
1172 	}
1173 	pid = fork();
1174 	switch (pid) {
1175 	case -1:
1176 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1177 		break;
1178 	default:
1179 		/* PARENT: close first socket, use second one */
1180 		close(sockets[0]);
1181 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1182 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1183 		}
1184 		if(del_db) xfrd_free_namedb(nsd);
1185 		/* use other task than I am using, since if xfrd died and is
1186 		 * restarted, the reload is using nsd->mytask */
1187 		nsd->mytask = 1 - nsd->mytask;
1188 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1189 		/* ENOTREACH */
1190 		break;
1191 	case 0:
1192 		/* CHILD: close second socket, use first one */
1193 		close(sockets[1]);
1194 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1195 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1196 		}
1197 		nsd->xfrd_listener->fd = sockets[0];
1198 		break;
1199 	}
1200 	/* server-parent only */
1201 	nsd->xfrd_listener->timeout = NULL;
1202 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1203 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1204 	/* clear ongoing ipc reads */
1205 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1206 	data->conn->is_reading = 0;
1207 }
1208 
1209 /** add all soainfo to taskdb */
1210 static void
1211 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1212 {
1213 	struct radnode* n;
1214 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1215 	/* add all SOA INFO to mytask */
1216 	udb_ptr_init(&task_last, taskudb);
1217 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1218 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1219 	}
1220 	udb_ptr_unlink(&task_last, taskudb);
1221 }
1222 
1223 void
1224 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1225 {
1226 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1227 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1228 	 *   then they exchange and process.
1229 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1230 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1231 	 *   expire notifications can be sent back via a normal reload later
1232 	 *   (xfrd will wait for current running reload to finish if any).
1233 	 */
1234 	sig_atomic_t cmd = 0;
1235 	pid_t mypid;
1236 	int xfrd_sock = nsd->xfrd_listener->fd;
1237 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1238 	udb_ptr t;
1239 	if(!shortsoa) {
1240 		if(nsd->signal_hint_shutdown) {
1241 		shutdown:
1242 			log_msg(LOG_WARNING, "signal received, shutting down...");
1243 			server_close_all_sockets(nsd->udp, nsd->ifs);
1244 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1245 #ifdef HAVE_SSL
1246 			daemon_remote_close(nsd->rc);
1247 #endif
1248 			/* Unlink it if possible... */
1249 			unlinkpid(nsd->pidfile);
1250 			unlink(nsd->task[0]->fname);
1251 			unlink(nsd->task[1]->fname);
1252 #ifdef USE_ZONE_STATS
1253 			unlink(nsd->zonestatfname[0]);
1254 			unlink(nsd->zonestatfname[1]);
1255 #endif
1256 			/* write the nsd.db to disk, wait for it to complete */
1257 			udb_base_sync(nsd->db->udb, 1);
1258 			udb_base_close(nsd->db->udb);
1259 			server_shutdown(nsd);
1260 			exit(0);
1261 		}
1262 	}
1263 	if(shortsoa) {
1264 		/* put SOA in xfrd task because mytask may be in use */
1265 		taskudb = nsd->task[1-nsd->mytask];
1266 	}
1267 
1268 	add_all_soa_to_task(nsd, taskudb);
1269 	if(!shortsoa) {
1270 		/* wait for xfrd to signal task is ready, RELOAD signal */
1271 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1272 			cmd != NSD_RELOAD) {
1273 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1274 			exit(1);
1275 		}
1276 		if(nsd->signal_hint_shutdown) {
1277 			goto shutdown;
1278 		}
1279 	}
1280 	/* give xfrd our task, signal it with RELOAD_DONE */
1281 	task_process_sync(taskudb);
1282 	cmd = NSD_RELOAD_DONE;
1283 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1284 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1285 			(int)nsd->pid, strerror(errno));
1286 	}
1287 	mypid = getpid();
1288 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1289 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1290 			strerror(errno));
1291 	}
1292 
1293 	if(!shortsoa) {
1294 		/* process the xfrd task works (expiry data) */
1295 		nsd->mytask = 1 - nsd->mytask;
1296 		taskudb = nsd->task[nsd->mytask];
1297 		task_remap(taskudb);
1298 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1299 		while(!udb_ptr_is_null(&t)) {
1300 			task_process_expire(nsd->db, TASKLIST(&t));
1301 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1302 		}
1303 		udb_ptr_unlink(&t, taskudb);
1304 		task_clear(taskudb);
1305 
1306 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1307 		cmd = NSD_RELOAD_DONE;
1308 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1309 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1310 				(int)nsd->pid, strerror(errno));
1311 		}
1312 	}
1313 }
1314 
1315 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1316 ssize_t
1317 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1318 {
1319 	uint8_t* buf = (uint8_t*) p;
1320 	ssize_t total = 0;
1321 	struct pollfd fd;
1322 	memset(&fd, 0, sizeof(fd));
1323 	fd.fd = s;
1324 	fd.events = POLLIN;
1325 
1326 	while( total < sz) {
1327 		ssize_t ret;
1328 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1329 		if(ret == -1) {
1330 			if(errno == EAGAIN)
1331 				/* blocking read */
1332 				continue;
1333 			if(errno == EINTR) {
1334 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1335 					return -1;
1336 				/* other signals can be handled later */
1337 				continue;
1338 			}
1339 			/* some error */
1340 			return -1;
1341 		}
1342 		if(ret == 0) {
1343 			/* operation timed out */
1344 			return -2;
1345 		}
1346 		ret = read(s, buf+total, sz-total);
1347 		if(ret == -1) {
1348 			if(errno == EAGAIN)
1349 				/* blocking read */
1350 				continue;
1351 			if(errno == EINTR) {
1352 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1353 					return -1;
1354 				/* other signals can be handled later */
1355 				continue;
1356 			}
1357 			/* some error */
1358 			return -1;
1359 		}
1360 		if(ret == 0) {
1361 			/* closed connection! */
1362 			return 0;
1363 		}
1364 		total += ret;
1365 	}
1366 	return total;
1367 }
1368 
1369 static void
1370 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1371 {
1372 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1373 	udb_ptr t, next;
1374 	udb_base* u = nsd->task[nsd->mytask];
1375 	udb_ptr_init(&next, u);
1376 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1377 	udb_base_set_userdata(u, 0);
1378 	while(!udb_ptr_is_null(&t)) {
1379 		/* store next in list so this one can be deleted or reused */
1380 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1381 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1382 
1383 		/* process task t */
1384 		/* append results for task t and update last_task */
1385 		task_process_in_reload(nsd, u, last_task, &t);
1386 
1387 		/* go to next */
1388 		udb_ptr_set_ptr(&t, u, &next);
1389 
1390 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1391 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1392 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1393 			if(cmd == NSD_QUIT) {
1394 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1395 				/* sync to disk (if needed) */
1396 				udb_base_sync(nsd->db->udb, 0);
1397 				/* unlink files of remainder of tasks */
1398 				while(!udb_ptr_is_null(&t)) {
1399 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1400 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1401 					}
1402 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1403 				}
1404 				udb_ptr_unlink(&t, u);
1405 				udb_ptr_unlink(&next, u);
1406 				exit(0);
1407 			}
1408 		}
1409 
1410 	}
1411 	udb_ptr_unlink(&t, u);
1412 	udb_ptr_unlink(&next, u);
1413 }
1414 
1415 #ifdef BIND8_STATS
1416 static void
1417 parent_send_stats(struct nsd* nsd, int cmdfd)
1418 {
1419 	size_t i;
1420 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1421 		log_msg(LOG_ERR, "could not write stats to reload");
1422 		return;
1423 	}
1424 	for(i=0; i<nsd->child_count; i++)
1425 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1426 			sizeof(stc_type))) {
1427 			log_msg(LOG_ERR, "could not write stats to reload");
1428 			return;
1429 		}
1430 }
1431 
1432 static void
1433 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1434 {
1435 	struct nsdst s;
1436 	stc_type* p;
1437 	size_t i;
1438 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1439 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1440 		log_msg(LOG_ERR, "could not read stats from oldpar");
1441 		return;
1442 	}
1443 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1444 	s.db_mem = region_get_mem(nsd->db->region);
1445 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1446 		nsd->child_count);
1447 	if(!p) return;
1448 	for(i=0; i<nsd->child_count; i++) {
1449 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
1450 			sizeof(stc_type))
1451 			return;
1452 	}
1453 }
1454 #endif /* BIND8_STATS */
1455 
1456 /*
1457  * Reload the database, stop parent, re-fork children and continue.
1458  * as server_main.
1459  */
1460 static void
1461 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1462 	int cmdsocket)
1463 {
1464 	pid_t mypid;
1465 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1466 	int ret;
1467 	udb_ptr last_task;
1468 	struct sigaction old_sigchld, ign_sigchld;
1469 	/* ignore SIGCHLD from the previous server_main that used this pid */
1470 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1471 	ign_sigchld.sa_handler = SIG_IGN;
1472 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1473 
1474 	/* see what tasks we got from xfrd */
1475 	task_remap(nsd->task[nsd->mytask]);
1476 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1477 	udb_compact_inhibited(nsd->db->udb, 1);
1478 	reload_process_tasks(nsd, &last_task, cmdsocket);
1479 	udb_compact_inhibited(nsd->db->udb, 0);
1480 	udb_compact(nsd->db->udb);
1481 
1482 #ifndef NDEBUG
1483 	if(nsd_debug_level >= 1)
1484 		region_log_stats(nsd->db->region);
1485 #endif /* NDEBUG */
1486 	/* sync to disk (if needed) */
1487 	udb_base_sync(nsd->db->udb, 0);
1488 
1489 	initialize_dname_compression_tables(nsd);
1490 
1491 #ifdef BIND8_STATS
1492 	/* Restart dumping stats if required.  */
1493 	time(&nsd->st.boot);
1494 	set_bind8_alarm(nsd);
1495 #endif
1496 #ifdef USE_ZONE_STATS
1497 	server_zonestat_realloc(nsd); /* realloc for new children */
1498 	server_zonestat_switch(nsd);
1499 #endif
1500 
1501 	/* listen for the signals of failed children again */
1502 	sigaction(SIGCHLD, &old_sigchld, NULL);
1503 	/* Start new child processes */
1504 	if (server_start_children(nsd, server_region, netio, &nsd->
1505 		xfrd_listener->fd) != 0) {
1506 		send_children_quit(nsd);
1507 		exit(1);
1508 	}
1509 
1510 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1511 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1512 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1513 		if(cmd == NSD_QUIT) {
1514 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1515 			send_children_quit(nsd);
1516 			exit(0);
1517 		}
1518 	}
1519 
1520 	/* Send quit command to parent: blocking, wait for receipt. */
1521 	do {
1522 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1523 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1524 		{
1525 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1526 				strerror(errno));
1527 		}
1528 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1529 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1530 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1531 			RELOAD_SYNC_TIMEOUT);
1532 		if(ret == -2) {
1533 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1534 		}
1535 	} while (ret == -2);
1536 	if(ret == -1) {
1537 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1538 			strerror(errno));
1539 	}
1540 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1541 	if(cmd == NSD_QUIT) {
1542 		/* small race condition possible here, parent got quit cmd. */
1543 		send_children_quit(nsd);
1544 		exit(1);
1545 	}
1546 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1547 #ifdef BIND8_STATS
1548 	reload_do_stats(cmdsocket, nsd, &last_task);
1549 #endif
1550 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1551 	task_process_sync(nsd->task[nsd->mytask]);
1552 #ifdef USE_ZONE_STATS
1553 	server_zonestat_realloc(nsd); /* realloc for next children */
1554 #endif
1555 
1556 	/* send soainfo to the xfrd process, signal it that reload is done,
1557 	 * it picks up the taskudb */
1558 	cmd = NSD_RELOAD_DONE;
1559 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1560 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1561 			strerror(errno));
1562 	}
1563 	mypid = getpid();
1564 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1565 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1566 			strerror(errno));
1567 	}
1568 
1569 	/* try to reopen file */
1570 	if (nsd->file_rotation_ok)
1571 		log_reopen(nsd->log_filename, 1);
1572 	/* exit reload, continue as new server_main */
1573 }
1574 
1575 /*
1576  * Get the mode depending on the signal hints that have been received.
1577  * Multiple signal hints can be received and will be handled in turn.
1578  */
1579 static sig_atomic_t
1580 server_signal_mode(struct nsd *nsd)
1581 {
1582 	if(nsd->signal_hint_quit) {
1583 		nsd->signal_hint_quit = 0;
1584 		return NSD_QUIT;
1585 	}
1586 	else if(nsd->signal_hint_shutdown) {
1587 		nsd->signal_hint_shutdown = 0;
1588 		return NSD_SHUTDOWN;
1589 	}
1590 	else if(nsd->signal_hint_child) {
1591 		nsd->signal_hint_child = 0;
1592 		return NSD_REAP_CHILDREN;
1593 	}
1594 	else if(nsd->signal_hint_reload) {
1595 		nsd->signal_hint_reload = 0;
1596 		return NSD_RELOAD;
1597 	}
1598 	else if(nsd->signal_hint_reload_hup) {
1599 		nsd->signal_hint_reload_hup = 0;
1600 		return NSD_RELOAD_REQ;
1601 	}
1602 	else if(nsd->signal_hint_stats) {
1603 		nsd->signal_hint_stats = 0;
1604 #ifdef BIND8_STATS
1605 		set_bind8_alarm(nsd);
1606 #endif
1607 		return NSD_STATS;
1608 	}
1609 	else if(nsd->signal_hint_statsusr) {
1610 		nsd->signal_hint_statsusr = 0;
1611 		return NSD_STATS;
1612 	}
1613 	return NSD_RUN;
1614 }
1615 
1616 /*
1617  * The main server simply waits for signals and child processes to
1618  * terminate.  Child processes are restarted as necessary.
1619  */
1620 void
1621 server_main(struct nsd *nsd)
1622 {
1623 	region_type *server_region = region_create(xalloc, free);
1624 	netio_type *netio = netio_create(server_region);
1625 	netio_handler_type reload_listener;
1626 	int reload_sockets[2] = {-1, -1};
1627 	struct timespec timeout_spec;
1628 	int status;
1629 	pid_t child_pid;
1630 	pid_t reload_pid = -1;
1631 	sig_atomic_t mode;
1632 
1633 	/* Ensure we are the main process */
1634 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1635 
1636 	/* Add listener for the XFRD process */
1637 	netio_add_handler(netio, nsd->xfrd_listener);
1638 
1639 	/* Start the child processes that handle incoming queries */
1640 	if (server_start_children(nsd, server_region, netio,
1641 		&nsd->xfrd_listener->fd) != 0) {
1642 		send_children_quit(nsd);
1643 		exit(1);
1644 	}
1645 	reload_listener.fd = -1;
1646 
1647 	/* This_child MUST be 0, because this is the parent process */
1648 	assert(nsd->this_child == 0);
1649 
1650 	/* Run the server until we get a shutdown signal */
1651 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1652 		/* Did we receive a signal that changes our mode? */
1653 		if(mode == NSD_RUN) {
1654 			nsd->mode = mode = server_signal_mode(nsd);
1655 		}
1656 
1657 		switch (mode) {
1658 		case NSD_RUN:
1659 			/* see if any child processes terminated */
1660 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1661 				int is_child = delete_child_pid(nsd, child_pid);
1662 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1663 					if(nsd->children[is_child].child_fd == -1)
1664 						nsd->children[is_child].has_exited = 1;
1665 					parent_check_all_children_exited(nsd);
1666 				} else if(is_child != -1) {
1667 					log_msg(LOG_WARNING,
1668 					       "server %d died unexpectedly with status %d, restarting",
1669 					       (int) child_pid, status);
1670 					restart_child_servers(nsd, server_region, netio,
1671 						&nsd->xfrd_listener->fd);
1672 				} else if (child_pid == reload_pid) {
1673 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1674 					pid_t mypid;
1675 					log_msg(LOG_WARNING,
1676 					       "Reload process %d failed with status %d, continuing with old database",
1677 					       (int) child_pid, status);
1678 					reload_pid = -1;
1679 					if(reload_listener.fd != -1) close(reload_listener.fd);
1680 					reload_listener.fd = -1;
1681 					reload_listener.event_types = NETIO_EVENT_NONE;
1682 					task_process_sync(nsd->task[nsd->mytask]);
1683 					/* inform xfrd reload attempt ended */
1684 					if(!write_socket(nsd->xfrd_listener->fd,
1685 						&cmd, sizeof(cmd))) {
1686 						log_msg(LOG_ERR, "problems "
1687 						  "sending SOAEND to xfrd: %s",
1688 						  strerror(errno));
1689 					}
1690 					mypid = getpid();
1691 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1692 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1693 							strerror(errno));
1694 					}
1695 				} else if(status != 0) {
1696 					/* check for status, because we get
1697 					 * the old-servermain because reload
1698 					 * is the process-parent of old-main,
1699 					 * and we get older server-processes
1700 					 * that are exiting after a reload */
1701 					log_msg(LOG_WARNING,
1702 					       "process %d terminated with status %d",
1703 					       (int) child_pid, status);
1704 				}
1705 			}
1706 			if (child_pid == -1) {
1707 				if (errno == EINTR) {
1708 					continue;
1709 				}
1710 				if (errno != ECHILD)
1711 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1712 			}
1713 			if (nsd->mode != NSD_RUN)
1714 				break;
1715 
1716 			/* timeout to collect processes. In case no sigchild happens. */
1717 			timeout_spec.tv_sec = 60;
1718 			timeout_spec.tv_nsec = 0;
1719 
1720 			/* listen on ports, timeout for collecting terminated children */
1721 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1722 				if (errno != EINTR) {
1723 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1724 				}
1725 			}
1726 			if(nsd->restart_children) {
1727 				restart_child_servers(nsd, server_region, netio,
1728 					&nsd->xfrd_listener->fd);
1729 				nsd->restart_children = 0;
1730 			}
1731 			if(nsd->reload_failed) {
1732 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1733 				pid_t mypid;
1734 				nsd->reload_failed = 0;
1735 				log_msg(LOG_WARNING,
1736 				       "Reload process %d failed, continuing with old database",
1737 				       (int) reload_pid);
1738 				reload_pid = -1;
1739 				if(reload_listener.fd != -1) close(reload_listener.fd);
1740 				reload_listener.fd = -1;
1741 				reload_listener.event_types = NETIO_EVENT_NONE;
1742 				task_process_sync(nsd->task[nsd->mytask]);
1743 				/* inform xfrd reload attempt ended */
1744 				if(!write_socket(nsd->xfrd_listener->fd,
1745 					&cmd, sizeof(cmd))) {
1746 					log_msg(LOG_ERR, "problems "
1747 					  "sending SOAEND to xfrd: %s",
1748 					  strerror(errno));
1749 				}
1750 				mypid = getpid();
1751 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1752 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1753 						strerror(errno));
1754 				}
1755 			}
1756 
1757 			break;
1758 		case NSD_RELOAD_REQ: {
1759 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1760 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1761 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1762 				"main: ipc send reload_req to xfrd"));
1763 			if(!write_socket(nsd->xfrd_listener->fd,
1764 				&cmd, sizeof(cmd))) {
1765 				log_msg(LOG_ERR, "server_main: could not send "
1766 				"reload_req to xfrd: %s", strerror(errno));
1767 			}
1768 			nsd->mode = NSD_RUN;
1769 			} break;
1770 		case NSD_RELOAD:
1771 			/* Continue to run nsd after reload */
1772 			nsd->mode = NSD_RUN;
1773 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1774 			if (reload_pid != -1) {
1775 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1776 				       (int) reload_pid);
1777 				break;
1778 			}
1779 
1780 			/* switch the mytask to keep track of who owns task*/
1781 			nsd->mytask = 1 - nsd->mytask;
1782 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1783 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1784 				reload_pid = -1;
1785 				break;
1786 			}
1787 
1788 			/* Do actual reload */
1789 			reload_pid = fork();
1790 			switch (reload_pid) {
1791 			case -1:
1792 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1793 				break;
1794 			default:
1795 				/* PARENT */
1796 				close(reload_sockets[0]);
1797 				server_reload(nsd, server_region, netio,
1798 					reload_sockets[1]);
1799 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1800 				close(reload_sockets[1]);
1801 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1802 				/* drop stale xfrd ipc data */
1803 				((struct ipc_handler_conn_data*)nsd->
1804 					xfrd_listener->user_data)
1805 					->conn->is_reading = 0;
1806 				reload_pid = -1;
1807 				reload_listener.fd = -1;
1808 				reload_listener.event_types = NETIO_EVENT_NONE;
1809 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1810 				break;
1811 			case 0:
1812 				/* CHILD */
1813 				/* server_main keep running until NSD_QUIT_SYNC
1814 				 * received from reload. */
1815 				close(reload_sockets[1]);
1816 				reload_listener.fd = reload_sockets[0];
1817 				reload_listener.timeout = NULL;
1818 				reload_listener.user_data = nsd;
1819 				reload_listener.event_types = NETIO_EVENT_READ;
1820 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1821 				netio_add_handler(netio, &reload_listener);
1822 				reload_pid = getppid();
1823 				break;
1824 			}
1825 			break;
1826 		case NSD_QUIT_SYNC:
1827 			/* synchronisation of xfrd, parent and reload */
1828 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1829 				sig_atomic_t cmd = NSD_RELOAD;
1830 				/* stop xfrd ipc writes in progress */
1831 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1832 					"main: ipc send indication reload"));
1833 				if(!write_socket(nsd->xfrd_listener->fd,
1834 					&cmd, sizeof(cmd))) {
1835 					log_msg(LOG_ERR, "server_main: could not send reload "
1836 					"indication to xfrd: %s", strerror(errno));
1837 				}
1838 				/* wait for ACK from xfrd */
1839 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1840 				nsd->quit_sync_done = 1;
1841 			}
1842 			nsd->mode = NSD_RUN;
1843 			break;
1844 		case NSD_QUIT:
1845 			/* silent shutdown during reload */
1846 			if(reload_listener.fd != -1) {
1847 				/* acknowledge the quit, to sync reload that we will really quit now */
1848 				sig_atomic_t cmd = NSD_RELOAD;
1849 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1850 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1851 					log_msg(LOG_ERR, "server_main: "
1852 						"could not ack quit: %s", strerror(errno));
1853 				}
1854 #ifdef BIND8_STATS
1855 				parent_send_stats(nsd, reload_listener.fd);
1856 #endif /* BIND8_STATS */
1857 				close(reload_listener.fd);
1858 			}
1859 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1860 			/* only quit children after xfrd has acked */
1861 			send_children_quit(nsd);
1862 
1863 #ifdef MEMCLEAN /* OS collects memory pages */
1864 			region_destroy(server_region);
1865 #endif
1866 			server_shutdown(nsd);
1867 
1868 			/* ENOTREACH */
1869 			break;
1870 		case NSD_SHUTDOWN:
1871 			break;
1872 		case NSD_REAP_CHILDREN:
1873 			/* continue; wait for child in run loop */
1874 			nsd->mode = NSD_RUN;
1875 			break;
1876 		case NSD_STATS:
1877 #ifdef BIND8_STATS
1878 			set_children_stats(nsd);
1879 #endif
1880 			nsd->mode = NSD_RUN;
1881 			break;
1882 		default:
1883 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1884 			nsd->mode = NSD_RUN;
1885 			break;
1886 		}
1887 	}
1888 	log_msg(LOG_WARNING, "signal received, shutting down...");
1889 
1890 	/* close opened ports to avoid race with restart of nsd */
1891 	server_close_all_sockets(nsd->udp, nsd->ifs);
1892 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1893 #ifdef HAVE_SSL
1894 	daemon_remote_close(nsd->rc);
1895 #endif
1896 	send_children_quit_and_wait(nsd);
1897 
1898 	/* Unlink it if possible... */
1899 	unlinkpid(nsd->pidfile);
1900 	unlink(nsd->task[0]->fname);
1901 	unlink(nsd->task[1]->fname);
1902 #ifdef USE_ZONE_STATS
1903 	unlink(nsd->zonestatfname[0]);
1904 	unlink(nsd->zonestatfname[1]);
1905 #endif
1906 
1907 	if(reload_listener.fd != -1) {
1908 		sig_atomic_t cmd = NSD_QUIT;
1909 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1910 			"main: ipc send quit to reload-process"));
1911 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1912 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1913 				strerror(errno));
1914 		}
1915 		fsync(reload_listener.fd);
1916 		close(reload_listener.fd);
1917 		/* wait for reload to finish processing */
1918 		while(1) {
1919 			if(waitpid(reload_pid, NULL, 0) == -1) {
1920 				if(errno == EINTR) continue;
1921 				if(errno == ECHILD) break;
1922 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1923 					(int)reload_pid, strerror(errno));
1924 			}
1925 			break;
1926 		}
1927 	}
1928 	if(nsd->xfrd_listener->fd != -1) {
1929 		/* complete quit, stop xfrd */
1930 		sig_atomic_t cmd = NSD_QUIT;
1931 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1932 			"main: ipc send quit to xfrd"));
1933 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1934 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1935 				strerror(errno));
1936 		}
1937 		fsync(nsd->xfrd_listener->fd);
1938 		close(nsd->xfrd_listener->fd);
1939 		(void)kill(nsd->pid, SIGTERM);
1940 	}
1941 
1942 #ifdef MEMCLEAN /* OS collects memory pages */
1943 	region_destroy(server_region);
1944 #endif
1945 	/* write the nsd.db to disk, wait for it to complete */
1946 	udb_base_sync(nsd->db->udb, 1);
1947 	udb_base_close(nsd->db->udb);
1948 	server_shutdown(nsd);
1949 }
1950 
1951 static query_state_type
1952 server_process_query(struct nsd *nsd, struct query *query)
1953 {
1954 	return query_process(query, nsd);
1955 }
1956 
1957 static query_state_type
1958 server_process_query_udp(struct nsd *nsd, struct query *query)
1959 {
1960 #ifdef RATELIMIT
1961 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1962 		if(rrl_process_query(query))
1963 			return rrl_slip(query);
1964 		else	return QUERY_PROCESSED;
1965 	}
1966 	return QUERY_DISCARDED;
1967 #else
1968 	return query_process(query, nsd);
1969 #endif
1970 }
1971 
1972 struct event_base*
1973 nsd_child_event_base(void)
1974 {
1975 	struct event_base* base;
1976 #ifdef USE_MINI_EVENT
1977 	static time_t secs;
1978 	static struct timeval now;
1979 	base = event_init(&secs, &now);
1980 #else
1981 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
1982 	/* libev */
1983 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
1984 #  else
1985 	/* libevent */
1986 #    ifdef HAVE_EVENT_BASE_NEW
1987 	base = event_base_new();
1988 #    else
1989 	base = event_init();
1990 #    endif
1991 #  endif
1992 #endif
1993 	return base;
1994 }
1995 
1996 /*
1997  * Serve DNS requests.
1998  */
1999 void
2000 server_child(struct nsd *nsd)
2001 {
2002 	size_t i, from, numifs;
2003 	region_type *server_region = region_create(xalloc, free);
2004 	struct event_base* event_base = nsd_child_event_base();
2005 	query_type *udp_query;
2006 	sig_atomic_t mode;
2007 
2008 	if(!event_base) {
2009 		log_msg(LOG_ERR, "nsd server could not create event base");
2010 		exit(1);
2011 	}
2012 	nsd->event_base = event_base;
2013 	nsd->server_region = server_region;
2014 
2015 #ifdef RATELIMIT
2016 	rrl_init(nsd->this_child->child_num);
2017 #endif
2018 
2019 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2020 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2021 
2022 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2023 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2024 	}
2025 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2026 		server_close_all_sockets(nsd->udp, nsd->ifs);
2027 	}
2028 
2029 	if (nsd->this_child->parent_fd != -1) {
2030 		struct event *handler;
2031 		struct ipc_handler_conn_data* user_data =
2032 			(struct ipc_handler_conn_data*)region_alloc(
2033 			server_region, sizeof(struct ipc_handler_conn_data));
2034 		user_data->nsd = nsd;
2035 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2036 
2037 		handler = (struct event*) region_alloc(
2038 			server_region, sizeof(*handler));
2039 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2040 			EV_READ, child_handle_parent_command, user_data);
2041 		if(event_base_set(event_base, handler) != 0)
2042 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2043 		if(event_add(handler, NULL) != 0)
2044 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2045 	}
2046 
2047 	if(nsd->reuseport) {
2048 		numifs = nsd->ifs / nsd->reuseport;
2049 		from = numifs * nsd->this_child->child_num;
2050 		if(from+numifs > nsd->ifs) { /* should not happen */
2051 			from = 0;
2052 			numifs = nsd->ifs;
2053 		}
2054 	} else {
2055 		from = 0;
2056 		numifs = nsd->ifs;
2057 	}
2058 
2059 	if (nsd->server_kind & NSD_SERVER_UDP) {
2060 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2061 		udp_query = query_create(server_region,
2062 			compressed_dname_offsets, compression_table_size,
2063 			compressed_dnames);
2064 #else
2065 		udp_query = NULL;
2066 		memset(msgs, 0, sizeof(msgs));
2067 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2068 			queries[i] = query_create(server_region,
2069 				compressed_dname_offsets,
2070 				compression_table_size, compressed_dnames);
2071 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2072 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2073 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2074 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2075 			msgs[i].msg_hdr.msg_iovlen  = 1;
2076 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2077 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2078 		}
2079 #endif
2080 		for (i = from; i < from+numifs; ++i) {
2081 			struct udp_handler_data *data;
2082 			struct event *handler;
2083 
2084 			data = (struct udp_handler_data *) region_alloc(
2085 				server_region,
2086 				sizeof(struct udp_handler_data));
2087 			data->query = udp_query;
2088 			data->nsd = nsd;
2089 			data->socket = &nsd->udp[i];
2090 
2091 			handler = (struct event*) region_alloc(
2092 				server_region, sizeof(*handler));
2093 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2094 				handle_udp, data);
2095 			if(event_base_set(event_base, handler) != 0)
2096 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2097 			if(event_add(handler, NULL) != 0)
2098 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2099 		}
2100 	}
2101 
2102 	/*
2103 	 * Keep track of all the TCP accept handlers so we can enable
2104 	 * and disable them based on the current number of active TCP
2105 	 * connections.
2106 	 */
2107 	tcp_accept_handler_count = numifs;
2108 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2109 		region_alloc_array(server_region,
2110 		numifs, sizeof(*tcp_accept_handlers));
2111 	if (nsd->server_kind & NSD_SERVER_TCP) {
2112 		for (i = from; i < numifs; ++i) {
2113 			struct event *handler = &tcp_accept_handlers[i-from].event;
2114 			struct tcp_accept_handler_data* data =
2115 				&tcp_accept_handlers[i-from];
2116 			data->nsd = nsd;
2117 			data->socket = &nsd->tcp[i];
2118 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2119 				handle_tcp_accept, data);
2120 			if(event_base_set(event_base, handler) != 0)
2121 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2122 			if(event_add(handler, NULL) != 0)
2123 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2124 			data->event_added = 1;
2125 		}
2126 	} else tcp_accept_handler_count = 0;
2127 
2128 	/* The main loop... */
2129 	while ((mode = nsd->mode) != NSD_QUIT) {
2130 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2131 
2132 		/* Do we need to do the statistics... */
2133 		if (mode == NSD_STATS) {
2134 #ifdef BIND8_STATS
2135 			int p = nsd->st.period;
2136 			nsd->st.period = 1; /* force stats printout */
2137 			/* Dump the statistics */
2138 			bind8_stats(nsd);
2139 			nsd->st.period = p;
2140 #else /* !BIND8_STATS */
2141 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2142 #endif /* BIND8_STATS */
2143 
2144 			nsd->mode = NSD_RUN;
2145 		}
2146 		else if (mode == NSD_REAP_CHILDREN) {
2147 			/* got signal, notify parent. parent reaps terminated children. */
2148 			if (nsd->this_child->parent_fd != -1) {
2149 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2150 				if (write(nsd->this_child->parent_fd,
2151 				    &parent_notify,
2152 				    sizeof(parent_notify)) == -1)
2153 				{
2154 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2155 						(int) nsd->this_child->pid, strerror(errno));
2156 				}
2157 			} else /* no parent, so reap 'em */
2158 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2159 			nsd->mode = NSD_RUN;
2160 		}
2161 		else if(mode == NSD_RUN) {
2162 			/* Wait for a query... */
2163 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2164 				if (errno != EINTR) {
2165 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2166 					break;
2167 				}
2168 			}
2169 		} else if(mode == NSD_QUIT) {
2170 			/* ignore here, quit */
2171 		} else {
2172 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2173 				(int)mode);
2174 			nsd->mode = NSD_RUN;
2175 		}
2176 	}
2177 
2178 #ifdef	BIND8_STATS
2179 	bind8_stats(nsd);
2180 #endif /* BIND8_STATS */
2181 
2182 #ifdef MEMCLEAN /* OS collects memory pages */
2183 #ifdef RATELIMIT
2184 	rrl_deinit(nsd->this_child->child_num);
2185 #endif
2186 	event_base_free(event_base);
2187 	region_destroy(server_region);
2188 #endif
2189 	server_shutdown(nsd);
2190 }
2191 
2192 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2193 static void
2194 handle_udp(int fd, short event, void* arg)
2195 {
2196 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2197 	int received, sent, recvcount, i;
2198 	struct query *q;
2199 
2200 	if (!(event & EV_READ)) {
2201 		return;
2202 	}
2203 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2204 	/* this printf strangely gave a performance increase on Linux */
2205 	/* printf("recvcount %d \n", recvcount); */
2206 	if (recvcount == -1) {
2207 		if (errno != EAGAIN && errno != EINTR) {
2208 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2209 			STATUP(data->nsd, rxerr);
2210 			/* No zone statup */
2211 		}
2212 		/* Simply no data available */
2213 		return;
2214 	}
2215 	for (i = 0; i < recvcount; i++) {
2216 	loopstart:
2217 		received = msgs[i].msg_len;
2218 		q = queries[i];
2219 		if (received == -1) {
2220 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2221 				msgs[i].msg_hdr.msg_flags));
2222 			STATUP(data->nsd, rxerr);
2223 			/* No zone statup */
2224 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2225 			iovecs[i].iov_len = buffer_remaining(q->packet);
2226 			goto swap_drop;
2227 		}
2228 
2229 		/* Account... */
2230 #ifdef BIND8_STATS
2231 		if (data->socket->fam == AF_INET) {
2232 			STATUP(data->nsd, qudp);
2233 		} else if (data->socket->fam == AF_INET6) {
2234 			STATUP(data->nsd, qudp6);
2235 		}
2236 #endif
2237 
2238 		buffer_skip(q->packet, received);
2239 		buffer_flip(q->packet);
2240 
2241 		/* Process and answer the query... */
2242 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2243 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2244 				STATUP(data->nsd, nona);
2245 				ZTATUP(data->nsd, q->zone, nona);
2246 			}
2247 
2248 #ifdef USE_ZONE_STATS
2249 			if (data->socket->fam == AF_INET) {
2250 				ZTATUP(data->nsd, q->zone, qudp);
2251 			} else if (data->socket->fam == AF_INET6) {
2252 				ZTATUP(data->nsd, q->zone, qudp6);
2253 			}
2254 #endif
2255 
2256 			/* Add EDNS0 and TSIG info if necessary.  */
2257 			query_add_optional(q, data->nsd);
2258 
2259 			buffer_flip(q->packet);
2260 			iovecs[i].iov_len = buffer_remaining(q->packet);
2261 #ifdef BIND8_STATS
2262 			/* Account the rcode & TC... */
2263 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2264 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2265 			if (TC(q->packet)) {
2266 				STATUP(data->nsd, truncated);
2267 				ZTATUP(data->nsd, q->zone, truncated);
2268 			}
2269 #endif /* BIND8_STATS */
2270 		} else {
2271 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2272 			iovecs[i].iov_len = buffer_remaining(q->packet);
2273 		swap_drop:
2274 			STATUP(data->nsd, dropped);
2275 			ZTATUP(data->nsd, q->zone, dropped);
2276 			if(i != recvcount-1) {
2277 				/* swap with last and decrease recvcount */
2278 				struct mmsghdr mtmp = msgs[i];
2279 				struct iovec iotmp = iovecs[i];
2280 				recvcount--;
2281 				msgs[i] = msgs[recvcount];
2282 				iovecs[i] = iovecs[recvcount];
2283 				queries[i] = queries[recvcount];
2284 				msgs[recvcount] = mtmp;
2285 				iovecs[recvcount] = iotmp;
2286 				queries[recvcount] = q;
2287 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2288 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2289 				goto loopstart;
2290 			} else { recvcount --; }
2291 		}
2292 	}
2293 
2294 	/* send until all are sent */
2295 	i = 0;
2296 	while(i<recvcount) {
2297 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2298 		if(sent == -1) {
2299 			const char* es = strerror(errno);
2300 			char a[48];
2301 			addr2str(&queries[i]->addr, a, sizeof(a));
2302 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2303 #ifdef BIND8_STATS
2304 			data->nsd->st.txerr += recvcount-i;
2305 #endif /* BIND8_STATS */
2306 			break;
2307 		}
2308 		i += sent;
2309 	}
2310 	for(i=0; i<recvcount; i++) {
2311 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2312 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2313 	}
2314 }
2315 
2316 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2317 
2318 static void
2319 handle_udp(int fd, short event, void* arg)
2320 {
2321 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2322 	int received, sent;
2323 #ifndef NONBLOCKING_IS_BROKEN
2324 #ifdef HAVE_RECVMMSG
2325 	int recvcount;
2326 #endif /* HAVE_RECVMMSG */
2327 	int i;
2328 #endif /* NONBLOCKING_IS_BROKEN */
2329 	struct query *q;
2330 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2331 	q = data->query;
2332 #endif
2333 
2334 	if (!(event & EV_READ)) {
2335 		return;
2336 	}
2337 #ifndef NONBLOCKING_IS_BROKEN
2338 #ifdef HAVE_RECVMMSG
2339 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2340 	/* this printf strangely gave a performance increase on Linux */
2341 	/* printf("recvcount %d \n", recvcount); */
2342 	if (recvcount == -1) {
2343 		if (errno != EAGAIN && errno != EINTR) {
2344 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2345 			STATUP(data->nsd, rxerr);
2346 			/* No zone statup */
2347 		}
2348 		/* Simply no data available */
2349 		return;
2350 	}
2351 	for (i = 0; i < recvcount; i++) {
2352 		received = msgs[i].msg_len;
2353 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2354 		if (received == -1) {
2355 			log_msg(LOG_ERR, "recvmmsg failed");
2356 			STATUP(data->nsd, rxerr);
2357 			/* No zone statup */
2358 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2359 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2360 			continue;
2361 		}
2362 		q = queries[i];
2363 #else
2364 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2365 #endif /* HAVE_RECVMMSG */
2366 #endif /* NONBLOCKING_IS_BROKEN */
2367 
2368 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2369 		/* Initialize the query... */
2370 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2371 
2372 		received = recvfrom(fd,
2373 				    buffer_begin(q->packet),
2374 				    buffer_remaining(q->packet),
2375 				    0,
2376 				    (struct sockaddr *)&q->addr,
2377 				    &q->addrlen);
2378 		if (received == -1) {
2379 			if (errno != EAGAIN && errno != EINTR) {
2380 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2381 				STATUP(data->nsd, rxerr);
2382 				/* No zone statup */
2383 			}
2384 			return;
2385 		}
2386 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2387 
2388 		/* Account... */
2389 		if (data->socket->fam == AF_INET) {
2390 			STATUP(data->nsd, qudp);
2391 		} else if (data->socket->fam == AF_INET6) {
2392 			STATUP(data->nsd, qudp6);
2393 		}
2394 
2395 		buffer_skip(q->packet, received);
2396 		buffer_flip(q->packet);
2397 
2398 		/* Process and answer the query... */
2399 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2400 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2401 				STATUP(data->nsd, nona);
2402 				ZTATUP(data->nsd, q->zone, nona);
2403 			}
2404 
2405 #ifdef USE_ZONE_STATS
2406 			if (data->socket->fam == AF_INET) {
2407 				ZTATUP(data->nsd, q->zone, qudp);
2408 			} else if (data->socket->fam == AF_INET6) {
2409 				ZTATUP(data->nsd, q->zone, qudp6);
2410 			}
2411 #endif
2412 
2413 			/* Add EDNS0 and TSIG info if necessary.  */
2414 			query_add_optional(q, data->nsd);
2415 
2416 			buffer_flip(q->packet);
2417 
2418 			sent = sendto(fd,
2419 				      buffer_begin(q->packet),
2420 				      buffer_remaining(q->packet),
2421 				      0,
2422 				      (struct sockaddr *) &q->addr,
2423 				      q->addrlen);
2424 			if (sent == -1) {
2425 				const char* es = strerror(errno);
2426 				char a[48];
2427 				addr2str(&q->addr, a, sizeof(a));
2428 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2429 				STATUP(data->nsd, txerr);
2430 				ZTATUP(data->nsd, q->zone, txerr);
2431 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2432 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2433 			} else {
2434 #ifdef BIND8_STATS
2435 				/* Account the rcode & TC... */
2436 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2437 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2438 				if (TC(q->packet)) {
2439 					STATUP(data->nsd, truncated);
2440 					ZTATUP(data->nsd, q->zone, truncated);
2441 				}
2442 #endif /* BIND8_STATS */
2443 			}
2444 		} else {
2445 			STATUP(data->nsd, dropped);
2446 			ZTATUP(data->nsd, q->zone, dropped);
2447 		}
2448 #ifndef NONBLOCKING_IS_BROKEN
2449 #ifdef HAVE_RECVMMSG
2450 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2451 #endif
2452 	}
2453 #endif
2454 }
2455 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2456 
2457 
2458 static void
2459 cleanup_tcp_handler(struct tcp_handler_data* data)
2460 {
2461 	event_del(&data->event);
2462 	close(data->event.ev_fd);
2463 
2464 	/*
2465 	 * Enable the TCP accept handlers when the current number of
2466 	 * TCP connections is about to drop below the maximum number
2467 	 * of TCP connections.
2468 	 */
2469 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2470 		configure_handler_event_types(EV_READ|EV_PERSIST);
2471 		if(slowaccept) {
2472 			event_del(&slowaccept_event);
2473 			slowaccept = 0;
2474 		}
2475 	}
2476 	--data->nsd->current_tcp_count;
2477 	assert(data->nsd->current_tcp_count >= 0);
2478 
2479 	region_destroy(data->region);
2480 }
2481 
2482 static void
2483 handle_tcp_reading(int fd, short event, void* arg)
2484 {
2485 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2486 	ssize_t received;
2487 	struct event_base* ev_base;
2488 	struct timeval timeout;
2489 
2490 	if ((event & EV_TIMEOUT)) {
2491 		/* Connection timed out.  */
2492 		cleanup_tcp_handler(data);
2493 		return;
2494 	}
2495 
2496 	if (data->nsd->tcp_query_count > 0 &&
2497 		data->query_count >= data->nsd->tcp_query_count) {
2498 		/* No more queries allowed on this tcp connection.  */
2499 		cleanup_tcp_handler(data);
2500 		return;
2501 	}
2502 
2503 	assert((event & EV_READ));
2504 
2505 	if (data->bytes_transmitted == 0) {
2506 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2507 	}
2508 
2509 	/*
2510 	 * Check if we received the leading packet length bytes yet.
2511 	 */
2512 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2513 		received = read(fd,
2514 				(char *) &data->query->tcplen
2515 				+ data->bytes_transmitted,
2516 				sizeof(uint16_t) - data->bytes_transmitted);
2517 		if (received == -1) {
2518 			if (errno == EAGAIN || errno == EINTR) {
2519 				/*
2520 				 * Read would block, wait until more
2521 				 * data is available.
2522 				 */
2523 				return;
2524 			} else {
2525 				char buf[48];
2526 				addr2str(&data->query->addr, buf, sizeof(buf));
2527 #ifdef ECONNRESET
2528 				if (verbosity >= 2 || errno != ECONNRESET)
2529 #endif /* ECONNRESET */
2530 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2531 				cleanup_tcp_handler(data);
2532 				return;
2533 			}
2534 		} else if (received == 0) {
2535 			/* EOF */
2536 			cleanup_tcp_handler(data);
2537 			return;
2538 		}
2539 
2540 		data->bytes_transmitted += received;
2541 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2542 			/*
2543 			 * Not done with the tcplen yet, wait for more
2544 			 * data to become available.
2545 			 */
2546 			return;
2547 		}
2548 
2549 		assert(data->bytes_transmitted == sizeof(uint16_t));
2550 
2551 		data->query->tcplen = ntohs(data->query->tcplen);
2552 
2553 		/*
2554 		 * Minimum query size is:
2555 		 *
2556 		 *     Size of the header (12)
2557 		 *   + Root domain name   (1)
2558 		 *   + Query class        (2)
2559 		 *   + Query type         (2)
2560 		 */
2561 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2562 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2563 			cleanup_tcp_handler(data);
2564 			return;
2565 		}
2566 
2567 		if (data->query->tcplen > data->query->maxlen) {
2568 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2569 			cleanup_tcp_handler(data);
2570 			return;
2571 		}
2572 
2573 		buffer_set_limit(data->query->packet, data->query->tcplen);
2574 	}
2575 
2576 	assert(buffer_remaining(data->query->packet) > 0);
2577 
2578 	/* Read the (remaining) query data.  */
2579 	received = read(fd,
2580 			buffer_current(data->query->packet),
2581 			buffer_remaining(data->query->packet));
2582 	if (received == -1) {
2583 		if (errno == EAGAIN || errno == EINTR) {
2584 			/*
2585 			 * Read would block, wait until more data is
2586 			 * available.
2587 			 */
2588 			return;
2589 		} else {
2590 			char buf[48];
2591 			addr2str(&data->query->addr, buf, sizeof(buf));
2592 #ifdef ECONNRESET
2593 			if (verbosity >= 2 || errno != ECONNRESET)
2594 #endif /* ECONNRESET */
2595 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2596 			cleanup_tcp_handler(data);
2597 			return;
2598 		}
2599 	} else if (received == 0) {
2600 		/* EOF */
2601 		cleanup_tcp_handler(data);
2602 		return;
2603 	}
2604 
2605 	data->bytes_transmitted += received;
2606 	buffer_skip(data->query->packet, received);
2607 	if (buffer_remaining(data->query->packet) > 0) {
2608 		/*
2609 		 * Message not yet complete, wait for more data to
2610 		 * become available.
2611 		 */
2612 		return;
2613 	}
2614 
2615 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2616 
2617 	/* Account... */
2618 #ifdef BIND8_STATS
2619 #ifndef INET6
2620 	STATUP(data->nsd, ctcp);
2621 #else
2622 	if (data->query->addr.ss_family == AF_INET) {
2623 		STATUP(data->nsd, ctcp);
2624 	} else if (data->query->addr.ss_family == AF_INET6) {
2625 		STATUP(data->nsd, ctcp6);
2626 	}
2627 #endif
2628 #endif /* BIND8_STATS */
2629 
2630 	/* We have a complete query, process it.  */
2631 
2632 	/* tcp-query-count: handle query counter ++ */
2633 	data->query_count++;
2634 
2635 	buffer_flip(data->query->packet);
2636 	data->query_state = server_process_query(data->nsd, data->query);
2637 	if (data->query_state == QUERY_DISCARDED) {
2638 		/* Drop the packet and the entire connection... */
2639 		STATUP(data->nsd, dropped);
2640 		ZTATUP(data->nsd, data->query->zone, dropped);
2641 		cleanup_tcp_handler(data);
2642 		return;
2643 	}
2644 
2645 #ifdef BIND8_STATS
2646 	if (RCODE(data->query->packet) == RCODE_OK
2647 	    && !AA(data->query->packet))
2648 	{
2649 		STATUP(data->nsd, nona);
2650 		ZTATUP(data->nsd, data->query->zone, nona);
2651 	}
2652 #endif /* BIND8_STATS */
2653 
2654 #ifdef USE_ZONE_STATS
2655 #ifndef INET6
2656 	ZTATUP(data->nsd, data->query->zone, ctcp);
2657 #else
2658 	if (data->query->addr.ss_family == AF_INET) {
2659 		ZTATUP(data->nsd, data->query->zone, ctcp);
2660 	} else if (data->query->addr.ss_family == AF_INET6) {
2661 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2662 	}
2663 #endif
2664 #endif /* USE_ZONE_STATS */
2665 
2666 	query_add_optional(data->query, data->nsd);
2667 
2668 	/* Switch to the tcp write handler.  */
2669 	buffer_flip(data->query->packet);
2670 	data->query->tcplen = buffer_remaining(data->query->packet);
2671 	data->bytes_transmitted = 0;
2672 
2673 	timeout.tv_sec = data->tcp_timeout / 1000;
2674 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2675 
2676 	ev_base = data->event.ev_base;
2677 	event_del(&data->event);
2678 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2679 		handle_tcp_writing, data);
2680 	if(event_base_set(ev_base, &data->event) != 0)
2681 		log_msg(LOG_ERR, "event base set tcpr failed");
2682 	if(event_add(&data->event, &timeout) != 0)
2683 		log_msg(LOG_ERR, "event add tcpr failed");
2684 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2685 	handle_tcp_writing(fd, EV_WRITE, data);
2686 }
2687 
2688 static void
2689 handle_tcp_writing(int fd, short event, void* arg)
2690 {
2691 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2692 	ssize_t sent;
2693 	struct query *q = data->query;
2694 	struct timeval timeout;
2695 	struct event_base* ev_base;
2696 
2697 	if ((event & EV_TIMEOUT)) {
2698 		/* Connection timed out.  */
2699 		cleanup_tcp_handler(data);
2700 		return;
2701 	}
2702 
2703 	assert((event & EV_WRITE));
2704 
2705 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2706 		/* Writing the response packet length.  */
2707 		uint16_t n_tcplen = htons(q->tcplen);
2708 #ifdef HAVE_WRITEV
2709 		struct iovec iov[2];
2710 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2711 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2712 		iov[1].iov_base = buffer_begin(q->packet);
2713 		iov[1].iov_len = buffer_limit(q->packet);
2714 		sent = writev(fd, iov, 2);
2715 #else /* HAVE_WRITEV */
2716 		sent = write(fd,
2717 			     (const char *) &n_tcplen + data->bytes_transmitted,
2718 			     sizeof(n_tcplen) - data->bytes_transmitted);
2719 #endif /* HAVE_WRITEV */
2720 		if (sent == -1) {
2721 			if (errno == EAGAIN || errno == EINTR) {
2722 				/*
2723 				 * Write would block, wait until
2724 				 * socket becomes writable again.
2725 				 */
2726 				return;
2727 			} else {
2728 #ifdef ECONNRESET
2729 				if(verbosity >= 2 || errno != ECONNRESET)
2730 #endif /* ECONNRESET */
2731 #ifdef EPIPE
2732 				  if(verbosity >= 2 || errno != EPIPE)
2733 #endif /* EPIPE 'broken pipe' */
2734 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2735 				cleanup_tcp_handler(data);
2736 				return;
2737 			}
2738 		}
2739 
2740 		data->bytes_transmitted += sent;
2741 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2742 			/*
2743 			 * Writing not complete, wait until socket
2744 			 * becomes writable again.
2745 			 */
2746 			return;
2747 		}
2748 
2749 #ifdef HAVE_WRITEV
2750 		sent -= sizeof(n_tcplen);
2751 		/* handle potential 'packet done' code */
2752 		goto packet_could_be_done;
2753 #endif
2754  	}
2755 
2756 	sent = write(fd,
2757 		     buffer_current(q->packet),
2758 		     buffer_remaining(q->packet));
2759 	if (sent == -1) {
2760 		if (errno == EAGAIN || errno == EINTR) {
2761 			/*
2762 			 * Write would block, wait until
2763 			 * socket becomes writable again.
2764 			 */
2765 			return;
2766 		} else {
2767 #ifdef ECONNRESET
2768 			if(verbosity >= 2 || errno != ECONNRESET)
2769 #endif /* ECONNRESET */
2770 #ifdef EPIPE
2771 				  if(verbosity >= 2 || errno != EPIPE)
2772 #endif /* EPIPE 'broken pipe' */
2773 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2774 			cleanup_tcp_handler(data);
2775 			return;
2776 		}
2777 	}
2778 
2779 	data->bytes_transmitted += sent;
2780 #ifdef HAVE_WRITEV
2781   packet_could_be_done:
2782 #endif
2783 	buffer_skip(q->packet, sent);
2784 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2785 		/*
2786 		 * Still more data to write when socket becomes
2787 		 * writable again.
2788 		 */
2789 		return;
2790 	}
2791 
2792 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2793 
2794 	if (data->query_state == QUERY_IN_AXFR) {
2795 		/* Continue processing AXFR and writing back results.  */
2796 		buffer_clear(q->packet);
2797 		data->query_state = query_axfr(data->nsd, q);
2798 		if (data->query_state != QUERY_PROCESSED) {
2799 			query_add_optional(data->query, data->nsd);
2800 
2801 			/* Reset data. */
2802 			buffer_flip(q->packet);
2803 			q->tcplen = buffer_remaining(q->packet);
2804 			data->bytes_transmitted = 0;
2805 			/* Reset timeout.  */
2806 			timeout.tv_sec = data->tcp_timeout / 1000;
2807 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2808 			ev_base = data->event.ev_base;
2809 			event_del(&data->event);
2810 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2811 				handle_tcp_writing, data);
2812 			if(event_base_set(ev_base, &data->event) != 0)
2813 				log_msg(LOG_ERR, "event base set tcpw failed");
2814 			if(event_add(&data->event, &timeout) != 0)
2815 				log_msg(LOG_ERR, "event add tcpw failed");
2816 
2817 			/*
2818 			 * Write data if/when the socket is writable
2819 			 * again.
2820 			 */
2821 			return;
2822 		}
2823 	}
2824 
2825 	/*
2826 	 * Done sending, wait for the next request to arrive on the
2827 	 * TCP socket by installing the TCP read handler.
2828 	 */
2829 	if (data->nsd->tcp_query_count > 0 &&
2830 		data->query_count >= data->nsd->tcp_query_count) {
2831 
2832 		(void) shutdown(fd, SHUT_WR);
2833 	}
2834 
2835 	data->bytes_transmitted = 0;
2836 
2837 	timeout.tv_sec = data->tcp_timeout / 1000;
2838 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2839 	ev_base = data->event.ev_base;
2840 	event_del(&data->event);
2841 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2842 		handle_tcp_reading, data);
2843 	if(event_base_set(ev_base, &data->event) != 0)
2844 		log_msg(LOG_ERR, "event base set tcpw failed");
2845 	if(event_add(&data->event, &timeout) != 0)
2846 		log_msg(LOG_ERR, "event add tcpw failed");
2847 }
2848 
2849 
2850 static void
2851 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2852 	void* ATTR_UNUSED(arg))
2853 {
2854 	if(slowaccept) {
2855 		configure_handler_event_types(EV_PERSIST | EV_READ);
2856 		slowaccept = 0;
2857 	}
2858 }
2859 
2860 /*
2861  * Handle an incoming TCP connection.  The connection is accepted and
2862  * a new TCP reader event handler is added.  The TCP handler
2863  * is responsible for cleanup when the connection is closed.
2864  */
2865 static void
2866 handle_tcp_accept(int fd, short event, void* arg)
2867 {
2868 	struct tcp_accept_handler_data *data
2869 		= (struct tcp_accept_handler_data *) arg;
2870 	int s;
2871 	struct tcp_handler_data *tcp_data;
2872 	region_type *tcp_region;
2873 #ifdef INET6
2874 	struct sockaddr_storage addr;
2875 #else
2876 	struct sockaddr_in addr;
2877 #endif
2878 	socklen_t addrlen;
2879 	struct timeval timeout;
2880 
2881 	if (!(event & EV_READ)) {
2882 		return;
2883 	}
2884 
2885 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2886 		return;
2887 	}
2888 
2889 	/* Accept it... */
2890 	addrlen = sizeof(addr);
2891 #ifndef HAVE_ACCEPT4
2892 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2893 #else
2894 	s = accept4(fd, (struct sockaddr *) &addr, &addrlen, SOCK_NONBLOCK);
2895 #endif
2896 	if (s == -1) {
2897 		/**
2898 		 * EMFILE and ENFILE is a signal that the limit of open
2899 		 * file descriptors has been reached. Pause accept().
2900 		 * EINTR is a signal interrupt. The others are various OS ways
2901 		 * of saying that the client has closed the connection.
2902 		 */
2903 		if (errno == EMFILE || errno == ENFILE) {
2904 			if (!slowaccept) {
2905 				/* disable accept events */
2906 				struct timeval tv;
2907 				configure_handler_event_types(0);
2908 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2909 				tv.tv_usec = 0L;
2910 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2911 					handle_slowaccept_timeout, NULL);
2912 				(void)event_base_set(data->event.ev_base,
2913 					&slowaccept_event);
2914 				(void)event_add(&slowaccept_event, &tv);
2915 				slowaccept = 1;
2916 				/* We don't want to spam the logs here */
2917 			}
2918 		} else if (errno != EINTR
2919 			&& errno != EWOULDBLOCK
2920 #ifdef ECONNABORTED
2921 			&& errno != ECONNABORTED
2922 #endif /* ECONNABORTED */
2923 #ifdef EPROTO
2924 			&& errno != EPROTO
2925 #endif /* EPROTO */
2926 			) {
2927 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2928 		}
2929 		return;
2930 	}
2931 
2932 #ifndef HAVE_ACCEPT4
2933 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2934 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
2935 		close(s);
2936 		return;
2937 	}
2938 #endif
2939 
2940 	/*
2941 	 * This region is deallocated when the TCP connection is
2942 	 * closed by the TCP handler.
2943 	 */
2944 	tcp_region = region_create(xalloc, free);
2945 	tcp_data = (struct tcp_handler_data *) region_alloc(
2946 		tcp_region, sizeof(struct tcp_handler_data));
2947 	tcp_data->region = tcp_region;
2948 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
2949 		compression_table_size, compressed_dnames);
2950 	tcp_data->nsd = data->nsd;
2951 	tcp_data->query_count = 0;
2952 
2953 	tcp_data->query_state = QUERY_PROCESSED;
2954 	tcp_data->bytes_transmitted = 0;
2955 	memcpy(&tcp_data->query->addr, &addr, addrlen);
2956 	tcp_data->query->addrlen = addrlen;
2957 
2958 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
2959 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
2960 		/* very busy, give smaller timeout */
2961 		tcp_data->tcp_timeout = 200;
2962 	}
2963 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
2964 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
2965 
2966 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
2967 		handle_tcp_reading, tcp_data);
2968 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
2969 		log_msg(LOG_ERR, "cannot set tcp event base");
2970 		close(s);
2971 		region_destroy(tcp_region);
2972 		return;
2973 	}
2974 	if(event_add(&tcp_data->event, &timeout) != 0) {
2975 		log_msg(LOG_ERR, "cannot add tcp to event base");
2976 		close(s);
2977 		region_destroy(tcp_region);
2978 		return;
2979 	}
2980 
2981 	/*
2982 	 * Keep track of the total number of TCP handlers installed so
2983 	 * we can stop accepting connections when the maximum number
2984 	 * of simultaneous TCP connections is reached.
2985 	 */
2986 	++data->nsd->current_tcp_count;
2987 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2988 		configure_handler_event_types(0);
2989 	}
2990 }
2991 
2992 static void
2993 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
2994 {
2995 	size_t i;
2996 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2997 	for (i = 0; i < nsd->child_count; ++i) {
2998 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
2999 			if (write(nsd->children[i].child_fd,
3000 				&command,
3001 				sizeof(command)) == -1)
3002 			{
3003 				if(errno != EAGAIN && errno != EINTR)
3004 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
3005 					(int) command,
3006 					(int) nsd->children[i].pid,
3007 					strerror(errno));
3008 			} else if (timeout > 0) {
3009 				(void)block_read(NULL,
3010 					nsd->children[i].child_fd,
3011 					&command, sizeof(command), timeout);
3012 			}
3013 			fsync(nsd->children[i].child_fd);
3014 			close(nsd->children[i].child_fd);
3015 			nsd->children[i].child_fd = -1;
3016 		}
3017 	}
3018 }
3019 
3020 static void
3021 send_children_quit(struct nsd* nsd)
3022 {
3023 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
3024 	send_children_command(nsd, NSD_QUIT, 0);
3025 }
3026 
3027 static void
3028 send_children_quit_and_wait(struct nsd* nsd)
3029 {
3030 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
3031 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
3032 }
3033 
3034 #ifdef BIND8_STATS
3035 static void
3036 set_children_stats(struct nsd* nsd)
3037 {
3038 	size_t i;
3039 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3040 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
3041 	for (i = 0; i < nsd->child_count; ++i) {
3042 		nsd->children[i].need_to_send_STATS = 1;
3043 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
3044 	}
3045 }
3046 #endif /* BIND8_STATS */
3047 
3048 static void
3049 configure_handler_event_types(short event_types)
3050 {
3051 	size_t i;
3052 
3053 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3054 		struct event* handler = &tcp_accept_handlers[i].event;
3055 		if(event_types) {
3056 			/* reassign */
3057 			int fd = handler->ev_fd;
3058 			struct event_base* base = handler->ev_base;
3059 			if(tcp_accept_handlers[i].event_added)
3060 				event_del(handler);
3061 			event_set(handler, fd, event_types,
3062 				handle_tcp_accept, &tcp_accept_handlers[i]);
3063 			if(event_base_set(base, handler) != 0)
3064 				log_msg(LOG_ERR, "conhand: cannot event_base");
3065 			if(event_add(handler, NULL) != 0)
3066 				log_msg(LOG_ERR, "conhand: cannot event_add");
3067 			tcp_accept_handlers[i].event_added = 1;
3068 		} else {
3069 			/* remove */
3070 			if(tcp_accept_handlers[i].event_added) {
3071 				event_del(handler);
3072 				tcp_accept_handlers[i].event_added = 0;
3073 			}
3074 		}
3075 	}
3076 }
3077