xref: /openbsd-src/usr.sbin/nsd/server.c (revision c90a81c56dcebd6a1b73fe4aff9b03385b8e63b3)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #ifdef HAVE_OPENSSL_RAND_H
41 #include <openssl/rand.h>
42 #endif
43 #ifndef USE_MINI_EVENT
44 #  ifdef HAVE_EVENT_H
45 #    include <event.h>
46 #  else
47 #    include <event2/event.h>
48 #    include "event2/event_struct.h"
49 #    include "event2/event_compat.h"
50 #  endif
51 #else
52 #  include "mini_event.h"
53 #endif
54 
55 #include "axfr.h"
56 #include "namedb.h"
57 #include "netio.h"
58 #include "xfrd.h"
59 #include "xfrd-tcp.h"
60 #include "xfrd-disk.h"
61 #include "difffile.h"
62 #include "nsec3.h"
63 #include "ipc.h"
64 #include "udb.h"
65 #include "remote.h"
66 #include "lookup3.h"
67 #include "rrl.h"
68 #ifdef USE_DNSTAP
69 #include "dnstap/dnstap_collector.h"
70 #endif
71 
72 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
73 
74 /*
75  * Data for the UDP handlers.
76  */
77 struct udp_handler_data
78 {
79 	struct nsd        *nsd;
80 	struct nsd_socket *socket;
81 	query_type        *query;
82 };
83 
84 struct tcp_accept_handler_data {
85 	struct nsd         *nsd;
86 	struct nsd_socket  *socket;
87 	int event_added;
88 	struct event       event;
89 };
90 
91 /*
92  * These globals are used to enable the TCP accept handlers
93  * when the number of TCP connection drops below the maximum
94  * number of TCP connections.
95  */
96 static size_t		tcp_accept_handler_count;
97 static struct tcp_accept_handler_data*	tcp_accept_handlers;
98 
99 static struct event slowaccept_event;
100 static int slowaccept;
101 
102 #ifndef NONBLOCKING_IS_BROKEN
103 #  define NUM_RECV_PER_SELECT 100
104 #endif
105 
106 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
107 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
108 struct iovec iovecs[NUM_RECV_PER_SELECT];
109 struct query *queries[NUM_RECV_PER_SELECT];
110 #endif
111 
112 /*
113  * Data for the TCP connection handlers.
114  *
115  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
116  * blocking the entire server on a slow TCP connection, but does make
117  * reading from and writing to the socket more complicated.
118  *
119  * Basically, whenever a read/write would block (indicated by the
120  * EAGAIN errno variable) we remember the position we were reading
121  * from/writing to and return from the TCP reading/writing event
122  * handler.  When the socket becomes readable/writable again we
123  * continue from the same position.
124  */
125 struct tcp_handler_data
126 {
127 	/*
128 	 * The region used to allocate all TCP connection related
129 	 * data, including this structure.  This region is destroyed
130 	 * when the connection is closed.
131 	 */
132 	region_type*		region;
133 
134 	/*
135 	 * The global nsd structure.
136 	 */
137 	struct nsd*			nsd;
138 
139 	/*
140 	 * The current query data for this TCP connection.
141 	 */
142 	query_type*			query;
143 
144 	/*
145 	 * The query_state is used to remember if we are performing an
146 	 * AXFR, if we're done processing, or if we should discard the
147 	 * query and connection.
148 	 */
149 	query_state_type	query_state;
150 
151 	/*
152 	 * The event for the file descriptor and tcp timeout
153 	 */
154 	struct event event;
155 
156 	/*
157 	 * The bytes_transmitted field is used to remember the number
158 	 * of bytes transmitted when receiving or sending a DNS
159 	 * packet.  The count includes the two additional bytes used
160 	 * to specify the packet length on a TCP connection.
161 	 */
162 	size_t				bytes_transmitted;
163 
164 	/*
165 	 * The number of queries handled by this specific TCP connection.
166 	 */
167 	int					query_count;
168 
169 	/*
170 	 * The timeout in msec for this tcp connection
171 	 */
172 	int	tcp_timeout;
173 };
174 
175 /*
176  * Handle incoming queries on the UDP server sockets.
177  */
178 static void handle_udp(int fd, short event, void* arg);
179 
180 /*
181  * Handle incoming connections on the TCP sockets.  These handlers
182  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
183  * connection) but are disabled when the number of current TCP
184  * connections is equal to the maximum number of TCP connections.
185  * Disabling is done by changing the handler to wait for the
186  * NETIO_EVENT_NONE type.  This is done using the function
187  * configure_tcp_accept_handlers.
188  */
189 static void handle_tcp_accept(int fd, short event, void* arg);
190 
191 /*
192  * Handle incoming queries on a TCP connection.  The TCP connections
193  * are configured to be non-blocking and the handler may be called
194  * multiple times before a complete query is received.
195  */
196 static void handle_tcp_reading(int fd, short event, void* arg);
197 
198 /*
199  * Handle outgoing responses on a TCP connection.  The TCP connections
200  * are configured to be non-blocking and the handler may be called
201  * multiple times before a complete response is sent.
202  */
203 static void handle_tcp_writing(int fd, short event, void* arg);
204 
205 /*
206  * Send all children the quit nonblocking, then close pipe.
207  */
208 static void send_children_quit(struct nsd* nsd);
209 /* same, for shutdown time, waits for child to exit to avoid restart issues */
210 static void send_children_quit_and_wait(struct nsd* nsd);
211 
212 /* set childrens flags to send NSD_STATS to them */
213 #ifdef BIND8_STATS
214 static void set_children_stats(struct nsd* nsd);
215 #endif /* BIND8_STATS */
216 
217 /*
218  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
219  */
220 static void configure_handler_event_types(short event_types);
221 
222 static uint16_t *compressed_dname_offsets = 0;
223 static uint32_t compression_table_capacity = 0;
224 static uint32_t compression_table_size = 0;
225 static domain_type* compressed_dnames[MAXRRSPP];
226 
227 /*
228  * Remove the specified pid from the list of child pids.  Returns -1 if
229  * the pid is not in the list, child_num otherwise.  The field is set to 0.
230  */
231 static int
232 delete_child_pid(struct nsd *nsd, pid_t pid)
233 {
234 	size_t i;
235 	for (i = 0; i < nsd->child_count; ++i) {
236 		if (nsd->children[i].pid == pid) {
237 			nsd->children[i].pid = 0;
238 			if(!nsd->children[i].need_to_exit) {
239 				if(nsd->children[i].child_fd != -1)
240 					close(nsd->children[i].child_fd);
241 				nsd->children[i].child_fd = -1;
242 				if(nsd->children[i].handler)
243 					nsd->children[i].handler->fd = -1;
244 			}
245 			return i;
246 		}
247 	}
248 	return -1;
249 }
250 
251 /*
252  * Restart child servers if necessary.
253  */
254 static int
255 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
256 	int* xfrd_sock_p)
257 {
258 	struct main_ipc_handler_data *ipc_data;
259 	size_t i;
260 	int sv[2];
261 
262 	/* Fork the child processes... */
263 	for (i = 0; i < nsd->child_count; ++i) {
264 		if (nsd->children[i].pid <= 0) {
265 			if (nsd->children[i].child_fd != -1)
266 				close(nsd->children[i].child_fd);
267 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
268 				log_msg(LOG_ERR, "socketpair: %s",
269 					strerror(errno));
270 				return -1;
271 			}
272 			nsd->children[i].child_fd = sv[0];
273 			nsd->children[i].parent_fd = sv[1];
274 			nsd->children[i].pid = fork();
275 			switch (nsd->children[i].pid) {
276 			default: /* SERVER MAIN */
277 				close(nsd->children[i].parent_fd);
278 				nsd->children[i].parent_fd = -1;
279 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
280 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
281 				}
282 				if(!nsd->children[i].handler)
283 				{
284 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
285 						region, sizeof(struct main_ipc_handler_data));
286 					ipc_data->nsd = nsd;
287 					ipc_data->child = &nsd->children[i];
288 					ipc_data->child_num = i;
289 					ipc_data->xfrd_sock = xfrd_sock_p;
290 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
291 					ipc_data->forward_mode = 0;
292 					ipc_data->got_bytes = 0;
293 					ipc_data->total_bytes = 0;
294 					ipc_data->acl_num = 0;
295 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
296 						region, sizeof(struct netio_handler));
297 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
298 					nsd->children[i].handler->timeout = NULL;
299 					nsd->children[i].handler->user_data = ipc_data;
300 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
301 					nsd->children[i].handler->event_handler = parent_handle_child_command;
302 					netio_add_handler(netio, nsd->children[i].handler);
303 				}
304 				/* clear any ongoing ipc */
305 				ipc_data = (struct main_ipc_handler_data*)
306 					nsd->children[i].handler->user_data;
307 				ipc_data->forward_mode = 0;
308 				/* restart - update fd */
309 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
310 				break;
311 			case 0: /* CHILD */
312 				/* the child need not be able to access the
313 				 * nsd.db file */
314 				namedb_close_udb(nsd->db);
315 #ifdef MEMCLEAN /* OS collects memory pages */
316 				region_destroy(region);
317 #endif
318 
319 				if (pledge("stdio rpath inet", NULL) == -1) {
320 					log_msg(LOG_ERR, "pledge");
321 					exit(1);
322 				}
323 
324 				nsd->pid = 0;
325 				nsd->child_count = 0;
326 				nsd->server_kind = nsd->children[i].kind;
327 				nsd->this_child = &nsd->children[i];
328 				nsd->this_child->child_num = i;
329 				/* remove signal flags inherited from parent
330 				   the parent will handle them. */
331 				nsd->signal_hint_reload_hup = 0;
332 				nsd->signal_hint_reload = 0;
333 				nsd->signal_hint_child = 0;
334 				nsd->signal_hint_quit = 0;
335 				nsd->signal_hint_shutdown = 0;
336 				nsd->signal_hint_stats = 0;
337 				nsd->signal_hint_statsusr = 0;
338 				close(*xfrd_sock_p);
339 				close(nsd->this_child->child_fd);
340 				nsd->this_child->child_fd = -1;
341 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
342 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
343 				}
344 				server_child(nsd);
345 				/* NOTREACH */
346 				exit(0);
347 			case -1:
348 				log_msg(LOG_ERR, "fork failed: %s",
349 					strerror(errno));
350 				return -1;
351 			}
352 		}
353 	}
354 	return 0;
355 }
356 
357 #ifdef BIND8_STATS
358 static void set_bind8_alarm(struct nsd* nsd)
359 {
360 	/* resync so that the next alarm is on the next whole minute */
361 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
362 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
363 }
364 #endif
365 
366 /* set zone stat ids for zones initially read in */
367 static void
368 zonestatid_tree_set(struct nsd* nsd)
369 {
370 	struct radnode* n;
371 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
372 		zone_type* zone = (zone_type*)n->elem;
373 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
374 	}
375 }
376 
377 #ifdef USE_ZONE_STATS
378 void
379 server_zonestat_alloc(struct nsd* nsd)
380 {
381 	size_t num = (nsd->options->zonestatnames->count==0?1:
382 			nsd->options->zonestatnames->count);
383 	size_t sz = sizeof(struct nsdst)*num;
384 	char tmpfile[256];
385 	uint8_t z = 0;
386 
387 	/* file names */
388 	nsd->zonestatfname[0] = 0;
389 	nsd->zonestatfname[1] = 0;
390 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
391 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
392 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
393 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
394 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
395 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
396 
397 	/* file descriptors */
398 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
399 	if(nsd->zonestatfd[0] == -1) {
400 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
401 			strerror(errno));
402 		exit(1);
403 	}
404 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
405 	if(nsd->zonestatfd[0] == -1) {
406 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
407 			strerror(errno));
408 		close(nsd->zonestatfd[0]);
409 		unlink(nsd->zonestatfname[0]);
410 		exit(1);
411 	}
412 
413 #ifdef HAVE_MMAP
414 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
415 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
416 			strerror(errno));
417 		exit(1);
418 	}
419 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
420 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
421 			nsd->zonestatfname[0], strerror(errno));
422 		exit(1);
423 	}
424 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
425 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
426 			strerror(errno));
427 		exit(1);
428 	}
429 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
430 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
431 			nsd->zonestatfname[1], strerror(errno));
432 		exit(1);
433 	}
434 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
435 		MAP_SHARED, nsd->zonestatfd[0], 0);
436 	if(nsd->zonestat[0] == MAP_FAILED) {
437 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
438 		unlink(nsd->zonestatfname[0]);
439 		unlink(nsd->zonestatfname[1]);
440 		exit(1);
441 	}
442 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
443 		MAP_SHARED, nsd->zonestatfd[1], 0);
444 	if(nsd->zonestat[1] == MAP_FAILED) {
445 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
446 		unlink(nsd->zonestatfname[0]);
447 		unlink(nsd->zonestatfname[1]);
448 		exit(1);
449 	}
450 	memset(nsd->zonestat[0], 0, sz);
451 	memset(nsd->zonestat[1], 0, sz);
452 	nsd->zonestatsize[0] = num;
453 	nsd->zonestatsize[1] = num;
454 	nsd->zonestatdesired = num;
455 	nsd->zonestatsizenow = num;
456 	nsd->zonestatnow = nsd->zonestat[0];
457 #endif /* HAVE_MMAP */
458 }
459 
460 void
461 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
462 {
463 #ifdef HAVE_MMAP
464 #ifdef MREMAP_MAYMOVE
465 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
466 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
467 		MREMAP_MAYMOVE);
468 	if(nsd->zonestat[idx] == MAP_FAILED) {
469 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
470 		exit(1);
471 	}
472 #else /* !HAVE MREMAP */
473 	if(msync(nsd->zonestat[idx],
474 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
475 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
476 	if(munmap(nsd->zonestat[idx],
477 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
478 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
479 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
480 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
481 	if(nsd->zonestat[idx] == MAP_FAILED) {
482 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
483 		exit(1);
484 	}
485 #endif /* MREMAP */
486 #endif /* HAVE_MMAP */
487 }
488 
489 /* realloc the zonestat array for the one that is not currently in use,
490  * to match the desired new size of the array (if applicable) */
491 void
492 server_zonestat_realloc(struct nsd* nsd)
493 {
494 #ifdef HAVE_MMAP
495 	uint8_t z = 0;
496 	size_t sz;
497 	int idx = 0; /* index of the zonestat array that is not in use */
498 	if(nsd->zonestatnow == nsd->zonestat[0])
499 		idx = 1;
500 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
501 		return;
502 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
503 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
504 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
505 			strerror(errno));
506 		exit(1);
507 	}
508 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
509 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
510 			nsd->zonestatfname[idx], strerror(errno));
511 		exit(1);
512 	}
513 	zonestat_remap(nsd, idx, sz);
514 	/* zero the newly allocated region */
515 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
516 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
517 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
518 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
519 	}
520 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
521 #endif /* HAVE_MMAP */
522 }
523 
524 /* switchover to use the other array for the new children, that
525  * briefly coexist with the old children.  And we want to avoid them
526  * both writing to the same statistics arrays. */
527 void
528 server_zonestat_switch(struct nsd* nsd)
529 {
530 	if(nsd->zonestatnow == nsd->zonestat[0]) {
531 		nsd->zonestatnow = nsd->zonestat[1];
532 		nsd->zonestatsizenow = nsd->zonestatsize[1];
533 	} else {
534 		nsd->zonestatnow = nsd->zonestat[0];
535 		nsd->zonestatsizenow = nsd->zonestatsize[0];
536 	}
537 }
538 #endif /* USE_ZONE_STATS */
539 
540 static void
541 cleanup_dname_compression_tables(void *ptr)
542 {
543 	free(ptr);
544 	compressed_dname_offsets = NULL;
545 	compression_table_capacity = 0;
546 }
547 
548 static void
549 initialize_dname_compression_tables(struct nsd *nsd)
550 {
551 	size_t needed = domain_table_count(nsd->db->domains) + 1;
552 	needed += EXTRA_DOMAIN_NUMBERS;
553 	if(compression_table_capacity < needed) {
554 		if(compressed_dname_offsets) {
555 			region_remove_cleanup(nsd->db->region,
556 				cleanup_dname_compression_tables,
557 				compressed_dname_offsets);
558 			free(compressed_dname_offsets);
559 		}
560 		compressed_dname_offsets = (uint16_t *) xmallocarray(
561 			needed, sizeof(uint16_t));
562 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
563 			compressed_dname_offsets);
564 		compression_table_capacity = needed;
565 		compression_table_size=domain_table_count(nsd->db->domains)+1;
566 	}
567 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
568 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
569 }
570 
571 /* create and bind sockets.  */
572 static int
573 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
574 {
575 	struct addrinfo* addr;
576 	size_t i;
577 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND) || defined(SO_BINDANY))
578 	int on = 1;
579 #endif
580 
581 	/* UDP */
582 
583 	/* Make a socket... */
584 	for (i = from; i < to; i++) {
585 		/* for reuseports copy socket specs of first entries */
586 		addr = nsd->udp[i%nsd->ifs].addr;
587 		if (!addr) {
588 			nsd->udp[i].s = -1;
589 			continue;
590 		}
591 		nsd->udp[i].fam = (int)addr->ai_family;
592 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
593 #if defined(INET6)
594 			if (addr->ai_family == AF_INET6 &&
595 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
596 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
597 				continue;
598 			}
599 #endif /* INET6 */
600 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
601 			return -1;
602 		}
603 
604 #ifdef SO_REUSEPORT
605 #  ifdef SO_REUSEPORT_LB
606 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
607 		 * like SO_REUSEPORT on Linux.  This is what the users want
608 		 * with the config option in nsd.conf; if we actually
609 		 * need local address and port reuse they'll also need to
610 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
611 		 */
612 		if(nsd->reuseport && *reuseport_works &&
613 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT_LB,
614 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
615 			if(verbosity >= 3
616 #ifdef ENOPROTOOPT
617 				|| errno != ENOPROTOOPT
618 #endif
619 				)
620 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT_LB, "
621 				"...) failed: %s", strerror(errno));
622 			*reuseport_works = 0;
623 		}
624 #  else /* SO_REUSEPORT_LB */
625 		if(nsd->reuseport && *reuseport_works &&
626 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
627 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
628 			if(verbosity >= 3
629 #ifdef ENOPROTOOPT
630 				|| errno != ENOPROTOOPT
631 #endif
632 				)
633 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
634 				"...) failed: %s", strerror(errno));
635 			*reuseport_works = 0;
636 		}
637 #  endif /* SO_REUSEPORT_LB */
638 #else
639 		(void)reuseport_works;
640 #endif /* SO_REUSEPORT */
641 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
642 	if(1) {
643 	int rcv = 1*1024*1024;
644 	int snd = 1*1024*1024;
645 
646 #ifdef SO_RCVBUF
647 #  ifdef SO_RCVBUFFORCE
648 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
649 		(socklen_t)sizeof(rcv)) < 0) {
650 		if(errno != EPERM && errno != ENOBUFS) {
651 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
652                                         "...) failed: %s", strerror(errno));
653 			return -1;
654 		}
655 #  else
656 	if(1) {
657 #  endif /* SO_RCVBUFFORCE */
658 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
659 			 (socklen_t)sizeof(rcv)) < 0) {
660 			if(errno != ENOBUFS && errno != ENOSYS) {
661 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
662                                         "...) failed: %s", strerror(errno));
663 				return -1;
664 			}
665 		}
666 	}
667 #endif /* SO_RCVBUF */
668 
669 #ifdef SO_SNDBUF
670 #  ifdef SO_SNDBUFFORCE
671 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
672 		(socklen_t)sizeof(snd)) < 0) {
673 		if(errno != EPERM && errno != ENOBUFS) {
674 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
675                                         "...) failed: %s", strerror(errno));
676 			return -1;
677 		}
678 #  else
679 	if(1) {
680 #  endif /* SO_SNDBUFFORCE */
681 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
682 			 (socklen_t)sizeof(snd)) < 0) {
683 			if(errno != ENOBUFS && errno != ENOSYS) {
684 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
685                                         "...) failed: %s", strerror(errno));
686 				return -1;
687 			}
688 		}
689 	}
690 #endif /* SO_SNDBUF */
691 
692 	}
693 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
694 
695 #if defined(INET6)
696 		if (addr->ai_family == AF_INET6) {
697 # if defined(IPV6_V6ONLY)
698 			if (setsockopt(nsd->udp[i].s,
699 				       IPPROTO_IPV6, IPV6_V6ONLY,
700 				       &on, sizeof(on)) < 0)
701 			{
702 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
703 					strerror(errno));
704 				return -1;
705 			}
706 # endif
707 # if defined(IPV6_USE_MIN_MTU)
708 			/*
709 			 * There is no fragmentation of IPv6 datagrams
710 			 * during forwarding in the network. Therefore
711 			 * we do not send UDP datagrams larger than
712 			 * the minimum IPv6 MTU of 1280 octets. The
713 			 * EDNS0 message length can be larger if the
714 			 * network stack supports IPV6_USE_MIN_MTU.
715 			 */
716 			if (setsockopt(nsd->udp[i].s,
717 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
718 				       &on, sizeof(on)) < 0)
719 			{
720 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
721 					strerror(errno));
722 				return -1;
723 			}
724 # elif defined(IPV6_MTU)
725 			/*
726 			 * On Linux, PMTUD is disabled by default for datagrams
727 			 * so set the MTU equal to the MIN MTU to get the same.
728 			 */
729 			on = IPV6_MIN_MTU;
730 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
731 				&on, sizeof(on)) < 0)
732 			{
733 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
734 					strerror(errno));
735 				return -1;
736 			}
737 			on = 1;
738 # endif
739 		}
740 #endif
741 #if defined(AF_INET)
742 		if (addr->ai_family == AF_INET) {
743 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
744 			int action = IP_PMTUDISC_DONT;
745 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
746 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
747 			{
748 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
749 					strerror(errno));
750 				return -1;
751 			}
752 #  elif defined(IP_DONTFRAG)
753 			int off = 0;
754 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
755 				&off, sizeof(off)) < 0)
756 			{
757 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
758 					strerror(errno));
759 				return -1;
760 			}
761 #  endif
762 		}
763 #endif
764 		/* set it nonblocking */
765 		/* otherwise, on OSes with thundering herd problems, the
766 		   UDP recv could block NSD after select returns readable. */
767 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
768 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
769 		}
770 
771 		/* Bind it... */
772 		if (nsd->options->ip_freebind) {
773 #ifdef IP_FREEBIND
774 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
775 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
776 					strerror(errno));
777 			}
778 #endif /* IP_FREEBIND */
779 		}
780 
781 		if (nsd->options->ip_transparent) {
782 #ifdef IP_TRANSPARENT
783 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
784 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
785 					strerror(errno));
786 			}
787 #endif /* IP_TRANSPARENT */
788 #ifdef SO_BINDANY
789 			if (setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
790 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for udp: %s",
791 					strerror(errno));
792 			}
793 #endif /* SO_BINDANY */
794 		}
795 
796 		if (
797 			bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
798 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
799 			return -1;
800 		}
801 	}
802 
803 	/* TCP */
804 
805 	/* Make a socket... */
806 	for (i = from; i < to; i++) {
807 		/* for reuseports copy socket specs of first entries */
808 		addr = nsd->tcp[i%nsd->ifs].addr;
809 		if (!addr) {
810 			nsd->tcp[i].s = -1;
811 			continue;
812 		}
813 		nsd->tcp[i].fam = (int)addr->ai_family;
814 		/* turn off REUSEPORT for TCP by copying the socket fd */
815 		if(i >= nsd->ifs) {
816 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
817 			continue;
818 		}
819 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
820 #if defined(INET6)
821 			if (addr->ai_family == AF_INET6 &&
822 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
823 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
824 				continue;
825 			}
826 #endif /* INET6 */
827 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
828 			return -1;
829 		}
830 
831 #ifdef SO_REUSEPORT
832 		if(nsd->reuseport && *reuseport_works &&
833 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
834 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
835 			if(verbosity >= 3
836 #ifdef ENOPROTOOPT
837 				|| errno != ENOPROTOOPT
838 #endif
839 				)
840 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
841 				"...) failed: %s", strerror(errno));
842 			*reuseport_works = 0;
843 		}
844 #endif /* SO_REUSEPORT */
845 #ifdef	SO_REUSEADDR
846 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
847 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
848 		}
849 #endif /* SO_REUSEADDR */
850 
851 #if defined(INET6)
852 		if (addr->ai_family == AF_INET6) {
853 # if defined(IPV6_V6ONLY)
854 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
855 				&on, sizeof(on)) < 0) {
856 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
857 				return -1;
858 			}
859 # endif
860 # if defined(IPV6_USE_MIN_MTU)
861 			/*
862 			 * Use minimum MTU to minimize delays learning working
863 			 * PMTU when communicating through a tunnel.
864 			 */
865 			if (setsockopt(nsd->tcp[i].s,
866 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
867 				       &on, sizeof(on)) < 0) {
868 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
869 				return -1;
870 			}
871 # elif defined(IPV6_MTU)
872 			/*
873 			 * On Linux, PMTUD is disabled by default for datagrams
874 			 * so set the MTU equal to the MIN MTU to get the same.
875 			 */
876 			on = IPV6_MIN_MTU;
877 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
878 				&on, sizeof(on)) < 0) {
879 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
880 				return -1;
881 			}
882 			on = 1;
883 # endif
884 		}
885 #endif
886 		/* set maximum segment size to tcp socket */
887 		if(nsd->tcp_mss > 0) {
888 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
889 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
890 					(void*)&nsd->tcp_mss,
891 					sizeof(nsd->tcp_mss)) < 0) {
892 				log_msg(LOG_ERR,
893 					"setsockopt(...,TCP_MAXSEG,...)"
894 					" failed for tcp: %s", strerror(errno));
895 			}
896 #else
897 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
898 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
899 		}
900 
901 		/* set it nonblocking */
902 		/* (StevensUNP p463), if tcp listening socket is blocking, then
903 		   it may block in accept, even if select() says readable. */
904 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
905 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
906 		}
907 
908 		/* Bind it... */
909 		if (nsd->options->ip_freebind) {
910 #ifdef IP_FREEBIND
911 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
912 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
913 					strerror(errno));
914 			}
915 #endif /* IP_FREEBIND */
916 		}
917 
918 		if (nsd->options->ip_transparent) {
919 #ifdef IP_TRANSPARENT
920 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
921 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
922 					strerror(errno));
923 			}
924 #endif /* IP_TRANSPARENT */
925 #ifdef SO_BINDANY
926 			if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
927 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for tcp: %s",
928 					strerror(errno));
929 			}
930 #endif /* SO_BINDANY */
931 		}
932 
933 		if(
934 			bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
935 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
936 			return -1;
937 		}
938 
939 		/* Listen to it... */
940 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
941 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
942 			return -1;
943 		}
944 	}
945 
946 	return 0;
947 }
948 
949 /*
950  * Initialize the server, reuseport, create and bind the sockets.
951  */
952 int
953 server_init(struct nsd *nsd)
954 {
955 	int reuseport_successful = 1; /* see if reuseport works in OS */
956 	if(nsd->reuseport) {
957 		/* increase the size of the udp and tcp interface arrays,
958 		 * there are going to be separate interface file descriptors
959 		 * for every server instance */
960 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
961 			sizeof(*nsd->udp));
962 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
963 			sizeof(*nsd->tcp));
964 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
965 			(nsd->ifs*(nsd->reuseport-1)));
966 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
967 			(nsd->ifs*(nsd->reuseport-1)));
968 	}
969 
970 	/* open the server interface ports */
971 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
972 		return -1;
973 
974 	/* continue to open the remaining reuseport ports */
975 	if(nsd->reuseport && reuseport_successful) {
976 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
977 			&reuseport_successful) == -1)
978 			return -1;
979 		nsd->ifs *= nsd->reuseport;
980 	} else {
981 		nsd->reuseport = 0;
982 	}
983 	return 0;
984 }
985 
986 /*
987  * Prepare the server for take off.
988  *
989  */
990 int
991 server_prepare(struct nsd *nsd)
992 {
993 #ifdef RATELIMIT
994 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
995 #ifdef HAVE_ARC4RANDOM
996 	hash_set_raninit(arc4random());
997 #else
998 	uint32_t v = getpid() ^ time(NULL);
999 	srandom((unsigned long)v);
1000 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1001 		hash_set_raninit(v);
1002 	else	hash_set_raninit(random());
1003 #endif
1004 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1005 		nsd->options->rrl_ratelimit,
1006 		nsd->options->rrl_whitelist_ratelimit,
1007 		nsd->options->rrl_slip,
1008 		nsd->options->rrl_ipv4_prefix_length,
1009 		nsd->options->rrl_ipv6_prefix_length);
1010 #endif /* RATELIMIT */
1011 
1012 	/* Open the database... */
1013 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1014 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1015 			nsd->dbfile, strerror(errno));
1016 		unlink(nsd->task[0]->fname);
1017 		unlink(nsd->task[1]->fname);
1018 #ifdef USE_ZONE_STATS
1019 		unlink(nsd->zonestatfname[0]);
1020 		unlink(nsd->zonestatfname[1]);
1021 #endif
1022 		xfrd_del_tempdir(nsd);
1023 		return -1;
1024 	}
1025 	/* check if zone files have been modified */
1026 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1027 	 * for all zones */
1028 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1029 		nsd->options->database[0] == 0))
1030 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1031 	zonestatid_tree_set(nsd);
1032 
1033 	compression_table_capacity = 0;
1034 	initialize_dname_compression_tables(nsd);
1035 
1036 #ifdef	BIND8_STATS
1037 	/* Initialize times... */
1038 	time(&nsd->st.boot);
1039 	set_bind8_alarm(nsd);
1040 #endif /* BIND8_STATS */
1041 
1042 	return 0;
1043 }
1044 
1045 /*
1046  * Fork the required number of servers.
1047  */
1048 static int
1049 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1050 	int* xfrd_sock_p)
1051 {
1052 	size_t i;
1053 
1054 	/* Start all child servers initially.  */
1055 	for (i = 0; i < nsd->child_count; ++i) {
1056 		nsd->children[i].pid = 0;
1057 	}
1058 
1059 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1060 }
1061 
1062 void
1063 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1064 {
1065 	size_t i;
1066 
1067 	/* Close all the sockets... */
1068 	for (i = 0; i < n; ++i) {
1069 		if (sockets[i].s != -1) {
1070 			close(sockets[i].s);
1071 			if(sockets[i].addr)
1072 				freeaddrinfo(sockets[i].addr);
1073 			sockets[i].s = -1;
1074 		}
1075 	}
1076 }
1077 
1078 /*
1079  * Close the sockets, shutdown the server and exit.
1080  * Does not return.
1081  *
1082  */
1083 void
1084 server_shutdown(struct nsd *nsd)
1085 {
1086 	size_t i;
1087 
1088 	server_close_all_sockets(nsd->udp, nsd->ifs);
1089 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1090 	/* CHILD: close command channel to parent */
1091 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1092 	{
1093 		close(nsd->this_child->parent_fd);
1094 		nsd->this_child->parent_fd = -1;
1095 	}
1096 	/* SERVER: close command channels to children */
1097 	if(!nsd->this_child)
1098 	{
1099 		for(i=0; i < nsd->child_count; ++i)
1100 			if(nsd->children[i].child_fd != -1)
1101 			{
1102 				close(nsd->children[i].child_fd);
1103 				nsd->children[i].child_fd = -1;
1104 			}
1105 	}
1106 
1107 	tsig_finalize();
1108 #ifdef HAVE_SSL
1109 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1110 #endif
1111 
1112 #ifdef MEMCLEAN /* OS collects memory pages */
1113 #ifdef RATELIMIT
1114 	rrl_mmap_deinit_keep_mmap();
1115 #endif
1116 #ifdef USE_DNSTAP
1117 	dt_collector_destroy(nsd->dt_collector, nsd);
1118 #endif
1119 	udb_base_free_keep_mmap(nsd->task[0]);
1120 	udb_base_free_keep_mmap(nsd->task[1]);
1121 	namedb_close_udb(nsd->db); /* keeps mmap */
1122 	namedb_close(nsd->db);
1123 	nsd_options_destroy(nsd->options);
1124 	region_destroy(nsd->region);
1125 #endif
1126 	log_finalize();
1127 	exit(0);
1128 }
1129 
1130 void
1131 server_prepare_xfrd(struct nsd* nsd)
1132 {
1133 	char tmpfile[256];
1134 	/* create task mmaps */
1135 	nsd->mytask = 0;
1136 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1137 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1138 	nsd->task[0] = task_file_create(tmpfile);
1139 	if(!nsd->task[0]) {
1140 #ifdef USE_ZONE_STATS
1141 		unlink(nsd->zonestatfname[0]);
1142 		unlink(nsd->zonestatfname[1]);
1143 #endif
1144 		xfrd_del_tempdir(nsd);
1145 		exit(1);
1146 	}
1147 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1148 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1149 	nsd->task[1] = task_file_create(tmpfile);
1150 	if(!nsd->task[1]) {
1151 		unlink(nsd->task[0]->fname);
1152 #ifdef USE_ZONE_STATS
1153 		unlink(nsd->zonestatfname[0]);
1154 		unlink(nsd->zonestatfname[1]);
1155 #endif
1156 		xfrd_del_tempdir(nsd);
1157 		exit(1);
1158 	}
1159 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1160 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1161 	/* create xfrd listener structure */
1162 	nsd->xfrd_listener = region_alloc(nsd->region,
1163 		sizeof(netio_handler_type));
1164 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1165 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1166 	nsd->xfrd_listener->fd = -1;
1167 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1168 		nsd;
1169 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1170 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1171 }
1172 
1173 
1174 void
1175 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1176 {
1177 	pid_t pid;
1178 	int sockets[2] = {0,0};
1179 	struct ipc_handler_conn_data *data;
1180 
1181 	if(nsd->xfrd_listener->fd != -1)
1182 		close(nsd->xfrd_listener->fd);
1183 	if(del_db) {
1184 		/* recreate taskdb that xfrd was using, it may be corrupt */
1185 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1186 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1187 		nsd->task[1-nsd->mytask]->fname = NULL;
1188 		/* free alloc already, so udb does not shrink itself */
1189 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1190 		nsd->task[1-nsd->mytask]->alloc = NULL;
1191 		udb_base_free(nsd->task[1-nsd->mytask]);
1192 		/* create new file, overwrite the old one */
1193 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1194 		free(tmpfile);
1195 	}
1196 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1197 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1198 		return;
1199 	}
1200 	pid = fork();
1201 	switch (pid) {
1202 	case -1:
1203 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1204 		break;
1205 	default:
1206 		/* PARENT: close first socket, use second one */
1207 		close(sockets[0]);
1208 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1209 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1210 		}
1211 		if(del_db) xfrd_free_namedb(nsd);
1212 		/* use other task than I am using, since if xfrd died and is
1213 		 * restarted, the reload is using nsd->mytask */
1214 		nsd->mytask = 1 - nsd->mytask;
1215 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1216 		/* ENOTREACH */
1217 		break;
1218 	case 0:
1219 		/* CHILD: close second socket, use first one */
1220 		close(sockets[1]);
1221 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1222 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1223 		}
1224 		nsd->xfrd_listener->fd = sockets[0];
1225 		break;
1226 	}
1227 	/* server-parent only */
1228 	nsd->xfrd_listener->timeout = NULL;
1229 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1230 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1231 	/* clear ongoing ipc reads */
1232 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1233 	data->conn->is_reading = 0;
1234 }
1235 
1236 /** add all soainfo to taskdb */
1237 static void
1238 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1239 {
1240 	struct radnode* n;
1241 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1242 	/* add all SOA INFO to mytask */
1243 	udb_ptr_init(&task_last, taskudb);
1244 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1245 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1246 	}
1247 	udb_ptr_unlink(&task_last, taskudb);
1248 }
1249 
1250 void
1251 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1252 {
1253 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1254 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1255 	 *   then they exchange and process.
1256 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1257 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1258 	 *   expire notifications can be sent back via a normal reload later
1259 	 *   (xfrd will wait for current running reload to finish if any).
1260 	 */
1261 	sig_atomic_t cmd = 0;
1262 	pid_t mypid;
1263 	int xfrd_sock = nsd->xfrd_listener->fd;
1264 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1265 	udb_ptr t;
1266 	if(!shortsoa) {
1267 		if(nsd->signal_hint_shutdown) {
1268 		shutdown:
1269 			log_msg(LOG_WARNING, "signal received, shutting down...");
1270 			server_close_all_sockets(nsd->udp, nsd->ifs);
1271 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1272 #ifdef HAVE_SSL
1273 			daemon_remote_close(nsd->rc);
1274 #endif
1275 			/* Unlink it if possible... */
1276 			unlinkpid(nsd->pidfile);
1277 			unlink(nsd->task[0]->fname);
1278 			unlink(nsd->task[1]->fname);
1279 #ifdef USE_ZONE_STATS
1280 			unlink(nsd->zonestatfname[0]);
1281 			unlink(nsd->zonestatfname[1]);
1282 #endif
1283 			/* write the nsd.db to disk, wait for it to complete */
1284 			udb_base_sync(nsd->db->udb, 1);
1285 			udb_base_close(nsd->db->udb);
1286 			server_shutdown(nsd);
1287 			exit(0);
1288 		}
1289 	}
1290 	if(shortsoa) {
1291 		/* put SOA in xfrd task because mytask may be in use */
1292 		taskudb = nsd->task[1-nsd->mytask];
1293 	}
1294 
1295 	add_all_soa_to_task(nsd, taskudb);
1296 	if(!shortsoa) {
1297 		/* wait for xfrd to signal task is ready, RELOAD signal */
1298 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1299 			cmd != NSD_RELOAD) {
1300 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1301 			exit(1);
1302 		}
1303 		if(nsd->signal_hint_shutdown) {
1304 			goto shutdown;
1305 		}
1306 	}
1307 	/* give xfrd our task, signal it with RELOAD_DONE */
1308 	task_process_sync(taskudb);
1309 	cmd = NSD_RELOAD_DONE;
1310 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1311 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1312 			(int)nsd->pid, strerror(errno));
1313 	}
1314 	mypid = getpid();
1315 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1316 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1317 			strerror(errno));
1318 	}
1319 
1320 	if(!shortsoa) {
1321 		/* process the xfrd task works (expiry data) */
1322 		nsd->mytask = 1 - nsd->mytask;
1323 		taskudb = nsd->task[nsd->mytask];
1324 		task_remap(taskudb);
1325 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1326 		while(!udb_ptr_is_null(&t)) {
1327 			task_process_expire(nsd->db, TASKLIST(&t));
1328 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1329 		}
1330 		udb_ptr_unlink(&t, taskudb);
1331 		task_clear(taskudb);
1332 
1333 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1334 		cmd = NSD_RELOAD_DONE;
1335 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1336 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1337 				(int)nsd->pid, strerror(errno));
1338 		}
1339 	}
1340 }
1341 
1342 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1343 ssize_t
1344 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1345 {
1346 	uint8_t* buf = (uint8_t*) p;
1347 	ssize_t total = 0;
1348 	struct pollfd fd;
1349 	memset(&fd, 0, sizeof(fd));
1350 	fd.fd = s;
1351 	fd.events = POLLIN;
1352 
1353 	while( total < sz) {
1354 		ssize_t ret;
1355 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1356 		if(ret == -1) {
1357 			if(errno == EAGAIN)
1358 				/* blocking read */
1359 				continue;
1360 			if(errno == EINTR) {
1361 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1362 					return -1;
1363 				/* other signals can be handled later */
1364 				continue;
1365 			}
1366 			/* some error */
1367 			return -1;
1368 		}
1369 		if(ret == 0) {
1370 			/* operation timed out */
1371 			return -2;
1372 		}
1373 		ret = read(s, buf+total, sz-total);
1374 		if(ret == -1) {
1375 			if(errno == EAGAIN)
1376 				/* blocking read */
1377 				continue;
1378 			if(errno == EINTR) {
1379 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1380 					return -1;
1381 				/* other signals can be handled later */
1382 				continue;
1383 			}
1384 			/* some error */
1385 			return -1;
1386 		}
1387 		if(ret == 0) {
1388 			/* closed connection! */
1389 			return 0;
1390 		}
1391 		total += ret;
1392 	}
1393 	return total;
1394 }
1395 
1396 static void
1397 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1398 {
1399 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1400 	udb_ptr t, next;
1401 	udb_base* u = nsd->task[nsd->mytask];
1402 	udb_ptr_init(&next, u);
1403 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1404 	udb_base_set_userdata(u, 0);
1405 	while(!udb_ptr_is_null(&t)) {
1406 		/* store next in list so this one can be deleted or reused */
1407 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1408 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1409 
1410 		/* process task t */
1411 		/* append results for task t and update last_task */
1412 		task_process_in_reload(nsd, u, last_task, &t);
1413 
1414 		/* go to next */
1415 		udb_ptr_set_ptr(&t, u, &next);
1416 
1417 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1418 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1419 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1420 			if(cmd == NSD_QUIT) {
1421 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1422 				/* sync to disk (if needed) */
1423 				udb_base_sync(nsd->db->udb, 0);
1424 				/* unlink files of remainder of tasks */
1425 				while(!udb_ptr_is_null(&t)) {
1426 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1427 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1428 					}
1429 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1430 				}
1431 				udb_ptr_unlink(&t, u);
1432 				udb_ptr_unlink(&next, u);
1433 				exit(0);
1434 			}
1435 		}
1436 
1437 	}
1438 	udb_ptr_unlink(&t, u);
1439 	udb_ptr_unlink(&next, u);
1440 }
1441 
1442 #ifdef BIND8_STATS
1443 static void
1444 parent_send_stats(struct nsd* nsd, int cmdfd)
1445 {
1446 	size_t i;
1447 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1448 		log_msg(LOG_ERR, "could not write stats to reload");
1449 		return;
1450 	}
1451 	for(i=0; i<nsd->child_count; i++)
1452 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1453 			sizeof(stc_type))) {
1454 			log_msg(LOG_ERR, "could not write stats to reload");
1455 			return;
1456 		}
1457 }
1458 
1459 static void
1460 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1461 {
1462 	struct nsdst s;
1463 	stc_type* p;
1464 	size_t i;
1465 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1466 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1467 		log_msg(LOG_ERR, "could not read stats from oldpar");
1468 		return;
1469 	}
1470 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1471 	s.db_mem = region_get_mem(nsd->db->region);
1472 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1473 		nsd->child_count);
1474 	if(!p) return;
1475 	for(i=0; i<nsd->child_count; i++) {
1476 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
1477 			sizeof(stc_type))
1478 			return;
1479 	}
1480 }
1481 #endif /* BIND8_STATS */
1482 
1483 /*
1484  * Reload the database, stop parent, re-fork children and continue.
1485  * as server_main.
1486  */
1487 static void
1488 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1489 	int cmdsocket)
1490 {
1491 	pid_t mypid;
1492 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1493 	int ret;
1494 	udb_ptr last_task;
1495 	struct sigaction old_sigchld, ign_sigchld;
1496 	/* ignore SIGCHLD from the previous server_main that used this pid */
1497 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1498 	ign_sigchld.sa_handler = SIG_IGN;
1499 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1500 
1501 	/* see what tasks we got from xfrd */
1502 	task_remap(nsd->task[nsd->mytask]);
1503 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1504 	udb_compact_inhibited(nsd->db->udb, 1);
1505 	reload_process_tasks(nsd, &last_task, cmdsocket);
1506 	udb_compact_inhibited(nsd->db->udb, 0);
1507 	udb_compact(nsd->db->udb);
1508 
1509 #ifndef NDEBUG
1510 	if(nsd_debug_level >= 1)
1511 		region_log_stats(nsd->db->region);
1512 #endif /* NDEBUG */
1513 	/* sync to disk (if needed) */
1514 	udb_base_sync(nsd->db->udb, 0);
1515 
1516 	initialize_dname_compression_tables(nsd);
1517 
1518 #ifdef BIND8_STATS
1519 	/* Restart dumping stats if required.  */
1520 	time(&nsd->st.boot);
1521 	set_bind8_alarm(nsd);
1522 #endif
1523 #ifdef USE_ZONE_STATS
1524 	server_zonestat_realloc(nsd); /* realloc for new children */
1525 	server_zonestat_switch(nsd);
1526 #endif
1527 
1528 	/* listen for the signals of failed children again */
1529 	sigaction(SIGCHLD, &old_sigchld, NULL);
1530 	/* Start new child processes */
1531 	if (server_start_children(nsd, server_region, netio, &nsd->
1532 		xfrd_listener->fd) != 0) {
1533 		send_children_quit(nsd);
1534 		exit(1);
1535 	}
1536 
1537 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1538 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1539 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1540 		if(cmd == NSD_QUIT) {
1541 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1542 			send_children_quit(nsd);
1543 			exit(0);
1544 		}
1545 	}
1546 
1547 	/* Send quit command to parent: blocking, wait for receipt. */
1548 	do {
1549 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1550 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1551 		{
1552 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1553 				strerror(errno));
1554 		}
1555 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1556 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1557 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1558 			RELOAD_SYNC_TIMEOUT);
1559 		if(ret == -2) {
1560 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1561 		}
1562 	} while (ret == -2);
1563 	if(ret == -1) {
1564 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1565 			strerror(errno));
1566 	}
1567 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1568 	if(cmd == NSD_QUIT) {
1569 		/* small race condition possible here, parent got quit cmd. */
1570 		send_children_quit(nsd);
1571 		exit(1);
1572 	}
1573 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1574 #ifdef BIND8_STATS
1575 	reload_do_stats(cmdsocket, nsd, &last_task);
1576 #endif
1577 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1578 	task_process_sync(nsd->task[nsd->mytask]);
1579 #ifdef USE_ZONE_STATS
1580 	server_zonestat_realloc(nsd); /* realloc for next children */
1581 #endif
1582 
1583 	/* send soainfo to the xfrd process, signal it that reload is done,
1584 	 * it picks up the taskudb */
1585 	cmd = NSD_RELOAD_DONE;
1586 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1587 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1588 			strerror(errno));
1589 	}
1590 	mypid = getpid();
1591 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1592 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1593 			strerror(errno));
1594 	}
1595 
1596 	/* try to reopen file */
1597 	if (nsd->file_rotation_ok)
1598 		log_reopen(nsd->log_filename, 1);
1599 	/* exit reload, continue as new server_main */
1600 }
1601 
1602 /*
1603  * Get the mode depending on the signal hints that have been received.
1604  * Multiple signal hints can be received and will be handled in turn.
1605  */
1606 static sig_atomic_t
1607 server_signal_mode(struct nsd *nsd)
1608 {
1609 	if(nsd->signal_hint_quit) {
1610 		nsd->signal_hint_quit = 0;
1611 		return NSD_QUIT;
1612 	}
1613 	else if(nsd->signal_hint_shutdown) {
1614 		nsd->signal_hint_shutdown = 0;
1615 		return NSD_SHUTDOWN;
1616 	}
1617 	else if(nsd->signal_hint_child) {
1618 		nsd->signal_hint_child = 0;
1619 		return NSD_REAP_CHILDREN;
1620 	}
1621 	else if(nsd->signal_hint_reload) {
1622 		nsd->signal_hint_reload = 0;
1623 		return NSD_RELOAD;
1624 	}
1625 	else if(nsd->signal_hint_reload_hup) {
1626 		nsd->signal_hint_reload_hup = 0;
1627 		return NSD_RELOAD_REQ;
1628 	}
1629 	else if(nsd->signal_hint_stats) {
1630 		nsd->signal_hint_stats = 0;
1631 #ifdef BIND8_STATS
1632 		set_bind8_alarm(nsd);
1633 #endif
1634 		return NSD_STATS;
1635 	}
1636 	else if(nsd->signal_hint_statsusr) {
1637 		nsd->signal_hint_statsusr = 0;
1638 		return NSD_STATS;
1639 	}
1640 	return NSD_RUN;
1641 }
1642 
1643 /*
1644  * The main server simply waits for signals and child processes to
1645  * terminate.  Child processes are restarted as necessary.
1646  */
1647 void
1648 server_main(struct nsd *nsd)
1649 {
1650 	region_type *server_region = region_create(xalloc, free);
1651 	netio_type *netio = netio_create(server_region);
1652 	netio_handler_type reload_listener;
1653 	int reload_sockets[2] = {-1, -1};
1654 	struct timespec timeout_spec;
1655 	int status;
1656 	pid_t child_pid;
1657 	pid_t reload_pid = -1;
1658 	sig_atomic_t mode;
1659 
1660 	/* Ensure we are the main process */
1661 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1662 
1663 	/* Add listener for the XFRD process */
1664 	netio_add_handler(netio, nsd->xfrd_listener);
1665 
1666 	/* Start the child processes that handle incoming queries */
1667 	if (server_start_children(nsd, server_region, netio,
1668 		&nsd->xfrd_listener->fd) != 0) {
1669 		send_children_quit(nsd);
1670 		exit(1);
1671 	}
1672 	reload_listener.fd = -1;
1673 
1674 	/* This_child MUST be 0, because this is the parent process */
1675 	assert(nsd->this_child == 0);
1676 
1677 	/* Run the server until we get a shutdown signal */
1678 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1679 		/* Did we receive a signal that changes our mode? */
1680 		if(mode == NSD_RUN) {
1681 			nsd->mode = mode = server_signal_mode(nsd);
1682 		}
1683 
1684 		switch (mode) {
1685 		case NSD_RUN:
1686 			/* see if any child processes terminated */
1687 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1688 				int is_child = delete_child_pid(nsd, child_pid);
1689 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1690 					if(nsd->children[is_child].child_fd == -1)
1691 						nsd->children[is_child].has_exited = 1;
1692 					parent_check_all_children_exited(nsd);
1693 				} else if(is_child != -1) {
1694 					log_msg(LOG_WARNING,
1695 					       "server %d died unexpectedly with status %d, restarting",
1696 					       (int) child_pid, status);
1697 					restart_child_servers(nsd, server_region, netio,
1698 						&nsd->xfrd_listener->fd);
1699 				} else if (child_pid == reload_pid) {
1700 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1701 					pid_t mypid;
1702 					log_msg(LOG_WARNING,
1703 					       "Reload process %d failed with status %d, continuing with old database",
1704 					       (int) child_pid, status);
1705 					reload_pid = -1;
1706 					if(reload_listener.fd != -1) close(reload_listener.fd);
1707 					reload_listener.fd = -1;
1708 					reload_listener.event_types = NETIO_EVENT_NONE;
1709 					task_process_sync(nsd->task[nsd->mytask]);
1710 					/* inform xfrd reload attempt ended */
1711 					if(!write_socket(nsd->xfrd_listener->fd,
1712 						&cmd, sizeof(cmd))) {
1713 						log_msg(LOG_ERR, "problems "
1714 						  "sending SOAEND to xfrd: %s",
1715 						  strerror(errno));
1716 					}
1717 					mypid = getpid();
1718 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1719 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1720 							strerror(errno));
1721 					}
1722 				} else if(status != 0) {
1723 					/* check for status, because we get
1724 					 * the old-servermain because reload
1725 					 * is the process-parent of old-main,
1726 					 * and we get older server-processes
1727 					 * that are exiting after a reload */
1728 					log_msg(LOG_WARNING,
1729 					       "process %d terminated with status %d",
1730 					       (int) child_pid, status);
1731 				}
1732 			}
1733 			if (child_pid == -1) {
1734 				if (errno == EINTR) {
1735 					continue;
1736 				}
1737 				if (errno != ECHILD)
1738 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1739 			}
1740 			if (nsd->mode != NSD_RUN)
1741 				break;
1742 
1743 			/* timeout to collect processes. In case no sigchild happens. */
1744 			timeout_spec.tv_sec = 60;
1745 			timeout_spec.tv_nsec = 0;
1746 
1747 			/* listen on ports, timeout for collecting terminated children */
1748 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1749 				if (errno != EINTR) {
1750 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1751 				}
1752 			}
1753 			if(nsd->restart_children) {
1754 				restart_child_servers(nsd, server_region, netio,
1755 					&nsd->xfrd_listener->fd);
1756 				nsd->restart_children = 0;
1757 			}
1758 			if(nsd->reload_failed) {
1759 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1760 				pid_t mypid;
1761 				nsd->reload_failed = 0;
1762 				log_msg(LOG_WARNING,
1763 				       "Reload process %d failed, continuing with old database",
1764 				       (int) reload_pid);
1765 				reload_pid = -1;
1766 				if(reload_listener.fd != -1) close(reload_listener.fd);
1767 				reload_listener.fd = -1;
1768 				reload_listener.event_types = NETIO_EVENT_NONE;
1769 				task_process_sync(nsd->task[nsd->mytask]);
1770 				/* inform xfrd reload attempt ended */
1771 				if(!write_socket(nsd->xfrd_listener->fd,
1772 					&cmd, sizeof(cmd))) {
1773 					log_msg(LOG_ERR, "problems "
1774 					  "sending SOAEND to xfrd: %s",
1775 					  strerror(errno));
1776 				}
1777 				mypid = getpid();
1778 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1779 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1780 						strerror(errno));
1781 				}
1782 			}
1783 
1784 			break;
1785 		case NSD_RELOAD_REQ: {
1786 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1787 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1788 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1789 				"main: ipc send reload_req to xfrd"));
1790 			if(!write_socket(nsd->xfrd_listener->fd,
1791 				&cmd, sizeof(cmd))) {
1792 				log_msg(LOG_ERR, "server_main: could not send "
1793 				"reload_req to xfrd: %s", strerror(errno));
1794 			}
1795 			nsd->mode = NSD_RUN;
1796 			} break;
1797 		case NSD_RELOAD:
1798 			/* Continue to run nsd after reload */
1799 			nsd->mode = NSD_RUN;
1800 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1801 			if (reload_pid != -1) {
1802 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1803 				       (int) reload_pid);
1804 				break;
1805 			}
1806 
1807 			/* switch the mytask to keep track of who owns task*/
1808 			nsd->mytask = 1 - nsd->mytask;
1809 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1810 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1811 				reload_pid = -1;
1812 				break;
1813 			}
1814 
1815 			/* Do actual reload */
1816 			reload_pid = fork();
1817 			switch (reload_pid) {
1818 			case -1:
1819 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1820 				break;
1821 			default:
1822 				/* PARENT */
1823 				close(reload_sockets[0]);
1824 				server_reload(nsd, server_region, netio,
1825 					reload_sockets[1]);
1826 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1827 				close(reload_sockets[1]);
1828 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1829 				/* drop stale xfrd ipc data */
1830 				((struct ipc_handler_conn_data*)nsd->
1831 					xfrd_listener->user_data)
1832 					->conn->is_reading = 0;
1833 				reload_pid = -1;
1834 				reload_listener.fd = -1;
1835 				reload_listener.event_types = NETIO_EVENT_NONE;
1836 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1837 				break;
1838 			case 0:
1839 				/* CHILD */
1840 				/* server_main keep running until NSD_QUIT_SYNC
1841 				 * received from reload. */
1842 				close(reload_sockets[1]);
1843 				reload_listener.fd = reload_sockets[0];
1844 				reload_listener.timeout = NULL;
1845 				reload_listener.user_data = nsd;
1846 				reload_listener.event_types = NETIO_EVENT_READ;
1847 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1848 				netio_add_handler(netio, &reload_listener);
1849 				reload_pid = getppid();
1850 				break;
1851 			}
1852 			break;
1853 		case NSD_QUIT_SYNC:
1854 			/* synchronisation of xfrd, parent and reload */
1855 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1856 				sig_atomic_t cmd = NSD_RELOAD;
1857 				/* stop xfrd ipc writes in progress */
1858 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1859 					"main: ipc send indication reload"));
1860 				if(!write_socket(nsd->xfrd_listener->fd,
1861 					&cmd, sizeof(cmd))) {
1862 					log_msg(LOG_ERR, "server_main: could not send reload "
1863 					"indication to xfrd: %s", strerror(errno));
1864 				}
1865 				/* wait for ACK from xfrd */
1866 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1867 				nsd->quit_sync_done = 1;
1868 			}
1869 			nsd->mode = NSD_RUN;
1870 			break;
1871 		case NSD_QUIT:
1872 			/* silent shutdown during reload */
1873 			if(reload_listener.fd != -1) {
1874 				/* acknowledge the quit, to sync reload that we will really quit now */
1875 				sig_atomic_t cmd = NSD_RELOAD;
1876 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1877 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1878 					log_msg(LOG_ERR, "server_main: "
1879 						"could not ack quit: %s", strerror(errno));
1880 				}
1881 #ifdef BIND8_STATS
1882 				parent_send_stats(nsd, reload_listener.fd);
1883 #endif /* BIND8_STATS */
1884 				close(reload_listener.fd);
1885 			}
1886 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1887 			/* only quit children after xfrd has acked */
1888 			send_children_quit(nsd);
1889 
1890 #ifdef MEMCLEAN /* OS collects memory pages */
1891 			region_destroy(server_region);
1892 #endif
1893 			server_shutdown(nsd);
1894 
1895 			/* ENOTREACH */
1896 			break;
1897 		case NSD_SHUTDOWN:
1898 			break;
1899 		case NSD_REAP_CHILDREN:
1900 			/* continue; wait for child in run loop */
1901 			nsd->mode = NSD_RUN;
1902 			break;
1903 		case NSD_STATS:
1904 #ifdef BIND8_STATS
1905 			set_children_stats(nsd);
1906 #endif
1907 			nsd->mode = NSD_RUN;
1908 			break;
1909 		default:
1910 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1911 			nsd->mode = NSD_RUN;
1912 			break;
1913 		}
1914 	}
1915 	log_msg(LOG_WARNING, "signal received, shutting down...");
1916 
1917 	/* close opened ports to avoid race with restart of nsd */
1918 	server_close_all_sockets(nsd->udp, nsd->ifs);
1919 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1920 #ifdef HAVE_SSL
1921 	daemon_remote_close(nsd->rc);
1922 #endif
1923 	send_children_quit_and_wait(nsd);
1924 
1925 	/* Unlink it if possible... */
1926 	unlinkpid(nsd->pidfile);
1927 	unlink(nsd->task[0]->fname);
1928 	unlink(nsd->task[1]->fname);
1929 #ifdef USE_ZONE_STATS
1930 	unlink(nsd->zonestatfname[0]);
1931 	unlink(nsd->zonestatfname[1]);
1932 #endif
1933 #ifdef USE_DNSTAP
1934 	dt_collector_close(nsd->dt_collector, nsd);
1935 #endif
1936 
1937 	if(reload_listener.fd != -1) {
1938 		sig_atomic_t cmd = NSD_QUIT;
1939 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1940 			"main: ipc send quit to reload-process"));
1941 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1942 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1943 				strerror(errno));
1944 		}
1945 		fsync(reload_listener.fd);
1946 		close(reload_listener.fd);
1947 		/* wait for reload to finish processing */
1948 		while(1) {
1949 			if(waitpid(reload_pid, NULL, 0) == -1) {
1950 				if(errno == EINTR) continue;
1951 				if(errno == ECHILD) break;
1952 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1953 					(int)reload_pid, strerror(errno));
1954 			}
1955 			break;
1956 		}
1957 	}
1958 	if(nsd->xfrd_listener->fd != -1) {
1959 		/* complete quit, stop xfrd */
1960 		sig_atomic_t cmd = NSD_QUIT;
1961 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1962 			"main: ipc send quit to xfrd"));
1963 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1964 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1965 				strerror(errno));
1966 		}
1967 		fsync(nsd->xfrd_listener->fd);
1968 		close(nsd->xfrd_listener->fd);
1969 		(void)kill(nsd->pid, SIGTERM);
1970 	}
1971 
1972 #ifdef MEMCLEAN /* OS collects memory pages */
1973 	region_destroy(server_region);
1974 #endif
1975 	/* write the nsd.db to disk, wait for it to complete */
1976 	udb_base_sync(nsd->db->udb, 1);
1977 	udb_base_close(nsd->db->udb);
1978 	server_shutdown(nsd);
1979 }
1980 
1981 static query_state_type
1982 server_process_query(struct nsd *nsd, struct query *query)
1983 {
1984 	return query_process(query, nsd);
1985 }
1986 
1987 static query_state_type
1988 server_process_query_udp(struct nsd *nsd, struct query *query)
1989 {
1990 #ifdef RATELIMIT
1991 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1992 		if(rrl_process_query(query))
1993 			return rrl_slip(query);
1994 		else	return QUERY_PROCESSED;
1995 	}
1996 	return QUERY_DISCARDED;
1997 #else
1998 	return query_process(query, nsd);
1999 #endif
2000 }
2001 
2002 struct event_base*
2003 nsd_child_event_base(void)
2004 {
2005 	struct event_base* base;
2006 #ifdef USE_MINI_EVENT
2007 	static time_t secs;
2008 	static struct timeval now;
2009 	base = event_init(&secs, &now);
2010 #else
2011 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2012 	/* libev */
2013 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2014 #  else
2015 	/* libevent */
2016 #    ifdef HAVE_EVENT_BASE_NEW
2017 	base = event_base_new();
2018 #    else
2019 	base = event_init();
2020 #    endif
2021 #  endif
2022 #endif
2023 	return base;
2024 }
2025 
2026 /*
2027  * Serve DNS requests.
2028  */
2029 void
2030 server_child(struct nsd *nsd)
2031 {
2032 	size_t i, from, numifs;
2033 	region_type *server_region = region_create(xalloc, free);
2034 	struct event_base* event_base = nsd_child_event_base();
2035 	query_type *udp_query;
2036 	sig_atomic_t mode;
2037 
2038 	if(!event_base) {
2039 		log_msg(LOG_ERR, "nsd server could not create event base");
2040 		exit(1);
2041 	}
2042 	nsd->event_base = event_base;
2043 	nsd->server_region = server_region;
2044 
2045 #ifdef RATELIMIT
2046 	rrl_init(nsd->this_child->child_num);
2047 #endif
2048 
2049 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2050 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2051 
2052 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2053 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2054 	}
2055 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2056 		server_close_all_sockets(nsd->udp, nsd->ifs);
2057 	}
2058 
2059 	if (nsd->this_child->parent_fd != -1) {
2060 		struct event *handler;
2061 		struct ipc_handler_conn_data* user_data =
2062 			(struct ipc_handler_conn_data*)region_alloc(
2063 			server_region, sizeof(struct ipc_handler_conn_data));
2064 		user_data->nsd = nsd;
2065 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2066 
2067 		handler = (struct event*) region_alloc(
2068 			server_region, sizeof(*handler));
2069 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2070 			EV_READ, child_handle_parent_command, user_data);
2071 		if(event_base_set(event_base, handler) != 0)
2072 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2073 		if(event_add(handler, NULL) != 0)
2074 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2075 	}
2076 
2077 	if(nsd->reuseport) {
2078 		numifs = nsd->ifs / nsd->reuseport;
2079 		from = numifs * nsd->this_child->child_num;
2080 		if(from+numifs > nsd->ifs) { /* should not happen */
2081 			from = 0;
2082 			numifs = nsd->ifs;
2083 		}
2084 	} else {
2085 		from = 0;
2086 		numifs = nsd->ifs;
2087 	}
2088 
2089 	if (nsd->server_kind & NSD_SERVER_UDP) {
2090 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2091 		udp_query = query_create(server_region,
2092 			compressed_dname_offsets, compression_table_size,
2093 			compressed_dnames);
2094 #else
2095 		udp_query = NULL;
2096 		memset(msgs, 0, sizeof(msgs));
2097 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2098 			queries[i] = query_create(server_region,
2099 				compressed_dname_offsets,
2100 				compression_table_size, compressed_dnames);
2101 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2102 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2103 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2104 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2105 			msgs[i].msg_hdr.msg_iovlen  = 1;
2106 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2107 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2108 		}
2109 #endif
2110 		for (i = from; i < from+numifs; ++i) {
2111 			struct udp_handler_data *data;
2112 			struct event *handler;
2113 
2114 			data = (struct udp_handler_data *) region_alloc(
2115 				server_region,
2116 				sizeof(struct udp_handler_data));
2117 			data->query = udp_query;
2118 			data->nsd = nsd;
2119 			data->socket = &nsd->udp[i];
2120 
2121 			handler = (struct event*) region_alloc(
2122 				server_region, sizeof(*handler));
2123 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2124 				handle_udp, data);
2125 			if(event_base_set(event_base, handler) != 0)
2126 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2127 			if(event_add(handler, NULL) != 0)
2128 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2129 		}
2130 	}
2131 
2132 	/*
2133 	 * Keep track of all the TCP accept handlers so we can enable
2134 	 * and disable them based on the current number of active TCP
2135 	 * connections.
2136 	 */
2137 	tcp_accept_handler_count = numifs;
2138 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2139 		region_alloc_array(server_region,
2140 		numifs, sizeof(*tcp_accept_handlers));
2141 	if (nsd->server_kind & NSD_SERVER_TCP) {
2142 		for (i = from; i < numifs; ++i) {
2143 			struct event *handler = &tcp_accept_handlers[i-from].event;
2144 			struct tcp_accept_handler_data* data =
2145 				&tcp_accept_handlers[i-from];
2146 			data->nsd = nsd;
2147 			data->socket = &nsd->tcp[i];
2148 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2149 				handle_tcp_accept, data);
2150 			if(event_base_set(event_base, handler) != 0)
2151 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2152 			if(event_add(handler, NULL) != 0)
2153 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2154 			data->event_added = 1;
2155 		}
2156 	} else tcp_accept_handler_count = 0;
2157 
2158 	/* The main loop... */
2159 	while ((mode = nsd->mode) != NSD_QUIT) {
2160 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2161 
2162 		/* Do we need to do the statistics... */
2163 		if (mode == NSD_STATS) {
2164 #ifdef BIND8_STATS
2165 			int p = nsd->st.period;
2166 			nsd->st.period = 1; /* force stats printout */
2167 			/* Dump the statistics */
2168 			bind8_stats(nsd);
2169 			nsd->st.period = p;
2170 #else /* !BIND8_STATS */
2171 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2172 #endif /* BIND8_STATS */
2173 
2174 			nsd->mode = NSD_RUN;
2175 		}
2176 		else if (mode == NSD_REAP_CHILDREN) {
2177 			/* got signal, notify parent. parent reaps terminated children. */
2178 			if (nsd->this_child->parent_fd != -1) {
2179 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2180 				if (write(nsd->this_child->parent_fd,
2181 				    &parent_notify,
2182 				    sizeof(parent_notify)) == -1)
2183 				{
2184 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2185 						(int) nsd->this_child->pid, strerror(errno));
2186 				}
2187 			} else /* no parent, so reap 'em */
2188 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2189 			nsd->mode = NSD_RUN;
2190 		}
2191 		else if(mode == NSD_RUN) {
2192 			/* Wait for a query... */
2193 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2194 				if (errno != EINTR) {
2195 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2196 					break;
2197 				}
2198 			}
2199 		} else if(mode == NSD_QUIT) {
2200 			/* ignore here, quit */
2201 		} else {
2202 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2203 				(int)mode);
2204 			nsd->mode = NSD_RUN;
2205 		}
2206 	}
2207 
2208 #ifdef	BIND8_STATS
2209 	bind8_stats(nsd);
2210 #endif /* BIND8_STATS */
2211 
2212 #ifdef MEMCLEAN /* OS collects memory pages */
2213 #ifdef RATELIMIT
2214 	rrl_deinit(nsd->this_child->child_num);
2215 #endif
2216 	event_base_free(event_base);
2217 	region_destroy(server_region);
2218 #endif
2219 	server_shutdown(nsd);
2220 }
2221 
2222 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2223 static void
2224 handle_udp(int fd, short event, void* arg)
2225 {
2226 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2227 	int received, sent, recvcount, i;
2228 	struct query *q;
2229 
2230 	if (!(event & EV_READ)) {
2231 		return;
2232 	}
2233 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2234 	/* this printf strangely gave a performance increase on Linux */
2235 	/* printf("recvcount %d \n", recvcount); */
2236 	if (recvcount == -1) {
2237 		if (errno != EAGAIN && errno != EINTR) {
2238 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2239 			STATUP(data->nsd, rxerr);
2240 			/* No zone statup */
2241 		}
2242 		/* Simply no data available */
2243 		return;
2244 	}
2245 	for (i = 0; i < recvcount; i++) {
2246 	loopstart:
2247 		received = msgs[i].msg_len;
2248 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
2249 		q = queries[i];
2250 		if (received == -1) {
2251 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2252 				msgs[i].msg_hdr.msg_flags));
2253 			STATUP(data->nsd, rxerr);
2254 			/* No zone statup */
2255 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2256 			iovecs[i].iov_len = buffer_remaining(q->packet);
2257 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2258 			goto swap_drop;
2259 		}
2260 
2261 		/* Account... */
2262 #ifdef BIND8_STATS
2263 		if (data->socket->fam == AF_INET) {
2264 			STATUP(data->nsd, qudp);
2265 		} else if (data->socket->fam == AF_INET6) {
2266 			STATUP(data->nsd, qudp6);
2267 		}
2268 #endif
2269 
2270 		buffer_skip(q->packet, received);
2271 		buffer_flip(q->packet);
2272 #ifdef USE_DNSTAP
2273 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
2274 			q->tcp, q->packet);
2275 #endif /* USE_DNSTAP */
2276 
2277 		/* Process and answer the query... */
2278 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2279 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2280 				STATUP(data->nsd, nona);
2281 				ZTATUP(data->nsd, q->zone, nona);
2282 			}
2283 
2284 #ifdef USE_ZONE_STATS
2285 			if (data->socket->fam == AF_INET) {
2286 				ZTATUP(data->nsd, q->zone, qudp);
2287 			} else if (data->socket->fam == AF_INET6) {
2288 				ZTATUP(data->nsd, q->zone, qudp6);
2289 			}
2290 #endif
2291 
2292 			/* Add EDNS0 and TSIG info if necessary.  */
2293 			query_add_optional(q, data->nsd);
2294 
2295 			buffer_flip(q->packet);
2296 			iovecs[i].iov_len = buffer_remaining(q->packet);
2297 #ifdef BIND8_STATS
2298 			/* Account the rcode & TC... */
2299 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2300 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2301 			if (TC(q->packet)) {
2302 				STATUP(data->nsd, truncated);
2303 				ZTATUP(data->nsd, q->zone, truncated);
2304 			}
2305 #endif /* BIND8_STATS */
2306 #ifdef USE_DNSTAP
2307 			dt_collector_submit_auth_response(data->nsd,
2308 				&q->addr, q->addrlen, q->tcp, q->packet,
2309 				q->zone);
2310 #endif /* USE_DNSTAP */
2311 		} else {
2312 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2313 			iovecs[i].iov_len = buffer_remaining(q->packet);
2314 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2315 		swap_drop:
2316 			STATUP(data->nsd, dropped);
2317 			ZTATUP(data->nsd, q->zone, dropped);
2318 			if(i != recvcount-1) {
2319 				/* swap with last and decrease recvcount */
2320 				struct mmsghdr mtmp = msgs[i];
2321 				struct iovec iotmp = iovecs[i];
2322 				recvcount--;
2323 				msgs[i] = msgs[recvcount];
2324 				iovecs[i] = iovecs[recvcount];
2325 				queries[i] = queries[recvcount];
2326 				msgs[recvcount] = mtmp;
2327 				iovecs[recvcount] = iotmp;
2328 				queries[recvcount] = q;
2329 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2330 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2331 				goto loopstart;
2332 			} else { recvcount --; }
2333 		}
2334 	}
2335 
2336 	/* send until all are sent */
2337 	i = 0;
2338 	while(i<recvcount) {
2339 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2340 		if(sent == -1) {
2341 			const char* es = strerror(errno);
2342 			char a[48];
2343 			addr2str(&queries[i]->addr, a, sizeof(a));
2344 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2345 #ifdef BIND8_STATS
2346 			data->nsd->st.txerr += recvcount-i;
2347 #endif /* BIND8_STATS */
2348 			break;
2349 		}
2350 		i += sent;
2351 	}
2352 	for(i=0; i<recvcount; i++) {
2353 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2354 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2355 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2356 	}
2357 }
2358 
2359 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2360 
2361 static void
2362 handle_udp(int fd, short event, void* arg)
2363 {
2364 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2365 	int received, sent;
2366 #ifndef NONBLOCKING_IS_BROKEN
2367 #ifdef HAVE_RECVMMSG
2368 	int recvcount;
2369 #endif /* HAVE_RECVMMSG */
2370 	int i;
2371 #endif /* NONBLOCKING_IS_BROKEN */
2372 	struct query *q;
2373 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2374 	q = data->query;
2375 #endif
2376 
2377 	if (!(event & EV_READ)) {
2378 		return;
2379 	}
2380 #ifndef NONBLOCKING_IS_BROKEN
2381 #ifdef HAVE_RECVMMSG
2382 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2383 	/* this printf strangely gave a performance increase on Linux */
2384 	/* printf("recvcount %d \n", recvcount); */
2385 	if (recvcount == -1) {
2386 		if (errno != EAGAIN && errno != EINTR) {
2387 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2388 			STATUP(data->nsd, rxerr);
2389 			/* No zone statup */
2390 		}
2391 		/* Simply no data available */
2392 		return;
2393 	}
2394 	for (i = 0; i < recvcount; i++) {
2395 		received = msgs[i].msg_len;
2396 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
2397 		if (received == -1) {
2398 			log_msg(LOG_ERR, "recvmmsg failed");
2399 			STATUP(data->nsd, rxerr);
2400 			/* No zone statup */
2401 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2402 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2403 			iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2404 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2405 			continue;
2406 		}
2407 		q = queries[i];
2408 #else
2409 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2410 #endif /* HAVE_RECVMMSG */
2411 #endif /* NONBLOCKING_IS_BROKEN */
2412 
2413 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2414 		/* Initialize the query... */
2415 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2416 
2417 		received = recvfrom(fd,
2418 				    buffer_begin(q->packet),
2419 				    buffer_remaining(q->packet),
2420 				    0,
2421 				    (struct sockaddr *)&q->addr,
2422 				    &q->addrlen);
2423 		if (received == -1) {
2424 			if (errno != EAGAIN && errno != EINTR) {
2425 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2426 				STATUP(data->nsd, rxerr);
2427 				/* No zone statup */
2428 			}
2429 			return;
2430 		}
2431 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2432 
2433 		/* Account... */
2434 		if (data->socket->fam == AF_INET) {
2435 			STATUP(data->nsd, qudp);
2436 		} else if (data->socket->fam == AF_INET6) {
2437 			STATUP(data->nsd, qudp6);
2438 		}
2439 
2440 		buffer_skip(q->packet, received);
2441 		buffer_flip(q->packet);
2442 #ifdef USE_DNSTAP
2443 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
2444 			q->tcp, q->packet);
2445 #endif /* USE_DNSTAP */
2446 
2447 		/* Process and answer the query... */
2448 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2449 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2450 				STATUP(data->nsd, nona);
2451 				ZTATUP(data->nsd, q->zone, nona);
2452 			}
2453 
2454 #ifdef USE_ZONE_STATS
2455 			if (data->socket->fam == AF_INET) {
2456 				ZTATUP(data->nsd, q->zone, qudp);
2457 			} else if (data->socket->fam == AF_INET6) {
2458 				ZTATUP(data->nsd, q->zone, qudp6);
2459 			}
2460 #endif
2461 
2462 			/* Add EDNS0 and TSIG info if necessary.  */
2463 			query_add_optional(q, data->nsd);
2464 
2465 			buffer_flip(q->packet);
2466 
2467 			sent = sendto(fd,
2468 				      buffer_begin(q->packet),
2469 				      buffer_remaining(q->packet),
2470 				      0,
2471 				      (struct sockaddr *) &q->addr,
2472 				      q->addrlen);
2473 			if (sent == -1) {
2474 				const char* es = strerror(errno);
2475 				char a[48];
2476 				addr2str(&q->addr, a, sizeof(a));
2477 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2478 				STATUP(data->nsd, txerr);
2479 				ZTATUP(data->nsd, q->zone, txerr);
2480 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2481 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2482 			} else {
2483 #ifdef BIND8_STATS
2484 				/* Account the rcode & TC... */
2485 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2486 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2487 				if (TC(q->packet)) {
2488 					STATUP(data->nsd, truncated);
2489 					ZTATUP(data->nsd, q->zone, truncated);
2490 				}
2491 #endif /* BIND8_STATS */
2492 #ifdef USE_DNSTAP
2493 				dt_collector_submit_auth_response(data->nsd,
2494 					&q->addr, q->addrlen, q->tcp,
2495 					q->packet, q->zone);
2496 #endif /* USE_DNSTAP */
2497 			}
2498 		} else {
2499 			STATUP(data->nsd, dropped);
2500 			ZTATUP(data->nsd, q->zone, dropped);
2501 		}
2502 #ifndef NONBLOCKING_IS_BROKEN
2503 #ifdef HAVE_RECVMMSG
2504 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2505 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2506 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2507 #endif
2508 	}
2509 #endif
2510 }
2511 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2512 
2513 
2514 static void
2515 cleanup_tcp_handler(struct tcp_handler_data* data)
2516 {
2517 	event_del(&data->event);
2518 	close(data->event.ev_fd);
2519 
2520 	/*
2521 	 * Enable the TCP accept handlers when the current number of
2522 	 * TCP connections is about to drop below the maximum number
2523 	 * of TCP connections.
2524 	 */
2525 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2526 		configure_handler_event_types(EV_READ|EV_PERSIST);
2527 		if(slowaccept) {
2528 			event_del(&slowaccept_event);
2529 			slowaccept = 0;
2530 		}
2531 	}
2532 	--data->nsd->current_tcp_count;
2533 	assert(data->nsd->current_tcp_count >= 0);
2534 
2535 	region_destroy(data->region);
2536 }
2537 
2538 static void
2539 handle_tcp_reading(int fd, short event, void* arg)
2540 {
2541 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2542 	ssize_t received;
2543 	struct event_base* ev_base;
2544 	struct timeval timeout;
2545 
2546 	if ((event & EV_TIMEOUT)) {
2547 		/* Connection timed out.  */
2548 		cleanup_tcp_handler(data);
2549 		return;
2550 	}
2551 
2552 	if (data->nsd->tcp_query_count > 0 &&
2553 		data->query_count >= data->nsd->tcp_query_count) {
2554 		/* No more queries allowed on this tcp connection.  */
2555 		cleanup_tcp_handler(data);
2556 		return;
2557 	}
2558 
2559 	assert((event & EV_READ));
2560 
2561 	if (data->bytes_transmitted == 0) {
2562 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2563 	}
2564 
2565 	/*
2566 	 * Check if we received the leading packet length bytes yet.
2567 	 */
2568 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2569 		received = read(fd,
2570 				(char *) &data->query->tcplen
2571 				+ data->bytes_transmitted,
2572 				sizeof(uint16_t) - data->bytes_transmitted);
2573 		if (received == -1) {
2574 			if (errno == EAGAIN || errno == EINTR) {
2575 				/*
2576 				 * Read would block, wait until more
2577 				 * data is available.
2578 				 */
2579 				return;
2580 			} else {
2581 				char buf[48];
2582 				addr2str(&data->query->addr, buf, sizeof(buf));
2583 #ifdef ECONNRESET
2584 				if (verbosity >= 2 || errno != ECONNRESET)
2585 #endif /* ECONNRESET */
2586 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2587 				cleanup_tcp_handler(data);
2588 				return;
2589 			}
2590 		} else if (received == 0) {
2591 			/* EOF */
2592 			cleanup_tcp_handler(data);
2593 			return;
2594 		}
2595 
2596 		data->bytes_transmitted += received;
2597 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2598 			/*
2599 			 * Not done with the tcplen yet, wait for more
2600 			 * data to become available.
2601 			 */
2602 			return;
2603 		}
2604 
2605 		assert(data->bytes_transmitted == sizeof(uint16_t));
2606 
2607 		data->query->tcplen = ntohs(data->query->tcplen);
2608 
2609 		/*
2610 		 * Minimum query size is:
2611 		 *
2612 		 *     Size of the header (12)
2613 		 *   + Root domain name   (1)
2614 		 *   + Query class        (2)
2615 		 *   + Query type         (2)
2616 		 */
2617 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2618 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2619 			cleanup_tcp_handler(data);
2620 			return;
2621 		}
2622 
2623 		if (data->query->tcplen > data->query->maxlen) {
2624 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2625 			cleanup_tcp_handler(data);
2626 			return;
2627 		}
2628 
2629 		buffer_set_limit(data->query->packet, data->query->tcplen);
2630 	}
2631 
2632 	assert(buffer_remaining(data->query->packet) > 0);
2633 
2634 	/* Read the (remaining) query data.  */
2635 	received = read(fd,
2636 			buffer_current(data->query->packet),
2637 			buffer_remaining(data->query->packet));
2638 	if (received == -1) {
2639 		if (errno == EAGAIN || errno == EINTR) {
2640 			/*
2641 			 * Read would block, wait until more data is
2642 			 * available.
2643 			 */
2644 			return;
2645 		} else {
2646 			char buf[48];
2647 			addr2str(&data->query->addr, buf, sizeof(buf));
2648 #ifdef ECONNRESET
2649 			if (verbosity >= 2 || errno != ECONNRESET)
2650 #endif /* ECONNRESET */
2651 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2652 			cleanup_tcp_handler(data);
2653 			return;
2654 		}
2655 	} else if (received == 0) {
2656 		/* EOF */
2657 		cleanup_tcp_handler(data);
2658 		return;
2659 	}
2660 
2661 	data->bytes_transmitted += received;
2662 	buffer_skip(data->query->packet, received);
2663 	if (buffer_remaining(data->query->packet) > 0) {
2664 		/*
2665 		 * Message not yet complete, wait for more data to
2666 		 * become available.
2667 		 */
2668 		return;
2669 	}
2670 
2671 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2672 
2673 	/* Account... */
2674 #ifdef BIND8_STATS
2675 #ifndef INET6
2676 	STATUP(data->nsd, ctcp);
2677 #else
2678 	if (data->query->addr.ss_family == AF_INET) {
2679 		STATUP(data->nsd, ctcp);
2680 	} else if (data->query->addr.ss_family == AF_INET6) {
2681 		STATUP(data->nsd, ctcp6);
2682 	}
2683 #endif
2684 #endif /* BIND8_STATS */
2685 
2686 	/* We have a complete query, process it.  */
2687 
2688 	/* tcp-query-count: handle query counter ++ */
2689 	data->query_count++;
2690 
2691 	buffer_flip(data->query->packet);
2692 #ifdef USE_DNSTAP
2693 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
2694 		data->query->addrlen, data->query->tcp, data->query->packet);
2695 #endif /* USE_DNSTAP */
2696 	data->query_state = server_process_query(data->nsd, data->query);
2697 	if (data->query_state == QUERY_DISCARDED) {
2698 		/* Drop the packet and the entire connection... */
2699 		STATUP(data->nsd, dropped);
2700 		ZTATUP(data->nsd, data->query->zone, dropped);
2701 		cleanup_tcp_handler(data);
2702 		return;
2703 	}
2704 
2705 #ifdef BIND8_STATS
2706 	if (RCODE(data->query->packet) == RCODE_OK
2707 	    && !AA(data->query->packet))
2708 	{
2709 		STATUP(data->nsd, nona);
2710 		ZTATUP(data->nsd, data->query->zone, nona);
2711 	}
2712 #endif /* BIND8_STATS */
2713 
2714 #ifdef USE_ZONE_STATS
2715 #ifndef INET6
2716 	ZTATUP(data->nsd, data->query->zone, ctcp);
2717 #else
2718 	if (data->query->addr.ss_family == AF_INET) {
2719 		ZTATUP(data->nsd, data->query->zone, ctcp);
2720 	} else if (data->query->addr.ss_family == AF_INET6) {
2721 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2722 	}
2723 #endif
2724 #endif /* USE_ZONE_STATS */
2725 
2726 	query_add_optional(data->query, data->nsd);
2727 
2728 	/* Switch to the tcp write handler.  */
2729 	buffer_flip(data->query->packet);
2730 	data->query->tcplen = buffer_remaining(data->query->packet);
2731 #ifdef USE_DNSTAP
2732 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
2733 		data->query->addrlen, data->query->tcp, data->query->packet,
2734 		data->query->zone);
2735 #endif /* USE_DNSTAP */
2736 	data->bytes_transmitted = 0;
2737 
2738 	timeout.tv_sec = data->tcp_timeout / 1000;
2739 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2740 
2741 	ev_base = data->event.ev_base;
2742 	event_del(&data->event);
2743 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2744 		handle_tcp_writing, data);
2745 	if(event_base_set(ev_base, &data->event) != 0)
2746 		log_msg(LOG_ERR, "event base set tcpr failed");
2747 	if(event_add(&data->event, &timeout) != 0)
2748 		log_msg(LOG_ERR, "event add tcpr failed");
2749 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2750 	handle_tcp_writing(fd, EV_WRITE, data);
2751 }
2752 
2753 static void
2754 handle_tcp_writing(int fd, short event, void* arg)
2755 {
2756 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2757 	ssize_t sent;
2758 	struct query *q = data->query;
2759 	struct timeval timeout;
2760 	struct event_base* ev_base;
2761 
2762 	if ((event & EV_TIMEOUT)) {
2763 		/* Connection timed out.  */
2764 		cleanup_tcp_handler(data);
2765 		return;
2766 	}
2767 
2768 	assert((event & EV_WRITE));
2769 
2770 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2771 		/* Writing the response packet length.  */
2772 		uint16_t n_tcplen = htons(q->tcplen);
2773 #ifdef HAVE_WRITEV
2774 		struct iovec iov[2];
2775 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2776 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2777 		iov[1].iov_base = buffer_begin(q->packet);
2778 		iov[1].iov_len = buffer_limit(q->packet);
2779 		sent = writev(fd, iov, 2);
2780 #else /* HAVE_WRITEV */
2781 		sent = write(fd,
2782 			     (const char *) &n_tcplen + data->bytes_transmitted,
2783 			     sizeof(n_tcplen) - data->bytes_transmitted);
2784 #endif /* HAVE_WRITEV */
2785 		if (sent == -1) {
2786 			if (errno == EAGAIN || errno == EINTR) {
2787 				/*
2788 				 * Write would block, wait until
2789 				 * socket becomes writable again.
2790 				 */
2791 				return;
2792 			} else {
2793 #ifdef ECONNRESET
2794 				if(verbosity >= 2 || errno != ECONNRESET)
2795 #endif /* ECONNRESET */
2796 #ifdef EPIPE
2797 				  if(verbosity >= 2 || errno != EPIPE)
2798 #endif /* EPIPE 'broken pipe' */
2799 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2800 				cleanup_tcp_handler(data);
2801 				return;
2802 			}
2803 		}
2804 
2805 		data->bytes_transmitted += sent;
2806 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2807 			/*
2808 			 * Writing not complete, wait until socket
2809 			 * becomes writable again.
2810 			 */
2811 			return;
2812 		}
2813 
2814 #ifdef HAVE_WRITEV
2815 		sent -= sizeof(n_tcplen);
2816 		/* handle potential 'packet done' code */
2817 		goto packet_could_be_done;
2818 #endif
2819  	}
2820 
2821 	sent = write(fd,
2822 		     buffer_current(q->packet),
2823 		     buffer_remaining(q->packet));
2824 	if (sent == -1) {
2825 		if (errno == EAGAIN || errno == EINTR) {
2826 			/*
2827 			 * Write would block, wait until
2828 			 * socket becomes writable again.
2829 			 */
2830 			return;
2831 		} else {
2832 #ifdef ECONNRESET
2833 			if(verbosity >= 2 || errno != ECONNRESET)
2834 #endif /* ECONNRESET */
2835 #ifdef EPIPE
2836 				  if(verbosity >= 2 || errno != EPIPE)
2837 #endif /* EPIPE 'broken pipe' */
2838 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2839 			cleanup_tcp_handler(data);
2840 			return;
2841 		}
2842 	}
2843 
2844 	data->bytes_transmitted += sent;
2845 #ifdef HAVE_WRITEV
2846   packet_could_be_done:
2847 #endif
2848 	buffer_skip(q->packet, sent);
2849 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2850 		/*
2851 		 * Still more data to write when socket becomes
2852 		 * writable again.
2853 		 */
2854 		return;
2855 	}
2856 
2857 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2858 
2859 	if (data->query_state == QUERY_IN_AXFR) {
2860 		/* Continue processing AXFR and writing back results.  */
2861 		buffer_clear(q->packet);
2862 		data->query_state = query_axfr(data->nsd, q);
2863 		if (data->query_state != QUERY_PROCESSED) {
2864 			query_add_optional(data->query, data->nsd);
2865 
2866 			/* Reset data. */
2867 			buffer_flip(q->packet);
2868 			q->tcplen = buffer_remaining(q->packet);
2869 			data->bytes_transmitted = 0;
2870 			/* Reset timeout.  */
2871 			timeout.tv_sec = data->tcp_timeout / 1000;
2872 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2873 			ev_base = data->event.ev_base;
2874 			event_del(&data->event);
2875 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2876 				handle_tcp_writing, data);
2877 			if(event_base_set(ev_base, &data->event) != 0)
2878 				log_msg(LOG_ERR, "event base set tcpw failed");
2879 			if(event_add(&data->event, &timeout) != 0)
2880 				log_msg(LOG_ERR, "event add tcpw failed");
2881 
2882 			/*
2883 			 * Write data if/when the socket is writable
2884 			 * again.
2885 			 */
2886 			return;
2887 		}
2888 	}
2889 
2890 	/*
2891 	 * Done sending, wait for the next request to arrive on the
2892 	 * TCP socket by installing the TCP read handler.
2893 	 */
2894 	if (data->nsd->tcp_query_count > 0 &&
2895 		data->query_count >= data->nsd->tcp_query_count) {
2896 
2897 		(void) shutdown(fd, SHUT_WR);
2898 	}
2899 
2900 	data->bytes_transmitted = 0;
2901 
2902 	timeout.tv_sec = data->tcp_timeout / 1000;
2903 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2904 	ev_base = data->event.ev_base;
2905 	event_del(&data->event);
2906 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2907 		handle_tcp_reading, data);
2908 	if(event_base_set(ev_base, &data->event) != 0)
2909 		log_msg(LOG_ERR, "event base set tcpw failed");
2910 	if(event_add(&data->event, &timeout) != 0)
2911 		log_msg(LOG_ERR, "event add tcpw failed");
2912 }
2913 
2914 
2915 static void
2916 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2917 	void* ATTR_UNUSED(arg))
2918 {
2919 	if(slowaccept) {
2920 		configure_handler_event_types(EV_PERSIST | EV_READ);
2921 		slowaccept = 0;
2922 	}
2923 }
2924 
2925 /*
2926  * Handle an incoming TCP connection.  The connection is accepted and
2927  * a new TCP reader event handler is added.  The TCP handler
2928  * is responsible for cleanup when the connection is closed.
2929  */
2930 static void
2931 handle_tcp_accept(int fd, short event, void* arg)
2932 {
2933 	struct tcp_accept_handler_data *data
2934 		= (struct tcp_accept_handler_data *) arg;
2935 	int s;
2936 	struct tcp_handler_data *tcp_data;
2937 	region_type *tcp_region;
2938 #ifdef INET6
2939 	struct sockaddr_storage addr;
2940 #else
2941 	struct sockaddr_in addr;
2942 #endif
2943 	socklen_t addrlen;
2944 	struct timeval timeout;
2945 
2946 	if (!(event & EV_READ)) {
2947 		return;
2948 	}
2949 
2950 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2951 		return;
2952 	}
2953 
2954 	/* Accept it... */
2955 	addrlen = sizeof(addr);
2956 #ifndef HAVE_ACCEPT4
2957 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2958 #else
2959 	s = accept4(fd, (struct sockaddr *) &addr, &addrlen, SOCK_NONBLOCK);
2960 #endif
2961 	if (s == -1) {
2962 		/**
2963 		 * EMFILE and ENFILE is a signal that the limit of open
2964 		 * file descriptors has been reached. Pause accept().
2965 		 * EINTR is a signal interrupt. The others are various OS ways
2966 		 * of saying that the client has closed the connection.
2967 		 */
2968 		if (errno == EMFILE || errno == ENFILE) {
2969 			if (!slowaccept) {
2970 				/* disable accept events */
2971 				struct timeval tv;
2972 				configure_handler_event_types(0);
2973 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2974 				tv.tv_usec = 0L;
2975 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2976 					handle_slowaccept_timeout, NULL);
2977 				(void)event_base_set(data->event.ev_base,
2978 					&slowaccept_event);
2979 				(void)event_add(&slowaccept_event, &tv);
2980 				slowaccept = 1;
2981 				/* We don't want to spam the logs here */
2982 			}
2983 		} else if (errno != EINTR
2984 			&& errno != EWOULDBLOCK
2985 #ifdef ECONNABORTED
2986 			&& errno != ECONNABORTED
2987 #endif /* ECONNABORTED */
2988 #ifdef EPROTO
2989 			&& errno != EPROTO
2990 #endif /* EPROTO */
2991 			) {
2992 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2993 		}
2994 		return;
2995 	}
2996 
2997 #ifndef HAVE_ACCEPT4
2998 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2999 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
3000 		close(s);
3001 		return;
3002 	}
3003 #endif
3004 
3005 	/*
3006 	 * This region is deallocated when the TCP connection is
3007 	 * closed by the TCP handler.
3008 	 */
3009 	tcp_region = region_create(xalloc, free);
3010 	tcp_data = (struct tcp_handler_data *) region_alloc(
3011 		tcp_region, sizeof(struct tcp_handler_data));
3012 	tcp_data->region = tcp_region;
3013 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
3014 		compression_table_size, compressed_dnames);
3015 	tcp_data->nsd = data->nsd;
3016 	tcp_data->query_count = 0;
3017 
3018 	tcp_data->query_state = QUERY_PROCESSED;
3019 	tcp_data->bytes_transmitted = 0;
3020 	memcpy(&tcp_data->query->addr, &addr, addrlen);
3021 	tcp_data->query->addrlen = addrlen;
3022 
3023 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
3024 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
3025 		/* very busy, give smaller timeout */
3026 		tcp_data->tcp_timeout = 200;
3027 	}
3028 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
3029 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
3030 
3031 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
3032 		handle_tcp_reading, tcp_data);
3033 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
3034 		log_msg(LOG_ERR, "cannot set tcp event base");
3035 		close(s);
3036 		region_destroy(tcp_region);
3037 		return;
3038 	}
3039 	if(event_add(&tcp_data->event, &timeout) != 0) {
3040 		log_msg(LOG_ERR, "cannot add tcp to event base");
3041 		close(s);
3042 		region_destroy(tcp_region);
3043 		return;
3044 	}
3045 
3046 	/*
3047 	 * Keep track of the total number of TCP handlers installed so
3048 	 * we can stop accepting connections when the maximum number
3049 	 * of simultaneous TCP connections is reached.
3050 	 */
3051 	++data->nsd->current_tcp_count;
3052 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3053 		configure_handler_event_types(0);
3054 	}
3055 }
3056 
3057 static void
3058 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
3059 {
3060 	size_t i;
3061 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3062 	for (i = 0; i < nsd->child_count; ++i) {
3063 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
3064 			if (write(nsd->children[i].child_fd,
3065 				&command,
3066 				sizeof(command)) == -1)
3067 			{
3068 				if(errno != EAGAIN && errno != EINTR)
3069 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
3070 					(int) command,
3071 					(int) nsd->children[i].pid,
3072 					strerror(errno));
3073 			} else if (timeout > 0) {
3074 				(void)block_read(NULL,
3075 					nsd->children[i].child_fd,
3076 					&command, sizeof(command), timeout);
3077 			}
3078 			fsync(nsd->children[i].child_fd);
3079 			close(nsd->children[i].child_fd);
3080 			nsd->children[i].child_fd = -1;
3081 		}
3082 	}
3083 }
3084 
3085 static void
3086 send_children_quit(struct nsd* nsd)
3087 {
3088 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
3089 	send_children_command(nsd, NSD_QUIT, 0);
3090 }
3091 
3092 static void
3093 send_children_quit_and_wait(struct nsd* nsd)
3094 {
3095 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
3096 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
3097 }
3098 
3099 #ifdef BIND8_STATS
3100 static void
3101 set_children_stats(struct nsd* nsd)
3102 {
3103 	size_t i;
3104 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3105 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
3106 	for (i = 0; i < nsd->child_count; ++i) {
3107 		nsd->children[i].need_to_send_STATS = 1;
3108 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
3109 	}
3110 }
3111 #endif /* BIND8_STATS */
3112 
3113 static void
3114 configure_handler_event_types(short event_types)
3115 {
3116 	size_t i;
3117 
3118 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3119 		struct event* handler = &tcp_accept_handlers[i].event;
3120 		if(event_types) {
3121 			/* reassign */
3122 			int fd = handler->ev_fd;
3123 			struct event_base* base = handler->ev_base;
3124 			if(tcp_accept_handlers[i].event_added)
3125 				event_del(handler);
3126 			event_set(handler, fd, event_types,
3127 				handle_tcp_accept, &tcp_accept_handlers[i]);
3128 			if(event_base_set(base, handler) != 0)
3129 				log_msg(LOG_ERR, "conhand: cannot event_base");
3130 			if(event_add(handler, NULL) != 0)
3131 				log_msg(LOG_ERR, "conhand: cannot event_add");
3132 			tcp_accept_handlers[i].event_added = 1;
3133 		} else {
3134 			/* remove */
3135 			if(tcp_accept_handlers[i].event_added) {
3136 				event_del(handler);
3137 				tcp_accept_handlers[i].event_added = 0;
3138 			}
3139 		}
3140 	}
3141 }
3142