xref: /openbsd-src/usr.sbin/nsd/server.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #include <openssl/rand.h>
41 #ifndef USE_MINI_EVENT
42 #  ifdef HAVE_EVENT_H
43 #    include <event.h>
44 #  else
45 #    include <event2/event.h>
46 #    include "event2/event_struct.h"
47 #    include "event2/event_compat.h"
48 #  endif
49 #else
50 #  include "mini_event.h"
51 #endif
52 
53 #include "axfr.h"
54 #include "namedb.h"
55 #include "netio.h"
56 #include "xfrd.h"
57 #include "xfrd-tcp.h"
58 #include "xfrd-disk.h"
59 #include "difffile.h"
60 #include "nsec3.h"
61 #include "ipc.h"
62 #include "udb.h"
63 #include "remote.h"
64 #include "lookup3.h"
65 #include "rrl.h"
66 
67 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
68 
69 /*
70  * Data for the UDP handlers.
71  */
72 struct udp_handler_data
73 {
74 	struct nsd        *nsd;
75 	struct nsd_socket *socket;
76 	query_type        *query;
77 };
78 
79 struct tcp_accept_handler_data {
80 	struct nsd         *nsd;
81 	struct nsd_socket  *socket;
82 	int event_added;
83 	struct event       event;
84 };
85 
86 /*
87  * These globals are used to enable the TCP accept handlers
88  * when the number of TCP connection drops below the maximum
89  * number of TCP connections.
90  */
91 static size_t		tcp_accept_handler_count;
92 static struct tcp_accept_handler_data*	tcp_accept_handlers;
93 
94 static struct event slowaccept_event;
95 static int slowaccept;
96 
97 #ifndef NONBLOCKING_IS_BROKEN
98 #  define NUM_RECV_PER_SELECT 100
99 #endif
100 
101 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
102 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
103 struct iovec iovecs[NUM_RECV_PER_SELECT];
104 struct query *queries[NUM_RECV_PER_SELECT];
105 #endif
106 
107 /*
108  * Data for the TCP connection handlers.
109  *
110  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
111  * blocking the entire server on a slow TCP connection, but does make
112  * reading from and writing to the socket more complicated.
113  *
114  * Basically, whenever a read/write would block (indicated by the
115  * EAGAIN errno variable) we remember the position we were reading
116  * from/writing to and return from the TCP reading/writing event
117  * handler.  When the socket becomes readable/writable again we
118  * continue from the same position.
119  */
120 struct tcp_handler_data
121 {
122 	/*
123 	 * The region used to allocate all TCP connection related
124 	 * data, including this structure.  This region is destroyed
125 	 * when the connection is closed.
126 	 */
127 	region_type*		region;
128 
129 	/*
130 	 * The global nsd structure.
131 	 */
132 	struct nsd*			nsd;
133 
134 	/*
135 	 * The current query data for this TCP connection.
136 	 */
137 	query_type*			query;
138 
139 	/*
140 	 * The query_state is used to remember if we are performing an
141 	 * AXFR, if we're done processing, or if we should discard the
142 	 * query and connection.
143 	 */
144 	query_state_type	query_state;
145 
146 	/*
147 	 * The event for the file descriptor and tcp timeout
148 	 */
149 	struct event event;
150 
151 	/*
152 	 * The bytes_transmitted field is used to remember the number
153 	 * of bytes transmitted when receiving or sending a DNS
154 	 * packet.  The count includes the two additional bytes used
155 	 * to specify the packet length on a TCP connection.
156 	 */
157 	size_t				bytes_transmitted;
158 
159 	/*
160 	 * The number of queries handled by this specific TCP connection.
161 	 */
162 	int					query_count;
163 
164 	/*
165 	 * The timeout in msec for this tcp connection
166 	 */
167 	int	tcp_timeout;
168 };
169 
170 /*
171  * Handle incoming queries on the UDP server sockets.
172  */
173 static void handle_udp(int fd, short event, void* arg);
174 
175 /*
176  * Handle incoming connections on the TCP sockets.  These handlers
177  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
178  * connection) but are disabled when the number of current TCP
179  * connections is equal to the maximum number of TCP connections.
180  * Disabling is done by changing the handler to wait for the
181  * NETIO_EVENT_NONE type.  This is done using the function
182  * configure_tcp_accept_handlers.
183  */
184 static void handle_tcp_accept(int fd, short event, void* arg);
185 
186 /*
187  * Handle incoming queries on a TCP connection.  The TCP connections
188  * are configured to be non-blocking and the handler may be called
189  * multiple times before a complete query is received.
190  */
191 static void handle_tcp_reading(int fd, short event, void* arg);
192 
193 /*
194  * Handle outgoing responses on a TCP connection.  The TCP connections
195  * are configured to be non-blocking and the handler may be called
196  * multiple times before a complete response is sent.
197  */
198 static void handle_tcp_writing(int fd, short event, void* arg);
199 
200 /*
201  * Send all children the quit nonblocking, then close pipe.
202  */
203 static void send_children_quit(struct nsd* nsd);
204 /* same, for shutdown time, waits for child to exit to avoid restart issues */
205 static void send_children_quit_and_wait(struct nsd* nsd);
206 
207 /* set childrens flags to send NSD_STATS to them */
208 #ifdef BIND8_STATS
209 static void set_children_stats(struct nsd* nsd);
210 #endif /* BIND8_STATS */
211 
212 /*
213  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
214  */
215 static void configure_handler_event_types(short event_types);
216 
217 static uint16_t *compressed_dname_offsets = 0;
218 static uint32_t compression_table_capacity = 0;
219 static uint32_t compression_table_size = 0;
220 
221 /*
222  * Remove the specified pid from the list of child pids.  Returns -1 if
223  * the pid is not in the list, child_num otherwise.  The field is set to 0.
224  */
225 static int
226 delete_child_pid(struct nsd *nsd, pid_t pid)
227 {
228 	size_t i;
229 	for (i = 0; i < nsd->child_count; ++i) {
230 		if (nsd->children[i].pid == pid) {
231 			nsd->children[i].pid = 0;
232 			if(!nsd->children[i].need_to_exit) {
233 				if(nsd->children[i].child_fd != -1)
234 					close(nsd->children[i].child_fd);
235 				nsd->children[i].child_fd = -1;
236 				if(nsd->children[i].handler)
237 					nsd->children[i].handler->fd = -1;
238 			}
239 			return i;
240 		}
241 	}
242 	return -1;
243 }
244 
245 /*
246  * Restart child servers if necessary.
247  */
248 static int
249 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
250 	int* xfrd_sock_p)
251 {
252 	struct main_ipc_handler_data *ipc_data;
253 	size_t i;
254 	int sv[2];
255 
256 	/* Fork the child processes... */
257 	for (i = 0; i < nsd->child_count; ++i) {
258 		if (nsd->children[i].pid <= 0) {
259 			if (nsd->children[i].child_fd != -1)
260 				close(nsd->children[i].child_fd);
261 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
262 				log_msg(LOG_ERR, "socketpair: %s",
263 					strerror(errno));
264 				return -1;
265 			}
266 			nsd->children[i].child_fd = sv[0];
267 			nsd->children[i].parent_fd = sv[1];
268 			nsd->children[i].pid = fork();
269 			switch (nsd->children[i].pid) {
270 			default: /* SERVER MAIN */
271 				close(nsd->children[i].parent_fd);
272 				nsd->children[i].parent_fd = -1;
273 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
274 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
275 				}
276 				if(!nsd->children[i].handler)
277 				{
278 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
279 						region, sizeof(struct main_ipc_handler_data));
280 					ipc_data->nsd = nsd;
281 					ipc_data->child = &nsd->children[i];
282 					ipc_data->child_num = i;
283 					ipc_data->xfrd_sock = xfrd_sock_p;
284 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
285 					ipc_data->forward_mode = 0;
286 					ipc_data->got_bytes = 0;
287 					ipc_data->total_bytes = 0;
288 					ipc_data->acl_num = 0;
289 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
290 						region, sizeof(struct netio_handler));
291 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
292 					nsd->children[i].handler->timeout = NULL;
293 					nsd->children[i].handler->user_data = ipc_data;
294 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
295 					nsd->children[i].handler->event_handler = parent_handle_child_command;
296 					netio_add_handler(netio, nsd->children[i].handler);
297 				}
298 				/* clear any ongoing ipc */
299 				ipc_data = (struct main_ipc_handler_data*)
300 					nsd->children[i].handler->user_data;
301 				ipc_data->forward_mode = 0;
302 				/* restart - update fd */
303 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
304 				break;
305 			case 0: /* CHILD */
306 				/* the child need not be able to access the
307 				 * nsd.db file */
308 				namedb_close_udb(nsd->db);
309 
310 				if (pledge("stdio rpath inet", NULL) == -1) {
311 					log_msg(LOG_ERR, "pledge");
312 					exit(1);
313 				}
314 
315 				nsd->pid = 0;
316 				nsd->child_count = 0;
317 				nsd->server_kind = nsd->children[i].kind;
318 				nsd->this_child = &nsd->children[i];
319 				nsd->this_child->child_num = i;
320 				/* remove signal flags inherited from parent
321 				   the parent will handle them. */
322 				nsd->signal_hint_reload_hup = 0;
323 				nsd->signal_hint_reload = 0;
324 				nsd->signal_hint_child = 0;
325 				nsd->signal_hint_quit = 0;
326 				nsd->signal_hint_shutdown = 0;
327 				nsd->signal_hint_stats = 0;
328 				nsd->signal_hint_statsusr = 0;
329 				close(*xfrd_sock_p);
330 				close(nsd->this_child->child_fd);
331 				nsd->this_child->child_fd = -1;
332 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
333 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
334 				}
335 				server_child(nsd);
336 				/* NOTREACH */
337 				exit(0);
338 			case -1:
339 				log_msg(LOG_ERR, "fork failed: %s",
340 					strerror(errno));
341 				return -1;
342 			}
343 		}
344 	}
345 	return 0;
346 }
347 
348 #ifdef BIND8_STATS
349 static void set_bind8_alarm(struct nsd* nsd)
350 {
351 	/* resync so that the next alarm is on the next whole minute */
352 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
353 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
354 }
355 #endif
356 
357 /* set zone stat ids for zones initially read in */
358 static void
359 zonestatid_tree_set(struct nsd* nsd)
360 {
361 	struct radnode* n;
362 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
363 		zone_type* zone = (zone_type*)n->elem;
364 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
365 	}
366 }
367 
368 #ifdef USE_ZONE_STATS
369 void
370 server_zonestat_alloc(struct nsd* nsd)
371 {
372 	size_t num = (nsd->options->zonestatnames->count==0?1:
373 			nsd->options->zonestatnames->count);
374 	size_t sz = sizeof(struct nsdst)*num;
375 	char tmpfile[256];
376 	uint8_t z = 0;
377 
378 	/* file names */
379 	nsd->zonestatfname[0] = 0;
380 	nsd->zonestatfname[1] = 0;
381 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
382 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
383 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
384 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
385 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
386 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
387 
388 	/* file descriptors */
389 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
390 	if(nsd->zonestatfd[0] == -1) {
391 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
392 			strerror(errno));
393 		exit(1);
394 	}
395 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
396 	if(nsd->zonestatfd[0] == -1) {
397 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
398 			strerror(errno));
399 		close(nsd->zonestatfd[0]);
400 		unlink(nsd->zonestatfname[0]);
401 		exit(1);
402 	}
403 
404 #ifdef HAVE_MMAP
405 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
406 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
407 			strerror(errno));
408 		exit(1);
409 	}
410 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
411 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
412 			nsd->zonestatfname[0], strerror(errno));
413 		exit(1);
414 	}
415 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
416 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
417 			strerror(errno));
418 		exit(1);
419 	}
420 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
421 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
422 			nsd->zonestatfname[1], strerror(errno));
423 		exit(1);
424 	}
425 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
426 		MAP_SHARED, nsd->zonestatfd[0], 0);
427 	if(nsd->zonestat[0] == MAP_FAILED) {
428 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
429 		unlink(nsd->zonestatfname[0]);
430 		unlink(nsd->zonestatfname[1]);
431 		exit(1);
432 	}
433 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
434 		MAP_SHARED, nsd->zonestatfd[1], 0);
435 	if(nsd->zonestat[1] == MAP_FAILED) {
436 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
437 		unlink(nsd->zonestatfname[0]);
438 		unlink(nsd->zonestatfname[1]);
439 		exit(1);
440 	}
441 	memset(nsd->zonestat[0], 0, sz);
442 	memset(nsd->zonestat[1], 0, sz);
443 	nsd->zonestatsize[0] = num;
444 	nsd->zonestatsize[1] = num;
445 	nsd->zonestatdesired = num;
446 	nsd->zonestatsizenow = num;
447 	nsd->zonestatnow = nsd->zonestat[0];
448 #endif /* HAVE_MMAP */
449 }
450 
451 void
452 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
453 {
454 #ifdef HAVE_MMAP
455 #ifdef MREMAP_MAYMOVE
456 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
457 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
458 		MREMAP_MAYMOVE);
459 	if(nsd->zonestat[idx] == MAP_FAILED) {
460 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
461 		exit(1);
462 	}
463 #else /* !HAVE MREMAP */
464 	if(msync(nsd->zonestat[idx],
465 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
466 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
467 	if(munmap(nsd->zonestat[idx],
468 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
469 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
470 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
471 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
472 	if(nsd->zonestat[idx] == MAP_FAILED) {
473 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
474 		exit(1);
475 	}
476 #endif /* MREMAP */
477 #endif /* HAVE_MMAP */
478 }
479 
480 /* realloc the zonestat array for the one that is not currently in use,
481  * to match the desired new size of the array (if applicable) */
482 void
483 server_zonestat_realloc(struct nsd* nsd)
484 {
485 #ifdef HAVE_MMAP
486 	uint8_t z = 0;
487 	size_t sz;
488 	int idx = 0; /* index of the zonestat array that is not in use */
489 	if(nsd->zonestatnow == nsd->zonestat[0])
490 		idx = 1;
491 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
492 		return;
493 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
494 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
495 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
496 			strerror(errno));
497 		exit(1);
498 	}
499 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
500 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
501 			nsd->zonestatfname[idx], strerror(errno));
502 		exit(1);
503 	}
504 	zonestat_remap(nsd, idx, sz);
505 	/* zero the newly allocated region */
506 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
507 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
508 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
509 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
510 	}
511 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
512 #endif /* HAVE_MMAP */
513 }
514 
515 /* switchover to use the other array for the new children, that
516  * briefly coexist with the old children.  And we want to avoid them
517  * both writing to the same statistics arrays. */
518 void
519 server_zonestat_switch(struct nsd* nsd)
520 {
521 	if(nsd->zonestatnow == nsd->zonestat[0]) {
522 		nsd->zonestatnow = nsd->zonestat[1];
523 		nsd->zonestatsizenow = nsd->zonestatsize[1];
524 	} else {
525 		nsd->zonestatnow = nsd->zonestat[0];
526 		nsd->zonestatsizenow = nsd->zonestatsize[0];
527 	}
528 }
529 #endif /* USE_ZONE_STATS */
530 
531 static void
532 cleanup_dname_compression_tables(void *ptr)
533 {
534 	free(ptr);
535 	compressed_dname_offsets = NULL;
536 	compression_table_capacity = 0;
537 }
538 
539 static void
540 initialize_dname_compression_tables(struct nsd *nsd)
541 {
542 	size_t needed = domain_table_count(nsd->db->domains) + 1;
543 	needed += EXTRA_DOMAIN_NUMBERS;
544 	if(compression_table_capacity < needed) {
545 		if(compressed_dname_offsets) {
546 			region_remove_cleanup(nsd->db->region,
547 				cleanup_dname_compression_tables,
548 				compressed_dname_offsets);
549 			free(compressed_dname_offsets);
550 		}
551 		compressed_dname_offsets = (uint16_t *) xmallocarray(
552 			needed, sizeof(uint16_t));
553 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
554 			compressed_dname_offsets);
555 		compression_table_capacity = needed;
556 		compression_table_size=domain_table_count(nsd->db->domains)+1;
557 	}
558 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
559 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
560 }
561 
562 /* create and bind sockets.  */
563 static int
564 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
565 {
566 	struct addrinfo* addr;
567 	size_t i;
568 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND))
569 	int on = 1;
570 #endif
571 
572 	/* UDP */
573 
574 	/* Make a socket... */
575 	for (i = from; i < to; i++) {
576 		/* for reuseports copy socket specs of first entries */
577 		addr = nsd->udp[i%nsd->ifs].addr;
578 		if (!addr) {
579 			nsd->udp[i].s = -1;
580 			continue;
581 		}
582 		nsd->udp[i].fam = (int)addr->ai_family;
583 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
584 #if defined(INET6)
585 			if (addr->ai_family == AF_INET6 &&
586 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
587 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
588 				continue;
589 			}
590 #endif /* INET6 */
591 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
592 			return -1;
593 		}
594 
595 #ifdef SO_REUSEPORT
596 		if(nsd->reuseport && *reuseport_works &&
597 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
598 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
599 			if(verbosity >= 3
600 #ifdef ENOPROTOOPT
601 				|| errno != ENOPROTOOPT
602 #endif
603 				)
604 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
605 				"...) failed: %s", strerror(errno));
606 			*reuseport_works = 0;
607 		}
608 #else
609 		(void)reuseport_works;
610 #endif /* SO_REUSEPORT */
611 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
612 	if(1) {
613 	int rcv = 1*1024*1024;
614 	int snd = 1*1024*1024;
615 
616 #ifdef SO_RCVBUF
617 #  ifdef SO_RCVBUFFORCE
618 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
619 		(socklen_t)sizeof(rcv)) < 0) {
620 		if(errno != EPERM && errno != ENOBUFS) {
621 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
622                                         "...) failed: %s", strerror(errno));
623 			return -1;
624 		}
625 #  else
626 	if(1) {
627 #  endif /* SO_RCVBUFFORCE */
628 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
629 			 (socklen_t)sizeof(rcv)) < 0) {
630 			if(errno != ENOBUFS && errno != ENOSYS) {
631 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
632                                         "...) failed: %s", strerror(errno));
633 				return -1;
634 			}
635 		}
636 	}
637 #endif /* SO_RCVBUF */
638 
639 #ifdef SO_SNDBUF
640 #  ifdef SO_SNDBUFFORCE
641 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
642 		(socklen_t)sizeof(snd)) < 0) {
643 		if(errno != EPERM && errno != ENOBUFS) {
644 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
645                                         "...) failed: %s", strerror(errno));
646 			return -1;
647 		}
648 #  else
649 	if(1) {
650 #  endif /* SO_SNDBUFFORCE */
651 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
652 			 (socklen_t)sizeof(snd)) < 0) {
653 			if(errno != ENOBUFS && errno != ENOSYS) {
654 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
655                                         "...) failed: %s", strerror(errno));
656 				return -1;
657 			}
658 		}
659 	}
660 #endif /* SO_SNDBUF */
661 
662 	}
663 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
664 
665 #if defined(INET6)
666 		if (addr->ai_family == AF_INET6) {
667 # if defined(IPV6_V6ONLY)
668 			if (setsockopt(nsd->udp[i].s,
669 				       IPPROTO_IPV6, IPV6_V6ONLY,
670 				       &on, sizeof(on)) < 0)
671 			{
672 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
673 					strerror(errno));
674 				return -1;
675 			}
676 # endif
677 # if defined(IPV6_USE_MIN_MTU)
678 			/*
679 			 * There is no fragmentation of IPv6 datagrams
680 			 * during forwarding in the network. Therefore
681 			 * we do not send UDP datagrams larger than
682 			 * the minimum IPv6 MTU of 1280 octets. The
683 			 * EDNS0 message length can be larger if the
684 			 * network stack supports IPV6_USE_MIN_MTU.
685 			 */
686 			if (setsockopt(nsd->udp[i].s,
687 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
688 				       &on, sizeof(on)) < 0)
689 			{
690 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
691 					strerror(errno));
692 				return -1;
693 			}
694 # elif defined(IPV6_MTU)
695 			/*
696 			 * On Linux, PMTUD is disabled by default for datagrams
697 			 * so set the MTU equal to the MIN MTU to get the same.
698 			 */
699 			on = IPV6_MIN_MTU;
700 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
701 				&on, sizeof(on)) < 0)
702 			{
703 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
704 					strerror(errno));
705 				return -1;
706 			}
707 			on = 1;
708 # endif
709 		}
710 #endif
711 #if defined(AF_INET)
712 		if (addr->ai_family == AF_INET) {
713 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
714 			int action = IP_PMTUDISC_DONT;
715 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
716 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
717 			{
718 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
719 					strerror(errno));
720 				return -1;
721 			}
722 #  elif defined(IP_DONTFRAG)
723 			int off = 0;
724 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
725 				&off, sizeof(off)) < 0)
726 			{
727 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
728 					strerror(errno));
729 				return -1;
730 			}
731 #  endif
732 		}
733 #endif
734 		/* set it nonblocking */
735 		/* otherwise, on OSes with thundering herd problems, the
736 		   UDP recv could block NSD after select returns readable. */
737 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
738 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
739 		}
740 
741 		/* Bind it... */
742 		if (nsd->options->ip_freebind) {
743 #ifdef IP_FREEBIND
744 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
745 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
746 					strerror(errno));
747 			}
748 #endif /* IP_FREEBIND */
749 		}
750 
751 		if (nsd->options->ip_transparent) {
752 #ifdef IP_TRANSPARENT
753 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
754 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
755 					strerror(errno));
756 			}
757 #endif /* IP_TRANSPARENT */
758 		}
759 
760 		if (bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
761 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
762 			return -1;
763 		}
764 	}
765 
766 	/* TCP */
767 
768 	/* Make a socket... */
769 	for (i = from; i < to; i++) {
770 		/* for reuseports copy socket specs of first entries */
771 		addr = nsd->tcp[i%nsd->ifs].addr;
772 		if (!addr) {
773 			nsd->tcp[i].s = -1;
774 			continue;
775 		}
776 		nsd->tcp[i].fam = (int)addr->ai_family;
777 		/* turn off REUSEPORT for TCP by copying the socket fd */
778 		if(i >= nsd->ifs) {
779 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
780 			continue;
781 		}
782 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
783 #if defined(INET6)
784 			if (addr->ai_family == AF_INET6 &&
785 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
786 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
787 				continue;
788 			}
789 #endif /* INET6 */
790 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
791 			return -1;
792 		}
793 
794 #ifdef SO_REUSEPORT
795 		if(nsd->reuseport && *reuseport_works &&
796 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
797 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
798 			if(verbosity >= 3
799 #ifdef ENOPROTOOPT
800 				|| errno != ENOPROTOOPT
801 #endif
802 				)
803 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
804 				"...) failed: %s", strerror(errno));
805 			*reuseport_works = 0;
806 		}
807 #endif /* SO_REUSEPORT */
808 #ifdef	SO_REUSEADDR
809 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
810 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
811 		}
812 #endif /* SO_REUSEADDR */
813 
814 #if defined(INET6)
815 		if (addr->ai_family == AF_INET6) {
816 # if defined(IPV6_V6ONLY)
817 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
818 				&on, sizeof(on)) < 0) {
819 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
820 				return -1;
821 			}
822 # endif
823 # if defined(IPV6_USE_MIN_MTU)
824 			/*
825 			 * Use minimum MTU to minimize delays learning working
826 			 * PMTU when communicating through a tunnel.
827 			 */
828 			if (setsockopt(nsd->tcp[i].s,
829 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
830 				       &on, sizeof(on)) < 0) {
831 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
832 				return -1;
833 			}
834 # elif defined(IPV6_MTU)
835 			/*
836 			 * On Linux, PMTUD is disabled by default for datagrams
837 			 * so set the MTU equal to the MIN MTU to get the same.
838 			 */
839 			on = IPV6_MIN_MTU;
840 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
841 				&on, sizeof(on)) < 0) {
842 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
843 				return -1;
844 			}
845 			on = 1;
846 # endif
847 		}
848 #endif
849 		/* set maximum segment size to tcp socket */
850 		if(nsd->tcp_mss > 0) {
851 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
852 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
853 					(void*)&nsd->tcp_mss,
854 					sizeof(nsd->tcp_mss)) < 0) {
855 				log_msg(LOG_ERR,
856 					"setsockopt(...,TCP_MAXSEG,...)"
857 					" failed for tcp: %s", strerror(errno));
858 			}
859 #else
860 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
861 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
862 		}
863 
864 		/* set it nonblocking */
865 		/* (StevensUNP p463), if tcp listening socket is blocking, then
866 		   it may block in accept, even if select() says readable. */
867 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
868 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
869 		}
870 
871 		/* Bind it... */
872 		if (nsd->options->ip_freebind) {
873 #ifdef IP_FREEBIND
874 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
875 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
876 					strerror(errno));
877 			}
878 #endif /* IP_FREEBIND */
879 		}
880 
881 		if (nsd->options->ip_transparent) {
882 #ifdef IP_TRANSPARENT
883 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
884 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
885 					strerror(errno));
886 			}
887 #endif /* IP_TRANSPARENT */
888 		}
889 
890 		if (bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
891 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
892 			return -1;
893 		}
894 
895 		/* Listen to it... */
896 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
897 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
898 			return -1;
899 		}
900 	}
901 
902 	return 0;
903 }
904 
905 /*
906  * Initialize the server, reuseport, create and bind the sockets.
907  */
908 int
909 server_init(struct nsd *nsd)
910 {
911 	int reuseport_successful = 1; /* see if reuseport works in OS */
912 	if(nsd->reuseport) {
913 		/* increase the size of the udp and tcp interface arrays,
914 		 * there are going to be separate interface file descriptors
915 		 * for every server instance */
916 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
917 			sizeof(*nsd->udp));
918 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
919 			sizeof(*nsd->tcp));
920 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
921 			(nsd->ifs*(nsd->reuseport-1)));
922 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
923 			(nsd->ifs*(nsd->reuseport-1)));
924 	}
925 
926 	/* open the server interface ports */
927 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
928 		return -1;
929 
930 	/* continue to open the remaining reuseport ports */
931 	if(nsd->reuseport && reuseport_successful) {
932 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
933 			&reuseport_successful) == -1)
934 			return -1;
935 		nsd->ifs *= nsd->reuseport;
936 	} else {
937 		nsd->reuseport = 0;
938 	}
939 	return 0;
940 }
941 
942 /*
943  * Prepare the server for take off.
944  *
945  */
946 int
947 server_prepare(struct nsd *nsd)
948 {
949 #ifdef RATELIMIT
950 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
951 #ifdef HAVE_ARC4RANDOM
952 	hash_set_raninit(arc4random());
953 #else
954 	uint32_t v = getpid() ^ time(NULL);
955 	srandom((unsigned long)v);
956 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
957 		hash_set_raninit(v);
958 	else	hash_set_raninit(random());
959 #endif
960 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
961 		nsd->options->rrl_ratelimit,
962 		nsd->options->rrl_whitelist_ratelimit,
963 		nsd->options->rrl_slip,
964 		nsd->options->rrl_ipv4_prefix_length,
965 		nsd->options->rrl_ipv6_prefix_length);
966 #endif /* RATELIMIT */
967 
968 	/* Open the database... */
969 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
970 		log_msg(LOG_ERR, "unable to open the database %s: %s",
971 			nsd->dbfile, strerror(errno));
972 		unlink(nsd->task[0]->fname);
973 		unlink(nsd->task[1]->fname);
974 #ifdef USE_ZONE_STATS
975 		unlink(nsd->zonestatfname[0]);
976 		unlink(nsd->zonestatfname[1]);
977 #endif
978 		xfrd_del_tempdir(nsd);
979 		return -1;
980 	}
981 	/* check if zone files have been modified */
982 	/* NULL for taskudb because we send soainfo in a moment, batched up,
983 	 * for all zones */
984 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
985 		nsd->options->database[0] == 0))
986 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
987 	zonestatid_tree_set(nsd);
988 
989 	compression_table_capacity = 0;
990 	initialize_dname_compression_tables(nsd);
991 
992 #ifdef	BIND8_STATS
993 	/* Initialize times... */
994 	time(&nsd->st.boot);
995 	set_bind8_alarm(nsd);
996 #endif /* BIND8_STATS */
997 
998 	return 0;
999 }
1000 
1001 /*
1002  * Fork the required number of servers.
1003  */
1004 static int
1005 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1006 	int* xfrd_sock_p)
1007 {
1008 	size_t i;
1009 
1010 	/* Start all child servers initially.  */
1011 	for (i = 0; i < nsd->child_count; ++i) {
1012 		nsd->children[i].pid = 0;
1013 	}
1014 
1015 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1016 }
1017 
1018 void
1019 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1020 {
1021 	size_t i;
1022 
1023 	/* Close all the sockets... */
1024 	for (i = 0; i < n; ++i) {
1025 		if (sockets[i].s != -1) {
1026 			close(sockets[i].s);
1027 			if(sockets[i].addr)
1028 				freeaddrinfo(sockets[i].addr);
1029 			sockets[i].s = -1;
1030 		}
1031 	}
1032 }
1033 
1034 /*
1035  * Close the sockets, shutdown the server and exit.
1036  * Does not return.
1037  *
1038  */
1039 void
1040 server_shutdown(struct nsd *nsd)
1041 {
1042 	size_t i;
1043 
1044 	server_close_all_sockets(nsd->udp, nsd->ifs);
1045 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1046 	/* CHILD: close command channel to parent */
1047 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1048 	{
1049 		close(nsd->this_child->parent_fd);
1050 		nsd->this_child->parent_fd = -1;
1051 	}
1052 	/* SERVER: close command channels to children */
1053 	if(!nsd->this_child)
1054 	{
1055 		for(i=0; i < nsd->child_count; ++i)
1056 			if(nsd->children[i].child_fd != -1)
1057 			{
1058 				close(nsd->children[i].child_fd);
1059 				nsd->children[i].child_fd = -1;
1060 			}
1061 	}
1062 
1063 	tsig_finalize();
1064 #ifdef HAVE_SSL
1065 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1066 #endif
1067 
1068 #if 0 /* OS collects memory pages */
1069 	nsd_options_destroy(nsd->options);
1070 	region_destroy(nsd->region);
1071 #endif
1072 	log_finalize();
1073 	exit(0);
1074 }
1075 
1076 void
1077 server_prepare_xfrd(struct nsd* nsd)
1078 {
1079 	char tmpfile[256];
1080 	/* create task mmaps */
1081 	nsd->mytask = 0;
1082 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1083 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1084 	nsd->task[0] = task_file_create(tmpfile);
1085 	if(!nsd->task[0]) {
1086 #ifdef USE_ZONE_STATS
1087 		unlink(nsd->zonestatfname[0]);
1088 		unlink(nsd->zonestatfname[1]);
1089 #endif
1090 		xfrd_del_tempdir(nsd);
1091 		exit(1);
1092 	}
1093 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1094 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1095 	nsd->task[1] = task_file_create(tmpfile);
1096 	if(!nsd->task[1]) {
1097 		unlink(nsd->task[0]->fname);
1098 #ifdef USE_ZONE_STATS
1099 		unlink(nsd->zonestatfname[0]);
1100 		unlink(nsd->zonestatfname[1]);
1101 #endif
1102 		xfrd_del_tempdir(nsd);
1103 		exit(1);
1104 	}
1105 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1106 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1107 	/* create xfrd listener structure */
1108 	nsd->xfrd_listener = region_alloc(nsd->region,
1109 		sizeof(netio_handler_type));
1110 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1111 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1112 	nsd->xfrd_listener->fd = -1;
1113 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1114 		nsd;
1115 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1116 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1117 }
1118 
1119 
1120 void
1121 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1122 {
1123 	pid_t pid;
1124 	int sockets[2] = {0,0};
1125 	struct ipc_handler_conn_data *data;
1126 
1127 	if(nsd->xfrd_listener->fd != -1)
1128 		close(nsd->xfrd_listener->fd);
1129 	if(del_db) {
1130 		/* recreate taskdb that xfrd was using, it may be corrupt */
1131 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1132 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1133 		nsd->task[1-nsd->mytask]->fname = NULL;
1134 		/* free alloc already, so udb does not shrink itself */
1135 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1136 		nsd->task[1-nsd->mytask]->alloc = NULL;
1137 		udb_base_free(nsd->task[1-nsd->mytask]);
1138 		/* create new file, overwrite the old one */
1139 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1140 		free(tmpfile);
1141 	}
1142 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1143 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1144 		return;
1145 	}
1146 	pid = fork();
1147 	switch (pid) {
1148 	case -1:
1149 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1150 		break;
1151 	default:
1152 		/* PARENT: close first socket, use second one */
1153 		close(sockets[0]);
1154 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1155 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1156 		}
1157 		if(del_db) xfrd_free_namedb(nsd);
1158 		/* use other task than I am using, since if xfrd died and is
1159 		 * restarted, the reload is using nsd->mytask */
1160 		nsd->mytask = 1 - nsd->mytask;
1161 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1162 		/* ENOTREACH */
1163 		break;
1164 	case 0:
1165 		/* CHILD: close second socket, use first one */
1166 		close(sockets[1]);
1167 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1168 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1169 		}
1170 		nsd->xfrd_listener->fd = sockets[0];
1171 		break;
1172 	}
1173 	/* server-parent only */
1174 	nsd->xfrd_listener->timeout = NULL;
1175 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1176 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1177 	/* clear ongoing ipc reads */
1178 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1179 	data->conn->is_reading = 0;
1180 }
1181 
1182 /** add all soainfo to taskdb */
1183 static void
1184 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1185 {
1186 	struct radnode* n;
1187 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1188 	/* add all SOA INFO to mytask */
1189 	udb_ptr_init(&task_last, taskudb);
1190 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1191 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1192 	}
1193 	udb_ptr_unlink(&task_last, taskudb);
1194 }
1195 
1196 void
1197 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1198 {
1199 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1200 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1201 	 *   then they exchange and process.
1202 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1203 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1204 	 *   expire notifications can be sent back via a normal reload later
1205 	 *   (xfrd will wait for current running reload to finish if any).
1206 	 */
1207 	sig_atomic_t cmd = 0;
1208 	pid_t mypid;
1209 	int xfrd_sock = nsd->xfrd_listener->fd;
1210 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1211 	udb_ptr t;
1212 	if(!shortsoa) {
1213 		if(nsd->signal_hint_shutdown) {
1214 		shutdown:
1215 			log_msg(LOG_WARNING, "signal received, shutting down...");
1216 			server_close_all_sockets(nsd->udp, nsd->ifs);
1217 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1218 #ifdef HAVE_SSL
1219 			daemon_remote_close(nsd->rc);
1220 #endif
1221 			/* Unlink it if possible... */
1222 			unlinkpid(nsd->pidfile);
1223 			unlink(nsd->task[0]->fname);
1224 			unlink(nsd->task[1]->fname);
1225 #ifdef USE_ZONE_STATS
1226 			unlink(nsd->zonestatfname[0]);
1227 			unlink(nsd->zonestatfname[1]);
1228 #endif
1229 			/* write the nsd.db to disk, wait for it to complete */
1230 			udb_base_sync(nsd->db->udb, 1);
1231 			udb_base_close(nsd->db->udb);
1232 			server_shutdown(nsd);
1233 			exit(0);
1234 		}
1235 	}
1236 	if(shortsoa) {
1237 		/* put SOA in xfrd task because mytask may be in use */
1238 		taskudb = nsd->task[1-nsd->mytask];
1239 	}
1240 
1241 	add_all_soa_to_task(nsd, taskudb);
1242 	if(!shortsoa) {
1243 		/* wait for xfrd to signal task is ready, RELOAD signal */
1244 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1245 			cmd != NSD_RELOAD) {
1246 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1247 			exit(1);
1248 		}
1249 		if(nsd->signal_hint_shutdown) {
1250 			goto shutdown;
1251 		}
1252 	}
1253 	/* give xfrd our task, signal it with RELOAD_DONE */
1254 	task_process_sync(taskudb);
1255 	cmd = NSD_RELOAD_DONE;
1256 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1257 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1258 			(int)nsd->pid, strerror(errno));
1259 	}
1260 	mypid = getpid();
1261 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1262 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1263 			strerror(errno));
1264 	}
1265 
1266 	if(!shortsoa) {
1267 		/* process the xfrd task works (expiry data) */
1268 		nsd->mytask = 1 - nsd->mytask;
1269 		taskudb = nsd->task[nsd->mytask];
1270 		task_remap(taskudb);
1271 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1272 		while(!udb_ptr_is_null(&t)) {
1273 			task_process_expire(nsd->db, TASKLIST(&t));
1274 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1275 		}
1276 		udb_ptr_unlink(&t, taskudb);
1277 		task_clear(taskudb);
1278 
1279 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1280 		cmd = NSD_RELOAD_DONE;
1281 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1282 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1283 				(int)nsd->pid, strerror(errno));
1284 		}
1285 	}
1286 }
1287 
1288 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1289 ssize_t
1290 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1291 {
1292 	uint8_t* buf = (uint8_t*) p;
1293 	ssize_t total = 0;
1294 	struct pollfd fd;
1295 	memset(&fd, 0, sizeof(fd));
1296 	fd.fd = s;
1297 	fd.events = POLLIN;
1298 
1299 	while( total < sz) {
1300 		ssize_t ret;
1301 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1302 		if(ret == -1) {
1303 			if(errno == EAGAIN)
1304 				/* blocking read */
1305 				continue;
1306 			if(errno == EINTR) {
1307 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1308 					return -1;
1309 				/* other signals can be handled later */
1310 				continue;
1311 			}
1312 			/* some error */
1313 			return -1;
1314 		}
1315 		if(ret == 0) {
1316 			/* operation timed out */
1317 			return -2;
1318 		}
1319 		ret = read(s, buf+total, sz-total);
1320 		if(ret == -1) {
1321 			if(errno == EAGAIN)
1322 				/* blocking read */
1323 				continue;
1324 			if(errno == EINTR) {
1325 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1326 					return -1;
1327 				/* other signals can be handled later */
1328 				continue;
1329 			}
1330 			/* some error */
1331 			return -1;
1332 		}
1333 		if(ret == 0) {
1334 			/* closed connection! */
1335 			return 0;
1336 		}
1337 		total += ret;
1338 	}
1339 	return total;
1340 }
1341 
1342 static void
1343 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1344 {
1345 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1346 	udb_ptr t, next;
1347 	udb_base* u = nsd->task[nsd->mytask];
1348 	udb_ptr_init(&next, u);
1349 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1350 	udb_base_set_userdata(u, 0);
1351 	while(!udb_ptr_is_null(&t)) {
1352 		/* store next in list so this one can be deleted or reused */
1353 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1354 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1355 
1356 		/* process task t */
1357 		/* append results for task t and update last_task */
1358 		task_process_in_reload(nsd, u, last_task, &t);
1359 
1360 		/* go to next */
1361 		udb_ptr_set_ptr(&t, u, &next);
1362 
1363 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1364 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1365 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1366 			if(cmd == NSD_QUIT) {
1367 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1368 				/* sync to disk (if needed) */
1369 				udb_base_sync(nsd->db->udb, 0);
1370 				/* unlink files of remainder of tasks */
1371 				while(!udb_ptr_is_null(&t)) {
1372 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1373 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1374 					}
1375 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1376 				}
1377 				udb_ptr_unlink(&t, u);
1378 				udb_ptr_unlink(&next, u);
1379 				exit(0);
1380 			}
1381 		}
1382 
1383 	}
1384 	udb_ptr_unlink(&t, u);
1385 	udb_ptr_unlink(&next, u);
1386 }
1387 
1388 #ifdef BIND8_STATS
1389 static void
1390 parent_send_stats(struct nsd* nsd, int cmdfd)
1391 {
1392 	size_t i;
1393 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1394 		log_msg(LOG_ERR, "could not write stats to reload");
1395 		return;
1396 	}
1397 	for(i=0; i<nsd->child_count; i++)
1398 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1399 			sizeof(stc_t))) {
1400 			log_msg(LOG_ERR, "could not write stats to reload");
1401 			return;
1402 		}
1403 }
1404 
1405 static void
1406 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1407 {
1408 	struct nsdst s;
1409 	stc_t* p;
1410 	size_t i;
1411 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1412 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1413 		log_msg(LOG_ERR, "could not read stats from oldpar");
1414 		return;
1415 	}
1416 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1417 	s.db_mem = region_get_mem(nsd->db->region);
1418 	p = (stc_t*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1419 		nsd->child_count);
1420 	if(!p) return;
1421 	for(i=0; i<nsd->child_count; i++) {
1422 		if(block_read(nsd, cmdfd, p++, sizeof(stc_t), 1)!=sizeof(stc_t))
1423 			return;
1424 	}
1425 }
1426 #endif /* BIND8_STATS */
1427 
1428 /*
1429  * Reload the database, stop parent, re-fork children and continue.
1430  * as server_main.
1431  */
1432 static void
1433 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1434 	int cmdsocket)
1435 {
1436 	pid_t mypid;
1437 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1438 	int ret;
1439 	udb_ptr last_task;
1440 	struct sigaction old_sigchld, ign_sigchld;
1441 	/* ignore SIGCHLD from the previous server_main that used this pid */
1442 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1443 	ign_sigchld.sa_handler = SIG_IGN;
1444 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1445 
1446 	/* see what tasks we got from xfrd */
1447 	task_remap(nsd->task[nsd->mytask]);
1448 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1449 	udb_compact_inhibited(nsd->db->udb, 1);
1450 	reload_process_tasks(nsd, &last_task, cmdsocket);
1451 	udb_compact_inhibited(nsd->db->udb, 0);
1452 	udb_compact(nsd->db->udb);
1453 
1454 #ifndef NDEBUG
1455 	if(nsd_debug_level >= 1)
1456 		region_log_stats(nsd->db->region);
1457 #endif /* NDEBUG */
1458 	/* sync to disk (if needed) */
1459 	udb_base_sync(nsd->db->udb, 0);
1460 
1461 	initialize_dname_compression_tables(nsd);
1462 
1463 #ifdef BIND8_STATS
1464 	/* Restart dumping stats if required.  */
1465 	time(&nsd->st.boot);
1466 	set_bind8_alarm(nsd);
1467 #endif
1468 #ifdef USE_ZONE_STATS
1469 	server_zonestat_realloc(nsd); /* realloc for new children */
1470 	server_zonestat_switch(nsd);
1471 #endif
1472 
1473 	/* listen for the signals of failed children again */
1474 	sigaction(SIGCHLD, &old_sigchld, NULL);
1475 	/* Start new child processes */
1476 	if (server_start_children(nsd, server_region, netio, &nsd->
1477 		xfrd_listener->fd) != 0) {
1478 		send_children_quit(nsd);
1479 		exit(1);
1480 	}
1481 
1482 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1483 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1484 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1485 		if(cmd == NSD_QUIT) {
1486 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1487 			send_children_quit(nsd);
1488 			exit(0);
1489 		}
1490 	}
1491 
1492 	/* Send quit command to parent: blocking, wait for receipt. */
1493 	do {
1494 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1495 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1496 		{
1497 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1498 				strerror(errno));
1499 		}
1500 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1501 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1502 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1503 			RELOAD_SYNC_TIMEOUT);
1504 		if(ret == -2) {
1505 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1506 		}
1507 	} while (ret == -2);
1508 	if(ret == -1) {
1509 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1510 			strerror(errno));
1511 	}
1512 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1513 	if(cmd == NSD_QUIT) {
1514 		/* small race condition possible here, parent got quit cmd. */
1515 		send_children_quit(nsd);
1516 		exit(1);
1517 	}
1518 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1519 #ifdef BIND8_STATS
1520 	reload_do_stats(cmdsocket, nsd, &last_task);
1521 #endif
1522 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1523 	task_process_sync(nsd->task[nsd->mytask]);
1524 #ifdef USE_ZONE_STATS
1525 	server_zonestat_realloc(nsd); /* realloc for next children */
1526 #endif
1527 
1528 	/* send soainfo to the xfrd process, signal it that reload is done,
1529 	 * it picks up the taskudb */
1530 	cmd = NSD_RELOAD_DONE;
1531 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1532 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1533 			strerror(errno));
1534 	}
1535 	mypid = getpid();
1536 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1537 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1538 			strerror(errno));
1539 	}
1540 
1541 	/* try to reopen file */
1542 	if (nsd->file_rotation_ok)
1543 		log_reopen(nsd->log_filename, 1);
1544 	/* exit reload, continue as new server_main */
1545 }
1546 
1547 /*
1548  * Get the mode depending on the signal hints that have been received.
1549  * Multiple signal hints can be received and will be handled in turn.
1550  */
1551 static sig_atomic_t
1552 server_signal_mode(struct nsd *nsd)
1553 {
1554 	if(nsd->signal_hint_quit) {
1555 		nsd->signal_hint_quit = 0;
1556 		return NSD_QUIT;
1557 	}
1558 	else if(nsd->signal_hint_shutdown) {
1559 		nsd->signal_hint_shutdown = 0;
1560 		return NSD_SHUTDOWN;
1561 	}
1562 	else if(nsd->signal_hint_child) {
1563 		nsd->signal_hint_child = 0;
1564 		return NSD_REAP_CHILDREN;
1565 	}
1566 	else if(nsd->signal_hint_reload) {
1567 		nsd->signal_hint_reload = 0;
1568 		return NSD_RELOAD;
1569 	}
1570 	else if(nsd->signal_hint_reload_hup) {
1571 		nsd->signal_hint_reload_hup = 0;
1572 		return NSD_RELOAD_REQ;
1573 	}
1574 	else if(nsd->signal_hint_stats) {
1575 		nsd->signal_hint_stats = 0;
1576 #ifdef BIND8_STATS
1577 		set_bind8_alarm(nsd);
1578 #endif
1579 		return NSD_STATS;
1580 	}
1581 	else if(nsd->signal_hint_statsusr) {
1582 		nsd->signal_hint_statsusr = 0;
1583 		return NSD_STATS;
1584 	}
1585 	return NSD_RUN;
1586 }
1587 
1588 /*
1589  * The main server simply waits for signals and child processes to
1590  * terminate.  Child processes are restarted as necessary.
1591  */
1592 void
1593 server_main(struct nsd *nsd)
1594 {
1595 	region_type *server_region = region_create(xalloc, free);
1596 	netio_type *netio = netio_create(server_region);
1597 	netio_handler_type reload_listener;
1598 	int reload_sockets[2] = {-1, -1};
1599 	struct timespec timeout_spec;
1600 	int status;
1601 	pid_t child_pid;
1602 	pid_t reload_pid = -1;
1603 	sig_atomic_t mode;
1604 
1605 	/* Ensure we are the main process */
1606 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1607 
1608 	/* Add listener for the XFRD process */
1609 	netio_add_handler(netio, nsd->xfrd_listener);
1610 
1611 	/* Start the child processes that handle incoming queries */
1612 	if (server_start_children(nsd, server_region, netio,
1613 		&nsd->xfrd_listener->fd) != 0) {
1614 		send_children_quit(nsd);
1615 		exit(1);
1616 	}
1617 	reload_listener.fd = -1;
1618 
1619 	/* This_child MUST be 0, because this is the parent process */
1620 	assert(nsd->this_child == 0);
1621 
1622 	/* Run the server until we get a shutdown signal */
1623 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1624 		/* Did we receive a signal that changes our mode? */
1625 		if(mode == NSD_RUN) {
1626 			nsd->mode = mode = server_signal_mode(nsd);
1627 		}
1628 
1629 		switch (mode) {
1630 		case NSD_RUN:
1631 			/* see if any child processes terminated */
1632 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1633 				int is_child = delete_child_pid(nsd, child_pid);
1634 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1635 					if(nsd->children[is_child].child_fd == -1)
1636 						nsd->children[is_child].has_exited = 1;
1637 					parent_check_all_children_exited(nsd);
1638 				} else if(is_child != -1) {
1639 					log_msg(LOG_WARNING,
1640 					       "server %d died unexpectedly with status %d, restarting",
1641 					       (int) child_pid, status);
1642 					restart_child_servers(nsd, server_region, netio,
1643 						&nsd->xfrd_listener->fd);
1644 				} else if (child_pid == reload_pid) {
1645 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1646 					pid_t mypid;
1647 					log_msg(LOG_WARNING,
1648 					       "Reload process %d failed with status %d, continuing with old database",
1649 					       (int) child_pid, status);
1650 					reload_pid = -1;
1651 					if(reload_listener.fd != -1) close(reload_listener.fd);
1652 					reload_listener.fd = -1;
1653 					reload_listener.event_types = NETIO_EVENT_NONE;
1654 					task_process_sync(nsd->task[nsd->mytask]);
1655 					/* inform xfrd reload attempt ended */
1656 					if(!write_socket(nsd->xfrd_listener->fd,
1657 						&cmd, sizeof(cmd))) {
1658 						log_msg(LOG_ERR, "problems "
1659 						  "sending SOAEND to xfrd: %s",
1660 						  strerror(errno));
1661 					}
1662 					mypid = getpid();
1663 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1664 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1665 							strerror(errno));
1666 					}
1667 				} else if(status != 0) {
1668 					/* check for status, because we get
1669 					 * the old-servermain because reload
1670 					 * is the process-parent of old-main,
1671 					 * and we get older server-processes
1672 					 * that are exiting after a reload */
1673 					log_msg(LOG_WARNING,
1674 					       "process %d terminated with status %d",
1675 					       (int) child_pid, status);
1676 				}
1677 			}
1678 			if (child_pid == -1) {
1679 				if (errno == EINTR) {
1680 					continue;
1681 				}
1682 				if (errno != ECHILD)
1683 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1684 			}
1685 			if (nsd->mode != NSD_RUN)
1686 				break;
1687 
1688 			/* timeout to collect processes. In case no sigchild happens. */
1689 			timeout_spec.tv_sec = 60;
1690 			timeout_spec.tv_nsec = 0;
1691 
1692 			/* listen on ports, timeout for collecting terminated children */
1693 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1694 				if (errno != EINTR) {
1695 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1696 				}
1697 			}
1698 			if(nsd->restart_children) {
1699 				restart_child_servers(nsd, server_region, netio,
1700 					&nsd->xfrd_listener->fd);
1701 				nsd->restart_children = 0;
1702 			}
1703 			if(nsd->reload_failed) {
1704 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1705 				pid_t mypid;
1706 				nsd->reload_failed = 0;
1707 				log_msg(LOG_WARNING,
1708 				       "Reload process %d failed, continuing with old database",
1709 				       (int) reload_pid);
1710 				reload_pid = -1;
1711 				if(reload_listener.fd != -1) close(reload_listener.fd);
1712 				reload_listener.fd = -1;
1713 				reload_listener.event_types = NETIO_EVENT_NONE;
1714 				task_process_sync(nsd->task[nsd->mytask]);
1715 				/* inform xfrd reload attempt ended */
1716 				if(!write_socket(nsd->xfrd_listener->fd,
1717 					&cmd, sizeof(cmd))) {
1718 					log_msg(LOG_ERR, "problems "
1719 					  "sending SOAEND to xfrd: %s",
1720 					  strerror(errno));
1721 				}
1722 				mypid = getpid();
1723 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1724 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1725 						strerror(errno));
1726 				}
1727 			}
1728 
1729 			break;
1730 		case NSD_RELOAD_REQ: {
1731 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1732 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1733 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1734 				"main: ipc send reload_req to xfrd"));
1735 			if(!write_socket(nsd->xfrd_listener->fd,
1736 				&cmd, sizeof(cmd))) {
1737 				log_msg(LOG_ERR, "server_main: could not send "
1738 				"reload_req to xfrd: %s", strerror(errno));
1739 			}
1740 			nsd->mode = NSD_RUN;
1741 			} break;
1742 		case NSD_RELOAD:
1743 			/* Continue to run nsd after reload */
1744 			nsd->mode = NSD_RUN;
1745 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1746 			if (reload_pid != -1) {
1747 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1748 				       (int) reload_pid);
1749 				break;
1750 			}
1751 
1752 			/* switch the mytask to keep track of who owns task*/
1753 			nsd->mytask = 1 - nsd->mytask;
1754 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1755 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1756 				reload_pid = -1;
1757 				break;
1758 			}
1759 
1760 			/* Do actual reload */
1761 			reload_pid = fork();
1762 			switch (reload_pid) {
1763 			case -1:
1764 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1765 				break;
1766 			default:
1767 				/* PARENT */
1768 				close(reload_sockets[0]);
1769 				server_reload(nsd, server_region, netio,
1770 					reload_sockets[1]);
1771 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1772 				close(reload_sockets[1]);
1773 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1774 				/* drop stale xfrd ipc data */
1775 				((struct ipc_handler_conn_data*)nsd->
1776 					xfrd_listener->user_data)
1777 					->conn->is_reading = 0;
1778 				reload_pid = -1;
1779 				reload_listener.fd = -1;
1780 				reload_listener.event_types = NETIO_EVENT_NONE;
1781 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1782 				break;
1783 			case 0:
1784 				/* CHILD */
1785 				/* server_main keep running until NSD_QUIT_SYNC
1786 				 * received from reload. */
1787 				close(reload_sockets[1]);
1788 				reload_listener.fd = reload_sockets[0];
1789 				reload_listener.timeout = NULL;
1790 				reload_listener.user_data = nsd;
1791 				reload_listener.event_types = NETIO_EVENT_READ;
1792 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1793 				netio_add_handler(netio, &reload_listener);
1794 				reload_pid = getppid();
1795 				break;
1796 			}
1797 			break;
1798 		case NSD_QUIT_SYNC:
1799 			/* synchronisation of xfrd, parent and reload */
1800 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1801 				sig_atomic_t cmd = NSD_RELOAD;
1802 				/* stop xfrd ipc writes in progress */
1803 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1804 					"main: ipc send indication reload"));
1805 				if(!write_socket(nsd->xfrd_listener->fd,
1806 					&cmd, sizeof(cmd))) {
1807 					log_msg(LOG_ERR, "server_main: could not send reload "
1808 					"indication to xfrd: %s", strerror(errno));
1809 				}
1810 				/* wait for ACK from xfrd */
1811 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1812 				nsd->quit_sync_done = 1;
1813 			}
1814 			nsd->mode = NSD_RUN;
1815 			break;
1816 		case NSD_QUIT:
1817 			/* silent shutdown during reload */
1818 			if(reload_listener.fd != -1) {
1819 				/* acknowledge the quit, to sync reload that we will really quit now */
1820 				sig_atomic_t cmd = NSD_RELOAD;
1821 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1822 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1823 					log_msg(LOG_ERR, "server_main: "
1824 						"could not ack quit: %s", strerror(errno));
1825 				}
1826 #ifdef BIND8_STATS
1827 				parent_send_stats(nsd, reload_listener.fd);
1828 #endif /* BIND8_STATS */
1829 				close(reload_listener.fd);
1830 			}
1831 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1832 			/* only quit children after xfrd has acked */
1833 			send_children_quit(nsd);
1834 
1835 #if 0 /* OS collects memory pages */
1836 			region_destroy(server_region);
1837 #endif
1838 			server_shutdown(nsd);
1839 
1840 			/* ENOTREACH */
1841 			break;
1842 		case NSD_SHUTDOWN:
1843 			break;
1844 		case NSD_REAP_CHILDREN:
1845 			/* continue; wait for child in run loop */
1846 			nsd->mode = NSD_RUN;
1847 			break;
1848 		case NSD_STATS:
1849 #ifdef BIND8_STATS
1850 			set_children_stats(nsd);
1851 #endif
1852 			nsd->mode = NSD_RUN;
1853 			break;
1854 		default:
1855 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1856 			nsd->mode = NSD_RUN;
1857 			break;
1858 		}
1859 	}
1860 	log_msg(LOG_WARNING, "signal received, shutting down...");
1861 
1862 	/* close opened ports to avoid race with restart of nsd */
1863 	server_close_all_sockets(nsd->udp, nsd->ifs);
1864 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1865 #ifdef HAVE_SSL
1866 	daemon_remote_close(nsd->rc);
1867 #endif
1868 	send_children_quit_and_wait(nsd);
1869 
1870 	/* Unlink it if possible... */
1871 	unlinkpid(nsd->pidfile);
1872 	unlink(nsd->task[0]->fname);
1873 	unlink(nsd->task[1]->fname);
1874 #ifdef USE_ZONE_STATS
1875 	unlink(nsd->zonestatfname[0]);
1876 	unlink(nsd->zonestatfname[1]);
1877 #endif
1878 
1879 	if(reload_listener.fd != -1) {
1880 		sig_atomic_t cmd = NSD_QUIT;
1881 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1882 			"main: ipc send quit to reload-process"));
1883 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1884 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1885 				strerror(errno));
1886 		}
1887 		fsync(reload_listener.fd);
1888 		close(reload_listener.fd);
1889 		/* wait for reload to finish processing */
1890 		while(1) {
1891 			if(waitpid(reload_pid, NULL, 0) == -1) {
1892 				if(errno == EINTR) continue;
1893 				if(errno == ECHILD) break;
1894 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1895 					(int)reload_pid, strerror(errno));
1896 			}
1897 			break;
1898 		}
1899 	}
1900 	if(nsd->xfrd_listener->fd != -1) {
1901 		/* complete quit, stop xfrd */
1902 		sig_atomic_t cmd = NSD_QUIT;
1903 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1904 			"main: ipc send quit to xfrd"));
1905 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1906 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1907 				strerror(errno));
1908 		}
1909 		fsync(nsd->xfrd_listener->fd);
1910 		close(nsd->xfrd_listener->fd);
1911 		(void)kill(nsd->pid, SIGTERM);
1912 	}
1913 
1914 #if 0 /* OS collects memory pages */
1915 	region_destroy(server_region);
1916 #endif
1917 	/* write the nsd.db to disk, wait for it to complete */
1918 	udb_base_sync(nsd->db->udb, 1);
1919 	udb_base_close(nsd->db->udb);
1920 	server_shutdown(nsd);
1921 }
1922 
1923 static query_state_type
1924 server_process_query(struct nsd *nsd, struct query *query)
1925 {
1926 	return query_process(query, nsd);
1927 }
1928 
1929 static query_state_type
1930 server_process_query_udp(struct nsd *nsd, struct query *query)
1931 {
1932 #ifdef RATELIMIT
1933 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1934 		if(rrl_process_query(query))
1935 			return rrl_slip(query);
1936 		else	return QUERY_PROCESSED;
1937 	}
1938 	return QUERY_DISCARDED;
1939 #else
1940 	return query_process(query, nsd);
1941 #endif
1942 }
1943 
1944 struct event_base*
1945 nsd_child_event_base(void)
1946 {
1947 	struct event_base* base;
1948 #ifdef USE_MINI_EVENT
1949 	static time_t secs;
1950 	static struct timeval now;
1951 	base = event_init(&secs, &now);
1952 #else
1953 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
1954 	/* libev */
1955 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
1956 #  else
1957 	/* libevent */
1958 #    ifdef HAVE_EVENT_BASE_NEW
1959 	base = event_base_new();
1960 #    else
1961 	base = event_init();
1962 #    endif
1963 #  endif
1964 #endif
1965 	return base;
1966 }
1967 
1968 /*
1969  * Serve DNS requests.
1970  */
1971 void
1972 server_child(struct nsd *nsd)
1973 {
1974 	size_t i, from, numifs;
1975 	region_type *server_region = region_create(xalloc, free);
1976 	struct event_base* event_base = nsd_child_event_base();
1977 	query_type *udp_query;
1978 	sig_atomic_t mode;
1979 
1980 	if(!event_base) {
1981 		log_msg(LOG_ERR, "nsd server could not create event base");
1982 		exit(1);
1983 	}
1984 
1985 #ifdef RATELIMIT
1986 	rrl_init(nsd->this_child->child_num);
1987 #endif
1988 
1989 	assert(nsd->server_kind != NSD_SERVER_MAIN);
1990 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
1991 
1992 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
1993 		server_close_all_sockets(nsd->tcp, nsd->ifs);
1994 	}
1995 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
1996 		server_close_all_sockets(nsd->udp, nsd->ifs);
1997 	}
1998 
1999 	if (nsd->this_child && nsd->this_child->parent_fd != -1) {
2000 		struct event *handler;
2001 		struct ipc_handler_conn_data* user_data =
2002 			(struct ipc_handler_conn_data*)region_alloc(
2003 			server_region, sizeof(struct ipc_handler_conn_data));
2004 		user_data->nsd = nsd;
2005 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2006 
2007 		handler = (struct event*) region_alloc(
2008 			server_region, sizeof(*handler));
2009 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2010 			EV_READ, child_handle_parent_command, user_data);
2011 		if(event_base_set(event_base, handler) != 0)
2012 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2013 		if(event_add(handler, NULL) != 0)
2014 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2015 	}
2016 
2017 	if(nsd->reuseport) {
2018 		numifs = nsd->ifs / nsd->reuseport;
2019 		from = numifs * nsd->this_child->child_num;
2020 		if(from+numifs > nsd->ifs) { /* should not happen */
2021 			from = 0;
2022 			numifs = nsd->ifs;
2023 		}
2024 	} else {
2025 		from = 0;
2026 		numifs = nsd->ifs;
2027 	}
2028 
2029 	if (nsd->server_kind & NSD_SERVER_UDP) {
2030 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2031 		udp_query = query_create(server_region,
2032 			compressed_dname_offsets, compression_table_size);
2033 #else
2034 		udp_query = NULL;
2035 		memset(msgs, 0, sizeof(msgs));
2036 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2037 			queries[i] = query_create(server_region,
2038 				compressed_dname_offsets, compression_table_size);
2039 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2040 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2041 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2042 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2043 			msgs[i].msg_hdr.msg_iovlen  = 1;
2044 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2045 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2046 		}
2047 #endif
2048 		for (i = from; i < from+numifs; ++i) {
2049 			struct udp_handler_data *data;
2050 			struct event *handler;
2051 
2052 			data = (struct udp_handler_data *) region_alloc(
2053 				server_region,
2054 				sizeof(struct udp_handler_data));
2055 			data->query = udp_query;
2056 			data->nsd = nsd;
2057 			data->socket = &nsd->udp[i];
2058 
2059 			handler = (struct event*) region_alloc(
2060 				server_region, sizeof(*handler));
2061 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2062 				handle_udp, data);
2063 			if(event_base_set(event_base, handler) != 0)
2064 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2065 			if(event_add(handler, NULL) != 0)
2066 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2067 		}
2068 	}
2069 
2070 	/*
2071 	 * Keep track of all the TCP accept handlers so we can enable
2072 	 * and disable them based on the current number of active TCP
2073 	 * connections.
2074 	 */
2075 	tcp_accept_handler_count = numifs;
2076 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2077 		region_alloc_array(server_region,
2078 		numifs, sizeof(*tcp_accept_handlers));
2079 	if (nsd->server_kind & NSD_SERVER_TCP) {
2080 		for (i = from; i < numifs; ++i) {
2081 			struct event *handler = &tcp_accept_handlers[i-from].event;
2082 			struct tcp_accept_handler_data* data =
2083 				&tcp_accept_handlers[i-from];
2084 			data->nsd = nsd;
2085 			data->socket = &nsd->tcp[i];
2086 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2087 				handle_tcp_accept, data);
2088 			if(event_base_set(event_base, handler) != 0)
2089 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2090 			if(event_add(handler, NULL) != 0)
2091 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2092 			data->event_added = 1;
2093 		}
2094 	} else tcp_accept_handler_count = 0;
2095 
2096 	/* The main loop... */
2097 	while ((mode = nsd->mode) != NSD_QUIT) {
2098 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2099 
2100 		/* Do we need to do the statistics... */
2101 		if (mode == NSD_STATS) {
2102 #ifdef BIND8_STATS
2103 			int p = nsd->st.period;
2104 			nsd->st.period = 1; /* force stats printout */
2105 			/* Dump the statistics */
2106 			bind8_stats(nsd);
2107 			nsd->st.period = p;
2108 #else /* !BIND8_STATS */
2109 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2110 #endif /* BIND8_STATS */
2111 
2112 			nsd->mode = NSD_RUN;
2113 		}
2114 		else if (mode == NSD_REAP_CHILDREN) {
2115 			/* got signal, notify parent. parent reaps terminated children. */
2116 			if (nsd->this_child->parent_fd != -1) {
2117 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2118 				if (write(nsd->this_child->parent_fd,
2119 				    &parent_notify,
2120 				    sizeof(parent_notify)) == -1)
2121 				{
2122 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2123 						(int) nsd->this_child->pid, strerror(errno));
2124 				}
2125 			} else /* no parent, so reap 'em */
2126 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2127 			nsd->mode = NSD_RUN;
2128 		}
2129 		else if(mode == NSD_RUN) {
2130 			/* Wait for a query... */
2131 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2132 				if (errno != EINTR) {
2133 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2134 					break;
2135 				}
2136 			}
2137 		} else if(mode == NSD_QUIT) {
2138 			/* ignore here, quit */
2139 		} else {
2140 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2141 				(int)mode);
2142 			nsd->mode = NSD_RUN;
2143 		}
2144 	}
2145 
2146 #ifdef	BIND8_STATS
2147 	bind8_stats(nsd);
2148 #endif /* BIND8_STATS */
2149 
2150 #if 0 /* OS collects memory pages */
2151 	event_base_free(event_base);
2152 	region_destroy(server_region);
2153 #endif
2154 	server_shutdown(nsd);
2155 }
2156 
2157 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2158 static void
2159 handle_udp(int fd, short event, void* arg)
2160 {
2161 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2162 	int received, sent, recvcount, i;
2163 	struct query *q;
2164 
2165 	if (!(event & EV_READ)) {
2166 		return;
2167 	}
2168 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2169 	/* this printf strangely gave a performance increase on Linux */
2170 	/* printf("recvcount %d \n", recvcount); */
2171 	if (recvcount == -1) {
2172 		if (errno != EAGAIN && errno != EINTR) {
2173 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2174 			STATUP(data->nsd, rxerr);
2175 			/* No zone statup */
2176 		}
2177 		/* Simply no data available */
2178 		return;
2179 	}
2180 	for (i = 0; i < recvcount; i++) {
2181 	loopstart:
2182 		received = msgs[i].msg_len;
2183 		q = queries[i];
2184 		if (received == -1) {
2185 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2186 				msgs[i].msg_hdr.msg_flags));
2187 			STATUP(data->nsd, rxerr);
2188 			/* No zone statup */
2189 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2190 			iovecs[i].iov_len = buffer_remaining(q->packet);
2191 			goto swap_drop;
2192 		}
2193 
2194 		/* Account... */
2195 #ifdef BIND8_STATS
2196 		if (data->socket->fam == AF_INET) {
2197 			STATUP(data->nsd, qudp);
2198 		} else if (data->socket->fam == AF_INET6) {
2199 			STATUP(data->nsd, qudp6);
2200 		}
2201 #endif
2202 
2203 		buffer_skip(q->packet, received);
2204 		buffer_flip(q->packet);
2205 
2206 		/* Process and answer the query... */
2207 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2208 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2209 				STATUP(data->nsd, nona);
2210 				ZTATUP(data->nsd, q->zone, nona);
2211 			}
2212 
2213 #ifdef USE_ZONE_STATS
2214 			if (data->socket->fam == AF_INET) {
2215 				ZTATUP(data->nsd, q->zone, qudp);
2216 			} else if (data->socket->fam == AF_INET6) {
2217 				ZTATUP(data->nsd, q->zone, qudp6);
2218 			}
2219 #endif
2220 
2221 			/* Add EDNS0 and TSIG info if necessary.  */
2222 			query_add_optional(q, data->nsd);
2223 
2224 			buffer_flip(q->packet);
2225 			iovecs[i].iov_len = buffer_remaining(q->packet);
2226 #ifdef BIND8_STATS
2227 			/* Account the rcode & TC... */
2228 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2229 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2230 			if (TC(q->packet)) {
2231 				STATUP(data->nsd, truncated);
2232 				ZTATUP(data->nsd, q->zone, truncated);
2233 			}
2234 #endif /* BIND8_STATS */
2235 		} else {
2236 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2237 			iovecs[i].iov_len = buffer_remaining(q->packet);
2238 		swap_drop:
2239 			STATUP(data->nsd, dropped);
2240 			ZTATUP(data->nsd, q->zone, dropped);
2241 			if(i != recvcount-1) {
2242 				/* swap with last and decrease recvcount */
2243 				struct mmsghdr mtmp = msgs[i];
2244 				struct iovec iotmp = iovecs[i];
2245 				recvcount--;
2246 				msgs[i] = msgs[recvcount];
2247 				iovecs[i] = iovecs[recvcount];
2248 				queries[i] = queries[recvcount];
2249 				msgs[recvcount] = mtmp;
2250 				iovecs[recvcount] = iotmp;
2251 				queries[recvcount] = q;
2252 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2253 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2254 				goto loopstart;
2255 			} else { recvcount --; }
2256 		}
2257 	}
2258 
2259 	/* send until all are sent */
2260 	i = 0;
2261 	while(i<recvcount) {
2262 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2263 		if(sent == -1) {
2264 			const char* es = strerror(errno);
2265 			char a[48];
2266 			addr2str(&queries[i]->addr, a, sizeof(a));
2267 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2268 #ifdef BIND8_STATS
2269 			data->nsd->st.txerr += recvcount-i;
2270 #endif /* BIND8_STATS */
2271 			break;
2272 		}
2273 		i += sent;
2274 	}
2275 	for(i=0; i<recvcount; i++) {
2276 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2277 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2278 	}
2279 }
2280 
2281 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2282 
2283 static void
2284 handle_udp(int fd, short event, void* arg)
2285 {
2286 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2287 	int received, sent;
2288 #ifndef NONBLOCKING_IS_BROKEN
2289 #ifdef HAVE_RECVMMSG
2290 	int recvcount;
2291 #endif /* HAVE_RECVMMSG */
2292 	int i;
2293 #endif /* NONBLOCKING_IS_BROKEN */
2294 	struct query *q;
2295 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2296 	q = data->query;
2297 #endif
2298 
2299 	if (!(event & EV_READ)) {
2300 		return;
2301 	}
2302 #ifndef NONBLOCKING_IS_BROKEN
2303 #ifdef HAVE_RECVMMSG
2304 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2305 	/* this printf strangely gave a performance increase on Linux */
2306 	/* printf("recvcount %d \n", recvcount); */
2307 	if (recvcount == -1) {
2308 		if (errno != EAGAIN && errno != EINTR) {
2309 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2310 			STATUP(data->nsd, rxerr);
2311 			/* No zone statup */
2312 		}
2313 		/* Simply no data available */
2314 		return;
2315 	}
2316 	for (i = 0; i < recvcount; i++) {
2317 		received = msgs[i].msg_len;
2318 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2319 		if (received == -1) {
2320 			log_msg(LOG_ERR, "recvmmsg failed");
2321 			STATUP(data->nsd, rxerr);
2322 			/* No zone statup */
2323 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2324 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2325 			continue;
2326 		}
2327 		q = queries[i];
2328 #else
2329 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2330 #endif /* HAVE_RECVMMSG */
2331 #endif /* NONBLOCKING_IS_BROKEN */
2332 
2333 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2334 		/* Initialize the query... */
2335 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2336 
2337 		received = recvfrom(fd,
2338 				    buffer_begin(q->packet),
2339 				    buffer_remaining(q->packet),
2340 				    0,
2341 				    (struct sockaddr *)&q->addr,
2342 				    &q->addrlen);
2343 		if (received == -1) {
2344 			if (errno != EAGAIN && errno != EINTR) {
2345 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2346 				STATUP(data->nsd, rxerr);
2347 				/* No zone statup */
2348 			}
2349 			return;
2350 		}
2351 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2352 
2353 		/* Account... */
2354 		if (data->socket->fam == AF_INET) {
2355 			STATUP(data->nsd, qudp);
2356 		} else if (data->socket->fam == AF_INET6) {
2357 			STATUP(data->nsd, qudp6);
2358 		}
2359 
2360 		buffer_skip(q->packet, received);
2361 		buffer_flip(q->packet);
2362 
2363 		/* Process and answer the query... */
2364 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2365 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2366 				STATUP(data->nsd, nona);
2367 				ZTATUP(data->nsd, q->zone, nona);
2368 			}
2369 
2370 #ifdef USE_ZONE_STATS
2371 			if (data->socket->fam == AF_INET) {
2372 				ZTATUP(data->nsd, q->zone, qudp);
2373 			} else if (data->socket->fam == AF_INET6) {
2374 				ZTATUP(data->nsd, q->zone, qudp6);
2375 			}
2376 #endif
2377 
2378 			/* Add EDNS0 and TSIG info if necessary.  */
2379 			query_add_optional(q, data->nsd);
2380 
2381 			buffer_flip(q->packet);
2382 
2383 			sent = sendto(fd,
2384 				      buffer_begin(q->packet),
2385 				      buffer_remaining(q->packet),
2386 				      0,
2387 				      (struct sockaddr *) &q->addr,
2388 				      q->addrlen);
2389 			if (sent == -1) {
2390 				const char* es = strerror(errno);
2391 				char a[48];
2392 				addr2str(&q->addr, a, sizeof(a));
2393 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2394 				STATUP(data->nsd, txerr);
2395 				ZTATUP(data->nsd, q->zone, txerr);
2396 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2397 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2398 			} else {
2399 #ifdef BIND8_STATS
2400 				/* Account the rcode & TC... */
2401 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2402 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2403 				if (TC(q->packet)) {
2404 					STATUP(data->nsd, truncated);
2405 					ZTATUP(data->nsd, q->zone, truncated);
2406 				}
2407 #endif /* BIND8_STATS */
2408 			}
2409 		} else {
2410 			STATUP(data->nsd, dropped);
2411 			ZTATUP(data->nsd, q->zone, dropped);
2412 		}
2413 #ifndef NONBLOCKING_IS_BROKEN
2414 #ifdef HAVE_RECVMMSG
2415 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2416 #endif
2417 	}
2418 #endif
2419 }
2420 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2421 
2422 
2423 static void
2424 cleanup_tcp_handler(struct tcp_handler_data* data)
2425 {
2426 	event_del(&data->event);
2427 	close(data->event.ev_fd);
2428 
2429 	/*
2430 	 * Enable the TCP accept handlers when the current number of
2431 	 * TCP connections is about to drop below the maximum number
2432 	 * of TCP connections.
2433 	 */
2434 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2435 		configure_handler_event_types(EV_READ|EV_PERSIST);
2436 		if(slowaccept) {
2437 			event_del(&slowaccept_event);
2438 			slowaccept = 0;
2439 		}
2440 	}
2441 	--data->nsd->current_tcp_count;
2442 	assert(data->nsd->current_tcp_count >= 0);
2443 
2444 	region_destroy(data->region);
2445 }
2446 
2447 static void
2448 handle_tcp_reading(int fd, short event, void* arg)
2449 {
2450 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2451 	ssize_t received;
2452 	struct event_base* ev_base;
2453 	struct timeval timeout;
2454 
2455 	if ((event & EV_TIMEOUT)) {
2456 		/* Connection timed out.  */
2457 		cleanup_tcp_handler(data);
2458 		return;
2459 	}
2460 
2461 	if (data->nsd->tcp_query_count > 0 &&
2462 		data->query_count >= data->nsd->tcp_query_count) {
2463 		/* No more queries allowed on this tcp connection.  */
2464 		cleanup_tcp_handler(data);
2465 		return;
2466 	}
2467 
2468 	assert((event & EV_READ));
2469 
2470 	if (data->bytes_transmitted == 0) {
2471 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2472 	}
2473 
2474 	/*
2475 	 * Check if we received the leading packet length bytes yet.
2476 	 */
2477 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2478 		received = read(fd,
2479 				(char *) &data->query->tcplen
2480 				+ data->bytes_transmitted,
2481 				sizeof(uint16_t) - data->bytes_transmitted);
2482 		if (received == -1) {
2483 			if (errno == EAGAIN || errno == EINTR) {
2484 				/*
2485 				 * Read would block, wait until more
2486 				 * data is available.
2487 				 */
2488 				return;
2489 			} else {
2490 				char buf[48];
2491 				addr2str(&data->query->addr, buf, sizeof(buf));
2492 #ifdef ECONNRESET
2493 				if (verbosity >= 2 || errno != ECONNRESET)
2494 #endif /* ECONNRESET */
2495 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2496 				cleanup_tcp_handler(data);
2497 				return;
2498 			}
2499 		} else if (received == 0) {
2500 			/* EOF */
2501 			cleanup_tcp_handler(data);
2502 			return;
2503 		}
2504 
2505 		data->bytes_transmitted += received;
2506 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2507 			/*
2508 			 * Not done with the tcplen yet, wait for more
2509 			 * data to become available.
2510 			 */
2511 			return;
2512 		}
2513 
2514 		assert(data->bytes_transmitted == sizeof(uint16_t));
2515 
2516 		data->query->tcplen = ntohs(data->query->tcplen);
2517 
2518 		/*
2519 		 * Minimum query size is:
2520 		 *
2521 		 *     Size of the header (12)
2522 		 *   + Root domain name   (1)
2523 		 *   + Query class        (2)
2524 		 *   + Query type         (2)
2525 		 */
2526 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2527 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2528 			cleanup_tcp_handler(data);
2529 			return;
2530 		}
2531 
2532 		if (data->query->tcplen > data->query->maxlen) {
2533 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2534 			cleanup_tcp_handler(data);
2535 			return;
2536 		}
2537 
2538 		buffer_set_limit(data->query->packet, data->query->tcplen);
2539 	}
2540 
2541 	assert(buffer_remaining(data->query->packet) > 0);
2542 
2543 	/* Read the (remaining) query data.  */
2544 	received = read(fd,
2545 			buffer_current(data->query->packet),
2546 			buffer_remaining(data->query->packet));
2547 	if (received == -1) {
2548 		if (errno == EAGAIN || errno == EINTR) {
2549 			/*
2550 			 * Read would block, wait until more data is
2551 			 * available.
2552 			 */
2553 			return;
2554 		} else {
2555 			char buf[48];
2556 			addr2str(&data->query->addr, buf, sizeof(buf));
2557 #ifdef ECONNRESET
2558 			if (verbosity >= 2 || errno != ECONNRESET)
2559 #endif /* ECONNRESET */
2560 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2561 			cleanup_tcp_handler(data);
2562 			return;
2563 		}
2564 	} else if (received == 0) {
2565 		/* EOF */
2566 		cleanup_tcp_handler(data);
2567 		return;
2568 	}
2569 
2570 	data->bytes_transmitted += received;
2571 	buffer_skip(data->query->packet, received);
2572 	if (buffer_remaining(data->query->packet) > 0) {
2573 		/*
2574 		 * Message not yet complete, wait for more data to
2575 		 * become available.
2576 		 */
2577 		return;
2578 	}
2579 
2580 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2581 
2582 	/* Account... */
2583 #ifdef BIND8_STATS
2584 #ifndef INET6
2585 	STATUP(data->nsd, ctcp);
2586 #else
2587 	if (data->query->addr.ss_family == AF_INET) {
2588 		STATUP(data->nsd, ctcp);
2589 	} else if (data->query->addr.ss_family == AF_INET6) {
2590 		STATUP(data->nsd, ctcp6);
2591 	}
2592 #endif
2593 #endif /* BIND8_STATS */
2594 
2595 	/* We have a complete query, process it.  */
2596 
2597 	/* tcp-query-count: handle query counter ++ */
2598 	data->query_count++;
2599 
2600 	buffer_flip(data->query->packet);
2601 	data->query_state = server_process_query(data->nsd, data->query);
2602 	if (data->query_state == QUERY_DISCARDED) {
2603 		/* Drop the packet and the entire connection... */
2604 		STATUP(data->nsd, dropped);
2605 		ZTATUP(data->nsd, data->query->zone, dropped);
2606 		cleanup_tcp_handler(data);
2607 		return;
2608 	}
2609 
2610 #ifdef BIND8_STATS
2611 	if (RCODE(data->query->packet) == RCODE_OK
2612 	    && !AA(data->query->packet))
2613 	{
2614 		STATUP(data->nsd, nona);
2615 		ZTATUP(data->nsd, data->query->zone, nona);
2616 	}
2617 #endif /* BIND8_STATS */
2618 
2619 #ifdef USE_ZONE_STATS
2620 #ifndef INET6
2621 	ZTATUP(data->nsd, data->query->zone, ctcp);
2622 #else
2623 	if (data->query->addr.ss_family == AF_INET) {
2624 		ZTATUP(data->nsd, data->query->zone, ctcp);
2625 	} else if (data->query->addr.ss_family == AF_INET6) {
2626 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2627 	}
2628 #endif
2629 #endif /* USE_ZONE_STATS */
2630 
2631 	query_add_optional(data->query, data->nsd);
2632 
2633 	/* Switch to the tcp write handler.  */
2634 	buffer_flip(data->query->packet);
2635 	data->query->tcplen = buffer_remaining(data->query->packet);
2636 	data->bytes_transmitted = 0;
2637 
2638 	timeout.tv_sec = data->tcp_timeout / 1000;
2639 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2640 
2641 	ev_base = data->event.ev_base;
2642 	event_del(&data->event);
2643 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2644 		handle_tcp_writing, data);
2645 	if(event_base_set(ev_base, &data->event) != 0)
2646 		log_msg(LOG_ERR, "event base set tcpr failed");
2647 	if(event_add(&data->event, &timeout) != 0)
2648 		log_msg(LOG_ERR, "event add tcpr failed");
2649 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2650 	handle_tcp_writing(fd, EV_WRITE, data);
2651 }
2652 
2653 static void
2654 handle_tcp_writing(int fd, short event, void* arg)
2655 {
2656 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2657 	ssize_t sent;
2658 	struct query *q = data->query;
2659 	struct timeval timeout;
2660 	struct event_base* ev_base;
2661 
2662 	if ((event & EV_TIMEOUT)) {
2663 		/* Connection timed out.  */
2664 		cleanup_tcp_handler(data);
2665 		return;
2666 	}
2667 
2668 	assert((event & EV_WRITE));
2669 
2670 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2671 		/* Writing the response packet length.  */
2672 		uint16_t n_tcplen = htons(q->tcplen);
2673 #ifdef HAVE_WRITEV
2674 		struct iovec iov[2];
2675 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2676 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2677 		iov[1].iov_base = buffer_begin(q->packet);
2678 		iov[1].iov_len = buffer_limit(q->packet);
2679 		sent = writev(fd, iov, 2);
2680 #else /* HAVE_WRITEV */
2681 		sent = write(fd,
2682 			     (const char *) &n_tcplen + data->bytes_transmitted,
2683 			     sizeof(n_tcplen) - data->bytes_transmitted);
2684 #endif /* HAVE_WRITEV */
2685 		if (sent == -1) {
2686 			if (errno == EAGAIN || errno == EINTR) {
2687 				/*
2688 				 * Write would block, wait until
2689 				 * socket becomes writable again.
2690 				 */
2691 				return;
2692 			} else {
2693 #ifdef ECONNRESET
2694 				if(verbosity >= 2 || errno != ECONNRESET)
2695 #endif /* ECONNRESET */
2696 #ifdef EPIPE
2697 				  if(verbosity >= 2 || errno != EPIPE)
2698 #endif /* EPIPE 'broken pipe' */
2699 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2700 				cleanup_tcp_handler(data);
2701 				return;
2702 			}
2703 		}
2704 
2705 		data->bytes_transmitted += sent;
2706 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2707 			/*
2708 			 * Writing not complete, wait until socket
2709 			 * becomes writable again.
2710 			 */
2711 			return;
2712 		}
2713 
2714 #ifdef HAVE_WRITEV
2715 		sent -= sizeof(n_tcplen);
2716 		/* handle potential 'packet done' code */
2717 		goto packet_could_be_done;
2718 #endif
2719  	}
2720 
2721 	sent = write(fd,
2722 		     buffer_current(q->packet),
2723 		     buffer_remaining(q->packet));
2724 	if (sent == -1) {
2725 		if (errno == EAGAIN || errno == EINTR) {
2726 			/*
2727 			 * Write would block, wait until
2728 			 * socket becomes writable again.
2729 			 */
2730 			return;
2731 		} else {
2732 #ifdef ECONNRESET
2733 			if(verbosity >= 2 || errno != ECONNRESET)
2734 #endif /* ECONNRESET */
2735 #ifdef EPIPE
2736 				  if(verbosity >= 2 || errno != EPIPE)
2737 #endif /* EPIPE 'broken pipe' */
2738 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2739 			cleanup_tcp_handler(data);
2740 			return;
2741 		}
2742 	}
2743 
2744 	data->bytes_transmitted += sent;
2745 #ifdef HAVE_WRITEV
2746   packet_could_be_done:
2747 #endif
2748 	buffer_skip(q->packet, sent);
2749 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2750 		/*
2751 		 * Still more data to write when socket becomes
2752 		 * writable again.
2753 		 */
2754 		return;
2755 	}
2756 
2757 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2758 
2759 	if (data->query_state == QUERY_IN_AXFR) {
2760 		/* Continue processing AXFR and writing back results.  */
2761 		buffer_clear(q->packet);
2762 		data->query_state = query_axfr(data->nsd, q);
2763 		if (data->query_state != QUERY_PROCESSED) {
2764 			query_add_optional(data->query, data->nsd);
2765 
2766 			/* Reset data. */
2767 			buffer_flip(q->packet);
2768 			q->tcplen = buffer_remaining(q->packet);
2769 			data->bytes_transmitted = 0;
2770 			/* Reset timeout.  */
2771 			timeout.tv_sec = data->tcp_timeout / 1000;
2772 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2773 			ev_base = data->event.ev_base;
2774 			event_del(&data->event);
2775 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2776 				handle_tcp_writing, data);
2777 			if(event_base_set(ev_base, &data->event) != 0)
2778 				log_msg(LOG_ERR, "event base set tcpw failed");
2779 			if(event_add(&data->event, &timeout) != 0)
2780 				log_msg(LOG_ERR, "event add tcpw failed");
2781 
2782 			/*
2783 			 * Write data if/when the socket is writable
2784 			 * again.
2785 			 */
2786 			return;
2787 		}
2788 	}
2789 
2790 	/*
2791 	 * Done sending, wait for the next request to arrive on the
2792 	 * TCP socket by installing the TCP read handler.
2793 	 */
2794 	if (data->nsd->tcp_query_count > 0 &&
2795 		data->query_count >= data->nsd->tcp_query_count) {
2796 
2797 		(void) shutdown(fd, SHUT_WR);
2798 	}
2799 
2800 	data->bytes_transmitted = 0;
2801 
2802 	timeout.tv_sec = data->tcp_timeout / 1000;
2803 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2804 	ev_base = data->event.ev_base;
2805 	event_del(&data->event);
2806 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2807 		handle_tcp_reading, data);
2808 	if(event_base_set(ev_base, &data->event) != 0)
2809 		log_msg(LOG_ERR, "event base set tcpw failed");
2810 	if(event_add(&data->event, &timeout) != 0)
2811 		log_msg(LOG_ERR, "event add tcpw failed");
2812 }
2813 
2814 
2815 static void
2816 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2817 	void* ATTR_UNUSED(arg))
2818 {
2819 	if(slowaccept) {
2820 		configure_handler_event_types(EV_PERSIST | EV_READ);
2821 		slowaccept = 0;
2822 	}
2823 }
2824 
2825 /*
2826  * Handle an incoming TCP connection.  The connection is accepted and
2827  * a new TCP reader event handler is added.  The TCP handler
2828  * is responsible for cleanup when the connection is closed.
2829  */
2830 static void
2831 handle_tcp_accept(int fd, short event, void* arg)
2832 {
2833 	struct tcp_accept_handler_data *data
2834 		= (struct tcp_accept_handler_data *) arg;
2835 	int s;
2836 	struct tcp_handler_data *tcp_data;
2837 	region_type *tcp_region;
2838 #ifdef INET6
2839 	struct sockaddr_storage addr;
2840 #else
2841 	struct sockaddr_in addr;
2842 #endif
2843 	socklen_t addrlen;
2844 	struct timeval timeout;
2845 
2846 	if (!(event & EV_READ)) {
2847 		return;
2848 	}
2849 
2850 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2851 		return;
2852 	}
2853 
2854 	/* Accept it... */
2855 	addrlen = sizeof(addr);
2856 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2857 	if (s == -1) {
2858 		/**
2859 		 * EMFILE and ENFILE is a signal that the limit of open
2860 		 * file descriptors has been reached. Pause accept().
2861 		 * EINTR is a signal interrupt. The others are various OS ways
2862 		 * of saying that the client has closed the connection.
2863 		 */
2864 		if (errno == EMFILE || errno == ENFILE) {
2865 			if (!slowaccept) {
2866 				/* disable accept events */
2867 				struct timeval tv;
2868 				configure_handler_event_types(0);
2869 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2870 				tv.tv_usec = 0L;
2871 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2872 					handle_slowaccept_timeout, NULL);
2873 				(void)event_base_set(data->event.ev_base,
2874 					&slowaccept_event);
2875 				(void)event_add(&slowaccept_event, &tv);
2876 				slowaccept = 1;
2877 				/* We don't want to spam the logs here */
2878 			}
2879 		} else if (errno != EINTR
2880 			&& errno != EWOULDBLOCK
2881 #ifdef ECONNABORTED
2882 			&& errno != ECONNABORTED
2883 #endif /* ECONNABORTED */
2884 #ifdef EPROTO
2885 			&& errno != EPROTO
2886 #endif /* EPROTO */
2887 			) {
2888 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2889 		}
2890 		return;
2891 	}
2892 
2893 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2894 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
2895 		close(s);
2896 		return;
2897 	}
2898 
2899 	/*
2900 	 * This region is deallocated when the TCP connection is
2901 	 * closed by the TCP handler.
2902 	 */
2903 	tcp_region = region_create(xalloc, free);
2904 	tcp_data = (struct tcp_handler_data *) region_alloc(
2905 		tcp_region, sizeof(struct tcp_handler_data));
2906 	tcp_data->region = tcp_region;
2907 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
2908 		compression_table_size);
2909 	tcp_data->nsd = data->nsd;
2910 	tcp_data->query_count = 0;
2911 
2912 	tcp_data->query_state = QUERY_PROCESSED;
2913 	tcp_data->bytes_transmitted = 0;
2914 	memcpy(&tcp_data->query->addr, &addr, addrlen);
2915 	tcp_data->query->addrlen = addrlen;
2916 
2917 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
2918 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
2919 		/* very busy, give smaller timeout */
2920 		tcp_data->tcp_timeout = 200;
2921 	}
2922 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
2923 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
2924 
2925 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
2926 		handle_tcp_reading, tcp_data);
2927 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
2928 		log_msg(LOG_ERR, "cannot set tcp event base");
2929 		close(s);
2930 		region_destroy(tcp_region);
2931 		return;
2932 	}
2933 	if(event_add(&tcp_data->event, &timeout) != 0) {
2934 		log_msg(LOG_ERR, "cannot add tcp to event base");
2935 		close(s);
2936 		region_destroy(tcp_region);
2937 		return;
2938 	}
2939 
2940 	/*
2941 	 * Keep track of the total number of TCP handlers installed so
2942 	 * we can stop accepting connections when the maximum number
2943 	 * of simultaneous TCP connections is reached.
2944 	 */
2945 	++data->nsd->current_tcp_count;
2946 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2947 		configure_handler_event_types(0);
2948 	}
2949 }
2950 
2951 static void
2952 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
2953 {
2954 	size_t i;
2955 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2956 	for (i = 0; i < nsd->child_count; ++i) {
2957 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
2958 			if (write(nsd->children[i].child_fd,
2959 				&command,
2960 				sizeof(command)) == -1)
2961 			{
2962 				if(errno != EAGAIN && errno != EINTR)
2963 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
2964 					(int) command,
2965 					(int) nsd->children[i].pid,
2966 					strerror(errno));
2967 			} else if (timeout > 0) {
2968 				(void)block_read(NULL,
2969 					nsd->children[i].child_fd,
2970 					&command, sizeof(command), timeout);
2971 			}
2972 			fsync(nsd->children[i].child_fd);
2973 			close(nsd->children[i].child_fd);
2974 			nsd->children[i].child_fd = -1;
2975 		}
2976 	}
2977 }
2978 
2979 static void
2980 send_children_quit(struct nsd* nsd)
2981 {
2982 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
2983 	send_children_command(nsd, NSD_QUIT, 0);
2984 }
2985 
2986 static void
2987 send_children_quit_and_wait(struct nsd* nsd)
2988 {
2989 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
2990 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
2991 }
2992 
2993 #ifdef BIND8_STATS
2994 static void
2995 set_children_stats(struct nsd* nsd)
2996 {
2997 	size_t i;
2998 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2999 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
3000 	for (i = 0; i < nsd->child_count; ++i) {
3001 		nsd->children[i].need_to_send_STATS = 1;
3002 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
3003 	}
3004 }
3005 #endif /* BIND8_STATS */
3006 
3007 static void
3008 configure_handler_event_types(short event_types)
3009 {
3010 	size_t i;
3011 
3012 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3013 		struct event* handler = &tcp_accept_handlers[i].event;
3014 		if(event_types) {
3015 			/* reassign */
3016 			int fd = handler->ev_fd;
3017 			struct event_base* base = handler->ev_base;
3018 			if(tcp_accept_handlers[i].event_added)
3019 				event_del(handler);
3020 			event_set(handler, fd, event_types,
3021 				handle_tcp_accept, &tcp_accept_handlers[i]);
3022 			if(event_base_set(base, handler) != 0)
3023 				log_msg(LOG_ERR, "conhand: cannot event_base");
3024 			if(event_add(handler, NULL) != 0)
3025 				log_msg(LOG_ERR, "conhand: cannot event_add");
3026 			tcp_accept_handlers[i].event_added = 1;
3027 		} else {
3028 			/* remove */
3029 			if(tcp_accept_handlers[i].event_added) {
3030 				event_del(handler);
3031 				tcp_accept_handlers[i].event_added = 0;
3032 			}
3033 		}
3034 	}
3035 }
3036