xref: /openbsd-src/usr.sbin/nsd/server.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #include <openssl/rand.h>
41 #ifndef USE_MINI_EVENT
42 #  ifdef HAVE_EVENT_H
43 #    include <event.h>
44 #  else
45 #    include <event2/event.h>
46 #    include "event2/event_struct.h"
47 #    include "event2/event_compat.h"
48 #  endif
49 #else
50 #  include "mini_event.h"
51 #endif
52 
53 #include "axfr.h"
54 #include "namedb.h"
55 #include "netio.h"
56 #include "xfrd.h"
57 #include "xfrd-tcp.h"
58 #include "xfrd-disk.h"
59 #include "difffile.h"
60 #include "nsec3.h"
61 #include "ipc.h"
62 #include "udb.h"
63 #include "remote.h"
64 #include "lookup3.h"
65 #include "rrl.h"
66 
67 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
68 
69 /*
70  * Data for the UDP handlers.
71  */
72 struct udp_handler_data
73 {
74 	struct nsd        *nsd;
75 	struct nsd_socket *socket;
76 	query_type        *query;
77 };
78 
79 struct tcp_accept_handler_data {
80 	struct nsd         *nsd;
81 	struct nsd_socket  *socket;
82 	int event_added;
83 	struct event       event;
84 };
85 
86 /*
87  * These globals are used to enable the TCP accept handlers
88  * when the number of TCP connection drops below the maximum
89  * number of TCP connections.
90  */
91 static size_t		tcp_accept_handler_count;
92 static struct tcp_accept_handler_data*	tcp_accept_handlers;
93 
94 static struct event slowaccept_event;
95 static int slowaccept;
96 
97 #ifndef NONBLOCKING_IS_BROKEN
98 #  define NUM_RECV_PER_SELECT 100
99 #endif
100 
101 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
102 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
103 struct iovec iovecs[NUM_RECV_PER_SELECT];
104 struct query *queries[NUM_RECV_PER_SELECT];
105 #endif
106 
107 /*
108  * Data for the TCP connection handlers.
109  *
110  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
111  * blocking the entire server on a slow TCP connection, but does make
112  * reading from and writing to the socket more complicated.
113  *
114  * Basically, whenever a read/write would block (indicated by the
115  * EAGAIN errno variable) we remember the position we were reading
116  * from/writing to and return from the TCP reading/writing event
117  * handler.  When the socket becomes readable/writable again we
118  * continue from the same position.
119  */
120 struct tcp_handler_data
121 {
122 	/*
123 	 * The region used to allocate all TCP connection related
124 	 * data, including this structure.  This region is destroyed
125 	 * when the connection is closed.
126 	 */
127 	region_type*		region;
128 
129 	/*
130 	 * The global nsd structure.
131 	 */
132 	struct nsd*			nsd;
133 
134 	/*
135 	 * The current query data for this TCP connection.
136 	 */
137 	query_type*			query;
138 
139 	/*
140 	 * The query_state is used to remember if we are performing an
141 	 * AXFR, if we're done processing, or if we should discard the
142 	 * query and connection.
143 	 */
144 	query_state_type	query_state;
145 
146 	/*
147 	 * The event for the file descriptor and tcp timeout
148 	 */
149 	struct event event;
150 
151 	/*
152 	 * The bytes_transmitted field is used to remember the number
153 	 * of bytes transmitted when receiving or sending a DNS
154 	 * packet.  The count includes the two additional bytes used
155 	 * to specify the packet length on a TCP connection.
156 	 */
157 	size_t				bytes_transmitted;
158 
159 	/*
160 	 * The number of queries handled by this specific TCP connection.
161 	 */
162 	int					query_count;
163 };
164 
165 /*
166  * Handle incoming queries on the UDP server sockets.
167  */
168 static void handle_udp(int fd, short event, void* arg);
169 
170 /*
171  * Handle incoming connections on the TCP sockets.  These handlers
172  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
173  * connection) but are disabled when the number of current TCP
174  * connections is equal to the maximum number of TCP connections.
175  * Disabling is done by changing the handler to wait for the
176  * NETIO_EVENT_NONE type.  This is done using the function
177  * configure_tcp_accept_handlers.
178  */
179 static void handle_tcp_accept(int fd, short event, void* arg);
180 
181 /*
182  * Handle incoming queries on a TCP connection.  The TCP connections
183  * are configured to be non-blocking and the handler may be called
184  * multiple times before a complete query is received.
185  */
186 static void handle_tcp_reading(int fd, short event, void* arg);
187 
188 /*
189  * Handle outgoing responses on a TCP connection.  The TCP connections
190  * are configured to be non-blocking and the handler may be called
191  * multiple times before a complete response is sent.
192  */
193 static void handle_tcp_writing(int fd, short event, void* arg);
194 
195 /*
196  * Send all children the quit nonblocking, then close pipe.
197  */
198 static void send_children_quit(struct nsd* nsd);
199 /* same, for shutdown time, waits for child to exit to avoid restart issues */
200 static void send_children_quit_and_wait(struct nsd* nsd);
201 
202 /* set childrens flags to send NSD_STATS to them */
203 #ifdef BIND8_STATS
204 static void set_children_stats(struct nsd* nsd);
205 #endif /* BIND8_STATS */
206 
207 /*
208  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
209  */
210 static void configure_handler_event_types(short event_types);
211 
212 static uint16_t *compressed_dname_offsets = 0;
213 static uint32_t compression_table_capacity = 0;
214 static uint32_t compression_table_size = 0;
215 
216 /*
217  * Remove the specified pid from the list of child pids.  Returns -1 if
218  * the pid is not in the list, child_num otherwise.  The field is set to 0.
219  */
220 static int
221 delete_child_pid(struct nsd *nsd, pid_t pid)
222 {
223 	size_t i;
224 	for (i = 0; i < nsd->child_count; ++i) {
225 		if (nsd->children[i].pid == pid) {
226 			nsd->children[i].pid = 0;
227 			if(!nsd->children[i].need_to_exit) {
228 				if(nsd->children[i].child_fd != -1)
229 					close(nsd->children[i].child_fd);
230 				nsd->children[i].child_fd = -1;
231 				if(nsd->children[i].handler)
232 					nsd->children[i].handler->fd = -1;
233 			}
234 			return i;
235 		}
236 	}
237 	return -1;
238 }
239 
240 /*
241  * Restart child servers if necessary.
242  */
243 static int
244 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
245 	int* xfrd_sock_p)
246 {
247 	struct main_ipc_handler_data *ipc_data;
248 	size_t i;
249 	int sv[2];
250 
251 	/* Fork the child processes... */
252 	for (i = 0; i < nsd->child_count; ++i) {
253 		if (nsd->children[i].pid <= 0) {
254 			if (nsd->children[i].child_fd != -1)
255 				close(nsd->children[i].child_fd);
256 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
257 				log_msg(LOG_ERR, "socketpair: %s",
258 					strerror(errno));
259 				return -1;
260 			}
261 			nsd->children[i].child_fd = sv[0];
262 			nsd->children[i].parent_fd = sv[1];
263 			nsd->children[i].pid = fork();
264 			switch (nsd->children[i].pid) {
265 			default: /* SERVER MAIN */
266 				close(nsd->children[i].parent_fd);
267 				nsd->children[i].parent_fd = -1;
268 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
269 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
270 				}
271 				if(!nsd->children[i].handler)
272 				{
273 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
274 						region, sizeof(struct main_ipc_handler_data));
275 					ipc_data->nsd = nsd;
276 					ipc_data->child = &nsd->children[i];
277 					ipc_data->child_num = i;
278 					ipc_data->xfrd_sock = xfrd_sock_p;
279 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
280 					ipc_data->forward_mode = 0;
281 					ipc_data->got_bytes = 0;
282 					ipc_data->total_bytes = 0;
283 					ipc_data->acl_num = 0;
284 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
285 						region, sizeof(struct netio_handler));
286 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
287 					nsd->children[i].handler->timeout = NULL;
288 					nsd->children[i].handler->user_data = ipc_data;
289 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
290 					nsd->children[i].handler->event_handler = parent_handle_child_command;
291 					netio_add_handler(netio, nsd->children[i].handler);
292 				}
293 				/* clear any ongoing ipc */
294 				ipc_data = (struct main_ipc_handler_data*)
295 					nsd->children[i].handler->user_data;
296 				ipc_data->forward_mode = 0;
297 				/* restart - update fd */
298 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
299 				break;
300 			case 0: /* CHILD */
301 				/* the child need not be able to access the
302 				 * nsd.db file */
303 				namedb_close_udb(nsd->db);
304 
305 				if (pledge("stdio rpath inet", NULL) == -1) {
306 					log_msg(LOG_ERR, "pledge");
307 					exit(1);
308 				}
309 
310 				nsd->pid = 0;
311 				nsd->child_count = 0;
312 				nsd->server_kind = nsd->children[i].kind;
313 				nsd->this_child = &nsd->children[i];
314 				nsd->this_child->child_num = i;
315 				/* remove signal flags inherited from parent
316 				   the parent will handle them. */
317 				nsd->signal_hint_reload_hup = 0;
318 				nsd->signal_hint_reload = 0;
319 				nsd->signal_hint_child = 0;
320 				nsd->signal_hint_quit = 0;
321 				nsd->signal_hint_shutdown = 0;
322 				nsd->signal_hint_stats = 0;
323 				nsd->signal_hint_statsusr = 0;
324 				close(*xfrd_sock_p);
325 				close(nsd->this_child->child_fd);
326 				nsd->this_child->child_fd = -1;
327 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
328 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
329 				}
330 				server_child(nsd);
331 				/* NOTREACH */
332 				exit(0);
333 			case -1:
334 				log_msg(LOG_ERR, "fork failed: %s",
335 					strerror(errno));
336 				return -1;
337 			}
338 		}
339 	}
340 	return 0;
341 }
342 
343 #ifdef BIND8_STATS
344 static void set_bind8_alarm(struct nsd* nsd)
345 {
346 	/* resync so that the next alarm is on the next whole minute */
347 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
348 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
349 }
350 #endif
351 
352 /* set zone stat ids for zones initially read in */
353 static void
354 zonestatid_tree_set(struct nsd* nsd)
355 {
356 	struct radnode* n;
357 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
358 		zone_type* zone = (zone_type*)n->elem;
359 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
360 	}
361 }
362 
363 #ifdef USE_ZONE_STATS
364 void
365 server_zonestat_alloc(struct nsd* nsd)
366 {
367 	size_t num = (nsd->options->zonestatnames->count==0?1:
368 			nsd->options->zonestatnames->count);
369 	size_t sz = sizeof(struct nsdst)*num;
370 	char tmpfile[256];
371 	uint8_t z = 0;
372 
373 	/* file names */
374 	nsd->zonestatfname[0] = 0;
375 	nsd->zonestatfname[1] = 0;
376 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
377 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
378 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
379 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
380 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
381 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
382 
383 	/* file descriptors */
384 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
385 	if(nsd->zonestatfd[0] == -1) {
386 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
387 			strerror(errno));
388 		exit(1);
389 	}
390 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
391 	if(nsd->zonestatfd[0] == -1) {
392 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
393 			strerror(errno));
394 		close(nsd->zonestatfd[0]);
395 		unlink(nsd->zonestatfname[0]);
396 		exit(1);
397 	}
398 
399 #ifdef HAVE_MMAP
400 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
401 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
402 			strerror(errno));
403 		exit(1);
404 	}
405 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
406 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
407 			nsd->zonestatfname[0], strerror(errno));
408 		exit(1);
409 	}
410 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
411 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
412 			strerror(errno));
413 		exit(1);
414 	}
415 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
416 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
417 			nsd->zonestatfname[1], strerror(errno));
418 		exit(1);
419 	}
420 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
421 		MAP_SHARED, nsd->zonestatfd[0], 0);
422 	if(nsd->zonestat[0] == MAP_FAILED) {
423 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
424 		unlink(nsd->zonestatfname[0]);
425 		unlink(nsd->zonestatfname[1]);
426 		exit(1);
427 	}
428 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
429 		MAP_SHARED, nsd->zonestatfd[1], 0);
430 	if(nsd->zonestat[1] == MAP_FAILED) {
431 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
432 		unlink(nsd->zonestatfname[0]);
433 		unlink(nsd->zonestatfname[1]);
434 		exit(1);
435 	}
436 	memset(nsd->zonestat[0], 0, sz);
437 	memset(nsd->zonestat[1], 0, sz);
438 	nsd->zonestatsize[0] = num;
439 	nsd->zonestatsize[1] = num;
440 	nsd->zonestatdesired = num;
441 	nsd->zonestatsizenow = num;
442 	nsd->zonestatnow = nsd->zonestat[0];
443 #endif /* HAVE_MMAP */
444 }
445 
446 void
447 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
448 {
449 #ifdef HAVE_MMAP
450 #ifdef MREMAP_MAYMOVE
451 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
452 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
453 		MREMAP_MAYMOVE);
454 	if(nsd->zonestat[idx] == MAP_FAILED) {
455 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
456 		exit(1);
457 	}
458 #else /* !HAVE MREMAP */
459 	if(msync(nsd->zonestat[idx],
460 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
461 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
462 	if(munmap(nsd->zonestat[idx],
463 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
464 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
465 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
466 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
467 	if(nsd->zonestat[idx] == MAP_FAILED) {
468 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
469 		exit(1);
470 	}
471 #endif /* MREMAP */
472 #endif /* HAVE_MMAP */
473 }
474 
475 /* realloc the zonestat array for the one that is not currently in use,
476  * to match the desired new size of the array (if applicable) */
477 void
478 server_zonestat_realloc(struct nsd* nsd)
479 {
480 #ifdef HAVE_MMAP
481 	uint8_t z = 0;
482 	size_t sz;
483 	int idx = 0; /* index of the zonestat array that is not in use */
484 	if(nsd->zonestatnow == nsd->zonestat[0])
485 		idx = 1;
486 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
487 		return;
488 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
489 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
490 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
491 			strerror(errno));
492 		exit(1);
493 	}
494 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
495 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
496 			nsd->zonestatfname[idx], strerror(errno));
497 		exit(1);
498 	}
499 	zonestat_remap(nsd, idx, sz);
500 	/* zero the newly allocated region */
501 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
502 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
503 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
504 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
505 	}
506 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
507 #endif /* HAVE_MMAP */
508 }
509 
510 /* switchover to use the other array for the new children, that
511  * briefly coexist with the old children.  And we want to avoid them
512  * both writing to the same statistics arrays. */
513 void
514 server_zonestat_switch(struct nsd* nsd)
515 {
516 	if(nsd->zonestatnow == nsd->zonestat[0]) {
517 		nsd->zonestatnow = nsd->zonestat[1];
518 		nsd->zonestatsizenow = nsd->zonestatsize[1];
519 	} else {
520 		nsd->zonestatnow = nsd->zonestat[0];
521 		nsd->zonestatsizenow = nsd->zonestatsize[0];
522 	}
523 }
524 #endif /* USE_ZONE_STATS */
525 
526 static void
527 cleanup_dname_compression_tables(void *ptr)
528 {
529 	free(ptr);
530 	compressed_dname_offsets = NULL;
531 	compression_table_capacity = 0;
532 }
533 
534 static void
535 initialize_dname_compression_tables(struct nsd *nsd)
536 {
537 	size_t needed = domain_table_count(nsd->db->domains) + 1;
538 	needed += EXTRA_DOMAIN_NUMBERS;
539 	if(compression_table_capacity < needed) {
540 		if(compressed_dname_offsets) {
541 			region_remove_cleanup(nsd->db->region,
542 				cleanup_dname_compression_tables,
543 				compressed_dname_offsets);
544 			free(compressed_dname_offsets);
545 		}
546 		compressed_dname_offsets = (uint16_t *) xmallocarray(
547 			needed, sizeof(uint16_t));
548 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
549 			compressed_dname_offsets);
550 		compression_table_capacity = needed;
551 		compression_table_size=domain_table_count(nsd->db->domains)+1;
552 	}
553 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
554 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
555 }
556 
557 /* create and bind sockets.  */
558 static int
559 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
560 {
561 	struct addrinfo* addr;
562 	size_t i;
563 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND))
564 	int on = 1;
565 #endif
566 
567 	/* UDP */
568 
569 	/* Make a socket... */
570 	for (i = from; i < to; i++) {
571 		/* for reuseports copy socket specs of first entries */
572 		addr = nsd->udp[i%nsd->ifs].addr;
573 		if (!addr) {
574 			nsd->udp[i].s = -1;
575 			continue;
576 		}
577 		nsd->udp[i].fam = (int)addr->ai_family;
578 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
579 #if defined(INET6)
580 			if (addr->ai_family == AF_INET6 &&
581 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
582 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
583 				continue;
584 			}
585 #endif /* INET6 */
586 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
587 			return -1;
588 		}
589 
590 #ifdef SO_REUSEPORT
591 		if(nsd->reuseport && *reuseport_works &&
592 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
593 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
594 			if(verbosity >= 3
595 #ifdef ENOPROTOOPT
596 				|| errno != ENOPROTOOPT
597 #endif
598 				)
599 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
600 				"...) failed: %s", strerror(errno));
601 			*reuseport_works = 0;
602 		}
603 #else
604 		(void)reuseport_works;
605 #endif /* SO_REUSEPORT */
606 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
607 	if(1) {
608 	int rcv = 1*1024*1024;
609 	int snd = 1*1024*1024;
610 
611 #ifdef SO_RCVBUF
612 #  ifdef SO_RCVBUFFORCE
613 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
614 		(socklen_t)sizeof(rcv)) < 0) {
615 		if(errno != EPERM && errno != ENOBUFS) {
616 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
617                                         "...) failed: %s", strerror(errno));
618 			return -1;
619 		}
620 #  else
621 	if(1) {
622 #  endif /* SO_RCVBUFFORCE */
623 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
624 			 (socklen_t)sizeof(rcv)) < 0) {
625 			if(errno != ENOBUFS && errno != ENOSYS) {
626 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
627                                         "...) failed: %s", strerror(errno));
628 				return -1;
629 			}
630 		}
631 	}
632 #endif /* SO_RCVBUF */
633 
634 #ifdef SO_SNDBUF
635 #  ifdef SO_SNDBUFFORCE
636 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
637 		(socklen_t)sizeof(snd)) < 0) {
638 		if(errno != EPERM && errno != ENOBUFS) {
639 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
640                                         "...) failed: %s", strerror(errno));
641 			return -1;
642 		}
643 #  else
644 	if(1) {
645 #  endif /* SO_SNDBUFFORCE */
646 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
647 			 (socklen_t)sizeof(snd)) < 0) {
648 			if(errno != ENOBUFS && errno != ENOSYS) {
649 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
650                                         "...) failed: %s", strerror(errno));
651 				return -1;
652 			}
653 		}
654 	}
655 #endif /* SO_SNDBUF */
656 
657 	}
658 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
659 
660 #if defined(INET6)
661 		if (addr->ai_family == AF_INET6) {
662 # if defined(IPV6_V6ONLY)
663 			if (setsockopt(nsd->udp[i].s,
664 				       IPPROTO_IPV6, IPV6_V6ONLY,
665 				       &on, sizeof(on)) < 0)
666 			{
667 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
668 					strerror(errno));
669 				return -1;
670 			}
671 # endif
672 # if defined(IPV6_USE_MIN_MTU)
673 			/*
674 			 * There is no fragmentation of IPv6 datagrams
675 			 * during forwarding in the network. Therefore
676 			 * we do not send UDP datagrams larger than
677 			 * the minimum IPv6 MTU of 1280 octets. The
678 			 * EDNS0 message length can be larger if the
679 			 * network stack supports IPV6_USE_MIN_MTU.
680 			 */
681 			if (setsockopt(nsd->udp[i].s,
682 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
683 				       &on, sizeof(on)) < 0)
684 			{
685 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
686 					strerror(errno));
687 				return -1;
688 			}
689 # elif defined(IPV6_MTU)
690 			/*
691 			 * On Linux, PMTUD is disabled by default for datagrams
692 			 * so set the MTU equal to the MIN MTU to get the same.
693 			 */
694 			on = IPV6_MIN_MTU;
695 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
696 				&on, sizeof(on)) < 0)
697 			{
698 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
699 					strerror(errno));
700 				return -1;
701 			}
702 			on = 1;
703 # endif
704 		}
705 #endif
706 #if defined(AF_INET)
707 		if (addr->ai_family == AF_INET) {
708 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
709 			int action = IP_PMTUDISC_DONT;
710 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
711 				IP_MTU_DISCOVER, &action, sizeof(action)) < 0)
712 			{
713 				log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
714 					strerror(errno));
715 				return -1;
716 			}
717 #  elif defined(IP_DONTFRAG)
718 			int off = 0;
719 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
720 				&off, sizeof(off)) < 0)
721 			{
722 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
723 					strerror(errno));
724 				return -1;
725 			}
726 #  endif
727 		}
728 #endif
729 		/* set it nonblocking */
730 		/* otherwise, on OSes with thundering herd problems, the
731 		   UDP recv could block NSD after select returns readable. */
732 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
733 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
734 		}
735 
736 		/* Bind it... */
737 		if (nsd->options->ip_freebind) {
738 #ifdef IP_FREEBIND
739 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
740 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
741 					strerror(errno));
742 			}
743 #endif /* IP_FREEBIND */
744 		}
745 
746 		if (nsd->options->ip_transparent) {
747 #ifdef IP_TRANSPARENT
748 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
749 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
750 					strerror(errno));
751 			}
752 #endif /* IP_TRANSPARENT */
753 		}
754 
755 		if (bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
756 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
757 			return -1;
758 		}
759 	}
760 
761 	/* TCP */
762 
763 	/* Make a socket... */
764 	for (i = from; i < to; i++) {
765 		/* for reuseports copy socket specs of first entries */
766 		addr = nsd->tcp[i%nsd->ifs].addr;
767 		if (!addr) {
768 			nsd->tcp[i].s = -1;
769 			continue;
770 		}
771 		nsd->tcp[i].fam = (int)addr->ai_family;
772 		/* turn off REUSEPORT for TCP by copying the socket fd */
773 		if(i >= nsd->ifs) {
774 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
775 			continue;
776 		}
777 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
778 #if defined(INET6)
779 			if (addr->ai_family == AF_INET6 &&
780 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
781 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
782 				continue;
783 			}
784 #endif /* INET6 */
785 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
786 			return -1;
787 		}
788 
789 #ifdef SO_REUSEPORT
790 		if(nsd->reuseport && *reuseport_works &&
791 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
792 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
793 			if(verbosity >= 3
794 #ifdef ENOPROTOOPT
795 				|| errno != ENOPROTOOPT
796 #endif
797 				)
798 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
799 				"...) failed: %s", strerror(errno));
800 			*reuseport_works = 0;
801 		}
802 #endif /* SO_REUSEPORT */
803 #ifdef	SO_REUSEADDR
804 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
805 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
806 		}
807 #endif /* SO_REUSEADDR */
808 
809 #if defined(INET6)
810 		if (addr->ai_family == AF_INET6) {
811 # if defined(IPV6_V6ONLY)
812 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
813 				&on, sizeof(on)) < 0) {
814 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
815 				return -1;
816 			}
817 # endif
818 # if defined(IPV6_USE_MIN_MTU)
819 			/*
820 			 * Use minimum MTU to minimize delays learning working
821 			 * PMTU when communicating through a tunnel.
822 			 */
823 			if (setsockopt(nsd->tcp[i].s,
824 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
825 				       &on, sizeof(on)) < 0) {
826 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
827 				return -1;
828 			}
829 # elif defined(IPV6_MTU)
830 			/*
831 			 * On Linux, PMTUD is disabled by default for datagrams
832 			 * so set the MTU equal to the MIN MTU to get the same.
833 			 */
834 			on = IPV6_MIN_MTU;
835 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
836 				&on, sizeof(on)) < 0) {
837 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
838 				return -1;
839 			}
840 			on = 1;
841 # endif
842 		}
843 #endif
844 		/* set maximum segment size to tcp socket */
845 		if(nsd->tcp_mss > 0) {
846 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
847 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
848 					(void*)&nsd->tcp_mss,
849 					sizeof(nsd->tcp_mss)) < 0) {
850 				log_msg(LOG_ERR,
851 					"setsockopt(...,TCP_MAXSEG,...)"
852 					" failed for tcp: %s", strerror(errno));
853 			}
854 #else
855 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
856 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
857 		}
858 
859 		/* set it nonblocking */
860 		/* (StevensUNP p463), if tcp listening socket is blocking, then
861 		   it may block in accept, even if select() says readable. */
862 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
863 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
864 		}
865 
866 		/* Bind it... */
867 		if (nsd->options->ip_freebind) {
868 #ifdef IP_FREEBIND
869 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
870 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
871 					strerror(errno));
872 			}
873 #endif /* IP_FREEBIND */
874 		}
875 
876 		if (nsd->options->ip_transparent) {
877 #ifdef IP_TRANSPARENT
878 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
879 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
880 					strerror(errno));
881 			}
882 #endif /* IP_TRANSPARENT */
883 		}
884 
885 		if (bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
886 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
887 			return -1;
888 		}
889 
890 		/* Listen to it... */
891 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
892 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
893 			return -1;
894 		}
895 	}
896 
897 	return 0;
898 }
899 
900 /*
901  * Initialize the server, reuseport, create and bind the sockets.
902  */
903 int
904 server_init(struct nsd *nsd)
905 {
906 	int reuseport_successful = 1; /* see if reuseport works in OS */
907 	if(nsd->reuseport) {
908 		/* increase the size of the udp and tcp interface arrays,
909 		 * there are going to be separate interface file descriptors
910 		 * for every server instance */
911 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
912 			sizeof(*nsd->udp));
913 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
914 			sizeof(*nsd->tcp));
915 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
916 			(nsd->ifs*(nsd->reuseport-1)));
917 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
918 			(nsd->ifs*(nsd->reuseport-1)));
919 	}
920 
921 	/* open the server interface ports */
922 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
923 		return -1;
924 
925 	/* continue to open the remaining reuseport ports */
926 	if(nsd->reuseport && reuseport_successful) {
927 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
928 			&reuseport_successful) == -1)
929 			return -1;
930 		nsd->ifs *= nsd->reuseport;
931 	} else {
932 		nsd->reuseport = 0;
933 	}
934 	return 0;
935 }
936 
937 /*
938  * Prepare the server for take off.
939  *
940  */
941 int
942 server_prepare(struct nsd *nsd)
943 {
944 #ifdef RATELIMIT
945 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
946 #ifdef HAVE_ARC4RANDOM
947 	hash_set_raninit(arc4random());
948 #else
949 	uint32_t v = getpid() ^ time(NULL);
950 	srandom((unsigned long)v);
951 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
952 		hash_set_raninit(v);
953 	else	hash_set_raninit(random());
954 #endif
955 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
956 		nsd->options->rrl_ratelimit,
957 		nsd->options->rrl_whitelist_ratelimit,
958 		nsd->options->rrl_slip,
959 		nsd->options->rrl_ipv4_prefix_length,
960 		nsd->options->rrl_ipv6_prefix_length);
961 #endif /* RATELIMIT */
962 
963 	/* Open the database... */
964 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
965 		log_msg(LOG_ERR, "unable to open the database %s: %s",
966 			nsd->dbfile, strerror(errno));
967 		unlink(nsd->task[0]->fname);
968 		unlink(nsd->task[1]->fname);
969 #ifdef USE_ZONE_STATS
970 		unlink(nsd->zonestatfname[0]);
971 		unlink(nsd->zonestatfname[1]);
972 #endif
973 		xfrd_del_tempdir(nsd);
974 		return -1;
975 	}
976 	/* check if zone files have been modified */
977 	/* NULL for taskudb because we send soainfo in a moment, batched up,
978 	 * for all zones */
979 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
980 		nsd->options->database[0] == 0))
981 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
982 	zonestatid_tree_set(nsd);
983 
984 	compression_table_capacity = 0;
985 	initialize_dname_compression_tables(nsd);
986 
987 #ifdef	BIND8_STATS
988 	/* Initialize times... */
989 	time(&nsd->st.boot);
990 	set_bind8_alarm(nsd);
991 #endif /* BIND8_STATS */
992 
993 	return 0;
994 }
995 
996 /*
997  * Fork the required number of servers.
998  */
999 static int
1000 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1001 	int* xfrd_sock_p)
1002 {
1003 	size_t i;
1004 
1005 	/* Start all child servers initially.  */
1006 	for (i = 0; i < nsd->child_count; ++i) {
1007 		nsd->children[i].pid = 0;
1008 	}
1009 
1010 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1011 }
1012 
1013 void
1014 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1015 {
1016 	size_t i;
1017 
1018 	/* Close all the sockets... */
1019 	for (i = 0; i < n; ++i) {
1020 		if (sockets[i].s != -1) {
1021 			close(sockets[i].s);
1022 			if(sockets[i].addr)
1023 				freeaddrinfo(sockets[i].addr);
1024 			sockets[i].s = -1;
1025 		}
1026 	}
1027 }
1028 
1029 /*
1030  * Close the sockets, shutdown the server and exit.
1031  * Does not return.
1032  *
1033  */
1034 void
1035 server_shutdown(struct nsd *nsd)
1036 {
1037 	size_t i;
1038 
1039 	server_close_all_sockets(nsd->udp, nsd->ifs);
1040 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1041 	/* CHILD: close command channel to parent */
1042 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1043 	{
1044 		close(nsd->this_child->parent_fd);
1045 		nsd->this_child->parent_fd = -1;
1046 	}
1047 	/* SERVER: close command channels to children */
1048 	if(!nsd->this_child)
1049 	{
1050 		for(i=0; i < nsd->child_count; ++i)
1051 			if(nsd->children[i].child_fd != -1)
1052 			{
1053 				close(nsd->children[i].child_fd);
1054 				nsd->children[i].child_fd = -1;
1055 			}
1056 	}
1057 
1058 	tsig_finalize();
1059 #ifdef HAVE_SSL
1060 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1061 #endif
1062 
1063 #if 0 /* OS collects memory pages */
1064 	nsd_options_destroy(nsd->options);
1065 	region_destroy(nsd->region);
1066 #endif
1067 	log_finalize();
1068 	exit(0);
1069 }
1070 
1071 void
1072 server_prepare_xfrd(struct nsd* nsd)
1073 {
1074 	char tmpfile[256];
1075 	/* create task mmaps */
1076 	nsd->mytask = 0;
1077 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1078 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1079 	nsd->task[0] = task_file_create(tmpfile);
1080 	if(!nsd->task[0]) {
1081 #ifdef USE_ZONE_STATS
1082 		unlink(nsd->zonestatfname[0]);
1083 		unlink(nsd->zonestatfname[1]);
1084 #endif
1085 		xfrd_del_tempdir(nsd);
1086 		exit(1);
1087 	}
1088 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1089 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1090 	nsd->task[1] = task_file_create(tmpfile);
1091 	if(!nsd->task[1]) {
1092 		unlink(nsd->task[0]->fname);
1093 #ifdef USE_ZONE_STATS
1094 		unlink(nsd->zonestatfname[0]);
1095 		unlink(nsd->zonestatfname[1]);
1096 #endif
1097 		xfrd_del_tempdir(nsd);
1098 		exit(1);
1099 	}
1100 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1101 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1102 	/* create xfrd listener structure */
1103 	nsd->xfrd_listener = region_alloc(nsd->region,
1104 		sizeof(netio_handler_type));
1105 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1106 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1107 	nsd->xfrd_listener->fd = -1;
1108 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1109 		nsd;
1110 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1111 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1112 }
1113 
1114 
1115 void
1116 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1117 {
1118 	pid_t pid;
1119 	int sockets[2] = {0,0};
1120 	struct ipc_handler_conn_data *data;
1121 
1122 	if(nsd->xfrd_listener->fd != -1)
1123 		close(nsd->xfrd_listener->fd);
1124 	if(del_db) {
1125 		/* recreate taskdb that xfrd was using, it may be corrupt */
1126 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1127 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1128 		nsd->task[1-nsd->mytask]->fname = NULL;
1129 		/* free alloc already, so udb does not shrink itself */
1130 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1131 		nsd->task[1-nsd->mytask]->alloc = NULL;
1132 		udb_base_free(nsd->task[1-nsd->mytask]);
1133 		/* create new file, overwrite the old one */
1134 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1135 		free(tmpfile);
1136 	}
1137 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1138 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1139 		return;
1140 	}
1141 	pid = fork();
1142 	switch (pid) {
1143 	case -1:
1144 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1145 		break;
1146 	default:
1147 		/* PARENT: close first socket, use second one */
1148 		close(sockets[0]);
1149 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1150 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1151 		}
1152 		if(del_db) xfrd_free_namedb(nsd);
1153 		/* use other task than I am using, since if xfrd died and is
1154 		 * restarted, the reload is using nsd->mytask */
1155 		nsd->mytask = 1 - nsd->mytask;
1156 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1157 		/* ENOTREACH */
1158 		break;
1159 	case 0:
1160 		/* CHILD: close second socket, use first one */
1161 		close(sockets[1]);
1162 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1163 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1164 		}
1165 		nsd->xfrd_listener->fd = sockets[0];
1166 		break;
1167 	}
1168 	/* server-parent only */
1169 	nsd->xfrd_listener->timeout = NULL;
1170 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1171 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1172 	/* clear ongoing ipc reads */
1173 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1174 	data->conn->is_reading = 0;
1175 }
1176 
1177 /** add all soainfo to taskdb */
1178 static void
1179 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1180 {
1181 	struct radnode* n;
1182 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1183 	/* add all SOA INFO to mytask */
1184 	udb_ptr_init(&task_last, taskudb);
1185 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1186 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1187 	}
1188 	udb_ptr_unlink(&task_last, taskudb);
1189 }
1190 
1191 void
1192 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1193 {
1194 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1195 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1196 	 *   then they exchange and process.
1197 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1198 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1199 	 *   expire notifications can be sent back via a normal reload later
1200 	 *   (xfrd will wait for current running reload to finish if any).
1201 	 */
1202 	sig_atomic_t cmd = 0;
1203 	pid_t mypid;
1204 	int xfrd_sock = nsd->xfrd_listener->fd;
1205 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1206 	udb_ptr t;
1207 	if(!shortsoa) {
1208 		if(nsd->signal_hint_shutdown) {
1209 		shutdown:
1210 			log_msg(LOG_WARNING, "signal received, shutting down...");
1211 			server_close_all_sockets(nsd->udp, nsd->ifs);
1212 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1213 #ifdef HAVE_SSL
1214 			daemon_remote_close(nsd->rc);
1215 #endif
1216 			/* Unlink it if possible... */
1217 			unlinkpid(nsd->pidfile);
1218 			unlink(nsd->task[0]->fname);
1219 			unlink(nsd->task[1]->fname);
1220 #ifdef USE_ZONE_STATS
1221 			unlink(nsd->zonestatfname[0]);
1222 			unlink(nsd->zonestatfname[1]);
1223 #endif
1224 			/* write the nsd.db to disk, wait for it to complete */
1225 			udb_base_sync(nsd->db->udb, 1);
1226 			udb_base_close(nsd->db->udb);
1227 			server_shutdown(nsd);
1228 			exit(0);
1229 		}
1230 	}
1231 	if(shortsoa) {
1232 		/* put SOA in xfrd task because mytask may be in use */
1233 		taskudb = nsd->task[1-nsd->mytask];
1234 	}
1235 
1236 	add_all_soa_to_task(nsd, taskudb);
1237 	if(!shortsoa) {
1238 		/* wait for xfrd to signal task is ready, RELOAD signal */
1239 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1240 			cmd != NSD_RELOAD) {
1241 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1242 			exit(1);
1243 		}
1244 		if(nsd->signal_hint_shutdown) {
1245 			goto shutdown;
1246 		}
1247 	}
1248 	/* give xfrd our task, signal it with RELOAD_DONE */
1249 	task_process_sync(taskudb);
1250 	cmd = NSD_RELOAD_DONE;
1251 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1252 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1253 			(int)nsd->pid, strerror(errno));
1254 	}
1255 	mypid = getpid();
1256 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1257 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1258 			strerror(errno));
1259 	}
1260 
1261 	if(!shortsoa) {
1262 		/* process the xfrd task works (expiry data) */
1263 		nsd->mytask = 1 - nsd->mytask;
1264 		taskudb = nsd->task[nsd->mytask];
1265 		task_remap(taskudb);
1266 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1267 		while(!udb_ptr_is_null(&t)) {
1268 			task_process_expire(nsd->db, TASKLIST(&t));
1269 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1270 		}
1271 		udb_ptr_unlink(&t, taskudb);
1272 		task_clear(taskudb);
1273 
1274 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1275 		cmd = NSD_RELOAD_DONE;
1276 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1277 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1278 				(int)nsd->pid, strerror(errno));
1279 		}
1280 	}
1281 }
1282 
1283 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1284 ssize_t
1285 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1286 {
1287 	uint8_t* buf = (uint8_t*) p;
1288 	ssize_t total = 0;
1289 	struct pollfd fd;
1290 	memset(&fd, 0, sizeof(fd));
1291 	fd.fd = s;
1292 	fd.events = POLLIN;
1293 
1294 	while( total < sz) {
1295 		ssize_t ret;
1296 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1297 		if(ret == -1) {
1298 			if(errno == EAGAIN)
1299 				/* blocking read */
1300 				continue;
1301 			if(errno == EINTR) {
1302 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1303 					return -1;
1304 				/* other signals can be handled later */
1305 				continue;
1306 			}
1307 			/* some error */
1308 			return -1;
1309 		}
1310 		if(ret == 0) {
1311 			/* operation timed out */
1312 			return -2;
1313 		}
1314 		ret = read(s, buf+total, sz-total);
1315 		if(ret == -1) {
1316 			if(errno == EAGAIN)
1317 				/* blocking read */
1318 				continue;
1319 			if(errno == EINTR) {
1320 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1321 					return -1;
1322 				/* other signals can be handled later */
1323 				continue;
1324 			}
1325 			/* some error */
1326 			return -1;
1327 		}
1328 		if(ret == 0) {
1329 			/* closed connection! */
1330 			return 0;
1331 		}
1332 		total += ret;
1333 	}
1334 	return total;
1335 }
1336 
1337 static void
1338 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1339 {
1340 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1341 	udb_ptr t, next;
1342 	udb_base* u = nsd->task[nsd->mytask];
1343 	udb_ptr_init(&next, u);
1344 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1345 	udb_base_set_userdata(u, 0);
1346 	while(!udb_ptr_is_null(&t)) {
1347 		/* store next in list so this one can be deleted or reused */
1348 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1349 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1350 
1351 		/* process task t */
1352 		/* append results for task t and update last_task */
1353 		task_process_in_reload(nsd, u, last_task, &t);
1354 
1355 		/* go to next */
1356 		udb_ptr_set_ptr(&t, u, &next);
1357 
1358 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1359 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1360 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1361 			if(cmd == NSD_QUIT) {
1362 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1363 				/* sync to disk (if needed) */
1364 				udb_base_sync(nsd->db->udb, 0);
1365 				/* unlink files of remainder of tasks */
1366 				while(!udb_ptr_is_null(&t)) {
1367 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1368 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1369 					}
1370 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1371 				}
1372 				udb_ptr_unlink(&t, u);
1373 				udb_ptr_unlink(&next, u);
1374 				exit(0);
1375 			}
1376 		}
1377 
1378 	}
1379 	udb_ptr_unlink(&t, u);
1380 	udb_ptr_unlink(&next, u);
1381 }
1382 
1383 #ifdef BIND8_STATS
1384 static void
1385 parent_send_stats(struct nsd* nsd, int cmdfd)
1386 {
1387 	size_t i;
1388 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1389 		log_msg(LOG_ERR, "could not write stats to reload");
1390 		return;
1391 	}
1392 	for(i=0; i<nsd->child_count; i++)
1393 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1394 			sizeof(stc_t))) {
1395 			log_msg(LOG_ERR, "could not write stats to reload");
1396 			return;
1397 		}
1398 }
1399 
1400 static void
1401 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1402 {
1403 	struct nsdst s;
1404 	stc_t* p;
1405 	size_t i;
1406 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1407 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1408 		log_msg(LOG_ERR, "could not read stats from oldpar");
1409 		return;
1410 	}
1411 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1412 	s.db_mem = region_get_mem(nsd->db->region);
1413 	p = (stc_t*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1414 		nsd->child_count);
1415 	if(!p) return;
1416 	for(i=0; i<nsd->child_count; i++) {
1417 		if(block_read(nsd, cmdfd, p++, sizeof(stc_t), 1)!=sizeof(stc_t))
1418 			return;
1419 	}
1420 }
1421 #endif /* BIND8_STATS */
1422 
1423 /*
1424  * Reload the database, stop parent, re-fork children and continue.
1425  * as server_main.
1426  */
1427 static void
1428 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1429 	int cmdsocket)
1430 {
1431 	pid_t mypid;
1432 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1433 	int ret;
1434 	udb_ptr last_task;
1435 	struct sigaction old_sigchld, ign_sigchld;
1436 	/* ignore SIGCHLD from the previous server_main that used this pid */
1437 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1438 	ign_sigchld.sa_handler = SIG_IGN;
1439 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1440 
1441 	/* see what tasks we got from xfrd */
1442 	task_remap(nsd->task[nsd->mytask]);
1443 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1444 	udb_compact_inhibited(nsd->db->udb, 1);
1445 	reload_process_tasks(nsd, &last_task, cmdsocket);
1446 	udb_compact_inhibited(nsd->db->udb, 0);
1447 	udb_compact(nsd->db->udb);
1448 
1449 #ifndef NDEBUG
1450 	if(nsd_debug_level >= 1)
1451 		region_log_stats(nsd->db->region);
1452 #endif /* NDEBUG */
1453 	/* sync to disk (if needed) */
1454 	udb_base_sync(nsd->db->udb, 0);
1455 
1456 	initialize_dname_compression_tables(nsd);
1457 
1458 #ifdef BIND8_STATS
1459 	/* Restart dumping stats if required.  */
1460 	time(&nsd->st.boot);
1461 	set_bind8_alarm(nsd);
1462 #endif
1463 #ifdef USE_ZONE_STATS
1464 	server_zonestat_realloc(nsd); /* realloc for new children */
1465 	server_zonestat_switch(nsd);
1466 #endif
1467 
1468 	/* listen for the signals of failed children again */
1469 	sigaction(SIGCHLD, &old_sigchld, NULL);
1470 	/* Start new child processes */
1471 	if (server_start_children(nsd, server_region, netio, &nsd->
1472 		xfrd_listener->fd) != 0) {
1473 		send_children_quit(nsd);
1474 		exit(1);
1475 	}
1476 
1477 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1478 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1479 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1480 		if(cmd == NSD_QUIT) {
1481 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1482 			send_children_quit(nsd);
1483 			exit(0);
1484 		}
1485 	}
1486 
1487 	/* Send quit command to parent: blocking, wait for receipt. */
1488 	do {
1489 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1490 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1491 		{
1492 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1493 				strerror(errno));
1494 		}
1495 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1496 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1497 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1498 			RELOAD_SYNC_TIMEOUT);
1499 		if(ret == -2) {
1500 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1501 		}
1502 	} while (ret == -2);
1503 	if(ret == -1) {
1504 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1505 			strerror(errno));
1506 	}
1507 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1508 	if(cmd == NSD_QUIT) {
1509 		/* small race condition possible here, parent got quit cmd. */
1510 		send_children_quit(nsd);
1511 		exit(1);
1512 	}
1513 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1514 #ifdef BIND8_STATS
1515 	reload_do_stats(cmdsocket, nsd, &last_task);
1516 #endif
1517 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1518 	task_process_sync(nsd->task[nsd->mytask]);
1519 #ifdef USE_ZONE_STATS
1520 	server_zonestat_realloc(nsd); /* realloc for next children */
1521 #endif
1522 
1523 	/* send soainfo to the xfrd process, signal it that reload is done,
1524 	 * it picks up the taskudb */
1525 	cmd = NSD_RELOAD_DONE;
1526 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1527 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1528 			strerror(errno));
1529 	}
1530 	mypid = getpid();
1531 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1532 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1533 			strerror(errno));
1534 	}
1535 
1536 	/* try to reopen file */
1537 	if (nsd->file_rotation_ok)
1538 		log_reopen(nsd->log_filename, 1);
1539 	/* exit reload, continue as new server_main */
1540 }
1541 
1542 /*
1543  * Get the mode depending on the signal hints that have been received.
1544  * Multiple signal hints can be received and will be handled in turn.
1545  */
1546 static sig_atomic_t
1547 server_signal_mode(struct nsd *nsd)
1548 {
1549 	if(nsd->signal_hint_quit) {
1550 		nsd->signal_hint_quit = 0;
1551 		return NSD_QUIT;
1552 	}
1553 	else if(nsd->signal_hint_shutdown) {
1554 		nsd->signal_hint_shutdown = 0;
1555 		return NSD_SHUTDOWN;
1556 	}
1557 	else if(nsd->signal_hint_child) {
1558 		nsd->signal_hint_child = 0;
1559 		return NSD_REAP_CHILDREN;
1560 	}
1561 	else if(nsd->signal_hint_reload) {
1562 		nsd->signal_hint_reload = 0;
1563 		return NSD_RELOAD;
1564 	}
1565 	else if(nsd->signal_hint_reload_hup) {
1566 		nsd->signal_hint_reload_hup = 0;
1567 		return NSD_RELOAD_REQ;
1568 	}
1569 	else if(nsd->signal_hint_stats) {
1570 		nsd->signal_hint_stats = 0;
1571 #ifdef BIND8_STATS
1572 		set_bind8_alarm(nsd);
1573 #endif
1574 		return NSD_STATS;
1575 	}
1576 	else if(nsd->signal_hint_statsusr) {
1577 		nsd->signal_hint_statsusr = 0;
1578 		return NSD_STATS;
1579 	}
1580 	return NSD_RUN;
1581 }
1582 
1583 /*
1584  * The main server simply waits for signals and child processes to
1585  * terminate.  Child processes are restarted as necessary.
1586  */
1587 void
1588 server_main(struct nsd *nsd)
1589 {
1590 	region_type *server_region = region_create(xalloc, free);
1591 	netio_type *netio = netio_create(server_region);
1592 	netio_handler_type reload_listener;
1593 	int reload_sockets[2] = {-1, -1};
1594 	struct timespec timeout_spec;
1595 	int status;
1596 	pid_t child_pid;
1597 	pid_t reload_pid = -1;
1598 	sig_atomic_t mode;
1599 
1600 	/* Ensure we are the main process */
1601 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1602 
1603 	/* Add listener for the XFRD process */
1604 	netio_add_handler(netio, nsd->xfrd_listener);
1605 
1606 	/* Start the child processes that handle incoming queries */
1607 	if (server_start_children(nsd, server_region, netio,
1608 		&nsd->xfrd_listener->fd) != 0) {
1609 		send_children_quit(nsd);
1610 		exit(1);
1611 	}
1612 	reload_listener.fd = -1;
1613 
1614 	/* This_child MUST be 0, because this is the parent process */
1615 	assert(nsd->this_child == 0);
1616 
1617 	/* Run the server until we get a shutdown signal */
1618 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1619 		/* Did we receive a signal that changes our mode? */
1620 		if(mode == NSD_RUN) {
1621 			nsd->mode = mode = server_signal_mode(nsd);
1622 		}
1623 
1624 		switch (mode) {
1625 		case NSD_RUN:
1626 			/* see if any child processes terminated */
1627 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1628 				int is_child = delete_child_pid(nsd, child_pid);
1629 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1630 					if(nsd->children[is_child].child_fd == -1)
1631 						nsd->children[is_child].has_exited = 1;
1632 					parent_check_all_children_exited(nsd);
1633 				} else if(is_child != -1) {
1634 					log_msg(LOG_WARNING,
1635 					       "server %d died unexpectedly with status %d, restarting",
1636 					       (int) child_pid, status);
1637 					restart_child_servers(nsd, server_region, netio,
1638 						&nsd->xfrd_listener->fd);
1639 				} else if (child_pid == reload_pid) {
1640 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1641 					pid_t mypid;
1642 					log_msg(LOG_WARNING,
1643 					       "Reload process %d failed with status %d, continuing with old database",
1644 					       (int) child_pid, status);
1645 					reload_pid = -1;
1646 					if(reload_listener.fd != -1) close(reload_listener.fd);
1647 					reload_listener.fd = -1;
1648 					reload_listener.event_types = NETIO_EVENT_NONE;
1649 					task_process_sync(nsd->task[nsd->mytask]);
1650 					/* inform xfrd reload attempt ended */
1651 					if(!write_socket(nsd->xfrd_listener->fd,
1652 						&cmd, sizeof(cmd))) {
1653 						log_msg(LOG_ERR, "problems "
1654 						  "sending SOAEND to xfrd: %s",
1655 						  strerror(errno));
1656 					}
1657 					mypid = getpid();
1658 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1659 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1660 							strerror(errno));
1661 					}
1662 				} else if(status != 0) {
1663 					/* check for status, because we get
1664 					 * the old-servermain because reload
1665 					 * is the process-parent of old-main,
1666 					 * and we get older server-processes
1667 					 * that are exiting after a reload */
1668 					log_msg(LOG_WARNING,
1669 					       "process %d terminated with status %d",
1670 					       (int) child_pid, status);
1671 				}
1672 			}
1673 			if (child_pid == -1) {
1674 				if (errno == EINTR) {
1675 					continue;
1676 				}
1677 				if (errno != ECHILD)
1678 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1679 			}
1680 			if (nsd->mode != NSD_RUN)
1681 				break;
1682 
1683 			/* timeout to collect processes. In case no sigchild happens. */
1684 			timeout_spec.tv_sec = 60;
1685 			timeout_spec.tv_nsec = 0;
1686 
1687 			/* listen on ports, timeout for collecting terminated children */
1688 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1689 				if (errno != EINTR) {
1690 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1691 				}
1692 			}
1693 			if(nsd->restart_children) {
1694 				restart_child_servers(nsd, server_region, netio,
1695 					&nsd->xfrd_listener->fd);
1696 				nsd->restart_children = 0;
1697 			}
1698 			if(nsd->reload_failed) {
1699 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1700 				pid_t mypid;
1701 				nsd->reload_failed = 0;
1702 				log_msg(LOG_WARNING,
1703 				       "Reload process %d failed, continuing with old database",
1704 				       (int) reload_pid);
1705 				reload_pid = -1;
1706 				if(reload_listener.fd != -1) close(reload_listener.fd);
1707 				reload_listener.fd = -1;
1708 				reload_listener.event_types = NETIO_EVENT_NONE;
1709 				task_process_sync(nsd->task[nsd->mytask]);
1710 				/* inform xfrd reload attempt ended */
1711 				if(!write_socket(nsd->xfrd_listener->fd,
1712 					&cmd, sizeof(cmd))) {
1713 					log_msg(LOG_ERR, "problems "
1714 					  "sending SOAEND to xfrd: %s",
1715 					  strerror(errno));
1716 				}
1717 				mypid = getpid();
1718 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1719 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1720 						strerror(errno));
1721 				}
1722 			}
1723 
1724 			break;
1725 		case NSD_RELOAD_REQ: {
1726 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1727 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1728 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1729 				"main: ipc send reload_req to xfrd"));
1730 			if(!write_socket(nsd->xfrd_listener->fd,
1731 				&cmd, sizeof(cmd))) {
1732 				log_msg(LOG_ERR, "server_main: could not send "
1733 				"reload_req to xfrd: %s", strerror(errno));
1734 			}
1735 			nsd->mode = NSD_RUN;
1736 			} break;
1737 		case NSD_RELOAD:
1738 			/* Continue to run nsd after reload */
1739 			nsd->mode = NSD_RUN;
1740 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1741 			if (reload_pid != -1) {
1742 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1743 				       (int) reload_pid);
1744 				break;
1745 			}
1746 
1747 			/* switch the mytask to keep track of who owns task*/
1748 			nsd->mytask = 1 - nsd->mytask;
1749 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1750 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1751 				reload_pid = -1;
1752 				break;
1753 			}
1754 
1755 			/* Do actual reload */
1756 			reload_pid = fork();
1757 			switch (reload_pid) {
1758 			case -1:
1759 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1760 				break;
1761 			default:
1762 				/* PARENT */
1763 				close(reload_sockets[0]);
1764 				server_reload(nsd, server_region, netio,
1765 					reload_sockets[1]);
1766 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1767 				close(reload_sockets[1]);
1768 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1769 				/* drop stale xfrd ipc data */
1770 				((struct ipc_handler_conn_data*)nsd->
1771 					xfrd_listener->user_data)
1772 					->conn->is_reading = 0;
1773 				reload_pid = -1;
1774 				reload_listener.fd = -1;
1775 				reload_listener.event_types = NETIO_EVENT_NONE;
1776 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1777 				break;
1778 			case 0:
1779 				/* CHILD */
1780 				/* server_main keep running until NSD_QUIT_SYNC
1781 				 * received from reload. */
1782 				close(reload_sockets[1]);
1783 				reload_listener.fd = reload_sockets[0];
1784 				reload_listener.timeout = NULL;
1785 				reload_listener.user_data = nsd;
1786 				reload_listener.event_types = NETIO_EVENT_READ;
1787 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1788 				netio_add_handler(netio, &reload_listener);
1789 				reload_pid = getppid();
1790 				break;
1791 			}
1792 			break;
1793 		case NSD_QUIT_SYNC:
1794 			/* synchronisation of xfrd, parent and reload */
1795 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1796 				sig_atomic_t cmd = NSD_RELOAD;
1797 				/* stop xfrd ipc writes in progress */
1798 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1799 					"main: ipc send indication reload"));
1800 				if(!write_socket(nsd->xfrd_listener->fd,
1801 					&cmd, sizeof(cmd))) {
1802 					log_msg(LOG_ERR, "server_main: could not send reload "
1803 					"indication to xfrd: %s", strerror(errno));
1804 				}
1805 				/* wait for ACK from xfrd */
1806 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1807 				nsd->quit_sync_done = 1;
1808 			}
1809 			nsd->mode = NSD_RUN;
1810 			break;
1811 		case NSD_QUIT:
1812 			/* silent shutdown during reload */
1813 			if(reload_listener.fd != -1) {
1814 				/* acknowledge the quit, to sync reload that we will really quit now */
1815 				sig_atomic_t cmd = NSD_RELOAD;
1816 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1817 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1818 					log_msg(LOG_ERR, "server_main: "
1819 						"could not ack quit: %s", strerror(errno));
1820 				}
1821 #ifdef BIND8_STATS
1822 				parent_send_stats(nsd, reload_listener.fd);
1823 #endif /* BIND8_STATS */
1824 				close(reload_listener.fd);
1825 			}
1826 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1827 			/* only quit children after xfrd has acked */
1828 			send_children_quit(nsd);
1829 
1830 #if 0 /* OS collects memory pages */
1831 			region_destroy(server_region);
1832 #endif
1833 			server_shutdown(nsd);
1834 
1835 			/* ENOTREACH */
1836 			break;
1837 		case NSD_SHUTDOWN:
1838 			break;
1839 		case NSD_REAP_CHILDREN:
1840 			/* continue; wait for child in run loop */
1841 			nsd->mode = NSD_RUN;
1842 			break;
1843 		case NSD_STATS:
1844 #ifdef BIND8_STATS
1845 			set_children_stats(nsd);
1846 #endif
1847 			nsd->mode = NSD_RUN;
1848 			break;
1849 		default:
1850 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1851 			nsd->mode = NSD_RUN;
1852 			break;
1853 		}
1854 	}
1855 	log_msg(LOG_WARNING, "signal received, shutting down...");
1856 
1857 	/* close opened ports to avoid race with restart of nsd */
1858 	server_close_all_sockets(nsd->udp, nsd->ifs);
1859 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1860 #ifdef HAVE_SSL
1861 	daemon_remote_close(nsd->rc);
1862 #endif
1863 	send_children_quit_and_wait(nsd);
1864 
1865 	/* Unlink it if possible... */
1866 	unlinkpid(nsd->pidfile);
1867 	unlink(nsd->task[0]->fname);
1868 	unlink(nsd->task[1]->fname);
1869 #ifdef USE_ZONE_STATS
1870 	unlink(nsd->zonestatfname[0]);
1871 	unlink(nsd->zonestatfname[1]);
1872 #endif
1873 
1874 	if(reload_listener.fd != -1) {
1875 		sig_atomic_t cmd = NSD_QUIT;
1876 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1877 			"main: ipc send quit to reload-process"));
1878 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1879 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1880 				strerror(errno));
1881 		}
1882 		fsync(reload_listener.fd);
1883 		close(reload_listener.fd);
1884 		/* wait for reload to finish processing */
1885 		while(1) {
1886 			if(waitpid(reload_pid, NULL, 0) == -1) {
1887 				if(errno == EINTR) continue;
1888 				if(errno == ECHILD) break;
1889 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1890 					(int)reload_pid, strerror(errno));
1891 			}
1892 			break;
1893 		}
1894 	}
1895 	if(nsd->xfrd_listener->fd != -1) {
1896 		/* complete quit, stop xfrd */
1897 		sig_atomic_t cmd = NSD_QUIT;
1898 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1899 			"main: ipc send quit to xfrd"));
1900 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
1901 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
1902 				strerror(errno));
1903 		}
1904 		fsync(nsd->xfrd_listener->fd);
1905 		close(nsd->xfrd_listener->fd);
1906 		(void)kill(nsd->pid, SIGTERM);
1907 	}
1908 
1909 #if 0 /* OS collects memory pages */
1910 	region_destroy(server_region);
1911 #endif
1912 	/* write the nsd.db to disk, wait for it to complete */
1913 	udb_base_sync(nsd->db->udb, 1);
1914 	udb_base_close(nsd->db->udb);
1915 	server_shutdown(nsd);
1916 }
1917 
1918 static query_state_type
1919 server_process_query(struct nsd *nsd, struct query *query)
1920 {
1921 	return query_process(query, nsd);
1922 }
1923 
1924 static query_state_type
1925 server_process_query_udp(struct nsd *nsd, struct query *query)
1926 {
1927 #ifdef RATELIMIT
1928 	if(query_process(query, nsd) != QUERY_DISCARDED) {
1929 		if(rrl_process_query(query))
1930 			return rrl_slip(query);
1931 		else	return QUERY_PROCESSED;
1932 	}
1933 	return QUERY_DISCARDED;
1934 #else
1935 	return query_process(query, nsd);
1936 #endif
1937 }
1938 
1939 struct event_base*
1940 nsd_child_event_base(void)
1941 {
1942 	struct event_base* base;
1943 #ifdef USE_MINI_EVENT
1944 	static time_t secs;
1945 	static struct timeval now;
1946 	base = event_init(&secs, &now);
1947 #else
1948 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
1949 	/* libev */
1950 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
1951 #  else
1952 	/* libevent */
1953 #    ifdef HAVE_EVENT_BASE_NEW
1954 	base = event_base_new();
1955 #    else
1956 	base = event_init();
1957 #    endif
1958 #  endif
1959 #endif
1960 	return base;
1961 }
1962 
1963 /*
1964  * Serve DNS requests.
1965  */
1966 void
1967 server_child(struct nsd *nsd)
1968 {
1969 	size_t i, from, numifs;
1970 	region_type *server_region = region_create(xalloc, free);
1971 	struct event_base* event_base = nsd_child_event_base();
1972 	query_type *udp_query;
1973 	sig_atomic_t mode;
1974 
1975 	if(!event_base) {
1976 		log_msg(LOG_ERR, "nsd server could not create event base");
1977 		exit(1);
1978 	}
1979 
1980 #ifdef RATELIMIT
1981 	rrl_init(nsd->this_child->child_num);
1982 #endif
1983 
1984 	assert(nsd->server_kind != NSD_SERVER_MAIN);
1985 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
1986 
1987 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
1988 		server_close_all_sockets(nsd->tcp, nsd->ifs);
1989 	}
1990 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
1991 		server_close_all_sockets(nsd->udp, nsd->ifs);
1992 	}
1993 
1994 	if (nsd->this_child && nsd->this_child->parent_fd != -1) {
1995 		struct event *handler;
1996 		struct ipc_handler_conn_data* user_data =
1997 			(struct ipc_handler_conn_data*)region_alloc(
1998 			server_region, sizeof(struct ipc_handler_conn_data));
1999 		user_data->nsd = nsd;
2000 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2001 
2002 		handler = (struct event*) region_alloc(
2003 			server_region, sizeof(*handler));
2004 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2005 			EV_READ, child_handle_parent_command, user_data);
2006 		if(event_base_set(event_base, handler) != 0)
2007 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2008 		if(event_add(handler, NULL) != 0)
2009 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2010 	}
2011 
2012 	if(nsd->reuseport) {
2013 		numifs = nsd->ifs / nsd->reuseport;
2014 		from = numifs * nsd->this_child->child_num;
2015 		if(from+numifs > nsd->ifs) { /* should not happen */
2016 			from = 0;
2017 			numifs = nsd->ifs;
2018 		}
2019 	} else {
2020 		from = 0;
2021 		numifs = nsd->ifs;
2022 	}
2023 
2024 	if (nsd->server_kind & NSD_SERVER_UDP) {
2025 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2026 		udp_query = query_create(server_region,
2027 			compressed_dname_offsets, compression_table_size);
2028 #else
2029 		udp_query = NULL;
2030 		memset(msgs, 0, sizeof(msgs));
2031 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2032 			queries[i] = query_create(server_region,
2033 				compressed_dname_offsets, compression_table_size);
2034 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2035 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2036 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2037 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2038 			msgs[i].msg_hdr.msg_iovlen  = 1;
2039 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2040 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2041 		}
2042 #endif
2043 		for (i = from; i < from+numifs; ++i) {
2044 			struct udp_handler_data *data;
2045 			struct event *handler;
2046 
2047 			data = (struct udp_handler_data *) region_alloc(
2048 				server_region,
2049 				sizeof(struct udp_handler_data));
2050 			data->query = udp_query;
2051 			data->nsd = nsd;
2052 			data->socket = &nsd->udp[i];
2053 
2054 			handler = (struct event*) region_alloc(
2055 				server_region, sizeof(*handler));
2056 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2057 				handle_udp, data);
2058 			if(event_base_set(event_base, handler) != 0)
2059 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2060 			if(event_add(handler, NULL) != 0)
2061 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2062 		}
2063 	}
2064 
2065 	/*
2066 	 * Keep track of all the TCP accept handlers so we can enable
2067 	 * and disable them based on the current number of active TCP
2068 	 * connections.
2069 	 */
2070 	tcp_accept_handler_count = numifs;
2071 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2072 		region_alloc_array(server_region,
2073 		numifs, sizeof(*tcp_accept_handlers));
2074 	if (nsd->server_kind & NSD_SERVER_TCP) {
2075 		for (i = from; i < numifs; ++i) {
2076 			struct event *handler = &tcp_accept_handlers[i-from].event;
2077 			struct tcp_accept_handler_data* data =
2078 				&tcp_accept_handlers[i-from];
2079 			data->nsd = nsd;
2080 			data->socket = &nsd->tcp[i];
2081 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2082 				handle_tcp_accept, data);
2083 			if(event_base_set(event_base, handler) != 0)
2084 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2085 			if(event_add(handler, NULL) != 0)
2086 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2087 			data->event_added = 1;
2088 		}
2089 	} else tcp_accept_handler_count = 0;
2090 
2091 	/* The main loop... */
2092 	while ((mode = nsd->mode) != NSD_QUIT) {
2093 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2094 
2095 		/* Do we need to do the statistics... */
2096 		if (mode == NSD_STATS) {
2097 #ifdef BIND8_STATS
2098 			int p = nsd->st.period;
2099 			nsd->st.period = 1; /* force stats printout */
2100 			/* Dump the statistics */
2101 			bind8_stats(nsd);
2102 			nsd->st.period = p;
2103 #else /* !BIND8_STATS */
2104 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2105 #endif /* BIND8_STATS */
2106 
2107 			nsd->mode = NSD_RUN;
2108 		}
2109 		else if (mode == NSD_REAP_CHILDREN) {
2110 			/* got signal, notify parent. parent reaps terminated children. */
2111 			if (nsd->this_child->parent_fd != -1) {
2112 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2113 				if (write(nsd->this_child->parent_fd,
2114 				    &parent_notify,
2115 				    sizeof(parent_notify)) == -1)
2116 				{
2117 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2118 						(int) nsd->this_child->pid, strerror(errno));
2119 				}
2120 			} else /* no parent, so reap 'em */
2121 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2122 			nsd->mode = NSD_RUN;
2123 		}
2124 		else if(mode == NSD_RUN) {
2125 			/* Wait for a query... */
2126 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2127 				if (errno != EINTR) {
2128 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2129 					break;
2130 				}
2131 			}
2132 		} else if(mode == NSD_QUIT) {
2133 			/* ignore here, quit */
2134 		} else {
2135 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2136 				(int)mode);
2137 			nsd->mode = NSD_RUN;
2138 		}
2139 	}
2140 
2141 #ifdef	BIND8_STATS
2142 	bind8_stats(nsd);
2143 #endif /* BIND8_STATS */
2144 
2145 #if 0 /* OS collects memory pages */
2146 	event_base_free(event_base);
2147 	region_destroy(server_region);
2148 #endif
2149 	server_shutdown(nsd);
2150 }
2151 
2152 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2153 static void
2154 handle_udp(int fd, short event, void* arg)
2155 {
2156 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2157 	int received, sent, recvcount, i;
2158 	struct query *q;
2159 
2160 	if (!(event & EV_READ)) {
2161 		return;
2162 	}
2163 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2164 	/* this printf strangely gave a performance increase on Linux */
2165 	/* printf("recvcount %d \n", recvcount); */
2166 	if (recvcount == -1) {
2167 		if (errno != EAGAIN && errno != EINTR) {
2168 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2169 			STATUP(data->nsd, rxerr);
2170 			/* No zone statup */
2171 		}
2172 		/* Simply no data available */
2173 		return;
2174 	}
2175 	for (i = 0; i < recvcount; i++) {
2176 	loopstart:
2177 		received = msgs[i].msg_len;
2178 		q = queries[i];
2179 		if (received == -1) {
2180 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2181 				msgs[i].msg_hdr.msg_flags));
2182 			STATUP(data->nsd, rxerr);
2183 			/* No zone statup */
2184 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2185 			iovecs[i].iov_len = buffer_remaining(q->packet);
2186 			goto swap_drop;
2187 		}
2188 
2189 		/* Account... */
2190 #ifdef BIND8_STATS
2191 		if (data->socket->fam == AF_INET) {
2192 			STATUP(data->nsd, qudp);
2193 		} else if (data->socket->fam == AF_INET6) {
2194 			STATUP(data->nsd, qudp6);
2195 		}
2196 #endif
2197 
2198 		buffer_skip(q->packet, received);
2199 		buffer_flip(q->packet);
2200 
2201 		/* Process and answer the query... */
2202 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2203 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2204 				STATUP(data->nsd, nona);
2205 				ZTATUP(data->nsd, q->zone, nona);
2206 			}
2207 
2208 #ifdef USE_ZONE_STATS
2209 			if (data->socket->fam == AF_INET) {
2210 				ZTATUP(data->nsd, q->zone, qudp);
2211 			} else if (data->socket->fam == AF_INET6) {
2212 				ZTATUP(data->nsd, q->zone, qudp6);
2213 			}
2214 #endif
2215 
2216 			/* Add EDNS0 and TSIG info if necessary.  */
2217 			query_add_optional(q, data->nsd);
2218 
2219 			buffer_flip(q->packet);
2220 			iovecs[i].iov_len = buffer_remaining(q->packet);
2221 #ifdef BIND8_STATS
2222 			/* Account the rcode & TC... */
2223 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2224 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2225 			if (TC(q->packet)) {
2226 				STATUP(data->nsd, truncated);
2227 				ZTATUP(data->nsd, q->zone, truncated);
2228 			}
2229 #endif /* BIND8_STATS */
2230 		} else {
2231 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2232 			iovecs[i].iov_len = buffer_remaining(q->packet);
2233 		swap_drop:
2234 			STATUP(data->nsd, dropped);
2235 			ZTATUP(data->nsd, q->zone, dropped);
2236 			if(i != recvcount-1) {
2237 				/* swap with last and decrease recvcount */
2238 				struct mmsghdr mtmp = msgs[i];
2239 				struct iovec iotmp = iovecs[i];
2240 				recvcount--;
2241 				msgs[i] = msgs[recvcount];
2242 				iovecs[i] = iovecs[recvcount];
2243 				queries[i] = queries[recvcount];
2244 				msgs[recvcount] = mtmp;
2245 				iovecs[recvcount] = iotmp;
2246 				queries[recvcount] = q;
2247 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2248 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2249 				goto loopstart;
2250 			} else { recvcount --; }
2251 		}
2252 	}
2253 
2254 	/* send until all are sent */
2255 	i = 0;
2256 	while(i<recvcount) {
2257 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2258 		if(sent == -1) {
2259 			const char* es = strerror(errno);
2260 			char a[48];
2261 			addr2str(&queries[i]->addr, a, sizeof(a));
2262 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2263 #ifdef BIND8_STATS
2264 			data->nsd->st.txerr += recvcount-i;
2265 #endif /* BIND8_STATS */
2266 			break;
2267 		}
2268 		i += sent;
2269 	}
2270 	for(i=0; i<recvcount; i++) {
2271 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2272 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2273 	}
2274 }
2275 
2276 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2277 
2278 static void
2279 handle_udp(int fd, short event, void* arg)
2280 {
2281 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2282 	int received, sent;
2283 #ifndef NONBLOCKING_IS_BROKEN
2284 #ifdef HAVE_RECVMMSG
2285 	int recvcount;
2286 #endif /* HAVE_RECVMMSG */
2287 	int i;
2288 #endif /* NONBLOCKING_IS_BROKEN */
2289 	struct query *q;
2290 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2291 	q = data->query;
2292 #endif
2293 
2294 	if (!(event & EV_READ)) {
2295 		return;
2296 	}
2297 #ifndef NONBLOCKING_IS_BROKEN
2298 #ifdef HAVE_RECVMMSG
2299 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2300 	/* this printf strangely gave a performance increase on Linux */
2301 	/* printf("recvcount %d \n", recvcount); */
2302 	if (recvcount == -1) {
2303 		if (errno != EAGAIN && errno != EINTR) {
2304 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2305 			STATUP(data->nsd, rxerr);
2306 			/* No zone statup */
2307 		}
2308 		/* Simply no data available */
2309 		return;
2310 	}
2311 	for (i = 0; i < recvcount; i++) {
2312 		received = msgs[i].msg_len;
2313 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2314 		if (received == -1) {
2315 			log_msg(LOG_ERR, "recvmmsg failed");
2316 			STATUP(data->nsd, rxerr);
2317 			/* No zone statup */
2318 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2319 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2320 			continue;
2321 		}
2322 		q = queries[i];
2323 #else
2324 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2325 #endif /* HAVE_RECVMMSG */
2326 #endif /* NONBLOCKING_IS_BROKEN */
2327 
2328 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2329 		/* Initialize the query... */
2330 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2331 
2332 		received = recvfrom(fd,
2333 				    buffer_begin(q->packet),
2334 				    buffer_remaining(q->packet),
2335 				    0,
2336 				    (struct sockaddr *)&q->addr,
2337 				    &q->addrlen);
2338 		if (received == -1) {
2339 			if (errno != EAGAIN && errno != EINTR) {
2340 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2341 				STATUP(data->nsd, rxerr);
2342 				/* No zone statup */
2343 			}
2344 			return;
2345 		}
2346 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2347 
2348 		/* Account... */
2349 		if (data->socket->fam == AF_INET) {
2350 			STATUP(data->nsd, qudp);
2351 		} else if (data->socket->fam == AF_INET6) {
2352 			STATUP(data->nsd, qudp6);
2353 		}
2354 
2355 		buffer_skip(q->packet, received);
2356 		buffer_flip(q->packet);
2357 
2358 		/* Process and answer the query... */
2359 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2360 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2361 				STATUP(data->nsd, nona);
2362 				ZTATUP(data->nsd, q->zone, nona);
2363 			}
2364 
2365 #ifdef USE_ZONE_STATS
2366 			if (data->socket->fam == AF_INET) {
2367 				ZTATUP(data->nsd, q->zone, qudp);
2368 			} else if (data->socket->fam == AF_INET6) {
2369 				ZTATUP(data->nsd, q->zone, qudp6);
2370 			}
2371 #endif
2372 
2373 			/* Add EDNS0 and TSIG info if necessary.  */
2374 			query_add_optional(q, data->nsd);
2375 
2376 			buffer_flip(q->packet);
2377 
2378 			sent = sendto(fd,
2379 				      buffer_begin(q->packet),
2380 				      buffer_remaining(q->packet),
2381 				      0,
2382 				      (struct sockaddr *) &q->addr,
2383 				      q->addrlen);
2384 			if (sent == -1) {
2385 				const char* es = strerror(errno);
2386 				char a[48];
2387 				addr2str(&q->addr, a, sizeof(a));
2388 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2389 				STATUP(data->nsd, txerr);
2390 				ZTATUP(data->nsd, q->zone, txerr);
2391 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2392 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2393 			} else {
2394 #ifdef BIND8_STATS
2395 				/* Account the rcode & TC... */
2396 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2397 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2398 				if (TC(q->packet)) {
2399 					STATUP(data->nsd, truncated);
2400 					ZTATUP(data->nsd, q->zone, truncated);
2401 				}
2402 #endif /* BIND8_STATS */
2403 			}
2404 		} else {
2405 			STATUP(data->nsd, dropped);
2406 			ZTATUP(data->nsd, q->zone, dropped);
2407 		}
2408 #ifndef NONBLOCKING_IS_BROKEN
2409 #ifdef HAVE_RECVMMSG
2410 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2411 #endif
2412 	}
2413 #endif
2414 }
2415 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2416 
2417 
2418 static void
2419 cleanup_tcp_handler(struct tcp_handler_data* data)
2420 {
2421 	event_del(&data->event);
2422 	close(data->event.ev_fd);
2423 
2424 	/*
2425 	 * Enable the TCP accept handlers when the current number of
2426 	 * TCP connections is about to drop below the maximum number
2427 	 * of TCP connections.
2428 	 */
2429 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2430 		configure_handler_event_types(EV_READ|EV_PERSIST);
2431 		if(slowaccept) {
2432 			event_del(&slowaccept_event);
2433 			slowaccept = 0;
2434 		}
2435 	}
2436 	--data->nsd->current_tcp_count;
2437 	assert(data->nsd->current_tcp_count >= 0);
2438 
2439 	region_destroy(data->region);
2440 }
2441 
2442 static void
2443 handle_tcp_reading(int fd, short event, void* arg)
2444 {
2445 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2446 	ssize_t received;
2447 	struct event_base* ev_base;
2448 	struct timeval timeout;
2449 
2450 	if ((event & EV_TIMEOUT)) {
2451 		/* Connection timed out.  */
2452 		cleanup_tcp_handler(data);
2453 		return;
2454 	}
2455 
2456 	if (data->nsd->tcp_query_count > 0 &&
2457 		data->query_count >= data->nsd->tcp_query_count) {
2458 		/* No more queries allowed on this tcp connection.  */
2459 		cleanup_tcp_handler(data);
2460 		return;
2461 	}
2462 
2463 	assert((event & EV_READ));
2464 
2465 	if (data->bytes_transmitted == 0) {
2466 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2467 	}
2468 
2469 	/*
2470 	 * Check if we received the leading packet length bytes yet.
2471 	 */
2472 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2473 		received = read(fd,
2474 				(char *) &data->query->tcplen
2475 				+ data->bytes_transmitted,
2476 				sizeof(uint16_t) - data->bytes_transmitted);
2477 		if (received == -1) {
2478 			if (errno == EAGAIN || errno == EINTR) {
2479 				/*
2480 				 * Read would block, wait until more
2481 				 * data is available.
2482 				 */
2483 				return;
2484 			} else {
2485 				char buf[48];
2486 				addr2str(&data->query->addr, buf, sizeof(buf));
2487 #ifdef ECONNRESET
2488 				if (verbosity >= 2 || errno != ECONNRESET)
2489 #endif /* ECONNRESET */
2490 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2491 				cleanup_tcp_handler(data);
2492 				return;
2493 			}
2494 		} else if (received == 0) {
2495 			/* EOF */
2496 			cleanup_tcp_handler(data);
2497 			return;
2498 		}
2499 
2500 		data->bytes_transmitted += received;
2501 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2502 			/*
2503 			 * Not done with the tcplen yet, wait for more
2504 			 * data to become available.
2505 			 */
2506 			return;
2507 		}
2508 
2509 		assert(data->bytes_transmitted == sizeof(uint16_t));
2510 
2511 		data->query->tcplen = ntohs(data->query->tcplen);
2512 
2513 		/*
2514 		 * Minimum query size is:
2515 		 *
2516 		 *     Size of the header (12)
2517 		 *   + Root domain name   (1)
2518 		 *   + Query class        (2)
2519 		 *   + Query type         (2)
2520 		 */
2521 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2522 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2523 			cleanup_tcp_handler(data);
2524 			return;
2525 		}
2526 
2527 		if (data->query->tcplen > data->query->maxlen) {
2528 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2529 			cleanup_tcp_handler(data);
2530 			return;
2531 		}
2532 
2533 		buffer_set_limit(data->query->packet, data->query->tcplen);
2534 	}
2535 
2536 	assert(buffer_remaining(data->query->packet) > 0);
2537 
2538 	/* Read the (remaining) query data.  */
2539 	received = read(fd,
2540 			buffer_current(data->query->packet),
2541 			buffer_remaining(data->query->packet));
2542 	if (received == -1) {
2543 		if (errno == EAGAIN || errno == EINTR) {
2544 			/*
2545 			 * Read would block, wait until more data is
2546 			 * available.
2547 			 */
2548 			return;
2549 		} else {
2550 			char buf[48];
2551 			addr2str(&data->query->addr, buf, sizeof(buf));
2552 #ifdef ECONNRESET
2553 			if (verbosity >= 2 || errno != ECONNRESET)
2554 #endif /* ECONNRESET */
2555 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2556 			cleanup_tcp_handler(data);
2557 			return;
2558 		}
2559 	} else if (received == 0) {
2560 		/* EOF */
2561 		cleanup_tcp_handler(data);
2562 		return;
2563 	}
2564 
2565 	data->bytes_transmitted += received;
2566 	buffer_skip(data->query->packet, received);
2567 	if (buffer_remaining(data->query->packet) > 0) {
2568 		/*
2569 		 * Message not yet complete, wait for more data to
2570 		 * become available.
2571 		 */
2572 		return;
2573 	}
2574 
2575 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2576 
2577 	/* Account... */
2578 #ifdef BIND8_STATS
2579 #ifndef INET6
2580 	STATUP(data->nsd, ctcp);
2581 #else
2582 	if (data->query->addr.ss_family == AF_INET) {
2583 		STATUP(data->nsd, ctcp);
2584 	} else if (data->query->addr.ss_family == AF_INET6) {
2585 		STATUP(data->nsd, ctcp6);
2586 	}
2587 #endif
2588 #endif /* BIND8_STATS */
2589 
2590 	/* We have a complete query, process it.  */
2591 
2592 	/* tcp-query-count: handle query counter ++ */
2593 	data->query_count++;
2594 
2595 	buffer_flip(data->query->packet);
2596 	data->query_state = server_process_query(data->nsd, data->query);
2597 	if (data->query_state == QUERY_DISCARDED) {
2598 		/* Drop the packet and the entire connection... */
2599 		STATUP(data->nsd, dropped);
2600 		ZTATUP(data->nsd, data->query->zone, dropped);
2601 		cleanup_tcp_handler(data);
2602 		return;
2603 	}
2604 
2605 #ifdef BIND8_STATS
2606 	if (RCODE(data->query->packet) == RCODE_OK
2607 	    && !AA(data->query->packet))
2608 	{
2609 		STATUP(data->nsd, nona);
2610 		ZTATUP(data->nsd, data->query->zone, nona);
2611 	}
2612 #endif /* BIND8_STATS */
2613 
2614 #ifdef USE_ZONE_STATS
2615 #ifndef INET6
2616 	ZTATUP(data->nsd, data->query->zone, ctcp);
2617 #else
2618 	if (data->query->addr.ss_family == AF_INET) {
2619 		ZTATUP(data->nsd, data->query->zone, ctcp);
2620 	} else if (data->query->addr.ss_family == AF_INET6) {
2621 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2622 	}
2623 #endif
2624 #endif /* USE_ZONE_STATS */
2625 
2626 	query_add_optional(data->query, data->nsd);
2627 
2628 	/* Switch to the tcp write handler.  */
2629 	buffer_flip(data->query->packet);
2630 	data->query->tcplen = buffer_remaining(data->query->packet);
2631 	data->bytes_transmitted = 0;
2632 
2633 	timeout.tv_sec = data->nsd->tcp_timeout;
2634 	timeout.tv_usec = 0L;
2635 
2636 	ev_base = data->event.ev_base;
2637 	event_del(&data->event);
2638 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2639 		handle_tcp_writing, data);
2640 	if(event_base_set(ev_base, &data->event) != 0)
2641 		log_msg(LOG_ERR, "event base set tcpr failed");
2642 	if(event_add(&data->event, &timeout) != 0)
2643 		log_msg(LOG_ERR, "event add tcpr failed");
2644 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2645 	handle_tcp_writing(fd, EV_WRITE, data);
2646 }
2647 
2648 static void
2649 handle_tcp_writing(int fd, short event, void* arg)
2650 {
2651 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2652 	ssize_t sent;
2653 	struct query *q = data->query;
2654 	struct timeval timeout;
2655 	struct event_base* ev_base;
2656 
2657 	if ((event & EV_TIMEOUT)) {
2658 		/* Connection timed out.  */
2659 		cleanup_tcp_handler(data);
2660 		return;
2661 	}
2662 
2663 	assert((event & EV_WRITE));
2664 
2665 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2666 		/* Writing the response packet length.  */
2667 		uint16_t n_tcplen = htons(q->tcplen);
2668 #ifdef HAVE_WRITEV
2669 		struct iovec iov[2];
2670 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2671 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2672 		iov[1].iov_base = buffer_begin(q->packet);
2673 		iov[1].iov_len = buffer_limit(q->packet);
2674 		sent = writev(fd, iov, 2);
2675 #else /* HAVE_WRITEV */
2676 		sent = write(fd,
2677 			     (const char *) &n_tcplen + data->bytes_transmitted,
2678 			     sizeof(n_tcplen) - data->bytes_transmitted);
2679 #endif /* HAVE_WRITEV */
2680 		if (sent == -1) {
2681 			if (errno == EAGAIN || errno == EINTR) {
2682 				/*
2683 				 * Write would block, wait until
2684 				 * socket becomes writable again.
2685 				 */
2686 				return;
2687 			} else {
2688 #ifdef ECONNRESET
2689 				if(verbosity >= 2 || errno != ECONNRESET)
2690 #endif /* ECONNRESET */
2691 #ifdef EPIPE
2692 				  if(verbosity >= 2 || errno != EPIPE)
2693 #endif /* EPIPE 'broken pipe' */
2694 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2695 				cleanup_tcp_handler(data);
2696 				return;
2697 			}
2698 		}
2699 
2700 		data->bytes_transmitted += sent;
2701 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2702 			/*
2703 			 * Writing not complete, wait until socket
2704 			 * becomes writable again.
2705 			 */
2706 			return;
2707 		}
2708 
2709 #ifdef HAVE_WRITEV
2710 		sent -= sizeof(n_tcplen);
2711 		/* handle potential 'packet done' code */
2712 		goto packet_could_be_done;
2713 #endif
2714  	}
2715 
2716 	sent = write(fd,
2717 		     buffer_current(q->packet),
2718 		     buffer_remaining(q->packet));
2719 	if (sent == -1) {
2720 		if (errno == EAGAIN || errno == EINTR) {
2721 			/*
2722 			 * Write would block, wait until
2723 			 * socket becomes writable again.
2724 			 */
2725 			return;
2726 		} else {
2727 #ifdef ECONNRESET
2728 			if(verbosity >= 2 || errno != ECONNRESET)
2729 #endif /* ECONNRESET */
2730 #ifdef EPIPE
2731 				  if(verbosity >= 2 || errno != EPIPE)
2732 #endif /* EPIPE 'broken pipe' */
2733 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2734 			cleanup_tcp_handler(data);
2735 			return;
2736 		}
2737 	}
2738 
2739 	data->bytes_transmitted += sent;
2740 #ifdef HAVE_WRITEV
2741   packet_could_be_done:
2742 #endif
2743 	buffer_skip(q->packet, sent);
2744 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2745 		/*
2746 		 * Still more data to write when socket becomes
2747 		 * writable again.
2748 		 */
2749 		return;
2750 	}
2751 
2752 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2753 
2754 	if (data->query_state == QUERY_IN_AXFR) {
2755 		/* Continue processing AXFR and writing back results.  */
2756 		buffer_clear(q->packet);
2757 		data->query_state = query_axfr(data->nsd, q);
2758 		if (data->query_state != QUERY_PROCESSED) {
2759 			query_add_optional(data->query, data->nsd);
2760 
2761 			/* Reset data. */
2762 			buffer_flip(q->packet);
2763 			q->tcplen = buffer_remaining(q->packet);
2764 			data->bytes_transmitted = 0;
2765 			/* Reset timeout.  */
2766 			timeout.tv_sec = data->nsd->tcp_timeout;
2767 			timeout.tv_usec = 0L;
2768 			ev_base = data->event.ev_base;
2769 			event_del(&data->event);
2770 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2771 				handle_tcp_writing, data);
2772 			if(event_base_set(ev_base, &data->event) != 0)
2773 				log_msg(LOG_ERR, "event base set tcpw failed");
2774 			if(event_add(&data->event, &timeout) != 0)
2775 				log_msg(LOG_ERR, "event add tcpw failed");
2776 
2777 			/*
2778 			 * Write data if/when the socket is writable
2779 			 * again.
2780 			 */
2781 			return;
2782 		}
2783 	}
2784 
2785 	/*
2786 	 * Done sending, wait for the next request to arrive on the
2787 	 * TCP socket by installing the TCP read handler.
2788 	 */
2789 	if (data->nsd->tcp_query_count > 0 &&
2790 		data->query_count >= data->nsd->tcp_query_count) {
2791 
2792 		(void) shutdown(fd, SHUT_WR);
2793 	}
2794 
2795 	data->bytes_transmitted = 0;
2796 
2797 	timeout.tv_sec = data->nsd->tcp_timeout;
2798 	timeout.tv_usec = 0L;
2799 	ev_base = data->event.ev_base;
2800 	event_del(&data->event);
2801 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2802 		handle_tcp_reading, data);
2803 	if(event_base_set(ev_base, &data->event) != 0)
2804 		log_msg(LOG_ERR, "event base set tcpw failed");
2805 	if(event_add(&data->event, &timeout) != 0)
2806 		log_msg(LOG_ERR, "event add tcpw failed");
2807 }
2808 
2809 
2810 static void
2811 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2812 	void* ATTR_UNUSED(arg))
2813 {
2814 	if(slowaccept) {
2815 		configure_handler_event_types(EV_PERSIST | EV_READ);
2816 		slowaccept = 0;
2817 	}
2818 }
2819 
2820 /*
2821  * Handle an incoming TCP connection.  The connection is accepted and
2822  * a new TCP reader event handler is added.  The TCP handler
2823  * is responsible for cleanup when the connection is closed.
2824  */
2825 static void
2826 handle_tcp_accept(int fd, short event, void* arg)
2827 {
2828 	struct tcp_accept_handler_data *data
2829 		= (struct tcp_accept_handler_data *) arg;
2830 	int s;
2831 	struct tcp_handler_data *tcp_data;
2832 	region_type *tcp_region;
2833 #ifdef INET6
2834 	struct sockaddr_storage addr;
2835 #else
2836 	struct sockaddr_in addr;
2837 #endif
2838 	socklen_t addrlen;
2839 	struct timeval timeout;
2840 
2841 	if (!(event & EV_READ)) {
2842 		return;
2843 	}
2844 
2845 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2846 		return;
2847 	}
2848 
2849 	/* Accept it... */
2850 	addrlen = sizeof(addr);
2851 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
2852 	if (s == -1) {
2853 		/**
2854 		 * EMFILE and ENFILE is a signal that the limit of open
2855 		 * file descriptors has been reached. Pause accept().
2856 		 * EINTR is a signal interrupt. The others are various OS ways
2857 		 * of saying that the client has closed the connection.
2858 		 */
2859 		if (errno == EMFILE || errno == ENFILE) {
2860 			if (!slowaccept) {
2861 				/* disable accept events */
2862 				struct timeval tv;
2863 				configure_handler_event_types(0);
2864 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
2865 				tv.tv_usec = 0L;
2866 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
2867 					handle_slowaccept_timeout, NULL);
2868 				(void)event_base_set(data->event.ev_base,
2869 					&slowaccept_event);
2870 				(void)event_add(&slowaccept_event, &tv);
2871 				slowaccept = 1;
2872 				/* We don't want to spam the logs here */
2873 			}
2874 		} else if (errno != EINTR
2875 			&& errno != EWOULDBLOCK
2876 #ifdef ECONNABORTED
2877 			&& errno != ECONNABORTED
2878 #endif /* ECONNABORTED */
2879 #ifdef EPROTO
2880 			&& errno != EPROTO
2881 #endif /* EPROTO */
2882 			) {
2883 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
2884 		}
2885 		return;
2886 	}
2887 
2888 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
2889 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
2890 		close(s);
2891 		return;
2892 	}
2893 
2894 	/*
2895 	 * This region is deallocated when the TCP connection is
2896 	 * closed by the TCP handler.
2897 	 */
2898 	tcp_region = region_create(xalloc, free);
2899 	tcp_data = (struct tcp_handler_data *) region_alloc(
2900 		tcp_region, sizeof(struct tcp_handler_data));
2901 	tcp_data->region = tcp_region;
2902 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
2903 		compression_table_size);
2904 	tcp_data->nsd = data->nsd;
2905 	tcp_data->query_count = 0;
2906 
2907 	tcp_data->query_state = QUERY_PROCESSED;
2908 	tcp_data->bytes_transmitted = 0;
2909 	memcpy(&tcp_data->query->addr, &addr, addrlen);
2910 	tcp_data->query->addrlen = addrlen;
2911 
2912 	timeout.tv_sec = data->nsd->tcp_timeout;
2913 	timeout.tv_usec = 0;
2914 
2915 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
2916 		handle_tcp_reading, tcp_data);
2917 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
2918 		log_msg(LOG_ERR, "cannot set tcp event base");
2919 		close(s);
2920 		region_destroy(tcp_region);
2921 		return;
2922 	}
2923 	if(event_add(&tcp_data->event, &timeout) != 0) {
2924 		log_msg(LOG_ERR, "cannot add tcp to event base");
2925 		close(s);
2926 		region_destroy(tcp_region);
2927 		return;
2928 	}
2929 
2930 	/*
2931 	 * Keep track of the total number of TCP handlers installed so
2932 	 * we can stop accepting connections when the maximum number
2933 	 * of simultaneous TCP connections is reached.
2934 	 */
2935 	++data->nsd->current_tcp_count;
2936 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2937 		configure_handler_event_types(0);
2938 	}
2939 }
2940 
2941 static void
2942 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
2943 {
2944 	size_t i;
2945 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2946 	for (i = 0; i < nsd->child_count; ++i) {
2947 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
2948 			if (write(nsd->children[i].child_fd,
2949 				&command,
2950 				sizeof(command)) == -1)
2951 			{
2952 				if(errno != EAGAIN && errno != EINTR)
2953 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
2954 					(int) command,
2955 					(int) nsd->children[i].pid,
2956 					strerror(errno));
2957 			} else if (timeout > 0) {
2958 				(void)block_read(NULL,
2959 					nsd->children[i].child_fd,
2960 					&command, sizeof(command), timeout);
2961 			}
2962 			fsync(nsd->children[i].child_fd);
2963 			close(nsd->children[i].child_fd);
2964 			nsd->children[i].child_fd = -1;
2965 		}
2966 	}
2967 }
2968 
2969 static void
2970 send_children_quit(struct nsd* nsd)
2971 {
2972 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
2973 	send_children_command(nsd, NSD_QUIT, 0);
2974 }
2975 
2976 static void
2977 send_children_quit_and_wait(struct nsd* nsd)
2978 {
2979 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
2980 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
2981 }
2982 
2983 #ifdef BIND8_STATS
2984 static void
2985 set_children_stats(struct nsd* nsd)
2986 {
2987 	size_t i;
2988 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
2989 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
2990 	for (i = 0; i < nsd->child_count; ++i) {
2991 		nsd->children[i].need_to_send_STATS = 1;
2992 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
2993 	}
2994 }
2995 #endif /* BIND8_STATS */
2996 
2997 static void
2998 configure_handler_event_types(short event_types)
2999 {
3000 	size_t i;
3001 
3002 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3003 		struct event* handler = &tcp_accept_handlers[i].event;
3004 		if(event_types) {
3005 			/* reassign */
3006 			int fd = handler->ev_fd;
3007 			struct event_base* base = handler->ev_base;
3008 			if(tcp_accept_handlers[i].event_added)
3009 				event_del(handler);
3010 			event_set(handler, fd, event_types,
3011 				handle_tcp_accept, &tcp_accept_handlers[i]);
3012 			if(event_base_set(base, handler) != 0)
3013 				log_msg(LOG_ERR, "conhand: cannot event_base");
3014 			if(event_add(handler, NULL) != 0)
3015 				log_msg(LOG_ERR, "conhand: cannot event_add");
3016 			tcp_accept_handlers[i].event_added = 1;
3017 		} else {
3018 			/* remove */
3019 			if(tcp_accept_handlers[i].event_added) {
3020 				event_del(handler);
3021 				tcp_accept_handlers[i].event_added = 0;
3022 			}
3023 		}
3024 	}
3025 }
3026