xref: /openbsd-src/usr.sbin/nsd/server.c (revision 4b70baf6e17fc8b27fc1f7fa7929335753fa94c3)
1 /*
2  * server.c -- nsd(8) network input/output
3  *
4  * Copyright (c) 2001-2006, NLnet Labs. All rights reserved.
5  *
6  * See LICENSE for the license.
7  *
8  */
9 
10 #include "config.h"
11 
12 #include <sys/types.h>
13 #include <sys/param.h>
14 #include <sys/socket.h>
15 #include <sys/uio.h>
16 #include <sys/wait.h>
17 
18 #include <netinet/in.h>
19 #include <arpa/inet.h>
20 
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <time.h>
30 #include <unistd.h>
31 #include <signal.h>
32 #include <netdb.h>
33 #include <poll.h>
34 #ifndef SHUT_WR
35 #define SHUT_WR 1
36 #endif
37 #ifdef HAVE_MMAP
38 #include <sys/mman.h>
39 #endif /* HAVE_MMAP */
40 #ifdef HAVE_OPENSSL_RAND_H
41 #include <openssl/rand.h>
42 #endif
43 #ifndef USE_MINI_EVENT
44 #  ifdef HAVE_EVENT_H
45 #    include <event.h>
46 #  else
47 #    include <event2/event.h>
48 #    include "event2/event_struct.h"
49 #    include "event2/event_compat.h"
50 #  endif
51 #else
52 #  include "mini_event.h"
53 #endif
54 
55 #include "axfr.h"
56 #include "namedb.h"
57 #include "netio.h"
58 #include "xfrd.h"
59 #include "xfrd-tcp.h"
60 #include "xfrd-disk.h"
61 #include "difffile.h"
62 #include "nsec3.h"
63 #include "ipc.h"
64 #include "udb.h"
65 #include "remote.h"
66 #include "lookup3.h"
67 #include "rrl.h"
68 #ifdef USE_DNSTAP
69 #include "dnstap/dnstap_collector.h"
70 #endif
71 
72 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */
73 
74 /*
75  * Data for the UDP handlers.
76  */
77 struct udp_handler_data
78 {
79 	struct nsd        *nsd;
80 	struct nsd_socket *socket;
81 	query_type        *query;
82 };
83 
84 struct tcp_accept_handler_data {
85 	struct nsd         *nsd;
86 	struct nsd_socket  *socket;
87 	int event_added;
88 	struct event       event;
89 };
90 
91 /*
92  * These globals are used to enable the TCP accept handlers
93  * when the number of TCP connection drops below the maximum
94  * number of TCP connections.
95  */
96 static size_t		tcp_accept_handler_count;
97 static struct tcp_accept_handler_data*	tcp_accept_handlers;
98 
99 static struct event slowaccept_event;
100 static int slowaccept;
101 
102 #ifndef NONBLOCKING_IS_BROKEN
103 #  define NUM_RECV_PER_SELECT 100
104 #endif
105 
106 #if (!defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG))
107 struct mmsghdr msgs[NUM_RECV_PER_SELECT];
108 struct iovec iovecs[NUM_RECV_PER_SELECT];
109 struct query *queries[NUM_RECV_PER_SELECT];
110 #endif
111 
112 /*
113  * Data for the TCP connection handlers.
114  *
115  * The TCP handlers use non-blocking I/O.  This is necessary to avoid
116  * blocking the entire server on a slow TCP connection, but does make
117  * reading from and writing to the socket more complicated.
118  *
119  * Basically, whenever a read/write would block (indicated by the
120  * EAGAIN errno variable) we remember the position we were reading
121  * from/writing to and return from the TCP reading/writing event
122  * handler.  When the socket becomes readable/writable again we
123  * continue from the same position.
124  */
125 struct tcp_handler_data
126 {
127 	/*
128 	 * The region used to allocate all TCP connection related
129 	 * data, including this structure.  This region is destroyed
130 	 * when the connection is closed.
131 	 */
132 	region_type*		region;
133 
134 	/*
135 	 * The global nsd structure.
136 	 */
137 	struct nsd*			nsd;
138 
139 	/*
140 	 * The current query data for this TCP connection.
141 	 */
142 	query_type*			query;
143 
144 	/*
145 	 * The query_state is used to remember if we are performing an
146 	 * AXFR, if we're done processing, or if we should discard the
147 	 * query and connection.
148 	 */
149 	query_state_type	query_state;
150 
151 	/*
152 	 * The event for the file descriptor and tcp timeout
153 	 */
154 	struct event event;
155 
156 	/*
157 	 * The bytes_transmitted field is used to remember the number
158 	 * of bytes transmitted when receiving or sending a DNS
159 	 * packet.  The count includes the two additional bytes used
160 	 * to specify the packet length on a TCP connection.
161 	 */
162 	size_t				bytes_transmitted;
163 
164 	/*
165 	 * The number of queries handled by this specific TCP connection.
166 	 */
167 	int					query_count;
168 
169 	/*
170 	 * The timeout in msec for this tcp connection
171 	 */
172 	int	tcp_timeout;
173 };
174 
175 /*
176  * Handle incoming queries on the UDP server sockets.
177  */
178 static void handle_udp(int fd, short event, void* arg);
179 
180 /*
181  * Handle incoming connections on the TCP sockets.  These handlers
182  * usually wait for the NETIO_EVENT_READ event (indicating an incoming
183  * connection) but are disabled when the number of current TCP
184  * connections is equal to the maximum number of TCP connections.
185  * Disabling is done by changing the handler to wait for the
186  * NETIO_EVENT_NONE type.  This is done using the function
187  * configure_tcp_accept_handlers.
188  */
189 static void handle_tcp_accept(int fd, short event, void* arg);
190 
191 /*
192  * Handle incoming queries on a TCP connection.  The TCP connections
193  * are configured to be non-blocking and the handler may be called
194  * multiple times before a complete query is received.
195  */
196 static void handle_tcp_reading(int fd, short event, void* arg);
197 
198 /*
199  * Handle outgoing responses on a TCP connection.  The TCP connections
200  * are configured to be non-blocking and the handler may be called
201  * multiple times before a complete response is sent.
202  */
203 static void handle_tcp_writing(int fd, short event, void* arg);
204 
205 /*
206  * Send all children the quit nonblocking, then close pipe.
207  */
208 static void send_children_quit(struct nsd* nsd);
209 /* same, for shutdown time, waits for child to exit to avoid restart issues */
210 static void send_children_quit_and_wait(struct nsd* nsd);
211 
212 /* set childrens flags to send NSD_STATS to them */
213 #ifdef BIND8_STATS
214 static void set_children_stats(struct nsd* nsd);
215 #endif /* BIND8_STATS */
216 
217 /*
218  * Change the event types the HANDLERS are interested in to EVENT_TYPES.
219  */
220 static void configure_handler_event_types(short event_types);
221 
222 static uint16_t *compressed_dname_offsets = 0;
223 static uint32_t compression_table_capacity = 0;
224 static uint32_t compression_table_size = 0;
225 static domain_type* compressed_dnames[MAXRRSPP];
226 
227 /*
228  * Remove the specified pid from the list of child pids.  Returns -1 if
229  * the pid is not in the list, child_num otherwise.  The field is set to 0.
230  */
231 static int
232 delete_child_pid(struct nsd *nsd, pid_t pid)
233 {
234 	size_t i;
235 	for (i = 0; i < nsd->child_count; ++i) {
236 		if (nsd->children[i].pid == pid) {
237 			nsd->children[i].pid = 0;
238 			if(!nsd->children[i].need_to_exit) {
239 				if(nsd->children[i].child_fd != -1)
240 					close(nsd->children[i].child_fd);
241 				nsd->children[i].child_fd = -1;
242 				if(nsd->children[i].handler)
243 					nsd->children[i].handler->fd = -1;
244 			}
245 			return i;
246 		}
247 	}
248 	return -1;
249 }
250 
251 /*
252  * Restart child servers if necessary.
253  */
254 static int
255 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio,
256 	int* xfrd_sock_p)
257 {
258 	struct main_ipc_handler_data *ipc_data;
259 	size_t i;
260 	int sv[2];
261 
262 	/* Fork the child processes... */
263 	for (i = 0; i < nsd->child_count; ++i) {
264 		if (nsd->children[i].pid <= 0) {
265 			if (nsd->children[i].child_fd != -1)
266 				close(nsd->children[i].child_fd);
267 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) {
268 				log_msg(LOG_ERR, "socketpair: %s",
269 					strerror(errno));
270 				return -1;
271 			}
272 			nsd->children[i].child_fd = sv[0];
273 			nsd->children[i].parent_fd = sv[1];
274 			nsd->children[i].pid = fork();
275 			switch (nsd->children[i].pid) {
276 			default: /* SERVER MAIN */
277 				close(nsd->children[i].parent_fd);
278 				nsd->children[i].parent_fd = -1;
279 				if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) {
280 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
281 				}
282 				if(!nsd->children[i].handler)
283 				{
284 					ipc_data = (struct main_ipc_handler_data*) region_alloc(
285 						region, sizeof(struct main_ipc_handler_data));
286 					ipc_data->nsd = nsd;
287 					ipc_data->child = &nsd->children[i];
288 					ipc_data->child_num = i;
289 					ipc_data->xfrd_sock = xfrd_sock_p;
290 					ipc_data->packet = buffer_create(region, QIOBUFSZ);
291 					ipc_data->forward_mode = 0;
292 					ipc_data->got_bytes = 0;
293 					ipc_data->total_bytes = 0;
294 					ipc_data->acl_num = 0;
295 					nsd->children[i].handler = (struct netio_handler*) region_alloc(
296 						region, sizeof(struct netio_handler));
297 					nsd->children[i].handler->fd = nsd->children[i].child_fd;
298 					nsd->children[i].handler->timeout = NULL;
299 					nsd->children[i].handler->user_data = ipc_data;
300 					nsd->children[i].handler->event_types = NETIO_EVENT_READ;
301 					nsd->children[i].handler->event_handler = parent_handle_child_command;
302 					netio_add_handler(netio, nsd->children[i].handler);
303 				}
304 				/* clear any ongoing ipc */
305 				ipc_data = (struct main_ipc_handler_data*)
306 					nsd->children[i].handler->user_data;
307 				ipc_data->forward_mode = 0;
308 				/* restart - update fd */
309 				nsd->children[i].handler->fd = nsd->children[i].child_fd;
310 				break;
311 			case 0: /* CHILD */
312 				/* the child need not be able to access the
313 				 * nsd.db file */
314 				namedb_close_udb(nsd->db);
315 #ifdef MEMCLEAN /* OS collects memory pages */
316 				region_destroy(region);
317 #endif
318 
319 				if (pledge("stdio rpath inet", NULL) == -1) {
320 					log_msg(LOG_ERR, "pledge");
321 					exit(1);
322 				}
323 
324 				nsd->pid = 0;
325 				nsd->child_count = 0;
326 				nsd->server_kind = nsd->children[i].kind;
327 				nsd->this_child = &nsd->children[i];
328 				nsd->this_child->child_num = i;
329 				/* remove signal flags inherited from parent
330 				   the parent will handle them. */
331 				nsd->signal_hint_reload_hup = 0;
332 				nsd->signal_hint_reload = 0;
333 				nsd->signal_hint_child = 0;
334 				nsd->signal_hint_quit = 0;
335 				nsd->signal_hint_shutdown = 0;
336 				nsd->signal_hint_stats = 0;
337 				nsd->signal_hint_statsusr = 0;
338 				close(*xfrd_sock_p);
339 				close(nsd->this_child->child_fd);
340 				nsd->this_child->child_fd = -1;
341 				if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) {
342 					log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
343 				}
344 				server_child(nsd);
345 				/* NOTREACH */
346 				exit(0);
347 			case -1:
348 				log_msg(LOG_ERR, "fork failed: %s",
349 					strerror(errno));
350 				return -1;
351 			}
352 		}
353 	}
354 	return 0;
355 }
356 
357 #ifdef BIND8_STATS
358 static void set_bind8_alarm(struct nsd* nsd)
359 {
360 	/* resync so that the next alarm is on the next whole minute */
361 	if(nsd->st.period > 0) /* % by 0 gives divbyzero error */
362 		alarm(nsd->st.period - (time(NULL) % nsd->st.period));
363 }
364 #endif
365 
366 /* set zone stat ids for zones initially read in */
367 static void
368 zonestatid_tree_set(struct nsd* nsd)
369 {
370 	struct radnode* n;
371 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
372 		zone_type* zone = (zone_type*)n->elem;
373 		zone->zonestatid = getzonestatid(nsd->options, zone->opts);
374 	}
375 }
376 
377 #ifdef USE_ZONE_STATS
378 void
379 server_zonestat_alloc(struct nsd* nsd)
380 {
381 	size_t num = (nsd->options->zonestatnames->count==0?1:
382 			nsd->options->zonestatnames->count);
383 	size_t sz = sizeof(struct nsdst)*num;
384 	char tmpfile[256];
385 	uint8_t z = 0;
386 
387 	/* file names */
388 	nsd->zonestatfname[0] = 0;
389 	nsd->zonestatfname[1] = 0;
390 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0",
391 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
392 	nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile);
393 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1",
394 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
395 	nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile);
396 
397 	/* file descriptors */
398 	nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600);
399 	if(nsd->zonestatfd[0] == -1) {
400 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0],
401 			strerror(errno));
402 		exit(1);
403 	}
404 	nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600);
405 	if(nsd->zonestatfd[0] == -1) {
406 		log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1],
407 			strerror(errno));
408 		close(nsd->zonestatfd[0]);
409 		unlink(nsd->zonestatfname[0]);
410 		exit(1);
411 	}
412 
413 #ifdef HAVE_MMAP
414 	if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) {
415 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0],
416 			strerror(errno));
417 		exit(1);
418 	}
419 	if(write(nsd->zonestatfd[0], &z, 1) == -1) {
420 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
421 			nsd->zonestatfname[0], strerror(errno));
422 		exit(1);
423 	}
424 	if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) {
425 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1],
426 			strerror(errno));
427 		exit(1);
428 	}
429 	if(write(nsd->zonestatfd[1], &z, 1) == -1) {
430 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
431 			nsd->zonestatfname[1], strerror(errno));
432 		exit(1);
433 	}
434 	nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
435 		MAP_SHARED, nsd->zonestatfd[0], 0);
436 	if(nsd->zonestat[0] == MAP_FAILED) {
437 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
438 		unlink(nsd->zonestatfname[0]);
439 		unlink(nsd->zonestatfname[1]);
440 		exit(1);
441 	}
442 	nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE,
443 		MAP_SHARED, nsd->zonestatfd[1], 0);
444 	if(nsd->zonestat[1] == MAP_FAILED) {
445 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
446 		unlink(nsd->zonestatfname[0]);
447 		unlink(nsd->zonestatfname[1]);
448 		exit(1);
449 	}
450 	memset(nsd->zonestat[0], 0, sz);
451 	memset(nsd->zonestat[1], 0, sz);
452 	nsd->zonestatsize[0] = num;
453 	nsd->zonestatsize[1] = num;
454 	nsd->zonestatdesired = num;
455 	nsd->zonestatsizenow = num;
456 	nsd->zonestatnow = nsd->zonestat[0];
457 #endif /* HAVE_MMAP */
458 }
459 
460 void
461 zonestat_remap(struct nsd* nsd, int idx, size_t sz)
462 {
463 #ifdef HAVE_MMAP
464 #ifdef MREMAP_MAYMOVE
465 	nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx],
466 		sizeof(struct nsdst)*nsd->zonestatsize[idx], sz,
467 		MREMAP_MAYMOVE);
468 	if(nsd->zonestat[idx] == MAP_FAILED) {
469 		log_msg(LOG_ERR, "mremap failed: %s", strerror(errno));
470 		exit(1);
471 	}
472 #else /* !HAVE MREMAP */
473 	if(msync(nsd->zonestat[idx],
474 		sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0)
475 		log_msg(LOG_ERR, "msync failed: %s", strerror(errno));
476 	if(munmap(nsd->zonestat[idx],
477 		sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0)
478 		log_msg(LOG_ERR, "munmap failed: %s", strerror(errno));
479 	nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz,
480 		PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0);
481 	if(nsd->zonestat[idx] == MAP_FAILED) {
482 		log_msg(LOG_ERR, "mmap failed: %s", strerror(errno));
483 		exit(1);
484 	}
485 #endif /* MREMAP */
486 #endif /* HAVE_MMAP */
487 }
488 
489 /* realloc the zonestat array for the one that is not currently in use,
490  * to match the desired new size of the array (if applicable) */
491 void
492 server_zonestat_realloc(struct nsd* nsd)
493 {
494 #ifdef HAVE_MMAP
495 	uint8_t z = 0;
496 	size_t sz;
497 	int idx = 0; /* index of the zonestat array that is not in use */
498 	if(nsd->zonestatnow == nsd->zonestat[0])
499 		idx = 1;
500 	if(nsd->zonestatsize[idx] == nsd->zonestatdesired)
501 		return;
502 	sz = sizeof(struct nsdst)*nsd->zonestatdesired;
503 	if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) {
504 		log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx],
505 			strerror(errno));
506 		exit(1);
507 	}
508 	if(write(nsd->zonestatfd[idx], &z, 1) == -1) {
509 		log_msg(LOG_ERR, "cannot extend stat file %s (%s)",
510 			nsd->zonestatfname[idx], strerror(errno));
511 		exit(1);
512 	}
513 	zonestat_remap(nsd, idx, sz);
514 	/* zero the newly allocated region */
515 	if(nsd->zonestatdesired > nsd->zonestatsize[idx]) {
516 		memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) *
517 			nsd->zonestatsize[idx], 0, sizeof(struct nsdst) *
518 			(nsd->zonestatdesired - nsd->zonestatsize[idx]));
519 	}
520 	nsd->zonestatsize[idx] = nsd->zonestatdesired;
521 #endif /* HAVE_MMAP */
522 }
523 
524 /* switchover to use the other array for the new children, that
525  * briefly coexist with the old children.  And we want to avoid them
526  * both writing to the same statistics arrays. */
527 void
528 server_zonestat_switch(struct nsd* nsd)
529 {
530 	if(nsd->zonestatnow == nsd->zonestat[0]) {
531 		nsd->zonestatnow = nsd->zonestat[1];
532 		nsd->zonestatsizenow = nsd->zonestatsize[1];
533 	} else {
534 		nsd->zonestatnow = nsd->zonestat[0];
535 		nsd->zonestatsizenow = nsd->zonestatsize[0];
536 	}
537 }
538 #endif /* USE_ZONE_STATS */
539 
540 static void
541 cleanup_dname_compression_tables(void *ptr)
542 {
543 	free(ptr);
544 	compressed_dname_offsets = NULL;
545 	compression_table_capacity = 0;
546 }
547 
548 static void
549 initialize_dname_compression_tables(struct nsd *nsd)
550 {
551 	size_t needed = domain_table_count(nsd->db->domains) + 1;
552 	needed += EXTRA_DOMAIN_NUMBERS;
553 	if(compression_table_capacity < needed) {
554 		if(compressed_dname_offsets) {
555 			region_remove_cleanup(nsd->db->region,
556 				cleanup_dname_compression_tables,
557 				compressed_dname_offsets);
558 			free(compressed_dname_offsets);
559 		}
560 		compressed_dname_offsets = (uint16_t *) xmallocarray(
561 			needed, sizeof(uint16_t));
562 		region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables,
563 			compressed_dname_offsets);
564 		compression_table_capacity = needed;
565 		compression_table_size=domain_table_count(nsd->db->domains)+1;
566 	}
567 	memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t));
568 	compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */
569 }
570 
571 /* create and bind sockets.  */
572 static int
573 server_init_ifs(struct nsd *nsd, size_t from, size_t to, int* reuseport_works)
574 {
575 	struct addrinfo* addr;
576 	size_t i;
577 #if defined(SO_REUSEPORT) || defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) || defined(IP_TRANSPARENT)) || defined(IP_FREEBIND) || defined(SO_BINDANY))
578 	int on = 1;
579 #endif
580 
581 	/* UDP */
582 
583 	/* Make a socket... */
584 	for (i = from; i < to; i++) {
585 		/* for reuseports copy socket specs of first entries */
586 		addr = nsd->udp[i%nsd->ifs].addr;
587 		if (!addr) {
588 			nsd->udp[i].s = -1;
589 			continue;
590 		}
591 		nsd->udp[i].fam = (int)addr->ai_family;
592 		if ((nsd->udp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
593 #if defined(INET6)
594 			if (addr->ai_family == AF_INET6 &&
595 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
596 				log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: not supported");
597 				continue;
598 			}
599 #endif /* INET6 */
600 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
601 			return -1;
602 		}
603 
604 #ifdef SO_REUSEPORT
605 #  ifdef SO_REUSEPORT_LB
606 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
607 		 * like SO_REUSEPORT on Linux.  This is what the users want
608 		 * with the config option in nsd.conf; if we actually
609 		 * need local address and port reuse they'll also need to
610 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
611 		 */
612 		if(nsd->reuseport && *reuseport_works &&
613 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT_LB,
614 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
615 			if(verbosity >= 3
616 #ifdef ENOPROTOOPT
617 				|| errno != ENOPROTOOPT
618 #endif
619 				)
620 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT_LB, "
621 				"...) failed: %s", strerror(errno));
622 			*reuseport_works = 0;
623 		}
624 #  else /* SO_REUSEPORT_LB */
625 		if(nsd->reuseport && *reuseport_works &&
626 			setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_REUSEPORT,
627 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
628 			if(verbosity >= 3
629 #ifdef ENOPROTOOPT
630 				|| errno != ENOPROTOOPT
631 #endif
632 				)
633 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
634 				"...) failed: %s", strerror(errno));
635 			*reuseport_works = 0;
636 		}
637 #  endif /* SO_REUSEPORT_LB */
638 #else
639 		(void)reuseport_works;
640 #endif /* SO_REUSEPORT */
641 #if defined(SO_RCVBUF) || defined(SO_SNDBUF)
642 	if(1) {
643 	int rcv = 1*1024*1024;
644 	int snd = 1*1024*1024;
645 
646 #ifdef SO_RCVBUF
647 #  ifdef SO_RCVBUFFORCE
648 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
649 		(socklen_t)sizeof(rcv)) < 0) {
650 		if(errno != EPERM && errno != ENOBUFS) {
651 			log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, "
652                                         "...) failed: %s", strerror(errno));
653 			return -1;
654 		}
655 #  else
656 	if(1) {
657 #  endif /* SO_RCVBUFFORCE */
658 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
659 			 (socklen_t)sizeof(rcv)) < 0) {
660 			if(errno != ENOBUFS && errno != ENOSYS) {
661 				log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, "
662                                         "...) failed: %s", strerror(errno));
663 				return -1;
664 			}
665 		}
666 	}
667 #endif /* SO_RCVBUF */
668 
669 #ifdef SO_SNDBUF
670 #  ifdef SO_SNDBUFFORCE
671 	if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
672 		(socklen_t)sizeof(snd)) < 0) {
673 		if(errno != EPERM && errno != ENOBUFS) {
674 			log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, "
675                                         "...) failed: %s", strerror(errno));
676 			return -1;
677 		}
678 #  else
679 	if(1) {
680 #  endif /* SO_SNDBUFFORCE */
681 		if(setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
682 			 (socklen_t)sizeof(snd)) < 0) {
683 			if(errno != ENOBUFS && errno != ENOSYS) {
684 				log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, "
685                                         "...) failed: %s", strerror(errno));
686 				return -1;
687 			}
688 		}
689 	}
690 #endif /* SO_SNDBUF */
691 
692 	}
693 #endif /* defined(SO_RCVBUF) || defined(SO_SNDBUF) */
694 
695 #if defined(INET6)
696 		if (addr->ai_family == AF_INET6) {
697 # if defined(IPV6_V6ONLY)
698 			if (setsockopt(nsd->udp[i].s,
699 				       IPPROTO_IPV6, IPV6_V6ONLY,
700 				       &on, sizeof(on)) < 0)
701 			{
702 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
703 					strerror(errno));
704 				return -1;
705 			}
706 # endif
707 # if defined(IPV6_USE_MIN_MTU)
708 			/*
709 			 * There is no fragmentation of IPv6 datagrams
710 			 * during forwarding in the network. Therefore
711 			 * we do not send UDP datagrams larger than
712 			 * the minimum IPv6 MTU of 1280 octets. The
713 			 * EDNS0 message length can be larger if the
714 			 * network stack supports IPV6_USE_MIN_MTU.
715 			 */
716 			if (setsockopt(nsd->udp[i].s,
717 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
718 				       &on, sizeof(on)) < 0)
719 			{
720 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s",
721 					strerror(errno));
722 				return -1;
723 			}
724 # elif defined(IPV6_MTU)
725 			/*
726 			 * On Linux, PMTUD is disabled by default for datagrams
727 			 * so set the MTU equal to the MIN MTU to get the same.
728 			 */
729 			on = IPV6_MIN_MTU;
730 			if (setsockopt(nsd->udp[i].s, IPPROTO_IPV6, IPV6_MTU,
731 				&on, sizeof(on)) < 0)
732 			{
733 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s",
734 					strerror(errno));
735 				return -1;
736 			}
737 			on = 1;
738 # endif
739 		}
740 #endif
741 #if defined(AF_INET)
742 		if (addr->ai_family == AF_INET) {
743 #  if defined(IP_MTU_DISCOVER)
744 			int mtudisc_disabled = 0;
745 #   if defined(IP_PMTUDISC_OMIT)
746 		/* Try IP_PMTUDISC_OMIT first */
747 
748 		/*
749 		 * Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets
750 		 * ignore PMTU information and send packets with DF=0.
751 		 * Fragmentation is allowed if and only if the packet
752 		 * size exceeds the outgoing interface MTU or the packet
753 		 * encounters smaller MTU link in network.
754 		 * This mitigates DNS fragmentation attacks by preventing
755 		 * forged PMTU information.
756 		 * FreeBSD already has same semantics without setting
757 		 * the option.
758 		 */
759 			int action_omit = IP_PMTUDISC_OMIT;
760 			if (!mtudisc_disabled) {
761 				if(setsockopt(nsd->udp[i].s, IPPROTO_IP,
762 					IP_MTU_DISCOVER, &action_omit,
763 					sizeof(action_omit)) < 0)
764 				{
765 					log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
766 						strerror(errno));
767 				} else {
768 					mtudisc_disabled = 1;
769 				}
770 			}
771 #   endif /* IP_PMTUDISC_OMIT */
772 #   if defined(IP_PMTUDISC_DONT)
773 			/*
774 			 * Use IP_PMTUDISC_DONT
775 			 * if IP_PMTUDISC_OMIT failed / undefined
776 			 */
777 			if (!mtudisc_disabled) {
778 				int action_dont = IP_PMTUDISC_DONT;
779 				if (setsockopt(nsd->udp[i].s, IPPROTO_IP,
780 					IP_MTU_DISCOVER, &action_dont,
781 					sizeof(action_dont)) < 0)
782 				{
783 					log_msg(LOG_ERR, "setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
784 						strerror(errno));
785 				} else {
786 					mtudisc_disabled = 1;
787 				}
788 			}
789 #   endif /* IP_PMTUDISC_DONT */
790 			/* exit if all methods to disable PMTUD failed */
791 			if(!mtudisc_disabled) {
792 				return -1;
793 			}
794 #  elif defined(IP_DONTFRAG)
795 			int off = 0;
796 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_DONTFRAG,
797 				&off, sizeof(off)) < 0)
798 			{
799 				log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s",
800 					strerror(errno));
801 				return -1;
802 			}
803 #  endif
804 		}
805 #endif
806 		/* set it nonblocking */
807 		/* otherwise, on OSes with thundering herd problems, the
808 		   UDP recv could block NSD after select returns readable. */
809 		if (fcntl(nsd->udp[i].s, F_SETFL, O_NONBLOCK) == -1) {
810 			log_msg(LOG_ERR, "cannot fcntl udp: %s", strerror(errno));
811 		}
812 
813 		/* Bind it... */
814 		if (nsd->options->ip_freebind) {
815 #ifdef IP_FREEBIND
816 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
817 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for udp: %s",
818 					strerror(errno));
819 			}
820 #endif /* IP_FREEBIND */
821 		}
822 
823 		if (nsd->options->ip_transparent) {
824 #ifdef IP_TRANSPARENT
825 			if (setsockopt(nsd->udp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
826 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for udp: %s",
827 					strerror(errno));
828 			}
829 #endif /* IP_TRANSPARENT */
830 #ifdef SO_BINDANY
831 			if (setsockopt(nsd->udp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
832 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for udp: %s",
833 					strerror(errno));
834 			}
835 #endif /* SO_BINDANY */
836 		}
837 
838 		if (
839 			bind(nsd->udp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
840 			log_msg(LOG_ERR, "can't bind udp socket: %s", strerror(errno));
841 			return -1;
842 		}
843 	}
844 
845 	/* TCP */
846 
847 	/* Make a socket... */
848 	for (i = from; i < to; i++) {
849 		/* for reuseports copy socket specs of first entries */
850 		addr = nsd->tcp[i%nsd->ifs].addr;
851 		if (!addr) {
852 			nsd->tcp[i].s = -1;
853 			continue;
854 		}
855 		nsd->tcp[i].fam = (int)addr->ai_family;
856 		/* turn off REUSEPORT for TCP by copying the socket fd */
857 		if(i >= nsd->ifs) {
858 			nsd->tcp[i].s = nsd->tcp[i%nsd->ifs].s;
859 			continue;
860 		}
861 		if ((nsd->tcp[i].s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
862 #if defined(INET6)
863 			if (addr->ai_family == AF_INET6 &&
864 				errno == EAFNOSUPPORT && nsd->grab_ip6_optional) {
865 				log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported");
866 				continue;
867 			}
868 #endif /* INET6 */
869 			log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno));
870 			return -1;
871 		}
872 
873 #ifdef SO_REUSEPORT
874 		if(nsd->reuseport && *reuseport_works &&
875 			setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEPORT,
876 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
877 			if(verbosity >= 3
878 #ifdef ENOPROTOOPT
879 				|| errno != ENOPROTOOPT
880 #endif
881 				)
882 			    log_msg(LOG_ERR, "setsockopt(..., SO_REUSEPORT, "
883 				"...) failed: %s", strerror(errno));
884 			*reuseport_works = 0;
885 		}
886 #endif /* SO_REUSEPORT */
887 #ifdef	SO_REUSEADDR
888 		if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
889 			log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno));
890 		}
891 #endif /* SO_REUSEADDR */
892 
893 #if defined(INET6)
894 		if (addr->ai_family == AF_INET6) {
895 # if defined(IPV6_V6ONLY)
896 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_V6ONLY,
897 				&on, sizeof(on)) < 0) {
898 				log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno));
899 				return -1;
900 			}
901 # endif
902 # if defined(IPV6_USE_MIN_MTU)
903 			/*
904 			 * Use minimum MTU to minimize delays learning working
905 			 * PMTU when communicating through a tunnel.
906 			 */
907 			if (setsockopt(nsd->tcp[i].s,
908 				       IPPROTO_IPV6, IPV6_USE_MIN_MTU,
909 				       &on, sizeof(on)) < 0) {
910 				log_msg(LOG_ERR, "setsockopt(..., IPV6_USE_MIN_MTU, ...) failed: %s", strerror(errno));
911 				return -1;
912 			}
913 # elif defined(IPV6_MTU)
914 			/*
915 			 * On Linux, PMTUD is disabled by default for datagrams
916 			 * so set the MTU equal to the MIN MTU to get the same.
917 			 */
918 			on = IPV6_MIN_MTU;
919 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IPV6, IPV6_MTU,
920 				&on, sizeof(on)) < 0) {
921 				log_msg(LOG_ERR, "setsockopt(..., IPV6_MTU, ...) failed: %s", strerror(errno));
922 				return -1;
923 			}
924 			on = 1;
925 # endif
926 		}
927 #endif
928 		/* set maximum segment size to tcp socket */
929 		if(nsd->tcp_mss > 0) {
930 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
931 			if(setsockopt(nsd->tcp[i].s, IPPROTO_TCP, TCP_MAXSEG,
932 					(void*)&nsd->tcp_mss,
933 					sizeof(nsd->tcp_mss)) < 0) {
934 				log_msg(LOG_ERR,
935 					"setsockopt(...,TCP_MAXSEG,...)"
936 					" failed for tcp: %s", strerror(errno));
937 			}
938 #else
939 			log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported");
940 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
941 		}
942 
943 		/* set it nonblocking */
944 		/* (StevensUNP p463), if tcp listening socket is blocking, then
945 		   it may block in accept, even if select() says readable. */
946 		if (fcntl(nsd->tcp[i].s, F_SETFL, O_NONBLOCK) == -1) {
947 			log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno));
948 		}
949 
950 		/* Bind it... */
951 		if (nsd->options->ip_freebind) {
952 #ifdef IP_FREEBIND
953 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) < 0) {
954 				log_msg(LOG_ERR, "setsockopt(...,IP_FREEBIND, ...) failed for tcp: %s",
955 					strerror(errno));
956 			}
957 #endif /* IP_FREEBIND */
958 		}
959 
960 		if (nsd->options->ip_transparent) {
961 #ifdef IP_TRANSPARENT
962 			if (setsockopt(nsd->tcp[i].s, IPPROTO_IP, IP_TRANSPARENT, &on, sizeof(on)) < 0) {
963 				log_msg(LOG_ERR, "setsockopt(...,IP_TRANSPARENT, ...) failed for tcp: %s",
964 					strerror(errno));
965 			}
966 #endif /* IP_TRANSPARENT */
967 #ifdef SO_BINDANY
968 			if (setsockopt(nsd->tcp[i].s, SOL_SOCKET, SO_BINDANY, &on, sizeof(on)) < 0) {
969 				log_msg(LOG_ERR, "setsockopt(...,SO_BINDANY, ...) failed for tcp: %s",
970 					strerror(errno));
971 			}
972 #endif /* SO_BINDANY */
973 		}
974 
975 		if(
976 			bind(nsd->tcp[i].s, (struct sockaddr *) addr->ai_addr, addr->ai_addrlen) != 0) {
977 			log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno));
978 			return -1;
979 		}
980 
981 		/* Listen to it... */
982 		if (listen(nsd->tcp[i].s, TCP_BACKLOG) == -1) {
983 			log_msg(LOG_ERR, "can't listen: %s", strerror(errno));
984 			return -1;
985 		}
986 	}
987 
988 	return 0;
989 }
990 
991 /*
992  * Initialize the server, reuseport, create and bind the sockets.
993  */
994 int
995 server_init(struct nsd *nsd)
996 {
997 	int reuseport_successful = 1; /* see if reuseport works in OS */
998 	if(nsd->reuseport) {
999 		/* increase the size of the udp and tcp interface arrays,
1000 		 * there are going to be separate interface file descriptors
1001 		 * for every server instance */
1002 		nsd->udp = xrealloc(nsd->udp, (nsd->ifs*nsd->reuseport)*
1003 			sizeof(*nsd->udp));
1004 		nsd->tcp = xrealloc(nsd->tcp, (nsd->ifs*nsd->reuseport)*
1005 			sizeof(*nsd->tcp));
1006 		memset(&nsd->udp[nsd->ifs], 0, sizeof(*nsd->udp)*
1007 			(nsd->ifs*(nsd->reuseport-1)));
1008 		memset(&nsd->tcp[nsd->ifs], 0, sizeof(*nsd->tcp)*
1009 			(nsd->ifs*(nsd->reuseport-1)));
1010 	}
1011 
1012 	/* open the server interface ports */
1013 	if(server_init_ifs(nsd, 0, nsd->ifs, &reuseport_successful) == -1)
1014 		return -1;
1015 
1016 	/* continue to open the remaining reuseport ports */
1017 	if(nsd->reuseport && reuseport_successful) {
1018 		if(server_init_ifs(nsd, nsd->ifs, nsd->ifs*nsd->reuseport,
1019 			&reuseport_successful) == -1)
1020 			return -1;
1021 		nsd->ifs *= nsd->reuseport;
1022 	} else {
1023 		nsd->reuseport = 0;
1024 	}
1025 	return 0;
1026 }
1027 
1028 /*
1029  * Prepare the server for take off.
1030  *
1031  */
1032 int
1033 server_prepare(struct nsd *nsd)
1034 {
1035 #ifdef RATELIMIT
1036 	/* set secret modifier for hashing (udb ptr buckets and rate limits) */
1037 #ifdef HAVE_ARC4RANDOM
1038 	hash_set_raninit(arc4random());
1039 #else
1040 	uint32_t v = getpid() ^ time(NULL);
1041 	srandom((unsigned long)v);
1042 	if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0)
1043 		hash_set_raninit(v);
1044 	else	hash_set_raninit(random());
1045 #endif
1046 	rrl_mmap_init(nsd->child_count, nsd->options->rrl_size,
1047 		nsd->options->rrl_ratelimit,
1048 		nsd->options->rrl_whitelist_ratelimit,
1049 		nsd->options->rrl_slip,
1050 		nsd->options->rrl_ipv4_prefix_length,
1051 		nsd->options->rrl_ipv6_prefix_length);
1052 #endif /* RATELIMIT */
1053 
1054 	/* Open the database... */
1055 	if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) {
1056 		log_msg(LOG_ERR, "unable to open the database %s: %s",
1057 			nsd->dbfile, strerror(errno));
1058 		unlink(nsd->task[0]->fname);
1059 		unlink(nsd->task[1]->fname);
1060 #ifdef USE_ZONE_STATS
1061 		unlink(nsd->zonestatfname[0]);
1062 		unlink(nsd->zonestatfname[1]);
1063 #endif
1064 		xfrd_del_tempdir(nsd);
1065 		return -1;
1066 	}
1067 	/* check if zone files have been modified */
1068 	/* NULL for taskudb because we send soainfo in a moment, batched up,
1069 	 * for all zones */
1070 	if(nsd->options->zonefiles_check || (nsd->options->database == NULL ||
1071 		nsd->options->database[0] == 0))
1072 		namedb_check_zonefiles(nsd, nsd->options, NULL, NULL);
1073 	zonestatid_tree_set(nsd);
1074 
1075 	compression_table_capacity = 0;
1076 	initialize_dname_compression_tables(nsd);
1077 
1078 #ifdef	BIND8_STATS
1079 	/* Initialize times... */
1080 	time(&nsd->st.boot);
1081 	set_bind8_alarm(nsd);
1082 #endif /* BIND8_STATS */
1083 
1084 	return 0;
1085 }
1086 
1087 /*
1088  * Fork the required number of servers.
1089  */
1090 static int
1091 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio,
1092 	int* xfrd_sock_p)
1093 {
1094 	size_t i;
1095 
1096 	/* Start all child servers initially.  */
1097 	for (i = 0; i < nsd->child_count; ++i) {
1098 		nsd->children[i].pid = 0;
1099 	}
1100 
1101 	return restart_child_servers(nsd, region, netio, xfrd_sock_p);
1102 }
1103 
1104 void
1105 server_close_all_sockets(struct nsd_socket sockets[], size_t n)
1106 {
1107 	size_t i;
1108 
1109 	/* Close all the sockets... */
1110 	for (i = 0; i < n; ++i) {
1111 		if (sockets[i].s != -1) {
1112 			close(sockets[i].s);
1113 			if(sockets[i].addr)
1114 				freeaddrinfo(sockets[i].addr);
1115 			sockets[i].s = -1;
1116 		}
1117 	}
1118 }
1119 
1120 /*
1121  * Close the sockets, shutdown the server and exit.
1122  * Does not return.
1123  *
1124  */
1125 void
1126 server_shutdown(struct nsd *nsd)
1127 {
1128 	size_t i;
1129 
1130 	server_close_all_sockets(nsd->udp, nsd->ifs);
1131 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1132 	/* CHILD: close command channel to parent */
1133 	if(nsd->this_child && nsd->this_child->parent_fd != -1)
1134 	{
1135 		close(nsd->this_child->parent_fd);
1136 		nsd->this_child->parent_fd = -1;
1137 	}
1138 	/* SERVER: close command channels to children */
1139 	if(!nsd->this_child)
1140 	{
1141 		for(i=0; i < nsd->child_count; ++i)
1142 			if(nsd->children[i].child_fd != -1)
1143 			{
1144 				close(nsd->children[i].child_fd);
1145 				nsd->children[i].child_fd = -1;
1146 			}
1147 	}
1148 
1149 	tsig_finalize();
1150 #ifdef HAVE_SSL
1151 	daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */
1152 #endif
1153 
1154 #ifdef MEMCLEAN /* OS collects memory pages */
1155 #ifdef RATELIMIT
1156 	rrl_mmap_deinit_keep_mmap();
1157 #endif
1158 #ifdef USE_DNSTAP
1159 	dt_collector_destroy(nsd->dt_collector, nsd);
1160 #endif
1161 	udb_base_free_keep_mmap(nsd->task[0]);
1162 	udb_base_free_keep_mmap(nsd->task[1]);
1163 	namedb_close_udb(nsd->db); /* keeps mmap */
1164 	namedb_close(nsd->db);
1165 	nsd_options_destroy(nsd->options);
1166 	region_destroy(nsd->region);
1167 #endif
1168 	log_finalize();
1169 	exit(0);
1170 }
1171 
1172 void
1173 server_prepare_xfrd(struct nsd* nsd)
1174 {
1175 	char tmpfile[256];
1176 	/* create task mmaps */
1177 	nsd->mytask = 0;
1178 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0",
1179 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1180 	nsd->task[0] = task_file_create(tmpfile);
1181 	if(!nsd->task[0]) {
1182 #ifdef USE_ZONE_STATS
1183 		unlink(nsd->zonestatfname[0]);
1184 		unlink(nsd->zonestatfname[1]);
1185 #endif
1186 		xfrd_del_tempdir(nsd);
1187 		exit(1);
1188 	}
1189 	snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1",
1190 		nsd->options->xfrdir, (int)getpid(), (unsigned)getpid());
1191 	nsd->task[1] = task_file_create(tmpfile);
1192 	if(!nsd->task[1]) {
1193 		unlink(nsd->task[0]->fname);
1194 #ifdef USE_ZONE_STATS
1195 		unlink(nsd->zonestatfname[0]);
1196 		unlink(nsd->zonestatfname[1]);
1197 #endif
1198 		xfrd_del_tempdir(nsd);
1199 		exit(1);
1200 	}
1201 	assert(udb_base_get_userdata(nsd->task[0])->data == 0);
1202 	assert(udb_base_get_userdata(nsd->task[1])->data == 0);
1203 	/* create xfrd listener structure */
1204 	nsd->xfrd_listener = region_alloc(nsd->region,
1205 		sizeof(netio_handler_type));
1206 	nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*)
1207 		region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data));
1208 	nsd->xfrd_listener->fd = -1;
1209 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd =
1210 		nsd;
1211 	((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn =
1212 		xfrd_tcp_create(nsd->region, QIOBUFSZ);
1213 }
1214 
1215 
1216 void
1217 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active)
1218 {
1219 	pid_t pid;
1220 	int sockets[2] = {0,0};
1221 	struct ipc_handler_conn_data *data;
1222 
1223 	if(nsd->xfrd_listener->fd != -1)
1224 		close(nsd->xfrd_listener->fd);
1225 	if(del_db) {
1226 		/* recreate taskdb that xfrd was using, it may be corrupt */
1227 		/* we (or reload) use nsd->mytask, and xfrd uses the other */
1228 		char* tmpfile = nsd->task[1-nsd->mytask]->fname;
1229 		nsd->task[1-nsd->mytask]->fname = NULL;
1230 		/* free alloc already, so udb does not shrink itself */
1231 		udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc);
1232 		nsd->task[1-nsd->mytask]->alloc = NULL;
1233 		udb_base_free(nsd->task[1-nsd->mytask]);
1234 		/* create new file, overwrite the old one */
1235 		nsd->task[1-nsd->mytask] = task_file_create(tmpfile);
1236 		free(tmpfile);
1237 	}
1238 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) {
1239 		log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno));
1240 		return;
1241 	}
1242 	pid = fork();
1243 	switch (pid) {
1244 	case -1:
1245 		log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno));
1246 		break;
1247 	default:
1248 		/* PARENT: close first socket, use second one */
1249 		close(sockets[0]);
1250 		if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) {
1251 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1252 		}
1253 		if(del_db) xfrd_free_namedb(nsd);
1254 		/* use other task than I am using, since if xfrd died and is
1255 		 * restarted, the reload is using nsd->mytask */
1256 		nsd->mytask = 1 - nsd->mytask;
1257 		xfrd_init(sockets[1], nsd, del_db, reload_active, pid);
1258 		/* ENOTREACH */
1259 		break;
1260 	case 0:
1261 		/* CHILD: close second socket, use first one */
1262 		close(sockets[1]);
1263 		if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) {
1264 			log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno));
1265 		}
1266 		nsd->xfrd_listener->fd = sockets[0];
1267 		break;
1268 	}
1269 	/* server-parent only */
1270 	nsd->xfrd_listener->timeout = NULL;
1271 	nsd->xfrd_listener->event_types = NETIO_EVENT_READ;
1272 	nsd->xfrd_listener->event_handler = parent_handle_xfrd_command;
1273 	/* clear ongoing ipc reads */
1274 	data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data;
1275 	data->conn->is_reading = 0;
1276 }
1277 
1278 /** add all soainfo to taskdb */
1279 static void
1280 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb)
1281 {
1282 	struct radnode* n;
1283 	udb_ptr task_last; /* last task, mytask is empty so NULL */
1284 	/* add all SOA INFO to mytask */
1285 	udb_ptr_init(&task_last, taskudb);
1286 	for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) {
1287 		task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0);
1288 	}
1289 	udb_ptr_unlink(&task_last, taskudb);
1290 }
1291 
1292 void
1293 server_send_soa_xfrd(struct nsd* nsd, int shortsoa)
1294 {
1295 	/* normally this exchanges the SOA from nsd->xfrd and the expire back.
1296 	 *   parent fills one taskdb with soas, xfrd fills other with expires.
1297 	 *   then they exchange and process.
1298 	 * shortsoa: xfrd crashes and needs to be restarted and one taskdb
1299 	 *   may be in use by reload.  Fill SOA in taskdb and give to xfrd.
1300 	 *   expire notifications can be sent back via a normal reload later
1301 	 *   (xfrd will wait for current running reload to finish if any).
1302 	 */
1303 	sig_atomic_t cmd = 0;
1304 	pid_t mypid;
1305 	int xfrd_sock = nsd->xfrd_listener->fd;
1306 	struct udb_base* taskudb = nsd->task[nsd->mytask];
1307 	udb_ptr t;
1308 	if(!shortsoa) {
1309 		if(nsd->signal_hint_shutdown) {
1310 		shutdown:
1311 			log_msg(LOG_WARNING, "signal received, shutting down...");
1312 			server_close_all_sockets(nsd->udp, nsd->ifs);
1313 			server_close_all_sockets(nsd->tcp, nsd->ifs);
1314 #ifdef HAVE_SSL
1315 			daemon_remote_close(nsd->rc);
1316 #endif
1317 			/* Unlink it if possible... */
1318 			unlinkpid(nsd->pidfile);
1319 			unlink(nsd->task[0]->fname);
1320 			unlink(nsd->task[1]->fname);
1321 #ifdef USE_ZONE_STATS
1322 			unlink(nsd->zonestatfname[0]);
1323 			unlink(nsd->zonestatfname[1]);
1324 #endif
1325 			/* write the nsd.db to disk, wait for it to complete */
1326 			udb_base_sync(nsd->db->udb, 1);
1327 			udb_base_close(nsd->db->udb);
1328 			server_shutdown(nsd);
1329 			exit(0);
1330 		}
1331 	}
1332 	if(shortsoa) {
1333 		/* put SOA in xfrd task because mytask may be in use */
1334 		taskudb = nsd->task[1-nsd->mytask];
1335 	}
1336 
1337 	add_all_soa_to_task(nsd, taskudb);
1338 	if(!shortsoa) {
1339 		/* wait for xfrd to signal task is ready, RELOAD signal */
1340 		if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) ||
1341 			cmd != NSD_RELOAD) {
1342 			log_msg(LOG_ERR, "did not get start signal from xfrd");
1343 			exit(1);
1344 		}
1345 		if(nsd->signal_hint_shutdown) {
1346 			goto shutdown;
1347 		}
1348 	}
1349 	/* give xfrd our task, signal it with RELOAD_DONE */
1350 	task_process_sync(taskudb);
1351 	cmd = NSD_RELOAD_DONE;
1352 	if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1353 		log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1354 			(int)nsd->pid, strerror(errno));
1355 	}
1356 	mypid = getpid();
1357 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1358 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1359 			strerror(errno));
1360 	}
1361 
1362 	if(!shortsoa) {
1363 		/* process the xfrd task works (expiry data) */
1364 		nsd->mytask = 1 - nsd->mytask;
1365 		taskudb = nsd->task[nsd->mytask];
1366 		task_remap(taskudb);
1367 		udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb));
1368 		while(!udb_ptr_is_null(&t)) {
1369 			task_process_expire(nsd->db, TASKLIST(&t));
1370 			udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next);
1371 		}
1372 		udb_ptr_unlink(&t, taskudb);
1373 		task_clear(taskudb);
1374 
1375 		/* tell xfrd that the task is emptied, signal with RELOAD_DONE */
1376 		cmd = NSD_RELOAD_DONE;
1377 		if(!write_socket(xfrd_sock, &cmd,  sizeof(cmd))) {
1378 			log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s",
1379 				(int)nsd->pid, strerror(errno));
1380 		}
1381 	}
1382 }
1383 
1384 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */
1385 ssize_t
1386 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout)
1387 {
1388 	uint8_t* buf = (uint8_t*) p;
1389 	ssize_t total = 0;
1390 	struct pollfd fd;
1391 	memset(&fd, 0, sizeof(fd));
1392 	fd.fd = s;
1393 	fd.events = POLLIN;
1394 
1395 	while( total < sz) {
1396 		ssize_t ret;
1397 		ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000);
1398 		if(ret == -1) {
1399 			if(errno == EAGAIN)
1400 				/* blocking read */
1401 				continue;
1402 			if(errno == EINTR) {
1403 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1404 					return -1;
1405 				/* other signals can be handled later */
1406 				continue;
1407 			}
1408 			/* some error */
1409 			return -1;
1410 		}
1411 		if(ret == 0) {
1412 			/* operation timed out */
1413 			return -2;
1414 		}
1415 		ret = read(s, buf+total, sz-total);
1416 		if(ret == -1) {
1417 			if(errno == EAGAIN)
1418 				/* blocking read */
1419 				continue;
1420 			if(errno == EINTR) {
1421 				if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown))
1422 					return -1;
1423 				/* other signals can be handled later */
1424 				continue;
1425 			}
1426 			/* some error */
1427 			return -1;
1428 		}
1429 		if(ret == 0) {
1430 			/* closed connection! */
1431 			return 0;
1432 		}
1433 		total += ret;
1434 	}
1435 	return total;
1436 }
1437 
1438 static void
1439 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket)
1440 {
1441 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1442 	udb_ptr t, next;
1443 	udb_base* u = nsd->task[nsd->mytask];
1444 	udb_ptr_init(&next, u);
1445 	udb_ptr_new(&t, u, udb_base_get_userdata(u));
1446 	udb_base_set_userdata(u, 0);
1447 	while(!udb_ptr_is_null(&t)) {
1448 		/* store next in list so this one can be deleted or reused */
1449 		udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next);
1450 		udb_rptr_zero(&TASKLIST(&t)->next, u);
1451 
1452 		/* process task t */
1453 		/* append results for task t and update last_task */
1454 		task_process_in_reload(nsd, u, last_task, &t);
1455 
1456 		/* go to next */
1457 		udb_ptr_set_ptr(&t, u, &next);
1458 
1459 		/* if the parent has quit, we must quit too, poll the fd for cmds */
1460 		if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1461 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1462 			if(cmd == NSD_QUIT) {
1463 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1464 				/* sync to disk (if needed) */
1465 				udb_base_sync(nsd->db->udb, 0);
1466 				/* unlink files of remainder of tasks */
1467 				while(!udb_ptr_is_null(&t)) {
1468 					if(TASKLIST(&t)->task_type == task_apply_xfr) {
1469 						xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno);
1470 					}
1471 					udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next);
1472 				}
1473 				udb_ptr_unlink(&t, u);
1474 				udb_ptr_unlink(&next, u);
1475 				exit(0);
1476 			}
1477 		}
1478 
1479 	}
1480 	udb_ptr_unlink(&t, u);
1481 	udb_ptr_unlink(&next, u);
1482 }
1483 
1484 #ifdef BIND8_STATS
1485 static void
1486 parent_send_stats(struct nsd* nsd, int cmdfd)
1487 {
1488 	size_t i;
1489 	if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) {
1490 		log_msg(LOG_ERR, "could not write stats to reload");
1491 		return;
1492 	}
1493 	for(i=0; i<nsd->child_count; i++)
1494 		if(!write_socket(cmdfd, &nsd->children[i].query_count,
1495 			sizeof(stc_type))) {
1496 			log_msg(LOG_ERR, "could not write stats to reload");
1497 			return;
1498 		}
1499 }
1500 
1501 static void
1502 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last)
1503 {
1504 	struct nsdst s;
1505 	stc_type* p;
1506 	size_t i;
1507 	if(block_read(nsd, cmdfd, &s, sizeof(s),
1508 		RELOAD_SYNC_TIMEOUT) != sizeof(s)) {
1509 		log_msg(LOG_ERR, "could not read stats from oldpar");
1510 		return;
1511 	}
1512 	s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0);
1513 	s.db_mem = region_get_mem(nsd->db->region);
1514 	p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s,
1515 		nsd->child_count);
1516 	if(!p) return;
1517 	for(i=0; i<nsd->child_count; i++) {
1518 		if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!=
1519 			sizeof(stc_type))
1520 			return;
1521 	}
1522 }
1523 #endif /* BIND8_STATS */
1524 
1525 /*
1526  * Reload the database, stop parent, re-fork children and continue.
1527  * as server_main.
1528  */
1529 static void
1530 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio,
1531 	int cmdsocket)
1532 {
1533 	pid_t mypid;
1534 	sig_atomic_t cmd = NSD_QUIT_SYNC;
1535 	int ret;
1536 	udb_ptr last_task;
1537 	struct sigaction old_sigchld, ign_sigchld;
1538 	/* ignore SIGCHLD from the previous server_main that used this pid */
1539 	memset(&ign_sigchld, 0, sizeof(ign_sigchld));
1540 	ign_sigchld.sa_handler = SIG_IGN;
1541 	sigaction(SIGCHLD, &ign_sigchld, &old_sigchld);
1542 
1543 	/* see what tasks we got from xfrd */
1544 	task_remap(nsd->task[nsd->mytask]);
1545 	udb_ptr_init(&last_task, nsd->task[nsd->mytask]);
1546 	udb_compact_inhibited(nsd->db->udb, 1);
1547 	reload_process_tasks(nsd, &last_task, cmdsocket);
1548 	udb_compact_inhibited(nsd->db->udb, 0);
1549 	udb_compact(nsd->db->udb);
1550 
1551 #ifndef NDEBUG
1552 	if(nsd_debug_level >= 1)
1553 		region_log_stats(nsd->db->region);
1554 #endif /* NDEBUG */
1555 	/* sync to disk (if needed) */
1556 	udb_base_sync(nsd->db->udb, 0);
1557 
1558 	initialize_dname_compression_tables(nsd);
1559 
1560 #ifdef BIND8_STATS
1561 	/* Restart dumping stats if required.  */
1562 	time(&nsd->st.boot);
1563 	set_bind8_alarm(nsd);
1564 #endif
1565 #ifdef USE_ZONE_STATS
1566 	server_zonestat_realloc(nsd); /* realloc for new children */
1567 	server_zonestat_switch(nsd);
1568 #endif
1569 
1570 	/* listen for the signals of failed children again */
1571 	sigaction(SIGCHLD, &old_sigchld, NULL);
1572 	/* Start new child processes */
1573 	if (server_start_children(nsd, server_region, netio, &nsd->
1574 		xfrd_listener->fd) != 0) {
1575 		send_children_quit(nsd);
1576 		exit(1);
1577 	}
1578 
1579 	/* if the parent has quit, we must quit too, poll the fd for cmds */
1580 	if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) {
1581 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd));
1582 		if(cmd == NSD_QUIT) {
1583 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd"));
1584 			send_children_quit(nsd);
1585 			exit(0);
1586 		}
1587 	}
1588 
1589 	/* Send quit command to parent: blocking, wait for receipt. */
1590 	do {
1591 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main"));
1592 		if (!write_socket(cmdsocket, &cmd, sizeof(cmd)))
1593 		{
1594 			log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s",
1595 				strerror(errno));
1596 		}
1597 		/* blocking: wait for parent to really quit. (it sends RELOAD as ack) */
1598 		DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main"));
1599 		ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd),
1600 			RELOAD_SYNC_TIMEOUT);
1601 		if(ret == -2) {
1602 			DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry"));
1603 		}
1604 	} while (ret == -2);
1605 	if(ret == -1) {
1606 		log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s",
1607 			strerror(errno));
1608 	}
1609 	DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd));
1610 	if(cmd == NSD_QUIT) {
1611 		/* small race condition possible here, parent got quit cmd. */
1612 		send_children_quit(nsd);
1613 		exit(1);
1614 	}
1615 	assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD);
1616 #ifdef BIND8_STATS
1617 	reload_do_stats(cmdsocket, nsd, &last_task);
1618 #endif
1619 	udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]);
1620 	task_process_sync(nsd->task[nsd->mytask]);
1621 #ifdef USE_ZONE_STATS
1622 	server_zonestat_realloc(nsd); /* realloc for next children */
1623 #endif
1624 
1625 	/* send soainfo to the xfrd process, signal it that reload is done,
1626 	 * it picks up the taskudb */
1627 	cmd = NSD_RELOAD_DONE;
1628 	if(!write_socket(nsd->xfrd_listener->fd, &cmd,  sizeof(cmd))) {
1629 		log_msg(LOG_ERR, "problems sending reload_done xfrd: %s",
1630 			strerror(errno));
1631 	}
1632 	mypid = getpid();
1633 	if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1634 		log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1635 			strerror(errno));
1636 	}
1637 
1638 	/* try to reopen file */
1639 	if (nsd->file_rotation_ok)
1640 		log_reopen(nsd->log_filename, 1);
1641 	/* exit reload, continue as new server_main */
1642 }
1643 
1644 /*
1645  * Get the mode depending on the signal hints that have been received.
1646  * Multiple signal hints can be received and will be handled in turn.
1647  */
1648 static sig_atomic_t
1649 server_signal_mode(struct nsd *nsd)
1650 {
1651 	if(nsd->signal_hint_quit) {
1652 		nsd->signal_hint_quit = 0;
1653 		return NSD_QUIT;
1654 	}
1655 	else if(nsd->signal_hint_shutdown) {
1656 		nsd->signal_hint_shutdown = 0;
1657 		return NSD_SHUTDOWN;
1658 	}
1659 	else if(nsd->signal_hint_child) {
1660 		nsd->signal_hint_child = 0;
1661 		return NSD_REAP_CHILDREN;
1662 	}
1663 	else if(nsd->signal_hint_reload) {
1664 		nsd->signal_hint_reload = 0;
1665 		return NSD_RELOAD;
1666 	}
1667 	else if(nsd->signal_hint_reload_hup) {
1668 		nsd->signal_hint_reload_hup = 0;
1669 		return NSD_RELOAD_REQ;
1670 	}
1671 	else if(nsd->signal_hint_stats) {
1672 		nsd->signal_hint_stats = 0;
1673 #ifdef BIND8_STATS
1674 		set_bind8_alarm(nsd);
1675 #endif
1676 		return NSD_STATS;
1677 	}
1678 	else if(nsd->signal_hint_statsusr) {
1679 		nsd->signal_hint_statsusr = 0;
1680 		return NSD_STATS;
1681 	}
1682 	return NSD_RUN;
1683 }
1684 
1685 /*
1686  * The main server simply waits for signals and child processes to
1687  * terminate.  Child processes are restarted as necessary.
1688  */
1689 void
1690 server_main(struct nsd *nsd)
1691 {
1692 	region_type *server_region = region_create(xalloc, free);
1693 	netio_type *netio = netio_create(server_region);
1694 	netio_handler_type reload_listener;
1695 	int reload_sockets[2] = {-1, -1};
1696 	struct timespec timeout_spec;
1697 	int status;
1698 	pid_t child_pid;
1699 	pid_t reload_pid = -1;
1700 	sig_atomic_t mode;
1701 
1702 	/* Ensure we are the main process */
1703 	assert(nsd->server_kind == NSD_SERVER_MAIN);
1704 
1705 	/* Add listener for the XFRD process */
1706 	netio_add_handler(netio, nsd->xfrd_listener);
1707 
1708 	/* Start the child processes that handle incoming queries */
1709 	if (server_start_children(nsd, server_region, netio,
1710 		&nsd->xfrd_listener->fd) != 0) {
1711 		send_children_quit(nsd);
1712 		exit(1);
1713 	}
1714 	reload_listener.fd = -1;
1715 
1716 	/* This_child MUST be 0, because this is the parent process */
1717 	assert(nsd->this_child == 0);
1718 
1719 	/* Run the server until we get a shutdown signal */
1720 	while ((mode = nsd->mode) != NSD_SHUTDOWN) {
1721 		/* Did we receive a signal that changes our mode? */
1722 		if(mode == NSD_RUN) {
1723 			nsd->mode = mode = server_signal_mode(nsd);
1724 		}
1725 
1726 		switch (mode) {
1727 		case NSD_RUN:
1728 			/* see if any child processes terminated */
1729 			while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) {
1730 				int is_child = delete_child_pid(nsd, child_pid);
1731 				if (is_child != -1 && nsd->children[is_child].need_to_exit) {
1732 					if(nsd->children[is_child].child_fd == -1)
1733 						nsd->children[is_child].has_exited = 1;
1734 					parent_check_all_children_exited(nsd);
1735 				} else if(is_child != -1) {
1736 					log_msg(LOG_WARNING,
1737 					       "server %d died unexpectedly with status %d, restarting",
1738 					       (int) child_pid, status);
1739 					restart_child_servers(nsd, server_region, netio,
1740 						&nsd->xfrd_listener->fd);
1741 				} else if (child_pid == reload_pid) {
1742 					sig_atomic_t cmd = NSD_RELOAD_DONE;
1743 					pid_t mypid;
1744 					log_msg(LOG_WARNING,
1745 					       "Reload process %d failed with status %d, continuing with old database",
1746 					       (int) child_pid, status);
1747 					reload_pid = -1;
1748 					if(reload_listener.fd != -1) close(reload_listener.fd);
1749 					reload_listener.fd = -1;
1750 					reload_listener.event_types = NETIO_EVENT_NONE;
1751 					task_process_sync(nsd->task[nsd->mytask]);
1752 					/* inform xfrd reload attempt ended */
1753 					if(!write_socket(nsd->xfrd_listener->fd,
1754 						&cmd, sizeof(cmd))) {
1755 						log_msg(LOG_ERR, "problems "
1756 						  "sending SOAEND to xfrd: %s",
1757 						  strerror(errno));
1758 					}
1759 					mypid = getpid();
1760 					if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1761 						log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1762 							strerror(errno));
1763 					}
1764 				} else if(status != 0) {
1765 					/* check for status, because we get
1766 					 * the old-servermain because reload
1767 					 * is the process-parent of old-main,
1768 					 * and we get older server-processes
1769 					 * that are exiting after a reload */
1770 					log_msg(LOG_WARNING,
1771 					       "process %d terminated with status %d",
1772 					       (int) child_pid, status);
1773 				}
1774 			}
1775 			if (child_pid == -1) {
1776 				if (errno == EINTR) {
1777 					continue;
1778 				}
1779 				if (errno != ECHILD)
1780 					log_msg(LOG_WARNING, "wait failed: %s", strerror(errno));
1781 			}
1782 			if (nsd->mode != NSD_RUN)
1783 				break;
1784 
1785 			/* timeout to collect processes. In case no sigchild happens. */
1786 			timeout_spec.tv_sec = 60;
1787 			timeout_spec.tv_nsec = 0;
1788 
1789 			/* listen on ports, timeout for collecting terminated children */
1790 			if(netio_dispatch(netio, &timeout_spec, 0) == -1) {
1791 				if (errno != EINTR) {
1792 					log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno));
1793 				}
1794 			}
1795 			if(nsd->restart_children) {
1796 				restart_child_servers(nsd, server_region, netio,
1797 					&nsd->xfrd_listener->fd);
1798 				nsd->restart_children = 0;
1799 			}
1800 			if(nsd->reload_failed) {
1801 				sig_atomic_t cmd = NSD_RELOAD_DONE;
1802 				pid_t mypid;
1803 				nsd->reload_failed = 0;
1804 				log_msg(LOG_WARNING,
1805 				       "Reload process %d failed, continuing with old database",
1806 				       (int) reload_pid);
1807 				reload_pid = -1;
1808 				if(reload_listener.fd != -1) close(reload_listener.fd);
1809 				reload_listener.fd = -1;
1810 				reload_listener.event_types = NETIO_EVENT_NONE;
1811 				task_process_sync(nsd->task[nsd->mytask]);
1812 				/* inform xfrd reload attempt ended */
1813 				if(!write_socket(nsd->xfrd_listener->fd,
1814 					&cmd, sizeof(cmd))) {
1815 					log_msg(LOG_ERR, "problems "
1816 					  "sending SOAEND to xfrd: %s",
1817 					  strerror(errno));
1818 				}
1819 				mypid = getpid();
1820 				if(!write_socket(nsd->xfrd_listener->fd, &mypid,  sizeof(mypid))) {
1821 					log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s",
1822 						strerror(errno));
1823 				}
1824 			}
1825 
1826 			break;
1827 		case NSD_RELOAD_REQ: {
1828 			sig_atomic_t cmd = NSD_RELOAD_REQ;
1829 			log_msg(LOG_WARNING, "SIGHUP received, reloading...");
1830 			DEBUG(DEBUG_IPC,1, (LOG_INFO,
1831 				"main: ipc send reload_req to xfrd"));
1832 			if(!write_socket(nsd->xfrd_listener->fd,
1833 				&cmd, sizeof(cmd))) {
1834 				log_msg(LOG_ERR, "server_main: could not send "
1835 				"reload_req to xfrd: %s", strerror(errno));
1836 			}
1837 			nsd->mode = NSD_RUN;
1838 			} break;
1839 		case NSD_RELOAD:
1840 			/* Continue to run nsd after reload */
1841 			nsd->mode = NSD_RUN;
1842 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading..."));
1843 			if (reload_pid != -1) {
1844 				log_msg(LOG_WARNING, "Reload already in progress (pid = %d)",
1845 				       (int) reload_pid);
1846 				break;
1847 			}
1848 
1849 			/* switch the mytask to keep track of who owns task*/
1850 			nsd->mytask = 1 - nsd->mytask;
1851 			if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) {
1852 				log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno));
1853 				reload_pid = -1;
1854 				break;
1855 			}
1856 
1857 			/* Do actual reload */
1858 			reload_pid = fork();
1859 			switch (reload_pid) {
1860 			case -1:
1861 				log_msg(LOG_ERR, "fork failed: %s", strerror(errno));
1862 				break;
1863 			default:
1864 				/* PARENT */
1865 				close(reload_sockets[0]);
1866 				server_reload(nsd, server_region, netio,
1867 					reload_sockets[1]);
1868 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main"));
1869 				close(reload_sockets[1]);
1870 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed"));
1871 				/* drop stale xfrd ipc data */
1872 				((struct ipc_handler_conn_data*)nsd->
1873 					xfrd_listener->user_data)
1874 					->conn->is_reading = 0;
1875 				reload_pid = -1;
1876 				reload_listener.fd = -1;
1877 				reload_listener.event_types = NETIO_EVENT_NONE;
1878 				DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run"));
1879 				break;
1880 			case 0:
1881 				/* CHILD */
1882 				/* server_main keep running until NSD_QUIT_SYNC
1883 				 * received from reload. */
1884 				close(reload_sockets[1]);
1885 				reload_listener.fd = reload_sockets[0];
1886 				reload_listener.timeout = NULL;
1887 				reload_listener.user_data = nsd;
1888 				reload_listener.event_types = NETIO_EVENT_READ;
1889 				reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */
1890 				netio_add_handler(netio, &reload_listener);
1891 				reload_pid = getppid();
1892 				break;
1893 			}
1894 			break;
1895 		case NSD_QUIT_SYNC:
1896 			/* synchronisation of xfrd, parent and reload */
1897 			if(!nsd->quit_sync_done && reload_listener.fd != -1) {
1898 				sig_atomic_t cmd = NSD_RELOAD;
1899 				/* stop xfrd ipc writes in progress */
1900 				DEBUG(DEBUG_IPC,1, (LOG_INFO,
1901 					"main: ipc send indication reload"));
1902 				if(!write_socket(nsd->xfrd_listener->fd,
1903 					&cmd, sizeof(cmd))) {
1904 					log_msg(LOG_ERR, "server_main: could not send reload "
1905 					"indication to xfrd: %s", strerror(errno));
1906 				}
1907 				/* wait for ACK from xfrd */
1908 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd"));
1909 				nsd->quit_sync_done = 1;
1910 			}
1911 			nsd->mode = NSD_RUN;
1912 			break;
1913 		case NSD_QUIT:
1914 			/* silent shutdown during reload */
1915 			if(reload_listener.fd != -1) {
1916 				/* acknowledge the quit, to sync reload that we will really quit now */
1917 				sig_atomic_t cmd = NSD_RELOAD;
1918 				DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload"));
1919 				if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1920 					log_msg(LOG_ERR, "server_main: "
1921 						"could not ack quit: %s", strerror(errno));
1922 				}
1923 #ifdef BIND8_STATS
1924 				parent_send_stats(nsd, reload_listener.fd);
1925 #endif /* BIND8_STATS */
1926 				close(reload_listener.fd);
1927 			}
1928 			DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence"));
1929 			/* only quit children after xfrd has acked */
1930 			send_children_quit(nsd);
1931 
1932 #ifdef MEMCLEAN /* OS collects memory pages */
1933 			region_destroy(server_region);
1934 #endif
1935 			server_shutdown(nsd);
1936 
1937 			/* ENOTREACH */
1938 			break;
1939 		case NSD_SHUTDOWN:
1940 			break;
1941 		case NSD_REAP_CHILDREN:
1942 			/* continue; wait for child in run loop */
1943 			nsd->mode = NSD_RUN;
1944 			break;
1945 		case NSD_STATS:
1946 #ifdef BIND8_STATS
1947 			set_children_stats(nsd);
1948 #endif
1949 			nsd->mode = NSD_RUN;
1950 			break;
1951 		default:
1952 			log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode);
1953 			nsd->mode = NSD_RUN;
1954 			break;
1955 		}
1956 	}
1957 	log_msg(LOG_WARNING, "signal received, shutting down...");
1958 
1959 	/* close opened ports to avoid race with restart of nsd */
1960 	server_close_all_sockets(nsd->udp, nsd->ifs);
1961 	server_close_all_sockets(nsd->tcp, nsd->ifs);
1962 #ifdef HAVE_SSL
1963 	daemon_remote_close(nsd->rc);
1964 #endif
1965 	send_children_quit_and_wait(nsd);
1966 
1967 	/* Unlink it if possible... */
1968 	unlinkpid(nsd->pidfile);
1969 	unlink(nsd->task[0]->fname);
1970 	unlink(nsd->task[1]->fname);
1971 #ifdef USE_ZONE_STATS
1972 	unlink(nsd->zonestatfname[0]);
1973 	unlink(nsd->zonestatfname[1]);
1974 #endif
1975 #ifdef USE_DNSTAP
1976 	dt_collector_close(nsd->dt_collector, nsd);
1977 #endif
1978 
1979 	if(reload_listener.fd != -1) {
1980 		sig_atomic_t cmd = NSD_QUIT;
1981 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
1982 			"main: ipc send quit to reload-process"));
1983 		if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) {
1984 			log_msg(LOG_ERR, "server_main: could not send quit to reload: %s",
1985 				strerror(errno));
1986 		}
1987 		fsync(reload_listener.fd);
1988 		close(reload_listener.fd);
1989 		/* wait for reload to finish processing */
1990 		while(1) {
1991 			if(waitpid(reload_pid, NULL, 0) == -1) {
1992 				if(errno == EINTR) continue;
1993 				if(errno == ECHILD) break;
1994 				log_msg(LOG_ERR, "waitpid(reload %d): %s",
1995 					(int)reload_pid, strerror(errno));
1996 			}
1997 			break;
1998 		}
1999 	}
2000 	if(nsd->xfrd_listener->fd != -1) {
2001 		/* complete quit, stop xfrd */
2002 		sig_atomic_t cmd = NSD_QUIT;
2003 		DEBUG(DEBUG_IPC,1, (LOG_INFO,
2004 			"main: ipc send quit to xfrd"));
2005 		if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) {
2006 			log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s",
2007 				strerror(errno));
2008 		}
2009 		fsync(nsd->xfrd_listener->fd);
2010 		close(nsd->xfrd_listener->fd);
2011 		(void)kill(nsd->pid, SIGTERM);
2012 	}
2013 
2014 #ifdef MEMCLEAN /* OS collects memory pages */
2015 	region_destroy(server_region);
2016 #endif
2017 	/* write the nsd.db to disk, wait for it to complete */
2018 	udb_base_sync(nsd->db->udb, 1);
2019 	udb_base_close(nsd->db->udb);
2020 	server_shutdown(nsd);
2021 }
2022 
2023 static query_state_type
2024 server_process_query(struct nsd *nsd, struct query *query)
2025 {
2026 	return query_process(query, nsd);
2027 }
2028 
2029 static query_state_type
2030 server_process_query_udp(struct nsd *nsd, struct query *query)
2031 {
2032 #ifdef RATELIMIT
2033 	if(query_process(query, nsd) != QUERY_DISCARDED) {
2034 		if(rrl_process_query(query))
2035 			return rrl_slip(query);
2036 		else	return QUERY_PROCESSED;
2037 	}
2038 	return QUERY_DISCARDED;
2039 #else
2040 	return query_process(query, nsd);
2041 #endif
2042 }
2043 
2044 struct event_base*
2045 nsd_child_event_base(void)
2046 {
2047 	struct event_base* base;
2048 #ifdef USE_MINI_EVENT
2049 	static time_t secs;
2050 	static struct timeval now;
2051 	base = event_init(&secs, &now);
2052 #else
2053 #  if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP)
2054 	/* libev */
2055 	base = (struct event_base *)ev_default_loop(EVFLAG_AUTO);
2056 #  else
2057 	/* libevent */
2058 #    ifdef HAVE_EVENT_BASE_NEW
2059 	base = event_base_new();
2060 #    else
2061 	base = event_init();
2062 #    endif
2063 #  endif
2064 #endif
2065 	return base;
2066 }
2067 
2068 /*
2069  * Serve DNS requests.
2070  */
2071 void
2072 server_child(struct nsd *nsd)
2073 {
2074 	size_t i, from, numifs;
2075 	region_type *server_region = region_create(xalloc, free);
2076 	struct event_base* event_base = nsd_child_event_base();
2077 	query_type *udp_query;
2078 	sig_atomic_t mode;
2079 
2080 	if(!event_base) {
2081 		log_msg(LOG_ERR, "nsd server could not create event base");
2082 		exit(1);
2083 	}
2084 	nsd->event_base = event_base;
2085 	nsd->server_region = server_region;
2086 
2087 #ifdef RATELIMIT
2088 	rrl_init(nsd->this_child->child_num);
2089 #endif
2090 
2091 	assert(nsd->server_kind != NSD_SERVER_MAIN);
2092 	DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started"));
2093 
2094 	if (!(nsd->server_kind & NSD_SERVER_TCP)) {
2095 		server_close_all_sockets(nsd->tcp, nsd->ifs);
2096 	}
2097 	if (!(nsd->server_kind & NSD_SERVER_UDP)) {
2098 		server_close_all_sockets(nsd->udp, nsd->ifs);
2099 	}
2100 
2101 	if (nsd->this_child->parent_fd != -1) {
2102 		struct event *handler;
2103 		struct ipc_handler_conn_data* user_data =
2104 			(struct ipc_handler_conn_data*)region_alloc(
2105 			server_region, sizeof(struct ipc_handler_conn_data));
2106 		user_data->nsd = nsd;
2107 		user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ);
2108 
2109 		handler = (struct event*) region_alloc(
2110 			server_region, sizeof(*handler));
2111 		event_set(handler, nsd->this_child->parent_fd, EV_PERSIST|
2112 			EV_READ, child_handle_parent_command, user_data);
2113 		if(event_base_set(event_base, handler) != 0)
2114 			log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed");
2115 		if(event_add(handler, NULL) != 0)
2116 			log_msg(LOG_ERR, "nsd ipcchild: event_add failed");
2117 	}
2118 
2119 	if(nsd->reuseport) {
2120 		numifs = nsd->ifs / nsd->reuseport;
2121 		from = numifs * nsd->this_child->child_num;
2122 		if(from+numifs > nsd->ifs) { /* should not happen */
2123 			from = 0;
2124 			numifs = nsd->ifs;
2125 		}
2126 	} else {
2127 		from = 0;
2128 		numifs = nsd->ifs;
2129 	}
2130 
2131 	if (nsd->server_kind & NSD_SERVER_UDP) {
2132 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2133 		udp_query = query_create(server_region,
2134 			compressed_dname_offsets, compression_table_size,
2135 			compressed_dnames);
2136 #else
2137 		udp_query = NULL;
2138 		memset(msgs, 0, sizeof(msgs));
2139 		for (i = 0; i < NUM_RECV_PER_SELECT; i++) {
2140 			queries[i] = query_create(server_region,
2141 				compressed_dname_offsets,
2142 				compression_table_size, compressed_dnames);
2143 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2144 			iovecs[i].iov_base          = buffer_begin(queries[i]->packet);
2145 			iovecs[i].iov_len           = buffer_remaining(queries[i]->packet);;
2146 			msgs[i].msg_hdr.msg_iov     = &iovecs[i];
2147 			msgs[i].msg_hdr.msg_iovlen  = 1;
2148 			msgs[i].msg_hdr.msg_name    = &queries[i]->addr;
2149 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2150 		}
2151 #endif
2152 		for (i = from; i < from+numifs; ++i) {
2153 			struct udp_handler_data *data;
2154 			struct event *handler;
2155 
2156 			data = (struct udp_handler_data *) region_alloc(
2157 				server_region,
2158 				sizeof(struct udp_handler_data));
2159 			data->query = udp_query;
2160 			data->nsd = nsd;
2161 			data->socket = &nsd->udp[i];
2162 
2163 			handler = (struct event*) region_alloc(
2164 				server_region, sizeof(*handler));
2165 			event_set(handler, nsd->udp[i].s, EV_PERSIST|EV_READ,
2166 				handle_udp, data);
2167 			if(event_base_set(event_base, handler) != 0)
2168 				log_msg(LOG_ERR, "nsd udp: event_base_set failed");
2169 			if(event_add(handler, NULL) != 0)
2170 				log_msg(LOG_ERR, "nsd udp: event_add failed");
2171 		}
2172 	}
2173 
2174 	/*
2175 	 * Keep track of all the TCP accept handlers so we can enable
2176 	 * and disable them based on the current number of active TCP
2177 	 * connections.
2178 	 */
2179 	tcp_accept_handler_count = numifs;
2180 	tcp_accept_handlers = (struct tcp_accept_handler_data*)
2181 		region_alloc_array(server_region,
2182 		numifs, sizeof(*tcp_accept_handlers));
2183 	if (nsd->server_kind & NSD_SERVER_TCP) {
2184 		for (i = from; i < numifs; ++i) {
2185 			struct event *handler = &tcp_accept_handlers[i-from].event;
2186 			struct tcp_accept_handler_data* data =
2187 				&tcp_accept_handlers[i-from];
2188 			data->nsd = nsd;
2189 			data->socket = &nsd->tcp[i];
2190 			event_set(handler, nsd->tcp[i].s, EV_PERSIST|EV_READ,
2191 				handle_tcp_accept, data);
2192 			if(event_base_set(event_base, handler) != 0)
2193 				log_msg(LOG_ERR, "nsd tcp: event_base_set failed");
2194 			if(event_add(handler, NULL) != 0)
2195 				log_msg(LOG_ERR, "nsd tcp: event_add failed");
2196 			data->event_added = 1;
2197 		}
2198 	} else tcp_accept_handler_count = 0;
2199 
2200 	/* The main loop... */
2201 	while ((mode = nsd->mode) != NSD_QUIT) {
2202 		if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd);
2203 
2204 		/* Do we need to do the statistics... */
2205 		if (mode == NSD_STATS) {
2206 #ifdef BIND8_STATS
2207 			int p = nsd->st.period;
2208 			nsd->st.period = 1; /* force stats printout */
2209 			/* Dump the statistics */
2210 			bind8_stats(nsd);
2211 			nsd->st.period = p;
2212 #else /* !BIND8_STATS */
2213 			log_msg(LOG_NOTICE, "Statistics support not enabled at compile time.");
2214 #endif /* BIND8_STATS */
2215 
2216 			nsd->mode = NSD_RUN;
2217 		}
2218 		else if (mode == NSD_REAP_CHILDREN) {
2219 			/* got signal, notify parent. parent reaps terminated children. */
2220 			if (nsd->this_child->parent_fd != -1) {
2221 				sig_atomic_t parent_notify = NSD_REAP_CHILDREN;
2222 				if (write(nsd->this_child->parent_fd,
2223 				    &parent_notify,
2224 				    sizeof(parent_notify)) == -1)
2225 				{
2226 					log_msg(LOG_ERR, "problems sending command from %d to parent: %s",
2227 						(int) nsd->this_child->pid, strerror(errno));
2228 				}
2229 			} else /* no parent, so reap 'em */
2230 				while (waitpid(-1, NULL, WNOHANG) > 0) ;
2231 			nsd->mode = NSD_RUN;
2232 		}
2233 		else if(mode == NSD_RUN) {
2234 			/* Wait for a query... */
2235 			if(event_base_loop(event_base, EVLOOP_ONCE) == -1) {
2236 				if (errno != EINTR) {
2237 					log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno));
2238 					break;
2239 				}
2240 			}
2241 		} else if(mode == NSD_QUIT) {
2242 			/* ignore here, quit */
2243 		} else {
2244 			log_msg(LOG_ERR, "mode bad value %d, back to service.",
2245 				(int)mode);
2246 			nsd->mode = NSD_RUN;
2247 		}
2248 	}
2249 
2250 #ifdef	BIND8_STATS
2251 	bind8_stats(nsd);
2252 #endif /* BIND8_STATS */
2253 
2254 #ifdef MEMCLEAN /* OS collects memory pages */
2255 #ifdef RATELIMIT
2256 	rrl_deinit(nsd->this_child->child_num);
2257 #endif
2258 	event_base_free(event_base);
2259 	region_destroy(server_region);
2260 #endif
2261 	server_shutdown(nsd);
2262 }
2263 
2264 #if defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG)
2265 static void
2266 handle_udp(int fd, short event, void* arg)
2267 {
2268 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2269 	int received, sent, recvcount, i;
2270 	struct query *q;
2271 
2272 	if (!(event & EV_READ)) {
2273 		return;
2274 	}
2275 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2276 	/* this printf strangely gave a performance increase on Linux */
2277 	/* printf("recvcount %d \n", recvcount); */
2278 	if (recvcount == -1) {
2279 		if (errno != EAGAIN && errno != EINTR) {
2280 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2281 			STATUP(data->nsd, rxerr);
2282 			/* No zone statup */
2283 		}
2284 		/* Simply no data available */
2285 		return;
2286 	}
2287 	for (i = 0; i < recvcount; i++) {
2288 	loopstart:
2289 		received = msgs[i].msg_len;
2290 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
2291 		q = queries[i];
2292 		if (received == -1) {
2293 			log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror(
2294 				msgs[i].msg_hdr.msg_flags));
2295 			STATUP(data->nsd, rxerr);
2296 			/* No zone statup */
2297 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2298 			iovecs[i].iov_len = buffer_remaining(q->packet);
2299 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2300 			goto swap_drop;
2301 		}
2302 
2303 		/* Account... */
2304 #ifdef BIND8_STATS
2305 		if (data->socket->fam == AF_INET) {
2306 			STATUP(data->nsd, qudp);
2307 		} else if (data->socket->fam == AF_INET6) {
2308 			STATUP(data->nsd, qudp6);
2309 		}
2310 #endif
2311 
2312 		buffer_skip(q->packet, received);
2313 		buffer_flip(q->packet);
2314 #ifdef USE_DNSTAP
2315 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
2316 			q->tcp, q->packet);
2317 #endif /* USE_DNSTAP */
2318 
2319 		/* Process and answer the query... */
2320 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2321 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2322 				STATUP(data->nsd, nona);
2323 				ZTATUP(data->nsd, q->zone, nona);
2324 			}
2325 
2326 #ifdef USE_ZONE_STATS
2327 			if (data->socket->fam == AF_INET) {
2328 				ZTATUP(data->nsd, q->zone, qudp);
2329 			} else if (data->socket->fam == AF_INET6) {
2330 				ZTATUP(data->nsd, q->zone, qudp6);
2331 			}
2332 #endif
2333 
2334 			/* Add EDNS0 and TSIG info if necessary.  */
2335 			query_add_optional(q, data->nsd);
2336 
2337 			buffer_flip(q->packet);
2338 			iovecs[i].iov_len = buffer_remaining(q->packet);
2339 #ifdef BIND8_STATS
2340 			/* Account the rcode & TC... */
2341 			STATUP2(data->nsd, rcode, RCODE(q->packet));
2342 			ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2343 			if (TC(q->packet)) {
2344 				STATUP(data->nsd, truncated);
2345 				ZTATUP(data->nsd, q->zone, truncated);
2346 			}
2347 #endif /* BIND8_STATS */
2348 #ifdef USE_DNSTAP
2349 			dt_collector_submit_auth_response(data->nsd,
2350 				&q->addr, q->addrlen, q->tcp, q->packet,
2351 				q->zone);
2352 #endif /* USE_DNSTAP */
2353 		} else {
2354 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2355 			iovecs[i].iov_len = buffer_remaining(q->packet);
2356 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2357 		swap_drop:
2358 			STATUP(data->nsd, dropped);
2359 			ZTATUP(data->nsd, q->zone, dropped);
2360 			if(i != recvcount-1) {
2361 				/* swap with last and decrease recvcount */
2362 				struct mmsghdr mtmp = msgs[i];
2363 				struct iovec iotmp = iovecs[i];
2364 				recvcount--;
2365 				msgs[i] = msgs[recvcount];
2366 				iovecs[i] = iovecs[recvcount];
2367 				queries[i] = queries[recvcount];
2368 				msgs[recvcount] = mtmp;
2369 				iovecs[recvcount] = iotmp;
2370 				queries[recvcount] = q;
2371 				msgs[i].msg_hdr.msg_iov = &iovecs[i];
2372 				msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount];
2373 				goto loopstart;
2374 			} else { recvcount --; }
2375 		}
2376 	}
2377 
2378 	/* send until all are sent */
2379 	i = 0;
2380 	while(i<recvcount) {
2381 		sent = sendmmsg(fd, &msgs[i], recvcount-i, 0);
2382 		if(sent == -1) {
2383 			const char* es = strerror(errno);
2384 			char a[48];
2385 			addr2str(&queries[i]->addr, a, sizeof(a));
2386 			log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es);
2387 #ifdef BIND8_STATS
2388 			data->nsd->st.txerr += recvcount-i;
2389 #endif /* BIND8_STATS */
2390 			break;
2391 		}
2392 		i += sent;
2393 	}
2394 	for(i=0; i<recvcount; i++) {
2395 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2396 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2397 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2398 	}
2399 }
2400 
2401 #else /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2402 
2403 static void
2404 handle_udp(int fd, short event, void* arg)
2405 {
2406 	struct udp_handler_data *data = (struct udp_handler_data *) arg;
2407 	int received, sent;
2408 #ifndef NONBLOCKING_IS_BROKEN
2409 #ifdef HAVE_RECVMMSG
2410 	int recvcount;
2411 #endif /* HAVE_RECVMMSG */
2412 	int i;
2413 #endif /* NONBLOCKING_IS_BROKEN */
2414 	struct query *q;
2415 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2416 	q = data->query;
2417 #endif
2418 
2419 	if (!(event & EV_READ)) {
2420 		return;
2421 	}
2422 #ifndef NONBLOCKING_IS_BROKEN
2423 #ifdef HAVE_RECVMMSG
2424 	recvcount = recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL);
2425 	/* this printf strangely gave a performance increase on Linux */
2426 	/* printf("recvcount %d \n", recvcount); */
2427 	if (recvcount == -1) {
2428 		if (errno != EAGAIN && errno != EINTR) {
2429 			log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno));
2430 			STATUP(data->nsd, rxerr);
2431 			/* No zone statup */
2432 		}
2433 		/* Simply no data available */
2434 		return;
2435 	}
2436 	for (i = 0; i < recvcount; i++) {
2437 		received = msgs[i].msg_len;
2438 		queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen;
2439 		if (received == -1) {
2440 			log_msg(LOG_ERR, "recvmmsg failed");
2441 			STATUP(data->nsd, rxerr);
2442 			/* No zone statup */
2443 			/* the error can be found in msgs[i].msg_hdr.msg_flags */
2444 			query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2445 			iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2446 			msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2447 			continue;
2448 		}
2449 		q = queries[i];
2450 #else
2451 	for(i=0; i<NUM_RECV_PER_SELECT; i++) {
2452 #endif /* HAVE_RECVMMSG */
2453 #endif /* NONBLOCKING_IS_BROKEN */
2454 
2455 #if (defined(NONBLOCKING_IS_BROKEN) || !defined(HAVE_RECVMMSG))
2456 		/* Initialize the query... */
2457 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
2458 
2459 		received = recvfrom(fd,
2460 				    buffer_begin(q->packet),
2461 				    buffer_remaining(q->packet),
2462 				    0,
2463 				    (struct sockaddr *)&q->addr,
2464 				    &q->addrlen);
2465 		if (received == -1) {
2466 			if (errno != EAGAIN && errno != EINTR) {
2467 				log_msg(LOG_ERR, "recvfrom failed: %s", strerror(errno));
2468 				STATUP(data->nsd, rxerr);
2469 				/* No zone statup */
2470 			}
2471 			return;
2472 		}
2473 #endif /* NONBLOCKING_IS_BROKEN || !HAVE_RECVMMSG */
2474 
2475 		/* Account... */
2476 		if (data->socket->fam == AF_INET) {
2477 			STATUP(data->nsd, qudp);
2478 		} else if (data->socket->fam == AF_INET6) {
2479 			STATUP(data->nsd, qudp6);
2480 		}
2481 
2482 		buffer_skip(q->packet, received);
2483 		buffer_flip(q->packet);
2484 #ifdef USE_DNSTAP
2485 		dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen,
2486 			q->tcp, q->packet);
2487 #endif /* USE_DNSTAP */
2488 
2489 		/* Process and answer the query... */
2490 		if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) {
2491 			if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
2492 				STATUP(data->nsd, nona);
2493 				ZTATUP(data->nsd, q->zone, nona);
2494 			}
2495 
2496 #ifdef USE_ZONE_STATS
2497 			if (data->socket->fam == AF_INET) {
2498 				ZTATUP(data->nsd, q->zone, qudp);
2499 			} else if (data->socket->fam == AF_INET6) {
2500 				ZTATUP(data->nsd, q->zone, qudp6);
2501 			}
2502 #endif
2503 
2504 			/* Add EDNS0 and TSIG info if necessary.  */
2505 			query_add_optional(q, data->nsd);
2506 
2507 			buffer_flip(q->packet);
2508 
2509 			sent = sendto(fd,
2510 				      buffer_begin(q->packet),
2511 				      buffer_remaining(q->packet),
2512 				      0,
2513 				      (struct sockaddr *) &q->addr,
2514 				      q->addrlen);
2515 			if (sent == -1) {
2516 				const char* es = strerror(errno);
2517 				char a[48];
2518 				addr2str(&q->addr, a, sizeof(a));
2519 				log_msg(LOG_ERR, "sendto %s failed: %s", a, es);
2520 				STATUP(data->nsd, txerr);
2521 				ZTATUP(data->nsd, q->zone, txerr);
2522 			} else if ((size_t) sent != buffer_remaining(q->packet)) {
2523 				log_msg(LOG_ERR, "sent %d in place of %d bytes", sent, (int) buffer_remaining(q->packet));
2524 			} else {
2525 #ifdef BIND8_STATS
2526 				/* Account the rcode & TC... */
2527 				STATUP2(data->nsd, rcode, RCODE(q->packet));
2528 				ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet));
2529 				if (TC(q->packet)) {
2530 					STATUP(data->nsd, truncated);
2531 					ZTATUP(data->nsd, q->zone, truncated);
2532 				}
2533 #endif /* BIND8_STATS */
2534 #ifdef USE_DNSTAP
2535 				dt_collector_submit_auth_response(data->nsd,
2536 					&q->addr, q->addrlen, q->tcp,
2537 					q->packet, q->zone);
2538 #endif /* USE_DNSTAP */
2539 			}
2540 		} else {
2541 			STATUP(data->nsd, dropped);
2542 			ZTATUP(data->nsd, q->zone, dropped);
2543 		}
2544 #ifndef NONBLOCKING_IS_BROKEN
2545 #ifdef HAVE_RECVMMSG
2546 		query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0);
2547 		iovecs[i].iov_len = buffer_remaining(queries[i]->packet);
2548 		msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen;
2549 #endif
2550 	}
2551 #endif
2552 }
2553 #endif /* defined(HAVE_SENDMMSG) && !defined(NONBLOCKING_IS_BROKEN) && defined(HAVE_RECVMMSG) */
2554 
2555 
2556 static void
2557 cleanup_tcp_handler(struct tcp_handler_data* data)
2558 {
2559 	event_del(&data->event);
2560 	close(data->event.ev_fd);
2561 
2562 	/*
2563 	 * Enable the TCP accept handlers when the current number of
2564 	 * TCP connections is about to drop below the maximum number
2565 	 * of TCP connections.
2566 	 */
2567 	if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
2568 		configure_handler_event_types(EV_READ|EV_PERSIST);
2569 		if(slowaccept) {
2570 			event_del(&slowaccept_event);
2571 			slowaccept = 0;
2572 		}
2573 	}
2574 	--data->nsd->current_tcp_count;
2575 	assert(data->nsd->current_tcp_count >= 0);
2576 
2577 	region_destroy(data->region);
2578 }
2579 
2580 static void
2581 handle_tcp_reading(int fd, short event, void* arg)
2582 {
2583 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2584 	ssize_t received;
2585 	struct event_base* ev_base;
2586 	struct timeval timeout;
2587 
2588 	if ((event & EV_TIMEOUT)) {
2589 		/* Connection timed out.  */
2590 		cleanup_tcp_handler(data);
2591 		return;
2592 	}
2593 
2594 	if (data->nsd->tcp_query_count > 0 &&
2595 		data->query_count >= data->nsd->tcp_query_count) {
2596 		/* No more queries allowed on this tcp connection.  */
2597 		cleanup_tcp_handler(data);
2598 		return;
2599 	}
2600 
2601 	assert((event & EV_READ));
2602 
2603 	if (data->bytes_transmitted == 0) {
2604 		query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1);
2605 	}
2606 
2607 	/*
2608 	 * Check if we received the leading packet length bytes yet.
2609 	 */
2610 	if (data->bytes_transmitted < sizeof(uint16_t)) {
2611 		received = read(fd,
2612 				(char *) &data->query->tcplen
2613 				+ data->bytes_transmitted,
2614 				sizeof(uint16_t) - data->bytes_transmitted);
2615 		if (received == -1) {
2616 			if (errno == EAGAIN || errno == EINTR) {
2617 				/*
2618 				 * Read would block, wait until more
2619 				 * data is available.
2620 				 */
2621 				return;
2622 			} else {
2623 				char buf[48];
2624 				addr2str(&data->query->addr, buf, sizeof(buf));
2625 #ifdef ECONNRESET
2626 				if (verbosity >= 2 || errno != ECONNRESET)
2627 #endif /* ECONNRESET */
2628 				log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2629 				cleanup_tcp_handler(data);
2630 				return;
2631 			}
2632 		} else if (received == 0) {
2633 			/* EOF */
2634 			cleanup_tcp_handler(data);
2635 			return;
2636 		}
2637 
2638 		data->bytes_transmitted += received;
2639 		if (data->bytes_transmitted < sizeof(uint16_t)) {
2640 			/*
2641 			 * Not done with the tcplen yet, wait for more
2642 			 * data to become available.
2643 			 */
2644 			return;
2645 		}
2646 
2647 		assert(data->bytes_transmitted == sizeof(uint16_t));
2648 
2649 		data->query->tcplen = ntohs(data->query->tcplen);
2650 
2651 		/*
2652 		 * Minimum query size is:
2653 		 *
2654 		 *     Size of the header (12)
2655 		 *   + Root domain name   (1)
2656 		 *   + Query class        (2)
2657 		 *   + Query type         (2)
2658 		 */
2659 		if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) {
2660 			VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection"));
2661 			cleanup_tcp_handler(data);
2662 			return;
2663 		}
2664 
2665 		if (data->query->tcplen > data->query->maxlen) {
2666 			VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection"));
2667 			cleanup_tcp_handler(data);
2668 			return;
2669 		}
2670 
2671 		buffer_set_limit(data->query->packet, data->query->tcplen);
2672 	}
2673 
2674 	assert(buffer_remaining(data->query->packet) > 0);
2675 
2676 	/* Read the (remaining) query data.  */
2677 	received = read(fd,
2678 			buffer_current(data->query->packet),
2679 			buffer_remaining(data->query->packet));
2680 	if (received == -1) {
2681 		if (errno == EAGAIN || errno == EINTR) {
2682 			/*
2683 			 * Read would block, wait until more data is
2684 			 * available.
2685 			 */
2686 			return;
2687 		} else {
2688 			char buf[48];
2689 			addr2str(&data->query->addr, buf, sizeof(buf));
2690 #ifdef ECONNRESET
2691 			if (verbosity >= 2 || errno != ECONNRESET)
2692 #endif /* ECONNRESET */
2693 			log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno));
2694 			cleanup_tcp_handler(data);
2695 			return;
2696 		}
2697 	} else if (received == 0) {
2698 		/* EOF */
2699 		cleanup_tcp_handler(data);
2700 		return;
2701 	}
2702 
2703 	data->bytes_transmitted += received;
2704 	buffer_skip(data->query->packet, received);
2705 	if (buffer_remaining(data->query->packet) > 0) {
2706 		/*
2707 		 * Message not yet complete, wait for more data to
2708 		 * become available.
2709 		 */
2710 		return;
2711 	}
2712 
2713 	assert(buffer_position(data->query->packet) == data->query->tcplen);
2714 
2715 	/* Account... */
2716 #ifdef BIND8_STATS
2717 #ifndef INET6
2718 	STATUP(data->nsd, ctcp);
2719 #else
2720 	if (data->query->addr.ss_family == AF_INET) {
2721 		STATUP(data->nsd, ctcp);
2722 	} else if (data->query->addr.ss_family == AF_INET6) {
2723 		STATUP(data->nsd, ctcp6);
2724 	}
2725 #endif
2726 #endif /* BIND8_STATS */
2727 
2728 	/* We have a complete query, process it.  */
2729 
2730 	/* tcp-query-count: handle query counter ++ */
2731 	data->query_count++;
2732 
2733 	buffer_flip(data->query->packet);
2734 #ifdef USE_DNSTAP
2735 	dt_collector_submit_auth_query(data->nsd, &data->query->addr,
2736 		data->query->addrlen, data->query->tcp, data->query->packet);
2737 #endif /* USE_DNSTAP */
2738 	data->query_state = server_process_query(data->nsd, data->query);
2739 	if (data->query_state == QUERY_DISCARDED) {
2740 		/* Drop the packet and the entire connection... */
2741 		STATUP(data->nsd, dropped);
2742 		ZTATUP(data->nsd, data->query->zone, dropped);
2743 		cleanup_tcp_handler(data);
2744 		return;
2745 	}
2746 
2747 #ifdef BIND8_STATS
2748 	if (RCODE(data->query->packet) == RCODE_OK
2749 	    && !AA(data->query->packet))
2750 	{
2751 		STATUP(data->nsd, nona);
2752 		ZTATUP(data->nsd, data->query->zone, nona);
2753 	}
2754 #endif /* BIND8_STATS */
2755 
2756 #ifdef USE_ZONE_STATS
2757 #ifndef INET6
2758 	ZTATUP(data->nsd, data->query->zone, ctcp);
2759 #else
2760 	if (data->query->addr.ss_family == AF_INET) {
2761 		ZTATUP(data->nsd, data->query->zone, ctcp);
2762 	} else if (data->query->addr.ss_family == AF_INET6) {
2763 		ZTATUP(data->nsd, data->query->zone, ctcp6);
2764 	}
2765 #endif
2766 #endif /* USE_ZONE_STATS */
2767 
2768 	query_add_optional(data->query, data->nsd);
2769 
2770 	/* Switch to the tcp write handler.  */
2771 	buffer_flip(data->query->packet);
2772 	data->query->tcplen = buffer_remaining(data->query->packet);
2773 #ifdef USE_DNSTAP
2774 	dt_collector_submit_auth_response(data->nsd, &data->query->addr,
2775 		data->query->addrlen, data->query->tcp, data->query->packet,
2776 		data->query->zone);
2777 #endif /* USE_DNSTAP */
2778 	data->bytes_transmitted = 0;
2779 
2780 	timeout.tv_sec = data->tcp_timeout / 1000;
2781 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2782 
2783 	ev_base = data->event.ev_base;
2784 	event_del(&data->event);
2785 	event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2786 		handle_tcp_writing, data);
2787 	if(event_base_set(ev_base, &data->event) != 0)
2788 		log_msg(LOG_ERR, "event base set tcpr failed");
2789 	if(event_add(&data->event, &timeout) != 0)
2790 		log_msg(LOG_ERR, "event add tcpr failed");
2791 	/* see if we can write the answer right away(usually so,EAGAIN ifnot)*/
2792 	handle_tcp_writing(fd, EV_WRITE, data);
2793 }
2794 
2795 static void
2796 handle_tcp_writing(int fd, short event, void* arg)
2797 {
2798 	struct tcp_handler_data *data = (struct tcp_handler_data *) arg;
2799 	ssize_t sent;
2800 	struct query *q = data->query;
2801 	struct timeval timeout;
2802 	struct event_base* ev_base;
2803 
2804 	if ((event & EV_TIMEOUT)) {
2805 		/* Connection timed out.  */
2806 		cleanup_tcp_handler(data);
2807 		return;
2808 	}
2809 
2810 	assert((event & EV_WRITE));
2811 
2812 	if (data->bytes_transmitted < sizeof(q->tcplen)) {
2813 		/* Writing the response packet length.  */
2814 		uint16_t n_tcplen = htons(q->tcplen);
2815 #ifdef HAVE_WRITEV
2816 		struct iovec iov[2];
2817 		iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted;
2818 		iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted;
2819 		iov[1].iov_base = buffer_begin(q->packet);
2820 		iov[1].iov_len = buffer_limit(q->packet);
2821 		sent = writev(fd, iov, 2);
2822 #else /* HAVE_WRITEV */
2823 		sent = write(fd,
2824 			     (const char *) &n_tcplen + data->bytes_transmitted,
2825 			     sizeof(n_tcplen) - data->bytes_transmitted);
2826 #endif /* HAVE_WRITEV */
2827 		if (sent == -1) {
2828 			if (errno == EAGAIN || errno == EINTR) {
2829 				/*
2830 				 * Write would block, wait until
2831 				 * socket becomes writable again.
2832 				 */
2833 				return;
2834 			} else {
2835 #ifdef ECONNRESET
2836 				if(verbosity >= 2 || errno != ECONNRESET)
2837 #endif /* ECONNRESET */
2838 #ifdef EPIPE
2839 				  if(verbosity >= 2 || errno != EPIPE)
2840 #endif /* EPIPE 'broken pipe' */
2841 				    log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2842 				cleanup_tcp_handler(data);
2843 				return;
2844 			}
2845 		}
2846 
2847 		data->bytes_transmitted += sent;
2848 		if (data->bytes_transmitted < sizeof(q->tcplen)) {
2849 			/*
2850 			 * Writing not complete, wait until socket
2851 			 * becomes writable again.
2852 			 */
2853 			return;
2854 		}
2855 
2856 #ifdef HAVE_WRITEV
2857 		sent -= sizeof(n_tcplen);
2858 		/* handle potential 'packet done' code */
2859 		goto packet_could_be_done;
2860 #endif
2861  	}
2862 
2863 	sent = write(fd,
2864 		     buffer_current(q->packet),
2865 		     buffer_remaining(q->packet));
2866 	if (sent == -1) {
2867 		if (errno == EAGAIN || errno == EINTR) {
2868 			/*
2869 			 * Write would block, wait until
2870 			 * socket becomes writable again.
2871 			 */
2872 			return;
2873 		} else {
2874 #ifdef ECONNRESET
2875 			if(verbosity >= 2 || errno != ECONNRESET)
2876 #endif /* ECONNRESET */
2877 #ifdef EPIPE
2878 				  if(verbosity >= 2 || errno != EPIPE)
2879 #endif /* EPIPE 'broken pipe' */
2880 			log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno));
2881 			cleanup_tcp_handler(data);
2882 			return;
2883 		}
2884 	}
2885 
2886 	data->bytes_transmitted += sent;
2887 #ifdef HAVE_WRITEV
2888   packet_could_be_done:
2889 #endif
2890 	buffer_skip(q->packet, sent);
2891 	if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) {
2892 		/*
2893 		 * Still more data to write when socket becomes
2894 		 * writable again.
2895 		 */
2896 		return;
2897 	}
2898 
2899 	assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen));
2900 
2901 	if (data->query_state == QUERY_IN_AXFR) {
2902 		/* Continue processing AXFR and writing back results.  */
2903 		buffer_clear(q->packet);
2904 		data->query_state = query_axfr(data->nsd, q);
2905 		if (data->query_state != QUERY_PROCESSED) {
2906 			query_add_optional(data->query, data->nsd);
2907 
2908 			/* Reset data. */
2909 			buffer_flip(q->packet);
2910 			q->tcplen = buffer_remaining(q->packet);
2911 			data->bytes_transmitted = 0;
2912 			/* Reset timeout.  */
2913 			timeout.tv_sec = data->tcp_timeout / 1000;
2914 			timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2915 			ev_base = data->event.ev_base;
2916 			event_del(&data->event);
2917 			event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT,
2918 				handle_tcp_writing, data);
2919 			if(event_base_set(ev_base, &data->event) != 0)
2920 				log_msg(LOG_ERR, "event base set tcpw failed");
2921 			if(event_add(&data->event, &timeout) != 0)
2922 				log_msg(LOG_ERR, "event add tcpw failed");
2923 
2924 			/*
2925 			 * Write data if/when the socket is writable
2926 			 * again.
2927 			 */
2928 			return;
2929 		}
2930 	}
2931 
2932 	/*
2933 	 * Done sending, wait for the next request to arrive on the
2934 	 * TCP socket by installing the TCP read handler.
2935 	 */
2936 	if (data->nsd->tcp_query_count > 0 &&
2937 		data->query_count >= data->nsd->tcp_query_count) {
2938 
2939 		(void) shutdown(fd, SHUT_WR);
2940 	}
2941 
2942 	data->bytes_transmitted = 0;
2943 
2944 	timeout.tv_sec = data->tcp_timeout / 1000;
2945 	timeout.tv_usec = (data->tcp_timeout % 1000)*1000;
2946 	ev_base = data->event.ev_base;
2947 	event_del(&data->event);
2948 	event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT,
2949 		handle_tcp_reading, data);
2950 	if(event_base_set(ev_base, &data->event) != 0)
2951 		log_msg(LOG_ERR, "event base set tcpw failed");
2952 	if(event_add(&data->event, &timeout) != 0)
2953 		log_msg(LOG_ERR, "event add tcpw failed");
2954 }
2955 
2956 
2957 static void
2958 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event),
2959 	void* ATTR_UNUSED(arg))
2960 {
2961 	if(slowaccept) {
2962 		configure_handler_event_types(EV_PERSIST | EV_READ);
2963 		slowaccept = 0;
2964 	}
2965 }
2966 
2967 /*
2968  * Handle an incoming TCP connection.  The connection is accepted and
2969  * a new TCP reader event handler is added.  The TCP handler
2970  * is responsible for cleanup when the connection is closed.
2971  */
2972 static void
2973 handle_tcp_accept(int fd, short event, void* arg)
2974 {
2975 	struct tcp_accept_handler_data *data
2976 		= (struct tcp_accept_handler_data *) arg;
2977 	int s;
2978 	struct tcp_handler_data *tcp_data;
2979 	region_type *tcp_region;
2980 #ifdef INET6
2981 	struct sockaddr_storage addr;
2982 #else
2983 	struct sockaddr_in addr;
2984 #endif
2985 	socklen_t addrlen;
2986 	struct timeval timeout;
2987 
2988 	if (!(event & EV_READ)) {
2989 		return;
2990 	}
2991 
2992 	if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) {
2993 		return;
2994 	}
2995 
2996 	/* Accept it... */
2997 	addrlen = sizeof(addr);
2998 #ifndef HAVE_ACCEPT4
2999 	s = accept(fd, (struct sockaddr *) &addr, &addrlen);
3000 #else
3001 	s = accept4(fd, (struct sockaddr *) &addr, &addrlen, SOCK_NONBLOCK);
3002 #endif
3003 	if (s == -1) {
3004 		/**
3005 		 * EMFILE and ENFILE is a signal that the limit of open
3006 		 * file descriptors has been reached. Pause accept().
3007 		 * EINTR is a signal interrupt. The others are various OS ways
3008 		 * of saying that the client has closed the connection.
3009 		 */
3010 		if (errno == EMFILE || errno == ENFILE) {
3011 			if (!slowaccept) {
3012 				/* disable accept events */
3013 				struct timeval tv;
3014 				configure_handler_event_types(0);
3015 				tv.tv_sec = SLOW_ACCEPT_TIMEOUT;
3016 				tv.tv_usec = 0L;
3017 				event_set(&slowaccept_event, -1, EV_TIMEOUT,
3018 					handle_slowaccept_timeout, NULL);
3019 				(void)event_base_set(data->event.ev_base,
3020 					&slowaccept_event);
3021 				(void)event_add(&slowaccept_event, &tv);
3022 				slowaccept = 1;
3023 				/* We don't want to spam the logs here */
3024 			}
3025 		} else if (errno != EINTR
3026 			&& errno != EWOULDBLOCK
3027 #ifdef ECONNABORTED
3028 			&& errno != ECONNABORTED
3029 #endif /* ECONNABORTED */
3030 #ifdef EPROTO
3031 			&& errno != EPROTO
3032 #endif /* EPROTO */
3033 			) {
3034 			log_msg(LOG_ERR, "accept failed: %s", strerror(errno));
3035 		}
3036 		return;
3037 	}
3038 
3039 #ifndef HAVE_ACCEPT4
3040 	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) {
3041 		log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno));
3042 		close(s);
3043 		return;
3044 	}
3045 #endif
3046 
3047 	/*
3048 	 * This region is deallocated when the TCP connection is
3049 	 * closed by the TCP handler.
3050 	 */
3051 	tcp_region = region_create(xalloc, free);
3052 	tcp_data = (struct tcp_handler_data *) region_alloc(
3053 		tcp_region, sizeof(struct tcp_handler_data));
3054 	tcp_data->region = tcp_region;
3055 	tcp_data->query = query_create(tcp_region, compressed_dname_offsets,
3056 		compression_table_size, compressed_dnames);
3057 	tcp_data->nsd = data->nsd;
3058 	tcp_data->query_count = 0;
3059 
3060 	tcp_data->query_state = QUERY_PROCESSED;
3061 	tcp_data->bytes_transmitted = 0;
3062 	memcpy(&tcp_data->query->addr, &addr, addrlen);
3063 	tcp_data->query->addrlen = addrlen;
3064 
3065 	tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000;
3066 	if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) {
3067 		/* very busy, give smaller timeout */
3068 		tcp_data->tcp_timeout = 200;
3069 	}
3070 	timeout.tv_sec = tcp_data->tcp_timeout / 1000;
3071 	timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000;
3072 
3073 	event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT,
3074 		handle_tcp_reading, tcp_data);
3075 	if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) {
3076 		log_msg(LOG_ERR, "cannot set tcp event base");
3077 		close(s);
3078 		region_destroy(tcp_region);
3079 		return;
3080 	}
3081 	if(event_add(&tcp_data->event, &timeout) != 0) {
3082 		log_msg(LOG_ERR, "cannot add tcp to event base");
3083 		close(s);
3084 		region_destroy(tcp_region);
3085 		return;
3086 	}
3087 
3088 	/*
3089 	 * Keep track of the total number of TCP handlers installed so
3090 	 * we can stop accepting connections when the maximum number
3091 	 * of simultaneous TCP connections is reached.
3092 	 */
3093 	++data->nsd->current_tcp_count;
3094 	if (data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) {
3095 		configure_handler_event_types(0);
3096 	}
3097 }
3098 
3099 static void
3100 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout)
3101 {
3102 	size_t i;
3103 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3104 	for (i = 0; i < nsd->child_count; ++i) {
3105 		if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) {
3106 			if (write(nsd->children[i].child_fd,
3107 				&command,
3108 				sizeof(command)) == -1)
3109 			{
3110 				if(errno != EAGAIN && errno != EINTR)
3111 					log_msg(LOG_ERR, "problems sending command %d to server %d: %s",
3112 					(int) command,
3113 					(int) nsd->children[i].pid,
3114 					strerror(errno));
3115 			} else if (timeout > 0) {
3116 				(void)block_read(NULL,
3117 					nsd->children[i].child_fd,
3118 					&command, sizeof(command), timeout);
3119 			}
3120 			fsync(nsd->children[i].child_fd);
3121 			close(nsd->children[i].child_fd);
3122 			nsd->children[i].child_fd = -1;
3123 		}
3124 	}
3125 }
3126 
3127 static void
3128 send_children_quit(struct nsd* nsd)
3129 {
3130 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit"));
3131 	send_children_command(nsd, NSD_QUIT, 0);
3132 }
3133 
3134 static void
3135 send_children_quit_and_wait(struct nsd* nsd)
3136 {
3137 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait"));
3138 	send_children_command(nsd, NSD_QUIT_CHILD, 3);
3139 }
3140 
3141 #ifdef BIND8_STATS
3142 static void
3143 set_children_stats(struct nsd* nsd)
3144 {
3145 	size_t i;
3146 	assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0);
3147 	DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children"));
3148 	for (i = 0; i < nsd->child_count; ++i) {
3149 		nsd->children[i].need_to_send_STATS = 1;
3150 		nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE;
3151 	}
3152 }
3153 #endif /* BIND8_STATS */
3154 
3155 static void
3156 configure_handler_event_types(short event_types)
3157 {
3158 	size_t i;
3159 
3160 	for (i = 0; i < tcp_accept_handler_count; ++i) {
3161 		struct event* handler = &tcp_accept_handlers[i].event;
3162 		if(event_types) {
3163 			/* reassign */
3164 			int fd = handler->ev_fd;
3165 			struct event_base* base = handler->ev_base;
3166 			if(tcp_accept_handlers[i].event_added)
3167 				event_del(handler);
3168 			event_set(handler, fd, event_types,
3169 				handle_tcp_accept, &tcp_accept_handlers[i]);
3170 			if(event_base_set(base, handler) != 0)
3171 				log_msg(LOG_ERR, "conhand: cannot event_base");
3172 			if(event_add(handler, NULL) != 0)
3173 				log_msg(LOG_ERR, "conhand: cannot event_add");
3174 			tcp_accept_handlers[i].event_added = 1;
3175 		} else {
3176 			/* remove */
3177 			if(tcp_accept_handlers[i].event_added) {
3178 				event_del(handler);
3179 				tcp_accept_handlers[i].event_added = 0;
3180 			}
3181 		}
3182 	}
3183 }
3184