1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #ifdef USE_DNSTAP 85 #include "dnstap/dnstap_collector.h" 86 #endif 87 88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 89 90 #ifdef USE_DNSTAP 91 /* 92 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content 93 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*) 94 */ 95 static void 96 log_addr(const char* descr, 97 #ifdef INET6 98 struct sockaddr_storage* addr, 99 #else 100 struct sockaddr_in* addr, 101 #endif 102 short family) 103 { 104 char str_buf[64]; 105 if(verbosity < 6) 106 return; 107 if(family == AF_INET) { 108 struct sockaddr_in* s = (struct sockaddr_in*)addr; 109 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf)); 110 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port))); 111 #ifdef INET6 112 } else { 113 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr; 114 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf)); 115 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port))); 116 #endif 117 } 118 } 119 #endif /* USE_DNSTAP */ 120 121 #ifdef USE_TCP_FASTOPEN 122 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 123 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 124 #endif 125 126 /* 127 * Data for the UDP handlers. 128 */ 129 struct udp_handler_data 130 { 131 struct nsd *nsd; 132 struct nsd_socket *socket; 133 struct event event; 134 }; 135 136 struct tcp_accept_handler_data { 137 struct nsd *nsd; 138 struct nsd_socket *socket; 139 int event_added; 140 struct event event; 141 #ifdef HAVE_SSL 142 /* handler accepts TLS connections on the dedicated port */ 143 int tls_accept; 144 #endif 145 }; 146 147 /* 148 * These globals are used to enable the TCP accept handlers 149 * when the number of TCP connection drops below the maximum 150 * number of TCP connections. 151 */ 152 static size_t tcp_accept_handler_count; 153 static struct tcp_accept_handler_data *tcp_accept_handlers; 154 155 static struct event slowaccept_event; 156 static int slowaccept; 157 158 #ifdef HAVE_SSL 159 static unsigned char *ocspdata = NULL; 160 static long ocspdata_len = 0; 161 #endif 162 163 #ifdef NONBLOCKING_IS_BROKEN 164 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 165 read multiple times from a socket when reported ready by select. */ 166 # define NUM_RECV_PER_SELECT (1) 167 #else /* !NONBLOCKING_IS_BROKEN */ 168 # define NUM_RECV_PER_SELECT (100) 169 #endif /* NONBLOCKING_IS_BROKEN */ 170 171 #ifndef HAVE_MMSGHDR 172 struct mmsghdr { 173 struct msghdr msg_hdr; 174 unsigned int msg_len; 175 }; 176 #endif 177 178 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 179 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 180 static struct query *queries[NUM_RECV_PER_SELECT]; 181 182 /* 183 * Data for the TCP connection handlers. 184 * 185 * The TCP handlers use non-blocking I/O. This is necessary to avoid 186 * blocking the entire server on a slow TCP connection, but does make 187 * reading from and writing to the socket more complicated. 188 * 189 * Basically, whenever a read/write would block (indicated by the 190 * EAGAIN errno variable) we remember the position we were reading 191 * from/writing to and return from the TCP reading/writing event 192 * handler. When the socket becomes readable/writable again we 193 * continue from the same position. 194 */ 195 struct tcp_handler_data 196 { 197 /* 198 * The region used to allocate all TCP connection related 199 * data, including this structure. This region is destroyed 200 * when the connection is closed. 201 */ 202 region_type* region; 203 204 /* 205 * The global nsd structure. 206 */ 207 struct nsd* nsd; 208 209 /* 210 * The current query data for this TCP connection. 211 */ 212 query_type* query; 213 214 /* 215 * The query_state is used to remember if we are performing an 216 * AXFR, if we're done processing, or if we should discard the 217 * query and connection. 218 */ 219 query_state_type query_state; 220 221 /* 222 * The event for the file descriptor and tcp timeout 223 */ 224 struct event event; 225 226 /* 227 * The bytes_transmitted field is used to remember the number 228 * of bytes transmitted when receiving or sending a DNS 229 * packet. The count includes the two additional bytes used 230 * to specify the packet length on a TCP connection. 231 */ 232 size_t bytes_transmitted; 233 234 /* 235 * The number of queries handled by this specific TCP connection. 236 */ 237 int query_count; 238 239 /* 240 * The timeout in msec for this tcp connection 241 */ 242 int tcp_timeout; 243 244 /* 245 * If the connection is allowed to have further queries on it. 246 */ 247 int tcp_no_more_queries; 248 249 #ifdef USE_DNSTAP 250 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */ 251 struct nsd_socket *socket; 252 #endif /* USE_DNSTAP */ 253 254 #ifdef HAVE_SSL 255 /* 256 * TLS object. 257 */ 258 SSL* tls; 259 260 /* 261 * TLS handshake state. 262 */ 263 enum { tls_hs_none, tls_hs_read, tls_hs_write, 264 tls_hs_read_event, tls_hs_write_event } shake_state; 265 #endif 266 /* list of connections, for service of remaining tcp channels */ 267 struct tcp_handler_data *prev, *next; 268 }; 269 /* global that is the list of active tcp channels */ 270 static struct tcp_handler_data *tcp_active_list = NULL; 271 272 /* 273 * Handle incoming queries on the UDP server sockets. 274 */ 275 static void handle_udp(int fd, short event, void* arg); 276 277 /* 278 * Handle incoming connections on the TCP sockets. These handlers 279 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 280 * connection) but are disabled when the number of current TCP 281 * connections is equal to the maximum number of TCP connections. 282 * Disabling is done by changing the handler to wait for the 283 * NETIO_EVENT_NONE type. This is done using the function 284 * configure_tcp_accept_handlers. 285 */ 286 static void handle_tcp_accept(int fd, short event, void* arg); 287 288 /* 289 * Handle incoming queries on a TCP connection. The TCP connections 290 * are configured to be non-blocking and the handler may be called 291 * multiple times before a complete query is received. 292 */ 293 static void handle_tcp_reading(int fd, short event, void* arg); 294 295 /* 296 * Handle outgoing responses on a TCP connection. The TCP connections 297 * are configured to be non-blocking and the handler may be called 298 * multiple times before a complete response is sent. 299 */ 300 static void handle_tcp_writing(int fd, short event, void* arg); 301 302 #ifdef HAVE_SSL 303 /* Create SSL object and associate fd */ 304 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 305 /* 306 * Handle TLS handshake. May be called multiple times if incomplete. 307 */ 308 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 309 310 /* 311 * Handle incoming queries on a TLS over TCP connection. The TLS 312 * connections are configured to be non-blocking and the handler may 313 * be called multiple times before a complete query is received. 314 */ 315 static void handle_tls_reading(int fd, short event, void* arg); 316 317 /* 318 * Handle outgoing responses on a TLS over TCP connection. The TLS 319 * connections are configured to be non-blocking and the handler may 320 * be called multiple times before a complete response is sent. 321 */ 322 static void handle_tls_writing(int fd, short event, void* arg); 323 #endif 324 325 /* 326 * Send all children the quit nonblocking, then close pipe. 327 */ 328 static void send_children_quit(struct nsd* nsd); 329 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 330 static void send_children_quit_and_wait(struct nsd* nsd); 331 332 /* set childrens flags to send NSD_STATS to them */ 333 #ifdef BIND8_STATS 334 static void set_children_stats(struct nsd* nsd); 335 #endif /* BIND8_STATS */ 336 337 /* 338 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 339 */ 340 static void configure_handler_event_types(short event_types); 341 342 static uint16_t *compressed_dname_offsets = 0; 343 static uint32_t compression_table_capacity = 0; 344 static uint32_t compression_table_size = 0; 345 static domain_type* compressed_dnames[MAXRRSPP]; 346 347 #ifdef USE_TCP_FASTOPEN 348 /* Checks to see if the kernel value must be manually changed in order for 349 TCP Fast Open to support server mode */ 350 static void report_tcp_fastopen_config() { 351 352 int tcp_fastopen_fp; 353 uint8_t tcp_fastopen_value; 354 355 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 356 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 357 } 358 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 359 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 360 close(tcp_fastopen_fp); 361 } 362 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 363 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 364 log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); 365 log_msg(LOG_WARNING, "To enable TFO use the command:"); 366 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 367 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 368 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 369 close(tcp_fastopen_fp); 370 } 371 close(tcp_fastopen_fp); 372 } 373 #endif 374 375 /* 376 * Remove the specified pid from the list of child pids. Returns -1 if 377 * the pid is not in the list, child_num otherwise. The field is set to 0. 378 */ 379 static int 380 delete_child_pid(struct nsd *nsd, pid_t pid) 381 { 382 size_t i; 383 for (i = 0; i < nsd->child_count; ++i) { 384 if (nsd->children[i].pid == pid) { 385 nsd->children[i].pid = 0; 386 if(!nsd->children[i].need_to_exit) { 387 if(nsd->children[i].child_fd != -1) 388 close(nsd->children[i].child_fd); 389 nsd->children[i].child_fd = -1; 390 if(nsd->children[i].handler) 391 nsd->children[i].handler->fd = -1; 392 } 393 return i; 394 } 395 } 396 return -1; 397 } 398 399 /* 400 * Restart child servers if necessary. 401 */ 402 static int 403 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 404 int* xfrd_sock_p) 405 { 406 struct main_ipc_handler_data *ipc_data; 407 size_t i; 408 int sv[2]; 409 410 /* Fork the child processes... */ 411 for (i = 0; i < nsd->child_count; ++i) { 412 if (nsd->children[i].pid <= 0) { 413 if (nsd->children[i].child_fd != -1) 414 close(nsd->children[i].child_fd); 415 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 416 log_msg(LOG_ERR, "socketpair: %s", 417 strerror(errno)); 418 return -1; 419 } 420 nsd->children[i].child_fd = sv[0]; 421 nsd->children[i].parent_fd = sv[1]; 422 nsd->children[i].pid = fork(); 423 switch (nsd->children[i].pid) { 424 default: /* SERVER MAIN */ 425 close(nsd->children[i].parent_fd); 426 nsd->children[i].parent_fd = -1; 427 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 428 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 429 } 430 if(!nsd->children[i].handler) 431 { 432 ipc_data = (struct main_ipc_handler_data*) region_alloc( 433 region, sizeof(struct main_ipc_handler_data)); 434 ipc_data->nsd = nsd; 435 ipc_data->child = &nsd->children[i]; 436 ipc_data->child_num = i; 437 ipc_data->xfrd_sock = xfrd_sock_p; 438 ipc_data->packet = buffer_create(region, QIOBUFSZ); 439 ipc_data->forward_mode = 0; 440 ipc_data->got_bytes = 0; 441 ipc_data->total_bytes = 0; 442 ipc_data->acl_num = 0; 443 nsd->children[i].handler = (struct netio_handler*) region_alloc( 444 region, sizeof(struct netio_handler)); 445 nsd->children[i].handler->fd = nsd->children[i].child_fd; 446 nsd->children[i].handler->timeout = NULL; 447 nsd->children[i].handler->user_data = ipc_data; 448 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 449 nsd->children[i].handler->event_handler = parent_handle_child_command; 450 netio_add_handler(netio, nsd->children[i].handler); 451 } 452 /* clear any ongoing ipc */ 453 ipc_data = (struct main_ipc_handler_data*) 454 nsd->children[i].handler->user_data; 455 ipc_data->forward_mode = 0; 456 /* restart - update fd */ 457 nsd->children[i].handler->fd = nsd->children[i].child_fd; 458 break; 459 case 0: /* CHILD */ 460 /* the child need not be able to access the 461 * nsd.db file */ 462 namedb_close_udb(nsd->db); 463 #ifdef MEMCLEAN /* OS collects memory pages */ 464 region_destroy(region); 465 #endif 466 467 if (pledge("stdio rpath inet", NULL) == -1) { 468 log_msg(LOG_ERR, "pledge"); 469 exit(1); 470 } 471 472 nsd->pid = 0; 473 nsd->child_count = 0; 474 nsd->server_kind = nsd->children[i].kind; 475 nsd->this_child = &nsd->children[i]; 476 nsd->this_child->child_num = i; 477 /* remove signal flags inherited from parent 478 the parent will handle them. */ 479 nsd->signal_hint_reload_hup = 0; 480 nsd->signal_hint_reload = 0; 481 nsd->signal_hint_child = 0; 482 nsd->signal_hint_quit = 0; 483 nsd->signal_hint_shutdown = 0; 484 nsd->signal_hint_stats = 0; 485 nsd->signal_hint_statsusr = 0; 486 close(*xfrd_sock_p); 487 close(nsd->this_child->child_fd); 488 nsd->this_child->child_fd = -1; 489 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 490 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 491 } 492 server_child(nsd); 493 /* NOTREACH */ 494 exit(0); 495 case -1: 496 log_msg(LOG_ERR, "fork failed: %s", 497 strerror(errno)); 498 return -1; 499 } 500 } 501 } 502 return 0; 503 } 504 505 #ifdef BIND8_STATS 506 static void set_bind8_alarm(struct nsd* nsd) 507 { 508 /* resync so that the next alarm is on the next whole minute */ 509 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ 510 alarm(nsd->st.period - (time(NULL) % nsd->st.period)); 511 } 512 #endif 513 514 /* set zone stat ids for zones initially read in */ 515 static void 516 zonestatid_tree_set(struct nsd* nsd) 517 { 518 struct radnode* n; 519 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 520 zone_type* zone = (zone_type*)n->elem; 521 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 522 } 523 } 524 525 #ifdef USE_ZONE_STATS 526 void 527 server_zonestat_alloc(struct nsd* nsd) 528 { 529 size_t num = (nsd->options->zonestatnames->count==0?1: 530 nsd->options->zonestatnames->count); 531 size_t sz = sizeof(struct nsdst)*num; 532 char tmpfile[256]; 533 uint8_t z = 0; 534 535 /* file names */ 536 nsd->zonestatfname[0] = 0; 537 nsd->zonestatfname[1] = 0; 538 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 539 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 540 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 541 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 542 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 543 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 544 545 /* file descriptors */ 546 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 547 if(nsd->zonestatfd[0] == -1) { 548 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 549 strerror(errno)); 550 exit(1); 551 } 552 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 553 if(nsd->zonestatfd[0] == -1) { 554 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 555 strerror(errno)); 556 close(nsd->zonestatfd[0]); 557 unlink(nsd->zonestatfname[0]); 558 exit(1); 559 } 560 561 #ifdef HAVE_MMAP 562 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 563 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 564 strerror(errno)); 565 exit(1); 566 } 567 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 568 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 569 nsd->zonestatfname[0], strerror(errno)); 570 exit(1); 571 } 572 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 573 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 574 strerror(errno)); 575 exit(1); 576 } 577 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 578 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 579 nsd->zonestatfname[1], strerror(errno)); 580 exit(1); 581 } 582 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 583 MAP_SHARED, nsd->zonestatfd[0], 0); 584 if(nsd->zonestat[0] == MAP_FAILED) { 585 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 586 unlink(nsd->zonestatfname[0]); 587 unlink(nsd->zonestatfname[1]); 588 exit(1); 589 } 590 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 591 MAP_SHARED, nsd->zonestatfd[1], 0); 592 if(nsd->zonestat[1] == MAP_FAILED) { 593 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 594 unlink(nsd->zonestatfname[0]); 595 unlink(nsd->zonestatfname[1]); 596 exit(1); 597 } 598 memset(nsd->zonestat[0], 0, sz); 599 memset(nsd->zonestat[1], 0, sz); 600 nsd->zonestatsize[0] = num; 601 nsd->zonestatsize[1] = num; 602 nsd->zonestatdesired = num; 603 nsd->zonestatsizenow = num; 604 nsd->zonestatnow = nsd->zonestat[0]; 605 #endif /* HAVE_MMAP */ 606 } 607 608 void 609 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 610 { 611 #ifdef HAVE_MMAP 612 #ifdef MREMAP_MAYMOVE 613 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 614 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 615 MREMAP_MAYMOVE); 616 if(nsd->zonestat[idx] == MAP_FAILED) { 617 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 618 exit(1); 619 } 620 #else /* !HAVE MREMAP */ 621 if(msync(nsd->zonestat[idx], 622 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 623 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 624 if(munmap(nsd->zonestat[idx], 625 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 626 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 627 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 628 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 629 if(nsd->zonestat[idx] == MAP_FAILED) { 630 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 631 exit(1); 632 } 633 #endif /* MREMAP */ 634 #endif /* HAVE_MMAP */ 635 } 636 637 /* realloc the zonestat array for the one that is not currently in use, 638 * to match the desired new size of the array (if applicable) */ 639 void 640 server_zonestat_realloc(struct nsd* nsd) 641 { 642 #ifdef HAVE_MMAP 643 uint8_t z = 0; 644 size_t sz; 645 int idx = 0; /* index of the zonestat array that is not in use */ 646 if(nsd->zonestatnow == nsd->zonestat[0]) 647 idx = 1; 648 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 649 return; 650 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 651 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 652 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 653 strerror(errno)); 654 exit(1); 655 } 656 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 657 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 658 nsd->zonestatfname[idx], strerror(errno)); 659 exit(1); 660 } 661 zonestat_remap(nsd, idx, sz); 662 /* zero the newly allocated region */ 663 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 664 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 665 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 666 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 667 } 668 nsd->zonestatsize[idx] = nsd->zonestatdesired; 669 #endif /* HAVE_MMAP */ 670 } 671 672 /* switchover to use the other array for the new children, that 673 * briefly coexist with the old children. And we want to avoid them 674 * both writing to the same statistics arrays. */ 675 void 676 server_zonestat_switch(struct nsd* nsd) 677 { 678 if(nsd->zonestatnow == nsd->zonestat[0]) { 679 nsd->zonestatnow = nsd->zonestat[1]; 680 nsd->zonestatsizenow = nsd->zonestatsize[1]; 681 } else { 682 nsd->zonestatnow = nsd->zonestat[0]; 683 nsd->zonestatsizenow = nsd->zonestatsize[0]; 684 } 685 } 686 #endif /* USE_ZONE_STATS */ 687 688 static void 689 cleanup_dname_compression_tables(void *ptr) 690 { 691 free(ptr); 692 compressed_dname_offsets = NULL; 693 compression_table_capacity = 0; 694 } 695 696 static void 697 initialize_dname_compression_tables(struct nsd *nsd) 698 { 699 size_t needed = domain_table_count(nsd->db->domains) + 1; 700 needed += EXTRA_DOMAIN_NUMBERS; 701 if(compression_table_capacity < needed) { 702 if(compressed_dname_offsets) { 703 region_remove_cleanup(nsd->db->region, 704 cleanup_dname_compression_tables, 705 compressed_dname_offsets); 706 free(compressed_dname_offsets); 707 } 708 compressed_dname_offsets = (uint16_t *) xmallocarray( 709 needed, sizeof(uint16_t)); 710 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 711 compressed_dname_offsets); 712 compression_table_capacity = needed; 713 compression_table_size=domain_table_count(nsd->db->domains)+1; 714 } 715 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 716 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 717 } 718 719 static int 720 set_cloexec(struct nsd_socket *sock) 721 { 722 assert(sock != NULL); 723 724 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 725 const char *socktype = 726 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 727 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 728 socktype, strerror(errno)); 729 return -1; 730 } 731 732 return 1; 733 } 734 735 static int 736 set_reuseport(struct nsd_socket *sock) 737 { 738 #ifdef SO_REUSEPORT 739 int on = 1; 740 #ifdef SO_REUSEPORT_LB 741 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 742 * SO_REUSEPORT on Linux. This is what the users want with the config 743 * option in nsd.conf; if we actually need local address and port reuse 744 * they'll also need to have SO_REUSEPORT set for them, assume it was 745 * _LB they want. 746 */ 747 int opt = SO_REUSEPORT_LB; 748 static const char optname[] = "SO_REUSEPORT_LB"; 749 #else /* !SO_REUSEPORT_LB */ 750 int opt = SO_REUSEPORT; 751 static const char optname[] = "SO_REUSEPORT"; 752 #endif /* SO_REUSEPORT_LB */ 753 754 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 755 return 1; 756 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 757 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 758 optname, strerror(errno)); 759 } 760 return -1; 761 #else 762 (void)sock; 763 #endif /* SO_REUSEPORT */ 764 765 return 0; 766 } 767 768 static int 769 set_reuseaddr(struct nsd_socket *sock) 770 { 771 #ifdef SO_REUSEADDR 772 int on = 1; 773 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 774 return 1; 775 } 776 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 777 strerror(errno)); 778 return -1; 779 #endif /* SO_REUSEADDR */ 780 return 0; 781 } 782 783 static int 784 set_rcvbuf(struct nsd_socket *sock, int rcv) 785 { 786 #ifdef SO_RCVBUF 787 #ifdef SO_RCVBUFFORCE 788 if(0 == setsockopt( 789 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 790 { 791 return 1; 792 } 793 if(errno == EPERM || errno == ENOBUFS) { 794 return 0; 795 } 796 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 797 strerror(errno)); 798 return -1; 799 #else /* !SO_RCVBUFFORCE */ 800 if (0 == setsockopt( 801 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 802 { 803 return 1; 804 } 805 if(errno == ENOSYS || errno == ENOBUFS) { 806 return 0; 807 } 808 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 809 strerror(errno)); 810 return -1; 811 #endif /* SO_RCVBUFFORCE */ 812 #endif /* SO_RCVBUF */ 813 814 return 0; 815 } 816 817 static int 818 set_sndbuf(struct nsd_socket *sock, int snd) 819 { 820 #ifdef SO_SNDBUF 821 #ifdef SO_SNDBUFFORCE 822 if(0 == setsockopt( 823 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 824 { 825 return 1; 826 } 827 if(errno == EPERM || errno == ENOBUFS) { 828 return 0; 829 } 830 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 831 strerror(errno)); 832 return -1; 833 #else /* !SO_SNDBUFFORCE */ 834 if(0 == setsockopt( 835 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 836 { 837 return 1; 838 } 839 if(errno == ENOSYS || errno == ENOBUFS) { 840 return 0; 841 } 842 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 843 strerror(errno)); 844 return -1; 845 #endif /* SO_SNDBUFFORCE */ 846 #endif /* SO_SNDBUF */ 847 848 return 0; 849 } 850 851 static int 852 set_nonblock(struct nsd_socket *sock) 853 { 854 const char *socktype = 855 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 856 857 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 858 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 859 socktype, strerror(errno)); 860 return -1; 861 } 862 863 return 1; 864 } 865 866 static int 867 set_ipv6_v6only(struct nsd_socket *sock) 868 { 869 #ifdef INET6 870 #ifdef IPV6_V6ONLY 871 int on = 1; 872 const char *socktype = 873 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 874 875 if(0 == setsockopt( 876 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 877 { 878 return 1; 879 } 880 881 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 882 socktype, strerror(errno)); 883 return -1; 884 #endif /* IPV6_V6ONLY */ 885 #endif /* INET6 */ 886 887 return 0; 888 } 889 890 static int 891 set_ipv6_use_min_mtu(struct nsd_socket *sock) 892 { 893 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)) 894 #if defined(IPV6_USE_MIN_MTU) 895 /* There is no fragmentation of IPv6 datagrams during forwarding in the 896 * network. Therefore we do not send UDP datagrams larger than the 897 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 898 * larger if the network stack supports IPV6_USE_MIN_MTU. 899 */ 900 int opt = IPV6_USE_MIN_MTU; 901 int optval = 1; 902 static const char optname[] = "IPV6_USE_MIN_MTU"; 903 #elif defined(IPV6_MTU) 904 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 905 * to the MIN MTU to get the same. 906 */ 907 int opt = IPV6_MTU; 908 int optval = IPV6_MIN_MTU; 909 static const char optname[] = "IPV6_MTU"; 910 #endif 911 if(0 == setsockopt( 912 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 913 { 914 return 1; 915 } 916 917 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 918 optname, strerror(errno)); 919 return -1; 920 #else 921 (void)sock; 922 #endif /* INET6 */ 923 924 return 0; 925 } 926 927 static int 928 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 929 { 930 int ret = 0; 931 932 #if defined(IP_MTU_DISCOVER) 933 int opt = IP_MTU_DISCOVER; 934 int optval; 935 # if defined(IP_PMTUDISC_OMIT) 936 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 937 * information and send packets with DF=0. Fragmentation is allowed if 938 * and only if the packet size exceeds the outgoing interface MTU or 939 * the packet encounters smaller MTU link in network. This mitigates 940 * DNS fragmentation attacks by preventing forged PMTU information. 941 * FreeBSD already has same semantics without setting the option. 942 */ 943 optval = IP_PMTUDISC_OMIT; 944 if(0 == setsockopt( 945 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 946 { 947 return 1; 948 } 949 950 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 951 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 952 # endif /* IP_PMTUDISC_OMIT */ 953 # if defined(IP_PMTUDISC_DONT) 954 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 955 optval = IP_PMTUDISC_DONT; 956 if(0 == setsockopt( 957 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 958 { 959 return 1; 960 } 961 962 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 963 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 964 # endif 965 ret = -1; 966 #elif defined(IP_DONTFRAG) 967 int off = 0; 968 if (0 == setsockopt( 969 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 970 { 971 return 1; 972 } 973 974 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 975 strerror(errno)); 976 ret = -1; 977 #else 978 (void)sock; 979 #endif 980 981 return ret; 982 } 983 984 static int 985 set_ip_freebind(struct nsd_socket *sock) 986 { 987 #ifdef IP_FREEBIND 988 int on = 1; 989 const char *socktype = 990 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 991 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 992 { 993 return 1; 994 } 995 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 996 socktype, strerror(errno)); 997 return -1; 998 #else 999 (void)sock; 1000 #endif /* IP_FREEBIND */ 1001 1002 return 0; 1003 } 1004 1005 static int 1006 set_ip_transparent(struct nsd_socket *sock) 1007 { 1008 /* 1009 The scandalous preprocessor blob here calls for some explanation :) 1010 POSIX does not specify an option to bind non-local IPs, so 1011 platforms developed several implementation-specific options, 1012 all set in the same way, but with different names. 1013 For additional complexity, some platform manage this setting 1014 differently for different address families (IPv4 vs IPv6). 1015 This scandalous preprocessor blob below abstracts such variability 1016 in the way which leaves the C code as lean and clear as possible. 1017 */ 1018 1019 #if defined(IP_TRANSPARENT) 1020 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 1021 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1022 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 1023 // as of 2020-01, Linux does not support this on IPv6 programmatically 1024 #elif defined(SO_BINDANY) 1025 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 1026 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 1027 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 1028 #elif defined(IP_BINDANY) 1029 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 1030 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 1031 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1032 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 1033 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 1034 #endif 1035 1036 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 1037 (void)sock; 1038 #else 1039 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 1040 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 1041 # endif 1042 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1043 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1044 # endif 1045 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1046 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1047 # endif 1048 1049 int on = 1; 1050 const char *socktype = 1051 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1052 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1053 1054 if(0 == setsockopt( 1055 sock->s, 1056 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1057 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1058 &on, sizeof(on))) 1059 { 1060 return 1; 1061 } 1062 1063 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1064 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1065 return -1; 1066 #endif 1067 1068 return 0; 1069 } 1070 1071 static int 1072 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1073 { 1074 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1075 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1076 return 1; 1077 } 1078 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1079 strerror(errno)); 1080 return -1; 1081 #else 1082 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1083 #endif 1084 return 0; 1085 } 1086 1087 #ifdef USE_TCP_FASTOPEN 1088 static int 1089 set_tcp_fastopen(struct nsd_socket *sock) 1090 { 1091 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1092 * a defense against IP spoofing attacks as suggested in RFC7413. 1093 */ 1094 int qlen; 1095 1096 #ifdef __APPLE__ 1097 /* macOS X implementation only supports qlen of 1 via this call. The 1098 * actual value is configured by the net.inet.tcp.fastopen_backlog 1099 * kernel parameter. 1100 */ 1101 qlen = 1; 1102 #else 1103 /* 5 is recommended on Linux. */ 1104 qlen = 5; 1105 #endif 1106 if (0 == setsockopt( 1107 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1108 { 1109 return 1; 1110 } 1111 1112 if (errno == EPERM) { 1113 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1114 "; this could likely be because sysctl " 1115 "net.inet.tcp.fastopen.enabled, " 1116 "net.inet.tcp.fastopen.server_enable, or " 1117 "net.ipv4.tcp_fastopen is disabled", 1118 strerror(errno)); 1119 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1120 * disabled, except when verbosity enabled for debugging 1121 */ 1122 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1123 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1124 strerror(errno)); 1125 } 1126 1127 return (errno == ENOPROTOOPT ? 0 : -1); 1128 } 1129 #endif /* USE_TCP_FASTOPEN */ 1130 1131 static int 1132 set_bindtodevice(struct nsd_socket *sock) 1133 { 1134 #if defined(SO_BINDTODEVICE) 1135 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1136 sock->device, strlen(sock->device)) == -1) 1137 { 1138 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1139 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1140 return -1; 1141 } 1142 1143 return 1; 1144 #else 1145 (void)sock; 1146 return 0; 1147 #endif 1148 } 1149 1150 static int 1151 set_setfib(struct nsd_socket *sock) 1152 { 1153 #if defined(SO_SETFIB) 1154 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1155 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1156 { 1157 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1158 "SO_SETFIB", sock->fib, strerror(errno)); 1159 return -1; 1160 } 1161 1162 return 1; 1163 #else 1164 (void)sock; 1165 return 0; 1166 #endif 1167 } 1168 1169 static int 1170 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1171 { 1172 int rcv = 1*1024*1024, snd = 1*1024*1024; 1173 1174 if(-1 == (sock->s = socket( 1175 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1176 { 1177 #ifdef INET6 1178 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1179 (sock->addr.ai_family == AF_INET6) && 1180 (errno == EAFNOSUPPORT)) 1181 { 1182 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1183 "not supported"); 1184 return 0; 1185 } 1186 #endif 1187 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1188 return -1; 1189 } 1190 1191 set_cloexec(sock); 1192 1193 if(nsd->reuseport && reuseport_works && *reuseport_works) 1194 *reuseport_works = (set_reuseport(sock) == 1); 1195 1196 if(nsd->options->receive_buffer_size > 0) 1197 rcv = nsd->options->receive_buffer_size; 1198 if(set_rcvbuf(sock, rcv) == -1) 1199 return -1; 1200 1201 if(nsd->options->send_buffer_size > 0) 1202 snd = nsd->options->send_buffer_size; 1203 if(set_sndbuf(sock, snd) == -1) 1204 return -1; 1205 #ifdef INET6 1206 if(sock->addr.ai_family == AF_INET6) { 1207 if(set_ipv6_v6only(sock) == -1 || 1208 set_ipv6_use_min_mtu(sock) == -1) 1209 return -1; 1210 } else 1211 #endif /* INET6 */ 1212 if(sock->addr.ai_family == AF_INET) { 1213 if(set_ipv4_no_pmtu_disc(sock) == -1) 1214 return -1; 1215 } 1216 1217 /* Set socket to non-blocking. Otherwise, on operating systems 1218 * with thundering herd problems, the UDP recv could block 1219 * after select returns readable. 1220 */ 1221 set_nonblock(sock); 1222 1223 if(nsd->options->ip_freebind) 1224 (void)set_ip_freebind(sock); 1225 if(nsd->options->ip_transparent) 1226 (void)set_ip_transparent(sock); 1227 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1228 return -1; 1229 if(sock->fib != -1 && set_setfib(sock) == -1) 1230 return -1; 1231 1232 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1233 char buf[256]; 1234 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1235 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1236 buf, strerror(errno)); 1237 return -1; 1238 } 1239 1240 return 1; 1241 } 1242 1243 static int 1244 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1245 { 1246 #ifdef USE_TCP_FASTOPEN 1247 report_tcp_fastopen_config(); 1248 #endif 1249 1250 (void)reuseport_works; 1251 1252 if(-1 == (sock->s = socket( 1253 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1254 { 1255 #ifdef INET6 1256 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1257 (sock->addr.ai_family == AF_INET6) && 1258 (errno == EAFNOSUPPORT)) 1259 { 1260 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1261 "not supported"); 1262 return 0; 1263 } 1264 #endif /* INET6 */ 1265 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1266 return -1; 1267 } 1268 1269 set_cloexec(sock); 1270 1271 if(nsd->reuseport && reuseport_works && *reuseport_works) 1272 *reuseport_works = (set_reuseport(sock) == 1); 1273 1274 (void)set_reuseaddr(sock); 1275 1276 #ifdef INET6 1277 if(sock->addr.ai_family == AF_INET6) { 1278 if (set_ipv6_v6only(sock) == -1 || 1279 set_ipv6_use_min_mtu(sock) == -1) 1280 return -1; 1281 } 1282 #endif 1283 1284 if(nsd->tcp_mss > 0) 1285 set_tcp_maxseg(sock, nsd->tcp_mss); 1286 /* (StevensUNP p463), if TCP listening socket is blocking, then 1287 it may block in accept, even if select() says readable. */ 1288 (void)set_nonblock(sock); 1289 if(nsd->options->ip_freebind) 1290 (void)set_ip_freebind(sock); 1291 if(nsd->options->ip_transparent) 1292 (void)set_ip_transparent(sock); 1293 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1294 return -1; 1295 if(sock->fib != -1 && set_setfib(sock) == -1) 1296 return -1; 1297 1298 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1299 char buf[256]; 1300 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1301 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1302 buf, strerror(errno)); 1303 return -1; 1304 } 1305 1306 #ifdef USE_TCP_FASTOPEN 1307 (void)set_tcp_fastopen(sock); 1308 #endif 1309 1310 if(listen(sock->s, TCP_BACKLOG) == -1) { 1311 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1312 return -1; 1313 } 1314 1315 return 1; 1316 } 1317 1318 /* 1319 * Initialize the server, reuseport, create and bind the sockets. 1320 */ 1321 int 1322 server_init(struct nsd *nsd) 1323 { 1324 size_t i; 1325 int reuseport = 1; /* Determine if REUSEPORT works. */ 1326 1327 /* open server interface ports */ 1328 for(i = 0; i < nsd->ifs; i++) { 1329 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1330 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1331 { 1332 return -1; 1333 } 1334 } 1335 1336 if(nsd->reuseport && reuseport) { 1337 size_t ifs = nsd->ifs * nsd->reuseport; 1338 1339 /* increase the size of the interface arrays, there are going 1340 * to be separate interface file descriptors for every server 1341 * instance */ 1342 region_remove_cleanup(nsd->region, free, nsd->udp); 1343 region_remove_cleanup(nsd->region, free, nsd->tcp); 1344 1345 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1346 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1347 region_add_cleanup(nsd->region, free, nsd->udp); 1348 region_add_cleanup(nsd->region, free, nsd->tcp); 1349 if(ifs > nsd->ifs) { 1350 memset(&nsd->udp[nsd->ifs], 0, 1351 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1352 memset(&nsd->tcp[nsd->ifs], 0, 1353 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1354 } 1355 1356 for(i = nsd->ifs; i < ifs; i++) { 1357 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1358 nsd->udp[i].s = -1; 1359 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1360 return -1; 1361 } 1362 /* Turn off REUSEPORT for TCP by copying the socket 1363 * file descriptor. 1364 * This means we should not close TCP used by 1365 * other servers in reuseport enabled mode, in 1366 * server_child(). 1367 */ 1368 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1369 } 1370 1371 nsd->ifs = ifs; 1372 } else { 1373 nsd->reuseport = 0; 1374 } 1375 1376 return 0; 1377 } 1378 1379 /* 1380 * Prepare the server for take off. 1381 * 1382 */ 1383 int 1384 server_prepare(struct nsd *nsd) 1385 { 1386 #ifdef RATELIMIT 1387 /* set secret modifier for hashing (udb ptr buckets and rate limits) */ 1388 #ifdef HAVE_GETRANDOM 1389 uint32_t v; 1390 if(getrandom(&v, sizeof(v), 0) == -1) { 1391 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1392 exit(1); 1393 } 1394 hash_set_raninit(v); 1395 #elif defined(HAVE_ARC4RANDOM) 1396 hash_set_raninit(arc4random()); 1397 #else 1398 uint32_t v = getpid() ^ time(NULL); 1399 srandom((unsigned long)v); 1400 # ifdef HAVE_SSL 1401 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1402 hash_set_raninit(v); 1403 else 1404 # endif 1405 hash_set_raninit(random()); 1406 #endif 1407 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1408 nsd->options->rrl_ratelimit, 1409 nsd->options->rrl_whitelist_ratelimit, 1410 nsd->options->rrl_slip, 1411 nsd->options->rrl_ipv4_prefix_length, 1412 nsd->options->rrl_ipv6_prefix_length); 1413 #endif /* RATELIMIT */ 1414 1415 /* Open the database... */ 1416 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { 1417 log_msg(LOG_ERR, "unable to open the database %s: %s", 1418 nsd->dbfile, strerror(errno)); 1419 unlink(nsd->task[0]->fname); 1420 unlink(nsd->task[1]->fname); 1421 #ifdef USE_ZONE_STATS 1422 unlink(nsd->zonestatfname[0]); 1423 unlink(nsd->zonestatfname[1]); 1424 #endif 1425 xfrd_del_tempdir(nsd); 1426 return -1; 1427 } 1428 /* check if zone files have been modified */ 1429 /* NULL for taskudb because we send soainfo in a moment, batched up, 1430 * for all zones */ 1431 if(nsd->options->zonefiles_check || (nsd->options->database == NULL || 1432 nsd->options->database[0] == 0)) 1433 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1434 zonestatid_tree_set(nsd); 1435 1436 compression_table_capacity = 0; 1437 initialize_dname_compression_tables(nsd); 1438 1439 #ifdef BIND8_STATS 1440 /* Initialize times... */ 1441 time(&nsd->st.boot); 1442 set_bind8_alarm(nsd); 1443 #endif /* BIND8_STATS */ 1444 1445 return 0; 1446 } 1447 1448 /* 1449 * Fork the required number of servers. 1450 */ 1451 static int 1452 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1453 int* xfrd_sock_p) 1454 { 1455 size_t i; 1456 1457 /* Start all child servers initially. */ 1458 for (i = 0; i < nsd->child_count; ++i) { 1459 nsd->children[i].pid = 0; 1460 } 1461 1462 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1463 } 1464 1465 static void 1466 server_close_socket(struct nsd_socket *sock) 1467 { 1468 if(sock->s != -1) { 1469 close(sock->s); 1470 sock->s = -1; 1471 } 1472 } 1473 1474 void 1475 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1476 { 1477 size_t i; 1478 1479 /* Close all the sockets... */ 1480 for (i = 0; i < n; ++i) { 1481 server_close_socket(&sockets[i]); 1482 } 1483 } 1484 1485 /* 1486 * Close the sockets, shutdown the server and exit. 1487 * Does not return. 1488 */ 1489 void 1490 server_shutdown(struct nsd *nsd) 1491 { 1492 size_t i; 1493 1494 server_close_all_sockets(nsd->udp, nsd->ifs); 1495 server_close_all_sockets(nsd->tcp, nsd->ifs); 1496 /* CHILD: close command channel to parent */ 1497 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1498 { 1499 close(nsd->this_child->parent_fd); 1500 nsd->this_child->parent_fd = -1; 1501 } 1502 /* SERVER: close command channels to children */ 1503 if(!nsd->this_child) 1504 { 1505 for(i=0; i < nsd->child_count; ++i) 1506 if(nsd->children[i].child_fd != -1) 1507 { 1508 close(nsd->children[i].child_fd); 1509 nsd->children[i].child_fd = -1; 1510 } 1511 } 1512 1513 tsig_finalize(); 1514 #ifdef HAVE_SSL 1515 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1516 if (nsd->tls_ctx) 1517 SSL_CTX_free(nsd->tls_ctx); 1518 #endif 1519 1520 #ifdef MEMCLEAN /* OS collects memory pages */ 1521 #ifdef RATELIMIT 1522 rrl_mmap_deinit_keep_mmap(); 1523 #endif 1524 #ifdef USE_DNSTAP 1525 dt_collector_destroy(nsd->dt_collector, nsd); 1526 #endif 1527 udb_base_free_keep_mmap(nsd->task[0]); 1528 udb_base_free_keep_mmap(nsd->task[1]); 1529 namedb_close_udb(nsd->db); /* keeps mmap */ 1530 namedb_close(nsd->db); 1531 nsd_options_destroy(nsd->options); 1532 region_destroy(nsd->region); 1533 #endif 1534 log_finalize(); 1535 exit(0); 1536 } 1537 1538 void 1539 server_prepare_xfrd(struct nsd* nsd) 1540 { 1541 char tmpfile[256]; 1542 /* create task mmaps */ 1543 nsd->mytask = 0; 1544 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1545 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1546 nsd->task[0] = task_file_create(tmpfile); 1547 if(!nsd->task[0]) { 1548 #ifdef USE_ZONE_STATS 1549 unlink(nsd->zonestatfname[0]); 1550 unlink(nsd->zonestatfname[1]); 1551 #endif 1552 xfrd_del_tempdir(nsd); 1553 exit(1); 1554 } 1555 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1556 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1557 nsd->task[1] = task_file_create(tmpfile); 1558 if(!nsd->task[1]) { 1559 unlink(nsd->task[0]->fname); 1560 #ifdef USE_ZONE_STATS 1561 unlink(nsd->zonestatfname[0]); 1562 unlink(nsd->zonestatfname[1]); 1563 #endif 1564 xfrd_del_tempdir(nsd); 1565 exit(1); 1566 } 1567 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1568 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1569 /* create xfrd listener structure */ 1570 nsd->xfrd_listener = region_alloc(nsd->region, 1571 sizeof(netio_handler_type)); 1572 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1573 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1574 nsd->xfrd_listener->fd = -1; 1575 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1576 nsd; 1577 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1578 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1579 } 1580 1581 1582 void 1583 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1584 { 1585 pid_t pid; 1586 int sockets[2] = {0,0}; 1587 struct ipc_handler_conn_data *data; 1588 1589 if(nsd->xfrd_listener->fd != -1) 1590 close(nsd->xfrd_listener->fd); 1591 if(del_db) { 1592 /* recreate taskdb that xfrd was using, it may be corrupt */ 1593 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1594 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1595 nsd->task[1-nsd->mytask]->fname = NULL; 1596 /* free alloc already, so udb does not shrink itself */ 1597 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1598 nsd->task[1-nsd->mytask]->alloc = NULL; 1599 udb_base_free(nsd->task[1-nsd->mytask]); 1600 /* create new file, overwrite the old one */ 1601 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1602 free(tmpfile); 1603 } 1604 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1605 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1606 return; 1607 } 1608 pid = fork(); 1609 switch (pid) { 1610 case -1: 1611 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1612 break; 1613 default: 1614 /* PARENT: close first socket, use second one */ 1615 close(sockets[0]); 1616 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1617 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1618 } 1619 if(del_db) xfrd_free_namedb(nsd); 1620 /* use other task than I am using, since if xfrd died and is 1621 * restarted, the reload is using nsd->mytask */ 1622 nsd->mytask = 1 - nsd->mytask; 1623 1624 #ifdef HAVE_SETPROCTITLE 1625 setproctitle("xfrd"); 1626 #endif 1627 #ifdef HAVE_CPUSET_T 1628 if(nsd->use_cpu_affinity) { 1629 set_cpu_affinity(nsd->xfrd_cpuset); 1630 } 1631 #endif 1632 1633 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1634 /* ENOTREACH */ 1635 break; 1636 case 0: 1637 /* CHILD: close second socket, use first one */ 1638 close(sockets[1]); 1639 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1640 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1641 } 1642 nsd->xfrd_listener->fd = sockets[0]; 1643 break; 1644 } 1645 /* server-parent only */ 1646 nsd->xfrd_listener->timeout = NULL; 1647 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1648 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1649 /* clear ongoing ipc reads */ 1650 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1651 data->conn->is_reading = 0; 1652 } 1653 1654 /** add all soainfo to taskdb */ 1655 static void 1656 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1657 { 1658 struct radnode* n; 1659 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1660 /* add all SOA INFO to mytask */ 1661 udb_ptr_init(&task_last, taskudb); 1662 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1663 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1664 } 1665 udb_ptr_unlink(&task_last, taskudb); 1666 } 1667 1668 void 1669 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1670 { 1671 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1672 * parent fills one taskdb with soas, xfrd fills other with expires. 1673 * then they exchange and process. 1674 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1675 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1676 * expire notifications can be sent back via a normal reload later 1677 * (xfrd will wait for current running reload to finish if any). 1678 */ 1679 sig_atomic_t cmd = 0; 1680 pid_t mypid; 1681 int xfrd_sock = nsd->xfrd_listener->fd; 1682 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1683 udb_ptr t; 1684 if(!shortsoa) { 1685 if(nsd->signal_hint_shutdown) { 1686 shutdown: 1687 log_msg(LOG_WARNING, "signal received, shutting down..."); 1688 server_close_all_sockets(nsd->udp, nsd->ifs); 1689 server_close_all_sockets(nsd->tcp, nsd->ifs); 1690 #ifdef HAVE_SSL 1691 daemon_remote_close(nsd->rc); 1692 #endif 1693 /* Unlink it if possible... */ 1694 unlinkpid(nsd->pidfile); 1695 unlink(nsd->task[0]->fname); 1696 unlink(nsd->task[1]->fname); 1697 #ifdef USE_ZONE_STATS 1698 unlink(nsd->zonestatfname[0]); 1699 unlink(nsd->zonestatfname[1]); 1700 #endif 1701 /* write the nsd.db to disk, wait for it to complete */ 1702 udb_base_sync(nsd->db->udb, 1); 1703 udb_base_close(nsd->db->udb); 1704 server_shutdown(nsd); 1705 /* ENOTREACH */ 1706 exit(0); 1707 } 1708 } 1709 if(shortsoa) { 1710 /* put SOA in xfrd task because mytask may be in use */ 1711 taskudb = nsd->task[1-nsd->mytask]; 1712 } 1713 1714 add_all_soa_to_task(nsd, taskudb); 1715 if(!shortsoa) { 1716 /* wait for xfrd to signal task is ready, RELOAD signal */ 1717 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1718 cmd != NSD_RELOAD) { 1719 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1720 exit(1); 1721 } 1722 if(nsd->signal_hint_shutdown) { 1723 goto shutdown; 1724 } 1725 } 1726 /* give xfrd our task, signal it with RELOAD_DONE */ 1727 task_process_sync(taskudb); 1728 cmd = NSD_RELOAD_DONE; 1729 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1730 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1731 (int)nsd->pid, strerror(errno)); 1732 } 1733 mypid = getpid(); 1734 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1735 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1736 strerror(errno)); 1737 } 1738 1739 if(!shortsoa) { 1740 /* process the xfrd task works (expiry data) */ 1741 nsd->mytask = 1 - nsd->mytask; 1742 taskudb = nsd->task[nsd->mytask]; 1743 task_remap(taskudb); 1744 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1745 while(!udb_ptr_is_null(&t)) { 1746 task_process_expire(nsd->db, TASKLIST(&t)); 1747 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1748 } 1749 udb_ptr_unlink(&t, taskudb); 1750 task_clear(taskudb); 1751 1752 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1753 cmd = NSD_RELOAD_DONE; 1754 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1755 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1756 (int)nsd->pid, strerror(errno)); 1757 } 1758 } 1759 } 1760 1761 #ifdef HAVE_SSL 1762 static void 1763 log_crypto_from_err(const char* str, unsigned long err) 1764 { 1765 /* error:[error code]:[library name]:[function name]:[reason string] */ 1766 char buf[128]; 1767 unsigned long e; 1768 ERR_error_string_n(err, buf, sizeof(buf)); 1769 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1770 while( (e=ERR_get_error()) ) { 1771 ERR_error_string_n(e, buf, sizeof(buf)); 1772 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1773 } 1774 } 1775 1776 void 1777 log_crypto_err(const char* str) 1778 { 1779 log_crypto_from_err(str, ERR_get_error()); 1780 } 1781 1782 /** true if the ssl handshake error has to be squelched from the logs */ 1783 static int 1784 squelch_err_ssl_handshake(unsigned long err) 1785 { 1786 if(verbosity >= 3) 1787 return 0; /* only squelch on low verbosity */ 1788 /* this is very specific, we could filter on ERR_GET_REASON() 1789 * (the third element in ERR_PACK) */ 1790 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1791 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1792 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1793 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1794 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1795 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1796 #endif 1797 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1798 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1799 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1800 # ifdef SSL_R_VERSION_TOO_LOW 1801 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1802 # endif 1803 #endif 1804 ) 1805 return 1; 1806 return 0; 1807 } 1808 1809 void 1810 perform_openssl_init(void) 1811 { 1812 /* init SSL library */ 1813 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1814 ERR_load_crypto_strings(); 1815 #endif 1816 ERR_load_SSL_strings(); 1817 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1818 OpenSSL_add_all_algorithms(); 1819 #else 1820 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1821 | OPENSSL_INIT_ADD_ALL_DIGESTS 1822 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1823 #endif 1824 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1825 (void)SSL_library_init(); 1826 #else 1827 OPENSSL_init_ssl(0, NULL); 1828 #endif 1829 1830 if(!RAND_status()) { 1831 /* try to seed it */ 1832 unsigned char buf[256]; 1833 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1834 size_t i; 1835 v = seed; 1836 for(i=0; i<256/sizeof(v); i++) { 1837 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1838 v = v*seed + (unsigned int)i; 1839 } 1840 RAND_seed(buf, 256); 1841 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1842 } 1843 } 1844 1845 static int 1846 get_ocsp(char *filename, unsigned char **ocsp) 1847 { 1848 BIO *bio; 1849 OCSP_RESPONSE *response; 1850 int len = -1; 1851 unsigned char *p, *buf; 1852 assert(filename); 1853 1854 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1855 log_crypto_err("get_ocsp: BIO_new_file failed"); 1856 return -1; 1857 } 1858 1859 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1860 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1861 BIO_free(bio); 1862 return -1; 1863 } 1864 1865 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1866 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1867 OCSP_RESPONSE_free(response); 1868 BIO_free(bio); 1869 return -1; 1870 } 1871 1872 if ((buf = malloc((size_t) len)) == NULL) { 1873 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1874 OCSP_RESPONSE_free(response); 1875 BIO_free(bio); 1876 return -1; 1877 } 1878 1879 p = buf; 1880 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1881 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1882 free(buf); 1883 OCSP_RESPONSE_free(response); 1884 BIO_free(bio); 1885 return -1; 1886 } 1887 1888 OCSP_RESPONSE_free(response); 1889 BIO_free(bio); 1890 1891 *ocsp = buf; 1892 return len; 1893 } 1894 1895 /* further setup ssl ctx after the keys are loaded */ 1896 static void 1897 listen_sslctx_setup_2(void* ctxt) 1898 { 1899 SSL_CTX* ctx = (SSL_CTX*)ctxt; 1900 (void)ctx; 1901 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 1902 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 1903 /* ENOTREACH */ 1904 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 1905 } 1906 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 1907 if(1) { 1908 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 1909 if (!ecdh) { 1910 log_crypto_err("could not find p256, not enabling ECDHE"); 1911 } else { 1912 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 1913 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 1914 } 1915 EC_KEY_free (ecdh); 1916 } 1917 } 1918 #endif 1919 } 1920 1921 static int 1922 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 1923 { 1924 if(ocspdata) { 1925 unsigned char *p; 1926 if ((p=malloc(ocspdata_len)) == NULL) { 1927 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 1928 return SSL_TLSEXT_ERR_NOACK; 1929 } 1930 memcpy(p, ocspdata, ocspdata_len); 1931 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 1932 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 1933 free(p); 1934 return SSL_TLSEXT_ERR_NOACK; 1935 } 1936 return SSL_TLSEXT_ERR_OK; 1937 } else { 1938 return SSL_TLSEXT_ERR_NOACK; 1939 } 1940 } 1941 1942 SSL_CTX* 1943 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 1944 { 1945 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 1946 if(!ctx) { 1947 log_crypto_err("could not SSL_CTX_new"); 1948 return NULL; 1949 } 1950 /* no SSLv2, SSLv3 because has defects */ 1951 #if SSL_OP_NO_SSLv2 != 0 1952 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 1953 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 1954 SSL_CTX_free(ctx); 1955 return NULL; 1956 } 1957 #endif 1958 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 1959 != SSL_OP_NO_SSLv3){ 1960 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 1961 SSL_CTX_free(ctx); 1962 return 0; 1963 } 1964 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 1965 /* if we have tls 1.1 disable 1.0 */ 1966 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 1967 != SSL_OP_NO_TLSv1){ 1968 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 1969 SSL_CTX_free(ctx); 1970 return 0; 1971 } 1972 #endif 1973 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 1974 /* if we have tls 1.2 disable 1.1 */ 1975 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 1976 != SSL_OP_NO_TLSv1_1){ 1977 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 1978 SSL_CTX_free(ctx); 1979 return 0; 1980 } 1981 #endif 1982 #if defined(SSL_OP_NO_RENEGOTIATION) 1983 /* disable client renegotiation */ 1984 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 1985 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 1986 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 1987 SSL_CTX_free(ctx); 1988 return 0; 1989 } 1990 #endif 1991 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 1992 /* if we have sha256, set the cipher list to have no known vulns */ 1993 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 1994 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 1995 #endif 1996 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 1997 SSL_OP_CIPHER_SERVER_PREFERENCE) != 1998 SSL_OP_CIPHER_SERVER_PREFERENCE) { 1999 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 2000 SSL_CTX_free(ctx); 2001 return 0; 2002 } 2003 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 2004 SSL_CTX_set_security_level(ctx, 0); 2005 #endif 2006 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 2007 log_msg(LOG_ERR, "error for cert file: %s", pem); 2008 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 2009 SSL_CTX_free(ctx); 2010 return NULL; 2011 } 2012 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 2013 log_msg(LOG_ERR, "error for private key file: %s", key); 2014 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 2015 SSL_CTX_free(ctx); 2016 return NULL; 2017 } 2018 if(!SSL_CTX_check_private_key(ctx)) { 2019 log_msg(LOG_ERR, "error for key file: %s", key); 2020 log_crypto_err("Error in SSL_CTX check_private_key"); 2021 SSL_CTX_free(ctx); 2022 return NULL; 2023 } 2024 listen_sslctx_setup_2(ctx); 2025 if(verifypem && verifypem[0]) { 2026 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 2027 log_crypto_err("Error in SSL_CTX verify locations"); 2028 SSL_CTX_free(ctx); 2029 return NULL; 2030 } 2031 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 2032 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 2033 } 2034 return ctx; 2035 } 2036 2037 SSL_CTX* 2038 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 2039 { 2040 char *key, *pem; 2041 SSL_CTX *ctx; 2042 2043 key = nsd->options->tls_service_key; 2044 pem = nsd->options->tls_service_pem; 2045 if(!key || key[0] == 0) { 2046 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2047 return NULL; 2048 } 2049 if(!pem || pem[0] == 0) { 2050 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2051 return NULL; 2052 } 2053 2054 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2055 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2056 ctx = server_tls_ctx_setup(key, pem, verifypem); 2057 if(!ctx) { 2058 log_msg(LOG_ERR, "could not setup server TLS context"); 2059 return NULL; 2060 } 2061 if(ocspfile && ocspfile[0]) { 2062 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2063 log_crypto_err("Error reading OCSPfile"); 2064 SSL_CTX_free(ctx); 2065 return NULL; 2066 } else { 2067 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2068 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2069 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2070 SSL_CTX_free(ctx); 2071 return NULL; 2072 } 2073 } 2074 } 2075 return ctx; 2076 } 2077 2078 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2079 int 2080 using_tls_port(struct sockaddr* addr, const char* tls_port) 2081 { 2082 in_port_t port = 0; 2083 2084 if (addr->sa_family == AF_INET) 2085 port = ((struct sockaddr_in*)addr)->sin_port; 2086 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2087 else 2088 port = ((struct sockaddr_in6*)addr)->sin6_port; 2089 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2090 if (atoi(tls_port) == ntohs(port)) 2091 return 1; 2092 2093 return 0; 2094 } 2095 #endif 2096 2097 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2098 ssize_t 2099 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2100 { 2101 uint8_t* buf = (uint8_t*) p; 2102 ssize_t total = 0; 2103 struct pollfd fd; 2104 memset(&fd, 0, sizeof(fd)); 2105 fd.fd = s; 2106 fd.events = POLLIN; 2107 2108 while( total < sz) { 2109 ssize_t ret; 2110 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2111 if(ret == -1) { 2112 if(errno == EAGAIN) 2113 /* blocking read */ 2114 continue; 2115 if(errno == EINTR) { 2116 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2117 return -1; 2118 /* other signals can be handled later */ 2119 continue; 2120 } 2121 /* some error */ 2122 return -1; 2123 } 2124 if(ret == 0) { 2125 /* operation timed out */ 2126 return -2; 2127 } 2128 ret = read(s, buf+total, sz-total); 2129 if(ret == -1) { 2130 if(errno == EAGAIN) 2131 /* blocking read */ 2132 continue; 2133 if(errno == EINTR) { 2134 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2135 return -1; 2136 /* other signals can be handled later */ 2137 continue; 2138 } 2139 /* some error */ 2140 return -1; 2141 } 2142 if(ret == 0) { 2143 /* closed connection! */ 2144 return 0; 2145 } 2146 total += ret; 2147 } 2148 return total; 2149 } 2150 2151 static void 2152 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2153 { 2154 sig_atomic_t cmd = NSD_QUIT_SYNC; 2155 udb_ptr t, next; 2156 udb_base* u = nsd->task[nsd->mytask]; 2157 udb_ptr_init(&next, u); 2158 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2159 udb_base_set_userdata(u, 0); 2160 while(!udb_ptr_is_null(&t)) { 2161 /* store next in list so this one can be deleted or reused */ 2162 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2163 udb_rptr_zero(&TASKLIST(&t)->next, u); 2164 2165 /* process task t */ 2166 /* append results for task t and update last_task */ 2167 task_process_in_reload(nsd, u, last_task, &t); 2168 2169 /* go to next */ 2170 udb_ptr_set_ptr(&t, u, &next); 2171 2172 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2173 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2174 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2175 if(cmd == NSD_QUIT) { 2176 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2177 /* sync to disk (if needed) */ 2178 udb_base_sync(nsd->db->udb, 0); 2179 /* unlink files of remainder of tasks */ 2180 while(!udb_ptr_is_null(&t)) { 2181 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2182 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2183 } 2184 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2185 } 2186 udb_ptr_unlink(&t, u); 2187 udb_ptr_unlink(&next, u); 2188 exit(0); 2189 } 2190 } 2191 2192 } 2193 udb_ptr_unlink(&t, u); 2194 udb_ptr_unlink(&next, u); 2195 } 2196 2197 #ifdef BIND8_STATS 2198 static void 2199 parent_send_stats(struct nsd* nsd, int cmdfd) 2200 { 2201 size_t i; 2202 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { 2203 log_msg(LOG_ERR, "could not write stats to reload"); 2204 return; 2205 } 2206 for(i=0; i<nsd->child_count; i++) 2207 if(!write_socket(cmdfd, &nsd->children[i].query_count, 2208 sizeof(stc_type))) { 2209 log_msg(LOG_ERR, "could not write stats to reload"); 2210 return; 2211 } 2212 } 2213 2214 static void 2215 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) 2216 { 2217 struct nsdst s; 2218 stc_type* p; 2219 size_t i; 2220 if(block_read(nsd, cmdfd, &s, sizeof(s), 2221 RELOAD_SYNC_TIMEOUT) != sizeof(s)) { 2222 log_msg(LOG_ERR, "could not read stats from oldpar"); 2223 return; 2224 } 2225 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); 2226 s.db_mem = region_get_mem(nsd->db->region); 2227 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, 2228 nsd->child_count); 2229 if(!p) return; 2230 for(i=0; i<nsd->child_count; i++) { 2231 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= 2232 sizeof(stc_type)) 2233 return; 2234 } 2235 } 2236 #endif /* BIND8_STATS */ 2237 2238 /* 2239 * Reload the database, stop parent, re-fork children and continue. 2240 * as server_main. 2241 */ 2242 static void 2243 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2244 int cmdsocket) 2245 { 2246 pid_t mypid; 2247 sig_atomic_t cmd = NSD_QUIT_SYNC; 2248 int ret; 2249 udb_ptr last_task; 2250 struct sigaction old_sigchld, ign_sigchld; 2251 /* ignore SIGCHLD from the previous server_main that used this pid */ 2252 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2253 ign_sigchld.sa_handler = SIG_IGN; 2254 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2255 2256 #ifdef HAVE_SETPROCTITLE 2257 setproctitle("main"); 2258 #endif 2259 #ifdef HAVE_CPUSET_T 2260 if(nsd->use_cpu_affinity) { 2261 set_cpu_affinity(nsd->cpuset); 2262 } 2263 #endif 2264 2265 /* see what tasks we got from xfrd */ 2266 task_remap(nsd->task[nsd->mytask]); 2267 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2268 udb_compact_inhibited(nsd->db->udb, 1); 2269 reload_process_tasks(nsd, &last_task, cmdsocket); 2270 udb_compact_inhibited(nsd->db->udb, 0); 2271 udb_compact(nsd->db->udb); 2272 2273 #ifndef NDEBUG 2274 if(nsd_debug_level >= 1) 2275 region_log_stats(nsd->db->region); 2276 #endif /* NDEBUG */ 2277 /* sync to disk (if needed) */ 2278 udb_base_sync(nsd->db->udb, 0); 2279 2280 initialize_dname_compression_tables(nsd); 2281 2282 #ifdef BIND8_STATS 2283 /* Restart dumping stats if required. */ 2284 time(&nsd->st.boot); 2285 set_bind8_alarm(nsd); 2286 #endif 2287 #ifdef USE_ZONE_STATS 2288 server_zonestat_realloc(nsd); /* realloc for new children */ 2289 server_zonestat_switch(nsd); 2290 #endif 2291 2292 /* listen for the signals of failed children again */ 2293 sigaction(SIGCHLD, &old_sigchld, NULL); 2294 /* Start new child processes */ 2295 if (server_start_children(nsd, server_region, netio, &nsd-> 2296 xfrd_listener->fd) != 0) { 2297 send_children_quit(nsd); 2298 exit(1); 2299 } 2300 2301 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2302 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2303 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2304 if(cmd == NSD_QUIT) { 2305 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2306 send_children_quit(nsd); 2307 exit(0); 2308 } 2309 } 2310 2311 /* Send quit command to parent: blocking, wait for receipt. */ 2312 do { 2313 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2314 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2315 { 2316 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2317 strerror(errno)); 2318 } 2319 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2320 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2321 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2322 RELOAD_SYNC_TIMEOUT); 2323 if(ret == -2) { 2324 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2325 } 2326 } while (ret == -2); 2327 if(ret == -1) { 2328 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2329 strerror(errno)); 2330 } 2331 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2332 if(cmd == NSD_QUIT) { 2333 /* small race condition possible here, parent got quit cmd. */ 2334 send_children_quit(nsd); 2335 exit(1); 2336 } 2337 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2338 #ifdef BIND8_STATS 2339 reload_do_stats(cmdsocket, nsd, &last_task); 2340 #endif 2341 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2342 task_process_sync(nsd->task[nsd->mytask]); 2343 #ifdef USE_ZONE_STATS 2344 server_zonestat_realloc(nsd); /* realloc for next children */ 2345 #endif 2346 2347 /* send soainfo to the xfrd process, signal it that reload is done, 2348 * it picks up the taskudb */ 2349 cmd = NSD_RELOAD_DONE; 2350 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2351 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2352 strerror(errno)); 2353 } 2354 mypid = getpid(); 2355 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2356 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2357 strerror(errno)); 2358 } 2359 2360 /* try to reopen file */ 2361 if (nsd->file_rotation_ok) 2362 log_reopen(nsd->log_filename, 1); 2363 /* exit reload, continue as new server_main */ 2364 } 2365 2366 /* 2367 * Get the mode depending on the signal hints that have been received. 2368 * Multiple signal hints can be received and will be handled in turn. 2369 */ 2370 static sig_atomic_t 2371 server_signal_mode(struct nsd *nsd) 2372 { 2373 if(nsd->signal_hint_quit) { 2374 nsd->signal_hint_quit = 0; 2375 return NSD_QUIT; 2376 } 2377 else if(nsd->signal_hint_shutdown) { 2378 nsd->signal_hint_shutdown = 0; 2379 return NSD_SHUTDOWN; 2380 } 2381 else if(nsd->signal_hint_child) { 2382 nsd->signal_hint_child = 0; 2383 return NSD_REAP_CHILDREN; 2384 } 2385 else if(nsd->signal_hint_reload) { 2386 nsd->signal_hint_reload = 0; 2387 return NSD_RELOAD; 2388 } 2389 else if(nsd->signal_hint_reload_hup) { 2390 nsd->signal_hint_reload_hup = 0; 2391 return NSD_RELOAD_REQ; 2392 } 2393 else if(nsd->signal_hint_stats) { 2394 nsd->signal_hint_stats = 0; 2395 #ifdef BIND8_STATS 2396 set_bind8_alarm(nsd); 2397 #endif 2398 return NSD_STATS; 2399 } 2400 else if(nsd->signal_hint_statsusr) { 2401 nsd->signal_hint_statsusr = 0; 2402 return NSD_STATS; 2403 } 2404 return NSD_RUN; 2405 } 2406 2407 /* 2408 * The main server simply waits for signals and child processes to 2409 * terminate. Child processes are restarted as necessary. 2410 */ 2411 void 2412 server_main(struct nsd *nsd) 2413 { 2414 region_type *server_region = region_create(xalloc, free); 2415 netio_type *netio = netio_create(server_region); 2416 netio_handler_type reload_listener; 2417 int reload_sockets[2] = {-1, -1}; 2418 struct timespec timeout_spec; 2419 int status; 2420 pid_t child_pid; 2421 pid_t reload_pid = -1; 2422 sig_atomic_t mode; 2423 2424 /* Ensure we are the main process */ 2425 assert(nsd->server_kind == NSD_SERVER_MAIN); 2426 2427 /* Add listener for the XFRD process */ 2428 netio_add_handler(netio, nsd->xfrd_listener); 2429 2430 /* Start the child processes that handle incoming queries */ 2431 if (server_start_children(nsd, server_region, netio, 2432 &nsd->xfrd_listener->fd) != 0) { 2433 send_children_quit(nsd); 2434 exit(1); 2435 } 2436 reload_listener.fd = -1; 2437 2438 /* This_child MUST be 0, because this is the parent process */ 2439 assert(nsd->this_child == 0); 2440 2441 /* Run the server until we get a shutdown signal */ 2442 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2443 /* Did we receive a signal that changes our mode? */ 2444 if(mode == NSD_RUN) { 2445 nsd->mode = mode = server_signal_mode(nsd); 2446 } 2447 2448 switch (mode) { 2449 case NSD_RUN: 2450 /* see if any child processes terminated */ 2451 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2452 int is_child = delete_child_pid(nsd, child_pid); 2453 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2454 if(nsd->children[is_child].child_fd == -1) 2455 nsd->children[is_child].has_exited = 1; 2456 parent_check_all_children_exited(nsd); 2457 } else if(is_child != -1) { 2458 log_msg(LOG_WARNING, 2459 "server %d died unexpectedly with status %d, restarting", 2460 (int) child_pid, status); 2461 restart_child_servers(nsd, server_region, netio, 2462 &nsd->xfrd_listener->fd); 2463 } else if (child_pid == reload_pid) { 2464 sig_atomic_t cmd = NSD_RELOAD_DONE; 2465 pid_t mypid; 2466 log_msg(LOG_WARNING, 2467 "Reload process %d failed with status %d, continuing with old database", 2468 (int) child_pid, status); 2469 reload_pid = -1; 2470 if(reload_listener.fd != -1) close(reload_listener.fd); 2471 reload_listener.fd = -1; 2472 reload_listener.event_types = NETIO_EVENT_NONE; 2473 task_process_sync(nsd->task[nsd->mytask]); 2474 /* inform xfrd reload attempt ended */ 2475 if(!write_socket(nsd->xfrd_listener->fd, 2476 &cmd, sizeof(cmd))) { 2477 log_msg(LOG_ERR, "problems " 2478 "sending SOAEND to xfrd: %s", 2479 strerror(errno)); 2480 } 2481 mypid = getpid(); 2482 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2483 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2484 strerror(errno)); 2485 } 2486 } else if(status != 0) { 2487 /* check for status, because we get 2488 * the old-servermain because reload 2489 * is the process-parent of old-main, 2490 * and we get older server-processes 2491 * that are exiting after a reload */ 2492 log_msg(LOG_WARNING, 2493 "process %d terminated with status %d", 2494 (int) child_pid, status); 2495 } 2496 } 2497 if (child_pid == -1) { 2498 if (errno == EINTR) { 2499 continue; 2500 } 2501 if (errno != ECHILD) 2502 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2503 } 2504 if (nsd->mode != NSD_RUN) 2505 break; 2506 2507 /* timeout to collect processes. In case no sigchild happens. */ 2508 timeout_spec.tv_sec = 60; 2509 timeout_spec.tv_nsec = 0; 2510 2511 /* listen on ports, timeout for collecting terminated children */ 2512 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2513 if (errno != EINTR) { 2514 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2515 } 2516 } 2517 if(nsd->restart_children) { 2518 restart_child_servers(nsd, server_region, netio, 2519 &nsd->xfrd_listener->fd); 2520 nsd->restart_children = 0; 2521 } 2522 if(nsd->reload_failed) { 2523 sig_atomic_t cmd = NSD_RELOAD_DONE; 2524 pid_t mypid; 2525 nsd->reload_failed = 0; 2526 log_msg(LOG_WARNING, 2527 "Reload process %d failed, continuing with old database", 2528 (int) reload_pid); 2529 reload_pid = -1; 2530 if(reload_listener.fd != -1) close(reload_listener.fd); 2531 reload_listener.fd = -1; 2532 reload_listener.event_types = NETIO_EVENT_NONE; 2533 task_process_sync(nsd->task[nsd->mytask]); 2534 /* inform xfrd reload attempt ended */ 2535 if(!write_socket(nsd->xfrd_listener->fd, 2536 &cmd, sizeof(cmd))) { 2537 log_msg(LOG_ERR, "problems " 2538 "sending SOAEND to xfrd: %s", 2539 strerror(errno)); 2540 } 2541 mypid = getpid(); 2542 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2543 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2544 strerror(errno)); 2545 } 2546 } 2547 2548 break; 2549 case NSD_RELOAD_REQ: { 2550 sig_atomic_t cmd = NSD_RELOAD_REQ; 2551 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2552 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2553 "main: ipc send reload_req to xfrd")); 2554 if(!write_socket(nsd->xfrd_listener->fd, 2555 &cmd, sizeof(cmd))) { 2556 log_msg(LOG_ERR, "server_main: could not send " 2557 "reload_req to xfrd: %s", strerror(errno)); 2558 } 2559 nsd->mode = NSD_RUN; 2560 } break; 2561 case NSD_RELOAD: 2562 /* Continue to run nsd after reload */ 2563 nsd->mode = NSD_RUN; 2564 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2565 if (reload_pid != -1) { 2566 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2567 (int) reload_pid); 2568 break; 2569 } 2570 2571 /* switch the mytask to keep track of who owns task*/ 2572 nsd->mytask = 1 - nsd->mytask; 2573 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2574 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2575 reload_pid = -1; 2576 break; 2577 } 2578 2579 /* Do actual reload */ 2580 reload_pid = fork(); 2581 switch (reload_pid) { 2582 case -1: 2583 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2584 break; 2585 default: 2586 /* PARENT */ 2587 close(reload_sockets[0]); 2588 server_reload(nsd, server_region, netio, 2589 reload_sockets[1]); 2590 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2591 close(reload_sockets[1]); 2592 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2593 /* drop stale xfrd ipc data */ 2594 ((struct ipc_handler_conn_data*)nsd-> 2595 xfrd_listener->user_data) 2596 ->conn->is_reading = 0; 2597 reload_pid = -1; 2598 reload_listener.fd = -1; 2599 reload_listener.event_types = NETIO_EVENT_NONE; 2600 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2601 break; 2602 case 0: 2603 /* CHILD */ 2604 /* server_main keep running until NSD_QUIT_SYNC 2605 * received from reload. */ 2606 close(reload_sockets[1]); 2607 reload_listener.fd = reload_sockets[0]; 2608 reload_listener.timeout = NULL; 2609 reload_listener.user_data = nsd; 2610 reload_listener.event_types = NETIO_EVENT_READ; 2611 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2612 netio_add_handler(netio, &reload_listener); 2613 reload_pid = getppid(); 2614 break; 2615 } 2616 break; 2617 case NSD_QUIT_SYNC: 2618 /* synchronisation of xfrd, parent and reload */ 2619 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2620 sig_atomic_t cmd = NSD_RELOAD; 2621 /* stop xfrd ipc writes in progress */ 2622 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2623 "main: ipc send indication reload")); 2624 if(!write_socket(nsd->xfrd_listener->fd, 2625 &cmd, sizeof(cmd))) { 2626 log_msg(LOG_ERR, "server_main: could not send reload " 2627 "indication to xfrd: %s", strerror(errno)); 2628 } 2629 /* wait for ACK from xfrd */ 2630 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2631 nsd->quit_sync_done = 1; 2632 } 2633 nsd->mode = NSD_RUN; 2634 break; 2635 case NSD_QUIT: 2636 /* silent shutdown during reload */ 2637 if(reload_listener.fd != -1) { 2638 /* acknowledge the quit, to sync reload that we will really quit now */ 2639 sig_atomic_t cmd = NSD_RELOAD; 2640 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2641 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2642 log_msg(LOG_ERR, "server_main: " 2643 "could not ack quit: %s", strerror(errno)); 2644 } 2645 #ifdef BIND8_STATS 2646 parent_send_stats(nsd, reload_listener.fd); 2647 #endif /* BIND8_STATS */ 2648 close(reload_listener.fd); 2649 } 2650 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2651 /* only quit children after xfrd has acked */ 2652 send_children_quit(nsd); 2653 2654 #ifdef MEMCLEAN /* OS collects memory pages */ 2655 region_destroy(server_region); 2656 #endif 2657 server_shutdown(nsd); 2658 2659 /* ENOTREACH */ 2660 break; 2661 case NSD_SHUTDOWN: 2662 break; 2663 case NSD_REAP_CHILDREN: 2664 /* continue; wait for child in run loop */ 2665 nsd->mode = NSD_RUN; 2666 break; 2667 case NSD_STATS: 2668 #ifdef BIND8_STATS 2669 set_children_stats(nsd); 2670 #endif 2671 nsd->mode = NSD_RUN; 2672 break; 2673 default: 2674 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2675 nsd->mode = NSD_RUN; 2676 break; 2677 } 2678 } 2679 log_msg(LOG_WARNING, "signal received, shutting down..."); 2680 2681 /* close opened ports to avoid race with restart of nsd */ 2682 server_close_all_sockets(nsd->udp, nsd->ifs); 2683 server_close_all_sockets(nsd->tcp, nsd->ifs); 2684 #ifdef HAVE_SSL 2685 daemon_remote_close(nsd->rc); 2686 #endif 2687 send_children_quit_and_wait(nsd); 2688 2689 /* Unlink it if possible... */ 2690 unlinkpid(nsd->pidfile); 2691 unlink(nsd->task[0]->fname); 2692 unlink(nsd->task[1]->fname); 2693 #ifdef USE_ZONE_STATS 2694 unlink(nsd->zonestatfname[0]); 2695 unlink(nsd->zonestatfname[1]); 2696 #endif 2697 #ifdef USE_DNSTAP 2698 dt_collector_close(nsd->dt_collector, nsd); 2699 #endif 2700 2701 if(reload_listener.fd != -1) { 2702 sig_atomic_t cmd = NSD_QUIT; 2703 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2704 "main: ipc send quit to reload-process")); 2705 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2706 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2707 strerror(errno)); 2708 } 2709 fsync(reload_listener.fd); 2710 close(reload_listener.fd); 2711 /* wait for reload to finish processing */ 2712 while(1) { 2713 if(waitpid(reload_pid, NULL, 0) == -1) { 2714 if(errno == EINTR) continue; 2715 if(errno == ECHILD) break; 2716 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2717 (int)reload_pid, strerror(errno)); 2718 } 2719 break; 2720 } 2721 } 2722 if(nsd->xfrd_listener->fd != -1) { 2723 /* complete quit, stop xfrd */ 2724 sig_atomic_t cmd = NSD_QUIT; 2725 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2726 "main: ipc send quit to xfrd")); 2727 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2728 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2729 strerror(errno)); 2730 } 2731 fsync(nsd->xfrd_listener->fd); 2732 close(nsd->xfrd_listener->fd); 2733 (void)kill(nsd->pid, SIGTERM); 2734 } 2735 2736 #ifdef MEMCLEAN /* OS collects memory pages */ 2737 region_destroy(server_region); 2738 #endif 2739 /* write the nsd.db to disk, wait for it to complete */ 2740 udb_base_sync(nsd->db->udb, 1); 2741 udb_base_close(nsd->db->udb); 2742 server_shutdown(nsd); 2743 } 2744 2745 static query_state_type 2746 server_process_query(struct nsd *nsd, struct query *query) 2747 { 2748 return query_process(query, nsd); 2749 } 2750 2751 static query_state_type 2752 server_process_query_udp(struct nsd *nsd, struct query *query) 2753 { 2754 #ifdef RATELIMIT 2755 if(query_process(query, nsd) != QUERY_DISCARDED) { 2756 if(rrl_process_query(query)) 2757 return rrl_slip(query); 2758 else return QUERY_PROCESSED; 2759 } 2760 return QUERY_DISCARDED; 2761 #else 2762 return query_process(query, nsd); 2763 #endif 2764 } 2765 2766 const char* 2767 nsd_event_vs(void) 2768 { 2769 #ifdef USE_MINI_EVENT 2770 return ""; 2771 #else 2772 return event_get_version(); 2773 #endif 2774 } 2775 2776 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2777 static const char* ub_ev_backend2str(int b) 2778 { 2779 switch(b) { 2780 case EVBACKEND_SELECT: return "select"; 2781 case EVBACKEND_POLL: return "poll"; 2782 case EVBACKEND_EPOLL: return "epoll"; 2783 case EVBACKEND_KQUEUE: return "kqueue"; 2784 case EVBACKEND_DEVPOLL: return "devpoll"; 2785 case EVBACKEND_PORT: return "evport"; 2786 } 2787 return "unknown"; 2788 } 2789 #endif 2790 2791 const char* 2792 nsd_event_method(void) 2793 { 2794 #ifdef USE_MINI_EVENT 2795 return "select"; 2796 #else 2797 struct event_base* b = nsd_child_event_base(); 2798 const char* m = "?"; 2799 # ifdef EV_FEATURE_BACKENDS 2800 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2801 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 2802 m = event_base_get_method(b); 2803 # endif 2804 # ifdef MEMCLEAN 2805 event_base_free(b); 2806 # endif 2807 return m; 2808 #endif 2809 } 2810 2811 struct event_base* 2812 nsd_child_event_base(void) 2813 { 2814 struct event_base* base; 2815 #ifdef USE_MINI_EVENT 2816 static time_t secs; 2817 static struct timeval now; 2818 base = event_init(&secs, &now); 2819 #else 2820 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2821 /* libev */ 2822 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2823 # else 2824 /* libevent */ 2825 # ifdef HAVE_EVENT_BASE_NEW 2826 base = event_base_new(); 2827 # else 2828 base = event_init(); 2829 # endif 2830 # endif 2831 #endif 2832 return base; 2833 } 2834 2835 static void 2836 add_udp_handler( 2837 struct nsd *nsd, 2838 struct nsd_socket *sock, 2839 struct udp_handler_data *data) 2840 { 2841 struct event *handler = &data->event; 2842 2843 data->nsd = nsd; 2844 data->socket = sock; 2845 2846 memset(handler, 0, sizeof(*handler)); 2847 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 2848 if(event_base_set(nsd->event_base, handler) != 0) 2849 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 2850 if(event_add(handler, NULL) != 0) 2851 log_msg(LOG_ERR, "nsd udp: event_add failed"); 2852 } 2853 2854 void 2855 add_tcp_handler( 2856 struct nsd *nsd, 2857 struct nsd_socket *sock, 2858 struct tcp_accept_handler_data *data) 2859 { 2860 struct event *handler = &data->event; 2861 2862 data->nsd = nsd; 2863 data->socket = sock; 2864 2865 #ifdef HAVE_SSL 2866 if (nsd->tls_ctx && 2867 nsd->options->tls_port && 2868 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 2869 { 2870 data->tls_accept = 1; 2871 if(verbosity >= 2) { 2872 char buf[48]; 2873 addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 2874 VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 2875 } 2876 } else { 2877 data->tls_accept = 0; 2878 } 2879 #endif 2880 2881 memset(handler, 0, sizeof(*handler)); 2882 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 2883 if(event_base_set(nsd->event_base, handler) != 0) 2884 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 2885 if(event_add(handler, NULL) != 0) 2886 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 2887 data->event_added = 1; 2888 } 2889 2890 /* 2891 * Serve DNS requests. 2892 */ 2893 void 2894 server_child(struct nsd *nsd) 2895 { 2896 size_t i, from, numifs; 2897 region_type *server_region = region_create(xalloc, free); 2898 struct event_base* event_base = nsd_child_event_base(); 2899 sig_atomic_t mode; 2900 2901 if(!event_base) { 2902 log_msg(LOG_ERR, "nsd server could not create event base"); 2903 exit(1); 2904 } 2905 nsd->event_base = event_base; 2906 nsd->server_region = server_region; 2907 2908 #ifdef RATELIMIT 2909 rrl_init(nsd->this_child->child_num); 2910 #endif 2911 2912 assert(nsd->server_kind != NSD_SERVER_MAIN); 2913 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 2914 2915 #ifdef HAVE_SETPROCTITLE 2916 setproctitle("server %d", nsd->this_child->child_num + 1); 2917 #endif 2918 #ifdef HAVE_CPUSET_T 2919 if(nsd->use_cpu_affinity) { 2920 set_cpu_affinity(nsd->this_child->cpuset); 2921 } 2922 #endif 2923 2924 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 2925 server_close_all_sockets(nsd->tcp, nsd->ifs); 2926 } 2927 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 2928 server_close_all_sockets(nsd->udp, nsd->ifs); 2929 } 2930 2931 if (nsd->this_child->parent_fd != -1) { 2932 struct event *handler; 2933 struct ipc_handler_conn_data* user_data = 2934 (struct ipc_handler_conn_data*)region_alloc( 2935 server_region, sizeof(struct ipc_handler_conn_data)); 2936 user_data->nsd = nsd; 2937 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 2938 2939 handler = (struct event*) region_alloc( 2940 server_region, sizeof(*handler)); 2941 memset(handler, 0, sizeof(*handler)); 2942 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 2943 EV_READ, child_handle_parent_command, user_data); 2944 if(event_base_set(event_base, handler) != 0) 2945 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 2946 if(event_add(handler, NULL) != 0) 2947 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 2948 } 2949 2950 if(nsd->reuseport) { 2951 numifs = nsd->ifs / nsd->reuseport; 2952 from = numifs * nsd->this_child->child_num; 2953 if(from+numifs > nsd->ifs) { /* should not happen */ 2954 from = 0; 2955 numifs = nsd->ifs; 2956 } 2957 } else { 2958 from = 0; 2959 numifs = nsd->ifs; 2960 } 2961 2962 if (nsd->server_kind & NSD_SERVER_UDP) { 2963 int child = nsd->this_child->child_num; 2964 memset(msgs, 0, sizeof(msgs)); 2965 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 2966 queries[i] = query_create(server_region, 2967 compressed_dname_offsets, 2968 compression_table_size, compressed_dnames); 2969 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 2970 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 2971 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 2972 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 2973 msgs[i].msg_hdr.msg_iovlen = 1; 2974 msgs[i].msg_hdr.msg_name = &queries[i]->addr; 2975 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 2976 } 2977 2978 for (i = 0; i < nsd->ifs; i++) { 2979 int listen; 2980 struct udp_handler_data *data; 2981 2982 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 2983 2984 if(i >= from && i < (from + numifs) && listen) { 2985 data = region_alloc_zero( 2986 nsd->server_region, sizeof(*data)); 2987 add_udp_handler(nsd, &nsd->udp[i], data); 2988 } else { 2989 /* close sockets intended for other servers */ 2990 server_close_socket(&nsd->udp[i]); 2991 } 2992 } 2993 } 2994 2995 /* 2996 * Keep track of all the TCP accept handlers so we can enable 2997 * and disable them based on the current number of active TCP 2998 * connections. 2999 */ 3000 if (nsd->server_kind & NSD_SERVER_TCP) { 3001 int child = nsd->this_child->child_num; 3002 tcp_accept_handler_count = numifs; 3003 tcp_accept_handlers = region_alloc_array(server_region, 3004 numifs, sizeof(*tcp_accept_handlers)); 3005 3006 for (i = 0; i < nsd->ifs; i++) { 3007 int listen; 3008 struct tcp_accept_handler_data *data; 3009 3010 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 3011 3012 if(i >= from && i < (from + numifs) && listen) { 3013 data = &tcp_accept_handlers[i-from]; 3014 memset(data, 0, sizeof(*data)); 3015 add_tcp_handler(nsd, &nsd->tcp[i], data); 3016 } else { 3017 /* close sockets intended for other servers */ 3018 /* 3019 * uncomment this once tcp servers are no 3020 * longer copied in the tcp fd copy line 3021 * in server_init(). 3022 server_close_socket(&nsd->tcp[i]); 3023 */ 3024 /* close sockets not meant for this server*/ 3025 if(!listen) 3026 server_close_socket(&nsd->tcp[i]); 3027 } 3028 } 3029 } else { 3030 tcp_accept_handler_count = 0; 3031 } 3032 3033 /* The main loop... */ 3034 while ((mode = nsd->mode) != NSD_QUIT) { 3035 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 3036 3037 /* Do we need to do the statistics... */ 3038 if (mode == NSD_STATS) { 3039 #ifdef BIND8_STATS 3040 int p = nsd->st.period; 3041 nsd->st.period = 1; /* force stats printout */ 3042 /* Dump the statistics */ 3043 bind8_stats(nsd); 3044 nsd->st.period = p; 3045 #else /* !BIND8_STATS */ 3046 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3047 #endif /* BIND8_STATS */ 3048 3049 nsd->mode = NSD_RUN; 3050 } 3051 else if (mode == NSD_REAP_CHILDREN) { 3052 /* got signal, notify parent. parent reaps terminated children. */ 3053 if (nsd->this_child->parent_fd != -1) { 3054 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3055 if (write(nsd->this_child->parent_fd, 3056 &parent_notify, 3057 sizeof(parent_notify)) == -1) 3058 { 3059 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3060 (int) nsd->this_child->pid, strerror(errno)); 3061 } 3062 } else /* no parent, so reap 'em */ 3063 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3064 nsd->mode = NSD_RUN; 3065 } 3066 else if(mode == NSD_RUN) { 3067 /* Wait for a query... */ 3068 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3069 if (errno != EINTR) { 3070 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3071 break; 3072 } 3073 } 3074 } else if(mode == NSD_QUIT) { 3075 /* ignore here, quit */ 3076 } else { 3077 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3078 (int)mode); 3079 nsd->mode = NSD_RUN; 3080 } 3081 } 3082 3083 service_remaining_tcp(nsd); 3084 #ifdef BIND8_STATS 3085 bind8_stats(nsd); 3086 #endif /* BIND8_STATS */ 3087 3088 #ifdef MEMCLEAN /* OS collects memory pages */ 3089 #ifdef RATELIMIT 3090 rrl_deinit(nsd->this_child->child_num); 3091 #endif 3092 event_base_free(event_base); 3093 region_destroy(server_region); 3094 #endif 3095 server_shutdown(nsd); 3096 } 3097 3098 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3099 { 3100 int* timed_out = (int*)arg; 3101 assert(event & EV_TIMEOUT); (void)event; 3102 /* wake up the service tcp thread, note event is no longer 3103 * registered */ 3104 *timed_out = 1; 3105 } 3106 3107 void 3108 service_remaining_tcp(struct nsd* nsd) 3109 { 3110 struct tcp_handler_data* p; 3111 struct event_base* event_base; 3112 /* check if it is needed */ 3113 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3114 return; 3115 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3116 3117 /* setup event base */ 3118 event_base = nsd_child_event_base(); 3119 if(!event_base) { 3120 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3121 return; 3122 } 3123 /* register tcp connections */ 3124 for(p = tcp_active_list; p != NULL; p = p->next) { 3125 struct timeval timeout; 3126 int fd = p->event.ev_fd; 3127 #ifdef USE_MINI_EVENT 3128 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3129 #else 3130 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3131 #endif 3132 void (*fn)(int, short, void*); 3133 #ifdef HAVE_SSL 3134 if(p->tls) { 3135 if((event&EV_READ)) 3136 fn = handle_tls_reading; 3137 else fn = handle_tls_writing; 3138 } else { 3139 #endif 3140 if((event&EV_READ)) 3141 fn = handle_tcp_reading; 3142 else fn = handle_tcp_writing; 3143 #ifdef HAVE_SSL 3144 } 3145 #endif 3146 3147 p->tcp_no_more_queries = 1; 3148 /* set timeout to 1/10 second */ 3149 if(p->tcp_timeout > 100) 3150 p->tcp_timeout = 100; 3151 timeout.tv_sec = p->tcp_timeout / 1000; 3152 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3153 event_del(&p->event); 3154 memset(&p->event, 0, sizeof(p->event)); 3155 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3156 fn, p); 3157 if(event_base_set(event_base, &p->event) != 0) 3158 log_msg(LOG_ERR, "event base set failed"); 3159 if(event_add(&p->event, &timeout) != 0) 3160 log_msg(LOG_ERR, "event add failed"); 3161 } 3162 3163 /* handle it */ 3164 while(nsd->current_tcp_count > 0) { 3165 mode_t m = server_signal_mode(nsd); 3166 struct event timeout; 3167 struct timeval tv; 3168 int timed_out = 0; 3169 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3170 m == NSD_REAP_CHILDREN) { 3171 /* quit */ 3172 break; 3173 } 3174 /* timer */ 3175 /* have to do something every second */ 3176 tv.tv_sec = 1; 3177 tv.tv_usec = 0; 3178 memset(&timeout, 0, sizeof(timeout)); 3179 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3180 &timed_out); 3181 if(event_base_set(event_base, &timeout) != 0) 3182 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3183 if(event_add(&timeout, &tv) != 0) 3184 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3185 3186 /* service loop */ 3187 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3188 if (errno != EINTR) { 3189 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3190 break; 3191 } 3192 } 3193 if(!timed_out) { 3194 event_del(&timeout); 3195 } else { 3196 /* timed out, quit */ 3197 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3198 break; 3199 } 3200 } 3201 #ifdef MEMCLEAN 3202 event_base_free(event_base); 3203 #endif 3204 /* continue to quit after return */ 3205 } 3206 3207 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3208 * are always used, even if nonblocking operations are broken, in which case 3209 * NUM_RECV_PER_SELECT is defined to 1 (one). 3210 */ 3211 #if defined(HAVE_RECVMMSG) 3212 #define nsd_recvmmsg recvmmsg 3213 #else /* !HAVE_RECVMMSG */ 3214 3215 static int 3216 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3217 int flags, struct timespec *timeout) 3218 { 3219 unsigned int vpos = 0; 3220 ssize_t rcvd; 3221 3222 /* timeout is ignored, ensure caller does not expect it to work */ 3223 assert(timeout == NULL); (void)timeout; 3224 3225 while(vpos < vlen) { 3226 rcvd = recvfrom(sockfd, 3227 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3228 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3229 flags, 3230 msgvec[vpos].msg_hdr.msg_name, 3231 &msgvec[vpos].msg_hdr.msg_namelen); 3232 if(rcvd < 0) { 3233 break; 3234 } else { 3235 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3236 msgvec[vpos].msg_len = (unsigned int)rcvd; 3237 vpos++; 3238 } 3239 } 3240 3241 if(vpos) { 3242 /* error will be picked up next time */ 3243 return (int)vpos; 3244 } else if(errno == 0) { 3245 return 0; 3246 } else if(errno == EAGAIN) { 3247 return 0; 3248 } 3249 3250 return -1; 3251 } 3252 #endif /* HAVE_RECVMMSG */ 3253 3254 #ifdef HAVE_SENDMMSG 3255 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3256 #else /* !HAVE_SENDMMSG */ 3257 3258 static int 3259 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3260 { 3261 unsigned int vpos = 0; 3262 ssize_t snd; 3263 3264 while(vpos < vlen) { 3265 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3266 snd = sendto(sockfd, 3267 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3268 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3269 flags, 3270 msgvec[vpos].msg_hdr.msg_name, 3271 msgvec[vpos].msg_hdr.msg_namelen); 3272 if(snd < 0) { 3273 break; 3274 } else { 3275 msgvec[vpos].msg_len = (unsigned int)snd; 3276 vpos++; 3277 } 3278 } 3279 3280 if(vpos) { 3281 return (int)vpos; 3282 } else if(errno == 0) { 3283 return 0; 3284 } 3285 3286 return -1; 3287 } 3288 #endif /* HAVE_SENDMMSG */ 3289 3290 static void 3291 handle_udp(int fd, short event, void* arg) 3292 { 3293 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3294 int received, sent, recvcount, i; 3295 struct query *q; 3296 3297 if (!(event & EV_READ)) { 3298 return; 3299 } 3300 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3301 /* this printf strangely gave a performance increase on Linux */ 3302 /* printf("recvcount %d \n", recvcount); */ 3303 if (recvcount == -1) { 3304 if (errno != EAGAIN && errno != EINTR) { 3305 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3306 STATUP(data->nsd, rxerr); 3307 /* No zone statup */ 3308 } 3309 /* Simply no data available */ 3310 return; 3311 } 3312 for (i = 0; i < recvcount; i++) { 3313 loopstart: 3314 received = msgs[i].msg_len; 3315 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; 3316 q = queries[i]; 3317 if (received == -1) { 3318 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3319 #if defined(HAVE_RECVMMSG) 3320 msgs[i].msg_hdr.msg_flags 3321 #else 3322 errno 3323 #endif 3324 )); 3325 STATUP(data->nsd, rxerr); 3326 /* No zone statup */ 3327 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3328 iovecs[i].iov_len = buffer_remaining(q->packet); 3329 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3330 goto swap_drop; 3331 } 3332 3333 /* Account... */ 3334 #ifdef BIND8_STATS 3335 if (data->socket->addr.ai_family == AF_INET) { 3336 STATUP(data->nsd, qudp); 3337 } else if (data->socket->addr.ai_family == AF_INET6) { 3338 STATUP(data->nsd, qudp6); 3339 } 3340 #endif 3341 3342 buffer_skip(q->packet, received); 3343 buffer_flip(q->packet); 3344 #ifdef USE_DNSTAP 3345 /* 3346 * sending UDP-query with server address (local) and client address to dnstap process 3347 */ 3348 log_addr("query from client", &q->addr, data->socket->addr.ai_family); 3349 log_addr("to server (local)", &data->socket->addr.ai_addr, data->socket->addr.ai_family); 3350 dt_collector_submit_auth_query(data->nsd, &data->socket->addr.ai_addr, &q->addr, q->addrlen, 3351 q->tcp, q->packet); 3352 #endif /* USE_DNSTAP */ 3353 3354 /* Process and answer the query... */ 3355 if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) { 3356 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3357 STATUP(data->nsd, nona); 3358 ZTATUP(data->nsd, q->zone, nona); 3359 } 3360 3361 #ifdef USE_ZONE_STATS 3362 if (data->socket->addr.ai_family == AF_INET) { 3363 ZTATUP(data->nsd, q->zone, qudp); 3364 } else if (data->socket->addr.ai_family == AF_INET6) { 3365 ZTATUP(data->nsd, q->zone, qudp6); 3366 } 3367 #endif 3368 3369 /* Add EDNS0 and TSIG info if necessary. */ 3370 query_add_optional(q, data->nsd); 3371 3372 buffer_flip(q->packet); 3373 iovecs[i].iov_len = buffer_remaining(q->packet); 3374 #ifdef BIND8_STATS 3375 /* Account the rcode & TC... */ 3376 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3377 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3378 if (TC(q->packet)) { 3379 STATUP(data->nsd, truncated); 3380 ZTATUP(data->nsd, q->zone, truncated); 3381 } 3382 #endif /* BIND8_STATS */ 3383 #ifdef USE_DNSTAP 3384 /* 3385 * sending UDP-response with server address (local) and client address to dnstap process 3386 */ 3387 log_addr("from server (local)", &data->socket->addr.ai_addr, data->socket->addr.ai_family); 3388 log_addr("response to client", &q->addr, data->socket->addr.ai_family); 3389 dt_collector_submit_auth_response(data->nsd, &data->socket->addr.ai_addr, 3390 &q->addr, q->addrlen, q->tcp, q->packet, 3391 q->zone); 3392 #endif /* USE_DNSTAP */ 3393 } else { 3394 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3395 iovecs[i].iov_len = buffer_remaining(q->packet); 3396 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3397 swap_drop: 3398 STATUP(data->nsd, dropped); 3399 ZTATUP(data->nsd, q->zone, dropped); 3400 if(i != recvcount-1) { 3401 /* swap with last and decrease recvcount */ 3402 struct mmsghdr mtmp = msgs[i]; 3403 struct iovec iotmp = iovecs[i]; 3404 recvcount--; 3405 msgs[i] = msgs[recvcount]; 3406 iovecs[i] = iovecs[recvcount]; 3407 queries[i] = queries[recvcount]; 3408 msgs[recvcount] = mtmp; 3409 iovecs[recvcount] = iotmp; 3410 queries[recvcount] = q; 3411 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3412 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3413 goto loopstart; 3414 } else { recvcount --; } 3415 } 3416 } 3417 3418 /* send until all are sent */ 3419 i = 0; 3420 while(i<recvcount) { 3421 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3422 if(sent == -1) { 3423 if(errno == ENOBUFS || 3424 #ifdef EWOULDBLOCK 3425 errno == EWOULDBLOCK || 3426 #endif 3427 errno == EAGAIN) { 3428 /* block to wait until send buffer avail */ 3429 int flag, errstore; 3430 if((flag = fcntl(fd, F_GETFL)) == -1) { 3431 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3432 flag = 0; 3433 } 3434 flag &= ~O_NONBLOCK; 3435 if(fcntl(fd, F_SETFL, flag) == -1) 3436 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3437 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3438 errstore = errno; 3439 flag |= O_NONBLOCK; 3440 if(fcntl(fd, F_SETFL, flag) == -1) 3441 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3442 if(sent != -1) { 3443 i += sent; 3444 continue; 3445 } 3446 errno = errstore; 3447 } 3448 /* don't log transient network full errors, unless 3449 * on higher verbosity */ 3450 if(!(errno == ENOBUFS && verbosity < 1) && 3451 #ifdef EWOULDBLOCK 3452 errno != EWOULDBLOCK && 3453 #endif 3454 errno != EAGAIN) { 3455 const char* es = strerror(errno); 3456 char a[64]; 3457 addrport2str(&queries[i]->addr, a, sizeof(a)); 3458 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3459 } 3460 #ifdef BIND8_STATS 3461 data->nsd->st.txerr += recvcount-i; 3462 #endif /* BIND8_STATS */ 3463 break; 3464 } 3465 i += sent; 3466 } 3467 for(i=0; i<recvcount; i++) { 3468 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3469 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3470 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3471 } 3472 } 3473 3474 #ifdef HAVE_SSL 3475 /* 3476 * Setup an event for the tcp handler. 3477 */ 3478 static void 3479 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3480 int fd, short event) 3481 { 3482 struct timeval timeout; 3483 struct event_base* ev_base; 3484 3485 timeout.tv_sec = data->nsd->tcp_timeout; 3486 timeout.tv_usec = 0L; 3487 3488 ev_base = data->event.ev_base; 3489 event_del(&data->event); 3490 memset(&data->event, 0, sizeof(data->event)); 3491 event_set(&data->event, fd, event, fn, data); 3492 if(event_base_set(ev_base, &data->event) != 0) 3493 log_msg(LOG_ERR, "event base set failed"); 3494 if(event_add(&data->event, &timeout) != 0) 3495 log_msg(LOG_ERR, "event add failed"); 3496 } 3497 #endif /* HAVE_SSL */ 3498 3499 static void 3500 cleanup_tcp_handler(struct tcp_handler_data* data) 3501 { 3502 event_del(&data->event); 3503 #ifdef HAVE_SSL 3504 if(data->tls) { 3505 SSL_shutdown(data->tls); 3506 SSL_free(data->tls); 3507 data->tls = NULL; 3508 } 3509 #endif 3510 close(data->event.ev_fd); 3511 if(data->prev) 3512 data->prev->next = data->next; 3513 else tcp_active_list = data->next; 3514 if(data->next) 3515 data->next->prev = data->prev; 3516 3517 /* 3518 * Enable the TCP accept handlers when the current number of 3519 * TCP connections is about to drop below the maximum number 3520 * of TCP connections. 3521 */ 3522 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3523 configure_handler_event_types(EV_READ|EV_PERSIST); 3524 if(slowaccept) { 3525 event_del(&slowaccept_event); 3526 slowaccept = 0; 3527 } 3528 } 3529 --data->nsd->current_tcp_count; 3530 assert(data->nsd->current_tcp_count >= 0); 3531 3532 region_destroy(data->region); 3533 } 3534 3535 static void 3536 handle_tcp_reading(int fd, short event, void* arg) 3537 { 3538 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3539 ssize_t received; 3540 struct event_base* ev_base; 3541 struct timeval timeout; 3542 3543 if ((event & EV_TIMEOUT)) { 3544 /* Connection timed out. */ 3545 cleanup_tcp_handler(data); 3546 return; 3547 } 3548 3549 if ((data->nsd->tcp_query_count > 0 && 3550 data->query_count >= data->nsd->tcp_query_count) || 3551 data->tcp_no_more_queries) { 3552 /* No more queries allowed on this tcp connection. */ 3553 cleanup_tcp_handler(data); 3554 return; 3555 } 3556 3557 assert((event & EV_READ)); 3558 3559 if (data->bytes_transmitted == 0) { 3560 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3561 } 3562 3563 /* 3564 * Check if we received the leading packet length bytes yet. 3565 */ 3566 if (data->bytes_transmitted < sizeof(uint16_t)) { 3567 received = read(fd, 3568 (char *) &data->query->tcplen 3569 + data->bytes_transmitted, 3570 sizeof(uint16_t) - data->bytes_transmitted); 3571 if (received == -1) { 3572 if (errno == EAGAIN || errno == EINTR) { 3573 /* 3574 * Read would block, wait until more 3575 * data is available. 3576 */ 3577 return; 3578 } else { 3579 char buf[48]; 3580 addr2str(&data->query->addr, buf, sizeof(buf)); 3581 #ifdef ECONNRESET 3582 if (verbosity >= 2 || errno != ECONNRESET) 3583 #endif /* ECONNRESET */ 3584 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3585 cleanup_tcp_handler(data); 3586 return; 3587 } 3588 } else if (received == 0) { 3589 /* EOF */ 3590 cleanup_tcp_handler(data); 3591 return; 3592 } 3593 3594 data->bytes_transmitted += received; 3595 if (data->bytes_transmitted < sizeof(uint16_t)) { 3596 /* 3597 * Not done with the tcplen yet, wait for more 3598 * data to become available. 3599 */ 3600 return; 3601 } 3602 3603 assert(data->bytes_transmitted == sizeof(uint16_t)); 3604 3605 data->query->tcplen = ntohs(data->query->tcplen); 3606 3607 /* 3608 * Minimum query size is: 3609 * 3610 * Size of the header (12) 3611 * + Root domain name (1) 3612 * + Query class (2) 3613 * + Query type (2) 3614 */ 3615 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3616 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3617 cleanup_tcp_handler(data); 3618 return; 3619 } 3620 3621 if (data->query->tcplen > data->query->maxlen) { 3622 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3623 cleanup_tcp_handler(data); 3624 return; 3625 } 3626 3627 buffer_set_limit(data->query->packet, data->query->tcplen); 3628 } 3629 3630 assert(buffer_remaining(data->query->packet) > 0); 3631 3632 /* Read the (remaining) query data. */ 3633 received = read(fd, 3634 buffer_current(data->query->packet), 3635 buffer_remaining(data->query->packet)); 3636 if (received == -1) { 3637 if (errno == EAGAIN || errno == EINTR) { 3638 /* 3639 * Read would block, wait until more data is 3640 * available. 3641 */ 3642 return; 3643 } else { 3644 char buf[48]; 3645 addr2str(&data->query->addr, buf, sizeof(buf)); 3646 #ifdef ECONNRESET 3647 if (verbosity >= 2 || errno != ECONNRESET) 3648 #endif /* ECONNRESET */ 3649 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3650 cleanup_tcp_handler(data); 3651 return; 3652 } 3653 } else if (received == 0) { 3654 /* EOF */ 3655 cleanup_tcp_handler(data); 3656 return; 3657 } 3658 3659 data->bytes_transmitted += received; 3660 buffer_skip(data->query->packet, received); 3661 if (buffer_remaining(data->query->packet) > 0) { 3662 /* 3663 * Message not yet complete, wait for more data to 3664 * become available. 3665 */ 3666 return; 3667 } 3668 3669 assert(buffer_position(data->query->packet) == data->query->tcplen); 3670 3671 /* Account... */ 3672 #ifdef BIND8_STATS 3673 #ifndef INET6 3674 STATUP(data->nsd, ctcp); 3675 #else 3676 if (data->query->addr.ss_family == AF_INET) { 3677 STATUP(data->nsd, ctcp); 3678 } else if (data->query->addr.ss_family == AF_INET6) { 3679 STATUP(data->nsd, ctcp6); 3680 } 3681 #endif 3682 #endif /* BIND8_STATS */ 3683 3684 /* We have a complete query, process it. */ 3685 3686 /* tcp-query-count: handle query counter ++ */ 3687 data->query_count++; 3688 3689 buffer_flip(data->query->packet); 3690 #ifdef USE_DNSTAP 3691 /* 3692 * and send TCP-query with found address (local) and client address to dnstap process 3693 */ 3694 log_addr("query from client", &data->query->addr, data->query->addr.ss_family); 3695 log_addr("to server (local)", &data->socket->addr.ai_addr, data->query->addr.ss_family); 3696 dt_collector_submit_auth_query(data->nsd, &data->socket->addr.ai_addr, &data->query->addr, 3697 data->query->addrlen, data->query->tcp, data->query->packet); 3698 #endif /* USE_DNSTAP */ 3699 data->query_state = server_process_query(data->nsd, data->query); 3700 if (data->query_state == QUERY_DISCARDED) { 3701 /* Drop the packet and the entire connection... */ 3702 STATUP(data->nsd, dropped); 3703 ZTATUP(data->nsd, data->query->zone, dropped); 3704 cleanup_tcp_handler(data); 3705 return; 3706 } 3707 3708 #ifdef BIND8_STATS 3709 if (RCODE(data->query->packet) == RCODE_OK 3710 && !AA(data->query->packet)) 3711 { 3712 STATUP(data->nsd, nona); 3713 ZTATUP(data->nsd, data->query->zone, nona); 3714 } 3715 #endif /* BIND8_STATS */ 3716 3717 #ifdef USE_ZONE_STATS 3718 #ifndef INET6 3719 ZTATUP(data->nsd, data->query->zone, ctcp); 3720 #else 3721 if (data->query->addr.ss_family == AF_INET) { 3722 ZTATUP(data->nsd, data->query->zone, ctcp); 3723 } else if (data->query->addr.ss_family == AF_INET6) { 3724 ZTATUP(data->nsd, data->query->zone, ctcp6); 3725 } 3726 #endif 3727 #endif /* USE_ZONE_STATS */ 3728 3729 query_add_optional(data->query, data->nsd); 3730 3731 /* Switch to the tcp write handler. */ 3732 buffer_flip(data->query->packet); 3733 data->query->tcplen = buffer_remaining(data->query->packet); 3734 #ifdef BIND8_STATS 3735 /* Account the rcode & TC... */ 3736 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 3737 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 3738 if (TC(data->query->packet)) { 3739 STATUP(data->nsd, truncated); 3740 ZTATUP(data->nsd, data->query->zone, truncated); 3741 } 3742 #endif /* BIND8_STATS */ 3743 #ifdef USE_DNSTAP 3744 /* 3745 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 3746 */ 3747 log_addr("from server (local)", &data->socket->addr.ai_addr, data->query->addr.ss_family); 3748 log_addr("response to client", &data->query->addr, data->query->addr.ss_family); 3749 dt_collector_submit_auth_response(data->nsd, &data->socket->addr.ai_addr, &data->query->addr, 3750 data->query->addrlen, data->query->tcp, data->query->packet, 3751 data->query->zone); 3752 #endif /* USE_DNSTAP */ 3753 data->bytes_transmitted = 0; 3754 3755 timeout.tv_sec = data->tcp_timeout / 1000; 3756 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3757 3758 ev_base = data->event.ev_base; 3759 event_del(&data->event); 3760 memset(&data->event, 0, sizeof(data->event)); 3761 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3762 handle_tcp_reading, data); 3763 if(event_base_set(ev_base, &data->event) != 0) 3764 log_msg(LOG_ERR, "event base set tcpr failed"); 3765 if(event_add(&data->event, &timeout) != 0) 3766 log_msg(LOG_ERR, "event add tcpr failed"); 3767 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 3768 handle_tcp_writing(fd, EV_WRITE, data); 3769 } 3770 3771 static void 3772 handle_tcp_writing(int fd, short event, void* arg) 3773 { 3774 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3775 ssize_t sent; 3776 struct query *q = data->query; 3777 struct timeval timeout; 3778 struct event_base* ev_base; 3779 3780 if ((event & EV_TIMEOUT)) { 3781 /* Connection timed out. */ 3782 cleanup_tcp_handler(data); 3783 return; 3784 } 3785 3786 assert((event & EV_WRITE)); 3787 3788 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3789 /* Writing the response packet length. */ 3790 uint16_t n_tcplen = htons(q->tcplen); 3791 #ifdef HAVE_WRITEV 3792 struct iovec iov[2]; 3793 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 3794 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 3795 iov[1].iov_base = buffer_begin(q->packet); 3796 iov[1].iov_len = buffer_limit(q->packet); 3797 sent = writev(fd, iov, 2); 3798 #else /* HAVE_WRITEV */ 3799 sent = write(fd, 3800 (const char *) &n_tcplen + data->bytes_transmitted, 3801 sizeof(n_tcplen) - data->bytes_transmitted); 3802 #endif /* HAVE_WRITEV */ 3803 if (sent == -1) { 3804 if (errno == EAGAIN || errno == EINTR) { 3805 /* 3806 * Write would block, wait until 3807 * socket becomes writable again. 3808 */ 3809 return; 3810 } else { 3811 #ifdef ECONNRESET 3812 if(verbosity >= 2 || errno != ECONNRESET) 3813 #endif /* ECONNRESET */ 3814 #ifdef EPIPE 3815 if(verbosity >= 2 || errno != EPIPE) 3816 #endif /* EPIPE 'broken pipe' */ 3817 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3818 cleanup_tcp_handler(data); 3819 return; 3820 } 3821 } 3822 3823 data->bytes_transmitted += sent; 3824 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3825 /* 3826 * Writing not complete, wait until socket 3827 * becomes writable again. 3828 */ 3829 return; 3830 } 3831 3832 #ifdef HAVE_WRITEV 3833 sent -= sizeof(n_tcplen); 3834 /* handle potential 'packet done' code */ 3835 goto packet_could_be_done; 3836 #endif 3837 } 3838 3839 sent = write(fd, 3840 buffer_current(q->packet), 3841 buffer_remaining(q->packet)); 3842 if (sent == -1) { 3843 if (errno == EAGAIN || errno == EINTR) { 3844 /* 3845 * Write would block, wait until 3846 * socket becomes writable again. 3847 */ 3848 return; 3849 } else { 3850 #ifdef ECONNRESET 3851 if(verbosity >= 2 || errno != ECONNRESET) 3852 #endif /* ECONNRESET */ 3853 #ifdef EPIPE 3854 if(verbosity >= 2 || errno != EPIPE) 3855 #endif /* EPIPE 'broken pipe' */ 3856 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3857 cleanup_tcp_handler(data); 3858 return; 3859 } 3860 } 3861 3862 data->bytes_transmitted += sent; 3863 #ifdef HAVE_WRITEV 3864 packet_could_be_done: 3865 #endif 3866 buffer_skip(q->packet, sent); 3867 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 3868 /* 3869 * Still more data to write when socket becomes 3870 * writable again. 3871 */ 3872 return; 3873 } 3874 3875 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 3876 3877 if (data->query_state == QUERY_IN_AXFR) { 3878 /* Continue processing AXFR and writing back results. */ 3879 buffer_clear(q->packet); 3880 data->query_state = query_axfr(data->nsd, q); 3881 if (data->query_state != QUERY_PROCESSED) { 3882 query_add_optional(data->query, data->nsd); 3883 3884 /* Reset data. */ 3885 buffer_flip(q->packet); 3886 q->tcplen = buffer_remaining(q->packet); 3887 data->bytes_transmitted = 0; 3888 /* Reset timeout. */ 3889 timeout.tv_sec = data->tcp_timeout / 1000; 3890 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3891 ev_base = data->event.ev_base; 3892 event_del(&data->event); 3893 memset(&data->event, 0, sizeof(data->event)); 3894 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 3895 handle_tcp_writing, data); 3896 if(event_base_set(ev_base, &data->event) != 0) 3897 log_msg(LOG_ERR, "event base set tcpw failed"); 3898 if(event_add(&data->event, &timeout) != 0) 3899 log_msg(LOG_ERR, "event add tcpw failed"); 3900 3901 /* 3902 * Write data if/when the socket is writable 3903 * again. 3904 */ 3905 return; 3906 } 3907 } 3908 3909 /* 3910 * Done sending, wait for the next request to arrive on the 3911 * TCP socket by installing the TCP read handler. 3912 */ 3913 if ((data->nsd->tcp_query_count > 0 && 3914 data->query_count >= data->nsd->tcp_query_count) || 3915 data->tcp_no_more_queries) { 3916 3917 (void) shutdown(fd, SHUT_WR); 3918 } 3919 3920 data->bytes_transmitted = 0; 3921 3922 timeout.tv_sec = data->tcp_timeout / 1000; 3923 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3924 ev_base = data->event.ev_base; 3925 event_del(&data->event); 3926 memset(&data->event, 0, sizeof(data->event)); 3927 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3928 handle_tcp_reading, data); 3929 if(event_base_set(ev_base, &data->event) != 0) 3930 log_msg(LOG_ERR, "event base set tcpw failed"); 3931 if(event_add(&data->event, &timeout) != 0) 3932 log_msg(LOG_ERR, "event add tcpw failed"); 3933 } 3934 3935 #ifdef HAVE_SSL 3936 /** create SSL object and associate fd */ 3937 static SSL* 3938 incoming_ssl_fd(SSL_CTX* ctx, int fd) 3939 { 3940 SSL* ssl = SSL_new((SSL_CTX*)ctx); 3941 if(!ssl) { 3942 log_crypto_err("could not SSL_new"); 3943 return NULL; 3944 } 3945 SSL_set_accept_state(ssl); 3946 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 3947 if(!SSL_set_fd(ssl, fd)) { 3948 log_crypto_err("could not SSL_set_fd"); 3949 SSL_free(ssl); 3950 return NULL; 3951 } 3952 return ssl; 3953 } 3954 3955 /** TLS handshake to upgrade TCP connection */ 3956 static int 3957 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 3958 { 3959 int r; 3960 if(data->shake_state == tls_hs_read_event) { 3961 /* read condition satisfied back to writing */ 3962 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3963 data->shake_state = tls_hs_none; 3964 return 1; 3965 } 3966 if(data->shake_state == tls_hs_write_event) { 3967 /* write condition satisfied back to reading */ 3968 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3969 data->shake_state = tls_hs_none; 3970 return 1; 3971 } 3972 3973 /* (continue to) setup the TLS connection */ 3974 ERR_clear_error(); 3975 r = SSL_do_handshake(data->tls); 3976 3977 if(r != 1) { 3978 int want = SSL_get_error(data->tls, r); 3979 if(want == SSL_ERROR_WANT_READ) { 3980 if(data->shake_state == tls_hs_read) { 3981 /* try again later */ 3982 return 1; 3983 } 3984 data->shake_state = tls_hs_read; 3985 /* switch back to reading mode */ 3986 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3987 return 1; 3988 } else if(want == SSL_ERROR_WANT_WRITE) { 3989 if(data->shake_state == tls_hs_write) { 3990 /* try again later */ 3991 return 1; 3992 } 3993 data->shake_state = tls_hs_write; 3994 /* switch back to writing mode */ 3995 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3996 return 1; 3997 } else { 3998 if(r == 0) 3999 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 4000 else { 4001 unsigned long err = ERR_get_error(); 4002 if(!squelch_err_ssl_handshake(err)) { 4003 char a[64], s[256]; 4004 addr2str(&data->query->addr, a, sizeof(a)); 4005 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 4006 log_crypto_from_err(s, err); 4007 } 4008 } 4009 cleanup_tcp_handler(data); 4010 return 0; 4011 } 4012 } 4013 4014 /* Use to log successful upgrade for testing - could be removed*/ 4015 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 4016 /* set back to the event we need to have when reading (or writing) */ 4017 if(data->shake_state == tls_hs_read && writing) { 4018 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4019 } else if(data->shake_state == tls_hs_write && !writing) { 4020 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4021 } 4022 data->shake_state = tls_hs_none; 4023 return 1; 4024 } 4025 4026 /** handle TLS reading of incoming query */ 4027 static void 4028 handle_tls_reading(int fd, short event, void* arg) 4029 { 4030 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4031 ssize_t received; 4032 4033 if ((event & EV_TIMEOUT)) { 4034 /* Connection timed out. */ 4035 cleanup_tcp_handler(data); 4036 return; 4037 } 4038 4039 if ((data->nsd->tcp_query_count > 0 && 4040 data->query_count >= data->nsd->tcp_query_count) || 4041 data->tcp_no_more_queries) { 4042 /* No more queries allowed on this tcp connection. */ 4043 cleanup_tcp_handler(data); 4044 return; 4045 } 4046 4047 assert((event & EV_READ)); 4048 4049 if (data->bytes_transmitted == 0) { 4050 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4051 } 4052 4053 if(data->shake_state != tls_hs_none) { 4054 if(!tls_handshake(data, fd, 0)) 4055 return; 4056 if(data->shake_state != tls_hs_none) 4057 return; 4058 } 4059 4060 /* 4061 * Check if we received the leading packet length bytes yet. 4062 */ 4063 if(data->bytes_transmitted < sizeof(uint16_t)) { 4064 ERR_clear_error(); 4065 if((received=SSL_read(data->tls, (char *) &data->query->tcplen 4066 + data->bytes_transmitted, 4067 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { 4068 int want = SSL_get_error(data->tls, received); 4069 if(want == SSL_ERROR_ZERO_RETURN) { 4070 cleanup_tcp_handler(data); 4071 return; /* shutdown, closed */ 4072 } else if(want == SSL_ERROR_WANT_READ) { 4073 /* wants to be called again */ 4074 return; 4075 } 4076 else if(want == SSL_ERROR_WANT_WRITE) { 4077 /* switch to writing */ 4078 data->shake_state = tls_hs_write_event; 4079 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4080 return; 4081 } 4082 cleanup_tcp_handler(data); 4083 log_crypto_err("could not SSL_read"); 4084 return; 4085 } 4086 4087 data->bytes_transmitted += received; 4088 if (data->bytes_transmitted < sizeof(uint16_t)) { 4089 /* 4090 * Not done with the tcplen yet, wait for more 4091 * data to become available. 4092 */ 4093 return; 4094 } 4095 4096 assert(data->bytes_transmitted == sizeof(uint16_t)); 4097 4098 data->query->tcplen = ntohs(data->query->tcplen); 4099 4100 /* 4101 * Minimum query size is: 4102 * 4103 * Size of the header (12) 4104 * + Root domain name (1) 4105 * + Query class (2) 4106 * + Query type (2) 4107 */ 4108 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4109 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4110 cleanup_tcp_handler(data); 4111 return; 4112 } 4113 4114 if (data->query->tcplen > data->query->maxlen) { 4115 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4116 cleanup_tcp_handler(data); 4117 return; 4118 } 4119 4120 buffer_set_limit(data->query->packet, data->query->tcplen); 4121 } 4122 4123 assert(buffer_remaining(data->query->packet) > 0); 4124 4125 /* Read the (remaining) query data. */ 4126 ERR_clear_error(); 4127 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), 4128 (int)buffer_remaining(data->query->packet)); 4129 if(received <= 0) { 4130 int want = SSL_get_error(data->tls, received); 4131 if(want == SSL_ERROR_ZERO_RETURN) { 4132 cleanup_tcp_handler(data); 4133 return; /* shutdown, closed */ 4134 } else if(want == SSL_ERROR_WANT_READ) { 4135 /* wants to be called again */ 4136 return; 4137 } 4138 else if(want == SSL_ERROR_WANT_WRITE) { 4139 /* switch back writing */ 4140 data->shake_state = tls_hs_write_event; 4141 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4142 return; 4143 } 4144 cleanup_tcp_handler(data); 4145 log_crypto_err("could not SSL_read"); 4146 return; 4147 } 4148 4149 data->bytes_transmitted += received; 4150 buffer_skip(data->query->packet, received); 4151 if (buffer_remaining(data->query->packet) > 0) { 4152 /* 4153 * Message not yet complete, wait for more data to 4154 * become available. 4155 */ 4156 return; 4157 } 4158 4159 assert(buffer_position(data->query->packet) == data->query->tcplen); 4160 4161 /* Account... */ 4162 #ifndef INET6 4163 STATUP(data->nsd, ctls); 4164 #else 4165 if (data->query->addr.ss_family == AF_INET) { 4166 STATUP(data->nsd, ctls); 4167 } else if (data->query->addr.ss_family == AF_INET6) { 4168 STATUP(data->nsd, ctls6); 4169 } 4170 #endif 4171 4172 /* We have a complete query, process it. */ 4173 4174 /* tcp-query-count: handle query counter ++ */ 4175 data->query_count++; 4176 4177 buffer_flip(data->query->packet); 4178 #ifdef USE_DNSTAP 4179 /* 4180 * and send TCP-query with found address (local) and client address to dnstap process 4181 */ 4182 log_addr("query from client", &data->query->addr, data->query->addr.ss_family); 4183 log_addr("to server (local)", &data->socket->addr.ai_addr, data->query->addr.ss_family); 4184 dt_collector_submit_auth_query(data->nsd, &data->socket->addr.ai_addr, &data->query->addr, 4185 data->query->addrlen, data->query->tcp, data->query->packet); 4186 #endif /* USE_DNSTAP */ 4187 data->query_state = server_process_query(data->nsd, data->query); 4188 if (data->query_state == QUERY_DISCARDED) { 4189 /* Drop the packet and the entire connection... */ 4190 STATUP(data->nsd, dropped); 4191 ZTATUP(data->nsd, data->query->zone, dropped); 4192 cleanup_tcp_handler(data); 4193 return; 4194 } 4195 4196 #ifdef BIND8_STATS 4197 if (RCODE(data->query->packet) == RCODE_OK 4198 && !AA(data->query->packet)) 4199 { 4200 STATUP(data->nsd, nona); 4201 ZTATUP(data->nsd, data->query->zone, nona); 4202 } 4203 #endif /* BIND8_STATS */ 4204 4205 #ifdef USE_ZONE_STATS 4206 #ifndef INET6 4207 ZTATUP(data->nsd, data->query->zone, ctls); 4208 #else 4209 if (data->query->addr.ss_family == AF_INET) { 4210 ZTATUP(data->nsd, data->query->zone, ctls); 4211 } else if (data->query->addr.ss_family == AF_INET6) { 4212 ZTATUP(data->nsd, data->query->zone, ctls6); 4213 } 4214 #endif 4215 #endif /* USE_ZONE_STATS */ 4216 4217 query_add_optional(data->query, data->nsd); 4218 4219 /* Switch to the tcp write handler. */ 4220 buffer_flip(data->query->packet); 4221 data->query->tcplen = buffer_remaining(data->query->packet); 4222 #ifdef BIND8_STATS 4223 /* Account the rcode & TC... */ 4224 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4225 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4226 if (TC(data->query->packet)) { 4227 STATUP(data->nsd, truncated); 4228 ZTATUP(data->nsd, data->query->zone, truncated); 4229 } 4230 #endif /* BIND8_STATS */ 4231 #ifdef USE_DNSTAP 4232 /* 4233 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4234 */ 4235 log_addr("from server (local)", &data->socket->addr.ai_addr, data->query->addr.ss_family); 4236 log_addr("response to client", &data->query->addr, data->query->addr.ss_family); 4237 dt_collector_submit_auth_response(data->nsd, &data->socket->addr.ai_addr, &data->query->addr, 4238 data->query->addrlen, data->query->tcp, data->query->packet, 4239 data->query->zone); 4240 #endif /* USE_DNSTAP */ 4241 data->bytes_transmitted = 0; 4242 4243 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4244 4245 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4246 handle_tls_writing(fd, EV_WRITE, data); 4247 } 4248 4249 /** handle TLS writing of outgoing response */ 4250 static void 4251 handle_tls_writing(int fd, short event, void* arg) 4252 { 4253 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4254 ssize_t sent; 4255 struct query *q = data->query; 4256 /* static variable that holds reassembly buffer used to put the 4257 * TCP length in front of the packet, like writev. */ 4258 static buffer_type* global_tls_temp_buffer = NULL; 4259 buffer_type* write_buffer; 4260 4261 if ((event & EV_TIMEOUT)) { 4262 /* Connection timed out. */ 4263 cleanup_tcp_handler(data); 4264 return; 4265 } 4266 4267 assert((event & EV_WRITE)); 4268 4269 if(data->shake_state != tls_hs_none) { 4270 if(!tls_handshake(data, fd, 1)) 4271 return; 4272 if(data->shake_state != tls_hs_none) 4273 return; 4274 } 4275 4276 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4277 4278 /* If we are writing the start of a message, we must include the length 4279 * this is done with a copy into write_buffer. */ 4280 write_buffer = NULL; 4281 if (data->bytes_transmitted == 0) { 4282 if(!global_tls_temp_buffer) { 4283 /* gets deallocated when nsd shuts down from 4284 * nsd.region */ 4285 global_tls_temp_buffer = buffer_create(nsd.region, 4286 QIOBUFSZ + sizeof(q->tcplen)); 4287 if (!global_tls_temp_buffer) { 4288 return; 4289 } 4290 } 4291 write_buffer = global_tls_temp_buffer; 4292 buffer_clear(write_buffer); 4293 buffer_write_u16(write_buffer, q->tcplen); 4294 buffer_write(write_buffer, buffer_current(q->packet), 4295 (int)buffer_remaining(q->packet)); 4296 buffer_flip(write_buffer); 4297 } else { 4298 write_buffer = q->packet; 4299 } 4300 4301 /* Write the response */ 4302 ERR_clear_error(); 4303 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4304 if(sent <= 0) { 4305 int want = SSL_get_error(data->tls, sent); 4306 if(want == SSL_ERROR_ZERO_RETURN) { 4307 cleanup_tcp_handler(data); 4308 /* closed */ 4309 } else if(want == SSL_ERROR_WANT_READ) { 4310 /* switch back to reading */ 4311 data->shake_state = tls_hs_read_event; 4312 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4313 } else if(want != SSL_ERROR_WANT_WRITE) { 4314 cleanup_tcp_handler(data); 4315 log_crypto_err("could not SSL_write"); 4316 } 4317 return; 4318 } 4319 4320 buffer_skip(write_buffer, sent); 4321 if(buffer_remaining(write_buffer) != 0) { 4322 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4323 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4324 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4325 } 4326 } 4327 4328 data->bytes_transmitted += sent; 4329 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4330 /* 4331 * Still more data to write when socket becomes 4332 * writable again. 4333 */ 4334 return; 4335 } 4336 4337 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4338 4339 if (data->query_state == QUERY_IN_AXFR) { 4340 /* Continue processing AXFR and writing back results. */ 4341 buffer_clear(q->packet); 4342 data->query_state = query_axfr(data->nsd, q); 4343 if (data->query_state != QUERY_PROCESSED) { 4344 query_add_optional(data->query, data->nsd); 4345 4346 /* Reset data. */ 4347 buffer_flip(q->packet); 4348 q->tcplen = buffer_remaining(q->packet); 4349 data->bytes_transmitted = 0; 4350 /* Reset to writing mode. */ 4351 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4352 4353 /* 4354 * Write data if/when the socket is writable 4355 * again. 4356 */ 4357 return; 4358 } 4359 } 4360 4361 /* 4362 * Done sending, wait for the next request to arrive on the 4363 * TCP socket by installing the TCP read handler. 4364 */ 4365 if ((data->nsd->tcp_query_count > 0 && 4366 data->query_count >= data->nsd->tcp_query_count) || 4367 data->tcp_no_more_queries) { 4368 4369 (void) shutdown(fd, SHUT_WR); 4370 } 4371 4372 data->bytes_transmitted = 0; 4373 4374 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4375 } 4376 #endif 4377 4378 static void 4379 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 4380 void* ATTR_UNUSED(arg)) 4381 { 4382 if(slowaccept) { 4383 configure_handler_event_types(EV_PERSIST | EV_READ); 4384 slowaccept = 0; 4385 } 4386 } 4387 4388 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 4389 { 4390 #ifndef HAVE_ACCEPT4 4391 int s = accept(fd, addr, addrlen); 4392 if (s != -1) { 4393 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 4394 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 4395 close(s); 4396 s = -1; 4397 errno=EINTR; /* stop error printout as error in accept4 4398 by setting this errno, it omits printout, in 4399 later code that calls nsd_accept4 */ 4400 } 4401 } 4402 return s; 4403 #else 4404 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 4405 #endif /* HAVE_ACCEPT4 */ 4406 } 4407 4408 /* 4409 * Handle an incoming TCP connection. The connection is accepted and 4410 * a new TCP reader event handler is added. The TCP handler 4411 * is responsible for cleanup when the connection is closed. 4412 */ 4413 static void 4414 handle_tcp_accept(int fd, short event, void* arg) 4415 { 4416 struct tcp_accept_handler_data *data 4417 = (struct tcp_accept_handler_data *) arg; 4418 int s; 4419 int reject = 0; 4420 struct tcp_handler_data *tcp_data; 4421 region_type *tcp_region; 4422 #ifdef INET6 4423 struct sockaddr_storage addr; 4424 #else 4425 struct sockaddr_in addr; 4426 #endif 4427 socklen_t addrlen; 4428 struct timeval timeout; 4429 4430 if (!(event & EV_READ)) { 4431 return; 4432 } 4433 4434 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 4435 reject = data->nsd->options->tcp_reject_overflow; 4436 if (!reject) { 4437 return; 4438 } 4439 } 4440 4441 /* Accept it... */ 4442 addrlen = sizeof(addr); 4443 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 4444 if (s == -1) { 4445 /** 4446 * EMFILE and ENFILE is a signal that the limit of open 4447 * file descriptors has been reached. Pause accept(). 4448 * EINTR is a signal interrupt. The others are various OS ways 4449 * of saying that the client has closed the connection. 4450 */ 4451 if (errno == EMFILE || errno == ENFILE) { 4452 if (!slowaccept) { 4453 /* disable accept events */ 4454 struct timeval tv; 4455 configure_handler_event_types(0); 4456 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 4457 tv.tv_usec = 0L; 4458 memset(&slowaccept_event, 0, 4459 sizeof(slowaccept_event)); 4460 event_set(&slowaccept_event, -1, EV_TIMEOUT, 4461 handle_slowaccept_timeout, NULL); 4462 (void)event_base_set(data->event.ev_base, 4463 &slowaccept_event); 4464 (void)event_add(&slowaccept_event, &tv); 4465 slowaccept = 1; 4466 /* We don't want to spam the logs here */ 4467 } 4468 } else if (errno != EINTR 4469 && errno != EWOULDBLOCK 4470 #ifdef ECONNABORTED 4471 && errno != ECONNABORTED 4472 #endif /* ECONNABORTED */ 4473 #ifdef EPROTO 4474 && errno != EPROTO 4475 #endif /* EPROTO */ 4476 ) { 4477 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 4478 } 4479 return; 4480 } 4481 4482 if (reject) { 4483 shutdown(s, SHUT_RDWR); 4484 close(s); 4485 return; 4486 } 4487 4488 /* 4489 * This region is deallocated when the TCP connection is 4490 * closed by the TCP handler. 4491 */ 4492 tcp_region = region_create(xalloc, free); 4493 tcp_data = (struct tcp_handler_data *) region_alloc( 4494 tcp_region, sizeof(struct tcp_handler_data)); 4495 tcp_data->region = tcp_region; 4496 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 4497 compression_table_size, compressed_dnames); 4498 tcp_data->nsd = data->nsd; 4499 tcp_data->query_count = 0; 4500 #ifdef HAVE_SSL 4501 tcp_data->shake_state = tls_hs_none; 4502 tcp_data->tls = NULL; 4503 #endif 4504 tcp_data->prev = NULL; 4505 tcp_data->next = NULL; 4506 4507 tcp_data->query_state = QUERY_PROCESSED; 4508 tcp_data->bytes_transmitted = 0; 4509 memcpy(&tcp_data->query->addr, &addr, addrlen); 4510 tcp_data->query->addrlen = addrlen; 4511 4512 tcp_data->tcp_no_more_queries = 0; 4513 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 4514 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 4515 /* very busy, give smaller timeout */ 4516 tcp_data->tcp_timeout = 200; 4517 } 4518 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4519 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 4520 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 4521 4522 #ifdef USE_DNSTAP 4523 /* save the address of the connection */ 4524 tcp_data->socket = data->socket; 4525 #endif /* USE_DNSTAP */ 4526 4527 #ifdef HAVE_SSL 4528 if (data->tls_accept) { 4529 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 4530 if(!tcp_data->tls) { 4531 close(s); 4532 return; 4533 } 4534 tcp_data->shake_state = tls_hs_read; 4535 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4536 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4537 handle_tls_reading, tcp_data); 4538 } else { 4539 #endif 4540 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4541 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4542 handle_tcp_reading, tcp_data); 4543 #ifdef HAVE_SSL 4544 } 4545 #endif 4546 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 4547 log_msg(LOG_ERR, "cannot set tcp event base"); 4548 close(s); 4549 region_destroy(tcp_region); 4550 return; 4551 } 4552 if(event_add(&tcp_data->event, &timeout) != 0) { 4553 log_msg(LOG_ERR, "cannot add tcp to event base"); 4554 close(s); 4555 region_destroy(tcp_region); 4556 return; 4557 } 4558 if(tcp_active_list) { 4559 tcp_active_list->prev = tcp_data; 4560 tcp_data->next = tcp_active_list; 4561 } 4562 tcp_active_list = tcp_data; 4563 4564 /* 4565 * Keep track of the total number of TCP handlers installed so 4566 * we can stop accepting connections when the maximum number 4567 * of simultaneous TCP connections is reached. 4568 * 4569 * If tcp-reject-overflow is enabled, however, then we do not 4570 * change the handler event type; we keep it as-is and accept 4571 * overflow TCP connections only so that we can forcibly kill 4572 * them off. 4573 */ 4574 ++data->nsd->current_tcp_count; 4575 if (!data->nsd->options->tcp_reject_overflow && 4576 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 4577 { 4578 configure_handler_event_types(0); 4579 } 4580 } 4581 4582 static void 4583 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 4584 { 4585 size_t i; 4586 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4587 for (i = 0; i < nsd->child_count; ++i) { 4588 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 4589 if (write(nsd->children[i].child_fd, 4590 &command, 4591 sizeof(command)) == -1) 4592 { 4593 if(errno != EAGAIN && errno != EINTR) 4594 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 4595 (int) command, 4596 (int) nsd->children[i].pid, 4597 strerror(errno)); 4598 } else if (timeout > 0) { 4599 (void)block_read(NULL, 4600 nsd->children[i].child_fd, 4601 &command, sizeof(command), timeout); 4602 } 4603 fsync(nsd->children[i].child_fd); 4604 close(nsd->children[i].child_fd); 4605 nsd->children[i].child_fd = -1; 4606 } 4607 } 4608 } 4609 4610 static void 4611 send_children_quit(struct nsd* nsd) 4612 { 4613 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 4614 send_children_command(nsd, NSD_QUIT, 0); 4615 } 4616 4617 static void 4618 send_children_quit_and_wait(struct nsd* nsd) 4619 { 4620 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 4621 send_children_command(nsd, NSD_QUIT_CHILD, 3); 4622 } 4623 4624 #ifdef BIND8_STATS 4625 static void 4626 set_children_stats(struct nsd* nsd) 4627 { 4628 size_t i; 4629 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4630 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 4631 for (i = 0; i < nsd->child_count; ++i) { 4632 nsd->children[i].need_to_send_STATS = 1; 4633 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 4634 } 4635 } 4636 #endif /* BIND8_STATS */ 4637 4638 static void 4639 configure_handler_event_types(short event_types) 4640 { 4641 size_t i; 4642 4643 for (i = 0; i < tcp_accept_handler_count; ++i) { 4644 struct event* handler = &tcp_accept_handlers[i].event; 4645 if(event_types) { 4646 /* reassign */ 4647 int fd = handler->ev_fd; 4648 struct event_base* base = handler->ev_base; 4649 if(tcp_accept_handlers[i].event_added) 4650 event_del(handler); 4651 memset(handler, 0, sizeof(*handler)); 4652 event_set(handler, fd, event_types, 4653 handle_tcp_accept, &tcp_accept_handlers[i]); 4654 if(event_base_set(base, handler) != 0) 4655 log_msg(LOG_ERR, "conhand: cannot event_base"); 4656 if(event_add(handler, NULL) != 0) 4657 log_msg(LOG_ERR, "conhand: cannot event_add"); 4658 tcp_accept_handlers[i].event_added = 1; 4659 } else { 4660 /* remove */ 4661 if(tcp_accept_handlers[i].event_added) { 4662 event_del(handler); 4663 tcp_accept_handlers[i].event_added = 0; 4664 } 4665 } 4666 } 4667 } 4668