1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #ifdef USE_DNSTAP 85 #include "dnstap/dnstap_collector.h" 86 #endif 87 88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 89 90 #ifdef USE_DNSTAP 91 /* 92 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content 93 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*) 94 */ 95 static void 96 log_addr(const char* descr, 97 #ifdef INET6 98 struct sockaddr_storage* addr 99 #else 100 struct sockaddr_in* addr 101 #endif 102 ) 103 { 104 char str_buf[64]; 105 if(verbosity < 6) 106 return; 107 if( 108 #ifdef INET6 109 addr->ss_family == AF_INET 110 #else 111 addr->sin_family == AF_INET 112 #endif 113 ) { 114 struct sockaddr_in* s = (struct sockaddr_in*)addr; 115 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf)); 116 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port))); 117 #ifdef INET6 118 } else { 119 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr; 120 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf)); 121 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port))); 122 #endif 123 } 124 } 125 #endif /* USE_DNSTAP */ 126 127 #ifdef USE_TCP_FASTOPEN 128 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 129 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 130 #endif 131 132 /* 133 * Data for the UDP handlers. 134 */ 135 struct udp_handler_data 136 { 137 struct nsd *nsd; 138 struct nsd_socket *socket; 139 struct event event; 140 }; 141 142 struct tcp_accept_handler_data { 143 struct nsd *nsd; 144 struct nsd_socket *socket; 145 int event_added; 146 struct event event; 147 #ifdef HAVE_SSL 148 /* handler accepts TLS connections on the dedicated port */ 149 int tls_accept; 150 #endif 151 }; 152 153 /* 154 * These globals are used to enable the TCP accept handlers 155 * when the number of TCP connection drops below the maximum 156 * number of TCP connections. 157 */ 158 static size_t tcp_accept_handler_count; 159 static struct tcp_accept_handler_data *tcp_accept_handlers; 160 161 static struct event slowaccept_event; 162 static int slowaccept; 163 164 #ifdef HAVE_SSL 165 static unsigned char *ocspdata = NULL; 166 static long ocspdata_len = 0; 167 #endif 168 169 #ifdef NONBLOCKING_IS_BROKEN 170 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 171 read multiple times from a socket when reported ready by select. */ 172 # define NUM_RECV_PER_SELECT (1) 173 #else /* !NONBLOCKING_IS_BROKEN */ 174 # define NUM_RECV_PER_SELECT (100) 175 #endif /* NONBLOCKING_IS_BROKEN */ 176 177 #ifndef HAVE_MMSGHDR 178 struct mmsghdr { 179 struct msghdr msg_hdr; 180 unsigned int msg_len; 181 }; 182 #endif 183 184 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 185 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 186 static struct query *queries[NUM_RECV_PER_SELECT]; 187 188 /* 189 * Data for the TCP connection handlers. 190 * 191 * The TCP handlers use non-blocking I/O. This is necessary to avoid 192 * blocking the entire server on a slow TCP connection, but does make 193 * reading from and writing to the socket more complicated. 194 * 195 * Basically, whenever a read/write would block (indicated by the 196 * EAGAIN errno variable) we remember the position we were reading 197 * from/writing to and return from the TCP reading/writing event 198 * handler. When the socket becomes readable/writable again we 199 * continue from the same position. 200 */ 201 struct tcp_handler_data 202 { 203 /* 204 * The region used to allocate all TCP connection related 205 * data, including this structure. This region is destroyed 206 * when the connection is closed. 207 */ 208 region_type* region; 209 210 /* 211 * The global nsd structure. 212 */ 213 struct nsd* nsd; 214 215 /* 216 * The current query data for this TCP connection. 217 */ 218 query_type* query; 219 220 /* 221 * The query_state is used to remember if we are performing an 222 * AXFR, if we're done processing, or if we should discard the 223 * query and connection. 224 */ 225 query_state_type query_state; 226 227 /* 228 * The event for the file descriptor and tcp timeout 229 */ 230 struct event event; 231 232 /* 233 * The bytes_transmitted field is used to remember the number 234 * of bytes transmitted when receiving or sending a DNS 235 * packet. The count includes the two additional bytes used 236 * to specify the packet length on a TCP connection. 237 */ 238 size_t bytes_transmitted; 239 240 /* 241 * The number of queries handled by this specific TCP connection. 242 */ 243 int query_count; 244 245 /* 246 * The timeout in msec for this tcp connection 247 */ 248 int tcp_timeout; 249 250 /* 251 * If the connection is allowed to have further queries on it. 252 */ 253 int tcp_no_more_queries; 254 255 #ifdef USE_DNSTAP 256 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */ 257 struct nsd_socket *socket; 258 #endif /* USE_DNSTAP */ 259 260 #ifdef HAVE_SSL 261 /* 262 * TLS object. 263 */ 264 SSL* tls; 265 266 /* 267 * TLS handshake state. 268 */ 269 enum { tls_hs_none, tls_hs_read, tls_hs_write, 270 tls_hs_read_event, tls_hs_write_event } shake_state; 271 #endif 272 /* list of connections, for service of remaining tcp channels */ 273 struct tcp_handler_data *prev, *next; 274 }; 275 /* global that is the list of active tcp channels */ 276 static struct tcp_handler_data *tcp_active_list = NULL; 277 278 /* 279 * Handle incoming queries on the UDP server sockets. 280 */ 281 static void handle_udp(int fd, short event, void* arg); 282 283 /* 284 * Handle incoming connections on the TCP sockets. These handlers 285 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 286 * connection) but are disabled when the number of current TCP 287 * connections is equal to the maximum number of TCP connections. 288 * Disabling is done by changing the handler to wait for the 289 * NETIO_EVENT_NONE type. This is done using the function 290 * configure_tcp_accept_handlers. 291 */ 292 static void handle_tcp_accept(int fd, short event, void* arg); 293 294 /* 295 * Handle incoming queries on a TCP connection. The TCP connections 296 * are configured to be non-blocking and the handler may be called 297 * multiple times before a complete query is received. 298 */ 299 static void handle_tcp_reading(int fd, short event, void* arg); 300 301 /* 302 * Handle outgoing responses on a TCP connection. The TCP connections 303 * are configured to be non-blocking and the handler may be called 304 * multiple times before a complete response is sent. 305 */ 306 static void handle_tcp_writing(int fd, short event, void* arg); 307 308 #ifdef HAVE_SSL 309 /* Create SSL object and associate fd */ 310 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 311 /* 312 * Handle TLS handshake. May be called multiple times if incomplete. 313 */ 314 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 315 316 /* 317 * Handle incoming queries on a TLS over TCP connection. The TLS 318 * connections are configured to be non-blocking and the handler may 319 * be called multiple times before a complete query is received. 320 */ 321 static void handle_tls_reading(int fd, short event, void* arg); 322 323 /* 324 * Handle outgoing responses on a TLS over TCP connection. The TLS 325 * connections are configured to be non-blocking and the handler may 326 * be called multiple times before a complete response is sent. 327 */ 328 static void handle_tls_writing(int fd, short event, void* arg); 329 #endif 330 331 /* 332 * Send all children the quit nonblocking, then close pipe. 333 */ 334 static void send_children_quit(struct nsd* nsd); 335 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 336 static void send_children_quit_and_wait(struct nsd* nsd); 337 338 /* set childrens flags to send NSD_STATS to them */ 339 #ifdef BIND8_STATS 340 static void set_children_stats(struct nsd* nsd); 341 #endif /* BIND8_STATS */ 342 343 /* 344 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 345 */ 346 static void configure_handler_event_types(short event_types); 347 348 static uint16_t *compressed_dname_offsets = 0; 349 static uint32_t compression_table_capacity = 0; 350 static uint32_t compression_table_size = 0; 351 static domain_type* compressed_dnames[MAXRRSPP]; 352 353 #ifdef USE_TCP_FASTOPEN 354 /* Checks to see if the kernel value must be manually changed in order for 355 TCP Fast Open to support server mode */ 356 static void report_tcp_fastopen_config() { 357 358 int tcp_fastopen_fp; 359 uint8_t tcp_fastopen_value; 360 361 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 362 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 363 } 364 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 365 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 366 close(tcp_fastopen_fp); 367 } 368 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 369 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 370 log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); 371 log_msg(LOG_WARNING, "To enable TFO use the command:"); 372 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 373 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 374 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 375 close(tcp_fastopen_fp); 376 } 377 close(tcp_fastopen_fp); 378 } 379 #endif 380 381 /* 382 * Remove the specified pid from the list of child pids. Returns -1 if 383 * the pid is not in the list, child_num otherwise. The field is set to 0. 384 */ 385 static int 386 delete_child_pid(struct nsd *nsd, pid_t pid) 387 { 388 size_t i; 389 for (i = 0; i < nsd->child_count; ++i) { 390 if (nsd->children[i].pid == pid) { 391 nsd->children[i].pid = 0; 392 if(!nsd->children[i].need_to_exit) { 393 if(nsd->children[i].child_fd != -1) 394 close(nsd->children[i].child_fd); 395 nsd->children[i].child_fd = -1; 396 if(nsd->children[i].handler) 397 nsd->children[i].handler->fd = -1; 398 } 399 return i; 400 } 401 } 402 return -1; 403 } 404 405 /* 406 * Restart child servers if necessary. 407 */ 408 static int 409 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 410 int* xfrd_sock_p) 411 { 412 struct main_ipc_handler_data *ipc_data; 413 size_t i; 414 int sv[2]; 415 416 /* Fork the child processes... */ 417 for (i = 0; i < nsd->child_count; ++i) { 418 if (nsd->children[i].pid <= 0) { 419 if (nsd->children[i].child_fd != -1) 420 close(nsd->children[i].child_fd); 421 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 422 log_msg(LOG_ERR, "socketpair: %s", 423 strerror(errno)); 424 return -1; 425 } 426 nsd->children[i].child_fd = sv[0]; 427 nsd->children[i].parent_fd = sv[1]; 428 nsd->children[i].pid = fork(); 429 switch (nsd->children[i].pid) { 430 default: /* SERVER MAIN */ 431 close(nsd->children[i].parent_fd); 432 nsd->children[i].parent_fd = -1; 433 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 434 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 435 } 436 if(!nsd->children[i].handler) 437 { 438 ipc_data = (struct main_ipc_handler_data*) region_alloc( 439 region, sizeof(struct main_ipc_handler_data)); 440 ipc_data->nsd = nsd; 441 ipc_data->child = &nsd->children[i]; 442 ipc_data->child_num = i; 443 ipc_data->xfrd_sock = xfrd_sock_p; 444 ipc_data->packet = buffer_create(region, QIOBUFSZ); 445 ipc_data->forward_mode = 0; 446 ipc_data->got_bytes = 0; 447 ipc_data->total_bytes = 0; 448 ipc_data->acl_num = 0; 449 nsd->children[i].handler = (struct netio_handler*) region_alloc( 450 region, sizeof(struct netio_handler)); 451 nsd->children[i].handler->fd = nsd->children[i].child_fd; 452 nsd->children[i].handler->timeout = NULL; 453 nsd->children[i].handler->user_data = ipc_data; 454 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 455 nsd->children[i].handler->event_handler = parent_handle_child_command; 456 netio_add_handler(netio, nsd->children[i].handler); 457 } 458 /* clear any ongoing ipc */ 459 ipc_data = (struct main_ipc_handler_data*) 460 nsd->children[i].handler->user_data; 461 ipc_data->forward_mode = 0; 462 /* restart - update fd */ 463 nsd->children[i].handler->fd = nsd->children[i].child_fd; 464 break; 465 case 0: /* CHILD */ 466 /* the child need not be able to access the 467 * nsd.db file */ 468 namedb_close_udb(nsd->db); 469 #ifdef MEMCLEAN /* OS collects memory pages */ 470 region_destroy(region); 471 #endif 472 473 if (pledge("stdio rpath inet", NULL) == -1) { 474 log_msg(LOG_ERR, "pledge"); 475 exit(1); 476 } 477 478 nsd->pid = 0; 479 nsd->child_count = 0; 480 nsd->server_kind = nsd->children[i].kind; 481 nsd->this_child = &nsd->children[i]; 482 nsd->this_child->child_num = i; 483 /* remove signal flags inherited from parent 484 the parent will handle them. */ 485 nsd->signal_hint_reload_hup = 0; 486 nsd->signal_hint_reload = 0; 487 nsd->signal_hint_child = 0; 488 nsd->signal_hint_quit = 0; 489 nsd->signal_hint_shutdown = 0; 490 nsd->signal_hint_stats = 0; 491 nsd->signal_hint_statsusr = 0; 492 close(*xfrd_sock_p); 493 close(nsd->this_child->child_fd); 494 nsd->this_child->child_fd = -1; 495 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 496 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 497 } 498 server_child(nsd); 499 /* NOTREACH */ 500 exit(0); 501 case -1: 502 log_msg(LOG_ERR, "fork failed: %s", 503 strerror(errno)); 504 return -1; 505 } 506 } 507 } 508 return 0; 509 } 510 511 #ifdef BIND8_STATS 512 static void set_bind8_alarm(struct nsd* nsd) 513 { 514 /* resync so that the next alarm is on the next whole minute */ 515 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ 516 alarm(nsd->st.period - (time(NULL) % nsd->st.period)); 517 } 518 #endif 519 520 /* set zone stat ids for zones initially read in */ 521 static void 522 zonestatid_tree_set(struct nsd* nsd) 523 { 524 struct radnode* n; 525 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 526 zone_type* zone = (zone_type*)n->elem; 527 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 528 } 529 } 530 531 #ifdef USE_ZONE_STATS 532 void 533 server_zonestat_alloc(struct nsd* nsd) 534 { 535 size_t num = (nsd->options->zonestatnames->count==0?1: 536 nsd->options->zonestatnames->count); 537 size_t sz = sizeof(struct nsdst)*num; 538 char tmpfile[256]; 539 uint8_t z = 0; 540 541 /* file names */ 542 nsd->zonestatfname[0] = 0; 543 nsd->zonestatfname[1] = 0; 544 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 545 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 546 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 547 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 548 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 549 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 550 551 /* file descriptors */ 552 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 553 if(nsd->zonestatfd[0] == -1) { 554 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 555 strerror(errno)); 556 exit(1); 557 } 558 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 559 if(nsd->zonestatfd[0] == -1) { 560 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 561 strerror(errno)); 562 close(nsd->zonestatfd[0]); 563 unlink(nsd->zonestatfname[0]); 564 exit(1); 565 } 566 567 #ifdef HAVE_MMAP 568 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 569 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 570 strerror(errno)); 571 exit(1); 572 } 573 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 574 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 575 nsd->zonestatfname[0], strerror(errno)); 576 exit(1); 577 } 578 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 579 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 580 strerror(errno)); 581 exit(1); 582 } 583 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 584 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 585 nsd->zonestatfname[1], strerror(errno)); 586 exit(1); 587 } 588 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 589 MAP_SHARED, nsd->zonestatfd[0], 0); 590 if(nsd->zonestat[0] == MAP_FAILED) { 591 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 592 unlink(nsd->zonestatfname[0]); 593 unlink(nsd->zonestatfname[1]); 594 exit(1); 595 } 596 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 597 MAP_SHARED, nsd->zonestatfd[1], 0); 598 if(nsd->zonestat[1] == MAP_FAILED) { 599 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 600 unlink(nsd->zonestatfname[0]); 601 unlink(nsd->zonestatfname[1]); 602 exit(1); 603 } 604 memset(nsd->zonestat[0], 0, sz); 605 memset(nsd->zonestat[1], 0, sz); 606 nsd->zonestatsize[0] = num; 607 nsd->zonestatsize[1] = num; 608 nsd->zonestatdesired = num; 609 nsd->zonestatsizenow = num; 610 nsd->zonestatnow = nsd->zonestat[0]; 611 #endif /* HAVE_MMAP */ 612 } 613 614 void 615 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 616 { 617 #ifdef HAVE_MMAP 618 #ifdef MREMAP_MAYMOVE 619 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 620 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 621 MREMAP_MAYMOVE); 622 if(nsd->zonestat[idx] == MAP_FAILED) { 623 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 624 exit(1); 625 } 626 #else /* !HAVE MREMAP */ 627 if(msync(nsd->zonestat[idx], 628 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 629 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 630 if(munmap(nsd->zonestat[idx], 631 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 632 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 633 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 634 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 635 if(nsd->zonestat[idx] == MAP_FAILED) { 636 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 637 exit(1); 638 } 639 #endif /* MREMAP */ 640 #endif /* HAVE_MMAP */ 641 } 642 643 /* realloc the zonestat array for the one that is not currently in use, 644 * to match the desired new size of the array (if applicable) */ 645 void 646 server_zonestat_realloc(struct nsd* nsd) 647 { 648 #ifdef HAVE_MMAP 649 uint8_t z = 0; 650 size_t sz; 651 int idx = 0; /* index of the zonestat array that is not in use */ 652 if(nsd->zonestatnow == nsd->zonestat[0]) 653 idx = 1; 654 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 655 return; 656 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 657 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 658 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 659 strerror(errno)); 660 exit(1); 661 } 662 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 663 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 664 nsd->zonestatfname[idx], strerror(errno)); 665 exit(1); 666 } 667 zonestat_remap(nsd, idx, sz); 668 /* zero the newly allocated region */ 669 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 670 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 671 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 672 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 673 } 674 nsd->zonestatsize[idx] = nsd->zonestatdesired; 675 #endif /* HAVE_MMAP */ 676 } 677 678 /* switchover to use the other array for the new children, that 679 * briefly coexist with the old children. And we want to avoid them 680 * both writing to the same statistics arrays. */ 681 void 682 server_zonestat_switch(struct nsd* nsd) 683 { 684 if(nsd->zonestatnow == nsd->zonestat[0]) { 685 nsd->zonestatnow = nsd->zonestat[1]; 686 nsd->zonestatsizenow = nsd->zonestatsize[1]; 687 } else { 688 nsd->zonestatnow = nsd->zonestat[0]; 689 nsd->zonestatsizenow = nsd->zonestatsize[0]; 690 } 691 } 692 #endif /* USE_ZONE_STATS */ 693 694 static void 695 cleanup_dname_compression_tables(void *ptr) 696 { 697 free(ptr); 698 compressed_dname_offsets = NULL; 699 compression_table_capacity = 0; 700 } 701 702 static void 703 initialize_dname_compression_tables(struct nsd *nsd) 704 { 705 size_t needed = domain_table_count(nsd->db->domains) + 1; 706 needed += EXTRA_DOMAIN_NUMBERS; 707 if(compression_table_capacity < needed) { 708 if(compressed_dname_offsets) { 709 region_remove_cleanup(nsd->db->region, 710 cleanup_dname_compression_tables, 711 compressed_dname_offsets); 712 free(compressed_dname_offsets); 713 } 714 compressed_dname_offsets = (uint16_t *) xmallocarray( 715 needed, sizeof(uint16_t)); 716 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 717 compressed_dname_offsets); 718 compression_table_capacity = needed; 719 compression_table_size=domain_table_count(nsd->db->domains)+1; 720 } 721 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 722 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 723 } 724 725 static int 726 set_cloexec(struct nsd_socket *sock) 727 { 728 assert(sock != NULL); 729 730 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 731 const char *socktype = 732 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 733 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 734 socktype, strerror(errno)); 735 return -1; 736 } 737 738 return 1; 739 } 740 741 static int 742 set_reuseport(struct nsd_socket *sock) 743 { 744 #ifdef SO_REUSEPORT 745 int on = 1; 746 #ifdef SO_REUSEPORT_LB 747 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 748 * SO_REUSEPORT on Linux. This is what the users want with the config 749 * option in nsd.conf; if we actually need local address and port reuse 750 * they'll also need to have SO_REUSEPORT set for them, assume it was 751 * _LB they want. 752 */ 753 int opt = SO_REUSEPORT_LB; 754 static const char optname[] = "SO_REUSEPORT_LB"; 755 #else /* !SO_REUSEPORT_LB */ 756 int opt = SO_REUSEPORT; 757 static const char optname[] = "SO_REUSEPORT"; 758 #endif /* SO_REUSEPORT_LB */ 759 760 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 761 return 1; 762 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 763 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 764 optname, strerror(errno)); 765 } 766 return -1; 767 #else 768 (void)sock; 769 #endif /* SO_REUSEPORT */ 770 771 return 0; 772 } 773 774 static int 775 set_reuseaddr(struct nsd_socket *sock) 776 { 777 #ifdef SO_REUSEADDR 778 int on = 1; 779 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 780 return 1; 781 } 782 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 783 strerror(errno)); 784 return -1; 785 #endif /* SO_REUSEADDR */ 786 return 0; 787 } 788 789 static int 790 set_rcvbuf(struct nsd_socket *sock, int rcv) 791 { 792 #ifdef SO_RCVBUF 793 #ifdef SO_RCVBUFFORCE 794 if(0 == setsockopt( 795 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 796 { 797 return 1; 798 } 799 if(errno == EPERM || errno == ENOBUFS) { 800 return 0; 801 } 802 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 803 strerror(errno)); 804 return -1; 805 #else /* !SO_RCVBUFFORCE */ 806 if (0 == setsockopt( 807 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 808 { 809 return 1; 810 } 811 if(errno == ENOSYS || errno == ENOBUFS) { 812 return 0; 813 } 814 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 815 strerror(errno)); 816 return -1; 817 #endif /* SO_RCVBUFFORCE */ 818 #endif /* SO_RCVBUF */ 819 820 return 0; 821 } 822 823 static int 824 set_sndbuf(struct nsd_socket *sock, int snd) 825 { 826 #ifdef SO_SNDBUF 827 #ifdef SO_SNDBUFFORCE 828 if(0 == setsockopt( 829 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 830 { 831 return 1; 832 } 833 if(errno == EPERM || errno == ENOBUFS) { 834 return 0; 835 } 836 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 837 strerror(errno)); 838 return -1; 839 #else /* !SO_SNDBUFFORCE */ 840 if(0 == setsockopt( 841 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 842 { 843 return 1; 844 } 845 if(errno == ENOSYS || errno == ENOBUFS) { 846 return 0; 847 } 848 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 849 strerror(errno)); 850 return -1; 851 #endif /* SO_SNDBUFFORCE */ 852 #endif /* SO_SNDBUF */ 853 854 return 0; 855 } 856 857 static int 858 set_nonblock(struct nsd_socket *sock) 859 { 860 const char *socktype = 861 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 862 863 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 864 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 865 socktype, strerror(errno)); 866 return -1; 867 } 868 869 return 1; 870 } 871 872 #ifdef INET6 873 static int 874 set_ipv6_v6only(struct nsd_socket *sock) 875 { 876 #ifdef IPV6_V6ONLY 877 int on = 1; 878 const char *socktype = 879 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 880 881 if(0 == setsockopt( 882 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 883 { 884 return 1; 885 } 886 887 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 888 socktype, strerror(errno)); 889 return -1; 890 #else 891 (void)sock; 892 #endif /* IPV6_V6ONLY */ 893 894 return 0; 895 } 896 #endif /* INET6 */ 897 898 #ifdef INET6 899 static int 900 set_ipv6_use_min_mtu(struct nsd_socket *sock) 901 { 902 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) 903 #if defined(IPV6_USE_MIN_MTU) 904 /* There is no fragmentation of IPv6 datagrams during forwarding in the 905 * network. Therefore we do not send UDP datagrams larger than the 906 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 907 * larger if the network stack supports IPV6_USE_MIN_MTU. 908 */ 909 int opt = IPV6_USE_MIN_MTU; 910 int optval = 1; 911 static const char optname[] = "IPV6_USE_MIN_MTU"; 912 #elif defined(IPV6_MTU) 913 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 914 * to the MIN MTU to get the same. 915 */ 916 int opt = IPV6_MTU; 917 int optval = IPV6_MIN_MTU; 918 static const char optname[] = "IPV6_MTU"; 919 #endif 920 if(0 == setsockopt( 921 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 922 { 923 return 1; 924 } 925 926 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 927 optname, strerror(errno)); 928 return -1; 929 #else 930 (void)sock; 931 #endif /* INET6 */ 932 933 return 0; 934 } 935 #endif /* INET6 */ 936 937 static int 938 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 939 { 940 int ret = 0; 941 942 #if defined(IP_MTU_DISCOVER) 943 int opt = IP_MTU_DISCOVER; 944 int optval; 945 # if defined(IP_PMTUDISC_OMIT) 946 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 947 * information and send packets with DF=0. Fragmentation is allowed if 948 * and only if the packet size exceeds the outgoing interface MTU or 949 * the packet encounters smaller MTU link in network. This mitigates 950 * DNS fragmentation attacks by preventing forged PMTU information. 951 * FreeBSD already has same semantics without setting the option. 952 */ 953 optval = IP_PMTUDISC_OMIT; 954 if(0 == setsockopt( 955 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 956 { 957 return 1; 958 } 959 960 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 961 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 962 # endif /* IP_PMTUDISC_OMIT */ 963 # if defined(IP_PMTUDISC_DONT) 964 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 965 optval = IP_PMTUDISC_DONT; 966 if(0 == setsockopt( 967 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 968 { 969 return 1; 970 } 971 972 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 973 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 974 # endif 975 ret = -1; 976 #elif defined(IP_DONTFRAG) 977 int off = 0; 978 if (0 == setsockopt( 979 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 980 { 981 return 1; 982 } 983 984 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 985 strerror(errno)); 986 ret = -1; 987 #else 988 (void)sock; 989 #endif 990 991 return ret; 992 } 993 994 static int 995 set_ip_freebind(struct nsd_socket *sock) 996 { 997 #ifdef IP_FREEBIND 998 int on = 1; 999 const char *socktype = 1000 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1001 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 1002 { 1003 return 1; 1004 } 1005 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 1006 socktype, strerror(errno)); 1007 return -1; 1008 #else 1009 (void)sock; 1010 #endif /* IP_FREEBIND */ 1011 1012 return 0; 1013 } 1014 1015 static int 1016 set_ip_transparent(struct nsd_socket *sock) 1017 { 1018 /* 1019 The scandalous preprocessor blob here calls for some explanation :) 1020 POSIX does not specify an option to bind non-local IPs, so 1021 platforms developed several implementation-specific options, 1022 all set in the same way, but with different names. 1023 For additional complexity, some platform manage this setting 1024 differently for different address families (IPv4 vs IPv6). 1025 This scandalous preprocessor blob below abstracts such variability 1026 in the way which leaves the C code as lean and clear as possible. 1027 */ 1028 1029 #if defined(IP_TRANSPARENT) 1030 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 1031 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1032 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 1033 // as of 2020-01, Linux does not support this on IPv6 programmatically 1034 #elif defined(SO_BINDANY) 1035 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 1036 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 1037 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 1038 #elif defined(IP_BINDANY) 1039 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 1040 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 1041 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1042 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 1043 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 1044 #endif 1045 1046 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 1047 (void)sock; 1048 #else 1049 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 1050 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 1051 # endif 1052 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1053 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1054 # endif 1055 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1056 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1057 # endif 1058 1059 int on = 1; 1060 const char *socktype = 1061 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1062 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1063 1064 if(0 == setsockopt( 1065 sock->s, 1066 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1067 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1068 &on, sizeof(on))) 1069 { 1070 return 1; 1071 } 1072 1073 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1074 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1075 return -1; 1076 #endif 1077 1078 return 0; 1079 } 1080 1081 static int 1082 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1083 { 1084 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1085 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1086 return 1; 1087 } 1088 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1089 strerror(errno)); 1090 return -1; 1091 #else 1092 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1093 #endif 1094 return 0; 1095 } 1096 1097 #ifdef USE_TCP_FASTOPEN 1098 static int 1099 set_tcp_fastopen(struct nsd_socket *sock) 1100 { 1101 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1102 * a defense against IP spoofing attacks as suggested in RFC7413. 1103 */ 1104 int qlen; 1105 1106 #ifdef __APPLE__ 1107 /* macOS X implementation only supports qlen of 1 via this call. The 1108 * actual value is configured by the net.inet.tcp.fastopen_backlog 1109 * kernel parameter. 1110 */ 1111 qlen = 1; 1112 #else 1113 /* 5 is recommended on Linux. */ 1114 qlen = 5; 1115 #endif 1116 if (0 == setsockopt( 1117 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1118 { 1119 return 1; 1120 } 1121 1122 if (errno == EPERM) { 1123 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1124 "; this could likely be because sysctl " 1125 "net.inet.tcp.fastopen.enabled, " 1126 "net.inet.tcp.fastopen.server_enable, or " 1127 "net.ipv4.tcp_fastopen is disabled", 1128 strerror(errno)); 1129 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1130 * disabled, except when verbosity enabled for debugging 1131 */ 1132 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1133 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1134 strerror(errno)); 1135 } 1136 1137 return (errno == ENOPROTOOPT ? 0 : -1); 1138 } 1139 #endif /* USE_TCP_FASTOPEN */ 1140 1141 static int 1142 set_bindtodevice(struct nsd_socket *sock) 1143 { 1144 #if defined(SO_BINDTODEVICE) 1145 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1146 sock->device, strlen(sock->device)) == -1) 1147 { 1148 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1149 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1150 return -1; 1151 } 1152 1153 return 1; 1154 #else 1155 (void)sock; 1156 return 0; 1157 #endif 1158 } 1159 1160 static int 1161 set_setfib(struct nsd_socket *sock) 1162 { 1163 #if defined(SO_SETFIB) 1164 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1165 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1166 { 1167 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1168 "SO_SETFIB", sock->fib, strerror(errno)); 1169 return -1; 1170 } 1171 1172 return 1; 1173 #else 1174 (void)sock; 1175 return 0; 1176 #endif 1177 } 1178 1179 static int 1180 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1181 { 1182 int rcv = 1*1024*1024, snd = 1*1024*1024; 1183 1184 if(-1 == (sock->s = socket( 1185 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1186 { 1187 #ifdef INET6 1188 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1189 (sock->addr.ai_family == AF_INET6) && 1190 (errno == EAFNOSUPPORT)) 1191 { 1192 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1193 "not supported"); 1194 return 0; 1195 } 1196 #endif 1197 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1198 return -1; 1199 } 1200 1201 set_cloexec(sock); 1202 1203 if(nsd->reuseport && reuseport_works && *reuseport_works) 1204 *reuseport_works = (set_reuseport(sock) == 1); 1205 1206 if(nsd->options->receive_buffer_size > 0) 1207 rcv = nsd->options->receive_buffer_size; 1208 if(set_rcvbuf(sock, rcv) == -1) 1209 return -1; 1210 1211 if(nsd->options->send_buffer_size > 0) 1212 snd = nsd->options->send_buffer_size; 1213 if(set_sndbuf(sock, snd) == -1) 1214 return -1; 1215 #ifdef INET6 1216 if(sock->addr.ai_family == AF_INET6) { 1217 if(set_ipv6_v6only(sock) == -1 || 1218 set_ipv6_use_min_mtu(sock) == -1) 1219 return -1; 1220 } else 1221 #endif /* INET6 */ 1222 if(sock->addr.ai_family == AF_INET) { 1223 if(set_ipv4_no_pmtu_disc(sock) == -1) 1224 return -1; 1225 } 1226 1227 /* Set socket to non-blocking. Otherwise, on operating systems 1228 * with thundering herd problems, the UDP recv could block 1229 * after select returns readable. 1230 */ 1231 set_nonblock(sock); 1232 1233 if(nsd->options->ip_freebind) 1234 (void)set_ip_freebind(sock); 1235 if(nsd->options->ip_transparent) 1236 (void)set_ip_transparent(sock); 1237 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1238 return -1; 1239 if(sock->fib != -1 && set_setfib(sock) == -1) 1240 return -1; 1241 1242 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1243 char buf[256]; 1244 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1245 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1246 buf, strerror(errno)); 1247 return -1; 1248 } 1249 1250 return 1; 1251 } 1252 1253 static int 1254 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1255 { 1256 #ifdef USE_TCP_FASTOPEN 1257 report_tcp_fastopen_config(); 1258 #endif 1259 1260 (void)reuseport_works; 1261 1262 if(-1 == (sock->s = socket( 1263 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1264 { 1265 #ifdef INET6 1266 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1267 (sock->addr.ai_family == AF_INET6) && 1268 (errno == EAFNOSUPPORT)) 1269 { 1270 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1271 "not supported"); 1272 return 0; 1273 } 1274 #endif /* INET6 */ 1275 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1276 return -1; 1277 } 1278 1279 set_cloexec(sock); 1280 1281 if(nsd->reuseport && reuseport_works && *reuseport_works) 1282 *reuseport_works = (set_reuseport(sock) == 1); 1283 1284 (void)set_reuseaddr(sock); 1285 1286 #ifdef INET6 1287 if(sock->addr.ai_family == AF_INET6) { 1288 if (set_ipv6_v6only(sock) == -1 || 1289 set_ipv6_use_min_mtu(sock) == -1) 1290 return -1; 1291 } 1292 #endif 1293 1294 if(nsd->tcp_mss > 0) 1295 set_tcp_maxseg(sock, nsd->tcp_mss); 1296 /* (StevensUNP p463), if TCP listening socket is blocking, then 1297 it may block in accept, even if select() says readable. */ 1298 (void)set_nonblock(sock); 1299 if(nsd->options->ip_freebind) 1300 (void)set_ip_freebind(sock); 1301 if(nsd->options->ip_transparent) 1302 (void)set_ip_transparent(sock); 1303 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1304 return -1; 1305 if(sock->fib != -1 && set_setfib(sock) == -1) 1306 return -1; 1307 1308 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1309 char buf[256]; 1310 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1311 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1312 buf, strerror(errno)); 1313 return -1; 1314 } 1315 1316 #ifdef USE_TCP_FASTOPEN 1317 (void)set_tcp_fastopen(sock); 1318 #endif 1319 1320 if(listen(sock->s, TCP_BACKLOG) == -1) { 1321 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1322 return -1; 1323 } 1324 1325 return 1; 1326 } 1327 1328 /* 1329 * Initialize the server, reuseport, create and bind the sockets. 1330 */ 1331 int 1332 server_init(struct nsd *nsd) 1333 { 1334 size_t i; 1335 int reuseport = 1; /* Determine if REUSEPORT works. */ 1336 1337 /* open server interface ports */ 1338 for(i = 0; i < nsd->ifs; i++) { 1339 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1340 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1341 { 1342 return -1; 1343 } 1344 } 1345 1346 if(nsd->reuseport && reuseport) { 1347 size_t ifs = nsd->ifs * nsd->reuseport; 1348 1349 /* increase the size of the interface arrays, there are going 1350 * to be separate interface file descriptors for every server 1351 * instance */ 1352 region_remove_cleanup(nsd->region, free, nsd->udp); 1353 region_remove_cleanup(nsd->region, free, nsd->tcp); 1354 1355 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1356 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1357 region_add_cleanup(nsd->region, free, nsd->udp); 1358 region_add_cleanup(nsd->region, free, nsd->tcp); 1359 if(ifs > nsd->ifs) { 1360 memset(&nsd->udp[nsd->ifs], 0, 1361 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1362 memset(&nsd->tcp[nsd->ifs], 0, 1363 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1364 } 1365 1366 for(i = nsd->ifs; i < ifs; i++) { 1367 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1368 nsd->udp[i].s = -1; 1369 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1370 return -1; 1371 } 1372 /* Turn off REUSEPORT for TCP by copying the socket 1373 * file descriptor. 1374 * This means we should not close TCP used by 1375 * other servers in reuseport enabled mode, in 1376 * server_child(). 1377 */ 1378 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1379 } 1380 1381 nsd->ifs = ifs; 1382 } else { 1383 nsd->reuseport = 0; 1384 } 1385 1386 return 0; 1387 } 1388 1389 /* 1390 * Prepare the server for take off. 1391 * 1392 */ 1393 int 1394 server_prepare(struct nsd *nsd) 1395 { 1396 #ifdef RATELIMIT 1397 /* set secret modifier for hashing (udb ptr buckets and rate limits) */ 1398 #ifdef HAVE_GETRANDOM 1399 uint32_t v; 1400 if(getrandom(&v, sizeof(v), 0) == -1) { 1401 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1402 exit(1); 1403 } 1404 hash_set_raninit(v); 1405 #elif defined(HAVE_ARC4RANDOM) 1406 hash_set_raninit(arc4random()); 1407 #else 1408 uint32_t v = getpid() ^ time(NULL); 1409 srandom((unsigned long)v); 1410 # ifdef HAVE_SSL 1411 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1412 hash_set_raninit(v); 1413 else 1414 # endif 1415 hash_set_raninit(random()); 1416 #endif 1417 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1418 nsd->options->rrl_ratelimit, 1419 nsd->options->rrl_whitelist_ratelimit, 1420 nsd->options->rrl_slip, 1421 nsd->options->rrl_ipv4_prefix_length, 1422 nsd->options->rrl_ipv6_prefix_length); 1423 #endif /* RATELIMIT */ 1424 1425 /* Open the database... */ 1426 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { 1427 log_msg(LOG_ERR, "unable to open the database %s: %s", 1428 nsd->dbfile, strerror(errno)); 1429 unlink(nsd->task[0]->fname); 1430 unlink(nsd->task[1]->fname); 1431 #ifdef USE_ZONE_STATS 1432 unlink(nsd->zonestatfname[0]); 1433 unlink(nsd->zonestatfname[1]); 1434 #endif 1435 xfrd_del_tempdir(nsd); 1436 return -1; 1437 } 1438 /* check if zone files have been modified */ 1439 /* NULL for taskudb because we send soainfo in a moment, batched up, 1440 * for all zones */ 1441 if(nsd->options->zonefiles_check || (nsd->options->database == NULL || 1442 nsd->options->database[0] == 0)) 1443 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1444 zonestatid_tree_set(nsd); 1445 1446 compression_table_capacity = 0; 1447 initialize_dname_compression_tables(nsd); 1448 1449 #ifdef BIND8_STATS 1450 /* Initialize times... */ 1451 time(&nsd->st.boot); 1452 set_bind8_alarm(nsd); 1453 #endif /* BIND8_STATS */ 1454 1455 return 0; 1456 } 1457 1458 /* 1459 * Fork the required number of servers. 1460 */ 1461 static int 1462 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1463 int* xfrd_sock_p) 1464 { 1465 size_t i; 1466 1467 /* Start all child servers initially. */ 1468 for (i = 0; i < nsd->child_count; ++i) { 1469 nsd->children[i].pid = 0; 1470 } 1471 1472 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1473 } 1474 1475 static void 1476 server_close_socket(struct nsd_socket *sock) 1477 { 1478 if(sock->s != -1) { 1479 close(sock->s); 1480 sock->s = -1; 1481 } 1482 } 1483 1484 void 1485 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1486 { 1487 size_t i; 1488 1489 /* Close all the sockets... */ 1490 for (i = 0; i < n; ++i) { 1491 server_close_socket(&sockets[i]); 1492 } 1493 } 1494 1495 /* 1496 * Close the sockets, shutdown the server and exit. 1497 * Does not return. 1498 */ 1499 void 1500 server_shutdown(struct nsd *nsd) 1501 { 1502 size_t i; 1503 1504 server_close_all_sockets(nsd->udp, nsd->ifs); 1505 server_close_all_sockets(nsd->tcp, nsd->ifs); 1506 /* CHILD: close command channel to parent */ 1507 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1508 { 1509 close(nsd->this_child->parent_fd); 1510 nsd->this_child->parent_fd = -1; 1511 } 1512 /* SERVER: close command channels to children */ 1513 if(!nsd->this_child) 1514 { 1515 for(i=0; i < nsd->child_count; ++i) 1516 if(nsd->children[i].child_fd != -1) 1517 { 1518 close(nsd->children[i].child_fd); 1519 nsd->children[i].child_fd = -1; 1520 } 1521 } 1522 1523 tsig_finalize(); 1524 #ifdef HAVE_SSL 1525 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1526 if (nsd->tls_ctx) 1527 SSL_CTX_free(nsd->tls_ctx); 1528 #endif 1529 1530 #ifdef MEMCLEAN /* OS collects memory pages */ 1531 #ifdef RATELIMIT 1532 rrl_mmap_deinit_keep_mmap(); 1533 #endif 1534 #ifdef USE_DNSTAP 1535 dt_collector_destroy(nsd->dt_collector, nsd); 1536 #endif 1537 udb_base_free_keep_mmap(nsd->task[0]); 1538 udb_base_free_keep_mmap(nsd->task[1]); 1539 namedb_close_udb(nsd->db); /* keeps mmap */ 1540 namedb_close(nsd->db); 1541 nsd_options_destroy(nsd->options); 1542 region_destroy(nsd->region); 1543 #endif 1544 log_finalize(); 1545 exit(0); 1546 } 1547 1548 void 1549 server_prepare_xfrd(struct nsd* nsd) 1550 { 1551 char tmpfile[256]; 1552 /* create task mmaps */ 1553 nsd->mytask = 0; 1554 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1555 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1556 nsd->task[0] = task_file_create(tmpfile); 1557 if(!nsd->task[0]) { 1558 #ifdef USE_ZONE_STATS 1559 unlink(nsd->zonestatfname[0]); 1560 unlink(nsd->zonestatfname[1]); 1561 #endif 1562 xfrd_del_tempdir(nsd); 1563 exit(1); 1564 } 1565 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1566 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1567 nsd->task[1] = task_file_create(tmpfile); 1568 if(!nsd->task[1]) { 1569 unlink(nsd->task[0]->fname); 1570 #ifdef USE_ZONE_STATS 1571 unlink(nsd->zonestatfname[0]); 1572 unlink(nsd->zonestatfname[1]); 1573 #endif 1574 xfrd_del_tempdir(nsd); 1575 exit(1); 1576 } 1577 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1578 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1579 /* create xfrd listener structure */ 1580 nsd->xfrd_listener = region_alloc(nsd->region, 1581 sizeof(netio_handler_type)); 1582 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1583 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1584 nsd->xfrd_listener->fd = -1; 1585 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1586 nsd; 1587 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1588 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1589 } 1590 1591 1592 void 1593 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1594 { 1595 pid_t pid; 1596 int sockets[2] = {0,0}; 1597 struct ipc_handler_conn_data *data; 1598 1599 if(nsd->xfrd_listener->fd != -1) 1600 close(nsd->xfrd_listener->fd); 1601 if(del_db) { 1602 /* recreate taskdb that xfrd was using, it may be corrupt */ 1603 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1604 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1605 nsd->task[1-nsd->mytask]->fname = NULL; 1606 /* free alloc already, so udb does not shrink itself */ 1607 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1608 nsd->task[1-nsd->mytask]->alloc = NULL; 1609 udb_base_free(nsd->task[1-nsd->mytask]); 1610 /* create new file, overwrite the old one */ 1611 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1612 free(tmpfile); 1613 } 1614 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1615 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1616 return; 1617 } 1618 pid = fork(); 1619 switch (pid) { 1620 case -1: 1621 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1622 break; 1623 default: 1624 /* PARENT: close first socket, use second one */ 1625 close(sockets[0]); 1626 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1627 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1628 } 1629 if(del_db) xfrd_free_namedb(nsd); 1630 /* use other task than I am using, since if xfrd died and is 1631 * restarted, the reload is using nsd->mytask */ 1632 nsd->mytask = 1 - nsd->mytask; 1633 1634 #ifdef HAVE_SETPROCTITLE 1635 setproctitle("xfrd"); 1636 #endif 1637 #ifdef HAVE_CPUSET_T 1638 if(nsd->use_cpu_affinity) { 1639 set_cpu_affinity(nsd->xfrd_cpuset); 1640 } 1641 #endif 1642 1643 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1644 /* ENOTREACH */ 1645 break; 1646 case 0: 1647 /* CHILD: close second socket, use first one */ 1648 close(sockets[1]); 1649 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1650 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1651 } 1652 nsd->xfrd_listener->fd = sockets[0]; 1653 break; 1654 } 1655 /* server-parent only */ 1656 nsd->xfrd_listener->timeout = NULL; 1657 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1658 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1659 /* clear ongoing ipc reads */ 1660 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1661 data->conn->is_reading = 0; 1662 } 1663 1664 /** add all soainfo to taskdb */ 1665 static void 1666 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1667 { 1668 struct radnode* n; 1669 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1670 /* add all SOA INFO to mytask */ 1671 udb_ptr_init(&task_last, taskudb); 1672 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1673 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1674 } 1675 udb_ptr_unlink(&task_last, taskudb); 1676 } 1677 1678 void 1679 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1680 { 1681 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1682 * parent fills one taskdb with soas, xfrd fills other with expires. 1683 * then they exchange and process. 1684 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1685 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1686 * expire notifications can be sent back via a normal reload later 1687 * (xfrd will wait for current running reload to finish if any). 1688 */ 1689 sig_atomic_t cmd = 0; 1690 pid_t mypid; 1691 int xfrd_sock = nsd->xfrd_listener->fd; 1692 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1693 udb_ptr t; 1694 if(!shortsoa) { 1695 if(nsd->signal_hint_shutdown) { 1696 shutdown: 1697 log_msg(LOG_WARNING, "signal received, shutting down..."); 1698 server_close_all_sockets(nsd->udp, nsd->ifs); 1699 server_close_all_sockets(nsd->tcp, nsd->ifs); 1700 #ifdef HAVE_SSL 1701 daemon_remote_close(nsd->rc); 1702 #endif 1703 /* Unlink it if possible... */ 1704 unlinkpid(nsd->pidfile); 1705 unlink(nsd->task[0]->fname); 1706 unlink(nsd->task[1]->fname); 1707 #ifdef USE_ZONE_STATS 1708 unlink(nsd->zonestatfname[0]); 1709 unlink(nsd->zonestatfname[1]); 1710 #endif 1711 /* write the nsd.db to disk, wait for it to complete */ 1712 udb_base_sync(nsd->db->udb, 1); 1713 udb_base_close(nsd->db->udb); 1714 server_shutdown(nsd); 1715 /* ENOTREACH */ 1716 exit(0); 1717 } 1718 } 1719 if(shortsoa) { 1720 /* put SOA in xfrd task because mytask may be in use */ 1721 taskudb = nsd->task[1-nsd->mytask]; 1722 } 1723 1724 add_all_soa_to_task(nsd, taskudb); 1725 if(!shortsoa) { 1726 /* wait for xfrd to signal task is ready, RELOAD signal */ 1727 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1728 cmd != NSD_RELOAD) { 1729 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1730 exit(1); 1731 } 1732 if(nsd->signal_hint_shutdown) { 1733 goto shutdown; 1734 } 1735 } 1736 /* give xfrd our task, signal it with RELOAD_DONE */ 1737 task_process_sync(taskudb); 1738 cmd = NSD_RELOAD_DONE; 1739 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1740 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1741 (int)nsd->pid, strerror(errno)); 1742 } 1743 mypid = getpid(); 1744 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1745 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1746 strerror(errno)); 1747 } 1748 1749 if(!shortsoa) { 1750 /* process the xfrd task works (expiry data) */ 1751 nsd->mytask = 1 - nsd->mytask; 1752 taskudb = nsd->task[nsd->mytask]; 1753 task_remap(taskudb); 1754 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1755 while(!udb_ptr_is_null(&t)) { 1756 task_process_expire(nsd->db, TASKLIST(&t)); 1757 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1758 } 1759 udb_ptr_unlink(&t, taskudb); 1760 task_clear(taskudb); 1761 1762 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1763 cmd = NSD_RELOAD_DONE; 1764 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1765 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1766 (int)nsd->pid, strerror(errno)); 1767 } 1768 } 1769 } 1770 1771 #ifdef HAVE_SSL 1772 static void 1773 log_crypto_from_err(const char* str, unsigned long err) 1774 { 1775 /* error:[error code]:[library name]:[function name]:[reason string] */ 1776 char buf[128]; 1777 unsigned long e; 1778 ERR_error_string_n(err, buf, sizeof(buf)); 1779 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1780 while( (e=ERR_get_error()) ) { 1781 ERR_error_string_n(e, buf, sizeof(buf)); 1782 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1783 } 1784 } 1785 1786 void 1787 log_crypto_err(const char* str) 1788 { 1789 log_crypto_from_err(str, ERR_get_error()); 1790 } 1791 1792 /** true if the ssl handshake error has to be squelched from the logs */ 1793 static int 1794 squelch_err_ssl_handshake(unsigned long err) 1795 { 1796 if(verbosity >= 3) 1797 return 0; /* only squelch on low verbosity */ 1798 /* this is very specific, we could filter on ERR_GET_REASON() 1799 * (the third element in ERR_PACK) */ 1800 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1801 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1802 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1803 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1804 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1805 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1806 #endif 1807 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1808 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1809 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1810 # ifdef SSL_R_VERSION_TOO_LOW 1811 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1812 # endif 1813 #endif 1814 ) 1815 return 1; 1816 return 0; 1817 } 1818 1819 void 1820 perform_openssl_init(void) 1821 { 1822 /* init SSL library */ 1823 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1824 ERR_load_crypto_strings(); 1825 #endif 1826 ERR_load_SSL_strings(); 1827 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1828 OpenSSL_add_all_algorithms(); 1829 #else 1830 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1831 | OPENSSL_INIT_ADD_ALL_DIGESTS 1832 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1833 #endif 1834 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1835 (void)SSL_library_init(); 1836 #else 1837 OPENSSL_init_ssl(0, NULL); 1838 #endif 1839 1840 if(!RAND_status()) { 1841 /* try to seed it */ 1842 unsigned char buf[256]; 1843 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1844 size_t i; 1845 v = seed; 1846 for(i=0; i<256/sizeof(v); i++) { 1847 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1848 v = v*seed + (unsigned int)i; 1849 } 1850 RAND_seed(buf, 256); 1851 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1852 } 1853 } 1854 1855 static int 1856 get_ocsp(char *filename, unsigned char **ocsp) 1857 { 1858 BIO *bio; 1859 OCSP_RESPONSE *response; 1860 int len = -1; 1861 unsigned char *p, *buf; 1862 assert(filename); 1863 1864 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1865 log_crypto_err("get_ocsp: BIO_new_file failed"); 1866 return -1; 1867 } 1868 1869 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1870 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1871 BIO_free(bio); 1872 return -1; 1873 } 1874 1875 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1876 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1877 OCSP_RESPONSE_free(response); 1878 BIO_free(bio); 1879 return -1; 1880 } 1881 1882 if ((buf = malloc((size_t) len)) == NULL) { 1883 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1884 OCSP_RESPONSE_free(response); 1885 BIO_free(bio); 1886 return -1; 1887 } 1888 1889 p = buf; 1890 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1891 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1892 free(buf); 1893 OCSP_RESPONSE_free(response); 1894 BIO_free(bio); 1895 return -1; 1896 } 1897 1898 OCSP_RESPONSE_free(response); 1899 BIO_free(bio); 1900 1901 *ocsp = buf; 1902 return len; 1903 } 1904 1905 /* further setup ssl ctx after the keys are loaded */ 1906 static void 1907 listen_sslctx_setup_2(void* ctxt) 1908 { 1909 SSL_CTX* ctx = (SSL_CTX*)ctxt; 1910 (void)ctx; 1911 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 1912 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 1913 /* ENOTREACH */ 1914 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 1915 } 1916 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 1917 if(1) { 1918 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 1919 if (!ecdh) { 1920 log_crypto_err("could not find p256, not enabling ECDHE"); 1921 } else { 1922 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 1923 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 1924 } 1925 EC_KEY_free (ecdh); 1926 } 1927 } 1928 #endif 1929 } 1930 1931 static int 1932 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 1933 { 1934 if(ocspdata) { 1935 unsigned char *p; 1936 if ((p=malloc(ocspdata_len)) == NULL) { 1937 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 1938 return SSL_TLSEXT_ERR_NOACK; 1939 } 1940 memcpy(p, ocspdata, ocspdata_len); 1941 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 1942 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 1943 free(p); 1944 return SSL_TLSEXT_ERR_NOACK; 1945 } 1946 return SSL_TLSEXT_ERR_OK; 1947 } else { 1948 return SSL_TLSEXT_ERR_NOACK; 1949 } 1950 } 1951 1952 SSL_CTX* 1953 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 1954 { 1955 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 1956 if(!ctx) { 1957 log_crypto_err("could not SSL_CTX_new"); 1958 return NULL; 1959 } 1960 /* no SSLv2, SSLv3 because has defects */ 1961 #if SSL_OP_NO_SSLv2 != 0 1962 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 1963 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 1964 SSL_CTX_free(ctx); 1965 return NULL; 1966 } 1967 #endif 1968 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 1969 != SSL_OP_NO_SSLv3){ 1970 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 1971 SSL_CTX_free(ctx); 1972 return 0; 1973 } 1974 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 1975 /* if we have tls 1.1 disable 1.0 */ 1976 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 1977 != SSL_OP_NO_TLSv1){ 1978 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 1979 SSL_CTX_free(ctx); 1980 return 0; 1981 } 1982 #endif 1983 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 1984 /* if we have tls 1.2 disable 1.1 */ 1985 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 1986 != SSL_OP_NO_TLSv1_1){ 1987 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 1988 SSL_CTX_free(ctx); 1989 return 0; 1990 } 1991 #endif 1992 #if defined(SSL_OP_NO_RENEGOTIATION) 1993 /* disable client renegotiation */ 1994 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 1995 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 1996 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 1997 SSL_CTX_free(ctx); 1998 return 0; 1999 } 2000 #endif 2001 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 2002 /* if we have sha256, set the cipher list to have no known vulns */ 2003 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 2004 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 2005 #endif 2006 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 2007 SSL_OP_CIPHER_SERVER_PREFERENCE) != 2008 SSL_OP_CIPHER_SERVER_PREFERENCE) { 2009 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 2010 SSL_CTX_free(ctx); 2011 return 0; 2012 } 2013 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 2014 SSL_CTX_set_security_level(ctx, 0); 2015 #endif 2016 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 2017 log_msg(LOG_ERR, "error for cert file: %s", pem); 2018 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 2019 SSL_CTX_free(ctx); 2020 return NULL; 2021 } 2022 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 2023 log_msg(LOG_ERR, "error for private key file: %s", key); 2024 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 2025 SSL_CTX_free(ctx); 2026 return NULL; 2027 } 2028 if(!SSL_CTX_check_private_key(ctx)) { 2029 log_msg(LOG_ERR, "error for key file: %s", key); 2030 log_crypto_err("Error in SSL_CTX check_private_key"); 2031 SSL_CTX_free(ctx); 2032 return NULL; 2033 } 2034 listen_sslctx_setup_2(ctx); 2035 if(verifypem && verifypem[0]) { 2036 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 2037 log_crypto_err("Error in SSL_CTX verify locations"); 2038 SSL_CTX_free(ctx); 2039 return NULL; 2040 } 2041 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 2042 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 2043 } 2044 return ctx; 2045 } 2046 2047 SSL_CTX* 2048 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 2049 { 2050 char *key, *pem; 2051 SSL_CTX *ctx; 2052 2053 key = nsd->options->tls_service_key; 2054 pem = nsd->options->tls_service_pem; 2055 if(!key || key[0] == 0) { 2056 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2057 return NULL; 2058 } 2059 if(!pem || pem[0] == 0) { 2060 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2061 return NULL; 2062 } 2063 2064 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2065 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2066 ctx = server_tls_ctx_setup(key, pem, verifypem); 2067 if(!ctx) { 2068 log_msg(LOG_ERR, "could not setup server TLS context"); 2069 return NULL; 2070 } 2071 if(ocspfile && ocspfile[0]) { 2072 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2073 log_crypto_err("Error reading OCSPfile"); 2074 SSL_CTX_free(ctx); 2075 return NULL; 2076 } else { 2077 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2078 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2079 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2080 SSL_CTX_free(ctx); 2081 return NULL; 2082 } 2083 } 2084 } 2085 return ctx; 2086 } 2087 2088 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2089 int 2090 using_tls_port(struct sockaddr* addr, const char* tls_port) 2091 { 2092 in_port_t port = 0; 2093 2094 if (addr->sa_family == AF_INET) 2095 port = ((struct sockaddr_in*)addr)->sin_port; 2096 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2097 else 2098 port = ((struct sockaddr_in6*)addr)->sin6_port; 2099 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2100 if (atoi(tls_port) == ntohs(port)) 2101 return 1; 2102 2103 return 0; 2104 } 2105 #endif 2106 2107 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2108 ssize_t 2109 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2110 { 2111 uint8_t* buf = (uint8_t*) p; 2112 ssize_t total = 0; 2113 struct pollfd fd; 2114 memset(&fd, 0, sizeof(fd)); 2115 fd.fd = s; 2116 fd.events = POLLIN; 2117 2118 while( total < sz) { 2119 ssize_t ret; 2120 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2121 if(ret == -1) { 2122 if(errno == EAGAIN) 2123 /* blocking read */ 2124 continue; 2125 if(errno == EINTR) { 2126 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2127 return -1; 2128 /* other signals can be handled later */ 2129 continue; 2130 } 2131 /* some error */ 2132 return -1; 2133 } 2134 if(ret == 0) { 2135 /* operation timed out */ 2136 return -2; 2137 } 2138 ret = read(s, buf+total, sz-total); 2139 if(ret == -1) { 2140 if(errno == EAGAIN) 2141 /* blocking read */ 2142 continue; 2143 if(errno == EINTR) { 2144 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2145 return -1; 2146 /* other signals can be handled later */ 2147 continue; 2148 } 2149 /* some error */ 2150 return -1; 2151 } 2152 if(ret == 0) { 2153 /* closed connection! */ 2154 return 0; 2155 } 2156 total += ret; 2157 } 2158 return total; 2159 } 2160 2161 static void 2162 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2163 { 2164 sig_atomic_t cmd = NSD_QUIT_SYNC; 2165 udb_ptr t, next; 2166 udb_base* u = nsd->task[nsd->mytask]; 2167 udb_ptr_init(&next, u); 2168 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2169 udb_base_set_userdata(u, 0); 2170 while(!udb_ptr_is_null(&t)) { 2171 /* store next in list so this one can be deleted or reused */ 2172 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2173 udb_rptr_zero(&TASKLIST(&t)->next, u); 2174 2175 /* process task t */ 2176 /* append results for task t and update last_task */ 2177 task_process_in_reload(nsd, u, last_task, &t); 2178 2179 /* go to next */ 2180 udb_ptr_set_ptr(&t, u, &next); 2181 2182 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2183 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2184 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2185 if(cmd == NSD_QUIT) { 2186 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2187 /* sync to disk (if needed) */ 2188 udb_base_sync(nsd->db->udb, 0); 2189 /* unlink files of remainder of tasks */ 2190 while(!udb_ptr_is_null(&t)) { 2191 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2192 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2193 } 2194 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2195 } 2196 udb_ptr_unlink(&t, u); 2197 udb_ptr_unlink(&next, u); 2198 exit(0); 2199 } 2200 } 2201 2202 } 2203 udb_ptr_unlink(&t, u); 2204 udb_ptr_unlink(&next, u); 2205 } 2206 2207 #ifdef BIND8_STATS 2208 static void 2209 parent_send_stats(struct nsd* nsd, int cmdfd) 2210 { 2211 size_t i; 2212 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { 2213 log_msg(LOG_ERR, "could not write stats to reload"); 2214 return; 2215 } 2216 for(i=0; i<nsd->child_count; i++) 2217 if(!write_socket(cmdfd, &nsd->children[i].query_count, 2218 sizeof(stc_type))) { 2219 log_msg(LOG_ERR, "could not write stats to reload"); 2220 return; 2221 } 2222 } 2223 2224 static void 2225 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) 2226 { 2227 struct nsdst s; 2228 stc_type* p; 2229 size_t i; 2230 if(block_read(nsd, cmdfd, &s, sizeof(s), 2231 RELOAD_SYNC_TIMEOUT) != sizeof(s)) { 2232 log_msg(LOG_ERR, "could not read stats from oldpar"); 2233 return; 2234 } 2235 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); 2236 s.db_mem = region_get_mem(nsd->db->region); 2237 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, 2238 nsd->child_count); 2239 if(!p) return; 2240 for(i=0; i<nsd->child_count; i++) { 2241 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= 2242 sizeof(stc_type)) 2243 return; 2244 } 2245 } 2246 #endif /* BIND8_STATS */ 2247 2248 /* 2249 * Reload the database, stop parent, re-fork children and continue. 2250 * as server_main. 2251 */ 2252 static void 2253 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2254 int cmdsocket) 2255 { 2256 pid_t mypid; 2257 sig_atomic_t cmd = NSD_QUIT_SYNC; 2258 int ret; 2259 udb_ptr last_task; 2260 struct sigaction old_sigchld, ign_sigchld; 2261 /* ignore SIGCHLD from the previous server_main that used this pid */ 2262 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2263 ign_sigchld.sa_handler = SIG_IGN; 2264 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2265 2266 #ifdef HAVE_SETPROCTITLE 2267 setproctitle("main"); 2268 #endif 2269 #ifdef HAVE_CPUSET_T 2270 if(nsd->use_cpu_affinity) { 2271 set_cpu_affinity(nsd->cpuset); 2272 } 2273 #endif 2274 2275 /* see what tasks we got from xfrd */ 2276 task_remap(nsd->task[nsd->mytask]); 2277 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2278 udb_compact_inhibited(nsd->db->udb, 1); 2279 reload_process_tasks(nsd, &last_task, cmdsocket); 2280 udb_compact_inhibited(nsd->db->udb, 0); 2281 udb_compact(nsd->db->udb); 2282 2283 #ifndef NDEBUG 2284 if(nsd_debug_level >= 1) 2285 region_log_stats(nsd->db->region); 2286 #endif /* NDEBUG */ 2287 /* sync to disk (if needed) */ 2288 udb_base_sync(nsd->db->udb, 0); 2289 2290 initialize_dname_compression_tables(nsd); 2291 2292 #ifdef BIND8_STATS 2293 /* Restart dumping stats if required. */ 2294 time(&nsd->st.boot); 2295 set_bind8_alarm(nsd); 2296 #endif 2297 #ifdef USE_ZONE_STATS 2298 server_zonestat_realloc(nsd); /* realloc for new children */ 2299 server_zonestat_switch(nsd); 2300 #endif 2301 2302 /* listen for the signals of failed children again */ 2303 sigaction(SIGCHLD, &old_sigchld, NULL); 2304 #ifdef USE_DNSTAP 2305 if (nsd->dt_collector) { 2306 int *swap_fd_send; 2307 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes")); 2308 /* Swap fd_send with fd_swap so old serve child and new serve 2309 * childs will not write to the same pipe ends simultaneously */ 2310 swap_fd_send = nsd->dt_collector_fd_send; 2311 nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap; 2312 nsd->dt_collector_fd_swap = swap_fd_send; 2313 2314 } 2315 #endif 2316 /* Start new child processes */ 2317 if (server_start_children(nsd, server_region, netio, &nsd-> 2318 xfrd_listener->fd) != 0) { 2319 send_children_quit(nsd); 2320 exit(1); 2321 } 2322 2323 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2324 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2325 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2326 if(cmd == NSD_QUIT) { 2327 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2328 send_children_quit(nsd); 2329 exit(0); 2330 } 2331 } 2332 2333 /* Send quit command to parent: blocking, wait for receipt. */ 2334 do { 2335 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2336 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2337 { 2338 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2339 strerror(errno)); 2340 } 2341 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2342 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2343 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2344 RELOAD_SYNC_TIMEOUT); 2345 if(ret == -2) { 2346 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2347 } 2348 } while (ret == -2); 2349 if(ret == -1) { 2350 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2351 strerror(errno)); 2352 } 2353 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2354 if(cmd == NSD_QUIT) { 2355 /* small race condition possible here, parent got quit cmd. */ 2356 send_children_quit(nsd); 2357 exit(1); 2358 } 2359 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2360 #ifdef BIND8_STATS 2361 reload_do_stats(cmdsocket, nsd, &last_task); 2362 #endif 2363 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2364 task_process_sync(nsd->task[nsd->mytask]); 2365 #ifdef USE_ZONE_STATS 2366 server_zonestat_realloc(nsd); /* realloc for next children */ 2367 #endif 2368 2369 /* send soainfo to the xfrd process, signal it that reload is done, 2370 * it picks up the taskudb */ 2371 cmd = NSD_RELOAD_DONE; 2372 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2373 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2374 strerror(errno)); 2375 } 2376 mypid = getpid(); 2377 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2378 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2379 strerror(errno)); 2380 } 2381 2382 /* try to reopen file */ 2383 if (nsd->file_rotation_ok) 2384 log_reopen(nsd->log_filename, 1); 2385 /* exit reload, continue as new server_main */ 2386 } 2387 2388 /* 2389 * Get the mode depending on the signal hints that have been received. 2390 * Multiple signal hints can be received and will be handled in turn. 2391 */ 2392 static sig_atomic_t 2393 server_signal_mode(struct nsd *nsd) 2394 { 2395 if(nsd->signal_hint_quit) { 2396 nsd->signal_hint_quit = 0; 2397 return NSD_QUIT; 2398 } 2399 else if(nsd->signal_hint_shutdown) { 2400 nsd->signal_hint_shutdown = 0; 2401 return NSD_SHUTDOWN; 2402 } 2403 else if(nsd->signal_hint_child) { 2404 nsd->signal_hint_child = 0; 2405 return NSD_REAP_CHILDREN; 2406 } 2407 else if(nsd->signal_hint_reload) { 2408 nsd->signal_hint_reload = 0; 2409 return NSD_RELOAD; 2410 } 2411 else if(nsd->signal_hint_reload_hup) { 2412 nsd->signal_hint_reload_hup = 0; 2413 return NSD_RELOAD_REQ; 2414 } 2415 else if(nsd->signal_hint_stats) { 2416 nsd->signal_hint_stats = 0; 2417 #ifdef BIND8_STATS 2418 set_bind8_alarm(nsd); 2419 #endif 2420 return NSD_STATS; 2421 } 2422 else if(nsd->signal_hint_statsusr) { 2423 nsd->signal_hint_statsusr = 0; 2424 return NSD_STATS; 2425 } 2426 return NSD_RUN; 2427 } 2428 2429 /* 2430 * The main server simply waits for signals and child processes to 2431 * terminate. Child processes are restarted as necessary. 2432 */ 2433 void 2434 server_main(struct nsd *nsd) 2435 { 2436 region_type *server_region = region_create(xalloc, free); 2437 netio_type *netio = netio_create(server_region); 2438 netio_handler_type reload_listener; 2439 int reload_sockets[2] = {-1, -1}; 2440 struct timespec timeout_spec; 2441 int status; 2442 pid_t child_pid; 2443 pid_t reload_pid = -1; 2444 sig_atomic_t mode; 2445 2446 /* Ensure we are the main process */ 2447 assert(nsd->server_kind == NSD_SERVER_MAIN); 2448 2449 /* Add listener for the XFRD process */ 2450 netio_add_handler(netio, nsd->xfrd_listener); 2451 2452 /* Start the child processes that handle incoming queries */ 2453 if (server_start_children(nsd, server_region, netio, 2454 &nsd->xfrd_listener->fd) != 0) { 2455 send_children_quit(nsd); 2456 exit(1); 2457 } 2458 reload_listener.fd = -1; 2459 2460 /* This_child MUST be 0, because this is the parent process */ 2461 assert(nsd->this_child == 0); 2462 2463 /* Run the server until we get a shutdown signal */ 2464 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2465 /* Did we receive a signal that changes our mode? */ 2466 if(mode == NSD_RUN) { 2467 nsd->mode = mode = server_signal_mode(nsd); 2468 } 2469 2470 switch (mode) { 2471 case NSD_RUN: 2472 /* see if any child processes terminated */ 2473 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2474 int is_child = delete_child_pid(nsd, child_pid); 2475 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2476 if(nsd->children[is_child].child_fd == -1) 2477 nsd->children[is_child].has_exited = 1; 2478 parent_check_all_children_exited(nsd); 2479 } else if(is_child != -1) { 2480 log_msg(LOG_WARNING, 2481 "server %d died unexpectedly with status %d, restarting", 2482 (int) child_pid, status); 2483 restart_child_servers(nsd, server_region, netio, 2484 &nsd->xfrd_listener->fd); 2485 } else if (child_pid == reload_pid) { 2486 sig_atomic_t cmd = NSD_RELOAD_DONE; 2487 pid_t mypid; 2488 log_msg(LOG_WARNING, 2489 "Reload process %d failed with status %d, continuing with old database", 2490 (int) child_pid, status); 2491 reload_pid = -1; 2492 if(reload_listener.fd != -1) close(reload_listener.fd); 2493 reload_listener.fd = -1; 2494 reload_listener.event_types = NETIO_EVENT_NONE; 2495 task_process_sync(nsd->task[nsd->mytask]); 2496 /* inform xfrd reload attempt ended */ 2497 if(!write_socket(nsd->xfrd_listener->fd, 2498 &cmd, sizeof(cmd))) { 2499 log_msg(LOG_ERR, "problems " 2500 "sending SOAEND to xfrd: %s", 2501 strerror(errno)); 2502 } 2503 mypid = getpid(); 2504 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2505 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2506 strerror(errno)); 2507 } 2508 #ifdef USE_DNSTAP 2509 } else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) { 2510 log_msg(LOG_WARNING, 2511 "dnstap-collector %d terminated with status %d", 2512 (int) child_pid, status); 2513 if(nsd->dt_collector) { 2514 dt_collector_close(nsd->dt_collector, nsd); 2515 dt_collector_destroy(nsd->dt_collector, nsd); 2516 nsd->dt_collector = NULL; 2517 } 2518 /* Only respawn a crashed (or exited) 2519 * dnstap-collector when not reloading, 2520 * to not induce a reload during a 2521 * reload (which would seriously 2522 * disrupt nsd procedures and lead to 2523 * unpredictable results)! 2524 * 2525 * This will *leave* a dnstap-collector 2526 * process terminated, but because 2527 * signalling of the reload process to 2528 * the main process to respawn in this 2529 * situation will be cumbersome, and 2530 * because this situation is so 2531 * specific (and therefore hopefully 2532 * extremely rare or non-existing at 2533 * all), plus the fact that we are left 2534 * with a perfectly function NSD 2535 * (besides not logging dnstap 2536 * messages), I consider it acceptable 2537 * to leave this unresolved. 2538 */ 2539 if(reload_pid == -1 && nsd->options->dnstap_enable) { 2540 nsd->dt_collector = dt_collector_create(nsd); 2541 dt_collector_start(nsd->dt_collector, nsd); 2542 nsd->mode = NSD_RELOAD_REQ; 2543 } 2544 #endif 2545 } else if(status != 0) { 2546 /* check for status, because we get 2547 * the old-servermain because reload 2548 * is the process-parent of old-main, 2549 * and we get older server-processes 2550 * that are exiting after a reload */ 2551 log_msg(LOG_WARNING, 2552 "process %d terminated with status %d", 2553 (int) child_pid, status); 2554 } 2555 } 2556 if (child_pid == -1) { 2557 if (errno == EINTR) { 2558 continue; 2559 } 2560 if (errno != ECHILD) 2561 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2562 } 2563 if (nsd->mode != NSD_RUN) 2564 break; 2565 2566 /* timeout to collect processes. In case no sigchild happens. */ 2567 timeout_spec.tv_sec = 60; 2568 timeout_spec.tv_nsec = 0; 2569 2570 /* listen on ports, timeout for collecting terminated children */ 2571 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2572 if (errno != EINTR) { 2573 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2574 } 2575 } 2576 if(nsd->restart_children) { 2577 restart_child_servers(nsd, server_region, netio, 2578 &nsd->xfrd_listener->fd); 2579 nsd->restart_children = 0; 2580 } 2581 if(nsd->reload_failed) { 2582 sig_atomic_t cmd = NSD_RELOAD_DONE; 2583 pid_t mypid; 2584 nsd->reload_failed = 0; 2585 log_msg(LOG_WARNING, 2586 "Reload process %d failed, continuing with old database", 2587 (int) reload_pid); 2588 reload_pid = -1; 2589 if(reload_listener.fd != -1) close(reload_listener.fd); 2590 reload_listener.fd = -1; 2591 reload_listener.event_types = NETIO_EVENT_NONE; 2592 task_process_sync(nsd->task[nsd->mytask]); 2593 /* inform xfrd reload attempt ended */ 2594 if(!write_socket(nsd->xfrd_listener->fd, 2595 &cmd, sizeof(cmd))) { 2596 log_msg(LOG_ERR, "problems " 2597 "sending SOAEND to xfrd: %s", 2598 strerror(errno)); 2599 } 2600 mypid = getpid(); 2601 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2602 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2603 strerror(errno)); 2604 } 2605 } 2606 2607 break; 2608 case NSD_RELOAD_REQ: { 2609 sig_atomic_t cmd = NSD_RELOAD_REQ; 2610 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2611 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2612 "main: ipc send reload_req to xfrd")); 2613 if(!write_socket(nsd->xfrd_listener->fd, 2614 &cmd, sizeof(cmd))) { 2615 log_msg(LOG_ERR, "server_main: could not send " 2616 "reload_req to xfrd: %s", strerror(errno)); 2617 } 2618 nsd->mode = NSD_RUN; 2619 } break; 2620 case NSD_RELOAD: 2621 /* Continue to run nsd after reload */ 2622 nsd->mode = NSD_RUN; 2623 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2624 if (reload_pid != -1) { 2625 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2626 (int) reload_pid); 2627 break; 2628 } 2629 2630 /* switch the mytask to keep track of who owns task*/ 2631 nsd->mytask = 1 - nsd->mytask; 2632 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2633 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2634 reload_pid = -1; 2635 break; 2636 } 2637 2638 /* Do actual reload */ 2639 reload_pid = fork(); 2640 switch (reload_pid) { 2641 case -1: 2642 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2643 break; 2644 default: 2645 /* PARENT */ 2646 close(reload_sockets[0]); 2647 server_reload(nsd, server_region, netio, 2648 reload_sockets[1]); 2649 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2650 close(reload_sockets[1]); 2651 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2652 /* drop stale xfrd ipc data */ 2653 ((struct ipc_handler_conn_data*)nsd-> 2654 xfrd_listener->user_data) 2655 ->conn->is_reading = 0; 2656 reload_pid = -1; 2657 reload_listener.fd = -1; 2658 reload_listener.event_types = NETIO_EVENT_NONE; 2659 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2660 break; 2661 case 0: 2662 /* CHILD */ 2663 /* server_main keep running until NSD_QUIT_SYNC 2664 * received from reload. */ 2665 close(reload_sockets[1]); 2666 reload_listener.fd = reload_sockets[0]; 2667 reload_listener.timeout = NULL; 2668 reload_listener.user_data = nsd; 2669 reload_listener.event_types = NETIO_EVENT_READ; 2670 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2671 netio_add_handler(netio, &reload_listener); 2672 reload_pid = getppid(); 2673 break; 2674 } 2675 break; 2676 case NSD_QUIT_SYNC: 2677 /* synchronisation of xfrd, parent and reload */ 2678 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2679 sig_atomic_t cmd = NSD_RELOAD; 2680 /* stop xfrd ipc writes in progress */ 2681 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2682 "main: ipc send indication reload")); 2683 if(!write_socket(nsd->xfrd_listener->fd, 2684 &cmd, sizeof(cmd))) { 2685 log_msg(LOG_ERR, "server_main: could not send reload " 2686 "indication to xfrd: %s", strerror(errno)); 2687 } 2688 /* wait for ACK from xfrd */ 2689 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2690 nsd->quit_sync_done = 1; 2691 } 2692 nsd->mode = NSD_RUN; 2693 break; 2694 case NSD_QUIT: 2695 /* silent shutdown during reload */ 2696 if(reload_listener.fd != -1) { 2697 /* acknowledge the quit, to sync reload that we will really quit now */ 2698 sig_atomic_t cmd = NSD_RELOAD; 2699 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2700 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2701 log_msg(LOG_ERR, "server_main: " 2702 "could not ack quit: %s", strerror(errno)); 2703 } 2704 #ifdef BIND8_STATS 2705 parent_send_stats(nsd, reload_listener.fd); 2706 #endif /* BIND8_STATS */ 2707 close(reload_listener.fd); 2708 } 2709 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2710 /* only quit children after xfrd has acked */ 2711 send_children_quit(nsd); 2712 2713 #ifdef MEMCLEAN /* OS collects memory pages */ 2714 region_destroy(server_region); 2715 #endif 2716 server_shutdown(nsd); 2717 2718 /* ENOTREACH */ 2719 break; 2720 case NSD_SHUTDOWN: 2721 break; 2722 case NSD_REAP_CHILDREN: 2723 /* continue; wait for child in run loop */ 2724 nsd->mode = NSD_RUN; 2725 break; 2726 case NSD_STATS: 2727 #ifdef BIND8_STATS 2728 set_children_stats(nsd); 2729 #endif 2730 nsd->mode = NSD_RUN; 2731 break; 2732 default: 2733 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2734 nsd->mode = NSD_RUN; 2735 break; 2736 } 2737 } 2738 log_msg(LOG_WARNING, "signal received, shutting down..."); 2739 2740 /* close opened ports to avoid race with restart of nsd */ 2741 server_close_all_sockets(nsd->udp, nsd->ifs); 2742 server_close_all_sockets(nsd->tcp, nsd->ifs); 2743 #ifdef HAVE_SSL 2744 daemon_remote_close(nsd->rc); 2745 #endif 2746 send_children_quit_and_wait(nsd); 2747 2748 /* Unlink it if possible... */ 2749 unlinkpid(nsd->pidfile); 2750 unlink(nsd->task[0]->fname); 2751 unlink(nsd->task[1]->fname); 2752 #ifdef USE_ZONE_STATS 2753 unlink(nsd->zonestatfname[0]); 2754 unlink(nsd->zonestatfname[1]); 2755 #endif 2756 #ifdef USE_DNSTAP 2757 dt_collector_close(nsd->dt_collector, nsd); 2758 #endif 2759 2760 if(reload_listener.fd != -1) { 2761 sig_atomic_t cmd = NSD_QUIT; 2762 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2763 "main: ipc send quit to reload-process")); 2764 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2765 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2766 strerror(errno)); 2767 } 2768 fsync(reload_listener.fd); 2769 close(reload_listener.fd); 2770 /* wait for reload to finish processing */ 2771 while(1) { 2772 if(waitpid(reload_pid, NULL, 0) == -1) { 2773 if(errno == EINTR) continue; 2774 if(errno == ECHILD) break; 2775 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2776 (int)reload_pid, strerror(errno)); 2777 } 2778 break; 2779 } 2780 } 2781 if(nsd->xfrd_listener->fd != -1) { 2782 /* complete quit, stop xfrd */ 2783 sig_atomic_t cmd = NSD_QUIT; 2784 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2785 "main: ipc send quit to xfrd")); 2786 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2787 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2788 strerror(errno)); 2789 } 2790 fsync(nsd->xfrd_listener->fd); 2791 close(nsd->xfrd_listener->fd); 2792 (void)kill(nsd->pid, SIGTERM); 2793 } 2794 2795 #ifdef MEMCLEAN /* OS collects memory pages */ 2796 region_destroy(server_region); 2797 #endif 2798 /* write the nsd.db to disk, wait for it to complete */ 2799 udb_base_sync(nsd->db->udb, 1); 2800 udb_base_close(nsd->db->udb); 2801 server_shutdown(nsd); 2802 } 2803 2804 static query_state_type 2805 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p) 2806 { 2807 return query_process(query, nsd, now_p); 2808 } 2809 2810 static query_state_type 2811 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p) 2812 { 2813 #ifdef RATELIMIT 2814 if(query_process(query, nsd, now_p) != QUERY_DISCARDED) { 2815 if(query->edns.cookie_status != COOKIE_VALID 2816 && query->edns.cookie_status != COOKIE_VALID_REUSE 2817 && rrl_process_query(query)) 2818 return rrl_slip(query); 2819 else return QUERY_PROCESSED; 2820 } 2821 return QUERY_DISCARDED; 2822 #else 2823 return query_process(query, nsd, now_p); 2824 #endif 2825 } 2826 2827 const char* 2828 nsd_event_vs(void) 2829 { 2830 #ifdef USE_MINI_EVENT 2831 return ""; 2832 #else 2833 return event_get_version(); 2834 #endif 2835 } 2836 2837 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2838 static const char* ub_ev_backend2str(int b) 2839 { 2840 switch(b) { 2841 case EVBACKEND_SELECT: return "select"; 2842 case EVBACKEND_POLL: return "poll"; 2843 case EVBACKEND_EPOLL: return "epoll"; 2844 case EVBACKEND_KQUEUE: return "kqueue"; 2845 case EVBACKEND_DEVPOLL: return "devpoll"; 2846 case EVBACKEND_PORT: return "evport"; 2847 } 2848 return "unknown"; 2849 } 2850 #endif 2851 2852 const char* 2853 nsd_event_method(void) 2854 { 2855 #ifdef USE_MINI_EVENT 2856 return "select"; 2857 #else 2858 struct event_base* b = nsd_child_event_base(); 2859 const char* m = "?"; 2860 # ifdef EV_FEATURE_BACKENDS 2861 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2862 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 2863 m = event_base_get_method(b); 2864 # endif 2865 # ifdef MEMCLEAN 2866 event_base_free(b); 2867 # endif 2868 return m; 2869 #endif 2870 } 2871 2872 struct event_base* 2873 nsd_child_event_base(void) 2874 { 2875 struct event_base* base; 2876 #ifdef USE_MINI_EVENT 2877 static time_t secs; 2878 static struct timeval now; 2879 base = event_init(&secs, &now); 2880 #else 2881 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2882 /* libev */ 2883 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2884 # else 2885 /* libevent */ 2886 # ifdef HAVE_EVENT_BASE_NEW 2887 base = event_base_new(); 2888 # else 2889 base = event_init(); 2890 # endif 2891 # endif 2892 #endif 2893 return base; 2894 } 2895 2896 static void 2897 add_udp_handler( 2898 struct nsd *nsd, 2899 struct nsd_socket *sock, 2900 struct udp_handler_data *data) 2901 { 2902 struct event *handler = &data->event; 2903 2904 data->nsd = nsd; 2905 data->socket = sock; 2906 2907 memset(handler, 0, sizeof(*handler)); 2908 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 2909 if(event_base_set(nsd->event_base, handler) != 0) 2910 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 2911 if(event_add(handler, NULL) != 0) 2912 log_msg(LOG_ERR, "nsd udp: event_add failed"); 2913 } 2914 2915 void 2916 add_tcp_handler( 2917 struct nsd *nsd, 2918 struct nsd_socket *sock, 2919 struct tcp_accept_handler_data *data) 2920 { 2921 struct event *handler = &data->event; 2922 2923 data->nsd = nsd; 2924 data->socket = sock; 2925 2926 #ifdef HAVE_SSL 2927 if (nsd->tls_ctx && 2928 nsd->options->tls_port && 2929 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 2930 { 2931 data->tls_accept = 1; 2932 if(verbosity >= 2) { 2933 char buf[48]; 2934 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 2935 VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 2936 } 2937 } else { 2938 data->tls_accept = 0; 2939 } 2940 #endif 2941 2942 memset(handler, 0, sizeof(*handler)); 2943 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 2944 if(event_base_set(nsd->event_base, handler) != 0) 2945 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 2946 if(event_add(handler, NULL) != 0) 2947 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 2948 data->event_added = 1; 2949 } 2950 2951 /* 2952 * Serve DNS requests. 2953 */ 2954 void 2955 server_child(struct nsd *nsd) 2956 { 2957 size_t i, from, numifs; 2958 region_type *server_region = region_create(xalloc, free); 2959 struct event_base* event_base = nsd_child_event_base(); 2960 sig_atomic_t mode; 2961 2962 if(!event_base) { 2963 log_msg(LOG_ERR, "nsd server could not create event base"); 2964 exit(1); 2965 } 2966 nsd->event_base = event_base; 2967 nsd->server_region = server_region; 2968 2969 #ifdef RATELIMIT 2970 rrl_init(nsd->this_child->child_num); 2971 #endif 2972 2973 assert(nsd->server_kind != NSD_SERVER_MAIN); 2974 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 2975 2976 #ifdef HAVE_SETPROCTITLE 2977 setproctitle("server %d", nsd->this_child->child_num + 1); 2978 #endif 2979 #ifdef HAVE_CPUSET_T 2980 if(nsd->use_cpu_affinity) { 2981 set_cpu_affinity(nsd->this_child->cpuset); 2982 } 2983 #endif 2984 2985 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 2986 server_close_all_sockets(nsd->tcp, nsd->ifs); 2987 } 2988 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 2989 server_close_all_sockets(nsd->udp, nsd->ifs); 2990 } 2991 2992 if (nsd->this_child->parent_fd != -1) { 2993 struct event *handler; 2994 struct ipc_handler_conn_data* user_data = 2995 (struct ipc_handler_conn_data*)region_alloc( 2996 server_region, sizeof(struct ipc_handler_conn_data)); 2997 user_data->nsd = nsd; 2998 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 2999 3000 handler = (struct event*) region_alloc( 3001 server_region, sizeof(*handler)); 3002 memset(handler, 0, sizeof(*handler)); 3003 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 3004 EV_READ, child_handle_parent_command, user_data); 3005 if(event_base_set(event_base, handler) != 0) 3006 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 3007 if(event_add(handler, NULL) != 0) 3008 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 3009 } 3010 3011 if(nsd->reuseport) { 3012 numifs = nsd->ifs / nsd->reuseport; 3013 from = numifs * nsd->this_child->child_num; 3014 if(from+numifs > nsd->ifs) { /* should not happen */ 3015 from = 0; 3016 numifs = nsd->ifs; 3017 } 3018 } else { 3019 from = 0; 3020 numifs = nsd->ifs; 3021 } 3022 3023 if (nsd->server_kind & NSD_SERVER_UDP) { 3024 int child = nsd->this_child->child_num; 3025 memset(msgs, 0, sizeof(msgs)); 3026 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 3027 queries[i] = query_create(server_region, 3028 compressed_dname_offsets, 3029 compression_table_size, compressed_dnames); 3030 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3031 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3032 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3033 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3034 msgs[i].msg_hdr.msg_iovlen = 1; 3035 msgs[i].msg_hdr.msg_name = &queries[i]->addr; 3036 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3037 } 3038 3039 for (i = 0; i < nsd->ifs; i++) { 3040 int listen; 3041 struct udp_handler_data *data; 3042 3043 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 3044 3045 if(i >= from && i < (from + numifs) && listen) { 3046 data = region_alloc_zero( 3047 nsd->server_region, sizeof(*data)); 3048 add_udp_handler(nsd, &nsd->udp[i], data); 3049 } else { 3050 /* close sockets intended for other servers */ 3051 server_close_socket(&nsd->udp[i]); 3052 } 3053 } 3054 } 3055 3056 /* 3057 * Keep track of all the TCP accept handlers so we can enable 3058 * and disable them based on the current number of active TCP 3059 * connections. 3060 */ 3061 if (nsd->server_kind & NSD_SERVER_TCP) { 3062 int child = nsd->this_child->child_num; 3063 tcp_accept_handler_count = numifs; 3064 tcp_accept_handlers = region_alloc_array(server_region, 3065 numifs, sizeof(*tcp_accept_handlers)); 3066 3067 for (i = 0; i < nsd->ifs; i++) { 3068 int listen; 3069 struct tcp_accept_handler_data *data; 3070 3071 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 3072 3073 if(i >= from && i < (from + numifs) && listen) { 3074 data = &tcp_accept_handlers[i-from]; 3075 memset(data, 0, sizeof(*data)); 3076 add_tcp_handler(nsd, &nsd->tcp[i], data); 3077 } else { 3078 /* close sockets intended for other servers */ 3079 /* 3080 * uncomment this once tcp servers are no 3081 * longer copied in the tcp fd copy line 3082 * in server_init(). 3083 server_close_socket(&nsd->tcp[i]); 3084 */ 3085 /* close sockets not meant for this server*/ 3086 if(!listen) 3087 server_close_socket(&nsd->tcp[i]); 3088 } 3089 } 3090 } else { 3091 tcp_accept_handler_count = 0; 3092 } 3093 3094 /* The main loop... */ 3095 while ((mode = nsd->mode) != NSD_QUIT) { 3096 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 3097 3098 /* Do we need to do the statistics... */ 3099 if (mode == NSD_STATS) { 3100 #ifdef BIND8_STATS 3101 int p = nsd->st.period; 3102 nsd->st.period = 1; /* force stats printout */ 3103 /* Dump the statistics */ 3104 bind8_stats(nsd); 3105 nsd->st.period = p; 3106 #else /* !BIND8_STATS */ 3107 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3108 #endif /* BIND8_STATS */ 3109 3110 nsd->mode = NSD_RUN; 3111 } 3112 else if (mode == NSD_REAP_CHILDREN) { 3113 /* got signal, notify parent. parent reaps terminated children. */ 3114 if (nsd->this_child->parent_fd != -1) { 3115 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3116 if (write(nsd->this_child->parent_fd, 3117 &parent_notify, 3118 sizeof(parent_notify)) == -1) 3119 { 3120 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3121 (int) nsd->this_child->pid, strerror(errno)); 3122 } 3123 } else /* no parent, so reap 'em */ 3124 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3125 nsd->mode = NSD_RUN; 3126 } 3127 else if(mode == NSD_RUN) { 3128 /* Wait for a query... */ 3129 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3130 if (errno != EINTR) { 3131 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3132 break; 3133 } 3134 } 3135 } else if(mode == NSD_QUIT) { 3136 /* ignore here, quit */ 3137 } else { 3138 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3139 (int)mode); 3140 nsd->mode = NSD_RUN; 3141 } 3142 } 3143 3144 service_remaining_tcp(nsd); 3145 #ifdef BIND8_STATS 3146 bind8_stats(nsd); 3147 #endif /* BIND8_STATS */ 3148 3149 #ifdef MEMCLEAN /* OS collects memory pages */ 3150 #ifdef RATELIMIT 3151 rrl_deinit(nsd->this_child->child_num); 3152 #endif 3153 event_base_free(event_base); 3154 region_destroy(server_region); 3155 #endif 3156 server_shutdown(nsd); 3157 } 3158 3159 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3160 { 3161 int* timed_out = (int*)arg; 3162 assert(event & EV_TIMEOUT); (void)event; 3163 /* wake up the service tcp thread, note event is no longer 3164 * registered */ 3165 *timed_out = 1; 3166 } 3167 3168 void 3169 service_remaining_tcp(struct nsd* nsd) 3170 { 3171 struct tcp_handler_data* p; 3172 struct event_base* event_base; 3173 /* check if it is needed */ 3174 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3175 return; 3176 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3177 #ifdef USE_DNSTAP 3178 /* remove dnstap collector, we cannot write there because the new 3179 * child process is using the file descriptor, or the child 3180 * process after that. */ 3181 dt_collector_destroy(nsd->dt_collector, nsd); 3182 nsd->dt_collector = NULL; 3183 #endif 3184 /* setup event base */ 3185 event_base = nsd_child_event_base(); 3186 if(!event_base) { 3187 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3188 return; 3189 } 3190 /* register tcp connections */ 3191 for(p = tcp_active_list; p != NULL; p = p->next) { 3192 struct timeval timeout; 3193 int fd = p->event.ev_fd; 3194 #ifdef USE_MINI_EVENT 3195 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3196 #else 3197 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3198 #endif 3199 void (*fn)(int, short, void*); 3200 #ifdef HAVE_SSL 3201 if(p->tls) { 3202 if((event&EV_READ)) 3203 fn = handle_tls_reading; 3204 else fn = handle_tls_writing; 3205 } else { 3206 #endif 3207 if((event&EV_READ)) 3208 fn = handle_tcp_reading; 3209 else fn = handle_tcp_writing; 3210 #ifdef HAVE_SSL 3211 } 3212 #endif 3213 3214 p->tcp_no_more_queries = 1; 3215 /* set timeout to 1/10 second */ 3216 if(p->tcp_timeout > 100) 3217 p->tcp_timeout = 100; 3218 timeout.tv_sec = p->tcp_timeout / 1000; 3219 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3220 event_del(&p->event); 3221 memset(&p->event, 0, sizeof(p->event)); 3222 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3223 fn, p); 3224 if(event_base_set(event_base, &p->event) != 0) 3225 log_msg(LOG_ERR, "event base set failed"); 3226 if(event_add(&p->event, &timeout) != 0) 3227 log_msg(LOG_ERR, "event add failed"); 3228 } 3229 3230 /* handle it */ 3231 while(nsd->current_tcp_count > 0) { 3232 mode_t m = server_signal_mode(nsd); 3233 struct event timeout; 3234 struct timeval tv; 3235 int timed_out = 0; 3236 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3237 m == NSD_REAP_CHILDREN) { 3238 /* quit */ 3239 break; 3240 } 3241 /* timer */ 3242 /* have to do something every second */ 3243 tv.tv_sec = 1; 3244 tv.tv_usec = 0; 3245 memset(&timeout, 0, sizeof(timeout)); 3246 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3247 &timed_out); 3248 if(event_base_set(event_base, &timeout) != 0) 3249 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3250 if(event_add(&timeout, &tv) != 0) 3251 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3252 3253 /* service loop */ 3254 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3255 if (errno != EINTR) { 3256 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3257 break; 3258 } 3259 } 3260 if(!timed_out) { 3261 event_del(&timeout); 3262 } else { 3263 /* timed out, quit */ 3264 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3265 break; 3266 } 3267 } 3268 #ifdef MEMCLEAN 3269 event_base_free(event_base); 3270 #endif 3271 /* continue to quit after return */ 3272 } 3273 3274 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3275 * are always used, even if nonblocking operations are broken, in which case 3276 * NUM_RECV_PER_SELECT is defined to 1 (one). 3277 */ 3278 #if defined(HAVE_RECVMMSG) 3279 #define nsd_recvmmsg recvmmsg 3280 #else /* !HAVE_RECVMMSG */ 3281 3282 static int 3283 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3284 int flags, struct timespec *timeout) 3285 { 3286 unsigned int vpos = 0; 3287 ssize_t rcvd; 3288 3289 /* timeout is ignored, ensure caller does not expect it to work */ 3290 assert(timeout == NULL); (void)timeout; 3291 3292 while(vpos < vlen) { 3293 rcvd = recvfrom(sockfd, 3294 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3295 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3296 flags, 3297 msgvec[vpos].msg_hdr.msg_name, 3298 &msgvec[vpos].msg_hdr.msg_namelen); 3299 if(rcvd < 0) { 3300 break; 3301 } else { 3302 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3303 msgvec[vpos].msg_len = (unsigned int)rcvd; 3304 vpos++; 3305 } 3306 } 3307 3308 if(vpos) { 3309 /* error will be picked up next time */ 3310 return (int)vpos; 3311 } else if(errno == 0) { 3312 return 0; 3313 } else if(errno == EAGAIN) { 3314 return 0; 3315 } 3316 3317 return -1; 3318 } 3319 #endif /* HAVE_RECVMMSG */ 3320 3321 #ifdef HAVE_SENDMMSG 3322 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3323 #else /* !HAVE_SENDMMSG */ 3324 3325 static int 3326 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3327 { 3328 unsigned int vpos = 0; 3329 ssize_t snd; 3330 3331 while(vpos < vlen) { 3332 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3333 snd = sendto(sockfd, 3334 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3335 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3336 flags, 3337 msgvec[vpos].msg_hdr.msg_name, 3338 msgvec[vpos].msg_hdr.msg_namelen); 3339 if(snd < 0) { 3340 break; 3341 } else { 3342 msgvec[vpos].msg_len = (unsigned int)snd; 3343 vpos++; 3344 } 3345 } 3346 3347 if(vpos) { 3348 return (int)vpos; 3349 } else if(errno == 0) { 3350 return 0; 3351 } 3352 3353 return -1; 3354 } 3355 #endif /* HAVE_SENDMMSG */ 3356 3357 static int 3358 port_is_zero( 3359 #ifdef INET6 3360 struct sockaddr_storage *addr 3361 #else 3362 struct sockaddr_in *addr 3363 #endif 3364 ) 3365 { 3366 #ifdef INET6 3367 if(addr->ss_family == AF_INET6) { 3368 return (((struct sockaddr_in6 *)addr)->sin6_port) == 0; 3369 } else if(addr->ss_family == AF_INET) { 3370 return (((struct sockaddr_in *)addr)->sin_port) == 0; 3371 } 3372 return 0; 3373 #else 3374 if(addr->sin_family == AF_INET) { 3375 return addr->sin_port == 0; 3376 } 3377 return 0; 3378 #endif 3379 } 3380 3381 static void 3382 handle_udp(int fd, short event, void* arg) 3383 { 3384 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3385 int received, sent, recvcount, i; 3386 struct query *q; 3387 uint32_t now = 0; 3388 3389 if (!(event & EV_READ)) { 3390 return; 3391 } 3392 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3393 /* this printf strangely gave a performance increase on Linux */ 3394 /* printf("recvcount %d \n", recvcount); */ 3395 if (recvcount == -1) { 3396 if (errno != EAGAIN && errno != EINTR) { 3397 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3398 STATUP(data->nsd, rxerr); 3399 /* No zone statup */ 3400 } 3401 /* Simply no data available */ 3402 return; 3403 } 3404 for (i = 0; i < recvcount; i++) { 3405 loopstart: 3406 received = msgs[i].msg_len; 3407 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; 3408 q = queries[i]; 3409 if (received == -1) { 3410 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3411 #if defined(HAVE_RECVMMSG) 3412 msgs[i].msg_hdr.msg_flags 3413 #else 3414 errno 3415 #endif 3416 )); 3417 STATUP(data->nsd, rxerr); 3418 /* No zone statup */ 3419 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3420 iovecs[i].iov_len = buffer_remaining(q->packet); 3421 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3422 goto swap_drop; 3423 } 3424 3425 /* Account... */ 3426 #ifdef BIND8_STATS 3427 if (data->socket->addr.ai_family == AF_INET) { 3428 STATUP(data->nsd, qudp); 3429 } else if (data->socket->addr.ai_family == AF_INET6) { 3430 STATUP(data->nsd, qudp6); 3431 } 3432 #endif 3433 3434 buffer_skip(q->packet, received); 3435 buffer_flip(q->packet); 3436 #ifdef USE_DNSTAP 3437 /* 3438 * sending UDP-query with server address (local) and client address to dnstap process 3439 */ 3440 log_addr("query from client", &q->addr); 3441 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 3442 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen, 3443 q->tcp, q->packet); 3444 #endif /* USE_DNSTAP */ 3445 3446 /* Process and answer the query... */ 3447 if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) { 3448 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3449 STATUP(data->nsd, nona); 3450 ZTATUP(data->nsd, q->zone, nona); 3451 } 3452 3453 #ifdef USE_ZONE_STATS 3454 if (data->socket->addr.ai_family == AF_INET) { 3455 ZTATUP(data->nsd, q->zone, qudp); 3456 } else if (data->socket->addr.ai_family == AF_INET6) { 3457 ZTATUP(data->nsd, q->zone, qudp6); 3458 } 3459 #endif 3460 3461 /* Add EDNS0 and TSIG info if necessary. */ 3462 query_add_optional(q, data->nsd, &now); 3463 3464 buffer_flip(q->packet); 3465 iovecs[i].iov_len = buffer_remaining(q->packet); 3466 #ifdef BIND8_STATS 3467 /* Account the rcode & TC... */ 3468 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3469 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3470 if (TC(q->packet)) { 3471 STATUP(data->nsd, truncated); 3472 ZTATUP(data->nsd, q->zone, truncated); 3473 } 3474 #endif /* BIND8_STATS */ 3475 #ifdef USE_DNSTAP 3476 /* 3477 * sending UDP-response with server address (local) and client address to dnstap process 3478 */ 3479 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 3480 log_addr("response to client", &q->addr); 3481 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, 3482 &q->addr, q->addrlen, q->tcp, q->packet, 3483 q->zone); 3484 #endif /* USE_DNSTAP */ 3485 } else { 3486 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3487 iovecs[i].iov_len = buffer_remaining(q->packet); 3488 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3489 swap_drop: 3490 STATUP(data->nsd, dropped); 3491 ZTATUP(data->nsd, q->zone, dropped); 3492 if(i != recvcount-1) { 3493 /* swap with last and decrease recvcount */ 3494 struct mmsghdr mtmp = msgs[i]; 3495 struct iovec iotmp = iovecs[i]; 3496 recvcount--; 3497 msgs[i] = msgs[recvcount]; 3498 iovecs[i] = iovecs[recvcount]; 3499 queries[i] = queries[recvcount]; 3500 msgs[recvcount] = mtmp; 3501 iovecs[recvcount] = iotmp; 3502 queries[recvcount] = q; 3503 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3504 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3505 goto loopstart; 3506 } else { recvcount --; } 3507 } 3508 } 3509 3510 /* send until all are sent */ 3511 i = 0; 3512 while(i<recvcount) { 3513 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3514 if(sent == -1) { 3515 if(errno == ENOBUFS || 3516 #ifdef EWOULDBLOCK 3517 errno == EWOULDBLOCK || 3518 #endif 3519 errno == EAGAIN) { 3520 /* block to wait until send buffer avail */ 3521 int flag, errstore; 3522 if((flag = fcntl(fd, F_GETFL)) == -1) { 3523 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3524 flag = 0; 3525 } 3526 flag &= ~O_NONBLOCK; 3527 if(fcntl(fd, F_SETFL, flag) == -1) 3528 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3529 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3530 errstore = errno; 3531 flag |= O_NONBLOCK; 3532 if(fcntl(fd, F_SETFL, flag) == -1) 3533 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3534 if(sent != -1) { 3535 i += sent; 3536 continue; 3537 } 3538 errno = errstore; 3539 } 3540 if(errno == EINVAL) { 3541 /* skip the invalid argument entry, 3542 * send the remaining packets in the list */ 3543 if(!(port_is_zero((void*)&queries[i]->addr) && 3544 verbosity < 3)) { 3545 const char* es = strerror(errno); 3546 char a[64]; 3547 addrport2str((void*)&queries[i]->addr, a, sizeof(a)); 3548 log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3549 } 3550 i += 1; 3551 continue; 3552 } 3553 /* don't log transient network full errors, unless 3554 * on higher verbosity */ 3555 if(!(errno == ENOBUFS && verbosity < 1) && 3556 #ifdef EWOULDBLOCK 3557 errno != EWOULDBLOCK && 3558 #endif 3559 errno != EAGAIN) { 3560 const char* es = strerror(errno); 3561 char a[64]; 3562 addrport2str((void*)&queries[i]->addr, a, sizeof(a)); 3563 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3564 } 3565 #ifdef BIND8_STATS 3566 data->nsd->st.txerr += recvcount-i; 3567 #endif /* BIND8_STATS */ 3568 break; 3569 } 3570 i += sent; 3571 } 3572 for(i=0; i<recvcount; i++) { 3573 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3574 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3575 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3576 } 3577 } 3578 3579 #ifdef HAVE_SSL 3580 /* 3581 * Setup an event for the tcp handler. 3582 */ 3583 static void 3584 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3585 int fd, short event) 3586 { 3587 struct timeval timeout; 3588 struct event_base* ev_base; 3589 3590 timeout.tv_sec = data->nsd->tcp_timeout; 3591 timeout.tv_usec = 0L; 3592 3593 ev_base = data->event.ev_base; 3594 event_del(&data->event); 3595 memset(&data->event, 0, sizeof(data->event)); 3596 event_set(&data->event, fd, event, fn, data); 3597 if(event_base_set(ev_base, &data->event) != 0) 3598 log_msg(LOG_ERR, "event base set failed"); 3599 if(event_add(&data->event, &timeout) != 0) 3600 log_msg(LOG_ERR, "event add failed"); 3601 } 3602 #endif /* HAVE_SSL */ 3603 3604 static void 3605 cleanup_tcp_handler(struct tcp_handler_data* data) 3606 { 3607 event_del(&data->event); 3608 #ifdef HAVE_SSL 3609 if(data->tls) { 3610 SSL_shutdown(data->tls); 3611 SSL_free(data->tls); 3612 data->tls = NULL; 3613 } 3614 #endif 3615 close(data->event.ev_fd); 3616 if(data->prev) 3617 data->prev->next = data->next; 3618 else tcp_active_list = data->next; 3619 if(data->next) 3620 data->next->prev = data->prev; 3621 3622 /* 3623 * Enable the TCP accept handlers when the current number of 3624 * TCP connections is about to drop below the maximum number 3625 * of TCP connections. 3626 */ 3627 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3628 configure_handler_event_types(EV_READ|EV_PERSIST); 3629 if(slowaccept) { 3630 event_del(&slowaccept_event); 3631 slowaccept = 0; 3632 } 3633 } 3634 --data->nsd->current_tcp_count; 3635 assert(data->nsd->current_tcp_count >= 0); 3636 3637 region_destroy(data->region); 3638 } 3639 3640 static void 3641 handle_tcp_reading(int fd, short event, void* arg) 3642 { 3643 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3644 ssize_t received; 3645 struct event_base* ev_base; 3646 struct timeval timeout; 3647 uint32_t now = 0; 3648 3649 if ((event & EV_TIMEOUT)) { 3650 /* Connection timed out. */ 3651 cleanup_tcp_handler(data); 3652 return; 3653 } 3654 3655 if ((data->nsd->tcp_query_count > 0 && 3656 data->query_count >= data->nsd->tcp_query_count) || 3657 data->tcp_no_more_queries) { 3658 /* No more queries allowed on this tcp connection. */ 3659 cleanup_tcp_handler(data); 3660 return; 3661 } 3662 3663 assert((event & EV_READ)); 3664 3665 if (data->bytes_transmitted == 0) { 3666 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3667 } 3668 3669 /* 3670 * Check if we received the leading packet length bytes yet. 3671 */ 3672 if (data->bytes_transmitted < sizeof(uint16_t)) { 3673 received = read(fd, 3674 (char *) &data->query->tcplen 3675 + data->bytes_transmitted, 3676 sizeof(uint16_t) - data->bytes_transmitted); 3677 if (received == -1) { 3678 if (errno == EAGAIN || errno == EINTR) { 3679 /* 3680 * Read would block, wait until more 3681 * data is available. 3682 */ 3683 return; 3684 } else { 3685 char buf[48]; 3686 addr2str(&data->query->addr, buf, sizeof(buf)); 3687 #ifdef ECONNRESET 3688 if (verbosity >= 2 || errno != ECONNRESET) 3689 #endif /* ECONNRESET */ 3690 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3691 cleanup_tcp_handler(data); 3692 return; 3693 } 3694 } else if (received == 0) { 3695 /* EOF */ 3696 cleanup_tcp_handler(data); 3697 return; 3698 } 3699 3700 data->bytes_transmitted += received; 3701 if (data->bytes_transmitted < sizeof(uint16_t)) { 3702 /* 3703 * Not done with the tcplen yet, wait for more 3704 * data to become available. 3705 */ 3706 return; 3707 } 3708 3709 assert(data->bytes_transmitted == sizeof(uint16_t)); 3710 3711 data->query->tcplen = ntohs(data->query->tcplen); 3712 3713 /* 3714 * Minimum query size is: 3715 * 3716 * Size of the header (12) 3717 * + Root domain name (1) 3718 * + Query class (2) 3719 * + Query type (2) 3720 */ 3721 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3722 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3723 cleanup_tcp_handler(data); 3724 return; 3725 } 3726 3727 if (data->query->tcplen > data->query->maxlen) { 3728 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3729 cleanup_tcp_handler(data); 3730 return; 3731 } 3732 3733 buffer_set_limit(data->query->packet, data->query->tcplen); 3734 } 3735 3736 assert(buffer_remaining(data->query->packet) > 0); 3737 3738 /* Read the (remaining) query data. */ 3739 received = read(fd, 3740 buffer_current(data->query->packet), 3741 buffer_remaining(data->query->packet)); 3742 if (received == -1) { 3743 if (errno == EAGAIN || errno == EINTR) { 3744 /* 3745 * Read would block, wait until more data is 3746 * available. 3747 */ 3748 return; 3749 } else { 3750 char buf[48]; 3751 addr2str(&data->query->addr, buf, sizeof(buf)); 3752 #ifdef ECONNRESET 3753 if (verbosity >= 2 || errno != ECONNRESET) 3754 #endif /* ECONNRESET */ 3755 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3756 cleanup_tcp_handler(data); 3757 return; 3758 } 3759 } else if (received == 0) { 3760 /* EOF */ 3761 cleanup_tcp_handler(data); 3762 return; 3763 } 3764 3765 data->bytes_transmitted += received; 3766 buffer_skip(data->query->packet, received); 3767 if (buffer_remaining(data->query->packet) > 0) { 3768 /* 3769 * Message not yet complete, wait for more data to 3770 * become available. 3771 */ 3772 return; 3773 } 3774 3775 assert(buffer_position(data->query->packet) == data->query->tcplen); 3776 3777 /* Account... */ 3778 #ifdef BIND8_STATS 3779 #ifndef INET6 3780 STATUP(data->nsd, ctcp); 3781 #else 3782 if (data->query->addr.ss_family == AF_INET) { 3783 STATUP(data->nsd, ctcp); 3784 } else if (data->query->addr.ss_family == AF_INET6) { 3785 STATUP(data->nsd, ctcp6); 3786 } 3787 #endif 3788 #endif /* BIND8_STATS */ 3789 3790 /* We have a complete query, process it. */ 3791 3792 /* tcp-query-count: handle query counter ++ */ 3793 data->query_count++; 3794 3795 buffer_flip(data->query->packet); 3796 #ifdef USE_DNSTAP 3797 /* 3798 * and send TCP-query with found address (local) and client address to dnstap process 3799 */ 3800 log_addr("query from client", &data->query->addr); 3801 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 3802 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 3803 data->query->addrlen, data->query->tcp, data->query->packet); 3804 #endif /* USE_DNSTAP */ 3805 data->query_state = server_process_query(data->nsd, data->query, &now); 3806 if (data->query_state == QUERY_DISCARDED) { 3807 /* Drop the packet and the entire connection... */ 3808 STATUP(data->nsd, dropped); 3809 ZTATUP(data->nsd, data->query->zone, dropped); 3810 cleanup_tcp_handler(data); 3811 return; 3812 } 3813 3814 #ifdef BIND8_STATS 3815 if (RCODE(data->query->packet) == RCODE_OK 3816 && !AA(data->query->packet)) 3817 { 3818 STATUP(data->nsd, nona); 3819 ZTATUP(data->nsd, data->query->zone, nona); 3820 } 3821 #endif /* BIND8_STATS */ 3822 3823 #ifdef USE_ZONE_STATS 3824 #ifndef INET6 3825 ZTATUP(data->nsd, data->query->zone, ctcp); 3826 #else 3827 if (data->query->addr.ss_family == AF_INET) { 3828 ZTATUP(data->nsd, data->query->zone, ctcp); 3829 } else if (data->query->addr.ss_family == AF_INET6) { 3830 ZTATUP(data->nsd, data->query->zone, ctcp6); 3831 } 3832 #endif 3833 #endif /* USE_ZONE_STATS */ 3834 3835 query_add_optional(data->query, data->nsd, &now); 3836 3837 /* Switch to the tcp write handler. */ 3838 buffer_flip(data->query->packet); 3839 data->query->tcplen = buffer_remaining(data->query->packet); 3840 #ifdef BIND8_STATS 3841 /* Account the rcode & TC... */ 3842 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 3843 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 3844 if (TC(data->query->packet)) { 3845 STATUP(data->nsd, truncated); 3846 ZTATUP(data->nsd, data->query->zone, truncated); 3847 } 3848 #endif /* BIND8_STATS */ 3849 #ifdef USE_DNSTAP 3850 /* 3851 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 3852 */ 3853 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 3854 log_addr("response to client", &data->query->addr); 3855 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 3856 data->query->addrlen, data->query->tcp, data->query->packet, 3857 data->query->zone); 3858 #endif /* USE_DNSTAP */ 3859 data->bytes_transmitted = 0; 3860 3861 timeout.tv_sec = data->tcp_timeout / 1000; 3862 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3863 3864 ev_base = data->event.ev_base; 3865 event_del(&data->event); 3866 memset(&data->event, 0, sizeof(data->event)); 3867 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3868 handle_tcp_reading, data); 3869 if(event_base_set(ev_base, &data->event) != 0) 3870 log_msg(LOG_ERR, "event base set tcpr failed"); 3871 if(event_add(&data->event, &timeout) != 0) 3872 log_msg(LOG_ERR, "event add tcpr failed"); 3873 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 3874 handle_tcp_writing(fd, EV_WRITE, data); 3875 } 3876 3877 static void 3878 handle_tcp_writing(int fd, short event, void* arg) 3879 { 3880 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3881 ssize_t sent; 3882 struct query *q = data->query; 3883 struct timeval timeout; 3884 struct event_base* ev_base; 3885 uint32_t now = 0; 3886 3887 if ((event & EV_TIMEOUT)) { 3888 /* Connection timed out. */ 3889 cleanup_tcp_handler(data); 3890 return; 3891 } 3892 3893 assert((event & EV_WRITE)); 3894 3895 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3896 /* Writing the response packet length. */ 3897 uint16_t n_tcplen = htons(q->tcplen); 3898 #ifdef HAVE_WRITEV 3899 struct iovec iov[2]; 3900 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 3901 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 3902 iov[1].iov_base = buffer_begin(q->packet); 3903 iov[1].iov_len = buffer_limit(q->packet); 3904 sent = writev(fd, iov, 2); 3905 #else /* HAVE_WRITEV */ 3906 sent = write(fd, 3907 (const char *) &n_tcplen + data->bytes_transmitted, 3908 sizeof(n_tcplen) - data->bytes_transmitted); 3909 #endif /* HAVE_WRITEV */ 3910 if (sent == -1) { 3911 if (errno == EAGAIN || errno == EINTR) { 3912 /* 3913 * Write would block, wait until 3914 * socket becomes writable again. 3915 */ 3916 return; 3917 } else { 3918 #ifdef ECONNRESET 3919 if(verbosity >= 2 || errno != ECONNRESET) 3920 #endif /* ECONNRESET */ 3921 #ifdef EPIPE 3922 if(verbosity >= 2 || errno != EPIPE) 3923 #endif /* EPIPE 'broken pipe' */ 3924 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3925 cleanup_tcp_handler(data); 3926 return; 3927 } 3928 } 3929 3930 data->bytes_transmitted += sent; 3931 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3932 /* 3933 * Writing not complete, wait until socket 3934 * becomes writable again. 3935 */ 3936 return; 3937 } 3938 3939 #ifdef HAVE_WRITEV 3940 sent -= sizeof(n_tcplen); 3941 /* handle potential 'packet done' code */ 3942 goto packet_could_be_done; 3943 #endif 3944 } 3945 3946 sent = write(fd, 3947 buffer_current(q->packet), 3948 buffer_remaining(q->packet)); 3949 if (sent == -1) { 3950 if (errno == EAGAIN || errno == EINTR) { 3951 /* 3952 * Write would block, wait until 3953 * socket becomes writable again. 3954 */ 3955 return; 3956 } else { 3957 #ifdef ECONNRESET 3958 if(verbosity >= 2 || errno != ECONNRESET) 3959 #endif /* ECONNRESET */ 3960 #ifdef EPIPE 3961 if(verbosity >= 2 || errno != EPIPE) 3962 #endif /* EPIPE 'broken pipe' */ 3963 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3964 cleanup_tcp_handler(data); 3965 return; 3966 } 3967 } 3968 3969 data->bytes_transmitted += sent; 3970 #ifdef HAVE_WRITEV 3971 packet_could_be_done: 3972 #endif 3973 buffer_skip(q->packet, sent); 3974 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 3975 /* 3976 * Still more data to write when socket becomes 3977 * writable again. 3978 */ 3979 return; 3980 } 3981 3982 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 3983 3984 if (data->query_state == QUERY_IN_AXFR) { 3985 /* Continue processing AXFR and writing back results. */ 3986 buffer_clear(q->packet); 3987 data->query_state = query_axfr(data->nsd, q); 3988 if (data->query_state != QUERY_PROCESSED) { 3989 query_add_optional(data->query, data->nsd, &now); 3990 3991 /* Reset data. */ 3992 buffer_flip(q->packet); 3993 q->tcplen = buffer_remaining(q->packet); 3994 data->bytes_transmitted = 0; 3995 /* Reset timeout. */ 3996 timeout.tv_sec = data->tcp_timeout / 1000; 3997 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3998 ev_base = data->event.ev_base; 3999 event_del(&data->event); 4000 memset(&data->event, 0, sizeof(data->event)); 4001 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4002 handle_tcp_writing, data); 4003 if(event_base_set(ev_base, &data->event) != 0) 4004 log_msg(LOG_ERR, "event base set tcpw failed"); 4005 if(event_add(&data->event, &timeout) != 0) 4006 log_msg(LOG_ERR, "event add tcpw failed"); 4007 4008 /* 4009 * Write data if/when the socket is writable 4010 * again. 4011 */ 4012 return; 4013 } 4014 } 4015 4016 /* 4017 * Done sending, wait for the next request to arrive on the 4018 * TCP socket by installing the TCP read handler. 4019 */ 4020 if ((data->nsd->tcp_query_count > 0 && 4021 data->query_count >= data->nsd->tcp_query_count) || 4022 data->tcp_no_more_queries) { 4023 4024 (void) shutdown(fd, SHUT_WR); 4025 } 4026 4027 data->bytes_transmitted = 0; 4028 4029 timeout.tv_sec = data->tcp_timeout / 1000; 4030 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4031 ev_base = data->event.ev_base; 4032 event_del(&data->event); 4033 memset(&data->event, 0, sizeof(data->event)); 4034 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 4035 handle_tcp_reading, data); 4036 if(event_base_set(ev_base, &data->event) != 0) 4037 log_msg(LOG_ERR, "event base set tcpw failed"); 4038 if(event_add(&data->event, &timeout) != 0) 4039 log_msg(LOG_ERR, "event add tcpw failed"); 4040 } 4041 4042 #ifdef HAVE_SSL 4043 /** create SSL object and associate fd */ 4044 static SSL* 4045 incoming_ssl_fd(SSL_CTX* ctx, int fd) 4046 { 4047 SSL* ssl = SSL_new((SSL_CTX*)ctx); 4048 if(!ssl) { 4049 log_crypto_err("could not SSL_new"); 4050 return NULL; 4051 } 4052 SSL_set_accept_state(ssl); 4053 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 4054 if(!SSL_set_fd(ssl, fd)) { 4055 log_crypto_err("could not SSL_set_fd"); 4056 SSL_free(ssl); 4057 return NULL; 4058 } 4059 return ssl; 4060 } 4061 4062 /** TLS handshake to upgrade TCP connection */ 4063 static int 4064 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 4065 { 4066 int r; 4067 if(data->shake_state == tls_hs_read_event) { 4068 /* read condition satisfied back to writing */ 4069 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4070 data->shake_state = tls_hs_none; 4071 return 1; 4072 } 4073 if(data->shake_state == tls_hs_write_event) { 4074 /* write condition satisfied back to reading */ 4075 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4076 data->shake_state = tls_hs_none; 4077 return 1; 4078 } 4079 4080 /* (continue to) setup the TLS connection */ 4081 ERR_clear_error(); 4082 r = SSL_do_handshake(data->tls); 4083 4084 if(r != 1) { 4085 int want = SSL_get_error(data->tls, r); 4086 if(want == SSL_ERROR_WANT_READ) { 4087 if(data->shake_state == tls_hs_read) { 4088 /* try again later */ 4089 return 1; 4090 } 4091 data->shake_state = tls_hs_read; 4092 /* switch back to reading mode */ 4093 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4094 return 1; 4095 } else if(want == SSL_ERROR_WANT_WRITE) { 4096 if(data->shake_state == tls_hs_write) { 4097 /* try again later */ 4098 return 1; 4099 } 4100 data->shake_state = tls_hs_write; 4101 /* switch back to writing mode */ 4102 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4103 return 1; 4104 } else { 4105 if(r == 0) 4106 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 4107 else { 4108 unsigned long err = ERR_get_error(); 4109 if(!squelch_err_ssl_handshake(err)) { 4110 char a[64], s[256]; 4111 addr2str(&data->query->addr, a, sizeof(a)); 4112 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 4113 log_crypto_from_err(s, err); 4114 } 4115 } 4116 cleanup_tcp_handler(data); 4117 return 0; 4118 } 4119 } 4120 4121 /* Use to log successful upgrade for testing - could be removed*/ 4122 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 4123 /* set back to the event we need to have when reading (or writing) */ 4124 if(data->shake_state == tls_hs_read && writing) { 4125 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4126 } else if(data->shake_state == tls_hs_write && !writing) { 4127 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4128 } 4129 data->shake_state = tls_hs_none; 4130 return 1; 4131 } 4132 4133 /** handle TLS reading of incoming query */ 4134 static void 4135 handle_tls_reading(int fd, short event, void* arg) 4136 { 4137 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4138 ssize_t received; 4139 uint32_t now = 0; 4140 4141 if ((event & EV_TIMEOUT)) { 4142 /* Connection timed out. */ 4143 cleanup_tcp_handler(data); 4144 return; 4145 } 4146 4147 if ((data->nsd->tcp_query_count > 0 && 4148 data->query_count >= data->nsd->tcp_query_count) || 4149 data->tcp_no_more_queries) { 4150 /* No more queries allowed on this tcp connection. */ 4151 cleanup_tcp_handler(data); 4152 return; 4153 } 4154 4155 assert((event & EV_READ)); 4156 4157 if (data->bytes_transmitted == 0) { 4158 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4159 } 4160 4161 if(data->shake_state != tls_hs_none) { 4162 if(!tls_handshake(data, fd, 0)) 4163 return; 4164 if(data->shake_state != tls_hs_none) 4165 return; 4166 } 4167 4168 /* 4169 * Check if we received the leading packet length bytes yet. 4170 */ 4171 if(data->bytes_transmitted < sizeof(uint16_t)) { 4172 ERR_clear_error(); 4173 if((received=SSL_read(data->tls, (char *) &data->query->tcplen 4174 + data->bytes_transmitted, 4175 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { 4176 int want = SSL_get_error(data->tls, received); 4177 if(want == SSL_ERROR_ZERO_RETURN) { 4178 cleanup_tcp_handler(data); 4179 return; /* shutdown, closed */ 4180 } else if(want == SSL_ERROR_WANT_READ) { 4181 /* wants to be called again */ 4182 return; 4183 } 4184 else if(want == SSL_ERROR_WANT_WRITE) { 4185 /* switch to writing */ 4186 data->shake_state = tls_hs_write_event; 4187 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4188 return; 4189 } 4190 cleanup_tcp_handler(data); 4191 log_crypto_err("could not SSL_read"); 4192 return; 4193 } 4194 4195 data->bytes_transmitted += received; 4196 if (data->bytes_transmitted < sizeof(uint16_t)) { 4197 /* 4198 * Not done with the tcplen yet, wait for more 4199 * data to become available. 4200 */ 4201 return; 4202 } 4203 4204 assert(data->bytes_transmitted == sizeof(uint16_t)); 4205 4206 data->query->tcplen = ntohs(data->query->tcplen); 4207 4208 /* 4209 * Minimum query size is: 4210 * 4211 * Size of the header (12) 4212 * + Root domain name (1) 4213 * + Query class (2) 4214 * + Query type (2) 4215 */ 4216 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4217 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4218 cleanup_tcp_handler(data); 4219 return; 4220 } 4221 4222 if (data->query->tcplen > data->query->maxlen) { 4223 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4224 cleanup_tcp_handler(data); 4225 return; 4226 } 4227 4228 buffer_set_limit(data->query->packet, data->query->tcplen); 4229 } 4230 4231 assert(buffer_remaining(data->query->packet) > 0); 4232 4233 /* Read the (remaining) query data. */ 4234 ERR_clear_error(); 4235 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), 4236 (int)buffer_remaining(data->query->packet)); 4237 if(received <= 0) { 4238 int want = SSL_get_error(data->tls, received); 4239 if(want == SSL_ERROR_ZERO_RETURN) { 4240 cleanup_tcp_handler(data); 4241 return; /* shutdown, closed */ 4242 } else if(want == SSL_ERROR_WANT_READ) { 4243 /* wants to be called again */ 4244 return; 4245 } 4246 else if(want == SSL_ERROR_WANT_WRITE) { 4247 /* switch back writing */ 4248 data->shake_state = tls_hs_write_event; 4249 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4250 return; 4251 } 4252 cleanup_tcp_handler(data); 4253 log_crypto_err("could not SSL_read"); 4254 return; 4255 } 4256 4257 data->bytes_transmitted += received; 4258 buffer_skip(data->query->packet, received); 4259 if (buffer_remaining(data->query->packet) > 0) { 4260 /* 4261 * Message not yet complete, wait for more data to 4262 * become available. 4263 */ 4264 return; 4265 } 4266 4267 assert(buffer_position(data->query->packet) == data->query->tcplen); 4268 4269 /* Account... */ 4270 #ifndef INET6 4271 STATUP(data->nsd, ctls); 4272 #else 4273 if (data->query->addr.ss_family == AF_INET) { 4274 STATUP(data->nsd, ctls); 4275 } else if (data->query->addr.ss_family == AF_INET6) { 4276 STATUP(data->nsd, ctls6); 4277 } 4278 #endif 4279 4280 /* We have a complete query, process it. */ 4281 4282 /* tcp-query-count: handle query counter ++ */ 4283 data->query_count++; 4284 4285 buffer_flip(data->query->packet); 4286 #ifdef USE_DNSTAP 4287 /* 4288 * and send TCP-query with found address (local) and client address to dnstap process 4289 */ 4290 log_addr("query from client", &data->query->addr); 4291 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4292 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 4293 data->query->addrlen, data->query->tcp, data->query->packet); 4294 #endif /* USE_DNSTAP */ 4295 data->query_state = server_process_query(data->nsd, data->query, &now); 4296 if (data->query_state == QUERY_DISCARDED) { 4297 /* Drop the packet and the entire connection... */ 4298 STATUP(data->nsd, dropped); 4299 ZTATUP(data->nsd, data->query->zone, dropped); 4300 cleanup_tcp_handler(data); 4301 return; 4302 } 4303 4304 #ifdef BIND8_STATS 4305 if (RCODE(data->query->packet) == RCODE_OK 4306 && !AA(data->query->packet)) 4307 { 4308 STATUP(data->nsd, nona); 4309 ZTATUP(data->nsd, data->query->zone, nona); 4310 } 4311 #endif /* BIND8_STATS */ 4312 4313 #ifdef USE_ZONE_STATS 4314 #ifndef INET6 4315 ZTATUP(data->nsd, data->query->zone, ctls); 4316 #else 4317 if (data->query->addr.ss_family == AF_INET) { 4318 ZTATUP(data->nsd, data->query->zone, ctls); 4319 } else if (data->query->addr.ss_family == AF_INET6) { 4320 ZTATUP(data->nsd, data->query->zone, ctls6); 4321 } 4322 #endif 4323 #endif /* USE_ZONE_STATS */ 4324 4325 query_add_optional(data->query, data->nsd, &now); 4326 4327 /* Switch to the tcp write handler. */ 4328 buffer_flip(data->query->packet); 4329 data->query->tcplen = buffer_remaining(data->query->packet); 4330 #ifdef BIND8_STATS 4331 /* Account the rcode & TC... */ 4332 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4333 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4334 if (TC(data->query->packet)) { 4335 STATUP(data->nsd, truncated); 4336 ZTATUP(data->nsd, data->query->zone, truncated); 4337 } 4338 #endif /* BIND8_STATS */ 4339 #ifdef USE_DNSTAP 4340 /* 4341 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4342 */ 4343 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4344 log_addr("response to client", &data->query->addr); 4345 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 4346 data->query->addrlen, data->query->tcp, data->query->packet, 4347 data->query->zone); 4348 #endif /* USE_DNSTAP */ 4349 data->bytes_transmitted = 0; 4350 4351 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4352 4353 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4354 handle_tls_writing(fd, EV_WRITE, data); 4355 } 4356 4357 /** handle TLS writing of outgoing response */ 4358 static void 4359 handle_tls_writing(int fd, short event, void* arg) 4360 { 4361 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4362 ssize_t sent; 4363 struct query *q = data->query; 4364 /* static variable that holds reassembly buffer used to put the 4365 * TCP length in front of the packet, like writev. */ 4366 static buffer_type* global_tls_temp_buffer = NULL; 4367 buffer_type* write_buffer; 4368 uint32_t now = 0; 4369 4370 if ((event & EV_TIMEOUT)) { 4371 /* Connection timed out. */ 4372 cleanup_tcp_handler(data); 4373 return; 4374 } 4375 4376 assert((event & EV_WRITE)); 4377 4378 if(data->shake_state != tls_hs_none) { 4379 if(!tls_handshake(data, fd, 1)) 4380 return; 4381 if(data->shake_state != tls_hs_none) 4382 return; 4383 } 4384 4385 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4386 4387 /* If we are writing the start of a message, we must include the length 4388 * this is done with a copy into write_buffer. */ 4389 write_buffer = NULL; 4390 if (data->bytes_transmitted == 0) { 4391 if(!global_tls_temp_buffer) { 4392 /* gets deallocated when nsd shuts down from 4393 * nsd.region */ 4394 global_tls_temp_buffer = buffer_create(nsd.region, 4395 QIOBUFSZ + sizeof(q->tcplen)); 4396 if (!global_tls_temp_buffer) { 4397 return; 4398 } 4399 } 4400 write_buffer = global_tls_temp_buffer; 4401 buffer_clear(write_buffer); 4402 buffer_write_u16(write_buffer, q->tcplen); 4403 buffer_write(write_buffer, buffer_current(q->packet), 4404 (int)buffer_remaining(q->packet)); 4405 buffer_flip(write_buffer); 4406 } else { 4407 write_buffer = q->packet; 4408 } 4409 4410 /* Write the response */ 4411 ERR_clear_error(); 4412 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4413 if(sent <= 0) { 4414 int want = SSL_get_error(data->tls, sent); 4415 if(want == SSL_ERROR_ZERO_RETURN) { 4416 cleanup_tcp_handler(data); 4417 /* closed */ 4418 } else if(want == SSL_ERROR_WANT_READ) { 4419 /* switch back to reading */ 4420 data->shake_state = tls_hs_read_event; 4421 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4422 } else if(want != SSL_ERROR_WANT_WRITE) { 4423 cleanup_tcp_handler(data); 4424 log_crypto_err("could not SSL_write"); 4425 } 4426 return; 4427 } 4428 4429 buffer_skip(write_buffer, sent); 4430 if(buffer_remaining(write_buffer) != 0) { 4431 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4432 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4433 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4434 } 4435 } 4436 4437 data->bytes_transmitted += sent; 4438 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4439 /* 4440 * Still more data to write when socket becomes 4441 * writable again. 4442 */ 4443 return; 4444 } 4445 4446 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4447 4448 if (data->query_state == QUERY_IN_AXFR) { 4449 /* Continue processing AXFR and writing back results. */ 4450 buffer_clear(q->packet); 4451 data->query_state = query_axfr(data->nsd, q); 4452 if (data->query_state != QUERY_PROCESSED) { 4453 query_add_optional(data->query, data->nsd, &now); 4454 4455 /* Reset data. */ 4456 buffer_flip(q->packet); 4457 q->tcplen = buffer_remaining(q->packet); 4458 data->bytes_transmitted = 0; 4459 /* Reset to writing mode. */ 4460 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4461 4462 /* 4463 * Write data if/when the socket is writable 4464 * again. 4465 */ 4466 return; 4467 } 4468 } 4469 4470 /* 4471 * Done sending, wait for the next request to arrive on the 4472 * TCP socket by installing the TCP read handler. 4473 */ 4474 if ((data->nsd->tcp_query_count > 0 && 4475 data->query_count >= data->nsd->tcp_query_count) || 4476 data->tcp_no_more_queries) { 4477 4478 (void) shutdown(fd, SHUT_WR); 4479 } 4480 4481 data->bytes_transmitted = 0; 4482 4483 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4484 } 4485 #endif 4486 4487 static void 4488 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 4489 void* ATTR_UNUSED(arg)) 4490 { 4491 if(slowaccept) { 4492 configure_handler_event_types(EV_PERSIST | EV_READ); 4493 slowaccept = 0; 4494 } 4495 } 4496 4497 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 4498 { 4499 #ifndef HAVE_ACCEPT4 4500 int s = accept(fd, addr, addrlen); 4501 if (s != -1) { 4502 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 4503 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 4504 close(s); 4505 s = -1; 4506 errno=EINTR; /* stop error printout as error in accept4 4507 by setting this errno, it omits printout, in 4508 later code that calls nsd_accept4 */ 4509 } 4510 } 4511 return s; 4512 #else 4513 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 4514 #endif /* HAVE_ACCEPT4 */ 4515 } 4516 4517 /* 4518 * Handle an incoming TCP connection. The connection is accepted and 4519 * a new TCP reader event handler is added. The TCP handler 4520 * is responsible for cleanup when the connection is closed. 4521 */ 4522 static void 4523 handle_tcp_accept(int fd, short event, void* arg) 4524 { 4525 struct tcp_accept_handler_data *data 4526 = (struct tcp_accept_handler_data *) arg; 4527 int s; 4528 int reject = 0; 4529 struct tcp_handler_data *tcp_data; 4530 region_type *tcp_region; 4531 #ifdef INET6 4532 struct sockaddr_storage addr; 4533 #else 4534 struct sockaddr_in addr; 4535 #endif 4536 socklen_t addrlen; 4537 struct timeval timeout; 4538 4539 if (!(event & EV_READ)) { 4540 return; 4541 } 4542 4543 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 4544 reject = data->nsd->options->tcp_reject_overflow; 4545 if (!reject) { 4546 return; 4547 } 4548 } 4549 4550 /* Accept it... */ 4551 addrlen = sizeof(addr); 4552 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 4553 if (s == -1) { 4554 /** 4555 * EMFILE and ENFILE is a signal that the limit of open 4556 * file descriptors has been reached. Pause accept(). 4557 * EINTR is a signal interrupt. The others are various OS ways 4558 * of saying that the client has closed the connection. 4559 */ 4560 if (errno == EMFILE || errno == ENFILE) { 4561 if (!slowaccept) { 4562 /* disable accept events */ 4563 struct timeval tv; 4564 configure_handler_event_types(0); 4565 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 4566 tv.tv_usec = 0L; 4567 memset(&slowaccept_event, 0, 4568 sizeof(slowaccept_event)); 4569 event_set(&slowaccept_event, -1, EV_TIMEOUT, 4570 handle_slowaccept_timeout, NULL); 4571 (void)event_base_set(data->event.ev_base, 4572 &slowaccept_event); 4573 (void)event_add(&slowaccept_event, &tv); 4574 slowaccept = 1; 4575 /* We don't want to spam the logs here */ 4576 } 4577 } else if (errno != EINTR 4578 && errno != EWOULDBLOCK 4579 #ifdef ECONNABORTED 4580 && errno != ECONNABORTED 4581 #endif /* ECONNABORTED */ 4582 #ifdef EPROTO 4583 && errno != EPROTO 4584 #endif /* EPROTO */ 4585 ) { 4586 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 4587 } 4588 return; 4589 } 4590 4591 if (reject) { 4592 shutdown(s, SHUT_RDWR); 4593 close(s); 4594 return; 4595 } 4596 4597 /* 4598 * This region is deallocated when the TCP connection is 4599 * closed by the TCP handler. 4600 */ 4601 tcp_region = region_create(xalloc, free); 4602 tcp_data = (struct tcp_handler_data *) region_alloc( 4603 tcp_region, sizeof(struct tcp_handler_data)); 4604 tcp_data->region = tcp_region; 4605 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 4606 compression_table_size, compressed_dnames); 4607 tcp_data->nsd = data->nsd; 4608 tcp_data->query_count = 0; 4609 #ifdef HAVE_SSL 4610 tcp_data->shake_state = tls_hs_none; 4611 tcp_data->tls = NULL; 4612 #endif 4613 tcp_data->prev = NULL; 4614 tcp_data->next = NULL; 4615 4616 tcp_data->query_state = QUERY_PROCESSED; 4617 tcp_data->bytes_transmitted = 0; 4618 memcpy(&tcp_data->query->addr, &addr, addrlen); 4619 tcp_data->query->addrlen = addrlen; 4620 4621 tcp_data->tcp_no_more_queries = 0; 4622 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 4623 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 4624 /* very busy, give smaller timeout */ 4625 tcp_data->tcp_timeout = 200; 4626 } 4627 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4628 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 4629 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 4630 4631 #ifdef USE_DNSTAP 4632 /* save the address of the connection */ 4633 tcp_data->socket = data->socket; 4634 #endif /* USE_DNSTAP */ 4635 4636 #ifdef HAVE_SSL 4637 if (data->tls_accept) { 4638 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 4639 if(!tcp_data->tls) { 4640 close(s); 4641 return; 4642 } 4643 tcp_data->shake_state = tls_hs_read; 4644 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4645 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4646 handle_tls_reading, tcp_data); 4647 } else { 4648 #endif 4649 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4650 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4651 handle_tcp_reading, tcp_data); 4652 #ifdef HAVE_SSL 4653 } 4654 #endif 4655 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 4656 log_msg(LOG_ERR, "cannot set tcp event base"); 4657 close(s); 4658 region_destroy(tcp_region); 4659 return; 4660 } 4661 if(event_add(&tcp_data->event, &timeout) != 0) { 4662 log_msg(LOG_ERR, "cannot add tcp to event base"); 4663 close(s); 4664 region_destroy(tcp_region); 4665 return; 4666 } 4667 if(tcp_active_list) { 4668 tcp_active_list->prev = tcp_data; 4669 tcp_data->next = tcp_active_list; 4670 } 4671 tcp_active_list = tcp_data; 4672 4673 /* 4674 * Keep track of the total number of TCP handlers installed so 4675 * we can stop accepting connections when the maximum number 4676 * of simultaneous TCP connections is reached. 4677 * 4678 * If tcp-reject-overflow is enabled, however, then we do not 4679 * change the handler event type; we keep it as-is and accept 4680 * overflow TCP connections only so that we can forcibly kill 4681 * them off. 4682 */ 4683 ++data->nsd->current_tcp_count; 4684 if (!data->nsd->options->tcp_reject_overflow && 4685 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 4686 { 4687 configure_handler_event_types(0); 4688 } 4689 } 4690 4691 static void 4692 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 4693 { 4694 size_t i; 4695 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4696 for (i = 0; i < nsd->child_count; ++i) { 4697 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 4698 if (write(nsd->children[i].child_fd, 4699 &command, 4700 sizeof(command)) == -1) 4701 { 4702 if(errno != EAGAIN && errno != EINTR) 4703 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 4704 (int) command, 4705 (int) nsd->children[i].pid, 4706 strerror(errno)); 4707 } else if (timeout > 0) { 4708 (void)block_read(NULL, 4709 nsd->children[i].child_fd, 4710 &command, sizeof(command), timeout); 4711 } 4712 fsync(nsd->children[i].child_fd); 4713 close(nsd->children[i].child_fd); 4714 nsd->children[i].child_fd = -1; 4715 } 4716 } 4717 } 4718 4719 static void 4720 send_children_quit(struct nsd* nsd) 4721 { 4722 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 4723 send_children_command(nsd, NSD_QUIT, 0); 4724 } 4725 4726 static void 4727 send_children_quit_and_wait(struct nsd* nsd) 4728 { 4729 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 4730 send_children_command(nsd, NSD_QUIT_CHILD, 3); 4731 } 4732 4733 #ifdef BIND8_STATS 4734 static void 4735 set_children_stats(struct nsd* nsd) 4736 { 4737 size_t i; 4738 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4739 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 4740 for (i = 0; i < nsd->child_count; ++i) { 4741 nsd->children[i].need_to_send_STATS = 1; 4742 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 4743 } 4744 } 4745 #endif /* BIND8_STATS */ 4746 4747 static void 4748 configure_handler_event_types(short event_types) 4749 { 4750 size_t i; 4751 4752 for (i = 0; i < tcp_accept_handler_count; ++i) { 4753 struct event* handler = &tcp_accept_handlers[i].event; 4754 if(event_types) { 4755 /* reassign */ 4756 int fd = handler->ev_fd; 4757 struct event_base* base = handler->ev_base; 4758 if(tcp_accept_handlers[i].event_added) 4759 event_del(handler); 4760 memset(handler, 0, sizeof(*handler)); 4761 event_set(handler, fd, event_types, 4762 handle_tcp_accept, &tcp_accept_handlers[i]); 4763 if(event_base_set(base, handler) != 0) 4764 log_msg(LOG_ERR, "conhand: cannot event_base"); 4765 if(event_add(handler, NULL) != 0) 4766 log_msg(LOG_ERR, "conhand: cannot event_add"); 4767 tcp_accept_handlers[i].event_added = 1; 4768 } else { 4769 /* remove */ 4770 if(tcp_accept_handlers[i].event_added) { 4771 event_del(handler); 4772 tcp_accept_handlers[i].event_added = 0; 4773 } 4774 } 4775 } 4776 } 4777