1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #ifdef USE_DNSTAP 85 #include "dnstap/dnstap_collector.h" 86 #endif 87 88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 89 90 #ifdef USE_DNSTAP 91 /* 92 * log_addr() - the function to print sockaddr_in/sockaddr_in6 structures content 93 * just like its done in Unbound via the same log_addr(VERB_LEVEL, const char*, sockaddr_storage*) 94 */ 95 static void 96 log_addr(const char* descr, 97 #ifdef INET6 98 struct sockaddr_storage* addr 99 #else 100 struct sockaddr_in* addr 101 #endif 102 ) 103 { 104 char str_buf[64]; 105 if(verbosity < 6) 106 return; 107 if( 108 #ifdef INET6 109 addr->ss_family == AF_INET 110 #else 111 addr->sin_family == AF_INET 112 #endif 113 ) { 114 struct sockaddr_in* s = (struct sockaddr_in*)addr; 115 inet_ntop(AF_INET, &s->sin_addr.s_addr, str_buf, sizeof(str_buf)); 116 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s->sin_port))); 117 #ifdef INET6 118 } else { 119 struct sockaddr_in6* s6 = (struct sockaddr_in6*)addr; 120 inet_ntop(AF_INET6, &s6->sin6_addr.s6_addr, str_buf, sizeof(str_buf)); 121 VERBOSITY(6, (LOG_INFO, "%s: address is: %s, port is: %d", descr, str_buf, ntohs(s6->sin6_port))); 122 #endif 123 } 124 } 125 #endif /* USE_DNSTAP */ 126 127 #ifdef USE_TCP_FASTOPEN 128 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 129 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 130 #endif 131 132 /* 133 * Data for the UDP handlers. 134 */ 135 struct udp_handler_data 136 { 137 struct nsd *nsd; 138 struct nsd_socket *socket; 139 struct event event; 140 }; 141 142 struct tcp_accept_handler_data { 143 struct nsd *nsd; 144 struct nsd_socket *socket; 145 int event_added; 146 struct event event; 147 #ifdef HAVE_SSL 148 /* handler accepts TLS connections on the dedicated port */ 149 int tls_accept; 150 #endif 151 }; 152 153 /* 154 * These globals are used to enable the TCP accept handlers 155 * when the number of TCP connection drops below the maximum 156 * number of TCP connections. 157 */ 158 static size_t tcp_accept_handler_count; 159 static struct tcp_accept_handler_data *tcp_accept_handlers; 160 161 static struct event slowaccept_event; 162 static int slowaccept; 163 164 #ifdef HAVE_SSL 165 static unsigned char *ocspdata = NULL; 166 static long ocspdata_len = 0; 167 #endif 168 169 #ifdef NONBLOCKING_IS_BROKEN 170 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 171 read multiple times from a socket when reported ready by select. */ 172 # define NUM_RECV_PER_SELECT (1) 173 #else /* !NONBLOCKING_IS_BROKEN */ 174 # define NUM_RECV_PER_SELECT (100) 175 #endif /* NONBLOCKING_IS_BROKEN */ 176 177 #ifndef HAVE_MMSGHDR 178 struct mmsghdr { 179 struct msghdr msg_hdr; 180 unsigned int msg_len; 181 }; 182 #endif 183 184 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 185 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 186 static struct query *queries[NUM_RECV_PER_SELECT]; 187 188 /* 189 * Data for the TCP connection handlers. 190 * 191 * The TCP handlers use non-blocking I/O. This is necessary to avoid 192 * blocking the entire server on a slow TCP connection, but does make 193 * reading from and writing to the socket more complicated. 194 * 195 * Basically, whenever a read/write would block (indicated by the 196 * EAGAIN errno variable) we remember the position we were reading 197 * from/writing to and return from the TCP reading/writing event 198 * handler. When the socket becomes readable/writable again we 199 * continue from the same position. 200 */ 201 struct tcp_handler_data 202 { 203 /* 204 * The region used to allocate all TCP connection related 205 * data, including this structure. This region is destroyed 206 * when the connection is closed. 207 */ 208 region_type* region; 209 210 /* 211 * The global nsd structure. 212 */ 213 struct nsd* nsd; 214 215 /* 216 * The current query data for this TCP connection. 217 */ 218 query_type* query; 219 220 /* 221 * The query_state is used to remember if we are performing an 222 * AXFR, if we're done processing, or if we should discard the 223 * query and connection. 224 */ 225 query_state_type query_state; 226 227 /* 228 * The event for the file descriptor and tcp timeout 229 */ 230 struct event event; 231 232 /* 233 * The bytes_transmitted field is used to remember the number 234 * of bytes transmitted when receiving or sending a DNS 235 * packet. The count includes the two additional bytes used 236 * to specify the packet length on a TCP connection. 237 */ 238 size_t bytes_transmitted; 239 240 /* 241 * The number of queries handled by this specific TCP connection. 242 */ 243 int query_count; 244 245 /* 246 * The timeout in msec for this tcp connection 247 */ 248 int tcp_timeout; 249 250 /* 251 * If the connection is allowed to have further queries on it. 252 */ 253 int tcp_no_more_queries; 254 255 #ifdef USE_DNSTAP 256 /* the socket of the accept socket to find proper service (local) address the socket is bound to. */ 257 struct nsd_socket *socket; 258 #endif /* USE_DNSTAP */ 259 260 #ifdef HAVE_SSL 261 /* 262 * TLS object. 263 */ 264 SSL* tls; 265 266 /* 267 * TLS handshake state. 268 */ 269 enum { tls_hs_none, tls_hs_read, tls_hs_write, 270 tls_hs_read_event, tls_hs_write_event } shake_state; 271 #endif 272 /* list of connections, for service of remaining tcp channels */ 273 struct tcp_handler_data *prev, *next; 274 }; 275 /* global that is the list of active tcp channels */ 276 static struct tcp_handler_data *tcp_active_list = NULL; 277 278 /* 279 * Handle incoming queries on the UDP server sockets. 280 */ 281 static void handle_udp(int fd, short event, void* arg); 282 283 /* 284 * Handle incoming connections on the TCP sockets. These handlers 285 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 286 * connection) but are disabled when the number of current TCP 287 * connections is equal to the maximum number of TCP connections. 288 * Disabling is done by changing the handler to wait for the 289 * NETIO_EVENT_NONE type. This is done using the function 290 * configure_tcp_accept_handlers. 291 */ 292 static void handle_tcp_accept(int fd, short event, void* arg); 293 294 /* 295 * Handle incoming queries on a TCP connection. The TCP connections 296 * are configured to be non-blocking and the handler may be called 297 * multiple times before a complete query is received. 298 */ 299 static void handle_tcp_reading(int fd, short event, void* arg); 300 301 /* 302 * Handle outgoing responses on a TCP connection. The TCP connections 303 * are configured to be non-blocking and the handler may be called 304 * multiple times before a complete response is sent. 305 */ 306 static void handle_tcp_writing(int fd, short event, void* arg); 307 308 #ifdef HAVE_SSL 309 /* Create SSL object and associate fd */ 310 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 311 /* 312 * Handle TLS handshake. May be called multiple times if incomplete. 313 */ 314 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 315 316 /* 317 * Handle incoming queries on a TLS over TCP connection. The TLS 318 * connections are configured to be non-blocking and the handler may 319 * be called multiple times before a complete query is received. 320 */ 321 static void handle_tls_reading(int fd, short event, void* arg); 322 323 /* 324 * Handle outgoing responses on a TLS over TCP connection. The TLS 325 * connections are configured to be non-blocking and the handler may 326 * be called multiple times before a complete response is sent. 327 */ 328 static void handle_tls_writing(int fd, short event, void* arg); 329 #endif 330 331 /* 332 * Send all children the quit nonblocking, then close pipe. 333 */ 334 static void send_children_quit(struct nsd* nsd); 335 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 336 static void send_children_quit_and_wait(struct nsd* nsd); 337 338 /* set childrens flags to send NSD_STATS to them */ 339 #ifdef BIND8_STATS 340 static void set_children_stats(struct nsd* nsd); 341 #endif /* BIND8_STATS */ 342 343 /* 344 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 345 */ 346 static void configure_handler_event_types(short event_types); 347 348 static uint16_t *compressed_dname_offsets = 0; 349 static uint32_t compression_table_capacity = 0; 350 static uint32_t compression_table_size = 0; 351 static domain_type* compressed_dnames[MAXRRSPP]; 352 353 #ifdef USE_TCP_FASTOPEN 354 /* Checks to see if the kernel value must be manually changed in order for 355 TCP Fast Open to support server mode */ 356 static void report_tcp_fastopen_config() { 357 358 int tcp_fastopen_fp; 359 uint8_t tcp_fastopen_value; 360 361 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 362 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 363 } 364 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 365 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 366 close(tcp_fastopen_fp); 367 } 368 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 369 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 370 log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); 371 log_msg(LOG_WARNING, "To enable TFO use the command:"); 372 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 373 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 374 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 375 close(tcp_fastopen_fp); 376 } 377 close(tcp_fastopen_fp); 378 } 379 #endif 380 381 /* 382 * Remove the specified pid from the list of child pids. Returns -1 if 383 * the pid is not in the list, child_num otherwise. The field is set to 0. 384 */ 385 static int 386 delete_child_pid(struct nsd *nsd, pid_t pid) 387 { 388 size_t i; 389 for (i = 0; i < nsd->child_count; ++i) { 390 if (nsd->children[i].pid == pid) { 391 nsd->children[i].pid = 0; 392 if(!nsd->children[i].need_to_exit) { 393 if(nsd->children[i].child_fd != -1) 394 close(nsd->children[i].child_fd); 395 nsd->children[i].child_fd = -1; 396 if(nsd->children[i].handler) 397 nsd->children[i].handler->fd = -1; 398 } 399 return i; 400 } 401 } 402 return -1; 403 } 404 405 /* 406 * Restart child servers if necessary. 407 */ 408 static int 409 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 410 int* xfrd_sock_p) 411 { 412 struct main_ipc_handler_data *ipc_data; 413 size_t i; 414 int sv[2]; 415 416 /* Fork the child processes... */ 417 for (i = 0; i < nsd->child_count; ++i) { 418 if (nsd->children[i].pid <= 0) { 419 if (nsd->children[i].child_fd != -1) 420 close(nsd->children[i].child_fd); 421 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 422 log_msg(LOG_ERR, "socketpair: %s", 423 strerror(errno)); 424 return -1; 425 } 426 nsd->children[i].child_fd = sv[0]; 427 nsd->children[i].parent_fd = sv[1]; 428 nsd->children[i].pid = fork(); 429 switch (nsd->children[i].pid) { 430 default: /* SERVER MAIN */ 431 close(nsd->children[i].parent_fd); 432 nsd->children[i].parent_fd = -1; 433 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 434 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 435 } 436 if(!nsd->children[i].handler) 437 { 438 ipc_data = (struct main_ipc_handler_data*) region_alloc( 439 region, sizeof(struct main_ipc_handler_data)); 440 ipc_data->nsd = nsd; 441 ipc_data->child = &nsd->children[i]; 442 ipc_data->child_num = i; 443 ipc_data->xfrd_sock = xfrd_sock_p; 444 ipc_data->packet = buffer_create(region, QIOBUFSZ); 445 ipc_data->forward_mode = 0; 446 ipc_data->got_bytes = 0; 447 ipc_data->total_bytes = 0; 448 ipc_data->acl_num = 0; 449 nsd->children[i].handler = (struct netio_handler*) region_alloc( 450 region, sizeof(struct netio_handler)); 451 nsd->children[i].handler->fd = nsd->children[i].child_fd; 452 nsd->children[i].handler->timeout = NULL; 453 nsd->children[i].handler->user_data = ipc_data; 454 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 455 nsd->children[i].handler->event_handler = parent_handle_child_command; 456 netio_add_handler(netio, nsd->children[i].handler); 457 } 458 /* clear any ongoing ipc */ 459 ipc_data = (struct main_ipc_handler_data*) 460 nsd->children[i].handler->user_data; 461 ipc_data->forward_mode = 0; 462 /* restart - update fd */ 463 nsd->children[i].handler->fd = nsd->children[i].child_fd; 464 break; 465 case 0: /* CHILD */ 466 /* the child need not be able to access the 467 * nsd.db file */ 468 namedb_close_udb(nsd->db); 469 #ifdef MEMCLEAN /* OS collects memory pages */ 470 region_destroy(region); 471 #endif 472 473 if (pledge("stdio rpath inet", NULL) == -1) { 474 log_msg(LOG_ERR, "pledge"); 475 exit(1); 476 } 477 478 nsd->pid = 0; 479 nsd->child_count = 0; 480 nsd->server_kind = nsd->children[i].kind; 481 nsd->this_child = &nsd->children[i]; 482 nsd->this_child->child_num = i; 483 /* remove signal flags inherited from parent 484 the parent will handle them. */ 485 nsd->signal_hint_reload_hup = 0; 486 nsd->signal_hint_reload = 0; 487 nsd->signal_hint_child = 0; 488 nsd->signal_hint_quit = 0; 489 nsd->signal_hint_shutdown = 0; 490 nsd->signal_hint_stats = 0; 491 nsd->signal_hint_statsusr = 0; 492 close(*xfrd_sock_p); 493 close(nsd->this_child->child_fd); 494 nsd->this_child->child_fd = -1; 495 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 496 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 497 } 498 server_child(nsd); 499 /* NOTREACH */ 500 exit(0); 501 case -1: 502 log_msg(LOG_ERR, "fork failed: %s", 503 strerror(errno)); 504 return -1; 505 } 506 } 507 } 508 return 0; 509 } 510 511 #ifdef BIND8_STATS 512 static void set_bind8_alarm(struct nsd* nsd) 513 { 514 /* resync so that the next alarm is on the next whole minute */ 515 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ 516 alarm(nsd->st.period - (time(NULL) % nsd->st.period)); 517 } 518 #endif 519 520 /* set zone stat ids for zones initially read in */ 521 static void 522 zonestatid_tree_set(struct nsd* nsd) 523 { 524 struct radnode* n; 525 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 526 zone_type* zone = (zone_type*)n->elem; 527 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 528 } 529 } 530 531 #ifdef USE_ZONE_STATS 532 void 533 server_zonestat_alloc(struct nsd* nsd) 534 { 535 size_t num = (nsd->options->zonestatnames->count==0?1: 536 nsd->options->zonestatnames->count); 537 size_t sz = sizeof(struct nsdst)*num; 538 char tmpfile[256]; 539 uint8_t z = 0; 540 541 /* file names */ 542 nsd->zonestatfname[0] = 0; 543 nsd->zonestatfname[1] = 0; 544 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 545 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 546 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 547 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 548 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 549 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 550 551 /* file descriptors */ 552 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 553 if(nsd->zonestatfd[0] == -1) { 554 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 555 strerror(errno)); 556 exit(1); 557 } 558 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 559 if(nsd->zonestatfd[0] == -1) { 560 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 561 strerror(errno)); 562 close(nsd->zonestatfd[0]); 563 unlink(nsd->zonestatfname[0]); 564 exit(1); 565 } 566 567 #ifdef HAVE_MMAP 568 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 569 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 570 strerror(errno)); 571 exit(1); 572 } 573 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 574 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 575 nsd->zonestatfname[0], strerror(errno)); 576 exit(1); 577 } 578 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 579 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 580 strerror(errno)); 581 exit(1); 582 } 583 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 584 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 585 nsd->zonestatfname[1], strerror(errno)); 586 exit(1); 587 } 588 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 589 MAP_SHARED, nsd->zonestatfd[0], 0); 590 if(nsd->zonestat[0] == MAP_FAILED) { 591 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 592 unlink(nsd->zonestatfname[0]); 593 unlink(nsd->zonestatfname[1]); 594 exit(1); 595 } 596 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 597 MAP_SHARED, nsd->zonestatfd[1], 0); 598 if(nsd->zonestat[1] == MAP_FAILED) { 599 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 600 unlink(nsd->zonestatfname[0]); 601 unlink(nsd->zonestatfname[1]); 602 exit(1); 603 } 604 memset(nsd->zonestat[0], 0, sz); 605 memset(nsd->zonestat[1], 0, sz); 606 nsd->zonestatsize[0] = num; 607 nsd->zonestatsize[1] = num; 608 nsd->zonestatdesired = num; 609 nsd->zonestatsizenow = num; 610 nsd->zonestatnow = nsd->zonestat[0]; 611 #endif /* HAVE_MMAP */ 612 } 613 614 void 615 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 616 { 617 #ifdef HAVE_MMAP 618 #ifdef MREMAP_MAYMOVE 619 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 620 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 621 MREMAP_MAYMOVE); 622 if(nsd->zonestat[idx] == MAP_FAILED) { 623 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 624 exit(1); 625 } 626 #else /* !HAVE MREMAP */ 627 if(msync(nsd->zonestat[idx], 628 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 629 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 630 if(munmap(nsd->zonestat[idx], 631 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 632 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 633 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 634 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 635 if(nsd->zonestat[idx] == MAP_FAILED) { 636 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 637 exit(1); 638 } 639 #endif /* MREMAP */ 640 #endif /* HAVE_MMAP */ 641 } 642 643 /* realloc the zonestat array for the one that is not currently in use, 644 * to match the desired new size of the array (if applicable) */ 645 void 646 server_zonestat_realloc(struct nsd* nsd) 647 { 648 #ifdef HAVE_MMAP 649 uint8_t z = 0; 650 size_t sz; 651 int idx = 0; /* index of the zonestat array that is not in use */ 652 if(nsd->zonestatnow == nsd->zonestat[0]) 653 idx = 1; 654 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 655 return; 656 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 657 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 658 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 659 strerror(errno)); 660 exit(1); 661 } 662 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 663 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 664 nsd->zonestatfname[idx], strerror(errno)); 665 exit(1); 666 } 667 zonestat_remap(nsd, idx, sz); 668 /* zero the newly allocated region */ 669 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 670 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 671 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 672 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 673 } 674 nsd->zonestatsize[idx] = nsd->zonestatdesired; 675 #endif /* HAVE_MMAP */ 676 } 677 678 /* switchover to use the other array for the new children, that 679 * briefly coexist with the old children. And we want to avoid them 680 * both writing to the same statistics arrays. */ 681 void 682 server_zonestat_switch(struct nsd* nsd) 683 { 684 if(nsd->zonestatnow == nsd->zonestat[0]) { 685 nsd->zonestatnow = nsd->zonestat[1]; 686 nsd->zonestatsizenow = nsd->zonestatsize[1]; 687 } else { 688 nsd->zonestatnow = nsd->zonestat[0]; 689 nsd->zonestatsizenow = nsd->zonestatsize[0]; 690 } 691 } 692 #endif /* USE_ZONE_STATS */ 693 694 static void 695 cleanup_dname_compression_tables(void *ptr) 696 { 697 free(ptr); 698 compressed_dname_offsets = NULL; 699 compression_table_capacity = 0; 700 } 701 702 static void 703 initialize_dname_compression_tables(struct nsd *nsd) 704 { 705 size_t needed = domain_table_count(nsd->db->domains) + 1; 706 needed += EXTRA_DOMAIN_NUMBERS; 707 if(compression_table_capacity < needed) { 708 if(compressed_dname_offsets) { 709 region_remove_cleanup(nsd->db->region, 710 cleanup_dname_compression_tables, 711 compressed_dname_offsets); 712 free(compressed_dname_offsets); 713 } 714 compressed_dname_offsets = (uint16_t *) xmallocarray( 715 needed, sizeof(uint16_t)); 716 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 717 compressed_dname_offsets); 718 compression_table_capacity = needed; 719 compression_table_size=domain_table_count(nsd->db->domains)+1; 720 } 721 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 722 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 723 } 724 725 static int 726 set_cloexec(struct nsd_socket *sock) 727 { 728 assert(sock != NULL); 729 730 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 731 const char *socktype = 732 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 733 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 734 socktype, strerror(errno)); 735 return -1; 736 } 737 738 return 1; 739 } 740 741 static int 742 set_reuseport(struct nsd_socket *sock) 743 { 744 #ifdef SO_REUSEPORT 745 int on = 1; 746 #ifdef SO_REUSEPORT_LB 747 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 748 * SO_REUSEPORT on Linux. This is what the users want with the config 749 * option in nsd.conf; if we actually need local address and port reuse 750 * they'll also need to have SO_REUSEPORT set for them, assume it was 751 * _LB they want. 752 */ 753 int opt = SO_REUSEPORT_LB; 754 static const char optname[] = "SO_REUSEPORT_LB"; 755 #else /* !SO_REUSEPORT_LB */ 756 int opt = SO_REUSEPORT; 757 static const char optname[] = "SO_REUSEPORT"; 758 #endif /* SO_REUSEPORT_LB */ 759 760 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 761 return 1; 762 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 763 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 764 optname, strerror(errno)); 765 } 766 return -1; 767 #else 768 (void)sock; 769 #endif /* SO_REUSEPORT */ 770 771 return 0; 772 } 773 774 static int 775 set_reuseaddr(struct nsd_socket *sock) 776 { 777 #ifdef SO_REUSEADDR 778 int on = 1; 779 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 780 return 1; 781 } 782 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 783 strerror(errno)); 784 return -1; 785 #endif /* SO_REUSEADDR */ 786 return 0; 787 } 788 789 static int 790 set_rcvbuf(struct nsd_socket *sock, int rcv) 791 { 792 #ifdef SO_RCVBUF 793 #ifdef SO_RCVBUFFORCE 794 if(0 == setsockopt( 795 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 796 { 797 return 1; 798 } 799 if(errno == EPERM || errno == ENOBUFS) { 800 return 0; 801 } 802 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 803 strerror(errno)); 804 return -1; 805 #else /* !SO_RCVBUFFORCE */ 806 if (0 == setsockopt( 807 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 808 { 809 return 1; 810 } 811 if(errno == ENOSYS || errno == ENOBUFS) { 812 return 0; 813 } 814 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 815 strerror(errno)); 816 return -1; 817 #endif /* SO_RCVBUFFORCE */ 818 #endif /* SO_RCVBUF */ 819 820 return 0; 821 } 822 823 static int 824 set_sndbuf(struct nsd_socket *sock, int snd) 825 { 826 #ifdef SO_SNDBUF 827 #ifdef SO_SNDBUFFORCE 828 if(0 == setsockopt( 829 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 830 { 831 return 1; 832 } 833 if(errno == EPERM || errno == ENOBUFS) { 834 return 0; 835 } 836 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 837 strerror(errno)); 838 return -1; 839 #else /* !SO_SNDBUFFORCE */ 840 if(0 == setsockopt( 841 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 842 { 843 return 1; 844 } 845 if(errno == ENOSYS || errno == ENOBUFS) { 846 return 0; 847 } 848 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 849 strerror(errno)); 850 return -1; 851 #endif /* SO_SNDBUFFORCE */ 852 #endif /* SO_SNDBUF */ 853 854 return 0; 855 } 856 857 static int 858 set_nonblock(struct nsd_socket *sock) 859 { 860 const char *socktype = 861 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 862 863 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 864 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 865 socktype, strerror(errno)); 866 return -1; 867 } 868 869 return 1; 870 } 871 872 #ifdef INET6 873 static int 874 set_ipv6_v6only(struct nsd_socket *sock) 875 { 876 #ifdef IPV6_V6ONLY 877 int on = 1; 878 const char *socktype = 879 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 880 881 if(0 == setsockopt( 882 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 883 { 884 return 1; 885 } 886 887 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 888 socktype, strerror(errno)); 889 return -1; 890 #else 891 (void)sock; 892 #endif /* IPV6_V6ONLY */ 893 894 return 0; 895 } 896 #endif /* INET6 */ 897 898 #ifdef INET6 899 static int 900 set_ipv6_use_min_mtu(struct nsd_socket *sock) 901 { 902 #if defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU) 903 #if defined(IPV6_USE_MIN_MTU) 904 /* There is no fragmentation of IPv6 datagrams during forwarding in the 905 * network. Therefore we do not send UDP datagrams larger than the 906 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 907 * larger if the network stack supports IPV6_USE_MIN_MTU. 908 */ 909 int opt = IPV6_USE_MIN_MTU; 910 int optval = 1; 911 static const char optname[] = "IPV6_USE_MIN_MTU"; 912 #elif defined(IPV6_MTU) 913 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 914 * to the MIN MTU to get the same. 915 */ 916 int opt = IPV6_MTU; 917 int optval = IPV6_MIN_MTU; 918 static const char optname[] = "IPV6_MTU"; 919 #endif 920 if(0 == setsockopt( 921 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 922 { 923 return 1; 924 } 925 926 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 927 optname, strerror(errno)); 928 return -1; 929 #else 930 (void)sock; 931 #endif /* INET6 */ 932 933 return 0; 934 } 935 #endif /* INET6 */ 936 937 static int 938 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 939 { 940 int ret = 0; 941 942 #if defined(IP_MTU_DISCOVER) 943 int opt = IP_MTU_DISCOVER; 944 int optval; 945 # if defined(IP_PMTUDISC_OMIT) 946 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 947 * information and send packets with DF=0. Fragmentation is allowed if 948 * and only if the packet size exceeds the outgoing interface MTU or 949 * the packet encounters smaller MTU link in network. This mitigates 950 * DNS fragmentation attacks by preventing forged PMTU information. 951 * FreeBSD already has same semantics without setting the option. 952 */ 953 optval = IP_PMTUDISC_OMIT; 954 if(0 == setsockopt( 955 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 956 { 957 return 1; 958 } 959 960 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 961 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 962 # endif /* IP_PMTUDISC_OMIT */ 963 # if defined(IP_PMTUDISC_DONT) 964 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 965 optval = IP_PMTUDISC_DONT; 966 if(0 == setsockopt( 967 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 968 { 969 return 1; 970 } 971 972 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 973 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 974 # endif 975 ret = -1; 976 #elif defined(IP_DONTFRAG) 977 int off = 0; 978 if (0 == setsockopt( 979 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 980 { 981 return 1; 982 } 983 984 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 985 strerror(errno)); 986 ret = -1; 987 #else 988 (void)sock; 989 #endif 990 991 return ret; 992 } 993 994 static int 995 set_ip_freebind(struct nsd_socket *sock) 996 { 997 #ifdef IP_FREEBIND 998 int on = 1; 999 const char *socktype = 1000 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1001 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 1002 { 1003 return 1; 1004 } 1005 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 1006 socktype, strerror(errno)); 1007 return -1; 1008 #else 1009 (void)sock; 1010 #endif /* IP_FREEBIND */ 1011 1012 return 0; 1013 } 1014 1015 static int 1016 set_ip_transparent(struct nsd_socket *sock) 1017 { 1018 /* 1019 The scandalous preprocessor blob here calls for some explanation :) 1020 POSIX does not specify an option to bind non-local IPs, so 1021 platforms developed several implementation-specific options, 1022 all set in the same way, but with different names. 1023 For additional complexity, some platform manage this setting 1024 differently for different address families (IPv4 vs IPv6). 1025 This scandalous preprocessor blob below abstracts such variability 1026 in the way which leaves the C code as lean and clear as possible. 1027 */ 1028 1029 #if defined(IP_TRANSPARENT) 1030 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 1031 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1032 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 1033 // as of 2020-01, Linux does not support this on IPv6 programmatically 1034 #elif defined(SO_BINDANY) 1035 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 1036 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 1037 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 1038 #elif defined(IP_BINDANY) 1039 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 1040 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 1041 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 1042 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 1043 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 1044 #endif 1045 1046 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 1047 (void)sock; 1048 #else 1049 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 1050 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 1051 # endif 1052 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1053 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1054 # endif 1055 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1056 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1057 # endif 1058 1059 int on = 1; 1060 const char *socktype = 1061 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1062 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1063 1064 if(0 == setsockopt( 1065 sock->s, 1066 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1067 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1068 &on, sizeof(on))) 1069 { 1070 return 1; 1071 } 1072 1073 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1074 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1075 return -1; 1076 #endif 1077 1078 return 0; 1079 } 1080 1081 static int 1082 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1083 { 1084 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1085 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1086 return 1; 1087 } 1088 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1089 strerror(errno)); 1090 return -1; 1091 #else 1092 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1093 #endif 1094 return 0; 1095 } 1096 1097 #ifdef USE_TCP_FASTOPEN 1098 static int 1099 set_tcp_fastopen(struct nsd_socket *sock) 1100 { 1101 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1102 * a defense against IP spoofing attacks as suggested in RFC7413. 1103 */ 1104 int qlen; 1105 1106 #ifdef __APPLE__ 1107 /* macOS X implementation only supports qlen of 1 via this call. The 1108 * actual value is configured by the net.inet.tcp.fastopen_backlog 1109 * kernel parameter. 1110 */ 1111 qlen = 1; 1112 #else 1113 /* 5 is recommended on Linux. */ 1114 qlen = 5; 1115 #endif 1116 if (0 == setsockopt( 1117 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1118 { 1119 return 1; 1120 } 1121 1122 if (errno == EPERM) { 1123 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1124 "; this could likely be because sysctl " 1125 "net.inet.tcp.fastopen.enabled, " 1126 "net.inet.tcp.fastopen.server_enable, or " 1127 "net.ipv4.tcp_fastopen is disabled", 1128 strerror(errno)); 1129 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1130 * disabled, except when verbosity enabled for debugging 1131 */ 1132 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1133 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1134 strerror(errno)); 1135 } 1136 1137 return (errno == ENOPROTOOPT ? 0 : -1); 1138 } 1139 #endif /* USE_TCP_FASTOPEN */ 1140 1141 static int 1142 set_bindtodevice(struct nsd_socket *sock) 1143 { 1144 #if defined(SO_BINDTODEVICE) 1145 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1146 sock->device, strlen(sock->device)) == -1) 1147 { 1148 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1149 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1150 return -1; 1151 } 1152 1153 return 1; 1154 #else 1155 (void)sock; 1156 return 0; 1157 #endif 1158 } 1159 1160 static int 1161 set_setfib(struct nsd_socket *sock) 1162 { 1163 #if defined(SO_SETFIB) 1164 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1165 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1166 { 1167 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1168 "SO_SETFIB", sock->fib, strerror(errno)); 1169 return -1; 1170 } 1171 1172 return 1; 1173 #else 1174 (void)sock; 1175 return 0; 1176 #endif 1177 } 1178 1179 static int 1180 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1181 { 1182 int rcv = 1*1024*1024, snd = 1*1024*1024; 1183 1184 if(-1 == (sock->s = socket( 1185 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1186 { 1187 #ifdef INET6 1188 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1189 (sock->addr.ai_family == AF_INET6) && 1190 (errno == EAFNOSUPPORT)) 1191 { 1192 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1193 "not supported"); 1194 return 0; 1195 } 1196 #endif 1197 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1198 return -1; 1199 } 1200 1201 set_cloexec(sock); 1202 1203 if(nsd->reuseport && reuseport_works && *reuseport_works) 1204 *reuseport_works = (set_reuseport(sock) == 1); 1205 1206 if(nsd->options->receive_buffer_size > 0) 1207 rcv = nsd->options->receive_buffer_size; 1208 if(set_rcvbuf(sock, rcv) == -1) 1209 return -1; 1210 1211 if(nsd->options->send_buffer_size > 0) 1212 snd = nsd->options->send_buffer_size; 1213 if(set_sndbuf(sock, snd) == -1) 1214 return -1; 1215 #ifdef INET6 1216 if(sock->addr.ai_family == AF_INET6) { 1217 if(set_ipv6_v6only(sock) == -1 || 1218 set_ipv6_use_min_mtu(sock) == -1) 1219 return -1; 1220 } else 1221 #endif /* INET6 */ 1222 if(sock->addr.ai_family == AF_INET) { 1223 if(set_ipv4_no_pmtu_disc(sock) == -1) 1224 return -1; 1225 } 1226 1227 /* Set socket to non-blocking. Otherwise, on operating systems 1228 * with thundering herd problems, the UDP recv could block 1229 * after select returns readable. 1230 */ 1231 set_nonblock(sock); 1232 1233 if(nsd->options->ip_freebind) 1234 (void)set_ip_freebind(sock); 1235 if(nsd->options->ip_transparent) 1236 (void)set_ip_transparent(sock); 1237 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1238 return -1; 1239 if(sock->fib != -1 && set_setfib(sock) == -1) 1240 return -1; 1241 1242 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1243 char buf[256]; 1244 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1245 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1246 buf, strerror(errno)); 1247 return -1; 1248 } 1249 1250 return 1; 1251 } 1252 1253 static int 1254 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1255 { 1256 #ifdef USE_TCP_FASTOPEN 1257 report_tcp_fastopen_config(); 1258 #endif 1259 1260 (void)reuseport_works; 1261 1262 if(-1 == (sock->s = socket( 1263 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1264 { 1265 #ifdef INET6 1266 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1267 (sock->addr.ai_family == AF_INET6) && 1268 (errno == EAFNOSUPPORT)) 1269 { 1270 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1271 "not supported"); 1272 return 0; 1273 } 1274 #endif /* INET6 */ 1275 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1276 return -1; 1277 } 1278 1279 set_cloexec(sock); 1280 1281 if(nsd->reuseport && reuseport_works && *reuseport_works) 1282 *reuseport_works = (set_reuseport(sock) == 1); 1283 1284 (void)set_reuseaddr(sock); 1285 1286 #ifdef INET6 1287 if(sock->addr.ai_family == AF_INET6) { 1288 if (set_ipv6_v6only(sock) == -1 || 1289 set_ipv6_use_min_mtu(sock) == -1) 1290 return -1; 1291 } 1292 #endif 1293 1294 if(nsd->tcp_mss > 0) 1295 set_tcp_maxseg(sock, nsd->tcp_mss); 1296 /* (StevensUNP p463), if TCP listening socket is blocking, then 1297 it may block in accept, even if select() says readable. */ 1298 (void)set_nonblock(sock); 1299 if(nsd->options->ip_freebind) 1300 (void)set_ip_freebind(sock); 1301 if(nsd->options->ip_transparent) 1302 (void)set_ip_transparent(sock); 1303 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1304 return -1; 1305 if(sock->fib != -1 && set_setfib(sock) == -1) 1306 return -1; 1307 1308 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1309 char buf[256]; 1310 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1311 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1312 buf, strerror(errno)); 1313 return -1; 1314 } 1315 1316 #ifdef USE_TCP_FASTOPEN 1317 (void)set_tcp_fastopen(sock); 1318 #endif 1319 1320 if(listen(sock->s, TCP_BACKLOG) == -1) { 1321 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1322 return -1; 1323 } 1324 1325 return 1; 1326 } 1327 1328 /* 1329 * Initialize the server, reuseport, create and bind the sockets. 1330 */ 1331 int 1332 server_init(struct nsd *nsd) 1333 { 1334 size_t i; 1335 int reuseport = 1; /* Determine if REUSEPORT works. */ 1336 1337 /* open server interface ports */ 1338 for(i = 0; i < nsd->ifs; i++) { 1339 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1340 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1341 { 1342 return -1; 1343 } 1344 } 1345 1346 if(nsd->reuseport && reuseport) { 1347 size_t ifs = nsd->ifs * nsd->reuseport; 1348 1349 /* increase the size of the interface arrays, there are going 1350 * to be separate interface file descriptors for every server 1351 * instance */ 1352 region_remove_cleanup(nsd->region, free, nsd->udp); 1353 region_remove_cleanup(nsd->region, free, nsd->tcp); 1354 1355 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1356 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1357 region_add_cleanup(nsd->region, free, nsd->udp); 1358 region_add_cleanup(nsd->region, free, nsd->tcp); 1359 if(ifs > nsd->ifs) { 1360 memset(&nsd->udp[nsd->ifs], 0, 1361 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1362 memset(&nsd->tcp[nsd->ifs], 0, 1363 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1364 } 1365 1366 for(i = nsd->ifs; i < ifs; i++) { 1367 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1368 nsd->udp[i].s = -1; 1369 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1370 return -1; 1371 } 1372 /* Turn off REUSEPORT for TCP by copying the socket 1373 * file descriptor. 1374 * This means we should not close TCP used by 1375 * other servers in reuseport enabled mode, in 1376 * server_child(). 1377 */ 1378 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1379 } 1380 1381 nsd->ifs = ifs; 1382 } else { 1383 nsd->reuseport = 0; 1384 } 1385 1386 return 0; 1387 } 1388 1389 /* 1390 * Prepare the server for take off. 1391 * 1392 */ 1393 int 1394 server_prepare(struct nsd *nsd) 1395 { 1396 #ifdef RATELIMIT 1397 /* set secret modifier for hashing (udb ptr buckets and rate limits) */ 1398 #ifdef HAVE_GETRANDOM 1399 uint32_t v; 1400 if(getrandom(&v, sizeof(v), 0) == -1) { 1401 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1402 exit(1); 1403 } 1404 hash_set_raninit(v); 1405 #elif defined(HAVE_ARC4RANDOM) 1406 hash_set_raninit(arc4random()); 1407 #else 1408 uint32_t v = getpid() ^ time(NULL); 1409 srandom((unsigned long)v); 1410 # ifdef HAVE_SSL 1411 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1412 hash_set_raninit(v); 1413 else 1414 # endif 1415 hash_set_raninit(random()); 1416 #endif 1417 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1418 nsd->options->rrl_ratelimit, 1419 nsd->options->rrl_whitelist_ratelimit, 1420 nsd->options->rrl_slip, 1421 nsd->options->rrl_ipv4_prefix_length, 1422 nsd->options->rrl_ipv6_prefix_length); 1423 #endif /* RATELIMIT */ 1424 1425 /* Open the database... */ 1426 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { 1427 log_msg(LOG_ERR, "unable to open the database %s: %s", 1428 nsd->dbfile, strerror(errno)); 1429 unlink(nsd->task[0]->fname); 1430 unlink(nsd->task[1]->fname); 1431 #ifdef USE_ZONE_STATS 1432 unlink(nsd->zonestatfname[0]); 1433 unlink(nsd->zonestatfname[1]); 1434 #endif 1435 xfrd_del_tempdir(nsd); 1436 return -1; 1437 } 1438 /* check if zone files have been modified */ 1439 /* NULL for taskudb because we send soainfo in a moment, batched up, 1440 * for all zones */ 1441 if(nsd->options->zonefiles_check || (nsd->options->database == NULL || 1442 nsd->options->database[0] == 0)) 1443 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1444 zonestatid_tree_set(nsd); 1445 1446 compression_table_capacity = 0; 1447 initialize_dname_compression_tables(nsd); 1448 1449 #ifdef BIND8_STATS 1450 /* Initialize times... */ 1451 time(&nsd->st.boot); 1452 set_bind8_alarm(nsd); 1453 #endif /* BIND8_STATS */ 1454 1455 return 0; 1456 } 1457 1458 /* 1459 * Fork the required number of servers. 1460 */ 1461 static int 1462 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1463 int* xfrd_sock_p) 1464 { 1465 size_t i; 1466 1467 /* Start all child servers initially. */ 1468 for (i = 0; i < nsd->child_count; ++i) { 1469 nsd->children[i].pid = 0; 1470 } 1471 1472 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1473 } 1474 1475 static void 1476 server_close_socket(struct nsd_socket *sock) 1477 { 1478 if(sock->s != -1) { 1479 close(sock->s); 1480 sock->s = -1; 1481 } 1482 } 1483 1484 void 1485 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1486 { 1487 size_t i; 1488 1489 /* Close all the sockets... */ 1490 for (i = 0; i < n; ++i) { 1491 server_close_socket(&sockets[i]); 1492 } 1493 } 1494 1495 /* 1496 * Close the sockets, shutdown the server and exit. 1497 * Does not return. 1498 */ 1499 void 1500 server_shutdown(struct nsd *nsd) 1501 { 1502 size_t i; 1503 1504 server_close_all_sockets(nsd->udp, nsd->ifs); 1505 server_close_all_sockets(nsd->tcp, nsd->ifs); 1506 /* CHILD: close command channel to parent */ 1507 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1508 { 1509 close(nsd->this_child->parent_fd); 1510 nsd->this_child->parent_fd = -1; 1511 } 1512 /* SERVER: close command channels to children */ 1513 if(!nsd->this_child) 1514 { 1515 for(i=0; i < nsd->child_count; ++i) 1516 if(nsd->children[i].child_fd != -1) 1517 { 1518 close(nsd->children[i].child_fd); 1519 nsd->children[i].child_fd = -1; 1520 } 1521 } 1522 1523 tsig_finalize(); 1524 #ifdef HAVE_SSL 1525 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1526 if (nsd->tls_ctx) 1527 SSL_CTX_free(nsd->tls_ctx); 1528 #endif 1529 1530 #ifdef MEMCLEAN /* OS collects memory pages */ 1531 #ifdef RATELIMIT 1532 rrl_mmap_deinit_keep_mmap(); 1533 #endif 1534 #ifdef USE_DNSTAP 1535 dt_collector_destroy(nsd->dt_collector, nsd); 1536 #endif 1537 udb_base_free_keep_mmap(nsd->task[0]); 1538 udb_base_free_keep_mmap(nsd->task[1]); 1539 namedb_close_udb(nsd->db); /* keeps mmap */ 1540 namedb_close(nsd->db); 1541 nsd_options_destroy(nsd->options); 1542 region_destroy(nsd->region); 1543 #endif 1544 log_finalize(); 1545 exit(0); 1546 } 1547 1548 void 1549 server_prepare_xfrd(struct nsd* nsd) 1550 { 1551 char tmpfile[256]; 1552 /* create task mmaps */ 1553 nsd->mytask = 0; 1554 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1555 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1556 nsd->task[0] = task_file_create(tmpfile); 1557 if(!nsd->task[0]) { 1558 #ifdef USE_ZONE_STATS 1559 unlink(nsd->zonestatfname[0]); 1560 unlink(nsd->zonestatfname[1]); 1561 #endif 1562 xfrd_del_tempdir(nsd); 1563 exit(1); 1564 } 1565 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1566 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1567 nsd->task[1] = task_file_create(tmpfile); 1568 if(!nsd->task[1]) { 1569 unlink(nsd->task[0]->fname); 1570 #ifdef USE_ZONE_STATS 1571 unlink(nsd->zonestatfname[0]); 1572 unlink(nsd->zonestatfname[1]); 1573 #endif 1574 xfrd_del_tempdir(nsd); 1575 exit(1); 1576 } 1577 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1578 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1579 /* create xfrd listener structure */ 1580 nsd->xfrd_listener = region_alloc(nsd->region, 1581 sizeof(netio_handler_type)); 1582 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1583 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1584 nsd->xfrd_listener->fd = -1; 1585 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1586 nsd; 1587 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1588 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1589 } 1590 1591 1592 void 1593 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1594 { 1595 pid_t pid; 1596 int sockets[2] = {0,0}; 1597 struct ipc_handler_conn_data *data; 1598 1599 if(nsd->xfrd_listener->fd != -1) 1600 close(nsd->xfrd_listener->fd); 1601 if(del_db) { 1602 /* recreate taskdb that xfrd was using, it may be corrupt */ 1603 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1604 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1605 nsd->task[1-nsd->mytask]->fname = NULL; 1606 /* free alloc already, so udb does not shrink itself */ 1607 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1608 nsd->task[1-nsd->mytask]->alloc = NULL; 1609 udb_base_free(nsd->task[1-nsd->mytask]); 1610 /* create new file, overwrite the old one */ 1611 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1612 free(tmpfile); 1613 } 1614 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1615 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1616 return; 1617 } 1618 pid = fork(); 1619 switch (pid) { 1620 case -1: 1621 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1622 break; 1623 default: 1624 /* PARENT: close first socket, use second one */ 1625 close(sockets[0]); 1626 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1627 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1628 } 1629 if(del_db) xfrd_free_namedb(nsd); 1630 /* use other task than I am using, since if xfrd died and is 1631 * restarted, the reload is using nsd->mytask */ 1632 nsd->mytask = 1 - nsd->mytask; 1633 1634 #ifdef HAVE_SETPROCTITLE 1635 setproctitle("xfrd"); 1636 #endif 1637 #ifdef HAVE_CPUSET_T 1638 if(nsd->use_cpu_affinity) { 1639 set_cpu_affinity(nsd->xfrd_cpuset); 1640 } 1641 #endif 1642 1643 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1644 /* ENOTREACH */ 1645 break; 1646 case 0: 1647 /* CHILD: close second socket, use first one */ 1648 close(sockets[1]); 1649 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1650 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1651 } 1652 nsd->xfrd_listener->fd = sockets[0]; 1653 break; 1654 } 1655 /* server-parent only */ 1656 nsd->xfrd_listener->timeout = NULL; 1657 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1658 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1659 /* clear ongoing ipc reads */ 1660 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1661 data->conn->is_reading = 0; 1662 } 1663 1664 /** add all soainfo to taskdb */ 1665 static void 1666 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1667 { 1668 struct radnode* n; 1669 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1670 /* add all SOA INFO to mytask */ 1671 udb_ptr_init(&task_last, taskudb); 1672 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1673 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1674 } 1675 udb_ptr_unlink(&task_last, taskudb); 1676 } 1677 1678 void 1679 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1680 { 1681 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1682 * parent fills one taskdb with soas, xfrd fills other with expires. 1683 * then they exchange and process. 1684 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1685 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1686 * expire notifications can be sent back via a normal reload later 1687 * (xfrd will wait for current running reload to finish if any). 1688 */ 1689 sig_atomic_t cmd = 0; 1690 pid_t mypid; 1691 int xfrd_sock = nsd->xfrd_listener->fd; 1692 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1693 udb_ptr t; 1694 if(!shortsoa) { 1695 if(nsd->signal_hint_shutdown) { 1696 shutdown: 1697 log_msg(LOG_WARNING, "signal received, shutting down..."); 1698 server_close_all_sockets(nsd->udp, nsd->ifs); 1699 server_close_all_sockets(nsd->tcp, nsd->ifs); 1700 #ifdef HAVE_SSL 1701 daemon_remote_close(nsd->rc); 1702 #endif 1703 /* Unlink it if possible... */ 1704 unlinkpid(nsd->pidfile); 1705 unlink(nsd->task[0]->fname); 1706 unlink(nsd->task[1]->fname); 1707 #ifdef USE_ZONE_STATS 1708 unlink(nsd->zonestatfname[0]); 1709 unlink(nsd->zonestatfname[1]); 1710 #endif 1711 /* write the nsd.db to disk, wait for it to complete */ 1712 udb_base_sync(nsd->db->udb, 1); 1713 udb_base_close(nsd->db->udb); 1714 server_shutdown(nsd); 1715 /* ENOTREACH */ 1716 exit(0); 1717 } 1718 } 1719 if(shortsoa) { 1720 /* put SOA in xfrd task because mytask may be in use */ 1721 taskudb = nsd->task[1-nsd->mytask]; 1722 } 1723 1724 add_all_soa_to_task(nsd, taskudb); 1725 if(!shortsoa) { 1726 /* wait for xfrd to signal task is ready, RELOAD signal */ 1727 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1728 cmd != NSD_RELOAD) { 1729 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1730 exit(1); 1731 } 1732 if(nsd->signal_hint_shutdown) { 1733 goto shutdown; 1734 } 1735 } 1736 /* give xfrd our task, signal it with RELOAD_DONE */ 1737 task_process_sync(taskudb); 1738 cmd = NSD_RELOAD_DONE; 1739 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1740 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1741 (int)nsd->pid, strerror(errno)); 1742 } 1743 mypid = getpid(); 1744 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1745 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1746 strerror(errno)); 1747 } 1748 1749 if(!shortsoa) { 1750 /* process the xfrd task works (expiry data) */ 1751 nsd->mytask = 1 - nsd->mytask; 1752 taskudb = nsd->task[nsd->mytask]; 1753 task_remap(taskudb); 1754 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1755 while(!udb_ptr_is_null(&t)) { 1756 task_process_expire(nsd->db, TASKLIST(&t)); 1757 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1758 } 1759 udb_ptr_unlink(&t, taskudb); 1760 task_clear(taskudb); 1761 1762 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1763 cmd = NSD_RELOAD_DONE; 1764 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1765 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1766 (int)nsd->pid, strerror(errno)); 1767 } 1768 } 1769 } 1770 1771 #ifdef HAVE_SSL 1772 static void 1773 log_crypto_from_err(const char* str, unsigned long err) 1774 { 1775 /* error:[error code]:[library name]:[function name]:[reason string] */ 1776 char buf[128]; 1777 unsigned long e; 1778 ERR_error_string_n(err, buf, sizeof(buf)); 1779 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1780 while( (e=ERR_get_error()) ) { 1781 ERR_error_string_n(e, buf, sizeof(buf)); 1782 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1783 } 1784 } 1785 1786 void 1787 log_crypto_err(const char* str) 1788 { 1789 log_crypto_from_err(str, ERR_get_error()); 1790 } 1791 1792 /** true if the ssl handshake error has to be squelched from the logs */ 1793 static int 1794 squelch_err_ssl_handshake(unsigned long err) 1795 { 1796 if(verbosity >= 3) 1797 return 0; /* only squelch on low verbosity */ 1798 /* this is very specific, we could filter on ERR_GET_REASON() 1799 * (the third element in ERR_PACK) */ 1800 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1801 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1802 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1803 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1804 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1805 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1806 #endif 1807 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1808 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1809 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1810 # ifdef SSL_R_VERSION_TOO_LOW 1811 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1812 # endif 1813 #endif 1814 ) 1815 return 1; 1816 return 0; 1817 } 1818 1819 void 1820 perform_openssl_init(void) 1821 { 1822 /* init SSL library */ 1823 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1824 ERR_load_crypto_strings(); 1825 #endif 1826 #if defined(HAVE_ERR_LOAD_SSL_STRINGS) && !defined(DEPRECATED_ERR_LOAD_SSL_STRINGS) 1827 ERR_load_SSL_strings(); 1828 #endif 1829 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1830 OpenSSL_add_all_algorithms(); 1831 #else 1832 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1833 | OPENSSL_INIT_ADD_ALL_DIGESTS 1834 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1835 #endif 1836 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1837 (void)SSL_library_init(); 1838 #else 1839 OPENSSL_init_ssl(0, NULL); 1840 #endif 1841 1842 if(!RAND_status()) { 1843 /* try to seed it */ 1844 unsigned char buf[256]; 1845 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1846 size_t i; 1847 v = seed; 1848 for(i=0; i<256/sizeof(v); i++) { 1849 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1850 v = v*seed + (unsigned int)i; 1851 } 1852 RAND_seed(buf, 256); 1853 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1854 } 1855 } 1856 1857 static int 1858 get_ocsp(char *filename, unsigned char **ocsp) 1859 { 1860 BIO *bio; 1861 OCSP_RESPONSE *response; 1862 int len = -1; 1863 unsigned char *p, *buf; 1864 assert(filename); 1865 1866 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1867 log_crypto_err("get_ocsp: BIO_new_file failed"); 1868 return -1; 1869 } 1870 1871 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1872 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1873 BIO_free(bio); 1874 return -1; 1875 } 1876 1877 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1878 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1879 OCSP_RESPONSE_free(response); 1880 BIO_free(bio); 1881 return -1; 1882 } 1883 1884 if ((buf = malloc((size_t) len)) == NULL) { 1885 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1886 OCSP_RESPONSE_free(response); 1887 BIO_free(bio); 1888 return -1; 1889 } 1890 1891 p = buf; 1892 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1893 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1894 free(buf); 1895 OCSP_RESPONSE_free(response); 1896 BIO_free(bio); 1897 return -1; 1898 } 1899 1900 OCSP_RESPONSE_free(response); 1901 BIO_free(bio); 1902 1903 *ocsp = buf; 1904 return len; 1905 } 1906 1907 /* further setup ssl ctx after the keys are loaded */ 1908 static void 1909 listen_sslctx_setup_2(void* ctxt) 1910 { 1911 SSL_CTX* ctx = (SSL_CTX*)ctxt; 1912 (void)ctx; 1913 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 1914 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 1915 /* ENOTREACH */ 1916 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 1917 } 1918 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 1919 if(1) { 1920 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 1921 if (!ecdh) { 1922 log_crypto_err("could not find p256, not enabling ECDHE"); 1923 } else { 1924 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 1925 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 1926 } 1927 EC_KEY_free (ecdh); 1928 } 1929 } 1930 #endif 1931 } 1932 1933 static int 1934 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 1935 { 1936 if(ocspdata) { 1937 unsigned char *p; 1938 if ((p=malloc(ocspdata_len)) == NULL) { 1939 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 1940 return SSL_TLSEXT_ERR_NOACK; 1941 } 1942 memcpy(p, ocspdata, ocspdata_len); 1943 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 1944 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 1945 free(p); 1946 return SSL_TLSEXT_ERR_NOACK; 1947 } 1948 return SSL_TLSEXT_ERR_OK; 1949 } else { 1950 return SSL_TLSEXT_ERR_NOACK; 1951 } 1952 } 1953 1954 SSL_CTX* 1955 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 1956 { 1957 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 1958 if(!ctx) { 1959 log_crypto_err("could not SSL_CTX_new"); 1960 return NULL; 1961 } 1962 /* no SSLv2, SSLv3 because has defects */ 1963 #if SSL_OP_NO_SSLv2 != 0 1964 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 1965 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 1966 SSL_CTX_free(ctx); 1967 return NULL; 1968 } 1969 #endif 1970 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 1971 != SSL_OP_NO_SSLv3){ 1972 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 1973 SSL_CTX_free(ctx); 1974 return 0; 1975 } 1976 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 1977 /* if we have tls 1.1 disable 1.0 */ 1978 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 1979 != SSL_OP_NO_TLSv1){ 1980 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 1981 SSL_CTX_free(ctx); 1982 return 0; 1983 } 1984 #endif 1985 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 1986 /* if we have tls 1.2 disable 1.1 */ 1987 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 1988 != SSL_OP_NO_TLSv1_1){ 1989 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 1990 SSL_CTX_free(ctx); 1991 return 0; 1992 } 1993 #endif 1994 #if defined(SSL_OP_NO_RENEGOTIATION) 1995 /* disable client renegotiation */ 1996 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 1997 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 1998 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 1999 SSL_CTX_free(ctx); 2000 return 0; 2001 } 2002 #endif 2003 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 2004 /* if we detect system-wide crypto policies, use those */ 2005 if (access( "/etc/crypto-policies/config", F_OK ) != 0 ) { 2006 /* if we have sha256, set the cipher list to have no known vulns */ 2007 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 2008 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 2009 } 2010 #endif 2011 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 2012 SSL_OP_CIPHER_SERVER_PREFERENCE) != 2013 SSL_OP_CIPHER_SERVER_PREFERENCE) { 2014 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 2015 SSL_CTX_free(ctx); 2016 return 0; 2017 } 2018 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 2019 SSL_CTX_set_security_level(ctx, 0); 2020 #endif 2021 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 2022 log_msg(LOG_ERR, "error for cert file: %s", pem); 2023 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 2024 SSL_CTX_free(ctx); 2025 return NULL; 2026 } 2027 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 2028 log_msg(LOG_ERR, "error for private key file: %s", key); 2029 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 2030 SSL_CTX_free(ctx); 2031 return NULL; 2032 } 2033 if(!SSL_CTX_check_private_key(ctx)) { 2034 log_msg(LOG_ERR, "error for key file: %s", key); 2035 log_crypto_err("Error in SSL_CTX check_private_key"); 2036 SSL_CTX_free(ctx); 2037 return NULL; 2038 } 2039 listen_sslctx_setup_2(ctx); 2040 if(verifypem && verifypem[0]) { 2041 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 2042 log_crypto_err("Error in SSL_CTX verify locations"); 2043 SSL_CTX_free(ctx); 2044 return NULL; 2045 } 2046 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 2047 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 2048 } 2049 return ctx; 2050 } 2051 2052 SSL_CTX* 2053 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 2054 { 2055 char *key, *pem; 2056 SSL_CTX *ctx; 2057 2058 key = nsd->options->tls_service_key; 2059 pem = nsd->options->tls_service_pem; 2060 if(!key || key[0] == 0) { 2061 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2062 return NULL; 2063 } 2064 if(!pem || pem[0] == 0) { 2065 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2066 return NULL; 2067 } 2068 2069 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2070 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2071 ctx = server_tls_ctx_setup(key, pem, verifypem); 2072 if(!ctx) { 2073 log_msg(LOG_ERR, "could not setup server TLS context"); 2074 return NULL; 2075 } 2076 if(ocspfile && ocspfile[0]) { 2077 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2078 log_crypto_err("Error reading OCSPfile"); 2079 SSL_CTX_free(ctx); 2080 return NULL; 2081 } else { 2082 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2083 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2084 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2085 SSL_CTX_free(ctx); 2086 return NULL; 2087 } 2088 } 2089 } 2090 return ctx; 2091 } 2092 2093 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2094 int 2095 using_tls_port(struct sockaddr* addr, const char* tls_port) 2096 { 2097 in_port_t port = 0; 2098 2099 if (addr->sa_family == AF_INET) 2100 port = ((struct sockaddr_in*)addr)->sin_port; 2101 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2102 else 2103 port = ((struct sockaddr_in6*)addr)->sin6_port; 2104 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2105 if (atoi(tls_port) == ntohs(port)) 2106 return 1; 2107 2108 return 0; 2109 } 2110 #endif 2111 2112 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2113 ssize_t 2114 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2115 { 2116 uint8_t* buf = (uint8_t*) p; 2117 ssize_t total = 0; 2118 struct pollfd fd; 2119 memset(&fd, 0, sizeof(fd)); 2120 fd.fd = s; 2121 fd.events = POLLIN; 2122 2123 while( total < sz) { 2124 ssize_t ret; 2125 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2126 if(ret == -1) { 2127 if(errno == EAGAIN) 2128 /* blocking read */ 2129 continue; 2130 if(errno == EINTR) { 2131 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2132 return -1; 2133 /* other signals can be handled later */ 2134 continue; 2135 } 2136 /* some error */ 2137 return -1; 2138 } 2139 if(ret == 0) { 2140 /* operation timed out */ 2141 return -2; 2142 } 2143 ret = read(s, buf+total, sz-total); 2144 if(ret == -1) { 2145 if(errno == EAGAIN) 2146 /* blocking read */ 2147 continue; 2148 if(errno == EINTR) { 2149 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2150 return -1; 2151 /* other signals can be handled later */ 2152 continue; 2153 } 2154 /* some error */ 2155 return -1; 2156 } 2157 if(ret == 0) { 2158 /* closed connection! */ 2159 return 0; 2160 } 2161 total += ret; 2162 } 2163 return total; 2164 } 2165 2166 static void 2167 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2168 { 2169 sig_atomic_t cmd = NSD_QUIT_SYNC; 2170 udb_ptr t, next; 2171 udb_base* u = nsd->task[nsd->mytask]; 2172 udb_ptr_init(&next, u); 2173 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2174 udb_base_set_userdata(u, 0); 2175 while(!udb_ptr_is_null(&t)) { 2176 /* store next in list so this one can be deleted or reused */ 2177 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2178 udb_rptr_zero(&TASKLIST(&t)->next, u); 2179 2180 /* process task t */ 2181 /* append results for task t and update last_task */ 2182 task_process_in_reload(nsd, u, last_task, &t); 2183 2184 /* go to next */ 2185 udb_ptr_set_ptr(&t, u, &next); 2186 2187 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2188 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2189 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2190 if(cmd == NSD_QUIT) { 2191 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2192 /* sync to disk (if needed) */ 2193 udb_base_sync(nsd->db->udb, 0); 2194 /* unlink files of remainder of tasks */ 2195 while(!udb_ptr_is_null(&t)) { 2196 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2197 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2198 } 2199 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2200 } 2201 udb_ptr_unlink(&t, u); 2202 udb_ptr_unlink(&next, u); 2203 exit(0); 2204 } 2205 } 2206 2207 } 2208 udb_ptr_unlink(&t, u); 2209 udb_ptr_unlink(&next, u); 2210 } 2211 2212 #ifdef BIND8_STATS 2213 static void 2214 parent_send_stats(struct nsd* nsd, int cmdfd) 2215 { 2216 size_t i; 2217 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { 2218 log_msg(LOG_ERR, "could not write stats to reload"); 2219 return; 2220 } 2221 for(i=0; i<nsd->child_count; i++) 2222 if(!write_socket(cmdfd, &nsd->children[i].query_count, 2223 sizeof(stc_type))) { 2224 log_msg(LOG_ERR, "could not write stats to reload"); 2225 return; 2226 } 2227 } 2228 2229 static void 2230 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) 2231 { 2232 struct nsdst s; 2233 stc_type* p; 2234 size_t i; 2235 if(block_read(nsd, cmdfd, &s, sizeof(s), 2236 RELOAD_SYNC_TIMEOUT) != sizeof(s)) { 2237 log_msg(LOG_ERR, "could not read stats from oldpar"); 2238 return; 2239 } 2240 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); 2241 s.db_mem = region_get_mem(nsd->db->region); 2242 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, 2243 nsd->child_count); 2244 if(!p) return; 2245 for(i=0; i<nsd->child_count; i++) { 2246 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= 2247 sizeof(stc_type)) 2248 return; 2249 } 2250 } 2251 #endif /* BIND8_STATS */ 2252 2253 /* 2254 * Reload the database, stop parent, re-fork children and continue. 2255 * as server_main. 2256 */ 2257 static void 2258 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2259 int cmdsocket) 2260 { 2261 pid_t mypid; 2262 sig_atomic_t cmd = NSD_QUIT_SYNC; 2263 int ret; 2264 udb_ptr last_task; 2265 struct sigaction old_sigchld, ign_sigchld; 2266 /* ignore SIGCHLD from the previous server_main that used this pid */ 2267 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2268 ign_sigchld.sa_handler = SIG_IGN; 2269 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2270 2271 #ifdef HAVE_SETPROCTITLE 2272 setproctitle("main"); 2273 #endif 2274 #ifdef HAVE_CPUSET_T 2275 if(nsd->use_cpu_affinity) { 2276 set_cpu_affinity(nsd->cpuset); 2277 } 2278 #endif 2279 2280 /* see what tasks we got from xfrd */ 2281 task_remap(nsd->task[nsd->mytask]); 2282 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2283 udb_compact_inhibited(nsd->db->udb, 1); 2284 reload_process_tasks(nsd, &last_task, cmdsocket); 2285 udb_compact_inhibited(nsd->db->udb, 0); 2286 udb_compact(nsd->db->udb); 2287 2288 #ifndef NDEBUG 2289 if(nsd_debug_level >= 1) 2290 region_log_stats(nsd->db->region); 2291 #endif /* NDEBUG */ 2292 /* sync to disk (if needed) */ 2293 udb_base_sync(nsd->db->udb, 0); 2294 2295 initialize_dname_compression_tables(nsd); 2296 2297 #ifdef BIND8_STATS 2298 /* Restart dumping stats if required. */ 2299 time(&nsd->st.boot); 2300 set_bind8_alarm(nsd); 2301 #endif 2302 #ifdef USE_ZONE_STATS 2303 server_zonestat_realloc(nsd); /* realloc for new children */ 2304 server_zonestat_switch(nsd); 2305 #endif 2306 2307 /* listen for the signals of failed children again */ 2308 sigaction(SIGCHLD, &old_sigchld, NULL); 2309 #ifdef USE_DNSTAP 2310 if (nsd->dt_collector) { 2311 int *swap_fd_send; 2312 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: swap dnstap collector pipes")); 2313 /* Swap fd_send with fd_swap so old serve child and new serve 2314 * childs will not write to the same pipe ends simultaneously */ 2315 swap_fd_send = nsd->dt_collector_fd_send; 2316 nsd->dt_collector_fd_send = nsd->dt_collector_fd_swap; 2317 nsd->dt_collector_fd_swap = swap_fd_send; 2318 2319 } 2320 #endif 2321 /* Start new child processes */ 2322 if (server_start_children(nsd, server_region, netio, &nsd-> 2323 xfrd_listener->fd) != 0) { 2324 send_children_quit(nsd); 2325 exit(1); 2326 } 2327 2328 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2329 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2330 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2331 if(cmd == NSD_QUIT) { 2332 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2333 send_children_quit(nsd); 2334 exit(0); 2335 } 2336 } 2337 2338 /* Send quit command to parent: blocking, wait for receipt. */ 2339 do { 2340 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2341 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2342 { 2343 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2344 strerror(errno)); 2345 } 2346 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2347 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2348 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2349 RELOAD_SYNC_TIMEOUT); 2350 if(ret == -2) { 2351 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2352 } 2353 } while (ret == -2); 2354 if(ret == -1) { 2355 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2356 strerror(errno)); 2357 } 2358 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2359 if(cmd == NSD_QUIT) { 2360 /* small race condition possible here, parent got quit cmd. */ 2361 send_children_quit(nsd); 2362 exit(1); 2363 } 2364 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2365 #ifdef BIND8_STATS 2366 reload_do_stats(cmdsocket, nsd, &last_task); 2367 #endif 2368 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2369 task_process_sync(nsd->task[nsd->mytask]); 2370 #ifdef USE_ZONE_STATS 2371 server_zonestat_realloc(nsd); /* realloc for next children */ 2372 #endif 2373 2374 /* send soainfo to the xfrd process, signal it that reload is done, 2375 * it picks up the taskudb */ 2376 cmd = NSD_RELOAD_DONE; 2377 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2378 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2379 strerror(errno)); 2380 } 2381 mypid = getpid(); 2382 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2383 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2384 strerror(errno)); 2385 } 2386 2387 /* try to reopen file */ 2388 if (nsd->file_rotation_ok) 2389 log_reopen(nsd->log_filename, 1); 2390 /* exit reload, continue as new server_main */ 2391 } 2392 2393 /* 2394 * Get the mode depending on the signal hints that have been received. 2395 * Multiple signal hints can be received and will be handled in turn. 2396 */ 2397 static sig_atomic_t 2398 server_signal_mode(struct nsd *nsd) 2399 { 2400 if(nsd->signal_hint_quit) { 2401 nsd->signal_hint_quit = 0; 2402 return NSD_QUIT; 2403 } 2404 else if(nsd->signal_hint_shutdown) { 2405 nsd->signal_hint_shutdown = 0; 2406 return NSD_SHUTDOWN; 2407 } 2408 else if(nsd->signal_hint_child) { 2409 nsd->signal_hint_child = 0; 2410 return NSD_REAP_CHILDREN; 2411 } 2412 else if(nsd->signal_hint_reload) { 2413 nsd->signal_hint_reload = 0; 2414 return NSD_RELOAD; 2415 } 2416 else if(nsd->signal_hint_reload_hup) { 2417 nsd->signal_hint_reload_hup = 0; 2418 return NSD_RELOAD_REQ; 2419 } 2420 else if(nsd->signal_hint_stats) { 2421 nsd->signal_hint_stats = 0; 2422 #ifdef BIND8_STATS 2423 set_bind8_alarm(nsd); 2424 #endif 2425 return NSD_STATS; 2426 } 2427 else if(nsd->signal_hint_statsusr) { 2428 nsd->signal_hint_statsusr = 0; 2429 return NSD_STATS; 2430 } 2431 return NSD_RUN; 2432 } 2433 2434 /* 2435 * The main server simply waits for signals and child processes to 2436 * terminate. Child processes are restarted as necessary. 2437 */ 2438 void 2439 server_main(struct nsd *nsd) 2440 { 2441 region_type *server_region = region_create(xalloc, free); 2442 netio_type *netio = netio_create(server_region); 2443 netio_handler_type reload_listener; 2444 int reload_sockets[2] = {-1, -1}; 2445 struct timespec timeout_spec; 2446 int status; 2447 pid_t child_pid; 2448 pid_t reload_pid = -1; 2449 sig_atomic_t mode; 2450 2451 /* Ensure we are the main process */ 2452 assert(nsd->server_kind == NSD_SERVER_MAIN); 2453 2454 /* Add listener for the XFRD process */ 2455 netio_add_handler(netio, nsd->xfrd_listener); 2456 2457 /* Start the child processes that handle incoming queries */ 2458 if (server_start_children(nsd, server_region, netio, 2459 &nsd->xfrd_listener->fd) != 0) { 2460 send_children_quit(nsd); 2461 exit(1); 2462 } 2463 reload_listener.fd = -1; 2464 2465 /* This_child MUST be 0, because this is the parent process */ 2466 assert(nsd->this_child == 0); 2467 2468 /* Run the server until we get a shutdown signal */ 2469 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2470 /* Did we receive a signal that changes our mode? */ 2471 if(mode == NSD_RUN) { 2472 nsd->mode = mode = server_signal_mode(nsd); 2473 } 2474 2475 switch (mode) { 2476 case NSD_RUN: 2477 /* see if any child processes terminated */ 2478 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2479 int is_child = delete_child_pid(nsd, child_pid); 2480 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2481 if(nsd->children[is_child].child_fd == -1) 2482 nsd->children[is_child].has_exited = 1; 2483 parent_check_all_children_exited(nsd); 2484 } else if(is_child != -1) { 2485 log_msg(LOG_WARNING, 2486 "server %d died unexpectedly with status %d, restarting", 2487 (int) child_pid, status); 2488 restart_child_servers(nsd, server_region, netio, 2489 &nsd->xfrd_listener->fd); 2490 } else if (child_pid == reload_pid) { 2491 sig_atomic_t cmd = NSD_RELOAD_DONE; 2492 pid_t mypid; 2493 log_msg(LOG_WARNING, 2494 "Reload process %d failed with status %d, continuing with old database", 2495 (int) child_pid, status); 2496 reload_pid = -1; 2497 if(reload_listener.fd != -1) close(reload_listener.fd); 2498 reload_listener.fd = -1; 2499 reload_listener.event_types = NETIO_EVENT_NONE; 2500 task_process_sync(nsd->task[nsd->mytask]); 2501 /* inform xfrd reload attempt ended */ 2502 if(!write_socket(nsd->xfrd_listener->fd, 2503 &cmd, sizeof(cmd))) { 2504 log_msg(LOG_ERR, "problems " 2505 "sending SOAEND to xfrd: %s", 2506 strerror(errno)); 2507 } 2508 mypid = getpid(); 2509 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2510 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2511 strerror(errno)); 2512 } 2513 #ifdef USE_DNSTAP 2514 } else if(nsd->dt_collector && child_pid == nsd->dt_collector->dt_pid) { 2515 log_msg(LOG_WARNING, 2516 "dnstap-collector %d terminated with status %d", 2517 (int) child_pid, status); 2518 if(nsd->dt_collector) { 2519 dt_collector_close(nsd->dt_collector, nsd); 2520 dt_collector_destroy(nsd->dt_collector, nsd); 2521 nsd->dt_collector = NULL; 2522 } 2523 /* Only respawn a crashed (or exited) 2524 * dnstap-collector when not reloading, 2525 * to not induce a reload during a 2526 * reload (which would seriously 2527 * disrupt nsd procedures and lead to 2528 * unpredictable results)! 2529 * 2530 * This will *leave* a dnstap-collector 2531 * process terminated, but because 2532 * signalling of the reload process to 2533 * the main process to respawn in this 2534 * situation will be cumbersome, and 2535 * because this situation is so 2536 * specific (and therefore hopefully 2537 * extremely rare or non-existing at 2538 * all), plus the fact that we are left 2539 * with a perfectly function NSD 2540 * (besides not logging dnstap 2541 * messages), I consider it acceptable 2542 * to leave this unresolved. 2543 */ 2544 if(reload_pid == -1 && nsd->options->dnstap_enable) { 2545 nsd->dt_collector = dt_collector_create(nsd); 2546 dt_collector_start(nsd->dt_collector, nsd); 2547 nsd->mode = NSD_RELOAD_REQ; 2548 } 2549 #endif 2550 } else if(status != 0) { 2551 /* check for status, because we get 2552 * the old-servermain because reload 2553 * is the process-parent of old-main, 2554 * and we get older server-processes 2555 * that are exiting after a reload */ 2556 log_msg(LOG_WARNING, 2557 "process %d terminated with status %d", 2558 (int) child_pid, status); 2559 } 2560 } 2561 if (child_pid == -1) { 2562 if (errno == EINTR) { 2563 continue; 2564 } 2565 if (errno != ECHILD) 2566 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2567 } 2568 if (nsd->mode != NSD_RUN) 2569 break; 2570 2571 /* timeout to collect processes. In case no sigchild happens. */ 2572 timeout_spec.tv_sec = 60; 2573 timeout_spec.tv_nsec = 0; 2574 2575 /* listen on ports, timeout for collecting terminated children */ 2576 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2577 if (errno != EINTR) { 2578 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2579 } 2580 } 2581 if(nsd->restart_children) { 2582 restart_child_servers(nsd, server_region, netio, 2583 &nsd->xfrd_listener->fd); 2584 nsd->restart_children = 0; 2585 } 2586 if(nsd->reload_failed) { 2587 sig_atomic_t cmd = NSD_RELOAD_DONE; 2588 pid_t mypid; 2589 nsd->reload_failed = 0; 2590 log_msg(LOG_WARNING, 2591 "Reload process %d failed, continuing with old database", 2592 (int) reload_pid); 2593 reload_pid = -1; 2594 if(reload_listener.fd != -1) close(reload_listener.fd); 2595 reload_listener.fd = -1; 2596 reload_listener.event_types = NETIO_EVENT_NONE; 2597 task_process_sync(nsd->task[nsd->mytask]); 2598 /* inform xfrd reload attempt ended */ 2599 if(!write_socket(nsd->xfrd_listener->fd, 2600 &cmd, sizeof(cmd))) { 2601 log_msg(LOG_ERR, "problems " 2602 "sending SOAEND to xfrd: %s", 2603 strerror(errno)); 2604 } 2605 mypid = getpid(); 2606 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2607 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2608 strerror(errno)); 2609 } 2610 } 2611 2612 break; 2613 case NSD_RELOAD_REQ: { 2614 sig_atomic_t cmd = NSD_RELOAD_REQ; 2615 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2616 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2617 "main: ipc send reload_req to xfrd")); 2618 if(!write_socket(nsd->xfrd_listener->fd, 2619 &cmd, sizeof(cmd))) { 2620 log_msg(LOG_ERR, "server_main: could not send " 2621 "reload_req to xfrd: %s", strerror(errno)); 2622 } 2623 nsd->mode = NSD_RUN; 2624 } break; 2625 case NSD_RELOAD: 2626 /* Continue to run nsd after reload */ 2627 nsd->mode = NSD_RUN; 2628 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2629 if (reload_pid != -1) { 2630 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2631 (int) reload_pid); 2632 break; 2633 } 2634 2635 /* switch the mytask to keep track of who owns task*/ 2636 nsd->mytask = 1 - nsd->mytask; 2637 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2638 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2639 reload_pid = -1; 2640 break; 2641 } 2642 2643 /* Do actual reload */ 2644 reload_pid = fork(); 2645 switch (reload_pid) { 2646 case -1: 2647 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2648 break; 2649 default: 2650 /* PARENT */ 2651 close(reload_sockets[0]); 2652 server_reload(nsd, server_region, netio, 2653 reload_sockets[1]); 2654 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2655 close(reload_sockets[1]); 2656 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2657 /* drop stale xfrd ipc data */ 2658 ((struct ipc_handler_conn_data*)nsd-> 2659 xfrd_listener->user_data) 2660 ->conn->is_reading = 0; 2661 reload_pid = -1; 2662 reload_listener.fd = -1; 2663 reload_listener.event_types = NETIO_EVENT_NONE; 2664 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2665 break; 2666 case 0: 2667 /* CHILD */ 2668 /* server_main keep running until NSD_QUIT_SYNC 2669 * received from reload. */ 2670 close(reload_sockets[1]); 2671 reload_listener.fd = reload_sockets[0]; 2672 reload_listener.timeout = NULL; 2673 reload_listener.user_data = nsd; 2674 reload_listener.event_types = NETIO_EVENT_READ; 2675 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2676 netio_add_handler(netio, &reload_listener); 2677 reload_pid = getppid(); 2678 break; 2679 } 2680 break; 2681 case NSD_QUIT_SYNC: 2682 /* synchronisation of xfrd, parent and reload */ 2683 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2684 sig_atomic_t cmd = NSD_RELOAD; 2685 /* stop xfrd ipc writes in progress */ 2686 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2687 "main: ipc send indication reload")); 2688 if(!write_socket(nsd->xfrd_listener->fd, 2689 &cmd, sizeof(cmd))) { 2690 log_msg(LOG_ERR, "server_main: could not send reload " 2691 "indication to xfrd: %s", strerror(errno)); 2692 } 2693 /* wait for ACK from xfrd */ 2694 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2695 nsd->quit_sync_done = 1; 2696 } 2697 nsd->mode = NSD_RUN; 2698 break; 2699 case NSD_QUIT: 2700 /* silent shutdown during reload */ 2701 if(reload_listener.fd != -1) { 2702 /* acknowledge the quit, to sync reload that we will really quit now */ 2703 sig_atomic_t cmd = NSD_RELOAD; 2704 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2705 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2706 log_msg(LOG_ERR, "server_main: " 2707 "could not ack quit: %s", strerror(errno)); 2708 } 2709 #ifdef BIND8_STATS 2710 parent_send_stats(nsd, reload_listener.fd); 2711 #endif /* BIND8_STATS */ 2712 close(reload_listener.fd); 2713 } 2714 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2715 /* only quit children after xfrd has acked */ 2716 send_children_quit(nsd); 2717 2718 #ifdef MEMCLEAN /* OS collects memory pages */ 2719 region_destroy(server_region); 2720 #endif 2721 server_shutdown(nsd); 2722 2723 /* ENOTREACH */ 2724 break; 2725 case NSD_SHUTDOWN: 2726 break; 2727 case NSD_REAP_CHILDREN: 2728 /* continue; wait for child in run loop */ 2729 nsd->mode = NSD_RUN; 2730 break; 2731 case NSD_STATS: 2732 #ifdef BIND8_STATS 2733 set_children_stats(nsd); 2734 #endif 2735 nsd->mode = NSD_RUN; 2736 break; 2737 default: 2738 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2739 nsd->mode = NSD_RUN; 2740 break; 2741 } 2742 } 2743 log_msg(LOG_WARNING, "signal received, shutting down..."); 2744 2745 /* close opened ports to avoid race with restart of nsd */ 2746 server_close_all_sockets(nsd->udp, nsd->ifs); 2747 server_close_all_sockets(nsd->tcp, nsd->ifs); 2748 #ifdef HAVE_SSL 2749 daemon_remote_close(nsd->rc); 2750 #endif 2751 send_children_quit_and_wait(nsd); 2752 2753 /* Unlink it if possible... */ 2754 unlinkpid(nsd->pidfile); 2755 unlink(nsd->task[0]->fname); 2756 unlink(nsd->task[1]->fname); 2757 #ifdef USE_ZONE_STATS 2758 unlink(nsd->zonestatfname[0]); 2759 unlink(nsd->zonestatfname[1]); 2760 #endif 2761 #ifdef USE_DNSTAP 2762 dt_collector_close(nsd->dt_collector, nsd); 2763 #endif 2764 2765 if(reload_listener.fd != -1) { 2766 sig_atomic_t cmd = NSD_QUIT; 2767 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2768 "main: ipc send quit to reload-process")); 2769 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2770 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2771 strerror(errno)); 2772 } 2773 fsync(reload_listener.fd); 2774 close(reload_listener.fd); 2775 /* wait for reload to finish processing */ 2776 while(1) { 2777 if(waitpid(reload_pid, NULL, 0) == -1) { 2778 if(errno == EINTR) continue; 2779 if(errno == ECHILD) break; 2780 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2781 (int)reload_pid, strerror(errno)); 2782 } 2783 break; 2784 } 2785 } 2786 if(nsd->xfrd_listener->fd != -1) { 2787 /* complete quit, stop xfrd */ 2788 sig_atomic_t cmd = NSD_QUIT; 2789 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2790 "main: ipc send quit to xfrd")); 2791 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2792 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2793 strerror(errno)); 2794 } 2795 fsync(nsd->xfrd_listener->fd); 2796 close(nsd->xfrd_listener->fd); 2797 (void)kill(nsd->pid, SIGTERM); 2798 } 2799 2800 #ifdef MEMCLEAN /* OS collects memory pages */ 2801 region_destroy(server_region); 2802 #endif 2803 /* write the nsd.db to disk, wait for it to complete */ 2804 udb_base_sync(nsd->db->udb, 1); 2805 udb_base_close(nsd->db->udb); 2806 server_shutdown(nsd); 2807 } 2808 2809 static query_state_type 2810 server_process_query(struct nsd *nsd, struct query *query, uint32_t *now_p) 2811 { 2812 return query_process(query, nsd, now_p); 2813 } 2814 2815 static query_state_type 2816 server_process_query_udp(struct nsd *nsd, struct query *query, uint32_t *now_p) 2817 { 2818 #ifdef RATELIMIT 2819 if(query_process(query, nsd, now_p) != QUERY_DISCARDED) { 2820 if(query->edns.cookie_status != COOKIE_VALID 2821 && query->edns.cookie_status != COOKIE_VALID_REUSE 2822 && rrl_process_query(query)) 2823 return rrl_slip(query); 2824 else return QUERY_PROCESSED; 2825 } 2826 return QUERY_DISCARDED; 2827 #else 2828 return query_process(query, nsd, now_p); 2829 #endif 2830 } 2831 2832 const char* 2833 nsd_event_vs(void) 2834 { 2835 #ifdef USE_MINI_EVENT 2836 return ""; 2837 #else 2838 return event_get_version(); 2839 #endif 2840 } 2841 2842 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2843 static const char* ub_ev_backend2str(int b) 2844 { 2845 switch(b) { 2846 case EVBACKEND_SELECT: return "select"; 2847 case EVBACKEND_POLL: return "poll"; 2848 case EVBACKEND_EPOLL: return "epoll"; 2849 case EVBACKEND_KQUEUE: return "kqueue"; 2850 case EVBACKEND_DEVPOLL: return "devpoll"; 2851 case EVBACKEND_PORT: return "evport"; 2852 } 2853 return "unknown"; 2854 } 2855 #endif 2856 2857 const char* 2858 nsd_event_method(void) 2859 { 2860 #ifdef USE_MINI_EVENT 2861 return "select"; 2862 #else 2863 struct event_base* b = nsd_child_event_base(); 2864 const char* m = "?"; 2865 # ifdef EV_FEATURE_BACKENDS 2866 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2867 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 2868 m = event_base_get_method(b); 2869 # endif 2870 # ifdef MEMCLEAN 2871 event_base_free(b); 2872 # endif 2873 return m; 2874 #endif 2875 } 2876 2877 struct event_base* 2878 nsd_child_event_base(void) 2879 { 2880 struct event_base* base; 2881 #ifdef USE_MINI_EVENT 2882 static time_t secs; 2883 static struct timeval now; 2884 base = event_init(&secs, &now); 2885 #else 2886 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2887 /* libev */ 2888 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2889 # else 2890 /* libevent */ 2891 # ifdef HAVE_EVENT_BASE_NEW 2892 base = event_base_new(); 2893 # else 2894 base = event_init(); 2895 # endif 2896 # endif 2897 #endif 2898 return base; 2899 } 2900 2901 static void 2902 add_udp_handler( 2903 struct nsd *nsd, 2904 struct nsd_socket *sock, 2905 struct udp_handler_data *data) 2906 { 2907 struct event *handler = &data->event; 2908 2909 data->nsd = nsd; 2910 data->socket = sock; 2911 2912 memset(handler, 0, sizeof(*handler)); 2913 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 2914 if(event_base_set(nsd->event_base, handler) != 0) 2915 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 2916 if(event_add(handler, NULL) != 0) 2917 log_msg(LOG_ERR, "nsd udp: event_add failed"); 2918 } 2919 2920 void 2921 add_tcp_handler( 2922 struct nsd *nsd, 2923 struct nsd_socket *sock, 2924 struct tcp_accept_handler_data *data) 2925 { 2926 struct event *handler = &data->event; 2927 2928 data->nsd = nsd; 2929 data->socket = sock; 2930 2931 #ifdef HAVE_SSL 2932 if (nsd->tls_ctx && 2933 nsd->options->tls_port && 2934 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 2935 { 2936 data->tls_accept = 1; 2937 if(verbosity >= 2) { 2938 char buf[48]; 2939 addrport2str((void*)(struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 2940 VERBOSITY(4, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 2941 } 2942 } else { 2943 data->tls_accept = 0; 2944 } 2945 #endif 2946 2947 memset(handler, 0, sizeof(*handler)); 2948 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 2949 if(event_base_set(nsd->event_base, handler) != 0) 2950 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 2951 if(event_add(handler, NULL) != 0) 2952 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 2953 data->event_added = 1; 2954 } 2955 2956 /* 2957 * Serve DNS requests. 2958 */ 2959 void 2960 server_child(struct nsd *nsd) 2961 { 2962 size_t i, from, numifs; 2963 region_type *server_region = region_create(xalloc, free); 2964 struct event_base* event_base = nsd_child_event_base(); 2965 sig_atomic_t mode; 2966 2967 if(!event_base) { 2968 log_msg(LOG_ERR, "nsd server could not create event base"); 2969 exit(1); 2970 } 2971 nsd->event_base = event_base; 2972 nsd->server_region = server_region; 2973 2974 #ifdef RATELIMIT 2975 rrl_init(nsd->this_child->child_num); 2976 #endif 2977 2978 assert(nsd->server_kind != NSD_SERVER_MAIN); 2979 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 2980 2981 #ifdef HAVE_SETPROCTITLE 2982 setproctitle("server %d", nsd->this_child->child_num + 1); 2983 #endif 2984 #ifdef HAVE_CPUSET_T 2985 if(nsd->use_cpu_affinity) { 2986 set_cpu_affinity(nsd->this_child->cpuset); 2987 } 2988 #endif 2989 2990 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 2991 server_close_all_sockets(nsd->tcp, nsd->ifs); 2992 } 2993 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 2994 server_close_all_sockets(nsd->udp, nsd->ifs); 2995 } 2996 2997 if (nsd->this_child->parent_fd != -1) { 2998 struct event *handler; 2999 struct ipc_handler_conn_data* user_data = 3000 (struct ipc_handler_conn_data*)region_alloc( 3001 server_region, sizeof(struct ipc_handler_conn_data)); 3002 user_data->nsd = nsd; 3003 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 3004 3005 handler = (struct event*) region_alloc( 3006 server_region, sizeof(*handler)); 3007 memset(handler, 0, sizeof(*handler)); 3008 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 3009 EV_READ, child_handle_parent_command, user_data); 3010 if(event_base_set(event_base, handler) != 0) 3011 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 3012 if(event_add(handler, NULL) != 0) 3013 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 3014 } 3015 3016 if(nsd->reuseport) { 3017 numifs = nsd->ifs / nsd->reuseport; 3018 from = numifs * nsd->this_child->child_num; 3019 if(from+numifs > nsd->ifs) { /* should not happen */ 3020 from = 0; 3021 numifs = nsd->ifs; 3022 } 3023 } else { 3024 from = 0; 3025 numifs = nsd->ifs; 3026 } 3027 3028 if (nsd->server_kind & NSD_SERVER_UDP) { 3029 int child = nsd->this_child->child_num; 3030 memset(msgs, 0, sizeof(msgs)); 3031 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 3032 queries[i] = query_create(server_region, 3033 compressed_dname_offsets, 3034 compression_table_size, compressed_dnames); 3035 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3036 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 3037 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3038 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3039 msgs[i].msg_hdr.msg_iovlen = 1; 3040 msgs[i].msg_hdr.msg_name = &queries[i]->addr; 3041 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3042 } 3043 3044 for (i = 0; i < nsd->ifs; i++) { 3045 int listen; 3046 struct udp_handler_data *data; 3047 3048 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 3049 3050 if(i >= from && i < (from + numifs) && listen) { 3051 data = region_alloc_zero( 3052 nsd->server_region, sizeof(*data)); 3053 add_udp_handler(nsd, &nsd->udp[i], data); 3054 } else { 3055 /* close sockets intended for other servers */ 3056 server_close_socket(&nsd->udp[i]); 3057 } 3058 } 3059 } 3060 3061 /* 3062 * Keep track of all the TCP accept handlers so we can enable 3063 * and disable them based on the current number of active TCP 3064 * connections. 3065 */ 3066 if (nsd->server_kind & NSD_SERVER_TCP) { 3067 int child = nsd->this_child->child_num; 3068 tcp_accept_handler_count = numifs; 3069 tcp_accept_handlers = region_alloc_array(server_region, 3070 numifs, sizeof(*tcp_accept_handlers)); 3071 3072 for (i = 0; i < nsd->ifs; i++) { 3073 int listen; 3074 struct tcp_accept_handler_data *data; 3075 3076 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 3077 3078 if(i >= from && i < (from + numifs) && listen) { 3079 data = &tcp_accept_handlers[i-from]; 3080 memset(data, 0, sizeof(*data)); 3081 add_tcp_handler(nsd, &nsd->tcp[i], data); 3082 } else { 3083 /* close sockets intended for other servers */ 3084 /* 3085 * uncomment this once tcp servers are no 3086 * longer copied in the tcp fd copy line 3087 * in server_init(). 3088 server_close_socket(&nsd->tcp[i]); 3089 */ 3090 /* close sockets not meant for this server*/ 3091 if(!listen) 3092 server_close_socket(&nsd->tcp[i]); 3093 } 3094 } 3095 } else { 3096 tcp_accept_handler_count = 0; 3097 } 3098 3099 /* The main loop... */ 3100 while ((mode = nsd->mode) != NSD_QUIT) { 3101 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 3102 3103 /* Do we need to do the statistics... */ 3104 if (mode == NSD_STATS) { 3105 #ifdef BIND8_STATS 3106 int p = nsd->st.period; 3107 nsd->st.period = 1; /* force stats printout */ 3108 /* Dump the statistics */ 3109 bind8_stats(nsd); 3110 nsd->st.period = p; 3111 #else /* !BIND8_STATS */ 3112 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3113 #endif /* BIND8_STATS */ 3114 3115 nsd->mode = NSD_RUN; 3116 } 3117 else if (mode == NSD_REAP_CHILDREN) { 3118 /* got signal, notify parent. parent reaps terminated children. */ 3119 if (nsd->this_child->parent_fd != -1) { 3120 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3121 if (write(nsd->this_child->parent_fd, 3122 &parent_notify, 3123 sizeof(parent_notify)) == -1) 3124 { 3125 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3126 (int) nsd->this_child->pid, strerror(errno)); 3127 } 3128 } else /* no parent, so reap 'em */ 3129 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3130 nsd->mode = NSD_RUN; 3131 } 3132 else if(mode == NSD_RUN) { 3133 /* Wait for a query... */ 3134 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3135 if (errno != EINTR) { 3136 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3137 break; 3138 } 3139 } 3140 } else if(mode == NSD_QUIT) { 3141 /* ignore here, quit */ 3142 } else { 3143 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3144 (int)mode); 3145 nsd->mode = NSD_RUN; 3146 } 3147 } 3148 3149 service_remaining_tcp(nsd); 3150 #ifdef BIND8_STATS 3151 bind8_stats(nsd); 3152 #endif /* BIND8_STATS */ 3153 3154 #ifdef MEMCLEAN /* OS collects memory pages */ 3155 #ifdef RATELIMIT 3156 rrl_deinit(nsd->this_child->child_num); 3157 #endif 3158 event_base_free(event_base); 3159 region_destroy(server_region); 3160 #endif 3161 server_shutdown(nsd); 3162 } 3163 3164 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3165 { 3166 int* timed_out = (int*)arg; 3167 assert(event & EV_TIMEOUT); (void)event; 3168 /* wake up the service tcp thread, note event is no longer 3169 * registered */ 3170 *timed_out = 1; 3171 } 3172 3173 void 3174 service_remaining_tcp(struct nsd* nsd) 3175 { 3176 struct tcp_handler_data* p; 3177 struct event_base* event_base; 3178 /* check if it is needed */ 3179 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3180 return; 3181 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3182 #ifdef USE_DNSTAP 3183 /* remove dnstap collector, we cannot write there because the new 3184 * child process is using the file descriptor, or the child 3185 * process after that. */ 3186 dt_collector_destroy(nsd->dt_collector, nsd); 3187 nsd->dt_collector = NULL; 3188 #endif 3189 /* setup event base */ 3190 event_base = nsd_child_event_base(); 3191 if(!event_base) { 3192 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3193 return; 3194 } 3195 /* register tcp connections */ 3196 for(p = tcp_active_list; p != NULL; p = p->next) { 3197 struct timeval timeout; 3198 int fd = p->event.ev_fd; 3199 #ifdef USE_MINI_EVENT 3200 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3201 #else 3202 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3203 #endif 3204 void (*fn)(int, short, void*); 3205 #ifdef HAVE_SSL 3206 if(p->tls) { 3207 if((event&EV_READ)) 3208 fn = handle_tls_reading; 3209 else fn = handle_tls_writing; 3210 } else { 3211 #endif 3212 if((event&EV_READ)) 3213 fn = handle_tcp_reading; 3214 else fn = handle_tcp_writing; 3215 #ifdef HAVE_SSL 3216 } 3217 #endif 3218 3219 p->tcp_no_more_queries = 1; 3220 /* set timeout to 1/10 second */ 3221 if(p->tcp_timeout > 100) 3222 p->tcp_timeout = 100; 3223 timeout.tv_sec = p->tcp_timeout / 1000; 3224 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3225 event_del(&p->event); 3226 memset(&p->event, 0, sizeof(p->event)); 3227 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3228 fn, p); 3229 if(event_base_set(event_base, &p->event) != 0) 3230 log_msg(LOG_ERR, "event base set failed"); 3231 if(event_add(&p->event, &timeout) != 0) 3232 log_msg(LOG_ERR, "event add failed"); 3233 } 3234 3235 /* handle it */ 3236 while(nsd->current_tcp_count > 0) { 3237 mode_t m = server_signal_mode(nsd); 3238 struct event timeout; 3239 struct timeval tv; 3240 int timed_out = 0; 3241 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3242 m == NSD_REAP_CHILDREN) { 3243 /* quit */ 3244 break; 3245 } 3246 /* timer */ 3247 /* have to do something every second */ 3248 tv.tv_sec = 1; 3249 tv.tv_usec = 0; 3250 memset(&timeout, 0, sizeof(timeout)); 3251 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3252 &timed_out); 3253 if(event_base_set(event_base, &timeout) != 0) 3254 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3255 if(event_add(&timeout, &tv) != 0) 3256 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3257 3258 /* service loop */ 3259 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3260 if (errno != EINTR) { 3261 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3262 break; 3263 } 3264 } 3265 if(!timed_out) { 3266 event_del(&timeout); 3267 } else { 3268 /* timed out, quit */ 3269 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3270 break; 3271 } 3272 } 3273 #ifdef MEMCLEAN 3274 event_base_free(event_base); 3275 #endif 3276 /* continue to quit after return */ 3277 } 3278 3279 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3280 * are always used, even if nonblocking operations are broken, in which case 3281 * NUM_RECV_PER_SELECT is defined to 1 (one). 3282 */ 3283 #if defined(HAVE_RECVMMSG) 3284 #define nsd_recvmmsg recvmmsg 3285 #else /* !HAVE_RECVMMSG */ 3286 3287 static int 3288 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3289 int flags, struct timespec *timeout) 3290 { 3291 unsigned int vpos = 0; 3292 ssize_t rcvd; 3293 3294 /* timeout is ignored, ensure caller does not expect it to work */ 3295 assert(timeout == NULL); (void)timeout; 3296 3297 while(vpos < vlen) { 3298 rcvd = recvfrom(sockfd, 3299 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3300 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3301 flags, 3302 msgvec[vpos].msg_hdr.msg_name, 3303 &msgvec[vpos].msg_hdr.msg_namelen); 3304 if(rcvd < 0) { 3305 break; 3306 } else { 3307 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3308 msgvec[vpos].msg_len = (unsigned int)rcvd; 3309 vpos++; 3310 } 3311 } 3312 3313 if(vpos) { 3314 /* error will be picked up next time */ 3315 return (int)vpos; 3316 } else if(errno == 0) { 3317 return 0; 3318 } else if(errno == EAGAIN) { 3319 return 0; 3320 } 3321 3322 return -1; 3323 } 3324 #endif /* HAVE_RECVMMSG */ 3325 3326 #ifdef HAVE_SENDMMSG 3327 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3328 #else /* !HAVE_SENDMMSG */ 3329 3330 static int 3331 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3332 { 3333 unsigned int vpos = 0; 3334 ssize_t snd; 3335 3336 while(vpos < vlen) { 3337 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3338 snd = sendto(sockfd, 3339 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3340 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3341 flags, 3342 msgvec[vpos].msg_hdr.msg_name, 3343 msgvec[vpos].msg_hdr.msg_namelen); 3344 if(snd < 0) { 3345 break; 3346 } else { 3347 msgvec[vpos].msg_len = (unsigned int)snd; 3348 vpos++; 3349 } 3350 } 3351 3352 if(vpos) { 3353 return (int)vpos; 3354 } else if(errno == 0) { 3355 return 0; 3356 } 3357 3358 return -1; 3359 } 3360 #endif /* HAVE_SENDMMSG */ 3361 3362 static int 3363 port_is_zero( 3364 #ifdef INET6 3365 struct sockaddr_storage *addr 3366 #else 3367 struct sockaddr_in *addr 3368 #endif 3369 ) 3370 { 3371 #ifdef INET6 3372 if(addr->ss_family == AF_INET6) { 3373 return (((struct sockaddr_in6 *)addr)->sin6_port) == 0; 3374 } else if(addr->ss_family == AF_INET) { 3375 return (((struct sockaddr_in *)addr)->sin_port) == 0; 3376 } 3377 return 0; 3378 #else 3379 if(addr->sin_family == AF_INET) { 3380 return addr->sin_port == 0; 3381 } 3382 return 0; 3383 #endif 3384 } 3385 3386 static void 3387 handle_udp(int fd, short event, void* arg) 3388 { 3389 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3390 int received, sent, recvcount, i; 3391 struct query *q; 3392 uint32_t now = 0; 3393 3394 if (!(event & EV_READ)) { 3395 return; 3396 } 3397 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3398 /* this printf strangely gave a performance increase on Linux */ 3399 /* printf("recvcount %d \n", recvcount); */ 3400 if (recvcount == -1) { 3401 if (errno != EAGAIN && errno != EINTR) { 3402 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3403 STATUP(data->nsd, rxerr); 3404 /* No zone statup */ 3405 } 3406 /* Simply no data available */ 3407 return; 3408 } 3409 for (i = 0; i < recvcount; i++) { 3410 loopstart: 3411 received = msgs[i].msg_len; 3412 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; 3413 q = queries[i]; 3414 if (received == -1) { 3415 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3416 #if defined(HAVE_RECVMMSG) 3417 msgs[i].msg_hdr.msg_flags 3418 #else 3419 errno 3420 #endif 3421 )); 3422 STATUP(data->nsd, rxerr); 3423 /* No zone statup */ 3424 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3425 iovecs[i].iov_len = buffer_remaining(q->packet); 3426 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3427 goto swap_drop; 3428 } 3429 3430 /* Account... */ 3431 #ifdef BIND8_STATS 3432 if (data->socket->addr.ai_family == AF_INET) { 3433 STATUP(data->nsd, qudp); 3434 } else if (data->socket->addr.ai_family == AF_INET6) { 3435 STATUP(data->nsd, qudp6); 3436 } 3437 #endif 3438 3439 buffer_skip(q->packet, received); 3440 buffer_flip(q->packet); 3441 #ifdef USE_DNSTAP 3442 /* 3443 * sending UDP-query with server address (local) and client address to dnstap process 3444 */ 3445 log_addr("query from client", &q->addr); 3446 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 3447 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &q->addr, q->addrlen, 3448 q->tcp, q->packet); 3449 #endif /* USE_DNSTAP */ 3450 3451 /* Process and answer the query... */ 3452 if (server_process_query_udp(data->nsd, q, &now) != QUERY_DISCARDED) { 3453 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3454 STATUP(data->nsd, nona); 3455 ZTATUP(data->nsd, q->zone, nona); 3456 } 3457 3458 #ifdef USE_ZONE_STATS 3459 if (data->socket->addr.ai_family == AF_INET) { 3460 ZTATUP(data->nsd, q->zone, qudp); 3461 } else if (data->socket->addr.ai_family == AF_INET6) { 3462 ZTATUP(data->nsd, q->zone, qudp6); 3463 } 3464 #endif 3465 3466 /* Add EDNS0 and TSIG info if necessary. */ 3467 query_add_optional(q, data->nsd, &now); 3468 3469 buffer_flip(q->packet); 3470 iovecs[i].iov_len = buffer_remaining(q->packet); 3471 #ifdef BIND8_STATS 3472 /* Account the rcode & TC... */ 3473 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3474 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3475 if (TC(q->packet)) { 3476 STATUP(data->nsd, truncated); 3477 ZTATUP(data->nsd, q->zone, truncated); 3478 } 3479 #endif /* BIND8_STATS */ 3480 #ifdef USE_DNSTAP 3481 /* 3482 * sending UDP-response with server address (local) and client address to dnstap process 3483 */ 3484 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 3485 log_addr("response to client", &q->addr); 3486 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, 3487 &q->addr, q->addrlen, q->tcp, q->packet, 3488 q->zone); 3489 #endif /* USE_DNSTAP */ 3490 } else { 3491 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3492 iovecs[i].iov_len = buffer_remaining(q->packet); 3493 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3494 swap_drop: 3495 STATUP(data->nsd, dropped); 3496 ZTATUP(data->nsd, q->zone, dropped); 3497 if(i != recvcount-1) { 3498 /* swap with last and decrease recvcount */ 3499 struct mmsghdr mtmp = msgs[i]; 3500 struct iovec iotmp = iovecs[i]; 3501 recvcount--; 3502 msgs[i] = msgs[recvcount]; 3503 iovecs[i] = iovecs[recvcount]; 3504 queries[i] = queries[recvcount]; 3505 msgs[recvcount] = mtmp; 3506 iovecs[recvcount] = iotmp; 3507 queries[recvcount] = q; 3508 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3509 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3510 goto loopstart; 3511 } else { recvcount --; } 3512 } 3513 } 3514 3515 /* send until all are sent */ 3516 i = 0; 3517 while(i<recvcount) { 3518 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3519 if(sent == -1) { 3520 if(errno == ENOBUFS || 3521 #ifdef EWOULDBLOCK 3522 errno == EWOULDBLOCK || 3523 #endif 3524 errno == EAGAIN) { 3525 /* block to wait until send buffer avail */ 3526 int flag, errstore; 3527 if((flag = fcntl(fd, F_GETFL)) == -1) { 3528 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3529 flag = 0; 3530 } 3531 flag &= ~O_NONBLOCK; 3532 if(fcntl(fd, F_SETFL, flag) == -1) 3533 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3534 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3535 errstore = errno; 3536 flag |= O_NONBLOCK; 3537 if(fcntl(fd, F_SETFL, flag) == -1) 3538 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3539 if(sent != -1) { 3540 i += sent; 3541 continue; 3542 } 3543 errno = errstore; 3544 } 3545 if(errno == EINVAL) { 3546 /* skip the invalid argument entry, 3547 * send the remaining packets in the list */ 3548 if(!(port_is_zero((void*)&queries[i]->addr) && 3549 verbosity < 3)) { 3550 const char* es = strerror(errno); 3551 char a[64]; 3552 addrport2str((void*)&queries[i]->addr, a, sizeof(a)); 3553 log_msg(LOG_ERR, "sendmmsg skip invalid argument [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3554 } 3555 i += 1; 3556 continue; 3557 } 3558 /* don't log transient network full errors, unless 3559 * on higher verbosity */ 3560 if(!(errno == ENOBUFS && verbosity < 1) && 3561 #ifdef EWOULDBLOCK 3562 errno != EWOULDBLOCK && 3563 #endif 3564 errno != EAGAIN) { 3565 const char* es = strerror(errno); 3566 char a[64]; 3567 addrport2str((void*)&queries[i]->addr, a, sizeof(a)); 3568 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3569 } 3570 #ifdef BIND8_STATS 3571 data->nsd->st.txerr += recvcount-i; 3572 #endif /* BIND8_STATS */ 3573 break; 3574 } 3575 i += sent; 3576 } 3577 for(i=0; i<recvcount; i++) { 3578 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3579 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3580 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3581 } 3582 } 3583 3584 #ifdef HAVE_SSL 3585 /* 3586 * Setup an event for the tcp handler. 3587 */ 3588 static void 3589 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3590 int fd, short event) 3591 { 3592 struct timeval timeout; 3593 struct event_base* ev_base; 3594 3595 timeout.tv_sec = data->nsd->tcp_timeout; 3596 timeout.tv_usec = 0L; 3597 3598 ev_base = data->event.ev_base; 3599 event_del(&data->event); 3600 memset(&data->event, 0, sizeof(data->event)); 3601 event_set(&data->event, fd, event, fn, data); 3602 if(event_base_set(ev_base, &data->event) != 0) 3603 log_msg(LOG_ERR, "event base set failed"); 3604 if(event_add(&data->event, &timeout) != 0) 3605 log_msg(LOG_ERR, "event add failed"); 3606 } 3607 #endif /* HAVE_SSL */ 3608 3609 static void 3610 cleanup_tcp_handler(struct tcp_handler_data* data) 3611 { 3612 event_del(&data->event); 3613 #ifdef HAVE_SSL 3614 if(data->tls) { 3615 SSL_shutdown(data->tls); 3616 SSL_free(data->tls); 3617 data->tls = NULL; 3618 } 3619 #endif 3620 close(data->event.ev_fd); 3621 if(data->prev) 3622 data->prev->next = data->next; 3623 else tcp_active_list = data->next; 3624 if(data->next) 3625 data->next->prev = data->prev; 3626 3627 /* 3628 * Enable the TCP accept handlers when the current number of 3629 * TCP connections is about to drop below the maximum number 3630 * of TCP connections. 3631 */ 3632 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3633 configure_handler_event_types(EV_READ|EV_PERSIST); 3634 if(slowaccept) { 3635 event_del(&slowaccept_event); 3636 slowaccept = 0; 3637 } 3638 } 3639 --data->nsd->current_tcp_count; 3640 assert(data->nsd->current_tcp_count >= 0); 3641 3642 region_destroy(data->region); 3643 } 3644 3645 static void 3646 handle_tcp_reading(int fd, short event, void* arg) 3647 { 3648 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3649 ssize_t received; 3650 struct event_base* ev_base; 3651 struct timeval timeout; 3652 uint32_t now = 0; 3653 3654 if ((event & EV_TIMEOUT)) { 3655 /* Connection timed out. */ 3656 cleanup_tcp_handler(data); 3657 return; 3658 } 3659 3660 if ((data->nsd->tcp_query_count > 0 && 3661 data->query_count >= data->nsd->tcp_query_count) || 3662 data->tcp_no_more_queries) { 3663 /* No more queries allowed on this tcp connection. */ 3664 cleanup_tcp_handler(data); 3665 return; 3666 } 3667 3668 assert((event & EV_READ)); 3669 3670 if (data->bytes_transmitted == 0) { 3671 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3672 } 3673 3674 /* 3675 * Check if we received the leading packet length bytes yet. 3676 */ 3677 if (data->bytes_transmitted < sizeof(uint16_t)) { 3678 received = read(fd, 3679 (char *) &data->query->tcplen 3680 + data->bytes_transmitted, 3681 sizeof(uint16_t) - data->bytes_transmitted); 3682 if (received == -1) { 3683 if (errno == EAGAIN || errno == EINTR) { 3684 /* 3685 * Read would block, wait until more 3686 * data is available. 3687 */ 3688 return; 3689 } else { 3690 char buf[48]; 3691 addr2str(&data->query->addr, buf, sizeof(buf)); 3692 #ifdef ECONNRESET 3693 if (verbosity >= 2 || errno != ECONNRESET) 3694 #endif /* ECONNRESET */ 3695 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3696 cleanup_tcp_handler(data); 3697 return; 3698 } 3699 } else if (received == 0) { 3700 /* EOF */ 3701 cleanup_tcp_handler(data); 3702 return; 3703 } 3704 3705 data->bytes_transmitted += received; 3706 if (data->bytes_transmitted < sizeof(uint16_t)) { 3707 /* 3708 * Not done with the tcplen yet, wait for more 3709 * data to become available. 3710 */ 3711 return; 3712 } 3713 3714 assert(data->bytes_transmitted == sizeof(uint16_t)); 3715 3716 data->query->tcplen = ntohs(data->query->tcplen); 3717 3718 /* 3719 * Minimum query size is: 3720 * 3721 * Size of the header (12) 3722 * + Root domain name (1) 3723 * + Query class (2) 3724 * + Query type (2) 3725 */ 3726 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3727 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3728 cleanup_tcp_handler(data); 3729 return; 3730 } 3731 3732 if (data->query->tcplen > data->query->maxlen) { 3733 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3734 cleanup_tcp_handler(data); 3735 return; 3736 } 3737 3738 buffer_set_limit(data->query->packet, data->query->tcplen); 3739 } 3740 3741 assert(buffer_remaining(data->query->packet) > 0); 3742 3743 /* Read the (remaining) query data. */ 3744 received = read(fd, 3745 buffer_current(data->query->packet), 3746 buffer_remaining(data->query->packet)); 3747 if (received == -1) { 3748 if (errno == EAGAIN || errno == EINTR) { 3749 /* 3750 * Read would block, wait until more data is 3751 * available. 3752 */ 3753 return; 3754 } else { 3755 char buf[48]; 3756 addr2str(&data->query->addr, buf, sizeof(buf)); 3757 #ifdef ECONNRESET 3758 if (verbosity >= 2 || errno != ECONNRESET) 3759 #endif /* ECONNRESET */ 3760 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3761 cleanup_tcp_handler(data); 3762 return; 3763 } 3764 } else if (received == 0) { 3765 /* EOF */ 3766 cleanup_tcp_handler(data); 3767 return; 3768 } 3769 3770 data->bytes_transmitted += received; 3771 buffer_skip(data->query->packet, received); 3772 if (buffer_remaining(data->query->packet) > 0) { 3773 /* 3774 * Message not yet complete, wait for more data to 3775 * become available. 3776 */ 3777 return; 3778 } 3779 3780 assert(buffer_position(data->query->packet) == data->query->tcplen); 3781 3782 /* Account... */ 3783 #ifdef BIND8_STATS 3784 #ifndef INET6 3785 STATUP(data->nsd, ctcp); 3786 #else 3787 if (data->query->addr.ss_family == AF_INET) { 3788 STATUP(data->nsd, ctcp); 3789 } else if (data->query->addr.ss_family == AF_INET6) { 3790 STATUP(data->nsd, ctcp6); 3791 } 3792 #endif 3793 #endif /* BIND8_STATS */ 3794 3795 /* We have a complete query, process it. */ 3796 3797 /* tcp-query-count: handle query counter ++ */ 3798 data->query_count++; 3799 3800 buffer_flip(data->query->packet); 3801 #ifdef USE_DNSTAP 3802 /* 3803 * and send TCP-query with found address (local) and client address to dnstap process 3804 */ 3805 log_addr("query from client", &data->query->addr); 3806 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 3807 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 3808 data->query->addrlen, data->query->tcp, data->query->packet); 3809 #endif /* USE_DNSTAP */ 3810 data->query_state = server_process_query(data->nsd, data->query, &now); 3811 if (data->query_state == QUERY_DISCARDED) { 3812 /* Drop the packet and the entire connection... */ 3813 STATUP(data->nsd, dropped); 3814 ZTATUP(data->nsd, data->query->zone, dropped); 3815 cleanup_tcp_handler(data); 3816 return; 3817 } 3818 3819 #ifdef BIND8_STATS 3820 if (RCODE(data->query->packet) == RCODE_OK 3821 && !AA(data->query->packet)) 3822 { 3823 STATUP(data->nsd, nona); 3824 ZTATUP(data->nsd, data->query->zone, nona); 3825 } 3826 #endif /* BIND8_STATS */ 3827 3828 #ifdef USE_ZONE_STATS 3829 #ifndef INET6 3830 ZTATUP(data->nsd, data->query->zone, ctcp); 3831 #else 3832 if (data->query->addr.ss_family == AF_INET) { 3833 ZTATUP(data->nsd, data->query->zone, ctcp); 3834 } else if (data->query->addr.ss_family == AF_INET6) { 3835 ZTATUP(data->nsd, data->query->zone, ctcp6); 3836 } 3837 #endif 3838 #endif /* USE_ZONE_STATS */ 3839 3840 query_add_optional(data->query, data->nsd, &now); 3841 3842 /* Switch to the tcp write handler. */ 3843 buffer_flip(data->query->packet); 3844 data->query->tcplen = buffer_remaining(data->query->packet); 3845 #ifdef BIND8_STATS 3846 /* Account the rcode & TC... */ 3847 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 3848 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 3849 if (TC(data->query->packet)) { 3850 STATUP(data->nsd, truncated); 3851 ZTATUP(data->nsd, data->query->zone, truncated); 3852 } 3853 #endif /* BIND8_STATS */ 3854 #ifdef USE_DNSTAP 3855 /* 3856 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 3857 */ 3858 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 3859 log_addr("response to client", &data->query->addr); 3860 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 3861 data->query->addrlen, data->query->tcp, data->query->packet, 3862 data->query->zone); 3863 #endif /* USE_DNSTAP */ 3864 data->bytes_transmitted = 0; 3865 3866 timeout.tv_sec = data->tcp_timeout / 1000; 3867 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3868 3869 ev_base = data->event.ev_base; 3870 event_del(&data->event); 3871 memset(&data->event, 0, sizeof(data->event)); 3872 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3873 handle_tcp_reading, data); 3874 if(event_base_set(ev_base, &data->event) != 0) 3875 log_msg(LOG_ERR, "event base set tcpr failed"); 3876 if(event_add(&data->event, &timeout) != 0) 3877 log_msg(LOG_ERR, "event add tcpr failed"); 3878 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 3879 handle_tcp_writing(fd, EV_WRITE, data); 3880 } 3881 3882 static void 3883 handle_tcp_writing(int fd, short event, void* arg) 3884 { 3885 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3886 ssize_t sent; 3887 struct query *q = data->query; 3888 struct timeval timeout; 3889 struct event_base* ev_base; 3890 uint32_t now = 0; 3891 3892 if ((event & EV_TIMEOUT)) { 3893 /* Connection timed out. */ 3894 cleanup_tcp_handler(data); 3895 return; 3896 } 3897 3898 assert((event & EV_WRITE)); 3899 3900 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3901 /* Writing the response packet length. */ 3902 uint16_t n_tcplen = htons(q->tcplen); 3903 #ifdef HAVE_WRITEV 3904 struct iovec iov[2]; 3905 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 3906 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 3907 iov[1].iov_base = buffer_begin(q->packet); 3908 iov[1].iov_len = buffer_limit(q->packet); 3909 sent = writev(fd, iov, 2); 3910 #else /* HAVE_WRITEV */ 3911 sent = write(fd, 3912 (const char *) &n_tcplen + data->bytes_transmitted, 3913 sizeof(n_tcplen) - data->bytes_transmitted); 3914 #endif /* HAVE_WRITEV */ 3915 if (sent == -1) { 3916 if (errno == EAGAIN || errno == EINTR) { 3917 /* 3918 * Write would block, wait until 3919 * socket becomes writable again. 3920 */ 3921 return; 3922 } else { 3923 #ifdef ECONNRESET 3924 if(verbosity >= 2 || errno != ECONNRESET) 3925 #endif /* ECONNRESET */ 3926 #ifdef EPIPE 3927 if(verbosity >= 2 || errno != EPIPE) 3928 #endif /* EPIPE 'broken pipe' */ 3929 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3930 cleanup_tcp_handler(data); 3931 return; 3932 } 3933 } 3934 3935 data->bytes_transmitted += sent; 3936 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3937 /* 3938 * Writing not complete, wait until socket 3939 * becomes writable again. 3940 */ 3941 return; 3942 } 3943 3944 #ifdef HAVE_WRITEV 3945 sent -= sizeof(n_tcplen); 3946 /* handle potential 'packet done' code */ 3947 goto packet_could_be_done; 3948 #endif 3949 } 3950 3951 sent = write(fd, 3952 buffer_current(q->packet), 3953 buffer_remaining(q->packet)); 3954 if (sent == -1) { 3955 if (errno == EAGAIN || errno == EINTR) { 3956 /* 3957 * Write would block, wait until 3958 * socket becomes writable again. 3959 */ 3960 return; 3961 } else { 3962 #ifdef ECONNRESET 3963 if(verbosity >= 2 || errno != ECONNRESET) 3964 #endif /* ECONNRESET */ 3965 #ifdef EPIPE 3966 if(verbosity >= 2 || errno != EPIPE) 3967 #endif /* EPIPE 'broken pipe' */ 3968 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3969 cleanup_tcp_handler(data); 3970 return; 3971 } 3972 } 3973 3974 data->bytes_transmitted += sent; 3975 #ifdef HAVE_WRITEV 3976 packet_could_be_done: 3977 #endif 3978 buffer_skip(q->packet, sent); 3979 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 3980 /* 3981 * Still more data to write when socket becomes 3982 * writable again. 3983 */ 3984 return; 3985 } 3986 3987 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 3988 3989 if (data->query_state == QUERY_IN_AXFR) { 3990 /* Continue processing AXFR and writing back results. */ 3991 buffer_clear(q->packet); 3992 data->query_state = query_axfr(data->nsd, q); 3993 if (data->query_state != QUERY_PROCESSED) { 3994 query_add_optional(data->query, data->nsd, &now); 3995 3996 /* Reset data. */ 3997 buffer_flip(q->packet); 3998 q->tcplen = buffer_remaining(q->packet); 3999 data->bytes_transmitted = 0; 4000 /* Reset timeout. */ 4001 timeout.tv_sec = data->tcp_timeout / 1000; 4002 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4003 ev_base = data->event.ev_base; 4004 event_del(&data->event); 4005 memset(&data->event, 0, sizeof(data->event)); 4006 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 4007 handle_tcp_writing, data); 4008 if(event_base_set(ev_base, &data->event) != 0) 4009 log_msg(LOG_ERR, "event base set tcpw failed"); 4010 if(event_add(&data->event, &timeout) != 0) 4011 log_msg(LOG_ERR, "event add tcpw failed"); 4012 4013 /* 4014 * Write data if/when the socket is writable 4015 * again. 4016 */ 4017 return; 4018 } 4019 } 4020 4021 /* 4022 * Done sending, wait for the next request to arrive on the 4023 * TCP socket by installing the TCP read handler. 4024 */ 4025 if ((data->nsd->tcp_query_count > 0 && 4026 data->query_count >= data->nsd->tcp_query_count) || 4027 data->tcp_no_more_queries) { 4028 4029 (void) shutdown(fd, SHUT_WR); 4030 } 4031 4032 data->bytes_transmitted = 0; 4033 4034 timeout.tv_sec = data->tcp_timeout / 1000; 4035 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 4036 ev_base = data->event.ev_base; 4037 event_del(&data->event); 4038 memset(&data->event, 0, sizeof(data->event)); 4039 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 4040 handle_tcp_reading, data); 4041 if(event_base_set(ev_base, &data->event) != 0) 4042 log_msg(LOG_ERR, "event base set tcpw failed"); 4043 if(event_add(&data->event, &timeout) != 0) 4044 log_msg(LOG_ERR, "event add tcpw failed"); 4045 } 4046 4047 #ifdef HAVE_SSL 4048 /** create SSL object and associate fd */ 4049 static SSL* 4050 incoming_ssl_fd(SSL_CTX* ctx, int fd) 4051 { 4052 SSL* ssl = SSL_new((SSL_CTX*)ctx); 4053 if(!ssl) { 4054 log_crypto_err("could not SSL_new"); 4055 return NULL; 4056 } 4057 SSL_set_accept_state(ssl); 4058 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 4059 if(!SSL_set_fd(ssl, fd)) { 4060 log_crypto_err("could not SSL_set_fd"); 4061 SSL_free(ssl); 4062 return NULL; 4063 } 4064 return ssl; 4065 } 4066 4067 /** TLS handshake to upgrade TCP connection */ 4068 static int 4069 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 4070 { 4071 int r; 4072 if(data->shake_state == tls_hs_read_event) { 4073 /* read condition satisfied back to writing */ 4074 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4075 data->shake_state = tls_hs_none; 4076 return 1; 4077 } 4078 if(data->shake_state == tls_hs_write_event) { 4079 /* write condition satisfied back to reading */ 4080 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4081 data->shake_state = tls_hs_none; 4082 return 1; 4083 } 4084 4085 /* (continue to) setup the TLS connection */ 4086 ERR_clear_error(); 4087 r = SSL_do_handshake(data->tls); 4088 4089 if(r != 1) { 4090 int want = SSL_get_error(data->tls, r); 4091 if(want == SSL_ERROR_WANT_READ) { 4092 if(data->shake_state == tls_hs_read) { 4093 /* try again later */ 4094 return 1; 4095 } 4096 data->shake_state = tls_hs_read; 4097 /* switch back to reading mode */ 4098 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4099 return 1; 4100 } else if(want == SSL_ERROR_WANT_WRITE) { 4101 if(data->shake_state == tls_hs_write) { 4102 /* try again later */ 4103 return 1; 4104 } 4105 data->shake_state = tls_hs_write; 4106 /* switch back to writing mode */ 4107 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4108 return 1; 4109 } else { 4110 if(r == 0) 4111 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 4112 else { 4113 unsigned long err = ERR_get_error(); 4114 if(!squelch_err_ssl_handshake(err)) { 4115 char a[64], s[256]; 4116 addr2str(&data->query->addr, a, sizeof(a)); 4117 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 4118 log_crypto_from_err(s, err); 4119 } 4120 } 4121 cleanup_tcp_handler(data); 4122 return 0; 4123 } 4124 } 4125 4126 /* Use to log successful upgrade for testing - could be removed*/ 4127 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 4128 /* set back to the event we need to have when reading (or writing) */ 4129 if(data->shake_state == tls_hs_read && writing) { 4130 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 4131 } else if(data->shake_state == tls_hs_write && !writing) { 4132 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 4133 } 4134 data->shake_state = tls_hs_none; 4135 return 1; 4136 } 4137 4138 /** handle TLS reading of incoming query */ 4139 static void 4140 handle_tls_reading(int fd, short event, void* arg) 4141 { 4142 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4143 ssize_t received; 4144 uint32_t now = 0; 4145 4146 if ((event & EV_TIMEOUT)) { 4147 /* Connection timed out. */ 4148 cleanup_tcp_handler(data); 4149 return; 4150 } 4151 4152 if ((data->nsd->tcp_query_count > 0 && 4153 data->query_count >= data->nsd->tcp_query_count) || 4154 data->tcp_no_more_queries) { 4155 /* No more queries allowed on this tcp connection. */ 4156 cleanup_tcp_handler(data); 4157 return; 4158 } 4159 4160 assert((event & EV_READ)); 4161 4162 if (data->bytes_transmitted == 0) { 4163 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 4164 } 4165 4166 if(data->shake_state != tls_hs_none) { 4167 if(!tls_handshake(data, fd, 0)) 4168 return; 4169 if(data->shake_state != tls_hs_none) 4170 return; 4171 } 4172 4173 /* 4174 * Check if we received the leading packet length bytes yet. 4175 */ 4176 if(data->bytes_transmitted < sizeof(uint16_t)) { 4177 ERR_clear_error(); 4178 if((received=SSL_read(data->tls, (char *) &data->query->tcplen 4179 + data->bytes_transmitted, 4180 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { 4181 int want = SSL_get_error(data->tls, received); 4182 if(want == SSL_ERROR_ZERO_RETURN) { 4183 cleanup_tcp_handler(data); 4184 return; /* shutdown, closed */ 4185 } else if(want == SSL_ERROR_WANT_READ) { 4186 /* wants to be called again */ 4187 return; 4188 } 4189 else if(want == SSL_ERROR_WANT_WRITE) { 4190 /* switch to writing */ 4191 data->shake_state = tls_hs_write_event; 4192 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4193 return; 4194 } 4195 cleanup_tcp_handler(data); 4196 log_crypto_err("could not SSL_read"); 4197 return; 4198 } 4199 4200 data->bytes_transmitted += received; 4201 if (data->bytes_transmitted < sizeof(uint16_t)) { 4202 /* 4203 * Not done with the tcplen yet, wait for more 4204 * data to become available. 4205 */ 4206 return; 4207 } 4208 4209 assert(data->bytes_transmitted == sizeof(uint16_t)); 4210 4211 data->query->tcplen = ntohs(data->query->tcplen); 4212 4213 /* 4214 * Minimum query size is: 4215 * 4216 * Size of the header (12) 4217 * + Root domain name (1) 4218 * + Query class (2) 4219 * + Query type (2) 4220 */ 4221 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4222 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4223 cleanup_tcp_handler(data); 4224 return; 4225 } 4226 4227 if (data->query->tcplen > data->query->maxlen) { 4228 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4229 cleanup_tcp_handler(data); 4230 return; 4231 } 4232 4233 buffer_set_limit(data->query->packet, data->query->tcplen); 4234 } 4235 4236 assert(buffer_remaining(data->query->packet) > 0); 4237 4238 /* Read the (remaining) query data. */ 4239 ERR_clear_error(); 4240 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), 4241 (int)buffer_remaining(data->query->packet)); 4242 if(received <= 0) { 4243 int want = SSL_get_error(data->tls, received); 4244 if(want == SSL_ERROR_ZERO_RETURN) { 4245 cleanup_tcp_handler(data); 4246 return; /* shutdown, closed */ 4247 } else if(want == SSL_ERROR_WANT_READ) { 4248 /* wants to be called again */ 4249 return; 4250 } 4251 else if(want == SSL_ERROR_WANT_WRITE) { 4252 /* switch back writing */ 4253 data->shake_state = tls_hs_write_event; 4254 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4255 return; 4256 } 4257 cleanup_tcp_handler(data); 4258 log_crypto_err("could not SSL_read"); 4259 return; 4260 } 4261 4262 data->bytes_transmitted += received; 4263 buffer_skip(data->query->packet, received); 4264 if (buffer_remaining(data->query->packet) > 0) { 4265 /* 4266 * Message not yet complete, wait for more data to 4267 * become available. 4268 */ 4269 return; 4270 } 4271 4272 assert(buffer_position(data->query->packet) == data->query->tcplen); 4273 4274 /* Account... */ 4275 #ifndef INET6 4276 STATUP(data->nsd, ctls); 4277 #else 4278 if (data->query->addr.ss_family == AF_INET) { 4279 STATUP(data->nsd, ctls); 4280 } else if (data->query->addr.ss_family == AF_INET6) { 4281 STATUP(data->nsd, ctls6); 4282 } 4283 #endif 4284 4285 /* We have a complete query, process it. */ 4286 4287 /* tcp-query-count: handle query counter ++ */ 4288 data->query_count++; 4289 4290 buffer_flip(data->query->packet); 4291 #ifdef USE_DNSTAP 4292 /* 4293 * and send TCP-query with found address (local) and client address to dnstap process 4294 */ 4295 log_addr("query from client", &data->query->addr); 4296 log_addr("to server (local)", (void*)&data->socket->addr.ai_addr); 4297 dt_collector_submit_auth_query(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 4298 data->query->addrlen, data->query->tcp, data->query->packet); 4299 #endif /* USE_DNSTAP */ 4300 data->query_state = server_process_query(data->nsd, data->query, &now); 4301 if (data->query_state == QUERY_DISCARDED) { 4302 /* Drop the packet and the entire connection... */ 4303 STATUP(data->nsd, dropped); 4304 ZTATUP(data->nsd, data->query->zone, dropped); 4305 cleanup_tcp_handler(data); 4306 return; 4307 } 4308 4309 #ifdef BIND8_STATS 4310 if (RCODE(data->query->packet) == RCODE_OK 4311 && !AA(data->query->packet)) 4312 { 4313 STATUP(data->nsd, nona); 4314 ZTATUP(data->nsd, data->query->zone, nona); 4315 } 4316 #endif /* BIND8_STATS */ 4317 4318 #ifdef USE_ZONE_STATS 4319 #ifndef INET6 4320 ZTATUP(data->nsd, data->query->zone, ctls); 4321 #else 4322 if (data->query->addr.ss_family == AF_INET) { 4323 ZTATUP(data->nsd, data->query->zone, ctls); 4324 } else if (data->query->addr.ss_family == AF_INET6) { 4325 ZTATUP(data->nsd, data->query->zone, ctls6); 4326 } 4327 #endif 4328 #endif /* USE_ZONE_STATS */ 4329 4330 query_add_optional(data->query, data->nsd, &now); 4331 4332 /* Switch to the tcp write handler. */ 4333 buffer_flip(data->query->packet); 4334 data->query->tcplen = buffer_remaining(data->query->packet); 4335 #ifdef BIND8_STATS 4336 /* Account the rcode & TC... */ 4337 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4338 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4339 if (TC(data->query->packet)) { 4340 STATUP(data->nsd, truncated); 4341 ZTATUP(data->nsd, data->query->zone, truncated); 4342 } 4343 #endif /* BIND8_STATS */ 4344 #ifdef USE_DNSTAP 4345 /* 4346 * sending TCP-response with found (earlier) address (local) and client address to dnstap process 4347 */ 4348 log_addr("from server (local)", (void*)&data->socket->addr.ai_addr); 4349 log_addr("response to client", &data->query->addr); 4350 dt_collector_submit_auth_response(data->nsd, (void*)&data->socket->addr.ai_addr, &data->query->addr, 4351 data->query->addrlen, data->query->tcp, data->query->packet, 4352 data->query->zone); 4353 #endif /* USE_DNSTAP */ 4354 data->bytes_transmitted = 0; 4355 4356 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4357 4358 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4359 handle_tls_writing(fd, EV_WRITE, data); 4360 } 4361 4362 /** handle TLS writing of outgoing response */ 4363 static void 4364 handle_tls_writing(int fd, short event, void* arg) 4365 { 4366 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4367 ssize_t sent; 4368 struct query *q = data->query; 4369 /* static variable that holds reassembly buffer used to put the 4370 * TCP length in front of the packet, like writev. */ 4371 static buffer_type* global_tls_temp_buffer = NULL; 4372 buffer_type* write_buffer; 4373 uint32_t now = 0; 4374 4375 if ((event & EV_TIMEOUT)) { 4376 /* Connection timed out. */ 4377 cleanup_tcp_handler(data); 4378 return; 4379 } 4380 4381 assert((event & EV_WRITE)); 4382 4383 if(data->shake_state != tls_hs_none) { 4384 if(!tls_handshake(data, fd, 1)) 4385 return; 4386 if(data->shake_state != tls_hs_none) 4387 return; 4388 } 4389 4390 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4391 4392 /* If we are writing the start of a message, we must include the length 4393 * this is done with a copy into write_buffer. */ 4394 write_buffer = NULL; 4395 if (data->bytes_transmitted == 0) { 4396 if(!global_tls_temp_buffer) { 4397 /* gets deallocated when nsd shuts down from 4398 * nsd.region */ 4399 global_tls_temp_buffer = buffer_create(nsd.region, 4400 QIOBUFSZ + sizeof(q->tcplen)); 4401 if (!global_tls_temp_buffer) { 4402 return; 4403 } 4404 } 4405 write_buffer = global_tls_temp_buffer; 4406 buffer_clear(write_buffer); 4407 buffer_write_u16(write_buffer, q->tcplen); 4408 buffer_write(write_buffer, buffer_current(q->packet), 4409 (int)buffer_remaining(q->packet)); 4410 buffer_flip(write_buffer); 4411 } else { 4412 write_buffer = q->packet; 4413 } 4414 4415 /* Write the response */ 4416 ERR_clear_error(); 4417 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4418 if(sent <= 0) { 4419 int want = SSL_get_error(data->tls, sent); 4420 if(want == SSL_ERROR_ZERO_RETURN) { 4421 cleanup_tcp_handler(data); 4422 /* closed */ 4423 } else if(want == SSL_ERROR_WANT_READ) { 4424 /* switch back to reading */ 4425 data->shake_state = tls_hs_read_event; 4426 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4427 } else if(want != SSL_ERROR_WANT_WRITE) { 4428 cleanup_tcp_handler(data); 4429 log_crypto_err("could not SSL_write"); 4430 } 4431 return; 4432 } 4433 4434 buffer_skip(write_buffer, sent); 4435 if(buffer_remaining(write_buffer) != 0) { 4436 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4437 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4438 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4439 } 4440 } 4441 4442 data->bytes_transmitted += sent; 4443 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4444 /* 4445 * Still more data to write when socket becomes 4446 * writable again. 4447 */ 4448 return; 4449 } 4450 4451 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4452 4453 if (data->query_state == QUERY_IN_AXFR) { 4454 /* Continue processing AXFR and writing back results. */ 4455 buffer_clear(q->packet); 4456 data->query_state = query_axfr(data->nsd, q); 4457 if (data->query_state != QUERY_PROCESSED) { 4458 query_add_optional(data->query, data->nsd, &now); 4459 4460 /* Reset data. */ 4461 buffer_flip(q->packet); 4462 q->tcplen = buffer_remaining(q->packet); 4463 data->bytes_transmitted = 0; 4464 /* Reset to writing mode. */ 4465 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4466 4467 /* 4468 * Write data if/when the socket is writable 4469 * again. 4470 */ 4471 return; 4472 } 4473 } 4474 4475 /* 4476 * Done sending, wait for the next request to arrive on the 4477 * TCP socket by installing the TCP read handler. 4478 */ 4479 if ((data->nsd->tcp_query_count > 0 && 4480 data->query_count >= data->nsd->tcp_query_count) || 4481 data->tcp_no_more_queries) { 4482 4483 (void) shutdown(fd, SHUT_WR); 4484 } 4485 4486 data->bytes_transmitted = 0; 4487 4488 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4489 } 4490 #endif 4491 4492 static void 4493 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 4494 void* ATTR_UNUSED(arg)) 4495 { 4496 if(slowaccept) { 4497 configure_handler_event_types(EV_PERSIST | EV_READ); 4498 slowaccept = 0; 4499 } 4500 } 4501 4502 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 4503 { 4504 #ifndef HAVE_ACCEPT4 4505 int s = accept(fd, addr, addrlen); 4506 if (s != -1) { 4507 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 4508 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 4509 close(s); 4510 s = -1; 4511 errno=EINTR; /* stop error printout as error in accept4 4512 by setting this errno, it omits printout, in 4513 later code that calls nsd_accept4 */ 4514 } 4515 } 4516 return s; 4517 #else 4518 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 4519 #endif /* HAVE_ACCEPT4 */ 4520 } 4521 4522 /* 4523 * Handle an incoming TCP connection. The connection is accepted and 4524 * a new TCP reader event handler is added. The TCP handler 4525 * is responsible for cleanup when the connection is closed. 4526 */ 4527 static void 4528 handle_tcp_accept(int fd, short event, void* arg) 4529 { 4530 struct tcp_accept_handler_data *data 4531 = (struct tcp_accept_handler_data *) arg; 4532 int s; 4533 int reject = 0; 4534 struct tcp_handler_data *tcp_data; 4535 region_type *tcp_region; 4536 #ifdef INET6 4537 struct sockaddr_storage addr; 4538 #else 4539 struct sockaddr_in addr; 4540 #endif 4541 socklen_t addrlen; 4542 struct timeval timeout; 4543 4544 if (!(event & EV_READ)) { 4545 return; 4546 } 4547 4548 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 4549 reject = data->nsd->options->tcp_reject_overflow; 4550 if (!reject) { 4551 return; 4552 } 4553 } 4554 4555 /* Accept it... */ 4556 addrlen = sizeof(addr); 4557 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 4558 if (s == -1) { 4559 /** 4560 * EMFILE and ENFILE is a signal that the limit of open 4561 * file descriptors has been reached. Pause accept(). 4562 * EINTR is a signal interrupt. The others are various OS ways 4563 * of saying that the client has closed the connection. 4564 */ 4565 if (errno == EMFILE || errno == ENFILE) { 4566 if (!slowaccept) { 4567 /* disable accept events */ 4568 struct timeval tv; 4569 configure_handler_event_types(0); 4570 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 4571 tv.tv_usec = 0L; 4572 memset(&slowaccept_event, 0, 4573 sizeof(slowaccept_event)); 4574 event_set(&slowaccept_event, -1, EV_TIMEOUT, 4575 handle_slowaccept_timeout, NULL); 4576 (void)event_base_set(data->event.ev_base, 4577 &slowaccept_event); 4578 (void)event_add(&slowaccept_event, &tv); 4579 slowaccept = 1; 4580 /* We don't want to spam the logs here */ 4581 } 4582 } else if (errno != EINTR 4583 && errno != EWOULDBLOCK 4584 #ifdef ECONNABORTED 4585 && errno != ECONNABORTED 4586 #endif /* ECONNABORTED */ 4587 #ifdef EPROTO 4588 && errno != EPROTO 4589 #endif /* EPROTO */ 4590 ) { 4591 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 4592 } 4593 return; 4594 } 4595 4596 if (reject) { 4597 shutdown(s, SHUT_RDWR); 4598 close(s); 4599 return; 4600 } 4601 4602 /* 4603 * This region is deallocated when the TCP connection is 4604 * closed by the TCP handler. 4605 */ 4606 tcp_region = region_create(xalloc, free); 4607 tcp_data = (struct tcp_handler_data *) region_alloc( 4608 tcp_region, sizeof(struct tcp_handler_data)); 4609 tcp_data->region = tcp_region; 4610 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 4611 compression_table_size, compressed_dnames); 4612 tcp_data->nsd = data->nsd; 4613 tcp_data->query_count = 0; 4614 #ifdef HAVE_SSL 4615 tcp_data->shake_state = tls_hs_none; 4616 tcp_data->tls = NULL; 4617 #endif 4618 tcp_data->prev = NULL; 4619 tcp_data->next = NULL; 4620 4621 tcp_data->query_state = QUERY_PROCESSED; 4622 tcp_data->bytes_transmitted = 0; 4623 memcpy(&tcp_data->query->addr, &addr, addrlen); 4624 tcp_data->query->addrlen = addrlen; 4625 4626 tcp_data->tcp_no_more_queries = 0; 4627 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 4628 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 4629 /* very busy, give smaller timeout */ 4630 tcp_data->tcp_timeout = 200; 4631 } 4632 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4633 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 4634 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 4635 4636 #ifdef USE_DNSTAP 4637 /* save the address of the connection */ 4638 tcp_data->socket = data->socket; 4639 #endif /* USE_DNSTAP */ 4640 4641 #ifdef HAVE_SSL 4642 if (data->tls_accept) { 4643 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 4644 if(!tcp_data->tls) { 4645 close(s); 4646 return; 4647 } 4648 tcp_data->shake_state = tls_hs_read; 4649 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4650 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4651 handle_tls_reading, tcp_data); 4652 } else { 4653 #endif 4654 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4655 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4656 handle_tcp_reading, tcp_data); 4657 #ifdef HAVE_SSL 4658 } 4659 #endif 4660 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 4661 log_msg(LOG_ERR, "cannot set tcp event base"); 4662 close(s); 4663 region_destroy(tcp_region); 4664 return; 4665 } 4666 if(event_add(&tcp_data->event, &timeout) != 0) { 4667 log_msg(LOG_ERR, "cannot add tcp to event base"); 4668 close(s); 4669 region_destroy(tcp_region); 4670 return; 4671 } 4672 if(tcp_active_list) { 4673 tcp_active_list->prev = tcp_data; 4674 tcp_data->next = tcp_active_list; 4675 } 4676 tcp_active_list = tcp_data; 4677 4678 /* 4679 * Keep track of the total number of TCP handlers installed so 4680 * we can stop accepting connections when the maximum number 4681 * of simultaneous TCP connections is reached. 4682 * 4683 * If tcp-reject-overflow is enabled, however, then we do not 4684 * change the handler event type; we keep it as-is and accept 4685 * overflow TCP connections only so that we can forcibly kill 4686 * them off. 4687 */ 4688 ++data->nsd->current_tcp_count; 4689 if (!data->nsd->options->tcp_reject_overflow && 4690 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 4691 { 4692 configure_handler_event_types(0); 4693 } 4694 } 4695 4696 static void 4697 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 4698 { 4699 size_t i; 4700 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4701 for (i = 0; i < nsd->child_count; ++i) { 4702 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 4703 if (write(nsd->children[i].child_fd, 4704 &command, 4705 sizeof(command)) == -1) 4706 { 4707 if(errno != EAGAIN && errno != EINTR) 4708 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 4709 (int) command, 4710 (int) nsd->children[i].pid, 4711 strerror(errno)); 4712 } else if (timeout > 0) { 4713 (void)block_read(NULL, 4714 nsd->children[i].child_fd, 4715 &command, sizeof(command), timeout); 4716 } 4717 fsync(nsd->children[i].child_fd); 4718 close(nsd->children[i].child_fd); 4719 nsd->children[i].child_fd = -1; 4720 } 4721 } 4722 } 4723 4724 static void 4725 send_children_quit(struct nsd* nsd) 4726 { 4727 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 4728 send_children_command(nsd, NSD_QUIT, 0); 4729 } 4730 4731 static void 4732 send_children_quit_and_wait(struct nsd* nsd) 4733 { 4734 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 4735 send_children_command(nsd, NSD_QUIT_CHILD, 3); 4736 } 4737 4738 #ifdef BIND8_STATS 4739 static void 4740 set_children_stats(struct nsd* nsd) 4741 { 4742 size_t i; 4743 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4744 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 4745 for (i = 0; i < nsd->child_count; ++i) { 4746 nsd->children[i].need_to_send_STATS = 1; 4747 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 4748 } 4749 } 4750 #endif /* BIND8_STATS */ 4751 4752 static void 4753 configure_handler_event_types(short event_types) 4754 { 4755 size_t i; 4756 4757 for (i = 0; i < tcp_accept_handler_count; ++i) { 4758 struct event* handler = &tcp_accept_handlers[i].event; 4759 if(event_types) { 4760 /* reassign */ 4761 int fd = handler->ev_fd; 4762 struct event_base* base = handler->ev_base; 4763 if(tcp_accept_handlers[i].event_added) 4764 event_del(handler); 4765 memset(handler, 0, sizeof(*handler)); 4766 event_set(handler, fd, event_types, 4767 handle_tcp_accept, &tcp_accept_handlers[i]); 4768 if(event_base_set(base, handler) != 0) 4769 log_msg(LOG_ERR, "conhand: cannot event_base"); 4770 if(event_add(handler, NULL) != 0) 4771 log_msg(LOG_ERR, "conhand: cannot event_add"); 4772 tcp_accept_handlers[i].event_added = 1; 4773 } else { 4774 /* remove */ 4775 if(tcp_accept_handlers[i].event_added) { 4776 event_del(handler); 4777 tcp_accept_handlers[i].event_added = 0; 4778 } 4779 } 4780 } 4781 } 4782