1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #ifdef USE_DNSTAP 85 #include "dnstap/dnstap_collector.h" 86 #endif 87 88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 89 90 #ifdef USE_TCP_FASTOPEN 91 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 92 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 93 #endif 94 95 /* 96 * Data for the UDP handlers. 97 */ 98 struct udp_handler_data 99 { 100 struct nsd *nsd; 101 struct nsd_socket *socket; 102 struct event event; 103 }; 104 105 struct tcp_accept_handler_data { 106 struct nsd *nsd; 107 struct nsd_socket *socket; 108 int event_added; 109 struct event event; 110 #ifdef HAVE_SSL 111 /* handler accepts TLS connections on the dedicated port */ 112 int tls_accept; 113 #endif 114 }; 115 116 /* 117 * These globals are used to enable the TCP accept handlers 118 * when the number of TCP connection drops below the maximum 119 * number of TCP connections. 120 */ 121 static size_t tcp_accept_handler_count; 122 static struct tcp_accept_handler_data *tcp_accept_handlers; 123 124 static struct event slowaccept_event; 125 static int slowaccept; 126 127 #ifdef HAVE_SSL 128 static unsigned char *ocspdata = NULL; 129 static long ocspdata_len = 0; 130 #endif 131 132 #ifdef NONBLOCKING_IS_BROKEN 133 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 134 read multiple times from a socket when reported ready by select. */ 135 # define NUM_RECV_PER_SELECT (1) 136 #else /* !NONBLOCKING_IS_BROKEN */ 137 # define NUM_RECV_PER_SELECT (100) 138 #endif /* NONBLOCKING_IS_BROKEN */ 139 140 #ifndef HAVE_MMSGHDR 141 struct mmsghdr { 142 struct msghdr msg_hdr; 143 unsigned int msg_len; 144 }; 145 #endif 146 147 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 148 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 149 static struct query *queries[NUM_RECV_PER_SELECT]; 150 151 /* 152 * Data for the TCP connection handlers. 153 * 154 * The TCP handlers use non-blocking I/O. This is necessary to avoid 155 * blocking the entire server on a slow TCP connection, but does make 156 * reading from and writing to the socket more complicated. 157 * 158 * Basically, whenever a read/write would block (indicated by the 159 * EAGAIN errno variable) we remember the position we were reading 160 * from/writing to and return from the TCP reading/writing event 161 * handler. When the socket becomes readable/writable again we 162 * continue from the same position. 163 */ 164 struct tcp_handler_data 165 { 166 /* 167 * The region used to allocate all TCP connection related 168 * data, including this structure. This region is destroyed 169 * when the connection is closed. 170 */ 171 region_type* region; 172 173 /* 174 * The global nsd structure. 175 */ 176 struct nsd* nsd; 177 178 /* 179 * The current query data for this TCP connection. 180 */ 181 query_type* query; 182 183 /* 184 * The query_state is used to remember if we are performing an 185 * AXFR, if we're done processing, or if we should discard the 186 * query and connection. 187 */ 188 query_state_type query_state; 189 190 /* 191 * The event for the file descriptor and tcp timeout 192 */ 193 struct event event; 194 195 /* 196 * The bytes_transmitted field is used to remember the number 197 * of bytes transmitted when receiving or sending a DNS 198 * packet. The count includes the two additional bytes used 199 * to specify the packet length on a TCP connection. 200 */ 201 size_t bytes_transmitted; 202 203 /* 204 * The number of queries handled by this specific TCP connection. 205 */ 206 int query_count; 207 208 /* 209 * The timeout in msec for this tcp connection 210 */ 211 int tcp_timeout; 212 #ifdef HAVE_SSL 213 /* 214 * TLS object. 215 */ 216 SSL* tls; 217 218 /* 219 * TLS handshake state. 220 */ 221 enum { tls_hs_none, tls_hs_read, tls_hs_write, 222 tls_hs_read_event, tls_hs_write_event } shake_state; 223 #endif 224 /* list of connections, for service of remaining tcp channels */ 225 struct tcp_handler_data *prev, *next; 226 }; 227 /* global that is the list of active tcp channels */ 228 static struct tcp_handler_data *tcp_active_list = NULL; 229 230 /* 231 * Handle incoming queries on the UDP server sockets. 232 */ 233 static void handle_udp(int fd, short event, void* arg); 234 235 /* 236 * Handle incoming connections on the TCP sockets. These handlers 237 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 238 * connection) but are disabled when the number of current TCP 239 * connections is equal to the maximum number of TCP connections. 240 * Disabling is done by changing the handler to wait for the 241 * NETIO_EVENT_NONE type. This is done using the function 242 * configure_tcp_accept_handlers. 243 */ 244 static void handle_tcp_accept(int fd, short event, void* arg); 245 246 /* 247 * Handle incoming queries on a TCP connection. The TCP connections 248 * are configured to be non-blocking and the handler may be called 249 * multiple times before a complete query is received. 250 */ 251 static void handle_tcp_reading(int fd, short event, void* arg); 252 253 /* 254 * Handle outgoing responses on a TCP connection. The TCP connections 255 * are configured to be non-blocking and the handler may be called 256 * multiple times before a complete response is sent. 257 */ 258 static void handle_tcp_writing(int fd, short event, void* arg); 259 260 #ifdef HAVE_SSL 261 /* Create SSL object and associate fd */ 262 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 263 /* 264 * Handle TLS handshake. May be called multiple times if incomplete. 265 */ 266 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 267 268 /* 269 * Handle incoming queries on a TLS over TCP connection. The TLS 270 * connections are configured to be non-blocking and the handler may 271 * be called multiple times before a complete query is received. 272 */ 273 static void handle_tls_reading(int fd, short event, void* arg); 274 275 /* 276 * Handle outgoing responses on a TLS over TCP connection. The TLS 277 * connections are configured to be non-blocking and the handler may 278 * be called multiple times before a complete response is sent. 279 */ 280 static void handle_tls_writing(int fd, short event, void* arg); 281 #endif 282 283 /* 284 * Send all children the quit nonblocking, then close pipe. 285 */ 286 static void send_children_quit(struct nsd* nsd); 287 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 288 static void send_children_quit_and_wait(struct nsd* nsd); 289 290 /* set childrens flags to send NSD_STATS to them */ 291 #ifdef BIND8_STATS 292 static void set_children_stats(struct nsd* nsd); 293 #endif /* BIND8_STATS */ 294 295 /* 296 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 297 */ 298 static void configure_handler_event_types(short event_types); 299 300 static uint16_t *compressed_dname_offsets = 0; 301 static uint32_t compression_table_capacity = 0; 302 static uint32_t compression_table_size = 0; 303 static domain_type* compressed_dnames[MAXRRSPP]; 304 305 #ifdef USE_TCP_FASTOPEN 306 /* Checks to see if the kernel value must be manually changed in order for 307 TCP Fast Open to support server mode */ 308 static void report_tcp_fastopen_config() { 309 310 int tcp_fastopen_fp; 311 uint8_t tcp_fastopen_value; 312 313 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 314 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 315 } 316 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 317 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 318 close(tcp_fastopen_fp); 319 } 320 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 321 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 322 log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); 323 log_msg(LOG_WARNING, "To enable TFO use the command:"); 324 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 325 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 326 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 327 close(tcp_fastopen_fp); 328 } 329 close(tcp_fastopen_fp); 330 } 331 #endif 332 333 /* 334 * Remove the specified pid from the list of child pids. Returns -1 if 335 * the pid is not in the list, child_num otherwise. The field is set to 0. 336 */ 337 static int 338 delete_child_pid(struct nsd *nsd, pid_t pid) 339 { 340 size_t i; 341 for (i = 0; i < nsd->child_count; ++i) { 342 if (nsd->children[i].pid == pid) { 343 nsd->children[i].pid = 0; 344 if(!nsd->children[i].need_to_exit) { 345 if(nsd->children[i].child_fd != -1) 346 close(nsd->children[i].child_fd); 347 nsd->children[i].child_fd = -1; 348 if(nsd->children[i].handler) 349 nsd->children[i].handler->fd = -1; 350 } 351 return i; 352 } 353 } 354 return -1; 355 } 356 357 /* 358 * Restart child servers if necessary. 359 */ 360 static int 361 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 362 int* xfrd_sock_p) 363 { 364 struct main_ipc_handler_data *ipc_data; 365 size_t i; 366 int sv[2]; 367 368 /* Fork the child processes... */ 369 for (i = 0; i < nsd->child_count; ++i) { 370 if (nsd->children[i].pid <= 0) { 371 if (nsd->children[i].child_fd != -1) 372 close(nsd->children[i].child_fd); 373 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 374 log_msg(LOG_ERR, "socketpair: %s", 375 strerror(errno)); 376 return -1; 377 } 378 nsd->children[i].child_fd = sv[0]; 379 nsd->children[i].parent_fd = sv[1]; 380 nsd->children[i].pid = fork(); 381 switch (nsd->children[i].pid) { 382 default: /* SERVER MAIN */ 383 close(nsd->children[i].parent_fd); 384 nsd->children[i].parent_fd = -1; 385 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 386 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 387 } 388 if(!nsd->children[i].handler) 389 { 390 ipc_data = (struct main_ipc_handler_data*) region_alloc( 391 region, sizeof(struct main_ipc_handler_data)); 392 ipc_data->nsd = nsd; 393 ipc_data->child = &nsd->children[i]; 394 ipc_data->child_num = i; 395 ipc_data->xfrd_sock = xfrd_sock_p; 396 ipc_data->packet = buffer_create(region, QIOBUFSZ); 397 ipc_data->forward_mode = 0; 398 ipc_data->got_bytes = 0; 399 ipc_data->total_bytes = 0; 400 ipc_data->acl_num = 0; 401 nsd->children[i].handler = (struct netio_handler*) region_alloc( 402 region, sizeof(struct netio_handler)); 403 nsd->children[i].handler->fd = nsd->children[i].child_fd; 404 nsd->children[i].handler->timeout = NULL; 405 nsd->children[i].handler->user_data = ipc_data; 406 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 407 nsd->children[i].handler->event_handler = parent_handle_child_command; 408 netio_add_handler(netio, nsd->children[i].handler); 409 } 410 /* clear any ongoing ipc */ 411 ipc_data = (struct main_ipc_handler_data*) 412 nsd->children[i].handler->user_data; 413 ipc_data->forward_mode = 0; 414 /* restart - update fd */ 415 nsd->children[i].handler->fd = nsd->children[i].child_fd; 416 break; 417 case 0: /* CHILD */ 418 /* the child need not be able to access the 419 * nsd.db file */ 420 namedb_close_udb(nsd->db); 421 #ifdef MEMCLEAN /* OS collects memory pages */ 422 region_destroy(region); 423 #endif 424 425 if (pledge("stdio rpath inet", NULL) == -1) { 426 log_msg(LOG_ERR, "pledge"); 427 exit(1); 428 } 429 430 nsd->pid = 0; 431 nsd->child_count = 0; 432 nsd->server_kind = nsd->children[i].kind; 433 nsd->this_child = &nsd->children[i]; 434 nsd->this_child->child_num = i; 435 /* remove signal flags inherited from parent 436 the parent will handle them. */ 437 nsd->signal_hint_reload_hup = 0; 438 nsd->signal_hint_reload = 0; 439 nsd->signal_hint_child = 0; 440 nsd->signal_hint_quit = 0; 441 nsd->signal_hint_shutdown = 0; 442 nsd->signal_hint_stats = 0; 443 nsd->signal_hint_statsusr = 0; 444 close(*xfrd_sock_p); 445 close(nsd->this_child->child_fd); 446 nsd->this_child->child_fd = -1; 447 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 448 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 449 } 450 server_child(nsd); 451 /* NOTREACH */ 452 exit(0); 453 case -1: 454 log_msg(LOG_ERR, "fork failed: %s", 455 strerror(errno)); 456 return -1; 457 } 458 } 459 } 460 return 0; 461 } 462 463 #ifdef BIND8_STATS 464 static void set_bind8_alarm(struct nsd* nsd) 465 { 466 /* resync so that the next alarm is on the next whole minute */ 467 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ 468 alarm(nsd->st.period - (time(NULL) % nsd->st.period)); 469 } 470 #endif 471 472 /* set zone stat ids for zones initially read in */ 473 static void 474 zonestatid_tree_set(struct nsd* nsd) 475 { 476 struct radnode* n; 477 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 478 zone_type* zone = (zone_type*)n->elem; 479 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 480 } 481 } 482 483 #ifdef USE_ZONE_STATS 484 void 485 server_zonestat_alloc(struct nsd* nsd) 486 { 487 size_t num = (nsd->options->zonestatnames->count==0?1: 488 nsd->options->zonestatnames->count); 489 size_t sz = sizeof(struct nsdst)*num; 490 char tmpfile[256]; 491 uint8_t z = 0; 492 493 /* file names */ 494 nsd->zonestatfname[0] = 0; 495 nsd->zonestatfname[1] = 0; 496 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 497 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 498 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 499 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 500 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 501 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 502 503 /* file descriptors */ 504 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 505 if(nsd->zonestatfd[0] == -1) { 506 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 507 strerror(errno)); 508 exit(1); 509 } 510 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 511 if(nsd->zonestatfd[0] == -1) { 512 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 513 strerror(errno)); 514 close(nsd->zonestatfd[0]); 515 unlink(nsd->zonestatfname[0]); 516 exit(1); 517 } 518 519 #ifdef HAVE_MMAP 520 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 521 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 522 strerror(errno)); 523 exit(1); 524 } 525 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 526 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 527 nsd->zonestatfname[0], strerror(errno)); 528 exit(1); 529 } 530 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 531 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 532 strerror(errno)); 533 exit(1); 534 } 535 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 536 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 537 nsd->zonestatfname[1], strerror(errno)); 538 exit(1); 539 } 540 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 541 MAP_SHARED, nsd->zonestatfd[0], 0); 542 if(nsd->zonestat[0] == MAP_FAILED) { 543 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 544 unlink(nsd->zonestatfname[0]); 545 unlink(nsd->zonestatfname[1]); 546 exit(1); 547 } 548 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 549 MAP_SHARED, nsd->zonestatfd[1], 0); 550 if(nsd->zonestat[1] == MAP_FAILED) { 551 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 552 unlink(nsd->zonestatfname[0]); 553 unlink(nsd->zonestatfname[1]); 554 exit(1); 555 } 556 memset(nsd->zonestat[0], 0, sz); 557 memset(nsd->zonestat[1], 0, sz); 558 nsd->zonestatsize[0] = num; 559 nsd->zonestatsize[1] = num; 560 nsd->zonestatdesired = num; 561 nsd->zonestatsizenow = num; 562 nsd->zonestatnow = nsd->zonestat[0]; 563 #endif /* HAVE_MMAP */ 564 } 565 566 void 567 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 568 { 569 #ifdef HAVE_MMAP 570 #ifdef MREMAP_MAYMOVE 571 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 572 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 573 MREMAP_MAYMOVE); 574 if(nsd->zonestat[idx] == MAP_FAILED) { 575 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 576 exit(1); 577 } 578 #else /* !HAVE MREMAP */ 579 if(msync(nsd->zonestat[idx], 580 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 581 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 582 if(munmap(nsd->zonestat[idx], 583 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 584 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 585 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 586 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 587 if(nsd->zonestat[idx] == MAP_FAILED) { 588 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 589 exit(1); 590 } 591 #endif /* MREMAP */ 592 #endif /* HAVE_MMAP */ 593 } 594 595 /* realloc the zonestat array for the one that is not currently in use, 596 * to match the desired new size of the array (if applicable) */ 597 void 598 server_zonestat_realloc(struct nsd* nsd) 599 { 600 #ifdef HAVE_MMAP 601 uint8_t z = 0; 602 size_t sz; 603 int idx = 0; /* index of the zonestat array that is not in use */ 604 if(nsd->zonestatnow == nsd->zonestat[0]) 605 idx = 1; 606 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 607 return; 608 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 609 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 610 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 611 strerror(errno)); 612 exit(1); 613 } 614 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 615 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 616 nsd->zonestatfname[idx], strerror(errno)); 617 exit(1); 618 } 619 zonestat_remap(nsd, idx, sz); 620 /* zero the newly allocated region */ 621 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 622 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 623 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 624 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 625 } 626 nsd->zonestatsize[idx] = nsd->zonestatdesired; 627 #endif /* HAVE_MMAP */ 628 } 629 630 /* switchover to use the other array for the new children, that 631 * briefly coexist with the old children. And we want to avoid them 632 * both writing to the same statistics arrays. */ 633 void 634 server_zonestat_switch(struct nsd* nsd) 635 { 636 if(nsd->zonestatnow == nsd->zonestat[0]) { 637 nsd->zonestatnow = nsd->zonestat[1]; 638 nsd->zonestatsizenow = nsd->zonestatsize[1]; 639 } else { 640 nsd->zonestatnow = nsd->zonestat[0]; 641 nsd->zonestatsizenow = nsd->zonestatsize[0]; 642 } 643 } 644 #endif /* USE_ZONE_STATS */ 645 646 static void 647 cleanup_dname_compression_tables(void *ptr) 648 { 649 free(ptr); 650 compressed_dname_offsets = NULL; 651 compression_table_capacity = 0; 652 } 653 654 static void 655 initialize_dname_compression_tables(struct nsd *nsd) 656 { 657 size_t needed = domain_table_count(nsd->db->domains) + 1; 658 needed += EXTRA_DOMAIN_NUMBERS; 659 if(compression_table_capacity < needed) { 660 if(compressed_dname_offsets) { 661 region_remove_cleanup(nsd->db->region, 662 cleanup_dname_compression_tables, 663 compressed_dname_offsets); 664 free(compressed_dname_offsets); 665 } 666 compressed_dname_offsets = (uint16_t *) xmallocarray( 667 needed, sizeof(uint16_t)); 668 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 669 compressed_dname_offsets); 670 compression_table_capacity = needed; 671 compression_table_size=domain_table_count(nsd->db->domains)+1; 672 } 673 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 674 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 675 } 676 677 static int 678 set_cloexec(struct nsd_socket *sock) 679 { 680 assert(sock != NULL); 681 682 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 683 const char *socktype = 684 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 685 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 686 socktype, strerror(errno)); 687 return -1; 688 } 689 690 return 1; 691 } 692 693 static int 694 set_reuseport(struct nsd_socket *sock) 695 { 696 #ifdef SO_REUSEPORT 697 int on = 1; 698 #ifdef SO_REUSEPORT_LB 699 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 700 * SO_REUSEPORT on Linux. This is what the users want with the config 701 * option in nsd.conf; if we actually need local address and port reuse 702 * they'll also need to have SO_REUSEPORT set for them, assume it was 703 * _LB they want. 704 */ 705 int opt = SO_REUSEPORT_LB; 706 static const char optname[] = "SO_REUSEPORT_LB"; 707 #else /* !SO_REUSEPORT_LB */ 708 int opt = SO_REUSEPORT; 709 static const char optname[] = "SO_REUSEPORT"; 710 #endif /* SO_REUSEPORT_LB */ 711 712 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 713 return 1; 714 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 715 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 716 optname, strerror(errno)); 717 } 718 return -1; 719 #else 720 (void)sock; 721 #endif /* SO_REUSEPORT */ 722 723 return 0; 724 } 725 726 static int 727 set_reuseaddr(struct nsd_socket *sock) 728 { 729 #ifdef SO_REUSEADDR 730 int on = 1; 731 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 732 return 1; 733 } 734 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 735 strerror(errno)); 736 return -1; 737 #endif /* SO_REUSEADDR */ 738 return 0; 739 } 740 741 static int 742 set_rcvbuf(struct nsd_socket *sock, int rcv) 743 { 744 #ifdef SO_RCVBUF 745 #ifdef SO_RCVBUFFORCE 746 if(0 == setsockopt( 747 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 748 { 749 return 1; 750 } 751 if(errno == EPERM || errno == ENOBUFS) { 752 return 0; 753 } 754 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 755 strerror(errno)); 756 return -1; 757 #else /* !SO_RCVBUFFORCE */ 758 if (0 == setsockopt( 759 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 760 { 761 return 1; 762 } 763 if(errno == ENOSYS || errno == ENOBUFS) { 764 return 0; 765 } 766 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 767 strerror(errno)); 768 return -1; 769 #endif /* SO_RCVBUFFORCE */ 770 #endif /* SO_RCVBUF */ 771 772 return 0; 773 } 774 775 static int 776 set_sndbuf(struct nsd_socket *sock, int snd) 777 { 778 #ifdef SO_SNDBUF 779 #ifdef SO_SNDBUFFORCE 780 if(0 == setsockopt( 781 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 782 { 783 return 1; 784 } 785 if(errno == EPERM || errno == ENOBUFS) { 786 return 0; 787 } 788 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 789 strerror(errno)); 790 return -1; 791 #else /* !SO_SNDBUFFORCE */ 792 if(0 == setsockopt( 793 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 794 { 795 return 1; 796 } 797 if(errno == ENOSYS || errno == ENOBUFS) { 798 return 0; 799 } 800 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 801 strerror(errno)); 802 return -1; 803 #endif /* SO_SNDBUFFORCE */ 804 #endif /* SO_SNDBUF */ 805 806 return 0; 807 } 808 809 static int 810 set_nonblock(struct nsd_socket *sock) 811 { 812 const char *socktype = 813 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 814 815 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 816 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 817 socktype, strerror(errno)); 818 return -1; 819 } 820 821 return 1; 822 } 823 824 static int 825 set_ipv6_v6only(struct nsd_socket *sock) 826 { 827 #ifdef INET6 828 #ifdef IPV6_V6ONLY 829 int on = 1; 830 const char *socktype = 831 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 832 833 if(0 == setsockopt( 834 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 835 { 836 return 1; 837 } 838 839 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 840 socktype, strerror(errno)); 841 return -1; 842 #endif /* IPV6_V6ONLY */ 843 #endif /* INET6 */ 844 845 return 0; 846 } 847 848 static int 849 set_ipv6_use_min_mtu(struct nsd_socket *sock) 850 { 851 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)) 852 #if defined(IPV6_USE_MIN_MTU) 853 /* There is no fragmentation of IPv6 datagrams during forwarding in the 854 * network. Therefore we do not send UDP datagrams larger than the 855 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 856 * larger if the network stack supports IPV6_USE_MIN_MTU. 857 */ 858 int opt = IPV6_USE_MIN_MTU; 859 int optval = 1; 860 static const char optname[] = "IPV6_USE_MIN_MTU"; 861 #elif defined(IPV6_MTU) 862 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 863 * to the MIN MTU to get the same. 864 */ 865 int opt = IPV6_MTU; 866 int optval = IPV6_MIN_MTU; 867 static const char optname[] = "IPV6_MTU"; 868 #endif 869 if(0 == setsockopt( 870 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 871 { 872 return 1; 873 } 874 875 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 876 optname, strerror(errno)); 877 return -1; 878 #else 879 (void)sock; 880 #endif /* INET6 */ 881 882 return 0; 883 } 884 885 static int 886 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 887 { 888 int ret = 0; 889 890 #if defined(IP_MTU_DISCOVER) 891 int opt = IP_MTU_DISCOVER; 892 int optval; 893 # if defined(IP_PMTUDISC_OMIT) 894 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 895 * information and send packets with DF=0. Fragmentation is allowed if 896 * and only if the packet size exceeds the outgoing interface MTU or 897 * the packet encounters smaller MTU link in network. This mitigates 898 * DNS fragmentation attacks by preventing forged PMTU information. 899 * FreeBSD already has same semantics without setting the option. 900 */ 901 optval = IP_PMTUDISC_OMIT; 902 if(0 == setsockopt( 903 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 904 { 905 return 1; 906 } 907 908 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 909 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 910 # endif /* IP_PMTUDISC_OMIT */ 911 # if defined(IP_PMTUDISC_DONT) 912 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 913 optval = IP_PMTUDISC_DONT; 914 if(0 == setsockopt( 915 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 916 { 917 return 1; 918 } 919 920 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 921 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 922 # endif 923 ret = -1; 924 #elif defined(IP_DONTFRAG) 925 int off = 0; 926 if (0 == setsockopt( 927 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 928 { 929 return 1; 930 } 931 932 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 933 strerror(errno)); 934 ret = -1; 935 #else 936 (void)sock; 937 #endif 938 939 return ret; 940 } 941 942 static int 943 set_ip_freebind(struct nsd_socket *sock) 944 { 945 #ifdef IP_FREEBIND 946 int on = 1; 947 const char *socktype = 948 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 949 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 950 { 951 return 1; 952 } 953 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 954 socktype, strerror(errno)); 955 return -1; 956 #else 957 (void)sock; 958 #endif /* IP_FREEBIND */ 959 960 return 0; 961 } 962 963 static int 964 set_ip_transparent(struct nsd_socket *sock) 965 { 966 /* 967 The scandalous preprocessor blob here calls for some explanation :) 968 POSIX does not specify an option to bind non-local IPs, so 969 platforms developed several implementation-specific options, 970 all set in the same way, but with different names. 971 For additional complexity, some platform manage this setting 972 differently for different address families (IPv4 vs IPv6). 973 This scandalous preprocessor blob below abstracts such variability 974 in the way which leaves the C code as lean and clear as possible. 975 */ 976 977 #if defined(IP_TRANSPARENT) 978 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 979 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 980 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 981 // as of 2020-01, Linux does not support this on IPv6 programmatically 982 #elif defined(SO_BINDANY) 983 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 984 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 985 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 986 #elif defined(IP_BINDANY) 987 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 988 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 989 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 990 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 991 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 992 #endif 993 994 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 995 (void)sock; 996 #else 997 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 998 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 999 # endif 1000 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1001 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1002 # endif 1003 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1004 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1005 # endif 1006 1007 int on = 1; 1008 const char *socktype = 1009 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1010 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1011 1012 if(0 == setsockopt( 1013 sock->s, 1014 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1015 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1016 &on, sizeof(on))) 1017 { 1018 return 1; 1019 } 1020 1021 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1022 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1023 return -1; 1024 #endif 1025 1026 return 0; 1027 } 1028 1029 static int 1030 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1031 { 1032 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1033 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1034 return 1; 1035 } 1036 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1037 strerror(errno)); 1038 return -1; 1039 #else 1040 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1041 #endif 1042 return 0; 1043 } 1044 1045 #ifdef USE_TCP_FASTOPEN 1046 static int 1047 set_tcp_fastopen(struct nsd_socket *sock) 1048 { 1049 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1050 * a defense against IP spoofing attacks as suggested in RFC7413. 1051 */ 1052 int qlen; 1053 1054 #ifdef __APPLE__ 1055 /* macOS X implementation only supports qlen of 1 via this call. The 1056 * actual value is configured by the net.inet.tcp.fastopen_backlog 1057 * kernel parameter. 1058 */ 1059 qlen = 1; 1060 #else 1061 /* 5 is recommended on Linux. */ 1062 qlen = 5; 1063 #endif 1064 if (0 == setsockopt( 1065 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1066 { 1067 return 1; 1068 } 1069 1070 if (errno == EPERM) { 1071 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1072 "; this could likely be because sysctl " 1073 "net.inet.tcp.fastopen.enabled, " 1074 "net.inet.tcp.fastopen.server_enable, or " 1075 "net.ipv4.tcp_fastopen is disabled", 1076 strerror(errno)); 1077 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1078 * disabled, except when verbosity enabled for debugging 1079 */ 1080 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1081 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1082 strerror(errno)); 1083 } 1084 1085 return (errno == ENOPROTOOPT ? 0 : -1); 1086 } 1087 #endif /* USE_TCP_FASTOPEN */ 1088 1089 static int 1090 set_bindtodevice(struct nsd_socket *sock) 1091 { 1092 #if defined(SO_BINDTODEVICE) 1093 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1094 sock->device, strlen(sock->device)) == -1) 1095 { 1096 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1097 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1098 return -1; 1099 } 1100 1101 return 1; 1102 #else 1103 (void)sock; 1104 return 0; 1105 #endif 1106 } 1107 1108 static int 1109 set_setfib(struct nsd_socket *sock) 1110 { 1111 #if defined(SO_SETFIB) 1112 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1113 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1114 { 1115 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1116 "SO_SETFIB", sock->fib, strerror(errno)); 1117 return -1; 1118 } 1119 1120 return 1; 1121 #else 1122 (void)sock; 1123 return 0; 1124 #endif 1125 } 1126 1127 static int 1128 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1129 { 1130 int rcv = 1*1024*1024, snd = 1*1024*1024; 1131 1132 if(-1 == (sock->s = socket( 1133 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1134 { 1135 #ifdef INET6 1136 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1137 (sock->addr.ai_family == AF_INET6) && 1138 (errno == EAFNOSUPPORT)) 1139 { 1140 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1141 "not supported"); 1142 return 0; 1143 } 1144 #endif 1145 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1146 return -1; 1147 } 1148 1149 set_cloexec(sock); 1150 1151 if(nsd->reuseport && reuseport_works && *reuseport_works) 1152 *reuseport_works = (set_reuseport(sock) == 1); 1153 1154 if(nsd->options->receive_buffer_size > 0) 1155 rcv = nsd->options->receive_buffer_size; 1156 if(set_rcvbuf(sock, rcv) == -1) 1157 return -1; 1158 1159 if(nsd->options->send_buffer_size > 0) 1160 snd = nsd->options->send_buffer_size; 1161 if(set_sndbuf(sock, snd) == -1) 1162 return -1; 1163 #ifdef INET6 1164 if(sock->addr.ai_family == AF_INET6) { 1165 if(set_ipv6_v6only(sock) == -1 || 1166 set_ipv6_use_min_mtu(sock) == -1) 1167 return -1; 1168 } else 1169 #endif /* INET6 */ 1170 if(sock->addr.ai_family == AF_INET) { 1171 if(set_ipv4_no_pmtu_disc(sock) == -1) 1172 return -1; 1173 } 1174 1175 /* Set socket to non-blocking. Otherwise, on operating systems 1176 * with thundering herd problems, the UDP recv could block 1177 * after select returns readable. 1178 */ 1179 set_nonblock(sock); 1180 1181 if(nsd->options->ip_freebind) 1182 (void)set_ip_freebind(sock); 1183 if(nsd->options->ip_transparent) 1184 (void)set_ip_transparent(sock); 1185 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1186 return -1; 1187 if(sock->fib != -1 && set_setfib(sock) == -1) 1188 return -1; 1189 1190 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1191 char buf[256]; 1192 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1193 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1194 buf, strerror(errno)); 1195 return -1; 1196 } 1197 1198 return 1; 1199 } 1200 1201 static int 1202 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1203 { 1204 #ifdef USE_TCP_FASTOPEN 1205 report_tcp_fastopen_config(); 1206 #endif 1207 1208 (void)reuseport_works; 1209 1210 if(-1 == (sock->s = socket( 1211 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1212 { 1213 #ifdef INET6 1214 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1215 (sock->addr.ai_family == AF_INET6) && 1216 (errno == EAFNOSUPPORT)) 1217 { 1218 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1219 "not supported"); 1220 return 0; 1221 } 1222 #endif /* INET6 */ 1223 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1224 return -1; 1225 } 1226 1227 set_cloexec(sock); 1228 1229 if(nsd->reuseport && reuseport_works && *reuseport_works) 1230 *reuseport_works = (set_reuseport(sock) == 1); 1231 1232 (void)set_reuseaddr(sock); 1233 1234 #ifdef INET6 1235 if(sock->addr.ai_family == AF_INET6) { 1236 if (set_ipv6_v6only(sock) == -1 || 1237 set_ipv6_use_min_mtu(sock) == -1) 1238 return -1; 1239 } 1240 #endif 1241 1242 if(nsd->tcp_mss > 0) 1243 set_tcp_maxseg(sock, nsd->tcp_mss); 1244 /* (StevensUNP p463), if TCP listening socket is blocking, then 1245 it may block in accept, even if select() says readable. */ 1246 (void)set_nonblock(sock); 1247 if(nsd->options->ip_freebind) 1248 (void)set_ip_freebind(sock); 1249 if(nsd->options->ip_transparent) 1250 (void)set_ip_transparent(sock); 1251 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1252 return -1; 1253 if(sock->fib != -1 && set_setfib(sock) == -1) 1254 return -1; 1255 1256 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1257 char buf[256]; 1258 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1259 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1260 buf, strerror(errno)); 1261 return -1; 1262 } 1263 1264 #ifdef USE_TCP_FASTOPEN 1265 (void)set_tcp_fastopen(sock); 1266 #endif 1267 1268 if(listen(sock->s, TCP_BACKLOG) == -1) { 1269 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1270 return -1; 1271 } 1272 1273 return 1; 1274 } 1275 1276 /* 1277 * Initialize the server, reuseport, create and bind the sockets. 1278 */ 1279 int 1280 server_init(struct nsd *nsd) 1281 { 1282 size_t i; 1283 int reuseport = 1; /* Determine if REUSEPORT works. */ 1284 1285 /* open server interface ports */ 1286 for(i = 0; i < nsd->ifs; i++) { 1287 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1288 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1289 { 1290 return -1; 1291 } 1292 } 1293 1294 if(nsd->reuseport && reuseport) { 1295 size_t ifs = nsd->ifs * nsd->reuseport; 1296 1297 /* increase the size of the interface arrays, there are going 1298 * to be separate interface file descriptors for every server 1299 * instance */ 1300 region_remove_cleanup(nsd->region, free, nsd->udp); 1301 region_remove_cleanup(nsd->region, free, nsd->tcp); 1302 1303 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1304 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1305 region_add_cleanup(nsd->region, free, nsd->udp); 1306 region_add_cleanup(nsd->region, free, nsd->tcp); 1307 1308 for(i = nsd->ifs; i < ifs; i++) { 1309 nsd->udp[i].addr = nsd->udp[i%nsd->ifs].addr; 1310 nsd->udp[i].servers = nsd->udp[i%nsd->ifs].servers; 1311 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1312 return -1; 1313 } 1314 /* Turn off REUSEPORT for TCP by copying the socket 1315 * file descriptor. 1316 * This means we should not close TCP used by 1317 * other servers in reuseport enabled mode, in 1318 * server_child(). 1319 */ 1320 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1321 } 1322 1323 nsd->ifs = ifs; 1324 } else { 1325 nsd->reuseport = 0; 1326 } 1327 1328 return 0; 1329 } 1330 1331 /* 1332 * Prepare the server for take off. 1333 * 1334 */ 1335 int 1336 server_prepare(struct nsd *nsd) 1337 { 1338 #ifdef RATELIMIT 1339 /* set secret modifier for hashing (udb ptr buckets and rate limits) */ 1340 #ifdef HAVE_GETRANDOM 1341 uint32_t v; 1342 if(getrandom(&v, sizeof(v), 0) == -1) { 1343 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1344 exit(1); 1345 } 1346 hash_set_raninit(v); 1347 #elif defined(HAVE_ARC4RANDOM) 1348 hash_set_raninit(arc4random()); 1349 #else 1350 uint32_t v = getpid() ^ time(NULL); 1351 srandom((unsigned long)v); 1352 # ifdef HAVE_SSL 1353 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1354 hash_set_raninit(v); 1355 else 1356 # endif 1357 hash_set_raninit(random()); 1358 #endif 1359 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1360 nsd->options->rrl_ratelimit, 1361 nsd->options->rrl_whitelist_ratelimit, 1362 nsd->options->rrl_slip, 1363 nsd->options->rrl_ipv4_prefix_length, 1364 nsd->options->rrl_ipv6_prefix_length); 1365 #endif /* RATELIMIT */ 1366 1367 /* Open the database... */ 1368 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { 1369 log_msg(LOG_ERR, "unable to open the database %s: %s", 1370 nsd->dbfile, strerror(errno)); 1371 unlink(nsd->task[0]->fname); 1372 unlink(nsd->task[1]->fname); 1373 #ifdef USE_ZONE_STATS 1374 unlink(nsd->zonestatfname[0]); 1375 unlink(nsd->zonestatfname[1]); 1376 #endif 1377 xfrd_del_tempdir(nsd); 1378 return -1; 1379 } 1380 /* check if zone files have been modified */ 1381 /* NULL for taskudb because we send soainfo in a moment, batched up, 1382 * for all zones */ 1383 if(nsd->options->zonefiles_check || (nsd->options->database == NULL || 1384 nsd->options->database[0] == 0)) 1385 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1386 zonestatid_tree_set(nsd); 1387 1388 compression_table_capacity = 0; 1389 initialize_dname_compression_tables(nsd); 1390 1391 #ifdef BIND8_STATS 1392 /* Initialize times... */ 1393 time(&nsd->st.boot); 1394 set_bind8_alarm(nsd); 1395 #endif /* BIND8_STATS */ 1396 1397 return 0; 1398 } 1399 1400 /* 1401 * Fork the required number of servers. 1402 */ 1403 static int 1404 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1405 int* xfrd_sock_p) 1406 { 1407 size_t i; 1408 1409 /* Start all child servers initially. */ 1410 for (i = 0; i < nsd->child_count; ++i) { 1411 nsd->children[i].pid = 0; 1412 } 1413 1414 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1415 } 1416 1417 static void 1418 server_close_socket(struct nsd_socket *sock) 1419 { 1420 if(sock->s != -1) { 1421 close(sock->s); 1422 sock->s = -1; 1423 } 1424 } 1425 1426 void 1427 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1428 { 1429 size_t i; 1430 1431 /* Close all the sockets... */ 1432 for (i = 0; i < n; ++i) { 1433 server_close_socket(&sockets[i]); 1434 } 1435 } 1436 1437 /* 1438 * Close the sockets, shutdown the server and exit. 1439 * Does not return. 1440 */ 1441 void 1442 server_shutdown(struct nsd *nsd) 1443 { 1444 size_t i; 1445 1446 server_close_all_sockets(nsd->udp, nsd->ifs); 1447 server_close_all_sockets(nsd->tcp, nsd->ifs); 1448 /* CHILD: close command channel to parent */ 1449 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1450 { 1451 close(nsd->this_child->parent_fd); 1452 nsd->this_child->parent_fd = -1; 1453 } 1454 /* SERVER: close command channels to children */ 1455 if(!nsd->this_child) 1456 { 1457 for(i=0; i < nsd->child_count; ++i) 1458 if(nsd->children[i].child_fd != -1) 1459 { 1460 close(nsd->children[i].child_fd); 1461 nsd->children[i].child_fd = -1; 1462 } 1463 } 1464 1465 tsig_finalize(); 1466 #ifdef HAVE_SSL 1467 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1468 if (nsd->tls_ctx) 1469 SSL_CTX_free(nsd->tls_ctx); 1470 #endif 1471 1472 #ifdef MEMCLEAN /* OS collects memory pages */ 1473 #ifdef RATELIMIT 1474 rrl_mmap_deinit_keep_mmap(); 1475 #endif 1476 #ifdef USE_DNSTAP 1477 dt_collector_destroy(nsd->dt_collector, nsd); 1478 #endif 1479 udb_base_free_keep_mmap(nsd->task[0]); 1480 udb_base_free_keep_mmap(nsd->task[1]); 1481 namedb_close_udb(nsd->db); /* keeps mmap */ 1482 namedb_close(nsd->db); 1483 nsd_options_destroy(nsd->options); 1484 region_destroy(nsd->region); 1485 #endif 1486 log_finalize(); 1487 exit(0); 1488 } 1489 1490 void 1491 server_prepare_xfrd(struct nsd* nsd) 1492 { 1493 char tmpfile[256]; 1494 /* create task mmaps */ 1495 nsd->mytask = 0; 1496 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1497 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1498 nsd->task[0] = task_file_create(tmpfile); 1499 if(!nsd->task[0]) { 1500 #ifdef USE_ZONE_STATS 1501 unlink(nsd->zonestatfname[0]); 1502 unlink(nsd->zonestatfname[1]); 1503 #endif 1504 xfrd_del_tempdir(nsd); 1505 exit(1); 1506 } 1507 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1508 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1509 nsd->task[1] = task_file_create(tmpfile); 1510 if(!nsd->task[1]) { 1511 unlink(nsd->task[0]->fname); 1512 #ifdef USE_ZONE_STATS 1513 unlink(nsd->zonestatfname[0]); 1514 unlink(nsd->zonestatfname[1]); 1515 #endif 1516 xfrd_del_tempdir(nsd); 1517 exit(1); 1518 } 1519 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1520 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1521 /* create xfrd listener structure */ 1522 nsd->xfrd_listener = region_alloc(nsd->region, 1523 sizeof(netio_handler_type)); 1524 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1525 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1526 nsd->xfrd_listener->fd = -1; 1527 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1528 nsd; 1529 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1530 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1531 } 1532 1533 1534 void 1535 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1536 { 1537 pid_t pid; 1538 int sockets[2] = {0,0}; 1539 struct ipc_handler_conn_data *data; 1540 1541 if(nsd->xfrd_listener->fd != -1) 1542 close(nsd->xfrd_listener->fd); 1543 if(del_db) { 1544 /* recreate taskdb that xfrd was using, it may be corrupt */ 1545 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1546 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1547 nsd->task[1-nsd->mytask]->fname = NULL; 1548 /* free alloc already, so udb does not shrink itself */ 1549 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1550 nsd->task[1-nsd->mytask]->alloc = NULL; 1551 udb_base_free(nsd->task[1-nsd->mytask]); 1552 /* create new file, overwrite the old one */ 1553 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1554 free(tmpfile); 1555 } 1556 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1557 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1558 return; 1559 } 1560 pid = fork(); 1561 switch (pid) { 1562 case -1: 1563 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1564 break; 1565 default: 1566 /* PARENT: close first socket, use second one */ 1567 close(sockets[0]); 1568 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1569 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1570 } 1571 if(del_db) xfrd_free_namedb(nsd); 1572 /* use other task than I am using, since if xfrd died and is 1573 * restarted, the reload is using nsd->mytask */ 1574 nsd->mytask = 1 - nsd->mytask; 1575 1576 #ifdef HAVE_SETPROCTITLE 1577 setproctitle("xfrd"); 1578 #endif 1579 #ifdef HAVE_CPUSET_T 1580 if(nsd->use_cpu_affinity) { 1581 set_cpu_affinity(nsd->xfrd_cpuset); 1582 } 1583 #endif 1584 1585 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1586 /* ENOTREACH */ 1587 break; 1588 case 0: 1589 /* CHILD: close second socket, use first one */ 1590 close(sockets[1]); 1591 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1592 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1593 } 1594 nsd->xfrd_listener->fd = sockets[0]; 1595 break; 1596 } 1597 /* server-parent only */ 1598 nsd->xfrd_listener->timeout = NULL; 1599 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1600 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1601 /* clear ongoing ipc reads */ 1602 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1603 data->conn->is_reading = 0; 1604 } 1605 1606 /** add all soainfo to taskdb */ 1607 static void 1608 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1609 { 1610 struct radnode* n; 1611 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1612 /* add all SOA INFO to mytask */ 1613 udb_ptr_init(&task_last, taskudb); 1614 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1615 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1616 } 1617 udb_ptr_unlink(&task_last, taskudb); 1618 } 1619 1620 void 1621 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1622 { 1623 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1624 * parent fills one taskdb with soas, xfrd fills other with expires. 1625 * then they exchange and process. 1626 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1627 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1628 * expire notifications can be sent back via a normal reload later 1629 * (xfrd will wait for current running reload to finish if any). 1630 */ 1631 sig_atomic_t cmd = 0; 1632 pid_t mypid; 1633 int xfrd_sock = nsd->xfrd_listener->fd; 1634 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1635 udb_ptr t; 1636 if(!shortsoa) { 1637 if(nsd->signal_hint_shutdown) { 1638 shutdown: 1639 log_msg(LOG_WARNING, "signal received, shutting down..."); 1640 server_close_all_sockets(nsd->udp, nsd->ifs); 1641 server_close_all_sockets(nsd->tcp, nsd->ifs); 1642 #ifdef HAVE_SSL 1643 daemon_remote_close(nsd->rc); 1644 #endif 1645 /* Unlink it if possible... */ 1646 unlinkpid(nsd->pidfile); 1647 unlink(nsd->task[0]->fname); 1648 unlink(nsd->task[1]->fname); 1649 #ifdef USE_ZONE_STATS 1650 unlink(nsd->zonestatfname[0]); 1651 unlink(nsd->zonestatfname[1]); 1652 #endif 1653 /* write the nsd.db to disk, wait for it to complete */ 1654 udb_base_sync(nsd->db->udb, 1); 1655 udb_base_close(nsd->db->udb); 1656 server_shutdown(nsd); 1657 exit(0); 1658 } 1659 } 1660 if(shortsoa) { 1661 /* put SOA in xfrd task because mytask may be in use */ 1662 taskudb = nsd->task[1-nsd->mytask]; 1663 } 1664 1665 add_all_soa_to_task(nsd, taskudb); 1666 if(!shortsoa) { 1667 /* wait for xfrd to signal task is ready, RELOAD signal */ 1668 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1669 cmd != NSD_RELOAD) { 1670 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1671 exit(1); 1672 } 1673 if(nsd->signal_hint_shutdown) { 1674 goto shutdown; 1675 } 1676 } 1677 /* give xfrd our task, signal it with RELOAD_DONE */ 1678 task_process_sync(taskudb); 1679 cmd = NSD_RELOAD_DONE; 1680 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1681 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1682 (int)nsd->pid, strerror(errno)); 1683 } 1684 mypid = getpid(); 1685 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1686 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1687 strerror(errno)); 1688 } 1689 1690 if(!shortsoa) { 1691 /* process the xfrd task works (expiry data) */ 1692 nsd->mytask = 1 - nsd->mytask; 1693 taskudb = nsd->task[nsd->mytask]; 1694 task_remap(taskudb); 1695 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1696 while(!udb_ptr_is_null(&t)) { 1697 task_process_expire(nsd->db, TASKLIST(&t)); 1698 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1699 } 1700 udb_ptr_unlink(&t, taskudb); 1701 task_clear(taskudb); 1702 1703 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1704 cmd = NSD_RELOAD_DONE; 1705 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1706 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1707 (int)nsd->pid, strerror(errno)); 1708 } 1709 } 1710 } 1711 1712 #ifdef HAVE_SSL 1713 static void 1714 log_crypto_from_err(const char* str, unsigned long err) 1715 { 1716 /* error:[error code]:[library name]:[function name]:[reason string] */ 1717 char buf[128]; 1718 unsigned long e; 1719 ERR_error_string_n(err, buf, sizeof(buf)); 1720 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1721 while( (e=ERR_get_error()) ) { 1722 ERR_error_string_n(e, buf, sizeof(buf)); 1723 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1724 } 1725 } 1726 1727 void 1728 log_crypto_err(const char* str) 1729 { 1730 log_crypto_from_err(str, ERR_get_error()); 1731 } 1732 1733 /** true if the ssl handshake error has to be squelched from the logs */ 1734 static int 1735 squelch_err_ssl_handshake(unsigned long err) 1736 { 1737 if(verbosity >= 3) 1738 return 0; /* only squelch on low verbosity */ 1739 /* this is very specific, we could filter on ERR_GET_REASON() 1740 * (the third element in ERR_PACK) */ 1741 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1742 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1743 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1744 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1745 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1746 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1747 #endif 1748 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1749 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1750 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1751 # ifdef SSL_R_VERSION_TOO_LOW 1752 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1753 # endif 1754 #endif 1755 ) 1756 return 1; 1757 return 0; 1758 } 1759 1760 void 1761 perform_openssl_init(void) 1762 { 1763 /* init SSL library */ 1764 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1765 ERR_load_crypto_strings(); 1766 #endif 1767 ERR_load_SSL_strings(); 1768 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1769 OpenSSL_add_all_algorithms(); 1770 #else 1771 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1772 | OPENSSL_INIT_ADD_ALL_DIGESTS 1773 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1774 #endif 1775 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1776 (void)SSL_library_init(); 1777 #else 1778 OPENSSL_init_ssl(0, NULL); 1779 #endif 1780 1781 if(!RAND_status()) { 1782 /* try to seed it */ 1783 unsigned char buf[256]; 1784 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1785 size_t i; 1786 v = seed; 1787 for(i=0; i<256/sizeof(v); i++) { 1788 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1789 v = v*seed + (unsigned int)i; 1790 } 1791 RAND_seed(buf, 256); 1792 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1793 } 1794 } 1795 1796 static int 1797 get_ocsp(char *filename, unsigned char **ocsp) 1798 { 1799 BIO *bio; 1800 OCSP_RESPONSE *response; 1801 int len = -1; 1802 unsigned char *p, *buf; 1803 assert(filename); 1804 1805 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1806 log_crypto_err("get_ocsp: BIO_new_file failed"); 1807 return -1; 1808 } 1809 1810 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1811 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1812 BIO_free(bio); 1813 return -1; 1814 } 1815 1816 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1817 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1818 OCSP_RESPONSE_free(response); 1819 BIO_free(bio); 1820 return -1; 1821 } 1822 1823 if ((buf = malloc((size_t) len)) == NULL) { 1824 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1825 OCSP_RESPONSE_free(response); 1826 BIO_free(bio); 1827 return -1; 1828 } 1829 1830 p = buf; 1831 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1832 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1833 free(buf); 1834 OCSP_RESPONSE_free(response); 1835 BIO_free(bio); 1836 return -1; 1837 } 1838 1839 OCSP_RESPONSE_free(response); 1840 BIO_free(bio); 1841 1842 *ocsp = buf; 1843 return len; 1844 } 1845 1846 /* further setup ssl ctx after the keys are loaded */ 1847 static void 1848 listen_sslctx_setup_2(void* ctxt) 1849 { 1850 SSL_CTX* ctx = (SSL_CTX*)ctxt; 1851 (void)ctx; 1852 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 1853 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 1854 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 1855 } 1856 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 1857 if(1) { 1858 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 1859 if (!ecdh) { 1860 log_crypto_err("could not find p256, not enabling ECDHE"); 1861 } else { 1862 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 1863 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 1864 } 1865 EC_KEY_free (ecdh); 1866 } 1867 } 1868 #endif 1869 } 1870 1871 static int 1872 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 1873 { 1874 if(ocspdata) { 1875 unsigned char *p; 1876 if ((p=malloc(ocspdata_len)) == NULL) { 1877 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 1878 return SSL_TLSEXT_ERR_NOACK; 1879 } 1880 memcpy(p, ocspdata, ocspdata_len); 1881 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 1882 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 1883 free(p); 1884 return SSL_TLSEXT_ERR_NOACK; 1885 } 1886 return SSL_TLSEXT_ERR_OK; 1887 } else { 1888 return SSL_TLSEXT_ERR_NOACK; 1889 } 1890 } 1891 1892 SSL_CTX* 1893 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 1894 { 1895 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 1896 if(!ctx) { 1897 log_crypto_err("could not SSL_CTX_new"); 1898 return NULL; 1899 } 1900 /* no SSLv2, SSLv3 because has defects */ 1901 #if SSL_OP_NO_SSLv2 != 0 1902 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 1903 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 1904 SSL_CTX_free(ctx); 1905 return NULL; 1906 } 1907 #endif 1908 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 1909 != SSL_OP_NO_SSLv3){ 1910 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 1911 SSL_CTX_free(ctx); 1912 return 0; 1913 } 1914 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 1915 /* if we have tls 1.1 disable 1.0 */ 1916 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 1917 != SSL_OP_NO_TLSv1){ 1918 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 1919 SSL_CTX_free(ctx); 1920 return 0; 1921 } 1922 #endif 1923 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 1924 /* if we have tls 1.2 disable 1.1 */ 1925 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 1926 != SSL_OP_NO_TLSv1_1){ 1927 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 1928 SSL_CTX_free(ctx); 1929 return 0; 1930 } 1931 #endif 1932 #if defined(SSL_OP_NO_RENEGOTIATION) 1933 /* disable client renegotiation */ 1934 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 1935 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 1936 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 1937 SSL_CTX_free(ctx); 1938 return 0; 1939 } 1940 #endif 1941 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 1942 /* if we have sha256, set the cipher list to have no known vulns */ 1943 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 1944 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 1945 #endif 1946 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 1947 SSL_OP_CIPHER_SERVER_PREFERENCE) != 1948 SSL_OP_CIPHER_SERVER_PREFERENCE) { 1949 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 1950 SSL_CTX_free(ctx); 1951 return 0; 1952 } 1953 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 1954 SSL_CTX_set_security_level(ctx, 0); 1955 #endif 1956 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 1957 log_msg(LOG_ERR, "error for cert file: %s", pem); 1958 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 1959 SSL_CTX_free(ctx); 1960 return NULL; 1961 } 1962 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 1963 log_msg(LOG_ERR, "error for private key file: %s", key); 1964 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 1965 SSL_CTX_free(ctx); 1966 return NULL; 1967 } 1968 if(!SSL_CTX_check_private_key(ctx)) { 1969 log_msg(LOG_ERR, "error for key file: %s", key); 1970 log_crypto_err("Error in SSL_CTX check_private_key"); 1971 SSL_CTX_free(ctx); 1972 return NULL; 1973 } 1974 listen_sslctx_setup_2(ctx); 1975 if(verifypem && verifypem[0]) { 1976 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 1977 log_crypto_err("Error in SSL_CTX verify locations"); 1978 SSL_CTX_free(ctx); 1979 return NULL; 1980 } 1981 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 1982 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 1983 } 1984 return ctx; 1985 } 1986 1987 SSL_CTX* 1988 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 1989 { 1990 char *key, *pem; 1991 SSL_CTX *ctx; 1992 1993 key = nsd->options->tls_service_key; 1994 pem = nsd->options->tls_service_pem; 1995 if(!key || key[0] == 0) { 1996 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 1997 return NULL; 1998 } 1999 if(!pem || pem[0] == 0) { 2000 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2001 return NULL; 2002 } 2003 2004 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2005 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2006 ctx = server_tls_ctx_setup(key, pem, verifypem); 2007 if(!ctx) { 2008 log_msg(LOG_ERR, "could not setup server TLS context"); 2009 return NULL; 2010 } 2011 if(ocspfile && ocspfile[0]) { 2012 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2013 log_crypto_err("Error reading OCSPfile"); 2014 SSL_CTX_free(ctx); 2015 return NULL; 2016 } else { 2017 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2018 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2019 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2020 SSL_CTX_free(ctx); 2021 return NULL; 2022 } 2023 } 2024 } 2025 return ctx; 2026 } 2027 2028 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2029 int 2030 using_tls_port(struct sockaddr* addr, const char* tls_port) 2031 { 2032 in_port_t port = 0; 2033 2034 if (addr->sa_family == AF_INET) 2035 port = ((struct sockaddr_in*)addr)->sin_port; 2036 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2037 else 2038 port = ((struct sockaddr_in6*)addr)->sin6_port; 2039 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2040 if (atoi(tls_port) == ntohs(port)) 2041 return 1; 2042 2043 return 0; 2044 } 2045 #endif 2046 2047 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2048 ssize_t 2049 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2050 { 2051 uint8_t* buf = (uint8_t*) p; 2052 ssize_t total = 0; 2053 struct pollfd fd; 2054 memset(&fd, 0, sizeof(fd)); 2055 fd.fd = s; 2056 fd.events = POLLIN; 2057 2058 while( total < sz) { 2059 ssize_t ret; 2060 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2061 if(ret == -1) { 2062 if(errno == EAGAIN) 2063 /* blocking read */ 2064 continue; 2065 if(errno == EINTR) { 2066 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2067 return -1; 2068 /* other signals can be handled later */ 2069 continue; 2070 } 2071 /* some error */ 2072 return -1; 2073 } 2074 if(ret == 0) { 2075 /* operation timed out */ 2076 return -2; 2077 } 2078 ret = read(s, buf+total, sz-total); 2079 if(ret == -1) { 2080 if(errno == EAGAIN) 2081 /* blocking read */ 2082 continue; 2083 if(errno == EINTR) { 2084 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2085 return -1; 2086 /* other signals can be handled later */ 2087 continue; 2088 } 2089 /* some error */ 2090 return -1; 2091 } 2092 if(ret == 0) { 2093 /* closed connection! */ 2094 return 0; 2095 } 2096 total += ret; 2097 } 2098 return total; 2099 } 2100 2101 static void 2102 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2103 { 2104 sig_atomic_t cmd = NSD_QUIT_SYNC; 2105 udb_ptr t, next; 2106 udb_base* u = nsd->task[nsd->mytask]; 2107 udb_ptr_init(&next, u); 2108 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2109 udb_base_set_userdata(u, 0); 2110 while(!udb_ptr_is_null(&t)) { 2111 /* store next in list so this one can be deleted or reused */ 2112 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2113 udb_rptr_zero(&TASKLIST(&t)->next, u); 2114 2115 /* process task t */ 2116 /* append results for task t and update last_task */ 2117 task_process_in_reload(nsd, u, last_task, &t); 2118 2119 /* go to next */ 2120 udb_ptr_set_ptr(&t, u, &next); 2121 2122 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2123 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2124 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2125 if(cmd == NSD_QUIT) { 2126 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2127 /* sync to disk (if needed) */ 2128 udb_base_sync(nsd->db->udb, 0); 2129 /* unlink files of remainder of tasks */ 2130 while(!udb_ptr_is_null(&t)) { 2131 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2132 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2133 } 2134 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2135 } 2136 udb_ptr_unlink(&t, u); 2137 udb_ptr_unlink(&next, u); 2138 exit(0); 2139 } 2140 } 2141 2142 } 2143 udb_ptr_unlink(&t, u); 2144 udb_ptr_unlink(&next, u); 2145 } 2146 2147 #ifdef BIND8_STATS 2148 static void 2149 parent_send_stats(struct nsd* nsd, int cmdfd) 2150 { 2151 size_t i; 2152 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { 2153 log_msg(LOG_ERR, "could not write stats to reload"); 2154 return; 2155 } 2156 for(i=0; i<nsd->child_count; i++) 2157 if(!write_socket(cmdfd, &nsd->children[i].query_count, 2158 sizeof(stc_type))) { 2159 log_msg(LOG_ERR, "could not write stats to reload"); 2160 return; 2161 } 2162 } 2163 2164 static void 2165 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) 2166 { 2167 struct nsdst s; 2168 stc_type* p; 2169 size_t i; 2170 if(block_read(nsd, cmdfd, &s, sizeof(s), 2171 RELOAD_SYNC_TIMEOUT) != sizeof(s)) { 2172 log_msg(LOG_ERR, "could not read stats from oldpar"); 2173 return; 2174 } 2175 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); 2176 s.db_mem = region_get_mem(nsd->db->region); 2177 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, 2178 nsd->child_count); 2179 if(!p) return; 2180 for(i=0; i<nsd->child_count; i++) { 2181 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= 2182 sizeof(stc_type)) 2183 return; 2184 } 2185 } 2186 #endif /* BIND8_STATS */ 2187 2188 /* 2189 * Reload the database, stop parent, re-fork children and continue. 2190 * as server_main. 2191 */ 2192 static void 2193 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2194 int cmdsocket) 2195 { 2196 pid_t mypid; 2197 sig_atomic_t cmd = NSD_QUIT_SYNC; 2198 int ret; 2199 udb_ptr last_task; 2200 struct sigaction old_sigchld, ign_sigchld; 2201 /* ignore SIGCHLD from the previous server_main that used this pid */ 2202 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2203 ign_sigchld.sa_handler = SIG_IGN; 2204 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2205 2206 #ifdef HAVE_SETPROCTITLE 2207 setproctitle("main"); 2208 #endif 2209 #ifdef HAVE_CPUSET_T 2210 if(nsd->use_cpu_affinity) { 2211 set_cpu_affinity(nsd->cpuset); 2212 } 2213 #endif 2214 2215 /* see what tasks we got from xfrd */ 2216 task_remap(nsd->task[nsd->mytask]); 2217 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2218 udb_compact_inhibited(nsd->db->udb, 1); 2219 reload_process_tasks(nsd, &last_task, cmdsocket); 2220 udb_compact_inhibited(nsd->db->udb, 0); 2221 udb_compact(nsd->db->udb); 2222 2223 #ifndef NDEBUG 2224 if(nsd_debug_level >= 1) 2225 region_log_stats(nsd->db->region); 2226 #endif /* NDEBUG */ 2227 /* sync to disk (if needed) */ 2228 udb_base_sync(nsd->db->udb, 0); 2229 2230 initialize_dname_compression_tables(nsd); 2231 2232 #ifdef BIND8_STATS 2233 /* Restart dumping stats if required. */ 2234 time(&nsd->st.boot); 2235 set_bind8_alarm(nsd); 2236 #endif 2237 #ifdef USE_ZONE_STATS 2238 server_zonestat_realloc(nsd); /* realloc for new children */ 2239 server_zonestat_switch(nsd); 2240 #endif 2241 2242 /* listen for the signals of failed children again */ 2243 sigaction(SIGCHLD, &old_sigchld, NULL); 2244 /* Start new child processes */ 2245 if (server_start_children(nsd, server_region, netio, &nsd-> 2246 xfrd_listener->fd) != 0) { 2247 send_children_quit(nsd); 2248 exit(1); 2249 } 2250 2251 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2252 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2253 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2254 if(cmd == NSD_QUIT) { 2255 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2256 send_children_quit(nsd); 2257 exit(0); 2258 } 2259 } 2260 2261 /* Send quit command to parent: blocking, wait for receipt. */ 2262 do { 2263 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2264 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2265 { 2266 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2267 strerror(errno)); 2268 } 2269 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2270 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2271 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2272 RELOAD_SYNC_TIMEOUT); 2273 if(ret == -2) { 2274 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2275 } 2276 } while (ret == -2); 2277 if(ret == -1) { 2278 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2279 strerror(errno)); 2280 } 2281 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2282 if(cmd == NSD_QUIT) { 2283 /* small race condition possible here, parent got quit cmd. */ 2284 send_children_quit(nsd); 2285 exit(1); 2286 } 2287 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2288 #ifdef BIND8_STATS 2289 reload_do_stats(cmdsocket, nsd, &last_task); 2290 #endif 2291 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2292 task_process_sync(nsd->task[nsd->mytask]); 2293 #ifdef USE_ZONE_STATS 2294 server_zonestat_realloc(nsd); /* realloc for next children */ 2295 #endif 2296 2297 /* send soainfo to the xfrd process, signal it that reload is done, 2298 * it picks up the taskudb */ 2299 cmd = NSD_RELOAD_DONE; 2300 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2301 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2302 strerror(errno)); 2303 } 2304 mypid = getpid(); 2305 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2306 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2307 strerror(errno)); 2308 } 2309 2310 /* try to reopen file */ 2311 if (nsd->file_rotation_ok) 2312 log_reopen(nsd->log_filename, 1); 2313 /* exit reload, continue as new server_main */ 2314 } 2315 2316 /* 2317 * Get the mode depending on the signal hints that have been received. 2318 * Multiple signal hints can be received and will be handled in turn. 2319 */ 2320 static sig_atomic_t 2321 server_signal_mode(struct nsd *nsd) 2322 { 2323 if(nsd->signal_hint_quit) { 2324 nsd->signal_hint_quit = 0; 2325 return NSD_QUIT; 2326 } 2327 else if(nsd->signal_hint_shutdown) { 2328 nsd->signal_hint_shutdown = 0; 2329 return NSD_SHUTDOWN; 2330 } 2331 else if(nsd->signal_hint_child) { 2332 nsd->signal_hint_child = 0; 2333 return NSD_REAP_CHILDREN; 2334 } 2335 else if(nsd->signal_hint_reload) { 2336 nsd->signal_hint_reload = 0; 2337 return NSD_RELOAD; 2338 } 2339 else if(nsd->signal_hint_reload_hup) { 2340 nsd->signal_hint_reload_hup = 0; 2341 return NSD_RELOAD_REQ; 2342 } 2343 else if(nsd->signal_hint_stats) { 2344 nsd->signal_hint_stats = 0; 2345 #ifdef BIND8_STATS 2346 set_bind8_alarm(nsd); 2347 #endif 2348 return NSD_STATS; 2349 } 2350 else if(nsd->signal_hint_statsusr) { 2351 nsd->signal_hint_statsusr = 0; 2352 return NSD_STATS; 2353 } 2354 return NSD_RUN; 2355 } 2356 2357 /* 2358 * The main server simply waits for signals and child processes to 2359 * terminate. Child processes are restarted as necessary. 2360 */ 2361 void 2362 server_main(struct nsd *nsd) 2363 { 2364 region_type *server_region = region_create(xalloc, free); 2365 netio_type *netio = netio_create(server_region); 2366 netio_handler_type reload_listener; 2367 int reload_sockets[2] = {-1, -1}; 2368 struct timespec timeout_spec; 2369 int status; 2370 pid_t child_pid; 2371 pid_t reload_pid = -1; 2372 sig_atomic_t mode; 2373 2374 /* Ensure we are the main process */ 2375 assert(nsd->server_kind == NSD_SERVER_MAIN); 2376 2377 /* Add listener for the XFRD process */ 2378 netio_add_handler(netio, nsd->xfrd_listener); 2379 2380 /* Start the child processes that handle incoming queries */ 2381 if (server_start_children(nsd, server_region, netio, 2382 &nsd->xfrd_listener->fd) != 0) { 2383 send_children_quit(nsd); 2384 exit(1); 2385 } 2386 reload_listener.fd = -1; 2387 2388 /* This_child MUST be 0, because this is the parent process */ 2389 assert(nsd->this_child == 0); 2390 2391 /* Run the server until we get a shutdown signal */ 2392 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2393 /* Did we receive a signal that changes our mode? */ 2394 if(mode == NSD_RUN) { 2395 nsd->mode = mode = server_signal_mode(nsd); 2396 } 2397 2398 switch (mode) { 2399 case NSD_RUN: 2400 /* see if any child processes terminated */ 2401 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2402 int is_child = delete_child_pid(nsd, child_pid); 2403 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2404 if(nsd->children[is_child].child_fd == -1) 2405 nsd->children[is_child].has_exited = 1; 2406 parent_check_all_children_exited(nsd); 2407 } else if(is_child != -1) { 2408 log_msg(LOG_WARNING, 2409 "server %d died unexpectedly with status %d, restarting", 2410 (int) child_pid, status); 2411 restart_child_servers(nsd, server_region, netio, 2412 &nsd->xfrd_listener->fd); 2413 } else if (child_pid == reload_pid) { 2414 sig_atomic_t cmd = NSD_RELOAD_DONE; 2415 pid_t mypid; 2416 log_msg(LOG_WARNING, 2417 "Reload process %d failed with status %d, continuing with old database", 2418 (int) child_pid, status); 2419 reload_pid = -1; 2420 if(reload_listener.fd != -1) close(reload_listener.fd); 2421 reload_listener.fd = -1; 2422 reload_listener.event_types = NETIO_EVENT_NONE; 2423 task_process_sync(nsd->task[nsd->mytask]); 2424 /* inform xfrd reload attempt ended */ 2425 if(!write_socket(nsd->xfrd_listener->fd, 2426 &cmd, sizeof(cmd))) { 2427 log_msg(LOG_ERR, "problems " 2428 "sending SOAEND to xfrd: %s", 2429 strerror(errno)); 2430 } 2431 mypid = getpid(); 2432 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2433 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2434 strerror(errno)); 2435 } 2436 } else if(status != 0) { 2437 /* check for status, because we get 2438 * the old-servermain because reload 2439 * is the process-parent of old-main, 2440 * and we get older server-processes 2441 * that are exiting after a reload */ 2442 log_msg(LOG_WARNING, 2443 "process %d terminated with status %d", 2444 (int) child_pid, status); 2445 } 2446 } 2447 if (child_pid == -1) { 2448 if (errno == EINTR) { 2449 continue; 2450 } 2451 if (errno != ECHILD) 2452 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2453 } 2454 if (nsd->mode != NSD_RUN) 2455 break; 2456 2457 /* timeout to collect processes. In case no sigchild happens. */ 2458 timeout_spec.tv_sec = 60; 2459 timeout_spec.tv_nsec = 0; 2460 2461 /* listen on ports, timeout for collecting terminated children */ 2462 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2463 if (errno != EINTR) { 2464 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2465 } 2466 } 2467 if(nsd->restart_children) { 2468 restart_child_servers(nsd, server_region, netio, 2469 &nsd->xfrd_listener->fd); 2470 nsd->restart_children = 0; 2471 } 2472 if(nsd->reload_failed) { 2473 sig_atomic_t cmd = NSD_RELOAD_DONE; 2474 pid_t mypid; 2475 nsd->reload_failed = 0; 2476 log_msg(LOG_WARNING, 2477 "Reload process %d failed, continuing with old database", 2478 (int) reload_pid); 2479 reload_pid = -1; 2480 if(reload_listener.fd != -1) close(reload_listener.fd); 2481 reload_listener.fd = -1; 2482 reload_listener.event_types = NETIO_EVENT_NONE; 2483 task_process_sync(nsd->task[nsd->mytask]); 2484 /* inform xfrd reload attempt ended */ 2485 if(!write_socket(nsd->xfrd_listener->fd, 2486 &cmd, sizeof(cmd))) { 2487 log_msg(LOG_ERR, "problems " 2488 "sending SOAEND to xfrd: %s", 2489 strerror(errno)); 2490 } 2491 mypid = getpid(); 2492 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2493 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2494 strerror(errno)); 2495 } 2496 } 2497 2498 break; 2499 case NSD_RELOAD_REQ: { 2500 sig_atomic_t cmd = NSD_RELOAD_REQ; 2501 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2502 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2503 "main: ipc send reload_req to xfrd")); 2504 if(!write_socket(nsd->xfrd_listener->fd, 2505 &cmd, sizeof(cmd))) { 2506 log_msg(LOG_ERR, "server_main: could not send " 2507 "reload_req to xfrd: %s", strerror(errno)); 2508 } 2509 nsd->mode = NSD_RUN; 2510 } break; 2511 case NSD_RELOAD: 2512 /* Continue to run nsd after reload */ 2513 nsd->mode = NSD_RUN; 2514 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2515 if (reload_pid != -1) { 2516 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2517 (int) reload_pid); 2518 break; 2519 } 2520 2521 /* switch the mytask to keep track of who owns task*/ 2522 nsd->mytask = 1 - nsd->mytask; 2523 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2524 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2525 reload_pid = -1; 2526 break; 2527 } 2528 2529 /* Do actual reload */ 2530 reload_pid = fork(); 2531 switch (reload_pid) { 2532 case -1: 2533 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2534 break; 2535 default: 2536 /* PARENT */ 2537 close(reload_sockets[0]); 2538 server_reload(nsd, server_region, netio, 2539 reload_sockets[1]); 2540 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2541 close(reload_sockets[1]); 2542 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2543 /* drop stale xfrd ipc data */ 2544 ((struct ipc_handler_conn_data*)nsd-> 2545 xfrd_listener->user_data) 2546 ->conn->is_reading = 0; 2547 reload_pid = -1; 2548 reload_listener.fd = -1; 2549 reload_listener.event_types = NETIO_EVENT_NONE; 2550 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2551 break; 2552 case 0: 2553 /* CHILD */ 2554 /* server_main keep running until NSD_QUIT_SYNC 2555 * received from reload. */ 2556 close(reload_sockets[1]); 2557 reload_listener.fd = reload_sockets[0]; 2558 reload_listener.timeout = NULL; 2559 reload_listener.user_data = nsd; 2560 reload_listener.event_types = NETIO_EVENT_READ; 2561 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2562 netio_add_handler(netio, &reload_listener); 2563 reload_pid = getppid(); 2564 break; 2565 } 2566 break; 2567 case NSD_QUIT_SYNC: 2568 /* synchronisation of xfrd, parent and reload */ 2569 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2570 sig_atomic_t cmd = NSD_RELOAD; 2571 /* stop xfrd ipc writes in progress */ 2572 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2573 "main: ipc send indication reload")); 2574 if(!write_socket(nsd->xfrd_listener->fd, 2575 &cmd, sizeof(cmd))) { 2576 log_msg(LOG_ERR, "server_main: could not send reload " 2577 "indication to xfrd: %s", strerror(errno)); 2578 } 2579 /* wait for ACK from xfrd */ 2580 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2581 nsd->quit_sync_done = 1; 2582 } 2583 nsd->mode = NSD_RUN; 2584 break; 2585 case NSD_QUIT: 2586 /* silent shutdown during reload */ 2587 if(reload_listener.fd != -1) { 2588 /* acknowledge the quit, to sync reload that we will really quit now */ 2589 sig_atomic_t cmd = NSD_RELOAD; 2590 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2591 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2592 log_msg(LOG_ERR, "server_main: " 2593 "could not ack quit: %s", strerror(errno)); 2594 } 2595 #ifdef BIND8_STATS 2596 parent_send_stats(nsd, reload_listener.fd); 2597 #endif /* BIND8_STATS */ 2598 close(reload_listener.fd); 2599 } 2600 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2601 /* only quit children after xfrd has acked */ 2602 send_children_quit(nsd); 2603 2604 #ifdef MEMCLEAN /* OS collects memory pages */ 2605 region_destroy(server_region); 2606 #endif 2607 server_shutdown(nsd); 2608 2609 /* ENOTREACH */ 2610 break; 2611 case NSD_SHUTDOWN: 2612 break; 2613 case NSD_REAP_CHILDREN: 2614 /* continue; wait for child in run loop */ 2615 nsd->mode = NSD_RUN; 2616 break; 2617 case NSD_STATS: 2618 #ifdef BIND8_STATS 2619 set_children_stats(nsd); 2620 #endif 2621 nsd->mode = NSD_RUN; 2622 break; 2623 default: 2624 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2625 nsd->mode = NSD_RUN; 2626 break; 2627 } 2628 } 2629 log_msg(LOG_WARNING, "signal received, shutting down..."); 2630 2631 /* close opened ports to avoid race with restart of nsd */ 2632 server_close_all_sockets(nsd->udp, nsd->ifs); 2633 server_close_all_sockets(nsd->tcp, nsd->ifs); 2634 #ifdef HAVE_SSL 2635 daemon_remote_close(nsd->rc); 2636 #endif 2637 send_children_quit_and_wait(nsd); 2638 2639 /* Unlink it if possible... */ 2640 unlinkpid(nsd->pidfile); 2641 unlink(nsd->task[0]->fname); 2642 unlink(nsd->task[1]->fname); 2643 #ifdef USE_ZONE_STATS 2644 unlink(nsd->zonestatfname[0]); 2645 unlink(nsd->zonestatfname[1]); 2646 #endif 2647 #ifdef USE_DNSTAP 2648 dt_collector_close(nsd->dt_collector, nsd); 2649 #endif 2650 2651 if(reload_listener.fd != -1) { 2652 sig_atomic_t cmd = NSD_QUIT; 2653 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2654 "main: ipc send quit to reload-process")); 2655 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2656 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2657 strerror(errno)); 2658 } 2659 fsync(reload_listener.fd); 2660 close(reload_listener.fd); 2661 /* wait for reload to finish processing */ 2662 while(1) { 2663 if(waitpid(reload_pid, NULL, 0) == -1) { 2664 if(errno == EINTR) continue; 2665 if(errno == ECHILD) break; 2666 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2667 (int)reload_pid, strerror(errno)); 2668 } 2669 break; 2670 } 2671 } 2672 if(nsd->xfrd_listener->fd != -1) { 2673 /* complete quit, stop xfrd */ 2674 sig_atomic_t cmd = NSD_QUIT; 2675 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2676 "main: ipc send quit to xfrd")); 2677 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2678 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2679 strerror(errno)); 2680 } 2681 fsync(nsd->xfrd_listener->fd); 2682 close(nsd->xfrd_listener->fd); 2683 (void)kill(nsd->pid, SIGTERM); 2684 } 2685 2686 #ifdef MEMCLEAN /* OS collects memory pages */ 2687 region_destroy(server_region); 2688 #endif 2689 /* write the nsd.db to disk, wait for it to complete */ 2690 udb_base_sync(nsd->db->udb, 1); 2691 udb_base_close(nsd->db->udb); 2692 server_shutdown(nsd); 2693 } 2694 2695 static query_state_type 2696 server_process_query(struct nsd *nsd, struct query *query) 2697 { 2698 return query_process(query, nsd); 2699 } 2700 2701 static query_state_type 2702 server_process_query_udp(struct nsd *nsd, struct query *query) 2703 { 2704 #ifdef RATELIMIT 2705 if(query_process(query, nsd) != QUERY_DISCARDED) { 2706 if(rrl_process_query(query)) 2707 return rrl_slip(query); 2708 else return QUERY_PROCESSED; 2709 } 2710 return QUERY_DISCARDED; 2711 #else 2712 return query_process(query, nsd); 2713 #endif 2714 } 2715 2716 struct event_base* 2717 nsd_child_event_base(void) 2718 { 2719 struct event_base* base; 2720 #ifdef USE_MINI_EVENT 2721 static time_t secs; 2722 static struct timeval now; 2723 base = event_init(&secs, &now); 2724 #else 2725 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2726 /* libev */ 2727 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2728 # else 2729 /* libevent */ 2730 # ifdef HAVE_EVENT_BASE_NEW 2731 base = event_base_new(); 2732 # else 2733 base = event_init(); 2734 # endif 2735 # endif 2736 #endif 2737 return base; 2738 } 2739 2740 static void 2741 add_udp_handler( 2742 struct nsd *nsd, 2743 struct nsd_socket *sock, 2744 struct udp_handler_data *data) 2745 { 2746 struct event *handler = &data->event; 2747 2748 data->nsd = nsd; 2749 data->socket = sock; 2750 2751 memset(handler, 0, sizeof(*handler)); 2752 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 2753 if(event_base_set(nsd->event_base, handler) != 0) 2754 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 2755 if(event_add(handler, NULL) != 0) 2756 log_msg(LOG_ERR, "nsd udp: event_add failed"); 2757 } 2758 2759 void 2760 add_tcp_handler( 2761 struct nsd *nsd, 2762 struct nsd_socket *sock, 2763 struct tcp_accept_handler_data *data) 2764 { 2765 struct event *handler = &data->event; 2766 2767 data->nsd = nsd; 2768 data->socket = sock; 2769 2770 #ifdef HAVE_SSL 2771 if (nsd->tls_ctx && 2772 nsd->options->tls_port && 2773 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 2774 { 2775 data->tls_accept = 1; 2776 if(verbosity >= 2) { 2777 char buf[48]; 2778 addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 2779 VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 2780 } 2781 } else { 2782 data->tls_accept = 0; 2783 } 2784 #endif 2785 2786 memset(handler, 0, sizeof(*handler)); 2787 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 2788 if(event_base_set(nsd->event_base, handler) != 0) 2789 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 2790 if(event_add(handler, NULL) != 0) 2791 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 2792 data->event_added = 1; 2793 } 2794 2795 /* 2796 * Serve DNS requests. 2797 */ 2798 void 2799 server_child(struct nsd *nsd) 2800 { 2801 size_t i, from, numifs; 2802 region_type *server_region = region_create(xalloc, free); 2803 struct event_base* event_base = nsd_child_event_base(); 2804 sig_atomic_t mode; 2805 2806 if(!event_base) { 2807 log_msg(LOG_ERR, "nsd server could not create event base"); 2808 exit(1); 2809 } 2810 nsd->event_base = event_base; 2811 nsd->server_region = server_region; 2812 2813 #ifdef RATELIMIT 2814 rrl_init(nsd->this_child->child_num); 2815 #endif 2816 2817 assert(nsd->server_kind != NSD_SERVER_MAIN); 2818 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 2819 2820 #ifdef HAVE_SETPROCTITLE 2821 setproctitle("server %d", nsd->this_child->child_num + 1); 2822 #endif 2823 #ifdef HAVE_CPUSET_T 2824 if(nsd->use_cpu_affinity) { 2825 set_cpu_affinity(nsd->this_child->cpuset); 2826 } 2827 #endif 2828 2829 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 2830 server_close_all_sockets(nsd->tcp, nsd->ifs); 2831 } 2832 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 2833 server_close_all_sockets(nsd->udp, nsd->ifs); 2834 } 2835 2836 if (nsd->this_child->parent_fd != -1) { 2837 struct event *handler; 2838 struct ipc_handler_conn_data* user_data = 2839 (struct ipc_handler_conn_data*)region_alloc( 2840 server_region, sizeof(struct ipc_handler_conn_data)); 2841 user_data->nsd = nsd; 2842 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 2843 2844 handler = (struct event*) region_alloc( 2845 server_region, sizeof(*handler)); 2846 memset(handler, 0, sizeof(*handler)); 2847 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 2848 EV_READ, child_handle_parent_command, user_data); 2849 if(event_base_set(event_base, handler) != 0) 2850 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 2851 if(event_add(handler, NULL) != 0) 2852 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 2853 } 2854 2855 if(nsd->reuseport) { 2856 numifs = nsd->ifs / nsd->reuseport; 2857 from = numifs * nsd->this_child->child_num; 2858 if(from+numifs > nsd->ifs) { /* should not happen */ 2859 from = 0; 2860 numifs = nsd->ifs; 2861 } 2862 } else { 2863 from = 0; 2864 numifs = nsd->ifs; 2865 } 2866 2867 if (nsd->server_kind & NSD_SERVER_UDP) { 2868 int child = nsd->this_child->child_num; 2869 memset(msgs, 0, sizeof(msgs)); 2870 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 2871 queries[i] = query_create(server_region, 2872 compressed_dname_offsets, 2873 compression_table_size, compressed_dnames); 2874 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 2875 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 2876 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 2877 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 2878 msgs[i].msg_hdr.msg_iovlen = 1; 2879 msgs[i].msg_hdr.msg_name = &queries[i]->addr; 2880 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 2881 } 2882 2883 for (i = 0; i < nsd->ifs; i++) { 2884 int listen; 2885 struct udp_handler_data *data; 2886 2887 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 2888 2889 if(i >= from && i < (from + numifs) && listen) { 2890 data = region_alloc_zero( 2891 nsd->server_region, sizeof(*data)); 2892 add_udp_handler(nsd, &nsd->udp[i], data); 2893 } else { 2894 /* close sockets intended for other servers */ 2895 server_close_socket(&nsd->udp[i]); 2896 } 2897 } 2898 } 2899 2900 /* 2901 * Keep track of all the TCP accept handlers so we can enable 2902 * and disable them based on the current number of active TCP 2903 * connections. 2904 */ 2905 if (nsd->server_kind & NSD_SERVER_TCP) { 2906 int child = nsd->this_child->child_num; 2907 tcp_accept_handler_count = numifs; 2908 tcp_accept_handlers = region_alloc_array(server_region, 2909 numifs, sizeof(*tcp_accept_handlers)); 2910 2911 for (i = 0; i < nsd->ifs; i++) { 2912 int listen; 2913 struct tcp_accept_handler_data *data; 2914 2915 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 2916 2917 if(i >= from && i < (from + numifs) && listen) { 2918 data = &tcp_accept_handlers[i-from]; 2919 memset(data, 0, sizeof(*data)); 2920 add_tcp_handler(nsd, &nsd->tcp[i], data); 2921 } else { 2922 /* close sockets intended for other servers */ 2923 /* 2924 * uncomment this once tcp servers are no 2925 * longer copied in the tcp fd copy line 2926 * in server_init(). 2927 server_close_socket(&nsd->tcp[i]); 2928 */ 2929 /* close sockets not meant for this server*/ 2930 if(!listen) 2931 server_close_socket(&nsd->tcp[i]); 2932 } 2933 } 2934 } else { 2935 tcp_accept_handler_count = 0; 2936 } 2937 2938 /* The main loop... */ 2939 while ((mode = nsd->mode) != NSD_QUIT) { 2940 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 2941 2942 /* Do we need to do the statistics... */ 2943 if (mode == NSD_STATS) { 2944 #ifdef BIND8_STATS 2945 int p = nsd->st.period; 2946 nsd->st.period = 1; /* force stats printout */ 2947 /* Dump the statistics */ 2948 bind8_stats(nsd); 2949 nsd->st.period = p; 2950 #else /* !BIND8_STATS */ 2951 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 2952 #endif /* BIND8_STATS */ 2953 2954 nsd->mode = NSD_RUN; 2955 } 2956 else if (mode == NSD_REAP_CHILDREN) { 2957 /* got signal, notify parent. parent reaps terminated children. */ 2958 if (nsd->this_child->parent_fd != -1) { 2959 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 2960 if (write(nsd->this_child->parent_fd, 2961 &parent_notify, 2962 sizeof(parent_notify)) == -1) 2963 { 2964 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 2965 (int) nsd->this_child->pid, strerror(errno)); 2966 } 2967 } else /* no parent, so reap 'em */ 2968 while (waitpid(-1, NULL, WNOHANG) > 0) ; 2969 nsd->mode = NSD_RUN; 2970 } 2971 else if(mode == NSD_RUN) { 2972 /* Wait for a query... */ 2973 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 2974 if (errno != EINTR) { 2975 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 2976 break; 2977 } 2978 } 2979 } else if(mode == NSD_QUIT) { 2980 /* ignore here, quit */ 2981 } else { 2982 log_msg(LOG_ERR, "mode bad value %d, back to service.", 2983 (int)mode); 2984 nsd->mode = NSD_RUN; 2985 } 2986 } 2987 2988 service_remaining_tcp(nsd); 2989 #ifdef BIND8_STATS 2990 bind8_stats(nsd); 2991 #endif /* BIND8_STATS */ 2992 2993 #ifdef MEMCLEAN /* OS collects memory pages */ 2994 #ifdef RATELIMIT 2995 rrl_deinit(nsd->this_child->child_num); 2996 #endif 2997 event_base_free(event_base); 2998 region_destroy(server_region); 2999 #endif 3000 server_shutdown(nsd); 3001 } 3002 3003 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3004 { 3005 int* timed_out = (int*)arg; 3006 assert(event & EV_TIMEOUT); 3007 /* wake up the service tcp thread, note event is no longer 3008 * registered */ 3009 *timed_out = 1; 3010 } 3011 3012 void 3013 service_remaining_tcp(struct nsd* nsd) 3014 { 3015 struct tcp_handler_data* p; 3016 struct event_base* event_base; 3017 /* check if it is needed */ 3018 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3019 return; 3020 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3021 3022 /* setup event base */ 3023 event_base = nsd_child_event_base(); 3024 if(!event_base) { 3025 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3026 return; 3027 } 3028 /* register tcp connections */ 3029 for(p = tcp_active_list; p != NULL; p = p->next) { 3030 struct timeval timeout; 3031 int fd = p->event.ev_fd; 3032 #ifdef USE_MINI_EVENT 3033 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3034 #else 3035 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3036 #endif 3037 void (*fn)(int, short, void*); 3038 #ifdef HAVE_SSL 3039 if(p->tls) { 3040 if((event&EV_READ)) 3041 fn = handle_tls_reading; 3042 else fn = handle_tls_writing; 3043 } else { 3044 #endif 3045 if((event&EV_READ)) 3046 fn = handle_tcp_reading; 3047 else fn = handle_tcp_writing; 3048 #ifdef HAVE_SSL 3049 } 3050 #endif 3051 3052 /* set timeout to 1/10 second */ 3053 if(p->tcp_timeout > 100) 3054 p->tcp_timeout = 100; 3055 timeout.tv_sec = p->tcp_timeout / 1000; 3056 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3057 event_del(&p->event); 3058 memset(&p->event, 0, sizeof(p->event)); 3059 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3060 fn, p); 3061 if(event_base_set(event_base, &p->event) != 0) 3062 log_msg(LOG_ERR, "event base set failed"); 3063 if(event_add(&p->event, &timeout) != 0) 3064 log_msg(LOG_ERR, "event add failed"); 3065 } 3066 3067 /* handle it */ 3068 while(nsd->current_tcp_count > 0) { 3069 mode_t m = server_signal_mode(nsd); 3070 struct event timeout; 3071 struct timeval tv; 3072 int timed_out = 0; 3073 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3074 m == NSD_REAP_CHILDREN) { 3075 /* quit */ 3076 break; 3077 } 3078 /* timer */ 3079 /* have to do something every second */ 3080 tv.tv_sec = 1; 3081 tv.tv_usec = 0; 3082 memset(&timeout, 0, sizeof(timeout)); 3083 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3084 &timed_out); 3085 if(event_base_set(event_base, &timeout) != 0) 3086 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3087 if(event_add(&timeout, &tv) != 0) 3088 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3089 3090 /* service loop */ 3091 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3092 if (errno != EINTR) { 3093 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3094 break; 3095 } 3096 } 3097 if(!timed_out) { 3098 event_del(&timeout); 3099 } else { 3100 /* timed out, quit */ 3101 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3102 break; 3103 } 3104 } 3105 #ifdef MEMCLEAN 3106 event_base_free(event_base); 3107 #endif 3108 /* continue to quit after return */ 3109 } 3110 3111 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3112 * are always used, even if nonblocking operations are broken, in which case 3113 * NUM_RECV_PER_SELECT is defined to 1 (one). 3114 */ 3115 #if defined(HAVE_RECVMMSG) 3116 #define nsd_recvmmsg recvmmsg 3117 #else /* !HAVE_RECVMMSG */ 3118 3119 static int 3120 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3121 int flags, struct timespec *timeout) 3122 { 3123 int orig_errno; 3124 unsigned int vpos = 0; 3125 ssize_t rcvd; 3126 3127 /* timeout is ignored, ensure caller does not expect it to work */ 3128 assert(timeout == NULL); 3129 3130 orig_errno = errno; 3131 errno = 0; 3132 while(vpos < vlen) { 3133 rcvd = recvfrom(sockfd, 3134 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3135 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3136 flags, 3137 msgvec[vpos].msg_hdr.msg_name, 3138 &msgvec[vpos].msg_hdr.msg_namelen); 3139 if(rcvd < 0) { 3140 break; 3141 } else { 3142 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3143 msgvec[vpos].msg_len = (unsigned int)rcvd; 3144 vpos++; 3145 } 3146 } 3147 3148 if(vpos) { 3149 /* error will be picked up next time */ 3150 return (int)vpos; 3151 } else if(errno == 0) { 3152 errno = orig_errno; 3153 return 0; 3154 } else if(errno == EAGAIN) { 3155 return 0; 3156 } 3157 3158 return -1; 3159 } 3160 #endif /* HAVE_RECVMMSG */ 3161 3162 #ifdef HAVE_SENDMMSG 3163 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3164 #else /* !HAVE_SENDMMSG */ 3165 3166 static int 3167 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3168 { 3169 int orig_errno; 3170 unsigned int vpos = 0; 3171 ssize_t snd; 3172 3173 orig_errno = errno; 3174 errno = 0; 3175 while(vpos < vlen) { 3176 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3177 snd = sendto(sockfd, 3178 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3179 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3180 flags, 3181 msgvec[vpos].msg_hdr.msg_name, 3182 msgvec[vpos].msg_hdr.msg_namelen); 3183 if(snd < 0) { 3184 break; 3185 } else { 3186 msgvec[vpos].msg_len = (unsigned int)snd; 3187 vpos++; 3188 } 3189 } 3190 3191 if(vpos) { 3192 return (int)vpos; 3193 } else if(errno == 0) { 3194 errno = orig_errno; 3195 return 0; 3196 } 3197 3198 return -1; 3199 } 3200 #endif /* HAVE_SENDMMSG */ 3201 3202 static void 3203 handle_udp(int fd, short event, void* arg) 3204 { 3205 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3206 int received, sent, recvcount, i; 3207 struct query *q; 3208 3209 if (!(event & EV_READ)) { 3210 return; 3211 } 3212 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3213 /* this printf strangely gave a performance increase on Linux */ 3214 /* printf("recvcount %d \n", recvcount); */ 3215 if (recvcount == -1) { 3216 if (errno != EAGAIN && errno != EINTR) { 3217 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3218 STATUP(data->nsd, rxerr); 3219 /* No zone statup */ 3220 } 3221 /* Simply no data available */ 3222 return; 3223 } 3224 for (i = 0; i < recvcount; i++) { 3225 loopstart: 3226 received = msgs[i].msg_len; 3227 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; 3228 q = queries[i]; 3229 if (received == -1) { 3230 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3231 #if defined(HAVE_RECVMMSG) 3232 msgs[i].msg_hdr.msg_flags 3233 #else 3234 errno 3235 #endif 3236 )); 3237 STATUP(data->nsd, rxerr); 3238 /* No zone statup */ 3239 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3240 iovecs[i].iov_len = buffer_remaining(q->packet); 3241 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3242 goto swap_drop; 3243 } 3244 3245 /* Account... */ 3246 #ifdef BIND8_STATS 3247 if (data->socket->addr.ai_family == AF_INET) { 3248 STATUP(data->nsd, qudp); 3249 } else if (data->socket->addr.ai_family == AF_INET6) { 3250 STATUP(data->nsd, qudp6); 3251 } 3252 #endif 3253 3254 buffer_skip(q->packet, received); 3255 buffer_flip(q->packet); 3256 #ifdef USE_DNSTAP 3257 dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen, 3258 q->tcp, q->packet); 3259 #endif /* USE_DNSTAP */ 3260 3261 /* Process and answer the query... */ 3262 if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) { 3263 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3264 STATUP(data->nsd, nona); 3265 ZTATUP(data->nsd, q->zone, nona); 3266 } 3267 3268 #ifdef USE_ZONE_STATS 3269 if (data->socket->addr.ai_family == AF_INET) { 3270 ZTATUP(data->nsd, q->zone, qudp); 3271 } else if (data->socket->addr.ai_family == AF_INET6) { 3272 ZTATUP(data->nsd, q->zone, qudp6); 3273 } 3274 #endif 3275 3276 /* Add EDNS0 and TSIG info if necessary. */ 3277 query_add_optional(q, data->nsd); 3278 3279 buffer_flip(q->packet); 3280 iovecs[i].iov_len = buffer_remaining(q->packet); 3281 #ifdef BIND8_STATS 3282 /* Account the rcode & TC... */ 3283 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3284 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3285 if (TC(q->packet)) { 3286 STATUP(data->nsd, truncated); 3287 ZTATUP(data->nsd, q->zone, truncated); 3288 } 3289 #endif /* BIND8_STATS */ 3290 #ifdef USE_DNSTAP 3291 dt_collector_submit_auth_response(data->nsd, 3292 &q->addr, q->addrlen, q->tcp, q->packet, 3293 q->zone); 3294 #endif /* USE_DNSTAP */ 3295 } else { 3296 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3297 iovecs[i].iov_len = buffer_remaining(q->packet); 3298 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3299 swap_drop: 3300 STATUP(data->nsd, dropped); 3301 ZTATUP(data->nsd, q->zone, dropped); 3302 if(i != recvcount-1) { 3303 /* swap with last and decrease recvcount */ 3304 struct mmsghdr mtmp = msgs[i]; 3305 struct iovec iotmp = iovecs[i]; 3306 recvcount--; 3307 msgs[i] = msgs[recvcount]; 3308 iovecs[i] = iovecs[recvcount]; 3309 queries[i] = queries[recvcount]; 3310 msgs[recvcount] = mtmp; 3311 iovecs[recvcount] = iotmp; 3312 queries[recvcount] = q; 3313 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3314 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3315 goto loopstart; 3316 } else { recvcount --; } 3317 } 3318 } 3319 3320 /* send until all are sent */ 3321 i = 0; 3322 while(i<recvcount) { 3323 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3324 if(sent == -1) { 3325 /* don't log transient network full errors, unless 3326 * on higher verbosity */ 3327 if(!(errno == ENOBUFS && verbosity < 1) && 3328 #ifdef EWOULDBLOCK 3329 !(errno == EWOULDBLOCK && verbosity < 1) && 3330 #endif 3331 !(errno == EAGAIN && verbosity < 1)) { 3332 const char* es = strerror(errno); 3333 char a[48]; 3334 addr2str(&queries[i]->addr, a, sizeof(a)); 3335 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3336 } 3337 #ifdef BIND8_STATS 3338 data->nsd->st.txerr += recvcount-i; 3339 #endif /* BIND8_STATS */ 3340 break; 3341 } 3342 i += sent; 3343 } 3344 for(i=0; i<recvcount; i++) { 3345 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3346 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3347 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3348 } 3349 } 3350 3351 #ifdef HAVE_SSL 3352 /* 3353 * Setup an event for the tcp handler. 3354 */ 3355 static void 3356 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3357 int fd, short event) 3358 { 3359 struct timeval timeout; 3360 struct event_base* ev_base; 3361 3362 timeout.tv_sec = data->nsd->tcp_timeout; 3363 timeout.tv_usec = 0L; 3364 3365 ev_base = data->event.ev_base; 3366 event_del(&data->event); 3367 memset(&data->event, 0, sizeof(data->event)); 3368 event_set(&data->event, fd, event, fn, data); 3369 if(event_base_set(ev_base, &data->event) != 0) 3370 log_msg(LOG_ERR, "event base set failed"); 3371 if(event_add(&data->event, &timeout) != 0) 3372 log_msg(LOG_ERR, "event add failed"); 3373 } 3374 #endif /* HAVE_SSL */ 3375 3376 static void 3377 cleanup_tcp_handler(struct tcp_handler_data* data) 3378 { 3379 event_del(&data->event); 3380 #ifdef HAVE_SSL 3381 if(data->tls) { 3382 SSL_shutdown(data->tls); 3383 SSL_free(data->tls); 3384 data->tls = NULL; 3385 } 3386 #endif 3387 close(data->event.ev_fd); 3388 if(data->prev) 3389 data->prev->next = data->next; 3390 else tcp_active_list = data->next; 3391 if(data->next) 3392 data->next->prev = data->prev; 3393 3394 /* 3395 * Enable the TCP accept handlers when the current number of 3396 * TCP connections is about to drop below the maximum number 3397 * of TCP connections. 3398 */ 3399 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3400 configure_handler_event_types(EV_READ|EV_PERSIST); 3401 if(slowaccept) { 3402 event_del(&slowaccept_event); 3403 slowaccept = 0; 3404 } 3405 } 3406 --data->nsd->current_tcp_count; 3407 assert(data->nsd->current_tcp_count >= 0); 3408 3409 region_destroy(data->region); 3410 } 3411 3412 static void 3413 handle_tcp_reading(int fd, short event, void* arg) 3414 { 3415 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3416 ssize_t received; 3417 struct event_base* ev_base; 3418 struct timeval timeout; 3419 3420 if ((event & EV_TIMEOUT)) { 3421 /* Connection timed out. */ 3422 cleanup_tcp_handler(data); 3423 return; 3424 } 3425 3426 if (data->nsd->tcp_query_count > 0 && 3427 data->query_count >= data->nsd->tcp_query_count) { 3428 /* No more queries allowed on this tcp connection. */ 3429 cleanup_tcp_handler(data); 3430 return; 3431 } 3432 3433 assert((event & EV_READ)); 3434 3435 if (data->bytes_transmitted == 0) { 3436 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3437 } 3438 3439 /* 3440 * Check if we received the leading packet length bytes yet. 3441 */ 3442 if (data->bytes_transmitted < sizeof(uint16_t)) { 3443 received = read(fd, 3444 (char *) &data->query->tcplen 3445 + data->bytes_transmitted, 3446 sizeof(uint16_t) - data->bytes_transmitted); 3447 if (received == -1) { 3448 if (errno == EAGAIN || errno == EINTR) { 3449 /* 3450 * Read would block, wait until more 3451 * data is available. 3452 */ 3453 return; 3454 } else { 3455 char buf[48]; 3456 addr2str(&data->query->addr, buf, sizeof(buf)); 3457 #ifdef ECONNRESET 3458 if (verbosity >= 2 || errno != ECONNRESET) 3459 #endif /* ECONNRESET */ 3460 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3461 cleanup_tcp_handler(data); 3462 return; 3463 } 3464 } else if (received == 0) { 3465 /* EOF */ 3466 cleanup_tcp_handler(data); 3467 return; 3468 } 3469 3470 data->bytes_transmitted += received; 3471 if (data->bytes_transmitted < sizeof(uint16_t)) { 3472 /* 3473 * Not done with the tcplen yet, wait for more 3474 * data to become available. 3475 */ 3476 return; 3477 } 3478 3479 assert(data->bytes_transmitted == sizeof(uint16_t)); 3480 3481 data->query->tcplen = ntohs(data->query->tcplen); 3482 3483 /* 3484 * Minimum query size is: 3485 * 3486 * Size of the header (12) 3487 * + Root domain name (1) 3488 * + Query class (2) 3489 * + Query type (2) 3490 */ 3491 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3492 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3493 cleanup_tcp_handler(data); 3494 return; 3495 } 3496 3497 if (data->query->tcplen > data->query->maxlen) { 3498 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3499 cleanup_tcp_handler(data); 3500 return; 3501 } 3502 3503 buffer_set_limit(data->query->packet, data->query->tcplen); 3504 } 3505 3506 assert(buffer_remaining(data->query->packet) > 0); 3507 3508 /* Read the (remaining) query data. */ 3509 received = read(fd, 3510 buffer_current(data->query->packet), 3511 buffer_remaining(data->query->packet)); 3512 if (received == -1) { 3513 if (errno == EAGAIN || errno == EINTR) { 3514 /* 3515 * Read would block, wait until more data is 3516 * available. 3517 */ 3518 return; 3519 } else { 3520 char buf[48]; 3521 addr2str(&data->query->addr, buf, sizeof(buf)); 3522 #ifdef ECONNRESET 3523 if (verbosity >= 2 || errno != ECONNRESET) 3524 #endif /* ECONNRESET */ 3525 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3526 cleanup_tcp_handler(data); 3527 return; 3528 } 3529 } else if (received == 0) { 3530 /* EOF */ 3531 cleanup_tcp_handler(data); 3532 return; 3533 } 3534 3535 data->bytes_transmitted += received; 3536 buffer_skip(data->query->packet, received); 3537 if (buffer_remaining(data->query->packet) > 0) { 3538 /* 3539 * Message not yet complete, wait for more data to 3540 * become available. 3541 */ 3542 return; 3543 } 3544 3545 assert(buffer_position(data->query->packet) == data->query->tcplen); 3546 3547 /* Account... */ 3548 #ifdef BIND8_STATS 3549 #ifndef INET6 3550 STATUP(data->nsd, ctcp); 3551 #else 3552 if (data->query->addr.ss_family == AF_INET) { 3553 STATUP(data->nsd, ctcp); 3554 } else if (data->query->addr.ss_family == AF_INET6) { 3555 STATUP(data->nsd, ctcp6); 3556 } 3557 #endif 3558 #endif /* BIND8_STATS */ 3559 3560 /* We have a complete query, process it. */ 3561 3562 /* tcp-query-count: handle query counter ++ */ 3563 data->query_count++; 3564 3565 buffer_flip(data->query->packet); 3566 #ifdef USE_DNSTAP 3567 dt_collector_submit_auth_query(data->nsd, &data->query->addr, 3568 data->query->addrlen, data->query->tcp, data->query->packet); 3569 #endif /* USE_DNSTAP */ 3570 data->query_state = server_process_query(data->nsd, data->query); 3571 if (data->query_state == QUERY_DISCARDED) { 3572 /* Drop the packet and the entire connection... */ 3573 STATUP(data->nsd, dropped); 3574 ZTATUP(data->nsd, data->query->zone, dropped); 3575 cleanup_tcp_handler(data); 3576 return; 3577 } 3578 3579 #ifdef BIND8_STATS 3580 if (RCODE(data->query->packet) == RCODE_OK 3581 && !AA(data->query->packet)) 3582 { 3583 STATUP(data->nsd, nona); 3584 ZTATUP(data->nsd, data->query->zone, nona); 3585 } 3586 #endif /* BIND8_STATS */ 3587 3588 #ifdef USE_ZONE_STATS 3589 #ifndef INET6 3590 ZTATUP(data->nsd, data->query->zone, ctcp); 3591 #else 3592 if (data->query->addr.ss_family == AF_INET) { 3593 ZTATUP(data->nsd, data->query->zone, ctcp); 3594 } else if (data->query->addr.ss_family == AF_INET6) { 3595 ZTATUP(data->nsd, data->query->zone, ctcp6); 3596 } 3597 #endif 3598 #endif /* USE_ZONE_STATS */ 3599 3600 query_add_optional(data->query, data->nsd); 3601 3602 /* Switch to the tcp write handler. */ 3603 buffer_flip(data->query->packet); 3604 data->query->tcplen = buffer_remaining(data->query->packet); 3605 #ifdef BIND8_STATS 3606 /* Account the rcode & TC... */ 3607 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 3608 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 3609 if (TC(data->query->packet)) { 3610 STATUP(data->nsd, truncated); 3611 ZTATUP(data->nsd, data->query->zone, truncated); 3612 } 3613 #endif /* BIND8_STATS */ 3614 #ifdef USE_DNSTAP 3615 dt_collector_submit_auth_response(data->nsd, &data->query->addr, 3616 data->query->addrlen, data->query->tcp, data->query->packet, 3617 data->query->zone); 3618 #endif /* USE_DNSTAP */ 3619 data->bytes_transmitted = 0; 3620 3621 timeout.tv_sec = data->tcp_timeout / 1000; 3622 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3623 3624 ev_base = data->event.ev_base; 3625 event_del(&data->event); 3626 memset(&data->event, 0, sizeof(data->event)); 3627 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3628 handle_tcp_reading, data); 3629 if(event_base_set(ev_base, &data->event) != 0) 3630 log_msg(LOG_ERR, "event base set tcpr failed"); 3631 if(event_add(&data->event, &timeout) != 0) 3632 log_msg(LOG_ERR, "event add tcpr failed"); 3633 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 3634 handle_tcp_writing(fd, EV_WRITE, data); 3635 } 3636 3637 static void 3638 handle_tcp_writing(int fd, short event, void* arg) 3639 { 3640 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3641 ssize_t sent; 3642 struct query *q = data->query; 3643 struct timeval timeout; 3644 struct event_base* ev_base; 3645 3646 if ((event & EV_TIMEOUT)) { 3647 /* Connection timed out. */ 3648 cleanup_tcp_handler(data); 3649 return; 3650 } 3651 3652 assert((event & EV_WRITE)); 3653 3654 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3655 /* Writing the response packet length. */ 3656 uint16_t n_tcplen = htons(q->tcplen); 3657 #ifdef HAVE_WRITEV 3658 struct iovec iov[2]; 3659 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 3660 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 3661 iov[1].iov_base = buffer_begin(q->packet); 3662 iov[1].iov_len = buffer_limit(q->packet); 3663 sent = writev(fd, iov, 2); 3664 #else /* HAVE_WRITEV */ 3665 sent = write(fd, 3666 (const char *) &n_tcplen + data->bytes_transmitted, 3667 sizeof(n_tcplen) - data->bytes_transmitted); 3668 #endif /* HAVE_WRITEV */ 3669 if (sent == -1) { 3670 if (errno == EAGAIN || errno == EINTR) { 3671 /* 3672 * Write would block, wait until 3673 * socket becomes writable again. 3674 */ 3675 return; 3676 } else { 3677 #ifdef ECONNRESET 3678 if(verbosity >= 2 || errno != ECONNRESET) 3679 #endif /* ECONNRESET */ 3680 #ifdef EPIPE 3681 if(verbosity >= 2 || errno != EPIPE) 3682 #endif /* EPIPE 'broken pipe' */ 3683 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3684 cleanup_tcp_handler(data); 3685 return; 3686 } 3687 } 3688 3689 data->bytes_transmitted += sent; 3690 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3691 /* 3692 * Writing not complete, wait until socket 3693 * becomes writable again. 3694 */ 3695 return; 3696 } 3697 3698 #ifdef HAVE_WRITEV 3699 sent -= sizeof(n_tcplen); 3700 /* handle potential 'packet done' code */ 3701 goto packet_could_be_done; 3702 #endif 3703 } 3704 3705 sent = write(fd, 3706 buffer_current(q->packet), 3707 buffer_remaining(q->packet)); 3708 if (sent == -1) { 3709 if (errno == EAGAIN || errno == EINTR) { 3710 /* 3711 * Write would block, wait until 3712 * socket becomes writable again. 3713 */ 3714 return; 3715 } else { 3716 #ifdef ECONNRESET 3717 if(verbosity >= 2 || errno != ECONNRESET) 3718 #endif /* ECONNRESET */ 3719 #ifdef EPIPE 3720 if(verbosity >= 2 || errno != EPIPE) 3721 #endif /* EPIPE 'broken pipe' */ 3722 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3723 cleanup_tcp_handler(data); 3724 return; 3725 } 3726 } 3727 3728 data->bytes_transmitted += sent; 3729 #ifdef HAVE_WRITEV 3730 packet_could_be_done: 3731 #endif 3732 buffer_skip(q->packet, sent); 3733 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 3734 /* 3735 * Still more data to write when socket becomes 3736 * writable again. 3737 */ 3738 return; 3739 } 3740 3741 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 3742 3743 if (data->query_state == QUERY_IN_AXFR) { 3744 /* Continue processing AXFR and writing back results. */ 3745 buffer_clear(q->packet); 3746 data->query_state = query_axfr(data->nsd, q); 3747 if (data->query_state != QUERY_PROCESSED) { 3748 query_add_optional(data->query, data->nsd); 3749 3750 /* Reset data. */ 3751 buffer_flip(q->packet); 3752 q->tcplen = buffer_remaining(q->packet); 3753 data->bytes_transmitted = 0; 3754 /* Reset timeout. */ 3755 timeout.tv_sec = data->tcp_timeout / 1000; 3756 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3757 ev_base = data->event.ev_base; 3758 event_del(&data->event); 3759 memset(&data->event, 0, sizeof(data->event)); 3760 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 3761 handle_tcp_writing, data); 3762 if(event_base_set(ev_base, &data->event) != 0) 3763 log_msg(LOG_ERR, "event base set tcpw failed"); 3764 if(event_add(&data->event, &timeout) != 0) 3765 log_msg(LOG_ERR, "event add tcpw failed"); 3766 3767 /* 3768 * Write data if/when the socket is writable 3769 * again. 3770 */ 3771 return; 3772 } 3773 } 3774 3775 /* 3776 * Done sending, wait for the next request to arrive on the 3777 * TCP socket by installing the TCP read handler. 3778 */ 3779 if (data->nsd->tcp_query_count > 0 && 3780 data->query_count >= data->nsd->tcp_query_count) { 3781 3782 (void) shutdown(fd, SHUT_WR); 3783 } 3784 3785 data->bytes_transmitted = 0; 3786 3787 timeout.tv_sec = data->tcp_timeout / 1000; 3788 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3789 ev_base = data->event.ev_base; 3790 event_del(&data->event); 3791 memset(&data->event, 0, sizeof(data->event)); 3792 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3793 handle_tcp_reading, data); 3794 if(event_base_set(ev_base, &data->event) != 0) 3795 log_msg(LOG_ERR, "event base set tcpw failed"); 3796 if(event_add(&data->event, &timeout) != 0) 3797 log_msg(LOG_ERR, "event add tcpw failed"); 3798 } 3799 3800 #ifdef HAVE_SSL 3801 /** create SSL object and associate fd */ 3802 static SSL* 3803 incoming_ssl_fd(SSL_CTX* ctx, int fd) 3804 { 3805 SSL* ssl = SSL_new((SSL_CTX*)ctx); 3806 if(!ssl) { 3807 log_crypto_err("could not SSL_new"); 3808 return NULL; 3809 } 3810 SSL_set_accept_state(ssl); 3811 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 3812 if(!SSL_set_fd(ssl, fd)) { 3813 log_crypto_err("could not SSL_set_fd"); 3814 SSL_free(ssl); 3815 return NULL; 3816 } 3817 return ssl; 3818 } 3819 3820 /** TLS handshake to upgrade TCP connection */ 3821 static int 3822 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 3823 { 3824 int r; 3825 if(data->shake_state == tls_hs_read_event) { 3826 /* read condition satisfied back to writing */ 3827 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3828 data->shake_state = tls_hs_none; 3829 return 1; 3830 } 3831 if(data->shake_state == tls_hs_write_event) { 3832 /* write condition satisfied back to reading */ 3833 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3834 data->shake_state = tls_hs_none; 3835 return 1; 3836 } 3837 3838 /* (continue to) setup the TLS connection */ 3839 ERR_clear_error(); 3840 r = SSL_do_handshake(data->tls); 3841 3842 if(r != 1) { 3843 int want = SSL_get_error(data->tls, r); 3844 if(want == SSL_ERROR_WANT_READ) { 3845 if(data->shake_state == tls_hs_read) { 3846 /* try again later */ 3847 return 1; 3848 } 3849 data->shake_state = tls_hs_read; 3850 /* switch back to reading mode */ 3851 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3852 return 1; 3853 } else if(want == SSL_ERROR_WANT_WRITE) { 3854 if(data->shake_state == tls_hs_write) { 3855 /* try again later */ 3856 return 1; 3857 } 3858 data->shake_state = tls_hs_write; 3859 /* switch back to writing mode */ 3860 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3861 return 1; 3862 } else { 3863 if(r == 0) 3864 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 3865 else { 3866 unsigned long err = ERR_get_error(); 3867 if(!squelch_err_ssl_handshake(err)) { 3868 char a[64], s[256]; 3869 addr2str(&data->query->addr, a, sizeof(a)); 3870 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 3871 log_crypto_from_err(s, err); 3872 } 3873 } 3874 cleanup_tcp_handler(data); 3875 return 0; 3876 } 3877 } 3878 3879 /* Use to log successful upgrade for testing - could be removed*/ 3880 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 3881 /* set back to the event we need to have when reading (or writing) */ 3882 if(data->shake_state == tls_hs_read && writing) { 3883 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3884 } else if(data->shake_state == tls_hs_write && !writing) { 3885 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3886 } 3887 data->shake_state = tls_hs_none; 3888 return 1; 3889 } 3890 3891 /** handle TLS reading of incoming query */ 3892 static void 3893 handle_tls_reading(int fd, short event, void* arg) 3894 { 3895 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3896 ssize_t received; 3897 3898 if ((event & EV_TIMEOUT)) { 3899 /* Connection timed out. */ 3900 cleanup_tcp_handler(data); 3901 return; 3902 } 3903 3904 if (data->nsd->tcp_query_count > 0 && 3905 data->query_count >= data->nsd->tcp_query_count) { 3906 /* No more queries allowed on this tcp connection. */ 3907 cleanup_tcp_handler(data); 3908 return; 3909 } 3910 3911 assert((event & EV_READ)); 3912 3913 if (data->bytes_transmitted == 0) { 3914 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3915 } 3916 3917 if(data->shake_state != tls_hs_none) { 3918 if(!tls_handshake(data, fd, 0)) 3919 return; 3920 if(data->shake_state != tls_hs_none) 3921 return; 3922 } 3923 3924 /* 3925 * Check if we received the leading packet length bytes yet. 3926 */ 3927 if(data->bytes_transmitted < sizeof(uint16_t)) { 3928 ERR_clear_error(); 3929 if((received=SSL_read(data->tls, (char *) &data->query->tcplen 3930 + data->bytes_transmitted, 3931 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { 3932 int want = SSL_get_error(data->tls, received); 3933 if(want == SSL_ERROR_ZERO_RETURN) { 3934 cleanup_tcp_handler(data); 3935 return; /* shutdown, closed */ 3936 } else if(want == SSL_ERROR_WANT_READ) { 3937 /* wants to be called again */ 3938 return; 3939 } 3940 else if(want == SSL_ERROR_WANT_WRITE) { 3941 /* switch to writing */ 3942 data->shake_state = tls_hs_write_event; 3943 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 3944 return; 3945 } 3946 cleanup_tcp_handler(data); 3947 log_crypto_err("could not SSL_read"); 3948 return; 3949 } 3950 3951 data->bytes_transmitted += received; 3952 if (data->bytes_transmitted < sizeof(uint16_t)) { 3953 /* 3954 * Not done with the tcplen yet, wait for more 3955 * data to become available. 3956 */ 3957 return; 3958 } 3959 3960 assert(data->bytes_transmitted == sizeof(uint16_t)); 3961 3962 data->query->tcplen = ntohs(data->query->tcplen); 3963 3964 /* 3965 * Minimum query size is: 3966 * 3967 * Size of the header (12) 3968 * + Root domain name (1) 3969 * + Query class (2) 3970 * + Query type (2) 3971 */ 3972 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3973 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3974 cleanup_tcp_handler(data); 3975 return; 3976 } 3977 3978 if (data->query->tcplen > data->query->maxlen) { 3979 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3980 cleanup_tcp_handler(data); 3981 return; 3982 } 3983 3984 buffer_set_limit(data->query->packet, data->query->tcplen); 3985 } 3986 3987 assert(buffer_remaining(data->query->packet) > 0); 3988 3989 /* Read the (remaining) query data. */ 3990 ERR_clear_error(); 3991 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), 3992 (int)buffer_remaining(data->query->packet)); 3993 if(received <= 0) { 3994 int want = SSL_get_error(data->tls, received); 3995 if(want == SSL_ERROR_ZERO_RETURN) { 3996 cleanup_tcp_handler(data); 3997 return; /* shutdown, closed */ 3998 } else if(want == SSL_ERROR_WANT_READ) { 3999 /* wants to be called again */ 4000 return; 4001 } 4002 else if(want == SSL_ERROR_WANT_WRITE) { 4003 /* switch back writing */ 4004 data->shake_state = tls_hs_write_event; 4005 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4006 return; 4007 } 4008 cleanup_tcp_handler(data); 4009 log_crypto_err("could not SSL_read"); 4010 return; 4011 } 4012 4013 data->bytes_transmitted += received; 4014 buffer_skip(data->query->packet, received); 4015 if (buffer_remaining(data->query->packet) > 0) { 4016 /* 4017 * Message not yet complete, wait for more data to 4018 * become available. 4019 */ 4020 return; 4021 } 4022 4023 assert(buffer_position(data->query->packet) == data->query->tcplen); 4024 4025 /* Account... */ 4026 #ifndef INET6 4027 STATUP(data->nsd, ctls); 4028 #else 4029 if (data->query->addr.ss_family == AF_INET) { 4030 STATUP(data->nsd, ctls); 4031 } else if (data->query->addr.ss_family == AF_INET6) { 4032 STATUP(data->nsd, ctls6); 4033 } 4034 #endif 4035 4036 /* We have a complete query, process it. */ 4037 4038 /* tcp-query-count: handle query counter ++ */ 4039 data->query_count++; 4040 4041 buffer_flip(data->query->packet); 4042 #ifdef USE_DNSTAP 4043 dt_collector_submit_auth_query(data->nsd, &data->query->addr, 4044 data->query->addrlen, data->query->tcp, data->query->packet); 4045 #endif /* USE_DNSTAP */ 4046 data->query_state = server_process_query(data->nsd, data->query); 4047 if (data->query_state == QUERY_DISCARDED) { 4048 /* Drop the packet and the entire connection... */ 4049 STATUP(data->nsd, dropped); 4050 ZTATUP(data->nsd, data->query->zone, dropped); 4051 cleanup_tcp_handler(data); 4052 return; 4053 } 4054 4055 #ifdef BIND8_STATS 4056 if (RCODE(data->query->packet) == RCODE_OK 4057 && !AA(data->query->packet)) 4058 { 4059 STATUP(data->nsd, nona); 4060 ZTATUP(data->nsd, data->query->zone, nona); 4061 } 4062 #endif /* BIND8_STATS */ 4063 4064 #ifdef USE_ZONE_STATS 4065 #ifndef INET6 4066 ZTATUP(data->nsd, data->query->zone, ctls); 4067 #else 4068 if (data->query->addr.ss_family == AF_INET) { 4069 ZTATUP(data->nsd, data->query->zone, ctls); 4070 } else if (data->query->addr.ss_family == AF_INET6) { 4071 ZTATUP(data->nsd, data->query->zone, ctls6); 4072 } 4073 #endif 4074 #endif /* USE_ZONE_STATS */ 4075 4076 query_add_optional(data->query, data->nsd); 4077 4078 /* Switch to the tcp write handler. */ 4079 buffer_flip(data->query->packet); 4080 data->query->tcplen = buffer_remaining(data->query->packet); 4081 #ifdef BIND8_STATS 4082 /* Account the rcode & TC... */ 4083 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4084 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4085 if (TC(data->query->packet)) { 4086 STATUP(data->nsd, truncated); 4087 ZTATUP(data->nsd, data->query->zone, truncated); 4088 } 4089 #endif /* BIND8_STATS */ 4090 #ifdef USE_DNSTAP 4091 dt_collector_submit_auth_response(data->nsd, &data->query->addr, 4092 data->query->addrlen, data->query->tcp, data->query->packet, 4093 data->query->zone); 4094 #endif /* USE_DNSTAP */ 4095 data->bytes_transmitted = 0; 4096 4097 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4098 4099 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4100 handle_tls_writing(fd, EV_WRITE, data); 4101 } 4102 4103 /** handle TLS writing of outgoing response */ 4104 static void 4105 handle_tls_writing(int fd, short event, void* arg) 4106 { 4107 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4108 ssize_t sent; 4109 struct query *q = data->query; 4110 /* static variable that holds reassembly buffer used to put the 4111 * TCP length in front of the packet, like writev. */ 4112 static buffer_type* global_tls_temp_buffer = NULL; 4113 buffer_type* write_buffer; 4114 4115 if ((event & EV_TIMEOUT)) { 4116 /* Connection timed out. */ 4117 cleanup_tcp_handler(data); 4118 return; 4119 } 4120 4121 assert((event & EV_WRITE)); 4122 4123 if(data->shake_state != tls_hs_none) { 4124 if(!tls_handshake(data, fd, 1)) 4125 return; 4126 if(data->shake_state != tls_hs_none) 4127 return; 4128 } 4129 4130 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4131 4132 /* If we are writing the start of a message, we must include the length 4133 * this is done with a copy into write_buffer. */ 4134 write_buffer = NULL; 4135 if (data->bytes_transmitted == 0) { 4136 if(!global_tls_temp_buffer) { 4137 /* gets deallocated when nsd shuts down from 4138 * nsd.region */ 4139 global_tls_temp_buffer = buffer_create(nsd.region, 4140 QIOBUFSZ + sizeof(q->tcplen)); 4141 if (!global_tls_temp_buffer) { 4142 return; 4143 } 4144 } 4145 write_buffer = global_tls_temp_buffer; 4146 buffer_clear(write_buffer); 4147 buffer_write_u16(write_buffer, q->tcplen); 4148 buffer_write(write_buffer, buffer_current(q->packet), 4149 (int)buffer_remaining(q->packet)); 4150 buffer_flip(write_buffer); 4151 } else { 4152 write_buffer = q->packet; 4153 } 4154 4155 /* Write the response */ 4156 ERR_clear_error(); 4157 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4158 if(sent <= 0) { 4159 int want = SSL_get_error(data->tls, sent); 4160 if(want == SSL_ERROR_ZERO_RETURN) { 4161 cleanup_tcp_handler(data); 4162 /* closed */ 4163 } else if(want == SSL_ERROR_WANT_READ) { 4164 /* switch back to reading */ 4165 data->shake_state = tls_hs_read_event; 4166 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4167 } else if(want != SSL_ERROR_WANT_WRITE) { 4168 cleanup_tcp_handler(data); 4169 log_crypto_err("could not SSL_write"); 4170 } 4171 return; 4172 } 4173 4174 buffer_skip(write_buffer, sent); 4175 if(buffer_remaining(write_buffer) != 0) { 4176 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4177 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4178 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4179 } 4180 } 4181 4182 data->bytes_transmitted += sent; 4183 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4184 /* 4185 * Still more data to write when socket becomes 4186 * writable again. 4187 */ 4188 return; 4189 } 4190 4191 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4192 4193 if (data->query_state == QUERY_IN_AXFR) { 4194 /* Continue processing AXFR and writing back results. */ 4195 buffer_clear(q->packet); 4196 data->query_state = query_axfr(data->nsd, q); 4197 if (data->query_state != QUERY_PROCESSED) { 4198 query_add_optional(data->query, data->nsd); 4199 4200 /* Reset data. */ 4201 buffer_flip(q->packet); 4202 q->tcplen = buffer_remaining(q->packet); 4203 data->bytes_transmitted = 0; 4204 /* Reset to writing mode. */ 4205 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4206 4207 /* 4208 * Write data if/when the socket is writable 4209 * again. 4210 */ 4211 return; 4212 } 4213 } 4214 4215 /* 4216 * Done sending, wait for the next request to arrive on the 4217 * TCP socket by installing the TCP read handler. 4218 */ 4219 if (data->nsd->tcp_query_count > 0 && 4220 data->query_count >= data->nsd->tcp_query_count) { 4221 4222 (void) shutdown(fd, SHUT_WR); 4223 } 4224 4225 data->bytes_transmitted = 0; 4226 4227 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4228 } 4229 #endif 4230 4231 static void 4232 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 4233 void* ATTR_UNUSED(arg)) 4234 { 4235 if(slowaccept) { 4236 configure_handler_event_types(EV_PERSIST | EV_READ); 4237 slowaccept = 0; 4238 } 4239 } 4240 4241 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 4242 { 4243 #ifndef HAVE_ACCEPT4 4244 int s = accept(fd, addr, addrlen); 4245 if (s != -1) { 4246 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 4247 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 4248 close(s); 4249 s = -1; 4250 errno=EINTR; /* stop error printout as error in accept4 4251 by setting this errno, it omits printout, in 4252 later code that calls nsd_accept4 */ 4253 } 4254 } 4255 return s; 4256 #else 4257 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 4258 #endif /* HAVE_ACCEPT4 */ 4259 } 4260 4261 /* 4262 * Handle an incoming TCP connection. The connection is accepted and 4263 * a new TCP reader event handler is added. The TCP handler 4264 * is responsible for cleanup when the connection is closed. 4265 */ 4266 static void 4267 handle_tcp_accept(int fd, short event, void* arg) 4268 { 4269 struct tcp_accept_handler_data *data 4270 = (struct tcp_accept_handler_data *) arg; 4271 int s; 4272 int reject = 0; 4273 struct tcp_handler_data *tcp_data; 4274 region_type *tcp_region; 4275 #ifdef INET6 4276 struct sockaddr_storage addr; 4277 #else 4278 struct sockaddr_in addr; 4279 #endif 4280 socklen_t addrlen; 4281 struct timeval timeout; 4282 4283 if (!(event & EV_READ)) { 4284 return; 4285 } 4286 4287 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 4288 reject = data->nsd->options->tcp_reject_overflow; 4289 if (!reject) { 4290 return; 4291 } 4292 } 4293 4294 /* Accept it... */ 4295 addrlen = sizeof(addr); 4296 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 4297 if (s == -1) { 4298 /** 4299 * EMFILE and ENFILE is a signal that the limit of open 4300 * file descriptors has been reached. Pause accept(). 4301 * EINTR is a signal interrupt. The others are various OS ways 4302 * of saying that the client has closed the connection. 4303 */ 4304 if (errno == EMFILE || errno == ENFILE) { 4305 if (!slowaccept) { 4306 /* disable accept events */ 4307 struct timeval tv; 4308 configure_handler_event_types(0); 4309 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 4310 tv.tv_usec = 0L; 4311 memset(&slowaccept_event, 0, 4312 sizeof(slowaccept_event)); 4313 event_set(&slowaccept_event, -1, EV_TIMEOUT, 4314 handle_slowaccept_timeout, NULL); 4315 (void)event_base_set(data->event.ev_base, 4316 &slowaccept_event); 4317 (void)event_add(&slowaccept_event, &tv); 4318 slowaccept = 1; 4319 /* We don't want to spam the logs here */ 4320 } 4321 } else if (errno != EINTR 4322 && errno != EWOULDBLOCK 4323 #ifdef ECONNABORTED 4324 && errno != ECONNABORTED 4325 #endif /* ECONNABORTED */ 4326 #ifdef EPROTO 4327 && errno != EPROTO 4328 #endif /* EPROTO */ 4329 ) { 4330 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 4331 } 4332 return; 4333 } 4334 4335 if (reject) { 4336 shutdown(s, SHUT_RDWR); 4337 close(s); 4338 return; 4339 } 4340 4341 /* 4342 * This region is deallocated when the TCP connection is 4343 * closed by the TCP handler. 4344 */ 4345 tcp_region = region_create(xalloc, free); 4346 tcp_data = (struct tcp_handler_data *) region_alloc( 4347 tcp_region, sizeof(struct tcp_handler_data)); 4348 tcp_data->region = tcp_region; 4349 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 4350 compression_table_size, compressed_dnames); 4351 tcp_data->nsd = data->nsd; 4352 tcp_data->query_count = 0; 4353 #ifdef HAVE_SSL 4354 tcp_data->shake_state = tls_hs_none; 4355 tcp_data->tls = NULL; 4356 #endif 4357 tcp_data->prev = NULL; 4358 tcp_data->next = NULL; 4359 4360 tcp_data->query_state = QUERY_PROCESSED; 4361 tcp_data->bytes_transmitted = 0; 4362 memcpy(&tcp_data->query->addr, &addr, addrlen); 4363 tcp_data->query->addrlen = addrlen; 4364 4365 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 4366 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 4367 /* very busy, give smaller timeout */ 4368 tcp_data->tcp_timeout = 200; 4369 } 4370 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4371 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 4372 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 4373 4374 #ifdef HAVE_SSL 4375 if (data->tls_accept) { 4376 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 4377 if(!tcp_data->tls) { 4378 close(s); 4379 return; 4380 } 4381 tcp_data->shake_state = tls_hs_read; 4382 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4383 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4384 handle_tls_reading, tcp_data); 4385 } else { 4386 #endif 4387 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4388 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4389 handle_tcp_reading, tcp_data); 4390 #ifdef HAVE_SSL 4391 } 4392 #endif 4393 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 4394 log_msg(LOG_ERR, "cannot set tcp event base"); 4395 close(s); 4396 region_destroy(tcp_region); 4397 return; 4398 } 4399 if(event_add(&tcp_data->event, &timeout) != 0) { 4400 log_msg(LOG_ERR, "cannot add tcp to event base"); 4401 close(s); 4402 region_destroy(tcp_region); 4403 return; 4404 } 4405 if(tcp_active_list) { 4406 tcp_active_list->prev = tcp_data; 4407 tcp_data->next = tcp_active_list; 4408 } 4409 tcp_active_list = tcp_data; 4410 4411 /* 4412 * Keep track of the total number of TCP handlers installed so 4413 * we can stop accepting connections when the maximum number 4414 * of simultaneous TCP connections is reached. 4415 * 4416 * If tcp-reject-overflow is enabled, however, then we do not 4417 * change the handler event type; we keep it as-is and accept 4418 * overflow TCP connections only so that we can forcibly kill 4419 * them off. 4420 */ 4421 ++data->nsd->current_tcp_count; 4422 if (!data->nsd->options->tcp_reject_overflow && 4423 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 4424 { 4425 configure_handler_event_types(0); 4426 } 4427 } 4428 4429 static void 4430 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 4431 { 4432 size_t i; 4433 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4434 for (i = 0; i < nsd->child_count; ++i) { 4435 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 4436 if (write(nsd->children[i].child_fd, 4437 &command, 4438 sizeof(command)) == -1) 4439 { 4440 if(errno != EAGAIN && errno != EINTR) 4441 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 4442 (int) command, 4443 (int) nsd->children[i].pid, 4444 strerror(errno)); 4445 } else if (timeout > 0) { 4446 (void)block_read(NULL, 4447 nsd->children[i].child_fd, 4448 &command, sizeof(command), timeout); 4449 } 4450 fsync(nsd->children[i].child_fd); 4451 close(nsd->children[i].child_fd); 4452 nsd->children[i].child_fd = -1; 4453 } 4454 } 4455 } 4456 4457 static void 4458 send_children_quit(struct nsd* nsd) 4459 { 4460 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 4461 send_children_command(nsd, NSD_QUIT, 0); 4462 } 4463 4464 static void 4465 send_children_quit_and_wait(struct nsd* nsd) 4466 { 4467 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 4468 send_children_command(nsd, NSD_QUIT_CHILD, 3); 4469 } 4470 4471 #ifdef BIND8_STATS 4472 static void 4473 set_children_stats(struct nsd* nsd) 4474 { 4475 size_t i; 4476 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4477 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 4478 for (i = 0; i < nsd->child_count; ++i) { 4479 nsd->children[i].need_to_send_STATS = 1; 4480 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 4481 } 4482 } 4483 #endif /* BIND8_STATS */ 4484 4485 static void 4486 configure_handler_event_types(short event_types) 4487 { 4488 size_t i; 4489 4490 for (i = 0; i < tcp_accept_handler_count; ++i) { 4491 struct event* handler = &tcp_accept_handlers[i].event; 4492 if(event_types) { 4493 /* reassign */ 4494 int fd = handler->ev_fd; 4495 struct event_base* base = handler->ev_base; 4496 if(tcp_accept_handlers[i].event_added) 4497 event_del(handler); 4498 memset(handler, 0, sizeof(*handler)); 4499 event_set(handler, fd, event_types, 4500 handle_tcp_accept, &tcp_accept_handlers[i]); 4501 if(event_base_set(base, handler) != 0) 4502 log_msg(LOG_ERR, "conhand: cannot event_base"); 4503 if(event_add(handler, NULL) != 0) 4504 log_msg(LOG_ERR, "conhand: cannot event_add"); 4505 tcp_accept_handlers[i].event_added = 1; 4506 } else { 4507 /* remove */ 4508 if(tcp_accept_handlers[i].event_added) { 4509 event_del(handler); 4510 tcp_accept_handlers[i].event_added = 0; 4511 } 4512 } 4513 } 4514 } 4515