1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #ifdef USE_DNSTAP 85 #include "dnstap/dnstap_collector.h" 86 #endif 87 88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 89 90 #ifdef USE_TCP_FASTOPEN 91 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 92 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 93 #endif 94 95 /* 96 * Data for the UDP handlers. 97 */ 98 struct udp_handler_data 99 { 100 struct nsd *nsd; 101 struct nsd_socket *socket; 102 struct event event; 103 }; 104 105 struct tcp_accept_handler_data { 106 struct nsd *nsd; 107 struct nsd_socket *socket; 108 int event_added; 109 struct event event; 110 #ifdef HAVE_SSL 111 /* handler accepts TLS connections on the dedicated port */ 112 int tls_accept; 113 #endif 114 }; 115 116 /* 117 * These globals are used to enable the TCP accept handlers 118 * when the number of TCP connection drops below the maximum 119 * number of TCP connections. 120 */ 121 static size_t tcp_accept_handler_count; 122 static struct tcp_accept_handler_data *tcp_accept_handlers; 123 124 static struct event slowaccept_event; 125 static int slowaccept; 126 127 #ifdef HAVE_SSL 128 static unsigned char *ocspdata = NULL; 129 static long ocspdata_len = 0; 130 #endif 131 132 #ifdef NONBLOCKING_IS_BROKEN 133 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 134 read multiple times from a socket when reported ready by select. */ 135 # define NUM_RECV_PER_SELECT (1) 136 #else /* !NONBLOCKING_IS_BROKEN */ 137 # define NUM_RECV_PER_SELECT (100) 138 #endif /* NONBLOCKING_IS_BROKEN */ 139 140 #ifndef HAVE_MMSGHDR 141 struct mmsghdr { 142 struct msghdr msg_hdr; 143 unsigned int msg_len; 144 }; 145 #endif 146 147 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 148 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 149 static struct query *queries[NUM_RECV_PER_SELECT]; 150 151 /* 152 * Data for the TCP connection handlers. 153 * 154 * The TCP handlers use non-blocking I/O. This is necessary to avoid 155 * blocking the entire server on a slow TCP connection, but does make 156 * reading from and writing to the socket more complicated. 157 * 158 * Basically, whenever a read/write would block (indicated by the 159 * EAGAIN errno variable) we remember the position we were reading 160 * from/writing to and return from the TCP reading/writing event 161 * handler. When the socket becomes readable/writable again we 162 * continue from the same position. 163 */ 164 struct tcp_handler_data 165 { 166 /* 167 * The region used to allocate all TCP connection related 168 * data, including this structure. This region is destroyed 169 * when the connection is closed. 170 */ 171 region_type* region; 172 173 /* 174 * The global nsd structure. 175 */ 176 struct nsd* nsd; 177 178 /* 179 * The current query data for this TCP connection. 180 */ 181 query_type* query; 182 183 /* 184 * The query_state is used to remember if we are performing an 185 * AXFR, if we're done processing, or if we should discard the 186 * query and connection. 187 */ 188 query_state_type query_state; 189 190 /* 191 * The event for the file descriptor and tcp timeout 192 */ 193 struct event event; 194 195 /* 196 * The bytes_transmitted field is used to remember the number 197 * of bytes transmitted when receiving or sending a DNS 198 * packet. The count includes the two additional bytes used 199 * to specify the packet length on a TCP connection. 200 */ 201 size_t bytes_transmitted; 202 203 /* 204 * The number of queries handled by this specific TCP connection. 205 */ 206 int query_count; 207 208 /* 209 * The timeout in msec for this tcp connection 210 */ 211 int tcp_timeout; 212 #ifdef HAVE_SSL 213 /* 214 * TLS object. 215 */ 216 SSL* tls; 217 218 /* 219 * TLS handshake state. 220 */ 221 enum { tls_hs_none, tls_hs_read, tls_hs_write, 222 tls_hs_read_event, tls_hs_write_event } shake_state; 223 #endif 224 /* list of connections, for service of remaining tcp channels */ 225 struct tcp_handler_data *prev, *next; 226 }; 227 /* global that is the list of active tcp channels */ 228 static struct tcp_handler_data *tcp_active_list = NULL; 229 230 /* 231 * Handle incoming queries on the UDP server sockets. 232 */ 233 static void handle_udp(int fd, short event, void* arg); 234 235 /* 236 * Handle incoming connections on the TCP sockets. These handlers 237 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 238 * connection) but are disabled when the number of current TCP 239 * connections is equal to the maximum number of TCP connections. 240 * Disabling is done by changing the handler to wait for the 241 * NETIO_EVENT_NONE type. This is done using the function 242 * configure_tcp_accept_handlers. 243 */ 244 static void handle_tcp_accept(int fd, short event, void* arg); 245 246 /* 247 * Handle incoming queries on a TCP connection. The TCP connections 248 * are configured to be non-blocking and the handler may be called 249 * multiple times before a complete query is received. 250 */ 251 static void handle_tcp_reading(int fd, short event, void* arg); 252 253 /* 254 * Handle outgoing responses on a TCP connection. The TCP connections 255 * are configured to be non-blocking and the handler may be called 256 * multiple times before a complete response is sent. 257 */ 258 static void handle_tcp_writing(int fd, short event, void* arg); 259 260 #ifdef HAVE_SSL 261 /* Create SSL object and associate fd */ 262 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 263 /* 264 * Handle TLS handshake. May be called multiple times if incomplete. 265 */ 266 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 267 268 /* 269 * Handle incoming queries on a TLS over TCP connection. The TLS 270 * connections are configured to be non-blocking and the handler may 271 * be called multiple times before a complete query is received. 272 */ 273 static void handle_tls_reading(int fd, short event, void* arg); 274 275 /* 276 * Handle outgoing responses on a TLS over TCP connection. The TLS 277 * connections are configured to be non-blocking and the handler may 278 * be called multiple times before a complete response is sent. 279 */ 280 static void handle_tls_writing(int fd, short event, void* arg); 281 #endif 282 283 /* 284 * Send all children the quit nonblocking, then close pipe. 285 */ 286 static void send_children_quit(struct nsd* nsd); 287 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 288 static void send_children_quit_and_wait(struct nsd* nsd); 289 290 /* set childrens flags to send NSD_STATS to them */ 291 #ifdef BIND8_STATS 292 static void set_children_stats(struct nsd* nsd); 293 #endif /* BIND8_STATS */ 294 295 /* 296 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 297 */ 298 static void configure_handler_event_types(short event_types); 299 300 static uint16_t *compressed_dname_offsets = 0; 301 static uint32_t compression_table_capacity = 0; 302 static uint32_t compression_table_size = 0; 303 static domain_type* compressed_dnames[MAXRRSPP]; 304 305 #ifdef USE_TCP_FASTOPEN 306 /* Checks to see if the kernel value must be manually changed in order for 307 TCP Fast Open to support server mode */ 308 static void report_tcp_fastopen_config() { 309 310 int tcp_fastopen_fp; 311 uint8_t tcp_fastopen_value; 312 313 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 314 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 315 } 316 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 317 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 318 close(tcp_fastopen_fp); 319 } 320 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 321 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 322 log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); 323 log_msg(LOG_WARNING, "To enable TFO use the command:"); 324 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 325 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 326 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 327 close(tcp_fastopen_fp); 328 } 329 close(tcp_fastopen_fp); 330 } 331 #endif 332 333 /* 334 * Remove the specified pid from the list of child pids. Returns -1 if 335 * the pid is not in the list, child_num otherwise. The field is set to 0. 336 */ 337 static int 338 delete_child_pid(struct nsd *nsd, pid_t pid) 339 { 340 size_t i; 341 for (i = 0; i < nsd->child_count; ++i) { 342 if (nsd->children[i].pid == pid) { 343 nsd->children[i].pid = 0; 344 if(!nsd->children[i].need_to_exit) { 345 if(nsd->children[i].child_fd != -1) 346 close(nsd->children[i].child_fd); 347 nsd->children[i].child_fd = -1; 348 if(nsd->children[i].handler) 349 nsd->children[i].handler->fd = -1; 350 } 351 return i; 352 } 353 } 354 return -1; 355 } 356 357 /* 358 * Restart child servers if necessary. 359 */ 360 static int 361 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 362 int* xfrd_sock_p) 363 { 364 struct main_ipc_handler_data *ipc_data; 365 size_t i; 366 int sv[2]; 367 368 /* Fork the child processes... */ 369 for (i = 0; i < nsd->child_count; ++i) { 370 if (nsd->children[i].pid <= 0) { 371 if (nsd->children[i].child_fd != -1) 372 close(nsd->children[i].child_fd); 373 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 374 log_msg(LOG_ERR, "socketpair: %s", 375 strerror(errno)); 376 return -1; 377 } 378 nsd->children[i].child_fd = sv[0]; 379 nsd->children[i].parent_fd = sv[1]; 380 nsd->children[i].pid = fork(); 381 switch (nsd->children[i].pid) { 382 default: /* SERVER MAIN */ 383 close(nsd->children[i].parent_fd); 384 nsd->children[i].parent_fd = -1; 385 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 386 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 387 } 388 if(!nsd->children[i].handler) 389 { 390 ipc_data = (struct main_ipc_handler_data*) region_alloc( 391 region, sizeof(struct main_ipc_handler_data)); 392 ipc_data->nsd = nsd; 393 ipc_data->child = &nsd->children[i]; 394 ipc_data->child_num = i; 395 ipc_data->xfrd_sock = xfrd_sock_p; 396 ipc_data->packet = buffer_create(region, QIOBUFSZ); 397 ipc_data->forward_mode = 0; 398 ipc_data->got_bytes = 0; 399 ipc_data->total_bytes = 0; 400 ipc_data->acl_num = 0; 401 nsd->children[i].handler = (struct netio_handler*) region_alloc( 402 region, sizeof(struct netio_handler)); 403 nsd->children[i].handler->fd = nsd->children[i].child_fd; 404 nsd->children[i].handler->timeout = NULL; 405 nsd->children[i].handler->user_data = ipc_data; 406 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 407 nsd->children[i].handler->event_handler = parent_handle_child_command; 408 netio_add_handler(netio, nsd->children[i].handler); 409 } 410 /* clear any ongoing ipc */ 411 ipc_data = (struct main_ipc_handler_data*) 412 nsd->children[i].handler->user_data; 413 ipc_data->forward_mode = 0; 414 /* restart - update fd */ 415 nsd->children[i].handler->fd = nsd->children[i].child_fd; 416 break; 417 case 0: /* CHILD */ 418 /* the child need not be able to access the 419 * nsd.db file */ 420 namedb_close_udb(nsd->db); 421 #ifdef MEMCLEAN /* OS collects memory pages */ 422 region_destroy(region); 423 #endif 424 425 if (pledge("stdio rpath inet", NULL) == -1) { 426 log_msg(LOG_ERR, "pledge"); 427 exit(1); 428 } 429 430 nsd->pid = 0; 431 nsd->child_count = 0; 432 nsd->server_kind = nsd->children[i].kind; 433 nsd->this_child = &nsd->children[i]; 434 nsd->this_child->child_num = i; 435 /* remove signal flags inherited from parent 436 the parent will handle them. */ 437 nsd->signal_hint_reload_hup = 0; 438 nsd->signal_hint_reload = 0; 439 nsd->signal_hint_child = 0; 440 nsd->signal_hint_quit = 0; 441 nsd->signal_hint_shutdown = 0; 442 nsd->signal_hint_stats = 0; 443 nsd->signal_hint_statsusr = 0; 444 close(*xfrd_sock_p); 445 close(nsd->this_child->child_fd); 446 nsd->this_child->child_fd = -1; 447 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 448 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 449 } 450 server_child(nsd); 451 /* NOTREACH */ 452 exit(0); 453 case -1: 454 log_msg(LOG_ERR, "fork failed: %s", 455 strerror(errno)); 456 return -1; 457 } 458 } 459 } 460 return 0; 461 } 462 463 #ifdef BIND8_STATS 464 static void set_bind8_alarm(struct nsd* nsd) 465 { 466 /* resync so that the next alarm is on the next whole minute */ 467 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ 468 alarm(nsd->st.period - (time(NULL) % nsd->st.period)); 469 } 470 #endif 471 472 /* set zone stat ids for zones initially read in */ 473 static void 474 zonestatid_tree_set(struct nsd* nsd) 475 { 476 struct radnode* n; 477 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 478 zone_type* zone = (zone_type*)n->elem; 479 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 480 } 481 } 482 483 #ifdef USE_ZONE_STATS 484 void 485 server_zonestat_alloc(struct nsd* nsd) 486 { 487 size_t num = (nsd->options->zonestatnames->count==0?1: 488 nsd->options->zonestatnames->count); 489 size_t sz = sizeof(struct nsdst)*num; 490 char tmpfile[256]; 491 uint8_t z = 0; 492 493 /* file names */ 494 nsd->zonestatfname[0] = 0; 495 nsd->zonestatfname[1] = 0; 496 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 497 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 498 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 499 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 500 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 501 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 502 503 /* file descriptors */ 504 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 505 if(nsd->zonestatfd[0] == -1) { 506 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 507 strerror(errno)); 508 exit(1); 509 } 510 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 511 if(nsd->zonestatfd[0] == -1) { 512 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 513 strerror(errno)); 514 close(nsd->zonestatfd[0]); 515 unlink(nsd->zonestatfname[0]); 516 exit(1); 517 } 518 519 #ifdef HAVE_MMAP 520 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 521 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 522 strerror(errno)); 523 exit(1); 524 } 525 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 526 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 527 nsd->zonestatfname[0], strerror(errno)); 528 exit(1); 529 } 530 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 531 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 532 strerror(errno)); 533 exit(1); 534 } 535 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 536 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 537 nsd->zonestatfname[1], strerror(errno)); 538 exit(1); 539 } 540 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 541 MAP_SHARED, nsd->zonestatfd[0], 0); 542 if(nsd->zonestat[0] == MAP_FAILED) { 543 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 544 unlink(nsd->zonestatfname[0]); 545 unlink(nsd->zonestatfname[1]); 546 exit(1); 547 } 548 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 549 MAP_SHARED, nsd->zonestatfd[1], 0); 550 if(nsd->zonestat[1] == MAP_FAILED) { 551 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 552 unlink(nsd->zonestatfname[0]); 553 unlink(nsd->zonestatfname[1]); 554 exit(1); 555 } 556 memset(nsd->zonestat[0], 0, sz); 557 memset(nsd->zonestat[1], 0, sz); 558 nsd->zonestatsize[0] = num; 559 nsd->zonestatsize[1] = num; 560 nsd->zonestatdesired = num; 561 nsd->zonestatsizenow = num; 562 nsd->zonestatnow = nsd->zonestat[0]; 563 #endif /* HAVE_MMAP */ 564 } 565 566 void 567 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 568 { 569 #ifdef HAVE_MMAP 570 #ifdef MREMAP_MAYMOVE 571 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 572 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 573 MREMAP_MAYMOVE); 574 if(nsd->zonestat[idx] == MAP_FAILED) { 575 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 576 exit(1); 577 } 578 #else /* !HAVE MREMAP */ 579 if(msync(nsd->zonestat[idx], 580 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 581 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 582 if(munmap(nsd->zonestat[idx], 583 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 584 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 585 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 586 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 587 if(nsd->zonestat[idx] == MAP_FAILED) { 588 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 589 exit(1); 590 } 591 #endif /* MREMAP */ 592 #endif /* HAVE_MMAP */ 593 } 594 595 /* realloc the zonestat array for the one that is not currently in use, 596 * to match the desired new size of the array (if applicable) */ 597 void 598 server_zonestat_realloc(struct nsd* nsd) 599 { 600 #ifdef HAVE_MMAP 601 uint8_t z = 0; 602 size_t sz; 603 int idx = 0; /* index of the zonestat array that is not in use */ 604 if(nsd->zonestatnow == nsd->zonestat[0]) 605 idx = 1; 606 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 607 return; 608 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 609 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 610 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 611 strerror(errno)); 612 exit(1); 613 } 614 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 615 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 616 nsd->zonestatfname[idx], strerror(errno)); 617 exit(1); 618 } 619 zonestat_remap(nsd, idx, sz); 620 /* zero the newly allocated region */ 621 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 622 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 623 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 624 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 625 } 626 nsd->zonestatsize[idx] = nsd->zonestatdesired; 627 #endif /* HAVE_MMAP */ 628 } 629 630 /* switchover to use the other array for the new children, that 631 * briefly coexist with the old children. And we want to avoid them 632 * both writing to the same statistics arrays. */ 633 void 634 server_zonestat_switch(struct nsd* nsd) 635 { 636 if(nsd->zonestatnow == nsd->zonestat[0]) { 637 nsd->zonestatnow = nsd->zonestat[1]; 638 nsd->zonestatsizenow = nsd->zonestatsize[1]; 639 } else { 640 nsd->zonestatnow = nsd->zonestat[0]; 641 nsd->zonestatsizenow = nsd->zonestatsize[0]; 642 } 643 } 644 #endif /* USE_ZONE_STATS */ 645 646 static void 647 cleanup_dname_compression_tables(void *ptr) 648 { 649 free(ptr); 650 compressed_dname_offsets = NULL; 651 compression_table_capacity = 0; 652 } 653 654 static void 655 initialize_dname_compression_tables(struct nsd *nsd) 656 { 657 size_t needed = domain_table_count(nsd->db->domains) + 1; 658 needed += EXTRA_DOMAIN_NUMBERS; 659 if(compression_table_capacity < needed) { 660 if(compressed_dname_offsets) { 661 region_remove_cleanup(nsd->db->region, 662 cleanup_dname_compression_tables, 663 compressed_dname_offsets); 664 free(compressed_dname_offsets); 665 } 666 compressed_dname_offsets = (uint16_t *) xmallocarray( 667 needed, sizeof(uint16_t)); 668 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 669 compressed_dname_offsets); 670 compression_table_capacity = needed; 671 compression_table_size=domain_table_count(nsd->db->domains)+1; 672 } 673 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 674 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 675 } 676 677 static int 678 set_cloexec(struct nsd_socket *sock) 679 { 680 assert(sock != NULL); 681 682 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 683 const char *socktype = 684 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 685 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 686 socktype, strerror(errno)); 687 return -1; 688 } 689 690 return 1; 691 } 692 693 static int 694 set_reuseport(struct nsd_socket *sock) 695 { 696 #ifdef SO_REUSEPORT 697 int on = 1; 698 #ifdef SO_REUSEPORT_LB 699 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 700 * SO_REUSEPORT on Linux. This is what the users want with the config 701 * option in nsd.conf; if we actually need local address and port reuse 702 * they'll also need to have SO_REUSEPORT set for them, assume it was 703 * _LB they want. 704 */ 705 int opt = SO_REUSEPORT_LB; 706 static const char optname[] = "SO_REUSEPORT_LB"; 707 #else /* !SO_REUSEPORT_LB */ 708 int opt = SO_REUSEPORT; 709 static const char optname[] = "SO_REUSEPORT"; 710 #endif /* SO_REUSEPORT_LB */ 711 712 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 713 return 1; 714 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 715 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 716 optname, strerror(errno)); 717 } 718 return -1; 719 #else 720 (void)sock; 721 #endif /* SO_REUSEPORT */ 722 723 return 0; 724 } 725 726 static int 727 set_reuseaddr(struct nsd_socket *sock) 728 { 729 #ifdef SO_REUSEADDR 730 int on = 1; 731 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 732 return 1; 733 } 734 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 735 strerror(errno)); 736 return -1; 737 #endif /* SO_REUSEADDR */ 738 return 0; 739 } 740 741 static int 742 set_rcvbuf(struct nsd_socket *sock, int rcv) 743 { 744 #ifdef SO_RCVBUF 745 #ifdef SO_RCVBUFFORCE 746 if(0 == setsockopt( 747 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 748 { 749 return 1; 750 } 751 if(errno == EPERM || errno == ENOBUFS) { 752 return 0; 753 } 754 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 755 strerror(errno)); 756 return -1; 757 #else /* !SO_RCVBUFFORCE */ 758 if (0 == setsockopt( 759 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 760 { 761 return 1; 762 } 763 if(errno == ENOSYS || errno == ENOBUFS) { 764 return 0; 765 } 766 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 767 strerror(errno)); 768 return -1; 769 #endif /* SO_RCVBUFFORCE */ 770 #endif /* SO_RCVBUF */ 771 772 return 0; 773 } 774 775 static int 776 set_sndbuf(struct nsd_socket *sock, int snd) 777 { 778 #ifdef SO_SNDBUF 779 #ifdef SO_SNDBUFFORCE 780 if(0 == setsockopt( 781 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 782 { 783 return 1; 784 } 785 if(errno == EPERM || errno == ENOBUFS) { 786 return 0; 787 } 788 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 789 strerror(errno)); 790 return -1; 791 #else /* !SO_SNDBUFFORCE */ 792 if(0 == setsockopt( 793 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 794 { 795 return 1; 796 } 797 if(errno == ENOSYS || errno == ENOBUFS) { 798 return 0; 799 } 800 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 801 strerror(errno)); 802 return -1; 803 #endif /* SO_SNDBUFFORCE */ 804 #endif /* SO_SNDBUF */ 805 806 return 0; 807 } 808 809 static int 810 set_nonblock(struct nsd_socket *sock) 811 { 812 const char *socktype = 813 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 814 815 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 816 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 817 socktype, strerror(errno)); 818 return -1; 819 } 820 821 return 1; 822 } 823 824 static int 825 set_ipv6_v6only(struct nsd_socket *sock) 826 { 827 #ifdef INET6 828 #ifdef IPV6_V6ONLY 829 int on = 1; 830 const char *socktype = 831 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 832 833 if(0 == setsockopt( 834 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 835 { 836 return 1; 837 } 838 839 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 840 socktype, strerror(errno)); 841 return -1; 842 #endif /* IPV6_V6ONLY */ 843 #endif /* INET6 */ 844 845 return 0; 846 } 847 848 static int 849 set_ipv6_use_min_mtu(struct nsd_socket *sock) 850 { 851 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)) 852 #if defined(IPV6_USE_MIN_MTU) 853 /* There is no fragmentation of IPv6 datagrams during forwarding in the 854 * network. Therefore we do not send UDP datagrams larger than the 855 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 856 * larger if the network stack supports IPV6_USE_MIN_MTU. 857 */ 858 int opt = IPV6_USE_MIN_MTU; 859 int optval = 1; 860 static const char optname[] = "IPV6_USE_MIN_MTU"; 861 #elif defined(IPV6_MTU) 862 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 863 * to the MIN MTU to get the same. 864 */ 865 int opt = IPV6_MTU; 866 int optval = IPV6_MIN_MTU; 867 static const char optname[] = "IPV6_MTU"; 868 #endif 869 if(0 == setsockopt( 870 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 871 { 872 return 1; 873 } 874 875 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 876 optname, strerror(errno)); 877 return -1; 878 #else 879 (void)sock; 880 #endif /* INET6 */ 881 882 return 0; 883 } 884 885 static int 886 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 887 { 888 int ret = 0; 889 890 #if defined(IP_MTU_DISCOVER) 891 int opt = IP_MTU_DISCOVER; 892 int optval; 893 # if defined(IP_PMTUDISC_OMIT) 894 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 895 * information and send packets with DF=0. Fragmentation is allowed if 896 * and only if the packet size exceeds the outgoing interface MTU or 897 * the packet encounters smaller MTU link in network. This mitigates 898 * DNS fragmentation attacks by preventing forged PMTU information. 899 * FreeBSD already has same semantics without setting the option. 900 */ 901 optval = IP_PMTUDISC_OMIT; 902 if(0 == setsockopt( 903 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 904 { 905 return 1; 906 } 907 908 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 909 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 910 # endif /* IP_PMTUDISC_OMIT */ 911 # if defined(IP_PMTUDISC_DONT) 912 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 913 optval = IP_PMTUDISC_DONT; 914 if(0 == setsockopt( 915 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 916 { 917 return 1; 918 } 919 920 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 921 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 922 # endif 923 ret = -1; 924 #elif defined(IP_DONTFRAG) 925 int off = 0; 926 if (0 == setsockopt( 927 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 928 { 929 return 1; 930 } 931 932 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 933 strerror(errno)); 934 ret = -1; 935 #else 936 (void)sock; 937 #endif 938 939 return ret; 940 } 941 942 static int 943 set_ip_freebind(struct nsd_socket *sock) 944 { 945 #ifdef IP_FREEBIND 946 int on = 1; 947 const char *socktype = 948 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 949 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 950 { 951 return 1; 952 } 953 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 954 socktype, strerror(errno)); 955 return -1; 956 #else 957 (void)sock; 958 #endif /* IP_FREEBIND */ 959 960 return 0; 961 } 962 963 static int 964 set_ip_transparent(struct nsd_socket *sock) 965 { 966 /* 967 The scandalous preprocessor blob here calls for some explanation :) 968 POSIX does not specify an option to bind non-local IPs, so 969 platforms developed several implementation-specific options, 970 all set in the same way, but with different names. 971 For additional complexity, some platform manage this setting 972 differently for different address families (IPv4 vs IPv6). 973 This scandalous preprocessor blob below abstracts such variability 974 in the way which leaves the C code as lean and clear as possible. 975 */ 976 977 #if defined(IP_TRANSPARENT) 978 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 979 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 980 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 981 // as of 2020-01, Linux does not support this on IPv6 programmatically 982 #elif defined(SO_BINDANY) 983 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 984 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 985 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 986 #elif defined(IP_BINDANY) 987 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 988 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 989 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 990 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 991 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 992 #endif 993 994 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 995 (void)sock; 996 #else 997 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 998 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 999 # endif 1000 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1001 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1002 # endif 1003 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1004 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1005 # endif 1006 1007 int on = 1; 1008 const char *socktype = 1009 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1010 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1011 1012 if(0 == setsockopt( 1013 sock->s, 1014 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1015 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1016 &on, sizeof(on))) 1017 { 1018 return 1; 1019 } 1020 1021 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1022 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1023 return -1; 1024 #endif 1025 1026 return 0; 1027 } 1028 1029 static int 1030 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1031 { 1032 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1033 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1034 return 1; 1035 } 1036 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1037 strerror(errno)); 1038 return -1; 1039 #else 1040 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1041 #endif 1042 return 0; 1043 } 1044 1045 #ifdef USE_TCP_FASTOPEN 1046 static int 1047 set_tcp_fastopen(struct nsd_socket *sock) 1048 { 1049 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1050 * a defense against IP spoofing attacks as suggested in RFC7413. 1051 */ 1052 int qlen; 1053 1054 #ifdef __APPLE__ 1055 /* macOS X implementation only supports qlen of 1 via this call. The 1056 * actual value is configured by the net.inet.tcp.fastopen_backlog 1057 * kernel parameter. 1058 */ 1059 qlen = 1; 1060 #else 1061 /* 5 is recommended on Linux. */ 1062 qlen = 5; 1063 #endif 1064 if (0 == setsockopt( 1065 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1066 { 1067 return 1; 1068 } 1069 1070 if (errno == EPERM) { 1071 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1072 "; this could likely be because sysctl " 1073 "net.inet.tcp.fastopen.enabled, " 1074 "net.inet.tcp.fastopen.server_enable, or " 1075 "net.ipv4.tcp_fastopen is disabled", 1076 strerror(errno)); 1077 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1078 * disabled, except when verbosity enabled for debugging 1079 */ 1080 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1081 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1082 strerror(errno)); 1083 } 1084 1085 return (errno == ENOPROTOOPT ? 0 : -1); 1086 } 1087 #endif /* USE_TCP_FASTOPEN */ 1088 1089 static int 1090 set_bindtodevice(struct nsd_socket *sock) 1091 { 1092 #if defined(SO_BINDTODEVICE) 1093 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1094 sock->device, strlen(sock->device)) == -1) 1095 { 1096 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1097 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1098 return -1; 1099 } 1100 1101 return 1; 1102 #else 1103 (void)sock; 1104 return 0; 1105 #endif 1106 } 1107 1108 static int 1109 set_setfib(struct nsd_socket *sock) 1110 { 1111 #if defined(SO_SETFIB) 1112 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1113 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1114 { 1115 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1116 "SO_SETFIB", sock->fib, strerror(errno)); 1117 return -1; 1118 } 1119 1120 return 1; 1121 #else 1122 (void)sock; 1123 return 0; 1124 #endif 1125 } 1126 1127 static int 1128 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1129 { 1130 int rcv = 1*1024*1024, snd = 1*1024*1024; 1131 1132 if(-1 == (sock->s = socket( 1133 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1134 { 1135 #ifdef INET6 1136 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1137 (sock->addr.ai_family == AF_INET6) && 1138 (errno == EAFNOSUPPORT)) 1139 { 1140 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1141 "not supported"); 1142 return 0; 1143 } 1144 #endif 1145 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1146 return -1; 1147 } 1148 1149 set_cloexec(sock); 1150 1151 if(nsd->reuseport && reuseport_works && *reuseport_works) 1152 *reuseport_works = (set_reuseport(sock) == 1); 1153 1154 if(nsd->options->receive_buffer_size > 0) 1155 rcv = nsd->options->receive_buffer_size; 1156 if(set_rcvbuf(sock, rcv) == -1) 1157 return -1; 1158 1159 if(nsd->options->send_buffer_size > 0) 1160 snd = nsd->options->send_buffer_size; 1161 if(set_sndbuf(sock, snd) == -1) 1162 return -1; 1163 #ifdef INET6 1164 if(sock->addr.ai_family == AF_INET6) { 1165 if(set_ipv6_v6only(sock) == -1 || 1166 set_ipv6_use_min_mtu(sock) == -1) 1167 return -1; 1168 } else 1169 #endif /* INET6 */ 1170 if(sock->addr.ai_family == AF_INET) { 1171 if(set_ipv4_no_pmtu_disc(sock) == -1) 1172 return -1; 1173 } 1174 1175 /* Set socket to non-blocking. Otherwise, on operating systems 1176 * with thundering herd problems, the UDP recv could block 1177 * after select returns readable. 1178 */ 1179 set_nonblock(sock); 1180 1181 if(nsd->options->ip_freebind) 1182 (void)set_ip_freebind(sock); 1183 if(nsd->options->ip_transparent) 1184 (void)set_ip_transparent(sock); 1185 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1186 return -1; 1187 if(sock->fib != -1 && set_setfib(sock) == -1) 1188 return -1; 1189 1190 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1191 char buf[256]; 1192 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1193 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1194 buf, strerror(errno)); 1195 return -1; 1196 } 1197 1198 return 1; 1199 } 1200 1201 static int 1202 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1203 { 1204 #ifdef USE_TCP_FASTOPEN 1205 report_tcp_fastopen_config(); 1206 #endif 1207 1208 (void)reuseport_works; 1209 1210 if(-1 == (sock->s = socket( 1211 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1212 { 1213 #ifdef INET6 1214 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1215 (sock->addr.ai_family == AF_INET6) && 1216 (errno == EAFNOSUPPORT)) 1217 { 1218 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1219 "not supported"); 1220 return 0; 1221 } 1222 #endif /* INET6 */ 1223 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1224 return -1; 1225 } 1226 1227 set_cloexec(sock); 1228 1229 if(nsd->reuseport && reuseport_works && *reuseport_works) 1230 *reuseport_works = (set_reuseport(sock) == 1); 1231 1232 (void)set_reuseaddr(sock); 1233 1234 #ifdef INET6 1235 if(sock->addr.ai_family == AF_INET6) { 1236 if (set_ipv6_v6only(sock) == -1 || 1237 set_ipv6_use_min_mtu(sock) == -1) 1238 return -1; 1239 } 1240 #endif 1241 1242 if(nsd->tcp_mss > 0) 1243 set_tcp_maxseg(sock, nsd->tcp_mss); 1244 /* (StevensUNP p463), if TCP listening socket is blocking, then 1245 it may block in accept, even if select() says readable. */ 1246 (void)set_nonblock(sock); 1247 if(nsd->options->ip_freebind) 1248 (void)set_ip_freebind(sock); 1249 if(nsd->options->ip_transparent) 1250 (void)set_ip_transparent(sock); 1251 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1252 return -1; 1253 if(sock->fib != -1 && set_setfib(sock) == -1) 1254 return -1; 1255 1256 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1257 char buf[256]; 1258 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1259 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1260 buf, strerror(errno)); 1261 return -1; 1262 } 1263 1264 #ifdef USE_TCP_FASTOPEN 1265 (void)set_tcp_fastopen(sock); 1266 #endif 1267 1268 if(listen(sock->s, TCP_BACKLOG) == -1) { 1269 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1270 return -1; 1271 } 1272 1273 return 1; 1274 } 1275 1276 /* 1277 * Initialize the server, reuseport, create and bind the sockets. 1278 */ 1279 int 1280 server_init(struct nsd *nsd) 1281 { 1282 size_t i; 1283 int reuseport = 1; /* Determine if REUSEPORT works. */ 1284 1285 /* open server interface ports */ 1286 for(i = 0; i < nsd->ifs; i++) { 1287 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1288 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1289 { 1290 return -1; 1291 } 1292 } 1293 1294 if(nsd->reuseport && reuseport) { 1295 size_t ifs = nsd->ifs * nsd->reuseport; 1296 1297 /* increase the size of the interface arrays, there are going 1298 * to be separate interface file descriptors for every server 1299 * instance */ 1300 region_remove_cleanup(nsd->region, free, nsd->udp); 1301 region_remove_cleanup(nsd->region, free, nsd->tcp); 1302 1303 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1304 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1305 region_add_cleanup(nsd->region, free, nsd->udp); 1306 region_add_cleanup(nsd->region, free, nsd->tcp); 1307 if(ifs > nsd->ifs) { 1308 memset(&nsd->udp[nsd->ifs], 0, 1309 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1310 memset(&nsd->tcp[nsd->ifs], 0, 1311 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1312 } 1313 1314 for(i = nsd->ifs; i < ifs; i++) { 1315 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1316 nsd->udp[i].s = -1; 1317 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1318 return -1; 1319 } 1320 /* Turn off REUSEPORT for TCP by copying the socket 1321 * file descriptor. 1322 * This means we should not close TCP used by 1323 * other servers in reuseport enabled mode, in 1324 * server_child(). 1325 */ 1326 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1327 } 1328 1329 nsd->ifs = ifs; 1330 } else { 1331 nsd->reuseport = 0; 1332 } 1333 1334 return 0; 1335 } 1336 1337 /* 1338 * Prepare the server for take off. 1339 * 1340 */ 1341 int 1342 server_prepare(struct nsd *nsd) 1343 { 1344 #ifdef RATELIMIT 1345 /* set secret modifier for hashing (udb ptr buckets and rate limits) */ 1346 #ifdef HAVE_GETRANDOM 1347 uint32_t v; 1348 if(getrandom(&v, sizeof(v), 0) == -1) { 1349 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1350 exit(1); 1351 } 1352 hash_set_raninit(v); 1353 #elif defined(HAVE_ARC4RANDOM) 1354 hash_set_raninit(arc4random()); 1355 #else 1356 uint32_t v = getpid() ^ time(NULL); 1357 srandom((unsigned long)v); 1358 # ifdef HAVE_SSL 1359 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1360 hash_set_raninit(v); 1361 else 1362 # endif 1363 hash_set_raninit(random()); 1364 #endif 1365 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1366 nsd->options->rrl_ratelimit, 1367 nsd->options->rrl_whitelist_ratelimit, 1368 nsd->options->rrl_slip, 1369 nsd->options->rrl_ipv4_prefix_length, 1370 nsd->options->rrl_ipv6_prefix_length); 1371 #endif /* RATELIMIT */ 1372 1373 /* Open the database... */ 1374 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { 1375 log_msg(LOG_ERR, "unable to open the database %s: %s", 1376 nsd->dbfile, strerror(errno)); 1377 unlink(nsd->task[0]->fname); 1378 unlink(nsd->task[1]->fname); 1379 #ifdef USE_ZONE_STATS 1380 unlink(nsd->zonestatfname[0]); 1381 unlink(nsd->zonestatfname[1]); 1382 #endif 1383 xfrd_del_tempdir(nsd); 1384 return -1; 1385 } 1386 /* check if zone files have been modified */ 1387 /* NULL for taskudb because we send soainfo in a moment, batched up, 1388 * for all zones */ 1389 if(nsd->options->zonefiles_check || (nsd->options->database == NULL || 1390 nsd->options->database[0] == 0)) 1391 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1392 zonestatid_tree_set(nsd); 1393 1394 compression_table_capacity = 0; 1395 initialize_dname_compression_tables(nsd); 1396 1397 #ifdef BIND8_STATS 1398 /* Initialize times... */ 1399 time(&nsd->st.boot); 1400 set_bind8_alarm(nsd); 1401 #endif /* BIND8_STATS */ 1402 1403 return 0; 1404 } 1405 1406 /* 1407 * Fork the required number of servers. 1408 */ 1409 static int 1410 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1411 int* xfrd_sock_p) 1412 { 1413 size_t i; 1414 1415 /* Start all child servers initially. */ 1416 for (i = 0; i < nsd->child_count; ++i) { 1417 nsd->children[i].pid = 0; 1418 } 1419 1420 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1421 } 1422 1423 static void 1424 server_close_socket(struct nsd_socket *sock) 1425 { 1426 if(sock->s != -1) { 1427 close(sock->s); 1428 sock->s = -1; 1429 } 1430 } 1431 1432 void 1433 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1434 { 1435 size_t i; 1436 1437 /* Close all the sockets... */ 1438 for (i = 0; i < n; ++i) { 1439 server_close_socket(&sockets[i]); 1440 } 1441 } 1442 1443 /* 1444 * Close the sockets, shutdown the server and exit. 1445 * Does not return. 1446 */ 1447 void 1448 server_shutdown(struct nsd *nsd) 1449 { 1450 size_t i; 1451 1452 server_close_all_sockets(nsd->udp, nsd->ifs); 1453 server_close_all_sockets(nsd->tcp, nsd->ifs); 1454 /* CHILD: close command channel to parent */ 1455 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1456 { 1457 close(nsd->this_child->parent_fd); 1458 nsd->this_child->parent_fd = -1; 1459 } 1460 /* SERVER: close command channels to children */ 1461 if(!nsd->this_child) 1462 { 1463 for(i=0; i < nsd->child_count; ++i) 1464 if(nsd->children[i].child_fd != -1) 1465 { 1466 close(nsd->children[i].child_fd); 1467 nsd->children[i].child_fd = -1; 1468 } 1469 } 1470 1471 tsig_finalize(); 1472 #ifdef HAVE_SSL 1473 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1474 if (nsd->tls_ctx) 1475 SSL_CTX_free(nsd->tls_ctx); 1476 #endif 1477 1478 #ifdef MEMCLEAN /* OS collects memory pages */ 1479 #ifdef RATELIMIT 1480 rrl_mmap_deinit_keep_mmap(); 1481 #endif 1482 #ifdef USE_DNSTAP 1483 dt_collector_destroy(nsd->dt_collector, nsd); 1484 #endif 1485 udb_base_free_keep_mmap(nsd->task[0]); 1486 udb_base_free_keep_mmap(nsd->task[1]); 1487 namedb_close_udb(nsd->db); /* keeps mmap */ 1488 namedb_close(nsd->db); 1489 nsd_options_destroy(nsd->options); 1490 region_destroy(nsd->region); 1491 #endif 1492 log_finalize(); 1493 exit(0); 1494 } 1495 1496 void 1497 server_prepare_xfrd(struct nsd* nsd) 1498 { 1499 char tmpfile[256]; 1500 /* create task mmaps */ 1501 nsd->mytask = 0; 1502 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1503 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1504 nsd->task[0] = task_file_create(tmpfile); 1505 if(!nsd->task[0]) { 1506 #ifdef USE_ZONE_STATS 1507 unlink(nsd->zonestatfname[0]); 1508 unlink(nsd->zonestatfname[1]); 1509 #endif 1510 xfrd_del_tempdir(nsd); 1511 exit(1); 1512 } 1513 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1514 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1515 nsd->task[1] = task_file_create(tmpfile); 1516 if(!nsd->task[1]) { 1517 unlink(nsd->task[0]->fname); 1518 #ifdef USE_ZONE_STATS 1519 unlink(nsd->zonestatfname[0]); 1520 unlink(nsd->zonestatfname[1]); 1521 #endif 1522 xfrd_del_tempdir(nsd); 1523 exit(1); 1524 } 1525 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1526 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1527 /* create xfrd listener structure */ 1528 nsd->xfrd_listener = region_alloc(nsd->region, 1529 sizeof(netio_handler_type)); 1530 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1531 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1532 nsd->xfrd_listener->fd = -1; 1533 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1534 nsd; 1535 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1536 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1537 } 1538 1539 1540 void 1541 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1542 { 1543 pid_t pid; 1544 int sockets[2] = {0,0}; 1545 struct ipc_handler_conn_data *data; 1546 1547 if(nsd->xfrd_listener->fd != -1) 1548 close(nsd->xfrd_listener->fd); 1549 if(del_db) { 1550 /* recreate taskdb that xfrd was using, it may be corrupt */ 1551 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1552 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1553 nsd->task[1-nsd->mytask]->fname = NULL; 1554 /* free alloc already, so udb does not shrink itself */ 1555 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1556 nsd->task[1-nsd->mytask]->alloc = NULL; 1557 udb_base_free(nsd->task[1-nsd->mytask]); 1558 /* create new file, overwrite the old one */ 1559 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1560 free(tmpfile); 1561 } 1562 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1563 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1564 return; 1565 } 1566 pid = fork(); 1567 switch (pid) { 1568 case -1: 1569 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1570 break; 1571 default: 1572 /* PARENT: close first socket, use second one */ 1573 close(sockets[0]); 1574 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1575 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1576 } 1577 if(del_db) xfrd_free_namedb(nsd); 1578 /* use other task than I am using, since if xfrd died and is 1579 * restarted, the reload is using nsd->mytask */ 1580 nsd->mytask = 1 - nsd->mytask; 1581 1582 #ifdef HAVE_SETPROCTITLE 1583 setproctitle("xfrd"); 1584 #endif 1585 #ifdef HAVE_CPUSET_T 1586 if(nsd->use_cpu_affinity) { 1587 set_cpu_affinity(nsd->xfrd_cpuset); 1588 } 1589 #endif 1590 1591 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1592 /* ENOTREACH */ 1593 break; 1594 case 0: 1595 /* CHILD: close second socket, use first one */ 1596 close(sockets[1]); 1597 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1598 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1599 } 1600 nsd->xfrd_listener->fd = sockets[0]; 1601 break; 1602 } 1603 /* server-parent only */ 1604 nsd->xfrd_listener->timeout = NULL; 1605 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1606 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1607 /* clear ongoing ipc reads */ 1608 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1609 data->conn->is_reading = 0; 1610 } 1611 1612 /** add all soainfo to taskdb */ 1613 static void 1614 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1615 { 1616 struct radnode* n; 1617 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1618 /* add all SOA INFO to mytask */ 1619 udb_ptr_init(&task_last, taskudb); 1620 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1621 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1622 } 1623 udb_ptr_unlink(&task_last, taskudb); 1624 } 1625 1626 void 1627 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1628 { 1629 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1630 * parent fills one taskdb with soas, xfrd fills other with expires. 1631 * then they exchange and process. 1632 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1633 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1634 * expire notifications can be sent back via a normal reload later 1635 * (xfrd will wait for current running reload to finish if any). 1636 */ 1637 sig_atomic_t cmd = 0; 1638 pid_t mypid; 1639 int xfrd_sock = nsd->xfrd_listener->fd; 1640 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1641 udb_ptr t; 1642 if(!shortsoa) { 1643 if(nsd->signal_hint_shutdown) { 1644 shutdown: 1645 log_msg(LOG_WARNING, "signal received, shutting down..."); 1646 server_close_all_sockets(nsd->udp, nsd->ifs); 1647 server_close_all_sockets(nsd->tcp, nsd->ifs); 1648 #ifdef HAVE_SSL 1649 daemon_remote_close(nsd->rc); 1650 #endif 1651 /* Unlink it if possible... */ 1652 unlinkpid(nsd->pidfile); 1653 unlink(nsd->task[0]->fname); 1654 unlink(nsd->task[1]->fname); 1655 #ifdef USE_ZONE_STATS 1656 unlink(nsd->zonestatfname[0]); 1657 unlink(nsd->zonestatfname[1]); 1658 #endif 1659 /* write the nsd.db to disk, wait for it to complete */ 1660 udb_base_sync(nsd->db->udb, 1); 1661 udb_base_close(nsd->db->udb); 1662 server_shutdown(nsd); 1663 /* ENOTREACH */ 1664 exit(0); 1665 } 1666 } 1667 if(shortsoa) { 1668 /* put SOA in xfrd task because mytask may be in use */ 1669 taskudb = nsd->task[1-nsd->mytask]; 1670 } 1671 1672 add_all_soa_to_task(nsd, taskudb); 1673 if(!shortsoa) { 1674 /* wait for xfrd to signal task is ready, RELOAD signal */ 1675 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1676 cmd != NSD_RELOAD) { 1677 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1678 exit(1); 1679 } 1680 if(nsd->signal_hint_shutdown) { 1681 goto shutdown; 1682 } 1683 } 1684 /* give xfrd our task, signal it with RELOAD_DONE */ 1685 task_process_sync(taskudb); 1686 cmd = NSD_RELOAD_DONE; 1687 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1688 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1689 (int)nsd->pid, strerror(errno)); 1690 } 1691 mypid = getpid(); 1692 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1693 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1694 strerror(errno)); 1695 } 1696 1697 if(!shortsoa) { 1698 /* process the xfrd task works (expiry data) */ 1699 nsd->mytask = 1 - nsd->mytask; 1700 taskudb = nsd->task[nsd->mytask]; 1701 task_remap(taskudb); 1702 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1703 while(!udb_ptr_is_null(&t)) { 1704 task_process_expire(nsd->db, TASKLIST(&t)); 1705 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1706 } 1707 udb_ptr_unlink(&t, taskudb); 1708 task_clear(taskudb); 1709 1710 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1711 cmd = NSD_RELOAD_DONE; 1712 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1713 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1714 (int)nsd->pid, strerror(errno)); 1715 } 1716 } 1717 } 1718 1719 #ifdef HAVE_SSL 1720 static void 1721 log_crypto_from_err(const char* str, unsigned long err) 1722 { 1723 /* error:[error code]:[library name]:[function name]:[reason string] */ 1724 char buf[128]; 1725 unsigned long e; 1726 ERR_error_string_n(err, buf, sizeof(buf)); 1727 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1728 while( (e=ERR_get_error()) ) { 1729 ERR_error_string_n(e, buf, sizeof(buf)); 1730 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1731 } 1732 } 1733 1734 void 1735 log_crypto_err(const char* str) 1736 { 1737 log_crypto_from_err(str, ERR_get_error()); 1738 } 1739 1740 /** true if the ssl handshake error has to be squelched from the logs */ 1741 static int 1742 squelch_err_ssl_handshake(unsigned long err) 1743 { 1744 if(verbosity >= 3) 1745 return 0; /* only squelch on low verbosity */ 1746 /* this is very specific, we could filter on ERR_GET_REASON() 1747 * (the third element in ERR_PACK) */ 1748 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1749 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1750 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1751 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1752 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1753 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1754 #endif 1755 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1756 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1757 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1758 # ifdef SSL_R_VERSION_TOO_LOW 1759 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1760 # endif 1761 #endif 1762 ) 1763 return 1; 1764 return 0; 1765 } 1766 1767 void 1768 perform_openssl_init(void) 1769 { 1770 /* init SSL library */ 1771 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1772 ERR_load_crypto_strings(); 1773 #endif 1774 ERR_load_SSL_strings(); 1775 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1776 OpenSSL_add_all_algorithms(); 1777 #else 1778 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1779 | OPENSSL_INIT_ADD_ALL_DIGESTS 1780 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1781 #endif 1782 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1783 (void)SSL_library_init(); 1784 #else 1785 OPENSSL_init_ssl(0, NULL); 1786 #endif 1787 1788 if(!RAND_status()) { 1789 /* try to seed it */ 1790 unsigned char buf[256]; 1791 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1792 size_t i; 1793 v = seed; 1794 for(i=0; i<256/sizeof(v); i++) { 1795 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1796 v = v*seed + (unsigned int)i; 1797 } 1798 RAND_seed(buf, 256); 1799 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1800 } 1801 } 1802 1803 static int 1804 get_ocsp(char *filename, unsigned char **ocsp) 1805 { 1806 BIO *bio; 1807 OCSP_RESPONSE *response; 1808 int len = -1; 1809 unsigned char *p, *buf; 1810 assert(filename); 1811 1812 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1813 log_crypto_err("get_ocsp: BIO_new_file failed"); 1814 return -1; 1815 } 1816 1817 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1818 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1819 BIO_free(bio); 1820 return -1; 1821 } 1822 1823 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1824 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1825 OCSP_RESPONSE_free(response); 1826 BIO_free(bio); 1827 return -1; 1828 } 1829 1830 if ((buf = malloc((size_t) len)) == NULL) { 1831 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1832 OCSP_RESPONSE_free(response); 1833 BIO_free(bio); 1834 return -1; 1835 } 1836 1837 p = buf; 1838 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1839 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1840 free(buf); 1841 OCSP_RESPONSE_free(response); 1842 BIO_free(bio); 1843 return -1; 1844 } 1845 1846 OCSP_RESPONSE_free(response); 1847 BIO_free(bio); 1848 1849 *ocsp = buf; 1850 return len; 1851 } 1852 1853 /* further setup ssl ctx after the keys are loaded */ 1854 static void 1855 listen_sslctx_setup_2(void* ctxt) 1856 { 1857 SSL_CTX* ctx = (SSL_CTX*)ctxt; 1858 (void)ctx; 1859 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 1860 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 1861 /* ENOTREACH */ 1862 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 1863 } 1864 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 1865 if(1) { 1866 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 1867 if (!ecdh) { 1868 log_crypto_err("could not find p256, not enabling ECDHE"); 1869 } else { 1870 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 1871 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 1872 } 1873 EC_KEY_free (ecdh); 1874 } 1875 } 1876 #endif 1877 } 1878 1879 static int 1880 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 1881 { 1882 if(ocspdata) { 1883 unsigned char *p; 1884 if ((p=malloc(ocspdata_len)) == NULL) { 1885 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 1886 return SSL_TLSEXT_ERR_NOACK; 1887 } 1888 memcpy(p, ocspdata, ocspdata_len); 1889 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 1890 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 1891 free(p); 1892 return SSL_TLSEXT_ERR_NOACK; 1893 } 1894 return SSL_TLSEXT_ERR_OK; 1895 } else { 1896 return SSL_TLSEXT_ERR_NOACK; 1897 } 1898 } 1899 1900 SSL_CTX* 1901 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 1902 { 1903 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 1904 if(!ctx) { 1905 log_crypto_err("could not SSL_CTX_new"); 1906 return NULL; 1907 } 1908 /* no SSLv2, SSLv3 because has defects */ 1909 #if SSL_OP_NO_SSLv2 != 0 1910 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 1911 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 1912 SSL_CTX_free(ctx); 1913 return NULL; 1914 } 1915 #endif 1916 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 1917 != SSL_OP_NO_SSLv3){ 1918 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 1919 SSL_CTX_free(ctx); 1920 return 0; 1921 } 1922 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 1923 /* if we have tls 1.1 disable 1.0 */ 1924 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 1925 != SSL_OP_NO_TLSv1){ 1926 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 1927 SSL_CTX_free(ctx); 1928 return 0; 1929 } 1930 #endif 1931 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 1932 /* if we have tls 1.2 disable 1.1 */ 1933 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 1934 != SSL_OP_NO_TLSv1_1){ 1935 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 1936 SSL_CTX_free(ctx); 1937 return 0; 1938 } 1939 #endif 1940 #if defined(SSL_OP_NO_RENEGOTIATION) 1941 /* disable client renegotiation */ 1942 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 1943 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 1944 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 1945 SSL_CTX_free(ctx); 1946 return 0; 1947 } 1948 #endif 1949 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 1950 /* if we have sha256, set the cipher list to have no known vulns */ 1951 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 1952 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 1953 #endif 1954 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 1955 SSL_OP_CIPHER_SERVER_PREFERENCE) != 1956 SSL_OP_CIPHER_SERVER_PREFERENCE) { 1957 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 1958 SSL_CTX_free(ctx); 1959 return 0; 1960 } 1961 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 1962 SSL_CTX_set_security_level(ctx, 0); 1963 #endif 1964 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 1965 log_msg(LOG_ERR, "error for cert file: %s", pem); 1966 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 1967 SSL_CTX_free(ctx); 1968 return NULL; 1969 } 1970 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 1971 log_msg(LOG_ERR, "error for private key file: %s", key); 1972 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 1973 SSL_CTX_free(ctx); 1974 return NULL; 1975 } 1976 if(!SSL_CTX_check_private_key(ctx)) { 1977 log_msg(LOG_ERR, "error for key file: %s", key); 1978 log_crypto_err("Error in SSL_CTX check_private_key"); 1979 SSL_CTX_free(ctx); 1980 return NULL; 1981 } 1982 listen_sslctx_setup_2(ctx); 1983 if(verifypem && verifypem[0]) { 1984 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 1985 log_crypto_err("Error in SSL_CTX verify locations"); 1986 SSL_CTX_free(ctx); 1987 return NULL; 1988 } 1989 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 1990 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 1991 } 1992 return ctx; 1993 } 1994 1995 SSL_CTX* 1996 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 1997 { 1998 char *key, *pem; 1999 SSL_CTX *ctx; 2000 2001 key = nsd->options->tls_service_key; 2002 pem = nsd->options->tls_service_pem; 2003 if(!key || key[0] == 0) { 2004 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2005 return NULL; 2006 } 2007 if(!pem || pem[0] == 0) { 2008 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2009 return NULL; 2010 } 2011 2012 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2013 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2014 ctx = server_tls_ctx_setup(key, pem, verifypem); 2015 if(!ctx) { 2016 log_msg(LOG_ERR, "could not setup server TLS context"); 2017 return NULL; 2018 } 2019 if(ocspfile && ocspfile[0]) { 2020 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2021 log_crypto_err("Error reading OCSPfile"); 2022 SSL_CTX_free(ctx); 2023 return NULL; 2024 } else { 2025 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2026 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2027 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2028 SSL_CTX_free(ctx); 2029 return NULL; 2030 } 2031 } 2032 } 2033 return ctx; 2034 } 2035 2036 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2037 int 2038 using_tls_port(struct sockaddr* addr, const char* tls_port) 2039 { 2040 in_port_t port = 0; 2041 2042 if (addr->sa_family == AF_INET) 2043 port = ((struct sockaddr_in*)addr)->sin_port; 2044 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2045 else 2046 port = ((struct sockaddr_in6*)addr)->sin6_port; 2047 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2048 if (atoi(tls_port) == ntohs(port)) 2049 return 1; 2050 2051 return 0; 2052 } 2053 #endif 2054 2055 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2056 ssize_t 2057 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2058 { 2059 uint8_t* buf = (uint8_t*) p; 2060 ssize_t total = 0; 2061 struct pollfd fd; 2062 memset(&fd, 0, sizeof(fd)); 2063 fd.fd = s; 2064 fd.events = POLLIN; 2065 2066 while( total < sz) { 2067 ssize_t ret; 2068 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2069 if(ret == -1) { 2070 if(errno == EAGAIN) 2071 /* blocking read */ 2072 continue; 2073 if(errno == EINTR) { 2074 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2075 return -1; 2076 /* other signals can be handled later */ 2077 continue; 2078 } 2079 /* some error */ 2080 return -1; 2081 } 2082 if(ret == 0) { 2083 /* operation timed out */ 2084 return -2; 2085 } 2086 ret = read(s, buf+total, sz-total); 2087 if(ret == -1) { 2088 if(errno == EAGAIN) 2089 /* blocking read */ 2090 continue; 2091 if(errno == EINTR) { 2092 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2093 return -1; 2094 /* other signals can be handled later */ 2095 continue; 2096 } 2097 /* some error */ 2098 return -1; 2099 } 2100 if(ret == 0) { 2101 /* closed connection! */ 2102 return 0; 2103 } 2104 total += ret; 2105 } 2106 return total; 2107 } 2108 2109 static void 2110 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2111 { 2112 sig_atomic_t cmd = NSD_QUIT_SYNC; 2113 udb_ptr t, next; 2114 udb_base* u = nsd->task[nsd->mytask]; 2115 udb_ptr_init(&next, u); 2116 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2117 udb_base_set_userdata(u, 0); 2118 while(!udb_ptr_is_null(&t)) { 2119 /* store next in list so this one can be deleted or reused */ 2120 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2121 udb_rptr_zero(&TASKLIST(&t)->next, u); 2122 2123 /* process task t */ 2124 /* append results for task t and update last_task */ 2125 task_process_in_reload(nsd, u, last_task, &t); 2126 2127 /* go to next */ 2128 udb_ptr_set_ptr(&t, u, &next); 2129 2130 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2131 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2132 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2133 if(cmd == NSD_QUIT) { 2134 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2135 /* sync to disk (if needed) */ 2136 udb_base_sync(nsd->db->udb, 0); 2137 /* unlink files of remainder of tasks */ 2138 while(!udb_ptr_is_null(&t)) { 2139 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2140 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2141 } 2142 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2143 } 2144 udb_ptr_unlink(&t, u); 2145 udb_ptr_unlink(&next, u); 2146 exit(0); 2147 } 2148 } 2149 2150 } 2151 udb_ptr_unlink(&t, u); 2152 udb_ptr_unlink(&next, u); 2153 } 2154 2155 #ifdef BIND8_STATS 2156 static void 2157 parent_send_stats(struct nsd* nsd, int cmdfd) 2158 { 2159 size_t i; 2160 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { 2161 log_msg(LOG_ERR, "could not write stats to reload"); 2162 return; 2163 } 2164 for(i=0; i<nsd->child_count; i++) 2165 if(!write_socket(cmdfd, &nsd->children[i].query_count, 2166 sizeof(stc_type))) { 2167 log_msg(LOG_ERR, "could not write stats to reload"); 2168 return; 2169 } 2170 } 2171 2172 static void 2173 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) 2174 { 2175 struct nsdst s; 2176 stc_type* p; 2177 size_t i; 2178 if(block_read(nsd, cmdfd, &s, sizeof(s), 2179 RELOAD_SYNC_TIMEOUT) != sizeof(s)) { 2180 log_msg(LOG_ERR, "could not read stats from oldpar"); 2181 return; 2182 } 2183 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); 2184 s.db_mem = region_get_mem(nsd->db->region); 2185 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, 2186 nsd->child_count); 2187 if(!p) return; 2188 for(i=0; i<nsd->child_count; i++) { 2189 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= 2190 sizeof(stc_type)) 2191 return; 2192 } 2193 } 2194 #endif /* BIND8_STATS */ 2195 2196 /* 2197 * Reload the database, stop parent, re-fork children and continue. 2198 * as server_main. 2199 */ 2200 static void 2201 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2202 int cmdsocket) 2203 { 2204 pid_t mypid; 2205 sig_atomic_t cmd = NSD_QUIT_SYNC; 2206 int ret; 2207 udb_ptr last_task; 2208 struct sigaction old_sigchld, ign_sigchld; 2209 /* ignore SIGCHLD from the previous server_main that used this pid */ 2210 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2211 ign_sigchld.sa_handler = SIG_IGN; 2212 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2213 2214 #ifdef HAVE_SETPROCTITLE 2215 setproctitle("main"); 2216 #endif 2217 #ifdef HAVE_CPUSET_T 2218 if(nsd->use_cpu_affinity) { 2219 set_cpu_affinity(nsd->cpuset); 2220 } 2221 #endif 2222 2223 /* see what tasks we got from xfrd */ 2224 task_remap(nsd->task[nsd->mytask]); 2225 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2226 udb_compact_inhibited(nsd->db->udb, 1); 2227 reload_process_tasks(nsd, &last_task, cmdsocket); 2228 udb_compact_inhibited(nsd->db->udb, 0); 2229 udb_compact(nsd->db->udb); 2230 2231 #ifndef NDEBUG 2232 if(nsd_debug_level >= 1) 2233 region_log_stats(nsd->db->region); 2234 #endif /* NDEBUG */ 2235 /* sync to disk (if needed) */ 2236 udb_base_sync(nsd->db->udb, 0); 2237 2238 initialize_dname_compression_tables(nsd); 2239 2240 #ifdef BIND8_STATS 2241 /* Restart dumping stats if required. */ 2242 time(&nsd->st.boot); 2243 set_bind8_alarm(nsd); 2244 #endif 2245 #ifdef USE_ZONE_STATS 2246 server_zonestat_realloc(nsd); /* realloc for new children */ 2247 server_zonestat_switch(nsd); 2248 #endif 2249 2250 /* listen for the signals of failed children again */ 2251 sigaction(SIGCHLD, &old_sigchld, NULL); 2252 /* Start new child processes */ 2253 if (server_start_children(nsd, server_region, netio, &nsd-> 2254 xfrd_listener->fd) != 0) { 2255 send_children_quit(nsd); 2256 exit(1); 2257 } 2258 2259 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2260 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2261 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2262 if(cmd == NSD_QUIT) { 2263 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2264 send_children_quit(nsd); 2265 exit(0); 2266 } 2267 } 2268 2269 /* Send quit command to parent: blocking, wait for receipt. */ 2270 do { 2271 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2272 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2273 { 2274 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2275 strerror(errno)); 2276 } 2277 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2278 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2279 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2280 RELOAD_SYNC_TIMEOUT); 2281 if(ret == -2) { 2282 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2283 } 2284 } while (ret == -2); 2285 if(ret == -1) { 2286 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2287 strerror(errno)); 2288 } 2289 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2290 if(cmd == NSD_QUIT) { 2291 /* small race condition possible here, parent got quit cmd. */ 2292 send_children_quit(nsd); 2293 exit(1); 2294 } 2295 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2296 #ifdef BIND8_STATS 2297 reload_do_stats(cmdsocket, nsd, &last_task); 2298 #endif 2299 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2300 task_process_sync(nsd->task[nsd->mytask]); 2301 #ifdef USE_ZONE_STATS 2302 server_zonestat_realloc(nsd); /* realloc for next children */ 2303 #endif 2304 2305 /* send soainfo to the xfrd process, signal it that reload is done, 2306 * it picks up the taskudb */ 2307 cmd = NSD_RELOAD_DONE; 2308 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2309 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2310 strerror(errno)); 2311 } 2312 mypid = getpid(); 2313 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2314 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2315 strerror(errno)); 2316 } 2317 2318 /* try to reopen file */ 2319 if (nsd->file_rotation_ok) 2320 log_reopen(nsd->log_filename, 1); 2321 /* exit reload, continue as new server_main */ 2322 } 2323 2324 /* 2325 * Get the mode depending on the signal hints that have been received. 2326 * Multiple signal hints can be received and will be handled in turn. 2327 */ 2328 static sig_atomic_t 2329 server_signal_mode(struct nsd *nsd) 2330 { 2331 if(nsd->signal_hint_quit) { 2332 nsd->signal_hint_quit = 0; 2333 return NSD_QUIT; 2334 } 2335 else if(nsd->signal_hint_shutdown) { 2336 nsd->signal_hint_shutdown = 0; 2337 return NSD_SHUTDOWN; 2338 } 2339 else if(nsd->signal_hint_child) { 2340 nsd->signal_hint_child = 0; 2341 return NSD_REAP_CHILDREN; 2342 } 2343 else if(nsd->signal_hint_reload) { 2344 nsd->signal_hint_reload = 0; 2345 return NSD_RELOAD; 2346 } 2347 else if(nsd->signal_hint_reload_hup) { 2348 nsd->signal_hint_reload_hup = 0; 2349 return NSD_RELOAD_REQ; 2350 } 2351 else if(nsd->signal_hint_stats) { 2352 nsd->signal_hint_stats = 0; 2353 #ifdef BIND8_STATS 2354 set_bind8_alarm(nsd); 2355 #endif 2356 return NSD_STATS; 2357 } 2358 else if(nsd->signal_hint_statsusr) { 2359 nsd->signal_hint_statsusr = 0; 2360 return NSD_STATS; 2361 } 2362 return NSD_RUN; 2363 } 2364 2365 /* 2366 * The main server simply waits for signals and child processes to 2367 * terminate. Child processes are restarted as necessary. 2368 */ 2369 void 2370 server_main(struct nsd *nsd) 2371 { 2372 region_type *server_region = region_create(xalloc, free); 2373 netio_type *netio = netio_create(server_region); 2374 netio_handler_type reload_listener; 2375 int reload_sockets[2] = {-1, -1}; 2376 struct timespec timeout_spec; 2377 int status; 2378 pid_t child_pid; 2379 pid_t reload_pid = -1; 2380 sig_atomic_t mode; 2381 2382 /* Ensure we are the main process */ 2383 assert(nsd->server_kind == NSD_SERVER_MAIN); 2384 2385 /* Add listener for the XFRD process */ 2386 netio_add_handler(netio, nsd->xfrd_listener); 2387 2388 /* Start the child processes that handle incoming queries */ 2389 if (server_start_children(nsd, server_region, netio, 2390 &nsd->xfrd_listener->fd) != 0) { 2391 send_children_quit(nsd); 2392 exit(1); 2393 } 2394 reload_listener.fd = -1; 2395 2396 /* This_child MUST be 0, because this is the parent process */ 2397 assert(nsd->this_child == 0); 2398 2399 /* Run the server until we get a shutdown signal */ 2400 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2401 /* Did we receive a signal that changes our mode? */ 2402 if(mode == NSD_RUN) { 2403 nsd->mode = mode = server_signal_mode(nsd); 2404 } 2405 2406 switch (mode) { 2407 case NSD_RUN: 2408 /* see if any child processes terminated */ 2409 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2410 int is_child = delete_child_pid(nsd, child_pid); 2411 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2412 if(nsd->children[is_child].child_fd == -1) 2413 nsd->children[is_child].has_exited = 1; 2414 parent_check_all_children_exited(nsd); 2415 } else if(is_child != -1) { 2416 log_msg(LOG_WARNING, 2417 "server %d died unexpectedly with status %d, restarting", 2418 (int) child_pid, status); 2419 restart_child_servers(nsd, server_region, netio, 2420 &nsd->xfrd_listener->fd); 2421 } else if (child_pid == reload_pid) { 2422 sig_atomic_t cmd = NSD_RELOAD_DONE; 2423 pid_t mypid; 2424 log_msg(LOG_WARNING, 2425 "Reload process %d failed with status %d, continuing with old database", 2426 (int) child_pid, status); 2427 reload_pid = -1; 2428 if(reload_listener.fd != -1) close(reload_listener.fd); 2429 reload_listener.fd = -1; 2430 reload_listener.event_types = NETIO_EVENT_NONE; 2431 task_process_sync(nsd->task[nsd->mytask]); 2432 /* inform xfrd reload attempt ended */ 2433 if(!write_socket(nsd->xfrd_listener->fd, 2434 &cmd, sizeof(cmd))) { 2435 log_msg(LOG_ERR, "problems " 2436 "sending SOAEND to xfrd: %s", 2437 strerror(errno)); 2438 } 2439 mypid = getpid(); 2440 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2441 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2442 strerror(errno)); 2443 } 2444 } else if(status != 0) { 2445 /* check for status, because we get 2446 * the old-servermain because reload 2447 * is the process-parent of old-main, 2448 * and we get older server-processes 2449 * that are exiting after a reload */ 2450 log_msg(LOG_WARNING, 2451 "process %d terminated with status %d", 2452 (int) child_pid, status); 2453 } 2454 } 2455 if (child_pid == -1) { 2456 if (errno == EINTR) { 2457 continue; 2458 } 2459 if (errno != ECHILD) 2460 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2461 } 2462 if (nsd->mode != NSD_RUN) 2463 break; 2464 2465 /* timeout to collect processes. In case no sigchild happens. */ 2466 timeout_spec.tv_sec = 60; 2467 timeout_spec.tv_nsec = 0; 2468 2469 /* listen on ports, timeout for collecting terminated children */ 2470 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2471 if (errno != EINTR) { 2472 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2473 } 2474 } 2475 if(nsd->restart_children) { 2476 restart_child_servers(nsd, server_region, netio, 2477 &nsd->xfrd_listener->fd); 2478 nsd->restart_children = 0; 2479 } 2480 if(nsd->reload_failed) { 2481 sig_atomic_t cmd = NSD_RELOAD_DONE; 2482 pid_t mypid; 2483 nsd->reload_failed = 0; 2484 log_msg(LOG_WARNING, 2485 "Reload process %d failed, continuing with old database", 2486 (int) reload_pid); 2487 reload_pid = -1; 2488 if(reload_listener.fd != -1) close(reload_listener.fd); 2489 reload_listener.fd = -1; 2490 reload_listener.event_types = NETIO_EVENT_NONE; 2491 task_process_sync(nsd->task[nsd->mytask]); 2492 /* inform xfrd reload attempt ended */ 2493 if(!write_socket(nsd->xfrd_listener->fd, 2494 &cmd, sizeof(cmd))) { 2495 log_msg(LOG_ERR, "problems " 2496 "sending SOAEND to xfrd: %s", 2497 strerror(errno)); 2498 } 2499 mypid = getpid(); 2500 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2501 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2502 strerror(errno)); 2503 } 2504 } 2505 2506 break; 2507 case NSD_RELOAD_REQ: { 2508 sig_atomic_t cmd = NSD_RELOAD_REQ; 2509 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2510 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2511 "main: ipc send reload_req to xfrd")); 2512 if(!write_socket(nsd->xfrd_listener->fd, 2513 &cmd, sizeof(cmd))) { 2514 log_msg(LOG_ERR, "server_main: could not send " 2515 "reload_req to xfrd: %s", strerror(errno)); 2516 } 2517 nsd->mode = NSD_RUN; 2518 } break; 2519 case NSD_RELOAD: 2520 /* Continue to run nsd after reload */ 2521 nsd->mode = NSD_RUN; 2522 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2523 if (reload_pid != -1) { 2524 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2525 (int) reload_pid); 2526 break; 2527 } 2528 2529 /* switch the mytask to keep track of who owns task*/ 2530 nsd->mytask = 1 - nsd->mytask; 2531 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2532 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2533 reload_pid = -1; 2534 break; 2535 } 2536 2537 /* Do actual reload */ 2538 reload_pid = fork(); 2539 switch (reload_pid) { 2540 case -1: 2541 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2542 break; 2543 default: 2544 /* PARENT */ 2545 close(reload_sockets[0]); 2546 server_reload(nsd, server_region, netio, 2547 reload_sockets[1]); 2548 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2549 close(reload_sockets[1]); 2550 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2551 /* drop stale xfrd ipc data */ 2552 ((struct ipc_handler_conn_data*)nsd-> 2553 xfrd_listener->user_data) 2554 ->conn->is_reading = 0; 2555 reload_pid = -1; 2556 reload_listener.fd = -1; 2557 reload_listener.event_types = NETIO_EVENT_NONE; 2558 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2559 break; 2560 case 0: 2561 /* CHILD */ 2562 /* server_main keep running until NSD_QUIT_SYNC 2563 * received from reload. */ 2564 close(reload_sockets[1]); 2565 reload_listener.fd = reload_sockets[0]; 2566 reload_listener.timeout = NULL; 2567 reload_listener.user_data = nsd; 2568 reload_listener.event_types = NETIO_EVENT_READ; 2569 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2570 netio_add_handler(netio, &reload_listener); 2571 reload_pid = getppid(); 2572 break; 2573 } 2574 break; 2575 case NSD_QUIT_SYNC: 2576 /* synchronisation of xfrd, parent and reload */ 2577 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2578 sig_atomic_t cmd = NSD_RELOAD; 2579 /* stop xfrd ipc writes in progress */ 2580 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2581 "main: ipc send indication reload")); 2582 if(!write_socket(nsd->xfrd_listener->fd, 2583 &cmd, sizeof(cmd))) { 2584 log_msg(LOG_ERR, "server_main: could not send reload " 2585 "indication to xfrd: %s", strerror(errno)); 2586 } 2587 /* wait for ACK from xfrd */ 2588 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2589 nsd->quit_sync_done = 1; 2590 } 2591 nsd->mode = NSD_RUN; 2592 break; 2593 case NSD_QUIT: 2594 /* silent shutdown during reload */ 2595 if(reload_listener.fd != -1) { 2596 /* acknowledge the quit, to sync reload that we will really quit now */ 2597 sig_atomic_t cmd = NSD_RELOAD; 2598 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2599 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2600 log_msg(LOG_ERR, "server_main: " 2601 "could not ack quit: %s", strerror(errno)); 2602 } 2603 #ifdef BIND8_STATS 2604 parent_send_stats(nsd, reload_listener.fd); 2605 #endif /* BIND8_STATS */ 2606 close(reload_listener.fd); 2607 } 2608 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2609 /* only quit children after xfrd has acked */ 2610 send_children_quit(nsd); 2611 2612 #ifdef MEMCLEAN /* OS collects memory pages */ 2613 region_destroy(server_region); 2614 #endif 2615 server_shutdown(nsd); 2616 2617 /* ENOTREACH */ 2618 break; 2619 case NSD_SHUTDOWN: 2620 break; 2621 case NSD_REAP_CHILDREN: 2622 /* continue; wait for child in run loop */ 2623 nsd->mode = NSD_RUN; 2624 break; 2625 case NSD_STATS: 2626 #ifdef BIND8_STATS 2627 set_children_stats(nsd); 2628 #endif 2629 nsd->mode = NSD_RUN; 2630 break; 2631 default: 2632 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2633 nsd->mode = NSD_RUN; 2634 break; 2635 } 2636 } 2637 log_msg(LOG_WARNING, "signal received, shutting down..."); 2638 2639 /* close opened ports to avoid race with restart of nsd */ 2640 server_close_all_sockets(nsd->udp, nsd->ifs); 2641 server_close_all_sockets(nsd->tcp, nsd->ifs); 2642 #ifdef HAVE_SSL 2643 daemon_remote_close(nsd->rc); 2644 #endif 2645 send_children_quit_and_wait(nsd); 2646 2647 /* Unlink it if possible... */ 2648 unlinkpid(nsd->pidfile); 2649 unlink(nsd->task[0]->fname); 2650 unlink(nsd->task[1]->fname); 2651 #ifdef USE_ZONE_STATS 2652 unlink(nsd->zonestatfname[0]); 2653 unlink(nsd->zonestatfname[1]); 2654 #endif 2655 #ifdef USE_DNSTAP 2656 dt_collector_close(nsd->dt_collector, nsd); 2657 #endif 2658 2659 if(reload_listener.fd != -1) { 2660 sig_atomic_t cmd = NSD_QUIT; 2661 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2662 "main: ipc send quit to reload-process")); 2663 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2664 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2665 strerror(errno)); 2666 } 2667 fsync(reload_listener.fd); 2668 close(reload_listener.fd); 2669 /* wait for reload to finish processing */ 2670 while(1) { 2671 if(waitpid(reload_pid, NULL, 0) == -1) { 2672 if(errno == EINTR) continue; 2673 if(errno == ECHILD) break; 2674 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2675 (int)reload_pid, strerror(errno)); 2676 } 2677 break; 2678 } 2679 } 2680 if(nsd->xfrd_listener->fd != -1) { 2681 /* complete quit, stop xfrd */ 2682 sig_atomic_t cmd = NSD_QUIT; 2683 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2684 "main: ipc send quit to xfrd")); 2685 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2686 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2687 strerror(errno)); 2688 } 2689 fsync(nsd->xfrd_listener->fd); 2690 close(nsd->xfrd_listener->fd); 2691 (void)kill(nsd->pid, SIGTERM); 2692 } 2693 2694 #ifdef MEMCLEAN /* OS collects memory pages */ 2695 region_destroy(server_region); 2696 #endif 2697 /* write the nsd.db to disk, wait for it to complete */ 2698 udb_base_sync(nsd->db->udb, 1); 2699 udb_base_close(nsd->db->udb); 2700 server_shutdown(nsd); 2701 } 2702 2703 static query_state_type 2704 server_process_query(struct nsd *nsd, struct query *query) 2705 { 2706 return query_process(query, nsd); 2707 } 2708 2709 static query_state_type 2710 server_process_query_udp(struct nsd *nsd, struct query *query) 2711 { 2712 #ifdef RATELIMIT 2713 if(query_process(query, nsd) != QUERY_DISCARDED) { 2714 if(rrl_process_query(query)) 2715 return rrl_slip(query); 2716 else return QUERY_PROCESSED; 2717 } 2718 return QUERY_DISCARDED; 2719 #else 2720 return query_process(query, nsd); 2721 #endif 2722 } 2723 2724 const char* 2725 nsd_event_vs(void) 2726 { 2727 #ifdef USE_MINI_EVENT 2728 return ""; 2729 #else 2730 return event_get_version(); 2731 #endif 2732 } 2733 2734 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2735 static const char* ub_ev_backend2str(int b) 2736 { 2737 switch(b) { 2738 case EVBACKEND_SELECT: return "select"; 2739 case EVBACKEND_POLL: return "poll"; 2740 case EVBACKEND_EPOLL: return "epoll"; 2741 case EVBACKEND_KQUEUE: return "kqueue"; 2742 case EVBACKEND_DEVPOLL: return "devpoll"; 2743 case EVBACKEND_PORT: return "evport"; 2744 } 2745 return "unknown"; 2746 } 2747 #endif 2748 2749 const char* 2750 nsd_event_method(void) 2751 { 2752 #ifdef USE_MINI_EVENT 2753 return "select"; 2754 #else 2755 struct event_base* b = nsd_child_event_base(); 2756 const char* m = "?"; 2757 # ifdef EV_FEATURE_BACKENDS 2758 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2759 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 2760 m = event_base_get_method(b); 2761 # endif 2762 # ifdef MEMCLEAN 2763 event_base_free(b); 2764 # endif 2765 return m; 2766 #endif 2767 } 2768 2769 struct event_base* 2770 nsd_child_event_base(void) 2771 { 2772 struct event_base* base; 2773 #ifdef USE_MINI_EVENT 2774 static time_t secs; 2775 static struct timeval now; 2776 base = event_init(&secs, &now); 2777 #else 2778 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2779 /* libev */ 2780 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2781 # else 2782 /* libevent */ 2783 # ifdef HAVE_EVENT_BASE_NEW 2784 base = event_base_new(); 2785 # else 2786 base = event_init(); 2787 # endif 2788 # endif 2789 #endif 2790 return base; 2791 } 2792 2793 static void 2794 add_udp_handler( 2795 struct nsd *nsd, 2796 struct nsd_socket *sock, 2797 struct udp_handler_data *data) 2798 { 2799 struct event *handler = &data->event; 2800 2801 data->nsd = nsd; 2802 data->socket = sock; 2803 2804 memset(handler, 0, sizeof(*handler)); 2805 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 2806 if(event_base_set(nsd->event_base, handler) != 0) 2807 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 2808 if(event_add(handler, NULL) != 0) 2809 log_msg(LOG_ERR, "nsd udp: event_add failed"); 2810 } 2811 2812 void 2813 add_tcp_handler( 2814 struct nsd *nsd, 2815 struct nsd_socket *sock, 2816 struct tcp_accept_handler_data *data) 2817 { 2818 struct event *handler = &data->event; 2819 2820 data->nsd = nsd; 2821 data->socket = sock; 2822 2823 #ifdef HAVE_SSL 2824 if (nsd->tls_ctx && 2825 nsd->options->tls_port && 2826 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 2827 { 2828 data->tls_accept = 1; 2829 if(verbosity >= 2) { 2830 char buf[48]; 2831 addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 2832 VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 2833 } 2834 } else { 2835 data->tls_accept = 0; 2836 } 2837 #endif 2838 2839 memset(handler, 0, sizeof(*handler)); 2840 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 2841 if(event_base_set(nsd->event_base, handler) != 0) 2842 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 2843 if(event_add(handler, NULL) != 0) 2844 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 2845 data->event_added = 1; 2846 } 2847 2848 /* 2849 * Serve DNS requests. 2850 */ 2851 void 2852 server_child(struct nsd *nsd) 2853 { 2854 size_t i, from, numifs; 2855 region_type *server_region = region_create(xalloc, free); 2856 struct event_base* event_base = nsd_child_event_base(); 2857 sig_atomic_t mode; 2858 2859 if(!event_base) { 2860 log_msg(LOG_ERR, "nsd server could not create event base"); 2861 exit(1); 2862 } 2863 nsd->event_base = event_base; 2864 nsd->server_region = server_region; 2865 2866 #ifdef RATELIMIT 2867 rrl_init(nsd->this_child->child_num); 2868 #endif 2869 2870 assert(nsd->server_kind != NSD_SERVER_MAIN); 2871 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 2872 2873 #ifdef HAVE_SETPROCTITLE 2874 setproctitle("server %d", nsd->this_child->child_num + 1); 2875 #endif 2876 #ifdef HAVE_CPUSET_T 2877 if(nsd->use_cpu_affinity) { 2878 set_cpu_affinity(nsd->this_child->cpuset); 2879 } 2880 #endif 2881 2882 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 2883 server_close_all_sockets(nsd->tcp, nsd->ifs); 2884 } 2885 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 2886 server_close_all_sockets(nsd->udp, nsd->ifs); 2887 } 2888 2889 if (nsd->this_child->parent_fd != -1) { 2890 struct event *handler; 2891 struct ipc_handler_conn_data* user_data = 2892 (struct ipc_handler_conn_data*)region_alloc( 2893 server_region, sizeof(struct ipc_handler_conn_data)); 2894 user_data->nsd = nsd; 2895 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 2896 2897 handler = (struct event*) region_alloc( 2898 server_region, sizeof(*handler)); 2899 memset(handler, 0, sizeof(*handler)); 2900 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 2901 EV_READ, child_handle_parent_command, user_data); 2902 if(event_base_set(event_base, handler) != 0) 2903 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 2904 if(event_add(handler, NULL) != 0) 2905 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 2906 } 2907 2908 if(nsd->reuseport) { 2909 numifs = nsd->ifs / nsd->reuseport; 2910 from = numifs * nsd->this_child->child_num; 2911 if(from+numifs > nsd->ifs) { /* should not happen */ 2912 from = 0; 2913 numifs = nsd->ifs; 2914 } 2915 } else { 2916 from = 0; 2917 numifs = nsd->ifs; 2918 } 2919 2920 if (nsd->server_kind & NSD_SERVER_UDP) { 2921 int child = nsd->this_child->child_num; 2922 memset(msgs, 0, sizeof(msgs)); 2923 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 2924 queries[i] = query_create(server_region, 2925 compressed_dname_offsets, 2926 compression_table_size, compressed_dnames); 2927 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 2928 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 2929 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 2930 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 2931 msgs[i].msg_hdr.msg_iovlen = 1; 2932 msgs[i].msg_hdr.msg_name = &queries[i]->addr; 2933 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 2934 } 2935 2936 for (i = 0; i < nsd->ifs; i++) { 2937 int listen; 2938 struct udp_handler_data *data; 2939 2940 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 2941 2942 if(i >= from && i < (from + numifs) && listen) { 2943 data = region_alloc_zero( 2944 nsd->server_region, sizeof(*data)); 2945 add_udp_handler(nsd, &nsd->udp[i], data); 2946 } else { 2947 /* close sockets intended for other servers */ 2948 server_close_socket(&nsd->udp[i]); 2949 } 2950 } 2951 } 2952 2953 /* 2954 * Keep track of all the TCP accept handlers so we can enable 2955 * and disable them based on the current number of active TCP 2956 * connections. 2957 */ 2958 if (nsd->server_kind & NSD_SERVER_TCP) { 2959 int child = nsd->this_child->child_num; 2960 tcp_accept_handler_count = numifs; 2961 tcp_accept_handlers = region_alloc_array(server_region, 2962 numifs, sizeof(*tcp_accept_handlers)); 2963 2964 for (i = 0; i < nsd->ifs; i++) { 2965 int listen; 2966 struct tcp_accept_handler_data *data; 2967 2968 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 2969 2970 if(i >= from && i < (from + numifs) && listen) { 2971 data = &tcp_accept_handlers[i-from]; 2972 memset(data, 0, sizeof(*data)); 2973 add_tcp_handler(nsd, &nsd->tcp[i], data); 2974 } else { 2975 /* close sockets intended for other servers */ 2976 /* 2977 * uncomment this once tcp servers are no 2978 * longer copied in the tcp fd copy line 2979 * in server_init(). 2980 server_close_socket(&nsd->tcp[i]); 2981 */ 2982 /* close sockets not meant for this server*/ 2983 if(!listen) 2984 server_close_socket(&nsd->tcp[i]); 2985 } 2986 } 2987 } else { 2988 tcp_accept_handler_count = 0; 2989 } 2990 2991 /* The main loop... */ 2992 while ((mode = nsd->mode) != NSD_QUIT) { 2993 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 2994 2995 /* Do we need to do the statistics... */ 2996 if (mode == NSD_STATS) { 2997 #ifdef BIND8_STATS 2998 int p = nsd->st.period; 2999 nsd->st.period = 1; /* force stats printout */ 3000 /* Dump the statistics */ 3001 bind8_stats(nsd); 3002 nsd->st.period = p; 3003 #else /* !BIND8_STATS */ 3004 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3005 #endif /* BIND8_STATS */ 3006 3007 nsd->mode = NSD_RUN; 3008 } 3009 else if (mode == NSD_REAP_CHILDREN) { 3010 /* got signal, notify parent. parent reaps terminated children. */ 3011 if (nsd->this_child->parent_fd != -1) { 3012 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3013 if (write(nsd->this_child->parent_fd, 3014 &parent_notify, 3015 sizeof(parent_notify)) == -1) 3016 { 3017 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3018 (int) nsd->this_child->pid, strerror(errno)); 3019 } 3020 } else /* no parent, so reap 'em */ 3021 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3022 nsd->mode = NSD_RUN; 3023 } 3024 else if(mode == NSD_RUN) { 3025 /* Wait for a query... */ 3026 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3027 if (errno != EINTR) { 3028 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3029 break; 3030 } 3031 } 3032 } else if(mode == NSD_QUIT) { 3033 /* ignore here, quit */ 3034 } else { 3035 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3036 (int)mode); 3037 nsd->mode = NSD_RUN; 3038 } 3039 } 3040 3041 service_remaining_tcp(nsd); 3042 #ifdef BIND8_STATS 3043 bind8_stats(nsd); 3044 #endif /* BIND8_STATS */ 3045 3046 #ifdef MEMCLEAN /* OS collects memory pages */ 3047 #ifdef RATELIMIT 3048 rrl_deinit(nsd->this_child->child_num); 3049 #endif 3050 event_base_free(event_base); 3051 region_destroy(server_region); 3052 #endif 3053 server_shutdown(nsd); 3054 } 3055 3056 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3057 { 3058 int* timed_out = (int*)arg; 3059 assert(event & EV_TIMEOUT); (void)event; 3060 /* wake up the service tcp thread, note event is no longer 3061 * registered */ 3062 *timed_out = 1; 3063 } 3064 3065 void 3066 service_remaining_tcp(struct nsd* nsd) 3067 { 3068 struct tcp_handler_data* p; 3069 struct event_base* event_base; 3070 /* check if it is needed */ 3071 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3072 return; 3073 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3074 3075 /* setup event base */ 3076 event_base = nsd_child_event_base(); 3077 if(!event_base) { 3078 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3079 return; 3080 } 3081 /* register tcp connections */ 3082 for(p = tcp_active_list; p != NULL; p = p->next) { 3083 struct timeval timeout; 3084 int fd = p->event.ev_fd; 3085 #ifdef USE_MINI_EVENT 3086 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3087 #else 3088 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3089 #endif 3090 void (*fn)(int, short, void*); 3091 #ifdef HAVE_SSL 3092 if(p->tls) { 3093 if((event&EV_READ)) 3094 fn = handle_tls_reading; 3095 else fn = handle_tls_writing; 3096 } else { 3097 #endif 3098 if((event&EV_READ)) 3099 fn = handle_tcp_reading; 3100 else fn = handle_tcp_writing; 3101 #ifdef HAVE_SSL 3102 } 3103 #endif 3104 3105 /* set timeout to 1/10 second */ 3106 if(p->tcp_timeout > 100) 3107 p->tcp_timeout = 100; 3108 timeout.tv_sec = p->tcp_timeout / 1000; 3109 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3110 event_del(&p->event); 3111 memset(&p->event, 0, sizeof(p->event)); 3112 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3113 fn, p); 3114 if(event_base_set(event_base, &p->event) != 0) 3115 log_msg(LOG_ERR, "event base set failed"); 3116 if(event_add(&p->event, &timeout) != 0) 3117 log_msg(LOG_ERR, "event add failed"); 3118 } 3119 3120 /* handle it */ 3121 while(nsd->current_tcp_count > 0) { 3122 mode_t m = server_signal_mode(nsd); 3123 struct event timeout; 3124 struct timeval tv; 3125 int timed_out = 0; 3126 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3127 m == NSD_REAP_CHILDREN) { 3128 /* quit */ 3129 break; 3130 } 3131 /* timer */ 3132 /* have to do something every second */ 3133 tv.tv_sec = 1; 3134 tv.tv_usec = 0; 3135 memset(&timeout, 0, sizeof(timeout)); 3136 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3137 &timed_out); 3138 if(event_base_set(event_base, &timeout) != 0) 3139 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3140 if(event_add(&timeout, &tv) != 0) 3141 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3142 3143 /* service loop */ 3144 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3145 if (errno != EINTR) { 3146 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3147 break; 3148 } 3149 } 3150 if(!timed_out) { 3151 event_del(&timeout); 3152 } else { 3153 /* timed out, quit */ 3154 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3155 break; 3156 } 3157 } 3158 #ifdef MEMCLEAN 3159 event_base_free(event_base); 3160 #endif 3161 /* continue to quit after return */ 3162 } 3163 3164 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3165 * are always used, even if nonblocking operations are broken, in which case 3166 * NUM_RECV_PER_SELECT is defined to 1 (one). 3167 */ 3168 #if defined(HAVE_RECVMMSG) 3169 #define nsd_recvmmsg recvmmsg 3170 #else /* !HAVE_RECVMMSG */ 3171 3172 static int 3173 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3174 int flags, struct timespec *timeout) 3175 { 3176 unsigned int vpos = 0; 3177 ssize_t rcvd; 3178 3179 /* timeout is ignored, ensure caller does not expect it to work */ 3180 assert(timeout == NULL); (void)timeout; 3181 3182 while(vpos < vlen) { 3183 rcvd = recvfrom(sockfd, 3184 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3185 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3186 flags, 3187 msgvec[vpos].msg_hdr.msg_name, 3188 &msgvec[vpos].msg_hdr.msg_namelen); 3189 if(rcvd < 0) { 3190 break; 3191 } else { 3192 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3193 msgvec[vpos].msg_len = (unsigned int)rcvd; 3194 vpos++; 3195 } 3196 } 3197 3198 if(vpos) { 3199 /* error will be picked up next time */ 3200 return (int)vpos; 3201 } else if(errno == 0) { 3202 return 0; 3203 } else if(errno == EAGAIN) { 3204 return 0; 3205 } 3206 3207 return -1; 3208 } 3209 #endif /* HAVE_RECVMMSG */ 3210 3211 #ifdef HAVE_SENDMMSG 3212 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3213 #else /* !HAVE_SENDMMSG */ 3214 3215 static int 3216 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3217 { 3218 unsigned int vpos = 0; 3219 ssize_t snd; 3220 3221 while(vpos < vlen) { 3222 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3223 snd = sendto(sockfd, 3224 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3225 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3226 flags, 3227 msgvec[vpos].msg_hdr.msg_name, 3228 msgvec[vpos].msg_hdr.msg_namelen); 3229 if(snd < 0) { 3230 break; 3231 } else { 3232 msgvec[vpos].msg_len = (unsigned int)snd; 3233 vpos++; 3234 } 3235 } 3236 3237 if(vpos) { 3238 return (int)vpos; 3239 } else if(errno == 0) { 3240 return 0; 3241 } 3242 3243 return -1; 3244 } 3245 #endif /* HAVE_SENDMMSG */ 3246 3247 static void 3248 handle_udp(int fd, short event, void* arg) 3249 { 3250 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3251 int received, sent, recvcount, i; 3252 struct query *q; 3253 3254 if (!(event & EV_READ)) { 3255 return; 3256 } 3257 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3258 /* this printf strangely gave a performance increase on Linux */ 3259 /* printf("recvcount %d \n", recvcount); */ 3260 if (recvcount == -1) { 3261 if (errno != EAGAIN && errno != EINTR) { 3262 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3263 STATUP(data->nsd, rxerr); 3264 /* No zone statup */ 3265 } 3266 /* Simply no data available */ 3267 return; 3268 } 3269 for (i = 0; i < recvcount; i++) { 3270 loopstart: 3271 received = msgs[i].msg_len; 3272 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; 3273 q = queries[i]; 3274 if (received == -1) { 3275 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3276 #if defined(HAVE_RECVMMSG) 3277 msgs[i].msg_hdr.msg_flags 3278 #else 3279 errno 3280 #endif 3281 )); 3282 STATUP(data->nsd, rxerr); 3283 /* No zone statup */ 3284 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3285 iovecs[i].iov_len = buffer_remaining(q->packet); 3286 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3287 goto swap_drop; 3288 } 3289 3290 /* Account... */ 3291 #ifdef BIND8_STATS 3292 if (data->socket->addr.ai_family == AF_INET) { 3293 STATUP(data->nsd, qudp); 3294 } else if (data->socket->addr.ai_family == AF_INET6) { 3295 STATUP(data->nsd, qudp6); 3296 } 3297 #endif 3298 3299 buffer_skip(q->packet, received); 3300 buffer_flip(q->packet); 3301 #ifdef USE_DNSTAP 3302 dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen, 3303 q->tcp, q->packet); 3304 #endif /* USE_DNSTAP */ 3305 3306 /* Process and answer the query... */ 3307 if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) { 3308 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3309 STATUP(data->nsd, nona); 3310 ZTATUP(data->nsd, q->zone, nona); 3311 } 3312 3313 #ifdef USE_ZONE_STATS 3314 if (data->socket->addr.ai_family == AF_INET) { 3315 ZTATUP(data->nsd, q->zone, qudp); 3316 } else if (data->socket->addr.ai_family == AF_INET6) { 3317 ZTATUP(data->nsd, q->zone, qudp6); 3318 } 3319 #endif 3320 3321 /* Add EDNS0 and TSIG info if necessary. */ 3322 query_add_optional(q, data->nsd); 3323 3324 buffer_flip(q->packet); 3325 iovecs[i].iov_len = buffer_remaining(q->packet); 3326 #ifdef BIND8_STATS 3327 /* Account the rcode & TC... */ 3328 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3329 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3330 if (TC(q->packet)) { 3331 STATUP(data->nsd, truncated); 3332 ZTATUP(data->nsd, q->zone, truncated); 3333 } 3334 #endif /* BIND8_STATS */ 3335 #ifdef USE_DNSTAP 3336 dt_collector_submit_auth_response(data->nsd, 3337 &q->addr, q->addrlen, q->tcp, q->packet, 3338 q->zone); 3339 #endif /* USE_DNSTAP */ 3340 } else { 3341 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3342 iovecs[i].iov_len = buffer_remaining(q->packet); 3343 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3344 swap_drop: 3345 STATUP(data->nsd, dropped); 3346 ZTATUP(data->nsd, q->zone, dropped); 3347 if(i != recvcount-1) { 3348 /* swap with last and decrease recvcount */ 3349 struct mmsghdr mtmp = msgs[i]; 3350 struct iovec iotmp = iovecs[i]; 3351 recvcount--; 3352 msgs[i] = msgs[recvcount]; 3353 iovecs[i] = iovecs[recvcount]; 3354 queries[i] = queries[recvcount]; 3355 msgs[recvcount] = mtmp; 3356 iovecs[recvcount] = iotmp; 3357 queries[recvcount] = q; 3358 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3359 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3360 goto loopstart; 3361 } else { recvcount --; } 3362 } 3363 } 3364 3365 /* send until all are sent */ 3366 i = 0; 3367 while(i<recvcount) { 3368 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3369 if(sent == -1) { 3370 if(errno == ENOBUFS || 3371 #ifdef EWOULDBLOCK 3372 errno == EWOULDBLOCK || 3373 #endif 3374 errno == EAGAIN) { 3375 /* block to wait until send buffer avail */ 3376 int flag; 3377 if((flag = fcntl(fd, F_GETFL)) == -1) { 3378 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3379 flag = 0; 3380 } 3381 flag &= ~O_NONBLOCK; 3382 if(fcntl(fd, F_SETFL, flag) == -1) 3383 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3384 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3385 flag |= O_NONBLOCK; 3386 if(fcntl(fd, F_SETFL, flag) == -1) 3387 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3388 if(sent != -1) { 3389 i += sent; 3390 continue; 3391 } 3392 } 3393 /* don't log transient network full errors, unless 3394 * on higher verbosity */ 3395 if(!(errno == ENOBUFS && verbosity < 1) && 3396 #ifdef EWOULDBLOCK 3397 errno != EWOULDBLOCK && 3398 #endif 3399 errno != EAGAIN) { 3400 const char* es = strerror(errno); 3401 char a[48]; 3402 addr2str(&queries[i]->addr, a, sizeof(a)); 3403 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3404 } 3405 #ifdef BIND8_STATS 3406 data->nsd->st.txerr += recvcount-i; 3407 #endif /* BIND8_STATS */ 3408 break; 3409 } 3410 i += sent; 3411 } 3412 for(i=0; i<recvcount; i++) { 3413 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3414 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3415 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3416 } 3417 } 3418 3419 #ifdef HAVE_SSL 3420 /* 3421 * Setup an event for the tcp handler. 3422 */ 3423 static void 3424 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3425 int fd, short event) 3426 { 3427 struct timeval timeout; 3428 struct event_base* ev_base; 3429 3430 timeout.tv_sec = data->nsd->tcp_timeout; 3431 timeout.tv_usec = 0L; 3432 3433 ev_base = data->event.ev_base; 3434 event_del(&data->event); 3435 memset(&data->event, 0, sizeof(data->event)); 3436 event_set(&data->event, fd, event, fn, data); 3437 if(event_base_set(ev_base, &data->event) != 0) 3438 log_msg(LOG_ERR, "event base set failed"); 3439 if(event_add(&data->event, &timeout) != 0) 3440 log_msg(LOG_ERR, "event add failed"); 3441 } 3442 #endif /* HAVE_SSL */ 3443 3444 static void 3445 cleanup_tcp_handler(struct tcp_handler_data* data) 3446 { 3447 event_del(&data->event); 3448 #ifdef HAVE_SSL 3449 if(data->tls) { 3450 SSL_shutdown(data->tls); 3451 SSL_free(data->tls); 3452 data->tls = NULL; 3453 } 3454 #endif 3455 close(data->event.ev_fd); 3456 if(data->prev) 3457 data->prev->next = data->next; 3458 else tcp_active_list = data->next; 3459 if(data->next) 3460 data->next->prev = data->prev; 3461 3462 /* 3463 * Enable the TCP accept handlers when the current number of 3464 * TCP connections is about to drop below the maximum number 3465 * of TCP connections. 3466 */ 3467 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3468 configure_handler_event_types(EV_READ|EV_PERSIST); 3469 if(slowaccept) { 3470 event_del(&slowaccept_event); 3471 slowaccept = 0; 3472 } 3473 } 3474 --data->nsd->current_tcp_count; 3475 assert(data->nsd->current_tcp_count >= 0); 3476 3477 region_destroy(data->region); 3478 } 3479 3480 static void 3481 handle_tcp_reading(int fd, short event, void* arg) 3482 { 3483 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3484 ssize_t received; 3485 struct event_base* ev_base; 3486 struct timeval timeout; 3487 3488 if ((event & EV_TIMEOUT)) { 3489 /* Connection timed out. */ 3490 cleanup_tcp_handler(data); 3491 return; 3492 } 3493 3494 if (data->nsd->tcp_query_count > 0 && 3495 data->query_count >= data->nsd->tcp_query_count) { 3496 /* No more queries allowed on this tcp connection. */ 3497 cleanup_tcp_handler(data); 3498 return; 3499 } 3500 3501 assert((event & EV_READ)); 3502 3503 if (data->bytes_transmitted == 0) { 3504 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3505 } 3506 3507 /* 3508 * Check if we received the leading packet length bytes yet. 3509 */ 3510 if (data->bytes_transmitted < sizeof(uint16_t)) { 3511 received = read(fd, 3512 (char *) &data->query->tcplen 3513 + data->bytes_transmitted, 3514 sizeof(uint16_t) - data->bytes_transmitted); 3515 if (received == -1) { 3516 if (errno == EAGAIN || errno == EINTR) { 3517 /* 3518 * Read would block, wait until more 3519 * data is available. 3520 */ 3521 return; 3522 } else { 3523 char buf[48]; 3524 addr2str(&data->query->addr, buf, sizeof(buf)); 3525 #ifdef ECONNRESET 3526 if (verbosity >= 2 || errno != ECONNRESET) 3527 #endif /* ECONNRESET */ 3528 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3529 cleanup_tcp_handler(data); 3530 return; 3531 } 3532 } else if (received == 0) { 3533 /* EOF */ 3534 cleanup_tcp_handler(data); 3535 return; 3536 } 3537 3538 data->bytes_transmitted += received; 3539 if (data->bytes_transmitted < sizeof(uint16_t)) { 3540 /* 3541 * Not done with the tcplen yet, wait for more 3542 * data to become available. 3543 */ 3544 return; 3545 } 3546 3547 assert(data->bytes_transmitted == sizeof(uint16_t)); 3548 3549 data->query->tcplen = ntohs(data->query->tcplen); 3550 3551 /* 3552 * Minimum query size is: 3553 * 3554 * Size of the header (12) 3555 * + Root domain name (1) 3556 * + Query class (2) 3557 * + Query type (2) 3558 */ 3559 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3560 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3561 cleanup_tcp_handler(data); 3562 return; 3563 } 3564 3565 if (data->query->tcplen > data->query->maxlen) { 3566 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3567 cleanup_tcp_handler(data); 3568 return; 3569 } 3570 3571 buffer_set_limit(data->query->packet, data->query->tcplen); 3572 } 3573 3574 assert(buffer_remaining(data->query->packet) > 0); 3575 3576 /* Read the (remaining) query data. */ 3577 received = read(fd, 3578 buffer_current(data->query->packet), 3579 buffer_remaining(data->query->packet)); 3580 if (received == -1) { 3581 if (errno == EAGAIN || errno == EINTR) { 3582 /* 3583 * Read would block, wait until more data is 3584 * available. 3585 */ 3586 return; 3587 } else { 3588 char buf[48]; 3589 addr2str(&data->query->addr, buf, sizeof(buf)); 3590 #ifdef ECONNRESET 3591 if (verbosity >= 2 || errno != ECONNRESET) 3592 #endif /* ECONNRESET */ 3593 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3594 cleanup_tcp_handler(data); 3595 return; 3596 } 3597 } else if (received == 0) { 3598 /* EOF */ 3599 cleanup_tcp_handler(data); 3600 return; 3601 } 3602 3603 data->bytes_transmitted += received; 3604 buffer_skip(data->query->packet, received); 3605 if (buffer_remaining(data->query->packet) > 0) { 3606 /* 3607 * Message not yet complete, wait for more data to 3608 * become available. 3609 */ 3610 return; 3611 } 3612 3613 assert(buffer_position(data->query->packet) == data->query->tcplen); 3614 3615 /* Account... */ 3616 #ifdef BIND8_STATS 3617 #ifndef INET6 3618 STATUP(data->nsd, ctcp); 3619 #else 3620 if (data->query->addr.ss_family == AF_INET) { 3621 STATUP(data->nsd, ctcp); 3622 } else if (data->query->addr.ss_family == AF_INET6) { 3623 STATUP(data->nsd, ctcp6); 3624 } 3625 #endif 3626 #endif /* BIND8_STATS */ 3627 3628 /* We have a complete query, process it. */ 3629 3630 /* tcp-query-count: handle query counter ++ */ 3631 data->query_count++; 3632 3633 buffer_flip(data->query->packet); 3634 #ifdef USE_DNSTAP 3635 dt_collector_submit_auth_query(data->nsd, &data->query->addr, 3636 data->query->addrlen, data->query->tcp, data->query->packet); 3637 #endif /* USE_DNSTAP */ 3638 data->query_state = server_process_query(data->nsd, data->query); 3639 if (data->query_state == QUERY_DISCARDED) { 3640 /* Drop the packet and the entire connection... */ 3641 STATUP(data->nsd, dropped); 3642 ZTATUP(data->nsd, data->query->zone, dropped); 3643 cleanup_tcp_handler(data); 3644 return; 3645 } 3646 3647 #ifdef BIND8_STATS 3648 if (RCODE(data->query->packet) == RCODE_OK 3649 && !AA(data->query->packet)) 3650 { 3651 STATUP(data->nsd, nona); 3652 ZTATUP(data->nsd, data->query->zone, nona); 3653 } 3654 #endif /* BIND8_STATS */ 3655 3656 #ifdef USE_ZONE_STATS 3657 #ifndef INET6 3658 ZTATUP(data->nsd, data->query->zone, ctcp); 3659 #else 3660 if (data->query->addr.ss_family == AF_INET) { 3661 ZTATUP(data->nsd, data->query->zone, ctcp); 3662 } else if (data->query->addr.ss_family == AF_INET6) { 3663 ZTATUP(data->nsd, data->query->zone, ctcp6); 3664 } 3665 #endif 3666 #endif /* USE_ZONE_STATS */ 3667 3668 query_add_optional(data->query, data->nsd); 3669 3670 /* Switch to the tcp write handler. */ 3671 buffer_flip(data->query->packet); 3672 data->query->tcplen = buffer_remaining(data->query->packet); 3673 #ifdef BIND8_STATS 3674 /* Account the rcode & TC... */ 3675 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 3676 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 3677 if (TC(data->query->packet)) { 3678 STATUP(data->nsd, truncated); 3679 ZTATUP(data->nsd, data->query->zone, truncated); 3680 } 3681 #endif /* BIND8_STATS */ 3682 #ifdef USE_DNSTAP 3683 dt_collector_submit_auth_response(data->nsd, &data->query->addr, 3684 data->query->addrlen, data->query->tcp, data->query->packet, 3685 data->query->zone); 3686 #endif /* USE_DNSTAP */ 3687 data->bytes_transmitted = 0; 3688 3689 timeout.tv_sec = data->tcp_timeout / 1000; 3690 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3691 3692 ev_base = data->event.ev_base; 3693 event_del(&data->event); 3694 memset(&data->event, 0, sizeof(data->event)); 3695 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3696 handle_tcp_reading, data); 3697 if(event_base_set(ev_base, &data->event) != 0) 3698 log_msg(LOG_ERR, "event base set tcpr failed"); 3699 if(event_add(&data->event, &timeout) != 0) 3700 log_msg(LOG_ERR, "event add tcpr failed"); 3701 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 3702 handle_tcp_writing(fd, EV_WRITE, data); 3703 } 3704 3705 static void 3706 handle_tcp_writing(int fd, short event, void* arg) 3707 { 3708 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3709 ssize_t sent; 3710 struct query *q = data->query; 3711 struct timeval timeout; 3712 struct event_base* ev_base; 3713 3714 if ((event & EV_TIMEOUT)) { 3715 /* Connection timed out. */ 3716 cleanup_tcp_handler(data); 3717 return; 3718 } 3719 3720 assert((event & EV_WRITE)); 3721 3722 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3723 /* Writing the response packet length. */ 3724 uint16_t n_tcplen = htons(q->tcplen); 3725 #ifdef HAVE_WRITEV 3726 struct iovec iov[2]; 3727 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 3728 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 3729 iov[1].iov_base = buffer_begin(q->packet); 3730 iov[1].iov_len = buffer_limit(q->packet); 3731 sent = writev(fd, iov, 2); 3732 #else /* HAVE_WRITEV */ 3733 sent = write(fd, 3734 (const char *) &n_tcplen + data->bytes_transmitted, 3735 sizeof(n_tcplen) - data->bytes_transmitted); 3736 #endif /* HAVE_WRITEV */ 3737 if (sent == -1) { 3738 if (errno == EAGAIN || errno == EINTR) { 3739 /* 3740 * Write would block, wait until 3741 * socket becomes writable again. 3742 */ 3743 return; 3744 } else { 3745 #ifdef ECONNRESET 3746 if(verbosity >= 2 || errno != ECONNRESET) 3747 #endif /* ECONNRESET */ 3748 #ifdef EPIPE 3749 if(verbosity >= 2 || errno != EPIPE) 3750 #endif /* EPIPE 'broken pipe' */ 3751 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3752 cleanup_tcp_handler(data); 3753 return; 3754 } 3755 } 3756 3757 data->bytes_transmitted += sent; 3758 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3759 /* 3760 * Writing not complete, wait until socket 3761 * becomes writable again. 3762 */ 3763 return; 3764 } 3765 3766 #ifdef HAVE_WRITEV 3767 sent -= sizeof(n_tcplen); 3768 /* handle potential 'packet done' code */ 3769 goto packet_could_be_done; 3770 #endif 3771 } 3772 3773 sent = write(fd, 3774 buffer_current(q->packet), 3775 buffer_remaining(q->packet)); 3776 if (sent == -1) { 3777 if (errno == EAGAIN || errno == EINTR) { 3778 /* 3779 * Write would block, wait until 3780 * socket becomes writable again. 3781 */ 3782 return; 3783 } else { 3784 #ifdef ECONNRESET 3785 if(verbosity >= 2 || errno != ECONNRESET) 3786 #endif /* ECONNRESET */ 3787 #ifdef EPIPE 3788 if(verbosity >= 2 || errno != EPIPE) 3789 #endif /* EPIPE 'broken pipe' */ 3790 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3791 cleanup_tcp_handler(data); 3792 return; 3793 } 3794 } 3795 3796 data->bytes_transmitted += sent; 3797 #ifdef HAVE_WRITEV 3798 packet_could_be_done: 3799 #endif 3800 buffer_skip(q->packet, sent); 3801 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 3802 /* 3803 * Still more data to write when socket becomes 3804 * writable again. 3805 */ 3806 return; 3807 } 3808 3809 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 3810 3811 if (data->query_state == QUERY_IN_AXFR) { 3812 /* Continue processing AXFR and writing back results. */ 3813 buffer_clear(q->packet); 3814 data->query_state = query_axfr(data->nsd, q); 3815 if (data->query_state != QUERY_PROCESSED) { 3816 query_add_optional(data->query, data->nsd); 3817 3818 /* Reset data. */ 3819 buffer_flip(q->packet); 3820 q->tcplen = buffer_remaining(q->packet); 3821 data->bytes_transmitted = 0; 3822 /* Reset timeout. */ 3823 timeout.tv_sec = data->tcp_timeout / 1000; 3824 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3825 ev_base = data->event.ev_base; 3826 event_del(&data->event); 3827 memset(&data->event, 0, sizeof(data->event)); 3828 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 3829 handle_tcp_writing, data); 3830 if(event_base_set(ev_base, &data->event) != 0) 3831 log_msg(LOG_ERR, "event base set tcpw failed"); 3832 if(event_add(&data->event, &timeout) != 0) 3833 log_msg(LOG_ERR, "event add tcpw failed"); 3834 3835 /* 3836 * Write data if/when the socket is writable 3837 * again. 3838 */ 3839 return; 3840 } 3841 } 3842 3843 /* 3844 * Done sending, wait for the next request to arrive on the 3845 * TCP socket by installing the TCP read handler. 3846 */ 3847 if (data->nsd->tcp_query_count > 0 && 3848 data->query_count >= data->nsd->tcp_query_count) { 3849 3850 (void) shutdown(fd, SHUT_WR); 3851 } 3852 3853 data->bytes_transmitted = 0; 3854 3855 timeout.tv_sec = data->tcp_timeout / 1000; 3856 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3857 ev_base = data->event.ev_base; 3858 event_del(&data->event); 3859 memset(&data->event, 0, sizeof(data->event)); 3860 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3861 handle_tcp_reading, data); 3862 if(event_base_set(ev_base, &data->event) != 0) 3863 log_msg(LOG_ERR, "event base set tcpw failed"); 3864 if(event_add(&data->event, &timeout) != 0) 3865 log_msg(LOG_ERR, "event add tcpw failed"); 3866 } 3867 3868 #ifdef HAVE_SSL 3869 /** create SSL object and associate fd */ 3870 static SSL* 3871 incoming_ssl_fd(SSL_CTX* ctx, int fd) 3872 { 3873 SSL* ssl = SSL_new((SSL_CTX*)ctx); 3874 if(!ssl) { 3875 log_crypto_err("could not SSL_new"); 3876 return NULL; 3877 } 3878 SSL_set_accept_state(ssl); 3879 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 3880 if(!SSL_set_fd(ssl, fd)) { 3881 log_crypto_err("could not SSL_set_fd"); 3882 SSL_free(ssl); 3883 return NULL; 3884 } 3885 return ssl; 3886 } 3887 3888 /** TLS handshake to upgrade TCP connection */ 3889 static int 3890 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 3891 { 3892 int r; 3893 if(data->shake_state == tls_hs_read_event) { 3894 /* read condition satisfied back to writing */ 3895 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3896 data->shake_state = tls_hs_none; 3897 return 1; 3898 } 3899 if(data->shake_state == tls_hs_write_event) { 3900 /* write condition satisfied back to reading */ 3901 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3902 data->shake_state = tls_hs_none; 3903 return 1; 3904 } 3905 3906 /* (continue to) setup the TLS connection */ 3907 ERR_clear_error(); 3908 r = SSL_do_handshake(data->tls); 3909 3910 if(r != 1) { 3911 int want = SSL_get_error(data->tls, r); 3912 if(want == SSL_ERROR_WANT_READ) { 3913 if(data->shake_state == tls_hs_read) { 3914 /* try again later */ 3915 return 1; 3916 } 3917 data->shake_state = tls_hs_read; 3918 /* switch back to reading mode */ 3919 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3920 return 1; 3921 } else if(want == SSL_ERROR_WANT_WRITE) { 3922 if(data->shake_state == tls_hs_write) { 3923 /* try again later */ 3924 return 1; 3925 } 3926 data->shake_state = tls_hs_write; 3927 /* switch back to writing mode */ 3928 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3929 return 1; 3930 } else { 3931 if(r == 0) 3932 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 3933 else { 3934 unsigned long err = ERR_get_error(); 3935 if(!squelch_err_ssl_handshake(err)) { 3936 char a[64], s[256]; 3937 addr2str(&data->query->addr, a, sizeof(a)); 3938 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 3939 log_crypto_from_err(s, err); 3940 } 3941 } 3942 cleanup_tcp_handler(data); 3943 return 0; 3944 } 3945 } 3946 3947 /* Use to log successful upgrade for testing - could be removed*/ 3948 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 3949 /* set back to the event we need to have when reading (or writing) */ 3950 if(data->shake_state == tls_hs_read && writing) { 3951 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3952 } else if(data->shake_state == tls_hs_write && !writing) { 3953 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3954 } 3955 data->shake_state = tls_hs_none; 3956 return 1; 3957 } 3958 3959 /** handle TLS reading of incoming query */ 3960 static void 3961 handle_tls_reading(int fd, short event, void* arg) 3962 { 3963 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3964 ssize_t received; 3965 3966 if ((event & EV_TIMEOUT)) { 3967 /* Connection timed out. */ 3968 cleanup_tcp_handler(data); 3969 return; 3970 } 3971 3972 if (data->nsd->tcp_query_count > 0 && 3973 data->query_count >= data->nsd->tcp_query_count) { 3974 /* No more queries allowed on this tcp connection. */ 3975 cleanup_tcp_handler(data); 3976 return; 3977 } 3978 3979 assert((event & EV_READ)); 3980 3981 if (data->bytes_transmitted == 0) { 3982 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3983 } 3984 3985 if(data->shake_state != tls_hs_none) { 3986 if(!tls_handshake(data, fd, 0)) 3987 return; 3988 if(data->shake_state != tls_hs_none) 3989 return; 3990 } 3991 3992 /* 3993 * Check if we received the leading packet length bytes yet. 3994 */ 3995 if(data->bytes_transmitted < sizeof(uint16_t)) { 3996 ERR_clear_error(); 3997 if((received=SSL_read(data->tls, (char *) &data->query->tcplen 3998 + data->bytes_transmitted, 3999 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { 4000 int want = SSL_get_error(data->tls, received); 4001 if(want == SSL_ERROR_ZERO_RETURN) { 4002 cleanup_tcp_handler(data); 4003 return; /* shutdown, closed */ 4004 } else if(want == SSL_ERROR_WANT_READ) { 4005 /* wants to be called again */ 4006 return; 4007 } 4008 else if(want == SSL_ERROR_WANT_WRITE) { 4009 /* switch to writing */ 4010 data->shake_state = tls_hs_write_event; 4011 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4012 return; 4013 } 4014 cleanup_tcp_handler(data); 4015 log_crypto_err("could not SSL_read"); 4016 return; 4017 } 4018 4019 data->bytes_transmitted += received; 4020 if (data->bytes_transmitted < sizeof(uint16_t)) { 4021 /* 4022 * Not done with the tcplen yet, wait for more 4023 * data to become available. 4024 */ 4025 return; 4026 } 4027 4028 assert(data->bytes_transmitted == sizeof(uint16_t)); 4029 4030 data->query->tcplen = ntohs(data->query->tcplen); 4031 4032 /* 4033 * Minimum query size is: 4034 * 4035 * Size of the header (12) 4036 * + Root domain name (1) 4037 * + Query class (2) 4038 * + Query type (2) 4039 */ 4040 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4041 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4042 cleanup_tcp_handler(data); 4043 return; 4044 } 4045 4046 if (data->query->tcplen > data->query->maxlen) { 4047 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4048 cleanup_tcp_handler(data); 4049 return; 4050 } 4051 4052 buffer_set_limit(data->query->packet, data->query->tcplen); 4053 } 4054 4055 assert(buffer_remaining(data->query->packet) > 0); 4056 4057 /* Read the (remaining) query data. */ 4058 ERR_clear_error(); 4059 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), 4060 (int)buffer_remaining(data->query->packet)); 4061 if(received <= 0) { 4062 int want = SSL_get_error(data->tls, received); 4063 if(want == SSL_ERROR_ZERO_RETURN) { 4064 cleanup_tcp_handler(data); 4065 return; /* shutdown, closed */ 4066 } else if(want == SSL_ERROR_WANT_READ) { 4067 /* wants to be called again */ 4068 return; 4069 } 4070 else if(want == SSL_ERROR_WANT_WRITE) { 4071 /* switch back writing */ 4072 data->shake_state = tls_hs_write_event; 4073 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4074 return; 4075 } 4076 cleanup_tcp_handler(data); 4077 log_crypto_err("could not SSL_read"); 4078 return; 4079 } 4080 4081 data->bytes_transmitted += received; 4082 buffer_skip(data->query->packet, received); 4083 if (buffer_remaining(data->query->packet) > 0) { 4084 /* 4085 * Message not yet complete, wait for more data to 4086 * become available. 4087 */ 4088 return; 4089 } 4090 4091 assert(buffer_position(data->query->packet) == data->query->tcplen); 4092 4093 /* Account... */ 4094 #ifndef INET6 4095 STATUP(data->nsd, ctls); 4096 #else 4097 if (data->query->addr.ss_family == AF_INET) { 4098 STATUP(data->nsd, ctls); 4099 } else if (data->query->addr.ss_family == AF_INET6) { 4100 STATUP(data->nsd, ctls6); 4101 } 4102 #endif 4103 4104 /* We have a complete query, process it. */ 4105 4106 /* tcp-query-count: handle query counter ++ */ 4107 data->query_count++; 4108 4109 buffer_flip(data->query->packet); 4110 #ifdef USE_DNSTAP 4111 dt_collector_submit_auth_query(data->nsd, &data->query->addr, 4112 data->query->addrlen, data->query->tcp, data->query->packet); 4113 #endif /* USE_DNSTAP */ 4114 data->query_state = server_process_query(data->nsd, data->query); 4115 if (data->query_state == QUERY_DISCARDED) { 4116 /* Drop the packet and the entire connection... */ 4117 STATUP(data->nsd, dropped); 4118 ZTATUP(data->nsd, data->query->zone, dropped); 4119 cleanup_tcp_handler(data); 4120 return; 4121 } 4122 4123 #ifdef BIND8_STATS 4124 if (RCODE(data->query->packet) == RCODE_OK 4125 && !AA(data->query->packet)) 4126 { 4127 STATUP(data->nsd, nona); 4128 ZTATUP(data->nsd, data->query->zone, nona); 4129 } 4130 #endif /* BIND8_STATS */ 4131 4132 #ifdef USE_ZONE_STATS 4133 #ifndef INET6 4134 ZTATUP(data->nsd, data->query->zone, ctls); 4135 #else 4136 if (data->query->addr.ss_family == AF_INET) { 4137 ZTATUP(data->nsd, data->query->zone, ctls); 4138 } else if (data->query->addr.ss_family == AF_INET6) { 4139 ZTATUP(data->nsd, data->query->zone, ctls6); 4140 } 4141 #endif 4142 #endif /* USE_ZONE_STATS */ 4143 4144 query_add_optional(data->query, data->nsd); 4145 4146 /* Switch to the tcp write handler. */ 4147 buffer_flip(data->query->packet); 4148 data->query->tcplen = buffer_remaining(data->query->packet); 4149 #ifdef BIND8_STATS 4150 /* Account the rcode & TC... */ 4151 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4152 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4153 if (TC(data->query->packet)) { 4154 STATUP(data->nsd, truncated); 4155 ZTATUP(data->nsd, data->query->zone, truncated); 4156 } 4157 #endif /* BIND8_STATS */ 4158 #ifdef USE_DNSTAP 4159 dt_collector_submit_auth_response(data->nsd, &data->query->addr, 4160 data->query->addrlen, data->query->tcp, data->query->packet, 4161 data->query->zone); 4162 #endif /* USE_DNSTAP */ 4163 data->bytes_transmitted = 0; 4164 4165 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4166 4167 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4168 handle_tls_writing(fd, EV_WRITE, data); 4169 } 4170 4171 /** handle TLS writing of outgoing response */ 4172 static void 4173 handle_tls_writing(int fd, short event, void* arg) 4174 { 4175 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4176 ssize_t sent; 4177 struct query *q = data->query; 4178 /* static variable that holds reassembly buffer used to put the 4179 * TCP length in front of the packet, like writev. */ 4180 static buffer_type* global_tls_temp_buffer = NULL; 4181 buffer_type* write_buffer; 4182 4183 if ((event & EV_TIMEOUT)) { 4184 /* Connection timed out. */ 4185 cleanup_tcp_handler(data); 4186 return; 4187 } 4188 4189 assert((event & EV_WRITE)); 4190 4191 if(data->shake_state != tls_hs_none) { 4192 if(!tls_handshake(data, fd, 1)) 4193 return; 4194 if(data->shake_state != tls_hs_none) 4195 return; 4196 } 4197 4198 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4199 4200 /* If we are writing the start of a message, we must include the length 4201 * this is done with a copy into write_buffer. */ 4202 write_buffer = NULL; 4203 if (data->bytes_transmitted == 0) { 4204 if(!global_tls_temp_buffer) { 4205 /* gets deallocated when nsd shuts down from 4206 * nsd.region */ 4207 global_tls_temp_buffer = buffer_create(nsd.region, 4208 QIOBUFSZ + sizeof(q->tcplen)); 4209 if (!global_tls_temp_buffer) { 4210 return; 4211 } 4212 } 4213 write_buffer = global_tls_temp_buffer; 4214 buffer_clear(write_buffer); 4215 buffer_write_u16(write_buffer, q->tcplen); 4216 buffer_write(write_buffer, buffer_current(q->packet), 4217 (int)buffer_remaining(q->packet)); 4218 buffer_flip(write_buffer); 4219 } else { 4220 write_buffer = q->packet; 4221 } 4222 4223 /* Write the response */ 4224 ERR_clear_error(); 4225 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4226 if(sent <= 0) { 4227 int want = SSL_get_error(data->tls, sent); 4228 if(want == SSL_ERROR_ZERO_RETURN) { 4229 cleanup_tcp_handler(data); 4230 /* closed */ 4231 } else if(want == SSL_ERROR_WANT_READ) { 4232 /* switch back to reading */ 4233 data->shake_state = tls_hs_read_event; 4234 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4235 } else if(want != SSL_ERROR_WANT_WRITE) { 4236 cleanup_tcp_handler(data); 4237 log_crypto_err("could not SSL_write"); 4238 } 4239 return; 4240 } 4241 4242 buffer_skip(write_buffer, sent); 4243 if(buffer_remaining(write_buffer) != 0) { 4244 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4245 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4246 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4247 } 4248 } 4249 4250 data->bytes_transmitted += sent; 4251 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4252 /* 4253 * Still more data to write when socket becomes 4254 * writable again. 4255 */ 4256 return; 4257 } 4258 4259 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4260 4261 if (data->query_state == QUERY_IN_AXFR) { 4262 /* Continue processing AXFR and writing back results. */ 4263 buffer_clear(q->packet); 4264 data->query_state = query_axfr(data->nsd, q); 4265 if (data->query_state != QUERY_PROCESSED) { 4266 query_add_optional(data->query, data->nsd); 4267 4268 /* Reset data. */ 4269 buffer_flip(q->packet); 4270 q->tcplen = buffer_remaining(q->packet); 4271 data->bytes_transmitted = 0; 4272 /* Reset to writing mode. */ 4273 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4274 4275 /* 4276 * Write data if/when the socket is writable 4277 * again. 4278 */ 4279 return; 4280 } 4281 } 4282 4283 /* 4284 * Done sending, wait for the next request to arrive on the 4285 * TCP socket by installing the TCP read handler. 4286 */ 4287 if (data->nsd->tcp_query_count > 0 && 4288 data->query_count >= data->nsd->tcp_query_count) { 4289 4290 (void) shutdown(fd, SHUT_WR); 4291 } 4292 4293 data->bytes_transmitted = 0; 4294 4295 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4296 } 4297 #endif 4298 4299 static void 4300 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 4301 void* ATTR_UNUSED(arg)) 4302 { 4303 if(slowaccept) { 4304 configure_handler_event_types(EV_PERSIST | EV_READ); 4305 slowaccept = 0; 4306 } 4307 } 4308 4309 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 4310 { 4311 #ifndef HAVE_ACCEPT4 4312 int s = accept(fd, addr, addrlen); 4313 if (s != -1) { 4314 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 4315 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 4316 close(s); 4317 s = -1; 4318 errno=EINTR; /* stop error printout as error in accept4 4319 by setting this errno, it omits printout, in 4320 later code that calls nsd_accept4 */ 4321 } 4322 } 4323 return s; 4324 #else 4325 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 4326 #endif /* HAVE_ACCEPT4 */ 4327 } 4328 4329 /* 4330 * Handle an incoming TCP connection. The connection is accepted and 4331 * a new TCP reader event handler is added. The TCP handler 4332 * is responsible for cleanup when the connection is closed. 4333 */ 4334 static void 4335 handle_tcp_accept(int fd, short event, void* arg) 4336 { 4337 struct tcp_accept_handler_data *data 4338 = (struct tcp_accept_handler_data *) arg; 4339 int s; 4340 int reject = 0; 4341 struct tcp_handler_data *tcp_data; 4342 region_type *tcp_region; 4343 #ifdef INET6 4344 struct sockaddr_storage addr; 4345 #else 4346 struct sockaddr_in addr; 4347 #endif 4348 socklen_t addrlen; 4349 struct timeval timeout; 4350 4351 if (!(event & EV_READ)) { 4352 return; 4353 } 4354 4355 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 4356 reject = data->nsd->options->tcp_reject_overflow; 4357 if (!reject) { 4358 return; 4359 } 4360 } 4361 4362 /* Accept it... */ 4363 addrlen = sizeof(addr); 4364 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 4365 if (s == -1) { 4366 /** 4367 * EMFILE and ENFILE is a signal that the limit of open 4368 * file descriptors has been reached. Pause accept(). 4369 * EINTR is a signal interrupt. The others are various OS ways 4370 * of saying that the client has closed the connection. 4371 */ 4372 if (errno == EMFILE || errno == ENFILE) { 4373 if (!slowaccept) { 4374 /* disable accept events */ 4375 struct timeval tv; 4376 configure_handler_event_types(0); 4377 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 4378 tv.tv_usec = 0L; 4379 memset(&slowaccept_event, 0, 4380 sizeof(slowaccept_event)); 4381 event_set(&slowaccept_event, -1, EV_TIMEOUT, 4382 handle_slowaccept_timeout, NULL); 4383 (void)event_base_set(data->event.ev_base, 4384 &slowaccept_event); 4385 (void)event_add(&slowaccept_event, &tv); 4386 slowaccept = 1; 4387 /* We don't want to spam the logs here */ 4388 } 4389 } else if (errno != EINTR 4390 && errno != EWOULDBLOCK 4391 #ifdef ECONNABORTED 4392 && errno != ECONNABORTED 4393 #endif /* ECONNABORTED */ 4394 #ifdef EPROTO 4395 && errno != EPROTO 4396 #endif /* EPROTO */ 4397 ) { 4398 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 4399 } 4400 return; 4401 } 4402 4403 if (reject) { 4404 shutdown(s, SHUT_RDWR); 4405 close(s); 4406 return; 4407 } 4408 4409 /* 4410 * This region is deallocated when the TCP connection is 4411 * closed by the TCP handler. 4412 */ 4413 tcp_region = region_create(xalloc, free); 4414 tcp_data = (struct tcp_handler_data *) region_alloc( 4415 tcp_region, sizeof(struct tcp_handler_data)); 4416 tcp_data->region = tcp_region; 4417 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 4418 compression_table_size, compressed_dnames); 4419 tcp_data->nsd = data->nsd; 4420 tcp_data->query_count = 0; 4421 #ifdef HAVE_SSL 4422 tcp_data->shake_state = tls_hs_none; 4423 tcp_data->tls = NULL; 4424 #endif 4425 tcp_data->prev = NULL; 4426 tcp_data->next = NULL; 4427 4428 tcp_data->query_state = QUERY_PROCESSED; 4429 tcp_data->bytes_transmitted = 0; 4430 memcpy(&tcp_data->query->addr, &addr, addrlen); 4431 tcp_data->query->addrlen = addrlen; 4432 4433 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 4434 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 4435 /* very busy, give smaller timeout */ 4436 tcp_data->tcp_timeout = 200; 4437 } 4438 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4439 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 4440 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 4441 4442 #ifdef HAVE_SSL 4443 if (data->tls_accept) { 4444 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 4445 if(!tcp_data->tls) { 4446 close(s); 4447 return; 4448 } 4449 tcp_data->shake_state = tls_hs_read; 4450 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4451 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4452 handle_tls_reading, tcp_data); 4453 } else { 4454 #endif 4455 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4456 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4457 handle_tcp_reading, tcp_data); 4458 #ifdef HAVE_SSL 4459 } 4460 #endif 4461 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 4462 log_msg(LOG_ERR, "cannot set tcp event base"); 4463 close(s); 4464 region_destroy(tcp_region); 4465 return; 4466 } 4467 if(event_add(&tcp_data->event, &timeout) != 0) { 4468 log_msg(LOG_ERR, "cannot add tcp to event base"); 4469 close(s); 4470 region_destroy(tcp_region); 4471 return; 4472 } 4473 if(tcp_active_list) { 4474 tcp_active_list->prev = tcp_data; 4475 tcp_data->next = tcp_active_list; 4476 } 4477 tcp_active_list = tcp_data; 4478 4479 /* 4480 * Keep track of the total number of TCP handlers installed so 4481 * we can stop accepting connections when the maximum number 4482 * of simultaneous TCP connections is reached. 4483 * 4484 * If tcp-reject-overflow is enabled, however, then we do not 4485 * change the handler event type; we keep it as-is and accept 4486 * overflow TCP connections only so that we can forcibly kill 4487 * them off. 4488 */ 4489 ++data->nsd->current_tcp_count; 4490 if (!data->nsd->options->tcp_reject_overflow && 4491 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 4492 { 4493 configure_handler_event_types(0); 4494 } 4495 } 4496 4497 static void 4498 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 4499 { 4500 size_t i; 4501 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4502 for (i = 0; i < nsd->child_count; ++i) { 4503 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 4504 if (write(nsd->children[i].child_fd, 4505 &command, 4506 sizeof(command)) == -1) 4507 { 4508 if(errno != EAGAIN && errno != EINTR) 4509 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 4510 (int) command, 4511 (int) nsd->children[i].pid, 4512 strerror(errno)); 4513 } else if (timeout > 0) { 4514 (void)block_read(NULL, 4515 nsd->children[i].child_fd, 4516 &command, sizeof(command), timeout); 4517 } 4518 fsync(nsd->children[i].child_fd); 4519 close(nsd->children[i].child_fd); 4520 nsd->children[i].child_fd = -1; 4521 } 4522 } 4523 } 4524 4525 static void 4526 send_children_quit(struct nsd* nsd) 4527 { 4528 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 4529 send_children_command(nsd, NSD_QUIT, 0); 4530 } 4531 4532 static void 4533 send_children_quit_and_wait(struct nsd* nsd) 4534 { 4535 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 4536 send_children_command(nsd, NSD_QUIT_CHILD, 3); 4537 } 4538 4539 #ifdef BIND8_STATS 4540 static void 4541 set_children_stats(struct nsd* nsd) 4542 { 4543 size_t i; 4544 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4545 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 4546 for (i = 0; i < nsd->child_count; ++i) { 4547 nsd->children[i].need_to_send_STATS = 1; 4548 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 4549 } 4550 } 4551 #endif /* BIND8_STATS */ 4552 4553 static void 4554 configure_handler_event_types(short event_types) 4555 { 4556 size_t i; 4557 4558 for (i = 0; i < tcp_accept_handler_count; ++i) { 4559 struct event* handler = &tcp_accept_handlers[i].event; 4560 if(event_types) { 4561 /* reassign */ 4562 int fd = handler->ev_fd; 4563 struct event_base* base = handler->ev_base; 4564 if(tcp_accept_handlers[i].event_added) 4565 event_del(handler); 4566 memset(handler, 0, sizeof(*handler)); 4567 event_set(handler, fd, event_types, 4568 handle_tcp_accept, &tcp_accept_handlers[i]); 4569 if(event_base_set(base, handler) != 0) 4570 log_msg(LOG_ERR, "conhand: cannot event_base"); 4571 if(event_add(handler, NULL) != 0) 4572 log_msg(LOG_ERR, "conhand: cannot event_add"); 4573 tcp_accept_handlers[i].event_added = 1; 4574 } else { 4575 /* remove */ 4576 if(tcp_accept_handlers[i].event_added) { 4577 event_del(handler); 4578 tcp_accept_handlers[i].event_added = 0; 4579 } 4580 } 4581 } 4582 } 4583