1 /* 2 * server.c -- nsd(8) network input/output 3 * 4 * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 #include "config.h" 11 12 #include <sys/types.h> 13 #include <sys/param.h> 14 #include <limits.h> 15 #include <sys/socket.h> 16 #include <sys/uio.h> 17 #include <sys/wait.h> 18 19 #include <netinet/in.h> 20 #ifdef USE_TCP_FASTOPEN 21 #include <netinet/tcp.h> 22 #endif 23 #include <arpa/inet.h> 24 25 #include <assert.h> 26 #include <ctype.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <time.h> 34 #include <unistd.h> 35 #include <signal.h> 36 #include <netdb.h> 37 #include <poll.h> 38 #ifdef HAVE_SYS_RANDOM_H 39 #include <sys/random.h> 40 #endif 41 #ifndef SHUT_WR 42 #define SHUT_WR 1 43 #endif 44 #ifdef HAVE_MMAP 45 #include <sys/mman.h> 46 #endif /* HAVE_MMAP */ 47 #ifdef HAVE_OPENSSL_RAND_H 48 #include <openssl/rand.h> 49 #endif 50 #ifdef HAVE_OPENSSL_SSL_H 51 #include <openssl/ssl.h> 52 #endif 53 #ifdef HAVE_OPENSSL_ERR_H 54 #include <openssl/err.h> 55 #endif 56 #ifdef HAVE_OPENSSL_OCSP_H 57 #include <openssl/ocsp.h> 58 #endif 59 #ifndef USE_MINI_EVENT 60 # ifdef HAVE_EVENT_H 61 # include <event.h> 62 # else 63 # include <event2/event.h> 64 # include "event2/event_struct.h" 65 # include "event2/event_compat.h" 66 # endif 67 #else 68 # include "mini_event.h" 69 #endif 70 71 #include "axfr.h" 72 #include "namedb.h" 73 #include "netio.h" 74 #include "xfrd.h" 75 #include "xfrd-tcp.h" 76 #include "xfrd-disk.h" 77 #include "difffile.h" 78 #include "nsec3.h" 79 #include "ipc.h" 80 #include "udb.h" 81 #include "remote.h" 82 #include "lookup3.h" 83 #include "rrl.h" 84 #ifdef USE_DNSTAP 85 #include "dnstap/dnstap_collector.h" 86 #endif 87 88 #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ 89 90 #ifdef USE_TCP_FASTOPEN 91 #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" 92 #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 93 #endif 94 95 /* 96 * Data for the UDP handlers. 97 */ 98 struct udp_handler_data 99 { 100 struct nsd *nsd; 101 struct nsd_socket *socket; 102 struct event event; 103 }; 104 105 struct tcp_accept_handler_data { 106 struct nsd *nsd; 107 struct nsd_socket *socket; 108 int event_added; 109 struct event event; 110 #ifdef HAVE_SSL 111 /* handler accepts TLS connections on the dedicated port */ 112 int tls_accept; 113 #endif 114 }; 115 116 /* 117 * These globals are used to enable the TCP accept handlers 118 * when the number of TCP connection drops below the maximum 119 * number of TCP connections. 120 */ 121 static size_t tcp_accept_handler_count; 122 static struct tcp_accept_handler_data *tcp_accept_handlers; 123 124 static struct event slowaccept_event; 125 static int slowaccept; 126 127 #ifdef HAVE_SSL 128 static unsigned char *ocspdata = NULL; 129 static long ocspdata_len = 0; 130 #endif 131 132 #ifdef NONBLOCKING_IS_BROKEN 133 /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to 134 read multiple times from a socket when reported ready by select. */ 135 # define NUM_RECV_PER_SELECT (1) 136 #else /* !NONBLOCKING_IS_BROKEN */ 137 # define NUM_RECV_PER_SELECT (100) 138 #endif /* NONBLOCKING_IS_BROKEN */ 139 140 #ifndef HAVE_MMSGHDR 141 struct mmsghdr { 142 struct msghdr msg_hdr; 143 unsigned int msg_len; 144 }; 145 #endif 146 147 static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; 148 static struct iovec iovecs[NUM_RECV_PER_SELECT]; 149 static struct query *queries[NUM_RECV_PER_SELECT]; 150 151 /* 152 * Data for the TCP connection handlers. 153 * 154 * The TCP handlers use non-blocking I/O. This is necessary to avoid 155 * blocking the entire server on a slow TCP connection, but does make 156 * reading from and writing to the socket more complicated. 157 * 158 * Basically, whenever a read/write would block (indicated by the 159 * EAGAIN errno variable) we remember the position we were reading 160 * from/writing to and return from the TCP reading/writing event 161 * handler. When the socket becomes readable/writable again we 162 * continue from the same position. 163 */ 164 struct tcp_handler_data 165 { 166 /* 167 * The region used to allocate all TCP connection related 168 * data, including this structure. This region is destroyed 169 * when the connection is closed. 170 */ 171 region_type* region; 172 173 /* 174 * The global nsd structure. 175 */ 176 struct nsd* nsd; 177 178 /* 179 * The current query data for this TCP connection. 180 */ 181 query_type* query; 182 183 /* 184 * The query_state is used to remember if we are performing an 185 * AXFR, if we're done processing, or if we should discard the 186 * query and connection. 187 */ 188 query_state_type query_state; 189 190 /* 191 * The event for the file descriptor and tcp timeout 192 */ 193 struct event event; 194 195 /* 196 * The bytes_transmitted field is used to remember the number 197 * of bytes transmitted when receiving or sending a DNS 198 * packet. The count includes the two additional bytes used 199 * to specify the packet length on a TCP connection. 200 */ 201 size_t bytes_transmitted; 202 203 /* 204 * The number of queries handled by this specific TCP connection. 205 */ 206 int query_count; 207 208 /* 209 * The timeout in msec for this tcp connection 210 */ 211 int tcp_timeout; 212 213 /* 214 * If the connection is allowed to have further queries on it. 215 */ 216 int tcp_no_more_queries; 217 #ifdef HAVE_SSL 218 /* 219 * TLS object. 220 */ 221 SSL* tls; 222 223 /* 224 * TLS handshake state. 225 */ 226 enum { tls_hs_none, tls_hs_read, tls_hs_write, 227 tls_hs_read_event, tls_hs_write_event } shake_state; 228 #endif 229 /* list of connections, for service of remaining tcp channels */ 230 struct tcp_handler_data *prev, *next; 231 }; 232 /* global that is the list of active tcp channels */ 233 static struct tcp_handler_data *tcp_active_list = NULL; 234 235 /* 236 * Handle incoming queries on the UDP server sockets. 237 */ 238 static void handle_udp(int fd, short event, void* arg); 239 240 /* 241 * Handle incoming connections on the TCP sockets. These handlers 242 * usually wait for the NETIO_EVENT_READ event (indicating an incoming 243 * connection) but are disabled when the number of current TCP 244 * connections is equal to the maximum number of TCP connections. 245 * Disabling is done by changing the handler to wait for the 246 * NETIO_EVENT_NONE type. This is done using the function 247 * configure_tcp_accept_handlers. 248 */ 249 static void handle_tcp_accept(int fd, short event, void* arg); 250 251 /* 252 * Handle incoming queries on a TCP connection. The TCP connections 253 * are configured to be non-blocking and the handler may be called 254 * multiple times before a complete query is received. 255 */ 256 static void handle_tcp_reading(int fd, short event, void* arg); 257 258 /* 259 * Handle outgoing responses on a TCP connection. The TCP connections 260 * are configured to be non-blocking and the handler may be called 261 * multiple times before a complete response is sent. 262 */ 263 static void handle_tcp_writing(int fd, short event, void* arg); 264 265 #ifdef HAVE_SSL 266 /* Create SSL object and associate fd */ 267 static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); 268 /* 269 * Handle TLS handshake. May be called multiple times if incomplete. 270 */ 271 static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); 272 273 /* 274 * Handle incoming queries on a TLS over TCP connection. The TLS 275 * connections are configured to be non-blocking and the handler may 276 * be called multiple times before a complete query is received. 277 */ 278 static void handle_tls_reading(int fd, short event, void* arg); 279 280 /* 281 * Handle outgoing responses on a TLS over TCP connection. The TLS 282 * connections are configured to be non-blocking and the handler may 283 * be called multiple times before a complete response is sent. 284 */ 285 static void handle_tls_writing(int fd, short event, void* arg); 286 #endif 287 288 /* 289 * Send all children the quit nonblocking, then close pipe. 290 */ 291 static void send_children_quit(struct nsd* nsd); 292 /* same, for shutdown time, waits for child to exit to avoid restart issues */ 293 static void send_children_quit_and_wait(struct nsd* nsd); 294 295 /* set childrens flags to send NSD_STATS to them */ 296 #ifdef BIND8_STATS 297 static void set_children_stats(struct nsd* nsd); 298 #endif /* BIND8_STATS */ 299 300 /* 301 * Change the event types the HANDLERS are interested in to EVENT_TYPES. 302 */ 303 static void configure_handler_event_types(short event_types); 304 305 static uint16_t *compressed_dname_offsets = 0; 306 static uint32_t compression_table_capacity = 0; 307 static uint32_t compression_table_size = 0; 308 static domain_type* compressed_dnames[MAXRRSPP]; 309 310 #ifdef USE_TCP_FASTOPEN 311 /* Checks to see if the kernel value must be manually changed in order for 312 TCP Fast Open to support server mode */ 313 static void report_tcp_fastopen_config() { 314 315 int tcp_fastopen_fp; 316 uint8_t tcp_fastopen_value; 317 318 if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { 319 log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 320 } 321 if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { 322 log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); 323 close(tcp_fastopen_fp); 324 } 325 if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { 326 log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); 327 log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); 328 log_msg(LOG_WARNING, "To enable TFO use the command:"); 329 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); 330 log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); 331 log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); 332 close(tcp_fastopen_fp); 333 } 334 close(tcp_fastopen_fp); 335 } 336 #endif 337 338 /* 339 * Remove the specified pid from the list of child pids. Returns -1 if 340 * the pid is not in the list, child_num otherwise. The field is set to 0. 341 */ 342 static int 343 delete_child_pid(struct nsd *nsd, pid_t pid) 344 { 345 size_t i; 346 for (i = 0; i < nsd->child_count; ++i) { 347 if (nsd->children[i].pid == pid) { 348 nsd->children[i].pid = 0; 349 if(!nsd->children[i].need_to_exit) { 350 if(nsd->children[i].child_fd != -1) 351 close(nsd->children[i].child_fd); 352 nsd->children[i].child_fd = -1; 353 if(nsd->children[i].handler) 354 nsd->children[i].handler->fd = -1; 355 } 356 return i; 357 } 358 } 359 return -1; 360 } 361 362 /* 363 * Restart child servers if necessary. 364 */ 365 static int 366 restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, 367 int* xfrd_sock_p) 368 { 369 struct main_ipc_handler_data *ipc_data; 370 size_t i; 371 int sv[2]; 372 373 /* Fork the child processes... */ 374 for (i = 0; i < nsd->child_count; ++i) { 375 if (nsd->children[i].pid <= 0) { 376 if (nsd->children[i].child_fd != -1) 377 close(nsd->children[i].child_fd); 378 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { 379 log_msg(LOG_ERR, "socketpair: %s", 380 strerror(errno)); 381 return -1; 382 } 383 nsd->children[i].child_fd = sv[0]; 384 nsd->children[i].parent_fd = sv[1]; 385 nsd->children[i].pid = fork(); 386 switch (nsd->children[i].pid) { 387 default: /* SERVER MAIN */ 388 close(nsd->children[i].parent_fd); 389 nsd->children[i].parent_fd = -1; 390 if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { 391 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 392 } 393 if(!nsd->children[i].handler) 394 { 395 ipc_data = (struct main_ipc_handler_data*) region_alloc( 396 region, sizeof(struct main_ipc_handler_data)); 397 ipc_data->nsd = nsd; 398 ipc_data->child = &nsd->children[i]; 399 ipc_data->child_num = i; 400 ipc_data->xfrd_sock = xfrd_sock_p; 401 ipc_data->packet = buffer_create(region, QIOBUFSZ); 402 ipc_data->forward_mode = 0; 403 ipc_data->got_bytes = 0; 404 ipc_data->total_bytes = 0; 405 ipc_data->acl_num = 0; 406 nsd->children[i].handler = (struct netio_handler*) region_alloc( 407 region, sizeof(struct netio_handler)); 408 nsd->children[i].handler->fd = nsd->children[i].child_fd; 409 nsd->children[i].handler->timeout = NULL; 410 nsd->children[i].handler->user_data = ipc_data; 411 nsd->children[i].handler->event_types = NETIO_EVENT_READ; 412 nsd->children[i].handler->event_handler = parent_handle_child_command; 413 netio_add_handler(netio, nsd->children[i].handler); 414 } 415 /* clear any ongoing ipc */ 416 ipc_data = (struct main_ipc_handler_data*) 417 nsd->children[i].handler->user_data; 418 ipc_data->forward_mode = 0; 419 /* restart - update fd */ 420 nsd->children[i].handler->fd = nsd->children[i].child_fd; 421 break; 422 case 0: /* CHILD */ 423 /* the child need not be able to access the 424 * nsd.db file */ 425 namedb_close_udb(nsd->db); 426 #ifdef MEMCLEAN /* OS collects memory pages */ 427 region_destroy(region); 428 #endif 429 nsd->pid = 0; 430 nsd->child_count = 0; 431 nsd->server_kind = nsd->children[i].kind; 432 nsd->this_child = &nsd->children[i]; 433 nsd->this_child->child_num = i; 434 /* remove signal flags inherited from parent 435 the parent will handle them. */ 436 nsd->signal_hint_reload_hup = 0; 437 nsd->signal_hint_reload = 0; 438 nsd->signal_hint_child = 0; 439 nsd->signal_hint_quit = 0; 440 nsd->signal_hint_shutdown = 0; 441 nsd->signal_hint_stats = 0; 442 nsd->signal_hint_statsusr = 0; 443 close(*xfrd_sock_p); 444 close(nsd->this_child->child_fd); 445 nsd->this_child->child_fd = -1; 446 if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { 447 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 448 } 449 server_child(nsd); 450 /* NOTREACH */ 451 exit(0); 452 case -1: 453 log_msg(LOG_ERR, "fork failed: %s", 454 strerror(errno)); 455 return -1; 456 } 457 } 458 } 459 return 0; 460 } 461 462 #ifdef BIND8_STATS 463 static void set_bind8_alarm(struct nsd* nsd) 464 { 465 /* resync so that the next alarm is on the next whole minute */ 466 if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ 467 alarm(nsd->st.period - (time(NULL) % nsd->st.period)); 468 } 469 #endif 470 471 /* set zone stat ids for zones initially read in */ 472 static void 473 zonestatid_tree_set(struct nsd* nsd) 474 { 475 struct radnode* n; 476 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 477 zone_type* zone = (zone_type*)n->elem; 478 zone->zonestatid = getzonestatid(nsd->options, zone->opts); 479 } 480 } 481 482 #ifdef USE_ZONE_STATS 483 void 484 server_zonestat_alloc(struct nsd* nsd) 485 { 486 size_t num = (nsd->options->zonestatnames->count==0?1: 487 nsd->options->zonestatnames->count); 488 size_t sz = sizeof(struct nsdst)*num; 489 char tmpfile[256]; 490 uint8_t z = 0; 491 492 /* file names */ 493 nsd->zonestatfname[0] = 0; 494 nsd->zonestatfname[1] = 0; 495 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", 496 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 497 nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); 498 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", 499 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 500 nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); 501 502 /* file descriptors */ 503 nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); 504 if(nsd->zonestatfd[0] == -1) { 505 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], 506 strerror(errno)); 507 exit(1); 508 } 509 nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); 510 if(nsd->zonestatfd[0] == -1) { 511 log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], 512 strerror(errno)); 513 close(nsd->zonestatfd[0]); 514 unlink(nsd->zonestatfname[0]); 515 exit(1); 516 } 517 518 #ifdef HAVE_MMAP 519 if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { 520 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], 521 strerror(errno)); 522 exit(1); 523 } 524 if(write(nsd->zonestatfd[0], &z, 1) == -1) { 525 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 526 nsd->zonestatfname[0], strerror(errno)); 527 exit(1); 528 } 529 if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { 530 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], 531 strerror(errno)); 532 exit(1); 533 } 534 if(write(nsd->zonestatfd[1], &z, 1) == -1) { 535 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 536 nsd->zonestatfname[1], strerror(errno)); 537 exit(1); 538 } 539 nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 540 MAP_SHARED, nsd->zonestatfd[0], 0); 541 if(nsd->zonestat[0] == MAP_FAILED) { 542 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 543 unlink(nsd->zonestatfname[0]); 544 unlink(nsd->zonestatfname[1]); 545 exit(1); 546 } 547 nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, 548 MAP_SHARED, nsd->zonestatfd[1], 0); 549 if(nsd->zonestat[1] == MAP_FAILED) { 550 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 551 unlink(nsd->zonestatfname[0]); 552 unlink(nsd->zonestatfname[1]); 553 exit(1); 554 } 555 memset(nsd->zonestat[0], 0, sz); 556 memset(nsd->zonestat[1], 0, sz); 557 nsd->zonestatsize[0] = num; 558 nsd->zonestatsize[1] = num; 559 nsd->zonestatdesired = num; 560 nsd->zonestatsizenow = num; 561 nsd->zonestatnow = nsd->zonestat[0]; 562 #endif /* HAVE_MMAP */ 563 } 564 565 void 566 zonestat_remap(struct nsd* nsd, int idx, size_t sz) 567 { 568 #ifdef HAVE_MMAP 569 #ifdef MREMAP_MAYMOVE 570 nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], 571 sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, 572 MREMAP_MAYMOVE); 573 if(nsd->zonestat[idx] == MAP_FAILED) { 574 log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); 575 exit(1); 576 } 577 #else /* !HAVE MREMAP */ 578 if(msync(nsd->zonestat[idx], 579 sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) 580 log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); 581 if(munmap(nsd->zonestat[idx], 582 sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) 583 log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); 584 nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, 585 PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); 586 if(nsd->zonestat[idx] == MAP_FAILED) { 587 log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); 588 exit(1); 589 } 590 #endif /* MREMAP */ 591 #endif /* HAVE_MMAP */ 592 } 593 594 /* realloc the zonestat array for the one that is not currently in use, 595 * to match the desired new size of the array (if applicable) */ 596 void 597 server_zonestat_realloc(struct nsd* nsd) 598 { 599 #ifdef HAVE_MMAP 600 uint8_t z = 0; 601 size_t sz; 602 int idx = 0; /* index of the zonestat array that is not in use */ 603 if(nsd->zonestatnow == nsd->zonestat[0]) 604 idx = 1; 605 if(nsd->zonestatsize[idx] == nsd->zonestatdesired) 606 return; 607 sz = sizeof(struct nsdst)*nsd->zonestatdesired; 608 if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { 609 log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], 610 strerror(errno)); 611 exit(1); 612 } 613 if(write(nsd->zonestatfd[idx], &z, 1) == -1) { 614 log_msg(LOG_ERR, "cannot extend stat file %s (%s)", 615 nsd->zonestatfname[idx], strerror(errno)); 616 exit(1); 617 } 618 zonestat_remap(nsd, idx, sz); 619 /* zero the newly allocated region */ 620 if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { 621 memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * 622 nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * 623 (nsd->zonestatdesired - nsd->zonestatsize[idx])); 624 } 625 nsd->zonestatsize[idx] = nsd->zonestatdesired; 626 #endif /* HAVE_MMAP */ 627 } 628 629 /* switchover to use the other array for the new children, that 630 * briefly coexist with the old children. And we want to avoid them 631 * both writing to the same statistics arrays. */ 632 void 633 server_zonestat_switch(struct nsd* nsd) 634 { 635 if(nsd->zonestatnow == nsd->zonestat[0]) { 636 nsd->zonestatnow = nsd->zonestat[1]; 637 nsd->zonestatsizenow = nsd->zonestatsize[1]; 638 } else { 639 nsd->zonestatnow = nsd->zonestat[0]; 640 nsd->zonestatsizenow = nsd->zonestatsize[0]; 641 } 642 } 643 #endif /* USE_ZONE_STATS */ 644 645 static void 646 cleanup_dname_compression_tables(void *ptr) 647 { 648 free(ptr); 649 compressed_dname_offsets = NULL; 650 compression_table_capacity = 0; 651 } 652 653 static void 654 initialize_dname_compression_tables(struct nsd *nsd) 655 { 656 size_t needed = domain_table_count(nsd->db->domains) + 1; 657 needed += EXTRA_DOMAIN_NUMBERS; 658 if(compression_table_capacity < needed) { 659 if(compressed_dname_offsets) { 660 region_remove_cleanup(nsd->db->region, 661 cleanup_dname_compression_tables, 662 compressed_dname_offsets); 663 free(compressed_dname_offsets); 664 } 665 compressed_dname_offsets = (uint16_t *) xmallocarray( 666 needed, sizeof(uint16_t)); 667 region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, 668 compressed_dname_offsets); 669 compression_table_capacity = needed; 670 compression_table_size=domain_table_count(nsd->db->domains)+1; 671 } 672 memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); 673 compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ 674 } 675 676 static int 677 set_cloexec(struct nsd_socket *sock) 678 { 679 assert(sock != NULL); 680 681 if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { 682 const char *socktype = 683 sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; 684 log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", 685 socktype, strerror(errno)); 686 return -1; 687 } 688 689 return 1; 690 } 691 692 static int 693 set_reuseport(struct nsd_socket *sock) 694 { 695 #ifdef SO_REUSEPORT 696 int on = 1; 697 #ifdef SO_REUSEPORT_LB 698 /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like 699 * SO_REUSEPORT on Linux. This is what the users want with the config 700 * option in nsd.conf; if we actually need local address and port reuse 701 * they'll also need to have SO_REUSEPORT set for them, assume it was 702 * _LB they want. 703 */ 704 int opt = SO_REUSEPORT_LB; 705 static const char optname[] = "SO_REUSEPORT_LB"; 706 #else /* !SO_REUSEPORT_LB */ 707 int opt = SO_REUSEPORT; 708 static const char optname[] = "SO_REUSEPORT"; 709 #endif /* SO_REUSEPORT_LB */ 710 711 if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { 712 return 1; 713 } else if(verbosity >= 3 || errno != ENOPROTOOPT) { 714 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 715 optname, strerror(errno)); 716 } 717 return -1; 718 #else 719 (void)sock; 720 #endif /* SO_REUSEPORT */ 721 722 return 0; 723 } 724 725 static int 726 set_reuseaddr(struct nsd_socket *sock) 727 { 728 #ifdef SO_REUSEADDR 729 int on = 1; 730 if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { 731 return 1; 732 } 733 log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", 734 strerror(errno)); 735 return -1; 736 #endif /* SO_REUSEADDR */ 737 return 0; 738 } 739 740 static int 741 set_rcvbuf(struct nsd_socket *sock, int rcv) 742 { 743 #ifdef SO_RCVBUF 744 #ifdef SO_RCVBUFFORCE 745 if(0 == setsockopt( 746 sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) 747 { 748 return 1; 749 } 750 if(errno == EPERM || errno == ENOBUFS) { 751 return 0; 752 } 753 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", 754 strerror(errno)); 755 return -1; 756 #else /* !SO_RCVBUFFORCE */ 757 if (0 == setsockopt( 758 sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) 759 { 760 return 1; 761 } 762 if(errno == ENOSYS || errno == ENOBUFS) { 763 return 0; 764 } 765 log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", 766 strerror(errno)); 767 return -1; 768 #endif /* SO_RCVBUFFORCE */ 769 #endif /* SO_RCVBUF */ 770 771 return 0; 772 } 773 774 static int 775 set_sndbuf(struct nsd_socket *sock, int snd) 776 { 777 #ifdef SO_SNDBUF 778 #ifdef SO_SNDBUFFORCE 779 if(0 == setsockopt( 780 sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) 781 { 782 return 1; 783 } 784 if(errno == EPERM || errno == ENOBUFS) { 785 return 0; 786 } 787 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", 788 strerror(errno)); 789 return -1; 790 #else /* !SO_SNDBUFFORCE */ 791 if(0 == setsockopt( 792 sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) 793 { 794 return 1; 795 } 796 if(errno == ENOSYS || errno == ENOBUFS) { 797 return 0; 798 } 799 log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", 800 strerror(errno)); 801 return -1; 802 #endif /* SO_SNDBUFFORCE */ 803 #endif /* SO_SNDBUF */ 804 805 return 0; 806 } 807 808 static int 809 set_nonblock(struct nsd_socket *sock) 810 { 811 const char *socktype = 812 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 813 814 if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { 815 log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", 816 socktype, strerror(errno)); 817 return -1; 818 } 819 820 return 1; 821 } 822 823 static int 824 set_ipv6_v6only(struct nsd_socket *sock) 825 { 826 #ifdef INET6 827 #ifdef IPV6_V6ONLY 828 int on = 1; 829 const char *socktype = 830 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 831 832 if(0 == setsockopt( 833 sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) 834 { 835 return 1; 836 } 837 838 log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", 839 socktype, strerror(errno)); 840 return -1; 841 #endif /* IPV6_V6ONLY */ 842 #endif /* INET6 */ 843 844 return 0; 845 } 846 847 static int 848 set_ipv6_use_min_mtu(struct nsd_socket *sock) 849 { 850 #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)) 851 #if defined(IPV6_USE_MIN_MTU) 852 /* There is no fragmentation of IPv6 datagrams during forwarding in the 853 * network. Therefore we do not send UDP datagrams larger than the 854 * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be 855 * larger if the network stack supports IPV6_USE_MIN_MTU. 856 */ 857 int opt = IPV6_USE_MIN_MTU; 858 int optval = 1; 859 static const char optname[] = "IPV6_USE_MIN_MTU"; 860 #elif defined(IPV6_MTU) 861 /* On Linux, PMTUD is disabled by default for datagrams so set the MTU 862 * to the MIN MTU to get the same. 863 */ 864 int opt = IPV6_MTU; 865 int optval = IPV6_MIN_MTU; 866 static const char optname[] = "IPV6_MTU"; 867 #endif 868 if(0 == setsockopt( 869 sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) 870 { 871 return 1; 872 } 873 874 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", 875 optname, strerror(errno)); 876 return -1; 877 #else 878 (void)sock; 879 #endif /* INET6 */ 880 881 return 0; 882 } 883 884 static int 885 set_ipv4_no_pmtu_disc(struct nsd_socket *sock) 886 { 887 int ret = 0; 888 889 #if defined(IP_MTU_DISCOVER) 890 int opt = IP_MTU_DISCOVER; 891 int optval; 892 # if defined(IP_PMTUDISC_OMIT) 893 /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU 894 * information and send packets with DF=0. Fragmentation is allowed if 895 * and only if the packet size exceeds the outgoing interface MTU or 896 * the packet encounters smaller MTU link in network. This mitigates 897 * DNS fragmentation attacks by preventing forged PMTU information. 898 * FreeBSD already has same semantics without setting the option. 899 */ 900 optval = IP_PMTUDISC_OMIT; 901 if(0 == setsockopt( 902 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 903 { 904 return 1; 905 } 906 907 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 908 "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); 909 # endif /* IP_PMTUDISC_OMIT */ 910 # if defined(IP_PMTUDISC_DONT) 911 /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ 912 optval = IP_PMTUDISC_DONT; 913 if(0 == setsockopt( 914 sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) 915 { 916 return 1; 917 } 918 919 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 920 "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); 921 # endif 922 ret = -1; 923 #elif defined(IP_DONTFRAG) 924 int off = 0; 925 if (0 == setsockopt( 926 sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) 927 { 928 return 1; 929 } 930 931 log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", 932 strerror(errno)); 933 ret = -1; 934 #else 935 (void)sock; 936 #endif 937 938 return ret; 939 } 940 941 static int 942 set_ip_freebind(struct nsd_socket *sock) 943 { 944 #ifdef IP_FREEBIND 945 int on = 1; 946 const char *socktype = 947 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 948 if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) 949 { 950 return 1; 951 } 952 log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", 953 socktype, strerror(errno)); 954 return -1; 955 #else 956 (void)sock; 957 #endif /* IP_FREEBIND */ 958 959 return 0; 960 } 961 962 static int 963 set_ip_transparent(struct nsd_socket *sock) 964 { 965 /* 966 The scandalous preprocessor blob here calls for some explanation :) 967 POSIX does not specify an option to bind non-local IPs, so 968 platforms developed several implementation-specific options, 969 all set in the same way, but with different names. 970 For additional complexity, some platform manage this setting 971 differently for different address families (IPv4 vs IPv6). 972 This scandalous preprocessor blob below abstracts such variability 973 in the way which leaves the C code as lean and clear as possible. 974 */ 975 976 #if defined(IP_TRANSPARENT) 977 # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT 978 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 979 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" 980 // as of 2020-01, Linux does not support this on IPv6 programmatically 981 #elif defined(SO_BINDANY) 982 # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY 983 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET 984 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" 985 #elif defined(IP_BINDANY) 986 # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY 987 # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY 988 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP 989 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 990 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" 991 #endif 992 993 #ifndef NSD_SOCKET_OPTION_TRANSPARENT 994 (void)sock; 995 #else 996 # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 997 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT 998 # endif 999 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 1000 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL 1001 # endif 1002 # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 1003 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME 1004 # endif 1005 1006 int on = 1; 1007 const char *socktype = 1008 sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; 1009 const int is_ip6 = (sock->addr.ai_family == AF_INET6); 1010 1011 if(0 == setsockopt( 1012 sock->s, 1013 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, 1014 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, 1015 &on, sizeof(on))) 1016 { 1017 return 1; 1018 } 1019 1020 log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", 1021 is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); 1022 return -1; 1023 #endif 1024 1025 return 0; 1026 } 1027 1028 static int 1029 set_tcp_maxseg(struct nsd_socket *sock, int mss) 1030 { 1031 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) 1032 if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { 1033 return 1; 1034 } 1035 log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", 1036 strerror(errno)); 1037 return -1; 1038 #else 1039 log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); 1040 #endif 1041 return 0; 1042 } 1043 1044 #ifdef USE_TCP_FASTOPEN 1045 static int 1046 set_tcp_fastopen(struct nsd_socket *sock) 1047 { 1048 /* qlen specifies how many outstanding TFO requests to allow. Limit is 1049 * a defense against IP spoofing attacks as suggested in RFC7413. 1050 */ 1051 int qlen; 1052 1053 #ifdef __APPLE__ 1054 /* macOS X implementation only supports qlen of 1 via this call. The 1055 * actual value is configured by the net.inet.tcp.fastopen_backlog 1056 * kernel parameter. 1057 */ 1058 qlen = 1; 1059 #else 1060 /* 5 is recommended on Linux. */ 1061 qlen = 5; 1062 #endif 1063 if (0 == setsockopt( 1064 sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) 1065 { 1066 return 1; 1067 } 1068 1069 if (errno == EPERM) { 1070 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " 1071 "; this could likely be because sysctl " 1072 "net.inet.tcp.fastopen.enabled, " 1073 "net.inet.tcp.fastopen.server_enable, or " 1074 "net.ipv4.tcp_fastopen is disabled", 1075 strerror(errno)); 1076 /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support 1077 * disabled, except when verbosity enabled for debugging 1078 */ 1079 } else if(errno != ENOPROTOOPT || verbosity >= 3) { 1080 log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", 1081 strerror(errno)); 1082 } 1083 1084 return (errno == ENOPROTOOPT ? 0 : -1); 1085 } 1086 #endif /* USE_TCP_FASTOPEN */ 1087 1088 static int 1089 set_bindtodevice(struct nsd_socket *sock) 1090 { 1091 #if defined(SO_BINDTODEVICE) 1092 if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, 1093 sock->device, strlen(sock->device)) == -1) 1094 { 1095 log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", 1096 "SO_BINDTODEVICE", sock->device, strerror(errno)); 1097 return -1; 1098 } 1099 1100 return 1; 1101 #else 1102 (void)sock; 1103 return 0; 1104 #endif 1105 } 1106 1107 static int 1108 set_setfib(struct nsd_socket *sock) 1109 { 1110 #if defined(SO_SETFIB) 1111 if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, 1112 (const void *)&sock->fib, sizeof(sock->fib)) == -1) 1113 { 1114 log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", 1115 "SO_SETFIB", sock->fib, strerror(errno)); 1116 return -1; 1117 } 1118 1119 return 1; 1120 #else 1121 (void)sock; 1122 return 0; 1123 #endif 1124 } 1125 1126 static int 1127 open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1128 { 1129 int rcv = 1*1024*1024, snd = 1*1024*1024; 1130 1131 if(-1 == (sock->s = socket( 1132 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1133 { 1134 #ifdef INET6 1135 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1136 (sock->addr.ai_family == AF_INET6) && 1137 (errno == EAFNOSUPPORT)) 1138 { 1139 log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " 1140 "not supported"); 1141 return 0; 1142 } 1143 #endif 1144 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1145 return -1; 1146 } 1147 1148 set_cloexec(sock); 1149 1150 if(nsd->reuseport && reuseport_works && *reuseport_works) 1151 *reuseport_works = (set_reuseport(sock) == 1); 1152 1153 if(nsd->options->receive_buffer_size > 0) 1154 rcv = nsd->options->receive_buffer_size; 1155 if(set_rcvbuf(sock, rcv) == -1) 1156 return -1; 1157 1158 if(nsd->options->send_buffer_size > 0) 1159 snd = nsd->options->send_buffer_size; 1160 if(set_sndbuf(sock, snd) == -1) 1161 return -1; 1162 #ifdef INET6 1163 if(sock->addr.ai_family == AF_INET6) { 1164 if(set_ipv6_v6only(sock) == -1 || 1165 set_ipv6_use_min_mtu(sock) == -1) 1166 return -1; 1167 } else 1168 #endif /* INET6 */ 1169 if(sock->addr.ai_family == AF_INET) { 1170 if(set_ipv4_no_pmtu_disc(sock) == -1) 1171 return -1; 1172 } 1173 1174 /* Set socket to non-blocking. Otherwise, on operating systems 1175 * with thundering herd problems, the UDP recv could block 1176 * after select returns readable. 1177 */ 1178 set_nonblock(sock); 1179 1180 if(nsd->options->ip_freebind) 1181 (void)set_ip_freebind(sock); 1182 if(nsd->options->ip_transparent) 1183 (void)set_ip_transparent(sock); 1184 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1185 return -1; 1186 if(sock->fib != -1 && set_setfib(sock) == -1) 1187 return -1; 1188 1189 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1190 char buf[256]; 1191 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1192 log_msg(LOG_ERR, "can't bind udp socket %s: %s", 1193 buf, strerror(errno)); 1194 return -1; 1195 } 1196 1197 return 1; 1198 } 1199 1200 static int 1201 open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) 1202 { 1203 #ifdef USE_TCP_FASTOPEN 1204 report_tcp_fastopen_config(); 1205 #endif 1206 1207 (void)reuseport_works; 1208 1209 if(-1 == (sock->s = socket( 1210 sock->addr.ai_family, sock->addr.ai_socktype, 0))) 1211 { 1212 #ifdef INET6 1213 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && 1214 (sock->addr.ai_family == AF_INET6) && 1215 (errno == EAFNOSUPPORT)) 1216 { 1217 log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " 1218 "not supported"); 1219 return 0; 1220 } 1221 #endif /* INET6 */ 1222 log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); 1223 return -1; 1224 } 1225 1226 set_cloexec(sock); 1227 1228 if(nsd->reuseport && reuseport_works && *reuseport_works) 1229 *reuseport_works = (set_reuseport(sock) == 1); 1230 1231 (void)set_reuseaddr(sock); 1232 1233 #ifdef INET6 1234 if(sock->addr.ai_family == AF_INET6) { 1235 if (set_ipv6_v6only(sock) == -1 || 1236 set_ipv6_use_min_mtu(sock) == -1) 1237 return -1; 1238 } 1239 #endif 1240 1241 if(nsd->tcp_mss > 0) 1242 set_tcp_maxseg(sock, nsd->tcp_mss); 1243 /* (StevensUNP p463), if TCP listening socket is blocking, then 1244 it may block in accept, even if select() says readable. */ 1245 (void)set_nonblock(sock); 1246 if(nsd->options->ip_freebind) 1247 (void)set_ip_freebind(sock); 1248 if(nsd->options->ip_transparent) 1249 (void)set_ip_transparent(sock); 1250 if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) 1251 return -1; 1252 if(sock->fib != -1 && set_setfib(sock) == -1) 1253 return -1; 1254 1255 if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { 1256 char buf[256]; 1257 addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); 1258 log_msg(LOG_ERR, "can't bind tcp socket %s: %s", 1259 buf, strerror(errno)); 1260 return -1; 1261 } 1262 1263 #ifdef USE_TCP_FASTOPEN 1264 (void)set_tcp_fastopen(sock); 1265 #endif 1266 1267 if(listen(sock->s, TCP_BACKLOG) == -1) { 1268 log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); 1269 return -1; 1270 } 1271 1272 return 1; 1273 } 1274 1275 /* 1276 * Initialize the server, reuseport, create and bind the sockets. 1277 */ 1278 int 1279 server_init(struct nsd *nsd) 1280 { 1281 size_t i; 1282 int reuseport = 1; /* Determine if REUSEPORT works. */ 1283 1284 /* open server interface ports */ 1285 for(i = 0; i < nsd->ifs; i++) { 1286 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || 1287 open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) 1288 { 1289 return -1; 1290 } 1291 } 1292 1293 if(nsd->reuseport && reuseport) { 1294 size_t ifs = nsd->ifs * nsd->reuseport; 1295 1296 /* increase the size of the interface arrays, there are going 1297 * to be separate interface file descriptors for every server 1298 * instance */ 1299 region_remove_cleanup(nsd->region, free, nsd->udp); 1300 region_remove_cleanup(nsd->region, free, nsd->tcp); 1301 1302 nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); 1303 nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); 1304 region_add_cleanup(nsd->region, free, nsd->udp); 1305 region_add_cleanup(nsd->region, free, nsd->tcp); 1306 if(ifs > nsd->ifs) { 1307 memset(&nsd->udp[nsd->ifs], 0, 1308 (ifs-nsd->ifs)*sizeof(*nsd->udp)); 1309 memset(&nsd->tcp[nsd->ifs], 0, 1310 (ifs-nsd->ifs)*sizeof(*nsd->tcp)); 1311 } 1312 1313 for(i = nsd->ifs; i < ifs; i++) { 1314 nsd->udp[i] = nsd->udp[i%nsd->ifs]; 1315 nsd->udp[i].s = -1; 1316 if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { 1317 return -1; 1318 } 1319 /* Turn off REUSEPORT for TCP by copying the socket 1320 * file descriptor. 1321 * This means we should not close TCP used by 1322 * other servers in reuseport enabled mode, in 1323 * server_child(). 1324 */ 1325 nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; 1326 } 1327 1328 nsd->ifs = ifs; 1329 } else { 1330 nsd->reuseport = 0; 1331 } 1332 1333 return 0; 1334 } 1335 1336 /* 1337 * Prepare the server for take off. 1338 * 1339 */ 1340 int 1341 server_prepare(struct nsd *nsd) 1342 { 1343 #ifdef RATELIMIT 1344 /* set secret modifier for hashing (udb ptr buckets and rate limits) */ 1345 #ifdef HAVE_GETRANDOM 1346 uint32_t v; 1347 if(getrandom(&v, sizeof(v), 0) == -1) { 1348 log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); 1349 exit(1); 1350 } 1351 hash_set_raninit(v); 1352 #elif defined(HAVE_ARC4RANDOM) 1353 hash_set_raninit(arc4random()); 1354 #else 1355 uint32_t v = getpid() ^ time(NULL); 1356 srandom((unsigned long)v); 1357 # ifdef HAVE_SSL 1358 if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) 1359 hash_set_raninit(v); 1360 else 1361 # endif 1362 hash_set_raninit(random()); 1363 #endif 1364 rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, 1365 nsd->options->rrl_ratelimit, 1366 nsd->options->rrl_whitelist_ratelimit, 1367 nsd->options->rrl_slip, 1368 nsd->options->rrl_ipv4_prefix_length, 1369 nsd->options->rrl_ipv6_prefix_length); 1370 #endif /* RATELIMIT */ 1371 1372 /* Open the database... */ 1373 if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { 1374 log_msg(LOG_ERR, "unable to open the database %s: %s", 1375 nsd->dbfile, strerror(errno)); 1376 unlink(nsd->task[0]->fname); 1377 unlink(nsd->task[1]->fname); 1378 #ifdef USE_ZONE_STATS 1379 unlink(nsd->zonestatfname[0]); 1380 unlink(nsd->zonestatfname[1]); 1381 #endif 1382 xfrd_del_tempdir(nsd); 1383 return -1; 1384 } 1385 /* check if zone files have been modified */ 1386 /* NULL for taskudb because we send soainfo in a moment, batched up, 1387 * for all zones */ 1388 if(nsd->options->zonefiles_check || (nsd->options->database == NULL || 1389 nsd->options->database[0] == 0)) 1390 namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); 1391 zonestatid_tree_set(nsd); 1392 1393 compression_table_capacity = 0; 1394 initialize_dname_compression_tables(nsd); 1395 1396 #ifdef BIND8_STATS 1397 /* Initialize times... */ 1398 time(&nsd->st.boot); 1399 set_bind8_alarm(nsd); 1400 #endif /* BIND8_STATS */ 1401 1402 return 0; 1403 } 1404 1405 /* 1406 * Fork the required number of servers. 1407 */ 1408 static int 1409 server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, 1410 int* xfrd_sock_p) 1411 { 1412 size_t i; 1413 1414 /* Start all child servers initially. */ 1415 for (i = 0; i < nsd->child_count; ++i) { 1416 nsd->children[i].pid = 0; 1417 } 1418 1419 return restart_child_servers(nsd, region, netio, xfrd_sock_p); 1420 } 1421 1422 static void 1423 server_close_socket(struct nsd_socket *sock) 1424 { 1425 if(sock->s != -1) { 1426 close(sock->s); 1427 sock->s = -1; 1428 } 1429 } 1430 1431 void 1432 server_close_all_sockets(struct nsd_socket sockets[], size_t n) 1433 { 1434 size_t i; 1435 1436 /* Close all the sockets... */ 1437 for (i = 0; i < n; ++i) { 1438 server_close_socket(&sockets[i]); 1439 } 1440 } 1441 1442 /* 1443 * Close the sockets, shutdown the server and exit. 1444 * Does not return. 1445 */ 1446 void 1447 server_shutdown(struct nsd *nsd) 1448 { 1449 size_t i; 1450 1451 server_close_all_sockets(nsd->udp, nsd->ifs); 1452 server_close_all_sockets(nsd->tcp, nsd->ifs); 1453 /* CHILD: close command channel to parent */ 1454 if(nsd->this_child && nsd->this_child->parent_fd != -1) 1455 { 1456 close(nsd->this_child->parent_fd); 1457 nsd->this_child->parent_fd = -1; 1458 } 1459 /* SERVER: close command channels to children */ 1460 if(!nsd->this_child) 1461 { 1462 for(i=0; i < nsd->child_count; ++i) 1463 if(nsd->children[i].child_fd != -1) 1464 { 1465 close(nsd->children[i].child_fd); 1466 nsd->children[i].child_fd = -1; 1467 } 1468 } 1469 1470 tsig_finalize(); 1471 #ifdef HAVE_SSL 1472 daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ 1473 if (nsd->tls_ctx) 1474 SSL_CTX_free(nsd->tls_ctx); 1475 #endif 1476 1477 #ifdef MEMCLEAN /* OS collects memory pages */ 1478 #ifdef RATELIMIT 1479 rrl_mmap_deinit_keep_mmap(); 1480 #endif 1481 #ifdef USE_DNSTAP 1482 dt_collector_destroy(nsd->dt_collector, nsd); 1483 #endif 1484 udb_base_free_keep_mmap(nsd->task[0]); 1485 udb_base_free_keep_mmap(nsd->task[1]); 1486 namedb_close_udb(nsd->db); /* keeps mmap */ 1487 namedb_close(nsd->db); 1488 nsd_options_destroy(nsd->options); 1489 region_destroy(nsd->region); 1490 #endif 1491 log_finalize(); 1492 exit(0); 1493 } 1494 1495 void 1496 server_prepare_xfrd(struct nsd* nsd) 1497 { 1498 char tmpfile[256]; 1499 /* create task mmaps */ 1500 nsd->mytask = 0; 1501 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", 1502 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1503 nsd->task[0] = task_file_create(tmpfile); 1504 if(!nsd->task[0]) { 1505 #ifdef USE_ZONE_STATS 1506 unlink(nsd->zonestatfname[0]); 1507 unlink(nsd->zonestatfname[1]); 1508 #endif 1509 xfrd_del_tempdir(nsd); 1510 exit(1); 1511 } 1512 snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", 1513 nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); 1514 nsd->task[1] = task_file_create(tmpfile); 1515 if(!nsd->task[1]) { 1516 unlink(nsd->task[0]->fname); 1517 #ifdef USE_ZONE_STATS 1518 unlink(nsd->zonestatfname[0]); 1519 unlink(nsd->zonestatfname[1]); 1520 #endif 1521 xfrd_del_tempdir(nsd); 1522 exit(1); 1523 } 1524 assert(udb_base_get_userdata(nsd->task[0])->data == 0); 1525 assert(udb_base_get_userdata(nsd->task[1])->data == 0); 1526 /* create xfrd listener structure */ 1527 nsd->xfrd_listener = region_alloc(nsd->region, 1528 sizeof(netio_handler_type)); 1529 nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) 1530 region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); 1531 nsd->xfrd_listener->fd = -1; 1532 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = 1533 nsd; 1534 ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = 1535 xfrd_tcp_create(nsd->region, QIOBUFSZ); 1536 } 1537 1538 1539 void 1540 server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) 1541 { 1542 pid_t pid; 1543 int sockets[2] = {0,0}; 1544 struct ipc_handler_conn_data *data; 1545 1546 if(nsd->xfrd_listener->fd != -1) 1547 close(nsd->xfrd_listener->fd); 1548 if(del_db) { 1549 /* recreate taskdb that xfrd was using, it may be corrupt */ 1550 /* we (or reload) use nsd->mytask, and xfrd uses the other */ 1551 char* tmpfile = nsd->task[1-nsd->mytask]->fname; 1552 nsd->task[1-nsd->mytask]->fname = NULL; 1553 /* free alloc already, so udb does not shrink itself */ 1554 udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); 1555 nsd->task[1-nsd->mytask]->alloc = NULL; 1556 udb_base_free(nsd->task[1-nsd->mytask]); 1557 /* create new file, overwrite the old one */ 1558 nsd->task[1-nsd->mytask] = task_file_create(tmpfile); 1559 free(tmpfile); 1560 } 1561 if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { 1562 log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); 1563 return; 1564 } 1565 pid = fork(); 1566 switch (pid) { 1567 case -1: 1568 log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); 1569 break; 1570 default: 1571 /* PARENT: close first socket, use second one */ 1572 close(sockets[0]); 1573 if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { 1574 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1575 } 1576 if(del_db) xfrd_free_namedb(nsd); 1577 /* use other task than I am using, since if xfrd died and is 1578 * restarted, the reload is using nsd->mytask */ 1579 nsd->mytask = 1 - nsd->mytask; 1580 1581 #ifdef HAVE_SETPROCTITLE 1582 setproctitle("xfrd"); 1583 #endif 1584 #ifdef HAVE_CPUSET_T 1585 if(nsd->use_cpu_affinity) { 1586 set_cpu_affinity(nsd->xfrd_cpuset); 1587 } 1588 #endif 1589 1590 xfrd_init(sockets[1], nsd, del_db, reload_active, pid); 1591 /* ENOTREACH */ 1592 break; 1593 case 0: 1594 /* CHILD: close second socket, use first one */ 1595 close(sockets[1]); 1596 if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { 1597 log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); 1598 } 1599 nsd->xfrd_listener->fd = sockets[0]; 1600 break; 1601 } 1602 /* server-parent only */ 1603 nsd->xfrd_listener->timeout = NULL; 1604 nsd->xfrd_listener->event_types = NETIO_EVENT_READ; 1605 nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; 1606 /* clear ongoing ipc reads */ 1607 data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; 1608 data->conn->is_reading = 0; 1609 } 1610 1611 /** add all soainfo to taskdb */ 1612 static void 1613 add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) 1614 { 1615 struct radnode* n; 1616 udb_ptr task_last; /* last task, mytask is empty so NULL */ 1617 /* add all SOA INFO to mytask */ 1618 udb_ptr_init(&task_last, taskudb); 1619 for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { 1620 task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); 1621 } 1622 udb_ptr_unlink(&task_last, taskudb); 1623 } 1624 1625 void 1626 server_send_soa_xfrd(struct nsd* nsd, int shortsoa) 1627 { 1628 /* normally this exchanges the SOA from nsd->xfrd and the expire back. 1629 * parent fills one taskdb with soas, xfrd fills other with expires. 1630 * then they exchange and process. 1631 * shortsoa: xfrd crashes and needs to be restarted and one taskdb 1632 * may be in use by reload. Fill SOA in taskdb and give to xfrd. 1633 * expire notifications can be sent back via a normal reload later 1634 * (xfrd will wait for current running reload to finish if any). 1635 */ 1636 sig_atomic_t cmd = 0; 1637 pid_t mypid; 1638 int xfrd_sock = nsd->xfrd_listener->fd; 1639 struct udb_base* taskudb = nsd->task[nsd->mytask]; 1640 udb_ptr t; 1641 if(!shortsoa) { 1642 if(nsd->signal_hint_shutdown) { 1643 shutdown: 1644 log_msg(LOG_WARNING, "signal received, shutting down..."); 1645 server_close_all_sockets(nsd->udp, nsd->ifs); 1646 server_close_all_sockets(nsd->tcp, nsd->ifs); 1647 #ifdef HAVE_SSL 1648 daemon_remote_close(nsd->rc); 1649 #endif 1650 /* Unlink it if possible... */ 1651 unlinkpid(nsd->pidfile); 1652 unlink(nsd->task[0]->fname); 1653 unlink(nsd->task[1]->fname); 1654 #ifdef USE_ZONE_STATS 1655 unlink(nsd->zonestatfname[0]); 1656 unlink(nsd->zonestatfname[1]); 1657 #endif 1658 /* write the nsd.db to disk, wait for it to complete */ 1659 udb_base_sync(nsd->db->udb, 1); 1660 udb_base_close(nsd->db->udb); 1661 server_shutdown(nsd); 1662 /* ENOTREACH */ 1663 exit(0); 1664 } 1665 } 1666 if(shortsoa) { 1667 /* put SOA in xfrd task because mytask may be in use */ 1668 taskudb = nsd->task[1-nsd->mytask]; 1669 } 1670 1671 add_all_soa_to_task(nsd, taskudb); 1672 if(!shortsoa) { 1673 /* wait for xfrd to signal task is ready, RELOAD signal */ 1674 if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || 1675 cmd != NSD_RELOAD) { 1676 log_msg(LOG_ERR, "did not get start signal from xfrd"); 1677 exit(1); 1678 } 1679 if(nsd->signal_hint_shutdown) { 1680 goto shutdown; 1681 } 1682 } 1683 /* give xfrd our task, signal it with RELOAD_DONE */ 1684 task_process_sync(taskudb); 1685 cmd = NSD_RELOAD_DONE; 1686 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1687 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1688 (int)nsd->pid, strerror(errno)); 1689 } 1690 mypid = getpid(); 1691 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 1692 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 1693 strerror(errno)); 1694 } 1695 1696 if(!shortsoa) { 1697 /* process the xfrd task works (expiry data) */ 1698 nsd->mytask = 1 - nsd->mytask; 1699 taskudb = nsd->task[nsd->mytask]; 1700 task_remap(taskudb); 1701 udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); 1702 while(!udb_ptr_is_null(&t)) { 1703 task_process_expire(nsd->db, TASKLIST(&t)); 1704 udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); 1705 } 1706 udb_ptr_unlink(&t, taskudb); 1707 task_clear(taskudb); 1708 1709 /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ 1710 cmd = NSD_RELOAD_DONE; 1711 if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { 1712 log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", 1713 (int)nsd->pid, strerror(errno)); 1714 } 1715 } 1716 } 1717 1718 #ifdef HAVE_SSL 1719 static void 1720 log_crypto_from_err(const char* str, unsigned long err) 1721 { 1722 /* error:[error code]:[library name]:[function name]:[reason string] */ 1723 char buf[128]; 1724 unsigned long e; 1725 ERR_error_string_n(err, buf, sizeof(buf)); 1726 log_msg(LOG_ERR, "%s crypto %s", str, buf); 1727 while( (e=ERR_get_error()) ) { 1728 ERR_error_string_n(e, buf, sizeof(buf)); 1729 log_msg(LOG_ERR, "and additionally crypto %s", buf); 1730 } 1731 } 1732 1733 void 1734 log_crypto_err(const char* str) 1735 { 1736 log_crypto_from_err(str, ERR_get_error()); 1737 } 1738 1739 /** true if the ssl handshake error has to be squelched from the logs */ 1740 static int 1741 squelch_err_ssl_handshake(unsigned long err) 1742 { 1743 if(verbosity >= 3) 1744 return 0; /* only squelch on low verbosity */ 1745 /* this is very specific, we could filter on ERR_GET_REASON() 1746 * (the third element in ERR_PACK) */ 1747 if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || 1748 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || 1749 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || 1750 err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) 1751 #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO 1752 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) 1753 #endif 1754 #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO 1755 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) 1756 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) 1757 # ifdef SSL_R_VERSION_TOO_LOW 1758 || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) 1759 # endif 1760 #endif 1761 ) 1762 return 1; 1763 return 0; 1764 } 1765 1766 void 1767 perform_openssl_init(void) 1768 { 1769 /* init SSL library */ 1770 #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS 1771 ERR_load_crypto_strings(); 1772 #endif 1773 ERR_load_SSL_strings(); 1774 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) 1775 OpenSSL_add_all_algorithms(); 1776 #else 1777 OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS 1778 | OPENSSL_INIT_ADD_ALL_DIGESTS 1779 | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); 1780 #endif 1781 #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) 1782 (void)SSL_library_init(); 1783 #else 1784 OPENSSL_init_ssl(0, NULL); 1785 #endif 1786 1787 if(!RAND_status()) { 1788 /* try to seed it */ 1789 unsigned char buf[256]; 1790 unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); 1791 size_t i; 1792 v = seed; 1793 for(i=0; i<256/sizeof(v); i++) { 1794 memmove(buf+i*sizeof(v), &v, sizeof(v)); 1795 v = v*seed + (unsigned int)i; 1796 } 1797 RAND_seed(buf, 256); 1798 log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); 1799 } 1800 } 1801 1802 static int 1803 get_ocsp(char *filename, unsigned char **ocsp) 1804 { 1805 BIO *bio; 1806 OCSP_RESPONSE *response; 1807 int len = -1; 1808 unsigned char *p, *buf; 1809 assert(filename); 1810 1811 if ((bio = BIO_new_file(filename, "r")) == NULL) { 1812 log_crypto_err("get_ocsp: BIO_new_file failed"); 1813 return -1; 1814 } 1815 1816 if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { 1817 log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); 1818 BIO_free(bio); 1819 return -1; 1820 } 1821 1822 if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { 1823 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); 1824 OCSP_RESPONSE_free(response); 1825 BIO_free(bio); 1826 return -1; 1827 } 1828 1829 if ((buf = malloc((size_t) len)) == NULL) { 1830 log_msg(LOG_ERR, "get_ocsp: malloc failed"); 1831 OCSP_RESPONSE_free(response); 1832 BIO_free(bio); 1833 return -1; 1834 } 1835 1836 p = buf; 1837 if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { 1838 log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); 1839 free(buf); 1840 OCSP_RESPONSE_free(response); 1841 BIO_free(bio); 1842 return -1; 1843 } 1844 1845 OCSP_RESPONSE_free(response); 1846 BIO_free(bio); 1847 1848 *ocsp = buf; 1849 return len; 1850 } 1851 1852 /* further setup ssl ctx after the keys are loaded */ 1853 static void 1854 listen_sslctx_setup_2(void* ctxt) 1855 { 1856 SSL_CTX* ctx = (SSL_CTX*)ctxt; 1857 (void)ctx; 1858 #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO 1859 if(!SSL_CTX_set_ecdh_auto(ctx,1)) { 1860 /* ENOTREACH */ 1861 log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); 1862 } 1863 #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) 1864 if(1) { 1865 EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); 1866 if (!ecdh) { 1867 log_crypto_err("could not find p256, not enabling ECDHE"); 1868 } else { 1869 if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { 1870 log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); 1871 } 1872 EC_KEY_free (ecdh); 1873 } 1874 } 1875 #endif 1876 } 1877 1878 static int 1879 add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) 1880 { 1881 if(ocspdata) { 1882 unsigned char *p; 1883 if ((p=malloc(ocspdata_len)) == NULL) { 1884 log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); 1885 return SSL_TLSEXT_ERR_NOACK; 1886 } 1887 memcpy(p, ocspdata, ocspdata_len); 1888 if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { 1889 log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); 1890 free(p); 1891 return SSL_TLSEXT_ERR_NOACK; 1892 } 1893 return SSL_TLSEXT_ERR_OK; 1894 } else { 1895 return SSL_TLSEXT_ERR_NOACK; 1896 } 1897 } 1898 1899 SSL_CTX* 1900 server_tls_ctx_setup(char* key, char* pem, char* verifypem) 1901 { 1902 SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); 1903 if(!ctx) { 1904 log_crypto_err("could not SSL_CTX_new"); 1905 return NULL; 1906 } 1907 /* no SSLv2, SSLv3 because has defects */ 1908 #if SSL_OP_NO_SSLv2 != 0 1909 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ 1910 log_crypto_err("could not set SSL_OP_NO_SSLv2"); 1911 SSL_CTX_free(ctx); 1912 return NULL; 1913 } 1914 #endif 1915 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) 1916 != SSL_OP_NO_SSLv3){ 1917 log_crypto_err("could not set SSL_OP_NO_SSLv3"); 1918 SSL_CTX_free(ctx); 1919 return 0; 1920 } 1921 #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) 1922 /* if we have tls 1.1 disable 1.0 */ 1923 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) 1924 != SSL_OP_NO_TLSv1){ 1925 log_crypto_err("could not set SSL_OP_NO_TLSv1"); 1926 SSL_CTX_free(ctx); 1927 return 0; 1928 } 1929 #endif 1930 #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) 1931 /* if we have tls 1.2 disable 1.1 */ 1932 if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) 1933 != SSL_OP_NO_TLSv1_1){ 1934 log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); 1935 SSL_CTX_free(ctx); 1936 return 0; 1937 } 1938 #endif 1939 #if defined(SSL_OP_NO_RENEGOTIATION) 1940 /* disable client renegotiation */ 1941 if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & 1942 SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { 1943 log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); 1944 SSL_CTX_free(ctx); 1945 return 0; 1946 } 1947 #endif 1948 #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) 1949 /* if we have sha256, set the cipher list to have no known vulns */ 1950 if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) 1951 log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); 1952 #endif 1953 if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & 1954 SSL_OP_CIPHER_SERVER_PREFERENCE) != 1955 SSL_OP_CIPHER_SERVER_PREFERENCE) { 1956 log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); 1957 SSL_CTX_free(ctx); 1958 return 0; 1959 } 1960 #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL 1961 SSL_CTX_set_security_level(ctx, 0); 1962 #endif 1963 if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { 1964 log_msg(LOG_ERR, "error for cert file: %s", pem); 1965 log_crypto_err("error in SSL_CTX use_certificate_chain_file"); 1966 SSL_CTX_free(ctx); 1967 return NULL; 1968 } 1969 if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { 1970 log_msg(LOG_ERR, "error for private key file: %s", key); 1971 log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); 1972 SSL_CTX_free(ctx); 1973 return NULL; 1974 } 1975 if(!SSL_CTX_check_private_key(ctx)) { 1976 log_msg(LOG_ERR, "error for key file: %s", key); 1977 log_crypto_err("Error in SSL_CTX check_private_key"); 1978 SSL_CTX_free(ctx); 1979 return NULL; 1980 } 1981 listen_sslctx_setup_2(ctx); 1982 if(verifypem && verifypem[0]) { 1983 if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { 1984 log_crypto_err("Error in SSL_CTX verify locations"); 1985 SSL_CTX_free(ctx); 1986 return NULL; 1987 } 1988 SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); 1989 SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); 1990 } 1991 return ctx; 1992 } 1993 1994 SSL_CTX* 1995 server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) 1996 { 1997 char *key, *pem; 1998 SSL_CTX *ctx; 1999 2000 key = nsd->options->tls_service_key; 2001 pem = nsd->options->tls_service_pem; 2002 if(!key || key[0] == 0) { 2003 log_msg(LOG_ERR, "error: no tls-service-key file specified"); 2004 return NULL; 2005 } 2006 if(!pem || pem[0] == 0) { 2007 log_msg(LOG_ERR, "error: no tls-service-pem file specified"); 2008 return NULL; 2009 } 2010 2011 /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but 2012 * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ 2013 ctx = server_tls_ctx_setup(key, pem, verifypem); 2014 if(!ctx) { 2015 log_msg(LOG_ERR, "could not setup server TLS context"); 2016 return NULL; 2017 } 2018 if(ocspfile && ocspfile[0]) { 2019 if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { 2020 log_crypto_err("Error reading OCSPfile"); 2021 SSL_CTX_free(ctx); 2022 return NULL; 2023 } else { 2024 VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); 2025 if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { 2026 log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); 2027 SSL_CTX_free(ctx); 2028 return NULL; 2029 } 2030 } 2031 } 2032 return ctx; 2033 } 2034 2035 /* check if tcp_handler_accept_data created for TLS dedicated port */ 2036 int 2037 using_tls_port(struct sockaddr* addr, const char* tls_port) 2038 { 2039 in_port_t port = 0; 2040 2041 if (addr->sa_family == AF_INET) 2042 port = ((struct sockaddr_in*)addr)->sin_port; 2043 #ifndef HAVE_STRUCT_SOCKADDR_IN6 2044 else 2045 port = ((struct sockaddr_in6*)addr)->sin6_port; 2046 #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ 2047 if (atoi(tls_port) == ntohs(port)) 2048 return 1; 2049 2050 return 0; 2051 } 2052 #endif 2053 2054 /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ 2055 ssize_t 2056 block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) 2057 { 2058 uint8_t* buf = (uint8_t*) p; 2059 ssize_t total = 0; 2060 struct pollfd fd; 2061 memset(&fd, 0, sizeof(fd)); 2062 fd.fd = s; 2063 fd.events = POLLIN; 2064 2065 while( total < sz) { 2066 ssize_t ret; 2067 ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); 2068 if(ret == -1) { 2069 if(errno == EAGAIN) 2070 /* blocking read */ 2071 continue; 2072 if(errno == EINTR) { 2073 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2074 return -1; 2075 /* other signals can be handled later */ 2076 continue; 2077 } 2078 /* some error */ 2079 return -1; 2080 } 2081 if(ret == 0) { 2082 /* operation timed out */ 2083 return -2; 2084 } 2085 ret = read(s, buf+total, sz-total); 2086 if(ret == -1) { 2087 if(errno == EAGAIN) 2088 /* blocking read */ 2089 continue; 2090 if(errno == EINTR) { 2091 if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) 2092 return -1; 2093 /* other signals can be handled later */ 2094 continue; 2095 } 2096 /* some error */ 2097 return -1; 2098 } 2099 if(ret == 0) { 2100 /* closed connection! */ 2101 return 0; 2102 } 2103 total += ret; 2104 } 2105 return total; 2106 } 2107 2108 static void 2109 reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) 2110 { 2111 sig_atomic_t cmd = NSD_QUIT_SYNC; 2112 udb_ptr t, next; 2113 udb_base* u = nsd->task[nsd->mytask]; 2114 udb_ptr_init(&next, u); 2115 udb_ptr_new(&t, u, udb_base_get_userdata(u)); 2116 udb_base_set_userdata(u, 0); 2117 while(!udb_ptr_is_null(&t)) { 2118 /* store next in list so this one can be deleted or reused */ 2119 udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); 2120 udb_rptr_zero(&TASKLIST(&t)->next, u); 2121 2122 /* process task t */ 2123 /* append results for task t and update last_task */ 2124 task_process_in_reload(nsd, u, last_task, &t); 2125 2126 /* go to next */ 2127 udb_ptr_set_ptr(&t, u, &next); 2128 2129 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2130 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2131 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2132 if(cmd == NSD_QUIT) { 2133 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2134 /* sync to disk (if needed) */ 2135 udb_base_sync(nsd->db->udb, 0); 2136 /* unlink files of remainder of tasks */ 2137 while(!udb_ptr_is_null(&t)) { 2138 if(TASKLIST(&t)->task_type == task_apply_xfr) { 2139 xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); 2140 } 2141 udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); 2142 } 2143 udb_ptr_unlink(&t, u); 2144 udb_ptr_unlink(&next, u); 2145 exit(0); 2146 } 2147 } 2148 2149 } 2150 udb_ptr_unlink(&t, u); 2151 udb_ptr_unlink(&next, u); 2152 } 2153 2154 #ifdef BIND8_STATS 2155 static void 2156 parent_send_stats(struct nsd* nsd, int cmdfd) 2157 { 2158 size_t i; 2159 if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { 2160 log_msg(LOG_ERR, "could not write stats to reload"); 2161 return; 2162 } 2163 for(i=0; i<nsd->child_count; i++) 2164 if(!write_socket(cmdfd, &nsd->children[i].query_count, 2165 sizeof(stc_type))) { 2166 log_msg(LOG_ERR, "could not write stats to reload"); 2167 return; 2168 } 2169 } 2170 2171 static void 2172 reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) 2173 { 2174 struct nsdst s; 2175 stc_type* p; 2176 size_t i; 2177 if(block_read(nsd, cmdfd, &s, sizeof(s), 2178 RELOAD_SYNC_TIMEOUT) != sizeof(s)) { 2179 log_msg(LOG_ERR, "could not read stats from oldpar"); 2180 return; 2181 } 2182 s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); 2183 s.db_mem = region_get_mem(nsd->db->region); 2184 p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, 2185 nsd->child_count); 2186 if(!p) return; 2187 for(i=0; i<nsd->child_count; i++) { 2188 if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= 2189 sizeof(stc_type)) 2190 return; 2191 } 2192 } 2193 #endif /* BIND8_STATS */ 2194 2195 /* 2196 * Reload the database, stop parent, re-fork children and continue. 2197 * as server_main. 2198 */ 2199 static void 2200 server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, 2201 int cmdsocket) 2202 { 2203 pid_t mypid; 2204 sig_atomic_t cmd = NSD_QUIT_SYNC; 2205 int ret; 2206 udb_ptr last_task; 2207 struct sigaction old_sigchld, ign_sigchld; 2208 /* ignore SIGCHLD from the previous server_main that used this pid */ 2209 memset(&ign_sigchld, 0, sizeof(ign_sigchld)); 2210 ign_sigchld.sa_handler = SIG_IGN; 2211 sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); 2212 2213 #ifdef HAVE_SETPROCTITLE 2214 setproctitle("main"); 2215 #endif 2216 #ifdef HAVE_CPUSET_T 2217 if(nsd->use_cpu_affinity) { 2218 set_cpu_affinity(nsd->cpuset); 2219 } 2220 #endif 2221 2222 /* see what tasks we got from xfrd */ 2223 task_remap(nsd->task[nsd->mytask]); 2224 udb_ptr_init(&last_task, nsd->task[nsd->mytask]); 2225 udb_compact_inhibited(nsd->db->udb, 1); 2226 reload_process_tasks(nsd, &last_task, cmdsocket); 2227 udb_compact_inhibited(nsd->db->udb, 0); 2228 udb_compact(nsd->db->udb); 2229 2230 #ifndef NDEBUG 2231 if(nsd_debug_level >= 1) 2232 region_log_stats(nsd->db->region); 2233 #endif /* NDEBUG */ 2234 /* sync to disk (if needed) */ 2235 udb_base_sync(nsd->db->udb, 0); 2236 2237 initialize_dname_compression_tables(nsd); 2238 2239 #ifdef BIND8_STATS 2240 /* Restart dumping stats if required. */ 2241 time(&nsd->st.boot); 2242 set_bind8_alarm(nsd); 2243 #endif 2244 #ifdef USE_ZONE_STATS 2245 server_zonestat_realloc(nsd); /* realloc for new children */ 2246 server_zonestat_switch(nsd); 2247 #endif 2248 2249 /* listen for the signals of failed children again */ 2250 sigaction(SIGCHLD, &old_sigchld, NULL); 2251 /* Start new child processes */ 2252 if (server_start_children(nsd, server_region, netio, &nsd-> 2253 xfrd_listener->fd) != 0) { 2254 send_children_quit(nsd); 2255 exit(1); 2256 } 2257 2258 /* if the parent has quit, we must quit too, poll the fd for cmds */ 2259 if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { 2260 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); 2261 if(cmd == NSD_QUIT) { 2262 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); 2263 send_children_quit(nsd); 2264 exit(0); 2265 } 2266 } 2267 2268 /* Send quit command to parent: blocking, wait for receipt. */ 2269 do { 2270 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); 2271 if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) 2272 { 2273 log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", 2274 strerror(errno)); 2275 } 2276 /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ 2277 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); 2278 ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 2279 RELOAD_SYNC_TIMEOUT); 2280 if(ret == -2) { 2281 DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); 2282 } 2283 } while (ret == -2); 2284 if(ret == -1) { 2285 log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", 2286 strerror(errno)); 2287 } 2288 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); 2289 if(cmd == NSD_QUIT) { 2290 /* small race condition possible here, parent got quit cmd. */ 2291 send_children_quit(nsd); 2292 exit(1); 2293 } 2294 assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); 2295 #ifdef BIND8_STATS 2296 reload_do_stats(cmdsocket, nsd, &last_task); 2297 #endif 2298 udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); 2299 task_process_sync(nsd->task[nsd->mytask]); 2300 #ifdef USE_ZONE_STATS 2301 server_zonestat_realloc(nsd); /* realloc for next children */ 2302 #endif 2303 2304 /* send soainfo to the xfrd process, signal it that reload is done, 2305 * it picks up the taskudb */ 2306 cmd = NSD_RELOAD_DONE; 2307 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2308 log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", 2309 strerror(errno)); 2310 } 2311 mypid = getpid(); 2312 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2313 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2314 strerror(errno)); 2315 } 2316 2317 /* try to reopen file */ 2318 if (nsd->file_rotation_ok) 2319 log_reopen(nsd->log_filename, 1); 2320 /* exit reload, continue as new server_main */ 2321 } 2322 2323 /* 2324 * Get the mode depending on the signal hints that have been received. 2325 * Multiple signal hints can be received and will be handled in turn. 2326 */ 2327 static sig_atomic_t 2328 server_signal_mode(struct nsd *nsd) 2329 { 2330 if(nsd->signal_hint_quit) { 2331 nsd->signal_hint_quit = 0; 2332 return NSD_QUIT; 2333 } 2334 else if(nsd->signal_hint_shutdown) { 2335 nsd->signal_hint_shutdown = 0; 2336 return NSD_SHUTDOWN; 2337 } 2338 else if(nsd->signal_hint_child) { 2339 nsd->signal_hint_child = 0; 2340 return NSD_REAP_CHILDREN; 2341 } 2342 else if(nsd->signal_hint_reload) { 2343 nsd->signal_hint_reload = 0; 2344 return NSD_RELOAD; 2345 } 2346 else if(nsd->signal_hint_reload_hup) { 2347 nsd->signal_hint_reload_hup = 0; 2348 return NSD_RELOAD_REQ; 2349 } 2350 else if(nsd->signal_hint_stats) { 2351 nsd->signal_hint_stats = 0; 2352 #ifdef BIND8_STATS 2353 set_bind8_alarm(nsd); 2354 #endif 2355 return NSD_STATS; 2356 } 2357 else if(nsd->signal_hint_statsusr) { 2358 nsd->signal_hint_statsusr = 0; 2359 return NSD_STATS; 2360 } 2361 return NSD_RUN; 2362 } 2363 2364 /* 2365 * The main server simply waits for signals and child processes to 2366 * terminate. Child processes are restarted as necessary. 2367 */ 2368 void 2369 server_main(struct nsd *nsd) 2370 { 2371 region_type *server_region = region_create(xalloc, free); 2372 netio_type *netio = netio_create(server_region); 2373 netio_handler_type reload_listener; 2374 int reload_sockets[2] = {-1, -1}; 2375 struct timespec timeout_spec; 2376 int status; 2377 pid_t child_pid; 2378 pid_t reload_pid = -1; 2379 sig_atomic_t mode; 2380 2381 /* Ensure we are the main process */ 2382 assert(nsd->server_kind == NSD_SERVER_MAIN); 2383 2384 /* Add listener for the XFRD process */ 2385 netio_add_handler(netio, nsd->xfrd_listener); 2386 2387 /* Start the child processes that handle incoming queries */ 2388 if (server_start_children(nsd, server_region, netio, 2389 &nsd->xfrd_listener->fd) != 0) { 2390 send_children_quit(nsd); 2391 exit(1); 2392 } 2393 reload_listener.fd = -1; 2394 2395 /* This_child MUST be 0, because this is the parent process */ 2396 assert(nsd->this_child == 0); 2397 2398 /* Run the server until we get a shutdown signal */ 2399 while ((mode = nsd->mode) != NSD_SHUTDOWN) { 2400 /* Did we receive a signal that changes our mode? */ 2401 if(mode == NSD_RUN) { 2402 nsd->mode = mode = server_signal_mode(nsd); 2403 } 2404 2405 switch (mode) { 2406 case NSD_RUN: 2407 /* see if any child processes terminated */ 2408 while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { 2409 int is_child = delete_child_pid(nsd, child_pid); 2410 if (is_child != -1 && nsd->children[is_child].need_to_exit) { 2411 if(nsd->children[is_child].child_fd == -1) 2412 nsd->children[is_child].has_exited = 1; 2413 parent_check_all_children_exited(nsd); 2414 } else if(is_child != -1) { 2415 log_msg(LOG_WARNING, 2416 "server %d died unexpectedly with status %d, restarting", 2417 (int) child_pid, status); 2418 restart_child_servers(nsd, server_region, netio, 2419 &nsd->xfrd_listener->fd); 2420 } else if (child_pid == reload_pid) { 2421 sig_atomic_t cmd = NSD_RELOAD_DONE; 2422 pid_t mypid; 2423 log_msg(LOG_WARNING, 2424 "Reload process %d failed with status %d, continuing with old database", 2425 (int) child_pid, status); 2426 reload_pid = -1; 2427 if(reload_listener.fd != -1) close(reload_listener.fd); 2428 reload_listener.fd = -1; 2429 reload_listener.event_types = NETIO_EVENT_NONE; 2430 task_process_sync(nsd->task[nsd->mytask]); 2431 /* inform xfrd reload attempt ended */ 2432 if(!write_socket(nsd->xfrd_listener->fd, 2433 &cmd, sizeof(cmd))) { 2434 log_msg(LOG_ERR, "problems " 2435 "sending SOAEND to xfrd: %s", 2436 strerror(errno)); 2437 } 2438 mypid = getpid(); 2439 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2440 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2441 strerror(errno)); 2442 } 2443 } else if(status != 0) { 2444 /* check for status, because we get 2445 * the old-servermain because reload 2446 * is the process-parent of old-main, 2447 * and we get older server-processes 2448 * that are exiting after a reload */ 2449 log_msg(LOG_WARNING, 2450 "process %d terminated with status %d", 2451 (int) child_pid, status); 2452 } 2453 } 2454 if (child_pid == -1) { 2455 if (errno == EINTR) { 2456 continue; 2457 } 2458 if (errno != ECHILD) 2459 log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); 2460 } 2461 if (nsd->mode != NSD_RUN) 2462 break; 2463 2464 /* timeout to collect processes. In case no sigchild happens. */ 2465 timeout_spec.tv_sec = 60; 2466 timeout_spec.tv_nsec = 0; 2467 2468 /* listen on ports, timeout for collecting terminated children */ 2469 if(netio_dispatch(netio, &timeout_spec, 0) == -1) { 2470 if (errno != EINTR) { 2471 log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); 2472 } 2473 } 2474 if(nsd->restart_children) { 2475 restart_child_servers(nsd, server_region, netio, 2476 &nsd->xfrd_listener->fd); 2477 nsd->restart_children = 0; 2478 } 2479 if(nsd->reload_failed) { 2480 sig_atomic_t cmd = NSD_RELOAD_DONE; 2481 pid_t mypid; 2482 nsd->reload_failed = 0; 2483 log_msg(LOG_WARNING, 2484 "Reload process %d failed, continuing with old database", 2485 (int) reload_pid); 2486 reload_pid = -1; 2487 if(reload_listener.fd != -1) close(reload_listener.fd); 2488 reload_listener.fd = -1; 2489 reload_listener.event_types = NETIO_EVENT_NONE; 2490 task_process_sync(nsd->task[nsd->mytask]); 2491 /* inform xfrd reload attempt ended */ 2492 if(!write_socket(nsd->xfrd_listener->fd, 2493 &cmd, sizeof(cmd))) { 2494 log_msg(LOG_ERR, "problems " 2495 "sending SOAEND to xfrd: %s", 2496 strerror(errno)); 2497 } 2498 mypid = getpid(); 2499 if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { 2500 log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", 2501 strerror(errno)); 2502 } 2503 } 2504 2505 break; 2506 case NSD_RELOAD_REQ: { 2507 sig_atomic_t cmd = NSD_RELOAD_REQ; 2508 log_msg(LOG_WARNING, "SIGHUP received, reloading..."); 2509 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2510 "main: ipc send reload_req to xfrd")); 2511 if(!write_socket(nsd->xfrd_listener->fd, 2512 &cmd, sizeof(cmd))) { 2513 log_msg(LOG_ERR, "server_main: could not send " 2514 "reload_req to xfrd: %s", strerror(errno)); 2515 } 2516 nsd->mode = NSD_RUN; 2517 } break; 2518 case NSD_RELOAD: 2519 /* Continue to run nsd after reload */ 2520 nsd->mode = NSD_RUN; 2521 DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); 2522 if (reload_pid != -1) { 2523 log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", 2524 (int) reload_pid); 2525 break; 2526 } 2527 2528 /* switch the mytask to keep track of who owns task*/ 2529 nsd->mytask = 1 - nsd->mytask; 2530 if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { 2531 log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); 2532 reload_pid = -1; 2533 break; 2534 } 2535 2536 /* Do actual reload */ 2537 reload_pid = fork(); 2538 switch (reload_pid) { 2539 case -1: 2540 log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); 2541 break; 2542 default: 2543 /* PARENT */ 2544 close(reload_sockets[0]); 2545 server_reload(nsd, server_region, netio, 2546 reload_sockets[1]); 2547 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); 2548 close(reload_sockets[1]); 2549 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); 2550 /* drop stale xfrd ipc data */ 2551 ((struct ipc_handler_conn_data*)nsd-> 2552 xfrd_listener->user_data) 2553 ->conn->is_reading = 0; 2554 reload_pid = -1; 2555 reload_listener.fd = -1; 2556 reload_listener.event_types = NETIO_EVENT_NONE; 2557 DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); 2558 break; 2559 case 0: 2560 /* CHILD */ 2561 /* server_main keep running until NSD_QUIT_SYNC 2562 * received from reload. */ 2563 close(reload_sockets[1]); 2564 reload_listener.fd = reload_sockets[0]; 2565 reload_listener.timeout = NULL; 2566 reload_listener.user_data = nsd; 2567 reload_listener.event_types = NETIO_EVENT_READ; 2568 reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ 2569 netio_add_handler(netio, &reload_listener); 2570 reload_pid = getppid(); 2571 break; 2572 } 2573 break; 2574 case NSD_QUIT_SYNC: 2575 /* synchronisation of xfrd, parent and reload */ 2576 if(!nsd->quit_sync_done && reload_listener.fd != -1) { 2577 sig_atomic_t cmd = NSD_RELOAD; 2578 /* stop xfrd ipc writes in progress */ 2579 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2580 "main: ipc send indication reload")); 2581 if(!write_socket(nsd->xfrd_listener->fd, 2582 &cmd, sizeof(cmd))) { 2583 log_msg(LOG_ERR, "server_main: could not send reload " 2584 "indication to xfrd: %s", strerror(errno)); 2585 } 2586 /* wait for ACK from xfrd */ 2587 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); 2588 nsd->quit_sync_done = 1; 2589 } 2590 nsd->mode = NSD_RUN; 2591 break; 2592 case NSD_QUIT: 2593 /* silent shutdown during reload */ 2594 if(reload_listener.fd != -1) { 2595 /* acknowledge the quit, to sync reload that we will really quit now */ 2596 sig_atomic_t cmd = NSD_RELOAD; 2597 DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); 2598 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2599 log_msg(LOG_ERR, "server_main: " 2600 "could not ack quit: %s", strerror(errno)); 2601 } 2602 #ifdef BIND8_STATS 2603 parent_send_stats(nsd, reload_listener.fd); 2604 #endif /* BIND8_STATS */ 2605 close(reload_listener.fd); 2606 } 2607 DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); 2608 /* only quit children after xfrd has acked */ 2609 send_children_quit(nsd); 2610 2611 #ifdef MEMCLEAN /* OS collects memory pages */ 2612 region_destroy(server_region); 2613 #endif 2614 server_shutdown(nsd); 2615 2616 /* ENOTREACH */ 2617 break; 2618 case NSD_SHUTDOWN: 2619 break; 2620 case NSD_REAP_CHILDREN: 2621 /* continue; wait for child in run loop */ 2622 nsd->mode = NSD_RUN; 2623 break; 2624 case NSD_STATS: 2625 #ifdef BIND8_STATS 2626 set_children_stats(nsd); 2627 #endif 2628 nsd->mode = NSD_RUN; 2629 break; 2630 default: 2631 log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); 2632 nsd->mode = NSD_RUN; 2633 break; 2634 } 2635 } 2636 log_msg(LOG_WARNING, "signal received, shutting down..."); 2637 2638 /* close opened ports to avoid race with restart of nsd */ 2639 server_close_all_sockets(nsd->udp, nsd->ifs); 2640 server_close_all_sockets(nsd->tcp, nsd->ifs); 2641 #ifdef HAVE_SSL 2642 daemon_remote_close(nsd->rc); 2643 #endif 2644 send_children_quit_and_wait(nsd); 2645 2646 /* Unlink it if possible... */ 2647 unlinkpid(nsd->pidfile); 2648 unlink(nsd->task[0]->fname); 2649 unlink(nsd->task[1]->fname); 2650 #ifdef USE_ZONE_STATS 2651 unlink(nsd->zonestatfname[0]); 2652 unlink(nsd->zonestatfname[1]); 2653 #endif 2654 #ifdef USE_DNSTAP 2655 dt_collector_close(nsd->dt_collector, nsd); 2656 #endif 2657 2658 if(reload_listener.fd != -1) { 2659 sig_atomic_t cmd = NSD_QUIT; 2660 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2661 "main: ipc send quit to reload-process")); 2662 if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { 2663 log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", 2664 strerror(errno)); 2665 } 2666 fsync(reload_listener.fd); 2667 close(reload_listener.fd); 2668 /* wait for reload to finish processing */ 2669 while(1) { 2670 if(waitpid(reload_pid, NULL, 0) == -1) { 2671 if(errno == EINTR) continue; 2672 if(errno == ECHILD) break; 2673 log_msg(LOG_ERR, "waitpid(reload %d): %s", 2674 (int)reload_pid, strerror(errno)); 2675 } 2676 break; 2677 } 2678 } 2679 if(nsd->xfrd_listener->fd != -1) { 2680 /* complete quit, stop xfrd */ 2681 sig_atomic_t cmd = NSD_QUIT; 2682 DEBUG(DEBUG_IPC,1, (LOG_INFO, 2683 "main: ipc send quit to xfrd")); 2684 if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { 2685 log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", 2686 strerror(errno)); 2687 } 2688 fsync(nsd->xfrd_listener->fd); 2689 close(nsd->xfrd_listener->fd); 2690 (void)kill(nsd->pid, SIGTERM); 2691 } 2692 2693 #ifdef MEMCLEAN /* OS collects memory pages */ 2694 region_destroy(server_region); 2695 #endif 2696 /* write the nsd.db to disk, wait for it to complete */ 2697 udb_base_sync(nsd->db->udb, 1); 2698 udb_base_close(nsd->db->udb); 2699 server_shutdown(nsd); 2700 } 2701 2702 static query_state_type 2703 server_process_query(struct nsd *nsd, struct query *query) 2704 { 2705 return query_process(query, nsd); 2706 } 2707 2708 static query_state_type 2709 server_process_query_udp(struct nsd *nsd, struct query *query) 2710 { 2711 #ifdef RATELIMIT 2712 if(query_process(query, nsd) != QUERY_DISCARDED) { 2713 if(rrl_process_query(query)) 2714 return rrl_slip(query); 2715 else return QUERY_PROCESSED; 2716 } 2717 return QUERY_DISCARDED; 2718 #else 2719 return query_process(query, nsd); 2720 #endif 2721 } 2722 2723 const char* 2724 nsd_event_vs(void) 2725 { 2726 #ifdef USE_MINI_EVENT 2727 return ""; 2728 #else 2729 return event_get_version(); 2730 #endif 2731 } 2732 2733 #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) 2734 static const char* ub_ev_backend2str(int b) 2735 { 2736 switch(b) { 2737 case EVBACKEND_SELECT: return "select"; 2738 case EVBACKEND_POLL: return "poll"; 2739 case EVBACKEND_EPOLL: return "epoll"; 2740 case EVBACKEND_KQUEUE: return "kqueue"; 2741 case EVBACKEND_DEVPOLL: return "devpoll"; 2742 case EVBACKEND_PORT: return "evport"; 2743 } 2744 return "unknown"; 2745 } 2746 #endif 2747 2748 const char* 2749 nsd_event_method(void) 2750 { 2751 #ifdef USE_MINI_EVENT 2752 return "select"; 2753 #else 2754 struct event_base* b = nsd_child_event_base(); 2755 const char* m = "?"; 2756 # ifdef EV_FEATURE_BACKENDS 2757 m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); 2758 # elif defined(HAVE_EVENT_BASE_GET_METHOD) 2759 m = event_base_get_method(b); 2760 # endif 2761 # ifdef MEMCLEAN 2762 event_base_free(b); 2763 # endif 2764 return m; 2765 #endif 2766 } 2767 2768 struct event_base* 2769 nsd_child_event_base(void) 2770 { 2771 struct event_base* base; 2772 #ifdef USE_MINI_EVENT 2773 static time_t secs; 2774 static struct timeval now; 2775 base = event_init(&secs, &now); 2776 #else 2777 # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) 2778 /* libev */ 2779 base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); 2780 # else 2781 /* libevent */ 2782 # ifdef HAVE_EVENT_BASE_NEW 2783 base = event_base_new(); 2784 # else 2785 base = event_init(); 2786 # endif 2787 # endif 2788 #endif 2789 return base; 2790 } 2791 2792 static void 2793 add_udp_handler( 2794 struct nsd *nsd, 2795 struct nsd_socket *sock, 2796 struct udp_handler_data *data) 2797 { 2798 struct event *handler = &data->event; 2799 2800 data->nsd = nsd; 2801 data->socket = sock; 2802 2803 memset(handler, 0, sizeof(*handler)); 2804 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); 2805 if(event_base_set(nsd->event_base, handler) != 0) 2806 log_msg(LOG_ERR, "nsd udp: event_base_set failed"); 2807 if(event_add(handler, NULL) != 0) 2808 log_msg(LOG_ERR, "nsd udp: event_add failed"); 2809 } 2810 2811 void 2812 add_tcp_handler( 2813 struct nsd *nsd, 2814 struct nsd_socket *sock, 2815 struct tcp_accept_handler_data *data) 2816 { 2817 struct event *handler = &data->event; 2818 2819 data->nsd = nsd; 2820 data->socket = sock; 2821 2822 #ifdef HAVE_SSL 2823 if (nsd->tls_ctx && 2824 nsd->options->tls_port && 2825 using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) 2826 { 2827 data->tls_accept = 1; 2828 if(verbosity >= 2) { 2829 char buf[48]; 2830 addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); 2831 VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); 2832 } 2833 } else { 2834 data->tls_accept = 0; 2835 } 2836 #endif 2837 2838 memset(handler, 0, sizeof(*handler)); 2839 event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); 2840 if(event_base_set(nsd->event_base, handler) != 0) 2841 log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); 2842 if(event_add(handler, NULL) != 0) 2843 log_msg(LOG_ERR, "nsd tcp: event_add failed"); 2844 data->event_added = 1; 2845 } 2846 2847 /* 2848 * Serve DNS requests. 2849 */ 2850 void 2851 server_child(struct nsd *nsd) 2852 { 2853 size_t i, from, numifs; 2854 region_type *server_region = region_create(xalloc, free); 2855 struct event_base* event_base = nsd_child_event_base(); 2856 sig_atomic_t mode; 2857 2858 if(!event_base) { 2859 log_msg(LOG_ERR, "nsd server could not create event base"); 2860 exit(1); 2861 } 2862 nsd->event_base = event_base; 2863 nsd->server_region = server_region; 2864 2865 #ifdef RATELIMIT 2866 rrl_init(nsd->this_child->child_num); 2867 #endif 2868 2869 assert(nsd->server_kind != NSD_SERVER_MAIN); 2870 DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); 2871 2872 #ifdef HAVE_SETPROCTITLE 2873 setproctitle("server %d", nsd->this_child->child_num + 1); 2874 #endif 2875 #ifdef HAVE_CPUSET_T 2876 if(nsd->use_cpu_affinity) { 2877 set_cpu_affinity(nsd->this_child->cpuset); 2878 } 2879 #endif 2880 2881 if (!(nsd->server_kind & NSD_SERVER_TCP)) { 2882 server_close_all_sockets(nsd->tcp, nsd->ifs); 2883 } 2884 if (!(nsd->server_kind & NSD_SERVER_UDP)) { 2885 server_close_all_sockets(nsd->udp, nsd->ifs); 2886 } 2887 2888 if (nsd->this_child->parent_fd != -1) { 2889 struct event *handler; 2890 struct ipc_handler_conn_data* user_data = 2891 (struct ipc_handler_conn_data*)region_alloc( 2892 server_region, sizeof(struct ipc_handler_conn_data)); 2893 user_data->nsd = nsd; 2894 user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); 2895 2896 handler = (struct event*) region_alloc( 2897 server_region, sizeof(*handler)); 2898 memset(handler, 0, sizeof(*handler)); 2899 event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| 2900 EV_READ, child_handle_parent_command, user_data); 2901 if(event_base_set(event_base, handler) != 0) 2902 log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); 2903 if(event_add(handler, NULL) != 0) 2904 log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); 2905 } 2906 2907 if(nsd->reuseport) { 2908 numifs = nsd->ifs / nsd->reuseport; 2909 from = numifs * nsd->this_child->child_num; 2910 if(from+numifs > nsd->ifs) { /* should not happen */ 2911 from = 0; 2912 numifs = nsd->ifs; 2913 } 2914 } else { 2915 from = 0; 2916 numifs = nsd->ifs; 2917 } 2918 2919 if (nsd->server_kind & NSD_SERVER_UDP) { 2920 int child = nsd->this_child->child_num; 2921 memset(msgs, 0, sizeof(msgs)); 2922 for (i = 0; i < NUM_RECV_PER_SELECT; i++) { 2923 queries[i] = query_create(server_region, 2924 compressed_dname_offsets, 2925 compression_table_size, compressed_dnames); 2926 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 2927 iovecs[i].iov_base = buffer_begin(queries[i]->packet); 2928 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 2929 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 2930 msgs[i].msg_hdr.msg_iovlen = 1; 2931 msgs[i].msg_hdr.msg_name = &queries[i]->addr; 2932 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 2933 } 2934 2935 for (i = 0; i < nsd->ifs; i++) { 2936 int listen; 2937 struct udp_handler_data *data; 2938 2939 listen = nsd_bitset_isset(nsd->udp[i].servers, child); 2940 2941 if(i >= from && i < (from + numifs) && listen) { 2942 data = region_alloc_zero( 2943 nsd->server_region, sizeof(*data)); 2944 add_udp_handler(nsd, &nsd->udp[i], data); 2945 } else { 2946 /* close sockets intended for other servers */ 2947 server_close_socket(&nsd->udp[i]); 2948 } 2949 } 2950 } 2951 2952 /* 2953 * Keep track of all the TCP accept handlers so we can enable 2954 * and disable them based on the current number of active TCP 2955 * connections. 2956 */ 2957 if (nsd->server_kind & NSD_SERVER_TCP) { 2958 int child = nsd->this_child->child_num; 2959 tcp_accept_handler_count = numifs; 2960 tcp_accept_handlers = region_alloc_array(server_region, 2961 numifs, sizeof(*tcp_accept_handlers)); 2962 2963 for (i = 0; i < nsd->ifs; i++) { 2964 int listen; 2965 struct tcp_accept_handler_data *data; 2966 2967 listen = nsd_bitset_isset(nsd->tcp[i].servers, child); 2968 2969 if(i >= from && i < (from + numifs) && listen) { 2970 data = &tcp_accept_handlers[i-from]; 2971 memset(data, 0, sizeof(*data)); 2972 add_tcp_handler(nsd, &nsd->tcp[i], data); 2973 } else { 2974 /* close sockets intended for other servers */ 2975 /* 2976 * uncomment this once tcp servers are no 2977 * longer copied in the tcp fd copy line 2978 * in server_init(). 2979 server_close_socket(&nsd->tcp[i]); 2980 */ 2981 /* close sockets not meant for this server*/ 2982 if(!listen) 2983 server_close_socket(&nsd->tcp[i]); 2984 } 2985 } 2986 } else { 2987 tcp_accept_handler_count = 0; 2988 } 2989 2990 /* The main loop... */ 2991 while ((mode = nsd->mode) != NSD_QUIT) { 2992 if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); 2993 2994 /* Do we need to do the statistics... */ 2995 if (mode == NSD_STATS) { 2996 #ifdef BIND8_STATS 2997 int p = nsd->st.period; 2998 nsd->st.period = 1; /* force stats printout */ 2999 /* Dump the statistics */ 3000 bind8_stats(nsd); 3001 nsd->st.period = p; 3002 #else /* !BIND8_STATS */ 3003 log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); 3004 #endif /* BIND8_STATS */ 3005 3006 nsd->mode = NSD_RUN; 3007 } 3008 else if (mode == NSD_REAP_CHILDREN) { 3009 /* got signal, notify parent. parent reaps terminated children. */ 3010 if (nsd->this_child->parent_fd != -1) { 3011 sig_atomic_t parent_notify = NSD_REAP_CHILDREN; 3012 if (write(nsd->this_child->parent_fd, 3013 &parent_notify, 3014 sizeof(parent_notify)) == -1) 3015 { 3016 log_msg(LOG_ERR, "problems sending command from %d to parent: %s", 3017 (int) nsd->this_child->pid, strerror(errno)); 3018 } 3019 } else /* no parent, so reap 'em */ 3020 while (waitpid(-1, NULL, WNOHANG) > 0) ; 3021 nsd->mode = NSD_RUN; 3022 } 3023 else if(mode == NSD_RUN) { 3024 /* Wait for a query... */ 3025 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3026 if (errno != EINTR) { 3027 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3028 break; 3029 } 3030 } 3031 } else if(mode == NSD_QUIT) { 3032 /* ignore here, quit */ 3033 } else { 3034 log_msg(LOG_ERR, "mode bad value %d, back to service.", 3035 (int)mode); 3036 nsd->mode = NSD_RUN; 3037 } 3038 } 3039 3040 service_remaining_tcp(nsd); 3041 #ifdef BIND8_STATS 3042 bind8_stats(nsd); 3043 #endif /* BIND8_STATS */ 3044 3045 #ifdef MEMCLEAN /* OS collects memory pages */ 3046 #ifdef RATELIMIT 3047 rrl_deinit(nsd->this_child->child_num); 3048 #endif 3049 event_base_free(event_base); 3050 region_destroy(server_region); 3051 #endif 3052 server_shutdown(nsd); 3053 } 3054 3055 static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) 3056 { 3057 int* timed_out = (int*)arg; 3058 assert(event & EV_TIMEOUT); (void)event; 3059 /* wake up the service tcp thread, note event is no longer 3060 * registered */ 3061 *timed_out = 1; 3062 } 3063 3064 void 3065 service_remaining_tcp(struct nsd* nsd) 3066 { 3067 struct tcp_handler_data* p; 3068 struct event_base* event_base; 3069 /* check if it is needed */ 3070 if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) 3071 return; 3072 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); 3073 3074 /* setup event base */ 3075 event_base = nsd_child_event_base(); 3076 if(!event_base) { 3077 log_msg(LOG_ERR, "nsd remain tcp could not create event base"); 3078 return; 3079 } 3080 /* register tcp connections */ 3081 for(p = tcp_active_list; p != NULL; p = p->next) { 3082 struct timeval timeout; 3083 int fd = p->event.ev_fd; 3084 #ifdef USE_MINI_EVENT 3085 short event = p->event.ev_flags & (EV_READ|EV_WRITE); 3086 #else 3087 short event = p->event.ev_events & (EV_READ|EV_WRITE); 3088 #endif 3089 void (*fn)(int, short, void*); 3090 #ifdef HAVE_SSL 3091 if(p->tls) { 3092 if((event&EV_READ)) 3093 fn = handle_tls_reading; 3094 else fn = handle_tls_writing; 3095 } else { 3096 #endif 3097 if((event&EV_READ)) 3098 fn = handle_tcp_reading; 3099 else fn = handle_tcp_writing; 3100 #ifdef HAVE_SSL 3101 } 3102 #endif 3103 3104 p->tcp_no_more_queries = 1; 3105 /* set timeout to 1/10 second */ 3106 if(p->tcp_timeout > 100) 3107 p->tcp_timeout = 100; 3108 timeout.tv_sec = p->tcp_timeout / 1000; 3109 timeout.tv_usec = (p->tcp_timeout % 1000)*1000; 3110 event_del(&p->event); 3111 memset(&p->event, 0, sizeof(p->event)); 3112 event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, 3113 fn, p); 3114 if(event_base_set(event_base, &p->event) != 0) 3115 log_msg(LOG_ERR, "event base set failed"); 3116 if(event_add(&p->event, &timeout) != 0) 3117 log_msg(LOG_ERR, "event add failed"); 3118 } 3119 3120 /* handle it */ 3121 while(nsd->current_tcp_count > 0) { 3122 mode_t m = server_signal_mode(nsd); 3123 struct event timeout; 3124 struct timeval tv; 3125 int timed_out = 0; 3126 if(m == NSD_QUIT || m == NSD_SHUTDOWN || 3127 m == NSD_REAP_CHILDREN) { 3128 /* quit */ 3129 break; 3130 } 3131 /* timer */ 3132 /* have to do something every second */ 3133 tv.tv_sec = 1; 3134 tv.tv_usec = 0; 3135 memset(&timeout, 0, sizeof(timeout)); 3136 event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, 3137 &timed_out); 3138 if(event_base_set(event_base, &timeout) != 0) 3139 log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); 3140 if(event_add(&timeout, &tv) != 0) 3141 log_msg(LOG_ERR, "remaintcp timer: event_add failed"); 3142 3143 /* service loop */ 3144 if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { 3145 if (errno != EINTR) { 3146 log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); 3147 break; 3148 } 3149 } 3150 if(!timed_out) { 3151 event_del(&timeout); 3152 } else { 3153 /* timed out, quit */ 3154 VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); 3155 break; 3156 } 3157 } 3158 #ifdef MEMCLEAN 3159 event_base_free(event_base); 3160 #endif 3161 /* continue to quit after return */ 3162 } 3163 3164 /* Implement recvmmsg and sendmmsg if the platform does not. These functions 3165 * are always used, even if nonblocking operations are broken, in which case 3166 * NUM_RECV_PER_SELECT is defined to 1 (one). 3167 */ 3168 #if defined(HAVE_RECVMMSG) 3169 #define nsd_recvmmsg recvmmsg 3170 #else /* !HAVE_RECVMMSG */ 3171 3172 static int 3173 nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, 3174 int flags, struct timespec *timeout) 3175 { 3176 unsigned int vpos = 0; 3177 ssize_t rcvd; 3178 3179 /* timeout is ignored, ensure caller does not expect it to work */ 3180 assert(timeout == NULL); (void)timeout; 3181 3182 while(vpos < vlen) { 3183 rcvd = recvfrom(sockfd, 3184 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3185 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3186 flags, 3187 msgvec[vpos].msg_hdr.msg_name, 3188 &msgvec[vpos].msg_hdr.msg_namelen); 3189 if(rcvd < 0) { 3190 break; 3191 } else { 3192 assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); 3193 msgvec[vpos].msg_len = (unsigned int)rcvd; 3194 vpos++; 3195 } 3196 } 3197 3198 if(vpos) { 3199 /* error will be picked up next time */ 3200 return (int)vpos; 3201 } else if(errno == 0) { 3202 return 0; 3203 } else if(errno == EAGAIN) { 3204 return 0; 3205 } 3206 3207 return -1; 3208 } 3209 #endif /* HAVE_RECVMMSG */ 3210 3211 #ifdef HAVE_SENDMMSG 3212 #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) 3213 #else /* !HAVE_SENDMMSG */ 3214 3215 static int 3216 nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) 3217 { 3218 unsigned int vpos = 0; 3219 ssize_t snd; 3220 3221 while(vpos < vlen) { 3222 assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); 3223 snd = sendto(sockfd, 3224 msgvec[vpos].msg_hdr.msg_iov->iov_base, 3225 msgvec[vpos].msg_hdr.msg_iov->iov_len, 3226 flags, 3227 msgvec[vpos].msg_hdr.msg_name, 3228 msgvec[vpos].msg_hdr.msg_namelen); 3229 if(snd < 0) { 3230 break; 3231 } else { 3232 msgvec[vpos].msg_len = (unsigned int)snd; 3233 vpos++; 3234 } 3235 } 3236 3237 if(vpos) { 3238 return (int)vpos; 3239 } else if(errno == 0) { 3240 return 0; 3241 } 3242 3243 return -1; 3244 } 3245 #endif /* HAVE_SENDMMSG */ 3246 3247 static void 3248 handle_udp(int fd, short event, void* arg) 3249 { 3250 struct udp_handler_data *data = (struct udp_handler_data *) arg; 3251 int received, sent, recvcount, i; 3252 struct query *q; 3253 3254 if (!(event & EV_READ)) { 3255 return; 3256 } 3257 recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); 3258 /* this printf strangely gave a performance increase on Linux */ 3259 /* printf("recvcount %d \n", recvcount); */ 3260 if (recvcount == -1) { 3261 if (errno != EAGAIN && errno != EINTR) { 3262 log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); 3263 STATUP(data->nsd, rxerr); 3264 /* No zone statup */ 3265 } 3266 /* Simply no data available */ 3267 return; 3268 } 3269 for (i = 0; i < recvcount; i++) { 3270 loopstart: 3271 received = msgs[i].msg_len; 3272 queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; 3273 q = queries[i]; 3274 if (received == -1) { 3275 log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( 3276 #if defined(HAVE_RECVMMSG) 3277 msgs[i].msg_hdr.msg_flags 3278 #else 3279 errno 3280 #endif 3281 )); 3282 STATUP(data->nsd, rxerr); 3283 /* No zone statup */ 3284 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3285 iovecs[i].iov_len = buffer_remaining(q->packet); 3286 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3287 goto swap_drop; 3288 } 3289 3290 /* Account... */ 3291 #ifdef BIND8_STATS 3292 if (data->socket->addr.ai_family == AF_INET) { 3293 STATUP(data->nsd, qudp); 3294 } else if (data->socket->addr.ai_family == AF_INET6) { 3295 STATUP(data->nsd, qudp6); 3296 } 3297 #endif 3298 3299 buffer_skip(q->packet, received); 3300 buffer_flip(q->packet); 3301 #ifdef USE_DNSTAP 3302 dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen, 3303 q->tcp, q->packet); 3304 #endif /* USE_DNSTAP */ 3305 3306 /* Process and answer the query... */ 3307 if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) { 3308 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 3309 STATUP(data->nsd, nona); 3310 ZTATUP(data->nsd, q->zone, nona); 3311 } 3312 3313 #ifdef USE_ZONE_STATS 3314 if (data->socket->addr.ai_family == AF_INET) { 3315 ZTATUP(data->nsd, q->zone, qudp); 3316 } else if (data->socket->addr.ai_family == AF_INET6) { 3317 ZTATUP(data->nsd, q->zone, qudp6); 3318 } 3319 #endif 3320 3321 /* Add EDNS0 and TSIG info if necessary. */ 3322 query_add_optional(q, data->nsd); 3323 3324 buffer_flip(q->packet); 3325 iovecs[i].iov_len = buffer_remaining(q->packet); 3326 #ifdef BIND8_STATS 3327 /* Account the rcode & TC... */ 3328 STATUP2(data->nsd, rcode, RCODE(q->packet)); 3329 ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); 3330 if (TC(q->packet)) { 3331 STATUP(data->nsd, truncated); 3332 ZTATUP(data->nsd, q->zone, truncated); 3333 } 3334 #endif /* BIND8_STATS */ 3335 #ifdef USE_DNSTAP 3336 dt_collector_submit_auth_response(data->nsd, 3337 &q->addr, q->addrlen, q->tcp, q->packet, 3338 q->zone); 3339 #endif /* USE_DNSTAP */ 3340 } else { 3341 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3342 iovecs[i].iov_len = buffer_remaining(q->packet); 3343 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3344 swap_drop: 3345 STATUP(data->nsd, dropped); 3346 ZTATUP(data->nsd, q->zone, dropped); 3347 if(i != recvcount-1) { 3348 /* swap with last and decrease recvcount */ 3349 struct mmsghdr mtmp = msgs[i]; 3350 struct iovec iotmp = iovecs[i]; 3351 recvcount--; 3352 msgs[i] = msgs[recvcount]; 3353 iovecs[i] = iovecs[recvcount]; 3354 queries[i] = queries[recvcount]; 3355 msgs[recvcount] = mtmp; 3356 iovecs[recvcount] = iotmp; 3357 queries[recvcount] = q; 3358 msgs[i].msg_hdr.msg_iov = &iovecs[i]; 3359 msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; 3360 goto loopstart; 3361 } else { recvcount --; } 3362 } 3363 } 3364 3365 /* send until all are sent */ 3366 i = 0; 3367 while(i<recvcount) { 3368 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3369 if(sent == -1) { 3370 if(errno == ENOBUFS || 3371 #ifdef EWOULDBLOCK 3372 errno == EWOULDBLOCK || 3373 #endif 3374 errno == EAGAIN) { 3375 /* block to wait until send buffer avail */ 3376 int flag, errstore; 3377 if((flag = fcntl(fd, F_GETFL)) == -1) { 3378 log_msg(LOG_ERR, "cannot fcntl F_GETFL: %s", strerror(errno)); 3379 flag = 0; 3380 } 3381 flag &= ~O_NONBLOCK; 3382 if(fcntl(fd, F_SETFL, flag) == -1) 3383 log_msg(LOG_ERR, "cannot fcntl F_SETFL 0: %s", strerror(errno)); 3384 sent = nsd_sendmmsg(fd, &msgs[i], recvcount-i, 0); 3385 errstore = errno; 3386 flag |= O_NONBLOCK; 3387 if(fcntl(fd, F_SETFL, flag) == -1) 3388 log_msg(LOG_ERR, "cannot fcntl F_SETFL O_NONBLOCK: %s", strerror(errno)); 3389 if(sent != -1) { 3390 i += sent; 3391 continue; 3392 } 3393 errno = errstore; 3394 } 3395 /* don't log transient network full errors, unless 3396 * on higher verbosity */ 3397 if(!(errno == ENOBUFS && verbosity < 1) && 3398 #ifdef EWOULDBLOCK 3399 errno != EWOULDBLOCK && 3400 #endif 3401 errno != EAGAIN) { 3402 const char* es = strerror(errno); 3403 char a[64]; 3404 addrport2str(&queries[i]->addr, a, sizeof(a)); 3405 log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); 3406 } 3407 #ifdef BIND8_STATS 3408 data->nsd->st.txerr += recvcount-i; 3409 #endif /* BIND8_STATS */ 3410 break; 3411 } 3412 i += sent; 3413 } 3414 for(i=0; i<recvcount; i++) { 3415 query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); 3416 iovecs[i].iov_len = buffer_remaining(queries[i]->packet); 3417 msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; 3418 } 3419 } 3420 3421 #ifdef HAVE_SSL 3422 /* 3423 * Setup an event for the tcp handler. 3424 */ 3425 static void 3426 tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), 3427 int fd, short event) 3428 { 3429 struct timeval timeout; 3430 struct event_base* ev_base; 3431 3432 timeout.tv_sec = data->nsd->tcp_timeout; 3433 timeout.tv_usec = 0L; 3434 3435 ev_base = data->event.ev_base; 3436 event_del(&data->event); 3437 memset(&data->event, 0, sizeof(data->event)); 3438 event_set(&data->event, fd, event, fn, data); 3439 if(event_base_set(ev_base, &data->event) != 0) 3440 log_msg(LOG_ERR, "event base set failed"); 3441 if(event_add(&data->event, &timeout) != 0) 3442 log_msg(LOG_ERR, "event add failed"); 3443 } 3444 #endif /* HAVE_SSL */ 3445 3446 static void 3447 cleanup_tcp_handler(struct tcp_handler_data* data) 3448 { 3449 event_del(&data->event); 3450 #ifdef HAVE_SSL 3451 if(data->tls) { 3452 SSL_shutdown(data->tls); 3453 SSL_free(data->tls); 3454 data->tls = NULL; 3455 } 3456 #endif 3457 close(data->event.ev_fd); 3458 if(data->prev) 3459 data->prev->next = data->next; 3460 else tcp_active_list = data->next; 3461 if(data->next) 3462 data->next->prev = data->prev; 3463 3464 /* 3465 * Enable the TCP accept handlers when the current number of 3466 * TCP connections is about to drop below the maximum number 3467 * of TCP connections. 3468 */ 3469 if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { 3470 configure_handler_event_types(EV_READ|EV_PERSIST); 3471 if(slowaccept) { 3472 event_del(&slowaccept_event); 3473 slowaccept = 0; 3474 } 3475 } 3476 --data->nsd->current_tcp_count; 3477 assert(data->nsd->current_tcp_count >= 0); 3478 3479 region_destroy(data->region); 3480 } 3481 3482 static void 3483 handle_tcp_reading(int fd, short event, void* arg) 3484 { 3485 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3486 ssize_t received; 3487 struct event_base* ev_base; 3488 struct timeval timeout; 3489 3490 if ((event & EV_TIMEOUT)) { 3491 /* Connection timed out. */ 3492 cleanup_tcp_handler(data); 3493 return; 3494 } 3495 3496 if ((data->nsd->tcp_query_count > 0 && 3497 data->query_count >= data->nsd->tcp_query_count) || 3498 data->tcp_no_more_queries) { 3499 /* No more queries allowed on this tcp connection. */ 3500 cleanup_tcp_handler(data); 3501 return; 3502 } 3503 3504 assert((event & EV_READ)); 3505 3506 if (data->bytes_transmitted == 0) { 3507 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3508 } 3509 3510 /* 3511 * Check if we received the leading packet length bytes yet. 3512 */ 3513 if (data->bytes_transmitted < sizeof(uint16_t)) { 3514 received = read(fd, 3515 (char *) &data->query->tcplen 3516 + data->bytes_transmitted, 3517 sizeof(uint16_t) - data->bytes_transmitted); 3518 if (received == -1) { 3519 if (errno == EAGAIN || errno == EINTR) { 3520 /* 3521 * Read would block, wait until more 3522 * data is available. 3523 */ 3524 return; 3525 } else { 3526 char buf[48]; 3527 addr2str(&data->query->addr, buf, sizeof(buf)); 3528 #ifdef ECONNRESET 3529 if (verbosity >= 2 || errno != ECONNRESET) 3530 #endif /* ECONNRESET */ 3531 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3532 cleanup_tcp_handler(data); 3533 return; 3534 } 3535 } else if (received == 0) { 3536 /* EOF */ 3537 cleanup_tcp_handler(data); 3538 return; 3539 } 3540 3541 data->bytes_transmitted += received; 3542 if (data->bytes_transmitted < sizeof(uint16_t)) { 3543 /* 3544 * Not done with the tcplen yet, wait for more 3545 * data to become available. 3546 */ 3547 return; 3548 } 3549 3550 assert(data->bytes_transmitted == sizeof(uint16_t)); 3551 3552 data->query->tcplen = ntohs(data->query->tcplen); 3553 3554 /* 3555 * Minimum query size is: 3556 * 3557 * Size of the header (12) 3558 * + Root domain name (1) 3559 * + Query class (2) 3560 * + Query type (2) 3561 */ 3562 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 3563 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 3564 cleanup_tcp_handler(data); 3565 return; 3566 } 3567 3568 if (data->query->tcplen > data->query->maxlen) { 3569 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 3570 cleanup_tcp_handler(data); 3571 return; 3572 } 3573 3574 buffer_set_limit(data->query->packet, data->query->tcplen); 3575 } 3576 3577 assert(buffer_remaining(data->query->packet) > 0); 3578 3579 /* Read the (remaining) query data. */ 3580 received = read(fd, 3581 buffer_current(data->query->packet), 3582 buffer_remaining(data->query->packet)); 3583 if (received == -1) { 3584 if (errno == EAGAIN || errno == EINTR) { 3585 /* 3586 * Read would block, wait until more data is 3587 * available. 3588 */ 3589 return; 3590 } else { 3591 char buf[48]; 3592 addr2str(&data->query->addr, buf, sizeof(buf)); 3593 #ifdef ECONNRESET 3594 if (verbosity >= 2 || errno != ECONNRESET) 3595 #endif /* ECONNRESET */ 3596 log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); 3597 cleanup_tcp_handler(data); 3598 return; 3599 } 3600 } else if (received == 0) { 3601 /* EOF */ 3602 cleanup_tcp_handler(data); 3603 return; 3604 } 3605 3606 data->bytes_transmitted += received; 3607 buffer_skip(data->query->packet, received); 3608 if (buffer_remaining(data->query->packet) > 0) { 3609 /* 3610 * Message not yet complete, wait for more data to 3611 * become available. 3612 */ 3613 return; 3614 } 3615 3616 assert(buffer_position(data->query->packet) == data->query->tcplen); 3617 3618 /* Account... */ 3619 #ifdef BIND8_STATS 3620 #ifndef INET6 3621 STATUP(data->nsd, ctcp); 3622 #else 3623 if (data->query->addr.ss_family == AF_INET) { 3624 STATUP(data->nsd, ctcp); 3625 } else if (data->query->addr.ss_family == AF_INET6) { 3626 STATUP(data->nsd, ctcp6); 3627 } 3628 #endif 3629 #endif /* BIND8_STATS */ 3630 3631 /* We have a complete query, process it. */ 3632 3633 /* tcp-query-count: handle query counter ++ */ 3634 data->query_count++; 3635 3636 buffer_flip(data->query->packet); 3637 #ifdef USE_DNSTAP 3638 dt_collector_submit_auth_query(data->nsd, &data->query->addr, 3639 data->query->addrlen, data->query->tcp, data->query->packet); 3640 #endif /* USE_DNSTAP */ 3641 data->query_state = server_process_query(data->nsd, data->query); 3642 if (data->query_state == QUERY_DISCARDED) { 3643 /* Drop the packet and the entire connection... */ 3644 STATUP(data->nsd, dropped); 3645 ZTATUP(data->nsd, data->query->zone, dropped); 3646 cleanup_tcp_handler(data); 3647 return; 3648 } 3649 3650 #ifdef BIND8_STATS 3651 if (RCODE(data->query->packet) == RCODE_OK 3652 && !AA(data->query->packet)) 3653 { 3654 STATUP(data->nsd, nona); 3655 ZTATUP(data->nsd, data->query->zone, nona); 3656 } 3657 #endif /* BIND8_STATS */ 3658 3659 #ifdef USE_ZONE_STATS 3660 #ifndef INET6 3661 ZTATUP(data->nsd, data->query->zone, ctcp); 3662 #else 3663 if (data->query->addr.ss_family == AF_INET) { 3664 ZTATUP(data->nsd, data->query->zone, ctcp); 3665 } else if (data->query->addr.ss_family == AF_INET6) { 3666 ZTATUP(data->nsd, data->query->zone, ctcp6); 3667 } 3668 #endif 3669 #endif /* USE_ZONE_STATS */ 3670 3671 query_add_optional(data->query, data->nsd); 3672 3673 /* Switch to the tcp write handler. */ 3674 buffer_flip(data->query->packet); 3675 data->query->tcplen = buffer_remaining(data->query->packet); 3676 #ifdef BIND8_STATS 3677 /* Account the rcode & TC... */ 3678 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 3679 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 3680 if (TC(data->query->packet)) { 3681 STATUP(data->nsd, truncated); 3682 ZTATUP(data->nsd, data->query->zone, truncated); 3683 } 3684 #endif /* BIND8_STATS */ 3685 #ifdef USE_DNSTAP 3686 dt_collector_submit_auth_response(data->nsd, &data->query->addr, 3687 data->query->addrlen, data->query->tcp, data->query->packet, 3688 data->query->zone); 3689 #endif /* USE_DNSTAP */ 3690 data->bytes_transmitted = 0; 3691 3692 timeout.tv_sec = data->tcp_timeout / 1000; 3693 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3694 3695 ev_base = data->event.ev_base; 3696 event_del(&data->event); 3697 memset(&data->event, 0, sizeof(data->event)); 3698 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3699 handle_tcp_reading, data); 3700 if(event_base_set(ev_base, &data->event) != 0) 3701 log_msg(LOG_ERR, "event base set tcpr failed"); 3702 if(event_add(&data->event, &timeout) != 0) 3703 log_msg(LOG_ERR, "event add tcpr failed"); 3704 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 3705 handle_tcp_writing(fd, EV_WRITE, data); 3706 } 3707 3708 static void 3709 handle_tcp_writing(int fd, short event, void* arg) 3710 { 3711 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3712 ssize_t sent; 3713 struct query *q = data->query; 3714 struct timeval timeout; 3715 struct event_base* ev_base; 3716 3717 if ((event & EV_TIMEOUT)) { 3718 /* Connection timed out. */ 3719 cleanup_tcp_handler(data); 3720 return; 3721 } 3722 3723 assert((event & EV_WRITE)); 3724 3725 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3726 /* Writing the response packet length. */ 3727 uint16_t n_tcplen = htons(q->tcplen); 3728 #ifdef HAVE_WRITEV 3729 struct iovec iov[2]; 3730 iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; 3731 iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; 3732 iov[1].iov_base = buffer_begin(q->packet); 3733 iov[1].iov_len = buffer_limit(q->packet); 3734 sent = writev(fd, iov, 2); 3735 #else /* HAVE_WRITEV */ 3736 sent = write(fd, 3737 (const char *) &n_tcplen + data->bytes_transmitted, 3738 sizeof(n_tcplen) - data->bytes_transmitted); 3739 #endif /* HAVE_WRITEV */ 3740 if (sent == -1) { 3741 if (errno == EAGAIN || errno == EINTR) { 3742 /* 3743 * Write would block, wait until 3744 * socket becomes writable again. 3745 */ 3746 return; 3747 } else { 3748 #ifdef ECONNRESET 3749 if(verbosity >= 2 || errno != ECONNRESET) 3750 #endif /* ECONNRESET */ 3751 #ifdef EPIPE 3752 if(verbosity >= 2 || errno != EPIPE) 3753 #endif /* EPIPE 'broken pipe' */ 3754 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3755 cleanup_tcp_handler(data); 3756 return; 3757 } 3758 } 3759 3760 data->bytes_transmitted += sent; 3761 if (data->bytes_transmitted < sizeof(q->tcplen)) { 3762 /* 3763 * Writing not complete, wait until socket 3764 * becomes writable again. 3765 */ 3766 return; 3767 } 3768 3769 #ifdef HAVE_WRITEV 3770 sent -= sizeof(n_tcplen); 3771 /* handle potential 'packet done' code */ 3772 goto packet_could_be_done; 3773 #endif 3774 } 3775 3776 sent = write(fd, 3777 buffer_current(q->packet), 3778 buffer_remaining(q->packet)); 3779 if (sent == -1) { 3780 if (errno == EAGAIN || errno == EINTR) { 3781 /* 3782 * Write would block, wait until 3783 * socket becomes writable again. 3784 */ 3785 return; 3786 } else { 3787 #ifdef ECONNRESET 3788 if(verbosity >= 2 || errno != ECONNRESET) 3789 #endif /* ECONNRESET */ 3790 #ifdef EPIPE 3791 if(verbosity >= 2 || errno != EPIPE) 3792 #endif /* EPIPE 'broken pipe' */ 3793 log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); 3794 cleanup_tcp_handler(data); 3795 return; 3796 } 3797 } 3798 3799 data->bytes_transmitted += sent; 3800 #ifdef HAVE_WRITEV 3801 packet_could_be_done: 3802 #endif 3803 buffer_skip(q->packet, sent); 3804 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 3805 /* 3806 * Still more data to write when socket becomes 3807 * writable again. 3808 */ 3809 return; 3810 } 3811 3812 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 3813 3814 if (data->query_state == QUERY_IN_AXFR) { 3815 /* Continue processing AXFR and writing back results. */ 3816 buffer_clear(q->packet); 3817 data->query_state = query_axfr(data->nsd, q); 3818 if (data->query_state != QUERY_PROCESSED) { 3819 query_add_optional(data->query, data->nsd); 3820 3821 /* Reset data. */ 3822 buffer_flip(q->packet); 3823 q->tcplen = buffer_remaining(q->packet); 3824 data->bytes_transmitted = 0; 3825 /* Reset timeout. */ 3826 timeout.tv_sec = data->tcp_timeout / 1000; 3827 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3828 ev_base = data->event.ev_base; 3829 event_del(&data->event); 3830 memset(&data->event, 0, sizeof(data->event)); 3831 event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, 3832 handle_tcp_writing, data); 3833 if(event_base_set(ev_base, &data->event) != 0) 3834 log_msg(LOG_ERR, "event base set tcpw failed"); 3835 if(event_add(&data->event, &timeout) != 0) 3836 log_msg(LOG_ERR, "event add tcpw failed"); 3837 3838 /* 3839 * Write data if/when the socket is writable 3840 * again. 3841 */ 3842 return; 3843 } 3844 } 3845 3846 /* 3847 * Done sending, wait for the next request to arrive on the 3848 * TCP socket by installing the TCP read handler. 3849 */ 3850 if ((data->nsd->tcp_query_count > 0 && 3851 data->query_count >= data->nsd->tcp_query_count) || 3852 data->tcp_no_more_queries) { 3853 3854 (void) shutdown(fd, SHUT_WR); 3855 } 3856 3857 data->bytes_transmitted = 0; 3858 3859 timeout.tv_sec = data->tcp_timeout / 1000; 3860 timeout.tv_usec = (data->tcp_timeout % 1000)*1000; 3861 ev_base = data->event.ev_base; 3862 event_del(&data->event); 3863 memset(&data->event, 0, sizeof(data->event)); 3864 event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, 3865 handle_tcp_reading, data); 3866 if(event_base_set(ev_base, &data->event) != 0) 3867 log_msg(LOG_ERR, "event base set tcpw failed"); 3868 if(event_add(&data->event, &timeout) != 0) 3869 log_msg(LOG_ERR, "event add tcpw failed"); 3870 } 3871 3872 #ifdef HAVE_SSL 3873 /** create SSL object and associate fd */ 3874 static SSL* 3875 incoming_ssl_fd(SSL_CTX* ctx, int fd) 3876 { 3877 SSL* ssl = SSL_new((SSL_CTX*)ctx); 3878 if(!ssl) { 3879 log_crypto_err("could not SSL_new"); 3880 return NULL; 3881 } 3882 SSL_set_accept_state(ssl); 3883 (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); 3884 if(!SSL_set_fd(ssl, fd)) { 3885 log_crypto_err("could not SSL_set_fd"); 3886 SSL_free(ssl); 3887 return NULL; 3888 } 3889 return ssl; 3890 } 3891 3892 /** TLS handshake to upgrade TCP connection */ 3893 static int 3894 tls_handshake(struct tcp_handler_data* data, int fd, int writing) 3895 { 3896 int r; 3897 if(data->shake_state == tls_hs_read_event) { 3898 /* read condition satisfied back to writing */ 3899 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3900 data->shake_state = tls_hs_none; 3901 return 1; 3902 } 3903 if(data->shake_state == tls_hs_write_event) { 3904 /* write condition satisfied back to reading */ 3905 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3906 data->shake_state = tls_hs_none; 3907 return 1; 3908 } 3909 3910 /* (continue to) setup the TLS connection */ 3911 ERR_clear_error(); 3912 r = SSL_do_handshake(data->tls); 3913 3914 if(r != 1) { 3915 int want = SSL_get_error(data->tls, r); 3916 if(want == SSL_ERROR_WANT_READ) { 3917 if(data->shake_state == tls_hs_read) { 3918 /* try again later */ 3919 return 1; 3920 } 3921 data->shake_state = tls_hs_read; 3922 /* switch back to reading mode */ 3923 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3924 return 1; 3925 } else if(want == SSL_ERROR_WANT_WRITE) { 3926 if(data->shake_state == tls_hs_write) { 3927 /* try again later */ 3928 return 1; 3929 } 3930 data->shake_state = tls_hs_write; 3931 /* switch back to writing mode */ 3932 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3933 return 1; 3934 } else { 3935 if(r == 0) 3936 VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); 3937 else { 3938 unsigned long err = ERR_get_error(); 3939 if(!squelch_err_ssl_handshake(err)) { 3940 char a[64], s[256]; 3941 addr2str(&data->query->addr, a, sizeof(a)); 3942 snprintf(s, sizeof(s), "TLS handshake failed from %s", a); 3943 log_crypto_from_err(s, err); 3944 } 3945 } 3946 cleanup_tcp_handler(data); 3947 return 0; 3948 } 3949 } 3950 3951 /* Use to log successful upgrade for testing - could be removed*/ 3952 VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); 3953 /* set back to the event we need to have when reading (or writing) */ 3954 if(data->shake_state == tls_hs_read && writing) { 3955 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); 3956 } else if(data->shake_state == tls_hs_write && !writing) { 3957 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); 3958 } 3959 data->shake_state = tls_hs_none; 3960 return 1; 3961 } 3962 3963 /** handle TLS reading of incoming query */ 3964 static void 3965 handle_tls_reading(int fd, short event, void* arg) 3966 { 3967 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 3968 ssize_t received; 3969 3970 if ((event & EV_TIMEOUT)) { 3971 /* Connection timed out. */ 3972 cleanup_tcp_handler(data); 3973 return; 3974 } 3975 3976 if ((data->nsd->tcp_query_count > 0 && 3977 data->query_count >= data->nsd->tcp_query_count) || 3978 data->tcp_no_more_queries) { 3979 /* No more queries allowed on this tcp connection. */ 3980 cleanup_tcp_handler(data); 3981 return; 3982 } 3983 3984 assert((event & EV_READ)); 3985 3986 if (data->bytes_transmitted == 0) { 3987 query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); 3988 } 3989 3990 if(data->shake_state != tls_hs_none) { 3991 if(!tls_handshake(data, fd, 0)) 3992 return; 3993 if(data->shake_state != tls_hs_none) 3994 return; 3995 } 3996 3997 /* 3998 * Check if we received the leading packet length bytes yet. 3999 */ 4000 if(data->bytes_transmitted < sizeof(uint16_t)) { 4001 ERR_clear_error(); 4002 if((received=SSL_read(data->tls, (char *) &data->query->tcplen 4003 + data->bytes_transmitted, 4004 sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { 4005 int want = SSL_get_error(data->tls, received); 4006 if(want == SSL_ERROR_ZERO_RETURN) { 4007 cleanup_tcp_handler(data); 4008 return; /* shutdown, closed */ 4009 } else if(want == SSL_ERROR_WANT_READ) { 4010 /* wants to be called again */ 4011 return; 4012 } 4013 else if(want == SSL_ERROR_WANT_WRITE) { 4014 /* switch to writing */ 4015 data->shake_state = tls_hs_write_event; 4016 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4017 return; 4018 } 4019 cleanup_tcp_handler(data); 4020 log_crypto_err("could not SSL_read"); 4021 return; 4022 } 4023 4024 data->bytes_transmitted += received; 4025 if (data->bytes_transmitted < sizeof(uint16_t)) { 4026 /* 4027 * Not done with the tcplen yet, wait for more 4028 * data to become available. 4029 */ 4030 return; 4031 } 4032 4033 assert(data->bytes_transmitted == sizeof(uint16_t)); 4034 4035 data->query->tcplen = ntohs(data->query->tcplen); 4036 4037 /* 4038 * Minimum query size is: 4039 * 4040 * Size of the header (12) 4041 * + Root domain name (1) 4042 * + Query class (2) 4043 * + Query type (2) 4044 */ 4045 if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { 4046 VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); 4047 cleanup_tcp_handler(data); 4048 return; 4049 } 4050 4051 if (data->query->tcplen > data->query->maxlen) { 4052 VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); 4053 cleanup_tcp_handler(data); 4054 return; 4055 } 4056 4057 buffer_set_limit(data->query->packet, data->query->tcplen); 4058 } 4059 4060 assert(buffer_remaining(data->query->packet) > 0); 4061 4062 /* Read the (remaining) query data. */ 4063 ERR_clear_error(); 4064 received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), 4065 (int)buffer_remaining(data->query->packet)); 4066 if(received <= 0) { 4067 int want = SSL_get_error(data->tls, received); 4068 if(want == SSL_ERROR_ZERO_RETURN) { 4069 cleanup_tcp_handler(data); 4070 return; /* shutdown, closed */ 4071 } else if(want == SSL_ERROR_WANT_READ) { 4072 /* wants to be called again */ 4073 return; 4074 } 4075 else if(want == SSL_ERROR_WANT_WRITE) { 4076 /* switch back writing */ 4077 data->shake_state = tls_hs_write_event; 4078 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4079 return; 4080 } 4081 cleanup_tcp_handler(data); 4082 log_crypto_err("could not SSL_read"); 4083 return; 4084 } 4085 4086 data->bytes_transmitted += received; 4087 buffer_skip(data->query->packet, received); 4088 if (buffer_remaining(data->query->packet) > 0) { 4089 /* 4090 * Message not yet complete, wait for more data to 4091 * become available. 4092 */ 4093 return; 4094 } 4095 4096 assert(buffer_position(data->query->packet) == data->query->tcplen); 4097 4098 /* Account... */ 4099 #ifndef INET6 4100 STATUP(data->nsd, ctls); 4101 #else 4102 if (data->query->addr.ss_family == AF_INET) { 4103 STATUP(data->nsd, ctls); 4104 } else if (data->query->addr.ss_family == AF_INET6) { 4105 STATUP(data->nsd, ctls6); 4106 } 4107 #endif 4108 4109 /* We have a complete query, process it. */ 4110 4111 /* tcp-query-count: handle query counter ++ */ 4112 data->query_count++; 4113 4114 buffer_flip(data->query->packet); 4115 #ifdef USE_DNSTAP 4116 dt_collector_submit_auth_query(data->nsd, &data->query->addr, 4117 data->query->addrlen, data->query->tcp, data->query->packet); 4118 #endif /* USE_DNSTAP */ 4119 data->query_state = server_process_query(data->nsd, data->query); 4120 if (data->query_state == QUERY_DISCARDED) { 4121 /* Drop the packet and the entire connection... */ 4122 STATUP(data->nsd, dropped); 4123 ZTATUP(data->nsd, data->query->zone, dropped); 4124 cleanup_tcp_handler(data); 4125 return; 4126 } 4127 4128 #ifdef BIND8_STATS 4129 if (RCODE(data->query->packet) == RCODE_OK 4130 && !AA(data->query->packet)) 4131 { 4132 STATUP(data->nsd, nona); 4133 ZTATUP(data->nsd, data->query->zone, nona); 4134 } 4135 #endif /* BIND8_STATS */ 4136 4137 #ifdef USE_ZONE_STATS 4138 #ifndef INET6 4139 ZTATUP(data->nsd, data->query->zone, ctls); 4140 #else 4141 if (data->query->addr.ss_family == AF_INET) { 4142 ZTATUP(data->nsd, data->query->zone, ctls); 4143 } else if (data->query->addr.ss_family == AF_INET6) { 4144 ZTATUP(data->nsd, data->query->zone, ctls6); 4145 } 4146 #endif 4147 #endif /* USE_ZONE_STATS */ 4148 4149 query_add_optional(data->query, data->nsd); 4150 4151 /* Switch to the tcp write handler. */ 4152 buffer_flip(data->query->packet); 4153 data->query->tcplen = buffer_remaining(data->query->packet); 4154 #ifdef BIND8_STATS 4155 /* Account the rcode & TC... */ 4156 STATUP2(data->nsd, rcode, RCODE(data->query->packet)); 4157 ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); 4158 if (TC(data->query->packet)) { 4159 STATUP(data->nsd, truncated); 4160 ZTATUP(data->nsd, data->query->zone, truncated); 4161 } 4162 #endif /* BIND8_STATS */ 4163 #ifdef USE_DNSTAP 4164 dt_collector_submit_auth_response(data->nsd, &data->query->addr, 4165 data->query->addrlen, data->query->tcp, data->query->packet, 4166 data->query->zone); 4167 #endif /* USE_DNSTAP */ 4168 data->bytes_transmitted = 0; 4169 4170 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4171 4172 /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ 4173 handle_tls_writing(fd, EV_WRITE, data); 4174 } 4175 4176 /** handle TLS writing of outgoing response */ 4177 static void 4178 handle_tls_writing(int fd, short event, void* arg) 4179 { 4180 struct tcp_handler_data *data = (struct tcp_handler_data *) arg; 4181 ssize_t sent; 4182 struct query *q = data->query; 4183 /* static variable that holds reassembly buffer used to put the 4184 * TCP length in front of the packet, like writev. */ 4185 static buffer_type* global_tls_temp_buffer = NULL; 4186 buffer_type* write_buffer; 4187 4188 if ((event & EV_TIMEOUT)) { 4189 /* Connection timed out. */ 4190 cleanup_tcp_handler(data); 4191 return; 4192 } 4193 4194 assert((event & EV_WRITE)); 4195 4196 if(data->shake_state != tls_hs_none) { 4197 if(!tls_handshake(data, fd, 1)) 4198 return; 4199 if(data->shake_state != tls_hs_none) 4200 return; 4201 } 4202 4203 (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); 4204 4205 /* If we are writing the start of a message, we must include the length 4206 * this is done with a copy into write_buffer. */ 4207 write_buffer = NULL; 4208 if (data->bytes_transmitted == 0) { 4209 if(!global_tls_temp_buffer) { 4210 /* gets deallocated when nsd shuts down from 4211 * nsd.region */ 4212 global_tls_temp_buffer = buffer_create(nsd.region, 4213 QIOBUFSZ + sizeof(q->tcplen)); 4214 if (!global_tls_temp_buffer) { 4215 return; 4216 } 4217 } 4218 write_buffer = global_tls_temp_buffer; 4219 buffer_clear(write_buffer); 4220 buffer_write_u16(write_buffer, q->tcplen); 4221 buffer_write(write_buffer, buffer_current(q->packet), 4222 (int)buffer_remaining(q->packet)); 4223 buffer_flip(write_buffer); 4224 } else { 4225 write_buffer = q->packet; 4226 } 4227 4228 /* Write the response */ 4229 ERR_clear_error(); 4230 sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); 4231 if(sent <= 0) { 4232 int want = SSL_get_error(data->tls, sent); 4233 if(want == SSL_ERROR_ZERO_RETURN) { 4234 cleanup_tcp_handler(data); 4235 /* closed */ 4236 } else if(want == SSL_ERROR_WANT_READ) { 4237 /* switch back to reading */ 4238 data->shake_state = tls_hs_read_event; 4239 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4240 } else if(want != SSL_ERROR_WANT_WRITE) { 4241 cleanup_tcp_handler(data); 4242 log_crypto_err("could not SSL_write"); 4243 } 4244 return; 4245 } 4246 4247 buffer_skip(write_buffer, sent); 4248 if(buffer_remaining(write_buffer) != 0) { 4249 /* If not all sent, sync up the real buffer if it wasn't used.*/ 4250 if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { 4251 buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); 4252 } 4253 } 4254 4255 data->bytes_transmitted += sent; 4256 if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { 4257 /* 4258 * Still more data to write when socket becomes 4259 * writable again. 4260 */ 4261 return; 4262 } 4263 4264 assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); 4265 4266 if (data->query_state == QUERY_IN_AXFR) { 4267 /* Continue processing AXFR and writing back results. */ 4268 buffer_clear(q->packet); 4269 data->query_state = query_axfr(data->nsd, q); 4270 if (data->query_state != QUERY_PROCESSED) { 4271 query_add_optional(data->query, data->nsd); 4272 4273 /* Reset data. */ 4274 buffer_flip(q->packet); 4275 q->tcplen = buffer_remaining(q->packet); 4276 data->bytes_transmitted = 0; 4277 /* Reset to writing mode. */ 4278 tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); 4279 4280 /* 4281 * Write data if/when the socket is writable 4282 * again. 4283 */ 4284 return; 4285 } 4286 } 4287 4288 /* 4289 * Done sending, wait for the next request to arrive on the 4290 * TCP socket by installing the TCP read handler. 4291 */ 4292 if ((data->nsd->tcp_query_count > 0 && 4293 data->query_count >= data->nsd->tcp_query_count) || 4294 data->tcp_no_more_queries) { 4295 4296 (void) shutdown(fd, SHUT_WR); 4297 } 4298 4299 data->bytes_transmitted = 0; 4300 4301 tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); 4302 } 4303 #endif 4304 4305 static void 4306 handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), 4307 void* ATTR_UNUSED(arg)) 4308 { 4309 if(slowaccept) { 4310 configure_handler_event_types(EV_PERSIST | EV_READ); 4311 slowaccept = 0; 4312 } 4313 } 4314 4315 static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) 4316 { 4317 #ifndef HAVE_ACCEPT4 4318 int s = accept(fd, addr, addrlen); 4319 if (s != -1) { 4320 if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { 4321 log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); 4322 close(s); 4323 s = -1; 4324 errno=EINTR; /* stop error printout as error in accept4 4325 by setting this errno, it omits printout, in 4326 later code that calls nsd_accept4 */ 4327 } 4328 } 4329 return s; 4330 #else 4331 return accept4(fd, addr, addrlen, SOCK_NONBLOCK); 4332 #endif /* HAVE_ACCEPT4 */ 4333 } 4334 4335 /* 4336 * Handle an incoming TCP connection. The connection is accepted and 4337 * a new TCP reader event handler is added. The TCP handler 4338 * is responsible for cleanup when the connection is closed. 4339 */ 4340 static void 4341 handle_tcp_accept(int fd, short event, void* arg) 4342 { 4343 struct tcp_accept_handler_data *data 4344 = (struct tcp_accept_handler_data *) arg; 4345 int s; 4346 int reject = 0; 4347 struct tcp_handler_data *tcp_data; 4348 region_type *tcp_region; 4349 #ifdef INET6 4350 struct sockaddr_storage addr; 4351 #else 4352 struct sockaddr_in addr; 4353 #endif 4354 socklen_t addrlen; 4355 struct timeval timeout; 4356 4357 if (!(event & EV_READ)) { 4358 return; 4359 } 4360 4361 if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { 4362 reject = data->nsd->options->tcp_reject_overflow; 4363 if (!reject) { 4364 return; 4365 } 4366 } 4367 4368 /* Accept it... */ 4369 addrlen = sizeof(addr); 4370 s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); 4371 if (s == -1) { 4372 /** 4373 * EMFILE and ENFILE is a signal that the limit of open 4374 * file descriptors has been reached. Pause accept(). 4375 * EINTR is a signal interrupt. The others are various OS ways 4376 * of saying that the client has closed the connection. 4377 */ 4378 if (errno == EMFILE || errno == ENFILE) { 4379 if (!slowaccept) { 4380 /* disable accept events */ 4381 struct timeval tv; 4382 configure_handler_event_types(0); 4383 tv.tv_sec = SLOW_ACCEPT_TIMEOUT; 4384 tv.tv_usec = 0L; 4385 memset(&slowaccept_event, 0, 4386 sizeof(slowaccept_event)); 4387 event_set(&slowaccept_event, -1, EV_TIMEOUT, 4388 handle_slowaccept_timeout, NULL); 4389 (void)event_base_set(data->event.ev_base, 4390 &slowaccept_event); 4391 (void)event_add(&slowaccept_event, &tv); 4392 slowaccept = 1; 4393 /* We don't want to spam the logs here */ 4394 } 4395 } else if (errno != EINTR 4396 && errno != EWOULDBLOCK 4397 #ifdef ECONNABORTED 4398 && errno != ECONNABORTED 4399 #endif /* ECONNABORTED */ 4400 #ifdef EPROTO 4401 && errno != EPROTO 4402 #endif /* EPROTO */ 4403 ) { 4404 log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); 4405 } 4406 return; 4407 } 4408 4409 if (reject) { 4410 shutdown(s, SHUT_RDWR); 4411 close(s); 4412 return; 4413 } 4414 4415 /* 4416 * This region is deallocated when the TCP connection is 4417 * closed by the TCP handler. 4418 */ 4419 tcp_region = region_create(xalloc, free); 4420 tcp_data = (struct tcp_handler_data *) region_alloc( 4421 tcp_region, sizeof(struct tcp_handler_data)); 4422 tcp_data->region = tcp_region; 4423 tcp_data->query = query_create(tcp_region, compressed_dname_offsets, 4424 compression_table_size, compressed_dnames); 4425 tcp_data->nsd = data->nsd; 4426 tcp_data->query_count = 0; 4427 #ifdef HAVE_SSL 4428 tcp_data->shake_state = tls_hs_none; 4429 tcp_data->tls = NULL; 4430 #endif 4431 tcp_data->prev = NULL; 4432 tcp_data->next = NULL; 4433 4434 tcp_data->query_state = QUERY_PROCESSED; 4435 tcp_data->bytes_transmitted = 0; 4436 memcpy(&tcp_data->query->addr, &addr, addrlen); 4437 tcp_data->query->addrlen = addrlen; 4438 4439 tcp_data->tcp_no_more_queries = 0; 4440 tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; 4441 if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { 4442 /* very busy, give smaller timeout */ 4443 tcp_data->tcp_timeout = 200; 4444 } 4445 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4446 timeout.tv_sec = tcp_data->tcp_timeout / 1000; 4447 timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; 4448 4449 #ifdef HAVE_SSL 4450 if (data->tls_accept) { 4451 tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); 4452 if(!tcp_data->tls) { 4453 close(s); 4454 return; 4455 } 4456 tcp_data->shake_state = tls_hs_read; 4457 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4458 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4459 handle_tls_reading, tcp_data); 4460 } else { 4461 #endif 4462 memset(&tcp_data->event, 0, sizeof(tcp_data->event)); 4463 event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, 4464 handle_tcp_reading, tcp_data); 4465 #ifdef HAVE_SSL 4466 } 4467 #endif 4468 if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { 4469 log_msg(LOG_ERR, "cannot set tcp event base"); 4470 close(s); 4471 region_destroy(tcp_region); 4472 return; 4473 } 4474 if(event_add(&tcp_data->event, &timeout) != 0) { 4475 log_msg(LOG_ERR, "cannot add tcp to event base"); 4476 close(s); 4477 region_destroy(tcp_region); 4478 return; 4479 } 4480 if(tcp_active_list) { 4481 tcp_active_list->prev = tcp_data; 4482 tcp_data->next = tcp_active_list; 4483 } 4484 tcp_active_list = tcp_data; 4485 4486 /* 4487 * Keep track of the total number of TCP handlers installed so 4488 * we can stop accepting connections when the maximum number 4489 * of simultaneous TCP connections is reached. 4490 * 4491 * If tcp-reject-overflow is enabled, however, then we do not 4492 * change the handler event type; we keep it as-is and accept 4493 * overflow TCP connections only so that we can forcibly kill 4494 * them off. 4495 */ 4496 ++data->nsd->current_tcp_count; 4497 if (!data->nsd->options->tcp_reject_overflow && 4498 data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) 4499 { 4500 configure_handler_event_types(0); 4501 } 4502 } 4503 4504 static void 4505 send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) 4506 { 4507 size_t i; 4508 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4509 for (i = 0; i < nsd->child_count; ++i) { 4510 if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { 4511 if (write(nsd->children[i].child_fd, 4512 &command, 4513 sizeof(command)) == -1) 4514 { 4515 if(errno != EAGAIN && errno != EINTR) 4516 log_msg(LOG_ERR, "problems sending command %d to server %d: %s", 4517 (int) command, 4518 (int) nsd->children[i].pid, 4519 strerror(errno)); 4520 } else if (timeout > 0) { 4521 (void)block_read(NULL, 4522 nsd->children[i].child_fd, 4523 &command, sizeof(command), timeout); 4524 } 4525 fsync(nsd->children[i].child_fd); 4526 close(nsd->children[i].child_fd); 4527 nsd->children[i].child_fd = -1; 4528 } 4529 } 4530 } 4531 4532 static void 4533 send_children_quit(struct nsd* nsd) 4534 { 4535 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); 4536 send_children_command(nsd, NSD_QUIT, 0); 4537 } 4538 4539 static void 4540 send_children_quit_and_wait(struct nsd* nsd) 4541 { 4542 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); 4543 send_children_command(nsd, NSD_QUIT_CHILD, 3); 4544 } 4545 4546 #ifdef BIND8_STATS 4547 static void 4548 set_children_stats(struct nsd* nsd) 4549 { 4550 size_t i; 4551 assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); 4552 DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); 4553 for (i = 0; i < nsd->child_count; ++i) { 4554 nsd->children[i].need_to_send_STATS = 1; 4555 nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; 4556 } 4557 } 4558 #endif /* BIND8_STATS */ 4559 4560 static void 4561 configure_handler_event_types(short event_types) 4562 { 4563 size_t i; 4564 4565 for (i = 0; i < tcp_accept_handler_count; ++i) { 4566 struct event* handler = &tcp_accept_handlers[i].event; 4567 if(event_types) { 4568 /* reassign */ 4569 int fd = handler->ev_fd; 4570 struct event_base* base = handler->ev_base; 4571 if(tcp_accept_handlers[i].event_added) 4572 event_del(handler); 4573 memset(handler, 0, sizeof(*handler)); 4574 event_set(handler, fd, event_types, 4575 handle_tcp_accept, &tcp_accept_handlers[i]); 4576 if(event_base_set(base, handler) != 0) 4577 log_msg(LOG_ERR, "conhand: cannot event_base"); 4578 if(event_add(handler, NULL) != 0) 4579 log_msg(LOG_ERR, "conhand: cannot event_add"); 4580 tcp_accept_handlers[i].event_added = 1; 4581 } else { 4582 /* remove */ 4583 if(tcp_accept_handlers[i].event_added) { 4584 event_del(handler); 4585 tcp_accept_handlers[i].event_added = 0; 4586 } 4587 } 4588 } 4589 } 4590