1 /* 2 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 10 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 13 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 14 * PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 /*! \file */ 18 19 #include <sys/socket.h> 20 #include <sys/time.h> 21 #include <sys/uio.h> 22 23 #include <netinet/tcp.h> 24 25 #include <errno.h> 26 #include <fcntl.h> 27 #include <stddef.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <unistd.h> 31 32 #include <isc/buffer.h> 33 #include <isc/bufferlist.h> 34 35 #include <isc/list.h> 36 #include <isc/log.h> 37 #include <isc/net.h> 38 #include <isc/region.h> 39 #include <isc/socket.h> 40 #include <isc/task.h> 41 #include <isc/util.h> 42 43 #include "errno2result.h" 44 45 #include "socket_p.h" 46 #include "../task_p.h" 47 48 struct isc_socketwait { 49 fd_set *readset; 50 fd_set *writeset; 51 int nfds; 52 int maxfd; 53 }; 54 55 /* 56 * Set by the -T dscp option on the command line. If set to a value 57 * other than -1, we check to make sure DSCP values match it, and 58 * assert if not. 59 */ 60 int isc_dscp_check_value = -1; 61 62 /*% 63 * Some systems define the socket length argument as an int, some as size_t, 64 * some as socklen_t. This is here so it can be easily changed if needed. 65 */ 66 67 /*% 68 * Define what the possible "soft" errors can be. These are non-fatal returns 69 * of various network related functions, like recv() and so on. 70 * 71 * For some reason, BSDI (and perhaps others) will sometimes return <0 72 * from recv() but will have errno==0. This is broken, but we have to 73 * work around it here. 74 */ 75 #define SOFT_ERROR(e) ((e) == EAGAIN || \ 76 (e) == EWOULDBLOCK || \ 77 (e) == EINTR || \ 78 (e) == 0) 79 80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 81 82 /*!< 83 * DLVL(90) -- Function entry/exit and other tracing. 84 * DLVL(60) -- Socket data send/receive 85 * DLVL(50) -- Event tracing, including receiving/sending completion events. 86 * DLVL(20) -- Socket creation/destruction. 87 */ 88 #define TRACE_LEVEL 90 89 #define IOEVENT_LEVEL 60 90 #define EVENT_LEVEL 50 91 #define CREATION_LEVEL 20 92 93 #define TRACE DLVL(TRACE_LEVEL) 94 #define IOEVENT DLVL(IOEVENT_LEVEL) 95 #define EVENT DLVL(EVENT_LEVEL) 96 #define CREATION DLVL(CREATION_LEVEL) 97 98 typedef isc_event_t intev_t; 99 100 /*! 101 * IPv6 control information. If the socket is an IPv6 socket we want 102 * to collect the destination address and interface so the client can 103 * set them on outgoing packets. 104 */ 105 106 /*% 107 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 108 * a setsockopt() like interface to request timestamps, and if the OS 109 * doesn't do it for us, call gettimeofday() on every UDP receive? 110 */ 111 112 /*% 113 * Instead of calculating the cmsgbuf lengths every time we take 114 * a rule of thumb approach - sizes are taken from x86_64 linux, 115 * multiplied by 2, everything should fit. Those sizes are not 116 * large enough to cause any concern. 117 */ 118 #define CMSG_SP_IN6PKT 40 119 120 #define CMSG_SP_TIMESTAMP 32 121 122 #define CMSG_SP_TCTOS 24 123 124 #define CMSG_SP_INT 24 125 126 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1) 127 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1) 128 129 /*% 130 * The number of times a send operation is repeated if the result is EINTR. 131 */ 132 #define NRETRIES 10 133 134 struct isc_socket { 135 /* Not locked. */ 136 isc_socketmgr_t *manager; 137 isc_sockettype_t type; 138 139 /* Locked by socket lock. */ 140 ISC_LINK(isc_socket_t) link; 141 unsigned int references; 142 int fd; 143 int pf; 144 145 ISC_LIST(isc_socketevent_t) send_list; 146 ISC_LIST(isc_socketevent_t) recv_list; 147 isc_socket_connev_t *connect_ev; 148 149 /* 150 * Internal events. Posted when a descriptor is readable or 151 * writable. These are statically allocated and never freed. 152 * They will be set to non-purgable before use. 153 */ 154 intev_t readable_ev; 155 intev_t writable_ev; 156 157 isc_sockaddr_t peer_address; /* remote address */ 158 159 unsigned int pending_recv : 1, 160 pending_send : 1, 161 connected : 1, 162 connecting : 1, /* connect pending */ 163 bound : 1, /* bound to local addr */ 164 active : 1, /* currently active */ 165 pktdscp : 1; /* per packet dscp */ 166 unsigned int dscp; 167 }; 168 169 struct isc_socketmgr { 170 /* Not locked. */ 171 int fd_bufsize; 172 unsigned int maxsocks; 173 174 isc_socket_t **fds; 175 int *fdstate; 176 177 /* Locked by manager lock. */ 178 ISC_LIST(isc_socket_t) socklist; 179 fd_set *read_fds; 180 fd_set *read_fds_copy; 181 fd_set *write_fds; 182 fd_set *write_fds_copy; 183 int maxfd; 184 unsigned int refs; 185 }; 186 187 static isc_socketmgr_t *socketmgr = NULL; 188 189 #define CLOSED 0 /* this one must be zero */ 190 #define MANAGED 1 191 #define CLOSE_PENDING 2 192 193 /* 194 * send() and recv() iovec counts 195 */ 196 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 197 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 198 199 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf, 200 isc_sockettype_t type, 201 isc_socket_t **socketp); 202 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **); 203 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **); 204 static void free_socket(isc_socket_t **); 205 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t, 206 isc_socket_t **); 207 static void destroy(isc_socket_t **); 208 static void internal_connect(isc_task_t *, isc_event_t *); 209 static void internal_recv(isc_task_t *, isc_event_t *); 210 static void internal_send(isc_task_t *, isc_event_t *); 211 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *); 212 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, 213 struct msghdr *, struct iovec *, size_t *); 214 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, 215 struct msghdr *, struct iovec *, size_t *); 216 217 #define SELECT_POKE_SHUTDOWN (-1) 218 #define SELECT_POKE_READ (-3) 219 #define SELECT_POKE_WRITE (-4) 220 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 221 #define SELECT_POKE_CLOSE (-5) 222 223 #define SOCK_DEAD(s) ((s)->references == 0) 224 225 /*% 226 * Shortcut index arrays to get access to statistics counters. 227 */ 228 enum { 229 STATID_OPEN = 0, 230 STATID_OPENFAIL = 1, 231 STATID_CLOSE = 2, 232 STATID_BINDFAIL = 3, 233 STATID_CONNECTFAIL = 4, 234 STATID_CONNECT = 5, 235 STATID_ACCEPTFAIL = 6, 236 STATID_ACCEPT = 7, 237 STATID_SENDFAIL = 8, 238 STATID_RECVFAIL = 9, 239 STATID_ACTIVE = 10 240 }; 241 242 243 static void 244 socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 245 isc_logcategory_t *category, isc_logmodule_t *module, int level, 246 const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7))); 247 static void 248 socket_log(isc_socket_t *sock, isc_sockaddr_t *address, 249 isc_logcategory_t *category, isc_logmodule_t *module, int level, 250 const char *fmt, ...) 251 { 252 char msgbuf[2048]; 253 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 254 va_list ap; 255 256 if (! isc_log_wouldlog(isc_lctx, level)) 257 return; 258 259 va_start(ap, fmt); 260 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 261 va_end(ap); 262 263 if (address == NULL) { 264 isc_log_write(isc_lctx, category, module, level, 265 "socket %p: %s", sock, msgbuf); 266 } else { 267 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 268 isc_log_write(isc_lctx, category, module, level, 269 "socket %p %s: %s", sock, peerbuf, msgbuf); 270 } 271 } 272 273 static inline isc_result_t 274 watch_fd(isc_socketmgr_t *manager, int fd, int msg) { 275 isc_result_t result = ISC_R_SUCCESS; 276 277 if (msg == SELECT_POKE_READ) 278 FD_SET(fd, manager->read_fds); 279 if (msg == SELECT_POKE_WRITE) 280 FD_SET(fd, manager->write_fds); 281 282 return (result); 283 } 284 285 static inline isc_result_t 286 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { 287 isc_result_t result = ISC_R_SUCCESS; 288 289 if (msg == SELECT_POKE_READ) 290 FD_CLR(fd, manager->read_fds); 291 else if (msg == SELECT_POKE_WRITE) 292 FD_CLR(fd, manager->write_fds); 293 294 return (result); 295 } 296 297 static void 298 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { 299 isc_result_t result; 300 301 /* 302 * This is a wakeup on a socket. If the socket is not in the 303 * process of being closed, start watching it for either reads 304 * or writes. 305 */ 306 307 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 308 309 if (msg == SELECT_POKE_CLOSE) { 310 /* No one should be updating fdstate, so no need to lock it */ 311 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 312 manager->fdstate[fd] = CLOSED; 313 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 314 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 315 (void)close(fd); 316 return; 317 } 318 319 if (manager->fdstate[fd] == CLOSE_PENDING) { 320 321 /* 322 * We accept (and ignore) any error from unwatch_fd() as we are 323 * closing the socket, hoping it doesn't leave dangling state in 324 * the kernel. 325 */ 326 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 327 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 328 return; 329 } 330 if (manager->fdstate[fd] != MANAGED) { 331 return; 332 } 333 334 /* 335 * Set requested bit. 336 */ 337 result = watch_fd(manager, fd, msg); 338 if (result != ISC_R_SUCCESS) { 339 /* 340 * XXXJT: what should we do? Ignoring the failure of watching 341 * a socket will make the application dysfunctional, but there 342 * seems to be no reasonable recovery process. 343 */ 344 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 345 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 346 "failed to start watching FD (%d): %s", 347 fd, isc_result_totext(result)); 348 } 349 } 350 351 /* 352 * Update the state of the socketmgr when something changes. 353 */ 354 static void 355 select_poke(isc_socketmgr_t *manager, int fd, int msg) { 356 if (msg == SELECT_POKE_SHUTDOWN) 357 return; 358 else if (fd >= 0) 359 wakeup_socket(manager, fd, msg); 360 return; 361 } 362 363 /* 364 * Make a fd non-blocking. 365 */ 366 static isc_result_t 367 make_nonblock(int fd) { 368 int ret; 369 int flags; 370 371 flags = fcntl(fd, F_GETFL, 0); 372 flags |= O_NONBLOCK; 373 ret = fcntl(fd, F_SETFL, flags); 374 375 if (ret == -1) { 376 UNEXPECTED_ERROR(__FILE__, __LINE__, 377 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 378 strerror(errno)); 379 return (ISC_R_UNEXPECTED); 380 } 381 382 return (ISC_R_SUCCESS); 383 } 384 385 /* 386 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 387 * In order to ensure as much portability as possible, we provide wrapper 388 * functions of these macros. 389 * Note that cmsg_space() could run slow on OSes that do not have 390 * CMSG_SPACE. 391 */ 392 static inline socklen_t 393 cmsg_len(socklen_t len) { 394 return (CMSG_LEN(len)); 395 } 396 397 static inline socklen_t 398 cmsg_space(socklen_t len) { 399 return (CMSG_SPACE(len)); 400 } 401 402 /* 403 * Process control messages received on a socket. 404 */ 405 static void 406 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 407 struct cmsghdr *cmsgp; 408 struct in6_pktinfo *pktinfop; 409 void *timevalp; 410 411 /* 412 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 413 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 414 * They are all here, outside of the CPP tests, because it is 415 * more consistent with the usual ISC coding style. 416 */ 417 UNUSED(sock); 418 UNUSED(msg); 419 UNUSED(dev); 420 421 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 422 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 423 424 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 425 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 426 427 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 428 return; 429 430 timevalp = NULL; 431 pktinfop = NULL; 432 433 cmsgp = CMSG_FIRSTHDR(msg); 434 while (cmsgp != NULL) { 435 socket_log(sock, NULL, TRACE, 436 "processing cmsg %p", cmsgp); 437 438 if (cmsgp->cmsg_level == IPPROTO_IPV6 439 && cmsgp->cmsg_type == IPV6_PKTINFO) { 440 441 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 442 memmove(&dev->pktinfo, pktinfop, 443 sizeof(struct in6_pktinfo)); 444 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 445 socket_log(sock, NULL, TRACE, 446 "interface received on ifindex %u", 447 dev->pktinfo.ipi6_ifindex); 448 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 449 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 450 goto next; 451 } 452 453 if (cmsgp->cmsg_level == SOL_SOCKET 454 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 455 struct timeval tv; 456 timevalp = CMSG_DATA(cmsgp); 457 memmove(&tv, timevalp, sizeof(tv)); 458 TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp); 459 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 460 goto next; 461 } 462 463 if (cmsgp->cmsg_level == IPPROTO_IPV6 464 && cmsgp->cmsg_type == IPV6_TCLASS) { 465 dev->dscp = *(int *)CMSG_DATA(cmsgp); 466 dev->dscp >>= 2; 467 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 468 goto next; 469 } 470 471 if (cmsgp->cmsg_level == IPPROTO_IP 472 && (cmsgp->cmsg_type == IP_TOS)) { 473 dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp); 474 dev->dscp >>= 2; 475 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 476 goto next; 477 } 478 next: 479 cmsgp = CMSG_NXTHDR(msg, cmsgp); 480 } 481 482 } 483 484 /* 485 * Construct an iov array and attach it to the msghdr passed in. This is 486 * the SEND constructor, which will use the used region of the buffer 487 * (if using a buffer list) or will use the internal region (if a single 488 * buffer I/O is requested). 489 * 490 * Nothing can be NULL, and the done event must list at least one buffer 491 * on the buffer linked list for this function to be meaningful. 492 * 493 * If write_countp != NULL, *write_countp will hold the number of bytes 494 * this transaction can send. 495 */ 496 static void 497 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev, 498 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 499 { 500 unsigned int iovcount; 501 isc_buffer_t *buffer; 502 isc_region_t used; 503 size_t write_count; 504 size_t skip_count; 505 struct cmsghdr *cmsgp; 506 507 memset(msg, 0, sizeof(*msg)); 508 509 if (!sock->connected) { 510 msg->msg_name = (void *)&dev->address.type.sa; 511 msg->msg_namelen = dev->address.length; 512 } else { 513 msg->msg_name = NULL; 514 msg->msg_namelen = 0; 515 } 516 517 buffer = ISC_LIST_HEAD(dev->bufferlist); 518 write_count = 0; 519 iovcount = 0; 520 521 /* 522 * Single buffer I/O? Skip what we've done so far in this region. 523 */ 524 if (buffer == NULL) { 525 write_count = dev->region.length - dev->n; 526 iov[0].iov_base = (void *)(dev->region.base + dev->n); 527 iov[0].iov_len = write_count; 528 iovcount = 1; 529 530 goto config; 531 } 532 533 /* 534 * Multibuffer I/O. 535 * Skip the data in the buffer list that we have already written. 536 */ 537 skip_count = dev->n; 538 while (buffer != NULL) { 539 if (skip_count < isc_buffer_usedlength(buffer)) 540 break; 541 skip_count -= isc_buffer_usedlength(buffer); 542 buffer = ISC_LIST_NEXT(buffer, link); 543 } 544 545 while (buffer != NULL) { 546 INSIST(iovcount < MAXSCATTERGATHER_SEND); 547 548 isc_buffer_usedregion(buffer, &used); 549 550 if (used.length > 0) { 551 iov[iovcount].iov_base = (void *)(used.base 552 + skip_count); 553 iov[iovcount].iov_len = used.length - skip_count; 554 write_count += (used.length - skip_count); 555 skip_count = 0; 556 iovcount++; 557 } 558 buffer = ISC_LIST_NEXT(buffer, link); 559 } 560 561 INSIST(skip_count == 0U); 562 563 config: 564 msg->msg_iov = iov; 565 msg->msg_iovlen = iovcount; 566 567 msg->msg_control = NULL; 568 msg->msg_controllen = 0; 569 msg->msg_flags = 0; 570 571 if ((sock->type == isc_sockettype_udp) && 572 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) 573 { 574 struct in6_pktinfo *pktinfop; 575 576 socket_log(sock, NULL, TRACE, 577 "sendto pktinfo data, ifindex %u", 578 dev->pktinfo.ipi6_ifindex); 579 580 msg->msg_control = (void *)cmsgbuf; 581 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 582 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 583 584 cmsgp = (struct cmsghdr *)cmsgbuf; 585 cmsgp->cmsg_level = IPPROTO_IPV6; 586 cmsgp->cmsg_type = IPV6_PKTINFO; 587 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 588 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 589 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 590 } 591 592 if ((sock->type == isc_sockettype_udp) && 593 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0)) 594 { 595 int use_min_mtu = 1; /* -1, 0, 1 */ 596 597 cmsgp = (struct cmsghdr *)(cmsgbuf + 598 msg->msg_controllen); 599 600 msg->msg_control = (void *)cmsgbuf; 601 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 602 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 603 604 cmsgp->cmsg_level = IPPROTO_IPV6; 605 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 606 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 607 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 608 } 609 610 if (isc_dscp_check_value > -1) { 611 if (sock->type == isc_sockettype_udp) 612 INSIST((int)dev->dscp == isc_dscp_check_value); 613 else if (sock->type == isc_sockettype_tcp) 614 INSIST((int)sock->dscp == isc_dscp_check_value); 615 } 616 617 if ((sock->type == isc_sockettype_udp) && 618 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0)) 619 { 620 int dscp = (dev->dscp << 2) & 0xff; 621 622 INSIST(dev->dscp < 0x40); 623 624 if (sock->pf == AF_INET && sock->pktdscp) { 625 cmsgp = (struct cmsghdr *)(cmsgbuf + 626 msg->msg_controllen); 627 msg->msg_control = (void *)cmsgbuf; 628 msg->msg_controllen += cmsg_space(sizeof(dscp)); 629 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 630 631 cmsgp->cmsg_level = IPPROTO_IP; 632 cmsgp->cmsg_type = IP_TOS; 633 cmsgp->cmsg_len = cmsg_len(sizeof(char)); 634 *(unsigned char*)CMSG_DATA(cmsgp) = dscp; 635 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) { 636 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, 637 (void *)&dscp, sizeof(int)) < 0) 638 { 639 UNEXPECTED_ERROR(__FILE__, __LINE__, 640 "setsockopt(%d, IP_TOS, %.02x)" 641 " %s: %s", 642 sock->fd, dscp >> 2, 643 "failed", strerror(errno)); 644 } else 645 sock->dscp = dscp; 646 } 647 648 if (sock->pf == AF_INET6 && sock->pktdscp) { 649 cmsgp = (struct cmsghdr *)(cmsgbuf + 650 msg->msg_controllen); 651 msg->msg_control = (void *)cmsgbuf; 652 msg->msg_controllen += cmsg_space(sizeof(dscp)); 653 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN); 654 655 cmsgp->cmsg_level = IPPROTO_IPV6; 656 cmsgp->cmsg_type = IPV6_TCLASS; 657 cmsgp->cmsg_len = cmsg_len(sizeof(dscp)); 658 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp)); 659 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) { 660 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS, 661 (void *)&dscp, sizeof(int)) < 0) { 662 UNEXPECTED_ERROR(__FILE__, __LINE__, 663 "setsockopt(%d, IPV6_TCLASS, " 664 "%.02x) %s: %s", 665 sock->fd, dscp >> 2, 666 "failed", strerror(errno)); 667 } else 668 sock->dscp = dscp; 669 } 670 671 if (msg->msg_controllen != 0 && 672 msg->msg_controllen < SENDCMSGBUFLEN) 673 { 674 memset(cmsgbuf + msg->msg_controllen, 0, 675 SENDCMSGBUFLEN - msg->msg_controllen); 676 } 677 } 678 679 if (write_countp != NULL) 680 *write_countp = write_count; 681 } 682 683 /* 684 * Construct an iov array and attach it to the msghdr passed in. This is 685 * the RECV constructor, which will use the available region of the buffer 686 * (if using a buffer list) or will use the internal region (if a single 687 * buffer I/O is requested). 688 * 689 * Nothing can be NULL, and the done event must list at least one buffer 690 * on the buffer linked list for this function to be meaningful. 691 * 692 * If read_countp != NULL, *read_countp will hold the number of bytes 693 * this transaction can receive. 694 */ 695 static void 696 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev, 697 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 698 { 699 unsigned int iovcount; 700 isc_buffer_t *buffer; 701 isc_region_t available; 702 size_t read_count; 703 704 memset(msg, 0, sizeof(struct msghdr)); 705 706 if (sock->type == isc_sockettype_udp) { 707 memset(&dev->address, 0, sizeof(dev->address)); 708 msg->msg_name = (void *)&dev->address.type.sa; 709 msg->msg_namelen = sizeof(dev->address.type); 710 } else { /* TCP */ 711 msg->msg_name = NULL; 712 msg->msg_namelen = 0; 713 dev->address = sock->peer_address; 714 } 715 716 buffer = ISC_LIST_HEAD(dev->bufferlist); 717 read_count = 0; 718 719 /* 720 * Single buffer I/O? Skip what we've done so far in this region. 721 */ 722 if (buffer == NULL) { 723 read_count = dev->region.length - dev->n; 724 iov[0].iov_base = (void *)(dev->region.base + dev->n); 725 iov[0].iov_len = read_count; 726 iovcount = 1; 727 728 goto config; 729 } 730 731 /* 732 * Multibuffer I/O. 733 * Skip empty buffers. 734 */ 735 while (buffer != NULL) { 736 if (isc_buffer_availablelength(buffer) != 0) 737 break; 738 buffer = ISC_LIST_NEXT(buffer, link); 739 } 740 741 iovcount = 0; 742 while (buffer != NULL) { 743 INSIST(iovcount < MAXSCATTERGATHER_RECV); 744 745 isc_buffer_availableregion(buffer, &available); 746 747 if (available.length > 0) { 748 iov[iovcount].iov_base = (void *)(available.base); 749 iov[iovcount].iov_len = available.length; 750 read_count += available.length; 751 iovcount++; 752 } 753 buffer = ISC_LIST_NEXT(buffer, link); 754 } 755 756 config: 757 758 /* 759 * If needed, set up to receive that one extra byte. 760 */ 761 msg->msg_iov = iov; 762 msg->msg_iovlen = iovcount; 763 764 msg->msg_control = cmsgbuf; 765 msg->msg_controllen = RECVCMSGBUFLEN; 766 msg->msg_flags = 0; 767 768 if (read_countp != NULL) 769 *read_countp = read_count; 770 } 771 772 static void 773 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock, 774 isc_socketevent_t *dev) 775 { 776 if (sock->type == isc_sockettype_udp) { 777 if (address != NULL) 778 dev->address = *address; 779 else 780 dev->address = sock->peer_address; 781 } else if (sock->type == isc_sockettype_tcp) { 782 INSIST(address == NULL); 783 dev->address = sock->peer_address; 784 } 785 } 786 787 static void 788 destroy_socketevent(isc_event_t *event) { 789 isc_socketevent_t *ev = (isc_socketevent_t *)event; 790 791 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 792 793 (ev->destroy)(event); 794 } 795 796 static isc_socketevent_t * 797 allocate_socketevent(void *sender, 798 isc_eventtype_t eventtype, isc_taskaction_t action, 799 void *arg) 800 { 801 isc_socketevent_t *ev; 802 803 ev = (isc_socketevent_t *)isc_event_allocate(sender, 804 eventtype, action, arg, 805 sizeof(*ev)); 806 807 if (ev == NULL) 808 return (NULL); 809 810 ev->result = ISC_R_UNSET; 811 ISC_LINK_INIT(ev, ev_link); 812 ISC_LIST_INIT(ev->bufferlist); 813 ev->region.base = NULL; 814 ev->n = 0; 815 ev->offset = 0; 816 ev->attributes = 0; 817 ev->destroy = ev->ev_destroy; 818 ev->ev_destroy = destroy_socketevent; 819 ev->dscp = 0; 820 821 return (ev); 822 } 823 824 #define DOIO_SUCCESS 0 /* i/o ok, event sent */ 825 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 826 #define DOIO_HARD 2 /* i/o error, event sent */ 827 #define DOIO_EOF 3 /* EOF, no event sent */ 828 829 static int 830 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { 831 int cc; 832 struct iovec iov[MAXSCATTERGATHER_RECV]; 833 size_t read_count; 834 size_t actual_count; 835 struct msghdr msghdr; 836 isc_buffer_t *buffer; 837 int recv_errno; 838 char cmsgbuf[RECVCMSGBUFLEN] = {0}; 839 840 build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count); 841 842 cc = recvmsg(sock->fd, &msghdr, 0); 843 recv_errno = errno; 844 845 if (cc < 0) { 846 if (SOFT_ERROR(recv_errno)) 847 return (DOIO_SOFT); 848 849 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 850 socket_log(sock, NULL, IOEVENT, 851 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 852 sock->fd, cc, recv_errno, 853 strerror(recv_errno)); 854 } 855 856 #define SOFT_OR_HARD(_system, _isc) \ 857 if (recv_errno == _system) { \ 858 if (sock->connected) { \ 859 dev->result = _isc; \ 860 return (DOIO_HARD); \ 861 } \ 862 return (DOIO_SOFT); \ 863 } 864 #define ALWAYS_HARD(_system, _isc) \ 865 if (recv_errno == _system) { \ 866 dev->result = _isc; \ 867 return (DOIO_HARD); \ 868 } 869 870 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 871 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 872 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 873 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 874 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 875 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 876 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 877 /* Should never get this one but it was seen. */ 878 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH); 879 /* 880 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 881 * errors. 882 */ 883 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 884 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 885 886 #undef SOFT_OR_HARD 887 #undef ALWAYS_HARD 888 889 dev->result = isc__errno2result(recv_errno); 890 return (DOIO_HARD); 891 } 892 893 /* 894 * On TCP and UNIX sockets, zero length reads indicate EOF, 895 * while on UDP sockets, zero length reads are perfectly valid, 896 * although strange. 897 */ 898 switch (sock->type) { 899 case isc_sockettype_tcp: 900 if (cc == 0) 901 return (DOIO_EOF); 902 break; 903 case isc_sockettype_udp: 904 break; 905 default: 906 INSIST(0); 907 } 908 909 if (sock->type == isc_sockettype_udp) { 910 dev->address.length = msghdr.msg_namelen; 911 if (isc_sockaddr_getport(&dev->address) == 0) { 912 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 913 socket_log(sock, &dev->address, IOEVENT, 914 "dropping source port zero packet"); 915 } 916 return (DOIO_SOFT); 917 } 918 } 919 920 socket_log(sock, &dev->address, IOEVENT, 921 "packet received correctly"); 922 923 /* 924 * Overflow bit detection. If we received MORE bytes than we should, 925 * this indicates an overflow situation. Set the flag in the 926 * dev entry and adjust how much we read by one. 927 */ 928 /* 929 * If there are control messages attached, run through them and pull 930 * out the interesting bits. 931 */ 932 process_cmsg(sock, &msghdr, dev); 933 934 /* 935 * update the buffers (if any) and the i/o count 936 */ 937 dev->n += cc; 938 actual_count = cc; 939 buffer = ISC_LIST_HEAD(dev->bufferlist); 940 while (buffer != NULL && actual_count > 0U) { 941 if (isc_buffer_availablelength(buffer) <= actual_count) { 942 actual_count -= isc_buffer_availablelength(buffer); 943 isc_buffer_add(buffer, 944 isc_buffer_availablelength(buffer)); 945 } else { 946 isc_buffer_add(buffer, actual_count); 947 actual_count = 0; 948 POST(actual_count); 949 break; 950 } 951 buffer = ISC_LIST_NEXT(buffer, link); 952 if (buffer == NULL) { 953 INSIST(actual_count == 0U); 954 } 955 } 956 957 /* 958 * If we read less than we expected, update counters, 959 * and let the upper layer poke the descriptor. 960 */ 961 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 962 return (DOIO_SOFT); 963 964 /* 965 * Full reads are posted, or partials if partials are ok. 966 */ 967 dev->result = ISC_R_SUCCESS; 968 return (DOIO_SUCCESS); 969 } 970 971 /* 972 * Returns: 973 * DOIO_SUCCESS The operation succeeded. dev->result contains 974 * ISC_R_SUCCESS. 975 * 976 * DOIO_HARD A hard or unexpected I/O error was encountered. 977 * dev->result contains the appropriate error. 978 * 979 * DOIO_SOFT A soft I/O error was encountered. No senddone 980 * event was sent. The operation should be retried. 981 * 982 * No other return values are possible. 983 */ 984 static int 985 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { 986 int cc; 987 struct iovec iov[MAXSCATTERGATHER_SEND]; 988 size_t write_count; 989 struct msghdr msghdr; 990 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 991 int attempts = 0; 992 int send_errno; 993 char cmsgbuf[SENDCMSGBUFLEN] = {0}; 994 995 build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count); 996 997 resend: 998 cc = sendmsg(sock->fd, &msghdr, 0); 999 send_errno = errno; 1000 1001 /* 1002 * Check for error or block condition. 1003 */ 1004 if (cc < 0) { 1005 if (send_errno == EINTR && ++attempts < NRETRIES) 1006 goto resend; 1007 1008 if (SOFT_ERROR(send_errno)) { 1009 if (errno == EWOULDBLOCK || errno == EAGAIN) 1010 dev->result = ISC_R_WOULDBLOCK; 1011 return (DOIO_SOFT); 1012 } 1013 1014 #define SOFT_OR_HARD(_system, _isc) \ 1015 if (send_errno == _system) { \ 1016 if (sock->connected) { \ 1017 dev->result = _isc; \ 1018 return (DOIO_HARD); \ 1019 } \ 1020 return (DOIO_SOFT); \ 1021 } 1022 #define ALWAYS_HARD(_system, _isc) \ 1023 if (send_errno == _system) { \ 1024 dev->result = _isc; \ 1025 return (DOIO_HARD); \ 1026 } 1027 1028 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1029 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 1030 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 1031 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1032 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1033 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 1034 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1035 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1036 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 1037 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 1038 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 1039 1040 #undef SOFT_OR_HARD 1041 #undef ALWAYS_HARD 1042 1043 /* 1044 * The other error types depend on whether or not the 1045 * socket is UDP or TCP. If it is UDP, some errors 1046 * that we expect to be fatal under TCP are merely 1047 * annoying, and are really soft errors. 1048 * 1049 * However, these soft errors are still returned as 1050 * a status. 1051 */ 1052 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 1053 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 1054 addrbuf, strerror(send_errno)); 1055 dev->result = isc__errno2result(send_errno); 1056 return (DOIO_HARD); 1057 } 1058 1059 if (cc == 0) { 1060 UNEXPECTED_ERROR(__FILE__, __LINE__, 1061 "doio_send: send() %s 0", "returned"); 1062 } 1063 1064 /* 1065 * If we write less than we expected, update counters, poke. 1066 */ 1067 dev->n += cc; 1068 if ((size_t)cc != write_count) 1069 return (DOIO_SOFT); 1070 1071 /* 1072 * Exactly what we wanted to write. We're done with this 1073 * entry. Post its completion event. 1074 */ 1075 dev->result = ISC_R_SUCCESS; 1076 return (DOIO_SUCCESS); 1077 } 1078 1079 /* 1080 * Kill. 1081 * 1082 * Caller must ensure that the socket is not locked and no external 1083 * references exist. 1084 */ 1085 static void 1086 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) { 1087 /* 1088 * No one has this socket open, so the watcher doesn't have to be 1089 * poked, and the socket doesn't have to be locked. 1090 */ 1091 manager->fds[fd] = NULL; 1092 manager->fdstate[fd] = CLOSE_PENDING; 1093 select_poke(manager, fd, SELECT_POKE_CLOSE); 1094 1095 if (sock->active == 1) { 1096 sock->active = 0; 1097 } 1098 1099 /* 1100 * update manager->maxfd here (XXX: this should be implemented more 1101 * efficiently) 1102 */ 1103 if (manager->maxfd == fd) { 1104 int i; 1105 1106 manager->maxfd = 0; 1107 for (i = fd - 1; i >= 0; i--) { 1108 if (manager->fdstate[i] == MANAGED) { 1109 manager->maxfd = i; 1110 break; 1111 } 1112 } 1113 } 1114 1115 } 1116 1117 static void 1118 destroy(isc_socket_t **sockp) { 1119 int fd; 1120 isc_socket_t *sock = *sockp; 1121 isc_socketmgr_t *manager = sock->manager; 1122 1123 socket_log(sock, NULL, CREATION, "destroying"); 1124 1125 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1126 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1127 INSIST(sock->connect_ev == NULL); 1128 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks); 1129 1130 if (sock->fd >= 0) { 1131 fd = sock->fd; 1132 sock->fd = -1; 1133 socketclose(manager, sock, fd); 1134 } 1135 1136 ISC_LIST_UNLINK(manager->socklist, sock, link); 1137 1138 /* can't unlock manager as its memory context is still used */ 1139 free_socket(sockp); 1140 } 1141 1142 static isc_result_t 1143 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, 1144 isc_socket_t **socketp) 1145 { 1146 isc_socket_t *sock; 1147 1148 sock = malloc(sizeof(*sock)); 1149 1150 if (sock == NULL) 1151 return (ISC_R_NOMEMORY); 1152 1153 sock->references = 0; 1154 1155 sock->manager = manager; 1156 sock->type = type; 1157 sock->fd = -1; 1158 sock->dscp = 0; /* TOS/TCLASS is zero until set. */ 1159 sock->active = 0; 1160 1161 ISC_LINK_INIT(sock, link); 1162 1163 /* 1164 * Set up list of readers and writers to be initially empty. 1165 */ 1166 ISC_LIST_INIT(sock->recv_list); 1167 ISC_LIST_INIT(sock->send_list); 1168 sock->connect_ev = NULL; 1169 sock->pending_recv = 0; 1170 sock->pending_send = 0; 1171 sock->connected = 0; 1172 sock->connecting = 0; 1173 sock->bound = 0; 1174 sock->pktdscp = 0; 1175 1176 /* 1177 * Initialize readable and writable events. 1178 */ 1179 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 1180 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 1181 NULL, sock, sock, NULL); 1182 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 1183 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 1184 NULL, sock, sock, NULL); 1185 1186 *socketp = sock; 1187 1188 return (ISC_R_SUCCESS); 1189 } 1190 1191 /* 1192 * This event requires that the various lists be empty, that the reference 1193 * count be 1. The other socket bits, 1194 * like the lock, must be initialized as well. The fd associated must be 1195 * marked as closed, by setting it to -1 on close, or this routine will 1196 * also close the socket. 1197 */ 1198 static void 1199 free_socket(isc_socket_t **socketp) { 1200 isc_socket_t *sock = *socketp; 1201 1202 INSIST(sock->references == 0); 1203 INSIST(!sock->connecting); 1204 INSIST(!sock->pending_recv); 1205 INSIST(!sock->pending_send); 1206 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 1207 INSIST(ISC_LIST_EMPTY(sock->send_list)); 1208 INSIST(!ISC_LINK_LINKED(sock, link)); 1209 1210 free(sock); 1211 1212 *socketp = NULL; 1213 } 1214 1215 static void 1216 use_min_mtu(isc_socket_t *sock) { 1217 /* use minimum MTU */ 1218 if (sock->pf == AF_INET6) { 1219 int on = 1; 1220 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 1221 (void *)&on, sizeof(on)); 1222 } 1223 } 1224 1225 static void 1226 set_tcp_maxseg(isc_socket_t *sock, int size) { 1227 if (sock->type == isc_sockettype_tcp) 1228 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG, 1229 (void *)&size, sizeof(size)); 1230 } 1231 1232 static isc_result_t 1233 opensocket(isc_socket_t *sock) 1234 { 1235 isc_result_t result; 1236 const char *err = "socket"; 1237 int on = 1; 1238 1239 switch (sock->type) { 1240 case isc_sockettype_udp: 1241 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 1242 break; 1243 case isc_sockettype_tcp: 1244 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 1245 break; 1246 } 1247 1248 if (sock->fd < 0) { 1249 switch (errno) { 1250 case EMFILE: 1251 case ENFILE: 1252 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 1253 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 1254 "%s: %s", err, strerror(errno)); 1255 /* fallthrough */ 1256 case ENOBUFS: 1257 return (ISC_R_NORESOURCES); 1258 1259 case EPROTONOSUPPORT: 1260 case EPFNOSUPPORT: 1261 case EAFNOSUPPORT: 1262 /* 1263 * Linux 2.2 (and maybe others) return EINVAL instead of 1264 * EAFNOSUPPORT. 1265 */ 1266 case EINVAL: 1267 return (ISC_R_FAMILYNOSUPPORT); 1268 1269 default: 1270 UNEXPECTED_ERROR(__FILE__, __LINE__, 1271 "%s() %s: %s", err, "failed", 1272 strerror(errno)); 1273 return (ISC_R_UNEXPECTED); 1274 } 1275 } 1276 1277 result = make_nonblock(sock->fd); 1278 if (result != ISC_R_SUCCESS) { 1279 (void)close(sock->fd); 1280 return (result); 1281 } 1282 1283 /* 1284 * Use minimum mtu if possible. 1285 */ 1286 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) { 1287 use_min_mtu(sock); 1288 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */ 1289 } 1290 1291 if (sock->type == isc_sockettype_udp) { 1292 1293 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 1294 (void *)&on, sizeof(on)) < 0 1295 && errno != ENOPROTOOPT) { 1296 UNEXPECTED_ERROR(__FILE__, __LINE__, 1297 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 1298 sock->fd, "failed", strerror(errno)); 1299 /* Press on... */ 1300 } 1301 1302 /* RFC 3542 */ 1303 if ((sock->pf == AF_INET6) 1304 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 1305 (void *)&on, sizeof(on)) < 0)) { 1306 UNEXPECTED_ERROR(__FILE__, __LINE__, 1307 "setsockopt(%d, IPV6_RECVPKTINFO) " 1308 "%s: %s", sock->fd, "failed", 1309 strerror(errno)); 1310 } 1311 } 1312 1313 if (sock->active == 0) { 1314 sock->active = 1; 1315 } 1316 1317 return (ISC_R_SUCCESS); 1318 } 1319 1320 /* 1321 * Create a 'type' socket managed 1322 * by 'manager'. Events will be posted to 'task' and when dispatched 1323 * 'action' will be called with 'arg' as the arg value. The new 1324 * socket is returned in 'socketp'. 1325 */ 1326 static isc_result_t 1327 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 1328 isc_socket_t **socketp) 1329 { 1330 isc_socket_t *sock = NULL; 1331 isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0; 1332 isc_result_t result; 1333 1334 REQUIRE(socketp != NULL && *socketp == NULL); 1335 1336 result = allocate_socket(manager, type, &sock); 1337 if (result != ISC_R_SUCCESS) 1338 return (result); 1339 1340 switch (sock->type) { 1341 case isc_sockettype_udp: 1342 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6) 1343 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0; 1344 break; 1345 case isc_sockettype_tcp: 1346 break; 1347 default: 1348 INSIST(0); 1349 } 1350 1351 sock->pf = pf; 1352 1353 result = opensocket(sock); 1354 if (result != ISC_R_SUCCESS) { 1355 free_socket(&sock); 1356 return (result); 1357 } 1358 1359 sock->references = 1; 1360 *socketp = (isc_socket_t *)sock; 1361 1362 /* 1363 * Note we don't have to lock the socket like we normally would because 1364 * there are no external references to it yet. 1365 */ 1366 1367 manager->fds[sock->fd] = sock; 1368 manager->fdstate[sock->fd] = MANAGED; 1369 1370 ISC_LIST_APPEND(manager->socklist, sock, link); 1371 if (manager->maxfd < sock->fd) 1372 manager->maxfd = sock->fd; 1373 1374 socket_log(sock, NULL, CREATION, "created"); 1375 1376 return (ISC_R_SUCCESS); 1377 } 1378 1379 /*% 1380 * Create a new 'type' socket managed by 'manager'. Events 1381 * will be posted to 'task' and when dispatched 'action' will be 1382 * called with 'arg' as the arg value. The new socket is returned 1383 * in 'socketp'. 1384 */ 1385 isc_result_t 1386 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 1387 isc_socket_t **socketp) 1388 { 1389 return (socket_create(manager0, pf, type, socketp)); 1390 } 1391 1392 /* 1393 * Attach to a socket. Caller must explicitly detach when it is done. 1394 */ 1395 void 1396 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) { 1397 isc_socket_t *sock = (isc_socket_t *)sock0; 1398 1399 REQUIRE(socketp != NULL && *socketp == NULL); 1400 1401 sock->references++; 1402 1403 *socketp = (isc_socket_t *)sock; 1404 } 1405 1406 /* 1407 * Dereference a socket. If this is the last reference to it, clean things 1408 * up by destroying the socket. 1409 */ 1410 void 1411 isc_socket_detach(isc_socket_t **socketp) { 1412 isc_socket_t *sock; 1413 isc_boolean_t kill_socket = ISC_FALSE; 1414 1415 REQUIRE(socketp != NULL); 1416 sock = (isc_socket_t *)*socketp; 1417 1418 REQUIRE(sock->references > 0); 1419 sock->references--; 1420 if (sock->references == 0) 1421 kill_socket = ISC_TRUE; 1422 1423 if (kill_socket) 1424 destroy(&sock); 1425 1426 *socketp = NULL; 1427 } 1428 1429 /* 1430 * I/O is possible on a given socket. Schedule an event to this task that 1431 * will call an internal function to do the I/O. This will charge the 1432 * task with the I/O operation and let our select loop handler get back 1433 * to doing something real as fast as possible. 1434 * 1435 * The socket and manager must be locked before calling this function. 1436 */ 1437 static void 1438 dispatch_recv(isc_socket_t *sock) { 1439 intev_t *iev; 1440 isc_socketevent_t *ev; 1441 isc_task_t *sender; 1442 1443 INSIST(!sock->pending_recv); 1444 1445 ev = ISC_LIST_HEAD(sock->recv_list); 1446 if (ev == NULL) 1447 return; 1448 socket_log(sock, NULL, EVENT, NULL, 0, 0, 1449 "dispatch_recv: event %p -> task %p", 1450 ev, ev->ev_sender); 1451 sender = ev->ev_sender; 1452 1453 sock->pending_recv = 1; 1454 iev = &sock->readable_ev; 1455 1456 sock->references++; 1457 iev->ev_sender = sock; 1458 iev->ev_action = internal_recv; 1459 iev->ev_arg = sock; 1460 1461 isc_task_send(sender, (isc_event_t **)&iev); 1462 } 1463 1464 static void 1465 dispatch_send(isc_socket_t *sock) { 1466 intev_t *iev; 1467 isc_socketevent_t *ev; 1468 isc_task_t *sender; 1469 1470 INSIST(!sock->pending_send); 1471 1472 ev = ISC_LIST_HEAD(sock->send_list); 1473 if (ev == NULL) 1474 return; 1475 socket_log(sock, NULL, EVENT, NULL, 0, 0, 1476 "dispatch_send: event %p -> task %p", 1477 ev, ev->ev_sender); 1478 sender = ev->ev_sender; 1479 1480 sock->pending_send = 1; 1481 iev = &sock->writable_ev; 1482 1483 sock->references++; 1484 iev->ev_sender = sock; 1485 iev->ev_action = internal_send; 1486 iev->ev_arg = sock; 1487 1488 isc_task_send(sender, (isc_event_t **)&iev); 1489 } 1490 1491 static void 1492 dispatch_connect(isc_socket_t *sock) { 1493 intev_t *iev; 1494 isc_socket_connev_t *ev; 1495 1496 iev = &sock->writable_ev; 1497 1498 ev = sock->connect_ev; 1499 INSIST(ev != NULL); /* XXX */ 1500 1501 INSIST(sock->connecting); 1502 1503 sock->references++; /* keep socket around for this internal event */ 1504 iev->ev_sender = sock; 1505 iev->ev_action = internal_connect; 1506 iev->ev_arg = sock; 1507 1508 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 1509 } 1510 1511 /* 1512 * Dequeue an item off the given socket's read queue, set the result code 1513 * in the done event to the one provided, and send it to the task it was 1514 * destined for. 1515 * 1516 * If the event to be sent is on a list, remove it before sending. If 1517 * asked to, send and detach from the socket as well. 1518 * 1519 * Caller must have the socket locked if the event is attached to the socket. 1520 */ 1521 static void 1522 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 1523 isc_task_t *task; 1524 1525 task = (*dev)->ev_sender; 1526 1527 (*dev)->ev_sender = sock; 1528 1529 if (ISC_LINK_LINKED(*dev, ev_link)) 1530 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 1531 1532 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 1533 == ISC_SOCKEVENTATTR_ATTACHED) 1534 isc_task_sendanddetach(&task, (isc_event_t **)dev); 1535 else 1536 isc_task_send(task, (isc_event_t **)dev); 1537 } 1538 1539 /* 1540 * See comments for send_recvdone_event() above. 1541 * 1542 * Caller must have the socket locked if the event is attached to the socket. 1543 */ 1544 static void 1545 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { 1546 isc_task_t *task; 1547 1548 INSIST(dev != NULL && *dev != NULL); 1549 1550 task = (*dev)->ev_sender; 1551 (*dev)->ev_sender = sock; 1552 1553 if (ISC_LINK_LINKED(*dev, ev_link)) 1554 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 1555 1556 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 1557 == ISC_SOCKEVENTATTR_ATTACHED) 1558 isc_task_sendanddetach(&task, (isc_event_t **)dev); 1559 else 1560 isc_task_send(task, (isc_event_t **)dev); 1561 } 1562 1563 static void 1564 internal_recv(isc_task_t *me, isc_event_t *ev) { 1565 isc_socketevent_t *dev; 1566 isc_socket_t *sock; 1567 1568 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 1569 1570 sock = ev->ev_sender; 1571 1572 socket_log(sock, NULL, IOEVENT, 1573 "internal_recv: task %p got event %p", me, ev); 1574 1575 INSIST(sock->pending_recv == 1); 1576 sock->pending_recv = 0; 1577 1578 INSIST(sock->references > 0); 1579 sock->references--; /* the internal event is done with this socket */ 1580 if (sock->references == 0) { 1581 destroy(&sock); 1582 return; 1583 } 1584 1585 /* 1586 * Try to do as much I/O as possible on this socket. There are no 1587 * limits here, currently. 1588 */ 1589 dev = ISC_LIST_HEAD(sock->recv_list); 1590 while (dev != NULL) { 1591 switch (doio_recv(sock, dev)) { 1592 case DOIO_SOFT: 1593 goto poke; 1594 1595 case DOIO_EOF: 1596 /* 1597 * read of 0 means the remote end was closed. 1598 * Run through the event queue and dispatch all 1599 * the events with an EOF result code. 1600 */ 1601 do { 1602 dev->result = ISC_R_EOF; 1603 send_recvdone_event(sock, &dev); 1604 dev = ISC_LIST_HEAD(sock->recv_list); 1605 } while (dev != NULL); 1606 goto poke; 1607 1608 case DOIO_SUCCESS: 1609 case DOIO_HARD: 1610 send_recvdone_event(sock, &dev); 1611 break; 1612 } 1613 1614 dev = ISC_LIST_HEAD(sock->recv_list); 1615 } 1616 1617 poke: 1618 if (!ISC_LIST_EMPTY(sock->recv_list)) 1619 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 1620 } 1621 1622 static void 1623 internal_send(isc_task_t *me, isc_event_t *ev) { 1624 isc_socketevent_t *dev; 1625 isc_socket_t *sock; 1626 1627 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 1628 1629 /* 1630 * Find out what socket this is and lock it. 1631 */ 1632 sock = (isc_socket_t *)ev->ev_sender; 1633 socket_log(sock, NULL, IOEVENT, 1634 "internal_send: task %p got event %p", me, ev); 1635 1636 INSIST(sock->pending_send == 1); 1637 sock->pending_send = 0; 1638 1639 INSIST(sock->references > 0); 1640 sock->references--; /* the internal event is done with this socket */ 1641 if (sock->references == 0) { 1642 destroy(&sock); 1643 return; 1644 } 1645 1646 /* 1647 * Try to do as much I/O as possible on this socket. There are no 1648 * limits here, currently. 1649 */ 1650 dev = ISC_LIST_HEAD(sock->send_list); 1651 while (dev != NULL) { 1652 switch (doio_send(sock, dev)) { 1653 case DOIO_SOFT: 1654 goto poke; 1655 1656 case DOIO_HARD: 1657 case DOIO_SUCCESS: 1658 send_senddone_event(sock, &dev); 1659 break; 1660 } 1661 1662 dev = ISC_LIST_HEAD(sock->send_list); 1663 } 1664 1665 poke: 1666 if (!ISC_LIST_EMPTY(sock->send_list)) 1667 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 1668 } 1669 1670 /* 1671 * Process read/writes on each fd here. Avoid locking 1672 * and unlocking twice if both reads and writes are possible. 1673 */ 1674 static void 1675 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, 1676 isc_boolean_t writeable) 1677 { 1678 isc_socket_t *sock; 1679 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 1680 1681 /* 1682 * If the socket is going to be closed, don't do more I/O. 1683 */ 1684 if (manager->fdstate[fd] == CLOSE_PENDING) { 1685 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1686 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1687 return; 1688 } 1689 1690 sock = manager->fds[fd]; 1691 if (readable) { 1692 if (sock == NULL) { 1693 unwatch_read = ISC_TRUE; 1694 goto check_write; 1695 } 1696 if (!SOCK_DEAD(sock)) { 1697 dispatch_recv(sock); 1698 } 1699 unwatch_read = ISC_TRUE; 1700 } 1701 check_write: 1702 if (writeable) { 1703 if (sock == NULL) { 1704 unwatch_write = ISC_TRUE; 1705 goto unlock_fd; 1706 } 1707 if (!SOCK_DEAD(sock)) { 1708 if (sock->connecting) 1709 dispatch_connect(sock); 1710 else 1711 dispatch_send(sock); 1712 } 1713 unwatch_write = ISC_TRUE; 1714 } 1715 1716 unlock_fd: 1717 if (unwatch_read) 1718 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1719 if (unwatch_write) 1720 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1721 1722 } 1723 1724 static void 1725 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds, 1726 fd_set *writefds) 1727 { 1728 int i; 1729 1730 REQUIRE(maxfd <= (int)manager->maxsocks); 1731 1732 for (i = 0; i < maxfd; i++) { 1733 process_fd(manager, i, FD_ISSET(i, readfds), 1734 FD_ISSET(i, writefds)); 1735 } 1736 } 1737 1738 /* 1739 * Create a new socket manager. 1740 */ 1741 1742 static isc_result_t 1743 setup_watcher(isc_socketmgr_t *manager) { 1744 isc_result_t result; 1745 1746 UNUSED(result); 1747 1748 manager->fd_bufsize = sizeof(fd_set); 1749 1750 manager->read_fds = NULL; 1751 manager->read_fds_copy = NULL; 1752 manager->write_fds = NULL; 1753 manager->write_fds_copy = NULL; 1754 1755 manager->read_fds = malloc(manager->fd_bufsize); 1756 if (manager->read_fds != NULL) 1757 manager->read_fds_copy = malloc(manager->fd_bufsize); 1758 if (manager->read_fds_copy != NULL) 1759 manager->write_fds = malloc(manager->fd_bufsize); 1760 if (manager->write_fds != NULL) { 1761 manager->write_fds_copy = malloc(manager->fd_bufsize); 1762 } 1763 if (manager->write_fds_copy == NULL) { 1764 if (manager->write_fds != NULL) { 1765 free(manager->write_fds); 1766 } 1767 if (manager->read_fds_copy != NULL) { 1768 free(manager->read_fds_copy); 1769 } 1770 if (manager->read_fds != NULL) { 1771 free(manager->read_fds); 1772 } 1773 return (ISC_R_NOMEMORY); 1774 } 1775 memset(manager->read_fds, 0, manager->fd_bufsize); 1776 memset(manager->write_fds, 0, manager->fd_bufsize); 1777 1778 manager->maxfd = 0; 1779 1780 return (ISC_R_SUCCESS); 1781 } 1782 1783 static void 1784 cleanup_watcher(isc_socketmgr_t *manager) { 1785 1786 if (manager->read_fds != NULL) 1787 free(manager->read_fds); 1788 if (manager->read_fds_copy != NULL) 1789 free(manager->read_fds_copy); 1790 if (manager->write_fds != NULL) 1791 free(manager->write_fds); 1792 if (manager->write_fds_copy != NULL) 1793 free(manager->write_fds_copy); 1794 } 1795 1796 static isc_result_t 1797 isc_socketmgr_create2(isc_socketmgr_t **managerp, 1798 unsigned int maxsocks) 1799 { 1800 isc_socketmgr_t *manager; 1801 isc_result_t result; 1802 1803 REQUIRE(managerp != NULL && *managerp == NULL); 1804 1805 if (socketmgr != NULL) { 1806 /* Don't allow maxsocks to be updated */ 1807 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 1808 return (ISC_R_EXISTS); 1809 1810 socketmgr->refs++; 1811 *managerp = (isc_socketmgr_t *)socketmgr; 1812 return (ISC_R_SUCCESS); 1813 } 1814 1815 if (maxsocks == 0) 1816 maxsocks = FD_SETSIZE; 1817 1818 manager = malloc(sizeof(*manager)); 1819 if (manager == NULL) 1820 return (ISC_R_NOMEMORY); 1821 1822 /* zero-clear so that necessary cleanup on failure will be easy */ 1823 memset(manager, 0, sizeof(*manager)); 1824 manager->maxsocks = maxsocks; 1825 manager->fds = malloc(manager->maxsocks * sizeof(isc_socket_t *)); 1826 if (manager->fds == NULL) { 1827 result = ISC_R_NOMEMORY; 1828 goto free_manager; 1829 } 1830 manager->fdstate = malloc(manager->maxsocks * sizeof(int)); 1831 if (manager->fdstate == NULL) { 1832 result = ISC_R_NOMEMORY; 1833 goto free_manager; 1834 } 1835 1836 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 1837 ISC_LIST_INIT(manager->socklist); 1838 1839 manager->refs = 1; 1840 1841 /* 1842 * Set up initial state for the select loop 1843 */ 1844 result = setup_watcher(manager); 1845 if (result != ISC_R_SUCCESS) 1846 goto cleanup; 1847 1848 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 1849 1850 socketmgr = manager; 1851 *managerp = (isc_socketmgr_t *)manager; 1852 1853 return (ISC_R_SUCCESS); 1854 1855 cleanup: 1856 1857 free_manager: 1858 if (manager->fdstate != NULL) { 1859 free(manager->fdstate); 1860 } 1861 if (manager->fds != NULL) { 1862 free(manager->fds); 1863 } 1864 free(manager); 1865 1866 return (result); 1867 } 1868 1869 isc_result_t 1870 isc_socketmgr_create(isc_socketmgr_t **managerp) { 1871 return (isc_socketmgr_create2(managerp, 0)); 1872 } 1873 1874 void 1875 isc_socketmgr_destroy(isc_socketmgr_t **managerp) { 1876 isc_socketmgr_t *manager; 1877 int i; 1878 1879 /* 1880 * Destroy a socket manager. 1881 */ 1882 1883 REQUIRE(managerp != NULL); 1884 manager = (isc_socketmgr_t *)*managerp; 1885 1886 manager->refs--; 1887 if (manager->refs > 0) { 1888 *managerp = NULL; 1889 return; 1890 } 1891 socketmgr = NULL; 1892 1893 /* 1894 * Wait for all sockets to be destroyed. 1895 */ 1896 while (!ISC_LIST_EMPTY(manager->socklist)) { 1897 isc_taskmgr_dispatch(NULL); 1898 } 1899 1900 /* 1901 * Here, poke our select/poll thread. Do this by closing the write 1902 * half of the pipe, which will send EOF to the read half. 1903 * This is currently a no-op in the non-threaded case. 1904 */ 1905 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 1906 1907 /* 1908 * Clean up. 1909 */ 1910 cleanup_watcher(manager); 1911 1912 for (i = 0; i < (int)manager->maxsocks; i++) 1913 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 1914 (void)close(i); 1915 1916 free(manager->fds); 1917 free(manager->fdstate); 1918 1919 free(manager); 1920 1921 *managerp = NULL; 1922 1923 socketmgr = NULL; 1924 } 1925 1926 static isc_result_t 1927 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 1928 unsigned int flags) 1929 { 1930 int io_state; 1931 isc_task_t *ntask = NULL; 1932 isc_result_t result = ISC_R_SUCCESS; 1933 1934 dev->ev_sender = task; 1935 1936 if (sock->type == isc_sockettype_udp) { 1937 io_state = doio_recv(sock, dev); 1938 } else { 1939 if (ISC_LIST_EMPTY(sock->recv_list)) 1940 io_state = doio_recv(sock, dev); 1941 else 1942 io_state = DOIO_SOFT; 1943 } 1944 1945 switch (io_state) { 1946 case DOIO_SOFT: 1947 /* 1948 * We couldn't read all or part of the request right now, so 1949 * queue it. 1950 * 1951 * Attach to socket and to task 1952 */ 1953 isc_task_attach(task, &ntask); 1954 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 1955 1956 /* 1957 * Enqueue the request. If the socket was previously not being 1958 * watched, poke the watcher to start paying attention to it. 1959 */ 1960 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 1961 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 1962 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 1963 1964 socket_log(sock, NULL, EVENT, NULL, 0, 0, 1965 "socket_recv: event %p -> task %p", 1966 dev, ntask); 1967 1968 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 1969 result = ISC_R_INPROGRESS; 1970 break; 1971 1972 case DOIO_EOF: 1973 dev->result = ISC_R_EOF; 1974 /* fallthrough */ 1975 1976 case DOIO_HARD: 1977 case DOIO_SUCCESS: 1978 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 1979 send_recvdone_event(sock, &dev); 1980 break; 1981 } 1982 1983 return (result); 1984 } 1985 1986 isc_result_t 1987 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist, 1988 unsigned int minimum, isc_task_t *task, 1989 isc_taskaction_t action, void *arg) 1990 { 1991 isc_socket_t *sock = (isc_socket_t *)sock0; 1992 isc_socketevent_t *dev; 1993 unsigned int iocount; 1994 isc_buffer_t *buffer; 1995 1996 REQUIRE(buflist != NULL); 1997 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 1998 REQUIRE(task != NULL); 1999 REQUIRE(action != NULL); 2000 2001 iocount = isc_bufferlist_availablecount(buflist); 2002 REQUIRE(iocount > 0); 2003 2004 INSIST(sock->bound); 2005 2006 dev = allocate_socketevent(sock, 2007 ISC_SOCKEVENT_RECVDONE, action, arg); 2008 if (dev == NULL) 2009 return (ISC_R_NOMEMORY); 2010 2011 /* 2012 * UDP sockets are always partial read 2013 */ 2014 if (sock->type == isc_sockettype_udp) 2015 dev->minimum = 1; 2016 else { 2017 if (minimum == 0) 2018 dev->minimum = iocount; 2019 else 2020 dev->minimum = minimum; 2021 } 2022 2023 /* 2024 * Move each buffer from the passed in list to our internal one. 2025 */ 2026 buffer = ISC_LIST_HEAD(*buflist); 2027 while (buffer != NULL) { 2028 ISC_LIST_DEQUEUE(*buflist, buffer, link); 2029 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 2030 buffer = ISC_LIST_HEAD(*buflist); 2031 } 2032 2033 return (socket_recv(sock, dev, task, 0)); 2034 } 2035 2036 static isc_result_t 2037 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 2038 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 2039 unsigned int flags) 2040 { 2041 int io_state; 2042 isc_task_t *ntask = NULL; 2043 isc_result_t result = ISC_R_SUCCESS; 2044 2045 dev->ev_sender = task; 2046 2047 set_dev_address(address, sock, dev); 2048 if (pktinfo != NULL) { 2049 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 2050 dev->pktinfo = *pktinfo; 2051 2052 if (!isc_sockaddr_issitelocal(&dev->address) && 2053 !isc_sockaddr_islinklocal(&dev->address)) { 2054 socket_log(sock, NULL, TRACE, 2055 "pktinfo structure provided, ifindex %u " 2056 "(set to 0)", pktinfo->ipi6_ifindex); 2057 2058 /* 2059 * Set the pktinfo index to 0 here, to let the 2060 * kernel decide what interface it should send on. 2061 */ 2062 dev->pktinfo.ipi6_ifindex = 0; 2063 } 2064 } 2065 2066 if (sock->type == isc_sockettype_udp) 2067 io_state = doio_send(sock, dev); 2068 else { 2069 if (ISC_LIST_EMPTY(sock->send_list)) 2070 io_state = doio_send(sock, dev); 2071 else 2072 io_state = DOIO_SOFT; 2073 } 2074 2075 switch (io_state) { 2076 case DOIO_SOFT: 2077 /* 2078 * We couldn't send all or part of the request right now, so 2079 * queue it unless ISC_SOCKFLAG_NORETRY is set. 2080 */ 2081 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 2082 isc_task_attach(task, &ntask); 2083 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 2084 2085 /* 2086 * Enqueue the request. If the socket was previously 2087 * not being watched, poke the watcher to start 2088 * paying attention to it. 2089 */ 2090 if (ISC_LIST_EMPTY(sock->send_list) && 2091 !sock->pending_send) 2092 select_poke(sock->manager, sock->fd, 2093 SELECT_POKE_WRITE); 2094 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 2095 2096 socket_log(sock, NULL, EVENT, NULL, 0, 0, 2097 "socket_send: event %p -> task %p", 2098 dev, ntask); 2099 2100 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 2101 result = ISC_R_INPROGRESS; 2102 break; 2103 } 2104 2105 /* FALLTHROUGH */ 2106 2107 case DOIO_HARD: 2108 case DOIO_SUCCESS: 2109 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 2110 send_senddone_event(sock, &dev); 2111 break; 2112 } 2113 2114 return (result); 2115 } 2116 2117 isc_result_t 2118 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 2119 isc_task_t *task, isc_taskaction_t action, void *arg) 2120 { 2121 return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL, 2122 NULL, 0)); 2123 } 2124 2125 isc_result_t 2126 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist, 2127 isc_task_t *task, isc_taskaction_t action, void *arg, 2128 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 2129 unsigned int flags) 2130 { 2131 isc_socket_t *sock = (isc_socket_t *)sock0; 2132 isc_socketevent_t *dev; 2133 unsigned int iocount; 2134 isc_buffer_t *buffer; 2135 2136 REQUIRE(buflist != NULL); 2137 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 2138 REQUIRE(task != NULL); 2139 REQUIRE(action != NULL); 2140 2141 iocount = isc_bufferlist_usedcount(buflist); 2142 REQUIRE(iocount > 0); 2143 2144 dev = allocate_socketevent(sock, 2145 ISC_SOCKEVENT_SENDDONE, action, arg); 2146 if (dev == NULL) 2147 return (ISC_R_NOMEMORY); 2148 2149 /* 2150 * Move each buffer from the passed in list to our internal one. 2151 */ 2152 buffer = ISC_LIST_HEAD(*buflist); 2153 while (buffer != NULL) { 2154 ISC_LIST_DEQUEUE(*buflist, buffer, link); 2155 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 2156 buffer = ISC_LIST_HEAD(*buflist); 2157 } 2158 2159 return (socket_send(sock, dev, task, address, pktinfo, flags)); 2160 } 2161 2162 isc_result_t 2163 isc_socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr, 2164 unsigned int options) { 2165 isc_socket_t *sock = (isc_socket_t *)sock0; 2166 int on = 1; 2167 2168 INSIST(!sock->bound); 2169 2170 if (sock->pf != sockaddr->type.sa.sa_family) { 2171 return (ISC_R_FAMILYMISMATCH); 2172 } 2173 2174 /* 2175 * Only set SO_REUSEADDR when we want a specific port. 2176 */ 2177 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 2178 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 2179 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 2180 sizeof(on)) < 0) { 2181 UNEXPECTED_ERROR(__FILE__, __LINE__, 2182 "setsockopt(%d) %s", sock->fd, "failed"); 2183 /* Press on... */ 2184 } 2185 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 2186 switch (errno) { 2187 case EACCES: 2188 return (ISC_R_NOPERM); 2189 case EADDRNOTAVAIL: 2190 return (ISC_R_ADDRNOTAVAIL); 2191 case EADDRINUSE: 2192 return (ISC_R_ADDRINUSE); 2193 case EINVAL: 2194 return (ISC_R_BOUND); 2195 default: 2196 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 2197 strerror(errno)); 2198 return (ISC_R_UNEXPECTED); 2199 } 2200 } 2201 2202 socket_log(sock, sockaddr, TRACE, "bound"); 2203 sock->bound = 1; 2204 2205 return (ISC_R_SUCCESS); 2206 } 2207 2208 isc_result_t 2209 isc_socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr, 2210 isc_task_t *task, isc_taskaction_t action, void *arg) 2211 { 2212 isc_socket_t *sock = (isc_socket_t *)sock0; 2213 isc_socket_connev_t *dev; 2214 isc_task_t *ntask = NULL; 2215 isc_socketmgr_t *manager; 2216 int cc; 2217 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 2218 2219 REQUIRE(addr != NULL); 2220 REQUIRE(task != NULL); 2221 REQUIRE(action != NULL); 2222 2223 manager = sock->manager; 2224 REQUIRE(addr != NULL); 2225 2226 if (isc_sockaddr_ismulticast(addr)) 2227 return (ISC_R_MULTICAST); 2228 2229 REQUIRE(!sock->connecting); 2230 2231 dev = (isc_socket_connev_t *)isc_event_allocate(sock, 2232 ISC_SOCKEVENT_CONNECT, 2233 action, arg, 2234 sizeof(*dev)); 2235 if (dev == NULL) { 2236 return (ISC_R_NOMEMORY); 2237 } 2238 ISC_LINK_INIT(dev, ev_link); 2239 2240 /* 2241 * Try to do the connect right away, as there can be only one 2242 * outstanding, and it might happen to complete. 2243 */ 2244 sock->peer_address = *addr; 2245 cc = connect(sock->fd, &addr->type.sa, addr->length); 2246 if (cc < 0) { 2247 /* 2248 * HP-UX "fails" to connect a UDP socket and sets errno to 2249 * EINPROGRESS if it's non-blocking. We'd rather regard this as 2250 * a success and let the user detect it if it's really an error 2251 * at the time of sending a packet on the socket. 2252 */ 2253 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 2254 cc = 0; 2255 goto success; 2256 } 2257 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 2258 goto queue; 2259 2260 switch (errno) { 2261 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 2262 ERROR_MATCH(EACCES, ISC_R_NOPERM); 2263 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 2264 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 2265 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 2266 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 2267 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 2268 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 2269 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 2270 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 2271 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 2272 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 2273 #undef ERROR_MATCH 2274 } 2275 2276 sock->connected = 0; 2277 2278 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 2279 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 2280 addrbuf, errno, strerror(errno)); 2281 2282 isc_event_free(ISC_EVENT_PTR(&dev)); 2283 return (ISC_R_UNEXPECTED); 2284 2285 err_exit: 2286 sock->connected = 0; 2287 isc_task_send(task, ISC_EVENT_PTR(&dev)); 2288 2289 return (ISC_R_SUCCESS); 2290 } 2291 2292 /* 2293 * If connect completed, fire off the done event. 2294 */ 2295 success: 2296 if (cc == 0) { 2297 sock->connected = 1; 2298 sock->bound = 1; 2299 dev->result = ISC_R_SUCCESS; 2300 isc_task_send(task, ISC_EVENT_PTR(&dev)); 2301 2302 return (ISC_R_SUCCESS); 2303 } 2304 2305 queue: 2306 2307 /* 2308 * Attach to task. 2309 */ 2310 isc_task_attach(task, &ntask); 2311 2312 sock->connecting = 1; 2313 2314 dev->ev_sender = ntask; 2315 2316 /* 2317 * Poke watcher here. We still have the socket locked, so there 2318 * is no race condition. We will keep the lock for such a short 2319 * bit of time waking it up now or later won't matter all that much. 2320 */ 2321 if (sock->connect_ev == NULL) 2322 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 2323 2324 sock->connect_ev = dev; 2325 2326 return (ISC_R_SUCCESS); 2327 } 2328 2329 /* 2330 * Called when a socket with a pending connect() finishes. 2331 */ 2332 static void 2333 internal_connect(isc_task_t *me, isc_event_t *ev) { 2334 isc_socket_t *sock; 2335 isc_socket_connev_t *dev; 2336 isc_task_t *task; 2337 int cc; 2338 socklen_t optlen; 2339 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 2340 2341 UNUSED(me); 2342 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 2343 2344 sock = ev->ev_sender; 2345 2346 /* 2347 * When the internal event was sent the reference count was bumped 2348 * to keep the socket around for us. Decrement the count here. 2349 */ 2350 INSIST(sock->references > 0); 2351 sock->references--; 2352 if (sock->references == 0) { 2353 destroy(&sock); 2354 return; 2355 } 2356 2357 /* 2358 * Has this event been canceled? 2359 */ 2360 dev = sock->connect_ev; 2361 if (dev == NULL) { 2362 INSIST(!sock->connecting); 2363 return; 2364 } 2365 2366 INSIST(sock->connecting); 2367 sock->connecting = 0; 2368 2369 /* 2370 * Get any possible error status here. 2371 */ 2372 optlen = sizeof(cc); 2373 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 2374 (void *)&cc, (void *)&optlen) < 0) 2375 cc = errno; 2376 else 2377 errno = cc; 2378 2379 if (errno != 0) { 2380 /* 2381 * If the error is EAGAIN, just re-select on this 2382 * fd and pretend nothing strange happened. 2383 */ 2384 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 2385 sock->connecting = 1; 2386 select_poke(sock->manager, sock->fd, 2387 SELECT_POKE_CONNECT); 2388 return; 2389 } 2390 2391 2392 /* 2393 * Translate other errors into ISC_R_* flavors. 2394 */ 2395 switch (errno) { 2396 #define ERROR_MATCH(a, b) case a: dev->result = b; break; 2397 ERROR_MATCH(EACCES, ISC_R_NOPERM); 2398 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 2399 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 2400 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 2401 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 2402 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 2403 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 2404 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 2405 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 2406 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 2407 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 2408 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 2409 #undef ERROR_MATCH 2410 default: 2411 dev->result = ISC_R_UNEXPECTED; 2412 isc_sockaddr_format(&sock->peer_address, peerbuf, 2413 sizeof(peerbuf)); 2414 UNEXPECTED_ERROR(__FILE__, __LINE__, 2415 "internal_connect: connect(%s) %s", 2416 peerbuf, strerror(errno)); 2417 } 2418 } else { 2419 dev->result = ISC_R_SUCCESS; 2420 sock->connected = 1; 2421 sock->bound = 1; 2422 } 2423 2424 sock->connect_ev = NULL; 2425 2426 task = dev->ev_sender; 2427 dev->ev_sender = sock; 2428 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 2429 } 2430 2431 /* 2432 * Run through the list of events on this socket, and cancel the ones 2433 * queued for task "task" of type "how". "how" is a bitmask. 2434 */ 2435 void 2436 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) { 2437 isc_socket_t *sock = (isc_socket_t *)sock0; 2438 2439 /* 2440 * Quick exit if there is nothing to do. Don't even bother locking 2441 * in this case. 2442 */ 2443 if (how == 0) 2444 return; 2445 2446 /* 2447 * All of these do the same thing, more or less. 2448 * Each will: 2449 * o If the internal event is marked as "posted" try to 2450 * remove it from the task's queue. If this fails, mark it 2451 * as canceled instead, and let the task clean it up later. 2452 * o For each I/O request for that task of that type, post 2453 * its done event with status of "ISC_R_CANCELED". 2454 * o Reset any state needed. 2455 */ 2456 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 2457 && !ISC_LIST_EMPTY(sock->recv_list)) { 2458 isc_socketevent_t *dev; 2459 isc_socketevent_t *next; 2460 isc_task_t *current_task; 2461 2462 dev = ISC_LIST_HEAD(sock->recv_list); 2463 2464 while (dev != NULL) { 2465 current_task = dev->ev_sender; 2466 next = ISC_LIST_NEXT(dev, ev_link); 2467 2468 if ((task == NULL) || (task == current_task)) { 2469 dev->result = ISC_R_CANCELED; 2470 send_recvdone_event(sock, &dev); 2471 } 2472 dev = next; 2473 } 2474 } 2475 2476 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 2477 && !ISC_LIST_EMPTY(sock->send_list)) { 2478 isc_socketevent_t *dev; 2479 isc_socketevent_t *next; 2480 isc_task_t *current_task; 2481 2482 dev = ISC_LIST_HEAD(sock->send_list); 2483 2484 while (dev != NULL) { 2485 current_task = dev->ev_sender; 2486 next = ISC_LIST_NEXT(dev, ev_link); 2487 2488 if ((task == NULL) || (task == current_task)) { 2489 dev->result = ISC_R_CANCELED; 2490 send_senddone_event(sock, &dev); 2491 } 2492 dev = next; 2493 } 2494 } 2495 2496 /* 2497 * Connecting is not a list. 2498 */ 2499 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 2500 && sock->connect_ev != NULL) { 2501 isc_socket_connev_t *dev; 2502 isc_task_t *current_task; 2503 2504 INSIST(sock->connecting); 2505 sock->connecting = 0; 2506 2507 dev = sock->connect_ev; 2508 current_task = dev->ev_sender; 2509 2510 if ((task == NULL) || (task == current_task)) { 2511 sock->connect_ev = NULL; 2512 2513 dev->result = ISC_R_CANCELED; 2514 dev->ev_sender = sock; 2515 isc_task_sendanddetach(¤t_task, 2516 ISC_EVENT_PTR(&dev)); 2517 } 2518 } 2519 2520 } 2521 2522 /* 2523 * In our assumed scenario, we can simply use a single static object. 2524 * XXX: this is not true if the application uses multiple threads with 2525 * 'multi-context' mode. Fixing this is a future TODO item. 2526 */ 2527 static isc_socketwait_t swait_private; 2528 2529 int 2530 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp, 2531 isc_socketwait_t **swaitp) 2532 { 2533 isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0; 2534 int n; 2535 2536 REQUIRE(swaitp != NULL && *swaitp == NULL); 2537 2538 if (manager == NULL) 2539 manager = socketmgr; 2540 if (manager == NULL) 2541 return (0); 2542 2543 memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); 2544 memmove(manager->write_fds_copy, manager->write_fds, 2545 manager->fd_bufsize); 2546 2547 swait_private.readset = manager->read_fds_copy; 2548 swait_private.writeset = manager->write_fds_copy; 2549 swait_private.maxfd = manager->maxfd + 1; 2550 2551 n = select(swait_private.maxfd, swait_private.readset, 2552 swait_private.writeset, NULL, tvp); 2553 2554 *swaitp = &swait_private; 2555 return (n); 2556 } 2557 2558 isc_result_t 2559 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) { 2560 isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0; 2561 2562 REQUIRE(swait == &swait_private); 2563 2564 if (manager == NULL) 2565 manager = socketmgr; 2566 if (manager == NULL) 2567 return (ISC_R_NOTFOUND); 2568 2569 process_fds(manager, swait->maxfd, swait->readset, swait->writeset); 2570 return (ISC_R_SUCCESS); 2571 } 2572