1 /* $NetBSD: socket.c,v 1.17 2015/07/08 17:29:00 christos Exp $ */ 2 3 /* 4 * Copyright (C) 2004-2014 Internet Systems Consortium, Inc. ("ISC") 5 * Copyright (C) 1998-2003 Internet Software Consortium. 6 * 7 * Permission to use, copy, modify, and/or distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 12 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 13 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 14 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 15 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 16 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 17 * PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 /* Id */ 21 22 /*! \file */ 23 24 #include <config.h> 25 26 #include <sys/param.h> 27 #include <sys/types.h> 28 #include <sys/socket.h> 29 #include <sys/stat.h> 30 #include <sys/time.h> 31 #include <sys/uio.h> 32 33 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) 34 #include <linux/netlink.h> 35 #include <linux/rtnetlink.h> 36 #endif 37 38 #include <errno.h> 39 #include <fcntl.h> 40 #include <stddef.h> 41 #include <stdlib.h> 42 #include <string.h> 43 #include <unistd.h> 44 #ifdef HAVE_INTTYPES_H 45 #include <inttypes.h> /* uintptr_t */ 46 #endif 47 48 #include <isc/buffer.h> 49 #include <isc/bufferlist.h> 50 #include <isc/condition.h> 51 #include <isc/formatcheck.h> 52 #include <isc/json.h> 53 #include <isc/list.h> 54 #include <isc/log.h> 55 #include <isc/mem.h> 56 #include <isc/msgs.h> 57 #include <isc/mutex.h> 58 #include <isc/net.h> 59 #include <isc/once.h> 60 #include <isc/platform.h> 61 #include <isc/print.h> 62 #include <isc/region.h> 63 #include <isc/resource.h> 64 #include <isc/socket.h> 65 #include <isc/stats.h> 66 #include <isc/strerror.h> 67 #include <isc/task.h> 68 #include <isc/thread.h> 69 #include <isc/util.h> 70 #include <isc/xml.h> 71 72 #ifdef ISC_PLATFORM_HAVESYSUNH 73 #include <sys/un.h> 74 #endif 75 #ifdef ISC_PLATFORM_HAVEKQUEUE 76 #include <sys/event.h> 77 #endif 78 #ifdef ISC_PLATFORM_HAVEEPOLL 79 #include <sys/epoll.h> 80 #endif 81 #ifdef ISC_PLATFORM_HAVEDEVPOLL 82 #if defined(HAVE_SYS_DEVPOLL_H) 83 #include <sys/devpoll.h> 84 #elif defined(HAVE_DEVPOLL_H) 85 #include <devpoll.h> 86 #endif 87 #endif 88 89 #include "errno2result.h" 90 91 /* See task.c about the following definition: */ 92 #ifdef ISC_PLATFORM_USETHREADS 93 #define USE_WATCHER_THREAD 94 #else 95 #define USE_SHARED_MANAGER 96 #endif /* ISC_PLATFORM_USETHREADS */ 97 98 #ifndef USE_WATCHER_THREAD 99 #include "socket_p.h" 100 #include "../task_p.h" 101 #endif /* USE_WATCHER_THREAD */ 102 103 #if defined(SO_BSDCOMPAT) && defined(__linux__) 104 #include <sys/utsname.h> 105 #endif 106 107 /*% 108 * Choose the most preferable multiplex method. 109 */ 110 #ifdef ISC_PLATFORM_HAVEKQUEUE 111 #define USE_KQUEUE 112 #elif defined (ISC_PLATFORM_HAVEEPOLL) 113 #define USE_EPOLL 114 #elif defined (ISC_PLATFORM_HAVEDEVPOLL) 115 #define USE_DEVPOLL 116 typedef struct { 117 unsigned int want_read : 1, 118 want_write : 1; 119 } pollinfo_t; 120 #else 121 #define USE_SELECT 122 #endif /* ISC_PLATFORM_HAVEKQUEUE */ 123 124 #ifndef USE_WATCHER_THREAD 125 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 126 struct isc_socketwait { 127 int nevents; 128 }; 129 #elif defined (USE_SELECT) 130 struct isc_socketwait { 131 fd_set *readset; 132 fd_set *writeset; 133 int nfds; 134 int maxfd; 135 }; 136 #endif /* USE_KQUEUE */ 137 #endif /* !USE_WATCHER_THREAD */ 138 139 /* 140 * Set by the -T dscp option on the command line. If set to a value 141 * other than -1, we check to make sure DSCP values match it, and 142 * assert if not. 143 */ 144 int isc_dscp_check_value = -1; 145 146 /*% 147 * Maximum number of allowable open sockets. This is also the maximum 148 * allowable socket file descriptor. 149 * 150 * Care should be taken before modifying this value for select(): 151 * The API standard doesn't ensure select() accept more than (the system default 152 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in 153 * the vast majority of cases. This constant should therefore be increased only 154 * when absolutely necessary and possible, i.e., the server is exhausting all 155 * available file descriptors (up to FD_SETSIZE) and the select() function 156 * and FD_xxx macros support larger values than FD_SETSIZE (which may not 157 * always by true, but we keep using some of them to ensure as much 158 * portability as possible). Note also that overall server performance 159 * may be rather worsened with a larger value of this constant due to 160 * inherent scalability problems of select(). 161 * 162 * As a special note, this value shouldn't have to be touched if 163 * this is a build for an authoritative only DNS server. 164 */ 165 #ifndef ISC_SOCKET_MAXSOCKETS 166 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 167 #ifdef TUNE_LARGE 168 #define ISC_SOCKET_MAXSOCKETS 21000 169 #else 170 #define ISC_SOCKET_MAXSOCKETS 4096 171 #endif /* TUNE_LARGE */ 172 #elif defined(USE_SELECT) 173 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE 174 #endif /* USE_KQUEUE... */ 175 #endif /* ISC_SOCKET_MAXSOCKETS */ 176 177 #ifdef USE_SELECT 178 /*% 179 * Mac OS X needs a special definition to support larger values in select(). 180 * We always define this because a larger value can be specified run-time. 181 */ 182 #ifdef __APPLE__ 183 #define _DARWIN_UNLIMITED_SELECT 184 #endif /* __APPLE__ */ 185 #endif /* USE_SELECT */ 186 187 #ifdef ISC_SOCKET_USE_POLLWATCH 188 /*% 189 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel 190 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for 191 * some of the specified FD. The idea is based on the observation that it's 192 * likely for a busy server to keep receiving packets. It specifically works 193 * as follows: the socket watcher is first initialized with the state of 194 * "poll_idle". While it's in the idle state it keeps sleeping until a socket 195 * event occurs. When it wakes up for a socket I/O event, it moves to the 196 * poll_active state, and sets the poll timeout to a short period 197 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the 198 * watcher goes to the poll_checking state with the same timeout period. 199 * In this state, the watcher tries to detect whether this is a break 200 * during intermittent events or the kernel bug is triggered. If the next 201 * polling reports an event within the short period, the previous timeout is 202 * likely to be a kernel bug, and so the watcher goes back to the active state. 203 * Otherwise, it moves to the idle state again. 204 * 205 * It's not clear whether this is a thread-related bug, but since we've only 206 * seen this with threads, this workaround is used only when enabling threads. 207 */ 208 209 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; 210 211 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT 212 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10 213 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ 214 #endif /* ISC_SOCKET_USE_POLLWATCH */ 215 216 /*% 217 * Size of per-FD lock buckets. 218 */ 219 #ifdef ISC_PLATFORM_USETHREADS 220 #define FDLOCK_COUNT 1024 221 #define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) 222 #else 223 #define FDLOCK_COUNT 1 224 #define FDLOCK_ID(fd) 0 225 #endif /* ISC_PLATFORM_USETHREADS */ 226 227 /*% 228 * Maximum number of events communicated with the kernel. There should normally 229 * be no need for having a large number. 230 */ 231 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 232 #ifndef ISC_SOCKET_MAXEVENTS 233 #ifdef TUNE_LARGE 234 #define ISC_SOCKET_MAXEVENTS 2048 235 #else 236 #define ISC_SOCKET_MAXEVENTS 64 237 #endif /* TUNE_LARGE */ 238 #endif 239 #endif 240 241 /*% 242 * Some systems define the socket length argument as an int, some as size_t, 243 * some as socklen_t. This is here so it can be easily changed if needed. 244 */ 245 #ifndef ISC_SOCKADDR_LEN_T 246 #define ISC_SOCKADDR_LEN_T unsigned int 247 #endif 248 249 /*% 250 * Define what the possible "soft" errors can be. These are non-fatal returns 251 * of various network related functions, like recv() and so on. 252 * 253 * For some reason, BSDI (and perhaps others) will sometimes return <0 254 * from recv() but will have errno==0. This is broken, but we have to 255 * work around it here. 256 */ 257 #define SOFT_ERROR(e) ((e) == EAGAIN || \ 258 (e) == EWOULDBLOCK || \ 259 (e) == EINTR || \ 260 (e) == 0) 261 262 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x) 263 264 /*!< 265 * DLVL(90) -- Function entry/exit and other tracing. 266 * DLVL(70) -- Socket "correctness" -- including returning of events, etc. 267 * DLVL(60) -- Socket data send/receive 268 * DLVL(50) -- Event tracing, including receiving/sending completion events. 269 * DLVL(20) -- Socket creation/destruction. 270 */ 271 #define TRACE_LEVEL 90 272 #define CORRECTNESS_LEVEL 70 273 #define IOEVENT_LEVEL 60 274 #define EVENT_LEVEL 50 275 #define CREATION_LEVEL 20 276 277 #define TRACE DLVL(TRACE_LEVEL) 278 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL) 279 #define IOEVENT DLVL(IOEVENT_LEVEL) 280 #define EVENT DLVL(EVENT_LEVEL) 281 #define CREATION DLVL(CREATION_LEVEL) 282 283 typedef isc_event_t intev_t; 284 285 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o') 286 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC) 287 288 /*! 289 * IPv6 control information. If the socket is an IPv6 socket we want 290 * to collect the destination address and interface so the client can 291 * set them on outgoing packets. 292 */ 293 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO 294 #ifndef USE_CMSG 295 #define USE_CMSG 1 296 #endif 297 #endif 298 299 /*% 300 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have 301 * a setsockopt() like interface to request timestamps, and if the OS 302 * doesn't do it for us, call gettimeofday() on every UDP receive? 303 */ 304 #ifdef SO_TIMESTAMP 305 #ifndef USE_CMSG 306 #define USE_CMSG 1 307 #endif 308 #endif 309 310 /*% 311 * The size to raise the receive buffer to (from BIND 8). 312 */ 313 #ifdef TUNE_LARGE 314 #ifdef sun 315 #define RCVBUFSIZE (1*1024*1024) 316 #else 317 #define RCVBUFSIZE (16*1024*1024) 318 #endif 319 #else 320 #define RCVBUFSIZE (32*1024) 321 #endif /* TUNE_LARGE */ 322 323 /*% 324 * The number of times a send operation is repeated if the result is EINTR. 325 */ 326 #define NRETRIES 10 327 328 typedef struct isc__socket isc__socket_t; 329 typedef struct isc__socketmgr isc__socketmgr_t; 330 331 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket) 332 333 struct isc__socket { 334 /* Not locked. */ 335 isc_socket_t common; 336 isc__socketmgr_t *manager; 337 isc_mutex_t lock; 338 isc_sockettype_t type; 339 const isc_statscounter_t *statsindex; 340 341 /* Locked by socket lock. */ 342 ISC_LINK(isc__socket_t) link; 343 unsigned int references; 344 int fd; 345 int pf; 346 char name[16]; 347 void * tag; 348 349 ISC_LIST(isc_socketevent_t) send_list; 350 ISC_LIST(isc_socketevent_t) recv_list; 351 ISC_LIST(isc_socket_newconnev_t) accept_list; 352 isc_socket_connev_t *connect_ev; 353 354 /* 355 * Internal events. Posted when a descriptor is readable or 356 * writable. These are statically allocated and never freed. 357 * They will be set to non-purgable before use. 358 */ 359 intev_t readable_ev; 360 intev_t writable_ev; 361 362 isc_sockaddr_t peer_address; /* remote address */ 363 364 unsigned int pending_recv : 1, 365 pending_send : 1, 366 pending_accept : 1, 367 listener : 1, /* listener socket */ 368 connected : 1, 369 connecting : 1, /* connect pending */ 370 bound : 1, /* bound to local addr */ 371 dupped : 1, 372 active : 1, /* currently active */ 373 pktdscp : 1; /* per packet dscp */ 374 375 #ifdef ISC_NET_RECVOVERFLOW 376 unsigned char overflow; /* used for MSG_TRUNC fake */ 377 #endif 378 379 char *recvcmsgbuf; 380 ISC_SOCKADDR_LEN_T recvcmsgbuflen; 381 char *sendcmsgbuf; 382 ISC_SOCKADDR_LEN_T sendcmsgbuflen; 383 384 void *fdwatcharg; 385 isc_sockfdwatch_t fdwatchcb; 386 int fdwatchflags; 387 isc_task_t *fdwatchtask; 388 unsigned int dscp; 389 }; 390 391 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') 392 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) 393 394 struct isc__socketmgr { 395 /* Not locked. */ 396 isc_socketmgr_t common; 397 isc_mem_t *mctx; 398 isc_mutex_t lock; 399 isc_mutex_t *fdlock; 400 isc_stats_t *stats; 401 #ifdef USE_KQUEUE 402 int kqueue_fd; 403 int nevents; 404 struct kevent *events; 405 #endif /* USE_KQUEUE */ 406 #ifdef USE_EPOLL 407 int epoll_fd; 408 int nevents; 409 struct epoll_event *events; 410 #endif /* USE_EPOLL */ 411 #ifdef USE_DEVPOLL 412 int devpoll_fd; 413 isc_resourcevalue_t open_max; 414 unsigned int calls; 415 int nevents; 416 struct pollfd *events; 417 #endif /* USE_DEVPOLL */ 418 #ifdef USE_SELECT 419 int fd_bufsize; 420 #endif /* USE_SELECT */ 421 unsigned int maxsocks; 422 #ifdef ISC_PLATFORM_USETHREADS 423 int pipe_fds[2]; 424 #endif 425 426 /* Locked by fdlock. */ 427 isc__socket_t **fds; 428 int *fdstate; 429 #ifdef USE_DEVPOLL 430 pollinfo_t *fdpollinfo; 431 #endif 432 433 /* Locked by manager lock. */ 434 ISC_LIST(isc__socket_t) socklist; 435 #ifdef USE_SELECT 436 fd_set *read_fds; 437 fd_set *read_fds_copy; 438 fd_set *write_fds; 439 fd_set *write_fds_copy; 440 int maxfd; 441 #endif /* USE_SELECT */ 442 int reserved; /* unlocked */ 443 #ifdef USE_WATCHER_THREAD 444 isc_thread_t watcher; 445 isc_condition_t shutdown_ok; 446 #else /* USE_WATCHER_THREAD */ 447 unsigned int refs; 448 #endif /* USE_WATCHER_THREAD */ 449 int maxudp; 450 }; 451 452 #ifdef USE_SHARED_MANAGER 453 static isc__socketmgr_t *socketmgr = NULL; 454 #endif /* USE_SHARED_MANAGER */ 455 456 #define CLOSED 0 /* this one must be zero */ 457 #define MANAGED 1 458 #define CLOSE_PENDING 2 459 460 /* 461 * send() and recv() iovec counts 462 */ 463 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) 464 #ifdef ISC_NET_RECVOVERFLOW 465 # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) 466 #else 467 # define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) 468 #endif 469 470 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf, 471 isc_sockettype_t type, 472 isc_socket_t **socketp, 473 isc_socket_t *dup_socket); 474 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **); 475 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **); 476 static void free_socket(isc__socket_t **); 477 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t, 478 isc__socket_t **); 479 static void destroy(isc__socket_t **); 480 static void internal_accept(isc_task_t *, isc_event_t *); 481 static void internal_connect(isc_task_t *, isc_event_t *); 482 static void internal_recv(isc_task_t *, isc_event_t *); 483 static void internal_send(isc_task_t *, isc_event_t *); 484 static void internal_fdwatch_write(isc_task_t *, isc_event_t *); 485 static void internal_fdwatch_read(isc_task_t *, isc_event_t *); 486 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *); 487 static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *, 488 struct msghdr *, struct iovec *, size_t *); 489 static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *, 490 struct msghdr *, struct iovec *, size_t *); 491 #ifdef USE_WATCHER_THREAD 492 static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager); 493 #endif 494 static void setdscp(isc__socket_t *sock, isc_dscp_t dscp); 495 496 /*% 497 * The following are intended for internal use (indicated by "isc__" 498 * prefix) but are not declared as static, allowing direct access from 499 * unit tests etc. 500 */ 501 502 isc_result_t 503 isc__socket_open(isc_socket_t *sock0); 504 isc_result_t 505 isc__socket_close(isc_socket_t *sock0); 506 isc_result_t 507 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, 508 isc_socket_t **socketp); 509 void 510 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp); 511 void 512 isc__socket_detach(isc_socket_t **socketp); 513 isc_result_t 514 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist, 515 unsigned int minimum, isc_task_t *task, 516 isc_taskaction_t action, void *arg); 517 isc_result_t 518 isc__socket_recv(isc_socket_t *sock, isc_region_t *region, 519 unsigned int minimum, isc_task_t *task, 520 isc_taskaction_t action, void *arg); 521 isc_result_t 522 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region, 523 unsigned int minimum, isc_task_t *task, 524 isc_socketevent_t *event, unsigned int flags); 525 isc_result_t 526 isc__socket_send(isc_socket_t *sock, isc_region_t *region, 527 isc_task_t *task, isc_taskaction_t action, void *arg); 528 isc_result_t 529 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region, 530 isc_task_t *task, isc_taskaction_t action, void *arg, 531 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 532 isc_result_t 533 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 534 isc_task_t *task, isc_taskaction_t action, void *arg); 535 isc_result_t 536 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 537 isc_task_t *task, isc_taskaction_t action, void *arg, 538 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo); 539 isc_result_t 540 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist, 541 isc_task_t *task, isc_taskaction_t action, void *arg, 542 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 543 unsigned int flags); 544 isc_result_t 545 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region, 546 isc_task_t *task, 547 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 548 isc_socketevent_t *event, unsigned int flags); 549 isc_socketevent_t * 550 isc_socket_socketevent(isc_mem_t *mctx, void *sender, 551 isc_eventtype_t eventtype, isc_taskaction_t action, 552 void *arg); 553 554 void 555 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active); 556 isc_result_t 557 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 558 isc_uint32_t owner, isc_uint32_t group); 559 isc_result_t 560 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, 561 unsigned int options); 562 isc_result_t 563 isc__socket_filter(isc_socket_t *sock, const char *filter); 564 isc_result_t 565 isc__socket_listen(isc_socket_t *sock, unsigned int backlog); 566 isc_result_t 567 isc__socket_accept(isc_socket_t *sock, 568 isc_task_t *task, isc_taskaction_t action, void *arg); 569 isc_result_t 570 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, 571 isc_task_t *task, isc_taskaction_t action, 572 void *arg); 573 isc_result_t 574 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp); 575 isc_result_t 576 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp); 577 void 578 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how); 579 isc_sockettype_t 580 isc__socket_gettype(isc_socket_t *sock); 581 isc_boolean_t 582 isc__socket_isbound(isc_socket_t *sock); 583 void 584 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes); 585 void 586 isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp); 587 isc_result_t 588 isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags, 589 isc_sockfdwatch_t callback, void *cbarg, 590 isc_task_t *task, isc_socket_t **socketp); 591 isc_result_t 592 isc__socket_fdwatchpoke(isc_socket_t *sock, int flags); 593 isc_result_t 594 isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp); 595 int 596 isc__socket_getfd(isc_socket_t *sock); 597 598 isc_result_t 599 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp); 600 isc_result_t 601 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 602 unsigned int maxsocks); 603 isc_result_t 604 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp); 605 void 606 isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats); 607 void 608 isc__socketmgr_destroy(isc_socketmgr_t **managerp); 609 void 610 isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag); 611 const char * 612 isc__socket_getname(isc_socket_t *socket0); 613 void * 614 isc__socket_gettag(isc_socket_t *socket0); 615 616 #ifdef HAVE_LIBXML2 617 void 618 isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer); 619 #endif 620 #ifdef HAVE_JSON 621 isc_result_t 622 isc__socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats); 623 #endif 624 625 static struct { 626 isc_socketmethods_t methods; 627 628 /*% 629 * The following are defined just for avoiding unused static functions. 630 */ 631 void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter, 632 *listen, *accept, *getpeername, *isbound; 633 } socketmethods = { 634 { 635 isc__socket_attach, 636 isc__socket_detach, 637 isc__socket_bind, 638 isc__socket_sendto, 639 isc__socket_sendto2, 640 isc__socket_connect, 641 isc__socket_recv, 642 isc__socket_recv2, 643 isc__socket_cancel, 644 isc__socket_getsockname, 645 isc__socket_gettype, 646 isc__socket_ipv6only, 647 isc__socket_fdwatchpoke, 648 isc__socket_dup, 649 isc__socket_getfd, 650 isc__socket_dscp 651 }, 652 (void *)isc__socket_recvv, (void *)isc__socket_send, 653 (void *)isc__socket_sendv, (void *)isc__socket_sendto2, 654 (void *)isc__socket_cleanunix, (void *)isc__socket_permunix, 655 (void *)isc__socket_filter, (void *)isc__socket_listen, 656 (void *)isc__socket_accept, (void *)isc__socket_getpeername, 657 (void *)isc__socket_isbound 658 }; 659 660 static isc_socketmgrmethods_t socketmgrmethods = { 661 isc__socketmgr_destroy, 662 isc__socket_create, 663 isc__socket_fdwatchcreate 664 }; 665 666 #define SELECT_POKE_SHUTDOWN (-1) 667 #define SELECT_POKE_NOTHING (-2) 668 #define SELECT_POKE_READ (-3) 669 #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */ 670 #define SELECT_POKE_WRITE (-4) 671 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */ 672 #define SELECT_POKE_CLOSE (-5) 673 674 #define SOCK_DEAD(s) ((s)->references == 0) 675 676 /*% 677 * Shortcut index arrays to get access to statistics counters. 678 */ 679 enum { 680 STATID_OPEN = 0, 681 STATID_OPENFAIL = 1, 682 STATID_CLOSE = 2, 683 STATID_BINDFAIL = 3, 684 STATID_CONNECTFAIL = 4, 685 STATID_CONNECT = 5, 686 STATID_ACCEPTFAIL = 6, 687 STATID_ACCEPT = 7, 688 STATID_SENDFAIL = 8, 689 STATID_RECVFAIL = 9, 690 STATID_ACTIVE = 10 691 }; 692 static const isc_statscounter_t udp4statsindex[] = { 693 isc_sockstatscounter_udp4open, 694 isc_sockstatscounter_udp4openfail, 695 isc_sockstatscounter_udp4close, 696 isc_sockstatscounter_udp4bindfail, 697 isc_sockstatscounter_udp4connectfail, 698 isc_sockstatscounter_udp4connect, 699 -1, 700 -1, 701 isc_sockstatscounter_udp4sendfail, 702 isc_sockstatscounter_udp4recvfail, 703 isc_sockstatscounter_udp4active 704 }; 705 static const isc_statscounter_t udp6statsindex[] = { 706 isc_sockstatscounter_udp6open, 707 isc_sockstatscounter_udp6openfail, 708 isc_sockstatscounter_udp6close, 709 isc_sockstatscounter_udp6bindfail, 710 isc_sockstatscounter_udp6connectfail, 711 isc_sockstatscounter_udp6connect, 712 -1, 713 -1, 714 isc_sockstatscounter_udp6sendfail, 715 isc_sockstatscounter_udp6recvfail, 716 isc_sockstatscounter_udp6active 717 }; 718 static const isc_statscounter_t tcp4statsindex[] = { 719 isc_sockstatscounter_tcp4open, 720 isc_sockstatscounter_tcp4openfail, 721 isc_sockstatscounter_tcp4close, 722 isc_sockstatscounter_tcp4bindfail, 723 isc_sockstatscounter_tcp4connectfail, 724 isc_sockstatscounter_tcp4connect, 725 isc_sockstatscounter_tcp4acceptfail, 726 isc_sockstatscounter_tcp4accept, 727 isc_sockstatscounter_tcp4sendfail, 728 isc_sockstatscounter_tcp4recvfail, 729 isc_sockstatscounter_tcp4active 730 }; 731 static const isc_statscounter_t tcp6statsindex[] = { 732 isc_sockstatscounter_tcp6open, 733 isc_sockstatscounter_tcp6openfail, 734 isc_sockstatscounter_tcp6close, 735 isc_sockstatscounter_tcp6bindfail, 736 isc_sockstatscounter_tcp6connectfail, 737 isc_sockstatscounter_tcp6connect, 738 isc_sockstatscounter_tcp6acceptfail, 739 isc_sockstatscounter_tcp6accept, 740 isc_sockstatscounter_tcp6sendfail, 741 isc_sockstatscounter_tcp6recvfail, 742 isc_sockstatscounter_tcp6active 743 }; 744 static const isc_statscounter_t unixstatsindex[] = { 745 isc_sockstatscounter_unixopen, 746 isc_sockstatscounter_unixopenfail, 747 isc_sockstatscounter_unixclose, 748 isc_sockstatscounter_unixbindfail, 749 isc_sockstatscounter_unixconnectfail, 750 isc_sockstatscounter_unixconnect, 751 isc_sockstatscounter_unixacceptfail, 752 isc_sockstatscounter_unixaccept, 753 isc_sockstatscounter_unixsendfail, 754 isc_sockstatscounter_unixrecvfail, 755 isc_sockstatscounter_unixactive 756 }; 757 static const isc_statscounter_t fdwatchstatsindex[] = { 758 -1, 759 -1, 760 isc_sockstatscounter_fdwatchclose, 761 isc_sockstatscounter_fdwatchbindfail, 762 isc_sockstatscounter_fdwatchconnectfail, 763 isc_sockstatscounter_fdwatchconnect, 764 -1, 765 -1, 766 isc_sockstatscounter_fdwatchsendfail, 767 isc_sockstatscounter_fdwatchrecvfail, 768 -1 769 }; 770 static const isc_statscounter_t rawstatsindex[] = { 771 isc_sockstatscounter_rawopen, 772 isc_sockstatscounter_rawopenfail, 773 isc_sockstatscounter_rawclose, 774 -1, 775 -1, 776 -1, 777 -1, 778 -1, 779 -1, 780 isc_sockstatscounter_rawrecvfail, 781 isc_sockstatscounter_rawactive 782 }; 783 784 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \ 785 defined(USE_WATCHER_THREAD) 786 static void 787 manager_log(isc__socketmgr_t *sockmgr, 788 isc_logcategory_t *category, isc_logmodule_t *module, int level, 789 const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6); 790 static void 791 manager_log(isc__socketmgr_t *sockmgr, 792 isc_logcategory_t *category, isc_logmodule_t *module, int level, 793 const char *fmt, ...) 794 { 795 char msgbuf[2048]; 796 va_list ap; 797 798 if (! isc_log_wouldlog(isc_lctx, level)) 799 return; 800 801 va_start(ap, fmt); 802 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 803 va_end(ap); 804 805 isc_log_write(isc_lctx, category, module, level, 806 "sockmgr %p: %s", sockmgr, msgbuf); 807 } 808 #endif 809 810 static void 811 socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 812 isc_logcategory_t *category, isc_logmodule_t *module, int level, 813 isc_msgcat_t *msgcat, int msgset, int message, 814 const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10); 815 static void 816 socket_log(isc__socket_t *sock, isc_sockaddr_t *address, 817 isc_logcategory_t *category, isc_logmodule_t *module, int level, 818 isc_msgcat_t *msgcat, int msgset, int message, 819 const char *fmt, ...) 820 { 821 char msgbuf[2048]; 822 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 823 va_list ap; 824 825 if (! isc_log_wouldlog(isc_lctx, level)) 826 return; 827 828 va_start(ap, fmt); 829 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap); 830 va_end(ap); 831 832 if (address == NULL) { 833 isc_log_iwrite(isc_lctx, category, module, level, 834 msgcat, msgset, message, 835 "socket %p: %s", sock, msgbuf); 836 } else { 837 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); 838 isc_log_iwrite(isc_lctx, category, module, level, 839 msgcat, msgset, message, 840 "socket %p %s: %s", sock, peerbuf, msgbuf); 841 } 842 } 843 844 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ 845 defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) 846 /* 847 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by 848 * setting IPV6_V6ONLY. 849 */ 850 static void 851 FIX_IPV6_RECVPKTINFO(isc__socket_t *sock) 852 { 853 char strbuf[ISC_STRERRORSIZE]; 854 int on = 1; 855 856 if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) 857 return; 858 859 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 860 (void *)&on, sizeof(on)) < 0) { 861 862 isc__strerror(errno, strbuf, sizeof(strbuf)); 863 UNEXPECTED_ERROR(__FILE__, __LINE__, 864 "setsockopt(%d, IPV6_RECVPKTINFO) " 865 "%s: %s", sock->fd, 866 isc_msgcat_get(isc_msgcat, 867 ISC_MSGSET_GENERAL, 868 ISC_MSG_FAILED, 869 "failed"), 870 strbuf); 871 } 872 } 873 #else 874 #define FIX_IPV6_RECVPKTINFO(sock) (void)0 875 #endif 876 877 /*% 878 * Increment socket-related statistics counters. 879 */ 880 static inline void 881 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 882 REQUIRE(counterid != -1); 883 884 if (stats != NULL) 885 isc_stats_increment(stats, counterid); 886 } 887 888 /*% 889 * Decrement socket-related statistics counters. 890 */ 891 static inline void 892 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) { 893 REQUIRE(counterid != -1); 894 895 if (stats != NULL) 896 isc_stats_decrement(stats, counterid); 897 } 898 899 static inline isc_result_t 900 watch_fd(isc__socketmgr_t *manager, int fd, int msg) { 901 isc_result_t result = ISC_R_SUCCESS; 902 903 #ifdef USE_KQUEUE 904 struct kevent evchange; 905 906 memset(&evchange, 0, sizeof(evchange)); 907 if (msg == SELECT_POKE_READ) 908 evchange.filter = EVFILT_READ; 909 else 910 evchange.filter = EVFILT_WRITE; 911 evchange.flags = EV_ADD; 912 evchange.ident = fd; 913 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 914 result = isc__errno2result(errno); 915 916 return (result); 917 #elif defined(USE_EPOLL) 918 struct epoll_event event; 919 920 if (msg == SELECT_POKE_READ) 921 event.events = EPOLLIN; 922 else 923 event.events = EPOLLOUT; 924 memset(&event.data, 0, sizeof(event.data)); 925 event.data.fd = fd; 926 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && 927 errno != EEXIST) { 928 result = isc__errno2result(errno); 929 } 930 931 return (result); 932 #elif defined(USE_DEVPOLL) 933 struct pollfd pfd; 934 int lockid = FDLOCK_ID(fd); 935 936 memset(&pfd, 0, sizeof(pfd)); 937 if (msg == SELECT_POKE_READ) 938 pfd.events = POLLIN; 939 else 940 pfd.events = POLLOUT; 941 pfd.fd = fd; 942 pfd.revents = 0; 943 LOCK(&manager->fdlock[lockid]); 944 if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) 945 result = isc__errno2result(errno); 946 else { 947 if (msg == SELECT_POKE_READ) 948 manager->fdpollinfo[fd].want_read = 1; 949 else 950 manager->fdpollinfo[fd].want_write = 1; 951 } 952 UNLOCK(&manager->fdlock[lockid]); 953 954 return (result); 955 #elif defined(USE_SELECT) 956 LOCK(&manager->lock); 957 if (msg == SELECT_POKE_READ) 958 FD_SET(fd, manager->read_fds); 959 if (msg == SELECT_POKE_WRITE) 960 FD_SET(fd, manager->write_fds); 961 UNLOCK(&manager->lock); 962 963 return (result); 964 #endif 965 } 966 967 static inline isc_result_t 968 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) { 969 isc_result_t result = ISC_R_SUCCESS; 970 971 #ifdef USE_KQUEUE 972 struct kevent evchange; 973 974 memset(&evchange, 0, sizeof(evchange)); 975 if (msg == SELECT_POKE_READ) 976 evchange.filter = EVFILT_READ; 977 else 978 evchange.filter = EVFILT_WRITE; 979 evchange.flags = EV_DELETE; 980 evchange.ident = fd; 981 if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) 982 result = isc__errno2result(errno); 983 984 return (result); 985 #elif defined(USE_EPOLL) 986 struct epoll_event event; 987 988 if (msg == SELECT_POKE_READ) 989 event.events = EPOLLIN; 990 else 991 event.events = EPOLLOUT; 992 memset(&event.data, 0, sizeof(event.data)); 993 event.data.fd = fd; 994 if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && 995 errno != ENOENT) { 996 char strbuf[ISC_STRERRORSIZE]; 997 isc__strerror(errno, strbuf, sizeof(strbuf)); 998 UNEXPECTED_ERROR(__FILE__, __LINE__, 999 "epoll_ctl(DEL), %d: %s", fd, strbuf); 1000 result = ISC_R_UNEXPECTED; 1001 } 1002 return (result); 1003 #elif defined(USE_DEVPOLL) 1004 struct pollfd pfds[2]; 1005 size_t writelen = sizeof(pfds[0]); 1006 int lockid = FDLOCK_ID(fd); 1007 1008 memset(pfds, 0, sizeof(pfds)); 1009 pfds[0].events = POLLREMOVE; 1010 pfds[0].fd = fd; 1011 1012 /* 1013 * Canceling read or write polling via /dev/poll is tricky. Since it 1014 * only provides a way of canceling per FD, we may need to re-poll the 1015 * socket for the other operation. 1016 */ 1017 LOCK(&manager->fdlock[lockid]); 1018 if (msg == SELECT_POKE_READ && 1019 manager->fdpollinfo[fd].want_write == 1) { 1020 pfds[1].events = POLLOUT; 1021 pfds[1].fd = fd; 1022 writelen += sizeof(pfds[1]); 1023 } 1024 if (msg == SELECT_POKE_WRITE && 1025 manager->fdpollinfo[fd].want_read == 1) { 1026 pfds[1].events = POLLIN; 1027 pfds[1].fd = fd; 1028 writelen += sizeof(pfds[1]); 1029 } 1030 1031 if (write(manager->devpoll_fd, pfds, writelen) == -1) 1032 result = isc__errno2result(errno); 1033 else { 1034 if (msg == SELECT_POKE_READ) 1035 manager->fdpollinfo[fd].want_read = 0; 1036 else 1037 manager->fdpollinfo[fd].want_write = 0; 1038 } 1039 UNLOCK(&manager->fdlock[lockid]); 1040 1041 return (result); 1042 #elif defined(USE_SELECT) 1043 LOCK(&manager->lock); 1044 if (msg == SELECT_POKE_READ) 1045 FD_CLR(fd, manager->read_fds); 1046 else if (msg == SELECT_POKE_WRITE) 1047 FD_CLR(fd, manager->write_fds); 1048 UNLOCK(&manager->lock); 1049 1050 return (result); 1051 #endif 1052 } 1053 1054 static void 1055 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) { 1056 isc_result_t result; 1057 int lockid = FDLOCK_ID(fd); 1058 1059 /* 1060 * This is a wakeup on a socket. If the socket is not in the 1061 * process of being closed, start watching it for either reads 1062 * or writes. 1063 */ 1064 1065 INSIST(fd >= 0 && fd < (int)manager->maxsocks); 1066 1067 if (msg == SELECT_POKE_CLOSE) { 1068 /* No one should be updating fdstate, so no need to lock it */ 1069 INSIST(manager->fdstate[fd] == CLOSE_PENDING); 1070 manager->fdstate[fd] = CLOSED; 1071 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1072 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1073 (void)close(fd); 1074 return; 1075 } 1076 1077 LOCK(&manager->fdlock[lockid]); 1078 if (manager->fdstate[fd] == CLOSE_PENDING) { 1079 UNLOCK(&manager->fdlock[lockid]); 1080 1081 /* 1082 * We accept (and ignore) any error from unwatch_fd() as we are 1083 * closing the socket, hoping it doesn't leave dangling state in 1084 * the kernel. 1085 * Note that unwatch_fd() must be called after releasing the 1086 * fdlock; otherwise it could cause deadlock due to a lock order 1087 * reversal. 1088 */ 1089 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 1090 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 1091 return; 1092 } 1093 if (manager->fdstate[fd] != MANAGED) { 1094 UNLOCK(&manager->fdlock[lockid]); 1095 return; 1096 } 1097 UNLOCK(&manager->fdlock[lockid]); 1098 1099 /* 1100 * Set requested bit. 1101 */ 1102 result = watch_fd(manager, fd, msg); 1103 if (result != ISC_R_SUCCESS) { 1104 /* 1105 * XXXJT: what should we do? Ignoring the failure of watching 1106 * a socket will make the application dysfunctional, but there 1107 * seems to be no reasonable recovery process. 1108 */ 1109 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 1110 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 1111 "failed to start watching FD (%d): %s", 1112 fd, isc_result_totext(result)); 1113 } 1114 } 1115 1116 #ifdef USE_WATCHER_THREAD 1117 /* 1118 * Poke the select loop when there is something for us to do. 1119 * The write is required (by POSIX) to complete. That is, we 1120 * will not get partial writes. 1121 */ 1122 static void 1123 select_poke(isc__socketmgr_t *mgr, int fd, int msg) { 1124 int cc; 1125 int buf[2]; 1126 char strbuf[ISC_STRERRORSIZE]; 1127 1128 buf[0] = fd; 1129 buf[1] = msg; 1130 1131 do { 1132 cc = write(mgr->pipe_fds[1], buf, sizeof(buf)); 1133 #ifdef ENOSR 1134 /* 1135 * Treat ENOSR as EAGAIN but loop slowly as it is 1136 * unlikely to clear fast. 1137 */ 1138 if (cc < 0 && errno == ENOSR) { 1139 sleep(1); 1140 errno = EAGAIN; 1141 } 1142 #endif 1143 } while (cc < 0 && SOFT_ERROR(errno)); 1144 1145 if (cc < 0) { 1146 isc__strerror(errno, strbuf, sizeof(strbuf)); 1147 FATAL_ERROR(__FILE__, __LINE__, 1148 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1149 ISC_MSG_WRITEFAILED, 1150 "write() failed " 1151 "during watcher poke: %s"), 1152 strbuf); 1153 } 1154 1155 INSIST(cc == sizeof(buf)); 1156 } 1157 1158 /* 1159 * Read a message on the internal fd. 1160 */ 1161 static void 1162 select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) { 1163 int buf[2]; 1164 int cc; 1165 char strbuf[ISC_STRERRORSIZE]; 1166 1167 cc = read(mgr->pipe_fds[0], buf, sizeof(buf)); 1168 if (cc < 0) { 1169 *msg = SELECT_POKE_NOTHING; 1170 *fd = -1; /* Silence compiler. */ 1171 if (SOFT_ERROR(errno)) 1172 return; 1173 1174 isc__strerror(errno, strbuf, sizeof(strbuf)); 1175 FATAL_ERROR(__FILE__, __LINE__, 1176 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 1177 ISC_MSG_READFAILED, 1178 "read() failed " 1179 "during watcher poke: %s"), 1180 strbuf); 1181 } 1182 INSIST(cc == sizeof(buf)); 1183 1184 *fd = buf[0]; 1185 *msg = buf[1]; 1186 } 1187 #else /* USE_WATCHER_THREAD */ 1188 /* 1189 * Update the state of the socketmgr when something changes. 1190 */ 1191 static void 1192 select_poke(isc__socketmgr_t *manager, int fd, int msg) { 1193 if (msg == SELECT_POKE_SHUTDOWN) 1194 return; 1195 else if (fd >= 0) 1196 wakeup_socket(manager, fd, msg); 1197 return; 1198 } 1199 #endif /* USE_WATCHER_THREAD */ 1200 1201 /* 1202 * Make a fd non-blocking. 1203 */ 1204 static isc_result_t 1205 make_nonblock(int fd) { 1206 int ret; 1207 int flags; 1208 char strbuf[ISC_STRERRORSIZE]; 1209 #ifdef USE_FIONBIO_IOCTL 1210 int on = 1; 1211 1212 ret = ioctl(fd, FIONBIO, (char *)&on); 1213 #else 1214 flags = fcntl(fd, F_GETFL, 0); 1215 flags |= PORT_NONBLOCK; 1216 ret = fcntl(fd, F_SETFL, flags); 1217 #endif 1218 1219 if (ret == -1) { 1220 isc__strerror(errno, strbuf, sizeof(strbuf)); 1221 UNEXPECTED_ERROR(__FILE__, __LINE__, 1222 #ifdef USE_FIONBIO_IOCTL 1223 "ioctl(%d, FIONBIO, &on): %s", fd, 1224 #else 1225 "fcntl(%d, F_SETFL, %d): %s", fd, flags, 1226 #endif 1227 strbuf); 1228 1229 return (ISC_R_UNEXPECTED); 1230 } 1231 1232 return (ISC_R_SUCCESS); 1233 } 1234 1235 #ifdef USE_CMSG 1236 /* 1237 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE. 1238 * In order to ensure as much portability as possible, we provide wrapper 1239 * functions of these macros. 1240 * Note that cmsg_space() could run slow on OSes that do not have 1241 * CMSG_SPACE. 1242 */ 1243 static inline ISC_SOCKADDR_LEN_T 1244 cmsg_len(ISC_SOCKADDR_LEN_T len) { 1245 #ifdef CMSG_LEN 1246 return (CMSG_LEN(len)); 1247 #else 1248 ISC_SOCKADDR_LEN_T hdrlen; 1249 1250 /* 1251 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA 1252 * is correct. 1253 */ 1254 hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL)); 1255 return (hdrlen + len); 1256 #endif 1257 } 1258 1259 static inline ISC_SOCKADDR_LEN_T 1260 cmsg_space(ISC_SOCKADDR_LEN_T len) { 1261 #ifdef CMSG_SPACE 1262 return (CMSG_SPACE(len)); 1263 #else 1264 struct msghdr msg; 1265 struct cmsghdr *cmsgp; 1266 /* 1267 * XXX: The buffer length is an ad-hoc value, but should be enough 1268 * in a practical sense. 1269 */ 1270 char dummybuf[sizeof(struct cmsghdr) + 1024]; 1271 1272 memset(&msg, 0, sizeof(msg)); 1273 msg.msg_control = dummybuf; 1274 msg.msg_controllen = sizeof(dummybuf); 1275 1276 cmsgp = (struct cmsghdr *)dummybuf; 1277 cmsgp->cmsg_len = cmsg_len(len); 1278 1279 cmsgp = CMSG_NXTHDR(&msg, cmsgp); 1280 if (cmsgp != NULL) 1281 return ((char *)cmsgp - (char *)msg.msg_control); 1282 else 1283 return (0); 1284 #endif 1285 } 1286 #endif /* USE_CMSG */ 1287 1288 /* 1289 * Process control messages received on a socket. 1290 */ 1291 static void 1292 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { 1293 #ifdef USE_CMSG 1294 struct cmsghdr *cmsgp; 1295 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1296 struct in6_pktinfo *pktinfop; 1297 #endif 1298 #ifdef SO_TIMESTAMP 1299 void *timevalp; 1300 #endif 1301 #endif 1302 1303 /* 1304 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. 1305 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. 1306 * They are all here, outside of the CPP tests, because it is 1307 * more consistent with the usual ISC coding style. 1308 */ 1309 UNUSED(sock); 1310 UNUSED(msg); 1311 UNUSED(dev); 1312 1313 #ifdef ISC_NET_BSD44MSGHDR 1314 1315 #ifdef MSG_TRUNC 1316 if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) 1317 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1318 #endif 1319 1320 #ifdef MSG_CTRUNC 1321 if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) 1322 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; 1323 #endif 1324 1325 #ifndef USE_CMSG 1326 return; 1327 #else 1328 if (msg->msg_controllen == 0U || msg->msg_control == NULL) 1329 return; 1330 1331 #ifdef SO_TIMESTAMP 1332 timevalp = NULL; 1333 #endif 1334 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1335 pktinfop = NULL; 1336 #endif 1337 1338 cmsgp = CMSG_FIRSTHDR(msg); 1339 while (cmsgp != NULL) { 1340 socket_log(sock, NULL, TRACE, 1341 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, 1342 "processing cmsg %p", cmsgp); 1343 1344 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO 1345 if (cmsgp->cmsg_level == IPPROTO_IPV6 1346 && cmsgp->cmsg_type == IPV6_PKTINFO) { 1347 1348 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1349 memmove(&dev->pktinfo, pktinfop, 1350 sizeof(struct in6_pktinfo)); 1351 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 1352 socket_log(sock, NULL, TRACE, 1353 isc_msgcat, ISC_MSGSET_SOCKET, 1354 ISC_MSG_IFRECEIVED, 1355 "interface received on ifindex %u", 1356 dev->pktinfo.ipi6_ifindex); 1357 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) 1358 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; 1359 goto next; 1360 } 1361 #endif 1362 1363 #ifdef SO_TIMESTAMP 1364 if (cmsgp->cmsg_level == SOL_SOCKET 1365 && cmsgp->cmsg_type == SCM_TIMESTAMP) { 1366 struct timeval tv; 1367 timevalp = CMSG_DATA(cmsgp); 1368 memmove(&tv, timevalp, sizeof(tv)); 1369 dev->timestamp.seconds = tv.tv_sec; 1370 dev->timestamp.nanoseconds = tv.tv_usec * 1000; 1371 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; 1372 goto next; 1373 } 1374 #endif 1375 1376 #ifdef IPV6_TCLASS 1377 if (cmsgp->cmsg_level == IPPROTO_IPV6 1378 && cmsgp->cmsg_type == IPV6_TCLASS) { 1379 dev->dscp = *(int *)CMSG_DATA(cmsgp); 1380 dev->dscp >>= 2; 1381 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 1382 goto next; 1383 } 1384 #endif 1385 1386 #ifdef IP_TOS 1387 if (cmsgp->cmsg_level == IPPROTO_IP 1388 && (cmsgp->cmsg_type == IP_TOS 1389 #ifdef IP_RECVTOS 1390 || cmsgp->cmsg_type == IP_RECVTOS 1391 #endif 1392 )) { 1393 dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp); 1394 dev->dscp >>= 2; 1395 dev->attributes |= ISC_SOCKEVENTATTR_DSCP; 1396 goto next; 1397 } 1398 #endif 1399 next: 1400 cmsgp = CMSG_NXTHDR(msg, cmsgp); 1401 } 1402 #endif /* USE_CMSG */ 1403 1404 #endif /* ISC_NET_BSD44MSGHDR */ 1405 } 1406 1407 /* 1408 * Construct an iov array and attach it to the msghdr passed in. This is 1409 * the SEND constructor, which will use the used region of the buffer 1410 * (if using a buffer list) or will use the internal region (if a single 1411 * buffer I/O is requested). 1412 * 1413 * Nothing can be NULL, and the done event must list at least one buffer 1414 * on the buffer linked list for this function to be meaningful. 1415 * 1416 * If write_countp != NULL, *write_countp will hold the number of bytes 1417 * this transaction can send. 1418 */ 1419 static void 1420 build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev, 1421 struct msghdr *msg, struct iovec *iov, size_t *write_countp) 1422 { 1423 unsigned int iovcount; 1424 isc_buffer_t *buffer; 1425 isc_region_t used; 1426 size_t write_count; 1427 size_t skip_count; 1428 #ifdef ISC_NET_BSD44MSGHDR 1429 struct cmsghdr *cmsgp; 1430 #endif 1431 1432 memset(msg, 0, sizeof(*msg)); 1433 1434 if (!sock->connected) { 1435 msg->msg_name = (void *)&dev->address.type.sa; 1436 msg->msg_namelen = dev->address.length; 1437 } else { 1438 msg->msg_name = NULL; 1439 msg->msg_namelen = 0; 1440 } 1441 1442 buffer = ISC_LIST_HEAD(dev->bufferlist); 1443 write_count = 0; 1444 iovcount = 0; 1445 1446 /* 1447 * Single buffer I/O? Skip what we've done so far in this region. 1448 */ 1449 if (buffer == NULL) { 1450 write_count = dev->region.length - dev->n; 1451 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1452 iov[0].iov_len = write_count; 1453 iovcount = 1; 1454 1455 goto config; 1456 } 1457 1458 /* 1459 * Multibuffer I/O. 1460 * Skip the data in the buffer list that we have already written. 1461 */ 1462 skip_count = dev->n; 1463 while (buffer != NULL) { 1464 REQUIRE(ISC_BUFFER_VALID(buffer)); 1465 if (skip_count < isc_buffer_usedlength(buffer)) 1466 break; 1467 skip_count -= isc_buffer_usedlength(buffer); 1468 buffer = ISC_LIST_NEXT(buffer, link); 1469 } 1470 1471 while (buffer != NULL) { 1472 INSIST(iovcount < MAXSCATTERGATHER_SEND); 1473 1474 isc_buffer_usedregion(buffer, &used); 1475 1476 if (used.length > 0) { 1477 iov[iovcount].iov_base = (void *)(used.base 1478 + skip_count); 1479 iov[iovcount].iov_len = used.length - skip_count; 1480 write_count += (used.length - skip_count); 1481 skip_count = 0; 1482 iovcount++; 1483 } 1484 buffer = ISC_LIST_NEXT(buffer, link); 1485 } 1486 1487 INSIST(skip_count == 0U); 1488 1489 config: 1490 msg->msg_iov = iov; 1491 msg->msg_iovlen = iovcount; 1492 1493 #ifdef ISC_NET_BSD44MSGHDR 1494 msg->msg_control = NULL; 1495 msg->msg_controllen = 0; 1496 msg->msg_flags = 0; 1497 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 1498 if ((sock->type == isc_sockettype_udp) 1499 && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { 1500 #if defined(IPV6_USE_MIN_MTU) 1501 int use_min_mtu = 1; /* -1, 0, 1 */ 1502 #endif 1503 struct in6_pktinfo *pktinfop; 1504 1505 socket_log(sock, NULL, TRACE, 1506 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, 1507 "sendto pktinfo data, ifindex %u", 1508 dev->pktinfo.ipi6_ifindex); 1509 1510 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo)); 1511 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1512 msg->msg_control = (void *)sock->sendcmsgbuf; 1513 1514 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf; 1515 cmsgp->cmsg_level = IPPROTO_IPV6; 1516 cmsgp->cmsg_type = IPV6_PKTINFO; 1517 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo)); 1518 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); 1519 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); 1520 #if defined(IPV6_USE_MIN_MTU) 1521 /* 1522 * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD 1523 * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO 1524 * is used. 1525 */ 1526 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf + 1527 msg->msg_controllen); 1528 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu)); 1529 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1530 1531 cmsgp->cmsg_level = IPPROTO_IPV6; 1532 cmsgp->cmsg_type = IPV6_USE_MIN_MTU; 1533 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu)); 1534 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu)); 1535 #endif 1536 } 1537 1538 if (isc_dscp_check_value > -1) { 1539 if (sock->type == isc_sockettype_udp) 1540 INSIST((int)dev->dscp == isc_dscp_check_value); 1541 else if (sock->type == isc_sockettype_tcp) 1542 INSIST((int)sock->dscp == isc_dscp_check_value); 1543 } 1544 1545 if ((sock->type == isc_sockettype_udp) && 1546 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0)) 1547 { 1548 int dscp = (dev->dscp << 2) & 0xff; 1549 1550 INSIST(dev->dscp < 0x40); 1551 1552 #ifdef IP_TOS 1553 if (sock->pf == AF_INET && sock->pktdscp) { 1554 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf + 1555 msg->msg_controllen); 1556 msg->msg_control = (void *)sock->sendcmsgbuf; 1557 msg->msg_controllen += cmsg_space(sizeof(dscp)); 1558 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1559 1560 cmsgp->cmsg_level = IPPROTO_IP; 1561 cmsgp->cmsg_type = IP_TOS; 1562 cmsgp->cmsg_len = cmsg_len(sizeof(char)); 1563 *(unsigned char*)CMSG_DATA(cmsgp) = dscp; 1564 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) { 1565 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, 1566 (void *)&dscp, sizeof(int)) < 0) 1567 { 1568 char strbuf[ISC_STRERRORSIZE]; 1569 isc__strerror(errno, strbuf, sizeof(strbuf)); 1570 UNEXPECTED_ERROR(__FILE__, __LINE__, 1571 "setsockopt(%d, IP_TOS, %.02x)" 1572 " %s: %s", 1573 sock->fd, dscp >> 2, 1574 isc_msgcat_get(isc_msgcat, 1575 ISC_MSGSET_GENERAL, 1576 ISC_MSG_FAILED, 1577 "failed"), 1578 strbuf); 1579 } else 1580 sock->dscp = dscp; 1581 } 1582 #endif 1583 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) 1584 if (sock->pf == AF_INET6 && sock->pktdscp) { 1585 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf + 1586 msg->msg_controllen); 1587 msg->msg_control = (void *)sock->sendcmsgbuf; 1588 msg->msg_controllen += cmsg_space(sizeof(dscp)); 1589 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen); 1590 1591 cmsgp->cmsg_level = IPPROTO_IPV6; 1592 cmsgp->cmsg_type = IPV6_TCLASS; 1593 cmsgp->cmsg_len = cmsg_len(sizeof(dscp)); 1594 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp)); 1595 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) { 1596 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS, 1597 (void *)&dscp, sizeof(int)) < 0) { 1598 char strbuf[ISC_STRERRORSIZE]; 1599 isc__strerror(errno, strbuf, sizeof(strbuf)); 1600 UNEXPECTED_ERROR(__FILE__, __LINE__, 1601 "setsockopt(%d, IPV6_TCLASS, " 1602 "%.02x) %s: %s", 1603 sock->fd, dscp >> 2, 1604 isc_msgcat_get(isc_msgcat, 1605 ISC_MSGSET_GENERAL, 1606 ISC_MSG_FAILED, 1607 "failed"), 1608 strbuf); 1609 } else 1610 sock->dscp = dscp; 1611 } 1612 #endif 1613 } 1614 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ 1615 #else /* ISC_NET_BSD44MSGHDR */ 1616 msg->msg_accrights = NULL; 1617 msg->msg_accrightslen = 0; 1618 #endif /* ISC_NET_BSD44MSGHDR */ 1619 1620 if (write_countp != NULL) 1621 *write_countp = write_count; 1622 } 1623 1624 /* 1625 * Construct an iov array and attach it to the msghdr passed in. This is 1626 * the RECV constructor, which will use the available region of the buffer 1627 * (if using a buffer list) or will use the internal region (if a single 1628 * buffer I/O is requested). 1629 * 1630 * Nothing can be NULL, and the done event must list at least one buffer 1631 * on the buffer linked list for this function to be meaningful. 1632 * 1633 * If read_countp != NULL, *read_countp will hold the number of bytes 1634 * this transaction can receive. 1635 */ 1636 static void 1637 build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev, 1638 struct msghdr *msg, struct iovec *iov, size_t *read_countp) 1639 { 1640 unsigned int iovcount; 1641 isc_buffer_t *buffer; 1642 isc_region_t available; 1643 size_t read_count; 1644 1645 memset(msg, 0, sizeof(struct msghdr)); 1646 1647 if (sock->type == isc_sockettype_udp) { 1648 memset(&dev->address, 0, sizeof(dev->address)); 1649 #ifdef BROKEN_RECVMSG 1650 if (sock->pf == AF_INET) { 1651 msg->msg_name = (void *)&dev->address.type.sin; 1652 msg->msg_namelen = sizeof(dev->address.type.sin6); 1653 } else if (sock->pf == AF_INET6) { 1654 msg->msg_name = (void *)&dev->address.type.sin6; 1655 msg->msg_namelen = sizeof(dev->address.type.sin6); 1656 #ifdef ISC_PLATFORM_HAVESYSUNH 1657 } else if (sock->pf == AF_UNIX) { 1658 msg->msg_name = (void *)&dev->address.type.sunix; 1659 msg->msg_namelen = sizeof(dev->address.type.sunix); 1660 #endif 1661 } else { 1662 msg->msg_name = (void *)&dev->address.type.sa; 1663 msg->msg_namelen = sizeof(dev->address.type); 1664 } 1665 #else 1666 msg->msg_name = (void *)&dev->address.type.sa; 1667 msg->msg_namelen = sizeof(dev->address.type); 1668 #endif 1669 #ifdef ISC_NET_RECVOVERFLOW 1670 /* If needed, steal one iovec for overflow detection. */ 1671 maxiov--; 1672 #endif 1673 } else { /* TCP */ 1674 msg->msg_name = NULL; 1675 msg->msg_namelen = 0; 1676 dev->address = sock->peer_address; 1677 } 1678 1679 buffer = ISC_LIST_HEAD(dev->bufferlist); 1680 read_count = 0; 1681 1682 /* 1683 * Single buffer I/O? Skip what we've done so far in this region. 1684 */ 1685 if (buffer == NULL) { 1686 read_count = dev->region.length - dev->n; 1687 iov[0].iov_base = (void *)(dev->region.base + dev->n); 1688 iov[0].iov_len = read_count; 1689 iovcount = 1; 1690 1691 goto config; 1692 } 1693 1694 /* 1695 * Multibuffer I/O. 1696 * Skip empty buffers. 1697 */ 1698 while (buffer != NULL) { 1699 REQUIRE(ISC_BUFFER_VALID(buffer)); 1700 if (isc_buffer_availablelength(buffer) != 0) 1701 break; 1702 buffer = ISC_LIST_NEXT(buffer, link); 1703 } 1704 1705 iovcount = 0; 1706 while (buffer != NULL) { 1707 INSIST(iovcount < MAXSCATTERGATHER_RECV); 1708 1709 isc_buffer_availableregion(buffer, &available); 1710 1711 if (available.length > 0) { 1712 iov[iovcount].iov_base = (void *)(available.base); 1713 iov[iovcount].iov_len = available.length; 1714 read_count += available.length; 1715 iovcount++; 1716 } 1717 buffer = ISC_LIST_NEXT(buffer, link); 1718 } 1719 1720 config: 1721 1722 /* 1723 * If needed, set up to receive that one extra byte. Note that 1724 * we know there is at least one iov left, since we stole it 1725 * at the top of this function. 1726 */ 1727 #ifdef ISC_NET_RECVOVERFLOW 1728 if (sock->type == isc_sockettype_udp) { 1729 iov[iovcount].iov_base = (void *)(&sock->overflow); 1730 iov[iovcount].iov_len = 1; 1731 iovcount++; 1732 } 1733 #endif 1734 1735 msg->msg_iov = iov; 1736 msg->msg_iovlen = iovcount; 1737 1738 #ifdef ISC_NET_BSD44MSGHDR 1739 msg->msg_control = NULL; 1740 msg->msg_controllen = 0; 1741 msg->msg_flags = 0; 1742 #if defined(USE_CMSG) 1743 msg->msg_control = sock->recvcmsgbuf; 1744 msg->msg_controllen = sock->recvcmsgbuflen; 1745 #endif /* USE_CMSG */ 1746 #else /* ISC_NET_BSD44MSGHDR */ 1747 msg->msg_accrights = NULL; 1748 msg->msg_accrightslen = 0; 1749 #endif /* ISC_NET_BSD44MSGHDR */ 1750 1751 if (read_countp != NULL) 1752 *read_countp = read_count; 1753 } 1754 1755 static void 1756 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock, 1757 isc_socketevent_t *dev) 1758 { 1759 if (sock->type == isc_sockettype_udp) { 1760 if (address != NULL) 1761 dev->address = *address; 1762 else 1763 dev->address = sock->peer_address; 1764 } else if (sock->type == isc_sockettype_tcp) { 1765 INSIST(address == NULL); 1766 dev->address = sock->peer_address; 1767 } 1768 } 1769 1770 static void 1771 destroy_socketevent(isc_event_t *event) { 1772 isc_socketevent_t *ev = (isc_socketevent_t *)event; 1773 1774 INSIST(ISC_LIST_EMPTY(ev->bufferlist)); 1775 1776 (ev->destroy)(event); 1777 } 1778 1779 static isc_socketevent_t * 1780 allocate_socketevent(isc_mem_t *mctx, void *sender, 1781 isc_eventtype_t eventtype, isc_taskaction_t action, 1782 void *arg) 1783 { 1784 isc_socketevent_t *ev; 1785 1786 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, 1787 eventtype, action, arg, 1788 sizeof(*ev)); 1789 1790 if (ev == NULL) 1791 return (NULL); 1792 1793 ev->result = ISC_R_UNSET; 1794 ISC_LINK_INIT(ev, ev_link); 1795 ISC_LIST_INIT(ev->bufferlist); 1796 ev->region.base = NULL; 1797 ev->n = 0; 1798 ev->offset = 0; 1799 ev->attributes = 0; 1800 ev->destroy = ev->ev_destroy; 1801 ev->ev_destroy = destroy_socketevent; 1802 ev->dscp = 0; 1803 1804 return (ev); 1805 } 1806 1807 #if defined(ISC_SOCKET_DEBUG) 1808 static void 1809 dump_msg(struct msghdr *msg) { 1810 unsigned int i; 1811 1812 printf("MSGHDR %p\n", msg); 1813 printf("\tname %p, namelen %ld\n", msg->msg_name, 1814 (long) msg->msg_namelen); 1815 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, 1816 (long) msg->msg_iovlen); 1817 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) 1818 printf("\t\t%d\tbase %p, len %ld\n", i, 1819 msg->msg_iov[i].iov_base, 1820 (long) msg->msg_iov[i].iov_len); 1821 #ifdef ISC_NET_BSD44MSGHDR 1822 printf("\tcontrol %p, controllen %ld\n", msg->msg_control, 1823 (long) msg->msg_controllen); 1824 #endif 1825 } 1826 #endif 1827 1828 #define DOIO_SUCCESS 0 /* i/o ok, event sent */ 1829 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */ 1830 #define DOIO_HARD 2 /* i/o error, event sent */ 1831 #define DOIO_EOF 3 /* EOF, no event sent */ 1832 1833 static int 1834 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) { 1835 int cc; 1836 struct iovec iov[MAXSCATTERGATHER_RECV]; 1837 size_t read_count; 1838 size_t actual_count; 1839 struct msghdr msghdr; 1840 isc_buffer_t *buffer; 1841 int recv_errno; 1842 char strbuf[ISC_STRERRORSIZE]; 1843 1844 build_msghdr_recv(sock, dev, &msghdr, iov, &read_count); 1845 1846 #if defined(ISC_SOCKET_DEBUG) 1847 dump_msg(&msghdr); 1848 #endif 1849 1850 cc = recvmsg(sock->fd, &msghdr, 0); 1851 recv_errno = errno; 1852 1853 #if defined(ISC_SOCKET_DEBUG) 1854 dump_msg(&msghdr); 1855 #endif 1856 1857 if (cc < 0) { 1858 if (SOFT_ERROR(recv_errno)) 1859 return (DOIO_SOFT); 1860 1861 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1862 isc__strerror(recv_errno, strbuf, sizeof(strbuf)); 1863 socket_log(sock, NULL, IOEVENT, 1864 isc_msgcat, ISC_MSGSET_SOCKET, 1865 ISC_MSG_DOIORECV, 1866 "doio_recv: recvmsg(%d) %d bytes, err %d/%s", 1867 sock->fd, cc, recv_errno, strbuf); 1868 } 1869 1870 #define SOFT_OR_HARD(_system, _isc) \ 1871 if (recv_errno == _system) { \ 1872 if (sock->connected) { \ 1873 dev->result = _isc; \ 1874 inc_stats(sock->manager->stats, \ 1875 sock->statsindex[STATID_RECVFAIL]); \ 1876 return (DOIO_HARD); \ 1877 } \ 1878 return (DOIO_SOFT); \ 1879 } 1880 #define ALWAYS_HARD(_system, _isc) \ 1881 if (recv_errno == _system) { \ 1882 dev->result = _isc; \ 1883 inc_stats(sock->manager->stats, \ 1884 sock->statsindex[STATID_RECVFAIL]); \ 1885 return (DOIO_HARD); \ 1886 } 1887 1888 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 1889 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH); 1890 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 1891 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN); 1892 /* HPUX 11.11 can return EADDRNOTAVAIL. */ 1893 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 1894 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 1895 /* Should never get this one but it was seen. */ 1896 #ifdef ENOPROTOOPT 1897 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH); 1898 #endif 1899 /* 1900 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 1901 * errors. 1902 */ 1903 #ifdef EPROTO 1904 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); 1905 #endif 1906 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); 1907 1908 #undef SOFT_OR_HARD 1909 #undef ALWAYS_HARD 1910 1911 dev->result = isc__errno2result(recv_errno); 1912 inc_stats(sock->manager->stats, 1913 sock->statsindex[STATID_RECVFAIL]); 1914 return (DOIO_HARD); 1915 } 1916 1917 /* 1918 * On TCP and UNIX sockets, zero length reads indicate EOF, 1919 * while on UDP sockets, zero length reads are perfectly valid, 1920 * although strange. 1921 */ 1922 switch (sock->type) { 1923 case isc_sockettype_tcp: 1924 case isc_sockettype_unix: 1925 if (cc == 0) 1926 return (DOIO_EOF); 1927 break; 1928 case isc_sockettype_udp: 1929 case isc_sockettype_raw: 1930 break; 1931 case isc_sockettype_fdwatch: 1932 default: 1933 INSIST(0); 1934 } 1935 1936 if (sock->type == isc_sockettype_udp) { 1937 dev->address.length = msghdr.msg_namelen; 1938 if (isc_sockaddr_getport(&dev->address) == 0) { 1939 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { 1940 socket_log(sock, &dev->address, IOEVENT, 1941 isc_msgcat, ISC_MSGSET_SOCKET, 1942 ISC_MSG_ZEROPORT, 1943 "dropping source port zero packet"); 1944 } 1945 return (DOIO_SOFT); 1946 } 1947 /* 1948 * Simulate a firewall blocking UDP responses bigger than 1949 * 'maxudp' bytes. 1950 */ 1951 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp) 1952 return (DOIO_SOFT); 1953 } 1954 1955 socket_log(sock, &dev->address, IOEVENT, 1956 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV, 1957 "packet received correctly"); 1958 1959 /* 1960 * Overflow bit detection. If we received MORE bytes than we should, 1961 * this indicates an overflow situation. Set the flag in the 1962 * dev entry and adjust how much we read by one. 1963 */ 1964 #ifdef ISC_NET_RECVOVERFLOW 1965 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) { 1966 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; 1967 cc--; 1968 } 1969 #endif 1970 1971 /* 1972 * If there are control messages attached, run through them and pull 1973 * out the interesting bits. 1974 */ 1975 process_cmsg(sock, &msghdr, dev); 1976 1977 /* 1978 * update the buffers (if any) and the i/o count 1979 */ 1980 dev->n += cc; 1981 actual_count = cc; 1982 buffer = ISC_LIST_HEAD(dev->bufferlist); 1983 while (buffer != NULL && actual_count > 0U) { 1984 REQUIRE(ISC_BUFFER_VALID(buffer)); 1985 if (isc_buffer_availablelength(buffer) <= actual_count) { 1986 actual_count -= isc_buffer_availablelength(buffer); 1987 isc_buffer_add(buffer, 1988 isc_buffer_availablelength(buffer)); 1989 } else { 1990 isc_buffer_add(buffer, actual_count); 1991 actual_count = 0; 1992 POST(actual_count); 1993 break; 1994 } 1995 buffer = ISC_LIST_NEXT(buffer, link); 1996 if (buffer == NULL) { 1997 INSIST(actual_count == 0U); 1998 } 1999 } 2000 2001 /* 2002 * If we read less than we expected, update counters, 2003 * and let the upper layer poke the descriptor. 2004 */ 2005 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) 2006 return (DOIO_SOFT); 2007 2008 /* 2009 * Full reads are posted, or partials if partials are ok. 2010 */ 2011 dev->result = ISC_R_SUCCESS; 2012 return (DOIO_SUCCESS); 2013 } 2014 2015 /* 2016 * Returns: 2017 * DOIO_SUCCESS The operation succeeded. dev->result contains 2018 * ISC_R_SUCCESS. 2019 * 2020 * DOIO_HARD A hard or unexpected I/O error was encountered. 2021 * dev->result contains the appropriate error. 2022 * 2023 * DOIO_SOFT A soft I/O error was encountered. No senddone 2024 * event was sent. The operation should be retried. 2025 * 2026 * No other return values are possible. 2027 */ 2028 static int 2029 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) { 2030 int cc; 2031 struct iovec iov[MAXSCATTERGATHER_SEND]; 2032 size_t write_count; 2033 struct msghdr msghdr; 2034 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 2035 int attempts = 0; 2036 int send_errno; 2037 char strbuf[ISC_STRERRORSIZE]; 2038 2039 build_msghdr_send(sock, dev, &msghdr, iov, &write_count); 2040 2041 resend: 2042 if (sock->type == isc_sockettype_udp && 2043 sock->manager->maxudp != 0 && 2044 write_count > (size_t)sock->manager->maxudp) 2045 cc = write_count; 2046 else 2047 cc = sendmsg(sock->fd, &msghdr, 0); 2048 send_errno = errno; 2049 2050 /* 2051 * Check for error or block condition. 2052 */ 2053 if (cc < 0) { 2054 if (send_errno == EINTR && ++attempts < NRETRIES) 2055 goto resend; 2056 2057 if (SOFT_ERROR(send_errno)) 2058 return (DOIO_SOFT); 2059 2060 #define SOFT_OR_HARD(_system, _isc) \ 2061 if (send_errno == _system) { \ 2062 if (sock->connected) { \ 2063 dev->result = _isc; \ 2064 inc_stats(sock->manager->stats, \ 2065 sock->statsindex[STATID_SENDFAIL]); \ 2066 return (DOIO_HARD); \ 2067 } \ 2068 return (DOIO_SOFT); \ 2069 } 2070 #define ALWAYS_HARD(_system, _isc) \ 2071 if (send_errno == _system) { \ 2072 dev->result = _isc; \ 2073 inc_stats(sock->manager->stats, \ 2074 sock->statsindex[STATID_SENDFAIL]); \ 2075 return (DOIO_HARD); \ 2076 } 2077 2078 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED); 2079 ALWAYS_HARD(EACCES, ISC_R_NOPERM); 2080 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 2081 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 2082 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH); 2083 #ifdef EHOSTDOWN 2084 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH); 2085 #endif 2086 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH); 2087 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); 2088 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH); 2089 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED); 2090 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET); 2091 2092 #undef SOFT_OR_HARD 2093 #undef ALWAYS_HARD 2094 2095 /* 2096 * The other error types depend on whether or not the 2097 * socket is UDP or TCP. If it is UDP, some errors 2098 * that we expect to be fatal under TCP are merely 2099 * annoying, and are really soft errors. 2100 * 2101 * However, these soft errors are still returned as 2102 * a status. 2103 */ 2104 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); 2105 isc__strerror(send_errno, strbuf, sizeof(strbuf)); 2106 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", 2107 addrbuf, strbuf); 2108 dev->result = isc__errno2result(send_errno); 2109 inc_stats(sock->manager->stats, 2110 sock->statsindex[STATID_SENDFAIL]); 2111 return (DOIO_HARD); 2112 } 2113 2114 if (cc == 0) { 2115 inc_stats(sock->manager->stats, 2116 sock->statsindex[STATID_SENDFAIL]); 2117 UNEXPECTED_ERROR(__FILE__, __LINE__, 2118 "doio_send: send() %s 0", 2119 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2120 ISC_MSG_RETURNED, "returned")); 2121 } 2122 2123 /* 2124 * If we write less than we expected, update counters, poke. 2125 */ 2126 dev->n += cc; 2127 if ((size_t)cc != write_count) 2128 return (DOIO_SOFT); 2129 2130 /* 2131 * Exactly what we wanted to write. We're done with this 2132 * entry. Post its completion event. 2133 */ 2134 dev->result = ISC_R_SUCCESS; 2135 return (DOIO_SUCCESS); 2136 } 2137 2138 /* 2139 * Kill. 2140 * 2141 * Caller must ensure that the socket is not locked and no external 2142 * references exist. 2143 */ 2144 static void 2145 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) { 2146 isc_sockettype_t type = sock->type; 2147 int lockid = FDLOCK_ID(fd); 2148 2149 /* 2150 * No one has this socket open, so the watcher doesn't have to be 2151 * poked, and the socket doesn't have to be locked. 2152 */ 2153 LOCK(&manager->fdlock[lockid]); 2154 manager->fds[fd] = NULL; 2155 if (type == isc_sockettype_fdwatch) 2156 manager->fdstate[fd] = CLOSED; 2157 else 2158 manager->fdstate[fd] = CLOSE_PENDING; 2159 UNLOCK(&manager->fdlock[lockid]); 2160 if (type == isc_sockettype_fdwatch) { 2161 /* 2162 * The caller may close the socket once this function returns, 2163 * and `fd' may be reassigned for a new socket. So we do 2164 * unwatch_fd() here, rather than defer it via select_poke(). 2165 * Note: this may complicate data protection among threads and 2166 * may reduce performance due to additional locks. One way to 2167 * solve this would be to dup() the watched descriptor, but we 2168 * take a simpler approach at this moment. 2169 */ 2170 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 2171 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 2172 } else 2173 select_poke(manager, fd, SELECT_POKE_CLOSE); 2174 2175 inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]); 2176 if (sock->active == 1) { 2177 dec_stats(manager->stats, sock->statsindex[STATID_ACTIVE]); 2178 sock->active = 0; 2179 } 2180 2181 /* 2182 * update manager->maxfd here (XXX: this should be implemented more 2183 * efficiently) 2184 */ 2185 #ifdef USE_SELECT 2186 LOCK(&manager->lock); 2187 if (manager->maxfd == fd) { 2188 int i; 2189 2190 manager->maxfd = 0; 2191 for (i = fd - 1; i >= 0; i--) { 2192 lockid = FDLOCK_ID(i); 2193 2194 LOCK(&manager->fdlock[lockid]); 2195 if (manager->fdstate[i] == MANAGED) { 2196 manager->maxfd = i; 2197 UNLOCK(&manager->fdlock[lockid]); 2198 break; 2199 } 2200 UNLOCK(&manager->fdlock[lockid]); 2201 } 2202 #ifdef ISC_PLATFORM_USETHREADS 2203 if (manager->maxfd < manager->pipe_fds[0]) 2204 manager->maxfd = manager->pipe_fds[0]; 2205 #endif 2206 } 2207 2208 UNLOCK(&manager->lock); 2209 #endif /* USE_SELECT */ 2210 } 2211 2212 static void 2213 destroy(isc__socket_t **sockp) { 2214 int fd; 2215 isc__socket_t *sock = *sockp; 2216 isc__socketmgr_t *manager = sock->manager; 2217 2218 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2219 ISC_MSG_DESTROYING, "destroying"); 2220 2221 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2222 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2223 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2224 INSIST(sock->connect_ev == NULL); 2225 REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); 2226 2227 if (sock->fd >= 0) { 2228 fd = sock->fd; 2229 sock->fd = -1; 2230 socketclose(manager, sock, fd); 2231 } 2232 2233 LOCK(&manager->lock); 2234 2235 ISC_LIST_UNLINK(manager->socklist, sock, link); 2236 2237 #ifdef USE_WATCHER_THREAD 2238 if (ISC_LIST_EMPTY(manager->socklist)) 2239 SIGNAL(&manager->shutdown_ok); 2240 #endif /* USE_WATCHER_THREAD */ 2241 2242 /* can't unlock manager as its memory context is still used */ 2243 free_socket(sockp); 2244 2245 UNLOCK(&manager->lock); 2246 } 2247 2248 static isc_result_t 2249 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type, 2250 isc__socket_t **socketp) 2251 { 2252 isc__socket_t *sock; 2253 isc_result_t result; 2254 ISC_SOCKADDR_LEN_T cmsgbuflen; 2255 2256 sock = isc_mem_get(manager->mctx, sizeof(*sock)); 2257 2258 if (sock == NULL) 2259 return (ISC_R_NOMEMORY); 2260 2261 sock->common.magic = 0; 2262 sock->common.impmagic = 0; 2263 sock->references = 0; 2264 2265 sock->manager = manager; 2266 sock->type = type; 2267 sock->fd = -1; 2268 sock->dscp = 0; /* TOS/TCLASS is zero until set. */ 2269 sock->dupped = 0; 2270 sock->statsindex = NULL; 2271 2272 ISC_LINK_INIT(sock, link); 2273 2274 sock->recvcmsgbuf = NULL; 2275 sock->sendcmsgbuf = NULL; 2276 2277 /* 2278 * Set up cmsg buffers. 2279 */ 2280 cmsgbuflen = 0; 2281 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2282 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2283 #endif 2284 #if defined(USE_CMSG) && defined(SO_TIMESTAMP) 2285 cmsgbuflen += cmsg_space(sizeof(struct timeval)); 2286 #endif 2287 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) 2288 cmsgbuflen += cmsg_space(sizeof(int)); 2289 #endif 2290 sock->recvcmsgbuflen = cmsgbuflen; 2291 if (sock->recvcmsgbuflen != 0U) { 2292 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2293 if (sock->recvcmsgbuf == NULL) { 2294 result = ISC_R_NOMEMORY; 2295 goto error; 2296 } 2297 } 2298 2299 cmsgbuflen = 0; 2300 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO) 2301 cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo)); 2302 #if defined(IPV6_USE_MIN_MTU) 2303 /* 2304 * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU 2305 * support. 2306 */ 2307 cmsgbuflen += cmsg_space(sizeof(int)); 2308 #endif 2309 #endif 2310 #if defined(USE_CMSG) && (defined(IP_TOS) || defined(IPV6_TCLASS)) 2311 cmsgbuflen += cmsg_space(sizeof(int)); 2312 #endif 2313 sock->sendcmsgbuflen = cmsgbuflen; 2314 if (sock->sendcmsgbuflen != 0U) { 2315 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen); 2316 if (sock->sendcmsgbuf == NULL) { 2317 result = ISC_R_NOMEMORY; 2318 goto error; 2319 } 2320 } 2321 2322 memset(sock->name, 0, sizeof(sock->name)); 2323 sock->tag = NULL; 2324 2325 /* 2326 * Set up list of readers and writers to be initially empty. 2327 */ 2328 ISC_LIST_INIT(sock->recv_list); 2329 ISC_LIST_INIT(sock->send_list); 2330 ISC_LIST_INIT(sock->accept_list); 2331 sock->connect_ev = NULL; 2332 sock->pending_recv = 0; 2333 sock->pending_send = 0; 2334 sock->pending_accept = 0; 2335 sock->listener = 0; 2336 sock->connected = 0; 2337 sock->connecting = 0; 2338 sock->bound = 0; 2339 sock->pktdscp = 0; 2340 2341 /* 2342 * Initialize the lock. 2343 */ 2344 result = isc_mutex_init(&sock->lock); 2345 if (result != ISC_R_SUCCESS) { 2346 sock->common.magic = 0; 2347 sock->common.impmagic = 0; 2348 goto error; 2349 } 2350 2351 /* 2352 * Initialize readable and writable events. 2353 */ 2354 ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t), 2355 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR, 2356 NULL, sock, sock, NULL, NULL); 2357 ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t), 2358 ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW, 2359 NULL, sock, sock, NULL, NULL); 2360 2361 sock->common.magic = ISCAPI_SOCKET_MAGIC; 2362 sock->common.impmagic = SOCKET_MAGIC; 2363 *socketp = sock; 2364 2365 return (ISC_R_SUCCESS); 2366 2367 error: 2368 if (sock->recvcmsgbuf != NULL) 2369 isc_mem_put(manager->mctx, sock->recvcmsgbuf, 2370 sock->recvcmsgbuflen); 2371 if (sock->sendcmsgbuf != NULL) 2372 isc_mem_put(manager->mctx, sock->sendcmsgbuf, 2373 sock->sendcmsgbuflen); 2374 isc_mem_put(manager->mctx, sock, sizeof(*sock)); 2375 2376 return (result); 2377 } 2378 2379 /* 2380 * This event requires that the various lists be empty, that the reference 2381 * count be 1, and that the magic number is valid. The other socket bits, 2382 * like the lock, must be initialized as well. The fd associated must be 2383 * marked as closed, by setting it to -1 on close, or this routine will 2384 * also close the socket. 2385 */ 2386 static void 2387 free_socket(isc__socket_t **socketp) { 2388 isc__socket_t *sock = *socketp; 2389 2390 INSIST(sock->references == 0); 2391 INSIST(VALID_SOCKET(sock)); 2392 INSIST(!sock->connecting); 2393 INSIST(!sock->pending_recv); 2394 INSIST(!sock->pending_send); 2395 INSIST(!sock->pending_accept); 2396 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 2397 INSIST(ISC_LIST_EMPTY(sock->send_list)); 2398 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 2399 INSIST(!ISC_LINK_LINKED(sock, link)); 2400 2401 if (sock->recvcmsgbuf != NULL) 2402 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf, 2403 sock->recvcmsgbuflen); 2404 if (sock->sendcmsgbuf != NULL) 2405 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf, 2406 sock->sendcmsgbuflen); 2407 2408 sock->common.magic = 0; 2409 sock->common.impmagic = 0; 2410 2411 DESTROYLOCK(&sock->lock); 2412 2413 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); 2414 2415 *socketp = NULL; 2416 } 2417 2418 #ifdef SO_RCVBUF 2419 static isc_once_t rcvbuf_once = ISC_ONCE_INIT; 2420 static int rcvbuf = RCVBUFSIZE; 2421 2422 static void 2423 set_rcvbuf(void) { 2424 int fd; 2425 int max = rcvbuf, min; 2426 ISC_SOCKADDR_LEN_T len; 2427 2428 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 2429 #if defined(ISC_PLATFORM_HAVEIPV6) 2430 if (fd == -1) { 2431 switch (errno) { 2432 case EPROTONOSUPPORT: 2433 case EPFNOSUPPORT: 2434 case EAFNOSUPPORT: 2435 /* 2436 * Linux 2.2 (and maybe others) return EINVAL instead of 2437 * EAFNOSUPPORT. 2438 */ 2439 case EINVAL: 2440 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); 2441 break; 2442 } 2443 } 2444 #endif 2445 if (fd == -1) 2446 return; 2447 2448 len = sizeof(min); 2449 if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) >= 0 && 2450 min < rcvbuf) { 2451 again: 2452 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf, 2453 sizeof(rcvbuf)) == -1) { 2454 if (errno == ENOBUFS && rcvbuf > min) { 2455 max = rcvbuf - 1; 2456 rcvbuf = (rcvbuf + min) / 2; 2457 goto again; 2458 } else { 2459 rcvbuf = min; 2460 goto cleanup; 2461 } 2462 } else 2463 min = rcvbuf; 2464 if (min != max) { 2465 rcvbuf = max; 2466 goto again; 2467 } 2468 } 2469 cleanup: 2470 close (fd); 2471 } 2472 #endif 2473 2474 #ifdef SO_BSDCOMPAT 2475 /* 2476 * This really should not be necessary to do. Having to workout 2477 * which kernel version we are on at run time so that we don't cause 2478 * the kernel to issue a warning about us using a deprecated socket option. 2479 * Such warnings should *never* be on by default in production kernels. 2480 * 2481 * We can't do this a build time because executables are moved between 2482 * machines and hence kernels. 2483 * 2484 * We can't just not set SO_BSDCOMAT because some kernels require it. 2485 */ 2486 2487 static isc_once_t bsdcompat_once = ISC_ONCE_INIT; 2488 isc_boolean_t bsdcompat = ISC_TRUE; 2489 2490 static void 2491 clear_bsdcompat(void) { 2492 #ifdef __linux__ 2493 struct utsname buf; 2494 char *endp; 2495 long int major; 2496 long int minor; 2497 2498 uname(&buf); /* Can only fail if buf is bad in Linux. */ 2499 2500 /* Paranoia in parsing can be increased, but we trust uname(). */ 2501 major = strtol(buf.release, &endp, 10); 2502 if (*endp == '.') { 2503 minor = strtol(endp+1, &endp, 10); 2504 if ((major > 2) || ((major == 2) && (minor >= 4))) { 2505 bsdcompat = ISC_FALSE; 2506 } 2507 } 2508 #endif /* __linux __ */ 2509 } 2510 #endif 2511 2512 static void 2513 use_min_mtu(isc__socket_t *sock) { 2514 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) 2515 UNUSED(sock); 2516 #endif 2517 #ifdef IPV6_USE_MIN_MTU 2518 /* use minimum MTU */ 2519 if (sock->pf == AF_INET6) { 2520 int on = 1; 2521 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, 2522 (void *)&on, sizeof(on)); 2523 } 2524 #endif 2525 #if defined(IPV6_MTU) 2526 /* 2527 * Use minimum MTU on IPv6 sockets. 2528 */ 2529 if (sock->pf == AF_INET6) { 2530 int mtu = 1280; 2531 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, 2532 &mtu, sizeof(mtu)); 2533 } 2534 #endif 2535 } 2536 2537 static isc_result_t 2538 opensocket(isc__socketmgr_t *manager, isc__socket_t *sock, 2539 isc__socket_t *dup_socket) 2540 { 2541 isc_result_t result; 2542 char strbuf[ISC_STRERRORSIZE]; 2543 const char *err = "socket"; 2544 int tries = 0; 2545 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT) || defined(SO_NOSIGPIPE) 2546 int on = 1; 2547 #endif 2548 #if defined(SO_RCVBUF) 2549 ISC_SOCKADDR_LEN_T optlen; 2550 int size; 2551 #endif 2552 2553 again: 2554 if (dup_socket == NULL) { 2555 switch (sock->type) { 2556 case isc_sockettype_udp: 2557 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); 2558 break; 2559 case isc_sockettype_tcp: 2560 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); 2561 break; 2562 case isc_sockettype_unix: 2563 sock->fd = socket(sock->pf, SOCK_STREAM, 0); 2564 break; 2565 case isc_sockettype_raw: 2566 errno = EPFNOSUPPORT; 2567 /* 2568 * PF_ROUTE is a alias for PF_NETLINK on linux. 2569 */ 2570 #if defined(PF_ROUTE) 2571 if (sock->fd == -1 && sock->pf == PF_ROUTE) { 2572 #ifdef NETLINK_ROUTE 2573 sock->fd = socket(sock->pf, SOCK_RAW, 2574 NETLINK_ROUTE); 2575 #else 2576 sock->fd = socket(sock->pf, SOCK_RAW, 0); 2577 #endif 2578 if (sock->fd != -1) { 2579 #ifdef NETLINK_ROUTE 2580 struct sockaddr_nl sa; 2581 int n; 2582 2583 /* 2584 * Do an implicit bind. 2585 */ 2586 memset(&sa, 0, sizeof(sa)); 2587 sa.nl_family = AF_NETLINK; 2588 sa.nl_groups = RTMGRP_IPV4_IFADDR | 2589 RTMGRP_IPV6_IFADDR; 2590 n = bind(sock->fd, 2591 (struct sockaddr *) &sa, 2592 sizeof(sa)); 2593 if (n < 0) { 2594 close(sock->fd); 2595 sock->fd = -1; 2596 } 2597 #endif 2598 sock->bound = 1; 2599 } 2600 } 2601 #endif 2602 break; 2603 case isc_sockettype_fdwatch: 2604 /* 2605 * We should not be called for isc_sockettype_fdwatch 2606 * sockets. 2607 */ 2608 INSIST(0); 2609 break; 2610 } 2611 } else { 2612 sock->fd = dup(dup_socket->fd); 2613 sock->dupped = 1; 2614 sock->bound = dup_socket->bound; 2615 } 2616 if (sock->fd == -1 && errno == EINTR && tries++ < 42) 2617 goto again; 2618 2619 #ifdef F_DUPFD 2620 /* 2621 * Leave a space for stdio and TCP to work in. 2622 */ 2623 if (manager->reserved != 0 && sock->type == isc_sockettype_udp && 2624 sock->fd >= 0 && sock->fd < manager->reserved) { 2625 int new, tmp; 2626 new = fcntl(sock->fd, F_DUPFD, manager->reserved); 2627 tmp = errno; 2628 (void)close(sock->fd); 2629 errno = tmp; 2630 sock->fd = new; 2631 err = "isc_socket_create: fcntl/reserved"; 2632 } else if (sock->fd >= 0 && sock->fd < 20) { 2633 int new, tmp; 2634 new = fcntl(sock->fd, F_DUPFD, 20); 2635 tmp = errno; 2636 (void)close(sock->fd); 2637 errno = tmp; 2638 sock->fd = new; 2639 err = "isc_socket_create: fcntl"; 2640 } 2641 #endif 2642 2643 if (sock->fd >= (int)manager->maxsocks) { 2644 (void)close(sock->fd); 2645 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2646 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2647 isc_msgcat, ISC_MSGSET_SOCKET, 2648 ISC_MSG_TOOMANYFDS, 2649 "socket: file descriptor exceeds limit (%d/%u)", 2650 sock->fd, manager->maxsocks); 2651 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2652 return (ISC_R_NORESOURCES); 2653 } 2654 2655 if (sock->fd < 0) { 2656 switch (errno) { 2657 case EMFILE: 2658 case ENFILE: 2659 isc__strerror(errno, strbuf, sizeof(strbuf)); 2660 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 2661 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 2662 isc_msgcat, ISC_MSGSET_SOCKET, 2663 ISC_MSG_TOOMANYFDS, 2664 "%s: %s", err, strbuf); 2665 /* fallthrough */ 2666 case ENOBUFS: 2667 inc_stats(manager->stats, 2668 sock->statsindex[STATID_OPENFAIL]); 2669 return (ISC_R_NORESOURCES); 2670 2671 case EPROTONOSUPPORT: 2672 case EPFNOSUPPORT: 2673 case EAFNOSUPPORT: 2674 /* 2675 * Linux 2.2 (and maybe others) return EINVAL instead of 2676 * EAFNOSUPPORT. 2677 */ 2678 case EINVAL: 2679 inc_stats(manager->stats, 2680 sock->statsindex[STATID_OPENFAIL]); 2681 return (ISC_R_FAMILYNOSUPPORT); 2682 2683 default: 2684 isc__strerror(errno, strbuf, sizeof(strbuf)); 2685 UNEXPECTED_ERROR(__FILE__, __LINE__, 2686 "%s() %s: %s", err, 2687 isc_msgcat_get(isc_msgcat, 2688 ISC_MSGSET_GENERAL, 2689 ISC_MSG_FAILED, 2690 "failed"), 2691 strbuf); 2692 inc_stats(manager->stats, 2693 sock->statsindex[STATID_OPENFAIL]); 2694 return (ISC_R_UNEXPECTED); 2695 } 2696 } 2697 2698 if (dup_socket != NULL) 2699 goto setup_done; 2700 2701 result = make_nonblock(sock->fd); 2702 if (result != ISC_R_SUCCESS) { 2703 (void)close(sock->fd); 2704 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]); 2705 return (result); 2706 } 2707 2708 #ifdef SO_BSDCOMPAT 2709 RUNTIME_CHECK(isc_once_do(&bsdcompat_once, 2710 clear_bsdcompat) == ISC_R_SUCCESS); 2711 if (sock->type != isc_sockettype_unix && bsdcompat && 2712 setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, 2713 (void *)&on, sizeof(on)) < 0) { 2714 isc__strerror(errno, strbuf, sizeof(strbuf)); 2715 UNEXPECTED_ERROR(__FILE__, __LINE__, 2716 "setsockopt(%d, SO_BSDCOMPAT) %s: %s", 2717 sock->fd, 2718 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2719 ISC_MSG_FAILED, "failed"), 2720 strbuf); 2721 /* Press on... */ 2722 } 2723 #endif 2724 2725 #ifdef SO_NOSIGPIPE 2726 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, 2727 (void *)&on, sizeof(on)) < 0) { 2728 isc__strerror(errno, strbuf, sizeof(strbuf)); 2729 UNEXPECTED_ERROR(__FILE__, __LINE__, 2730 "setsockopt(%d, SO_NOSIGPIPE) %s: %s", 2731 sock->fd, 2732 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2733 ISC_MSG_FAILED, "failed"), 2734 strbuf); 2735 /* Press on... */ 2736 } 2737 #endif 2738 2739 /* 2740 * Use minimum mtu if possible. 2741 */ 2742 use_min_mtu(sock); 2743 2744 #if defined(USE_CMSG) || defined(SO_RCVBUF) 2745 if (sock->type == isc_sockettype_udp) { 2746 2747 #if defined(USE_CMSG) 2748 #if defined(SO_TIMESTAMP) 2749 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, 2750 (void *)&on, sizeof(on)) < 0 2751 && errno != ENOPROTOOPT) { 2752 isc__strerror(errno, strbuf, sizeof(strbuf)); 2753 UNEXPECTED_ERROR(__FILE__, __LINE__, 2754 "setsockopt(%d, SO_TIMESTAMP) %s: %s", 2755 sock->fd, 2756 isc_msgcat_get(isc_msgcat, 2757 ISC_MSGSET_GENERAL, 2758 ISC_MSG_FAILED, 2759 "failed"), 2760 strbuf); 2761 /* Press on... */ 2762 } 2763 #endif /* SO_TIMESTAMP */ 2764 2765 #if defined(ISC_PLATFORM_HAVEIPV6) 2766 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { 2767 /* 2768 * Warn explicitly because this anomaly can be hidden 2769 * in usual operation (and unexpectedly appear later). 2770 */ 2771 UNEXPECTED_ERROR(__FILE__, __LINE__, 2772 "No buffer available to receive " 2773 "IPv6 destination"); 2774 } 2775 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO 2776 #ifdef IPV6_RECVPKTINFO 2777 /* RFC 3542 */ 2778 if ((sock->pf == AF_INET6) 2779 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, 2780 (void *)&on, sizeof(on)) < 0)) { 2781 isc__strerror(errno, strbuf, sizeof(strbuf)); 2782 UNEXPECTED_ERROR(__FILE__, __LINE__, 2783 "setsockopt(%d, IPV6_RECVPKTINFO) " 2784 "%s: %s", sock->fd, 2785 isc_msgcat_get(isc_msgcat, 2786 ISC_MSGSET_GENERAL, 2787 ISC_MSG_FAILED, 2788 "failed"), 2789 strbuf); 2790 } 2791 #else 2792 /* RFC 2292 */ 2793 if ((sock->pf == AF_INET6) 2794 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, 2795 (void *)&on, sizeof(on)) < 0)) { 2796 isc__strerror(errno, strbuf, sizeof(strbuf)); 2797 UNEXPECTED_ERROR(__FILE__, __LINE__, 2798 "setsockopt(%d, IPV6_PKTINFO) %s: %s", 2799 sock->fd, 2800 isc_msgcat_get(isc_msgcat, 2801 ISC_MSGSET_GENERAL, 2802 ISC_MSG_FAILED, 2803 "failed"), 2804 strbuf); 2805 } 2806 #endif /* IPV6_RECVPKTINFO */ 2807 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ 2808 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT) 2809 /* 2810 * Turn off Path MTU discovery on IPv6/UDP sockets. 2811 */ 2812 if (sock->pf == AF_INET6) { 2813 int action = IPV6_PMTUDISC_DONT; 2814 (void)setsockopt(sock->fd, IPPROTO_IPV6, 2815 IPV6_MTU_DISCOVER, &action, 2816 sizeof(action)); 2817 } 2818 #endif 2819 #endif /* ISC_PLATFORM_HAVEIPV6 */ 2820 #endif /* defined(USE_CMSG) */ 2821 2822 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) 2823 /* 2824 * Turn off Path MTU discovery on IPv4/UDP sockets. 2825 */ 2826 if (sock->pf == AF_INET) { 2827 int action = IP_PMTUDISC_DONT; 2828 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, 2829 &action, sizeof(action)); 2830 } 2831 #endif 2832 #if defined(IP_DONTFRAG) 2833 /* 2834 * Turn off Path MTU discovery on IPv4/UDP sockets. 2835 */ 2836 if (sock->pf == AF_INET) { 2837 int off = 0; 2838 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, 2839 &off, sizeof(off)); 2840 } 2841 #endif 2842 2843 #if defined(SO_RCVBUF) 2844 optlen = sizeof(size); 2845 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2846 (void *)&size, &optlen) >= 0 && size < rcvbuf) { 2847 RUNTIME_CHECK(isc_once_do(&rcvbuf_once, 2848 set_rcvbuf) == ISC_R_SUCCESS); 2849 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, 2850 (void *)&rcvbuf, sizeof(rcvbuf)) == -1) { 2851 isc__strerror(errno, strbuf, sizeof(strbuf)); 2852 UNEXPECTED_ERROR(__FILE__, __LINE__, 2853 "setsockopt(%d, SO_RCVBUF, %d) %s: %s", 2854 sock->fd, rcvbuf, 2855 isc_msgcat_get(isc_msgcat, 2856 ISC_MSGSET_GENERAL, 2857 ISC_MSG_FAILED, 2858 "failed"), 2859 strbuf); 2860 } 2861 } 2862 #endif 2863 } 2864 #ifdef IPV6_RECVTCLASS 2865 if ((sock->pf == AF_INET6) 2866 && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, 2867 (void *)&on, sizeof(on)) < 0)) { 2868 isc__strerror(errno, strbuf, sizeof(strbuf)); 2869 UNEXPECTED_ERROR(__FILE__, __LINE__, 2870 "setsockopt(%d, IPV6_RECVTCLASS) " 2871 "%s: %s", sock->fd, 2872 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2873 ISC_MSG_FAILED, "failed"), 2874 strbuf); 2875 } 2876 #endif 2877 #ifdef IP_RECVTOS 2878 if ((sock->pf == AF_INET) 2879 && (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, 2880 (void *)&on, sizeof(on)) < 0)) { 2881 isc__strerror(errno, strbuf, sizeof(strbuf)); 2882 UNEXPECTED_ERROR(__FILE__, __LINE__, 2883 "setsockopt(%d, IP_RECVTOS) " 2884 "%s: %s", sock->fd, 2885 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 2886 ISC_MSG_FAILED, "failed"), 2887 strbuf); 2888 } 2889 #endif 2890 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ 2891 2892 setup_done: 2893 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]); 2894 if (sock->active == 0) { 2895 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]); 2896 sock->active = 1; 2897 } 2898 2899 return (ISC_R_SUCCESS); 2900 } 2901 2902 /* 2903 * Create a 'type' socket or duplicate an existing socket, managed 2904 * by 'manager'. Events will be posted to 'task' and when dispatched 2905 * 'action' will be called with 'arg' as the arg value. The new 2906 * socket is returned in 'socketp'. 2907 */ 2908 static isc_result_t 2909 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2910 isc_socket_t **socketp, isc_socket_t *dup_socket) 2911 { 2912 isc__socket_t *sock = NULL; 2913 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 2914 isc_result_t result; 2915 int lockid; 2916 2917 REQUIRE(VALID_MANAGER(manager)); 2918 REQUIRE(socketp != NULL && *socketp == NULL); 2919 REQUIRE(type != isc_sockettype_fdwatch); 2920 2921 result = allocate_socket(manager, type, &sock); 2922 if (result != ISC_R_SUCCESS) 2923 return (result); 2924 2925 switch (sock->type) { 2926 case isc_sockettype_udp: 2927 sock->statsindex = 2928 (pf == AF_INET) ? udp4statsindex : udp6statsindex; 2929 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6) 2930 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0; 2931 break; 2932 case isc_sockettype_tcp: 2933 sock->statsindex = 2934 (pf == AF_INET) ? tcp4statsindex : tcp6statsindex; 2935 break; 2936 case isc_sockettype_unix: 2937 sock->statsindex = unixstatsindex; 2938 break; 2939 case isc_sockettype_raw: 2940 sock->statsindex = rawstatsindex; 2941 break; 2942 default: 2943 INSIST(0); 2944 } 2945 2946 sock->active = 0; 2947 sock->pf = pf; 2948 2949 result = opensocket(manager, sock, (isc__socket_t *)dup_socket); 2950 if (result != ISC_R_SUCCESS) { 2951 free_socket(&sock); 2952 return (result); 2953 } 2954 2955 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 2956 sock->references = 1; 2957 *socketp = (isc_socket_t *)sock; 2958 2959 /* 2960 * Note we don't have to lock the socket like we normally would because 2961 * there are no external references to it yet. 2962 */ 2963 2964 lockid = FDLOCK_ID(sock->fd); 2965 LOCK(&manager->fdlock[lockid]); 2966 manager->fds[sock->fd] = sock; 2967 manager->fdstate[sock->fd] = MANAGED; 2968 #ifdef USE_DEVPOLL 2969 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 2970 sock->manager->fdpollinfo[sock->fd].want_write == 0); 2971 #endif 2972 UNLOCK(&manager->fdlock[lockid]); 2973 2974 LOCK(&manager->lock); 2975 ISC_LIST_APPEND(manager->socklist, sock, link); 2976 #ifdef USE_SELECT 2977 if (manager->maxfd < sock->fd) 2978 manager->maxfd = sock->fd; 2979 #endif 2980 UNLOCK(&manager->lock); 2981 2982 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 2983 ISC_MSG_CREATED, dup_socket != NULL ? "dupped" : "created"); 2984 2985 return (ISC_R_SUCCESS); 2986 } 2987 2988 /*% 2989 * Create a new 'type' socket managed by 'manager'. Events 2990 * will be posted to 'task' and when dispatched 'action' will be 2991 * called with 'arg' as the arg value. The new socket is returned 2992 * in 'socketp'. 2993 */ 2994 isc_result_t 2995 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type, 2996 isc_socket_t **socketp) 2997 { 2998 return (socket_create(manager0, pf, type, socketp, NULL)); 2999 } 3000 3001 /*% 3002 * Duplicate an existing socket. The new socket is returned 3003 * in 'socketp'. 3004 */ 3005 isc_result_t 3006 isc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) { 3007 isc__socket_t *sock = (isc__socket_t *)sock0; 3008 3009 REQUIRE(VALID_SOCKET(sock)); 3010 REQUIRE(socketp != NULL && *socketp == NULL); 3011 3012 return (socket_create((isc_socketmgr_t *) sock->manager, 3013 sock->pf, sock->type, socketp, 3014 sock0)); 3015 } 3016 3017 isc_result_t 3018 isc__socket_open(isc_socket_t *sock0) { 3019 isc_result_t result; 3020 isc__socket_t *sock = (isc__socket_t *)sock0; 3021 3022 REQUIRE(VALID_SOCKET(sock)); 3023 3024 LOCK(&sock->lock); 3025 REQUIRE(sock->references == 1); 3026 REQUIRE(sock->type != isc_sockettype_fdwatch); 3027 UNLOCK(&sock->lock); 3028 /* 3029 * We don't need to retain the lock hereafter, since no one else has 3030 * this socket. 3031 */ 3032 REQUIRE(sock->fd == -1); 3033 3034 result = opensocket(sock->manager, sock, NULL); 3035 if (result != ISC_R_SUCCESS) 3036 sock->fd = -1; 3037 3038 if (result == ISC_R_SUCCESS) { 3039 int lockid = FDLOCK_ID(sock->fd); 3040 3041 LOCK(&sock->manager->fdlock[lockid]); 3042 sock->manager->fds[sock->fd] = sock; 3043 sock->manager->fdstate[sock->fd] = MANAGED; 3044 #ifdef USE_DEVPOLL 3045 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && 3046 sock->manager->fdpollinfo[sock->fd].want_write == 0); 3047 #endif 3048 UNLOCK(&sock->manager->fdlock[lockid]); 3049 3050 #ifdef USE_SELECT 3051 LOCK(&sock->manager->lock); 3052 if (sock->manager->maxfd < sock->fd) 3053 sock->manager->maxfd = sock->fd; 3054 UNLOCK(&sock->manager->lock); 3055 #endif 3056 } 3057 3058 return (result); 3059 } 3060 3061 /* 3062 * Create a new 'type' socket managed by 'manager'. Events 3063 * will be posted to 'task' and when dispatched 'action' will be 3064 * called with 'arg' as the arg value. The new socket is returned 3065 * in 'socketp'. 3066 */ 3067 isc_result_t 3068 isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags, 3069 isc_sockfdwatch_t callback, void *cbarg, 3070 isc_task_t *task, isc_socket_t **socketp) 3071 { 3072 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 3073 isc__socket_t *sock = NULL; 3074 isc_result_t result; 3075 int lockid; 3076 3077 REQUIRE(VALID_MANAGER(manager)); 3078 REQUIRE(socketp != NULL && *socketp == NULL); 3079 3080 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock); 3081 if (result != ISC_R_SUCCESS) 3082 return (result); 3083 3084 sock->fd = fd; 3085 sock->fdwatcharg = cbarg; 3086 sock->fdwatchcb = callback; 3087 sock->fdwatchflags = flags; 3088 sock->fdwatchtask = task; 3089 sock->statsindex = fdwatchstatsindex; 3090 3091 sock->common.methods = (isc_socketmethods_t *)&socketmethods; 3092 sock->references = 1; 3093 *socketp = (isc_socket_t *)sock; 3094 3095 /* 3096 * Note we don't have to lock the socket like we normally would because 3097 * there are no external references to it yet. 3098 */ 3099 3100 lockid = FDLOCK_ID(sock->fd); 3101 LOCK(&manager->fdlock[lockid]); 3102 manager->fds[sock->fd] = sock; 3103 manager->fdstate[sock->fd] = MANAGED; 3104 UNLOCK(&manager->fdlock[lockid]); 3105 3106 LOCK(&manager->lock); 3107 ISC_LIST_APPEND(manager->socklist, sock, link); 3108 #ifdef USE_SELECT 3109 if (manager->maxfd < sock->fd) 3110 manager->maxfd = sock->fd; 3111 #endif 3112 UNLOCK(&manager->lock); 3113 3114 if (flags & ISC_SOCKFDWATCH_READ) 3115 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3116 if (flags & ISC_SOCKFDWATCH_WRITE) 3117 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3118 3119 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 3120 ISC_MSG_CREATED, "fdwatch-created"); 3121 3122 return (ISC_R_SUCCESS); 3123 } 3124 3125 /* 3126 * Indicate to the manager that it should watch the socket again. 3127 * This can be used to restart watching if the previous event handler 3128 * didn't indicate there was more data to be processed. Primarily 3129 * it is for writing but could be used for reading if desired 3130 */ 3131 3132 isc_result_t 3133 isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags) 3134 { 3135 isc__socket_t *sock = (isc__socket_t *)sock0; 3136 3137 REQUIRE(VALID_SOCKET(sock)); 3138 3139 /* 3140 * We check both flags first to allow us to get the lock 3141 * once but only if we need it. 3142 */ 3143 3144 if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) { 3145 LOCK(&sock->lock); 3146 if (((flags & ISC_SOCKFDWATCH_READ) != 0) && 3147 !sock->pending_recv) 3148 select_poke(sock->manager, sock->fd, 3149 SELECT_POKE_READ); 3150 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) && 3151 !sock->pending_send) 3152 select_poke(sock->manager, sock->fd, 3153 SELECT_POKE_WRITE); 3154 UNLOCK(&sock->lock); 3155 } 3156 3157 socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET, 3158 ISC_MSG_POKED, "fdwatch-poked flags: %d", flags); 3159 3160 return (ISC_R_SUCCESS); 3161 } 3162 3163 /* 3164 * Attach to a socket. Caller must explicitly detach when it is done. 3165 */ 3166 void 3167 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) { 3168 isc__socket_t *sock = (isc__socket_t *)sock0; 3169 3170 REQUIRE(VALID_SOCKET(sock)); 3171 REQUIRE(socketp != NULL && *socketp == NULL); 3172 3173 LOCK(&sock->lock); 3174 sock->references++; 3175 UNLOCK(&sock->lock); 3176 3177 *socketp = (isc_socket_t *)sock; 3178 } 3179 3180 /* 3181 * Dereference a socket. If this is the last reference to it, clean things 3182 * up by destroying the socket. 3183 */ 3184 void 3185 isc__socket_detach(isc_socket_t **socketp) { 3186 isc__socket_t *sock; 3187 isc_boolean_t kill_socket = ISC_FALSE; 3188 3189 REQUIRE(socketp != NULL); 3190 sock = (isc__socket_t *)*socketp; 3191 REQUIRE(VALID_SOCKET(sock)); 3192 3193 LOCK(&sock->lock); 3194 REQUIRE(sock->references > 0); 3195 sock->references--; 3196 if (sock->references == 0) 3197 kill_socket = ISC_TRUE; 3198 UNLOCK(&sock->lock); 3199 3200 if (kill_socket) 3201 destroy(&sock); 3202 3203 *socketp = NULL; 3204 } 3205 3206 isc_result_t 3207 isc__socket_close(isc_socket_t *sock0) { 3208 isc__socket_t *sock = (isc__socket_t *)sock0; 3209 int fd; 3210 isc__socketmgr_t *manager; 3211 3212 fflush(stdout); 3213 REQUIRE(VALID_SOCKET(sock)); 3214 3215 LOCK(&sock->lock); 3216 3217 REQUIRE(sock->references == 1); 3218 REQUIRE(sock->type != isc_sockettype_fdwatch); 3219 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); 3220 3221 INSIST(!sock->connecting); 3222 INSIST(!sock->pending_recv); 3223 INSIST(!sock->pending_send); 3224 INSIST(!sock->pending_accept); 3225 INSIST(ISC_LIST_EMPTY(sock->recv_list)); 3226 INSIST(ISC_LIST_EMPTY(sock->send_list)); 3227 INSIST(ISC_LIST_EMPTY(sock->accept_list)); 3228 INSIST(sock->connect_ev == NULL); 3229 3230 manager = sock->manager; 3231 fd = sock->fd; 3232 sock->fd = -1; 3233 sock->dupped = 0; 3234 memset(sock->name, 0, sizeof(sock->name)); 3235 sock->tag = NULL; 3236 sock->listener = 0; 3237 sock->connected = 0; 3238 sock->connecting = 0; 3239 sock->bound = 0; 3240 isc_sockaddr_any(&sock->peer_address); 3241 3242 UNLOCK(&sock->lock); 3243 3244 socketclose(manager, sock, fd); 3245 3246 return (ISC_R_SUCCESS); 3247 } 3248 3249 /* 3250 * I/O is possible on a given socket. Schedule an event to this task that 3251 * will call an internal function to do the I/O. This will charge the 3252 * task with the I/O operation and let our select loop handler get back 3253 * to doing something real as fast as possible. 3254 * 3255 * The socket and manager must be locked before calling this function. 3256 */ 3257 static void 3258 dispatch_recv(isc__socket_t *sock) { 3259 intev_t *iev; 3260 isc_socketevent_t *ev; 3261 isc_task_t *sender; 3262 3263 INSIST(!sock->pending_recv); 3264 3265 if (sock->type != isc_sockettype_fdwatch) { 3266 ev = ISC_LIST_HEAD(sock->recv_list); 3267 if (ev == NULL) 3268 return; 3269 socket_log(sock, NULL, EVENT, NULL, 0, 0, 3270 "dispatch_recv: event %p -> task %p", 3271 ev, ev->ev_sender); 3272 sender = ev->ev_sender; 3273 } else { 3274 sender = sock->fdwatchtask; 3275 } 3276 3277 sock->pending_recv = 1; 3278 iev = &sock->readable_ev; 3279 3280 sock->references++; 3281 iev->ev_sender = sock; 3282 if (sock->type == isc_sockettype_fdwatch) 3283 iev->ev_action = internal_fdwatch_read; 3284 else 3285 iev->ev_action = internal_recv; 3286 iev->ev_arg = sock; 3287 3288 isc_task_send(sender, (isc_event_t **)&iev); 3289 } 3290 3291 static void 3292 dispatch_send(isc__socket_t *sock) { 3293 intev_t *iev; 3294 isc_socketevent_t *ev; 3295 isc_task_t *sender; 3296 3297 INSIST(!sock->pending_send); 3298 3299 if (sock->type != isc_sockettype_fdwatch) { 3300 ev = ISC_LIST_HEAD(sock->send_list); 3301 if (ev == NULL) 3302 return; 3303 socket_log(sock, NULL, EVENT, NULL, 0, 0, 3304 "dispatch_send: event %p -> task %p", 3305 ev, ev->ev_sender); 3306 sender = ev->ev_sender; 3307 } else { 3308 sender = sock->fdwatchtask; 3309 } 3310 3311 sock->pending_send = 1; 3312 iev = &sock->writable_ev; 3313 3314 sock->references++; 3315 iev->ev_sender = sock; 3316 if (sock->type == isc_sockettype_fdwatch) 3317 iev->ev_action = internal_fdwatch_write; 3318 else 3319 iev->ev_action = internal_send; 3320 iev->ev_arg = sock; 3321 3322 isc_task_send(sender, (isc_event_t **)&iev); 3323 } 3324 3325 /* 3326 * Dispatch an internal accept event. 3327 */ 3328 static void 3329 dispatch_accept(isc__socket_t *sock) { 3330 intev_t *iev; 3331 isc_socket_newconnev_t *ev; 3332 3333 INSIST(!sock->pending_accept); 3334 3335 /* 3336 * Are there any done events left, or were they all canceled 3337 * before the manager got the socket lock? 3338 */ 3339 ev = ISC_LIST_HEAD(sock->accept_list); 3340 if (ev == NULL) 3341 return; 3342 3343 sock->pending_accept = 1; 3344 iev = &sock->readable_ev; 3345 3346 sock->references++; /* keep socket around for this internal event */ 3347 iev->ev_sender = sock; 3348 iev->ev_action = internal_accept; 3349 iev->ev_arg = sock; 3350 3351 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 3352 } 3353 3354 static void 3355 dispatch_connect(isc__socket_t *sock) { 3356 intev_t *iev; 3357 isc_socket_connev_t *ev; 3358 3359 iev = &sock->writable_ev; 3360 3361 ev = sock->connect_ev; 3362 INSIST(ev != NULL); /* XXX */ 3363 3364 INSIST(sock->connecting); 3365 3366 sock->references++; /* keep socket around for this internal event */ 3367 iev->ev_sender = sock; 3368 iev->ev_action = internal_connect; 3369 iev->ev_arg = sock; 3370 3371 isc_task_send(ev->ev_sender, (isc_event_t **)&iev); 3372 } 3373 3374 /* 3375 * Dequeue an item off the given socket's read queue, set the result code 3376 * in the done event to the one provided, and send it to the task it was 3377 * destined for. 3378 * 3379 * If the event to be sent is on a list, remove it before sending. If 3380 * asked to, send and detach from the socket as well. 3381 * 3382 * Caller must have the socket locked if the event is attached to the socket. 3383 */ 3384 static void 3385 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 3386 isc_task_t *task; 3387 3388 task = (*dev)->ev_sender; 3389 3390 (*dev)->ev_sender = sock; 3391 3392 if (ISC_LINK_LINKED(*dev, ev_link)) 3393 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); 3394 3395 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3396 == ISC_SOCKEVENTATTR_ATTACHED) 3397 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3398 else 3399 isc_task_send(task, (isc_event_t **)dev); 3400 } 3401 3402 /* 3403 * See comments for send_recvdone_event() above. 3404 * 3405 * Caller must have the socket locked if the event is attached to the socket. 3406 */ 3407 static void 3408 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) { 3409 isc_task_t *task; 3410 3411 INSIST(dev != NULL && *dev != NULL); 3412 3413 task = (*dev)->ev_sender; 3414 (*dev)->ev_sender = sock; 3415 3416 if (ISC_LINK_LINKED(*dev, ev_link)) 3417 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); 3418 3419 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) 3420 == ISC_SOCKEVENTATTR_ATTACHED) 3421 isc_task_sendanddetach(&task, (isc_event_t **)dev); 3422 else 3423 isc_task_send(task, (isc_event_t **)dev); 3424 } 3425 3426 /* 3427 * Call accept() on a socket, to get the new file descriptor. The listen 3428 * socket is used as a prototype to create a new isc_socket_t. The new 3429 * socket has one outstanding reference. The task receiving the event 3430 * will be detached from just after the event is delivered. 3431 * 3432 * On entry to this function, the event delivered is the internal 3433 * readable event, and the first item on the accept_list should be 3434 * the done event we want to send. If the list is empty, this is a no-op, 3435 * so just unlock and return. 3436 */ 3437 static void 3438 internal_accept(isc_task_t *me, isc_event_t *ev) { 3439 isc__socket_t *sock; 3440 isc__socketmgr_t *manager; 3441 isc_socket_newconnev_t *dev; 3442 isc_task_t *task; 3443 ISC_SOCKADDR_LEN_T addrlen; 3444 int fd; 3445 isc_result_t result = ISC_R_SUCCESS; 3446 char strbuf[ISC_STRERRORSIZE]; 3447 const char *err = "accept"; 3448 3449 UNUSED(me); 3450 3451 sock = ev->ev_sender; 3452 INSIST(VALID_SOCKET(sock)); 3453 3454 LOCK(&sock->lock); 3455 socket_log(sock, NULL, TRACE, 3456 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK, 3457 "internal_accept called, locked socket"); 3458 3459 manager = sock->manager; 3460 INSIST(VALID_MANAGER(manager)); 3461 3462 INSIST(sock->listener); 3463 INSIST(sock->pending_accept == 1); 3464 sock->pending_accept = 0; 3465 3466 INSIST(sock->references > 0); 3467 sock->references--; /* the internal event is done with this socket */ 3468 if (sock->references == 0) { 3469 UNLOCK(&sock->lock); 3470 destroy(&sock); 3471 return; 3472 } 3473 3474 /* 3475 * Get the first item off the accept list. 3476 * If it is empty, unlock the socket and return. 3477 */ 3478 dev = ISC_LIST_HEAD(sock->accept_list); 3479 if (dev == NULL) { 3480 UNLOCK(&sock->lock); 3481 return; 3482 } 3483 3484 /* 3485 * Try to accept the new connection. If the accept fails with 3486 * EAGAIN or EINTR, simply poke the watcher to watch this socket 3487 * again. Also ignore ECONNRESET, which has been reported to 3488 * be spuriously returned on Linux 2.2.19 although it is not 3489 * a documented error for accept(). ECONNABORTED has been 3490 * reported for Solaris 8. The rest are thrown in not because 3491 * we have seen them but because they are ignored by other 3492 * daemons such as BIND 8 and Apache. 3493 */ 3494 3495 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type); 3496 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen); 3497 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa, 3498 (void *)&addrlen); 3499 3500 #ifdef F_DUPFD 3501 /* 3502 * Leave a space for stdio to work in. 3503 */ 3504 if (fd >= 0 && fd < 20) { 3505 int new, tmp; 3506 new = fcntl(fd, F_DUPFD, 20); 3507 tmp = errno; 3508 (void)close(fd); 3509 errno = tmp; 3510 fd = new; 3511 err = "accept/fcntl"; 3512 } 3513 #endif 3514 3515 if (fd < 0) { 3516 if (SOFT_ERROR(errno)) 3517 goto soft_error; 3518 switch (errno) { 3519 case ENFILE: 3520 case EMFILE: 3521 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3522 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3523 isc_msgcat, ISC_MSGSET_SOCKET, 3524 ISC_MSG_TOOMANYFDS, 3525 "%s: too many open file descriptors", 3526 err); 3527 goto soft_error; 3528 3529 case ENOBUFS: 3530 case ENOMEM: 3531 case ECONNRESET: 3532 case ECONNABORTED: 3533 case EHOSTUNREACH: 3534 case EHOSTDOWN: 3535 case ENETUNREACH: 3536 case ENETDOWN: 3537 case ECONNREFUSED: 3538 #ifdef EPROTO 3539 case EPROTO: 3540 #endif 3541 #ifdef ENONET 3542 case ENONET: 3543 #endif 3544 goto soft_error; 3545 default: 3546 break; 3547 } 3548 isc__strerror(errno, strbuf, sizeof(strbuf)); 3549 UNEXPECTED_ERROR(__FILE__, __LINE__, 3550 "internal_accept: %s() %s: %s", err, 3551 isc_msgcat_get(isc_msgcat, 3552 ISC_MSGSET_GENERAL, 3553 ISC_MSG_FAILED, 3554 "failed"), 3555 strbuf); 3556 fd = -1; 3557 result = ISC_R_UNEXPECTED; 3558 } else { 3559 if (addrlen == 0U) { 3560 UNEXPECTED_ERROR(__FILE__, __LINE__, 3561 "internal_accept(): " 3562 "accept() failed to return " 3563 "remote address"); 3564 3565 (void)close(fd); 3566 goto soft_error; 3567 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family != 3568 sock->pf) 3569 { 3570 UNEXPECTED_ERROR(__FILE__, __LINE__, 3571 "internal_accept(): " 3572 "accept() returned peer address " 3573 "family %u (expected %u)", 3574 NEWCONNSOCK(dev)->peer_address. 3575 type.sa.sa_family, 3576 sock->pf); 3577 (void)close(fd); 3578 goto soft_error; 3579 } else if (fd >= (int)manager->maxsocks) { 3580 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, 3581 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 3582 isc_msgcat, ISC_MSGSET_SOCKET, 3583 ISC_MSG_TOOMANYFDS, 3584 "accept: " 3585 "file descriptor exceeds limit (%d/%u)", 3586 fd, manager->maxsocks); 3587 (void)close(fd); 3588 goto soft_error; 3589 } 3590 } 3591 3592 if (fd != -1) { 3593 NEWCONNSOCK(dev)->peer_address.length = addrlen; 3594 NEWCONNSOCK(dev)->pf = sock->pf; 3595 } 3596 3597 /* 3598 * Pull off the done event. 3599 */ 3600 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); 3601 3602 /* 3603 * Poke watcher if there are more pending accepts. 3604 */ 3605 if (!ISC_LIST_EMPTY(sock->accept_list)) 3606 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3607 3608 UNLOCK(&sock->lock); 3609 3610 if (fd != -1) { 3611 result = make_nonblock(fd); 3612 if (result != ISC_R_SUCCESS) { 3613 (void)close(fd); 3614 fd = -1; 3615 } 3616 } 3617 3618 /* 3619 * -1 means the new socket didn't happen. 3620 */ 3621 if (fd != -1) { 3622 int lockid = FDLOCK_ID(fd); 3623 3624 NEWCONNSOCK(dev)->fd = fd; 3625 NEWCONNSOCK(dev)->bound = 1; 3626 NEWCONNSOCK(dev)->connected = 1; 3627 3628 /* 3629 * Use minimum mtu if possible. 3630 */ 3631 use_min_mtu(NEWCONNSOCK(dev)); 3632 3633 /* 3634 * Ensure DSCP settings are inherited across accept. 3635 */ 3636 setdscp(NEWCONNSOCK(dev), sock->dscp); 3637 3638 /* 3639 * Save away the remote address 3640 */ 3641 dev->address = NEWCONNSOCK(dev)->peer_address; 3642 3643 LOCK(&manager->fdlock[lockid]); 3644 manager->fds[fd] = NEWCONNSOCK(dev); 3645 manager->fdstate[fd] = MANAGED; 3646 UNLOCK(&manager->fdlock[lockid]); 3647 3648 LOCK(&manager->lock); 3649 3650 #ifdef USE_SELECT 3651 if (manager->maxfd < fd) 3652 manager->maxfd = fd; 3653 #endif 3654 3655 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION, 3656 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, 3657 "accepted connection, new socket %p", 3658 dev->newsocket); 3659 3660 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link); 3661 3662 UNLOCK(&manager->lock); 3663 3664 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]); 3665 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]); 3666 } else { 3667 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3668 NEWCONNSOCK(dev)->references--; 3669 free_socket((isc__socket_t **)&dev->newsocket); 3670 } 3671 3672 /* 3673 * Fill in the done event details and send it off. 3674 */ 3675 dev->result = result; 3676 task = dev->ev_sender; 3677 dev->ev_sender = sock; 3678 3679 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 3680 return; 3681 3682 soft_error: 3683 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); 3684 UNLOCK(&sock->lock); 3685 3686 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]); 3687 return; 3688 } 3689 3690 static void 3691 internal_recv(isc_task_t *me, isc_event_t *ev) { 3692 isc_socketevent_t *dev; 3693 isc__socket_t *sock; 3694 3695 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3696 3697 sock = ev->ev_sender; 3698 INSIST(VALID_SOCKET(sock)); 3699 3700 LOCK(&sock->lock); 3701 socket_log(sock, NULL, IOEVENT, 3702 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3703 "internal_recv: task %p got event %p", me, ev); 3704 3705 INSIST(sock->pending_recv == 1); 3706 sock->pending_recv = 0; 3707 3708 INSIST(sock->references > 0); 3709 sock->references--; /* the internal event is done with this socket */ 3710 if (sock->references == 0) { 3711 UNLOCK(&sock->lock); 3712 destroy(&sock); 3713 return; 3714 } 3715 3716 /* 3717 * Try to do as much I/O as possible on this socket. There are no 3718 * limits here, currently. 3719 */ 3720 dev = ISC_LIST_HEAD(sock->recv_list); 3721 while (dev != NULL) { 3722 switch (doio_recv(sock, dev)) { 3723 case DOIO_SOFT: 3724 goto poke; 3725 3726 case DOIO_EOF: 3727 /* 3728 * read of 0 means the remote end was closed. 3729 * Run through the event queue and dispatch all 3730 * the events with an EOF result code. 3731 */ 3732 do { 3733 dev->result = ISC_R_EOF; 3734 send_recvdone_event(sock, &dev); 3735 dev = ISC_LIST_HEAD(sock->recv_list); 3736 } while (dev != NULL); 3737 goto poke; 3738 3739 case DOIO_SUCCESS: 3740 case DOIO_HARD: 3741 send_recvdone_event(sock, &dev); 3742 break; 3743 } 3744 3745 dev = ISC_LIST_HEAD(sock->recv_list); 3746 } 3747 3748 poke: 3749 if (!ISC_LIST_EMPTY(sock->recv_list)) 3750 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3751 3752 UNLOCK(&sock->lock); 3753 } 3754 3755 static void 3756 internal_send(isc_task_t *me, isc_event_t *ev) { 3757 isc_socketevent_t *dev; 3758 isc__socket_t *sock; 3759 3760 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3761 3762 /* 3763 * Find out what socket this is and lock it. 3764 */ 3765 sock = (isc__socket_t *)ev->ev_sender; 3766 INSIST(VALID_SOCKET(sock)); 3767 3768 LOCK(&sock->lock); 3769 socket_log(sock, NULL, IOEVENT, 3770 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3771 "internal_send: task %p got event %p", me, ev); 3772 3773 INSIST(sock->pending_send == 1); 3774 sock->pending_send = 0; 3775 3776 INSIST(sock->references > 0); 3777 sock->references--; /* the internal event is done with this socket */ 3778 if (sock->references == 0) { 3779 UNLOCK(&sock->lock); 3780 destroy(&sock); 3781 return; 3782 } 3783 3784 /* 3785 * Try to do as much I/O as possible on this socket. There are no 3786 * limits here, currently. 3787 */ 3788 dev = ISC_LIST_HEAD(sock->send_list); 3789 while (dev != NULL) { 3790 switch (doio_send(sock, dev)) { 3791 case DOIO_SOFT: 3792 goto poke; 3793 3794 case DOIO_HARD: 3795 case DOIO_SUCCESS: 3796 send_senddone_event(sock, &dev); 3797 break; 3798 } 3799 3800 dev = ISC_LIST_HEAD(sock->send_list); 3801 } 3802 3803 poke: 3804 if (!ISC_LIST_EMPTY(sock->send_list)) 3805 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3806 3807 UNLOCK(&sock->lock); 3808 } 3809 3810 static void 3811 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) { 3812 isc__socket_t *sock; 3813 int more_data; 3814 3815 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 3816 3817 /* 3818 * Find out what socket this is and lock it. 3819 */ 3820 sock = (isc__socket_t *)ev->ev_sender; 3821 INSIST(VALID_SOCKET(sock)); 3822 3823 LOCK(&sock->lock); 3824 socket_log(sock, NULL, IOEVENT, 3825 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, 3826 "internal_fdwatch_write: task %p got event %p", me, ev); 3827 3828 INSIST(sock->pending_send == 1); 3829 3830 UNLOCK(&sock->lock); 3831 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3832 sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE); 3833 LOCK(&sock->lock); 3834 3835 sock->pending_send = 0; 3836 3837 INSIST(sock->references > 0); 3838 sock->references--; /* the internal event is done with this socket */ 3839 if (sock->references == 0) { 3840 UNLOCK(&sock->lock); 3841 destroy(&sock); 3842 return; 3843 } 3844 3845 if (more_data) 3846 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); 3847 3848 UNLOCK(&sock->lock); 3849 } 3850 3851 static void 3852 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) { 3853 isc__socket_t *sock; 3854 int more_data; 3855 3856 INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); 3857 3858 /* 3859 * Find out what socket this is and lock it. 3860 */ 3861 sock = (isc__socket_t *)ev->ev_sender; 3862 INSIST(VALID_SOCKET(sock)); 3863 3864 LOCK(&sock->lock); 3865 socket_log(sock, NULL, IOEVENT, 3866 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, 3867 "internal_fdwatch_read: task %p got event %p", me, ev); 3868 3869 INSIST(sock->pending_recv == 1); 3870 3871 UNLOCK(&sock->lock); 3872 more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock, 3873 sock->fdwatcharg, ISC_SOCKFDWATCH_READ); 3874 LOCK(&sock->lock); 3875 3876 sock->pending_recv = 0; 3877 3878 INSIST(sock->references > 0); 3879 sock->references--; /* the internal event is done with this socket */ 3880 if (sock->references == 0) { 3881 UNLOCK(&sock->lock); 3882 destroy(&sock); 3883 return; 3884 } 3885 3886 if (more_data) 3887 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 3888 3889 UNLOCK(&sock->lock); 3890 } 3891 3892 /* 3893 * Process read/writes on each fd here. Avoid locking 3894 * and unlocking twice if both reads and writes are possible. 3895 */ 3896 static void 3897 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable, 3898 isc_boolean_t writeable) 3899 { 3900 isc__socket_t *sock; 3901 isc_boolean_t unlock_sock; 3902 isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE; 3903 int lockid = FDLOCK_ID(fd); 3904 3905 /* 3906 * If the socket is going to be closed, don't do more I/O. 3907 */ 3908 LOCK(&manager->fdlock[lockid]); 3909 if (manager->fdstate[fd] == CLOSE_PENDING) { 3910 UNLOCK(&manager->fdlock[lockid]); 3911 3912 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3913 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3914 return; 3915 } 3916 3917 sock = manager->fds[fd]; 3918 unlock_sock = ISC_FALSE; 3919 if (readable) { 3920 if (sock == NULL) { 3921 unwatch_read = ISC_TRUE; 3922 goto check_write; 3923 } 3924 unlock_sock = ISC_TRUE; 3925 LOCK(&sock->lock); 3926 if (!SOCK_DEAD(sock)) { 3927 if (sock->listener) 3928 dispatch_accept(sock); 3929 else 3930 dispatch_recv(sock); 3931 } 3932 unwatch_read = ISC_TRUE; 3933 } 3934 check_write: 3935 if (writeable) { 3936 if (sock == NULL) { 3937 unwatch_write = ISC_TRUE; 3938 goto unlock_fd; 3939 } 3940 if (!unlock_sock) { 3941 unlock_sock = ISC_TRUE; 3942 LOCK(&sock->lock); 3943 } 3944 if (!SOCK_DEAD(sock)) { 3945 if (sock->connecting) 3946 dispatch_connect(sock); 3947 else 3948 dispatch_send(sock); 3949 } 3950 unwatch_write = ISC_TRUE; 3951 } 3952 if (unlock_sock) 3953 UNLOCK(&sock->lock); 3954 3955 unlock_fd: 3956 UNLOCK(&manager->fdlock[lockid]); 3957 if (unwatch_read) 3958 (void)unwatch_fd(manager, fd, SELECT_POKE_READ); 3959 if (unwatch_write) 3960 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); 3961 3962 } 3963 3964 #ifdef USE_KQUEUE 3965 static isc_boolean_t 3966 process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) { 3967 int i; 3968 isc_boolean_t readable, writable; 3969 isc_boolean_t done = ISC_FALSE; 3970 #ifdef USE_WATCHER_THREAD 3971 isc_boolean_t have_ctlevent = ISC_FALSE; 3972 #endif 3973 3974 if (nevents == manager->nevents) { 3975 /* 3976 * This is not an error, but something unexpected. If this 3977 * happens, it may indicate the need for increasing 3978 * ISC_SOCKET_MAXEVENTS. 3979 */ 3980 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 3981 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 3982 "maximum number of FD events (%d) received", 3983 nevents); 3984 } 3985 3986 for (i = 0; i < nevents; i++) { 3987 REQUIRE(events[i].ident < manager->maxsocks); 3988 #ifdef USE_WATCHER_THREAD 3989 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { 3990 have_ctlevent = ISC_TRUE; 3991 continue; 3992 } 3993 #endif 3994 readable = ISC_TF(events[i].filter == EVFILT_READ); 3995 writable = ISC_TF(events[i].filter == EVFILT_WRITE); 3996 process_fd(manager, events[i].ident, readable, writable); 3997 } 3998 3999 #ifdef USE_WATCHER_THREAD 4000 if (have_ctlevent) 4001 done = process_ctlfd(manager); 4002 #endif 4003 4004 return (done); 4005 } 4006 #elif defined(USE_EPOLL) 4007 static isc_boolean_t 4008 process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents) 4009 { 4010 int i; 4011 isc_boolean_t done = ISC_FALSE; 4012 #ifdef USE_WATCHER_THREAD 4013 isc_boolean_t have_ctlevent = ISC_FALSE; 4014 #endif 4015 4016 if (nevents == manager->nevents) { 4017 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 4018 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 4019 "maximum number of FD events (%d) received", 4020 nevents); 4021 } 4022 4023 for (i = 0; i < nevents; i++) { 4024 REQUIRE(events[i].data.fd < (int)manager->maxsocks); 4025 #ifdef USE_WATCHER_THREAD 4026 if (events[i].data.fd == manager->pipe_fds[0]) { 4027 have_ctlevent = ISC_TRUE; 4028 continue; 4029 } 4030 #endif 4031 if ((events[i].events & EPOLLERR) != 0 || 4032 (events[i].events & EPOLLHUP) != 0) { 4033 /* 4034 * epoll does not set IN/OUT bits on an erroneous 4035 * condition, so we need to try both anyway. This is a 4036 * bit inefficient, but should be okay for such rare 4037 * events. Note also that the read or write attempt 4038 * won't block because we use non-blocking sockets. 4039 */ 4040 events[i].events |= (EPOLLIN | EPOLLOUT); 4041 } 4042 process_fd(manager, events[i].data.fd, 4043 (events[i].events & EPOLLIN) != 0, 4044 (events[i].events & EPOLLOUT) != 0); 4045 } 4046 4047 #ifdef USE_WATCHER_THREAD 4048 if (have_ctlevent) 4049 done = process_ctlfd(manager); 4050 #endif 4051 4052 return (done); 4053 } 4054 #elif defined(USE_DEVPOLL) 4055 static isc_boolean_t 4056 process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) { 4057 int i; 4058 isc_boolean_t done = ISC_FALSE; 4059 #ifdef USE_WATCHER_THREAD 4060 isc_boolean_t have_ctlevent = ISC_FALSE; 4061 #endif 4062 4063 if (nevents == manager->nevents) { 4064 manager_log(manager, ISC_LOGCATEGORY_GENERAL, 4065 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, 4066 "maximum number of FD events (%d) received", 4067 nevents); 4068 } 4069 4070 for (i = 0; i < nevents; i++) { 4071 REQUIRE(events[i].fd < (int)manager->maxsocks); 4072 #ifdef USE_WATCHER_THREAD 4073 if (events[i].fd == manager->pipe_fds[0]) { 4074 have_ctlevent = ISC_TRUE; 4075 continue; 4076 } 4077 #endif 4078 process_fd(manager, events[i].fd, 4079 (events[i].events & POLLIN) != 0, 4080 (events[i].events & POLLOUT) != 0); 4081 } 4082 4083 #ifdef USE_WATCHER_THREAD 4084 if (have_ctlevent) 4085 done = process_ctlfd(manager); 4086 #endif 4087 4088 return (done); 4089 } 4090 #elif defined(USE_SELECT) 4091 static void 4092 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds, 4093 fd_set *writefds) 4094 { 4095 int i; 4096 4097 REQUIRE(maxfd <= (int)manager->maxsocks); 4098 4099 for (i = 0; i < maxfd; i++) { 4100 #ifdef USE_WATCHER_THREAD 4101 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) 4102 continue; 4103 #endif /* USE_WATCHER_THREAD */ 4104 process_fd(manager, i, FD_ISSET(i, readfds), 4105 FD_ISSET(i, writefds)); 4106 } 4107 } 4108 #endif 4109 4110 #ifdef USE_WATCHER_THREAD 4111 static isc_boolean_t 4112 process_ctlfd(isc__socketmgr_t *manager) { 4113 int msg, fd; 4114 4115 for (;;) { 4116 select_readmsg(manager, &fd, &msg); 4117 4118 manager_log(manager, IOEVENT, 4119 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4120 ISC_MSG_WATCHERMSG, 4121 "watcher got message %d " 4122 "for socket %d"), msg, fd); 4123 4124 /* 4125 * Nothing to read? 4126 */ 4127 if (msg == SELECT_POKE_NOTHING) 4128 break; 4129 4130 /* 4131 * Handle shutdown message. We really should 4132 * jump out of this loop right away, but 4133 * it doesn't matter if we have to do a little 4134 * more work first. 4135 */ 4136 if (msg == SELECT_POKE_SHUTDOWN) 4137 return (ISC_TRUE); 4138 4139 /* 4140 * This is a wakeup on a socket. Look 4141 * at the event queue for both read and write, 4142 * and decide if we need to watch on it now 4143 * or not. 4144 */ 4145 wakeup_socket(manager, fd, msg); 4146 } 4147 4148 return (ISC_FALSE); 4149 } 4150 4151 /* 4152 * This is the thread that will loop forever, always in a select or poll 4153 * call. 4154 * 4155 * When select returns something to do, track down what thread gets to do 4156 * this I/O and post the event to it. 4157 */ 4158 static isc_threadresult_t 4159 watcher(void *uap) { 4160 isc__socketmgr_t *manager = uap; 4161 isc_boolean_t done; 4162 int cc; 4163 #ifdef USE_KQUEUE 4164 const char *fnname = "kevent()"; 4165 #elif defined (USE_EPOLL) 4166 const char *fnname = "epoll_wait()"; 4167 #elif defined(USE_DEVPOLL) 4168 isc_result_t result; 4169 const char *fnname = "ioctl(DP_POLL)"; 4170 struct dvpoll dvp; 4171 int pass; 4172 #elif defined (USE_SELECT) 4173 const char *fnname = "select()"; 4174 int maxfd; 4175 int ctlfd; 4176 #endif 4177 char strbuf[ISC_STRERRORSIZE]; 4178 #ifdef ISC_SOCKET_USE_POLLWATCH 4179 pollstate_t pollstate = poll_idle; 4180 #endif 4181 4182 #if defined (USE_SELECT) 4183 /* 4184 * Get the control fd here. This will never change. 4185 */ 4186 ctlfd = manager->pipe_fds[0]; 4187 #endif 4188 done = ISC_FALSE; 4189 while (!done) { 4190 do { 4191 #ifdef USE_KQUEUE 4192 cc = kevent(manager->kqueue_fd, NULL, 0, 4193 manager->events, manager->nevents, NULL); 4194 #elif defined(USE_EPOLL) 4195 cc = epoll_wait(manager->epoll_fd, manager->events, 4196 manager->nevents, -1); 4197 #elif defined(USE_DEVPOLL) 4198 /* 4199 * Re-probe every thousand calls. 4200 */ 4201 if (manager->calls++ > 1000U) { 4202 result = isc_resource_getcurlimit( 4203 isc_resource_openfiles, 4204 &manager->open_max); 4205 if (result != ISC_R_SUCCESS) 4206 manager->open_max = 64; 4207 manager->calls = 0; 4208 } 4209 for (pass = 0; pass < 2; pass++) { 4210 dvp.dp_fds = manager->events; 4211 dvp.dp_nfds = manager->nevents; 4212 if (dvp.dp_nfds >= manager->open_max) 4213 dvp.dp_nfds = manager->open_max - 1; 4214 #ifndef ISC_SOCKET_USE_POLLWATCH 4215 dvp.dp_timeout = -1; 4216 #else 4217 if (pollstate == poll_idle) 4218 dvp.dp_timeout = -1; 4219 else 4220 dvp.dp_timeout = 4221 ISC_SOCKET_POLLWATCH_TIMEOUT; 4222 #endif /* ISC_SOCKET_USE_POLLWATCH */ 4223 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 4224 if (cc == -1 && errno == EINVAL) { 4225 /* 4226 * {OPEN_MAX} may have dropped. Look 4227 * up the current value and try again. 4228 */ 4229 result = isc_resource_getcurlimit( 4230 isc_resource_openfiles, 4231 &manager->open_max); 4232 if (result != ISC_R_SUCCESS) 4233 manager->open_max = 64; 4234 } else 4235 break; 4236 } 4237 #elif defined(USE_SELECT) 4238 LOCK(&manager->lock); 4239 memmove(manager->read_fds_copy, manager->read_fds, 4240 manager->fd_bufsize); 4241 memmove(manager->write_fds_copy, manager->write_fds, 4242 manager->fd_bufsize); 4243 maxfd = manager->maxfd + 1; 4244 UNLOCK(&manager->lock); 4245 4246 cc = select(maxfd, manager->read_fds_copy, 4247 manager->write_fds_copy, NULL, NULL); 4248 #endif /* USE_KQUEUE */ 4249 4250 if (cc < 0 && !SOFT_ERROR(errno)) { 4251 isc__strerror(errno, strbuf, sizeof(strbuf)); 4252 FATAL_ERROR(__FILE__, __LINE__, 4253 "%s %s: %s", fnname, 4254 isc_msgcat_get(isc_msgcat, 4255 ISC_MSGSET_GENERAL, 4256 ISC_MSG_FAILED, 4257 "failed"), strbuf); 4258 } 4259 4260 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) 4261 if (cc == 0) { 4262 if (pollstate == poll_active) 4263 pollstate = poll_checking; 4264 else if (pollstate == poll_checking) 4265 pollstate = poll_idle; 4266 } else if (cc > 0) { 4267 if (pollstate == poll_checking) { 4268 /* 4269 * XXX: We'd like to use a more 4270 * verbose log level as it's actually an 4271 * unexpected event, but the kernel bug 4272 * reportedly happens pretty frequently 4273 * (and it can also be a false positive) 4274 * so it would be just too noisy. 4275 */ 4276 manager_log(manager, 4277 ISC_LOGCATEGORY_GENERAL, 4278 ISC_LOGMODULE_SOCKET, 4279 ISC_LOG_DEBUG(1), 4280 "unexpected POLL timeout"); 4281 } 4282 pollstate = poll_active; 4283 } 4284 #endif 4285 } while (cc < 0); 4286 4287 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) 4288 done = process_fds(manager, manager->events, cc); 4289 #elif defined(USE_SELECT) 4290 process_fds(manager, maxfd, manager->read_fds_copy, 4291 manager->write_fds_copy); 4292 4293 /* 4294 * Process reads on internal, control fd. 4295 */ 4296 if (FD_ISSET(ctlfd, manager->read_fds_copy)) 4297 done = process_ctlfd(manager); 4298 #endif 4299 } 4300 4301 manager_log(manager, TRACE, "%s", 4302 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4303 ISC_MSG_EXITING, "watcher exiting")); 4304 4305 return ((isc_threadresult_t)0); 4306 } 4307 #endif /* USE_WATCHER_THREAD */ 4308 4309 void 4310 isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) { 4311 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4312 4313 REQUIRE(VALID_MANAGER(manager)); 4314 4315 manager->reserved = reserved; 4316 } 4317 4318 void 4319 isc__socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) { 4320 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4321 4322 REQUIRE(VALID_MANAGER(manager)); 4323 4324 manager->maxudp = maxudp; 4325 } 4326 4327 /* 4328 * Create a new socket manager. 4329 */ 4330 4331 static isc_result_t 4332 setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 4333 isc_result_t result; 4334 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 4335 char strbuf[ISC_STRERRORSIZE]; 4336 #endif 4337 4338 #ifdef USE_KQUEUE 4339 manager->nevents = ISC_SOCKET_MAXEVENTS; 4340 manager->events = isc_mem_get(mctx, sizeof(struct kevent) * 4341 manager->nevents); 4342 if (manager->events == NULL) 4343 return (ISC_R_NOMEMORY); 4344 manager->kqueue_fd = kqueue(); 4345 if (manager->kqueue_fd == -1) { 4346 result = isc__errno2result(errno); 4347 isc__strerror(errno, strbuf, sizeof(strbuf)); 4348 UNEXPECTED_ERROR(__FILE__, __LINE__, 4349 "kqueue %s: %s", 4350 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4351 ISC_MSG_FAILED, "failed"), 4352 strbuf); 4353 isc_mem_put(mctx, manager->events, 4354 sizeof(struct kevent) * manager->nevents); 4355 return (result); 4356 } 4357 4358 #ifdef USE_WATCHER_THREAD 4359 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4360 if (result != ISC_R_SUCCESS) { 4361 close(manager->kqueue_fd); 4362 isc_mem_put(mctx, manager->events, 4363 sizeof(struct kevent) * manager->nevents); 4364 return (result); 4365 } 4366 #endif /* USE_WATCHER_THREAD */ 4367 #elif defined(USE_EPOLL) 4368 manager->nevents = ISC_SOCKET_MAXEVENTS; 4369 manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * 4370 manager->nevents); 4371 if (manager->events == NULL) 4372 return (ISC_R_NOMEMORY); 4373 manager->epoll_fd = epoll_create(manager->nevents); 4374 if (manager->epoll_fd == -1) { 4375 result = isc__errno2result(errno); 4376 isc__strerror(errno, strbuf, sizeof(strbuf)); 4377 UNEXPECTED_ERROR(__FILE__, __LINE__, 4378 "epoll_create %s: %s", 4379 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4380 ISC_MSG_FAILED, "failed"), 4381 strbuf); 4382 isc_mem_put(mctx, manager->events, 4383 sizeof(struct epoll_event) * manager->nevents); 4384 return (result); 4385 } 4386 #ifdef USE_WATCHER_THREAD 4387 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4388 if (result != ISC_R_SUCCESS) { 4389 close(manager->epoll_fd); 4390 isc_mem_put(mctx, manager->events, 4391 sizeof(struct epoll_event) * manager->nevents); 4392 return (result); 4393 } 4394 #endif /* USE_WATCHER_THREAD */ 4395 #elif defined(USE_DEVPOLL) 4396 manager->nevents = ISC_SOCKET_MAXEVENTS; 4397 result = isc_resource_getcurlimit(isc_resource_openfiles, 4398 &manager->open_max); 4399 if (result != ISC_R_SUCCESS) 4400 manager->open_max = 64; 4401 manager->calls = 0; 4402 manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * 4403 manager->nevents); 4404 if (manager->events == NULL) 4405 return (ISC_R_NOMEMORY); 4406 /* 4407 * Note: fdpollinfo should be able to support all possible FDs, so 4408 * it must have maxsocks entries (not nevents). 4409 */ 4410 manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * 4411 manager->maxsocks); 4412 if (manager->fdpollinfo == NULL) { 4413 isc_mem_put(mctx, manager->events, 4414 sizeof(struct pollfd) * manager->nevents); 4415 return (ISC_R_NOMEMORY); 4416 } 4417 memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); 4418 manager->devpoll_fd = open("/dev/poll", O_RDWR); 4419 if (manager->devpoll_fd == -1) { 4420 result = isc__errno2result(errno); 4421 isc__strerror(errno, strbuf, sizeof(strbuf)); 4422 UNEXPECTED_ERROR(__FILE__, __LINE__, 4423 "open(/dev/poll) %s: %s", 4424 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4425 ISC_MSG_FAILED, "failed"), 4426 strbuf); 4427 isc_mem_put(mctx, manager->events, 4428 sizeof(struct pollfd) * manager->nevents); 4429 isc_mem_put(mctx, manager->fdpollinfo, 4430 sizeof(pollinfo_t) * manager->maxsocks); 4431 return (result); 4432 } 4433 #ifdef USE_WATCHER_THREAD 4434 result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4435 if (result != ISC_R_SUCCESS) { 4436 close(manager->devpoll_fd); 4437 isc_mem_put(mctx, manager->events, 4438 sizeof(struct pollfd) * manager->nevents); 4439 isc_mem_put(mctx, manager->fdpollinfo, 4440 sizeof(pollinfo_t) * manager->maxsocks); 4441 return (result); 4442 } 4443 #endif /* USE_WATCHER_THREAD */ 4444 #elif defined(USE_SELECT) 4445 UNUSED(result); 4446 4447 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE 4448 /* 4449 * Note: this code should also cover the case of MAXSOCKETS <= 4450 * FD_SETSIZE, but we separate the cases to avoid possible portability 4451 * issues regarding howmany() and the actual representation of fd_set. 4452 */ 4453 manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * 4454 sizeof(fd_mask); 4455 #else 4456 manager->fd_bufsize = sizeof(fd_set); 4457 #endif 4458 4459 manager->read_fds = NULL; 4460 manager->read_fds_copy = NULL; 4461 manager->write_fds = NULL; 4462 manager->write_fds_copy = NULL; 4463 4464 manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); 4465 if (manager->read_fds != NULL) 4466 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); 4467 if (manager->read_fds_copy != NULL) 4468 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); 4469 if (manager->write_fds != NULL) { 4470 manager->write_fds_copy = isc_mem_get(mctx, 4471 manager->fd_bufsize); 4472 } 4473 if (manager->write_fds_copy == NULL) { 4474 if (manager->write_fds != NULL) { 4475 isc_mem_put(mctx, manager->write_fds, 4476 manager->fd_bufsize); 4477 } 4478 if (manager->read_fds_copy != NULL) { 4479 isc_mem_put(mctx, manager->read_fds_copy, 4480 manager->fd_bufsize); 4481 } 4482 if (manager->read_fds != NULL) { 4483 isc_mem_put(mctx, manager->read_fds, 4484 manager->fd_bufsize); 4485 } 4486 return (ISC_R_NOMEMORY); 4487 } 4488 memset(manager->read_fds, 0, manager->fd_bufsize); 4489 memset(manager->write_fds, 0, manager->fd_bufsize); 4490 4491 #ifdef USE_WATCHER_THREAD 4492 (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4493 manager->maxfd = manager->pipe_fds[0]; 4494 #else /* USE_WATCHER_THREAD */ 4495 manager->maxfd = 0; 4496 #endif /* USE_WATCHER_THREAD */ 4497 #endif /* USE_KQUEUE */ 4498 4499 return (ISC_R_SUCCESS); 4500 } 4501 4502 static void 4503 cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) { 4504 #ifdef USE_WATCHER_THREAD 4505 isc_result_t result; 4506 4507 result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); 4508 if (result != ISC_R_SUCCESS) { 4509 UNEXPECTED_ERROR(__FILE__, __LINE__, 4510 "epoll_ctl(DEL) %s", 4511 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4512 ISC_MSG_FAILED, "failed")); 4513 } 4514 #endif /* USE_WATCHER_THREAD */ 4515 4516 #ifdef USE_KQUEUE 4517 close(manager->kqueue_fd); 4518 isc_mem_put(mctx, manager->events, 4519 sizeof(struct kevent) * manager->nevents); 4520 #elif defined(USE_EPOLL) 4521 close(manager->epoll_fd); 4522 isc_mem_put(mctx, manager->events, 4523 sizeof(struct epoll_event) * manager->nevents); 4524 #elif defined(USE_DEVPOLL) 4525 close(manager->devpoll_fd); 4526 isc_mem_put(mctx, manager->events, 4527 sizeof(struct pollfd) * manager->nevents); 4528 isc_mem_put(mctx, manager->fdpollinfo, 4529 sizeof(pollinfo_t) * manager->maxsocks); 4530 #elif defined(USE_SELECT) 4531 if (manager->read_fds != NULL) 4532 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); 4533 if (manager->read_fds_copy != NULL) 4534 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize); 4535 if (manager->write_fds != NULL) 4536 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); 4537 if (manager->write_fds_copy != NULL) 4538 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); 4539 #endif /* USE_KQUEUE */ 4540 } 4541 4542 isc_result_t 4543 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { 4544 return (isc__socketmgr_create2(mctx, managerp, 0)); 4545 } 4546 4547 isc_result_t 4548 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, 4549 unsigned int maxsocks) 4550 { 4551 int i; 4552 isc__socketmgr_t *manager; 4553 #ifdef USE_WATCHER_THREAD 4554 char strbuf[ISC_STRERRORSIZE]; 4555 #endif 4556 isc_result_t result; 4557 4558 REQUIRE(managerp != NULL && *managerp == NULL); 4559 4560 #ifdef USE_SHARED_MANAGER 4561 if (socketmgr != NULL) { 4562 /* Don't allow maxsocks to be updated */ 4563 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) 4564 return (ISC_R_EXISTS); 4565 4566 socketmgr->refs++; 4567 *managerp = (isc_socketmgr_t *)socketmgr; 4568 return (ISC_R_SUCCESS); 4569 } 4570 #endif /* USE_SHARED_MANAGER */ 4571 4572 if (maxsocks == 0) 4573 maxsocks = ISC_SOCKET_MAXSOCKETS; 4574 4575 manager = isc_mem_get(mctx, sizeof(*manager)); 4576 if (manager == NULL) 4577 return (ISC_R_NOMEMORY); 4578 4579 /* zero-clear so that necessary cleanup on failure will be easy */ 4580 memset(manager, 0, sizeof(*manager)); 4581 manager->maxsocks = maxsocks; 4582 manager->reserved = 0; 4583 manager->maxudp = 0; 4584 manager->fds = isc_mem_get(mctx, 4585 manager->maxsocks * sizeof(isc__socket_t *)); 4586 if (manager->fds == NULL) { 4587 result = ISC_R_NOMEMORY; 4588 goto free_manager; 4589 } 4590 manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); 4591 if (manager->fdstate == NULL) { 4592 result = ISC_R_NOMEMORY; 4593 goto free_manager; 4594 } 4595 manager->stats = NULL; 4596 4597 manager->common.methods = &socketmgrmethods; 4598 manager->common.magic = ISCAPI_SOCKETMGR_MAGIC; 4599 manager->common.impmagic = SOCKET_MANAGER_MAGIC; 4600 manager->mctx = NULL; 4601 memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); 4602 ISC_LIST_INIT(manager->socklist); 4603 result = isc_mutex_init(&manager->lock); 4604 if (result != ISC_R_SUCCESS) 4605 goto free_manager; 4606 manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); 4607 if (manager->fdlock == NULL) { 4608 result = ISC_R_NOMEMORY; 4609 goto cleanup_lock; 4610 } 4611 for (i = 0; i < FDLOCK_COUNT; i++) { 4612 result = isc_mutex_init(&manager->fdlock[i]); 4613 if (result != ISC_R_SUCCESS) { 4614 while (--i >= 0) 4615 DESTROYLOCK(&manager->fdlock[i]); 4616 isc_mem_put(mctx, manager->fdlock, 4617 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4618 manager->fdlock = NULL; 4619 goto cleanup_lock; 4620 } 4621 } 4622 4623 #ifdef USE_WATCHER_THREAD 4624 if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { 4625 UNEXPECTED_ERROR(__FILE__, __LINE__, 4626 "isc_condition_init() %s", 4627 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4628 ISC_MSG_FAILED, "failed")); 4629 result = ISC_R_UNEXPECTED; 4630 goto cleanup_lock; 4631 } 4632 4633 /* 4634 * Create the special fds that will be used to wake up the 4635 * select/poll loop when something internal needs to be done. 4636 */ 4637 if (pipe(manager->pipe_fds) != 0) { 4638 isc__strerror(errno, strbuf, sizeof(strbuf)); 4639 UNEXPECTED_ERROR(__FILE__, __LINE__, 4640 "pipe() %s: %s", 4641 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4642 ISC_MSG_FAILED, "failed"), 4643 strbuf); 4644 result = ISC_R_UNEXPECTED; 4645 goto cleanup_condition; 4646 } 4647 4648 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); 4649 #if 0 4650 RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS); 4651 #endif 4652 #endif /* USE_WATCHER_THREAD */ 4653 4654 #ifdef USE_SHARED_MANAGER 4655 manager->refs = 1; 4656 #endif /* USE_SHARED_MANAGER */ 4657 4658 /* 4659 * Set up initial state for the select loop 4660 */ 4661 result = setup_watcher(mctx, manager); 4662 if (result != ISC_R_SUCCESS) 4663 goto cleanup; 4664 memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); 4665 #ifdef USE_WATCHER_THREAD 4666 /* 4667 * Start up the select/poll thread. 4668 */ 4669 if (isc_thread_create(watcher, manager, &manager->watcher) != 4670 ISC_R_SUCCESS) { 4671 UNEXPECTED_ERROR(__FILE__, __LINE__, 4672 "isc_thread_create() %s", 4673 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4674 ISC_MSG_FAILED, "failed")); 4675 cleanup_watcher(mctx, manager); 4676 result = ISC_R_UNEXPECTED; 4677 goto cleanup; 4678 } 4679 #endif /* USE_WATCHER_THREAD */ 4680 isc_mem_attach(mctx, &manager->mctx); 4681 4682 #ifdef USE_SHARED_MANAGER 4683 socketmgr = manager; 4684 #endif /* USE_SHARED_MANAGER */ 4685 *managerp = (isc_socketmgr_t *)manager; 4686 4687 return (ISC_R_SUCCESS); 4688 4689 cleanup: 4690 #ifdef USE_WATCHER_THREAD 4691 (void)close(manager->pipe_fds[0]); 4692 (void)close(manager->pipe_fds[1]); 4693 #endif /* USE_WATCHER_THREAD */ 4694 4695 #ifdef USE_WATCHER_THREAD 4696 cleanup_condition: 4697 (void)isc_condition_destroy(&manager->shutdown_ok); 4698 #endif /* USE_WATCHER_THREAD */ 4699 4700 4701 cleanup_lock: 4702 if (manager->fdlock != NULL) { 4703 for (i = 0; i < FDLOCK_COUNT; i++) 4704 DESTROYLOCK(&manager->fdlock[i]); 4705 } 4706 DESTROYLOCK(&manager->lock); 4707 4708 free_manager: 4709 if (manager->fdlock != NULL) { 4710 isc_mem_put(mctx, manager->fdlock, 4711 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4712 } 4713 if (manager->fdstate != NULL) { 4714 isc_mem_put(mctx, manager->fdstate, 4715 manager->maxsocks * sizeof(int)); 4716 } 4717 if (manager->fds != NULL) { 4718 isc_mem_put(mctx, manager->fds, 4719 manager->maxsocks * sizeof(isc_socket_t *)); 4720 } 4721 isc_mem_put(mctx, manager, sizeof(*manager)); 4722 4723 return (result); 4724 } 4725 4726 isc_result_t 4727 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) { 4728 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4729 REQUIRE(VALID_MANAGER(manager)); 4730 REQUIRE(nsockp != NULL); 4731 4732 *nsockp = manager->maxsocks; 4733 4734 return (ISC_R_SUCCESS); 4735 } 4736 4737 void 4738 isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) { 4739 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 4740 4741 REQUIRE(VALID_MANAGER(manager)); 4742 REQUIRE(ISC_LIST_EMPTY(manager->socklist)); 4743 REQUIRE(manager->stats == NULL); 4744 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max); 4745 4746 isc_stats_attach(stats, &manager->stats); 4747 } 4748 4749 void 4750 isc__socketmgr_destroy(isc_socketmgr_t **managerp) { 4751 isc__socketmgr_t *manager; 4752 int i; 4753 isc_mem_t *mctx; 4754 4755 /* 4756 * Destroy a socket manager. 4757 */ 4758 4759 REQUIRE(managerp != NULL); 4760 manager = (isc__socketmgr_t *)*managerp; 4761 REQUIRE(VALID_MANAGER(manager)); 4762 4763 #ifdef USE_SHARED_MANAGER 4764 manager->refs--; 4765 if (manager->refs > 0) { 4766 *managerp = NULL; 4767 return; 4768 } 4769 socketmgr = NULL; 4770 #endif /* USE_SHARED_MANAGER */ 4771 4772 LOCK(&manager->lock); 4773 4774 /* 4775 * Wait for all sockets to be destroyed. 4776 */ 4777 while (!ISC_LIST_EMPTY(manager->socklist)) { 4778 #ifdef USE_WATCHER_THREAD 4779 manager_log(manager, CREATION, "%s", 4780 isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, 4781 ISC_MSG_SOCKETSREMAIN, 4782 "sockets exist")); 4783 WAIT(&manager->shutdown_ok, &manager->lock); 4784 #else /* USE_WATCHER_THREAD */ 4785 UNLOCK(&manager->lock); 4786 isc__taskmgr_dispatch(NULL); 4787 LOCK(&manager->lock); 4788 #endif /* USE_WATCHER_THREAD */ 4789 } 4790 4791 UNLOCK(&manager->lock); 4792 4793 /* 4794 * Here, poke our select/poll thread. Do this by closing the write 4795 * half of the pipe, which will send EOF to the read half. 4796 * This is currently a no-op in the non-threaded case. 4797 */ 4798 select_poke(manager, 0, SELECT_POKE_SHUTDOWN); 4799 4800 #ifdef USE_WATCHER_THREAD 4801 /* 4802 * Wait for thread to exit. 4803 */ 4804 if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) 4805 UNEXPECTED_ERROR(__FILE__, __LINE__, 4806 "isc_thread_join() %s", 4807 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 4808 ISC_MSG_FAILED, "failed")); 4809 #endif /* USE_WATCHER_THREAD */ 4810 4811 /* 4812 * Clean up. 4813 */ 4814 cleanup_watcher(manager->mctx, manager); 4815 4816 #ifdef USE_WATCHER_THREAD 4817 (void)close(manager->pipe_fds[0]); 4818 (void)close(manager->pipe_fds[1]); 4819 (void)isc_condition_destroy(&manager->shutdown_ok); 4820 #endif /* USE_WATCHER_THREAD */ 4821 4822 for (i = 0; i < (int)manager->maxsocks; i++) 4823 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ 4824 (void)close(i); 4825 4826 isc_mem_put(manager->mctx, manager->fds, 4827 manager->maxsocks * sizeof(isc__socket_t *)); 4828 isc_mem_put(manager->mctx, manager->fdstate, 4829 manager->maxsocks * sizeof(int)); 4830 4831 if (manager->stats != NULL) 4832 isc_stats_detach(&manager->stats); 4833 4834 if (manager->fdlock != NULL) { 4835 for (i = 0; i < FDLOCK_COUNT; i++) 4836 DESTROYLOCK(&manager->fdlock[i]); 4837 isc_mem_put(manager->mctx, manager->fdlock, 4838 FDLOCK_COUNT * sizeof(isc_mutex_t)); 4839 } 4840 DESTROYLOCK(&manager->lock); 4841 manager->common.magic = 0; 4842 manager->common.impmagic = 0; 4843 mctx= manager->mctx; 4844 isc_mem_put(mctx, manager, sizeof(*manager)); 4845 4846 isc_mem_detach(&mctx); 4847 4848 *managerp = NULL; 4849 4850 #ifdef USE_SHARED_MANAGER 4851 socketmgr = NULL; 4852 #endif 4853 } 4854 4855 static isc_result_t 4856 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 4857 unsigned int flags) 4858 { 4859 int io_state; 4860 isc_boolean_t have_lock = ISC_FALSE; 4861 isc_task_t *ntask = NULL; 4862 isc_result_t result = ISC_R_SUCCESS; 4863 4864 dev->ev_sender = task; 4865 4866 if (sock->type == isc_sockettype_udp) { 4867 io_state = doio_recv(sock, dev); 4868 } else { 4869 LOCK(&sock->lock); 4870 have_lock = ISC_TRUE; 4871 4872 if (ISC_LIST_EMPTY(sock->recv_list)) 4873 io_state = doio_recv(sock, dev); 4874 else 4875 io_state = DOIO_SOFT; 4876 } 4877 4878 switch (io_state) { 4879 case DOIO_SOFT: 4880 /* 4881 * We couldn't read all or part of the request right now, so 4882 * queue it. 4883 * 4884 * Attach to socket and to task 4885 */ 4886 isc_task_attach(task, &ntask); 4887 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 4888 4889 if (!have_lock) { 4890 LOCK(&sock->lock); 4891 have_lock = ISC_TRUE; 4892 } 4893 4894 /* 4895 * Enqueue the request. If the socket was previously not being 4896 * watched, poke the watcher to start paying attention to it. 4897 */ 4898 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) 4899 select_poke(sock->manager, sock->fd, SELECT_POKE_READ); 4900 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); 4901 4902 socket_log(sock, NULL, EVENT, NULL, 0, 0, 4903 "socket_recv: event %p -> task %p", 4904 dev, ntask); 4905 4906 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 4907 result = ISC_R_INPROGRESS; 4908 break; 4909 4910 case DOIO_EOF: 4911 dev->result = ISC_R_EOF; 4912 /* fallthrough */ 4913 4914 case DOIO_HARD: 4915 case DOIO_SUCCESS: 4916 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 4917 send_recvdone_event(sock, &dev); 4918 break; 4919 } 4920 4921 if (have_lock) 4922 UNLOCK(&sock->lock); 4923 4924 return (result); 4925 } 4926 4927 isc_result_t 4928 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist, 4929 unsigned int minimum, isc_task_t *task, 4930 isc_taskaction_t action, void *arg) 4931 { 4932 isc__socket_t *sock = (isc__socket_t *)sock0; 4933 isc_socketevent_t *dev; 4934 isc__socketmgr_t *manager; 4935 unsigned int iocount; 4936 isc_buffer_t *buffer; 4937 4938 REQUIRE(VALID_SOCKET(sock)); 4939 REQUIRE(buflist != NULL); 4940 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 4941 REQUIRE(task != NULL); 4942 REQUIRE(action != NULL); 4943 4944 manager = sock->manager; 4945 REQUIRE(VALID_MANAGER(manager)); 4946 4947 iocount = isc_bufferlist_availablecount(buflist); 4948 REQUIRE(iocount > 0); 4949 4950 INSIST(sock->bound); 4951 4952 dev = allocate_socketevent(manager->mctx, sock, 4953 ISC_SOCKEVENT_RECVDONE, action, arg); 4954 if (dev == NULL) 4955 return (ISC_R_NOMEMORY); 4956 4957 /* 4958 * UDP sockets are always partial read 4959 */ 4960 if (sock->type == isc_sockettype_udp) 4961 dev->minimum = 1; 4962 else { 4963 if (minimum == 0) 4964 dev->minimum = iocount; 4965 else 4966 dev->minimum = minimum; 4967 } 4968 4969 /* 4970 * Move each buffer from the passed in list to our internal one. 4971 */ 4972 buffer = ISC_LIST_HEAD(*buflist); 4973 while (buffer != NULL) { 4974 ISC_LIST_DEQUEUE(*buflist, buffer, link); 4975 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 4976 buffer = ISC_LIST_HEAD(*buflist); 4977 } 4978 4979 return (socket_recv(sock, dev, task, 0)); 4980 } 4981 4982 isc_result_t 4983 isc__socket_recv(isc_socket_t *sock0, isc_region_t *region, 4984 unsigned int minimum, isc_task_t *task, 4985 isc_taskaction_t action, void *arg) 4986 { 4987 isc__socket_t *sock = (isc__socket_t *)sock0; 4988 isc_socketevent_t *dev; 4989 isc__socketmgr_t *manager; 4990 4991 REQUIRE(VALID_SOCKET(sock)); 4992 REQUIRE(action != NULL); 4993 4994 manager = sock->manager; 4995 REQUIRE(VALID_MANAGER(manager)); 4996 4997 INSIST(sock->bound); 4998 4999 dev = allocate_socketevent(manager->mctx, sock, 5000 ISC_SOCKEVENT_RECVDONE, action, arg); 5001 if (dev == NULL) 5002 return (ISC_R_NOMEMORY); 5003 5004 return (isc__socket_recv2(sock0, region, minimum, task, dev, 0)); 5005 } 5006 5007 isc_result_t 5008 isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region, 5009 unsigned int minimum, isc_task_t *task, 5010 isc_socketevent_t *event, unsigned int flags) 5011 { 5012 isc__socket_t *sock = (isc__socket_t *)sock0; 5013 5014 event->ev_sender = sock; 5015 event->result = ISC_R_UNSET; 5016 ISC_LIST_INIT(event->bufferlist); 5017 event->region = *region; 5018 event->n = 0; 5019 event->offset = 0; 5020 event->attributes = 0; 5021 5022 /* 5023 * UDP sockets are always partial read. 5024 */ 5025 if (sock->type == isc_sockettype_udp) 5026 event->minimum = 1; 5027 else { 5028 if (minimum == 0) 5029 event->minimum = region->length; 5030 else 5031 event->minimum = minimum; 5032 } 5033 5034 return (socket_recv(sock, event, task, flags)); 5035 } 5036 5037 static isc_result_t 5038 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, 5039 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 5040 unsigned int flags) 5041 { 5042 int io_state; 5043 isc_boolean_t have_lock = ISC_FALSE; 5044 isc_task_t *ntask = NULL; 5045 isc_result_t result = ISC_R_SUCCESS; 5046 5047 dev->ev_sender = task; 5048 5049 set_dev_address(address, sock, dev); 5050 if (pktinfo != NULL) { 5051 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; 5052 dev->pktinfo = *pktinfo; 5053 5054 if (!isc_sockaddr_issitelocal(&dev->address) && 5055 !isc_sockaddr_islinklocal(&dev->address)) { 5056 socket_log(sock, NULL, TRACE, isc_msgcat, 5057 ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED, 5058 "pktinfo structure provided, ifindex %u " 5059 "(set to 0)", pktinfo->ipi6_ifindex); 5060 5061 /* 5062 * Set the pktinfo index to 0 here, to let the 5063 * kernel decide what interface it should send on. 5064 */ 5065 dev->pktinfo.ipi6_ifindex = 0; 5066 } 5067 } 5068 5069 if (sock->type == isc_sockettype_udp) 5070 io_state = doio_send(sock, dev); 5071 else { 5072 LOCK(&sock->lock); 5073 have_lock = ISC_TRUE; 5074 5075 if (ISC_LIST_EMPTY(sock->send_list)) 5076 io_state = doio_send(sock, dev); 5077 else 5078 io_state = DOIO_SOFT; 5079 } 5080 5081 switch (io_state) { 5082 case DOIO_SOFT: 5083 /* 5084 * We couldn't send all or part of the request right now, so 5085 * queue it unless ISC_SOCKFLAG_NORETRY is set. 5086 */ 5087 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { 5088 isc_task_attach(task, &ntask); 5089 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; 5090 5091 if (!have_lock) { 5092 LOCK(&sock->lock); 5093 have_lock = ISC_TRUE; 5094 } 5095 5096 /* 5097 * Enqueue the request. If the socket was previously 5098 * not being watched, poke the watcher to start 5099 * paying attention to it. 5100 */ 5101 if (ISC_LIST_EMPTY(sock->send_list) && 5102 !sock->pending_send) 5103 select_poke(sock->manager, sock->fd, 5104 SELECT_POKE_WRITE); 5105 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); 5106 5107 socket_log(sock, NULL, EVENT, NULL, 0, 0, 5108 "socket_send: event %p -> task %p", 5109 dev, ntask); 5110 5111 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) 5112 result = ISC_R_INPROGRESS; 5113 break; 5114 } 5115 5116 case DOIO_HARD: 5117 case DOIO_SUCCESS: 5118 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) 5119 send_senddone_event(sock, &dev); 5120 break; 5121 } 5122 5123 if (have_lock) 5124 UNLOCK(&sock->lock); 5125 5126 return (result); 5127 } 5128 5129 isc_result_t 5130 isc__socket_send(isc_socket_t *sock, isc_region_t *region, 5131 isc_task_t *task, isc_taskaction_t action, void *arg) 5132 { 5133 /* 5134 * REQUIRE() checking is performed in isc_socket_sendto(). 5135 */ 5136 return (isc__socket_sendto(sock, region, task, action, arg, NULL, 5137 NULL)); 5138 } 5139 5140 isc_result_t 5141 isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region, 5142 isc_task_t *task, isc_taskaction_t action, void *arg, 5143 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 5144 { 5145 isc__socket_t *sock = (isc__socket_t *)sock0; 5146 isc_socketevent_t *dev; 5147 isc__socketmgr_t *manager; 5148 5149 REQUIRE(VALID_SOCKET(sock)); 5150 REQUIRE(region != NULL); 5151 REQUIRE(task != NULL); 5152 REQUIRE(action != NULL); 5153 5154 manager = sock->manager; 5155 REQUIRE(VALID_MANAGER(manager)); 5156 5157 INSIST(sock->bound); 5158 5159 dev = allocate_socketevent(manager->mctx, sock, 5160 ISC_SOCKEVENT_SENDDONE, action, arg); 5161 if (dev == NULL) 5162 return (ISC_R_NOMEMORY); 5163 5164 dev->region = *region; 5165 5166 return (socket_send(sock, dev, task, address, pktinfo, 0)); 5167 } 5168 5169 isc_result_t 5170 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist, 5171 isc_task_t *task, isc_taskaction_t action, void *arg) 5172 { 5173 return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL, 5174 NULL, 0)); 5175 } 5176 5177 isc_result_t 5178 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist, 5179 isc_task_t *task, isc_taskaction_t action, void *arg, 5180 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) 5181 { 5182 return (isc__socket_sendtov2(sock, buflist, task, action, arg, address, 5183 pktinfo, 0)); 5184 } 5185 5186 isc_result_t 5187 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist, 5188 isc_task_t *task, isc_taskaction_t action, void *arg, 5189 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 5190 unsigned int flags) 5191 { 5192 isc__socket_t *sock = (isc__socket_t *)sock0; 5193 isc_socketevent_t *dev; 5194 isc__socketmgr_t *manager; 5195 unsigned int iocount; 5196 isc_buffer_t *buffer; 5197 5198 REQUIRE(VALID_SOCKET(sock)); 5199 REQUIRE(buflist != NULL); 5200 REQUIRE(!ISC_LIST_EMPTY(*buflist)); 5201 REQUIRE(task != NULL); 5202 REQUIRE(action != NULL); 5203 5204 manager = sock->manager; 5205 REQUIRE(VALID_MANAGER(manager)); 5206 5207 iocount = isc_bufferlist_usedcount(buflist); 5208 REQUIRE(iocount > 0); 5209 5210 dev = allocate_socketevent(manager->mctx, sock, 5211 ISC_SOCKEVENT_SENDDONE, action, arg); 5212 if (dev == NULL) 5213 return (ISC_R_NOMEMORY); 5214 5215 /* 5216 * Move each buffer from the passed in list to our internal one. 5217 */ 5218 buffer = ISC_LIST_HEAD(*buflist); 5219 while (buffer != NULL) { 5220 ISC_LIST_DEQUEUE(*buflist, buffer, link); 5221 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link); 5222 buffer = ISC_LIST_HEAD(*buflist); 5223 } 5224 5225 return (socket_send(sock, dev, task, address, pktinfo, flags)); 5226 } 5227 5228 isc_result_t 5229 isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region, 5230 isc_task_t *task, 5231 isc_sockaddr_t *address, struct in6_pktinfo *pktinfo, 5232 isc_socketevent_t *event, unsigned int flags) 5233 { 5234 isc__socket_t *sock = (isc__socket_t *)sock0; 5235 5236 REQUIRE(VALID_SOCKET(sock)); 5237 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0); 5238 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) 5239 REQUIRE(sock->type == isc_sockettype_udp); 5240 event->ev_sender = sock; 5241 event->result = ISC_R_UNSET; 5242 ISC_LIST_INIT(event->bufferlist); 5243 event->region = *region; 5244 event->n = 0; 5245 event->offset = 0; 5246 event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED; 5247 5248 return (socket_send(sock, event, task, address, pktinfo, flags)); 5249 } 5250 5251 void 5252 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) { 5253 #ifdef ISC_PLATFORM_HAVESYSUNH 5254 int s; 5255 struct stat sb; 5256 char strbuf[ISC_STRERRORSIZE]; 5257 5258 if (sockaddr->type.sa.sa_family != AF_UNIX) 5259 return; 5260 5261 #ifndef S_ISSOCK 5262 #if defined(S_IFMT) && defined(S_IFSOCK) 5263 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK) 5264 #elif defined(_S_IFMT) && defined(S_IFSOCK) 5265 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK) 5266 #endif 5267 #endif 5268 5269 #ifndef S_ISFIFO 5270 #if defined(S_IFMT) && defined(S_IFIFO) 5271 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO) 5272 #elif defined(_S_IFMT) && defined(S_IFIFO) 5273 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO) 5274 #endif 5275 #endif 5276 5277 #if !defined(S_ISFIFO) && !defined(S_ISSOCK) 5278 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>. 5279 #endif 5280 5281 #ifndef S_ISFIFO 5282 #define S_ISFIFO(mode) 0 5283 #endif 5284 5285 #ifndef S_ISSOCK 5286 #define S_ISSOCK(mode) 0 5287 #endif 5288 5289 if (active) { 5290 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 5291 isc__strerror(errno, strbuf, sizeof(strbuf)); 5292 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5293 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5294 "isc_socket_cleanunix: stat(%s): %s", 5295 sockaddr->type.sunix.sun_path, strbuf); 5296 return; 5297 } 5298 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 5299 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5300 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5301 "isc_socket_cleanunix: %s: not a socket", 5302 sockaddr->type.sunix.sun_path); 5303 return; 5304 } 5305 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 5306 isc__strerror(errno, strbuf, sizeof(strbuf)); 5307 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5308 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5309 "isc_socket_cleanunix: unlink(%s): %s", 5310 sockaddr->type.sunix.sun_path, strbuf); 5311 } 5312 return; 5313 } 5314 5315 s = socket(AF_UNIX, SOCK_STREAM, 0); 5316 if (s < 0) { 5317 isc__strerror(errno, strbuf, sizeof(strbuf)); 5318 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5319 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 5320 "isc_socket_cleanunix: socket(%s): %s", 5321 sockaddr->type.sunix.sun_path, strbuf); 5322 return; 5323 } 5324 5325 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) { 5326 switch (errno) { 5327 case ENOENT: /* We exited cleanly last time */ 5328 break; 5329 default: 5330 isc__strerror(errno, strbuf, sizeof(strbuf)); 5331 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5332 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 5333 "isc_socket_cleanunix: stat(%s): %s", 5334 sockaddr->type.sunix.sun_path, strbuf); 5335 break; 5336 } 5337 goto cleanup; 5338 } 5339 5340 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) { 5341 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5342 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 5343 "isc_socket_cleanunix: %s: not a socket", 5344 sockaddr->type.sunix.sun_path); 5345 goto cleanup; 5346 } 5347 5348 if (connect(s, (struct sockaddr *)&sockaddr->type.sunix, 5349 sizeof(sockaddr->type.sunix)) < 0) { 5350 switch (errno) { 5351 case ECONNREFUSED: 5352 case ECONNRESET: 5353 if (unlink(sockaddr->type.sunix.sun_path) < 0) { 5354 isc__strerror(errno, strbuf, sizeof(strbuf)); 5355 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5356 ISC_LOGMODULE_SOCKET, 5357 ISC_LOG_WARNING, 5358 "isc_socket_cleanunix: " 5359 "unlink(%s): %s", 5360 sockaddr->type.sunix.sun_path, 5361 strbuf); 5362 } 5363 break; 5364 default: 5365 isc__strerror(errno, strbuf, sizeof(strbuf)); 5366 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5367 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING, 5368 "isc_socket_cleanunix: connect(%s): %s", 5369 sockaddr->type.sunix.sun_path, strbuf); 5370 break; 5371 } 5372 } 5373 cleanup: 5374 close(s); 5375 #else 5376 UNUSED(sockaddr); 5377 UNUSED(active); 5378 #endif 5379 } 5380 5381 isc_result_t 5382 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, 5383 isc_uint32_t owner, isc_uint32_t group) 5384 { 5385 #ifdef ISC_PLATFORM_HAVESYSUNH 5386 isc_result_t result = ISC_R_SUCCESS; 5387 char strbuf[ISC_STRERRORSIZE]; 5388 char path[sizeof(sockaddr->type.sunix.sun_path)]; 5389 #ifdef NEED_SECURE_DIRECTORY 5390 char *slash; 5391 #endif 5392 5393 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX); 5394 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path)); 5395 strcpy(path, sockaddr->type.sunix.sun_path); 5396 5397 #ifdef NEED_SECURE_DIRECTORY 5398 slash = strrchr(path, '/'); 5399 if (slash != NULL) { 5400 if (slash != path) 5401 *slash = '\0'; 5402 else 5403 strcpy(path, "/"); 5404 } else 5405 strcpy(path, "."); 5406 #endif 5407 5408 if (chmod(path, perm) < 0) { 5409 isc__strerror(errno, strbuf, sizeof(strbuf)); 5410 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5411 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5412 "isc_socket_permunix: chmod(%s, %d): %s", 5413 path, perm, strbuf); 5414 result = ISC_R_FAILURE; 5415 } 5416 if (chown(path, owner, group) < 0) { 5417 isc__strerror(errno, strbuf, sizeof(strbuf)); 5418 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, 5419 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, 5420 "isc_socket_permunix: chown(%s, %d, %d): %s", 5421 path, owner, group, 5422 strbuf); 5423 result = ISC_R_FAILURE; 5424 } 5425 return (result); 5426 #else 5427 UNUSED(sockaddr); 5428 UNUSED(perm); 5429 UNUSED(owner); 5430 UNUSED(group); 5431 return (ISC_R_NOTIMPLEMENTED); 5432 #endif 5433 } 5434 5435 isc_result_t 5436 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr, 5437 unsigned int options) { 5438 isc__socket_t *sock = (isc__socket_t *)sock0; 5439 char strbuf[ISC_STRERRORSIZE]; 5440 int on = 1; 5441 5442 REQUIRE(VALID_SOCKET(sock)); 5443 5444 LOCK(&sock->lock); 5445 5446 INSIST(!sock->bound); 5447 INSIST(!sock->dupped); 5448 5449 if (sock->pf != sockaddr->type.sa.sa_family) { 5450 UNLOCK(&sock->lock); 5451 return (ISC_R_FAMILYMISMATCH); 5452 } 5453 5454 /* 5455 * Only set SO_REUSEADDR when we want a specific port. 5456 */ 5457 #ifdef AF_UNIX 5458 if (sock->pf == AF_UNIX) 5459 goto bind_socket; 5460 #endif 5461 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 && 5462 isc_sockaddr_getport(sockaddr) != (in_port_t)0 && 5463 setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, 5464 sizeof(on)) < 0) { 5465 UNEXPECTED_ERROR(__FILE__, __LINE__, 5466 "setsockopt(%d) %s", sock->fd, 5467 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, 5468 ISC_MSG_FAILED, "failed")); 5469 /* Press on... */ 5470 } 5471 #ifdef AF_UNIX 5472 bind_socket: 5473 #endif 5474 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) { 5475 inc_stats(sock->manager->stats, 5476 sock->statsindex[STATID_BINDFAIL]); 5477 5478 UNLOCK(&sock->lock); 5479 switch (errno) { 5480 case EACCES: 5481 return (ISC_R_NOPERM); 5482 case EADDRNOTAVAIL: 5483 return (ISC_R_ADDRNOTAVAIL); 5484 case EADDRINUSE: 5485 return (ISC_R_ADDRINUSE); 5486 case EINVAL: 5487 return (ISC_R_BOUND); 5488 default: 5489 isc__strerror(errno, strbuf, sizeof(strbuf)); 5490 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s", 5491 strbuf); 5492 return (ISC_R_UNEXPECTED); 5493 } 5494 } 5495 5496 socket_log(sock, sockaddr, TRACE, 5497 isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound"); 5498 sock->bound = 1; 5499 5500 UNLOCK(&sock->lock); 5501 return (ISC_R_SUCCESS); 5502 } 5503 5504 /* 5505 * Enable this only for specific OS versions, and only when they have repaired 5506 * their problems with it. Until then, this is is broken and needs to be 5507 * diabled by default. See RT22589 for details. 5508 */ 5509 #undef ENABLE_ACCEPTFILTER 5510 5511 isc_result_t 5512 isc__socket_filter(isc_socket_t *sock0, const char *filter) { 5513 isc__socket_t *sock = (isc__socket_t *)sock0; 5514 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5515 char strbuf[ISC_STRERRORSIZE]; 5516 struct accept_filter_arg afa; 5517 #else 5518 UNUSED(sock); 5519 UNUSED(filter); 5520 #endif 5521 5522 REQUIRE(VALID_SOCKET(sock)); 5523 5524 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) 5525 bzero(&afa, sizeof(afa)); 5526 strncpy(afa.af_name, filter, sizeof(afa.af_name)); 5527 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, 5528 &afa, sizeof(afa)) == -1) { 5529 isc__strerror(errno, strbuf, sizeof(strbuf)); 5530 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, 5531 ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s", 5532 strbuf); 5533 return (ISC_R_FAILURE); 5534 } 5535 return (ISC_R_SUCCESS); 5536 #else 5537 return (ISC_R_NOTIMPLEMENTED); 5538 #endif 5539 } 5540 5541 /* 5542 * Set up to listen on a given socket. We do this by creating an internal 5543 * event that will be dispatched when the socket has read activity. The 5544 * watcher will send the internal event to the task when there is a new 5545 * connection. 5546 * 5547 * Unlike in read, we don't preallocate a done event here. Every time there 5548 * is a new connection we'll have to allocate a new one anyway, so we might 5549 * as well keep things simple rather than having to track them. 5550 */ 5551 isc_result_t 5552 isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) { 5553 isc__socket_t *sock = (isc__socket_t *)sock0; 5554 char strbuf[ISC_STRERRORSIZE]; 5555 5556 REQUIRE(VALID_SOCKET(sock)); 5557 5558 LOCK(&sock->lock); 5559 5560 REQUIRE(!sock->listener); 5561 REQUIRE(sock->bound); 5562 REQUIRE(sock->type == isc_sockettype_tcp || 5563 sock->type == isc_sockettype_unix); 5564 5565 if (backlog == 0) 5566 backlog = SOMAXCONN; 5567 5568 if (listen(sock->fd, (int)backlog) < 0) { 5569 UNLOCK(&sock->lock); 5570 isc__strerror(errno, strbuf, sizeof(strbuf)); 5571 5572 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf); 5573 5574 return (ISC_R_UNEXPECTED); 5575 } 5576 5577 sock->listener = 1; 5578 5579 UNLOCK(&sock->lock); 5580 return (ISC_R_SUCCESS); 5581 } 5582 5583 /* 5584 * This should try to do aggressive accept() XXXMLG 5585 */ 5586 isc_result_t 5587 isc__socket_accept(isc_socket_t *sock0, 5588 isc_task_t *task, isc_taskaction_t action, void *arg) 5589 { 5590 isc__socket_t *sock = (isc__socket_t *)sock0; 5591 isc_socket_newconnev_t *dev; 5592 isc__socketmgr_t *manager; 5593 isc_task_t *ntask = NULL; 5594 isc__socket_t *nsock; 5595 isc_result_t result; 5596 isc_boolean_t do_poke = ISC_FALSE; 5597 5598 REQUIRE(VALID_SOCKET(sock)); 5599 manager = sock->manager; 5600 REQUIRE(VALID_MANAGER(manager)); 5601 5602 LOCK(&sock->lock); 5603 5604 REQUIRE(sock->listener); 5605 5606 /* 5607 * Sender field is overloaded here with the task we will be sending 5608 * this event to. Just before the actual event is delivered the 5609 * actual ev_sender will be touched up to be the socket. 5610 */ 5611 dev = (isc_socket_newconnev_t *) 5612 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN, 5613 action, arg, sizeof(*dev)); 5614 if (dev == NULL) { 5615 UNLOCK(&sock->lock); 5616 return (ISC_R_NOMEMORY); 5617 } 5618 ISC_LINK_INIT(dev, ev_link); 5619 5620 result = allocate_socket(manager, sock->type, &nsock); 5621 if (result != ISC_R_SUCCESS) { 5622 isc_event_free(ISC_EVENT_PTR(&dev)); 5623 UNLOCK(&sock->lock); 5624 return (result); 5625 } 5626 5627 /* 5628 * Attach to socket and to task. 5629 */ 5630 isc_task_attach(task, &ntask); 5631 if (isc_task_exiting(ntask)) { 5632 free_socket(&nsock); 5633 isc_task_detach(&ntask); 5634 isc_event_free(ISC_EVENT_PTR(&dev)); 5635 UNLOCK(&sock->lock); 5636 return (ISC_R_SHUTTINGDOWN); 5637 } 5638 nsock->references++; 5639 nsock->statsindex = sock->statsindex; 5640 5641 dev->ev_sender = ntask; 5642 dev->newsocket = (isc_socket_t *)nsock; 5643 5644 /* 5645 * Poke watcher here. We still have the socket locked, so there 5646 * is no race condition. We will keep the lock for such a short 5647 * bit of time waking it up now or later won't matter all that much. 5648 */ 5649 if (ISC_LIST_EMPTY(sock->accept_list)) 5650 do_poke = ISC_TRUE; 5651 5652 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); 5653 5654 if (do_poke) 5655 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); 5656 5657 UNLOCK(&sock->lock); 5658 return (ISC_R_SUCCESS); 5659 } 5660 5661 isc_result_t 5662 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr, 5663 isc_task_t *task, isc_taskaction_t action, void *arg) 5664 { 5665 isc__socket_t *sock = (isc__socket_t *)sock0; 5666 isc_socket_connev_t *dev; 5667 isc_task_t *ntask = NULL; 5668 isc__socketmgr_t *manager; 5669 int cc; 5670 char strbuf[ISC_STRERRORSIZE]; 5671 char addrbuf[ISC_SOCKADDR_FORMATSIZE]; 5672 5673 REQUIRE(VALID_SOCKET(sock)); 5674 REQUIRE(addr != NULL); 5675 REQUIRE(task != NULL); 5676 REQUIRE(action != NULL); 5677 5678 manager = sock->manager; 5679 REQUIRE(VALID_MANAGER(manager)); 5680 REQUIRE(addr != NULL); 5681 5682 if (isc_sockaddr_ismulticast(addr)) 5683 return (ISC_R_MULTICAST); 5684 5685 LOCK(&sock->lock); 5686 5687 REQUIRE(!sock->connecting); 5688 5689 dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock, 5690 ISC_SOCKEVENT_CONNECT, 5691 action, arg, 5692 sizeof(*dev)); 5693 if (dev == NULL) { 5694 UNLOCK(&sock->lock); 5695 return (ISC_R_NOMEMORY); 5696 } 5697 ISC_LINK_INIT(dev, ev_link); 5698 5699 /* 5700 * Try to do the connect right away, as there can be only one 5701 * outstanding, and it might happen to complete. 5702 */ 5703 sock->peer_address = *addr; 5704 cc = connect(sock->fd, &addr->type.sa, addr->length); 5705 if (cc < 0) { 5706 /* 5707 * HP-UX "fails" to connect a UDP socket and sets errno to 5708 * EINPROGRESS if it's non-blocking. We'd rather regard this as 5709 * a success and let the user detect it if it's really an error 5710 * at the time of sending a packet on the socket. 5711 */ 5712 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { 5713 cc = 0; 5714 goto success; 5715 } 5716 if (SOFT_ERROR(errno) || errno == EINPROGRESS) 5717 goto queue; 5718 5719 switch (errno) { 5720 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit; 5721 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5722 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5723 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5724 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5725 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5726 #ifdef EHOSTDOWN 5727 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5728 #endif 5729 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5730 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5731 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5732 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5733 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5734 #undef ERROR_MATCH 5735 } 5736 5737 sock->connected = 0; 5738 5739 isc__strerror(errno, strbuf, sizeof(strbuf)); 5740 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf)); 5741 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s", 5742 addrbuf, errno, strbuf); 5743 5744 UNLOCK(&sock->lock); 5745 inc_stats(sock->manager->stats, 5746 sock->statsindex[STATID_CONNECTFAIL]); 5747 isc_event_free(ISC_EVENT_PTR(&dev)); 5748 return (ISC_R_UNEXPECTED); 5749 5750 err_exit: 5751 sock->connected = 0; 5752 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5753 5754 UNLOCK(&sock->lock); 5755 inc_stats(sock->manager->stats, 5756 sock->statsindex[STATID_CONNECTFAIL]); 5757 return (ISC_R_SUCCESS); 5758 } 5759 5760 /* 5761 * If connect completed, fire off the done event. 5762 */ 5763 success: 5764 if (cc == 0) { 5765 sock->connected = 1; 5766 sock->bound = 1; 5767 dev->result = ISC_R_SUCCESS; 5768 isc_task_send(task, ISC_EVENT_PTR(&dev)); 5769 5770 UNLOCK(&sock->lock); 5771 5772 inc_stats(sock->manager->stats, 5773 sock->statsindex[STATID_CONNECT]); 5774 5775 return (ISC_R_SUCCESS); 5776 } 5777 5778 queue: 5779 5780 /* 5781 * Attach to task. 5782 */ 5783 isc_task_attach(task, &ntask); 5784 5785 sock->connecting = 1; 5786 5787 dev->ev_sender = ntask; 5788 5789 /* 5790 * Poke watcher here. We still have the socket locked, so there 5791 * is no race condition. We will keep the lock for such a short 5792 * bit of time waking it up now or later won't matter all that much. 5793 */ 5794 if (sock->connect_ev == NULL) 5795 select_poke(manager, sock->fd, SELECT_POKE_CONNECT); 5796 5797 sock->connect_ev = dev; 5798 5799 UNLOCK(&sock->lock); 5800 return (ISC_R_SUCCESS); 5801 } 5802 5803 /* 5804 * Called when a socket with a pending connect() finishes. 5805 */ 5806 static void 5807 internal_connect(isc_task_t *me, isc_event_t *ev) { 5808 isc__socket_t *sock; 5809 isc_socket_connev_t *dev; 5810 isc_task_t *task; 5811 int cc; 5812 ISC_SOCKADDR_LEN_T optlen; 5813 char strbuf[ISC_STRERRORSIZE]; 5814 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 5815 5816 UNUSED(me); 5817 INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); 5818 5819 sock = ev->ev_sender; 5820 INSIST(VALID_SOCKET(sock)); 5821 5822 LOCK(&sock->lock); 5823 5824 /* 5825 * When the internal event was sent the reference count was bumped 5826 * to keep the socket around for us. Decrement the count here. 5827 */ 5828 INSIST(sock->references > 0); 5829 sock->references--; 5830 if (sock->references == 0) { 5831 UNLOCK(&sock->lock); 5832 destroy(&sock); 5833 return; 5834 } 5835 5836 /* 5837 * Has this event been canceled? 5838 */ 5839 dev = sock->connect_ev; 5840 if (dev == NULL) { 5841 INSIST(!sock->connecting); 5842 UNLOCK(&sock->lock); 5843 return; 5844 } 5845 5846 INSIST(sock->connecting); 5847 sock->connecting = 0; 5848 5849 /* 5850 * Get any possible error status here. 5851 */ 5852 optlen = sizeof(cc); 5853 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, 5854 (void *)&cc, (void *)&optlen) < 0) 5855 cc = errno; 5856 else 5857 errno = cc; 5858 5859 if (errno != 0) { 5860 /* 5861 * If the error is EAGAIN, just re-select on this 5862 * fd and pretend nothing strange happened. 5863 */ 5864 if (SOFT_ERROR(errno) || errno == EINPROGRESS) { 5865 sock->connecting = 1; 5866 select_poke(sock->manager, sock->fd, 5867 SELECT_POKE_CONNECT); 5868 UNLOCK(&sock->lock); 5869 5870 return; 5871 } 5872 5873 inc_stats(sock->manager->stats, 5874 sock->statsindex[STATID_CONNECTFAIL]); 5875 5876 /* 5877 * Translate other errors into ISC_R_* flavors. 5878 */ 5879 switch (errno) { 5880 #define ERROR_MATCH(a, b) case a: dev->result = b; break; 5881 ERROR_MATCH(EACCES, ISC_R_NOPERM); 5882 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); 5883 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL); 5884 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED); 5885 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH); 5886 #ifdef EHOSTDOWN 5887 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH); 5888 #endif 5889 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH); 5890 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES); 5891 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); 5892 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); 5893 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT); 5894 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET); 5895 #undef ERROR_MATCH 5896 default: 5897 dev->result = ISC_R_UNEXPECTED; 5898 isc_sockaddr_format(&sock->peer_address, peerbuf, 5899 sizeof(peerbuf)); 5900 isc__strerror(errno, strbuf, sizeof(strbuf)); 5901 UNEXPECTED_ERROR(__FILE__, __LINE__, 5902 "internal_connect: connect(%s) %s", 5903 peerbuf, strbuf); 5904 } 5905 } else { 5906 inc_stats(sock->manager->stats, 5907 sock->statsindex[STATID_CONNECT]); 5908 dev->result = ISC_R_SUCCESS; 5909 sock->connected = 1; 5910 sock->bound = 1; 5911 } 5912 5913 sock->connect_ev = NULL; 5914 5915 UNLOCK(&sock->lock); 5916 5917 task = dev->ev_sender; 5918 dev->ev_sender = sock; 5919 isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev)); 5920 } 5921 5922 isc_result_t 5923 isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5924 isc__socket_t *sock = (isc__socket_t *)sock0; 5925 isc_result_t result; 5926 5927 REQUIRE(VALID_SOCKET(sock)); 5928 REQUIRE(addressp != NULL); 5929 5930 LOCK(&sock->lock); 5931 5932 if (sock->connected) { 5933 *addressp = sock->peer_address; 5934 result = ISC_R_SUCCESS; 5935 } else { 5936 result = ISC_R_NOTCONNECTED; 5937 } 5938 5939 UNLOCK(&sock->lock); 5940 5941 return (result); 5942 } 5943 5944 isc_result_t 5945 isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) { 5946 isc__socket_t *sock = (isc__socket_t *)sock0; 5947 ISC_SOCKADDR_LEN_T len; 5948 isc_result_t result; 5949 char strbuf[ISC_STRERRORSIZE]; 5950 5951 REQUIRE(VALID_SOCKET(sock)); 5952 REQUIRE(addressp != NULL); 5953 5954 LOCK(&sock->lock); 5955 5956 if (!sock->bound) { 5957 result = ISC_R_NOTBOUND; 5958 goto out; 5959 } 5960 5961 result = ISC_R_SUCCESS; 5962 5963 len = sizeof(addressp->type); 5964 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { 5965 isc__strerror(errno, strbuf, sizeof(strbuf)); 5966 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", 5967 strbuf); 5968 result = ISC_R_UNEXPECTED; 5969 goto out; 5970 } 5971 addressp->length = (unsigned int)len; 5972 5973 out: 5974 UNLOCK(&sock->lock); 5975 5976 return (result); 5977 } 5978 5979 /* 5980 * Run through the list of events on this socket, and cancel the ones 5981 * queued for task "task" of type "how". "how" is a bitmask. 5982 */ 5983 void 5984 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) { 5985 isc__socket_t *sock = (isc__socket_t *)sock0; 5986 5987 REQUIRE(VALID_SOCKET(sock)); 5988 5989 /* 5990 * Quick exit if there is nothing to do. Don't even bother locking 5991 * in this case. 5992 */ 5993 if (how == 0) 5994 return; 5995 5996 LOCK(&sock->lock); 5997 5998 /* 5999 * All of these do the same thing, more or less. 6000 * Each will: 6001 * o If the internal event is marked as "posted" try to 6002 * remove it from the task's queue. If this fails, mark it 6003 * as canceled instead, and let the task clean it up later. 6004 * o For each I/O request for that task of that type, post 6005 * its done event with status of "ISC_R_CANCELED". 6006 * o Reset any state needed. 6007 */ 6008 if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV) 6009 && !ISC_LIST_EMPTY(sock->recv_list)) { 6010 isc_socketevent_t *dev; 6011 isc_socketevent_t *next; 6012 isc_task_t *current_task; 6013 6014 dev = ISC_LIST_HEAD(sock->recv_list); 6015 6016 while (dev != NULL) { 6017 current_task = dev->ev_sender; 6018 next = ISC_LIST_NEXT(dev, ev_link); 6019 6020 if ((task == NULL) || (task == current_task)) { 6021 dev->result = ISC_R_CANCELED; 6022 send_recvdone_event(sock, &dev); 6023 } 6024 dev = next; 6025 } 6026 } 6027 6028 if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND) 6029 && !ISC_LIST_EMPTY(sock->send_list)) { 6030 isc_socketevent_t *dev; 6031 isc_socketevent_t *next; 6032 isc_task_t *current_task; 6033 6034 dev = ISC_LIST_HEAD(sock->send_list); 6035 6036 while (dev != NULL) { 6037 current_task = dev->ev_sender; 6038 next = ISC_LIST_NEXT(dev, ev_link); 6039 6040 if ((task == NULL) || (task == current_task)) { 6041 dev->result = ISC_R_CANCELED; 6042 send_senddone_event(sock, &dev); 6043 } 6044 dev = next; 6045 } 6046 } 6047 6048 if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT) 6049 && !ISC_LIST_EMPTY(sock->accept_list)) { 6050 isc_socket_newconnev_t *dev; 6051 isc_socket_newconnev_t *next; 6052 isc_task_t *current_task; 6053 6054 dev = ISC_LIST_HEAD(sock->accept_list); 6055 while (dev != NULL) { 6056 current_task = dev->ev_sender; 6057 next = ISC_LIST_NEXT(dev, ev_link); 6058 6059 if ((task == NULL) || (task == current_task)) { 6060 6061 ISC_LIST_UNLINK(sock->accept_list, dev, 6062 ev_link); 6063 6064 NEWCONNSOCK(dev)->references--; 6065 free_socket((isc__socket_t **)&dev->newsocket); 6066 6067 dev->result = ISC_R_CANCELED; 6068 dev->ev_sender = sock; 6069 isc_task_sendanddetach(¤t_task, 6070 ISC_EVENT_PTR(&dev)); 6071 } 6072 6073 dev = next; 6074 } 6075 } 6076 6077 /* 6078 * Connecting is not a list. 6079 */ 6080 if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT) 6081 && sock->connect_ev != NULL) { 6082 isc_socket_connev_t *dev; 6083 isc_task_t *current_task; 6084 6085 INSIST(sock->connecting); 6086 sock->connecting = 0; 6087 6088 dev = sock->connect_ev; 6089 current_task = dev->ev_sender; 6090 6091 if ((task == NULL) || (task == current_task)) { 6092 sock->connect_ev = NULL; 6093 6094 dev->result = ISC_R_CANCELED; 6095 dev->ev_sender = sock; 6096 isc_task_sendanddetach(¤t_task, 6097 ISC_EVENT_PTR(&dev)); 6098 } 6099 } 6100 6101 UNLOCK(&sock->lock); 6102 } 6103 6104 isc_sockettype_t 6105 isc__socket_gettype(isc_socket_t *sock0) { 6106 isc__socket_t *sock = (isc__socket_t *)sock0; 6107 6108 REQUIRE(VALID_SOCKET(sock)); 6109 6110 return (sock->type); 6111 } 6112 6113 isc_boolean_t 6114 isc__socket_isbound(isc_socket_t *sock0) { 6115 isc__socket_t *sock = (isc__socket_t *)sock0; 6116 isc_boolean_t val; 6117 6118 REQUIRE(VALID_SOCKET(sock)); 6119 6120 LOCK(&sock->lock); 6121 val = ((sock->bound) ? ISC_TRUE : ISC_FALSE); 6122 UNLOCK(&sock->lock); 6123 6124 return (val); 6125 } 6126 6127 void 6128 isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) { 6129 isc__socket_t *sock = (isc__socket_t *)sock0; 6130 #if defined(IPV6_V6ONLY) 6131 int onoff = yes ? 1 : 0; 6132 #else 6133 UNUSED(yes); 6134 UNUSED(sock); 6135 #endif 6136 6137 REQUIRE(VALID_SOCKET(sock)); 6138 INSIST(!sock->dupped); 6139 6140 #ifdef IPV6_V6ONLY 6141 if (sock->pf == AF_INET6) { 6142 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, 6143 (void *)&onoff, sizeof(int)) < 0) { 6144 char strbuf[ISC_STRERRORSIZE]; 6145 isc__strerror(errno, strbuf, sizeof(strbuf)); 6146 UNEXPECTED_ERROR(__FILE__, __LINE__, 6147 "setsockopt(%d, IPV6_V6ONLY) " 6148 "%s: %s", sock->fd, 6149 isc_msgcat_get(isc_msgcat, 6150 ISC_MSGSET_GENERAL, 6151 ISC_MSG_FAILED, 6152 "failed"), 6153 strbuf); 6154 } 6155 } 6156 FIX_IPV6_RECVPKTINFO(sock); /* AIX */ 6157 #endif 6158 } 6159 6160 static void 6161 setdscp(isc__socket_t *sock, isc_dscp_t dscp) { 6162 #if defined(IP_TOS) || defined(IPV6_TCLASS) 6163 int value = dscp << 2; 6164 #endif 6165 6166 sock->dscp = dscp; 6167 6168 #ifdef IP_TOS 6169 if (sock->pf == AF_INET) { 6170 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, 6171 (void *)&value, sizeof(value)) < 0) { 6172 char strbuf[ISC_STRERRORSIZE]; 6173 isc__strerror(errno, strbuf, sizeof(strbuf)); 6174 UNEXPECTED_ERROR(__FILE__, __LINE__, 6175 "setsockopt(%d, IP_TOS, %.02x) " 6176 "%s: %s", sock->fd, value >> 2, 6177 isc_msgcat_get(isc_msgcat, 6178 ISC_MSGSET_GENERAL, 6179 ISC_MSG_FAILED, 6180 "failed"), 6181 strbuf); 6182 } 6183 } 6184 #endif 6185 #ifdef IPV6_TCLASS 6186 if (sock->pf == AF_INET6) { 6187 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS, 6188 (void *)&value, sizeof(value)) < 0) { 6189 char strbuf[ISC_STRERRORSIZE]; 6190 isc__strerror(errno, strbuf, sizeof(strbuf)); 6191 UNEXPECTED_ERROR(__FILE__, __LINE__, 6192 "setsockopt(%d, IPV6_TCLASS, %.02x) " 6193 "%s: %s", sock->fd, dscp >> 2, 6194 isc_msgcat_get(isc_msgcat, 6195 ISC_MSGSET_GENERAL, 6196 ISC_MSG_FAILED, 6197 "failed"), 6198 strbuf); 6199 } 6200 } 6201 #endif 6202 } 6203 6204 void 6205 isc__socket_dscp(isc_socket_t *sock0, isc_dscp_t dscp) { 6206 isc__socket_t *sock = (isc__socket_t *)sock0; 6207 6208 REQUIRE(VALID_SOCKET(sock)); 6209 REQUIRE(dscp < 0x40); 6210 6211 #if !defined(IP_TOS) && !defined(IPV6_TCLASS) 6212 UNUSED(dscp); 6213 #else 6214 if (dscp < 0) 6215 return; 6216 6217 /* The DSCP value must not be changed once it has been set. */ 6218 if (isc_dscp_check_value != -1) 6219 INSIST(dscp == isc_dscp_check_value); 6220 #endif 6221 6222 6223 #ifdef notyet 6224 REQUIRE(!sock->dupped); 6225 #endif 6226 6227 setdscp(sock, dscp); 6228 } 6229 6230 isc_socketevent_t * 6231 isc_socket_socketevent(isc_mem_t *mctx, void *sender, 6232 isc_eventtype_t eventtype, isc_taskaction_t action, 6233 void *arg) 6234 { 6235 return (allocate_socketevent(mctx, sender, eventtype, action, arg)); 6236 } 6237 6238 #ifndef USE_WATCHER_THREAD 6239 /* 6240 * In our assumed scenario, we can simply use a single static object. 6241 * XXX: this is not true if the application uses multiple threads with 6242 * 'multi-context' mode. Fixing this is a future TODO item. 6243 */ 6244 static isc_socketwait_t swait_private; 6245 6246 int 6247 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp, 6248 isc_socketwait_t **swaitp) 6249 { 6250 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 6251 int n; 6252 #ifdef USE_KQUEUE 6253 struct timespec ts, *tsp; 6254 #endif 6255 #ifdef USE_EPOLL 6256 int timeout; 6257 #endif 6258 #ifdef USE_DEVPOLL 6259 isc_result_t result; 6260 int pass; 6261 struct dvpoll dvp; 6262 #endif 6263 6264 REQUIRE(swaitp != NULL && *swaitp == NULL); 6265 6266 #ifdef USE_SHARED_MANAGER 6267 if (manager == NULL) 6268 manager = socketmgr; 6269 #endif 6270 if (manager == NULL) 6271 return (0); 6272 6273 #ifdef USE_KQUEUE 6274 if (tvp != NULL) { 6275 ts.tv_sec = tvp->tv_sec; 6276 ts.tv_nsec = tvp->tv_usec * 1000; 6277 tsp = &ts; 6278 } else 6279 tsp = NULL; 6280 swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0, 6281 manager->events, manager->nevents, 6282 tsp); 6283 n = swait_private.nevents; 6284 #elif defined(USE_EPOLL) 6285 if (tvp != NULL) 6286 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; 6287 else 6288 timeout = -1; 6289 swait_private.nevents = epoll_wait(manager->epoll_fd, 6290 manager->events, 6291 manager->nevents, timeout); 6292 n = swait_private.nevents; 6293 #elif defined(USE_DEVPOLL) 6294 /* 6295 * Re-probe every thousand calls. 6296 */ 6297 if (manager->calls++ > 1000U) { 6298 result = isc_resource_getcurlimit(isc_resource_openfiles, 6299 &manager->open_max); 6300 if (result != ISC_R_SUCCESS) 6301 manager->open_max = 64; 6302 manager->calls = 0; 6303 } 6304 for (pass = 0; pass < 2; pass++) { 6305 dvp.dp_fds = manager->events; 6306 dvp.dp_nfds = manager->nevents; 6307 if (dvp.dp_nfds >= manager->open_max) 6308 dvp.dp_nfds = manager->open_max - 1; 6309 if (tvp != NULL) { 6310 dvp.dp_timeout = tvp->tv_sec * 1000 + 6311 (tvp->tv_usec + 999) / 1000; 6312 } else 6313 dvp.dp_timeout = -1; 6314 n = ioctl(manager->devpoll_fd, DP_POLL, &dvp); 6315 if (n == -1 && errno == EINVAL) { 6316 /* 6317 * {OPEN_MAX} may have dropped. Look 6318 * up the current value and try again. 6319 */ 6320 result = isc_resource_getcurlimit( 6321 isc_resource_openfiles, 6322 &manager->open_max); 6323 if (result != ISC_R_SUCCESS) 6324 manager->open_max = 64; 6325 } else 6326 break; 6327 } 6328 swait_private.nevents = n; 6329 #elif defined(USE_SELECT) 6330 memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); 6331 memmove(manager->write_fds_copy, manager->write_fds, 6332 manager->fd_bufsize); 6333 6334 swait_private.readset = manager->read_fds_copy; 6335 swait_private.writeset = manager->write_fds_copy; 6336 swait_private.maxfd = manager->maxfd + 1; 6337 6338 n = select(swait_private.maxfd, swait_private.readset, 6339 swait_private.writeset, NULL, tvp); 6340 #endif 6341 6342 *swaitp = &swait_private; 6343 return (n); 6344 } 6345 6346 isc_result_t 6347 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) { 6348 isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0; 6349 6350 REQUIRE(swait == &swait_private); 6351 6352 #ifdef USE_SHARED_MANAGER 6353 if (manager == NULL) 6354 manager = socketmgr; 6355 #endif 6356 if (manager == NULL) 6357 return (ISC_R_NOTFOUND); 6358 6359 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) 6360 (void)process_fds(manager, manager->events, swait->nevents); 6361 return (ISC_R_SUCCESS); 6362 #elif defined(USE_SELECT) 6363 process_fds(manager, swait->maxfd, swait->readset, swait->writeset); 6364 return (ISC_R_SUCCESS); 6365 #endif 6366 } 6367 #endif /* USE_WATCHER_THREAD */ 6368 6369 void 6370 isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) { 6371 isc__socket_t *socket = (isc__socket_t *)socket0; 6372 6373 /* 6374 * Name 'socket'. 6375 */ 6376 6377 REQUIRE(VALID_SOCKET(socket)); 6378 6379 LOCK(&socket->lock); 6380 memset(socket->name, 0, sizeof(socket->name)); 6381 strncpy(socket->name, name, sizeof(socket->name) - 1); 6382 socket->tag = tag; 6383 UNLOCK(&socket->lock); 6384 } 6385 6386 const char * 6387 isc__socket_getname(isc_socket_t *socket0) { 6388 isc__socket_t *socket = (isc__socket_t *)socket0; 6389 6390 return (socket->name); 6391 } 6392 6393 void * 6394 isc__socket_gettag(isc_socket_t *socket0) { 6395 isc__socket_t *socket = (isc__socket_t *)socket0; 6396 6397 return (socket->tag); 6398 } 6399 6400 isc_result_t 6401 isc__socket_register(void) { 6402 return (isc_socket_register(isc__socketmgr_create)); 6403 } 6404 6405 int 6406 isc__socket_getfd(isc_socket_t *socket0) { 6407 isc__socket_t *socket = (isc__socket_t *)socket0; 6408 6409 return ((short) socket->fd); 6410 } 6411 6412 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON) 6413 static const char * 6414 _socktype(isc_sockettype_t type) 6415 { 6416 if (type == isc_sockettype_udp) 6417 return ("udp"); 6418 else if (type == isc_sockettype_tcp) 6419 return ("tcp"); 6420 else if (type == isc_sockettype_unix) 6421 return ("unix"); 6422 else if (type == isc_sockettype_fdwatch) 6423 return ("fdwatch"); 6424 else 6425 return ("not-initialized"); 6426 } 6427 #endif 6428 6429 #ifdef HAVE_LIBXML2 6430 #define TRY0(a) do { xmlrc = (a); if (xmlrc < 0) goto error; } while(/*CONSTCOND*/0) 6431 int 6432 isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) { 6433 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0; 6434 isc__socket_t *sock = NULL; 6435 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 6436 isc_sockaddr_t addr; 6437 ISC_SOCKADDR_LEN_T len; 6438 int xmlrc; 6439 6440 LOCK(&mgr->lock); 6441 6442 #ifdef USE_SHARED_MANAGER 6443 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "references")); 6444 TRY0(xmlTextWriterWriteFormatString(writer, "%d", mgr->refs)); 6445 TRY0(xmlTextWriterEndElement(writer)); 6446 #endif /* USE_SHARED_MANAGER */ 6447 6448 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets")); 6449 sock = ISC_LIST_HEAD(mgr->socklist); 6450 while (sock != NULL) { 6451 LOCK(&sock->lock); 6452 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket")); 6453 6454 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id")); 6455 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock)); 6456 TRY0(xmlTextWriterEndElement(writer)); 6457 6458 if (sock->name[0] != 0) { 6459 TRY0(xmlTextWriterStartElement(writer, 6460 ISC_XMLCHAR "name")); 6461 TRY0(xmlTextWriterWriteFormatString(writer, "%s", 6462 sock->name)); 6463 TRY0(xmlTextWriterEndElement(writer)); /* name */ 6464 } 6465 6466 TRY0(xmlTextWriterStartElement(writer, 6467 ISC_XMLCHAR "references")); 6468 TRY0(xmlTextWriterWriteFormatString(writer, "%d", 6469 sock->references)); 6470 TRY0(xmlTextWriterEndElement(writer)); 6471 6472 TRY0(xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type", 6473 ISC_XMLCHAR _socktype(sock->type))); 6474 6475 if (sock->connected) { 6476 isc_sockaddr_format(&sock->peer_address, peerbuf, 6477 sizeof(peerbuf)); 6478 TRY0(xmlTextWriterWriteElement(writer, 6479 ISC_XMLCHAR "peer-address", 6480 ISC_XMLCHAR peerbuf)); 6481 } 6482 6483 len = sizeof(addr); 6484 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 6485 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 6486 TRY0(xmlTextWriterWriteElement(writer, 6487 ISC_XMLCHAR "local-address", 6488 ISC_XMLCHAR peerbuf)); 6489 } 6490 6491 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states")); 6492 if (sock->pending_recv) 6493 TRY0(xmlTextWriterWriteElement(writer, 6494 ISC_XMLCHAR "state", 6495 ISC_XMLCHAR "pending-receive")); 6496 if (sock->pending_send) 6497 TRY0(xmlTextWriterWriteElement(writer, 6498 ISC_XMLCHAR "state", 6499 ISC_XMLCHAR "pending-send")); 6500 if (sock->pending_accept) 6501 TRY0(xmlTextWriterWriteElement(writer, 6502 ISC_XMLCHAR "state", 6503 ISC_XMLCHAR "pending_accept")); 6504 if (sock->listener) 6505 TRY0(xmlTextWriterWriteElement(writer, 6506 ISC_XMLCHAR "state", 6507 ISC_XMLCHAR "listener")); 6508 if (sock->connected) 6509 TRY0(xmlTextWriterWriteElement(writer, 6510 ISC_XMLCHAR "state", 6511 ISC_XMLCHAR "connected")); 6512 if (sock->connecting) 6513 TRY0(xmlTextWriterWriteElement(writer, 6514 ISC_XMLCHAR "state", 6515 ISC_XMLCHAR "connecting")); 6516 if (sock->bound) 6517 TRY0(xmlTextWriterWriteElement(writer, 6518 ISC_XMLCHAR "state", 6519 ISC_XMLCHAR "bound")); 6520 6521 TRY0(xmlTextWriterEndElement(writer)); /* states */ 6522 6523 TRY0(xmlTextWriterEndElement(writer)); /* socket */ 6524 6525 UNLOCK(&sock->lock); 6526 sock = ISC_LIST_NEXT(sock, link); 6527 } 6528 TRY0(xmlTextWriterEndElement(writer)); /* sockets */ 6529 6530 error: 6531 if (sock != NULL) 6532 UNLOCK(&sock->lock); 6533 6534 UNLOCK(&mgr->lock); 6535 6536 return (xmlrc); 6537 } 6538 #endif /* HAVE_LIBXML2 */ 6539 6540 #ifdef HAVE_JSON 6541 #define CHECKMEM(m) do { \ 6542 if (m == NULL) { \ 6543 result = ISC_R_NOMEMORY;\ 6544 goto error;\ 6545 } \ 6546 } while(/*CONSTCOND*/0) 6547 6548 isc_result_t 6549 isc_socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats) { 6550 isc_result_t result = ISC_R_SUCCESS; 6551 isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0; 6552 isc__socket_t *sock = NULL; 6553 char peerbuf[ISC_SOCKADDR_FORMATSIZE]; 6554 isc_sockaddr_t addr; 6555 ISC_SOCKADDR_LEN_T len; 6556 json_object *obj, *array = json_object_new_array(); 6557 6558 CHECKMEM(array); 6559 6560 LOCK(&mgr->lock); 6561 6562 #ifdef USE_SHARED_MANAGER 6563 obj = json_object_new_int(mgr->refs); 6564 CHECKMEM(obj); 6565 json_object_object_add(stats, "references", obj); 6566 #endif /* USE_SHARED_MANAGER */ 6567 6568 sock = ISC_LIST_HEAD(mgr->socklist); 6569 while (sock != NULL) { 6570 json_object *states, *entry = json_object_new_object(); 6571 char buf[255]; 6572 6573 CHECKMEM(entry); 6574 json_object_array_add(array, entry); 6575 6576 LOCK(&sock->lock); 6577 6578 sprintf(buf, "%p", sock); 6579 obj = json_object_new_string(buf); 6580 CHECKMEM(obj); 6581 json_object_object_add(entry, "id", obj); 6582 6583 if (sock->name[0] != 0) { 6584 obj = json_object_new_string(sock->name); 6585 CHECKMEM(obj); 6586 json_object_object_add(entry, "name", obj); 6587 } 6588 6589 obj = json_object_new_int(sock->references); 6590 CHECKMEM(obj); 6591 json_object_object_add(entry, "references", obj); 6592 6593 obj = json_object_new_string(_socktype(sock->type)); 6594 CHECKMEM(obj); 6595 json_object_object_add(entry, "type", obj); 6596 6597 if (sock->connected) { 6598 isc_sockaddr_format(&sock->peer_address, peerbuf, 6599 sizeof(peerbuf)); 6600 obj = json_object_new_string(peerbuf); 6601 CHECKMEM(obj); 6602 json_object_object_add(entry, "peer-address", obj); 6603 } 6604 6605 len = sizeof(addr); 6606 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) { 6607 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf)); 6608 obj = json_object_new_string(peerbuf); 6609 CHECKMEM(obj); 6610 json_object_object_add(entry, "local-address", obj); 6611 } 6612 6613 states = json_object_new_array(); 6614 CHECKMEM(states); 6615 json_object_object_add(entry, "states", states); 6616 6617 if (sock->pending_recv) { 6618 obj = json_object_new_string("pending-receive"); 6619 CHECKMEM(obj); 6620 json_object_array_add(states, obj); 6621 } 6622 6623 if (sock->pending_send) { 6624 obj = json_object_new_string("pending-send"); 6625 CHECKMEM(obj); 6626 json_object_array_add(states, obj); 6627 } 6628 6629 if (sock->pending_accept) { 6630 obj = json_object_new_string("pending-accept"); 6631 CHECKMEM(obj); 6632 json_object_array_add(states, obj); 6633 } 6634 6635 if (sock->listener) { 6636 obj = json_object_new_string("listener"); 6637 CHECKMEM(obj); 6638 json_object_array_add(states, obj); 6639 } 6640 6641 if (sock->connected) { 6642 obj = json_object_new_string("connected"); 6643 CHECKMEM(obj); 6644 json_object_array_add(states, obj); 6645 } 6646 6647 if (sock->connecting) { 6648 obj = json_object_new_string("connecting"); 6649 CHECKMEM(obj); 6650 json_object_array_add(states, obj); 6651 } 6652 6653 if (sock->bound) { 6654 obj = json_object_new_string("bound"); 6655 CHECKMEM(obj); 6656 json_object_array_add(states, obj); 6657 } 6658 6659 UNLOCK(&sock->lock); 6660 sock = ISC_LIST_NEXT(sock, link); 6661 } 6662 6663 json_object_object_add(stats, "sockets", array); 6664 array = NULL; 6665 result = ISC_R_SUCCESS; 6666 6667 error: 6668 if (array != NULL) 6669 json_object_put(array); 6670 6671 if (sock != NULL) 6672 UNLOCK(&sock->lock); 6673 6674 UNLOCK(&mgr->lock); 6675 6676 return (result); 6677 } 6678 #endif /* HAVE_JSON */ 6679 6680 #include "../socket_api.c" 6681