1 /* $NetBSD: socket.c,v 1.1 2024/02/18 20:57:57 christos Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16 /*! \file */
17
18 #include <inttypes.h>
19 #include <stdbool.h>
20 #include <sys/param.h>
21 #include <sys/socket.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
25 #include <sys/sysctl.h>
26 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
27 #include <sys/time.h>
28 #include <sys/uio.h>
29
30 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
31 #include <linux/netlink.h>
32 #include <linux/rtnetlink.h>
33 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
34 */
35
36 #include <errno.h>
37 #include <fcntl.h>
38 #include <stddef.h>
39 #include <stdlib.h>
40 #include <unistd.h>
41
42 #include <isc/app.h>
43 #include <isc/buffer.h>
44 #include <isc/condition.h>
45 #include <isc/formatcheck.h>
46 #include <isc/list.h>
47 #include <isc/log.h>
48 #include <isc/mem.h>
49 #include <isc/mutex.h>
50 #include <isc/net.h>
51 #include <isc/once.h>
52 #include <isc/platform.h>
53 #include <isc/print.h>
54 #include <isc/refcount.h>
55 #include <isc/region.h>
56 #include <isc/resource.h>
57 #include <isc/socket.h>
58 #include <isc/stats.h>
59 #include <isc/strerr.h>
60 #include <isc/string.h>
61 #include <isc/task.h>
62 #include <isc/thread.h>
63 #include <isc/util.h>
64
65 #ifdef ISC_PLATFORM_HAVESYSUNH
66 #include <sys/un.h>
67 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
68 #ifdef HAVE_KQUEUE
69 #include <sys/event.h>
70 #endif /* ifdef HAVE_KQUEUE */
71 #ifdef HAVE_EPOLL_CREATE1
72 #include <sys/epoll.h>
73 #endif /* ifdef HAVE_EPOLL_CREATE1 */
74 #if defined(HAVE_SYS_DEVPOLL_H)
75 #include <sys/devpoll.h>
76 #elif defined(HAVE_DEVPOLL_H)
77 #include <devpoll.h>
78 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
79
80 #include <netinet/tcp.h>
81
82 #include "errno2result.h"
83
84 #ifdef ENABLE_TCP_FASTOPEN
85 #include <netinet/tcp.h>
86 #endif /* ifdef ENABLE_TCP_FASTOPEN */
87
88 #ifdef HAVE_JSON_C
89 #include <json_object.h>
90 #endif /* HAVE_JSON_C */
91
92 #ifdef HAVE_LIBXML2
93 #include <libxml/xmlwriter.h>
94 #define ISC_XMLCHAR (const xmlChar *)
95 #endif /* HAVE_LIBXML2 */
96
97 /*%
98 * Choose the most preferable multiplex method.
99 */
100 #if defined(HAVE_KQUEUE)
101 #define USE_KQUEUE
102 #elif defined(HAVE_EPOLL_CREATE1)
103 #define USE_EPOLL
104 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
105 #define USE_DEVPOLL
106 typedef struct {
107 unsigned int want_read : 1, want_write : 1;
108 } pollinfo_t;
109 #else /* if defined(HAVE_KQUEUE) */
110 #define USE_SELECT
111 #endif /* HAVE_KQUEUE */
112
113 /*
114 * Set by the -T dscp option on the command line. If set to a value
115 * other than -1, we check to make sure DSCP values match it, and
116 * assert if not.
117 */
118 int isc_dscp_check_value = -1;
119
120 /*%
121 * Maximum number of allowable open sockets. This is also the maximum
122 * allowable socket file descriptor.
123 *
124 * Care should be taken before modifying this value for select():
125 * The API standard doesn't ensure select() accept more than (the system default
126 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
127 * the vast majority of cases. This constant should therefore be increased only
128 * when absolutely necessary and possible, i.e., the server is exhausting all
129 * available file descriptors (up to FD_SETSIZE) and the select() function
130 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
131 * always by true, but we keep using some of them to ensure as much
132 * portability as possible). Note also that overall server performance
133 * may be rather worsened with a larger value of this constant due to
134 * inherent scalability problems of select().
135 *
136 * As a special note, this value shouldn't have to be touched if
137 * this is a build for an authoritative only DNS server.
138 */
139 #ifndef ISC_SOCKET_MAXSOCKETS
140 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
141 #ifdef TUNE_LARGE
142 #define ISC_SOCKET_MAXSOCKETS 21000
143 #else /* ifdef TUNE_LARGE */
144 #define ISC_SOCKET_MAXSOCKETS 4096
145 #endif /* TUNE_LARGE */
146 #elif defined(USE_SELECT)
147 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
148 #endif /* USE_KQUEUE... */
149 #endif /* ISC_SOCKET_MAXSOCKETS */
150
151 #ifdef USE_SELECT
152 /*%
153 * Mac OS X needs a special definition to support larger values in select().
154 * We always define this because a larger value can be specified run-time.
155 */
156 #ifdef __APPLE__
157 #define _DARWIN_UNLIMITED_SELECT
158 #endif /* __APPLE__ */
159 #endif /* USE_SELECT */
160
161 #ifdef ISC_SOCKET_USE_POLLWATCH
162 /*%
163 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
164 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
165 * some of the specified FD. The idea is based on the observation that it's
166 * likely for a busy server to keep receiving packets. It specifically works
167 * as follows: the socket watcher is first initialized with the state of
168 * "poll_idle". While it's in the idle state it keeps sleeping until a socket
169 * event occurs. When it wakes up for a socket I/O event, it moves to the
170 * poll_active state, and sets the poll timeout to a short period
171 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
172 * watcher goes to the poll_checking state with the same timeout period.
173 * In this state, the watcher tries to detect whether this is a break
174 * during intermittent events or the kernel bug is triggered. If the next
175 * polling reports an event within the short period, the previous timeout is
176 * likely to be a kernel bug, and so the watcher goes back to the active state.
177 * Otherwise, it moves to the idle state again.
178 *
179 * It's not clear whether this is a thread-related bug, but since we've only
180 * seen this with threads, this workaround is used only when enabling threads.
181 */
182
183 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
184
185 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
186 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
187 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
188 #endif /* ISC_SOCKET_USE_POLLWATCH */
189
190 /*%
191 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
192 */
193 #define FDLOCK_BITS 10
194 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
195 #define FDLOCK_ID(fd) \
196 (((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
197 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
198
199 /*%
200 * Maximum number of events communicated with the kernel. There should normally
201 * be no need for having a large number.
202 */
203 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
204 #ifndef ISC_SOCKET_MAXEVENTS
205 #ifdef TUNE_LARGE
206 #define ISC_SOCKET_MAXEVENTS 2048
207 #else /* ifdef TUNE_LARGE */
208 #define ISC_SOCKET_MAXEVENTS 64
209 #endif /* TUNE_LARGE */
210 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
211 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
212 * */
213
214 /*%
215 * Some systems define the socket length argument as an int, some as size_t,
216 * some as socklen_t. This is here so it can be easily changed if needed.
217 */
218 #ifndef socklen_t
219 #define socklen_t unsigned int
220 #endif /* ifndef socklen_t */
221
222 /*%
223 * Define what the possible "soft" errors can be. These are non-fatal returns
224 * of various network related functions, like recv() and so on.
225 *
226 * For some reason, BSDI (and perhaps others) will sometimes return <0
227 * from recv() but will have errno==0. This is broken, but we have to
228 * work around it here.
229 */
230 #define SOFT_ERROR(e) \
231 ((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
232 (e) == EINTR || (e) == 0)
233
234 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
235
236 /*!<
237 * DLVL(90) -- Function entry/exit and other tracing.
238 * DLVL(70) -- Socket "correctness" -- including returning of events, etc.
239 * DLVL(60) -- Socket data send/receive
240 * DLVL(50) -- Event tracing, including receiving/sending completion events.
241 * DLVL(20) -- Socket creation/destruction.
242 */
243 #define TRACE_LEVEL 90
244 #define CORRECTNESS_LEVEL 70
245 #define IOEVENT_LEVEL 60
246 #define EVENT_LEVEL 50
247 #define CREATION_LEVEL 20
248
249 #define TRACE DLVL(TRACE_LEVEL)
250 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
251 #define IOEVENT DLVL(IOEVENT_LEVEL)
252 #define EVENT DLVL(EVENT_LEVEL)
253 #define CREATION DLVL(CREATION_LEVEL)
254
255 typedef isc_event_t intev_t;
256
257 #define SOCKET_MAGIC ISC_MAGIC('I', 'O', 'i', 'o')
258 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
259
260 /*!
261 * IPv6 control information. If the socket is an IPv6 socket we want
262 * to collect the destination address and interface so the client can
263 * set them on outgoing packets.
264 */
265 #ifndef USE_CMSG
266 #define USE_CMSG 1
267 #endif /* ifndef USE_CMSG */
268
269 /*%
270 * NetBSD and FreeBSD can timestamp packets. XXXMLG Should we have
271 * a setsockopt() like interface to request timestamps, and if the OS
272 * doesn't do it for us, call gettimeofday() on every UDP receive?
273 */
274 #ifdef SO_TIMESTAMP
275 #ifndef USE_CMSG
276 #define USE_CMSG 1
277 #endif /* ifndef USE_CMSG */
278 #endif /* ifdef SO_TIMESTAMP */
279
280 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
281 #define SET_RCVBUF
282 #endif
283
284 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
285 #define SET_SNDBUF
286 #endif
287
288 /*%
289 * Instead of calculating the cmsgbuf lengths every time we take
290 * a rule of thumb approach - sizes are taken from x86_64 linux,
291 * multiplied by 2, everything should fit. Those sizes are not
292 * large enough to cause any concern.
293 */
294 #if defined(USE_CMSG)
295 #define CMSG_SP_IN6PKT 40
296 #else /* if defined(USE_CMSG) */
297 #define CMSG_SP_IN6PKT 0
298 #endif /* if defined(USE_CMSG) */
299
300 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
301 #define CMSG_SP_TIMESTAMP 32
302 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
303 #define CMSG_SP_TIMESTAMP 0
304 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
305
306 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
307 #define CMSG_SP_TCTOS 24
308 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
309 #define CMSG_SP_TCTOS 0
310 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
311
312 #define CMSG_SP_INT 24
313
314 /* Align cmsg buffers to be safe on SPARC etc. */
315 #define RECVCMSGBUFLEN \
316 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
317 1, \
318 sizeof(void *))
319 #define SENDCMSGBUFLEN \
320 ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
321 sizeof(void *))
322
323 /*%
324 * The number of times a send operation is repeated if the result is EINTR.
325 */
326 #define NRETRIES 10
327
328 typedef struct isc__socketthread isc__socketthread_t;
329
330 #define NEWCONNSOCK(ev) ((ev)->newsocket)
331
332 struct isc_socket {
333 /* Not locked. */
334 unsigned int magic;
335 isc_socketmgr_t *manager;
336 isc_mutex_t lock;
337 isc_sockettype_t type;
338 const isc_statscounter_t *statsindex;
339 isc_refcount_t references;
340
341 /* Locked by socket lock. */
342 ISC_LINK(isc_socket_t) link;
343 int fd;
344 int pf;
345 int threadid;
346 char name[16];
347 void *tag;
348
349 ISC_LIST(isc_socketevent_t) send_list;
350 ISC_LIST(isc_socketevent_t) recv_list;
351 ISC_LIST(isc_socket_newconnev_t) accept_list;
352 ISC_LIST(isc_socket_connev_t) connect_list;
353
354 isc_sockaddr_t peer_address; /* remote address */
355
356 unsigned int listener : 1, /* listener socket */
357 connected : 1, connecting : 1, /* connect pending
358 * */
359 bound : 1, /* bound to local addr */
360 dupped : 1, active : 1, /* currently active */
361 pktdscp : 1; /* per packet dscp */
362
363 #ifdef ISC_PLATFORM_RECVOVERFLOW
364 unsigned char overflow; /* used for MSG_TRUNC fake */
365 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
366
367 void *fdwatcharg;
368 isc_sockfdwatch_t fdwatchcb;
369 int fdwatchflags;
370 isc_task_t *fdwatchtask;
371 unsigned int dscp;
372 };
373
374 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
375 #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
376
377 struct isc_socketmgr {
378 /* Not locked. */
379 unsigned int magic;
380 isc_mem_t *mctx;
381 isc_mutex_t lock;
382 isc_stats_t *stats;
383 int nthreads;
384 isc__socketthread_t *threads;
385 unsigned int maxsocks;
386 /* Locked by manager lock. */
387 ISC_LIST(isc_socket_t) socklist;
388 int reserved; /* unlocked */
389 isc_condition_t shutdown_ok;
390 size_t maxudp;
391 };
392
393 struct isc__socketthread {
394 isc_socketmgr_t *manager;
395 int threadid;
396 isc_thread_t thread;
397 int pipe_fds[2];
398 isc_mutex_t *fdlock;
399 /* Locked by fdlock. */
400 isc_socket_t **fds;
401 int *fdstate;
402 #ifdef USE_KQUEUE
403 int kqueue_fd;
404 int nevents;
405 struct kevent *events;
406 #endif /* USE_KQUEUE */
407 #ifdef USE_EPOLL
408 int epoll_fd;
409 int nevents;
410 struct epoll_event *events;
411 uint32_t *epoll_events;
412 #endif /* USE_EPOLL */
413 #ifdef USE_DEVPOLL
414 int devpoll_fd;
415 isc_resourcevalue_t open_max;
416 unsigned int calls;
417 int nevents;
418 struct pollfd *events;
419 pollinfo_t *fdpollinfo;
420 #endif /* USE_DEVPOLL */
421 #ifdef USE_SELECT
422 int fd_bufsize;
423 fd_set *read_fds;
424 fd_set *read_fds_copy;
425 fd_set *write_fds;
426 fd_set *write_fds_copy;
427 int maxfd;
428 #endif /* USE_SELECT */
429 };
430
431 #define CLOSED 0 /* this one must be zero */
432 #define MANAGED 1
433 #define CLOSE_PENDING 2
434
435 /*
436 * send() and recv() iovec counts
437 */
438 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
439 #ifdef ISC_PLATFORM_RECVOVERFLOW
440 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
441 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
442 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
443 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
444
445 static isc_result_t
446 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
447 isc_socket_t **socketp, isc_socket_t *dup_socket);
448 static void
449 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
450 static void
451 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
452 static void
453 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
454 static void
455 free_socket(isc_socket_t **);
456 static isc_result_t
457 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
458 static void
459 destroy(isc_socket_t **);
460 static void
461 internal_accept(isc_socket_t *);
462 static void
463 internal_connect(isc_socket_t *);
464 static void
465 internal_recv(isc_socket_t *);
466 static void
467 internal_send(isc_socket_t *);
468 static void
469 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
470 static void
471 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
472 struct iovec *, size_t *);
473 static void
474 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
475 struct iovec *, size_t *);
476 static bool
477 process_ctlfd(isc__socketthread_t *thread);
478 static void
479 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
480 static void
481 dispatch_recv(isc_socket_t *sock);
482 static void
483 dispatch_send(isc_socket_t *sock);
484 static void
485 internal_fdwatch_read(isc_socket_t *sock);
486 static void
487 internal_fdwatch_write(isc_socket_t *sock);
488
489 #define SELECT_POKE_SHUTDOWN (-1)
490 #define SELECT_POKE_NOTHING (-2)
491 #define SELECT_POKE_READ (-3)
492 #define SELECT_POKE_ACCEPT (-3) /*%< Same as _READ */
493 #define SELECT_POKE_WRITE (-4)
494 #define SELECT_POKE_CONNECT (-4) /*%< Same as _WRITE */
495 #define SELECT_POKE_CLOSE (-5)
496
497 /*%
498 * Shortcut index arrays to get access to statistics counters.
499 */
500 enum {
501 STATID_OPEN = 0,
502 STATID_OPENFAIL = 1,
503 STATID_CLOSE = 2,
504 STATID_BINDFAIL = 3,
505 STATID_CONNECTFAIL = 4,
506 STATID_CONNECT = 5,
507 STATID_ACCEPTFAIL = 6,
508 STATID_ACCEPT = 7,
509 STATID_SENDFAIL = 8,
510 STATID_RECVFAIL = 9,
511 STATID_ACTIVE = 10
512 };
513 static const isc_statscounter_t udp4statsindex[] = {
514 isc_sockstatscounter_udp4open,
515 isc_sockstatscounter_udp4openfail,
516 isc_sockstatscounter_udp4close,
517 isc_sockstatscounter_udp4bindfail,
518 isc_sockstatscounter_udp4connectfail,
519 isc_sockstatscounter_udp4connect,
520 -1,
521 -1,
522 isc_sockstatscounter_udp4sendfail,
523 isc_sockstatscounter_udp4recvfail,
524 isc_sockstatscounter_udp4active
525 };
526 static const isc_statscounter_t udp6statsindex[] = {
527 isc_sockstatscounter_udp6open,
528 isc_sockstatscounter_udp6openfail,
529 isc_sockstatscounter_udp6close,
530 isc_sockstatscounter_udp6bindfail,
531 isc_sockstatscounter_udp6connectfail,
532 isc_sockstatscounter_udp6connect,
533 -1,
534 -1,
535 isc_sockstatscounter_udp6sendfail,
536 isc_sockstatscounter_udp6recvfail,
537 isc_sockstatscounter_udp6active
538 };
539 static const isc_statscounter_t tcp4statsindex[] = {
540 isc_sockstatscounter_tcp4open, isc_sockstatscounter_tcp4openfail,
541 isc_sockstatscounter_tcp4close, isc_sockstatscounter_tcp4bindfail,
542 isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
543 isc_sockstatscounter_tcp4acceptfail, isc_sockstatscounter_tcp4accept,
544 isc_sockstatscounter_tcp4sendfail, isc_sockstatscounter_tcp4recvfail,
545 isc_sockstatscounter_tcp4active
546 };
547 static const isc_statscounter_t tcp6statsindex[] = {
548 isc_sockstatscounter_tcp6open, isc_sockstatscounter_tcp6openfail,
549 isc_sockstatscounter_tcp6close, isc_sockstatscounter_tcp6bindfail,
550 isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
551 isc_sockstatscounter_tcp6acceptfail, isc_sockstatscounter_tcp6accept,
552 isc_sockstatscounter_tcp6sendfail, isc_sockstatscounter_tcp6recvfail,
553 isc_sockstatscounter_tcp6active
554 };
555 static const isc_statscounter_t unixstatsindex[] = {
556 isc_sockstatscounter_unixopen, isc_sockstatscounter_unixopenfail,
557 isc_sockstatscounter_unixclose, isc_sockstatscounter_unixbindfail,
558 isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
559 isc_sockstatscounter_unixacceptfail, isc_sockstatscounter_unixaccept,
560 isc_sockstatscounter_unixsendfail, isc_sockstatscounter_unixrecvfail,
561 isc_sockstatscounter_unixactive
562 };
563 static const isc_statscounter_t rawstatsindex[] = {
564 isc_sockstatscounter_rawopen,
565 isc_sockstatscounter_rawopenfail,
566 isc_sockstatscounter_rawclose,
567 -1,
568 -1,
569 -1,
570 -1,
571 -1,
572 -1,
573 isc_sockstatscounter_rawrecvfail,
574 isc_sockstatscounter_rawactive
575 };
576
577 static int
578 gen_threadid(isc_socket_t *sock);
579
580 static int
gen_threadid(isc_socket_t * sock)581 gen_threadid(isc_socket_t *sock) {
582 return (sock->fd % sock->manager->nthreads);
583 }
584
585 static void
586 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
587 isc_logmodule_t *module, int level, const char *fmt, ...)
588 ISC_FORMAT_PRINTF(5, 6);
589 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)590 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
591 isc_logmodule_t *module, int level, const char *fmt, ...) {
592 char msgbuf[2048];
593 va_list ap;
594
595 if (!isc_log_wouldlog(isc_lctx, level)) {
596 return;
597 }
598
599 va_start(ap, fmt);
600 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
601 va_end(ap);
602
603 isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
604 sockmgr, msgbuf);
605 }
606
607 static void
608 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
609 isc_logmodule_t *module, int level, const char *fmt, ...)
610 ISC_FORMAT_PRINTF(5, 6);
611 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)612 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
613 isc_logmodule_t *module, int level, const char *fmt, ...) {
614 char msgbuf[2048];
615 va_list ap;
616
617 if (!isc_log_wouldlog(isc_lctx, level)) {
618 return;
619 }
620
621 va_start(ap, fmt);
622 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
623 va_end(ap);
624
625 isc_log_write(isc_lctx, category, module, level,
626 "sockmgr %p thread %d: %s", thread->manager,
627 thread->threadid, msgbuf);
628 }
629
630 static void
631 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
632 isc_logcategory_t *category, isc_logmodule_t *module, int level,
633 const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
634 static void
socket_log(isc_socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)635 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
636 isc_logcategory_t *category, isc_logmodule_t *module, int level,
637 const char *fmt, ...) {
638 char msgbuf[2048];
639 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
640 va_list ap;
641
642 if (!isc_log_wouldlog(isc_lctx, level)) {
643 return;
644 }
645
646 va_start(ap, fmt);
647 vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
648 va_end(ap);
649
650 if (address == NULL) {
651 isc_log_write(isc_lctx, category, module, level,
652 "socket %p: %s", sock, msgbuf);
653 } else {
654 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
655 isc_log_write(isc_lctx, category, module, level,
656 "socket %p %s: %s", sock, peerbuf, msgbuf);
657 }
658 }
659
660 /*%
661 * Increment socket-related statistics counters.
662 */
663 static void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)664 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
665 REQUIRE(counterid != -1);
666
667 if (stats != NULL) {
668 isc_stats_increment(stats, counterid);
669 }
670 }
671
672 /*%
673 * Decrement socket-related statistics counters.
674 */
675 static void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)676 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
677 REQUIRE(counterid != -1);
678
679 if (stats != NULL) {
680 isc_stats_decrement(stats, counterid);
681 }
682 }
683
684 static isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)685 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
686 isc_result_t result = ISC_R_SUCCESS;
687
688 #ifdef USE_KQUEUE
689 struct kevent evchange;
690
691 memset(&evchange, 0, sizeof(evchange));
692 if (msg == SELECT_POKE_READ) {
693 evchange.filter = EVFILT_READ;
694 } else {
695 evchange.filter = EVFILT_WRITE;
696 }
697 evchange.flags = EV_ADD;
698 evchange.ident = fd;
699 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
700 result = isc__errno2result(errno);
701 }
702
703 return (result);
704 #elif defined(USE_EPOLL)
705 struct epoll_event event;
706 uint32_t oldevents;
707 int ret;
708 int op;
709
710 oldevents = thread->epoll_events[fd];
711 if (msg == SELECT_POKE_READ) {
712 thread->epoll_events[fd] |= EPOLLIN;
713 } else {
714 thread->epoll_events[fd] |= EPOLLOUT;
715 }
716
717 event.events = thread->epoll_events[fd];
718 memset(&event.data, 0, sizeof(event.data));
719 event.data.fd = fd;
720
721 op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
722 if (thread->fds[fd] != NULL) {
723 LOCK(&thread->fds[fd]->lock);
724 }
725 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
726 if (thread->fds[fd] != NULL) {
727 UNLOCK(&thread->fds[fd]->lock);
728 }
729 if (ret == -1) {
730 if (errno == EEXIST) {
731 UNEXPECTED_ERROR(__FILE__, __LINE__,
732 "epoll_ctl(ADD/MOD) returned "
733 "EEXIST for fd %d",
734 fd);
735 }
736 result = isc__errno2result(errno);
737 }
738
739 return (result);
740 #elif defined(USE_DEVPOLL)
741 struct pollfd pfd;
742
743 memset(&pfd, 0, sizeof(pfd));
744 if (msg == SELECT_POKE_READ) {
745 pfd.events = POLLIN;
746 } else {
747 pfd.events = POLLOUT;
748 }
749 pfd.fd = fd;
750 pfd.revents = 0;
751 if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
752 result = isc__errno2result(errno);
753 } else {
754 if (msg == SELECT_POKE_READ) {
755 thread->fdpollinfo[fd].want_read = 1;
756 } else {
757 thread->fdpollinfo[fd].want_write = 1;
758 }
759 }
760
761 return (result);
762 #elif defined(USE_SELECT)
763 LOCK(&thread->manager->lock);
764 if (msg == SELECT_POKE_READ) {
765 FD_SET(fd, thread->read_fds);
766 }
767 if (msg == SELECT_POKE_WRITE) {
768 FD_SET(fd, thread->write_fds);
769 }
770 UNLOCK(&thread->manager->lock);
771
772 return (result);
773 #endif /* ifdef USE_KQUEUE */
774 }
775
776 static isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)777 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
778 isc_result_t result = ISC_R_SUCCESS;
779
780 #ifdef USE_KQUEUE
781 struct kevent evchange;
782
783 memset(&evchange, 0, sizeof(evchange));
784 if (msg == SELECT_POKE_READ) {
785 evchange.filter = EVFILT_READ;
786 } else {
787 evchange.filter = EVFILT_WRITE;
788 }
789 evchange.flags = EV_DELETE;
790 evchange.ident = fd;
791 if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
792 result = isc__errno2result(errno);
793 }
794
795 return (result);
796 #elif defined(USE_EPOLL)
797 struct epoll_event event;
798 int ret;
799 int op;
800
801 if (msg == SELECT_POKE_READ) {
802 thread->epoll_events[fd] &= ~(EPOLLIN);
803 } else {
804 thread->epoll_events[fd] &= ~(EPOLLOUT);
805 }
806
807 event.events = thread->epoll_events[fd];
808 memset(&event.data, 0, sizeof(event.data));
809 event.data.fd = fd;
810
811 op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
812 ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
813 if (ret == -1 && errno != ENOENT) {
814 char strbuf[ISC_STRERRORSIZE];
815 strerror_r(errno, strbuf, sizeof(strbuf));
816 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
817 fd, strbuf);
818 result = ISC_R_UNEXPECTED;
819 }
820 return (result);
821 #elif defined(USE_DEVPOLL)
822 struct pollfd pfds[2];
823 size_t writelen = sizeof(pfds[0]);
824
825 memset(pfds, 0, sizeof(pfds));
826 pfds[0].events = POLLREMOVE;
827 pfds[0].fd = fd;
828
829 /*
830 * Canceling read or write polling via /dev/poll is tricky. Since it
831 * only provides a way of canceling per FD, we may need to re-poll the
832 * socket for the other operation.
833 */
834 if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
835 pfds[1].events = POLLOUT;
836 pfds[1].fd = fd;
837 writelen += sizeof(pfds[1]);
838 }
839 if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
840 pfds[1].events = POLLIN;
841 pfds[1].fd = fd;
842 writelen += sizeof(pfds[1]);
843 }
844
845 if (write(thread->devpoll_fd, pfds, writelen) == -1) {
846 result = isc__errno2result(errno);
847 } else {
848 if (msg == SELECT_POKE_READ) {
849 thread->fdpollinfo[fd].want_read = 0;
850 } else {
851 thread->fdpollinfo[fd].want_write = 0;
852 }
853 }
854
855 return (result);
856 #elif defined(USE_SELECT)
857 LOCK(&thread->manager->lock);
858 if (msg == SELECT_POKE_READ) {
859 FD_CLR(fd, thread->read_fds);
860 } else if (msg == SELECT_POKE_WRITE) {
861 FD_CLR(fd, thread->write_fds);
862 }
863 UNLOCK(&thread->manager->lock);
864
865 return (result);
866 #endif /* ifdef USE_KQUEUE */
867 }
868
869 /*
870 * A poke message was received, perform a proper watch/unwatch
871 * on a fd provided
872 */
873 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)874 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
875 isc_result_t result;
876 int lockid = FDLOCK_ID(fd);
877
878 /*
879 * This is a wakeup on a socket. If the socket is not in the
880 * process of being closed, start watching it for either reads
881 * or writes.
882 */
883
884 INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
885
886 if (msg == SELECT_POKE_CLOSE) {
887 LOCK(&thread->fdlock[lockid]);
888 INSIST(thread->fdstate[fd] == CLOSE_PENDING);
889 thread->fdstate[fd] = CLOSED;
890 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
891 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
892 (void)close(fd);
893 UNLOCK(&thread->fdlock[lockid]);
894 return;
895 }
896
897 LOCK(&thread->fdlock[lockid]);
898 if (thread->fdstate[fd] == CLOSE_PENDING) {
899 /*
900 * We accept (and ignore) any error from unwatch_fd() as we are
901 * closing the socket, hoping it doesn't leave dangling state in
902 * the kernel.
903 * Note that unwatch_fd() must be called after releasing the
904 * fdlock; otherwise it could cause deadlock due to a lock order
905 * reversal.
906 */
907 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
908 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
909 UNLOCK(&thread->fdlock[lockid]);
910 return;
911 }
912 if (thread->fdstate[fd] != MANAGED) {
913 UNLOCK(&thread->fdlock[lockid]);
914 return;
915 }
916
917 /*
918 * Set requested bit.
919 */
920 result = watch_fd(thread, fd, msg);
921 if (result != ISC_R_SUCCESS) {
922 /*
923 * XXXJT: what should we do? Ignoring the failure of watching
924 * a socket will make the application dysfunctional, but there
925 * seems to be no reasonable recovery process.
926 */
927 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
928 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
929 "failed to start watching FD (%d): %s", fd,
930 isc_result_totext(result));
931 }
932 UNLOCK(&thread->fdlock[lockid]);
933 }
934
935 /*
936 * Poke the select loop when there is something for us to do.
937 * The write is required (by POSIX) to complete. That is, we
938 * will not get partial writes.
939 */
940 static void
select_poke(isc_socketmgr_t * mgr,int threadid,int fd,int msg)941 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
942 int cc;
943 int buf[2];
944 char strbuf[ISC_STRERRORSIZE];
945
946 buf[0] = fd;
947 buf[1] = msg;
948
949 do {
950 cc = write(mgr->threads[threadid].pipe_fds[1], buf,
951 sizeof(buf));
952 #ifdef ENOSR
953 /*
954 * Treat ENOSR as EAGAIN but loop slowly as it is
955 * unlikely to clear fast.
956 */
957 if (cc < 0 && errno == ENOSR) {
958 sleep(1);
959 errno = EAGAIN;
960 }
961 #endif /* ifdef ENOSR */
962 } while (cc < 0 && SOFT_ERROR(errno));
963
964 if (cc < 0) {
965 strerror_r(errno, strbuf, sizeof(strbuf));
966 FATAL_ERROR(__FILE__, __LINE__,
967 "write() failed during watcher poke: %s", strbuf);
968 }
969
970 INSIST(cc == sizeof(buf));
971 }
972
973 /*
974 * Read a message on the internal fd.
975 */
976 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)977 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
978 int buf[2];
979 int cc;
980 char strbuf[ISC_STRERRORSIZE];
981
982 cc = read(thread->pipe_fds[0], buf, sizeof(buf));
983 if (cc < 0) {
984 *msg = SELECT_POKE_NOTHING;
985 *fd = -1; /* Silence compiler. */
986 if (SOFT_ERROR(errno)) {
987 return;
988 }
989
990 strerror_r(errno, strbuf, sizeof(strbuf));
991 FATAL_ERROR(__FILE__, __LINE__,
992 "read() failed during watcher poke: %s", strbuf);
993 }
994 INSIST(cc == sizeof(buf));
995
996 *fd = buf[0];
997 *msg = buf[1];
998 }
999
1000 /*
1001 * Make a fd non-blocking.
1002 */
1003 static isc_result_t
make_nonblock(int fd)1004 make_nonblock(int fd) {
1005 int ret;
1006 char strbuf[ISC_STRERRORSIZE];
1007 #ifdef USE_FIONBIO_IOCTL
1008 int on = 1;
1009 #else /* ifdef USE_FIONBIO_IOCTL */
1010 int flags;
1011 #endif /* ifdef USE_FIONBIO_IOCTL */
1012
1013 #ifdef USE_FIONBIO_IOCTL
1014 ret = ioctl(fd, FIONBIO, (char *)&on);
1015 #else /* ifdef USE_FIONBIO_IOCTL */
1016 flags = fcntl(fd, F_GETFL, 0);
1017 flags |= PORT_NONBLOCK;
1018 ret = fcntl(fd, F_SETFL, flags);
1019 #endif /* ifdef USE_FIONBIO_IOCTL */
1020
1021 if (ret == -1) {
1022 strerror_r(errno, strbuf, sizeof(strbuf));
1023 UNEXPECTED_ERROR(__FILE__, __LINE__,
1024 #ifdef USE_FIONBIO_IOCTL
1025 "ioctl(%d, FIONBIO, &on): %s", fd,
1026 #else /* ifdef USE_FIONBIO_IOCTL */
1027 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1028 #endif /* ifdef USE_FIONBIO_IOCTL */
1029 strbuf);
1030
1031 return (ISC_R_UNEXPECTED);
1032 }
1033
1034 return (ISC_R_SUCCESS);
1035 }
1036
1037 #ifdef USE_CMSG
1038 /*
1039 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1040 * In order to ensure as much portability as possible, we provide wrapper
1041 * functions of these macros.
1042 * Note that cmsg_space() could run slow on OSes that do not have
1043 * CMSG_SPACE.
1044 */
1045 static socklen_t
cmsg_len(socklen_t len)1046 cmsg_len(socklen_t len) {
1047 #ifdef CMSG_LEN
1048 return (CMSG_LEN(len));
1049 #else /* ifdef CMSG_LEN */
1050 socklen_t hdrlen;
1051
1052 /*
1053 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1054 * is correct.
1055 */
1056 hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1057 return (hdrlen + len);
1058 #endif /* ifdef CMSG_LEN */
1059 }
1060
1061 static socklen_t
cmsg_space(socklen_t len)1062 cmsg_space(socklen_t len) {
1063 #ifdef CMSG_SPACE
1064 return (CMSG_SPACE(len));
1065 #else /* ifdef CMSG_SPACE */
1066 struct msghdr msg;
1067 struct cmsghdr *cmsgp;
1068 /*
1069 * XXX: The buffer length is an ad-hoc value, but should be enough
1070 * in a practical sense.
1071 */
1072 char dummybuf[sizeof(struct cmsghdr) + 1024];
1073
1074 memset(&msg, 0, sizeof(msg));
1075 msg.msg_control = dummybuf;
1076 msg.msg_controllen = sizeof(dummybuf);
1077
1078 cmsgp = (struct cmsghdr *)dummybuf;
1079 cmsgp->cmsg_len = cmsg_len(len);
1080
1081 cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1082 if (cmsgp != NULL) {
1083 return ((char *)cmsgp - (char *)msg.msg_control);
1084 } else {
1085 return (0);
1086 }
1087 #endif /* ifdef CMSG_SPACE */
1088 }
1089 #endif /* USE_CMSG */
1090
1091 /*
1092 * Process control messages received on a socket.
1093 */
1094 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1095 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1096 #ifdef USE_CMSG
1097 struct cmsghdr *cmsgp;
1098 struct in6_pktinfo *pktinfop;
1099 #ifdef SO_TIMESTAMP
1100 void *timevalp;
1101 #endif /* ifdef SO_TIMESTAMP */
1102 #endif /* ifdef USE_CMSG */
1103
1104 /*
1105 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1106 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1107 * They are all here, outside of the CPP tests, because it is
1108 * more consistent with the usual ISC coding style.
1109 */
1110 UNUSED(sock);
1111 UNUSED(msg);
1112 UNUSED(dev);
1113
1114 #ifdef MSG_TRUNC
1115 if ((msg->msg_flags & MSG_TRUNC) != 0) {
1116 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1117 }
1118 #endif /* ifdef MSG_TRUNC */
1119
1120 #ifdef MSG_CTRUNC
1121 if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1122 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1123 }
1124 #endif /* ifdef MSG_CTRUNC */
1125
1126 #ifndef USE_CMSG
1127 return;
1128 #else /* ifndef USE_CMSG */
1129 if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1130 return;
1131 }
1132
1133 #ifdef SO_TIMESTAMP
1134 timevalp = NULL;
1135 #endif /* ifdef SO_TIMESTAMP */
1136 pktinfop = NULL;
1137
1138 cmsgp = CMSG_FIRSTHDR(msg);
1139 while (cmsgp != NULL) {
1140 socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1141
1142 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1143 cmsgp->cmsg_type == IPV6_PKTINFO)
1144 {
1145 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1146 memmove(&dev->pktinfo, pktinfop,
1147 sizeof(struct in6_pktinfo));
1148 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1149 socket_log(sock, NULL, TRACE,
1150 "interface received on ifindex %u",
1151 dev->pktinfo.ipi6_ifindex);
1152 if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1153 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1154 }
1155 goto next;
1156 }
1157
1158 #ifdef SO_TIMESTAMP
1159 if (cmsgp->cmsg_level == SOL_SOCKET &&
1160 cmsgp->cmsg_type == SCM_TIMESTAMP)
1161 {
1162 struct timeval tv;
1163 timevalp = CMSG_DATA(cmsgp);
1164 memmove(&tv, timevalp, sizeof(tv));
1165 dev->timestamp.seconds = tv.tv_sec;
1166 dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1167 dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1168 goto next;
1169 }
1170 #endif /* ifdef SO_TIMESTAMP */
1171
1172 #ifdef IPV6_TCLASS
1173 if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1174 cmsgp->cmsg_type == IPV6_TCLASS)
1175 {
1176 dev->dscp = *(int *)CMSG_DATA(cmsgp);
1177 dev->dscp >>= 2;
1178 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1179 goto next;
1180 }
1181 #endif /* ifdef IPV6_TCLASS */
1182
1183 #ifdef IP_TOS
1184 if (cmsgp->cmsg_level == IPPROTO_IP &&
1185 (cmsgp->cmsg_type == IP_TOS
1186 #ifdef IP_RECVTOS
1187 || cmsgp->cmsg_type == IP_RECVTOS
1188 #endif /* ifdef IP_RECVTOS */
1189 ))
1190 {
1191 dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1192 dev->dscp >>= 2;
1193 dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1194 goto next;
1195 }
1196 #endif /* ifdef IP_TOS */
1197 next:
1198 cmsgp = CMSG_NXTHDR(msg, cmsgp);
1199 }
1200 #endif /* USE_CMSG */
1201 }
1202
1203 /*
1204 * Construct an iov array and attach it to the msghdr passed in. This is
1205 * the SEND constructor, which will use the used region of the buffer
1206 * (if using a buffer list) or will use the internal region (if a single
1207 * buffer I/O is requested).
1208 *
1209 * Nothing can be NULL, and the done event must list at least one buffer
1210 * on the buffer linked list for this function to be meaningful.
1211 *
1212 * If write_countp != NULL, *write_countp will hold the number of bytes
1213 * this transaction can send.
1214 */
1215 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1216 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1217 struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1218 unsigned int iovcount;
1219 size_t write_count;
1220 struct cmsghdr *cmsgp;
1221
1222 memset(msg, 0, sizeof(*msg));
1223
1224 if (!sock->connected) {
1225 msg->msg_name = (void *)&dev->address.type.sa;
1226 msg->msg_namelen = dev->address.length;
1227 } else {
1228 msg->msg_name = NULL;
1229 msg->msg_namelen = 0;
1230 }
1231
1232 write_count = dev->region.length - dev->n;
1233 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1234 iov[0].iov_len = write_count;
1235 iovcount = 1;
1236
1237 msg->msg_iov = iov;
1238 msg->msg_iovlen = iovcount;
1239 msg->msg_control = NULL;
1240 msg->msg_controllen = 0;
1241 msg->msg_flags = 0;
1242 #if defined(USE_CMSG)
1243
1244 if ((sock->type == isc_sockettype_udp) &&
1245 ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1246 {
1247 struct in6_pktinfo *pktinfop;
1248
1249 socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1250 dev->pktinfo.ipi6_ifindex);
1251
1252 msg->msg_control = (void *)cmsgbuf;
1253 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1254 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1255
1256 cmsgp = (struct cmsghdr *)cmsgbuf;
1257 cmsgp->cmsg_level = IPPROTO_IPV6;
1258 cmsgp->cmsg_type = IPV6_PKTINFO;
1259 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1260 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1261 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1262 }
1263
1264 #if defined(IPV6_USE_MIN_MTU)
1265 if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1266 ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1267 {
1268 int use_min_mtu = 1; /* -1, 0, 1 */
1269
1270 cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1271 msg->msg_control = (void *)cmsgbuf;
1272 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1273 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1274
1275 cmsgp->cmsg_level = IPPROTO_IPV6;
1276 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1277 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1278 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1279 }
1280 #endif /* if defined(IPV6_USE_MIN_MTU) */
1281
1282 if (isc_dscp_check_value > -1) {
1283 if (sock->type == isc_sockettype_udp) {
1284 INSIST((int)dev->dscp == isc_dscp_check_value);
1285 } else if (sock->type == isc_sockettype_tcp) {
1286 INSIST((int)sock->dscp == isc_dscp_check_value);
1287 }
1288 }
1289
1290 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1291 if ((sock->type == isc_sockettype_udp) &&
1292 ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1293 {
1294 int dscp = (dev->dscp << 2) & 0xff;
1295
1296 INSIST(dev->dscp < 0x40);
1297
1298 #ifdef IP_TOS
1299 if (sock->pf == AF_INET && sock->pktdscp) {
1300 cmsgp = (struct cmsghdr *)(cmsgbuf +
1301 msg->msg_controllen);
1302 msg->msg_control = (void *)cmsgbuf;
1303 msg->msg_controllen += cmsg_space(sizeof(dscp));
1304 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1305
1306 cmsgp->cmsg_level = IPPROTO_IP;
1307 cmsgp->cmsg_type = IP_TOS;
1308 cmsgp->cmsg_len = cmsg_len(sizeof(char));
1309 *(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1310 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1311 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1312 (void *)&dscp, sizeof(int)) < 0)
1313 {
1314 char strbuf[ISC_STRERRORSIZE];
1315 strerror_r(errno, strbuf, sizeof(strbuf));
1316 UNEXPECTED_ERROR(__FILE__, __LINE__,
1317 "setsockopt(%d, IP_TOS, %.02x)"
1318 " failed: %s",
1319 sock->fd, dscp >> 2, strbuf);
1320 } else {
1321 sock->dscp = dscp;
1322 }
1323 }
1324 #endif /* ifdef IP_TOS */
1325 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1326 if (sock->pf == AF_INET6 && sock->pktdscp) {
1327 cmsgp = (struct cmsghdr *)(cmsgbuf +
1328 msg->msg_controllen);
1329 msg->msg_control = (void *)cmsgbuf;
1330 msg->msg_controllen += cmsg_space(sizeof(dscp));
1331 INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1332
1333 cmsgp->cmsg_level = IPPROTO_IPV6;
1334 cmsgp->cmsg_type = IPV6_TCLASS;
1335 cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1336 memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1337 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1338 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1339 (void *)&dscp, sizeof(int)) < 0)
1340 {
1341 char strbuf[ISC_STRERRORSIZE];
1342 strerror_r(errno, strbuf, sizeof(strbuf));
1343 UNEXPECTED_ERROR(__FILE__, __LINE__,
1344 "setsockopt(%d, IPV6_TCLASS, "
1345 "%.02x) failed: %s",
1346 sock->fd, dscp >> 2, strbuf);
1347 } else {
1348 sock->dscp = dscp;
1349 }
1350 }
1351 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1352 if (msg->msg_controllen != 0 &&
1353 msg->msg_controllen < SENDCMSGBUFLEN)
1354 {
1355 memset(cmsgbuf + msg->msg_controllen, 0,
1356 SENDCMSGBUFLEN - msg->msg_controllen);
1357 }
1358 }
1359 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1360 * defined(IPV6_TCLASS)) \
1361 * */
1362 #endif /* USE_CMSG */
1363
1364 if (write_countp != NULL) {
1365 *write_countp = write_count;
1366 }
1367 }
1368
1369 /*
1370 * Construct an iov array and attach it to the msghdr passed in. This is
1371 * the RECV constructor, which will use the available region of the buffer
1372 * (if using a buffer list) or will use the internal region (if a single
1373 * buffer I/O is requested).
1374 *
1375 * Nothing can be NULL, and the done event must list at least one buffer
1376 * on the buffer linked list for this function to be meaningful.
1377 *
1378 * If read_countp != NULL, *read_countp will hold the number of bytes
1379 * this transaction can receive.
1380 */
1381 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1382 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1383 struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1384 unsigned int iovcount;
1385 size_t read_count;
1386
1387 memset(msg, 0, sizeof(struct msghdr));
1388
1389 if (sock->type == isc_sockettype_udp) {
1390 memset(&dev->address, 0, sizeof(dev->address));
1391 msg->msg_name = (void *)&dev->address.type.sa;
1392 msg->msg_namelen = sizeof(dev->address.type);
1393 } else { /* TCP */
1394 msg->msg_name = NULL;
1395 msg->msg_namelen = 0;
1396 dev->address = sock->peer_address;
1397 }
1398
1399 read_count = dev->region.length - dev->n;
1400 iov[0].iov_base = (void *)(dev->region.base + dev->n);
1401 iov[0].iov_len = read_count;
1402 iovcount = 1;
1403
1404 /*
1405 * If needed, set up to receive that one extra byte.
1406 */
1407 #ifdef ISC_PLATFORM_RECVOVERFLOW
1408 if (sock->type == isc_sockettype_udp) {
1409 INSIST(iovcount < MAXSCATTERGATHER_RECV);
1410 iov[iovcount].iov_base = (void *)(&sock->overflow);
1411 iov[iovcount].iov_len = 1;
1412 iovcount++;
1413 }
1414 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1415
1416 msg->msg_iov = iov;
1417 msg->msg_iovlen = iovcount;
1418
1419 #if defined(USE_CMSG)
1420 msg->msg_control = cmsgbuf;
1421 msg->msg_controllen = RECVCMSGBUFLEN;
1422 #else /* if defined(USE_CMSG) */
1423 msg->msg_control = NULL;
1424 msg->msg_controllen = 0;
1425 #endif /* USE_CMSG */
1426 msg->msg_flags = 0;
1427
1428 if (read_countp != NULL) {
1429 *read_countp = read_count;
1430 }
1431 }
1432
1433 static void
set_dev_address(const isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1434 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1435 isc_socketevent_t *dev) {
1436 if (sock->type == isc_sockettype_udp) {
1437 if (address != NULL) {
1438 dev->address = *address;
1439 } else {
1440 dev->address = sock->peer_address;
1441 }
1442 } else if (sock->type == isc_sockettype_tcp) {
1443 INSIST(address == NULL);
1444 dev->address = sock->peer_address;
1445 }
1446 }
1447
1448 static void
destroy_socketevent(isc_event_t * event)1449 destroy_socketevent(isc_event_t *event) {
1450 isc_socketevent_t *ev = (isc_socketevent_t *)event;
1451
1452 (ev->destroy)(event);
1453 }
1454
1455 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1456 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1457 isc_taskaction_t action, void *arg) {
1458 isc_socketevent_t *ev;
1459
1460 ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1461 action, arg, sizeof(*ev));
1462
1463 ev->result = ISC_R_UNSET;
1464 ISC_LINK_INIT(ev, ev_link);
1465 ev->region.base = NULL;
1466 ev->n = 0;
1467 ev->offset = 0;
1468 ev->attributes = 0;
1469 ev->destroy = ev->ev_destroy;
1470 ev->ev_destroy = destroy_socketevent;
1471 ev->dscp = 0;
1472
1473 return (ev);
1474 }
1475
1476 #if defined(ISC_SOCKET_DEBUG)
1477 static void
dump_msg(struct msghdr * msg)1478 dump_msg(struct msghdr *msg) {
1479 unsigned int i;
1480
1481 printf("MSGHDR %p\n", msg);
1482 printf("\tname %p, namelen %ld\n", msg->msg_name,
1483 (long)msg->msg_namelen);
1484 printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1485 for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) {
1486 printf("\t\t%u\tbase %p, len %ld\n", i,
1487 msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1488 }
1489 printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1490 (long)msg->msg_controllen);
1491 }
1492 #endif /* if defined(ISC_SOCKET_DEBUG) */
1493
1494 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1495 #define DOIO_SOFT 1 /* i/o ok, soft error, no event sent */
1496 #define DOIO_HARD 2 /* i/o error, event sent */
1497 #define DOIO_EOF 3 /* EOF, no event sent */
1498
1499 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)1500 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1501 int cc;
1502 struct iovec iov[MAXSCATTERGATHER_RECV];
1503 size_t read_count;
1504 struct msghdr msghdr;
1505 int recv_errno;
1506 char strbuf[ISC_STRERRORSIZE];
1507 char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1508
1509 build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1510
1511 #if defined(ISC_SOCKET_DEBUG)
1512 dump_msg(&msghdr);
1513 #endif /* if defined(ISC_SOCKET_DEBUG) */
1514
1515 cc = recvmsg(sock->fd, &msghdr, 0);
1516 recv_errno = errno;
1517
1518 #if defined(ISC_SOCKET_DEBUG)
1519 dump_msg(&msghdr);
1520 #endif /* if defined(ISC_SOCKET_DEBUG) */
1521
1522 if (cc < 0) {
1523 if (SOFT_ERROR(recv_errno)) {
1524 return (DOIO_SOFT);
1525 }
1526
1527 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1528 strerror_r(recv_errno, strbuf, sizeof(strbuf));
1529 socket_log(sock, NULL, IOEVENT,
1530 "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1531 sock->fd, cc, recv_errno, strbuf);
1532 }
1533
1534 #define SOFT_OR_HARD(_system, _isc) \
1535 if (recv_errno == _system) { \
1536 if (sock->connected) { \
1537 dev->result = _isc; \
1538 inc_stats(sock->manager->stats, \
1539 sock->statsindex[STATID_RECVFAIL]); \
1540 return (DOIO_HARD); \
1541 } \
1542 return (DOIO_SOFT); \
1543 }
1544 #define ALWAYS_HARD(_system, _isc) \
1545 if (recv_errno == _system) { \
1546 dev->result = _isc; \
1547 inc_stats(sock->manager->stats, \
1548 sock->statsindex[STATID_RECVFAIL]); \
1549 return (DOIO_HARD); \
1550 }
1551
1552 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1553 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1554 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1555 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1556 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1557 /*
1558 * Older operating systems may still return EPROTO in some
1559 * situations, for example when receiving ICMP/ICMPv6 errors.
1560 * A real life scenario is when ICMPv6 returns code 5 or 6.
1561 * These codes are introduced in RFC 4443 from March 2006,
1562 * and the document obsoletes RFC 1885. But unfortunately not
1563 * all operating systems have caught up with the new standard
1564 * (in 2020) and thus a generic protocol error is returned.
1565 */
1566 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1567 /* Should never get this one but it was seen. */
1568 #ifdef ENOPROTOOPT
1569 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1570 #endif /* ifdef ENOPROTOOPT */
1571 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1572
1573 #undef SOFT_OR_HARD
1574 #undef ALWAYS_HARD
1575
1576 dev->result = isc__errno2result(recv_errno);
1577 inc_stats(sock->manager->stats,
1578 sock->statsindex[STATID_RECVFAIL]);
1579 return (DOIO_HARD);
1580 }
1581
1582 /*
1583 * On TCP and UNIX sockets, zero length reads indicate EOF,
1584 * while on UDP sockets, zero length reads are perfectly valid,
1585 * although strange.
1586 */
1587 switch (sock->type) {
1588 case isc_sockettype_tcp:
1589 case isc_sockettype_unix:
1590 if (cc == 0) {
1591 return (DOIO_EOF);
1592 }
1593 break;
1594 case isc_sockettype_udp:
1595 case isc_sockettype_raw:
1596 break;
1597 case isc_sockettype_fdwatch:
1598 default:
1599 UNREACHABLE();
1600 }
1601
1602 if (sock->type == isc_sockettype_udp) {
1603 dev->address.length = msghdr.msg_namelen;
1604 if (isc_sockaddr_getport(&dev->address) == 0) {
1605 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1606 socket_log(sock, &dev->address, IOEVENT,
1607 "dropping source port zero packet");
1608 }
1609 return (DOIO_SOFT);
1610 }
1611 /*
1612 * Simulate a firewall blocking UDP responses bigger than
1613 * 'maxudp' bytes.
1614 */
1615 if (sock->manager->maxudp != 0 &&
1616 cc > (int)sock->manager->maxudp)
1617 {
1618 return (DOIO_SOFT);
1619 }
1620 }
1621
1622 socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1623
1624 /*
1625 * Overflow bit detection. If we received MORE bytes than we should,
1626 * this indicates an overflow situation. Set the flag in the
1627 * dev entry and adjust how much we read by one.
1628 */
1629 #ifdef ISC_PLATFORM_RECVOVERFLOW
1630 if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1631 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1632 cc--;
1633 }
1634 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1635
1636 /*
1637 * If there are control messages attached, run through them and pull
1638 * out the interesting bits.
1639 */
1640 process_cmsg(sock, &msghdr, dev);
1641
1642 /*
1643 * update the buffers (if any) and the i/o count
1644 */
1645 dev->n += cc;
1646
1647 /*
1648 * If we read less than we expected, update counters,
1649 * and let the upper layer poke the descriptor.
1650 */
1651 if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1652 return (DOIO_SOFT);
1653 }
1654
1655 /*
1656 * Full reads are posted, or partials if partials are ok.
1657 */
1658 dev->result = ISC_R_SUCCESS;
1659 return (DOIO_SUCCESS);
1660 }
1661
1662 /*
1663 * Returns:
1664 * DOIO_SUCCESS The operation succeeded. dev->result contains
1665 * ISC_R_SUCCESS.
1666 *
1667 * DOIO_HARD A hard or unexpected I/O error was encountered.
1668 * dev->result contains the appropriate error.
1669 *
1670 * DOIO_SOFT A soft I/O error was encountered. No senddone
1671 * event was sent. The operation should be retried.
1672 *
1673 * No other return values are possible.
1674 */
1675 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)1676 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1677 int cc;
1678 struct iovec iov[MAXSCATTERGATHER_SEND];
1679 size_t write_count;
1680 struct msghdr msghdr;
1681 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1682 int attempts = 0;
1683 int send_errno;
1684 char strbuf[ISC_STRERRORSIZE];
1685 char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1686
1687 build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1688
1689 resend:
1690 if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1691 write_count > sock->manager->maxudp)
1692 {
1693 cc = write_count;
1694 } else {
1695 cc = sendmsg(sock->fd, &msghdr, 0);
1696 }
1697 send_errno = errno;
1698
1699 /*
1700 * Check for error or block condition.
1701 */
1702 if (cc < 0) {
1703 if (send_errno == EINTR && ++attempts < NRETRIES) {
1704 goto resend;
1705 }
1706
1707 if (SOFT_ERROR(send_errno)) {
1708 if (errno == EWOULDBLOCK || errno == EAGAIN) {
1709 dev->result = ISC_R_WOULDBLOCK;
1710 }
1711 return (DOIO_SOFT);
1712 }
1713
1714 #define SOFT_OR_HARD(_system, _isc) \
1715 if (send_errno == _system) { \
1716 if (sock->connected) { \
1717 dev->result = _isc; \
1718 inc_stats(sock->manager->stats, \
1719 sock->statsindex[STATID_SENDFAIL]); \
1720 return (DOIO_HARD); \
1721 } \
1722 return (DOIO_SOFT); \
1723 }
1724 #define ALWAYS_HARD(_system, _isc) \
1725 if (send_errno == _system) { \
1726 dev->result = _isc; \
1727 inc_stats(sock->manager->stats, \
1728 sock->statsindex[STATID_SENDFAIL]); \
1729 return (DOIO_HARD); \
1730 }
1731
1732 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1733 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1734 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1735 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1736 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1737 #ifdef EHOSTDOWN
1738 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1739 #endif /* ifdef EHOSTDOWN */
1740 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1741 SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1742 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1743 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1744 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1745
1746 #undef SOFT_OR_HARD
1747 #undef ALWAYS_HARD
1748
1749 /*
1750 * The other error types depend on whether or not the
1751 * socket is UDP or TCP. If it is UDP, some errors
1752 * that we expect to be fatal under TCP are merely
1753 * annoying, and are really soft errors.
1754 *
1755 * However, these soft errors are still returned as
1756 * a status.
1757 */
1758 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1759 strerror_r(send_errno, strbuf, sizeof(strbuf));
1760 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1761 addrbuf, strbuf);
1762 dev->result = isc__errno2result(send_errno);
1763 inc_stats(sock->manager->stats,
1764 sock->statsindex[STATID_SENDFAIL]);
1765 return (DOIO_HARD);
1766 }
1767
1768 if (cc == 0) {
1769 inc_stats(sock->manager->stats,
1770 sock->statsindex[STATID_SENDFAIL]);
1771 UNEXPECTED_ERROR(__FILE__, __LINE__,
1772 "doio_send: send() returned 0");
1773 }
1774
1775 /*
1776 * If we write less than we expected, update counters, poke.
1777 */
1778 dev->n += cc;
1779 if ((size_t)cc != write_count) {
1780 return (DOIO_SOFT);
1781 }
1782
1783 /*
1784 * Exactly what we wanted to write. We're done with this
1785 * entry. Post its completion event.
1786 */
1787 dev->result = ISC_R_SUCCESS;
1788 return (DOIO_SUCCESS);
1789 }
1790
1791 /*
1792 * Kill.
1793 *
1794 * Caller must ensure that the socket is not locked and no external
1795 * references exist.
1796 */
1797 static void
socketclose(isc__socketthread_t * thread,isc_socket_t * sock,int fd)1798 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1799 int lockid = FDLOCK_ID(fd);
1800 /*
1801 * No one has this socket open, so the watcher doesn't have to be
1802 * poked, and the socket doesn't have to be locked.
1803 */
1804 LOCK(&thread->fdlock[lockid]);
1805 thread->fds[fd] = NULL;
1806 if (sock->type == isc_sockettype_fdwatch)
1807 thread->fdstate[fd] = CLOSED;
1808 else
1809 thread->fdstate[fd] = CLOSE_PENDING;
1810 UNLOCK(&thread->fdlock[lockid]);
1811 if (sock->type == isc_sockettype_fdwatch) {
1812 /*
1813 * The caller may close the socket once this function returns,
1814 * and `fd' may be reassigned for a new socket. So we do
1815 * unwatch_fd() here, rather than defer it via select_poke().
1816 * Note: this may complicate data protection among threads and
1817 * may reduce performance due to additional locks. One way to
1818 * solve this would be to dup() the watched descriptor, but we
1819 * take a simpler approach at this moment.
1820 */
1821 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
1822 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
1823 } else
1824 select_poke(thread->manager, thread->threadid, fd,
1825 SELECT_POKE_CLOSE);
1826
1827 inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1828
1829 LOCK(&sock->lock);
1830 if (sock->active == 1) {
1831 dec_stats(thread->manager->stats,
1832 sock->statsindex[STATID_ACTIVE]);
1833 sock->active = 0;
1834 }
1835 UNLOCK(&sock->lock);
1836
1837 /*
1838 * update manager->maxfd here (XXX: this should be implemented more
1839 * efficiently)
1840 */
1841 #ifdef USE_SELECT
1842 LOCK(&thread->manager->lock);
1843 if (thread->maxfd == fd) {
1844 int i;
1845
1846 thread->maxfd = 0;
1847 for (i = fd - 1; i >= 0; i--) {
1848 lockid = FDLOCK_ID(i);
1849
1850 LOCK(&thread->fdlock[lockid]);
1851 if (thread->fdstate[i] == MANAGED) {
1852 thread->maxfd = i;
1853 UNLOCK(&thread->fdlock[lockid]);
1854 break;
1855 }
1856 UNLOCK(&thread->fdlock[lockid]);
1857 }
1858 if (thread->maxfd < thread->pipe_fds[0]) {
1859 thread->maxfd = thread->pipe_fds[0];
1860 }
1861 }
1862
1863 UNLOCK(&thread->manager->lock);
1864 #endif /* USE_SELECT */
1865 }
1866
1867 static void
destroy(isc_socket_t ** sockp)1868 destroy(isc_socket_t **sockp) {
1869 int fd = 0;
1870 isc_socket_t *sock = *sockp;
1871 isc_socketmgr_t *manager = sock->manager;
1872 isc__socketthread_t *thread = NULL;
1873
1874 socket_log(sock, NULL, CREATION, "destroying");
1875
1876 isc_refcount_destroy(&sock->references);
1877
1878 LOCK(&sock->lock);
1879 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1880 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1881 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1882 INSIST(ISC_LIST_EMPTY(sock->send_list));
1883 INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1884
1885 if (sock->fd >= 0) {
1886 fd = sock->fd;
1887 thread = &manager->threads[sock->threadid];
1888 sock->fd = -1;
1889 sock->threadid = -1;
1890 }
1891 UNLOCK(&sock->lock);
1892
1893 if (fd > 0) {
1894 socketclose(thread, sock, fd);
1895 }
1896
1897 LOCK(&manager->lock);
1898
1899 ISC_LIST_UNLINK(manager->socklist, sock, link);
1900
1901 if (ISC_LIST_EMPTY(manager->socklist)) {
1902 SIGNAL(&manager->shutdown_ok);
1903 }
1904
1905 /* can't unlock manager as its memory context is still used */
1906 free_socket(sockp);
1907
1908 UNLOCK(&manager->lock);
1909 }
1910
1911 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1912 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1913 isc_socket_t **socketp) {
1914 isc_socket_t *sock;
1915
1916 sock = isc_mem_get(manager->mctx, sizeof(*sock));
1917
1918 sock->magic = 0;
1919 isc_refcount_init(&sock->references, 0);
1920
1921 sock->manager = manager;
1922 sock->type = type;
1923 sock->fd = -1;
1924 sock->threadid = -1;
1925 sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1926 sock->dupped = 0;
1927 sock->statsindex = NULL;
1928 sock->active = 0;
1929
1930 ISC_LINK_INIT(sock, link);
1931
1932 memset(sock->name, 0, sizeof(sock->name));
1933 sock->tag = NULL;
1934
1935 /*
1936 * Set up list of readers and writers to be initially empty.
1937 */
1938 ISC_LIST_INIT(sock->recv_list);
1939 ISC_LIST_INIT(sock->send_list);
1940 ISC_LIST_INIT(sock->accept_list);
1941 ISC_LIST_INIT(sock->connect_list);
1942
1943 sock->listener = 0;
1944 sock->connected = 0;
1945 sock->connecting = 0;
1946 sock->bound = 0;
1947 sock->pktdscp = 0;
1948
1949 /*
1950 * Initialize the lock.
1951 */
1952 isc_mutex_init(&sock->lock);
1953
1954 sock->magic = SOCKET_MAGIC;
1955 *socketp = sock;
1956
1957 return (ISC_R_SUCCESS);
1958 }
1959
1960 /*
1961 * This event requires that the various lists be empty, that the reference
1962 * count be 1, and that the magic number is valid. The other socket bits,
1963 * like the lock, must be initialized as well. The fd associated must be
1964 * marked as closed, by setting it to -1 on close, or this routine will
1965 * also close the socket.
1966 */
1967 static void
free_socket(isc_socket_t ** socketp)1968 free_socket(isc_socket_t **socketp) {
1969 isc_socket_t *sock = *socketp;
1970 *socketp = NULL;
1971
1972 INSIST(VALID_SOCKET(sock));
1973 isc_refcount_destroy(&sock->references);
1974 LOCK(&sock->lock);
1975 INSIST(!sock->connecting);
1976 INSIST(ISC_LIST_EMPTY(sock->recv_list));
1977 INSIST(ISC_LIST_EMPTY(sock->send_list));
1978 INSIST(ISC_LIST_EMPTY(sock->accept_list));
1979 INSIST(ISC_LIST_EMPTY(sock->connect_list));
1980 INSIST(!ISC_LINK_LINKED(sock, link));
1981 UNLOCK(&sock->lock);
1982
1983 sock->magic = 0;
1984
1985 isc_mutex_destroy(&sock->lock);
1986
1987 isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1988 }
1989
1990 #if defined(SET_RCVBUF)
1991 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1992 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1993
1994 static void
set_rcvbuf(void)1995 set_rcvbuf(void) {
1996 int fd;
1997 int max = rcvbuf, min;
1998 socklen_t len;
1999
2000 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2001 if (fd == -1) {
2002 switch (errno) {
2003 case EPROTONOSUPPORT:
2004 case EPFNOSUPPORT:
2005 case EAFNOSUPPORT:
2006 /*
2007 * Linux 2.2 (and maybe others) return EINVAL instead of
2008 * EAFNOSUPPORT.
2009 */
2010 case EINVAL:
2011 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2012 break;
2013 }
2014 }
2015 if (fd == -1) {
2016 return;
2017 }
2018
2019 len = sizeof(min);
2020 if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
2021 min < rcvbuf)
2022 {
2023 again:
2024 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
2025 sizeof(rcvbuf)) == -1)
2026 {
2027 if (errno == ENOBUFS && rcvbuf > min) {
2028 max = rcvbuf - 1;
2029 rcvbuf = (rcvbuf + min) / 2;
2030 goto again;
2031 } else {
2032 rcvbuf = min;
2033 goto cleanup;
2034 }
2035 } else {
2036 min = rcvbuf;
2037 }
2038 if (min != max) {
2039 rcvbuf = max;
2040 goto again;
2041 }
2042 }
2043 cleanup:
2044 close(fd);
2045 }
2046 #endif /* ifdef SO_RCVBUF */
2047
2048 #if defined(SET_SNDBUF)
2049 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2050 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2051
2052 static void
set_sndbuf(void)2053 set_sndbuf(void) {
2054 int fd;
2055 int max = sndbuf, min;
2056 socklen_t len;
2057
2058 fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2059 if (fd == -1) {
2060 switch (errno) {
2061 case EPROTONOSUPPORT:
2062 case EPFNOSUPPORT:
2063 case EAFNOSUPPORT:
2064 /*
2065 * Linux 2.2 (and maybe others) return EINVAL instead of
2066 * EAFNOSUPPORT.
2067 */
2068 case EINVAL:
2069 fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2070 break;
2071 }
2072 }
2073 if (fd == -1) {
2074 return;
2075 }
2076
2077 len = sizeof(min);
2078 if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2079 min < sndbuf)
2080 {
2081 again:
2082 if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2083 sizeof(sndbuf)) == -1)
2084 {
2085 if (errno == ENOBUFS && sndbuf > min) {
2086 max = sndbuf - 1;
2087 sndbuf = (sndbuf + min) / 2;
2088 goto again;
2089 } else {
2090 sndbuf = min;
2091 goto cleanup;
2092 }
2093 } else {
2094 min = sndbuf;
2095 }
2096 if (min != max) {
2097 sndbuf = max;
2098 goto again;
2099 }
2100 }
2101 cleanup:
2102 close(fd);
2103 }
2104 #endif /* ifdef SO_SNDBUF */
2105
2106 static void
use_min_mtu(isc_socket_t * sock)2107 use_min_mtu(isc_socket_t *sock) {
2108 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2109 UNUSED(sock);
2110 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2111 #ifdef IPV6_USE_MIN_MTU
2112 /* use minimum MTU */
2113 if (sock->pf == AF_INET6) {
2114 int on = 1;
2115 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2116 (void *)&on, sizeof(on));
2117 }
2118 #endif /* ifdef IPV6_USE_MIN_MTU */
2119 #if defined(IPV6_MTU)
2120 /*
2121 * Use minimum MTU on IPv6 sockets.
2122 */
2123 if (sock->pf == AF_INET6) {
2124 int mtu = 1280;
2125 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2126 sizeof(mtu));
2127 }
2128 #endif /* if defined(IPV6_MTU) */
2129 }
2130
2131 static void
set_tcp_maxseg(isc_socket_t * sock,int size)2132 set_tcp_maxseg(isc_socket_t *sock, int size) {
2133 #ifdef TCP_MAXSEG
2134 if (sock->type == isc_sockettype_tcp) {
2135 (void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2136 (void *)&size, sizeof(size));
2137 }
2138 #endif /* ifdef TCP_MAXSEG */
2139 }
2140
2141 static void
set_ip_disable_pmtud(isc_socket_t * sock)2142 set_ip_disable_pmtud(isc_socket_t *sock) {
2143 /*
2144 * Disable Path MTU Discover on IP packets
2145 */
2146 if (sock->pf == AF_INET6) {
2147 #if defined(IPV6_DONTFRAG)
2148 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2149 &(int){ 0 }, sizeof(int));
2150 #endif
2151 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2152 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2153 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2154 #endif
2155 } else if (sock->pf == AF_INET) {
2156 #if defined(IP_DONTFRAG)
2157 (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2158 sizeof(int));
2159 #endif
2160 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2161 (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2162 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2163 #endif
2164 }
2165 }
2166
2167 static isc_result_t
opensocket(isc_socketmgr_t * manager,isc_socket_t * sock,isc_socket_t * dup_socket)2168 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock,
2169 isc_socket_t *dup_socket) {
2170 isc_result_t result;
2171 char strbuf[ISC_STRERRORSIZE];
2172 const char *err = "socket";
2173 int tries = 0;
2174 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2175 int on = 1;
2176 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2177 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2178 socklen_t optlen;
2179 int size = 0;
2180 #endif
2181
2182 again:
2183 if (dup_socket == NULL) {
2184 switch (sock->type) {
2185 case isc_sockettype_udp:
2186 sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2187 break;
2188 case isc_sockettype_tcp:
2189 sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2190 break;
2191 case isc_sockettype_unix:
2192 sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2193 break;
2194 case isc_sockettype_raw:
2195 errno = EPFNOSUPPORT;
2196 /*
2197 * PF_ROUTE is a alias for PF_NETLINK on linux.
2198 */
2199 #if defined(PF_ROUTE)
2200 if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2201 #ifdef NETLINK_ROUTE
2202 sock->fd = socket(sock->pf, SOCK_RAW,
2203 NETLINK_ROUTE);
2204 #else /* ifdef NETLINK_ROUTE */
2205 sock->fd = socket(sock->pf, SOCK_RAW, 0);
2206 #endif /* ifdef NETLINK_ROUTE */
2207 if (sock->fd != -1) {
2208 #ifdef NETLINK_ROUTE
2209 struct sockaddr_nl sa;
2210 int n;
2211
2212 /*
2213 * Do an implicit bind.
2214 */
2215 memset(&sa, 0, sizeof(sa));
2216 sa.nl_family = AF_NETLINK;
2217 sa.nl_groups = RTMGRP_IPV4_IFADDR |
2218 RTMGRP_IPV6_IFADDR;
2219 n = bind(sock->fd,
2220 (struct sockaddr *)&sa,
2221 sizeof(sa));
2222 if (n < 0) {
2223 close(sock->fd);
2224 sock->fd = -1;
2225 }
2226 #endif /* ifdef NETLINK_ROUTE */
2227 sock->bound = 1;
2228 }
2229 }
2230 #endif /* if defined(PF_ROUTE) */
2231 break;
2232 case isc_sockettype_fdwatch:
2233 /*
2234 * We should not be called for isc_sockettype_fdwatch
2235 * sockets.
2236 */
2237 INSIST(0);
2238 break;
2239 }
2240 } else {
2241 sock->fd = dup(dup_socket->fd);
2242 sock->dupped = 1;
2243 sock->bound = dup_socket->bound;
2244 }
2245 if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2246 goto again;
2247 }
2248
2249 #ifdef F_DUPFD
2250 /*
2251 * Leave a space for stdio and TCP to work in.
2252 */
2253 if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2254 sock->fd >= 0 && sock->fd < manager->reserved)
2255 {
2256 int newfd, tmp;
2257 newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2258 tmp = errno;
2259 (void)close(sock->fd);
2260 errno = tmp;
2261 sock->fd = newfd;
2262 err = "isc_socket_create: fcntl/reserved";
2263 } else if (sock->fd >= 0 && sock->fd < 20) {
2264 int newfd, tmp;
2265 newfd = fcntl(sock->fd, F_DUPFD, 20);
2266 tmp = errno;
2267 (void)close(sock->fd);
2268 errno = tmp;
2269 sock->fd = newfd;
2270 err = "isc_socket_create: fcntl";
2271 }
2272 #endif /* ifdef F_DUPFD */
2273
2274 if (sock->fd >= (int)manager->maxsocks) {
2275 (void)close(sock->fd);
2276 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2277 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2278 "socket: file descriptor exceeds limit (%d/%u)",
2279 sock->fd, manager->maxsocks);
2280 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2281 return (ISC_R_NORESOURCES);
2282 }
2283
2284 if (sock->fd < 0) {
2285 switch (errno) {
2286 case EMFILE:
2287 case ENFILE:
2288 strerror_r(errno, strbuf, sizeof(strbuf));
2289 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2290 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2291 "%s: %s", err, strbuf);
2292 FALLTHROUGH;
2293 case ENOBUFS:
2294 inc_stats(manager->stats,
2295 sock->statsindex[STATID_OPENFAIL]);
2296 return (ISC_R_NORESOURCES);
2297
2298 case EPROTONOSUPPORT:
2299 case EPFNOSUPPORT:
2300 case EAFNOSUPPORT:
2301 /*
2302 * Linux 2.2 (and maybe others) return EINVAL instead of
2303 * EAFNOSUPPORT.
2304 */
2305 case EINVAL:
2306 inc_stats(manager->stats,
2307 sock->statsindex[STATID_OPENFAIL]);
2308 return (ISC_R_FAMILYNOSUPPORT);
2309
2310 default:
2311 strerror_r(errno, strbuf, sizeof(strbuf));
2312 UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2313 err, strbuf);
2314 inc_stats(manager->stats,
2315 sock->statsindex[STATID_OPENFAIL]);
2316 return (ISC_R_UNEXPECTED);
2317 }
2318 }
2319
2320 if (dup_socket != NULL) {
2321 goto setup_done;
2322 }
2323
2324 result = make_nonblock(sock->fd);
2325 if (result != ISC_R_SUCCESS) {
2326 (void)close(sock->fd);
2327 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2328 return (result);
2329 }
2330
2331 #ifdef SO_NOSIGPIPE
2332 if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2333 sizeof(on)) < 0)
2334 {
2335 strerror_r(errno, strbuf, sizeof(strbuf));
2336 UNEXPECTED_ERROR(__FILE__, __LINE__,
2337 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2338 sock->fd, strbuf);
2339 /* Press on... */
2340 }
2341 #endif /* ifdef SO_NOSIGPIPE */
2342
2343 /*
2344 * Use minimum mtu if possible.
2345 */
2346 if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2347 use_min_mtu(sock);
2348 set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2349 }
2350
2351 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2352 if (sock->type == isc_sockettype_udp) {
2353 #if defined(USE_CMSG)
2354 #if defined(SO_TIMESTAMP)
2355 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2356 sizeof(on)) < 0 &&
2357 errno != ENOPROTOOPT)
2358 {
2359 strerror_r(errno, strbuf, sizeof(strbuf));
2360 UNEXPECTED_ERROR(__FILE__, __LINE__,
2361 "setsockopt(%d, SO_TIMESTAMP) failed: "
2362 "%s",
2363 sock->fd, strbuf);
2364 /* Press on... */
2365 }
2366 #endif /* SO_TIMESTAMP */
2367
2368 #ifdef IPV6_RECVPKTINFO
2369 /* RFC 3542 */
2370 if ((sock->pf == AF_INET6) &&
2371 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2372 (void *)&on, sizeof(on)) < 0))
2373 {
2374 strerror_r(errno, strbuf, sizeof(strbuf));
2375 UNEXPECTED_ERROR(__FILE__, __LINE__,
2376 "setsockopt(%d, IPV6_RECVPKTINFO) "
2377 "failed: %s",
2378 sock->fd, strbuf);
2379 }
2380 #else /* ifdef IPV6_RECVPKTINFO */
2381 /* RFC 2292 */
2382 if ((sock->pf == AF_INET6) &&
2383 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2384 (void *)&on, sizeof(on)) < 0))
2385 {
2386 strerror_r(errno, strbuf, sizeof(strbuf));
2387 UNEXPECTED_ERROR(__FILE__, __LINE__,
2388 "setsockopt(%d, IPV6_PKTINFO) failed: "
2389 "%s",
2390 sock->fd, strbuf);
2391 }
2392 #endif /* IPV6_RECVPKTINFO */
2393 #endif /* defined(USE_CMSG) */
2394
2395 #if defined(SET_RCVBUF)
2396 optlen = sizeof(size);
2397 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2398 &optlen) == 0 &&
2399 size < rcvbuf)
2400 {
2401 RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2402 ISC_R_SUCCESS);
2403 if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2404 (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2405 {
2406 strerror_r(errno, strbuf, sizeof(strbuf));
2407 UNEXPECTED_ERROR(__FILE__, __LINE__,
2408 "setsockopt(%d, SO_RCVBUF, "
2409 "%d) failed: %s",
2410 sock->fd, rcvbuf, strbuf);
2411 }
2412 }
2413 #endif /* if defined(SET_RCVBUF) */
2414
2415 #if defined(SET_SNDBUF)
2416 optlen = sizeof(size);
2417 if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2418 &optlen) == 0 &&
2419 size < sndbuf)
2420 {
2421 RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2422 ISC_R_SUCCESS);
2423 if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2424 (void *)&sndbuf, sizeof(sndbuf)) == -1)
2425 {
2426 strerror_r(errno, strbuf, sizeof(strbuf));
2427 UNEXPECTED_ERROR(__FILE__, __LINE__,
2428 "setsockopt(%d, SO_SNDBUF, "
2429 "%d) failed: %s",
2430 sock->fd, sndbuf, strbuf);
2431 }
2432 }
2433 #endif /* if defined(SO_SNDBUF) */
2434 }
2435 #ifdef IPV6_RECVTCLASS
2436 if ((sock->pf == AF_INET6) &&
2437 (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2438 sizeof(on)) < 0))
2439 {
2440 strerror_r(errno, strbuf, sizeof(strbuf));
2441 UNEXPECTED_ERROR(__FILE__, __LINE__,
2442 "setsockopt(%d, IPV6_RECVTCLASS) "
2443 "failed: %s",
2444 sock->fd, strbuf);
2445 }
2446 #endif /* ifdef IPV6_RECVTCLASS */
2447 #ifdef IP_RECVTOS
2448 if ((sock->pf == AF_INET) &&
2449 (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2450 sizeof(on)) < 0))
2451 {
2452 strerror_r(errno, strbuf, sizeof(strbuf));
2453 UNEXPECTED_ERROR(__FILE__, __LINE__,
2454 "setsockopt(%d, IP_RECVTOS) "
2455 "failed: %s",
2456 sock->fd, strbuf);
2457 }
2458 #endif /* ifdef IP_RECVTOS */
2459 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2460
2461 set_ip_disable_pmtud(sock);
2462
2463 setup_done:
2464 inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2465 if (sock->active == 0) {
2466 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2467 sock->active = 1;
2468 }
2469
2470 return (ISC_R_SUCCESS);
2471 }
2472
2473 /*
2474 * Create a 'type' socket or duplicate an existing socket, managed
2475 * by 'manager'. Events will be posted to 'task' and when dispatched
2476 * 'action' will be called with 'arg' as the arg value. The new
2477 * socket is returned in 'socketp'.
2478 */
2479 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)2480 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2481 isc_socket_t **socketp, isc_socket_t *dup_socket) {
2482 isc_socket_t *sock = NULL;
2483 isc__socketthread_t *thread;
2484 isc_result_t result;
2485 int lockid;
2486
2487 REQUIRE(VALID_MANAGER(manager));
2488 REQUIRE(socketp != NULL && *socketp == NULL);
2489 REQUIRE(type != isc_sockettype_fdwatch);
2490
2491 result = allocate_socket(manager, type, &sock);
2492 if (result != ISC_R_SUCCESS) {
2493 return (result);
2494 }
2495
2496 switch (sock->type) {
2497 case isc_sockettype_udp:
2498 sock->statsindex = (pf == AF_INET) ? udp4statsindex
2499 : udp6statsindex;
2500 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2501 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2502 break;
2503 case isc_sockettype_tcp:
2504 sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2505 : tcp6statsindex;
2506 break;
2507 case isc_sockettype_unix:
2508 sock->statsindex = unixstatsindex;
2509 break;
2510 case isc_sockettype_raw:
2511 sock->statsindex = rawstatsindex;
2512 break;
2513 default:
2514 UNREACHABLE();
2515 }
2516
2517 sock->pf = pf;
2518
2519 result = opensocket(manager, sock, dup_socket);
2520 if (result != ISC_R_SUCCESS) {
2521 free_socket(&sock);
2522 return (result);
2523 }
2524
2525 if (sock->fd == -1) {
2526 abort();
2527 }
2528 sock->threadid = gen_threadid(sock);
2529 isc_refcount_increment0(&sock->references);
2530 thread = &manager->threads[sock->threadid];
2531 *socketp = sock;
2532
2533 /*
2534 * Note we don't have to lock the socket like we normally would because
2535 * there are no external references to it yet.
2536 */
2537
2538 lockid = FDLOCK_ID(sock->fd);
2539 LOCK(&thread->fdlock[lockid]);
2540 thread->fds[sock->fd] = sock;
2541 thread->fdstate[sock->fd] = MANAGED;
2542 #if defined(USE_EPOLL)
2543 thread->epoll_events[sock->fd] = 0;
2544 #endif /* if defined(USE_EPOLL) */
2545 #ifdef USE_DEVPOLL
2546 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2547 thread->fdpollinfo[sock->fd].want_write == 0);
2548 #endif /* ifdef USE_DEVPOLL */
2549 UNLOCK(&thread->fdlock[lockid]);
2550
2551 LOCK(&manager->lock);
2552 ISC_LIST_APPEND(manager->socklist, sock, link);
2553 #ifdef USE_SELECT
2554 if (thread->maxfd < sock->fd) {
2555 thread->maxfd = sock->fd;
2556 }
2557 #endif /* ifdef USE_SELECT */
2558 UNLOCK(&manager->lock);
2559
2560 socket_log(sock, NULL, CREATION,
2561 dup_socket != NULL ? "dupped" : "created");
2562
2563 return (ISC_R_SUCCESS);
2564 }
2565
2566 /*%
2567 * Create a new 'type' socket managed by 'manager'. Events
2568 * will be posted to 'task' and when dispatched 'action' will be
2569 * called with 'arg' as the arg value. The new socket is returned
2570 * in 'socketp'.
2571 */
2572 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2573 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2574 isc_socket_t **socketp) {
2575 return (socket_create(manager0, pf, type, socketp, NULL));
2576 }
2577
2578 /*%
2579 * Duplicate an existing socket. The new socket is returned
2580 * in 'socketp'.
2581 */
2582 isc_result_t
isc_socket_dup(isc_socket_t * sock,isc_socket_t ** socketp)2583 isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
2584 REQUIRE(VALID_SOCKET(sock));
2585 REQUIRE(socketp != NULL && *socketp == NULL);
2586
2587 return (socket_create(sock->manager, sock->pf, sock->type, socketp,
2588 sock));
2589 }
2590
2591 isc_result_t
isc_socket_open(isc_socket_t * sock)2592 isc_socket_open(isc_socket_t *sock) {
2593 isc_result_t result;
2594 isc__socketthread_t *thread;
2595
2596 REQUIRE(VALID_SOCKET(sock));
2597
2598 LOCK(&sock->lock);
2599
2600 REQUIRE(isc_refcount_current(&sock->references) >= 1);
2601 REQUIRE(sock->fd == -1);
2602 REQUIRE(sock->threadid == -1);
2603 REQUIRE(sock->type != isc_sockettype_fdwatch);
2604
2605 result = opensocket(sock->manager, sock, NULL);
2606
2607 UNLOCK(&sock->lock);
2608
2609 if (result != ISC_R_SUCCESS) {
2610 sock->fd = -1;
2611 } else {
2612 sock->threadid = gen_threadid(sock);
2613 thread = &sock->manager->threads[sock->threadid];
2614 int lockid = FDLOCK_ID(sock->fd);
2615
2616 LOCK(&thread->fdlock[lockid]);
2617 thread->fds[sock->fd] = sock;
2618 thread->fdstate[sock->fd] = MANAGED;
2619 #if defined(USE_EPOLL)
2620 thread->epoll_events[sock->fd] = 0;
2621 #endif /* if defined(USE_EPOLL) */
2622 #ifdef USE_DEVPOLL
2623 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2624 thread->fdpollinfo[sock->fd].want_write == 0);
2625 #endif /* ifdef USE_DEVPOLL */
2626 UNLOCK(&thread->fdlock[lockid]);
2627
2628 #ifdef USE_SELECT
2629 LOCK(&sock->manager->lock);
2630 if (thread->maxfd < sock->fd) {
2631 thread->maxfd = sock->fd;
2632 }
2633 UNLOCK(&sock->manager->lock);
2634 #endif /* ifdef USE_SELECT */
2635 }
2636
2637 return (result);
2638 }
2639
2640 /*
2641 * Attach to a socket. Caller must explicitly detach when it is done.
2642 */
2643 void
isc_socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)2644 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2645 REQUIRE(VALID_SOCKET(sock));
2646 REQUIRE(socketp != NULL && *socketp == NULL);
2647
2648 int old_refs = isc_refcount_increment(&sock->references);
2649 REQUIRE(old_refs > 0);
2650
2651 *socketp = sock;
2652 }
2653
2654 /*
2655 * Dereference a socket. If this is the last reference to it, clean things
2656 * up by destroying the socket.
2657 */
2658 void
isc_socket_detach(isc_socket_t ** socketp)2659 isc_socket_detach(isc_socket_t **socketp) {
2660 isc_socket_t *sock;
2661
2662 REQUIRE(socketp != NULL);
2663 sock = *socketp;
2664 REQUIRE(VALID_SOCKET(sock));
2665 if (isc_refcount_decrement(&sock->references) == 1) {
2666 destroy(&sock);
2667 }
2668
2669 *socketp = NULL;
2670 }
2671
2672 isc_result_t
isc_socket_close(isc_socket_t * sock)2673 isc_socket_close(isc_socket_t *sock) {
2674 int fd;
2675 isc_socketmgr_t *manager;
2676 isc__socketthread_t *thread;
2677 fflush(stdout);
2678 REQUIRE(VALID_SOCKET(sock));
2679
2680 LOCK(&sock->lock);
2681
2682 REQUIRE(sock->type != isc_sockettype_fdwatch);
2683 REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2684
2685 INSIST(!sock->connecting);
2686 INSIST(ISC_LIST_EMPTY(sock->recv_list));
2687 INSIST(ISC_LIST_EMPTY(sock->send_list));
2688 INSIST(ISC_LIST_EMPTY(sock->accept_list));
2689 INSIST(ISC_LIST_EMPTY(sock->connect_list));
2690
2691 manager = sock->manager;
2692 thread = &manager->threads[sock->threadid];
2693 fd = sock->fd;
2694 sock->fd = -1;
2695 sock->threadid = -1;
2696
2697 sock->dupped = 0;
2698 memset(sock->name, 0, sizeof(sock->name));
2699 sock->tag = NULL;
2700 sock->listener = 0;
2701 sock->connected = 0;
2702 sock->connecting = 0;
2703 sock->bound = 0;
2704 isc_sockaddr_any(&sock->peer_address);
2705
2706 UNLOCK(&sock->lock);
2707
2708 socketclose(thread, sock, fd);
2709
2710 return (ISC_R_SUCCESS);
2711 }
2712
2713 static void
dispatch_recv(isc_socket_t * sock)2714 dispatch_recv(isc_socket_t *sock) {
2715 if (sock->type != isc_sockettype_fdwatch) {
2716 internal_recv(sock);
2717 } else {
2718 internal_fdwatch_read(sock);
2719 }
2720 }
2721
2722 static void
dispatch_send(isc_socket_t * sock)2723 dispatch_send(isc_socket_t *sock) {
2724 if (sock->type != isc_sockettype_fdwatch) {
2725 internal_send(sock);
2726 } else {
2727 internal_fdwatch_write(sock);
2728 }
2729 }
2730
2731 /*
2732 * Dequeue an item off the given socket's read queue, set the result code
2733 * in the done event to the one provided, and send it to the task it was
2734 * destined for.
2735 *
2736 * If the event to be sent is on a list, remove it before sending. If
2737 * asked to, send and detach from the socket as well.
2738 *
2739 * Caller must have the socket locked if the event is attached to the socket.
2740 */
2741 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2742 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2743 isc_task_t *task;
2744
2745 task = (*dev)->ev_sender;
2746
2747 (*dev)->ev_sender = sock;
2748
2749 if (ISC_LINK_LINKED(*dev, ev_link)) {
2750 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2751 }
2752
2753 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2754 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2755 sock->threadid);
2756 } else {
2757 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2758 }
2759 }
2760
2761 /*
2762 * See comments for send_recvdone_event() above.
2763 *
2764 * Caller must have the socket locked if the event is attached to the socket.
2765 */
2766 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2767 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2768 isc_task_t *task;
2769
2770 INSIST(dev != NULL && *dev != NULL);
2771
2772 task = (*dev)->ev_sender;
2773 (*dev)->ev_sender = sock;
2774
2775 if (ISC_LINK_LINKED(*dev, ev_link)) {
2776 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2777 }
2778
2779 if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2780 isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2781 sock->threadid);
2782 } else {
2783 isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2784 }
2785 }
2786
2787 /*
2788 * See comments for send_recvdone_event() above.
2789 *
2790 * Caller must have the socket locked if the event is attached to the socket.
2791 */
2792 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** dev)2793 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2794 isc_task_t *task;
2795
2796 INSIST(dev != NULL && *dev != NULL);
2797
2798 task = (*dev)->ev_sender;
2799 (*dev)->ev_sender = sock;
2800
2801 if (ISC_LINK_LINKED(*dev, ev_link)) {
2802 ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2803 }
2804
2805 isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2806 }
2807
2808 /*
2809 * Call accept() on a socket, to get the new file descriptor. The listen
2810 * socket is used as a prototype to create a new isc_socket_t. The new
2811 * socket has one outstanding reference. The task receiving the event
2812 * will be detached from just after the event is delivered.
2813 *
2814 * On entry to this function, the event delivered is the internal
2815 * readable event, and the first item on the accept_list should be
2816 * the done event we want to send. If the list is empty, this is a no-op,
2817 * so just unlock and return.
2818 */
2819 static void
internal_accept(isc_socket_t * sock)2820 internal_accept(isc_socket_t *sock) {
2821 isc_socketmgr_t *manager;
2822 isc__socketthread_t *thread, *nthread;
2823 isc_socket_newconnev_t *dev;
2824 isc_task_t *task;
2825 socklen_t addrlen;
2826 int fd;
2827 isc_result_t result = ISC_R_SUCCESS;
2828 char strbuf[ISC_STRERRORSIZE];
2829 const char *err = "accept";
2830
2831 INSIST(VALID_SOCKET(sock));
2832 REQUIRE(sock->fd >= 0);
2833
2834 socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2835
2836 manager = sock->manager;
2837 INSIST(VALID_MANAGER(manager));
2838 thread = &manager->threads[sock->threadid];
2839
2840 INSIST(sock->listener);
2841
2842 /*
2843 * Get the first item off the accept list.
2844 * If it is empty, unlock the socket and return.
2845 */
2846 dev = ISC_LIST_HEAD(sock->accept_list);
2847 if (dev == NULL) {
2848 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2849 UNLOCK(&sock->lock);
2850 return;
2851 }
2852
2853 /*
2854 * Try to accept the new connection. If the accept fails with
2855 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2856 * again. Also ignore ECONNRESET, which has been reported to
2857 * be spuriously returned on Linux 2.2.19 although it is not
2858 * a documented error for accept(). ECONNABORTED has been
2859 * reported for Solaris 8. The rest are thrown in not because
2860 * we have seen them but because they are ignored by other
2861 * daemons such as BIND 8 and Apache.
2862 */
2863
2864 addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2865 memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2866 fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2867 (void *)&addrlen);
2868
2869 #ifdef F_DUPFD
2870 /*
2871 * Leave a space for stdio to work in.
2872 */
2873 if (fd >= 0 && fd < 20) {
2874 int newfd, tmp;
2875 newfd = fcntl(fd, F_DUPFD, 20);
2876 tmp = errno;
2877 (void)close(fd);
2878 errno = tmp;
2879 fd = newfd;
2880 err = "accept/fcntl";
2881 }
2882 #endif /* ifdef F_DUPFD */
2883
2884 if (fd < 0) {
2885 if (SOFT_ERROR(errno)) {
2886 goto soft_error;
2887 }
2888 switch (errno) {
2889 case ENFILE:
2890 case EMFILE:
2891 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2892 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2893 "%s: too many open file descriptors",
2894 err);
2895 goto soft_error;
2896
2897 case ENOBUFS:
2898 case ENOMEM:
2899 case ECONNRESET:
2900 case ECONNABORTED:
2901 case EHOSTUNREACH:
2902 case EHOSTDOWN:
2903 case ENETUNREACH:
2904 case ENETDOWN:
2905 case ECONNREFUSED:
2906 #ifdef EPROTO
2907 case EPROTO:
2908 #endif /* ifdef EPROTO */
2909 #ifdef ENONET
2910 case ENONET:
2911 #endif /* ifdef ENONET */
2912 goto soft_error;
2913 default:
2914 break;
2915 }
2916 strerror_r(errno, strbuf, sizeof(strbuf));
2917 UNEXPECTED_ERROR(__FILE__, __LINE__,
2918 "internal_accept: %s() failed: %s", err,
2919 strbuf);
2920 fd = -1;
2921 result = ISC_R_UNEXPECTED;
2922 } else {
2923 if (addrlen == 0U) {
2924 UNEXPECTED_ERROR(__FILE__, __LINE__,
2925 "internal_accept(): "
2926 "accept() failed to return "
2927 "remote address");
2928
2929 (void)close(fd);
2930 goto soft_error;
2931 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2932 sock->pf)
2933 {
2934 UNEXPECTED_ERROR(
2935 __FILE__, __LINE__,
2936 "internal_accept(): "
2937 "accept() returned peer address "
2938 "family %u (expected %u)",
2939 NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2940 sock->pf);
2941 (void)close(fd);
2942 goto soft_error;
2943 } else if (fd >= (int)manager->maxsocks) {
2944 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2945 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2946 "accept: file descriptor exceeds limit "
2947 "(%d/%u)",
2948 fd, manager->maxsocks);
2949 (void)close(fd);
2950 goto soft_error;
2951 }
2952 }
2953
2954 if (fd != -1) {
2955 NEWCONNSOCK(dev)->peer_address.length = addrlen;
2956 NEWCONNSOCK(dev)->pf = sock->pf;
2957 }
2958
2959 /*
2960 * Pull off the done event.
2961 */
2962 ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2963
2964 /*
2965 * Poke watcher if there are more pending accepts.
2966 */
2967 if (ISC_LIST_EMPTY(sock->accept_list)) {
2968 unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2969 }
2970
2971 if (fd != -1) {
2972 result = make_nonblock(fd);
2973 if (result != ISC_R_SUCCESS) {
2974 (void)close(fd);
2975 fd = -1;
2976 }
2977 }
2978
2979 /*
2980 * We need to unlock sock->lock now to be able to lock manager->lock
2981 * without risking a deadlock with xmlstats.
2982 */
2983 UNLOCK(&sock->lock);
2984
2985 /*
2986 * -1 means the new socket didn't happen.
2987 */
2988 if (fd != -1) {
2989 int lockid = FDLOCK_ID(fd);
2990
2991 NEWCONNSOCK(dev)->fd = fd;
2992 NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2993 NEWCONNSOCK(dev)->bound = 1;
2994 NEWCONNSOCK(dev)->connected = 1;
2995 nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2996
2997 /*
2998 * We already hold a lock on one fdlock in accepting thread,
2999 * we need to make sure that we don't double lock.
3000 */
3001 bool same_bucket = (sock->threadid ==
3002 NEWCONNSOCK(dev)->threadid) &&
3003 (FDLOCK_ID(sock->fd) == lockid);
3004
3005 /*
3006 * Use minimum mtu if possible.
3007 */
3008 use_min_mtu(NEWCONNSOCK(dev));
3009 set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
3010
3011 /*
3012 * Ensure DSCP settings are inherited across accept.
3013 */
3014 setdscp(NEWCONNSOCK(dev), sock->dscp);
3015
3016 /*
3017 * Save away the remote address
3018 */
3019 dev->address = NEWCONNSOCK(dev)->peer_address;
3020
3021 if (NEWCONNSOCK(dev)->active == 0) {
3022 inc_stats(manager->stats,
3023 NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
3024 NEWCONNSOCK(dev)->active = 1;
3025 }
3026
3027 if (!same_bucket) {
3028 LOCK(&nthread->fdlock[lockid]);
3029 }
3030 nthread->fds[fd] = NEWCONNSOCK(dev);
3031 nthread->fdstate[fd] = MANAGED;
3032 #if defined(USE_EPOLL)
3033 nthread->epoll_events[fd] = 0;
3034 #endif /* if defined(USE_EPOLL) */
3035 if (!same_bucket) {
3036 UNLOCK(&nthread->fdlock[lockid]);
3037 }
3038
3039 LOCK(&manager->lock);
3040
3041 #ifdef USE_SELECT
3042 if (nthread->maxfd < fd) {
3043 nthread->maxfd = fd;
3044 }
3045 #endif /* ifdef USE_SELECT */
3046
3047 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3048 "accepted connection, new socket %p",
3049 dev->newsocket);
3050
3051 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3052
3053 UNLOCK(&manager->lock);
3054
3055 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3056 } else {
3057 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3058 isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
3059 free_socket((isc_socket_t **)&dev->newsocket);
3060 }
3061
3062 /*
3063 * Fill in the done event details and send it off.
3064 */
3065 dev->result = result;
3066 task = dev->ev_sender;
3067 dev->ev_sender = sock;
3068
3069 isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3070 return;
3071
3072 soft_error:
3073 watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3074 UNLOCK(&sock->lock);
3075
3076 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3077 return;
3078 }
3079
3080 static void
internal_recv(isc_socket_t * sock)3081 internal_recv(isc_socket_t *sock) {
3082 isc_socketevent_t *dev;
3083
3084 INSIST(VALID_SOCKET(sock));
3085 REQUIRE(sock->fd >= 0);
3086
3087 dev = ISC_LIST_HEAD(sock->recv_list);
3088 if (dev == NULL) {
3089 goto finish;
3090 }
3091
3092 socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3093 dev, dev->ev_sender);
3094
3095 /*
3096 * Try to do as much I/O as possible on this socket. There are no
3097 * limits here, currently.
3098 */
3099 while (dev != NULL) {
3100 switch (doio_recv(sock, dev)) {
3101 case DOIO_SOFT:
3102 goto finish;
3103
3104 case DOIO_EOF:
3105 /*
3106 * read of 0 means the remote end was closed.
3107 * Run through the event queue and dispatch all
3108 * the events with an EOF result code.
3109 */
3110 do {
3111 dev->result = ISC_R_EOF;
3112 send_recvdone_event(sock, &dev);
3113 dev = ISC_LIST_HEAD(sock->recv_list);
3114 } while (dev != NULL);
3115 goto finish;
3116
3117 case DOIO_SUCCESS:
3118 case DOIO_HARD:
3119 send_recvdone_event(sock, &dev);
3120 break;
3121 }
3122
3123 dev = ISC_LIST_HEAD(sock->recv_list);
3124 }
3125
3126 finish:
3127 if (ISC_LIST_EMPTY(sock->recv_list)) {
3128 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3129 SELECT_POKE_READ);
3130 }
3131 }
3132
3133 static void
internal_send(isc_socket_t * sock)3134 internal_send(isc_socket_t *sock) {
3135 isc_socketevent_t *dev;
3136
3137 INSIST(VALID_SOCKET(sock));
3138 REQUIRE(sock->fd >= 0);
3139
3140 dev = ISC_LIST_HEAD(sock->send_list);
3141 if (dev == NULL) {
3142 goto finish;
3143 }
3144 socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3145 dev->ev_sender);
3146
3147 /*
3148 * Try to do as much I/O as possible on this socket. There are no
3149 * limits here, currently.
3150 */
3151 while (dev != NULL) {
3152 switch (doio_send(sock, dev)) {
3153 case DOIO_SOFT:
3154 goto finish;
3155
3156 case DOIO_HARD:
3157 case DOIO_SUCCESS:
3158 send_senddone_event(sock, &dev);
3159 break;
3160 }
3161
3162 dev = ISC_LIST_HEAD(sock->send_list);
3163 }
3164
3165 finish:
3166 if (ISC_LIST_EMPTY(sock->send_list)) {
3167 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3168 SELECT_POKE_WRITE);
3169 }
3170 }
3171
3172 static void
internal_fdwatch_write(isc_socket_t * sock)3173 internal_fdwatch_write(isc_socket_t *sock)
3174 {
3175 int more_data;
3176
3177 INSIST(VALID_SOCKET(sock));
3178
3179 isc_refcount_increment(&sock->references);
3180 UNLOCK(&sock->lock);
3181
3182 more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
3183 sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3184
3185 LOCK(&sock->lock);
3186
3187 if (isc_refcount_decrement(&sock->references) == 0) {
3188 UNLOCK(&sock->lock);
3189 destroy(&sock);
3190 return;
3191 }
3192
3193 if (more_data)
3194 select_poke(sock->manager, sock->threadid, sock->fd,
3195 SELECT_POKE_WRITE);
3196 }
3197
3198 static void
internal_fdwatch_read(isc_socket_t * sock)3199 internal_fdwatch_read(isc_socket_t *sock)
3200 {
3201 int more_data;
3202
3203 INSIST(VALID_SOCKET(sock));
3204
3205 isc_refcount_increment(&sock->references);
3206 UNLOCK(&sock->lock);
3207
3208 more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
3209 sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3210
3211 LOCK(&sock->lock);
3212
3213 if (isc_refcount_decrement(&sock->references) == 0) {
3214 UNLOCK(&sock->lock);
3215 destroy(&sock);
3216 return;
3217 }
3218
3219 if (more_data)
3220 select_poke(sock->manager, sock->threadid, sock->fd,
3221 SELECT_POKE_READ);
3222 }
3223
3224 /*
3225 * Process read/writes on each fd here. Avoid locking
3226 * and unlocking twice if both reads and writes are possible.
3227 */
3228 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3229 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3230 isc_socket_t *sock;
3231 int lockid = FDLOCK_ID(fd);
3232
3233 /*
3234 * If the socket is going to be closed, don't do more I/O.
3235 */
3236 LOCK(&thread->fdlock[lockid]);
3237 if (thread->fdstate[fd] == CLOSE_PENDING) {
3238 UNLOCK(&thread->fdlock[lockid]);
3239
3240 (void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3241 (void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3242 return;
3243 }
3244
3245 sock = thread->fds[fd];
3246 if (sock == NULL) {
3247 UNLOCK(&thread->fdlock[lockid]);
3248 return;
3249 }
3250
3251 LOCK(&sock->lock);
3252
3253 if (sock->fd < 0) {
3254 /*
3255 * Sock is being closed - the final external reference
3256 * is gone but it was not yet removed from event loop
3257 * and fdstate[]/fds[] as destroy() is waiting on
3258 * thread->fdlock[lockid] or sock->lock that we're holding.
3259 * Just release the locks and bail.
3260 */
3261 UNLOCK(&sock->lock);
3262 UNLOCK(&thread->fdlock[lockid]);
3263 return;
3264 }
3265
3266 REQUIRE(readable || writeable);
3267 if (writeable) {
3268 if (sock->connecting) {
3269 internal_connect(sock);
3270 } else {
3271 dispatch_send(sock);
3272 }
3273 }
3274
3275 if (readable) {
3276 if (sock->listener) {
3277 internal_accept(sock); /* unlocks sock */
3278 } else {
3279 dispatch_recv(sock);
3280 UNLOCK(&sock->lock);
3281 }
3282 } else {
3283 UNLOCK(&sock->lock);
3284 }
3285
3286 UNLOCK(&thread->fdlock[lockid]);
3287
3288 /*
3289 * Socket destruction might be pending, it will resume
3290 * after releasing fdlock and sock->lock.
3291 */
3292 }
3293
3294 /*
3295 * process_fds is different for different event loops
3296 * it takes the events from event loops and for each FD
3297 * launches process_fd
3298 */
3299 #ifdef USE_KQUEUE
3300 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3301 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3302 int i;
3303 bool readable, writable;
3304 bool done = false;
3305 bool have_ctlevent = false;
3306 if (nevents == thread->nevents) {
3307 /*
3308 * This is not an error, but something unexpected. If this
3309 * happens, it may indicate the need for increasing
3310 * ISC_SOCKET_MAXEVENTS.
3311 */
3312 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3313 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3314 "maximum number of FD events (%d) received",
3315 nevents);
3316 }
3317
3318 for (i = 0; i < nevents; i++) {
3319 REQUIRE(events[i].ident < thread->manager->maxsocks);
3320 if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3321 have_ctlevent = true;
3322 continue;
3323 }
3324 readable = (events[i].filter == EVFILT_READ);
3325 writable = (events[i].filter == EVFILT_WRITE);
3326 process_fd(thread, events[i].ident, readable, writable);
3327 }
3328
3329 if (have_ctlevent) {
3330 done = process_ctlfd(thread);
3331 }
3332
3333 return (done);
3334 }
3335 #elif defined(USE_EPOLL)
3336 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3337 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3338 int nevents) {
3339 int i;
3340 bool done = false;
3341 bool have_ctlevent = false;
3342
3343 if (nevents == thread->nevents) {
3344 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3345 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3346 "maximum number of FD events (%d) received",
3347 nevents);
3348 }
3349
3350 for (i = 0; i < nevents; i++) {
3351 REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3352 if (events[i].data.fd == thread->pipe_fds[0]) {
3353 have_ctlevent = true;
3354 continue;
3355 }
3356 if ((events[i].events & EPOLLERR) != 0 ||
3357 (events[i].events & EPOLLHUP) != 0)
3358 {
3359 /*
3360 * epoll does not set IN/OUT bits on an erroneous
3361 * condition, so we need to try both anyway. This is a
3362 * bit inefficient, but should be okay for such rare
3363 * events. Note also that the read or write attempt
3364 * won't block because we use non-blocking sockets.
3365 */
3366 int fd = events[i].data.fd;
3367 events[i].events |= thread->epoll_events[fd];
3368 }
3369 process_fd(thread, events[i].data.fd,
3370 (events[i].events & EPOLLIN) != 0,
3371 (events[i].events & EPOLLOUT) != 0);
3372 }
3373
3374 if (have_ctlevent) {
3375 done = process_ctlfd(thread);
3376 }
3377
3378 return (done);
3379 }
3380 #elif defined(USE_DEVPOLL)
3381 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3382 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3383 int i;
3384 bool done = false;
3385 bool have_ctlevent = false;
3386
3387 if (nevents == thread->nevents) {
3388 thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3389 ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3390 "maximum number of FD events (%d) received",
3391 nevents);
3392 }
3393
3394 for (i = 0; i < nevents; i++) {
3395 REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3396 if (events[i].fd == thread->pipe_fds[0]) {
3397 have_ctlevent = true;
3398 continue;
3399 }
3400 process_fd(thread, events[i].fd,
3401 (events[i].events & POLLIN) != 0,
3402 (events[i].events & POLLOUT) != 0);
3403 }
3404
3405 if (have_ctlevent) {
3406 done = process_ctlfd(thread);
3407 }
3408
3409 return (done);
3410 }
3411 #elif defined(USE_SELECT)
3412 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3413 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3414 fd_set *writefds) {
3415 int i;
3416
3417 REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3418
3419 for (i = 0; i < maxfd; i++) {
3420 if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3421 continue;
3422 }
3423 process_fd(thread, i, FD_ISSET(i, readfds),
3424 FD_ISSET(i, writefds));
3425 }
3426 }
3427 #endif /* ifdef USE_KQUEUE */
3428
3429 static bool
process_ctlfd(isc__socketthread_t * thread)3430 process_ctlfd(isc__socketthread_t *thread) {
3431 int msg, fd;
3432
3433 for (;;) {
3434 select_readmsg(thread, &fd, &msg);
3435
3436 thread_log(thread, IOEVENT,
3437 "watcher got message %d for socket %d", msg, fd);
3438
3439 /*
3440 * Nothing to read?
3441 */
3442 if (msg == SELECT_POKE_NOTHING) {
3443 break;
3444 }
3445
3446 /*
3447 * Handle shutdown message. We really should
3448 * jump out of this loop right away, but
3449 * it doesn't matter if we have to do a little
3450 * more work first.
3451 */
3452 if (msg == SELECT_POKE_SHUTDOWN) {
3453 return (true);
3454 }
3455
3456 /*
3457 * This is a wakeup on a socket. Look
3458 * at the event queue for both read and write,
3459 * and decide if we need to watch on it now
3460 * or not.
3461 */
3462 wakeup_socket(thread, fd, msg);
3463 }
3464
3465 return (false);
3466 }
3467
3468 /*
3469 * This is the thread that will loop forever, always in a select or poll
3470 * call.
3471 *
3472 * When select returns something to do, do whatever's necessary and post
3473 * an event to the task that was requesting the action.
3474 */
3475 static isc_threadresult_t
netthread(void * uap)3476 netthread(void *uap) {
3477 isc__socketthread_t *thread = uap;
3478 isc_socketmgr_t *manager = thread->manager;
3479 (void)manager;
3480 bool done;
3481 int cc;
3482 #ifdef USE_KQUEUE
3483 const char *fnname = "kevent()";
3484 #elif defined(USE_EPOLL)
3485 const char *fnname = "epoll_wait()";
3486 #elif defined(USE_DEVPOLL)
3487 isc_result_t result;
3488 const char *fnname = "ioctl(DP_POLL)";
3489 struct dvpoll dvp;
3490 int pass;
3491 #if defined(ISC_SOCKET_USE_POLLWATCH)
3492 pollstate_t pollstate = poll_idle;
3493 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3494 #elif defined(USE_SELECT)
3495 const char *fnname = "select()";
3496 int maxfd;
3497 int ctlfd;
3498 #endif /* ifdef USE_KQUEUE */
3499 char strbuf[ISC_STRERRORSIZE];
3500
3501 #if defined(USE_SELECT)
3502 /*
3503 * Get the control fd here. This will never change.
3504 */
3505 ctlfd = thread->pipe_fds[0];
3506 #endif /* if defined(USE_SELECT) */
3507 done = false;
3508 while (!done) {
3509 do {
3510 #ifdef USE_KQUEUE
3511 cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3512 thread->nevents, NULL);
3513 #elif defined(USE_EPOLL)
3514 cc = epoll_wait(thread->epoll_fd, thread->events,
3515 thread->nevents, -1);
3516 #elif defined(USE_DEVPOLL)
3517 /*
3518 * Re-probe every thousand calls.
3519 */
3520 if (thread->calls++ > 1000U) {
3521 result = isc_resource_getcurlimit(
3522 isc_resource_openfiles,
3523 &thread->open_max);
3524 if (result != ISC_R_SUCCESS) {
3525 thread->open_max = 64;
3526 }
3527 thread->calls = 0;
3528 }
3529 for (pass = 0; pass < 2; pass++) {
3530 dvp.dp_fds = thread->events;
3531 dvp.dp_nfds = thread->nevents;
3532 if (dvp.dp_nfds >= thread->open_max) {
3533 dvp.dp_nfds = thread->open_max - 1;
3534 }
3535 #ifndef ISC_SOCKET_USE_POLLWATCH
3536 dvp.dp_timeout = -1;
3537 #else /* ifndef ISC_SOCKET_USE_POLLWATCH */
3538 if (pollstate == poll_idle) {
3539 dvp.dp_timeout = -1;
3540 } else {
3541 dvp.dp_timeout =
3542 ISC_SOCKET_POLLWATCH_TIMEOUT;
3543 }
3544 #endif /* ISC_SOCKET_USE_POLLWATCH */
3545 cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3546 if (cc == -1 && errno == EINVAL) {
3547 /*
3548 * {OPEN_MAX} may have dropped. Look
3549 * up the current value and try again.
3550 */
3551 result = isc_resource_getcurlimit(
3552 isc_resource_openfiles,
3553 &thread->open_max);
3554 if (result != ISC_R_SUCCESS) {
3555 thread->open_max = 64;
3556 }
3557 } else {
3558 break;
3559 }
3560 }
3561 #elif defined(USE_SELECT)
3562 /*
3563 * We will have only one thread anyway, we can lock
3564 * manager lock and don't care
3565 */
3566 LOCK(&manager->lock);
3567 memmove(thread->read_fds_copy, thread->read_fds,
3568 thread->fd_bufsize);
3569 memmove(thread->write_fds_copy, thread->write_fds,
3570 thread->fd_bufsize);
3571 maxfd = thread->maxfd + 1;
3572 UNLOCK(&manager->lock);
3573
3574 cc = select(maxfd, thread->read_fds_copy,
3575 thread->write_fds_copy, NULL, NULL);
3576 #endif /* USE_KQUEUE */
3577
3578 if (cc < 0 && !SOFT_ERROR(errno)) {
3579 strerror_r(errno, strbuf, sizeof(strbuf));
3580 FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3581 fnname, strbuf);
3582 }
3583
3584 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3585 if (cc == 0) {
3586 if (pollstate == poll_active) {
3587 pollstate = poll_checking;
3588 } else if (pollstate == poll_checking) {
3589 pollstate = poll_idle;
3590 }
3591 } else if (cc > 0) {
3592 if (pollstate == poll_checking) {
3593 /*
3594 * XXX: We'd like to use a more
3595 * verbose log level as it's actually an
3596 * unexpected event, but the kernel bug
3597 * reportedly happens pretty frequently
3598 * (and it can also be a false positive)
3599 * so it would be just too noisy.
3600 */
3601 thread_log(thread,
3602 ISC_LOGCATEGORY_GENERAL,
3603 ISC_LOGMODULE_SOCKET,
3604 ISC_LOG_DEBUG(1),
3605 "unexpected POLL timeout");
3606 }
3607 pollstate = poll_active;
3608 }
3609 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3610 } while (cc < 0);
3611
3612 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3613 done = process_fds(thread, thread->events, cc);
3614 #elif defined(USE_SELECT)
3615 process_fds(thread, maxfd, thread->read_fds_copy,
3616 thread->write_fds_copy);
3617
3618 /*
3619 * Process reads on internal, control fd.
3620 */
3621 if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3622 done = process_ctlfd(thread);
3623 }
3624 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3625 * */
3626 }
3627
3628 thread_log(thread, TRACE, "watcher exiting");
3629 return ((isc_threadresult_t)0);
3630 }
3631
3632 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager,uint32_t reserved)3633 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3634 REQUIRE(VALID_MANAGER(manager));
3635
3636 manager->reserved = reserved;
3637 }
3638
3639 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager,unsigned int maxudp)3640 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3641 REQUIRE(VALID_MANAGER(manager));
3642
3643 manager->maxudp = maxudp;
3644 }
3645
3646 /*
3647 * Setup socket thread, thread->manager and thread->threadid must be filled.
3648 */
3649
3650 static isc_result_t
setup_thread(isc__socketthread_t * thread)3651 setup_thread(isc__socketthread_t *thread) {
3652 isc_result_t result = ISC_R_SUCCESS;
3653 int i;
3654 char strbuf[ISC_STRERRORSIZE];
3655
3656 REQUIRE(thread != NULL);
3657 REQUIRE(VALID_MANAGER(thread->manager));
3658 REQUIRE(thread->threadid >= 0 &&
3659 thread->threadid < thread->manager->nthreads);
3660
3661 thread->fds =
3662 isc_mem_get(thread->manager->mctx,
3663 thread->manager->maxsocks * sizeof(isc_socket_t *));
3664
3665 memset(thread->fds, 0,
3666 thread->manager->maxsocks * sizeof(isc_socket_t *));
3667
3668 thread->fdstate = isc_mem_get(thread->manager->mctx,
3669 thread->manager->maxsocks * sizeof(int));
3670
3671 memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3672
3673 thread->fdlock = isc_mem_get(thread->manager->mctx,
3674 FDLOCK_COUNT * sizeof(isc_mutex_t));
3675
3676 for (i = 0; i < FDLOCK_COUNT; i++) {
3677 isc_mutex_init(&thread->fdlock[i]);
3678 }
3679
3680 if (pipe(thread->pipe_fds) != 0) {
3681 strerror_r(errno, strbuf, sizeof(strbuf));
3682 UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3683 strbuf);
3684 return (ISC_R_UNEXPECTED);
3685 }
3686 RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3687
3688 #ifdef USE_KQUEUE
3689 thread->nevents = ISC_SOCKET_MAXEVENTS;
3690 thread->events = isc_mem_get(thread->manager->mctx,
3691 sizeof(struct kevent) * thread->nevents);
3692
3693 thread->kqueue_fd = kqueue();
3694 if (thread->kqueue_fd == -1) {
3695 result = isc__errno2result(errno);
3696 strerror_r(errno, strbuf, sizeof(strbuf));
3697 UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3698 strbuf);
3699 isc_mem_put(thread->manager->mctx, thread->events,
3700 sizeof(struct kevent) * thread->nevents);
3701 return (result);
3702 }
3703
3704 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3705 if (result != ISC_R_SUCCESS) {
3706 close(thread->kqueue_fd);
3707 isc_mem_put(thread->manager->mctx, thread->events,
3708 sizeof(struct kevent) * thread->nevents);
3709 }
3710 return (result);
3711
3712 #elif defined(USE_EPOLL)
3713 thread->nevents = ISC_SOCKET_MAXEVENTS;
3714 thread->epoll_events =
3715 isc_mem_get(thread->manager->mctx,
3716 (thread->manager->maxsocks * sizeof(uint32_t)));
3717
3718 memset(thread->epoll_events, 0,
3719 thread->manager->maxsocks * sizeof(uint32_t));
3720
3721 thread->events =
3722 isc_mem_get(thread->manager->mctx,
3723 sizeof(struct epoll_event) * thread->nevents);
3724
3725 thread->epoll_fd = epoll_create(thread->nevents);
3726 if (thread->epoll_fd == -1) {
3727 result = isc__errno2result(errno);
3728 strerror_r(errno, strbuf, sizeof(strbuf));
3729 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3730 strbuf);
3731 return (result);
3732 }
3733
3734 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3735 return (result);
3736
3737 #elif defined(USE_DEVPOLL)
3738 thread->nevents = ISC_SOCKET_MAXEVENTS;
3739 result = isc_resource_getcurlimit(isc_resource_openfiles,
3740 &thread->open_max);
3741 if (result != ISC_R_SUCCESS) {
3742 thread->open_max = 64;
3743 }
3744 thread->calls = 0;
3745 thread->events = isc_mem_get(thread->manager->mctx,
3746 sizeof(struct pollfd) * thread->nevents);
3747
3748 /*
3749 * Note: fdpollinfo should be able to support all possible FDs, so
3750 * it must have maxsocks entries (not nevents).
3751 */
3752 thread->fdpollinfo =
3753 isc_mem_get(thread->manager->mctx,
3754 sizeof(pollinfo_t) * thread->manager->maxsocks);
3755 memset(thread->fdpollinfo, 0,
3756 sizeof(pollinfo_t) * thread->manager->maxsocks);
3757 thread->devpoll_fd = open("/dev/poll", O_RDWR);
3758 if (thread->devpoll_fd == -1) {
3759 result = isc__errno2result(errno);
3760 strerror_r(errno, strbuf, sizeof(strbuf));
3761 UNEXPECTED_ERROR(__FILE__, __LINE__,
3762 "open(/dev/poll) failed: %s", strbuf);
3763 isc_mem_put(thread->manager->mctx, thread->events,
3764 sizeof(struct pollfd) * thread->nevents);
3765 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3766 sizeof(pollinfo_t) * thread->manager->maxsocks);
3767 return (result);
3768 }
3769 result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3770 if (result != ISC_R_SUCCESS) {
3771 close(thread->devpoll_fd);
3772 isc_mem_put(thread->manager->mctx, thread->events,
3773 sizeof(struct pollfd) * thread->nevents);
3774 isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3775 sizeof(pollinfo_t) * thread->manager->maxsocks);
3776 return (result);
3777 }
3778
3779 return (ISC_R_SUCCESS);
3780 #elif defined(USE_SELECT)
3781 UNUSED(result);
3782
3783 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3784 /*
3785 * Note: this code should also cover the case of MAXSOCKETS <=
3786 * FD_SETSIZE, but we separate the cases to avoid possible portability
3787 * issues regarding howmany() and the actual representation of fd_set.
3788 */
3789 thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3790 sizeof(fd_mask);
3791 #else /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3792 thread->fd_bufsize = sizeof(fd_set);
3793 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3794
3795 thread->read_fds = isc_mem_get(thread->manager->mctx,
3796 thread->fd_bufsize);
3797 thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3798 thread->fd_bufsize);
3799 thread->write_fds = isc_mem_get(thread->manager->mctx,
3800 thread->fd_bufsize);
3801 thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3802 thread->fd_bufsize);
3803 memset(thread->read_fds, 0, thread->fd_bufsize);
3804 memset(thread->write_fds, 0, thread->fd_bufsize);
3805
3806 (void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3807 thread->maxfd = thread->pipe_fds[0];
3808
3809 return (ISC_R_SUCCESS);
3810 #endif /* USE_KQUEUE */
3811 }
3812
3813 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3814 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3815 isc_result_t result;
3816 int i;
3817
3818 result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3819 if (result != ISC_R_SUCCESS) {
3820 UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3821 }
3822 #ifdef USE_KQUEUE
3823 close(thread->kqueue_fd);
3824 isc_mem_put(mctx, thread->events,
3825 sizeof(struct kevent) * thread->nevents);
3826 #elif defined(USE_EPOLL)
3827 close(thread->epoll_fd);
3828
3829 isc_mem_put(mctx, thread->events,
3830 sizeof(struct epoll_event) * thread->nevents);
3831 #elif defined(USE_DEVPOLL)
3832 close(thread->devpoll_fd);
3833 isc_mem_put(mctx, thread->events,
3834 sizeof(struct pollfd) * thread->nevents);
3835 isc_mem_put(mctx, thread->fdpollinfo,
3836 sizeof(pollinfo_t) * thread->manager->maxsocks);
3837 #elif defined(USE_SELECT)
3838 if (thread->read_fds != NULL) {
3839 isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3840 }
3841 if (thread->read_fds_copy != NULL) {
3842 isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3843 }
3844 if (thread->write_fds != NULL) {
3845 isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3846 }
3847 if (thread->write_fds_copy != NULL) {
3848 isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3849 }
3850 #endif /* USE_KQUEUE */
3851 for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3852 if (thread->fdstate[i] == CLOSE_PENDING) {
3853 /* no need to lock */
3854 (void)close(i);
3855 }
3856 }
3857
3858 #if defined(USE_EPOLL)
3859 isc_mem_put(thread->manager->mctx, thread->epoll_events,
3860 thread->manager->maxsocks * sizeof(uint32_t));
3861 #endif /* if defined(USE_EPOLL) */
3862 isc_mem_put(thread->manager->mctx, thread->fds,
3863 thread->manager->maxsocks * sizeof(isc_socket_t *));
3864 isc_mem_put(thread->manager->mctx, thread->fdstate,
3865 thread->manager->maxsocks * sizeof(int));
3866
3867 for (i = 0; i < FDLOCK_COUNT; i++) {
3868 isc_mutex_destroy(&thread->fdlock[i]);
3869 }
3870 isc_mem_put(thread->manager->mctx, thread->fdlock,
3871 FDLOCK_COUNT * sizeof(isc_mutex_t));
3872 }
3873
3874 isc_result_t
isc_socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)3875 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3876 return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3877 }
3878
3879 isc_result_t
isc_socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3880 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3881 unsigned int maxsocks, int nthreads) {
3882 int i;
3883 isc_socketmgr_t *manager;
3884
3885 REQUIRE(managerp != NULL && *managerp == NULL);
3886
3887 if (maxsocks == 0) {
3888 maxsocks = ISC_SOCKET_MAXSOCKETS;
3889 }
3890
3891 manager = isc_mem_get(mctx, sizeof(*manager));
3892
3893 /* zero-clear so that necessary cleanup on failure will be easy */
3894 memset(manager, 0, sizeof(*manager));
3895 manager->maxsocks = maxsocks;
3896 manager->reserved = 0;
3897 manager->maxudp = 0;
3898 manager->nthreads = nthreads;
3899 manager->stats = NULL;
3900
3901 manager->magic = SOCKET_MANAGER_MAGIC;
3902 manager->mctx = NULL;
3903 ISC_LIST_INIT(manager->socklist);
3904 isc_mutex_init(&manager->lock);
3905 isc_condition_init(&manager->shutdown_ok);
3906
3907 /*
3908 * Start up the select/poll thread.
3909 */
3910 manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3911 manager->nthreads);
3912 isc_mem_attach(mctx, &manager->mctx);
3913
3914 for (i = 0; i < manager->nthreads; i++) {
3915 manager->threads[i].manager = manager;
3916 manager->threads[i].threadid = i;
3917 setup_thread(&manager->threads[i]);
3918 isc_thread_create(netthread, &manager->threads[i],
3919 &manager->threads[i].thread);
3920 char tname[1024];
3921 sprintf(tname, "sock-%d", i);
3922 isc_thread_setname(manager->threads[i].thread, tname);
3923 }
3924
3925 *managerp = manager;
3926
3927 return (ISC_R_SUCCESS);
3928 }
3929
3930 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)3931 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3932 REQUIRE(VALID_MANAGER(manager));
3933 REQUIRE(nsockp != NULL);
3934
3935 *nsockp = manager->maxsocks;
3936
3937 return (ISC_R_SUCCESS);
3938 }
3939
3940 void
isc_socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)3941 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3942 REQUIRE(VALID_MANAGER(manager));
3943 REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3944 REQUIRE(manager->stats == NULL);
3945 REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3946
3947 isc_stats_attach(stats, &manager->stats);
3948 }
3949
3950 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)3951 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3952 isc_socketmgr_t *manager;
3953
3954 /*
3955 * Destroy a socket manager.
3956 */
3957
3958 REQUIRE(managerp != NULL);
3959 manager = *managerp;
3960 REQUIRE(VALID_MANAGER(manager));
3961
3962 LOCK(&manager->lock);
3963
3964 /*
3965 * Wait for all sockets to be destroyed.
3966 */
3967 while (!ISC_LIST_EMPTY(manager->socklist)) {
3968 manager_log(manager, CREATION, "sockets exist");
3969 WAIT(&manager->shutdown_ok, &manager->lock);
3970 }
3971
3972 UNLOCK(&manager->lock);
3973
3974 /*
3975 * Here, poke our select/poll thread. Do this by closing the write
3976 * half of the pipe, which will send EOF to the read half.
3977 * This is currently a no-op in the non-threaded case.
3978 */
3979 for (int i = 0; i < manager->nthreads; i++) {
3980 select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3981 }
3982
3983 /*
3984 * Wait for thread to exit.
3985 */
3986 for (int i = 0; i < manager->nthreads; i++) {
3987 isc_thread_join(manager->threads[i].thread, NULL);
3988 cleanup_thread(manager->mctx, &manager->threads[i]);
3989 }
3990 /*
3991 * Clean up.
3992 */
3993 isc_mem_put(manager->mctx, manager->threads,
3994 sizeof(isc__socketthread_t) * manager->nthreads);
3995 (void)isc_condition_destroy(&manager->shutdown_ok);
3996
3997 if (manager->stats != NULL) {
3998 isc_stats_detach(&manager->stats);
3999 }
4000 isc_mutex_destroy(&manager->lock);
4001 manager->magic = 0;
4002 isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
4003
4004 *managerp = NULL;
4005 }
4006
4007 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)4008 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4009 unsigned int flags) {
4010 int io_state;
4011 bool have_lock = false;
4012 isc_task_t *ntask = NULL;
4013 isc_result_t result = ISC_R_SUCCESS;
4014
4015 dev->ev_sender = task;
4016
4017 if (sock->type == isc_sockettype_udp) {
4018 io_state = doio_recv(sock, dev);
4019 } else {
4020 LOCK(&sock->lock);
4021 have_lock = true;
4022
4023 if (ISC_LIST_EMPTY(sock->recv_list)) {
4024 io_state = doio_recv(sock, dev);
4025 } else {
4026 io_state = DOIO_SOFT;
4027 }
4028 }
4029
4030 switch (io_state) {
4031 case DOIO_SOFT:
4032 /*
4033 * We couldn't read all or part of the request right now, so
4034 * queue it.
4035 *
4036 * Attach to socket and to task
4037 */
4038 isc_task_attach(task, &ntask);
4039 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4040
4041 if (!have_lock) {
4042 LOCK(&sock->lock);
4043 have_lock = true;
4044 }
4045
4046 /*
4047 * Enqueue the request. If the socket was previously not being
4048 * watched, poke the watcher to start paying attention to it.
4049 */
4050 bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
4051 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4052 if (do_poke) {
4053 select_poke(sock->manager, sock->threadid, sock->fd,
4054 SELECT_POKE_READ);
4055 }
4056
4057 socket_log(sock, NULL, EVENT,
4058 "socket_recv: event %p -> task %p", dev, ntask);
4059
4060 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4061 result = ISC_R_INPROGRESS;
4062 }
4063 break;
4064
4065 case DOIO_EOF:
4066 dev->result = ISC_R_EOF;
4067 FALLTHROUGH;
4068
4069 case DOIO_HARD:
4070 case DOIO_SUCCESS:
4071 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4072 send_recvdone_event(sock, &dev);
4073 }
4074 break;
4075 }
4076
4077 if (have_lock) {
4078 UNLOCK(&sock->lock);
4079 }
4080
4081 return (result);
4082 }
4083
4084 isc_result_t
isc_socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)4085 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4086 isc_task_t *task, isc_taskaction_t action, void *arg) {
4087 isc_socketevent_t *dev;
4088 isc_socketmgr_t *manager;
4089
4090 REQUIRE(VALID_SOCKET(sock));
4091 REQUIRE(action != NULL);
4092
4093 manager = sock->manager;
4094 REQUIRE(VALID_MANAGER(manager));
4095
4096 INSIST(sock->bound);
4097
4098 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
4099 action, arg);
4100 if (dev == NULL) {
4101 return (ISC_R_NOMEMORY);
4102 }
4103
4104 return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4105 }
4106
4107 isc_result_t
isc_socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)4108 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4109 isc_task_t *task, isc_socketevent_t *event,
4110 unsigned int flags) {
4111 event->ev_sender = sock;
4112 event->result = ISC_R_UNSET;
4113 event->region = *region;
4114 event->n = 0;
4115 event->offset = 0;
4116 event->attributes = 0;
4117
4118 /*
4119 * UDP sockets are always partial read.
4120 */
4121 if (sock->type == isc_sockettype_udp) {
4122 event->minimum = 1;
4123 } else {
4124 if (minimum == 0) {
4125 event->minimum = region->length;
4126 } else {
4127 event->minimum = minimum;
4128 }
4129 }
4130
4131 return (socket_recv(sock, event, task, flags));
4132 }
4133
4134 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)4135 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4136 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4137 unsigned int flags) {
4138 int io_state;
4139 bool have_lock = false;
4140 isc_task_t *ntask = NULL;
4141 isc_result_t result = ISC_R_SUCCESS;
4142
4143 dev->ev_sender = task;
4144
4145 set_dev_address(address, sock, dev);
4146 if (pktinfo != NULL) {
4147 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4148 dev->pktinfo = *pktinfo;
4149
4150 if (!isc_sockaddr_issitelocal(&dev->address) &&
4151 !isc_sockaddr_islinklocal(&dev->address))
4152 {
4153 socket_log(sock, NULL, TRACE,
4154 "pktinfo structure provided, ifindex %u "
4155 "(set to 0)",
4156 pktinfo->ipi6_ifindex);
4157
4158 /*
4159 * Set the pktinfo index to 0 here, to let the
4160 * kernel decide what interface it should send on.
4161 */
4162 dev->pktinfo.ipi6_ifindex = 0;
4163 }
4164 }
4165
4166 if (sock->type == isc_sockettype_udp) {
4167 io_state = doio_send(sock, dev);
4168 } else {
4169 LOCK(&sock->lock);
4170 have_lock = true;
4171
4172 if (ISC_LIST_EMPTY(sock->send_list)) {
4173 io_state = doio_send(sock, dev);
4174 } else {
4175 io_state = DOIO_SOFT;
4176 }
4177 }
4178
4179 switch (io_state) {
4180 case DOIO_SOFT:
4181 /*
4182 * We couldn't send all or part of the request right now, so
4183 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4184 */
4185 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4186 isc_task_attach(task, &ntask);
4187 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4188
4189 if (!have_lock) {
4190 LOCK(&sock->lock);
4191 have_lock = true;
4192 }
4193
4194 /*
4195 * Enqueue the request. If the socket was previously
4196 * not being watched, poke the watcher to start
4197 * paying attention to it.
4198 */
4199 bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4200 ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4201 if (do_poke) {
4202 select_poke(sock->manager, sock->threadid,
4203 sock->fd, SELECT_POKE_WRITE);
4204 }
4205 socket_log(sock, NULL, EVENT,
4206 "socket_send: event %p -> task %p", dev,
4207 ntask);
4208
4209 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4210 result = ISC_R_INPROGRESS;
4211 }
4212 break;
4213 }
4214
4215 FALLTHROUGH;
4216
4217 case DOIO_HARD:
4218 case DOIO_SUCCESS:
4219 if (!have_lock) {
4220 LOCK(&sock->lock);
4221 have_lock = true;
4222 }
4223 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4224 send_senddone_event(sock, &dev);
4225 }
4226 break;
4227 }
4228
4229 if (have_lock) {
4230 UNLOCK(&sock->lock);
4231 }
4232
4233 return (result);
4234 }
4235
4236 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4237 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4238 isc_taskaction_t action, void *arg) {
4239 /*
4240 * REQUIRE() checking is performed in isc_socket_sendto().
4241 */
4242 return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4243 }
4244
4245 isc_result_t
isc_socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4246 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4247 isc_taskaction_t action, void *arg,
4248 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4249 isc_socketevent_t *dev;
4250 isc_socketmgr_t *manager;
4251
4252 REQUIRE(VALID_SOCKET(sock));
4253 REQUIRE(region != NULL);
4254 REQUIRE(task != NULL);
4255 REQUIRE(action != NULL);
4256
4257 manager = sock->manager;
4258 REQUIRE(VALID_MANAGER(manager));
4259
4260 INSIST(sock->bound);
4261
4262 dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4263 action, arg);
4264 if (dev == NULL) {
4265 return (ISC_R_NOMEMORY);
4266 }
4267
4268 dev->region = *region;
4269
4270 return (socket_send(sock, dev, task, address, pktinfo, 0));
4271 }
4272
4273 isc_result_t
isc_socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4274 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4275 const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4276 isc_socketevent_t *event, unsigned int flags) {
4277 REQUIRE(VALID_SOCKET(sock));
4278 REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4279 0);
4280 if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4281 REQUIRE(sock->type == isc_sockettype_udp);
4282 }
4283 event->ev_sender = sock;
4284 event->result = ISC_R_UNSET;
4285 event->region = *region;
4286 event->n = 0;
4287 event->offset = 0;
4288 event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4289
4290 return (socket_send(sock, event, task, address, pktinfo, flags));
4291 }
4292
4293 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4294 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4295 #ifdef ISC_PLATFORM_HAVESYSUNH
4296 int s;
4297 struct stat sb;
4298 char strbuf[ISC_STRERRORSIZE];
4299
4300 if (sockaddr->type.sa.sa_family != AF_UNIX) {
4301 return;
4302 }
4303
4304 #ifndef S_ISSOCK
4305 #if defined(S_IFMT) && defined(S_IFSOCK)
4306 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4307 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4308 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4309 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4310 #endif /* ifndef S_ISSOCK */
4311
4312 #ifndef S_ISFIFO
4313 #if defined(S_IFMT) && defined(S_IFIFO)
4314 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4315 #elif defined(_S_IFMT) && defined(S_IFIFO)
4316 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4317 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4318 #endif /* ifndef S_ISFIFO */
4319
4320 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4321 /* cppcheck-suppress preprocessorErrorDirective */
4322 #error \
4323 You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform. See <sys/stat.h>.
4324 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4325
4326 #ifndef S_ISFIFO
4327 #define S_ISFIFO(mode) 0
4328 #endif /* ifndef S_ISFIFO */
4329
4330 #ifndef S_ISSOCK
4331 #define S_ISSOCK(mode) 0
4332 #endif /* ifndef S_ISSOCK */
4333
4334 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4335 switch (errno) {
4336 case ENOENT:
4337 if (active) { /* We exited cleanly last time */
4338 break;
4339 }
4340 FALLTHROUGH;
4341 default:
4342 strerror_r(errno, strbuf, sizeof(strbuf));
4343 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4344 ISC_LOGMODULE_SOCKET,
4345 active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4346 "isc_socket_cleanunix: stat(%s): %s",
4347 sockaddr->type.sunix.sun_path, strbuf);
4348 return;
4349 }
4350 } else {
4351 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4352 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4353 ISC_LOGMODULE_SOCKET,
4354 active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4355 "isc_socket_cleanunix: %s: not a socket",
4356 sockaddr->type.sunix.sun_path);
4357 return;
4358 }
4359 }
4360
4361 if (active) {
4362 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4363 strerror_r(errno, strbuf, sizeof(strbuf));
4364 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4365 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4366 "isc_socket_cleanunix: unlink(%s): %s",
4367 sockaddr->type.sunix.sun_path, strbuf);
4368 }
4369 return;
4370 }
4371
4372 s = socket(AF_UNIX, SOCK_STREAM, 0);
4373 if (s < 0) {
4374 strerror_r(errno, strbuf, sizeof(strbuf));
4375 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4376 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4377 "isc_socket_cleanunix: socket(%s): %s",
4378 sockaddr->type.sunix.sun_path, strbuf);
4379 return;
4380 }
4381
4382 if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4383 sizeof(sockaddr->type.sunix)) < 0)
4384 {
4385 switch (errno) {
4386 case ECONNREFUSED:
4387 case ECONNRESET:
4388 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4389 strerror_r(errno, strbuf, sizeof(strbuf));
4390 isc_log_write(
4391 isc_lctx, ISC_LOGCATEGORY_GENERAL,
4392 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4393 "isc_socket_cleanunix: "
4394 "unlink(%s): %s",
4395 sockaddr->type.sunix.sun_path, strbuf);
4396 }
4397 break;
4398 default:
4399 strerror_r(errno, strbuf, sizeof(strbuf));
4400 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4401 ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4402 "isc_socket_cleanunix: connect(%s): %s",
4403 sockaddr->type.sunix.sun_path, strbuf);
4404 break;
4405 }
4406 }
4407 close(s);
4408 #else /* ifdef ISC_PLATFORM_HAVESYSUNH */
4409 UNUSED(sockaddr);
4410 UNUSED(active);
4411 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4412 }
4413
4414 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4415 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4416 uint32_t owner, uint32_t group) {
4417 #ifdef ISC_PLATFORM_HAVESYSUNH
4418 isc_result_t result = ISC_R_SUCCESS;
4419 char strbuf[ISC_STRERRORSIZE];
4420 char path[sizeof(sockaddr->type.sunix.sun_path)];
4421 #ifdef NEED_SECURE_DIRECTORY
4422 char *slash;
4423 #endif /* ifdef NEED_SECURE_DIRECTORY */
4424
4425 REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4426 INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4427 strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4428
4429 #ifdef NEED_SECURE_DIRECTORY
4430 slash = strrchr(path, '/');
4431 if (slash != NULL) {
4432 if (slash != path) {
4433 *slash = '\0';
4434 } else {
4435 strlcpy(path, "/", sizeof(path));
4436 }
4437 } else {
4438 strlcpy(path, ".", sizeof(path));
4439 }
4440 #endif /* ifdef NEED_SECURE_DIRECTORY */
4441
4442 if (chmod(path, perm) < 0) {
4443 strerror_r(errno, strbuf, sizeof(strbuf));
4444 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4445 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4446 "isc_socket_permunix: chmod(%s, %d): %s", path,
4447 perm, strbuf);
4448 result = ISC_R_FAILURE;
4449 }
4450 if (chown(path, owner, group) < 0) {
4451 strerror_r(errno, strbuf, sizeof(strbuf));
4452 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4453 ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4454 "isc_socket_permunix: chown(%s, %d, %d): %s",
4455 path, owner, group, strbuf);
4456 result = ISC_R_FAILURE;
4457 }
4458 return (result);
4459 #else /* ifdef ISC_PLATFORM_HAVESYSUNH */
4460 UNUSED(sockaddr);
4461 UNUSED(perm);
4462 UNUSED(owner);
4463 UNUSED(group);
4464 return (ISC_R_NOTIMPLEMENTED);
4465 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4466 }
4467
4468 isc_result_t
isc_socket_bind(isc_socket_t * sock,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4469 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4470 isc_socket_options_t options) {
4471 char strbuf[ISC_STRERRORSIZE];
4472 int on = 1;
4473
4474 REQUIRE(VALID_SOCKET(sock));
4475
4476 LOCK(&sock->lock);
4477
4478 INSIST(!sock->bound);
4479 INSIST(!sock->dupped);
4480
4481 if (sock->pf != sockaddr->type.sa.sa_family) {
4482 UNLOCK(&sock->lock);
4483 return (ISC_R_FAMILYMISMATCH);
4484 }
4485
4486 /*
4487 * Only set SO_REUSEADDR when we want a specific port.
4488 */
4489 #ifdef AF_UNIX
4490 if (sock->pf == AF_UNIX) {
4491 goto bind_socket;
4492 }
4493 #endif /* ifdef AF_UNIX */
4494 if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4495 isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4496 {
4497 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4498 sizeof(on)) < 0)
4499 {
4500 UNEXPECTED_ERROR(__FILE__, __LINE__,
4501 "setsockopt(%d) failed", sock->fd);
4502 }
4503 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4504 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4505 (void *)&on, sizeof(on)) < 0)
4506 {
4507 UNEXPECTED_ERROR(__FILE__, __LINE__,
4508 "setsockopt(%d) failed", sock->fd);
4509 }
4510 #elif defined(__linux__) && defined(SO_REUSEPORT)
4511 if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4512 sizeof(on)) < 0)
4513 {
4514 UNEXPECTED_ERROR(__FILE__, __LINE__,
4515 "setsockopt(%d) failed", sock->fd);
4516 }
4517 #endif /* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4518 /* Press on... */
4519 }
4520 #ifdef AF_UNIX
4521 bind_socket:
4522 #endif /* ifdef AF_UNIX */
4523 if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4524 inc_stats(sock->manager->stats,
4525 sock->statsindex[STATID_BINDFAIL]);
4526
4527 UNLOCK(&sock->lock);
4528 switch (errno) {
4529 case EACCES:
4530 return (ISC_R_NOPERM);
4531 case EADDRNOTAVAIL:
4532 return (ISC_R_ADDRNOTAVAIL);
4533 case EADDRINUSE:
4534 return (ISC_R_ADDRINUSE);
4535 case EINVAL:
4536 return (ISC_R_BOUND);
4537 default:
4538 strerror_r(errno, strbuf, sizeof(strbuf));
4539 UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4540 strbuf);
4541 return (ISC_R_UNEXPECTED);
4542 }
4543 }
4544
4545 socket_log(sock, sockaddr, TRACE, "bound");
4546 sock->bound = 1;
4547
4548 UNLOCK(&sock->lock);
4549 return (ISC_R_SUCCESS);
4550 }
4551
4552 /*
4553 * Enable this only for specific OS versions, and only when they have repaired
4554 * their problems with it. Until then, this is is broken and needs to be
4555 * disabled by default. See RT22589 for details.
4556 */
4557 #undef ENABLE_ACCEPTFILTER
4558
4559 isc_result_t
isc_socket_filter(isc_socket_t * sock,const char * filter)4560 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4561 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4562 char strbuf[ISC_STRERRORSIZE];
4563 struct accept_filter_arg afa;
4564 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4565 UNUSED(sock);
4566 UNUSED(filter);
4567 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4568
4569 REQUIRE(VALID_SOCKET(sock));
4570
4571 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4572 bzero(&afa, sizeof(afa));
4573 strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4574 if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4575 sizeof(afa)) == -1)
4576 {
4577 strerror_r(errno, strbuf, sizeof(strbuf));
4578 socket_log(sock, NULL, CREATION,
4579 "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4580 return (ISC_R_FAILURE);
4581 }
4582 return (ISC_R_SUCCESS);
4583 #else /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4584 return (ISC_R_NOTIMPLEMENTED);
4585 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4586 }
4587
4588 /*
4589 * Try enabling TCP Fast Open for a given socket if the OS supports it.
4590 */
4591 static void
set_tcp_fastopen(isc_socket_t * sock,unsigned int backlog)4592 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4593 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4594 char strbuf[ISC_STRERRORSIZE];
4595
4596 /*
4597 * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4598 * shipping a default kernel without TFO support, so we special-case it by
4599 * performing an additional runtime check for TFO support using sysctl to
4600 * prevent setsockopt() errors from being logged.
4601 */
4602 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4603 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4604 unsigned int enabled;
4605 size_t enabledlen = sizeof(enabled);
4606 static bool tfo_notice_logged = false;
4607
4608 if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4609 /*
4610 * This kernel does not support TCP Fast Open. There is
4611 * nothing more we can do.
4612 */
4613 return;
4614 } else if (enabled == 0) {
4615 /*
4616 * This kernel does support TCP Fast Open, but it is disabled
4617 * by sysctl. Notify the user, but do not nag.
4618 */
4619 if (!tfo_notice_logged) {
4620 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4621 ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4622 "TCP_FASTOPEN support is disabled by "
4623 "sysctl (" SYSCTL_TFO " = 0)");
4624 tfo_notice_logged = true;
4625 }
4626 return;
4627 }
4628 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4629
4630 #ifdef __APPLE__
4631 backlog = 1;
4632 #else /* ifdef __APPLE__ */
4633 backlog = backlog / 2;
4634 if (backlog == 0) {
4635 backlog = 1;
4636 }
4637 #endif /* ifdef __APPLE__ */
4638 if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4639 sizeof(backlog)) < 0)
4640 {
4641 strerror_r(errno, strbuf, sizeof(strbuf));
4642 UNEXPECTED_ERROR(__FILE__, __LINE__,
4643 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4644 sock->fd, strbuf);
4645 /* TCP_FASTOPEN is experimental so ignore failures */
4646 }
4647 #else /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4648 UNUSED(sock);
4649 UNUSED(backlog);
4650 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4651 }
4652
4653 /*
4654 * Set up to listen on a given socket. We do this by creating an internal
4655 * event that will be dispatched when the socket has read activity. The
4656 * watcher will send the internal event to the task when there is a new
4657 * connection.
4658 *
4659 * Unlike in read, we don't preallocate a done event here. Every time there
4660 * is a new connection we'll have to allocate a new one anyway, so we might
4661 * as well keep things simple rather than having to track them.
4662 */
4663 isc_result_t
isc_socket_listen(isc_socket_t * sock,unsigned int backlog)4664 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4665 char strbuf[ISC_STRERRORSIZE];
4666
4667 REQUIRE(VALID_SOCKET(sock));
4668
4669 LOCK(&sock->lock);
4670
4671 REQUIRE(!sock->listener);
4672 REQUIRE(sock->bound);
4673 REQUIRE(sock->type == isc_sockettype_tcp ||
4674 sock->type == isc_sockettype_unix);
4675
4676 if (backlog == 0) {
4677 backlog = SOMAXCONN;
4678 }
4679
4680 if (listen(sock->fd, (int)backlog) < 0) {
4681 UNLOCK(&sock->lock);
4682 strerror_r(errno, strbuf, sizeof(strbuf));
4683
4684 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4685
4686 return (ISC_R_UNEXPECTED);
4687 }
4688
4689 set_tcp_fastopen(sock, backlog);
4690
4691 sock->listener = 1;
4692
4693 UNLOCK(&sock->lock);
4694 return (ISC_R_SUCCESS);
4695 }
4696
4697 /*
4698 * This should try to do aggressive accept() XXXMLG
4699 */
4700 isc_result_t
isc_socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,void * arg)4701 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4702 void *arg) {
4703 isc_socket_newconnev_t *dev;
4704 isc_socketmgr_t *manager;
4705 isc_task_t *ntask = NULL;
4706 isc_socket_t *nsock;
4707 isc_result_t result;
4708 bool do_poke = false;
4709
4710 REQUIRE(VALID_SOCKET(sock));
4711 manager = sock->manager;
4712 REQUIRE(VALID_MANAGER(manager));
4713
4714 LOCK(&sock->lock);
4715
4716 REQUIRE(sock->listener);
4717
4718 /*
4719 * Sender field is overloaded here with the task we will be sending
4720 * this event to. Just before the actual event is delivered the
4721 * actual ev_sender will be touched up to be the socket.
4722 */
4723 dev = (isc_socket_newconnev_t *)isc_event_allocate(
4724 manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4725 sizeof(*dev));
4726 ISC_LINK_INIT(dev, ev_link);
4727
4728 result = allocate_socket(manager, sock->type, &nsock);
4729 if (result != ISC_R_SUCCESS) {
4730 isc_event_free(ISC_EVENT_PTR(&dev));
4731 UNLOCK(&sock->lock);
4732 return (result);
4733 }
4734
4735 /*
4736 * Attach to socket and to task.
4737 */
4738 isc_task_attach(task, &ntask);
4739 if (isc_task_exiting(ntask)) {
4740 free_socket(&nsock);
4741 isc_task_detach(&ntask);
4742 isc_event_free(ISC_EVENT_PTR(&dev));
4743 UNLOCK(&sock->lock);
4744 return (ISC_R_SHUTTINGDOWN);
4745 }
4746 isc_refcount_increment0(&nsock->references);
4747 nsock->statsindex = sock->statsindex;
4748
4749 dev->ev_sender = ntask;
4750 dev->newsocket = nsock;
4751
4752 /*
4753 * Poke watcher here. We still have the socket locked, so there
4754 * is no race condition. We will keep the lock for such a short
4755 * bit of time waking it up now or later won't matter all that much.
4756 */
4757 do_poke = ISC_LIST_EMPTY(sock->accept_list);
4758 ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4759 if (do_poke) {
4760 select_poke(manager, sock->threadid, sock->fd,
4761 SELECT_POKE_ACCEPT);
4762 }
4763 UNLOCK(&sock->lock);
4764 return (ISC_R_SUCCESS);
4765 }
4766
4767 isc_result_t
isc_socket_connect(isc_socket_t * sock,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4768 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4769 isc_task_t *task, isc_taskaction_t action, void *arg) {
4770 isc_socket_connev_t *dev;
4771 isc_task_t *ntask = NULL;
4772 isc_socketmgr_t *manager;
4773 int cc;
4774 char strbuf[ISC_STRERRORSIZE];
4775 char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4776
4777 REQUIRE(VALID_SOCKET(sock));
4778 REQUIRE(addr != NULL);
4779 REQUIRE(task != NULL);
4780 REQUIRE(action != NULL);
4781
4782 manager = sock->manager;
4783 REQUIRE(VALID_MANAGER(manager));
4784 REQUIRE(addr != NULL);
4785
4786 if (isc_sockaddr_ismulticast(addr)) {
4787 return (ISC_R_MULTICAST);
4788 }
4789
4790 LOCK(&sock->lock);
4791
4792 dev = (isc_socket_connev_t *)isc_event_allocate(
4793 manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4794 sizeof(*dev));
4795 ISC_LINK_INIT(dev, ev_link);
4796
4797 if (sock->connecting) {
4798 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4799 goto queue;
4800 }
4801
4802 if (sock->connected) {
4803 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4804 dev->result = ISC_R_SUCCESS;
4805 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4806
4807 UNLOCK(&sock->lock);
4808
4809 return (ISC_R_SUCCESS);
4810 }
4811
4812 /*
4813 * Try to do the connect right away, as there can be only one
4814 * outstanding, and it might happen to complete.
4815 */
4816 sock->peer_address = *addr;
4817 cc = connect(sock->fd, &addr->type.sa, addr->length);
4818 if (cc < 0) {
4819 /*
4820 * The socket is nonblocking and the connection cannot be
4821 * completed immediately. It is possible to select(2) or
4822 * poll(2) for completion by selecting the socket for writing.
4823 * After select(2) indicates writability, use getsockopt(2) to
4824 * read the SO_ERROR option at level SOL_SOCKET to determine
4825 * whether connect() completed successfully (SO_ERROR is zero)
4826 * or unsuccessfully (SO_ERROR is one of the usual error codes
4827 * listed here, explaining the reason for the failure).
4828 */
4829 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4830 cc = 0;
4831 goto success;
4832 }
4833 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4834 goto queue;
4835 }
4836
4837 switch (errno) {
4838 #define ERROR_MATCH(a, b) \
4839 case a: \
4840 dev->result = b; \
4841 goto err_exit;
4842 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4843 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4844 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4845 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4846 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4847 #ifdef EHOSTDOWN
4848 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4849 #endif /* ifdef EHOSTDOWN */
4850 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4851 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4852 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4853 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4854 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4855 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4856 #undef ERROR_MATCH
4857 }
4858
4859 sock->connected = 0;
4860
4861 strerror_r(errno, strbuf, sizeof(strbuf));
4862 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4863 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4864 addrbuf, errno, strbuf);
4865
4866 UNLOCK(&sock->lock);
4867 inc_stats(sock->manager->stats,
4868 sock->statsindex[STATID_CONNECTFAIL]);
4869 isc_event_free(ISC_EVENT_PTR(&dev));
4870 return (ISC_R_UNEXPECTED);
4871
4872 err_exit:
4873 sock->connected = 0;
4874 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4875
4876 UNLOCK(&sock->lock);
4877 inc_stats(sock->manager->stats,
4878 sock->statsindex[STATID_CONNECTFAIL]);
4879 return (ISC_R_SUCCESS);
4880 }
4881
4882 /*
4883 * If connect completed, fire off the done event.
4884 */
4885 success:
4886 if (cc == 0) {
4887 sock->connected = 1;
4888 sock->bound = 1;
4889 dev->result = ISC_R_SUCCESS;
4890 isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4891
4892 UNLOCK(&sock->lock);
4893
4894 inc_stats(sock->manager->stats,
4895 sock->statsindex[STATID_CONNECT]);
4896
4897 return (ISC_R_SUCCESS);
4898 }
4899
4900 queue:
4901
4902 /*
4903 * Attach to task.
4904 */
4905 isc_task_attach(task, &ntask);
4906
4907 dev->ev_sender = ntask;
4908
4909 /*
4910 * Poke watcher here. We still have the socket locked, so there
4911 * is no race condition. We will keep the lock for such a short
4912 * bit of time waking it up now or later won't matter all that much.
4913 */
4914 bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4915 ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4916 if (do_poke && !sock->connecting) {
4917 sock->connecting = 1;
4918 select_poke(manager, sock->threadid, sock->fd,
4919 SELECT_POKE_CONNECT);
4920 }
4921
4922 UNLOCK(&sock->lock);
4923 return (ISC_R_SUCCESS);
4924 }
4925
4926 /*
4927 * Called when a socket with a pending connect() finishes.
4928 */
4929 static void
internal_connect(isc_socket_t * sock)4930 internal_connect(isc_socket_t *sock) {
4931 isc_socket_connev_t *dev;
4932 int cc;
4933 isc_result_t result;
4934 socklen_t optlen;
4935 char strbuf[ISC_STRERRORSIZE];
4936 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4937
4938 INSIST(VALID_SOCKET(sock));
4939 REQUIRE(sock->fd >= 0);
4940
4941 /*
4942 * Get the first item off the connect list.
4943 * If it is empty, unlock the socket and return.
4944 */
4945 dev = ISC_LIST_HEAD(sock->connect_list);
4946 if (dev == NULL) {
4947 INSIST(!sock->connecting);
4948 goto finish;
4949 }
4950
4951 INSIST(sock->connecting);
4952 sock->connecting = 0;
4953
4954 /*
4955 * Get any possible error status here.
4956 */
4957 optlen = sizeof(cc);
4958 if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4959 (void *)&optlen) != 0)
4960 {
4961 cc = errno;
4962 } else {
4963 errno = cc;
4964 }
4965
4966 if (errno != 0) {
4967 /*
4968 * If the error is EAGAIN, just re-select on this
4969 * fd and pretend nothing strange happened.
4970 */
4971 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4972 sock->connecting = 1;
4973 return;
4974 }
4975
4976 inc_stats(sock->manager->stats,
4977 sock->statsindex[STATID_CONNECTFAIL]);
4978
4979 /*
4980 * Translate other errors into ISC_R_* flavors.
4981 */
4982 switch (errno) {
4983 #define ERROR_MATCH(a, b) \
4984 case a: \
4985 result = b; \
4986 break;
4987 ERROR_MATCH(EACCES, ISC_R_NOPERM);
4988 ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4989 ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4990 ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4991 ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4992 #ifdef EHOSTDOWN
4993 ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4994 #endif /* ifdef EHOSTDOWN */
4995 ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4996 ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4997 ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4998 ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4999 ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5000 ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5001 #undef ERROR_MATCH
5002 default:
5003 result = ISC_R_UNEXPECTED;
5004 isc_sockaddr_format(&sock->peer_address, peerbuf,
5005 sizeof(peerbuf));
5006 strerror_r(errno, strbuf, sizeof(strbuf));
5007 UNEXPECTED_ERROR(__FILE__, __LINE__,
5008 "internal_connect: connect(%s) %s",
5009 peerbuf, strbuf);
5010 }
5011 } else {
5012 inc_stats(sock->manager->stats,
5013 sock->statsindex[STATID_CONNECT]);
5014 result = ISC_R_SUCCESS;
5015 sock->connected = 1;
5016 sock->bound = 1;
5017 }
5018
5019 do {
5020 dev->result = result;
5021 send_connectdone_event(sock, &dev);
5022 dev = ISC_LIST_HEAD(sock->connect_list);
5023 } while (dev != NULL);
5024
5025 finish:
5026 unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
5027 SELECT_POKE_CONNECT);
5028 }
5029
5030 isc_result_t
isc_socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)5031 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5032 isc_result_t result;
5033
5034 REQUIRE(VALID_SOCKET(sock));
5035 REQUIRE(addressp != NULL);
5036
5037 LOCK(&sock->lock);
5038
5039 if (sock->connected) {
5040 *addressp = sock->peer_address;
5041 result = ISC_R_SUCCESS;
5042 } else {
5043 result = ISC_R_NOTCONNECTED;
5044 }
5045
5046 UNLOCK(&sock->lock);
5047
5048 return (result);
5049 }
5050
5051 isc_result_t
isc_socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)5052 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5053 socklen_t len;
5054 isc_result_t result;
5055 char strbuf[ISC_STRERRORSIZE];
5056
5057 REQUIRE(VALID_SOCKET(sock));
5058 REQUIRE(addressp != NULL);
5059
5060 LOCK(&sock->lock);
5061
5062 if (!sock->bound) {
5063 result = ISC_R_NOTBOUND;
5064 goto out;
5065 }
5066
5067 result = ISC_R_SUCCESS;
5068
5069 len = sizeof(addressp->type);
5070 if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5071 strerror_r(errno, strbuf, sizeof(strbuf));
5072 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
5073 result = ISC_R_UNEXPECTED;
5074 goto out;
5075 }
5076 addressp->length = (unsigned int)len;
5077
5078 out:
5079 UNLOCK(&sock->lock);
5080
5081 return (result);
5082 }
5083
5084 /*
5085 * Run through the list of events on this socket, and cancel the ones
5086 * queued for task "task" of type "how". "how" is a bitmask.
5087 */
5088 void
isc_socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)5089 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5090 REQUIRE(VALID_SOCKET(sock));
5091
5092 /*
5093 * Quick exit if there is nothing to do. Don't even bother locking
5094 * in this case.
5095 */
5096 if (how == 0) {
5097 return;
5098 }
5099
5100 LOCK(&sock->lock);
5101
5102 /*
5103 * All of these do the same thing, more or less.
5104 * Each will:
5105 * o If the internal event is marked as "posted" try to
5106 * remove it from the task's queue. If this fails, mark it
5107 * as canceled instead, and let the task clean it up later.
5108 * o For each I/O request for that task of that type, post
5109 * its done event with status of "ISC_R_CANCELED".
5110 * o Reset any state needed.
5111 */
5112 if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
5113 !ISC_LIST_EMPTY(sock->recv_list))
5114 {
5115 isc_socketevent_t *dev;
5116 isc_socketevent_t *next;
5117 isc_task_t *current_task;
5118
5119 dev = ISC_LIST_HEAD(sock->recv_list);
5120
5121 while (dev != NULL) {
5122 current_task = dev->ev_sender;
5123 next = ISC_LIST_NEXT(dev, ev_link);
5124
5125 if ((task == NULL) || (task == current_task)) {
5126 dev->result = ISC_R_CANCELED;
5127 send_recvdone_event(sock, &dev);
5128 }
5129 dev = next;
5130 }
5131 }
5132
5133 if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5134 !ISC_LIST_EMPTY(sock->send_list))
5135 {
5136 isc_socketevent_t *dev;
5137 isc_socketevent_t *next;
5138 isc_task_t *current_task;
5139
5140 dev = ISC_LIST_HEAD(sock->send_list);
5141
5142 while (dev != NULL) {
5143 current_task = dev->ev_sender;
5144 next = ISC_LIST_NEXT(dev, ev_link);
5145
5146 if ((task == NULL) || (task == current_task)) {
5147 dev->result = ISC_R_CANCELED;
5148 send_senddone_event(sock, &dev);
5149 }
5150 dev = next;
5151 }
5152 }
5153
5154 if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5155 !ISC_LIST_EMPTY(sock->accept_list))
5156 {
5157 isc_socket_newconnev_t *dev;
5158 isc_socket_newconnev_t *next;
5159 isc_task_t *current_task;
5160
5161 dev = ISC_LIST_HEAD(sock->accept_list);
5162 while (dev != NULL) {
5163 current_task = dev->ev_sender;
5164 next = ISC_LIST_NEXT(dev, ev_link);
5165
5166 if ((task == NULL) || (task == current_task)) {
5167 ISC_LIST_UNLINK(sock->accept_list, dev,
5168 ev_link);
5169
5170 isc_refcount_decrementz(
5171 &NEWCONNSOCK(dev)->references);
5172 free_socket((isc_socket_t **)&dev->newsocket);
5173
5174 dev->result = ISC_R_CANCELED;
5175 dev->ev_sender = sock;
5176 isc_task_sendtoanddetach(¤t_task,
5177 ISC_EVENT_PTR(&dev),
5178 sock->threadid);
5179 }
5180
5181 dev = next;
5182 }
5183 }
5184
5185 if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5186 !ISC_LIST_EMPTY(sock->connect_list))
5187 {
5188 isc_socket_connev_t *dev;
5189 isc_socket_connev_t *next;
5190 isc_task_t *current_task;
5191
5192 INSIST(sock->connecting);
5193 sock->connecting = 0;
5194
5195 dev = ISC_LIST_HEAD(sock->connect_list);
5196
5197 while (dev != NULL) {
5198 current_task = dev->ev_sender;
5199 next = ISC_LIST_NEXT(dev, ev_link);
5200
5201 if ((task == NULL) || (task == current_task)) {
5202 dev->result = ISC_R_CANCELED;
5203 send_connectdone_event(sock, &dev);
5204 }
5205 dev = next;
5206 }
5207 }
5208
5209 UNLOCK(&sock->lock);
5210 }
5211
5212 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock)5213 isc_socket_gettype(isc_socket_t *sock) {
5214 REQUIRE(VALID_SOCKET(sock));
5215
5216 return (sock->type);
5217 }
5218
5219 void
isc_socket_ipv6only(isc_socket_t * sock,bool yes)5220 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5221 #if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
5222 int onoff = yes ? 1 : 0;
5223 #else /* if defined(IPV6_V6ONLY) */
5224 UNUSED(yes);
5225 UNUSED(sock);
5226 #endif /* if defined(IPV6_V6ONLY) */
5227
5228 REQUIRE(VALID_SOCKET(sock));
5229 INSIST(!sock->dupped);
5230
5231 #if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
5232 if (sock->pf == AF_INET6) {
5233 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5234 (void *)&onoff, sizeof(int)) < 0)
5235 {
5236 char strbuf[ISC_STRERRORSIZE];
5237 strerror_r(errno, strbuf, sizeof(strbuf));
5238 UNEXPECTED_ERROR(__FILE__, __LINE__,
5239 "setsockopt(%d, IPV6_V6ONLY) failed: "
5240 "%s",
5241 sock->fd, strbuf);
5242 }
5243 }
5244 #endif /* ifdef IPV6_V6ONLY */
5245 }
5246
5247 static void
setdscp(isc_socket_t * sock,isc_dscp_t dscp)5248 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5249 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5250 int value = dscp << 2;
5251 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5252
5253 sock->dscp = dscp;
5254
5255 #ifdef IP_TOS
5256 if (sock->pf == AF_INET) {
5257 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5258 sizeof(value)) < 0)
5259 {
5260 char strbuf[ISC_STRERRORSIZE];
5261 strerror_r(errno, strbuf, sizeof(strbuf));
5262 UNEXPECTED_ERROR(__FILE__, __LINE__,
5263 "setsockopt(%d, IP_TOS, %.02x) "
5264 "failed: %s",
5265 sock->fd, value >> 2, strbuf);
5266 }
5267 }
5268 #endif /* ifdef IP_TOS */
5269 #ifdef IPV6_TCLASS
5270 if (sock->pf == AF_INET6) {
5271 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5272 (void *)&value, sizeof(value)) < 0)
5273 {
5274 char strbuf[ISC_STRERRORSIZE];
5275 strerror_r(errno, strbuf, sizeof(strbuf));
5276 UNEXPECTED_ERROR(__FILE__, __LINE__,
5277 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5278 "failed: %s",
5279 sock->fd, dscp >> 2, strbuf);
5280 }
5281 }
5282 #endif /* ifdef IPV6_TCLASS */
5283 }
5284
5285 void
isc_socket_dscp(isc_socket_t * sock,isc_dscp_t dscp)5286 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5287 REQUIRE(VALID_SOCKET(sock));
5288 REQUIRE(dscp < 0x40);
5289
5290 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5291 UNUSED(dscp);
5292 #else /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5293 if (dscp < 0) {
5294 return;
5295 }
5296
5297 /* The DSCP value must not be changed once it has been set. */
5298 if (isc_dscp_check_value != -1) {
5299 INSIST(dscp == isc_dscp_check_value);
5300 }
5301 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5302
5303 #ifdef notyet
5304 REQUIRE(!sock->dupped);
5305 #endif /* ifdef notyet */
5306
5307 setdscp(sock, dscp);
5308 }
5309
5310 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5311 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5312 isc_taskaction_t action, void *arg) {
5313 return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5314 }
5315
5316 void
isc_socket_setname(isc_socket_t * sock,const char * name,void * tag)5317 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5318 /*
5319 * Name 'sock'.
5320 */
5321
5322 REQUIRE(VALID_SOCKET(sock));
5323
5324 LOCK(&sock->lock);
5325 strlcpy(sock->name, name, sizeof(sock->name));
5326 sock->tag = tag;
5327 UNLOCK(&sock->lock);
5328 }
5329
5330 const char *
isc_socket_getname(isc_socket_t * sock)5331 isc_socket_getname(isc_socket_t *sock) {
5332 return (sock->name);
5333 }
5334
5335 void *
isc_socket_gettag(isc_socket_t * sock)5336 isc_socket_gettag(isc_socket_t *sock) {
5337 return (sock->tag);
5338 }
5339
5340 int
isc_socket_getfd(isc_socket_t * sock)5341 isc_socket_getfd(isc_socket_t *sock) {
5342 return ((short)sock->fd);
5343 }
5344
5345 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5346 static bool hasreuseport = false;
5347
5348 static void
init_hasreuseport(void)5349 init_hasreuseport(void) {
5350 /*
5351 * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5352 * We only want to use it on Linux, if it's available. On BSD we want to dup()
5353 * sockets instead of re-binding them.
5354 */
5355 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5356 (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5357 int sock, yes = 1;
5358 sock = socket(AF_INET, SOCK_DGRAM, 0);
5359 if (sock < 0) {
5360 sock = socket(AF_INET6, SOCK_DGRAM, 0);
5361 if (sock < 0) {
5362 return;
5363 }
5364 }
5365 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5366 sizeof(yes)) < 0)
5367 {
5368 close(sock);
5369 return;
5370 #if defined(__FreeBSD_kernel__)
5371 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5372 sizeof(yes)) < 0)
5373 #else /* if defined(__FreeBSD_kernel__) */
5374 } else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5375 sizeof(yes)) < 0)
5376 #endif /* if defined(__FreeBSD_kernel__) */
5377 {
5378 close(sock);
5379 return;
5380 }
5381 hasreuseport = true;
5382 close(sock);
5383 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5384 * (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5385 }
5386
5387 bool
isc_socket_hasreuseport()5388 isc_socket_hasreuseport() {
5389 RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5390 ISC_R_SUCCESS);
5391 return (hasreuseport);
5392 }
5393
5394 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5395 static const char *
_socktype(isc_sockettype_t type)5396 _socktype(isc_sockettype_t type) {
5397 switch (type) {
5398 case isc_sockettype_udp:
5399 return ("udp");
5400 case isc_sockettype_tcp:
5401 return ("tcp");
5402 case isc_sockettype_unix:
5403 return ("unix");
5404 case isc_sockettype_fdwatch:
5405 return ("fdwatch");
5406 default:
5407 return ("not-initialized");
5408 }
5409 }
5410 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5411
5412 #ifdef HAVE_LIBXML2
5413 #define TRY0(a) \
5414 do { \
5415 xmlrc = (a); \
5416 if (xmlrc < 0) \
5417 goto error; \
5418 } while (0)
5419 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,void * writer0)5420 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5421 isc_socket_t *sock = NULL;
5422 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5423 isc_sockaddr_t addr;
5424 socklen_t len;
5425 int xmlrc;
5426 xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5427
5428 LOCK(&mgr->lock);
5429
5430 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5431 sock = ISC_LIST_HEAD(mgr->socklist);
5432 while (sock != NULL) {
5433 LOCK(&sock->lock);
5434 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5435
5436 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5437 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5438 TRY0(xmlTextWriterEndElement(writer));
5439
5440 if (sock->name[0] != 0) {
5441 TRY0(xmlTextWriterStartElement(writer,
5442 ISC_XMLCHAR "name"));
5443 TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5444 sock->name));
5445 TRY0(xmlTextWriterEndElement(writer)); /* name */
5446 }
5447
5448 TRY0(xmlTextWriterStartElement(writer,
5449 ISC_XMLCHAR "references"));
5450 TRY0(xmlTextWriterWriteFormatString(
5451 writer, "%d",
5452 (int)isc_refcount_current(&sock->references)));
5453 TRY0(xmlTextWriterEndElement(writer));
5454
5455 TRY0(xmlTextWriterWriteElement(
5456 writer, ISC_XMLCHAR "type",
5457 ISC_XMLCHAR _socktype(sock->type)));
5458
5459 if (sock->connected) {
5460 isc_sockaddr_format(&sock->peer_address, peerbuf,
5461 sizeof(peerbuf));
5462 TRY0(xmlTextWriterWriteElement(
5463 writer, ISC_XMLCHAR "peer-address",
5464 ISC_XMLCHAR peerbuf));
5465 }
5466
5467 len = sizeof(addr);
5468 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5469 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5470 TRY0(xmlTextWriterWriteElement(
5471 writer, ISC_XMLCHAR "local-address",
5472 ISC_XMLCHAR peerbuf));
5473 }
5474
5475 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5476 if (sock->listener) {
5477 TRY0(xmlTextWriterWriteElement(writer,
5478 ISC_XMLCHAR "state",
5479 ISC_XMLCHAR "listener"));
5480 }
5481 if (sock->connected) {
5482 TRY0(xmlTextWriterWriteElement(
5483 writer, ISC_XMLCHAR "state",
5484 ISC_XMLCHAR "connected"));
5485 }
5486 if (sock->connecting) {
5487 TRY0(xmlTextWriterWriteElement(
5488 writer, ISC_XMLCHAR "state",
5489 ISC_XMLCHAR "connecting"));
5490 }
5491 if (sock->bound) {
5492 TRY0(xmlTextWriterWriteElement(writer,
5493 ISC_XMLCHAR "state",
5494 ISC_XMLCHAR "bound"));
5495 }
5496
5497 TRY0(xmlTextWriterEndElement(writer)); /* states */
5498
5499 TRY0(xmlTextWriterEndElement(writer)); /* socket */
5500
5501 UNLOCK(&sock->lock);
5502 sock = ISC_LIST_NEXT(sock, link);
5503 }
5504 TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5505
5506 error:
5507 if (sock != NULL) {
5508 UNLOCK(&sock->lock);
5509 }
5510
5511 UNLOCK(&mgr->lock);
5512
5513 return (xmlrc);
5514 }
5515 #endif /* HAVE_LIBXML2 */
5516
5517 #ifdef HAVE_JSON_C
5518 #define CHECKMEM(m) \
5519 do { \
5520 if (m == NULL) { \
5521 result = ISC_R_NOMEMORY; \
5522 goto error; \
5523 } \
5524 } while (0)
5525
5526 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr,void * stats0)5527 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5528 isc_result_t result = ISC_R_SUCCESS;
5529 isc_socket_t *sock = NULL;
5530 char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5531 isc_sockaddr_t addr;
5532 socklen_t len;
5533 json_object *obj, *array = json_object_new_array();
5534 json_object *stats = (json_object *)stats0;
5535
5536 CHECKMEM(array);
5537
5538 LOCK(&mgr->lock);
5539
5540 sock = ISC_LIST_HEAD(mgr->socklist);
5541 while (sock != NULL) {
5542 json_object *states, *entry = json_object_new_object();
5543 char buf[255];
5544
5545 CHECKMEM(entry);
5546 json_object_array_add(array, entry);
5547
5548 LOCK(&sock->lock);
5549
5550 snprintf(buf, sizeof(buf), "%p", sock);
5551 obj = json_object_new_string(buf);
5552 CHECKMEM(obj);
5553 json_object_object_add(entry, "id", obj);
5554
5555 if (sock->name[0] != 0) {
5556 obj = json_object_new_string(sock->name);
5557 CHECKMEM(obj);
5558 json_object_object_add(entry, "name", obj);
5559 }
5560
5561 obj = json_object_new_int(
5562 (int)isc_refcount_current(&sock->references));
5563 CHECKMEM(obj);
5564 json_object_object_add(entry, "references", obj);
5565
5566 obj = json_object_new_string(_socktype(sock->type));
5567 CHECKMEM(obj);
5568 json_object_object_add(entry, "type", obj);
5569
5570 if (sock->connected) {
5571 isc_sockaddr_format(&sock->peer_address, peerbuf,
5572 sizeof(peerbuf));
5573 obj = json_object_new_string(peerbuf);
5574 CHECKMEM(obj);
5575 json_object_object_add(entry, "peer-address", obj);
5576 }
5577
5578 len = sizeof(addr);
5579 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5580 isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5581 obj = json_object_new_string(peerbuf);
5582 CHECKMEM(obj);
5583 json_object_object_add(entry, "local-address", obj);
5584 }
5585
5586 states = json_object_new_array();
5587 CHECKMEM(states);
5588 json_object_object_add(entry, "states", states);
5589
5590 if (sock->listener) {
5591 obj = json_object_new_string("listener");
5592 CHECKMEM(obj);
5593 json_object_array_add(states, obj);
5594 }
5595
5596 if (sock->connected) {
5597 obj = json_object_new_string("connected");
5598 CHECKMEM(obj);
5599 json_object_array_add(states, obj);
5600 }
5601
5602 if (sock->connecting) {
5603 obj = json_object_new_string("connecting");
5604 CHECKMEM(obj);
5605 json_object_array_add(states, obj);
5606 }
5607
5608 if (sock->bound) {
5609 obj = json_object_new_string("bound");
5610 CHECKMEM(obj);
5611 json_object_array_add(states, obj);
5612 }
5613
5614 UNLOCK(&sock->lock);
5615 sock = ISC_LIST_NEXT(sock, link);
5616 }
5617
5618 json_object_object_add(stats, "sockets", array);
5619 array = NULL;
5620 result = ISC_R_SUCCESS;
5621
5622 error:
5623 if (array != NULL) {
5624 json_object_put(array);
5625 }
5626
5627 if (sock != NULL) {
5628 UNLOCK(&sock->lock);
5629 }
5630
5631 UNLOCK(&mgr->lock);
5632
5633 return (result);
5634 }
5635 #endif /* HAVE_JSON_C */
5636
5637 /*
5638 * Create a new 'type' socket managed by 'manager'. Events
5639 * will be posted to 'task' and when dispatched 'action' will be
5640 * called with 'arg' as the arg value. The new socket is returned
5641 * in 'socketp'.
5642 */
5643 isc_result_t
isc_socket_fdwatchcreate(isc_socketmgr_t * manager,int fd,int flags,isc_sockfdwatch_t callback,void * cbarg,isc_task_t * task,isc_socket_t ** socketp)5644 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
5645 isc_sockfdwatch_t callback, void *cbarg,
5646 isc_task_t *task, isc_socket_t **socketp)
5647 {
5648 isc_socket_t *sock = NULL;
5649 isc__socketthread_t *thread;
5650 isc_result_t result;
5651 int lockid;
5652
5653 REQUIRE(VALID_MANAGER(manager));
5654 REQUIRE(socketp != NULL && *socketp == NULL);
5655
5656 if (fd < 0 || (unsigned int)fd >= manager->maxsocks)
5657 return (ISC_R_RANGE);
5658
5659 result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
5660 if (result != ISC_R_SUCCESS)
5661 return (result);
5662
5663 sock->fd = fd;
5664 sock->fdwatcharg = cbarg;
5665 sock->fdwatchcb = callback;
5666 sock->fdwatchflags = flags;
5667 sock->fdwatchtask = task;
5668
5669 sock->threadid = gen_threadid(sock);
5670 isc_refcount_init(&sock->references, 1);
5671 thread = &manager->threads[sock->threadid];
5672 *socketp = (isc_socket_t *)sock;
5673
5674 /*
5675 * Note we don't have to lock the socket like we normally would because
5676 * there are no external references to it yet.
5677 */
5678
5679 lockid = FDLOCK_ID(sock->fd);
5680 LOCK(&thread->fdlock[lockid]);
5681 thread->fds[sock->fd] = sock;
5682 thread->fdstate[sock->fd] = MANAGED;
5683
5684 #if defined(USE_EPOLL)
5685 manager->epoll_events[sock->fd] = 0;
5686 #endif
5687 #ifdef USE_DEVPOLL
5688 INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
5689 thread->fdpollinfo[sock->fd].want_write == 0);
5690 #endif /* ifdef USE_DEVPOLL */
5691 UNLOCK(&thread->fdlock[lockid]);
5692
5693 LOCK(&manager->lock);
5694 ISC_LIST_APPEND(manager->socklist, sock, link);
5695 #ifdef USE_SELECT
5696 if (thread->maxfd < sock->fd)
5697 thread->maxfd = sock->fd;
5698 #endif
5699 UNLOCK(&manager->lock);
5700
5701 sock->active = 1;
5702 if (flags & ISC_SOCKFDWATCH_READ)
5703 select_poke(sock->manager, sock->threadid, sock->fd,
5704 SELECT_POKE_READ);
5705 if (flags & ISC_SOCKFDWATCH_WRITE)
5706 select_poke(sock->manager, sock->threadid, sock->fd,
5707 SELECT_POKE_WRITE);
5708
5709 socket_log(sock, NULL, CREATION, "fdwatch-created");
5710
5711 return (ISC_R_SUCCESS);
5712 }
5713
5714 /*
5715 * Indicate to the manager that it should watch the socket again.
5716 * This can be used to restart watching if the previous event handler
5717 * didn't indicate there was more data to be processed. Primarily
5718 * it is for writing but could be used for reading if desired
5719 */
5720
5721 isc_result_t
isc_socket_fdwatchpoke(isc_socket_t * sock,int flags)5722 isc_socket_fdwatchpoke(isc_socket_t *sock, int flags)
5723 {
5724 REQUIRE(VALID_SOCKET(sock));
5725
5726 /*
5727 * We check both flags first to allow us to get the lock
5728 * once but only if we need it.
5729 */
5730
5731 if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
5732 LOCK(&sock->lock);
5733 if ((flags & ISC_SOCKFDWATCH_READ) != 0)
5734 select_poke(sock->manager, sock->threadid, sock->fd,
5735 SELECT_POKE_READ);
5736 if ((flags & ISC_SOCKFDWATCH_WRITE) != 0)
5737 select_poke(sock->manager, sock->threadid, sock->fd,
5738 SELECT_POKE_WRITE);
5739 UNLOCK(&sock->lock);
5740 }
5741
5742 socket_log(sock, NULL, TRACE, "fdwatch-poked flags: %d", flags);
5743
5744 return (ISC_R_SUCCESS);
5745 }
5746