xref: /netbsd-src/external/mpl/dhcp/bind/dist/lib/isc/unix/socket.c (revision 4afad4b7fa6d4a0d3dedf41d1587a7250710ae54)
1 /*	$NetBSD: socket.c,v 1.1 2024/02/18 20:57:57 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0.  If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*! \file */
17 
18 #include <inttypes.h>
19 #include <stdbool.h>
20 #include <sys/param.h>
21 #include <sys/socket.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 #if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
25 #include <sys/sysctl.h>
26 #endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
27 #include <sys/time.h>
28 #include <sys/uio.h>
29 
30 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
31 #include <linux/netlink.h>
32 #include <linux/rtnetlink.h>
33 #endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
34 	*/
35 
36 #include <errno.h>
37 #include <fcntl.h>
38 #include <stddef.h>
39 #include <stdlib.h>
40 #include <unistd.h>
41 
42 #include <isc/app.h>
43 #include <isc/buffer.h>
44 #include <isc/condition.h>
45 #include <isc/formatcheck.h>
46 #include <isc/list.h>
47 #include <isc/log.h>
48 #include <isc/mem.h>
49 #include <isc/mutex.h>
50 #include <isc/net.h>
51 #include <isc/once.h>
52 #include <isc/platform.h>
53 #include <isc/print.h>
54 #include <isc/refcount.h>
55 #include <isc/region.h>
56 #include <isc/resource.h>
57 #include <isc/socket.h>
58 #include <isc/stats.h>
59 #include <isc/strerr.h>
60 #include <isc/string.h>
61 #include <isc/task.h>
62 #include <isc/thread.h>
63 #include <isc/util.h>
64 
65 #ifdef ISC_PLATFORM_HAVESYSUNH
66 #include <sys/un.h>
67 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
68 #ifdef HAVE_KQUEUE
69 #include <sys/event.h>
70 #endif /* ifdef HAVE_KQUEUE */
71 #ifdef HAVE_EPOLL_CREATE1
72 #include <sys/epoll.h>
73 #endif /* ifdef HAVE_EPOLL_CREATE1 */
74 #if defined(HAVE_SYS_DEVPOLL_H)
75 #include <sys/devpoll.h>
76 #elif defined(HAVE_DEVPOLL_H)
77 #include <devpoll.h>
78 #endif /* if defined(HAVE_SYS_DEVPOLL_H) */
79 
80 #include <netinet/tcp.h>
81 
82 #include "errno2result.h"
83 
84 #ifdef ENABLE_TCP_FASTOPEN
85 #include <netinet/tcp.h>
86 #endif /* ifdef ENABLE_TCP_FASTOPEN */
87 
88 #ifdef HAVE_JSON_C
89 #include <json_object.h>
90 #endif /* HAVE_JSON_C */
91 
92 #ifdef HAVE_LIBXML2
93 #include <libxml/xmlwriter.h>
94 #define ISC_XMLCHAR (const xmlChar *)
95 #endif /* HAVE_LIBXML2 */
96 
97 /*%
98  * Choose the most preferable multiplex method.
99  */
100 #if defined(HAVE_KQUEUE)
101 #define USE_KQUEUE
102 #elif defined(HAVE_EPOLL_CREATE1)
103 #define USE_EPOLL
104 #elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
105 #define USE_DEVPOLL
106 typedef struct {
107 	unsigned int want_read : 1, want_write : 1;
108 } pollinfo_t;
109 #else /* if defined(HAVE_KQUEUE) */
110 #define USE_SELECT
111 #endif /* HAVE_KQUEUE */
112 
113 /*
114  * Set by the -T dscp option on the command line. If set to a value
115  * other than -1, we check to make sure DSCP values match it, and
116  * assert if not.
117  */
118 int isc_dscp_check_value = -1;
119 
120 /*%
121  * Maximum number of allowable open sockets.  This is also the maximum
122  * allowable socket file descriptor.
123  *
124  * Care should be taken before modifying this value for select():
125  * The API standard doesn't ensure select() accept more than (the system default
126  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
127  * the vast majority of cases.  This constant should therefore be increased only
128  * when absolutely necessary and possible, i.e., the server is exhausting all
129  * available file descriptors (up to FD_SETSIZE) and the select() function
130  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
131  * always by true, but we keep using some of them to ensure as much
132  * portability as possible).  Note also that overall server performance
133  * may be rather worsened with a larger value of this constant due to
134  * inherent scalability problems of select().
135  *
136  * As a special note, this value shouldn't have to be touched if
137  * this is a build for an authoritative only DNS server.
138  */
139 #ifndef ISC_SOCKET_MAXSOCKETS
140 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
141 #ifdef TUNE_LARGE
142 #define ISC_SOCKET_MAXSOCKETS 21000
143 #else /* ifdef TUNE_LARGE */
144 #define ISC_SOCKET_MAXSOCKETS 4096
145 #endif /* TUNE_LARGE */
146 #elif defined(USE_SELECT)
147 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
148 #endif /* USE_KQUEUE... */
149 #endif /* ISC_SOCKET_MAXSOCKETS */
150 
151 #ifdef USE_SELECT
152 /*%
153  * Mac OS X needs a special definition to support larger values in select().
154  * We always define this because a larger value can be specified run-time.
155  */
156 #ifdef __APPLE__
157 #define _DARWIN_UNLIMITED_SELECT
158 #endif /* __APPLE__ */
159 #endif /* USE_SELECT */
160 
161 #ifdef ISC_SOCKET_USE_POLLWATCH
162 /*%
163  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
164  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
165  * some of the specified FD.  The idea is based on the observation that it's
166  * likely for a busy server to keep receiving packets.  It specifically works
167  * as follows: the socket watcher is first initialized with the state of
168  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
169  * event occurs.  When it wakes up for a socket I/O event, it moves to the
170  * poll_active state, and sets the poll timeout to a short period
171  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
172  * watcher goes to the poll_checking state with the same timeout period.
173  * In this state, the watcher tries to detect whether this is a break
174  * during intermittent events or the kernel bug is triggered.  If the next
175  * polling reports an event within the short period, the previous timeout is
176  * likely to be a kernel bug, and so the watcher goes back to the active state.
177  * Otherwise, it moves to the idle state again.
178  *
179  * It's not clear whether this is a thread-related bug, but since we've only
180  * seen this with threads, this workaround is used only when enabling threads.
181  */
182 
183 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
184 
185 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
186 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
187 #endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
188 #endif /* ISC_SOCKET_USE_POLLWATCH */
189 
190 /*%
191  * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
192  */
193 #define FDLOCK_BITS  10
194 #define FDLOCK_COUNT (1 << FDLOCK_BITS)
195 #define FDLOCK_ID(fd)                                   \
196 	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
197 	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
198 
199 /*%
200  * Maximum number of events communicated with the kernel.  There should normally
201  * be no need for having a large number.
202  */
203 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
204 #ifndef ISC_SOCKET_MAXEVENTS
205 #ifdef TUNE_LARGE
206 #define ISC_SOCKET_MAXEVENTS 2048
207 #else /* ifdef TUNE_LARGE */
208 #define ISC_SOCKET_MAXEVENTS 64
209 #endif /* TUNE_LARGE */
210 #endif /* ifndef ISC_SOCKET_MAXEVENTS */
211 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
212 	* */
213 
214 /*%
215  * Some systems define the socket length argument as an int, some as size_t,
216  * some as socklen_t.  This is here so it can be easily changed if needed.
217  */
218 #ifndef socklen_t
219 #define socklen_t unsigned int
220 #endif /* ifndef socklen_t */
221 
222 /*%
223  * Define what the possible "soft" errors can be.  These are non-fatal returns
224  * of various network related functions, like recv() and so on.
225  *
226  * For some reason, BSDI (and perhaps others) will sometimes return <0
227  * from recv() but will have errno==0.  This is broken, but we have to
228  * work around it here.
229  */
230 #define SOFT_ERROR(e)                                             \
231 	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
232 	 (e) == EINTR || (e) == 0)
233 
234 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
235 
236 /*!<
237  * DLVL(90)  --  Function entry/exit and other tracing.
238  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
239  * DLVL(60)  --  Socket data send/receive
240  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
241  * DLVL(20)  --  Socket creation/destruction.
242  */
243 #define TRACE_LEVEL	  90
244 #define CORRECTNESS_LEVEL 70
245 #define IOEVENT_LEVEL	  60
246 #define EVENT_LEVEL	  50
247 #define CREATION_LEVEL	  20
248 
249 #define TRACE	    DLVL(TRACE_LEVEL)
250 #define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
251 #define IOEVENT	    DLVL(IOEVENT_LEVEL)
252 #define EVENT	    DLVL(EVENT_LEVEL)
253 #define CREATION    DLVL(CREATION_LEVEL)
254 
255 typedef isc_event_t intev_t;
256 
257 #define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
258 #define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
259 
260 /*!
261  * IPv6 control information.  If the socket is an IPv6 socket we want
262  * to collect the destination address and interface so the client can
263  * set them on outgoing packets.
264  */
265 #ifndef USE_CMSG
266 #define USE_CMSG 1
267 #endif /* ifndef USE_CMSG */
268 
269 /*%
270  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
271  * a setsockopt() like interface to request timestamps, and if the OS
272  * doesn't do it for us, call gettimeofday() on every UDP receive?
273  */
274 #ifdef SO_TIMESTAMP
275 #ifndef USE_CMSG
276 #define USE_CMSG 1
277 #endif /* ifndef USE_CMSG */
278 #endif /* ifdef SO_TIMESTAMP */
279 
280 #if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
281 #define SET_RCVBUF
282 #endif
283 
284 #if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
285 #define SET_SNDBUF
286 #endif
287 
288 /*%
289  * Instead of calculating the cmsgbuf lengths every time we take
290  * a rule of thumb approach - sizes are taken from x86_64 linux,
291  * multiplied by 2, everything should fit. Those sizes are not
292  * large enough to cause any concern.
293  */
294 #if defined(USE_CMSG)
295 #define CMSG_SP_IN6PKT 40
296 #else /* if defined(USE_CMSG) */
297 #define CMSG_SP_IN6PKT 0
298 #endif /* if defined(USE_CMSG) */
299 
300 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
301 #define CMSG_SP_TIMESTAMP 32
302 #else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
303 #define CMSG_SP_TIMESTAMP 0
304 #endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
305 
306 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
307 #define CMSG_SP_TCTOS 24
308 #else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
309 #define CMSG_SP_TCTOS 0
310 #endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
311 
312 #define CMSG_SP_INT 24
313 
314 /* Align cmsg buffers to be safe on SPARC etc. */
315 #define RECVCMSGBUFLEN                                                       \
316 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
317 			  1,                                                 \
318 		  sizeof(void *))
319 #define SENDCMSGBUFLEN                                                    \
320 	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
321 		  sizeof(void *))
322 
323 /*%
324  * The number of times a send operation is repeated if the result is EINTR.
325  */
326 #define NRETRIES 10
327 
328 typedef struct isc__socketthread isc__socketthread_t;
329 
330 #define NEWCONNSOCK(ev) ((ev)->newsocket)
331 
332 struct isc_socket {
333 	/* Not locked. */
334 	unsigned int magic;
335 	isc_socketmgr_t *manager;
336 	isc_mutex_t lock;
337 	isc_sockettype_t type;
338 	const isc_statscounter_t *statsindex;
339 	isc_refcount_t references;
340 
341 	/* Locked by socket lock. */
342 	ISC_LINK(isc_socket_t) link;
343 	int fd;
344 	int pf;
345 	int threadid;
346 	char name[16];
347 	void *tag;
348 
349 	ISC_LIST(isc_socketevent_t) send_list;
350 	ISC_LIST(isc_socketevent_t) recv_list;
351 	ISC_LIST(isc_socket_newconnev_t) accept_list;
352 	ISC_LIST(isc_socket_connev_t) connect_list;
353 
354 	isc_sockaddr_t peer_address; /* remote address */
355 
356 	unsigned int listener : 1,	       /* listener socket */
357 		connected : 1, connecting : 1, /* connect pending
358 						* */
359 		bound  : 1,		       /* bound to local addr */
360 		dupped : 1, active : 1,	       /* currently active */
361 		pktdscp : 1;		       /* per packet dscp */
362 
363 #ifdef ISC_PLATFORM_RECVOVERFLOW
364 	unsigned char overflow; /* used for MSG_TRUNC fake */
365 #endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
366 
367 	void			*fdwatcharg;
368 	isc_sockfdwatch_t	fdwatchcb;
369 	int			fdwatchflags;
370 	isc_task_t              *fdwatchtask;
371 	unsigned int		dscp;
372 };
373 
374 #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
375 #define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
376 
377 struct isc_socketmgr {
378 	/* Not locked. */
379 	unsigned int magic;
380 	isc_mem_t *mctx;
381 	isc_mutex_t lock;
382 	isc_stats_t *stats;
383 	int nthreads;
384 	isc__socketthread_t *threads;
385 	unsigned int maxsocks;
386 	/* Locked by manager lock. */
387 	ISC_LIST(isc_socket_t) socklist;
388 	int reserved; /* unlocked */
389 	isc_condition_t shutdown_ok;
390 	size_t maxudp;
391 };
392 
393 struct isc__socketthread {
394 	isc_socketmgr_t *manager;
395 	int threadid;
396 	isc_thread_t thread;
397 	int pipe_fds[2];
398 	isc_mutex_t *fdlock;
399 	/* Locked by fdlock. */
400 	isc_socket_t **fds;
401 	int *fdstate;
402 #ifdef USE_KQUEUE
403 	int kqueue_fd;
404 	int nevents;
405 	struct kevent *events;
406 #endif /* USE_KQUEUE */
407 #ifdef USE_EPOLL
408 	int epoll_fd;
409 	int nevents;
410 	struct epoll_event *events;
411 	uint32_t *epoll_events;
412 #endif /* USE_EPOLL */
413 #ifdef USE_DEVPOLL
414 	int devpoll_fd;
415 	isc_resourcevalue_t open_max;
416 	unsigned int calls;
417 	int nevents;
418 	struct pollfd *events;
419 	pollinfo_t *fdpollinfo;
420 #endif /* USE_DEVPOLL */
421 #ifdef USE_SELECT
422 	int fd_bufsize;
423 	fd_set *read_fds;
424 	fd_set *read_fds_copy;
425 	fd_set *write_fds;
426 	fd_set *write_fds_copy;
427 	int maxfd;
428 #endif /* USE_SELECT */
429 };
430 
431 #define CLOSED	      0 /* this one must be zero */
432 #define MANAGED	      1
433 #define CLOSE_PENDING 2
434 
435 /*
436  * send() and recv() iovec counts
437  */
438 #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
439 #ifdef ISC_PLATFORM_RECVOVERFLOW
440 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
441 #else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
442 #define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
443 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
444 
445 static isc_result_t
446 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
447 	      isc_socket_t **socketp, isc_socket_t *dup_socket);
448 static void
449 send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
450 static void
451 send_senddone_event(isc_socket_t *, isc_socketevent_t **);
452 static void
453 send_connectdone_event(isc_socket_t *, isc_socket_connev_t **);
454 static void
455 free_socket(isc_socket_t **);
456 static isc_result_t
457 allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **);
458 static void
459 destroy(isc_socket_t **);
460 static void
461 internal_accept(isc_socket_t *);
462 static void
463 internal_connect(isc_socket_t *);
464 static void
465 internal_recv(isc_socket_t *);
466 static void
467 internal_send(isc_socket_t *);
468 static void
469 process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
470 static void
471 build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
472 		  struct iovec *, size_t *);
473 static void
474 build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *, struct msghdr *,
475 		  struct iovec *, size_t *);
476 static bool
477 process_ctlfd(isc__socketthread_t *thread);
478 static void
479 setdscp(isc_socket_t *sock, isc_dscp_t dscp);
480 static void
481 dispatch_recv(isc_socket_t *sock);
482 static void
483 dispatch_send(isc_socket_t *sock);
484 static void
485 internal_fdwatch_read(isc_socket_t *sock);
486 static void
487 internal_fdwatch_write(isc_socket_t *sock);
488 
489 #define SELECT_POKE_SHUTDOWN (-1)
490 #define SELECT_POKE_NOTHING  (-2)
491 #define SELECT_POKE_READ     (-3)
492 #define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
493 #define SELECT_POKE_WRITE    (-4)
494 #define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
495 #define SELECT_POKE_CLOSE    (-5)
496 
497 /*%
498  * Shortcut index arrays to get access to statistics counters.
499  */
500 enum {
501 	STATID_OPEN = 0,
502 	STATID_OPENFAIL = 1,
503 	STATID_CLOSE = 2,
504 	STATID_BINDFAIL = 3,
505 	STATID_CONNECTFAIL = 4,
506 	STATID_CONNECT = 5,
507 	STATID_ACCEPTFAIL = 6,
508 	STATID_ACCEPT = 7,
509 	STATID_SENDFAIL = 8,
510 	STATID_RECVFAIL = 9,
511 	STATID_ACTIVE = 10
512 };
513 static const isc_statscounter_t udp4statsindex[] = {
514 	isc_sockstatscounter_udp4open,
515 	isc_sockstatscounter_udp4openfail,
516 	isc_sockstatscounter_udp4close,
517 	isc_sockstatscounter_udp4bindfail,
518 	isc_sockstatscounter_udp4connectfail,
519 	isc_sockstatscounter_udp4connect,
520 	-1,
521 	-1,
522 	isc_sockstatscounter_udp4sendfail,
523 	isc_sockstatscounter_udp4recvfail,
524 	isc_sockstatscounter_udp4active
525 };
526 static const isc_statscounter_t udp6statsindex[] = {
527 	isc_sockstatscounter_udp6open,
528 	isc_sockstatscounter_udp6openfail,
529 	isc_sockstatscounter_udp6close,
530 	isc_sockstatscounter_udp6bindfail,
531 	isc_sockstatscounter_udp6connectfail,
532 	isc_sockstatscounter_udp6connect,
533 	-1,
534 	-1,
535 	isc_sockstatscounter_udp6sendfail,
536 	isc_sockstatscounter_udp6recvfail,
537 	isc_sockstatscounter_udp6active
538 };
539 static const isc_statscounter_t tcp4statsindex[] = {
540 	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
541 	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
542 	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
543 	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
544 	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
545 	isc_sockstatscounter_tcp4active
546 };
547 static const isc_statscounter_t tcp6statsindex[] = {
548 	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
549 	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
550 	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
551 	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
552 	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
553 	isc_sockstatscounter_tcp6active
554 };
555 static const isc_statscounter_t unixstatsindex[] = {
556 	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
557 	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
558 	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
559 	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
560 	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
561 	isc_sockstatscounter_unixactive
562 };
563 static const isc_statscounter_t rawstatsindex[] = {
564 	isc_sockstatscounter_rawopen,
565 	isc_sockstatscounter_rawopenfail,
566 	isc_sockstatscounter_rawclose,
567 	-1,
568 	-1,
569 	-1,
570 	-1,
571 	-1,
572 	-1,
573 	isc_sockstatscounter_rawrecvfail,
574 	isc_sockstatscounter_rawactive
575 };
576 
577 static int
578 gen_threadid(isc_socket_t *sock);
579 
580 static int
gen_threadid(isc_socket_t * sock)581 gen_threadid(isc_socket_t *sock) {
582 	return (sock->fd % sock->manager->nthreads);
583 }
584 
585 static void
586 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
587 	    isc_logmodule_t *module, int level, const char *fmt, ...)
588 	ISC_FORMAT_PRINTF(5, 6);
589 static void
manager_log(isc_socketmgr_t * sockmgr,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)590 manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
591 	    isc_logmodule_t *module, int level, const char *fmt, ...) {
592 	char msgbuf[2048];
593 	va_list ap;
594 
595 	if (!isc_log_wouldlog(isc_lctx, level)) {
596 		return;
597 	}
598 
599 	va_start(ap, fmt);
600 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
601 	va_end(ap);
602 
603 	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
604 		      sockmgr, msgbuf);
605 }
606 
607 static void
608 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
609 	   isc_logmodule_t *module, int level, const char *fmt, ...)
610 	ISC_FORMAT_PRINTF(5, 6);
611 static void
thread_log(isc__socketthread_t * thread,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)612 thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
613 	   isc_logmodule_t *module, int level, const char *fmt, ...) {
614 	char msgbuf[2048];
615 	va_list ap;
616 
617 	if (!isc_log_wouldlog(isc_lctx, level)) {
618 		return;
619 	}
620 
621 	va_start(ap, fmt);
622 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
623 	va_end(ap);
624 
625 	isc_log_write(isc_lctx, category, module, level,
626 		      "sockmgr %p thread %d: %s", thread->manager,
627 		      thread->threadid, msgbuf);
628 }
629 
630 static void
631 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
632 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
633 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
634 static void
socket_log(isc_socket_t * sock,const isc_sockaddr_t * address,isc_logcategory_t * category,isc_logmodule_t * module,int level,const char * fmt,...)635 socket_log(isc_socket_t *sock, const isc_sockaddr_t *address,
636 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
637 	   const char *fmt, ...) {
638 	char msgbuf[2048];
639 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
640 	va_list ap;
641 
642 	if (!isc_log_wouldlog(isc_lctx, level)) {
643 		return;
644 	}
645 
646 	va_start(ap, fmt);
647 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
648 	va_end(ap);
649 
650 	if (address == NULL) {
651 		isc_log_write(isc_lctx, category, module, level,
652 			      "socket %p: %s", sock, msgbuf);
653 	} else {
654 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
655 		isc_log_write(isc_lctx, category, module, level,
656 			      "socket %p %s: %s", sock, peerbuf, msgbuf);
657 	}
658 }
659 
660 /*%
661  * Increment socket-related statistics counters.
662  */
663 static void
inc_stats(isc_stats_t * stats,isc_statscounter_t counterid)664 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
665 	REQUIRE(counterid != -1);
666 
667 	if (stats != NULL) {
668 		isc_stats_increment(stats, counterid);
669 	}
670 }
671 
672 /*%
673  * Decrement socket-related statistics counters.
674  */
675 static void
dec_stats(isc_stats_t * stats,isc_statscounter_t counterid)676 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
677 	REQUIRE(counterid != -1);
678 
679 	if (stats != NULL) {
680 		isc_stats_decrement(stats, counterid);
681 	}
682 }
683 
684 static isc_result_t
watch_fd(isc__socketthread_t * thread,int fd,int msg)685 watch_fd(isc__socketthread_t *thread, int fd, int msg) {
686 	isc_result_t result = ISC_R_SUCCESS;
687 
688 #ifdef USE_KQUEUE
689 	struct kevent evchange;
690 
691 	memset(&evchange, 0, sizeof(evchange));
692 	if (msg == SELECT_POKE_READ) {
693 		evchange.filter = EVFILT_READ;
694 	} else {
695 		evchange.filter = EVFILT_WRITE;
696 	}
697 	evchange.flags = EV_ADD;
698 	evchange.ident = fd;
699 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
700 		result = isc__errno2result(errno);
701 	}
702 
703 	return (result);
704 #elif defined(USE_EPOLL)
705 	struct epoll_event event;
706 	uint32_t oldevents;
707 	int ret;
708 	int op;
709 
710 	oldevents = thread->epoll_events[fd];
711 	if (msg == SELECT_POKE_READ) {
712 		thread->epoll_events[fd] |= EPOLLIN;
713 	} else {
714 		thread->epoll_events[fd] |= EPOLLOUT;
715 	}
716 
717 	event.events = thread->epoll_events[fd];
718 	memset(&event.data, 0, sizeof(event.data));
719 	event.data.fd = fd;
720 
721 	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
722 	if (thread->fds[fd] != NULL) {
723 		LOCK(&thread->fds[fd]->lock);
724 	}
725 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
726 	if (thread->fds[fd] != NULL) {
727 		UNLOCK(&thread->fds[fd]->lock);
728 	}
729 	if (ret == -1) {
730 		if (errno == EEXIST) {
731 			UNEXPECTED_ERROR(__FILE__, __LINE__,
732 					 "epoll_ctl(ADD/MOD) returned "
733 					 "EEXIST for fd %d",
734 					 fd);
735 		}
736 		result = isc__errno2result(errno);
737 	}
738 
739 	return (result);
740 #elif defined(USE_DEVPOLL)
741 	struct pollfd pfd;
742 
743 	memset(&pfd, 0, sizeof(pfd));
744 	if (msg == SELECT_POKE_READ) {
745 		pfd.events = POLLIN;
746 	} else {
747 		pfd.events = POLLOUT;
748 	}
749 	pfd.fd = fd;
750 	pfd.revents = 0;
751 	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
752 		result = isc__errno2result(errno);
753 	} else {
754 		if (msg == SELECT_POKE_READ) {
755 			thread->fdpollinfo[fd].want_read = 1;
756 		} else {
757 			thread->fdpollinfo[fd].want_write = 1;
758 		}
759 	}
760 
761 	return (result);
762 #elif defined(USE_SELECT)
763 	LOCK(&thread->manager->lock);
764 	if (msg == SELECT_POKE_READ) {
765 		FD_SET(fd, thread->read_fds);
766 	}
767 	if (msg == SELECT_POKE_WRITE) {
768 		FD_SET(fd, thread->write_fds);
769 	}
770 	UNLOCK(&thread->manager->lock);
771 
772 	return (result);
773 #endif /* ifdef USE_KQUEUE */
774 }
775 
776 static isc_result_t
unwatch_fd(isc__socketthread_t * thread,int fd,int msg)777 unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
778 	isc_result_t result = ISC_R_SUCCESS;
779 
780 #ifdef USE_KQUEUE
781 	struct kevent evchange;
782 
783 	memset(&evchange, 0, sizeof(evchange));
784 	if (msg == SELECT_POKE_READ) {
785 		evchange.filter = EVFILT_READ;
786 	} else {
787 		evchange.filter = EVFILT_WRITE;
788 	}
789 	evchange.flags = EV_DELETE;
790 	evchange.ident = fd;
791 	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
792 		result = isc__errno2result(errno);
793 	}
794 
795 	return (result);
796 #elif defined(USE_EPOLL)
797 	struct epoll_event event;
798 	int ret;
799 	int op;
800 
801 	if (msg == SELECT_POKE_READ) {
802 		thread->epoll_events[fd] &= ~(EPOLLIN);
803 	} else {
804 		thread->epoll_events[fd] &= ~(EPOLLOUT);
805 	}
806 
807 	event.events = thread->epoll_events[fd];
808 	memset(&event.data, 0, sizeof(event.data));
809 	event.data.fd = fd;
810 
811 	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
812 	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
813 	if (ret == -1 && errno != ENOENT) {
814 		char strbuf[ISC_STRERRORSIZE];
815 		strerror_r(errno, strbuf, sizeof(strbuf));
816 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
817 				 fd, strbuf);
818 		result = ISC_R_UNEXPECTED;
819 	}
820 	return (result);
821 #elif defined(USE_DEVPOLL)
822 	struct pollfd pfds[2];
823 	size_t writelen = sizeof(pfds[0]);
824 
825 	memset(pfds, 0, sizeof(pfds));
826 	pfds[0].events = POLLREMOVE;
827 	pfds[0].fd = fd;
828 
829 	/*
830 	 * Canceling read or write polling via /dev/poll is tricky.  Since it
831 	 * only provides a way of canceling per FD, we may need to re-poll the
832 	 * socket for the other operation.
833 	 */
834 	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
835 		pfds[1].events = POLLOUT;
836 		pfds[1].fd = fd;
837 		writelen += sizeof(pfds[1]);
838 	}
839 	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
840 		pfds[1].events = POLLIN;
841 		pfds[1].fd = fd;
842 		writelen += sizeof(pfds[1]);
843 	}
844 
845 	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
846 		result = isc__errno2result(errno);
847 	} else {
848 		if (msg == SELECT_POKE_READ) {
849 			thread->fdpollinfo[fd].want_read = 0;
850 		} else {
851 			thread->fdpollinfo[fd].want_write = 0;
852 		}
853 	}
854 
855 	return (result);
856 #elif defined(USE_SELECT)
857 	LOCK(&thread->manager->lock);
858 	if (msg == SELECT_POKE_READ) {
859 		FD_CLR(fd, thread->read_fds);
860 	} else if (msg == SELECT_POKE_WRITE) {
861 		FD_CLR(fd, thread->write_fds);
862 	}
863 	UNLOCK(&thread->manager->lock);
864 
865 	return (result);
866 #endif /* ifdef USE_KQUEUE */
867 }
868 
869 /*
870  * A poke message was received, perform a proper watch/unwatch
871  * on a fd provided
872  */
873 static void
wakeup_socket(isc__socketthread_t * thread,int fd,int msg)874 wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
875 	isc_result_t result;
876 	int lockid = FDLOCK_ID(fd);
877 
878 	/*
879 	 * This is a wakeup on a socket.  If the socket is not in the
880 	 * process of being closed, start watching it for either reads
881 	 * or writes.
882 	 */
883 
884 	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
885 
886 	if (msg == SELECT_POKE_CLOSE) {
887 		LOCK(&thread->fdlock[lockid]);
888 		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
889 		thread->fdstate[fd] = CLOSED;
890 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
891 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
892 		(void)close(fd);
893 		UNLOCK(&thread->fdlock[lockid]);
894 		return;
895 	}
896 
897 	LOCK(&thread->fdlock[lockid]);
898 	if (thread->fdstate[fd] == CLOSE_PENDING) {
899 		/*
900 		 * We accept (and ignore) any error from unwatch_fd() as we are
901 		 * closing the socket, hoping it doesn't leave dangling state in
902 		 * the kernel.
903 		 * Note that unwatch_fd() must be called after releasing the
904 		 * fdlock; otherwise it could cause deadlock due to a lock order
905 		 * reversal.
906 		 */
907 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
908 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
909 		UNLOCK(&thread->fdlock[lockid]);
910 		return;
911 	}
912 	if (thread->fdstate[fd] != MANAGED) {
913 		UNLOCK(&thread->fdlock[lockid]);
914 		return;
915 	}
916 
917 	/*
918 	 * Set requested bit.
919 	 */
920 	result = watch_fd(thread, fd, msg);
921 	if (result != ISC_R_SUCCESS) {
922 		/*
923 		 * XXXJT: what should we do?  Ignoring the failure of watching
924 		 * a socket will make the application dysfunctional, but there
925 		 * seems to be no reasonable recovery process.
926 		 */
927 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
928 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
929 			      "failed to start watching FD (%d): %s", fd,
930 			      isc_result_totext(result));
931 	}
932 	UNLOCK(&thread->fdlock[lockid]);
933 }
934 
935 /*
936  * Poke the select loop when there is something for us to do.
937  * The write is required (by POSIX) to complete.  That is, we
938  * will not get partial writes.
939  */
940 static void
select_poke(isc_socketmgr_t * mgr,int threadid,int fd,int msg)941 select_poke(isc_socketmgr_t *mgr, int threadid, int fd, int msg) {
942 	int cc;
943 	int buf[2];
944 	char strbuf[ISC_STRERRORSIZE];
945 
946 	buf[0] = fd;
947 	buf[1] = msg;
948 
949 	do {
950 		cc = write(mgr->threads[threadid].pipe_fds[1], buf,
951 			   sizeof(buf));
952 #ifdef ENOSR
953 		/*
954 		 * Treat ENOSR as EAGAIN but loop slowly as it is
955 		 * unlikely to clear fast.
956 		 */
957 		if (cc < 0 && errno == ENOSR) {
958 			sleep(1);
959 			errno = EAGAIN;
960 		}
961 #endif /* ifdef ENOSR */
962 	} while (cc < 0 && SOFT_ERROR(errno));
963 
964 	if (cc < 0) {
965 		strerror_r(errno, strbuf, sizeof(strbuf));
966 		FATAL_ERROR(__FILE__, __LINE__,
967 			    "write() failed during watcher poke: %s", strbuf);
968 	}
969 
970 	INSIST(cc == sizeof(buf));
971 }
972 
973 /*
974  * Read a message on the internal fd.
975  */
976 static void
select_readmsg(isc__socketthread_t * thread,int * fd,int * msg)977 select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
978 	int buf[2];
979 	int cc;
980 	char strbuf[ISC_STRERRORSIZE];
981 
982 	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
983 	if (cc < 0) {
984 		*msg = SELECT_POKE_NOTHING;
985 		*fd = -1; /* Silence compiler. */
986 		if (SOFT_ERROR(errno)) {
987 			return;
988 		}
989 
990 		strerror_r(errno, strbuf, sizeof(strbuf));
991 		FATAL_ERROR(__FILE__, __LINE__,
992 			    "read() failed during watcher poke: %s", strbuf);
993 	}
994 	INSIST(cc == sizeof(buf));
995 
996 	*fd = buf[0];
997 	*msg = buf[1];
998 }
999 
1000 /*
1001  * Make a fd non-blocking.
1002  */
1003 static isc_result_t
make_nonblock(int fd)1004 make_nonblock(int fd) {
1005 	int ret;
1006 	char strbuf[ISC_STRERRORSIZE];
1007 #ifdef USE_FIONBIO_IOCTL
1008 	int on = 1;
1009 #else  /* ifdef USE_FIONBIO_IOCTL */
1010 	int flags;
1011 #endif /* ifdef USE_FIONBIO_IOCTL */
1012 
1013 #ifdef USE_FIONBIO_IOCTL
1014 	ret = ioctl(fd, FIONBIO, (char *)&on);
1015 #else  /* ifdef USE_FIONBIO_IOCTL */
1016 	flags = fcntl(fd, F_GETFL, 0);
1017 	flags |= PORT_NONBLOCK;
1018 	ret = fcntl(fd, F_SETFL, flags);
1019 #endif /* ifdef USE_FIONBIO_IOCTL */
1020 
1021 	if (ret == -1) {
1022 		strerror_r(errno, strbuf, sizeof(strbuf));
1023 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1024 #ifdef USE_FIONBIO_IOCTL
1025 				 "ioctl(%d, FIONBIO, &on): %s", fd,
1026 #else  /* ifdef USE_FIONBIO_IOCTL */
1027 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
1028 #endif /* ifdef USE_FIONBIO_IOCTL */
1029 				 strbuf);
1030 
1031 		return (ISC_R_UNEXPECTED);
1032 	}
1033 
1034 	return (ISC_R_SUCCESS);
1035 }
1036 
1037 #ifdef USE_CMSG
1038 /*
1039  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
1040  * In order to ensure as much portability as possible, we provide wrapper
1041  * functions of these macros.
1042  * Note that cmsg_space() could run slow on OSes that do not have
1043  * CMSG_SPACE.
1044  */
1045 static socklen_t
cmsg_len(socklen_t len)1046 cmsg_len(socklen_t len) {
1047 #ifdef CMSG_LEN
1048 	return (CMSG_LEN(len));
1049 #else  /* ifdef CMSG_LEN */
1050 	socklen_t hdrlen;
1051 
1052 	/*
1053 	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
1054 	 * is correct.
1055 	 */
1056 	hdrlen = (socklen_t)CMSG_DATA(((struct cmsghdr *)NULL));
1057 	return (hdrlen + len);
1058 #endif /* ifdef CMSG_LEN */
1059 }
1060 
1061 static socklen_t
cmsg_space(socklen_t len)1062 cmsg_space(socklen_t len) {
1063 #ifdef CMSG_SPACE
1064 	return (CMSG_SPACE(len));
1065 #else  /* ifdef CMSG_SPACE */
1066 	struct msghdr msg;
1067 	struct cmsghdr *cmsgp;
1068 	/*
1069 	 * XXX: The buffer length is an ad-hoc value, but should be enough
1070 	 * in a practical sense.
1071 	 */
1072 	char dummybuf[sizeof(struct cmsghdr) + 1024];
1073 
1074 	memset(&msg, 0, sizeof(msg));
1075 	msg.msg_control = dummybuf;
1076 	msg.msg_controllen = sizeof(dummybuf);
1077 
1078 	cmsgp = (struct cmsghdr *)dummybuf;
1079 	cmsgp->cmsg_len = cmsg_len(len);
1080 
1081 	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
1082 	if (cmsgp != NULL) {
1083 		return ((char *)cmsgp - (char *)msg.msg_control);
1084 	} else {
1085 		return (0);
1086 	}
1087 #endif /* ifdef CMSG_SPACE */
1088 }
1089 #endif /* USE_CMSG */
1090 
1091 /*
1092  * Process control messages received on a socket.
1093  */
1094 static void
process_cmsg(isc_socket_t * sock,struct msghdr * msg,isc_socketevent_t * dev)1095 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
1096 #ifdef USE_CMSG
1097 	struct cmsghdr *cmsgp;
1098 	struct in6_pktinfo *pktinfop;
1099 #ifdef SO_TIMESTAMP
1100 	void *timevalp;
1101 #endif /* ifdef SO_TIMESTAMP */
1102 #endif /* ifdef USE_CMSG */
1103 
1104 	/*
1105 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
1106 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
1107 	 * They are all here, outside of the CPP tests, because it is
1108 	 * more consistent with the usual ISC coding style.
1109 	 */
1110 	UNUSED(sock);
1111 	UNUSED(msg);
1112 	UNUSED(dev);
1113 
1114 #ifdef MSG_TRUNC
1115 	if ((msg->msg_flags & MSG_TRUNC) != 0) {
1116 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1117 	}
1118 #endif /* ifdef MSG_TRUNC */
1119 
1120 #ifdef MSG_CTRUNC
1121 	if ((msg->msg_flags & MSG_CTRUNC) != 0) {
1122 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
1123 	}
1124 #endif /* ifdef MSG_CTRUNC */
1125 
1126 #ifndef USE_CMSG
1127 	return;
1128 #else /* ifndef USE_CMSG */
1129 	if (msg->msg_controllen == 0U || msg->msg_control == NULL) {
1130 		return;
1131 	}
1132 
1133 #ifdef SO_TIMESTAMP
1134 	timevalp = NULL;
1135 #endif /* ifdef SO_TIMESTAMP */
1136 	pktinfop = NULL;
1137 
1138 	cmsgp = CMSG_FIRSTHDR(msg);
1139 	while (cmsgp != NULL) {
1140 		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
1141 
1142 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1143 		    cmsgp->cmsg_type == IPV6_PKTINFO)
1144 		{
1145 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1146 			memmove(&dev->pktinfo, pktinfop,
1147 				sizeof(struct in6_pktinfo));
1148 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1149 			socket_log(sock, NULL, TRACE,
1150 				   "interface received on ifindex %u",
1151 				   dev->pktinfo.ipi6_ifindex);
1152 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) {
1153 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
1154 			}
1155 			goto next;
1156 		}
1157 
1158 #ifdef SO_TIMESTAMP
1159 		if (cmsgp->cmsg_level == SOL_SOCKET &&
1160 		    cmsgp->cmsg_type == SCM_TIMESTAMP)
1161 		{
1162 			struct timeval tv;
1163 			timevalp = CMSG_DATA(cmsgp);
1164 			memmove(&tv, timevalp, sizeof(tv));
1165 			dev->timestamp.seconds = tv.tv_sec;
1166 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
1167 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
1168 			goto next;
1169 		}
1170 #endif /* ifdef SO_TIMESTAMP */
1171 
1172 #ifdef IPV6_TCLASS
1173 		if (cmsgp->cmsg_level == IPPROTO_IPV6 &&
1174 		    cmsgp->cmsg_type == IPV6_TCLASS)
1175 		{
1176 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
1177 			dev->dscp >>= 2;
1178 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1179 			goto next;
1180 		}
1181 #endif /* ifdef IPV6_TCLASS */
1182 
1183 #ifdef IP_TOS
1184 		if (cmsgp->cmsg_level == IPPROTO_IP &&
1185 		    (cmsgp->cmsg_type == IP_TOS
1186 #ifdef IP_RECVTOS
1187 		     || cmsgp->cmsg_type == IP_RECVTOS
1188 #endif /* ifdef IP_RECVTOS */
1189 		     ))
1190 		{
1191 			dev->dscp = (int)*(unsigned char *)CMSG_DATA(cmsgp);
1192 			dev->dscp >>= 2;
1193 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
1194 			goto next;
1195 		}
1196 #endif /* ifdef IP_TOS */
1197 	next:
1198 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
1199 	}
1200 #endif /* USE_CMSG */
1201 }
1202 
1203 /*
1204  * Construct an iov array and attach it to the msghdr passed in.  This is
1205  * the SEND constructor, which will use the used region of the buffer
1206  * (if using a buffer list) or will use the internal region (if a single
1207  * buffer I/O is requested).
1208  *
1209  * Nothing can be NULL, and the done event must list at least one buffer
1210  * on the buffer linked list for this function to be meaningful.
1211  *
1212  * If write_countp != NULL, *write_countp will hold the number of bytes
1213  * this transaction can send.
1214  */
1215 static void
build_msghdr_send(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * write_countp)1216 build_msghdr_send(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1217 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp) {
1218 	unsigned int iovcount;
1219 	size_t write_count;
1220 	struct cmsghdr *cmsgp;
1221 
1222 	memset(msg, 0, sizeof(*msg));
1223 
1224 	if (!sock->connected) {
1225 		msg->msg_name = (void *)&dev->address.type.sa;
1226 		msg->msg_namelen = dev->address.length;
1227 	} else {
1228 		msg->msg_name = NULL;
1229 		msg->msg_namelen = 0;
1230 	}
1231 
1232 	write_count = dev->region.length - dev->n;
1233 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1234 	iov[0].iov_len = write_count;
1235 	iovcount = 1;
1236 
1237 	msg->msg_iov = iov;
1238 	msg->msg_iovlen = iovcount;
1239 	msg->msg_control = NULL;
1240 	msg->msg_controllen = 0;
1241 	msg->msg_flags = 0;
1242 #if defined(USE_CMSG)
1243 
1244 	if ((sock->type == isc_sockettype_udp) &&
1245 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
1246 	{
1247 		struct in6_pktinfo *pktinfop;
1248 
1249 		socket_log(sock, NULL, TRACE, "sendto pktinfo data, ifindex %u",
1250 			   dev->pktinfo.ipi6_ifindex);
1251 
1252 		msg->msg_control = (void *)cmsgbuf;
1253 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
1254 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1255 
1256 		cmsgp = (struct cmsghdr *)cmsgbuf;
1257 		cmsgp->cmsg_level = IPPROTO_IPV6;
1258 		cmsgp->cmsg_type = IPV6_PKTINFO;
1259 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
1260 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1261 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
1262 	}
1263 
1264 #if defined(IPV6_USE_MIN_MTU)
1265 	if ((sock->type == isc_sockettype_udp) && (sock->pf == AF_INET6) &&
1266 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
1267 	{
1268 		int use_min_mtu = 1; /* -1, 0, 1 */
1269 
1270 		cmsgp = (struct cmsghdr *)(cmsgbuf + msg->msg_controllen);
1271 		msg->msg_control = (void *)cmsgbuf;
1272 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
1273 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1274 
1275 		cmsgp->cmsg_level = IPPROTO_IPV6;
1276 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
1277 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
1278 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
1279 	}
1280 #endif /* if defined(IPV6_USE_MIN_MTU) */
1281 
1282 	if (isc_dscp_check_value > -1) {
1283 		if (sock->type == isc_sockettype_udp) {
1284 			INSIST((int)dev->dscp == isc_dscp_check_value);
1285 		} else if (sock->type == isc_sockettype_tcp) {
1286 			INSIST((int)sock->dscp == isc_dscp_check_value);
1287 		}
1288 	}
1289 
1290 #if defined(IP_TOS) || (defined(IPPROTO_IPV6) && defined(IPV6_TCLASS))
1291 	if ((sock->type == isc_sockettype_udp) &&
1292 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
1293 	{
1294 		int dscp = (dev->dscp << 2) & 0xff;
1295 
1296 		INSIST(dev->dscp < 0x40);
1297 
1298 #ifdef IP_TOS
1299 		if (sock->pf == AF_INET && sock->pktdscp) {
1300 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1301 						   msg->msg_controllen);
1302 			msg->msg_control = (void *)cmsgbuf;
1303 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1304 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1305 
1306 			cmsgp->cmsg_level = IPPROTO_IP;
1307 			cmsgp->cmsg_type = IP_TOS;
1308 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
1309 			*(unsigned char *)CMSG_DATA(cmsgp) = dscp;
1310 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
1311 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
1312 				       (void *)&dscp, sizeof(int)) < 0)
1313 			{
1314 				char strbuf[ISC_STRERRORSIZE];
1315 				strerror_r(errno, strbuf, sizeof(strbuf));
1316 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1317 						 "setsockopt(%d, IP_TOS, %.02x)"
1318 						 " failed: %s",
1319 						 sock->fd, dscp >> 2, strbuf);
1320 			} else {
1321 				sock->dscp = dscp;
1322 			}
1323 		}
1324 #endif /* ifdef IP_TOS */
1325 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
1326 		if (sock->pf == AF_INET6 && sock->pktdscp) {
1327 			cmsgp = (struct cmsghdr *)(cmsgbuf +
1328 						   msg->msg_controllen);
1329 			msg->msg_control = (void *)cmsgbuf;
1330 			msg->msg_controllen += cmsg_space(sizeof(dscp));
1331 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
1332 
1333 			cmsgp->cmsg_level = IPPROTO_IPV6;
1334 			cmsgp->cmsg_type = IPV6_TCLASS;
1335 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
1336 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
1337 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
1338 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
1339 				       (void *)&dscp, sizeof(int)) < 0)
1340 			{
1341 				char strbuf[ISC_STRERRORSIZE];
1342 				strerror_r(errno, strbuf, sizeof(strbuf));
1343 				UNEXPECTED_ERROR(__FILE__, __LINE__,
1344 						 "setsockopt(%d, IPV6_TCLASS, "
1345 						 "%.02x) failed: %s",
1346 						 sock->fd, dscp >> 2, strbuf);
1347 			} else {
1348 				sock->dscp = dscp;
1349 			}
1350 		}
1351 #endif /* if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS) */
1352 		if (msg->msg_controllen != 0 &&
1353 		    msg->msg_controllen < SENDCMSGBUFLEN)
1354 		{
1355 			memset(cmsgbuf + msg->msg_controllen, 0,
1356 			       SENDCMSGBUFLEN - msg->msg_controllen);
1357 		}
1358 	}
1359 #endif /* if defined(IP_TOS) || (defined(IPPROTO_IPV6) && \
1360 	* defined(IPV6_TCLASS))                           \
1361 	* */
1362 #endif /* USE_CMSG */
1363 
1364 	if (write_countp != NULL) {
1365 		*write_countp = write_count;
1366 	}
1367 }
1368 
1369 /*
1370  * Construct an iov array and attach it to the msghdr passed in.  This is
1371  * the RECV constructor, which will use the available region of the buffer
1372  * (if using a buffer list) or will use the internal region (if a single
1373  * buffer I/O is requested).
1374  *
1375  * Nothing can be NULL, and the done event must list at least one buffer
1376  * on the buffer linked list for this function to be meaningful.
1377  *
1378  * If read_countp != NULL, *read_countp will hold the number of bytes
1379  * this transaction can receive.
1380  */
1381 static void
build_msghdr_recv(isc_socket_t * sock,char * cmsgbuf,isc_socketevent_t * dev,struct msghdr * msg,struct iovec * iov,size_t * read_countp)1382 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
1383 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp) {
1384 	unsigned int iovcount;
1385 	size_t read_count;
1386 
1387 	memset(msg, 0, sizeof(struct msghdr));
1388 
1389 	if (sock->type == isc_sockettype_udp) {
1390 		memset(&dev->address, 0, sizeof(dev->address));
1391 		msg->msg_name = (void *)&dev->address.type.sa;
1392 		msg->msg_namelen = sizeof(dev->address.type);
1393 	} else { /* TCP */
1394 		msg->msg_name = NULL;
1395 		msg->msg_namelen = 0;
1396 		dev->address = sock->peer_address;
1397 	}
1398 
1399 	read_count = dev->region.length - dev->n;
1400 	iov[0].iov_base = (void *)(dev->region.base + dev->n);
1401 	iov[0].iov_len = read_count;
1402 	iovcount = 1;
1403 
1404 	/*
1405 	 * If needed, set up to receive that one extra byte.
1406 	 */
1407 #ifdef ISC_PLATFORM_RECVOVERFLOW
1408 	if (sock->type == isc_sockettype_udp) {
1409 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
1410 		iov[iovcount].iov_base = (void *)(&sock->overflow);
1411 		iov[iovcount].iov_len = 1;
1412 		iovcount++;
1413 	}
1414 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1415 
1416 	msg->msg_iov = iov;
1417 	msg->msg_iovlen = iovcount;
1418 
1419 #if defined(USE_CMSG)
1420 	msg->msg_control = cmsgbuf;
1421 	msg->msg_controllen = RECVCMSGBUFLEN;
1422 #else  /* if defined(USE_CMSG) */
1423 	msg->msg_control = NULL;
1424 	msg->msg_controllen = 0;
1425 #endif /* USE_CMSG */
1426 	msg->msg_flags = 0;
1427 
1428 	if (read_countp != NULL) {
1429 		*read_countp = read_count;
1430 	}
1431 }
1432 
1433 static void
set_dev_address(const isc_sockaddr_t * address,isc_socket_t * sock,isc_socketevent_t * dev)1434 set_dev_address(const isc_sockaddr_t *address, isc_socket_t *sock,
1435 		isc_socketevent_t *dev) {
1436 	if (sock->type == isc_sockettype_udp) {
1437 		if (address != NULL) {
1438 			dev->address = *address;
1439 		} else {
1440 			dev->address = sock->peer_address;
1441 		}
1442 	} else if (sock->type == isc_sockettype_tcp) {
1443 		INSIST(address == NULL);
1444 		dev->address = sock->peer_address;
1445 	}
1446 }
1447 
1448 static void
destroy_socketevent(isc_event_t * event)1449 destroy_socketevent(isc_event_t *event) {
1450 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1451 
1452 	(ev->destroy)(event);
1453 }
1454 
1455 static isc_socketevent_t *
allocate_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)1456 allocate_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
1457 		     isc_taskaction_t action, void *arg) {
1458 	isc_socketevent_t *ev;
1459 
1460 	ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender, eventtype,
1461 						     action, arg, sizeof(*ev));
1462 
1463 	ev->result = ISC_R_UNSET;
1464 	ISC_LINK_INIT(ev, ev_link);
1465 	ev->region.base = NULL;
1466 	ev->n = 0;
1467 	ev->offset = 0;
1468 	ev->attributes = 0;
1469 	ev->destroy = ev->ev_destroy;
1470 	ev->ev_destroy = destroy_socketevent;
1471 	ev->dscp = 0;
1472 
1473 	return (ev);
1474 }
1475 
1476 #if defined(ISC_SOCKET_DEBUG)
1477 static void
dump_msg(struct msghdr * msg)1478 dump_msg(struct msghdr *msg) {
1479 	unsigned int i;
1480 
1481 	printf("MSGHDR %p\n", msg);
1482 	printf("\tname %p, namelen %ld\n", msg->msg_name,
1483 	       (long)msg->msg_namelen);
1484 	printf("\tiov %p, iovlen %ld\n", msg->msg_iov, (long)msg->msg_iovlen);
1485 	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) {
1486 		printf("\t\t%u\tbase %p, len %ld\n", i,
1487 		       msg->msg_iov[i].iov_base, (long)msg->msg_iov[i].iov_len);
1488 	}
1489 	printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
1490 	       (long)msg->msg_controllen);
1491 }
1492 #endif /* if defined(ISC_SOCKET_DEBUG) */
1493 
1494 #define DOIO_SUCCESS 0 /* i/o ok, event sent */
1495 #define DOIO_SOFT    1 /* i/o ok, soft error, no event sent */
1496 #define DOIO_HARD    2 /* i/o error, event sent */
1497 #define DOIO_EOF     3 /* EOF, no event sent */
1498 
1499 static int
doio_recv(isc_socket_t * sock,isc_socketevent_t * dev)1500 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1501 	int cc;
1502 	struct iovec iov[MAXSCATTERGATHER_RECV];
1503 	size_t read_count;
1504 	struct msghdr msghdr;
1505 	int recv_errno;
1506 	char strbuf[ISC_STRERRORSIZE];
1507 	char cmsgbuf[RECVCMSGBUFLEN] = { 0 };
1508 
1509 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
1510 
1511 #if defined(ISC_SOCKET_DEBUG)
1512 	dump_msg(&msghdr);
1513 #endif /* if defined(ISC_SOCKET_DEBUG) */
1514 
1515 	cc = recvmsg(sock->fd, &msghdr, 0);
1516 	recv_errno = errno;
1517 
1518 #if defined(ISC_SOCKET_DEBUG)
1519 	dump_msg(&msghdr);
1520 #endif /* if defined(ISC_SOCKET_DEBUG) */
1521 
1522 	if (cc < 0) {
1523 		if (SOFT_ERROR(recv_errno)) {
1524 			return (DOIO_SOFT);
1525 		}
1526 
1527 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1528 			strerror_r(recv_errno, strbuf, sizeof(strbuf));
1529 			socket_log(sock, NULL, IOEVENT,
1530 				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1531 				   sock->fd, cc, recv_errno, strbuf);
1532 		}
1533 
1534 #define SOFT_OR_HARD(_system, _isc)                                   \
1535 	if (recv_errno == _system) {                                  \
1536 		if (sock->connected) {                                \
1537 			dev->result = _isc;                           \
1538 			inc_stats(sock->manager->stats,               \
1539 				  sock->statsindex[STATID_RECVFAIL]); \
1540 			return (DOIO_HARD);                           \
1541 		}                                                     \
1542 		return (DOIO_SOFT);                                   \
1543 	}
1544 #define ALWAYS_HARD(_system, _isc)                            \
1545 	if (recv_errno == _system) {                          \
1546 		dev->result = _isc;                           \
1547 		inc_stats(sock->manager->stats,               \
1548 			  sock->statsindex[STATID_RECVFAIL]); \
1549 		return (DOIO_HARD);                           \
1550 	}
1551 
1552 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1553 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1554 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1555 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
1556 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1557 		/*
1558 		 * Older operating systems may still return EPROTO in some
1559 		 * situations, for example when receiving ICMP/ICMPv6 errors.
1560 		 * A real life scenario is when ICMPv6 returns code 5 or 6.
1561 		 * These codes are introduced in RFC 4443 from March 2006,
1562 		 * and the document obsoletes RFC 1885. But unfortunately not
1563 		 * all operating systems have caught up with the new standard
1564 		 * (in 2020) and thus a generic protocol error is returned.
1565 		 */
1566 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1567 		/* Should never get this one but it was seen. */
1568 #ifdef ENOPROTOOPT
1569 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
1570 #endif /* ifdef ENOPROTOOPT */
1571 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1572 
1573 #undef SOFT_OR_HARD
1574 #undef ALWAYS_HARD
1575 
1576 		dev->result = isc__errno2result(recv_errno);
1577 		inc_stats(sock->manager->stats,
1578 			  sock->statsindex[STATID_RECVFAIL]);
1579 		return (DOIO_HARD);
1580 	}
1581 
1582 	/*
1583 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1584 	 * while on UDP sockets, zero length reads are perfectly valid,
1585 	 * although strange.
1586 	 */
1587 	switch (sock->type) {
1588 	case isc_sockettype_tcp:
1589 	case isc_sockettype_unix:
1590 		if (cc == 0) {
1591 			return (DOIO_EOF);
1592 		}
1593 		break;
1594 	case isc_sockettype_udp:
1595 	case isc_sockettype_raw:
1596 		break;
1597 	case isc_sockettype_fdwatch:
1598 	default:
1599 		UNREACHABLE();
1600 	}
1601 
1602 	if (sock->type == isc_sockettype_udp) {
1603 		dev->address.length = msghdr.msg_namelen;
1604 		if (isc_sockaddr_getport(&dev->address) == 0) {
1605 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1606 				socket_log(sock, &dev->address, IOEVENT,
1607 					   "dropping source port zero packet");
1608 			}
1609 			return (DOIO_SOFT);
1610 		}
1611 		/*
1612 		 * Simulate a firewall blocking UDP responses bigger than
1613 		 * 'maxudp' bytes.
1614 		 */
1615 		if (sock->manager->maxudp != 0 &&
1616 		    cc > (int)sock->manager->maxudp)
1617 		{
1618 			return (DOIO_SOFT);
1619 		}
1620 	}
1621 
1622 	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");
1623 
1624 	/*
1625 	 * Overflow bit detection.  If we received MORE bytes than we should,
1626 	 * this indicates an overflow situation.  Set the flag in the
1627 	 * dev entry and adjust how much we read by one.
1628 	 */
1629 #ifdef ISC_PLATFORM_RECVOVERFLOW
1630 	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
1631 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
1632 		cc--;
1633 	}
1634 #endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
1635 
1636 	/*
1637 	 * If there are control messages attached, run through them and pull
1638 	 * out the interesting bits.
1639 	 */
1640 	process_cmsg(sock, &msghdr, dev);
1641 
1642 	/*
1643 	 * update the buffers (if any) and the i/o count
1644 	 */
1645 	dev->n += cc;
1646 
1647 	/*
1648 	 * If we read less than we expected, update counters,
1649 	 * and let the upper layer poke the descriptor.
1650 	 */
1651 	if (((size_t)cc != read_count) && (dev->n < dev->minimum)) {
1652 		return (DOIO_SOFT);
1653 	}
1654 
1655 	/*
1656 	 * Full reads are posted, or partials if partials are ok.
1657 	 */
1658 	dev->result = ISC_R_SUCCESS;
1659 	return (DOIO_SUCCESS);
1660 }
1661 
1662 /*
1663  * Returns:
1664  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1665  *			ISC_R_SUCCESS.
1666  *
1667  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1668  *			dev->result contains the appropriate error.
1669  *
1670  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1671  *			event was sent.  The operation should be retried.
1672  *
1673  *	No other return values are possible.
1674  */
1675 static int
doio_send(isc_socket_t * sock,isc_socketevent_t * dev)1676 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
1677 	int cc;
1678 	struct iovec iov[MAXSCATTERGATHER_SEND];
1679 	size_t write_count;
1680 	struct msghdr msghdr;
1681 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1682 	int attempts = 0;
1683 	int send_errno;
1684 	char strbuf[ISC_STRERRORSIZE];
1685 	char cmsgbuf[SENDCMSGBUFLEN] = { 0 };
1686 
1687 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1688 
1689 resend:
1690 	if (sock->type == isc_sockettype_udp && sock->manager->maxudp != 0 &&
1691 	    write_count > sock->manager->maxudp)
1692 	{
1693 		cc = write_count;
1694 	} else {
1695 		cc = sendmsg(sock->fd, &msghdr, 0);
1696 	}
1697 	send_errno = errno;
1698 
1699 	/*
1700 	 * Check for error or block condition.
1701 	 */
1702 	if (cc < 0) {
1703 		if (send_errno == EINTR && ++attempts < NRETRIES) {
1704 			goto resend;
1705 		}
1706 
1707 		if (SOFT_ERROR(send_errno)) {
1708 			if (errno == EWOULDBLOCK || errno == EAGAIN) {
1709 				dev->result = ISC_R_WOULDBLOCK;
1710 			}
1711 			return (DOIO_SOFT);
1712 		}
1713 
1714 #define SOFT_OR_HARD(_system, _isc)                                   \
1715 	if (send_errno == _system) {                                  \
1716 		if (sock->connected) {                                \
1717 			dev->result = _isc;                           \
1718 			inc_stats(sock->manager->stats,               \
1719 				  sock->statsindex[STATID_SENDFAIL]); \
1720 			return (DOIO_HARD);                           \
1721 		}                                                     \
1722 		return (DOIO_SOFT);                                   \
1723 	}
1724 #define ALWAYS_HARD(_system, _isc)                            \
1725 	if (send_errno == _system) {                          \
1726 		dev->result = _isc;                           \
1727 		inc_stats(sock->manager->stats,               \
1728 			  sock->statsindex[STATID_SENDFAIL]); \
1729 		return (DOIO_HARD);                           \
1730 	}
1731 
1732 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1733 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1734 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1735 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1736 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1737 #ifdef EHOSTDOWN
1738 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1739 #endif /* ifdef EHOSTDOWN */
1740 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1741 		SOFT_OR_HARD(ENOBUFS, ISC_R_NORESOURCES);
1742 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1743 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1744 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1745 
1746 #undef SOFT_OR_HARD
1747 #undef ALWAYS_HARD
1748 
1749 		/*
1750 		 * The other error types depend on whether or not the
1751 		 * socket is UDP or TCP.  If it is UDP, some errors
1752 		 * that we expect to be fatal under TCP are merely
1753 		 * annoying, and are really soft errors.
1754 		 *
1755 		 * However, these soft errors are still returned as
1756 		 * a status.
1757 		 */
1758 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1759 		strerror_r(send_errno, strbuf, sizeof(strbuf));
1760 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1761 				 addrbuf, strbuf);
1762 		dev->result = isc__errno2result(send_errno);
1763 		inc_stats(sock->manager->stats,
1764 			  sock->statsindex[STATID_SENDFAIL]);
1765 		return (DOIO_HARD);
1766 	}
1767 
1768 	if (cc == 0) {
1769 		inc_stats(sock->manager->stats,
1770 			  sock->statsindex[STATID_SENDFAIL]);
1771 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1772 				 "doio_send: send() returned 0");
1773 	}
1774 
1775 	/*
1776 	 * If we write less than we expected, update counters, poke.
1777 	 */
1778 	dev->n += cc;
1779 	if ((size_t)cc != write_count) {
1780 		return (DOIO_SOFT);
1781 	}
1782 
1783 	/*
1784 	 * Exactly what we wanted to write.  We're done with this
1785 	 * entry.  Post its completion event.
1786 	 */
1787 	dev->result = ISC_R_SUCCESS;
1788 	return (DOIO_SUCCESS);
1789 }
1790 
1791 /*
1792  * Kill.
1793  *
1794  * Caller must ensure that the socket is not locked and no external
1795  * references exist.
1796  */
1797 static void
socketclose(isc__socketthread_t * thread,isc_socket_t * sock,int fd)1798 socketclose(isc__socketthread_t *thread, isc_socket_t *sock, int fd) {
1799 	int lockid = FDLOCK_ID(fd);
1800 	/*
1801 	 * No one has this socket open, so the watcher doesn't have to be
1802 	 * poked, and the socket doesn't have to be locked.
1803 	 */
1804 	LOCK(&thread->fdlock[lockid]);
1805 	thread->fds[fd] = NULL;
1806 	if (sock->type == isc_sockettype_fdwatch)
1807 		thread->fdstate[fd] = CLOSED;
1808 	else
1809 		thread->fdstate[fd] = CLOSE_PENDING;
1810 	UNLOCK(&thread->fdlock[lockid]);
1811 	if (sock->type == isc_sockettype_fdwatch) {
1812 		/*
1813 		 * The caller may close the socket once this function returns,
1814 		 * and `fd' may be reassigned for a new socket.  So we do
1815 		 * unwatch_fd() here, rather than defer it via select_poke().
1816 		 * Note: this may complicate data protection among threads and
1817 		 * may reduce performance due to additional locks.  One way to
1818 		 * solve this would be to dup() the watched descriptor, but we
1819 		 * take a simpler approach at this moment.
1820 		 */
1821 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
1822 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
1823 	} else
1824 		select_poke(thread->manager, thread->threadid, fd,
1825 		    SELECT_POKE_CLOSE);
1826 
1827 	inc_stats(thread->manager->stats, sock->statsindex[STATID_CLOSE]);
1828 
1829 	LOCK(&sock->lock);
1830 	if (sock->active == 1) {
1831 		dec_stats(thread->manager->stats,
1832 			  sock->statsindex[STATID_ACTIVE]);
1833 		sock->active = 0;
1834 	}
1835 	UNLOCK(&sock->lock);
1836 
1837 	/*
1838 	 * update manager->maxfd here (XXX: this should be implemented more
1839 	 * efficiently)
1840 	 */
1841 #ifdef USE_SELECT
1842 	LOCK(&thread->manager->lock);
1843 	if (thread->maxfd == fd) {
1844 		int i;
1845 
1846 		thread->maxfd = 0;
1847 		for (i = fd - 1; i >= 0; i--) {
1848 			lockid = FDLOCK_ID(i);
1849 
1850 			LOCK(&thread->fdlock[lockid]);
1851 			if (thread->fdstate[i] == MANAGED) {
1852 				thread->maxfd = i;
1853 				UNLOCK(&thread->fdlock[lockid]);
1854 				break;
1855 			}
1856 			UNLOCK(&thread->fdlock[lockid]);
1857 		}
1858 		if (thread->maxfd < thread->pipe_fds[0]) {
1859 			thread->maxfd = thread->pipe_fds[0];
1860 		}
1861 	}
1862 
1863 	UNLOCK(&thread->manager->lock);
1864 #endif /* USE_SELECT */
1865 }
1866 
1867 static void
destroy(isc_socket_t ** sockp)1868 destroy(isc_socket_t **sockp) {
1869 	int fd = 0;
1870 	isc_socket_t *sock = *sockp;
1871 	isc_socketmgr_t *manager = sock->manager;
1872 	isc__socketthread_t *thread = NULL;
1873 
1874 	socket_log(sock, NULL, CREATION, "destroying");
1875 
1876 	isc_refcount_destroy(&sock->references);
1877 
1878 	LOCK(&sock->lock);
1879 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1880 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1881 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1882 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1883 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1884 
1885 	if (sock->fd >= 0) {
1886 		fd = sock->fd;
1887 		thread = &manager->threads[sock->threadid];
1888 		sock->fd = -1;
1889 		sock->threadid = -1;
1890 	}
1891 	UNLOCK(&sock->lock);
1892 
1893 	if (fd > 0) {
1894 		socketclose(thread, sock, fd);
1895 	}
1896 
1897 	LOCK(&manager->lock);
1898 
1899 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1900 
1901 	if (ISC_LIST_EMPTY(manager->socklist)) {
1902 		SIGNAL(&manager->shutdown_ok);
1903 	}
1904 
1905 	/* can't unlock manager as its memory context is still used */
1906 	free_socket(sockp);
1907 
1908 	UNLOCK(&manager->lock);
1909 }
1910 
1911 static isc_result_t
allocate_socket(isc_socketmgr_t * manager,isc_sockettype_t type,isc_socket_t ** socketp)1912 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1913 		isc_socket_t **socketp) {
1914 	isc_socket_t *sock;
1915 
1916 	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1917 
1918 	sock->magic = 0;
1919 	isc_refcount_init(&sock->references, 0);
1920 
1921 	sock->manager = manager;
1922 	sock->type = type;
1923 	sock->fd = -1;
1924 	sock->threadid = -1;
1925 	sock->dscp = 0; /* TOS/TCLASS is zero until set. */
1926 	sock->dupped = 0;
1927 	sock->statsindex = NULL;
1928 	sock->active = 0;
1929 
1930 	ISC_LINK_INIT(sock, link);
1931 
1932 	memset(sock->name, 0, sizeof(sock->name));
1933 	sock->tag = NULL;
1934 
1935 	/*
1936 	 * Set up list of readers and writers to be initially empty.
1937 	 */
1938 	ISC_LIST_INIT(sock->recv_list);
1939 	ISC_LIST_INIT(sock->send_list);
1940 	ISC_LIST_INIT(sock->accept_list);
1941 	ISC_LIST_INIT(sock->connect_list);
1942 
1943 	sock->listener = 0;
1944 	sock->connected = 0;
1945 	sock->connecting = 0;
1946 	sock->bound = 0;
1947 	sock->pktdscp = 0;
1948 
1949 	/*
1950 	 * Initialize the lock.
1951 	 */
1952 	isc_mutex_init(&sock->lock);
1953 
1954 	sock->magic = SOCKET_MAGIC;
1955 	*socketp = sock;
1956 
1957 	return (ISC_R_SUCCESS);
1958 }
1959 
1960 /*
1961  * This event requires that the various lists be empty, that the reference
1962  * count be 1, and that the magic number is valid.  The other socket bits,
1963  * like the lock, must be initialized as well.  The fd associated must be
1964  * marked as closed, by setting it to -1 on close, or this routine will
1965  * also close the socket.
1966  */
1967 static void
free_socket(isc_socket_t ** socketp)1968 free_socket(isc_socket_t **socketp) {
1969 	isc_socket_t *sock = *socketp;
1970 	*socketp = NULL;
1971 
1972 	INSIST(VALID_SOCKET(sock));
1973 	isc_refcount_destroy(&sock->references);
1974 	LOCK(&sock->lock);
1975 	INSIST(!sock->connecting);
1976 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1977 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1978 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
1979 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
1980 	INSIST(!ISC_LINK_LINKED(sock, link));
1981 	UNLOCK(&sock->lock);
1982 
1983 	sock->magic = 0;
1984 
1985 	isc_mutex_destroy(&sock->lock);
1986 
1987 	isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
1988 }
1989 
1990 #if defined(SET_RCVBUF)
1991 static isc_once_t rcvbuf_once = ISC_ONCE_INIT;
1992 static int rcvbuf = ISC_RECV_BUFFER_SIZE;
1993 
1994 static void
set_rcvbuf(void)1995 set_rcvbuf(void) {
1996 	int fd;
1997 	int max = rcvbuf, min;
1998 	socklen_t len;
1999 
2000 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2001 	if (fd == -1) {
2002 		switch (errno) {
2003 		case EPROTONOSUPPORT:
2004 		case EPFNOSUPPORT:
2005 		case EAFNOSUPPORT:
2006 		/*
2007 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2008 		 * EAFNOSUPPORT.
2009 		 */
2010 		case EINVAL:
2011 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2012 			break;
2013 		}
2014 	}
2015 	if (fd == -1) {
2016 		return;
2017 	}
2018 
2019 	len = sizeof(min);
2020 	if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
2021 	    min < rcvbuf)
2022 	{
2023 	again:
2024 		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
2025 			       sizeof(rcvbuf)) == -1)
2026 		{
2027 			if (errno == ENOBUFS && rcvbuf > min) {
2028 				max = rcvbuf - 1;
2029 				rcvbuf = (rcvbuf + min) / 2;
2030 				goto again;
2031 			} else {
2032 				rcvbuf = min;
2033 				goto cleanup;
2034 			}
2035 		} else {
2036 			min = rcvbuf;
2037 		}
2038 		if (min != max) {
2039 			rcvbuf = max;
2040 			goto again;
2041 		}
2042 	}
2043 cleanup:
2044 	close(fd);
2045 }
2046 #endif /* ifdef SO_RCVBUF */
2047 
2048 #if defined(SET_SNDBUF)
2049 static isc_once_t sndbuf_once = ISC_ONCE_INIT;
2050 static int sndbuf = ISC_SEND_BUFFER_SIZE;
2051 
2052 static void
set_sndbuf(void)2053 set_sndbuf(void) {
2054 	int fd;
2055 	int max = sndbuf, min;
2056 	socklen_t len;
2057 
2058 	fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
2059 	if (fd == -1) {
2060 		switch (errno) {
2061 		case EPROTONOSUPPORT:
2062 		case EPFNOSUPPORT:
2063 		case EAFNOSUPPORT:
2064 		/*
2065 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2066 		 * EAFNOSUPPORT.
2067 		 */
2068 		case EINVAL:
2069 			fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
2070 			break;
2071 		}
2072 	}
2073 	if (fd == -1) {
2074 		return;
2075 	}
2076 
2077 	len = sizeof(min);
2078 	if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&min, &len) == 0 &&
2079 	    min < sndbuf)
2080 	{
2081 	again:
2082 		if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *)&sndbuf,
2083 			       sizeof(sndbuf)) == -1)
2084 		{
2085 			if (errno == ENOBUFS && sndbuf > min) {
2086 				max = sndbuf - 1;
2087 				sndbuf = (sndbuf + min) / 2;
2088 				goto again;
2089 			} else {
2090 				sndbuf = min;
2091 				goto cleanup;
2092 			}
2093 		} else {
2094 			min = sndbuf;
2095 		}
2096 		if (min != max) {
2097 			sndbuf = max;
2098 			goto again;
2099 		}
2100 	}
2101 cleanup:
2102 	close(fd);
2103 }
2104 #endif /* ifdef SO_SNDBUF */
2105 
2106 static void
use_min_mtu(isc_socket_t * sock)2107 use_min_mtu(isc_socket_t *sock) {
2108 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
2109 	UNUSED(sock);
2110 #endif /* if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU) */
2111 #ifdef IPV6_USE_MIN_MTU
2112 	/* use minimum MTU */
2113 	if (sock->pf == AF_INET6) {
2114 		int on = 1;
2115 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
2116 				 (void *)&on, sizeof(on));
2117 	}
2118 #endif /* ifdef IPV6_USE_MIN_MTU */
2119 #if defined(IPV6_MTU)
2120 	/*
2121 	 * Use minimum MTU on IPv6 sockets.
2122 	 */
2123 	if (sock->pf == AF_INET6) {
2124 		int mtu = 1280;
2125 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU, &mtu,
2126 				 sizeof(mtu));
2127 	}
2128 #endif /* if defined(IPV6_MTU) */
2129 }
2130 
2131 static void
set_tcp_maxseg(isc_socket_t * sock,int size)2132 set_tcp_maxseg(isc_socket_t *sock, int size) {
2133 #ifdef TCP_MAXSEG
2134 	if (sock->type == isc_sockettype_tcp) {
2135 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
2136 				 (void *)&size, sizeof(size));
2137 	}
2138 #endif /* ifdef TCP_MAXSEG */
2139 }
2140 
2141 static void
set_ip_disable_pmtud(isc_socket_t * sock)2142 set_ip_disable_pmtud(isc_socket_t *sock) {
2143 	/*
2144 	 * Disable Path MTU Discover on IP packets
2145 	 */
2146 	if (sock->pf == AF_INET6) {
2147 #if defined(IPV6_DONTFRAG)
2148 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_DONTFRAG,
2149 				 &(int){ 0 }, sizeof(int));
2150 #endif
2151 #if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2152 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
2153 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2154 #endif
2155 	} else if (sock->pf == AF_INET) {
2156 #if defined(IP_DONTFRAG)
2157 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, &(int){ 0 },
2158 				 sizeof(int));
2159 #endif
2160 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_OMIT)
2161 		(void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
2162 				 &(int){ IP_PMTUDISC_OMIT }, sizeof(int));
2163 #endif
2164 	}
2165 }
2166 
2167 static isc_result_t
opensocket(isc_socketmgr_t * manager,isc_socket_t * sock,isc_socket_t * dup_socket)2168 opensocket(isc_socketmgr_t *manager, isc_socket_t *sock,
2169 	   isc_socket_t *dup_socket) {
2170 	isc_result_t result;
2171 	char strbuf[ISC_STRERRORSIZE];
2172 	const char *err = "socket";
2173 	int tries = 0;
2174 #if defined(USE_CMSG) || defined(SO_NOSIGPIPE)
2175 	int on = 1;
2176 #endif /* if defined(USE_CMSG) || defined(SO_NOSIGPIPE) */
2177 #if defined(SET_RCVBUF) || defined(SET_SNDBUF)
2178 	socklen_t optlen;
2179 	int size = 0;
2180 #endif
2181 
2182 again:
2183 	if (dup_socket == NULL) {
2184 		switch (sock->type) {
2185 		case isc_sockettype_udp:
2186 			sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
2187 			break;
2188 		case isc_sockettype_tcp:
2189 			sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
2190 			break;
2191 		case isc_sockettype_unix:
2192 			sock->fd = socket(sock->pf, SOCK_STREAM, 0);
2193 			break;
2194 		case isc_sockettype_raw:
2195 			errno = EPFNOSUPPORT;
2196 			/*
2197 			 * PF_ROUTE is a alias for PF_NETLINK on linux.
2198 			 */
2199 #if defined(PF_ROUTE)
2200 			if (sock->fd == -1 && sock->pf == PF_ROUTE) {
2201 #ifdef NETLINK_ROUTE
2202 				sock->fd = socket(sock->pf, SOCK_RAW,
2203 						  NETLINK_ROUTE);
2204 #else  /* ifdef NETLINK_ROUTE */
2205 				sock->fd = socket(sock->pf, SOCK_RAW, 0);
2206 #endif /* ifdef NETLINK_ROUTE */
2207 				if (sock->fd != -1) {
2208 #ifdef NETLINK_ROUTE
2209 					struct sockaddr_nl sa;
2210 					int n;
2211 
2212 					/*
2213 					 * Do an implicit bind.
2214 					 */
2215 					memset(&sa, 0, sizeof(sa));
2216 					sa.nl_family = AF_NETLINK;
2217 					sa.nl_groups = RTMGRP_IPV4_IFADDR |
2218 						       RTMGRP_IPV6_IFADDR;
2219 					n = bind(sock->fd,
2220 						 (struct sockaddr *)&sa,
2221 						 sizeof(sa));
2222 					if (n < 0) {
2223 						close(sock->fd);
2224 						sock->fd = -1;
2225 					}
2226 #endif /* ifdef NETLINK_ROUTE */
2227 					sock->bound = 1;
2228 				}
2229 			}
2230 #endif /* if defined(PF_ROUTE) */
2231 			break;
2232 		case isc_sockettype_fdwatch:
2233 			/*
2234 			 * We should not be called for isc_sockettype_fdwatch
2235 			 * sockets.
2236 			 */
2237 			INSIST(0);
2238 			break;
2239 		}
2240 	} else {
2241 		sock->fd = dup(dup_socket->fd);
2242 		sock->dupped = 1;
2243 		sock->bound = dup_socket->bound;
2244 	}
2245 	if (sock->fd == -1 && errno == EINTR && tries++ < 42) {
2246 		goto again;
2247 	}
2248 
2249 #ifdef F_DUPFD
2250 	/*
2251 	 * Leave a space for stdio and TCP to work in.
2252 	 */
2253 	if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
2254 	    sock->fd >= 0 && sock->fd < manager->reserved)
2255 	{
2256 		int newfd, tmp;
2257 		newfd = fcntl(sock->fd, F_DUPFD, manager->reserved);
2258 		tmp = errno;
2259 		(void)close(sock->fd);
2260 		errno = tmp;
2261 		sock->fd = newfd;
2262 		err = "isc_socket_create: fcntl/reserved";
2263 	} else if (sock->fd >= 0 && sock->fd < 20) {
2264 		int newfd, tmp;
2265 		newfd = fcntl(sock->fd, F_DUPFD, 20);
2266 		tmp = errno;
2267 		(void)close(sock->fd);
2268 		errno = tmp;
2269 		sock->fd = newfd;
2270 		err = "isc_socket_create: fcntl";
2271 	}
2272 #endif /* ifdef F_DUPFD */
2273 
2274 	if (sock->fd >= (int)manager->maxsocks) {
2275 		(void)close(sock->fd);
2276 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2277 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2278 			      "socket: file descriptor exceeds limit (%d/%u)",
2279 			      sock->fd, manager->maxsocks);
2280 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2281 		return (ISC_R_NORESOURCES);
2282 	}
2283 
2284 	if (sock->fd < 0) {
2285 		switch (errno) {
2286 		case EMFILE:
2287 		case ENFILE:
2288 			strerror_r(errno, strbuf, sizeof(strbuf));
2289 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2290 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2291 				      "%s: %s", err, strbuf);
2292 			FALLTHROUGH;
2293 		case ENOBUFS:
2294 			inc_stats(manager->stats,
2295 				  sock->statsindex[STATID_OPENFAIL]);
2296 			return (ISC_R_NORESOURCES);
2297 
2298 		case EPROTONOSUPPORT:
2299 		case EPFNOSUPPORT:
2300 		case EAFNOSUPPORT:
2301 		/*
2302 		 * Linux 2.2 (and maybe others) return EINVAL instead of
2303 		 * EAFNOSUPPORT.
2304 		 */
2305 		case EINVAL:
2306 			inc_stats(manager->stats,
2307 				  sock->statsindex[STATID_OPENFAIL]);
2308 			return (ISC_R_FAMILYNOSUPPORT);
2309 
2310 		default:
2311 			strerror_r(errno, strbuf, sizeof(strbuf));
2312 			UNEXPECTED_ERROR(__FILE__, __LINE__, "%s() failed: %s",
2313 					 err, strbuf);
2314 			inc_stats(manager->stats,
2315 				  sock->statsindex[STATID_OPENFAIL]);
2316 			return (ISC_R_UNEXPECTED);
2317 		}
2318 	}
2319 
2320 	if (dup_socket != NULL) {
2321 		goto setup_done;
2322 	}
2323 
2324 	result = make_nonblock(sock->fd);
2325 	if (result != ISC_R_SUCCESS) {
2326 		(void)close(sock->fd);
2327 		inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
2328 		return (result);
2329 	}
2330 
2331 #ifdef SO_NOSIGPIPE
2332 	if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on,
2333 		       sizeof(on)) < 0)
2334 	{
2335 		strerror_r(errno, strbuf, sizeof(strbuf));
2336 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2337 				 "setsockopt(%d, SO_NOSIGPIPE) failed: %s",
2338 				 sock->fd, strbuf);
2339 		/* Press on... */
2340 	}
2341 #endif /* ifdef SO_NOSIGPIPE */
2342 
2343 	/*
2344 	 * Use minimum mtu if possible.
2345 	 */
2346 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
2347 		use_min_mtu(sock);
2348 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
2349 	}
2350 
2351 #if defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF)
2352 	if (sock->type == isc_sockettype_udp) {
2353 #if defined(USE_CMSG)
2354 #if defined(SO_TIMESTAMP)
2355 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, (void *)&on,
2356 			       sizeof(on)) < 0 &&
2357 		    errno != ENOPROTOOPT)
2358 		{
2359 			strerror_r(errno, strbuf, sizeof(strbuf));
2360 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2361 					 "setsockopt(%d, SO_TIMESTAMP) failed: "
2362 					 "%s",
2363 					 sock->fd, strbuf);
2364 			/* Press on... */
2365 		}
2366 #endif /* SO_TIMESTAMP */
2367 
2368 #ifdef IPV6_RECVPKTINFO
2369 		/* RFC 3542 */
2370 		if ((sock->pf == AF_INET6) &&
2371 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
2372 				(void *)&on, sizeof(on)) < 0))
2373 		{
2374 			strerror_r(errno, strbuf, sizeof(strbuf));
2375 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2376 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
2377 					 "failed: %s",
2378 					 sock->fd, strbuf);
2379 		}
2380 #else  /* ifdef IPV6_RECVPKTINFO */
2381 		/* RFC 2292 */
2382 		if ((sock->pf == AF_INET6) &&
2383 		    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
2384 				(void *)&on, sizeof(on)) < 0))
2385 		{
2386 			strerror_r(errno, strbuf, sizeof(strbuf));
2387 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2388 					 "setsockopt(%d, IPV6_PKTINFO) failed: "
2389 					 "%s",
2390 					 sock->fd, strbuf);
2391 		}
2392 #endif /* IPV6_RECVPKTINFO */
2393 #endif /* defined(USE_CMSG) */
2394 
2395 #if defined(SET_RCVBUF)
2396 		optlen = sizeof(size);
2397 		if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (void *)&size,
2398 			       &optlen) == 0 &&
2399 		    size < rcvbuf)
2400 		{
2401 			RUNTIME_CHECK(isc_once_do(&rcvbuf_once, set_rcvbuf) ==
2402 				      ISC_R_SUCCESS);
2403 			if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
2404 				       (void *)&rcvbuf, sizeof(rcvbuf)) == -1)
2405 			{
2406 				strerror_r(errno, strbuf, sizeof(strbuf));
2407 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2408 						 "setsockopt(%d, SO_RCVBUF, "
2409 						 "%d) failed: %s",
2410 						 sock->fd, rcvbuf, strbuf);
2411 			}
2412 		}
2413 #endif /* if defined(SET_RCVBUF) */
2414 
2415 #if defined(SET_SNDBUF)
2416 		optlen = sizeof(size);
2417 		if (getsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (void *)&size,
2418 			       &optlen) == 0 &&
2419 		    size < sndbuf)
2420 		{
2421 			RUNTIME_CHECK(isc_once_do(&sndbuf_once, set_sndbuf) ==
2422 				      ISC_R_SUCCESS);
2423 			if (setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
2424 				       (void *)&sndbuf, sizeof(sndbuf)) == -1)
2425 			{
2426 				strerror_r(errno, strbuf, sizeof(strbuf));
2427 				UNEXPECTED_ERROR(__FILE__, __LINE__,
2428 						 "setsockopt(%d, SO_SNDBUF, "
2429 						 "%d) failed: %s",
2430 						 sock->fd, sndbuf, strbuf);
2431 			}
2432 		}
2433 #endif /* if defined(SO_SNDBUF) */
2434 	}
2435 #ifdef IPV6_RECVTCLASS
2436 	if ((sock->pf == AF_INET6) &&
2437 	    (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS, (void *)&on,
2438 			sizeof(on)) < 0))
2439 	{
2440 		strerror_r(errno, strbuf, sizeof(strbuf));
2441 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2442 				 "setsockopt(%d, IPV6_RECVTCLASS) "
2443 				 "failed: %s",
2444 				 sock->fd, strbuf);
2445 	}
2446 #endif /* ifdef IPV6_RECVTCLASS */
2447 #ifdef IP_RECVTOS
2448 	if ((sock->pf == AF_INET) &&
2449 	    (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS, (void *)&on,
2450 			sizeof(on)) < 0))
2451 	{
2452 		strerror_r(errno, strbuf, sizeof(strbuf));
2453 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2454 				 "setsockopt(%d, IP_RECVTOS) "
2455 				 "failed: %s",
2456 				 sock->fd, strbuf);
2457 	}
2458 #endif /* ifdef IP_RECVTOS */
2459 #endif /* defined(USE_CMSG) || defined(SET_RCVBUF) || defined(SET_SNDBUF) */
2460 
2461 	set_ip_disable_pmtud(sock);
2462 
2463 setup_done:
2464 	inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
2465 	if (sock->active == 0) {
2466 		inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
2467 		sock->active = 1;
2468 	}
2469 
2470 	return (ISC_R_SUCCESS);
2471 }
2472 
2473 /*
2474  * Create a 'type' socket or duplicate an existing socket, managed
2475  * by 'manager'.  Events will be posted to 'task' and when dispatched
2476  * 'action' will be called with 'arg' as the arg value.  The new
2477  * socket is returned in 'socketp'.
2478  */
2479 static isc_result_t
socket_create(isc_socketmgr_t * manager,int pf,isc_sockettype_t type,isc_socket_t ** socketp,isc_socket_t * dup_socket)2480 socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
2481 	      isc_socket_t **socketp, isc_socket_t *dup_socket) {
2482 	isc_socket_t *sock = NULL;
2483 	isc__socketthread_t *thread;
2484 	isc_result_t result;
2485 	int lockid;
2486 
2487 	REQUIRE(VALID_MANAGER(manager));
2488 	REQUIRE(socketp != NULL && *socketp == NULL);
2489 	REQUIRE(type != isc_sockettype_fdwatch);
2490 
2491 	result = allocate_socket(manager, type, &sock);
2492 	if (result != ISC_R_SUCCESS) {
2493 		return (result);
2494 	}
2495 
2496 	switch (sock->type) {
2497 	case isc_sockettype_udp:
2498 		sock->statsindex = (pf == AF_INET) ? udp4statsindex
2499 						   : udp6statsindex;
2500 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
2501 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
2502 		break;
2503 	case isc_sockettype_tcp:
2504 		sock->statsindex = (pf == AF_INET) ? tcp4statsindex
2505 						   : tcp6statsindex;
2506 		break;
2507 	case isc_sockettype_unix:
2508 		sock->statsindex = unixstatsindex;
2509 		break;
2510 	case isc_sockettype_raw:
2511 		sock->statsindex = rawstatsindex;
2512 		break;
2513 	default:
2514 		UNREACHABLE();
2515 	}
2516 
2517 	sock->pf = pf;
2518 
2519 	result = opensocket(manager, sock, dup_socket);
2520 	if (result != ISC_R_SUCCESS) {
2521 		free_socket(&sock);
2522 		return (result);
2523 	}
2524 
2525 	if (sock->fd == -1) {
2526 		abort();
2527 	}
2528 	sock->threadid = gen_threadid(sock);
2529 	isc_refcount_increment0(&sock->references);
2530 	thread = &manager->threads[sock->threadid];
2531 	*socketp = sock;
2532 
2533 	/*
2534 	 * Note we don't have to lock the socket like we normally would because
2535 	 * there are no external references to it yet.
2536 	 */
2537 
2538 	lockid = FDLOCK_ID(sock->fd);
2539 	LOCK(&thread->fdlock[lockid]);
2540 	thread->fds[sock->fd] = sock;
2541 	thread->fdstate[sock->fd] = MANAGED;
2542 #if defined(USE_EPOLL)
2543 	thread->epoll_events[sock->fd] = 0;
2544 #endif /* if defined(USE_EPOLL) */
2545 #ifdef USE_DEVPOLL
2546 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2547 	       thread->fdpollinfo[sock->fd].want_write == 0);
2548 #endif /* ifdef USE_DEVPOLL */
2549 	UNLOCK(&thread->fdlock[lockid]);
2550 
2551 	LOCK(&manager->lock);
2552 	ISC_LIST_APPEND(manager->socklist, sock, link);
2553 #ifdef USE_SELECT
2554 	if (thread->maxfd < sock->fd) {
2555 		thread->maxfd = sock->fd;
2556 	}
2557 #endif /* ifdef USE_SELECT */
2558 	UNLOCK(&manager->lock);
2559 
2560 	socket_log(sock, NULL, CREATION,
2561 		   dup_socket != NULL ? "dupped" : "created");
2562 
2563 	return (ISC_R_SUCCESS);
2564 }
2565 
2566 /*%
2567  * Create a new 'type' socket managed by 'manager'.  Events
2568  * will be posted to 'task' and when dispatched 'action' will be
2569  * called with 'arg' as the arg value.  The new socket is returned
2570  * in 'socketp'.
2571  */
2572 isc_result_t
isc_socket_create(isc_socketmgr_t * manager0,int pf,isc_sockettype_t type,isc_socket_t ** socketp)2573 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
2574 		  isc_socket_t **socketp) {
2575 	return (socket_create(manager0, pf, type, socketp, NULL));
2576 }
2577 
2578 /*%
2579  * Duplicate an existing socket.  The new socket is returned
2580  * in 'socketp'.
2581  */
2582 isc_result_t
isc_socket_dup(isc_socket_t * sock,isc_socket_t ** socketp)2583 isc_socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
2584 	REQUIRE(VALID_SOCKET(sock));
2585 	REQUIRE(socketp != NULL && *socketp == NULL);
2586 
2587 	return (socket_create(sock->manager, sock->pf, sock->type, socketp,
2588 			      sock));
2589 }
2590 
2591 isc_result_t
isc_socket_open(isc_socket_t * sock)2592 isc_socket_open(isc_socket_t *sock) {
2593 	isc_result_t result;
2594 	isc__socketthread_t *thread;
2595 
2596 	REQUIRE(VALID_SOCKET(sock));
2597 
2598 	LOCK(&sock->lock);
2599 
2600 	REQUIRE(isc_refcount_current(&sock->references) >= 1);
2601 	REQUIRE(sock->fd == -1);
2602 	REQUIRE(sock->threadid == -1);
2603 	REQUIRE(sock->type != isc_sockettype_fdwatch);
2604 
2605 	result = opensocket(sock->manager, sock, NULL);
2606 
2607 	UNLOCK(&sock->lock);
2608 
2609 	if (result != ISC_R_SUCCESS) {
2610 		sock->fd = -1;
2611 	} else {
2612 		sock->threadid = gen_threadid(sock);
2613 		thread = &sock->manager->threads[sock->threadid];
2614 		int lockid = FDLOCK_ID(sock->fd);
2615 
2616 		LOCK(&thread->fdlock[lockid]);
2617 		thread->fds[sock->fd] = sock;
2618 		thread->fdstate[sock->fd] = MANAGED;
2619 #if defined(USE_EPOLL)
2620 		thread->epoll_events[sock->fd] = 0;
2621 #endif /* if defined(USE_EPOLL) */
2622 #ifdef USE_DEVPOLL
2623 		INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
2624 		       thread->fdpollinfo[sock->fd].want_write == 0);
2625 #endif /* ifdef USE_DEVPOLL */
2626 		UNLOCK(&thread->fdlock[lockid]);
2627 
2628 #ifdef USE_SELECT
2629 		LOCK(&sock->manager->lock);
2630 		if (thread->maxfd < sock->fd) {
2631 			thread->maxfd = sock->fd;
2632 		}
2633 		UNLOCK(&sock->manager->lock);
2634 #endif /* ifdef USE_SELECT */
2635 	}
2636 
2637 	return (result);
2638 }
2639 
2640 /*
2641  * Attach to a socket.  Caller must explicitly detach when it is done.
2642  */
2643 void
isc_socket_attach(isc_socket_t * sock,isc_socket_t ** socketp)2644 isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
2645 	REQUIRE(VALID_SOCKET(sock));
2646 	REQUIRE(socketp != NULL && *socketp == NULL);
2647 
2648 	int old_refs = isc_refcount_increment(&sock->references);
2649 	REQUIRE(old_refs > 0);
2650 
2651 	*socketp = sock;
2652 }
2653 
2654 /*
2655  * Dereference a socket.  If this is the last reference to it, clean things
2656  * up by destroying the socket.
2657  */
2658 void
isc_socket_detach(isc_socket_t ** socketp)2659 isc_socket_detach(isc_socket_t **socketp) {
2660 	isc_socket_t *sock;
2661 
2662 	REQUIRE(socketp != NULL);
2663 	sock = *socketp;
2664 	REQUIRE(VALID_SOCKET(sock));
2665 	if (isc_refcount_decrement(&sock->references) == 1) {
2666 		destroy(&sock);
2667 	}
2668 
2669 	*socketp = NULL;
2670 }
2671 
2672 isc_result_t
isc_socket_close(isc_socket_t * sock)2673 isc_socket_close(isc_socket_t *sock) {
2674 	int fd;
2675 	isc_socketmgr_t *manager;
2676 	isc__socketthread_t *thread;
2677 	fflush(stdout);
2678 	REQUIRE(VALID_SOCKET(sock));
2679 
2680 	LOCK(&sock->lock);
2681 
2682 	REQUIRE(sock->type != isc_sockettype_fdwatch);
2683 	REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
2684 
2685 	INSIST(!sock->connecting);
2686 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
2687 	INSIST(ISC_LIST_EMPTY(sock->send_list));
2688 	INSIST(ISC_LIST_EMPTY(sock->accept_list));
2689 	INSIST(ISC_LIST_EMPTY(sock->connect_list));
2690 
2691 	manager = sock->manager;
2692 	thread = &manager->threads[sock->threadid];
2693 	fd = sock->fd;
2694 	sock->fd = -1;
2695 	sock->threadid = -1;
2696 
2697 	sock->dupped = 0;
2698 	memset(sock->name, 0, sizeof(sock->name));
2699 	sock->tag = NULL;
2700 	sock->listener = 0;
2701 	sock->connected = 0;
2702 	sock->connecting = 0;
2703 	sock->bound = 0;
2704 	isc_sockaddr_any(&sock->peer_address);
2705 
2706 	UNLOCK(&sock->lock);
2707 
2708 	socketclose(thread, sock, fd);
2709 
2710 	return (ISC_R_SUCCESS);
2711 }
2712 
2713 static void
dispatch_recv(isc_socket_t * sock)2714 dispatch_recv(isc_socket_t *sock) {
2715 	if (sock->type != isc_sockettype_fdwatch) {
2716 		internal_recv(sock);
2717 	} else {
2718 		internal_fdwatch_read(sock);
2719 	}
2720 }
2721 
2722 static void
dispatch_send(isc_socket_t * sock)2723 dispatch_send(isc_socket_t *sock) {
2724 	if (sock->type != isc_sockettype_fdwatch) {
2725 		internal_send(sock);
2726 	} else {
2727 		internal_fdwatch_write(sock);
2728 	}
2729 }
2730 
2731 /*
2732  * Dequeue an item off the given socket's read queue, set the result code
2733  * in the done event to the one provided, and send it to the task it was
2734  * destined for.
2735  *
2736  * If the event to be sent is on a list, remove it before sending.  If
2737  * asked to, send and detach from the socket as well.
2738  *
2739  * Caller must have the socket locked if the event is attached to the socket.
2740  */
2741 static void
send_recvdone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2742 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2743 	isc_task_t *task;
2744 
2745 	task = (*dev)->ev_sender;
2746 
2747 	(*dev)->ev_sender = sock;
2748 
2749 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2750 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
2751 	}
2752 
2753 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2754 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2755 					 sock->threadid);
2756 	} else {
2757 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2758 	}
2759 }
2760 
2761 /*
2762  * See comments for send_recvdone_event() above.
2763  *
2764  * Caller must have the socket locked if the event is attached to the socket.
2765  */
2766 static void
send_senddone_event(isc_socket_t * sock,isc_socketevent_t ** dev)2767 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
2768 	isc_task_t *task;
2769 
2770 	INSIST(dev != NULL && *dev != NULL);
2771 
2772 	task = (*dev)->ev_sender;
2773 	(*dev)->ev_sender = sock;
2774 
2775 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2776 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
2777 	}
2778 
2779 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) != 0) {
2780 		isc_task_sendtoanddetach(&task, (isc_event_t **)dev,
2781 					 sock->threadid);
2782 	} else {
2783 		isc_task_sendto(task, (isc_event_t **)dev, sock->threadid);
2784 	}
2785 }
2786 
2787 /*
2788  * See comments for send_recvdone_event() above.
2789  *
2790  * Caller must have the socket locked if the event is attached to the socket.
2791  */
2792 static void
send_connectdone_event(isc_socket_t * sock,isc_socket_connev_t ** dev)2793 send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **dev) {
2794 	isc_task_t *task;
2795 
2796 	INSIST(dev != NULL && *dev != NULL);
2797 
2798 	task = (*dev)->ev_sender;
2799 	(*dev)->ev_sender = sock;
2800 
2801 	if (ISC_LINK_LINKED(*dev, ev_link)) {
2802 		ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
2803 	}
2804 
2805 	isc_task_sendtoanddetach(&task, (isc_event_t **)dev, sock->threadid);
2806 }
2807 
2808 /*
2809  * Call accept() on a socket, to get the new file descriptor.  The listen
2810  * socket is used as a prototype to create a new isc_socket_t.  The new
2811  * socket has one outstanding reference.  The task receiving the event
2812  * will be detached from just after the event is delivered.
2813  *
2814  * On entry to this function, the event delivered is the internal
2815  * readable event, and the first item on the accept_list should be
2816  * the done event we want to send.  If the list is empty, this is a no-op,
2817  * so just unlock and return.
2818  */
2819 static void
internal_accept(isc_socket_t * sock)2820 internal_accept(isc_socket_t *sock) {
2821 	isc_socketmgr_t *manager;
2822 	isc__socketthread_t *thread, *nthread;
2823 	isc_socket_newconnev_t *dev;
2824 	isc_task_t *task;
2825 	socklen_t addrlen;
2826 	int fd;
2827 	isc_result_t result = ISC_R_SUCCESS;
2828 	char strbuf[ISC_STRERRORSIZE];
2829 	const char *err = "accept";
2830 
2831 	INSIST(VALID_SOCKET(sock));
2832 	REQUIRE(sock->fd >= 0);
2833 
2834 	socket_log(sock, NULL, TRACE, "internal_accept called, locked socket");
2835 
2836 	manager = sock->manager;
2837 	INSIST(VALID_MANAGER(manager));
2838 	thread = &manager->threads[sock->threadid];
2839 
2840 	INSIST(sock->listener);
2841 
2842 	/*
2843 	 * Get the first item off the accept list.
2844 	 * If it is empty, unlock the socket and return.
2845 	 */
2846 	dev = ISC_LIST_HEAD(sock->accept_list);
2847 	if (dev == NULL) {
2848 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2849 		UNLOCK(&sock->lock);
2850 		return;
2851 	}
2852 
2853 	/*
2854 	 * Try to accept the new connection.  If the accept fails with
2855 	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
2856 	 * again.  Also ignore ECONNRESET, which has been reported to
2857 	 * be spuriously returned on Linux 2.2.19 although it is not
2858 	 * a documented error for accept().  ECONNABORTED has been
2859 	 * reported for Solaris 8.  The rest are thrown in not because
2860 	 * we have seen them but because they are ignored by other
2861 	 * daemons such as BIND 8 and Apache.
2862 	 */
2863 
2864 	addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
2865 	memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
2866 	fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
2867 		    (void *)&addrlen);
2868 
2869 #ifdef F_DUPFD
2870 	/*
2871 	 * Leave a space for stdio to work in.
2872 	 */
2873 	if (fd >= 0 && fd < 20) {
2874 		int newfd, tmp;
2875 		newfd = fcntl(fd, F_DUPFD, 20);
2876 		tmp = errno;
2877 		(void)close(fd);
2878 		errno = tmp;
2879 		fd = newfd;
2880 		err = "accept/fcntl";
2881 	}
2882 #endif /* ifdef F_DUPFD */
2883 
2884 	if (fd < 0) {
2885 		if (SOFT_ERROR(errno)) {
2886 			goto soft_error;
2887 		}
2888 		switch (errno) {
2889 		case ENFILE:
2890 		case EMFILE:
2891 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2892 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2893 				      "%s: too many open file descriptors",
2894 				      err);
2895 			goto soft_error;
2896 
2897 		case ENOBUFS:
2898 		case ENOMEM:
2899 		case ECONNRESET:
2900 		case ECONNABORTED:
2901 		case EHOSTUNREACH:
2902 		case EHOSTDOWN:
2903 		case ENETUNREACH:
2904 		case ENETDOWN:
2905 		case ECONNREFUSED:
2906 #ifdef EPROTO
2907 		case EPROTO:
2908 #endif /* ifdef EPROTO */
2909 #ifdef ENONET
2910 		case ENONET:
2911 #endif /* ifdef ENONET */
2912 			goto soft_error;
2913 		default:
2914 			break;
2915 		}
2916 		strerror_r(errno, strbuf, sizeof(strbuf));
2917 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2918 				 "internal_accept: %s() failed: %s", err,
2919 				 strbuf);
2920 		fd = -1;
2921 		result = ISC_R_UNEXPECTED;
2922 	} else {
2923 		if (addrlen == 0U) {
2924 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2925 					 "internal_accept(): "
2926 					 "accept() failed to return "
2927 					 "remote address");
2928 
2929 			(void)close(fd);
2930 			goto soft_error;
2931 		} else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
2932 			   sock->pf)
2933 		{
2934 			UNEXPECTED_ERROR(
2935 				__FILE__, __LINE__,
2936 				"internal_accept(): "
2937 				"accept() returned peer address "
2938 				"family %u (expected %u)",
2939 				NEWCONNSOCK(dev)->peer_address.type.sa.sa_family,
2940 				sock->pf);
2941 			(void)close(fd);
2942 			goto soft_error;
2943 		} else if (fd >= (int)manager->maxsocks) {
2944 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
2945 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
2946 				      "accept: file descriptor exceeds limit "
2947 				      "(%d/%u)",
2948 				      fd, manager->maxsocks);
2949 			(void)close(fd);
2950 			goto soft_error;
2951 		}
2952 	}
2953 
2954 	if (fd != -1) {
2955 		NEWCONNSOCK(dev)->peer_address.length = addrlen;
2956 		NEWCONNSOCK(dev)->pf = sock->pf;
2957 	}
2958 
2959 	/*
2960 	 * Pull off the done event.
2961 	 */
2962 	ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
2963 
2964 	/*
2965 	 * Poke watcher if there are more pending accepts.
2966 	 */
2967 	if (ISC_LIST_EMPTY(sock->accept_list)) {
2968 		unwatch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
2969 	}
2970 
2971 	if (fd != -1) {
2972 		result = make_nonblock(fd);
2973 		if (result != ISC_R_SUCCESS) {
2974 			(void)close(fd);
2975 			fd = -1;
2976 		}
2977 	}
2978 
2979 	/*
2980 	 * We need to unlock sock->lock now to be able to lock manager->lock
2981 	 * without risking a deadlock with xmlstats.
2982 	 */
2983 	UNLOCK(&sock->lock);
2984 
2985 	/*
2986 	 * -1 means the new socket didn't happen.
2987 	 */
2988 	if (fd != -1) {
2989 		int lockid = FDLOCK_ID(fd);
2990 
2991 		NEWCONNSOCK(dev)->fd = fd;
2992 		NEWCONNSOCK(dev)->threadid = gen_threadid(NEWCONNSOCK(dev));
2993 		NEWCONNSOCK(dev)->bound = 1;
2994 		NEWCONNSOCK(dev)->connected = 1;
2995 		nthread = &manager->threads[NEWCONNSOCK(dev)->threadid];
2996 
2997 		/*
2998 		 * We already hold a lock on one fdlock in accepting thread,
2999 		 * we need to make sure that we don't double lock.
3000 		 */
3001 		bool same_bucket = (sock->threadid ==
3002 				    NEWCONNSOCK(dev)->threadid) &&
3003 				   (FDLOCK_ID(sock->fd) == lockid);
3004 
3005 		/*
3006 		 * Use minimum mtu if possible.
3007 		 */
3008 		use_min_mtu(NEWCONNSOCK(dev));
3009 		set_tcp_maxseg(NEWCONNSOCK(dev), 1280 - 20 - 40);
3010 
3011 		/*
3012 		 * Ensure DSCP settings are inherited across accept.
3013 		 */
3014 		setdscp(NEWCONNSOCK(dev), sock->dscp);
3015 
3016 		/*
3017 		 * Save away the remote address
3018 		 */
3019 		dev->address = NEWCONNSOCK(dev)->peer_address;
3020 
3021 		if (NEWCONNSOCK(dev)->active == 0) {
3022 			inc_stats(manager->stats,
3023 				  NEWCONNSOCK(dev)->statsindex[STATID_ACTIVE]);
3024 			NEWCONNSOCK(dev)->active = 1;
3025 		}
3026 
3027 		if (!same_bucket) {
3028 			LOCK(&nthread->fdlock[lockid]);
3029 		}
3030 		nthread->fds[fd] = NEWCONNSOCK(dev);
3031 		nthread->fdstate[fd] = MANAGED;
3032 #if defined(USE_EPOLL)
3033 		nthread->epoll_events[fd] = 0;
3034 #endif /* if defined(USE_EPOLL) */
3035 		if (!same_bucket) {
3036 			UNLOCK(&nthread->fdlock[lockid]);
3037 		}
3038 
3039 		LOCK(&manager->lock);
3040 
3041 #ifdef USE_SELECT
3042 		if (nthread->maxfd < fd) {
3043 			nthread->maxfd = fd;
3044 		}
3045 #endif /* ifdef USE_SELECT */
3046 
3047 		socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
3048 			   "accepted connection, new socket %p",
3049 			   dev->newsocket);
3050 
3051 		ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
3052 
3053 		UNLOCK(&manager->lock);
3054 
3055 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
3056 	} else {
3057 		inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3058 		isc_refcount_decrementz(&NEWCONNSOCK(dev)->references);
3059 		free_socket((isc_socket_t **)&dev->newsocket);
3060 	}
3061 
3062 	/*
3063 	 * Fill in the done event details and send it off.
3064 	 */
3065 	dev->result = result;
3066 	task = dev->ev_sender;
3067 	dev->ev_sender = sock;
3068 
3069 	isc_task_sendtoanddetach(&task, ISC_EVENT_PTR(&dev), sock->threadid);
3070 	return;
3071 
3072 soft_error:
3073 	watch_fd(thread, sock->fd, SELECT_POKE_ACCEPT);
3074 	UNLOCK(&sock->lock);
3075 
3076 	inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
3077 	return;
3078 }
3079 
3080 static void
internal_recv(isc_socket_t * sock)3081 internal_recv(isc_socket_t *sock) {
3082 	isc_socketevent_t *dev;
3083 
3084 	INSIST(VALID_SOCKET(sock));
3085 	REQUIRE(sock->fd >= 0);
3086 
3087 	dev = ISC_LIST_HEAD(sock->recv_list);
3088 	if (dev == NULL) {
3089 		goto finish;
3090 	}
3091 
3092 	socket_log(sock, NULL, IOEVENT, "internal_recv: event %p -> task %p",
3093 		   dev, dev->ev_sender);
3094 
3095 	/*
3096 	 * Try to do as much I/O as possible on this socket.  There are no
3097 	 * limits here, currently.
3098 	 */
3099 	while (dev != NULL) {
3100 		switch (doio_recv(sock, dev)) {
3101 		case DOIO_SOFT:
3102 			goto finish;
3103 
3104 		case DOIO_EOF:
3105 			/*
3106 			 * read of 0 means the remote end was closed.
3107 			 * Run through the event queue and dispatch all
3108 			 * the events with an EOF result code.
3109 			 */
3110 			do {
3111 				dev->result = ISC_R_EOF;
3112 				send_recvdone_event(sock, &dev);
3113 				dev = ISC_LIST_HEAD(sock->recv_list);
3114 			} while (dev != NULL);
3115 			goto finish;
3116 
3117 		case DOIO_SUCCESS:
3118 		case DOIO_HARD:
3119 			send_recvdone_event(sock, &dev);
3120 			break;
3121 		}
3122 
3123 		dev = ISC_LIST_HEAD(sock->recv_list);
3124 	}
3125 
3126 finish:
3127 	if (ISC_LIST_EMPTY(sock->recv_list)) {
3128 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3129 			   SELECT_POKE_READ);
3130 	}
3131 }
3132 
3133 static void
internal_send(isc_socket_t * sock)3134 internal_send(isc_socket_t *sock) {
3135 	isc_socketevent_t *dev;
3136 
3137 	INSIST(VALID_SOCKET(sock));
3138 	REQUIRE(sock->fd >= 0);
3139 
3140 	dev = ISC_LIST_HEAD(sock->send_list);
3141 	if (dev == NULL) {
3142 		goto finish;
3143 	}
3144 	socket_log(sock, NULL, EVENT, "internal_send: event %p -> task %p", dev,
3145 		   dev->ev_sender);
3146 
3147 	/*
3148 	 * Try to do as much I/O as possible on this socket.  There are no
3149 	 * limits here, currently.
3150 	 */
3151 	while (dev != NULL) {
3152 		switch (doio_send(sock, dev)) {
3153 		case DOIO_SOFT:
3154 			goto finish;
3155 
3156 		case DOIO_HARD:
3157 		case DOIO_SUCCESS:
3158 			send_senddone_event(sock, &dev);
3159 			break;
3160 		}
3161 
3162 		dev = ISC_LIST_HEAD(sock->send_list);
3163 	}
3164 
3165 finish:
3166 	if (ISC_LIST_EMPTY(sock->send_list)) {
3167 		unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
3168 			   SELECT_POKE_WRITE);
3169 	}
3170 }
3171 
3172 static void
internal_fdwatch_write(isc_socket_t * sock)3173 internal_fdwatch_write(isc_socket_t *sock)
3174 {
3175 	int more_data;
3176 
3177 	INSIST(VALID_SOCKET(sock));
3178 
3179 	isc_refcount_increment(&sock->references);
3180 	UNLOCK(&sock->lock);
3181 
3182 	more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
3183 				      sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
3184 
3185 	LOCK(&sock->lock);
3186 
3187 	if (isc_refcount_decrement(&sock->references) == 0) {
3188 		UNLOCK(&sock->lock);
3189 		destroy(&sock);
3190 		return;
3191 	}
3192 
3193 	if (more_data)
3194 		select_poke(sock->manager, sock->threadid, sock->fd,
3195 		    SELECT_POKE_WRITE);
3196 }
3197 
3198 static void
internal_fdwatch_read(isc_socket_t * sock)3199 internal_fdwatch_read(isc_socket_t *sock)
3200 {
3201 	int more_data;
3202 
3203 	INSIST(VALID_SOCKET(sock));
3204 
3205 	isc_refcount_increment(&sock->references);
3206 	UNLOCK(&sock->lock);
3207 
3208 	more_data = (sock->fdwatchcb)(sock->fdwatchtask, (isc_socket_t *)sock,
3209 				      sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
3210 
3211 	LOCK(&sock->lock);
3212 
3213 	if (isc_refcount_decrement(&sock->references) == 0) {
3214 		UNLOCK(&sock->lock);
3215 		destroy(&sock);
3216 		return;
3217 	}
3218 
3219 	if (more_data)
3220 		select_poke(sock->manager, sock->threadid, sock->fd,
3221 		    SELECT_POKE_READ);
3222 }
3223 
3224 /*
3225  * Process read/writes on each fd here.  Avoid locking
3226  * and unlocking twice if both reads and writes are possible.
3227  */
3228 static void
process_fd(isc__socketthread_t * thread,int fd,bool readable,bool writeable)3229 process_fd(isc__socketthread_t *thread, int fd, bool readable, bool writeable) {
3230 	isc_socket_t *sock;
3231 	int lockid = FDLOCK_ID(fd);
3232 
3233 	/*
3234 	 * If the socket is going to be closed, don't do more I/O.
3235 	 */
3236 	LOCK(&thread->fdlock[lockid]);
3237 	if (thread->fdstate[fd] == CLOSE_PENDING) {
3238 		UNLOCK(&thread->fdlock[lockid]);
3239 
3240 		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
3241 		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
3242 		return;
3243 	}
3244 
3245 	sock = thread->fds[fd];
3246 	if (sock == NULL) {
3247 		UNLOCK(&thread->fdlock[lockid]);
3248 		return;
3249 	}
3250 
3251 	LOCK(&sock->lock);
3252 
3253 	if (sock->fd < 0) {
3254 		/*
3255 		 * Sock is being closed - the final external reference
3256 		 * is gone but it was not yet removed from event loop
3257 		 * and fdstate[]/fds[] as destroy() is waiting on
3258 		 * thread->fdlock[lockid] or sock->lock that we're holding.
3259 		 * Just release the locks and bail.
3260 		 */
3261 		UNLOCK(&sock->lock);
3262 		UNLOCK(&thread->fdlock[lockid]);
3263 		return;
3264 	}
3265 
3266 	REQUIRE(readable || writeable);
3267 	if (writeable) {
3268 		if (sock->connecting) {
3269 			internal_connect(sock);
3270 		} else {
3271 			dispatch_send(sock);
3272 		}
3273 	}
3274 
3275 	if (readable) {
3276 		if (sock->listener) {
3277 			internal_accept(sock); /* unlocks sock */
3278 		} else {
3279 			dispatch_recv(sock);
3280 			UNLOCK(&sock->lock);
3281 		}
3282 	} else {
3283 		UNLOCK(&sock->lock);
3284 	}
3285 
3286 	UNLOCK(&thread->fdlock[lockid]);
3287 
3288 	/*
3289 	 * Socket destruction might be pending, it will resume
3290 	 * after releasing fdlock and sock->lock.
3291 	 */
3292 }
3293 
3294 /*
3295  * process_fds is different for different event loops
3296  * it takes the events from event loops and for each FD
3297  * launches process_fd
3298  */
3299 #ifdef USE_KQUEUE
3300 static bool
process_fds(isc__socketthread_t * thread,struct kevent * events,int nevents)3301 process_fds(isc__socketthread_t *thread, struct kevent *events, int nevents) {
3302 	int i;
3303 	bool readable, writable;
3304 	bool done = false;
3305 	bool have_ctlevent = false;
3306 	if (nevents == thread->nevents) {
3307 		/*
3308 		 * This is not an error, but something unexpected.  If this
3309 		 * happens, it may indicate the need for increasing
3310 		 * ISC_SOCKET_MAXEVENTS.
3311 		 */
3312 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3313 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3314 			   "maximum number of FD events (%d) received",
3315 			   nevents);
3316 	}
3317 
3318 	for (i = 0; i < nevents; i++) {
3319 		REQUIRE(events[i].ident < thread->manager->maxsocks);
3320 		if (events[i].ident == (uintptr_t)thread->pipe_fds[0]) {
3321 			have_ctlevent = true;
3322 			continue;
3323 		}
3324 		readable = (events[i].filter == EVFILT_READ);
3325 		writable = (events[i].filter == EVFILT_WRITE);
3326 		process_fd(thread, events[i].ident, readable, writable);
3327 	}
3328 
3329 	if (have_ctlevent) {
3330 		done = process_ctlfd(thread);
3331 	}
3332 
3333 	return (done);
3334 }
3335 #elif defined(USE_EPOLL)
3336 static bool
process_fds(isc__socketthread_t * thread,struct epoll_event * events,int nevents)3337 process_fds(isc__socketthread_t *thread, struct epoll_event *events,
3338 	    int nevents) {
3339 	int i;
3340 	bool done = false;
3341 	bool have_ctlevent = false;
3342 
3343 	if (nevents == thread->nevents) {
3344 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3345 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3346 			   "maximum number of FD events (%d) received",
3347 			   nevents);
3348 	}
3349 
3350 	for (i = 0; i < nevents; i++) {
3351 		REQUIRE(events[i].data.fd < (int)thread->manager->maxsocks);
3352 		if (events[i].data.fd == thread->pipe_fds[0]) {
3353 			have_ctlevent = true;
3354 			continue;
3355 		}
3356 		if ((events[i].events & EPOLLERR) != 0 ||
3357 		    (events[i].events & EPOLLHUP) != 0)
3358 		{
3359 			/*
3360 			 * epoll does not set IN/OUT bits on an erroneous
3361 			 * condition, so we need to try both anyway.  This is a
3362 			 * bit inefficient, but should be okay for such rare
3363 			 * events.  Note also that the read or write attempt
3364 			 * won't block because we use non-blocking sockets.
3365 			 */
3366 			int fd = events[i].data.fd;
3367 			events[i].events |= thread->epoll_events[fd];
3368 		}
3369 		process_fd(thread, events[i].data.fd,
3370 			   (events[i].events & EPOLLIN) != 0,
3371 			   (events[i].events & EPOLLOUT) != 0);
3372 	}
3373 
3374 	if (have_ctlevent) {
3375 		done = process_ctlfd(thread);
3376 	}
3377 
3378 	return (done);
3379 }
3380 #elif defined(USE_DEVPOLL)
3381 static bool
process_fds(isc__socketthread_t * thread,struct pollfd * events,int nevents)3382 process_fds(isc__socketthread_t *thread, struct pollfd *events, int nevents) {
3383 	int i;
3384 	bool done = false;
3385 	bool have_ctlevent = false;
3386 
3387 	if (nevents == thread->nevents) {
3388 		thread_log(thread, ISC_LOGCATEGORY_GENERAL,
3389 			   ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
3390 			   "maximum number of FD events (%d) received",
3391 			   nevents);
3392 	}
3393 
3394 	for (i = 0; i < nevents; i++) {
3395 		REQUIRE(events[i].fd < (int)thread->manager->maxsocks);
3396 		if (events[i].fd == thread->pipe_fds[0]) {
3397 			have_ctlevent = true;
3398 			continue;
3399 		}
3400 		process_fd(thread, events[i].fd,
3401 			   (events[i].events & POLLIN) != 0,
3402 			   (events[i].events & POLLOUT) != 0);
3403 	}
3404 
3405 	if (have_ctlevent) {
3406 		done = process_ctlfd(thread);
3407 	}
3408 
3409 	return (done);
3410 }
3411 #elif defined(USE_SELECT)
3412 static void
process_fds(isc__socketthread_t * thread,int maxfd,fd_set * readfds,fd_set * writefds)3413 process_fds(isc__socketthread_t *thread, int maxfd, fd_set *readfds,
3414 	    fd_set *writefds) {
3415 	int i;
3416 
3417 	REQUIRE(maxfd <= (int)thread->manager->maxsocks);
3418 
3419 	for (i = 0; i < maxfd; i++) {
3420 		if (i == thread->pipe_fds[0] || i == thread->pipe_fds[1]) {
3421 			continue;
3422 		}
3423 		process_fd(thread, i, FD_ISSET(i, readfds),
3424 			   FD_ISSET(i, writefds));
3425 	}
3426 }
3427 #endif /* ifdef USE_KQUEUE */
3428 
3429 static bool
process_ctlfd(isc__socketthread_t * thread)3430 process_ctlfd(isc__socketthread_t *thread) {
3431 	int msg, fd;
3432 
3433 	for (;;) {
3434 		select_readmsg(thread, &fd, &msg);
3435 
3436 		thread_log(thread, IOEVENT,
3437 			   "watcher got message %d for socket %d", msg, fd);
3438 
3439 		/*
3440 		 * Nothing to read?
3441 		 */
3442 		if (msg == SELECT_POKE_NOTHING) {
3443 			break;
3444 		}
3445 
3446 		/*
3447 		 * Handle shutdown message.  We really should
3448 		 * jump out of this loop right away, but
3449 		 * it doesn't matter if we have to do a little
3450 		 * more work first.
3451 		 */
3452 		if (msg == SELECT_POKE_SHUTDOWN) {
3453 			return (true);
3454 		}
3455 
3456 		/*
3457 		 * This is a wakeup on a socket.  Look
3458 		 * at the event queue for both read and write,
3459 		 * and decide if we need to watch on it now
3460 		 * or not.
3461 		 */
3462 		wakeup_socket(thread, fd, msg);
3463 	}
3464 
3465 	return (false);
3466 }
3467 
3468 /*
3469  * This is the thread that will loop forever, always in a select or poll
3470  * call.
3471  *
3472  * When select returns something to do, do whatever's necessary and post
3473  * an event to the task that was requesting the action.
3474  */
3475 static isc_threadresult_t
netthread(void * uap)3476 netthread(void *uap) {
3477 	isc__socketthread_t *thread = uap;
3478 	isc_socketmgr_t *manager = thread->manager;
3479 	(void)manager;
3480 	bool done;
3481 	int cc;
3482 #ifdef USE_KQUEUE
3483 	const char *fnname = "kevent()";
3484 #elif defined(USE_EPOLL)
3485 	const char *fnname = "epoll_wait()";
3486 #elif defined(USE_DEVPOLL)
3487 	isc_result_t result;
3488 	const char *fnname = "ioctl(DP_POLL)";
3489 	struct dvpoll dvp;
3490 	int pass;
3491 #if defined(ISC_SOCKET_USE_POLLWATCH)
3492 	pollstate_t pollstate = poll_idle;
3493 #endif /* if defined(ISC_SOCKET_USE_POLLWATCH) */
3494 #elif defined(USE_SELECT)
3495 	const char *fnname = "select()";
3496 	int maxfd;
3497 	int ctlfd;
3498 #endif /* ifdef USE_KQUEUE */
3499 	char strbuf[ISC_STRERRORSIZE];
3500 
3501 #if defined(USE_SELECT)
3502 	/*
3503 	 * Get the control fd here.  This will never change.
3504 	 */
3505 	ctlfd = thread->pipe_fds[0];
3506 #endif /* if defined(USE_SELECT) */
3507 	done = false;
3508 	while (!done) {
3509 		do {
3510 #ifdef USE_KQUEUE
3511 			cc = kevent(thread->kqueue_fd, NULL, 0, thread->events,
3512 				    thread->nevents, NULL);
3513 #elif defined(USE_EPOLL)
3514 			cc = epoll_wait(thread->epoll_fd, thread->events,
3515 					thread->nevents, -1);
3516 #elif defined(USE_DEVPOLL)
3517 			/*
3518 			 * Re-probe every thousand calls.
3519 			 */
3520 			if (thread->calls++ > 1000U) {
3521 				result = isc_resource_getcurlimit(
3522 					isc_resource_openfiles,
3523 					&thread->open_max);
3524 				if (result != ISC_R_SUCCESS) {
3525 					thread->open_max = 64;
3526 				}
3527 				thread->calls = 0;
3528 			}
3529 			for (pass = 0; pass < 2; pass++) {
3530 				dvp.dp_fds = thread->events;
3531 				dvp.dp_nfds = thread->nevents;
3532 				if (dvp.dp_nfds >= thread->open_max) {
3533 					dvp.dp_nfds = thread->open_max - 1;
3534 				}
3535 #ifndef ISC_SOCKET_USE_POLLWATCH
3536 				dvp.dp_timeout = -1;
3537 #else  /* ifndef ISC_SOCKET_USE_POLLWATCH */
3538 				if (pollstate == poll_idle) {
3539 					dvp.dp_timeout = -1;
3540 				} else {
3541 					dvp.dp_timeout =
3542 						ISC_SOCKET_POLLWATCH_TIMEOUT;
3543 				}
3544 #endif /* ISC_SOCKET_USE_POLLWATCH */
3545 				cc = ioctl(thread->devpoll_fd, DP_POLL, &dvp);
3546 				if (cc == -1 && errno == EINVAL) {
3547 					/*
3548 					 * {OPEN_MAX} may have dropped.  Look
3549 					 * up the current value and try again.
3550 					 */
3551 					result = isc_resource_getcurlimit(
3552 						isc_resource_openfiles,
3553 						&thread->open_max);
3554 					if (result != ISC_R_SUCCESS) {
3555 						thread->open_max = 64;
3556 					}
3557 				} else {
3558 					break;
3559 				}
3560 			}
3561 #elif defined(USE_SELECT)
3562 			/*
3563 			 * We will have only one thread anyway, we can lock
3564 			 * manager lock and don't care
3565 			 */
3566 			LOCK(&manager->lock);
3567 			memmove(thread->read_fds_copy, thread->read_fds,
3568 				thread->fd_bufsize);
3569 			memmove(thread->write_fds_copy, thread->write_fds,
3570 				thread->fd_bufsize);
3571 			maxfd = thread->maxfd + 1;
3572 			UNLOCK(&manager->lock);
3573 
3574 			cc = select(maxfd, thread->read_fds_copy,
3575 				    thread->write_fds_copy, NULL, NULL);
3576 #endif /* USE_KQUEUE */
3577 
3578 			if (cc < 0 && !SOFT_ERROR(errno)) {
3579 				strerror_r(errno, strbuf, sizeof(strbuf));
3580 				FATAL_ERROR(__FILE__, __LINE__, "%s failed: %s",
3581 					    fnname, strbuf);
3582 			}
3583 
3584 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
3585 			if (cc == 0) {
3586 				if (pollstate == poll_active) {
3587 					pollstate = poll_checking;
3588 				} else if (pollstate == poll_checking) {
3589 					pollstate = poll_idle;
3590 				}
3591 			} else if (cc > 0) {
3592 				if (pollstate == poll_checking) {
3593 					/*
3594 					 * XXX: We'd like to use a more
3595 					 * verbose log level as it's actually an
3596 					 * unexpected event, but the kernel bug
3597 					 * reportedly happens pretty frequently
3598 					 * (and it can also be a false positive)
3599 					 * so it would be just too noisy.
3600 					 */
3601 					thread_log(thread,
3602 						   ISC_LOGCATEGORY_GENERAL,
3603 						   ISC_LOGMODULE_SOCKET,
3604 						   ISC_LOG_DEBUG(1),
3605 						   "unexpected POLL timeout");
3606 				}
3607 				pollstate = poll_active;
3608 			}
3609 #endif /* if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) */
3610 		} while (cc < 0);
3611 
3612 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
3613 		done = process_fds(thread, thread->events, cc);
3614 #elif defined(USE_SELECT)
3615 		process_fds(thread, maxfd, thread->read_fds_copy,
3616 			    thread->write_fds_copy);
3617 
3618 		/*
3619 		 * Process reads on internal, control fd.
3620 		 */
3621 		if (FD_ISSET(ctlfd, thread->read_fds_copy)) {
3622 			done = process_ctlfd(thread);
3623 		}
3624 #endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
3625 	* */
3626 	}
3627 
3628 	thread_log(thread, TRACE, "watcher exiting");
3629 	return ((isc_threadresult_t)0);
3630 }
3631 
3632 void
isc_socketmgr_setreserved(isc_socketmgr_t * manager,uint32_t reserved)3633 isc_socketmgr_setreserved(isc_socketmgr_t *manager, uint32_t reserved) {
3634 	REQUIRE(VALID_MANAGER(manager));
3635 
3636 	manager->reserved = reserved;
3637 }
3638 
3639 void
isc_socketmgr_maxudp(isc_socketmgr_t * manager,unsigned int maxudp)3640 isc_socketmgr_maxudp(isc_socketmgr_t *manager, unsigned int maxudp) {
3641 	REQUIRE(VALID_MANAGER(manager));
3642 
3643 	manager->maxudp = maxudp;
3644 }
3645 
3646 /*
3647  * Setup socket thread, thread->manager and thread->threadid must be filled.
3648  */
3649 
3650 static isc_result_t
setup_thread(isc__socketthread_t * thread)3651 setup_thread(isc__socketthread_t *thread) {
3652 	isc_result_t result = ISC_R_SUCCESS;
3653 	int i;
3654 	char strbuf[ISC_STRERRORSIZE];
3655 
3656 	REQUIRE(thread != NULL);
3657 	REQUIRE(VALID_MANAGER(thread->manager));
3658 	REQUIRE(thread->threadid >= 0 &&
3659 		thread->threadid < thread->manager->nthreads);
3660 
3661 	thread->fds =
3662 		isc_mem_get(thread->manager->mctx,
3663 			    thread->manager->maxsocks * sizeof(isc_socket_t *));
3664 
3665 	memset(thread->fds, 0,
3666 	       thread->manager->maxsocks * sizeof(isc_socket_t *));
3667 
3668 	thread->fdstate = isc_mem_get(thread->manager->mctx,
3669 				      thread->manager->maxsocks * sizeof(int));
3670 
3671 	memset(thread->fdstate, 0, thread->manager->maxsocks * sizeof(int));
3672 
3673 	thread->fdlock = isc_mem_get(thread->manager->mctx,
3674 				     FDLOCK_COUNT * sizeof(isc_mutex_t));
3675 
3676 	for (i = 0; i < FDLOCK_COUNT; i++) {
3677 		isc_mutex_init(&thread->fdlock[i]);
3678 	}
3679 
3680 	if (pipe(thread->pipe_fds) != 0) {
3681 		strerror_r(errno, strbuf, sizeof(strbuf));
3682 		UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() failed: %s",
3683 				 strbuf);
3684 		return (ISC_R_UNEXPECTED);
3685 	}
3686 	RUNTIME_CHECK(make_nonblock(thread->pipe_fds[0]) == ISC_R_SUCCESS);
3687 
3688 #ifdef USE_KQUEUE
3689 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3690 	thread->events = isc_mem_get(thread->manager->mctx,
3691 				     sizeof(struct kevent) * thread->nevents);
3692 
3693 	thread->kqueue_fd = kqueue();
3694 	if (thread->kqueue_fd == -1) {
3695 		result = isc__errno2result(errno);
3696 		strerror_r(errno, strbuf, sizeof(strbuf));
3697 		UNEXPECTED_ERROR(__FILE__, __LINE__, "kqueue failed: %s",
3698 				 strbuf);
3699 		isc_mem_put(thread->manager->mctx, thread->events,
3700 			    sizeof(struct kevent) * thread->nevents);
3701 		return (result);
3702 	}
3703 
3704 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3705 	if (result != ISC_R_SUCCESS) {
3706 		close(thread->kqueue_fd);
3707 		isc_mem_put(thread->manager->mctx, thread->events,
3708 			    sizeof(struct kevent) * thread->nevents);
3709 	}
3710 	return (result);
3711 
3712 #elif defined(USE_EPOLL)
3713 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3714 	thread->epoll_events =
3715 		isc_mem_get(thread->manager->mctx,
3716 			    (thread->manager->maxsocks * sizeof(uint32_t)));
3717 
3718 	memset(thread->epoll_events, 0,
3719 	       thread->manager->maxsocks * sizeof(uint32_t));
3720 
3721 	thread->events =
3722 		isc_mem_get(thread->manager->mctx,
3723 			    sizeof(struct epoll_event) * thread->nevents);
3724 
3725 	thread->epoll_fd = epoll_create(thread->nevents);
3726 	if (thread->epoll_fd == -1) {
3727 		result = isc__errno2result(errno);
3728 		strerror_r(errno, strbuf, sizeof(strbuf));
3729 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_create failed: %s",
3730 				 strbuf);
3731 		return (result);
3732 	}
3733 
3734 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3735 	return (result);
3736 
3737 #elif defined(USE_DEVPOLL)
3738 	thread->nevents = ISC_SOCKET_MAXEVENTS;
3739 	result = isc_resource_getcurlimit(isc_resource_openfiles,
3740 					  &thread->open_max);
3741 	if (result != ISC_R_SUCCESS) {
3742 		thread->open_max = 64;
3743 	}
3744 	thread->calls = 0;
3745 	thread->events = isc_mem_get(thread->manager->mctx,
3746 				     sizeof(struct pollfd) * thread->nevents);
3747 
3748 	/*
3749 	 * Note: fdpollinfo should be able to support all possible FDs, so
3750 	 * it must have maxsocks entries (not nevents).
3751 	 */
3752 	thread->fdpollinfo =
3753 		isc_mem_get(thread->manager->mctx,
3754 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3755 	memset(thread->fdpollinfo, 0,
3756 	       sizeof(pollinfo_t) * thread->manager->maxsocks);
3757 	thread->devpoll_fd = open("/dev/poll", O_RDWR);
3758 	if (thread->devpoll_fd == -1) {
3759 		result = isc__errno2result(errno);
3760 		strerror_r(errno, strbuf, sizeof(strbuf));
3761 		UNEXPECTED_ERROR(__FILE__, __LINE__,
3762 				 "open(/dev/poll) failed: %s", strbuf);
3763 		isc_mem_put(thread->manager->mctx, thread->events,
3764 			    sizeof(struct pollfd) * thread->nevents);
3765 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3766 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3767 		return (result);
3768 	}
3769 	result = watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3770 	if (result != ISC_R_SUCCESS) {
3771 		close(thread->devpoll_fd);
3772 		isc_mem_put(thread->manager->mctx, thread->events,
3773 			    sizeof(struct pollfd) * thread->nevents);
3774 		isc_mem_put(thread->manager->mctx, thread->fdpollinfo,
3775 			    sizeof(pollinfo_t) * thread->manager->maxsocks);
3776 		return (result);
3777 	}
3778 
3779 	return (ISC_R_SUCCESS);
3780 #elif defined(USE_SELECT)
3781 	UNUSED(result);
3782 
3783 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
3784 	/*
3785 	 * Note: this code should also cover the case of MAXSOCKETS <=
3786 	 * FD_SETSIZE, but we separate the cases to avoid possible portability
3787 	 * issues regarding howmany() and the actual representation of fd_set.
3788 	 */
3789 	thread->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
3790 			     sizeof(fd_mask);
3791 #else  /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3792 	thread->fd_bufsize = sizeof(fd_set);
3793 #endif /* if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
3794 
3795 	thread->read_fds = isc_mem_get(thread->manager->mctx,
3796 				       thread->fd_bufsize);
3797 	thread->read_fds_copy = isc_mem_get(thread->manager->mctx,
3798 					    thread->fd_bufsize);
3799 	thread->write_fds = isc_mem_get(thread->manager->mctx,
3800 					thread->fd_bufsize);
3801 	thread->write_fds_copy = isc_mem_get(thread->manager->mctx,
3802 					     thread->fd_bufsize);
3803 	memset(thread->read_fds, 0, thread->fd_bufsize);
3804 	memset(thread->write_fds, 0, thread->fd_bufsize);
3805 
3806 	(void)watch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3807 	thread->maxfd = thread->pipe_fds[0];
3808 
3809 	return (ISC_R_SUCCESS);
3810 #endif /* USE_KQUEUE */
3811 }
3812 
3813 static void
cleanup_thread(isc_mem_t * mctx,isc__socketthread_t * thread)3814 cleanup_thread(isc_mem_t *mctx, isc__socketthread_t *thread) {
3815 	isc_result_t result;
3816 	int i;
3817 
3818 	result = unwatch_fd(thread, thread->pipe_fds[0], SELECT_POKE_READ);
3819 	if (result != ISC_R_SUCCESS) {
3820 		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL) failed");
3821 	}
3822 #ifdef USE_KQUEUE
3823 	close(thread->kqueue_fd);
3824 	isc_mem_put(mctx, thread->events,
3825 		    sizeof(struct kevent) * thread->nevents);
3826 #elif defined(USE_EPOLL)
3827 	close(thread->epoll_fd);
3828 
3829 	isc_mem_put(mctx, thread->events,
3830 		    sizeof(struct epoll_event) * thread->nevents);
3831 #elif defined(USE_DEVPOLL)
3832 	close(thread->devpoll_fd);
3833 	isc_mem_put(mctx, thread->events,
3834 		    sizeof(struct pollfd) * thread->nevents);
3835 	isc_mem_put(mctx, thread->fdpollinfo,
3836 		    sizeof(pollinfo_t) * thread->manager->maxsocks);
3837 #elif defined(USE_SELECT)
3838 	if (thread->read_fds != NULL) {
3839 		isc_mem_put(mctx, thread->read_fds, thread->fd_bufsize);
3840 	}
3841 	if (thread->read_fds_copy != NULL) {
3842 		isc_mem_put(mctx, thread->read_fds_copy, thread->fd_bufsize);
3843 	}
3844 	if (thread->write_fds != NULL) {
3845 		isc_mem_put(mctx, thread->write_fds, thread->fd_bufsize);
3846 	}
3847 	if (thread->write_fds_copy != NULL) {
3848 		isc_mem_put(mctx, thread->write_fds_copy, thread->fd_bufsize);
3849 	}
3850 #endif /* USE_KQUEUE */
3851 	for (i = 0; i < (int)thread->manager->maxsocks; i++) {
3852 		if (thread->fdstate[i] == CLOSE_PENDING) {
3853 			/* no need to lock */
3854 			(void)close(i);
3855 		}
3856 	}
3857 
3858 #if defined(USE_EPOLL)
3859 	isc_mem_put(thread->manager->mctx, thread->epoll_events,
3860 		    thread->manager->maxsocks * sizeof(uint32_t));
3861 #endif /* if defined(USE_EPOLL) */
3862 	isc_mem_put(thread->manager->mctx, thread->fds,
3863 		    thread->manager->maxsocks * sizeof(isc_socket_t *));
3864 	isc_mem_put(thread->manager->mctx, thread->fdstate,
3865 		    thread->manager->maxsocks * sizeof(int));
3866 
3867 	for (i = 0; i < FDLOCK_COUNT; i++) {
3868 		isc_mutex_destroy(&thread->fdlock[i]);
3869 	}
3870 	isc_mem_put(thread->manager->mctx, thread->fdlock,
3871 		    FDLOCK_COUNT * sizeof(isc_mutex_t));
3872 }
3873 
3874 isc_result_t
isc_socketmgr_create(isc_mem_t * mctx,isc_socketmgr_t ** managerp)3875 isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
3876 	return (isc_socketmgr_create2(mctx, managerp, 0, 1));
3877 }
3878 
3879 isc_result_t
isc_socketmgr_create2(isc_mem_t * mctx,isc_socketmgr_t ** managerp,unsigned int maxsocks,int nthreads)3880 isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
3881 		      unsigned int maxsocks, int nthreads) {
3882 	int i;
3883 	isc_socketmgr_t *manager;
3884 
3885 	REQUIRE(managerp != NULL && *managerp == NULL);
3886 
3887 	if (maxsocks == 0) {
3888 		maxsocks = ISC_SOCKET_MAXSOCKETS;
3889 	}
3890 
3891 	manager = isc_mem_get(mctx, sizeof(*manager));
3892 
3893 	/* zero-clear so that necessary cleanup on failure will be easy */
3894 	memset(manager, 0, sizeof(*manager));
3895 	manager->maxsocks = maxsocks;
3896 	manager->reserved = 0;
3897 	manager->maxudp = 0;
3898 	manager->nthreads = nthreads;
3899 	manager->stats = NULL;
3900 
3901 	manager->magic = SOCKET_MANAGER_MAGIC;
3902 	manager->mctx = NULL;
3903 	ISC_LIST_INIT(manager->socklist);
3904 	isc_mutex_init(&manager->lock);
3905 	isc_condition_init(&manager->shutdown_ok);
3906 
3907 	/*
3908 	 * Start up the select/poll thread.
3909 	 */
3910 	manager->threads = isc_mem_get(mctx, sizeof(isc__socketthread_t) *
3911 						     manager->nthreads);
3912 	isc_mem_attach(mctx, &manager->mctx);
3913 
3914 	for (i = 0; i < manager->nthreads; i++) {
3915 		manager->threads[i].manager = manager;
3916 		manager->threads[i].threadid = i;
3917 		setup_thread(&manager->threads[i]);
3918 		isc_thread_create(netthread, &manager->threads[i],
3919 				  &manager->threads[i].thread);
3920 		char tname[1024];
3921 		sprintf(tname, "sock-%d", i);
3922 		isc_thread_setname(manager->threads[i].thread, tname);
3923 	}
3924 
3925 	*managerp = manager;
3926 
3927 	return (ISC_R_SUCCESS);
3928 }
3929 
3930 isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t * manager,unsigned int * nsockp)3931 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
3932 	REQUIRE(VALID_MANAGER(manager));
3933 	REQUIRE(nsockp != NULL);
3934 
3935 	*nsockp = manager->maxsocks;
3936 
3937 	return (ISC_R_SUCCESS);
3938 }
3939 
3940 void
isc_socketmgr_setstats(isc_socketmgr_t * manager,isc_stats_t * stats)3941 isc_socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
3942 	REQUIRE(VALID_MANAGER(manager));
3943 	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
3944 	REQUIRE(manager->stats == NULL);
3945 	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
3946 
3947 	isc_stats_attach(stats, &manager->stats);
3948 }
3949 
3950 void
isc_socketmgr_destroy(isc_socketmgr_t ** managerp)3951 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
3952 	isc_socketmgr_t *manager;
3953 
3954 	/*
3955 	 * Destroy a socket manager.
3956 	 */
3957 
3958 	REQUIRE(managerp != NULL);
3959 	manager = *managerp;
3960 	REQUIRE(VALID_MANAGER(manager));
3961 
3962 	LOCK(&manager->lock);
3963 
3964 	/*
3965 	 * Wait for all sockets to be destroyed.
3966 	 */
3967 	while (!ISC_LIST_EMPTY(manager->socklist)) {
3968 		manager_log(manager, CREATION, "sockets exist");
3969 		WAIT(&manager->shutdown_ok, &manager->lock);
3970 	}
3971 
3972 	UNLOCK(&manager->lock);
3973 
3974 	/*
3975 	 * Here, poke our select/poll thread.  Do this by closing the write
3976 	 * half of the pipe, which will send EOF to the read half.
3977 	 * This is currently a no-op in the non-threaded case.
3978 	 */
3979 	for (int i = 0; i < manager->nthreads; i++) {
3980 		select_poke(manager, i, 0, SELECT_POKE_SHUTDOWN);
3981 	}
3982 
3983 	/*
3984 	 * Wait for thread to exit.
3985 	 */
3986 	for (int i = 0; i < manager->nthreads; i++) {
3987 		isc_thread_join(manager->threads[i].thread, NULL);
3988 		cleanup_thread(manager->mctx, &manager->threads[i]);
3989 	}
3990 	/*
3991 	 * Clean up.
3992 	 */
3993 	isc_mem_put(manager->mctx, manager->threads,
3994 		    sizeof(isc__socketthread_t) * manager->nthreads);
3995 	(void)isc_condition_destroy(&manager->shutdown_ok);
3996 
3997 	if (manager->stats != NULL) {
3998 		isc_stats_detach(&manager->stats);
3999 	}
4000 	isc_mutex_destroy(&manager->lock);
4001 	manager->magic = 0;
4002 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
4003 
4004 	*managerp = NULL;
4005 }
4006 
4007 static isc_result_t
socket_recv(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,unsigned int flags)4008 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4009 	    unsigned int flags) {
4010 	int io_state;
4011 	bool have_lock = false;
4012 	isc_task_t *ntask = NULL;
4013 	isc_result_t result = ISC_R_SUCCESS;
4014 
4015 	dev->ev_sender = task;
4016 
4017 	if (sock->type == isc_sockettype_udp) {
4018 		io_state = doio_recv(sock, dev);
4019 	} else {
4020 		LOCK(&sock->lock);
4021 		have_lock = true;
4022 
4023 		if (ISC_LIST_EMPTY(sock->recv_list)) {
4024 			io_state = doio_recv(sock, dev);
4025 		} else {
4026 			io_state = DOIO_SOFT;
4027 		}
4028 	}
4029 
4030 	switch (io_state) {
4031 	case DOIO_SOFT:
4032 		/*
4033 		 * We couldn't read all or part of the request right now, so
4034 		 * queue it.
4035 		 *
4036 		 * Attach to socket and to task
4037 		 */
4038 		isc_task_attach(task, &ntask);
4039 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4040 
4041 		if (!have_lock) {
4042 			LOCK(&sock->lock);
4043 			have_lock = true;
4044 		}
4045 
4046 		/*
4047 		 * Enqueue the request.  If the socket was previously not being
4048 		 * watched, poke the watcher to start paying attention to it.
4049 		 */
4050 		bool do_poke = ISC_LIST_EMPTY(sock->recv_list);
4051 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
4052 		if (do_poke) {
4053 			select_poke(sock->manager, sock->threadid, sock->fd,
4054 				    SELECT_POKE_READ);
4055 		}
4056 
4057 		socket_log(sock, NULL, EVENT,
4058 			   "socket_recv: event %p -> task %p", dev, ntask);
4059 
4060 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4061 			result = ISC_R_INPROGRESS;
4062 		}
4063 		break;
4064 
4065 	case DOIO_EOF:
4066 		dev->result = ISC_R_EOF;
4067 		FALLTHROUGH;
4068 
4069 	case DOIO_HARD:
4070 	case DOIO_SUCCESS:
4071 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4072 			send_recvdone_event(sock, &dev);
4073 		}
4074 		break;
4075 	}
4076 
4077 	if (have_lock) {
4078 		UNLOCK(&sock->lock);
4079 	}
4080 
4081 	return (result);
4082 }
4083 
4084 isc_result_t
isc_socket_recv(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_taskaction_t action,void * arg)4085 isc_socket_recv(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4086 		isc_task_t *task, isc_taskaction_t action, void *arg) {
4087 	isc_socketevent_t *dev;
4088 	isc_socketmgr_t *manager;
4089 
4090 	REQUIRE(VALID_SOCKET(sock));
4091 	REQUIRE(action != NULL);
4092 
4093 	manager = sock->manager;
4094 	REQUIRE(VALID_MANAGER(manager));
4095 
4096 	INSIST(sock->bound);
4097 
4098 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_RECVDONE,
4099 				   action, arg);
4100 	if (dev == NULL) {
4101 		return (ISC_R_NOMEMORY);
4102 	}
4103 
4104 	return (isc_socket_recv2(sock, region, minimum, task, dev, 0));
4105 }
4106 
4107 isc_result_t
isc_socket_recv2(isc_socket_t * sock,isc_region_t * region,unsigned int minimum,isc_task_t * task,isc_socketevent_t * event,unsigned int flags)4108 isc_socket_recv2(isc_socket_t *sock, isc_region_t *region, unsigned int minimum,
4109 		 isc_task_t *task, isc_socketevent_t *event,
4110 		 unsigned int flags) {
4111 	event->ev_sender = sock;
4112 	event->result = ISC_R_UNSET;
4113 	event->region = *region;
4114 	event->n = 0;
4115 	event->offset = 0;
4116 	event->attributes = 0;
4117 
4118 	/*
4119 	 * UDP sockets are always partial read.
4120 	 */
4121 	if (sock->type == isc_sockettype_udp) {
4122 		event->minimum = 1;
4123 	} else {
4124 		if (minimum == 0) {
4125 			event->minimum = region->length;
4126 		} else {
4127 			event->minimum = minimum;
4128 		}
4129 	}
4130 
4131 	return (socket_recv(sock, event, task, flags));
4132 }
4133 
4134 static isc_result_t
socket_send(isc_socket_t * sock,isc_socketevent_t * dev,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,unsigned int flags)4135 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
4136 	    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4137 	    unsigned int flags) {
4138 	int io_state;
4139 	bool have_lock = false;
4140 	isc_task_t *ntask = NULL;
4141 	isc_result_t result = ISC_R_SUCCESS;
4142 
4143 	dev->ev_sender = task;
4144 
4145 	set_dev_address(address, sock, dev);
4146 	if (pktinfo != NULL) {
4147 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
4148 		dev->pktinfo = *pktinfo;
4149 
4150 		if (!isc_sockaddr_issitelocal(&dev->address) &&
4151 		    !isc_sockaddr_islinklocal(&dev->address))
4152 		{
4153 			socket_log(sock, NULL, TRACE,
4154 				   "pktinfo structure provided, ifindex %u "
4155 				   "(set to 0)",
4156 				   pktinfo->ipi6_ifindex);
4157 
4158 			/*
4159 			 * Set the pktinfo index to 0 here, to let the
4160 			 * kernel decide what interface it should send on.
4161 			 */
4162 			dev->pktinfo.ipi6_ifindex = 0;
4163 		}
4164 	}
4165 
4166 	if (sock->type == isc_sockettype_udp) {
4167 		io_state = doio_send(sock, dev);
4168 	} else {
4169 		LOCK(&sock->lock);
4170 		have_lock = true;
4171 
4172 		if (ISC_LIST_EMPTY(sock->send_list)) {
4173 			io_state = doio_send(sock, dev);
4174 		} else {
4175 			io_state = DOIO_SOFT;
4176 		}
4177 	}
4178 
4179 	switch (io_state) {
4180 	case DOIO_SOFT:
4181 		/*
4182 		 * We couldn't send all or part of the request right now, so
4183 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
4184 		 */
4185 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
4186 			isc_task_attach(task, &ntask);
4187 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
4188 
4189 			if (!have_lock) {
4190 				LOCK(&sock->lock);
4191 				have_lock = true;
4192 			}
4193 
4194 			/*
4195 			 * Enqueue the request.  If the socket was previously
4196 			 * not being watched, poke the watcher to start
4197 			 * paying attention to it.
4198 			 */
4199 			bool do_poke = ISC_LIST_EMPTY(sock->send_list);
4200 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
4201 			if (do_poke) {
4202 				select_poke(sock->manager, sock->threadid,
4203 					    sock->fd, SELECT_POKE_WRITE);
4204 			}
4205 			socket_log(sock, NULL, EVENT,
4206 				   "socket_send: event %p -> task %p", dev,
4207 				   ntask);
4208 
4209 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) {
4210 				result = ISC_R_INPROGRESS;
4211 			}
4212 			break;
4213 		}
4214 
4215 		FALLTHROUGH;
4216 
4217 	case DOIO_HARD:
4218 	case DOIO_SUCCESS:
4219 		if (!have_lock) {
4220 			LOCK(&sock->lock);
4221 			have_lock = true;
4222 		}
4223 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) {
4224 			send_senddone_event(sock, &dev);
4225 		}
4226 		break;
4227 	}
4228 
4229 	if (have_lock) {
4230 		UNLOCK(&sock->lock);
4231 	}
4232 
4233 	return (result);
4234 }
4235 
4236 isc_result_t
isc_socket_send(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg)4237 isc_socket_send(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4238 		isc_taskaction_t action, void *arg) {
4239 	/*
4240 	 * REQUIRE() checking is performed in isc_socket_sendto().
4241 	 */
4242 	return (isc_socket_sendto(sock, region, task, action, arg, NULL, NULL));
4243 }
4244 
4245 isc_result_t
isc_socket_sendto(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,isc_taskaction_t action,void * arg,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo)4246 isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4247 		  isc_taskaction_t action, void *arg,
4248 		  const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo) {
4249 	isc_socketevent_t *dev;
4250 	isc_socketmgr_t *manager;
4251 
4252 	REQUIRE(VALID_SOCKET(sock));
4253 	REQUIRE(region != NULL);
4254 	REQUIRE(task != NULL);
4255 	REQUIRE(action != NULL);
4256 
4257 	manager = sock->manager;
4258 	REQUIRE(VALID_MANAGER(manager));
4259 
4260 	INSIST(sock->bound);
4261 
4262 	dev = allocate_socketevent(manager->mctx, sock, ISC_SOCKEVENT_SENDDONE,
4263 				   action, arg);
4264 	if (dev == NULL) {
4265 		return (ISC_R_NOMEMORY);
4266 	}
4267 
4268 	dev->region = *region;
4269 
4270 	return (socket_send(sock, dev, task, address, pktinfo, 0));
4271 }
4272 
4273 isc_result_t
isc_socket_sendto2(isc_socket_t * sock,isc_region_t * region,isc_task_t * task,const isc_sockaddr_t * address,struct in6_pktinfo * pktinfo,isc_socketevent_t * event,unsigned int flags)4274 isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_task_t *task,
4275 		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
4276 		   isc_socketevent_t *event, unsigned int flags) {
4277 	REQUIRE(VALID_SOCKET(sock));
4278 	REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE | ISC_SOCKFLAG_NORETRY)) ==
4279 		0);
4280 	if ((flags & ISC_SOCKFLAG_NORETRY) != 0) {
4281 		REQUIRE(sock->type == isc_sockettype_udp);
4282 	}
4283 	event->ev_sender = sock;
4284 	event->result = ISC_R_UNSET;
4285 	event->region = *region;
4286 	event->n = 0;
4287 	event->offset = 0;
4288 	event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
4289 
4290 	return (socket_send(sock, event, task, address, pktinfo, flags));
4291 }
4292 
4293 void
isc_socket_cleanunix(const isc_sockaddr_t * sockaddr,bool active)4294 isc_socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active) {
4295 #ifdef ISC_PLATFORM_HAVESYSUNH
4296 	int s;
4297 	struct stat sb;
4298 	char strbuf[ISC_STRERRORSIZE];
4299 
4300 	if (sockaddr->type.sa.sa_family != AF_UNIX) {
4301 		return;
4302 	}
4303 
4304 #ifndef S_ISSOCK
4305 #if defined(S_IFMT) && defined(S_IFSOCK)
4306 #define S_ISSOCK(mode) ((mode & S_IFMT) == S_IFSOCK)
4307 #elif defined(_S_IFMT) && defined(S_IFSOCK)
4308 #define S_ISSOCK(mode) ((mode & _S_IFMT) == S_IFSOCK)
4309 #endif /* if defined(S_IFMT) && defined(S_IFSOCK) */
4310 #endif /* ifndef S_ISSOCK */
4311 
4312 #ifndef S_ISFIFO
4313 #if defined(S_IFMT) && defined(S_IFIFO)
4314 #define S_ISFIFO(mode) ((mode & S_IFMT) == S_IFIFO)
4315 #elif defined(_S_IFMT) && defined(S_IFIFO)
4316 #define S_ISFIFO(mode) ((mode & _S_IFMT) == S_IFIFO)
4317 #endif /* if defined(S_IFMT) && defined(S_IFIFO) */
4318 #endif /* ifndef S_ISFIFO */
4319 
4320 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
4321 /* cppcheck-suppress preprocessorErrorDirective */
4322 #error \
4323 	You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
4324 #endif /* if !defined(S_ISFIFO) && !defined(S_ISSOCK) */
4325 
4326 #ifndef S_ISFIFO
4327 #define S_ISFIFO(mode) 0
4328 #endif /* ifndef S_ISFIFO */
4329 
4330 #ifndef S_ISSOCK
4331 #define S_ISSOCK(mode) 0
4332 #endif /* ifndef S_ISSOCK */
4333 
4334 	if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
4335 		switch (errno) {
4336 		case ENOENT:
4337 			if (active) { /* We exited cleanly last time */
4338 				break;
4339 			}
4340 			FALLTHROUGH;
4341 		default:
4342 			strerror_r(errno, strbuf, sizeof(strbuf));
4343 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4344 				      ISC_LOGMODULE_SOCKET,
4345 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4346 				      "isc_socket_cleanunix: stat(%s): %s",
4347 				      sockaddr->type.sunix.sun_path, strbuf);
4348 			return;
4349 		}
4350 	} else {
4351 		if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
4352 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4353 				      ISC_LOGMODULE_SOCKET,
4354 				      active ? ISC_LOG_ERROR : ISC_LOG_WARNING,
4355 				      "isc_socket_cleanunix: %s: not a socket",
4356 				      sockaddr->type.sunix.sun_path);
4357 			return;
4358 		}
4359 	}
4360 
4361 	if (active) {
4362 		if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4363 			strerror_r(errno, strbuf, sizeof(strbuf));
4364 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4365 				      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4366 				      "isc_socket_cleanunix: unlink(%s): %s",
4367 				      sockaddr->type.sunix.sun_path, strbuf);
4368 		}
4369 		return;
4370 	}
4371 
4372 	s = socket(AF_UNIX, SOCK_STREAM, 0);
4373 	if (s < 0) {
4374 		strerror_r(errno, strbuf, sizeof(strbuf));
4375 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4376 			      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4377 			      "isc_socket_cleanunix: socket(%s): %s",
4378 			      sockaddr->type.sunix.sun_path, strbuf);
4379 		return;
4380 	}
4381 
4382 	if (connect(s, (const struct sockaddr *)&sockaddr->type.sunix,
4383 		    sizeof(sockaddr->type.sunix)) < 0)
4384 	{
4385 		switch (errno) {
4386 		case ECONNREFUSED:
4387 		case ECONNRESET:
4388 			if (unlink(sockaddr->type.sunix.sun_path) < 0) {
4389 				strerror_r(errno, strbuf, sizeof(strbuf));
4390 				isc_log_write(
4391 					isc_lctx, ISC_LOGCATEGORY_GENERAL,
4392 					ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4393 					"isc_socket_cleanunix: "
4394 					"unlink(%s): %s",
4395 					sockaddr->type.sunix.sun_path, strbuf);
4396 			}
4397 			break;
4398 		default:
4399 			strerror_r(errno, strbuf, sizeof(strbuf));
4400 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4401 				      ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
4402 				      "isc_socket_cleanunix: connect(%s): %s",
4403 				      sockaddr->type.sunix.sun_path, strbuf);
4404 			break;
4405 		}
4406 	}
4407 	close(s);
4408 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4409 	UNUSED(sockaddr);
4410 	UNUSED(active);
4411 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4412 }
4413 
4414 isc_result_t
isc_socket_permunix(const isc_sockaddr_t * sockaddr,uint32_t perm,uint32_t owner,uint32_t group)4415 isc_socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
4416 		    uint32_t owner, uint32_t group) {
4417 #ifdef ISC_PLATFORM_HAVESYSUNH
4418 	isc_result_t result = ISC_R_SUCCESS;
4419 	char strbuf[ISC_STRERRORSIZE];
4420 	char path[sizeof(sockaddr->type.sunix.sun_path)];
4421 #ifdef NEED_SECURE_DIRECTORY
4422 	char *slash;
4423 #endif /* ifdef NEED_SECURE_DIRECTORY */
4424 
4425 	REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
4426 	INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
4427 	strlcpy(path, sockaddr->type.sunix.sun_path, sizeof(path));
4428 
4429 #ifdef NEED_SECURE_DIRECTORY
4430 	slash = strrchr(path, '/');
4431 	if (slash != NULL) {
4432 		if (slash != path) {
4433 			*slash = '\0';
4434 		} else {
4435 			strlcpy(path, "/", sizeof(path));
4436 		}
4437 	} else {
4438 		strlcpy(path, ".", sizeof(path));
4439 	}
4440 #endif /* ifdef NEED_SECURE_DIRECTORY */
4441 
4442 	if (chmod(path, perm) < 0) {
4443 		strerror_r(errno, strbuf, sizeof(strbuf));
4444 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4445 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4446 			      "isc_socket_permunix: chmod(%s, %d): %s", path,
4447 			      perm, strbuf);
4448 		result = ISC_R_FAILURE;
4449 	}
4450 	if (chown(path, owner, group) < 0) {
4451 		strerror_r(errno, strbuf, sizeof(strbuf));
4452 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4453 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
4454 			      "isc_socket_permunix: chown(%s, %d, %d): %s",
4455 			      path, owner, group, strbuf);
4456 		result = ISC_R_FAILURE;
4457 	}
4458 	return (result);
4459 #else  /* ifdef ISC_PLATFORM_HAVESYSUNH */
4460 	UNUSED(sockaddr);
4461 	UNUSED(perm);
4462 	UNUSED(owner);
4463 	UNUSED(group);
4464 	return (ISC_R_NOTIMPLEMENTED);
4465 #endif /* ifdef ISC_PLATFORM_HAVESYSUNH */
4466 }
4467 
4468 isc_result_t
isc_socket_bind(isc_socket_t * sock,const isc_sockaddr_t * sockaddr,isc_socket_options_t options)4469 isc_socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
4470 		isc_socket_options_t options) {
4471 	char strbuf[ISC_STRERRORSIZE];
4472 	int on = 1;
4473 
4474 	REQUIRE(VALID_SOCKET(sock));
4475 
4476 	LOCK(&sock->lock);
4477 
4478 	INSIST(!sock->bound);
4479 	INSIST(!sock->dupped);
4480 
4481 	if (sock->pf != sockaddr->type.sa.sa_family) {
4482 		UNLOCK(&sock->lock);
4483 		return (ISC_R_FAMILYMISMATCH);
4484 	}
4485 
4486 	/*
4487 	 * Only set SO_REUSEADDR when we want a specific port.
4488 	 */
4489 #ifdef AF_UNIX
4490 	if (sock->pf == AF_UNIX) {
4491 		goto bind_socket;
4492 	}
4493 #endif /* ifdef AF_UNIX */
4494 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
4495 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0)
4496 	{
4497 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
4498 			       sizeof(on)) < 0)
4499 		{
4500 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4501 					 "setsockopt(%d) failed", sock->fd);
4502 		}
4503 #if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB)
4504 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT_LB,
4505 			       (void *)&on, sizeof(on)) < 0)
4506 		{
4507 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4508 					 "setsockopt(%d) failed", sock->fd);
4509 		}
4510 #elif defined(__linux__) && defined(SO_REUSEPORT)
4511 		if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, (void *)&on,
4512 			       sizeof(on)) < 0)
4513 		{
4514 			UNEXPECTED_ERROR(__FILE__, __LINE__,
4515 					 "setsockopt(%d) failed", sock->fd);
4516 		}
4517 #endif		/* if defined(__FreeBSD_kernel__) && defined(SO_REUSEPORT_LB) */
4518 		/* Press on... */
4519 	}
4520 #ifdef AF_UNIX
4521 bind_socket:
4522 #endif /* ifdef AF_UNIX */
4523 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
4524 		inc_stats(sock->manager->stats,
4525 			  sock->statsindex[STATID_BINDFAIL]);
4526 
4527 		UNLOCK(&sock->lock);
4528 		switch (errno) {
4529 		case EACCES:
4530 			return (ISC_R_NOPERM);
4531 		case EADDRNOTAVAIL:
4532 			return (ISC_R_ADDRNOTAVAIL);
4533 		case EADDRINUSE:
4534 			return (ISC_R_ADDRINUSE);
4535 		case EINVAL:
4536 			return (ISC_R_BOUND);
4537 		default:
4538 			strerror_r(errno, strbuf, sizeof(strbuf));
4539 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
4540 					 strbuf);
4541 			return (ISC_R_UNEXPECTED);
4542 		}
4543 	}
4544 
4545 	socket_log(sock, sockaddr, TRACE, "bound");
4546 	sock->bound = 1;
4547 
4548 	UNLOCK(&sock->lock);
4549 	return (ISC_R_SUCCESS);
4550 }
4551 
4552 /*
4553  * Enable this only for specific OS versions, and only when they have repaired
4554  * their problems with it.  Until then, this is is broken and needs to be
4555  * disabled by default.  See RT22589 for details.
4556  */
4557 #undef ENABLE_ACCEPTFILTER
4558 
4559 isc_result_t
isc_socket_filter(isc_socket_t * sock,const char * filter)4560 isc_socket_filter(isc_socket_t *sock, const char *filter) {
4561 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4562 	char strbuf[ISC_STRERRORSIZE];
4563 	struct accept_filter_arg afa;
4564 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4565 	UNUSED(sock);
4566 	UNUSED(filter);
4567 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4568 
4569 	REQUIRE(VALID_SOCKET(sock));
4570 
4571 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
4572 	bzero(&afa, sizeof(afa));
4573 	strlcpy(afa.af_name, filter, sizeof(afa.af_name));
4574 	if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER, &afa,
4575 		       sizeof(afa)) == -1)
4576 	{
4577 		strerror_r(errno, strbuf, sizeof(strbuf));
4578 		socket_log(sock, NULL, CREATION,
4579 			   "setsockopt(SO_ACCEPTFILTER): %s", strbuf);
4580 		return (ISC_R_FAILURE);
4581 	}
4582 	return (ISC_R_SUCCESS);
4583 #else  /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4584 	return (ISC_R_NOTIMPLEMENTED);
4585 #endif /* if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER) */
4586 }
4587 
4588 /*
4589  * Try enabling TCP Fast Open for a given socket if the OS supports it.
4590  */
4591 static void
set_tcp_fastopen(isc_socket_t * sock,unsigned int backlog)4592 set_tcp_fastopen(isc_socket_t *sock, unsigned int backlog) {
4593 #if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN)
4594 	char strbuf[ISC_STRERRORSIZE];
4595 
4596 /*
4597  * FreeBSD, as of versions 10.3 and 11.0, defines TCP_FASTOPEN while also
4598  * shipping a default kernel without TFO support, so we special-case it by
4599  * performing an additional runtime check for TFO support using sysctl to
4600  * prevent setsockopt() errors from being logged.
4601  */
4602 #if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME)
4603 #define SYSCTL_TFO "net.inet.tcp.fastopen.enabled"
4604 	unsigned int enabled;
4605 	size_t enabledlen = sizeof(enabled);
4606 	static bool tfo_notice_logged = false;
4607 
4608 	if (sysctlbyname(SYSCTL_TFO, &enabled, &enabledlen, NULL, 0) < 0) {
4609 		/*
4610 		 * This kernel does not support TCP Fast Open.  There is
4611 		 * nothing more we can do.
4612 		 */
4613 		return;
4614 	} else if (enabled == 0) {
4615 		/*
4616 		 * This kernel does support TCP Fast Open, but it is disabled
4617 		 * by sysctl.  Notify the user, but do not nag.
4618 		 */
4619 		if (!tfo_notice_logged) {
4620 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
4621 				      ISC_LOGMODULE_SOCKET, ISC_LOG_NOTICE,
4622 				      "TCP_FASTOPEN support is disabled by "
4623 				      "sysctl (" SYSCTL_TFO " = 0)");
4624 			tfo_notice_logged = true;
4625 		}
4626 		return;
4627 	}
4628 #endif /* if defined(__FreeBSD__) && defined(HAVE_SYSCTLBYNAME) */
4629 
4630 #ifdef __APPLE__
4631 	backlog = 1;
4632 #else  /* ifdef __APPLE__ */
4633 	backlog = backlog / 2;
4634 	if (backlog == 0) {
4635 		backlog = 1;
4636 	}
4637 #endif /* ifdef __APPLE__ */
4638 	if (setsockopt(sock->fd, IPPROTO_TCP, TCP_FASTOPEN, (void *)&backlog,
4639 		       sizeof(backlog)) < 0)
4640 	{
4641 		strerror_r(errno, strbuf, sizeof(strbuf));
4642 		UNEXPECTED_ERROR(__FILE__, __LINE__,
4643 				 "setsockopt(%d, TCP_FASTOPEN) failed with %s",
4644 				 sock->fd, strbuf);
4645 		/* TCP_FASTOPEN is experimental so ignore failures */
4646 	}
4647 #else  /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4648 	UNUSED(sock);
4649 	UNUSED(backlog);
4650 #endif /* if defined(ENABLE_TCP_FASTOPEN) && defined(TCP_FASTOPEN) */
4651 }
4652 
4653 /*
4654  * Set up to listen on a given socket.  We do this by creating an internal
4655  * event that will be dispatched when the socket has read activity.  The
4656  * watcher will send the internal event to the task when there is a new
4657  * connection.
4658  *
4659  * Unlike in read, we don't preallocate a done event here.  Every time there
4660  * is a new connection we'll have to allocate a new one anyway, so we might
4661  * as well keep things simple rather than having to track them.
4662  */
4663 isc_result_t
isc_socket_listen(isc_socket_t * sock,unsigned int backlog)4664 isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
4665 	char strbuf[ISC_STRERRORSIZE];
4666 
4667 	REQUIRE(VALID_SOCKET(sock));
4668 
4669 	LOCK(&sock->lock);
4670 
4671 	REQUIRE(!sock->listener);
4672 	REQUIRE(sock->bound);
4673 	REQUIRE(sock->type == isc_sockettype_tcp ||
4674 		sock->type == isc_sockettype_unix);
4675 
4676 	if (backlog == 0) {
4677 		backlog = SOMAXCONN;
4678 	}
4679 
4680 	if (listen(sock->fd, (int)backlog) < 0) {
4681 		UNLOCK(&sock->lock);
4682 		strerror_r(errno, strbuf, sizeof(strbuf));
4683 
4684 		UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
4685 
4686 		return (ISC_R_UNEXPECTED);
4687 	}
4688 
4689 	set_tcp_fastopen(sock, backlog);
4690 
4691 	sock->listener = 1;
4692 
4693 	UNLOCK(&sock->lock);
4694 	return (ISC_R_SUCCESS);
4695 }
4696 
4697 /*
4698  * This should try to do aggressive accept() XXXMLG
4699  */
4700 isc_result_t
isc_socket_accept(isc_socket_t * sock,isc_task_t * task,isc_taskaction_t action,void * arg)4701 isc_socket_accept(isc_socket_t *sock, isc_task_t *task, isc_taskaction_t action,
4702 		  void *arg) {
4703 	isc_socket_newconnev_t *dev;
4704 	isc_socketmgr_t *manager;
4705 	isc_task_t *ntask = NULL;
4706 	isc_socket_t *nsock;
4707 	isc_result_t result;
4708 	bool do_poke = false;
4709 
4710 	REQUIRE(VALID_SOCKET(sock));
4711 	manager = sock->manager;
4712 	REQUIRE(VALID_MANAGER(manager));
4713 
4714 	LOCK(&sock->lock);
4715 
4716 	REQUIRE(sock->listener);
4717 
4718 	/*
4719 	 * Sender field is overloaded here with the task we will be sending
4720 	 * this event to.  Just before the actual event is delivered the
4721 	 * actual ev_sender will be touched up to be the socket.
4722 	 */
4723 	dev = (isc_socket_newconnev_t *)isc_event_allocate(
4724 		manager->mctx, task, ISC_SOCKEVENT_NEWCONN, action, arg,
4725 		sizeof(*dev));
4726 	ISC_LINK_INIT(dev, ev_link);
4727 
4728 	result = allocate_socket(manager, sock->type, &nsock);
4729 	if (result != ISC_R_SUCCESS) {
4730 		isc_event_free(ISC_EVENT_PTR(&dev));
4731 		UNLOCK(&sock->lock);
4732 		return (result);
4733 	}
4734 
4735 	/*
4736 	 * Attach to socket and to task.
4737 	 */
4738 	isc_task_attach(task, &ntask);
4739 	if (isc_task_exiting(ntask)) {
4740 		free_socket(&nsock);
4741 		isc_task_detach(&ntask);
4742 		isc_event_free(ISC_EVENT_PTR(&dev));
4743 		UNLOCK(&sock->lock);
4744 		return (ISC_R_SHUTTINGDOWN);
4745 	}
4746 	isc_refcount_increment0(&nsock->references);
4747 	nsock->statsindex = sock->statsindex;
4748 
4749 	dev->ev_sender = ntask;
4750 	dev->newsocket = nsock;
4751 
4752 	/*
4753 	 * Poke watcher here.  We still have the socket locked, so there
4754 	 * is no race condition.  We will keep the lock for such a short
4755 	 * bit of time waking it up now or later won't matter all that much.
4756 	 */
4757 	do_poke = ISC_LIST_EMPTY(sock->accept_list);
4758 	ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
4759 	if (do_poke) {
4760 		select_poke(manager, sock->threadid, sock->fd,
4761 			    SELECT_POKE_ACCEPT);
4762 	}
4763 	UNLOCK(&sock->lock);
4764 	return (ISC_R_SUCCESS);
4765 }
4766 
4767 isc_result_t
isc_socket_connect(isc_socket_t * sock,const isc_sockaddr_t * addr,isc_task_t * task,isc_taskaction_t action,void * arg)4768 isc_socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
4769 		   isc_task_t *task, isc_taskaction_t action, void *arg) {
4770 	isc_socket_connev_t *dev;
4771 	isc_task_t *ntask = NULL;
4772 	isc_socketmgr_t *manager;
4773 	int cc;
4774 	char strbuf[ISC_STRERRORSIZE];
4775 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
4776 
4777 	REQUIRE(VALID_SOCKET(sock));
4778 	REQUIRE(addr != NULL);
4779 	REQUIRE(task != NULL);
4780 	REQUIRE(action != NULL);
4781 
4782 	manager = sock->manager;
4783 	REQUIRE(VALID_MANAGER(manager));
4784 	REQUIRE(addr != NULL);
4785 
4786 	if (isc_sockaddr_ismulticast(addr)) {
4787 		return (ISC_R_MULTICAST);
4788 	}
4789 
4790 	LOCK(&sock->lock);
4791 
4792 	dev = (isc_socket_connev_t *)isc_event_allocate(
4793 		manager->mctx, sock, ISC_SOCKEVENT_CONNECT, action, arg,
4794 		sizeof(*dev));
4795 	ISC_LINK_INIT(dev, ev_link);
4796 
4797 	if (sock->connecting) {
4798 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4799 		goto queue;
4800 	}
4801 
4802 	if (sock->connected) {
4803 		INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
4804 		dev->result = ISC_R_SUCCESS;
4805 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4806 
4807 		UNLOCK(&sock->lock);
4808 
4809 		return (ISC_R_SUCCESS);
4810 	}
4811 
4812 	/*
4813 	 * Try to do the connect right away, as there can be only one
4814 	 * outstanding, and it might happen to complete.
4815 	 */
4816 	sock->peer_address = *addr;
4817 	cc = connect(sock->fd, &addr->type.sa, addr->length);
4818 	if (cc < 0) {
4819 		/*
4820 		 * The socket is nonblocking and the connection cannot be
4821 		 * completed immediately.  It is possible to select(2) or
4822 		 * poll(2) for completion by selecting the socket for writing.
4823 		 * After select(2) indicates writability, use getsockopt(2) to
4824 		 * read the SO_ERROR option at level SOL_SOCKET to determine
4825 		 * whether connect() completed successfully (SO_ERROR is zero)
4826 		 * or unsuccessfully (SO_ERROR is one of the usual error codes
4827 		 * listed here, explaining the reason for the failure).
4828 		 */
4829 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
4830 			cc = 0;
4831 			goto success;
4832 		}
4833 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4834 			goto queue;
4835 		}
4836 
4837 		switch (errno) {
4838 #define ERROR_MATCH(a, b)        \
4839 	case a:                  \
4840 		dev->result = b; \
4841 		goto err_exit;
4842 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4843 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4844 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4845 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4846 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4847 #ifdef EHOSTDOWN
4848 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4849 #endif /* ifdef EHOSTDOWN */
4850 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4851 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4852 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4853 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4854 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
4855 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
4856 #undef ERROR_MATCH
4857 		}
4858 
4859 		sock->connected = 0;
4860 
4861 		strerror_r(errno, strbuf, sizeof(strbuf));
4862 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
4863 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
4864 				 addrbuf, errno, strbuf);
4865 
4866 		UNLOCK(&sock->lock);
4867 		inc_stats(sock->manager->stats,
4868 			  sock->statsindex[STATID_CONNECTFAIL]);
4869 		isc_event_free(ISC_EVENT_PTR(&dev));
4870 		return (ISC_R_UNEXPECTED);
4871 
4872 	err_exit:
4873 		sock->connected = 0;
4874 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4875 
4876 		UNLOCK(&sock->lock);
4877 		inc_stats(sock->manager->stats,
4878 			  sock->statsindex[STATID_CONNECTFAIL]);
4879 		return (ISC_R_SUCCESS);
4880 	}
4881 
4882 	/*
4883 	 * If connect completed, fire off the done event.
4884 	 */
4885 success:
4886 	if (cc == 0) {
4887 		sock->connected = 1;
4888 		sock->bound = 1;
4889 		dev->result = ISC_R_SUCCESS;
4890 		isc_task_sendto(task, ISC_EVENT_PTR(&dev), sock->threadid);
4891 
4892 		UNLOCK(&sock->lock);
4893 
4894 		inc_stats(sock->manager->stats,
4895 			  sock->statsindex[STATID_CONNECT]);
4896 
4897 		return (ISC_R_SUCCESS);
4898 	}
4899 
4900 queue:
4901 
4902 	/*
4903 	 * Attach to task.
4904 	 */
4905 	isc_task_attach(task, &ntask);
4906 
4907 	dev->ev_sender = ntask;
4908 
4909 	/*
4910 	 * Poke watcher here.  We still have the socket locked, so there
4911 	 * is no race condition.  We will keep the lock for such a short
4912 	 * bit of time waking it up now or later won't matter all that much.
4913 	 */
4914 	bool do_poke = ISC_LIST_EMPTY(sock->connect_list);
4915 	ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
4916 	if (do_poke && !sock->connecting) {
4917 		sock->connecting = 1;
4918 		select_poke(manager, sock->threadid, sock->fd,
4919 			    SELECT_POKE_CONNECT);
4920 	}
4921 
4922 	UNLOCK(&sock->lock);
4923 	return (ISC_R_SUCCESS);
4924 }
4925 
4926 /*
4927  * Called when a socket with a pending connect() finishes.
4928  */
4929 static void
internal_connect(isc_socket_t * sock)4930 internal_connect(isc_socket_t *sock) {
4931 	isc_socket_connev_t *dev;
4932 	int cc;
4933 	isc_result_t result;
4934 	socklen_t optlen;
4935 	char strbuf[ISC_STRERRORSIZE];
4936 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
4937 
4938 	INSIST(VALID_SOCKET(sock));
4939 	REQUIRE(sock->fd >= 0);
4940 
4941 	/*
4942 	 * Get the first item off the connect list.
4943 	 * If it is empty, unlock the socket and return.
4944 	 */
4945 	dev = ISC_LIST_HEAD(sock->connect_list);
4946 	if (dev == NULL) {
4947 		INSIST(!sock->connecting);
4948 		goto finish;
4949 	}
4950 
4951 	INSIST(sock->connecting);
4952 	sock->connecting = 0;
4953 
4954 	/*
4955 	 * Get any possible error status here.
4956 	 */
4957 	optlen = sizeof(cc);
4958 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc,
4959 		       (void *)&optlen) != 0)
4960 	{
4961 		cc = errno;
4962 	} else {
4963 		errno = cc;
4964 	}
4965 
4966 	if (errno != 0) {
4967 		/*
4968 		 * If the error is EAGAIN, just re-select on this
4969 		 * fd and pretend nothing strange happened.
4970 		 */
4971 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
4972 			sock->connecting = 1;
4973 			return;
4974 		}
4975 
4976 		inc_stats(sock->manager->stats,
4977 			  sock->statsindex[STATID_CONNECTFAIL]);
4978 
4979 		/*
4980 		 * Translate other errors into ISC_R_* flavors.
4981 		 */
4982 		switch (errno) {
4983 #define ERROR_MATCH(a, b)   \
4984 	case a:             \
4985 		result = b; \
4986 		break;
4987 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
4988 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
4989 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
4990 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
4991 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
4992 #ifdef EHOSTDOWN
4993 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
4994 #endif /* ifdef EHOSTDOWN */
4995 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
4996 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
4997 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
4998 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
4999 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
5000 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
5001 #undef ERROR_MATCH
5002 		default:
5003 			result = ISC_R_UNEXPECTED;
5004 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5005 					    sizeof(peerbuf));
5006 			strerror_r(errno, strbuf, sizeof(strbuf));
5007 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5008 					 "internal_connect: connect(%s) %s",
5009 					 peerbuf, strbuf);
5010 		}
5011 	} else {
5012 		inc_stats(sock->manager->stats,
5013 			  sock->statsindex[STATID_CONNECT]);
5014 		result = ISC_R_SUCCESS;
5015 		sock->connected = 1;
5016 		sock->bound = 1;
5017 	}
5018 
5019 	do {
5020 		dev->result = result;
5021 		send_connectdone_event(sock, &dev);
5022 		dev = ISC_LIST_HEAD(sock->connect_list);
5023 	} while (dev != NULL);
5024 
5025 finish:
5026 	unwatch_fd(&sock->manager->threads[sock->threadid], sock->fd,
5027 		   SELECT_POKE_CONNECT);
5028 }
5029 
5030 isc_result_t
isc_socket_getpeername(isc_socket_t * sock,isc_sockaddr_t * addressp)5031 isc_socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5032 	isc_result_t result;
5033 
5034 	REQUIRE(VALID_SOCKET(sock));
5035 	REQUIRE(addressp != NULL);
5036 
5037 	LOCK(&sock->lock);
5038 
5039 	if (sock->connected) {
5040 		*addressp = sock->peer_address;
5041 		result = ISC_R_SUCCESS;
5042 	} else {
5043 		result = ISC_R_NOTCONNECTED;
5044 	}
5045 
5046 	UNLOCK(&sock->lock);
5047 
5048 	return (result);
5049 }
5050 
5051 isc_result_t
isc_socket_getsockname(isc_socket_t * sock,isc_sockaddr_t * addressp)5052 isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) {
5053 	socklen_t len;
5054 	isc_result_t result;
5055 	char strbuf[ISC_STRERRORSIZE];
5056 
5057 	REQUIRE(VALID_SOCKET(sock));
5058 	REQUIRE(addressp != NULL);
5059 
5060 	LOCK(&sock->lock);
5061 
5062 	if (!sock->bound) {
5063 		result = ISC_R_NOTBOUND;
5064 		goto out;
5065 	}
5066 
5067 	result = ISC_R_SUCCESS;
5068 
5069 	len = sizeof(addressp->type);
5070 	if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
5071 		strerror_r(errno, strbuf, sizeof(strbuf));
5072 		UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", strbuf);
5073 		result = ISC_R_UNEXPECTED;
5074 		goto out;
5075 	}
5076 	addressp->length = (unsigned int)len;
5077 
5078 out:
5079 	UNLOCK(&sock->lock);
5080 
5081 	return (result);
5082 }
5083 
5084 /*
5085  * Run through the list of events on this socket, and cancel the ones
5086  * queued for task "task" of type "how".  "how" is a bitmask.
5087  */
5088 void
isc_socket_cancel(isc_socket_t * sock,isc_task_t * task,unsigned int how)5089 isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) {
5090 	REQUIRE(VALID_SOCKET(sock));
5091 
5092 	/*
5093 	 * Quick exit if there is nothing to do.  Don't even bother locking
5094 	 * in this case.
5095 	 */
5096 	if (how == 0) {
5097 		return;
5098 	}
5099 
5100 	LOCK(&sock->lock);
5101 
5102 	/*
5103 	 * All of these do the same thing, more or less.
5104 	 * Each will:
5105 	 *	o If the internal event is marked as "posted" try to
5106 	 *	  remove it from the task's queue.  If this fails, mark it
5107 	 *	  as canceled instead, and let the task clean it up later.
5108 	 *	o For each I/O request for that task of that type, post
5109 	 *	  its done event with status of "ISC_R_CANCELED".
5110 	 *	o Reset any state needed.
5111 	 */
5112 	if (((how & ISC_SOCKCANCEL_RECV) != 0) &&
5113 	    !ISC_LIST_EMPTY(sock->recv_list))
5114 	{
5115 		isc_socketevent_t *dev;
5116 		isc_socketevent_t *next;
5117 		isc_task_t *current_task;
5118 
5119 		dev = ISC_LIST_HEAD(sock->recv_list);
5120 
5121 		while (dev != NULL) {
5122 			current_task = dev->ev_sender;
5123 			next = ISC_LIST_NEXT(dev, ev_link);
5124 
5125 			if ((task == NULL) || (task == current_task)) {
5126 				dev->result = ISC_R_CANCELED;
5127 				send_recvdone_event(sock, &dev);
5128 			}
5129 			dev = next;
5130 		}
5131 	}
5132 
5133 	if (((how & ISC_SOCKCANCEL_SEND) != 0) &&
5134 	    !ISC_LIST_EMPTY(sock->send_list))
5135 	{
5136 		isc_socketevent_t *dev;
5137 		isc_socketevent_t *next;
5138 		isc_task_t *current_task;
5139 
5140 		dev = ISC_LIST_HEAD(sock->send_list);
5141 
5142 		while (dev != NULL) {
5143 			current_task = dev->ev_sender;
5144 			next = ISC_LIST_NEXT(dev, ev_link);
5145 
5146 			if ((task == NULL) || (task == current_task)) {
5147 				dev->result = ISC_R_CANCELED;
5148 				send_senddone_event(sock, &dev);
5149 			}
5150 			dev = next;
5151 		}
5152 	}
5153 
5154 	if (((how & ISC_SOCKCANCEL_ACCEPT) != 0) &&
5155 	    !ISC_LIST_EMPTY(sock->accept_list))
5156 	{
5157 		isc_socket_newconnev_t *dev;
5158 		isc_socket_newconnev_t *next;
5159 		isc_task_t *current_task;
5160 
5161 		dev = ISC_LIST_HEAD(sock->accept_list);
5162 		while (dev != NULL) {
5163 			current_task = dev->ev_sender;
5164 			next = ISC_LIST_NEXT(dev, ev_link);
5165 
5166 			if ((task == NULL) || (task == current_task)) {
5167 				ISC_LIST_UNLINK(sock->accept_list, dev,
5168 						ev_link);
5169 
5170 				isc_refcount_decrementz(
5171 					&NEWCONNSOCK(dev)->references);
5172 				free_socket((isc_socket_t **)&dev->newsocket);
5173 
5174 				dev->result = ISC_R_CANCELED;
5175 				dev->ev_sender = sock;
5176 				isc_task_sendtoanddetach(&current_task,
5177 							 ISC_EVENT_PTR(&dev),
5178 							 sock->threadid);
5179 			}
5180 
5181 			dev = next;
5182 		}
5183 	}
5184 
5185 	if (((how & ISC_SOCKCANCEL_CONNECT) != 0) &&
5186 	    !ISC_LIST_EMPTY(sock->connect_list))
5187 	{
5188 		isc_socket_connev_t *dev;
5189 		isc_socket_connev_t *next;
5190 		isc_task_t *current_task;
5191 
5192 		INSIST(sock->connecting);
5193 		sock->connecting = 0;
5194 
5195 		dev = ISC_LIST_HEAD(sock->connect_list);
5196 
5197 		while (dev != NULL) {
5198 			current_task = dev->ev_sender;
5199 			next = ISC_LIST_NEXT(dev, ev_link);
5200 
5201 			if ((task == NULL) || (task == current_task)) {
5202 				dev->result = ISC_R_CANCELED;
5203 				send_connectdone_event(sock, &dev);
5204 			}
5205 			dev = next;
5206 		}
5207 	}
5208 
5209 	UNLOCK(&sock->lock);
5210 }
5211 
5212 isc_sockettype_t
isc_socket_gettype(isc_socket_t * sock)5213 isc_socket_gettype(isc_socket_t *sock) {
5214 	REQUIRE(VALID_SOCKET(sock));
5215 
5216 	return (sock->type);
5217 }
5218 
5219 void
isc_socket_ipv6only(isc_socket_t * sock,bool yes)5220 isc_socket_ipv6only(isc_socket_t *sock, bool yes) {
5221 #if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
5222 	int onoff = yes ? 1 : 0;
5223 #else  /* if defined(IPV6_V6ONLY) */
5224 	UNUSED(yes);
5225 	UNUSED(sock);
5226 #endif /* if defined(IPV6_V6ONLY) */
5227 
5228 	REQUIRE(VALID_SOCKET(sock));
5229 	INSIST(!sock->dupped);
5230 
5231 #if defined(IPV6_V6ONLY) && !defined(__OpenBSD__)
5232 	if (sock->pf == AF_INET6) {
5233 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
5234 			       (void *)&onoff, sizeof(int)) < 0)
5235 		{
5236 			char strbuf[ISC_STRERRORSIZE];
5237 			strerror_r(errno, strbuf, sizeof(strbuf));
5238 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5239 					 "setsockopt(%d, IPV6_V6ONLY) failed: "
5240 					 "%s",
5241 					 sock->fd, strbuf);
5242 		}
5243 	}
5244 #endif /* ifdef IPV6_V6ONLY */
5245 }
5246 
5247 static void
setdscp(isc_socket_t * sock,isc_dscp_t dscp)5248 setdscp(isc_socket_t *sock, isc_dscp_t dscp) {
5249 #if defined(IP_TOS) || defined(IPV6_TCLASS)
5250 	int value = dscp << 2;
5251 #endif /* if defined(IP_TOS) || defined(IPV6_TCLASS) */
5252 
5253 	sock->dscp = dscp;
5254 
5255 #ifdef IP_TOS
5256 	if (sock->pf == AF_INET) {
5257 		if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS, (void *)&value,
5258 			       sizeof(value)) < 0)
5259 		{
5260 			char strbuf[ISC_STRERRORSIZE];
5261 			strerror_r(errno, strbuf, sizeof(strbuf));
5262 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5263 					 "setsockopt(%d, IP_TOS, %.02x) "
5264 					 "failed: %s",
5265 					 sock->fd, value >> 2, strbuf);
5266 		}
5267 	}
5268 #endif /* ifdef IP_TOS */
5269 #ifdef IPV6_TCLASS
5270 	if (sock->pf == AF_INET6) {
5271 		if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
5272 			       (void *)&value, sizeof(value)) < 0)
5273 		{
5274 			char strbuf[ISC_STRERRORSIZE];
5275 			strerror_r(errno, strbuf, sizeof(strbuf));
5276 			UNEXPECTED_ERROR(__FILE__, __LINE__,
5277 					 "setsockopt(%d, IPV6_TCLASS, %.02x) "
5278 					 "failed: %s",
5279 					 sock->fd, dscp >> 2, strbuf);
5280 		}
5281 	}
5282 #endif /* ifdef IPV6_TCLASS */
5283 }
5284 
5285 void
isc_socket_dscp(isc_socket_t * sock,isc_dscp_t dscp)5286 isc_socket_dscp(isc_socket_t *sock, isc_dscp_t dscp) {
5287 	REQUIRE(VALID_SOCKET(sock));
5288 	REQUIRE(dscp < 0x40);
5289 
5290 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
5291 	UNUSED(dscp);
5292 #else  /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5293 	if (dscp < 0) {
5294 		return;
5295 	}
5296 
5297 	/* The DSCP value must not be changed once it has been set. */
5298 	if (isc_dscp_check_value != -1) {
5299 		INSIST(dscp == isc_dscp_check_value);
5300 	}
5301 #endif /* if !defined(IP_TOS) && !defined(IPV6_TCLASS) */
5302 
5303 #ifdef notyet
5304 	REQUIRE(!sock->dupped);
5305 #endif /* ifdef notyet */
5306 
5307 	setdscp(sock, dscp);
5308 }
5309 
5310 isc_socketevent_t *
isc_socket_socketevent(isc_mem_t * mctx,void * sender,isc_eventtype_t eventtype,isc_taskaction_t action,void * arg)5311 isc_socket_socketevent(isc_mem_t *mctx, void *sender, isc_eventtype_t eventtype,
5312 		       isc_taskaction_t action, void *arg) {
5313 	return (allocate_socketevent(mctx, sender, eventtype, action, arg));
5314 }
5315 
5316 void
isc_socket_setname(isc_socket_t * sock,const char * name,void * tag)5317 isc_socket_setname(isc_socket_t *sock, const char *name, void *tag) {
5318 	/*
5319 	 * Name 'sock'.
5320 	 */
5321 
5322 	REQUIRE(VALID_SOCKET(sock));
5323 
5324 	LOCK(&sock->lock);
5325 	strlcpy(sock->name, name, sizeof(sock->name));
5326 	sock->tag = tag;
5327 	UNLOCK(&sock->lock);
5328 }
5329 
5330 const char *
isc_socket_getname(isc_socket_t * sock)5331 isc_socket_getname(isc_socket_t *sock) {
5332 	return (sock->name);
5333 }
5334 
5335 void *
isc_socket_gettag(isc_socket_t * sock)5336 isc_socket_gettag(isc_socket_t *sock) {
5337 	return (sock->tag);
5338 }
5339 
5340 int
isc_socket_getfd(isc_socket_t * sock)5341 isc_socket_getfd(isc_socket_t *sock) {
5342 	return ((short)sock->fd);
5343 }
5344 
5345 static isc_once_t hasreuseport_once = ISC_ONCE_INIT;
5346 static bool hasreuseport = false;
5347 
5348 static void
init_hasreuseport(void)5349 init_hasreuseport(void) {
5350 /*
5351  * SO_REUSEPORT works very differently on *BSD and on Linux (because why not).
5352  * We only want to use it on Linux, if it's available. On BSD we want to dup()
5353  * sockets instead of re-binding them.
5354  */
5355 #if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5356 	(defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__))
5357 	int sock, yes = 1;
5358 	sock = socket(AF_INET, SOCK_DGRAM, 0);
5359 	if (sock < 0) {
5360 		sock = socket(AF_INET6, SOCK_DGRAM, 0);
5361 		if (sock < 0) {
5362 			return;
5363 		}
5364 	}
5365 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)&yes,
5366 		       sizeof(yes)) < 0)
5367 	{
5368 		close(sock);
5369 		return;
5370 #if defined(__FreeBSD_kernel__)
5371 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT_LB, (void *)&yes,
5372 			      sizeof(yes)) < 0)
5373 #else  /* if defined(__FreeBSD_kernel__) */
5374 	} else if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (void *)&yes,
5375 			      sizeof(yes)) < 0)
5376 #endif /* if defined(__FreeBSD_kernel__) */
5377 	{
5378 		close(sock);
5379 		return;
5380 	}
5381 	hasreuseport = true;
5382 	close(sock);
5383 #endif /* if (defined(SO_REUSEPORT) && defined(__linux__)) || \
5384 	* (defined(SO_REUSEPORT_LB) && defined(__FreeBSD_kernel__)) */
5385 }
5386 
5387 bool
isc_socket_hasreuseport()5388 isc_socket_hasreuseport() {
5389 	RUNTIME_CHECK(isc_once_do(&hasreuseport_once, init_hasreuseport) ==
5390 		      ISC_R_SUCCESS);
5391 	return (hasreuseport);
5392 }
5393 
5394 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C)
5395 static const char *
_socktype(isc_sockettype_t type)5396 _socktype(isc_sockettype_t type) {
5397 	switch (type) {
5398 	case isc_sockettype_udp:
5399 		return ("udp");
5400 	case isc_sockettype_tcp:
5401 		return ("tcp");
5402 	case isc_sockettype_unix:
5403 		return ("unix");
5404 	case isc_sockettype_fdwatch:
5405 		return ("fdwatch");
5406 	default:
5407 		return ("not-initialized");
5408 	}
5409 }
5410 #endif /* if defined(HAVE_LIBXML2) || defined(HAVE_JSON_C) */
5411 
5412 #ifdef HAVE_LIBXML2
5413 #define TRY0(a)                     \
5414 	do {                        \
5415 		xmlrc = (a);        \
5416 		if (xmlrc < 0)      \
5417 			goto error; \
5418 	} while (0)
5419 int
isc_socketmgr_renderxml(isc_socketmgr_t * mgr,void * writer0)5420 isc_socketmgr_renderxml(isc_socketmgr_t *mgr, void *writer0) {
5421 	isc_socket_t *sock = NULL;
5422 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5423 	isc_sockaddr_t addr;
5424 	socklen_t len;
5425 	int xmlrc;
5426 	xmlTextWriterPtr writer = (xmlTextWriterPtr)writer0;
5427 
5428 	LOCK(&mgr->lock);
5429 
5430 	TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
5431 	sock = ISC_LIST_HEAD(mgr->socklist);
5432 	while (sock != NULL) {
5433 		LOCK(&sock->lock);
5434 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
5435 
5436 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
5437 		TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
5438 		TRY0(xmlTextWriterEndElement(writer));
5439 
5440 		if (sock->name[0] != 0) {
5441 			TRY0(xmlTextWriterStartElement(writer,
5442 						       ISC_XMLCHAR "name"));
5443 			TRY0(xmlTextWriterWriteFormatString(writer, "%s",
5444 							    sock->name));
5445 			TRY0(xmlTextWriterEndElement(writer)); /* name */
5446 		}
5447 
5448 		TRY0(xmlTextWriterStartElement(writer,
5449 					       ISC_XMLCHAR "references"));
5450 		TRY0(xmlTextWriterWriteFormatString(
5451 			writer, "%d",
5452 			(int)isc_refcount_current(&sock->references)));
5453 		TRY0(xmlTextWriterEndElement(writer));
5454 
5455 		TRY0(xmlTextWriterWriteElement(
5456 			writer, ISC_XMLCHAR "type",
5457 			ISC_XMLCHAR _socktype(sock->type)));
5458 
5459 		if (sock->connected) {
5460 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5461 					    sizeof(peerbuf));
5462 			TRY0(xmlTextWriterWriteElement(
5463 				writer, ISC_XMLCHAR "peer-address",
5464 				ISC_XMLCHAR peerbuf));
5465 		}
5466 
5467 		len = sizeof(addr);
5468 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5469 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5470 			TRY0(xmlTextWriterWriteElement(
5471 				writer, ISC_XMLCHAR "local-address",
5472 				ISC_XMLCHAR peerbuf));
5473 		}
5474 
5475 		TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
5476 		if (sock->listener) {
5477 			TRY0(xmlTextWriterWriteElement(writer,
5478 						       ISC_XMLCHAR "state",
5479 						       ISC_XMLCHAR "listener"));
5480 		}
5481 		if (sock->connected) {
5482 			TRY0(xmlTextWriterWriteElement(
5483 				writer, ISC_XMLCHAR "state",
5484 				ISC_XMLCHAR "connected"));
5485 		}
5486 		if (sock->connecting) {
5487 			TRY0(xmlTextWriterWriteElement(
5488 				writer, ISC_XMLCHAR "state",
5489 				ISC_XMLCHAR "connecting"));
5490 		}
5491 		if (sock->bound) {
5492 			TRY0(xmlTextWriterWriteElement(writer,
5493 						       ISC_XMLCHAR "state",
5494 						       ISC_XMLCHAR "bound"));
5495 		}
5496 
5497 		TRY0(xmlTextWriterEndElement(writer)); /* states */
5498 
5499 		TRY0(xmlTextWriterEndElement(writer)); /* socket */
5500 
5501 		UNLOCK(&sock->lock);
5502 		sock = ISC_LIST_NEXT(sock, link);
5503 	}
5504 	TRY0(xmlTextWriterEndElement(writer)); /* sockets */
5505 
5506 error:
5507 	if (sock != NULL) {
5508 		UNLOCK(&sock->lock);
5509 	}
5510 
5511 	UNLOCK(&mgr->lock);
5512 
5513 	return (xmlrc);
5514 }
5515 #endif /* HAVE_LIBXML2 */
5516 
5517 #ifdef HAVE_JSON_C
5518 #define CHECKMEM(m)                              \
5519 	do {                                     \
5520 		if (m == NULL) {                 \
5521 			result = ISC_R_NOMEMORY; \
5522 			goto error;              \
5523 		}                                \
5524 	} while (0)
5525 
5526 isc_result_t
isc_socketmgr_renderjson(isc_socketmgr_t * mgr,void * stats0)5527 isc_socketmgr_renderjson(isc_socketmgr_t *mgr, void *stats0) {
5528 	isc_result_t result = ISC_R_SUCCESS;
5529 	isc_socket_t *sock = NULL;
5530 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
5531 	isc_sockaddr_t addr;
5532 	socklen_t len;
5533 	json_object *obj, *array = json_object_new_array();
5534 	json_object *stats = (json_object *)stats0;
5535 
5536 	CHECKMEM(array);
5537 
5538 	LOCK(&mgr->lock);
5539 
5540 	sock = ISC_LIST_HEAD(mgr->socklist);
5541 	while (sock != NULL) {
5542 		json_object *states, *entry = json_object_new_object();
5543 		char buf[255];
5544 
5545 		CHECKMEM(entry);
5546 		json_object_array_add(array, entry);
5547 
5548 		LOCK(&sock->lock);
5549 
5550 		snprintf(buf, sizeof(buf), "%p", sock);
5551 		obj = json_object_new_string(buf);
5552 		CHECKMEM(obj);
5553 		json_object_object_add(entry, "id", obj);
5554 
5555 		if (sock->name[0] != 0) {
5556 			obj = json_object_new_string(sock->name);
5557 			CHECKMEM(obj);
5558 			json_object_object_add(entry, "name", obj);
5559 		}
5560 
5561 		obj = json_object_new_int(
5562 			(int)isc_refcount_current(&sock->references));
5563 		CHECKMEM(obj);
5564 		json_object_object_add(entry, "references", obj);
5565 
5566 		obj = json_object_new_string(_socktype(sock->type));
5567 		CHECKMEM(obj);
5568 		json_object_object_add(entry, "type", obj);
5569 
5570 		if (sock->connected) {
5571 			isc_sockaddr_format(&sock->peer_address, peerbuf,
5572 					    sizeof(peerbuf));
5573 			obj = json_object_new_string(peerbuf);
5574 			CHECKMEM(obj);
5575 			json_object_object_add(entry, "peer-address", obj);
5576 		}
5577 
5578 		len = sizeof(addr);
5579 		if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
5580 			isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
5581 			obj = json_object_new_string(peerbuf);
5582 			CHECKMEM(obj);
5583 			json_object_object_add(entry, "local-address", obj);
5584 		}
5585 
5586 		states = json_object_new_array();
5587 		CHECKMEM(states);
5588 		json_object_object_add(entry, "states", states);
5589 
5590 		if (sock->listener) {
5591 			obj = json_object_new_string("listener");
5592 			CHECKMEM(obj);
5593 			json_object_array_add(states, obj);
5594 		}
5595 
5596 		if (sock->connected) {
5597 			obj = json_object_new_string("connected");
5598 			CHECKMEM(obj);
5599 			json_object_array_add(states, obj);
5600 		}
5601 
5602 		if (sock->connecting) {
5603 			obj = json_object_new_string("connecting");
5604 			CHECKMEM(obj);
5605 			json_object_array_add(states, obj);
5606 		}
5607 
5608 		if (sock->bound) {
5609 			obj = json_object_new_string("bound");
5610 			CHECKMEM(obj);
5611 			json_object_array_add(states, obj);
5612 		}
5613 
5614 		UNLOCK(&sock->lock);
5615 		sock = ISC_LIST_NEXT(sock, link);
5616 	}
5617 
5618 	json_object_object_add(stats, "sockets", array);
5619 	array = NULL;
5620 	result = ISC_R_SUCCESS;
5621 
5622 error:
5623 	if (array != NULL) {
5624 		json_object_put(array);
5625 	}
5626 
5627 	if (sock != NULL) {
5628 		UNLOCK(&sock->lock);
5629 	}
5630 
5631 	UNLOCK(&mgr->lock);
5632 
5633 	return (result);
5634 }
5635 #endif /* HAVE_JSON_C */
5636 
5637 /*
5638  * Create a new 'type' socket managed by 'manager'.  Events
5639  * will be posted to 'task' and when dispatched 'action' will be
5640  * called with 'arg' as the arg value.  The new socket is returned
5641  * in 'socketp'.
5642  */
5643 isc_result_t
isc_socket_fdwatchcreate(isc_socketmgr_t * manager,int fd,int flags,isc_sockfdwatch_t callback,void * cbarg,isc_task_t * task,isc_socket_t ** socketp)5644 isc_socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
5645 			 isc_sockfdwatch_t callback, void *cbarg,
5646 			 isc_task_t *task, isc_socket_t **socketp)
5647 {
5648 	isc_socket_t *sock = NULL;
5649 	isc__socketthread_t *thread;
5650 	isc_result_t result;
5651 	int lockid;
5652 
5653 	REQUIRE(VALID_MANAGER(manager));
5654 	REQUIRE(socketp != NULL && *socketp == NULL);
5655 
5656 	if (fd < 0 || (unsigned int)fd >= manager->maxsocks)
5657 		return (ISC_R_RANGE);
5658 
5659 	result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
5660 	if (result != ISC_R_SUCCESS)
5661 		return (result);
5662 
5663 	sock->fd = fd;
5664 	sock->fdwatcharg = cbarg;
5665 	sock->fdwatchcb = callback;
5666 	sock->fdwatchflags = flags;
5667 	sock->fdwatchtask = task;
5668 
5669 	sock->threadid = gen_threadid(sock);
5670 	isc_refcount_init(&sock->references, 1);
5671 	thread = &manager->threads[sock->threadid];
5672 	*socketp = (isc_socket_t *)sock;
5673 
5674 	/*
5675 	 * Note we don't have to lock the socket like we normally would because
5676 	 * there are no external references to it yet.
5677 	 */
5678 
5679 	lockid = FDLOCK_ID(sock->fd);
5680 	LOCK(&thread->fdlock[lockid]);
5681 	thread->fds[sock->fd] = sock;
5682 	thread->fdstate[sock->fd] = MANAGED;
5683 
5684 #if defined(USE_EPOLL)
5685 	manager->epoll_events[sock->fd] = 0;
5686 #endif
5687 #ifdef USE_DEVPOLL
5688 	INSIST(thread->fdpollinfo[sock->fd].want_read == 0 &&
5689 	       thread->fdpollinfo[sock->fd].want_write == 0);
5690 #endif /* ifdef USE_DEVPOLL */
5691 	UNLOCK(&thread->fdlock[lockid]);
5692 
5693 	LOCK(&manager->lock);
5694 	ISC_LIST_APPEND(manager->socklist, sock, link);
5695 #ifdef USE_SELECT
5696 	if (thread->maxfd < sock->fd)
5697 		thread->maxfd = sock->fd;
5698 #endif
5699 	UNLOCK(&manager->lock);
5700 
5701 	sock->active = 1;
5702 	if (flags & ISC_SOCKFDWATCH_READ)
5703 		select_poke(sock->manager, sock->threadid, sock->fd,
5704 		    SELECT_POKE_READ);
5705 	if (flags & ISC_SOCKFDWATCH_WRITE)
5706 		select_poke(sock->manager, sock->threadid, sock->fd,
5707 		    SELECT_POKE_WRITE);
5708 
5709 	socket_log(sock, NULL, CREATION, "fdwatch-created");
5710 
5711 	return (ISC_R_SUCCESS);
5712 }
5713 
5714 /*
5715  * Indicate to the manager that it should watch the socket again.
5716  * This can be used to restart watching if the previous event handler
5717  * didn't indicate there was more data to be processed.  Primarily
5718  * it is for writing but could be used for reading if desired
5719  */
5720 
5721 isc_result_t
isc_socket_fdwatchpoke(isc_socket_t * sock,int flags)5722 isc_socket_fdwatchpoke(isc_socket_t *sock, int flags)
5723 {
5724 	REQUIRE(VALID_SOCKET(sock));
5725 
5726 	/*
5727 	 * We check both flags first to allow us to get the lock
5728 	 * once but only if we need it.
5729 	 */
5730 
5731 	if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
5732 		LOCK(&sock->lock);
5733 		if ((flags & ISC_SOCKFDWATCH_READ) != 0)
5734 			select_poke(sock->manager, sock->threadid, sock->fd,
5735 				    SELECT_POKE_READ);
5736 		if ((flags & ISC_SOCKFDWATCH_WRITE) != 0)
5737 			select_poke(sock->manager, sock->threadid, sock->fd,
5738 				    SELECT_POKE_WRITE);
5739 		UNLOCK(&sock->lock);
5740 	}
5741 
5742 	socket_log(sock, NULL, TRACE, "fdwatch-poked flags: %d", flags);
5743 
5744 	return (ISC_R_SUCCESS);
5745 }
5746