xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 /*!
101  * IPv6 control information.  If the socket is an IPv6 socket we want
102  * to collect the destination address and interface so the client can
103  * set them on outgoing packets.
104  */
105 
106 /*%
107  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
108  * a setsockopt() like interface to request timestamps, and if the OS
109  * doesn't do it for us, call gettimeofday() on every UDP receive?
110  */
111 
112 /*%
113  * Instead of calculating the cmsgbuf lengths every time we take
114  * a rule of thumb approach - sizes are taken from x86_64 linux,
115  * multiplied by 2, everything should fit. Those sizes are not
116  * large enough to cause any concern.
117  */
118 #define CMSG_SP_IN6PKT 40
119 
120 #define CMSG_SP_TIMESTAMP 32
121 
122 #define CMSG_SP_TCTOS 24
123 
124 #define CMSG_SP_INT 24
125 
126 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
127 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
128 
129 /*%
130  * The number of times a send operation is repeated if the result is EINTR.
131  */
132 #define NRETRIES 10
133 
134 struct isc_socket {
135 	/* Not locked. */
136 	isc_socketmgr_t	*manager;
137 	isc_sockettype_t	type;
138 
139 	/* Locked by socket lock. */
140 	ISC_LINK(isc_socket_t)	link;
141 	unsigned int		references;
142 	int			fd;
143 	int			pf;
144 
145 	ISC_LIST(isc_socketevent_t)		send_list;
146 	ISC_LIST(isc_socketevent_t)		recv_list;
147 	isc_socket_connev_t		       *connect_ev;
148 
149 	/*
150 	 * Internal events.  Posted when a descriptor is readable or
151 	 * writable.  These are statically allocated and never freed.
152 	 * They will be set to non-purgable before use.
153 	 */
154 	intev_t			readable_ev;
155 	intev_t			writable_ev;
156 
157 	isc_sockaddr_t		peer_address;       /* remote address */
158 
159 	unsigned int		pending_recv : 1,
160 				pending_send : 1,
161 				connected : 1,
162 				connecting : 1,     /* connect pending */
163 				bound : 1,          /* bound to local addr */
164 				active : 1,         /* currently active */
165 				pktdscp : 1;	    /* per packet dscp */
166 	unsigned int		dscp;
167 };
168 
169 struct isc_socketmgr {
170 	/* Not locked. */
171 	int			fd_bufsize;
172 	unsigned int		maxsocks;
173 
174 	isc_socket_t	       **fds;
175 	int			*fdstate;
176 
177 	/* Locked by manager lock. */
178 	ISC_LIST(isc_socket_t)	socklist;
179 	fd_set			*read_fds;
180 	fd_set			*read_fds_copy;
181 	fd_set			*write_fds;
182 	fd_set			*write_fds_copy;
183 	int			maxfd;
184 	unsigned int		refs;
185 };
186 
187 static isc_socketmgr_t *socketmgr = NULL;
188 
189 #define CLOSED			0	/* this one must be zero */
190 #define MANAGED			1
191 #define CLOSE_PENDING		2
192 
193 /*
194  * send() and recv() iovec counts
195  */
196 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
197 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
198 
199 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
200 				  isc_sockettype_t type,
201 				  isc_socket_t **socketp);
202 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
203 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
204 static void free_socket(isc_socket_t **);
205 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
206 				    isc_socket_t **);
207 static void destroy(isc_socket_t **);
208 static void internal_connect(isc_task_t *, isc_event_t *);
209 static void internal_recv(isc_task_t *, isc_event_t *);
210 static void internal_send(isc_task_t *, isc_event_t *);
211 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
212 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *,
213 			      struct msghdr *, struct iovec *, size_t *);
214 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *,
215 			      struct msghdr *, struct iovec *, size_t *);
216 
217 #define SELECT_POKE_SHUTDOWN		(-1)
218 #define SELECT_POKE_READ		(-3)
219 #define SELECT_POKE_WRITE		(-4)
220 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
221 #define SELECT_POKE_CLOSE		(-5)
222 
223 #define SOCK_DEAD(s)			((s)->references == 0)
224 
225 /*%
226  * Shortcut index arrays to get access to statistics counters.
227  */
228 enum {
229 	STATID_OPEN = 0,
230 	STATID_OPENFAIL = 1,
231 	STATID_CLOSE = 2,
232 	STATID_BINDFAIL = 3,
233 	STATID_CONNECTFAIL = 4,
234 	STATID_CONNECT = 5,
235 	STATID_ACCEPTFAIL = 6,
236 	STATID_ACCEPT = 7,
237 	STATID_SENDFAIL = 8,
238 	STATID_RECVFAIL = 9,
239 	STATID_ACTIVE = 10
240 };
241 
242 static void
243 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
244 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
245 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
246 static void
247 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
248 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
249 	   const char *fmt, ...)
250 {
251 	char msgbuf[2048];
252 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
253 	va_list ap;
254 
255 	if (! isc_log_wouldlog(isc_lctx, level))
256 		return;
257 
258 	va_start(ap, fmt);
259 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
260 	va_end(ap);
261 
262 	if (address == NULL) {
263 		isc_log_write(isc_lctx, category, module, level,
264 			       "socket %p: %s", sock, msgbuf);
265 	} else {
266 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
267 		isc_log_write(isc_lctx, category, module, level,
268 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
269 	}
270 }
271 
272 static inline isc_result_t
273 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
274 	isc_result_t result = ISC_R_SUCCESS;
275 
276 	if (msg == SELECT_POKE_READ)
277 		FD_SET(fd, manager->read_fds);
278 	if (msg == SELECT_POKE_WRITE)
279 		FD_SET(fd, manager->write_fds);
280 
281 	return (result);
282 }
283 
284 static inline isc_result_t
285 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
286 	isc_result_t result = ISC_R_SUCCESS;
287 
288 	if (msg == SELECT_POKE_READ)
289 		FD_CLR(fd, manager->read_fds);
290 	else if (msg == SELECT_POKE_WRITE)
291 		FD_CLR(fd, manager->write_fds);
292 
293 	return (result);
294 }
295 
296 static void
297 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
298 	isc_result_t result;
299 
300 	/*
301 	 * This is a wakeup on a socket.  If the socket is not in the
302 	 * process of being closed, start watching it for either reads
303 	 * or writes.
304 	 */
305 
306 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
307 
308 	if (msg == SELECT_POKE_CLOSE) {
309 		/* No one should be updating fdstate, so no need to lock it */
310 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
311 		manager->fdstate[fd] = CLOSED;
312 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
313 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
314 		(void)close(fd);
315 		return;
316 	}
317 
318 	if (manager->fdstate[fd] == CLOSE_PENDING) {
319 
320 		/*
321 		 * We accept (and ignore) any error from unwatch_fd() as we are
322 		 * closing the socket, hoping it doesn't leave dangling state in
323 		 * the kernel.
324 		 */
325 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
326 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
327 		return;
328 	}
329 	if (manager->fdstate[fd] != MANAGED) {
330 		return;
331 	}
332 
333 	/*
334 	 * Set requested bit.
335 	 */
336 	result = watch_fd(manager, fd, msg);
337 	if (result != ISC_R_SUCCESS) {
338 		/*
339 		 * XXXJT: what should we do?  Ignoring the failure of watching
340 		 * a socket will make the application dysfunctional, but there
341 		 * seems to be no reasonable recovery process.
342 		 */
343 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
344 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
345 			      "failed to start watching FD (%d): %s",
346 			      fd, isc_result_totext(result));
347 	}
348 }
349 
350 /*
351  * Update the state of the socketmgr when something changes.
352  */
353 static void
354 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
355 	if (msg == SELECT_POKE_SHUTDOWN)
356 		return;
357 	else if (fd >= 0)
358 		wakeup_socket(manager, fd, msg);
359 	return;
360 }
361 
362 /*
363  * Make a fd non-blocking.
364  */
365 static isc_result_t
366 make_nonblock(int fd) {
367 	int ret;
368 	int flags;
369 
370 	flags = fcntl(fd, F_GETFL, 0);
371 	flags |= O_NONBLOCK;
372 	ret = fcntl(fd, F_SETFL, flags);
373 
374 	if (ret == -1) {
375 		UNEXPECTED_ERROR(__FILE__, __LINE__,
376 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
377 				 strerror(errno));
378 		return (ISC_R_UNEXPECTED);
379 	}
380 
381 	return (ISC_R_SUCCESS);
382 }
383 
384 /*
385  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
386  * In order to ensure as much portability as possible, we provide wrapper
387  * functions of these macros.
388  * Note that cmsg_space() could run slow on OSes that do not have
389  * CMSG_SPACE.
390  */
391 static inline socklen_t
392 cmsg_len(socklen_t len) {
393 	return (CMSG_LEN(len));
394 }
395 
396 static inline socklen_t
397 cmsg_space(socklen_t len) {
398 	return (CMSG_SPACE(len));
399 }
400 
401 /*
402  * Process control messages received on a socket.
403  */
404 static void
405 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
406 	struct cmsghdr *cmsgp;
407 	struct in6_pktinfo *pktinfop;
408 	void *timevalp;
409 
410 	/*
411 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
412 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
413 	 * They are all here, outside of the CPP tests, because it is
414 	 * more consistent with the usual ISC coding style.
415 	 */
416 	UNUSED(sock);
417 	UNUSED(msg);
418 	UNUSED(dev);
419 
420 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
421 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
422 
423 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
424 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
425 
426 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
427 		return;
428 
429 	timevalp = NULL;
430 	pktinfop = NULL;
431 
432 	cmsgp = CMSG_FIRSTHDR(msg);
433 	while (cmsgp != NULL) {
434 		socket_log(sock, NULL, TRACE,
435 			   "processing cmsg %p", cmsgp);
436 
437 		if (cmsgp->cmsg_level == IPPROTO_IPV6
438 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
439 
440 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
441 			memmove(&dev->pktinfo, pktinfop,
442 				sizeof(struct in6_pktinfo));
443 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
444 			socket_log(sock, NULL, TRACE,
445 				   "interface received on ifindex %u",
446 				   dev->pktinfo.ipi6_ifindex);
447 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
448 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
449 			goto next;
450 		}
451 
452 		if (cmsgp->cmsg_level == SOL_SOCKET
453 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
454 			struct timeval tv;
455 			timevalp = CMSG_DATA(cmsgp);
456 			memmove(&tv, timevalp, sizeof(tv));
457 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
458 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
459 			goto next;
460 		}
461 
462 		if (cmsgp->cmsg_level == IPPROTO_IPV6
463 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
464 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
465 			dev->dscp >>= 2;
466 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
467 			goto next;
468 		}
469 
470 		if (cmsgp->cmsg_level == IPPROTO_IP
471 		    && (cmsgp->cmsg_type == IP_TOS)) {
472 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
473 			dev->dscp >>= 2;
474 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
475 			goto next;
476 		}
477 	next:
478 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
479 	}
480 
481 }
482 
483 /*
484  * Construct an iov array and attach it to the msghdr passed in.  This is
485  * the SEND constructor, which will use the used region of the buffer
486  * (if using a buffer list) or will use the internal region (if a single
487  * buffer I/O is requested).
488  *
489  * Nothing can be NULL, and the done event must list at least one buffer
490  * on the buffer linked list for this function to be meaningful.
491  *
492  * If write_countp != NULL, *write_countp will hold the number of bytes
493  * this transaction can send.
494  */
495 static void
496 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
497 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
498 {
499 	unsigned int iovcount;
500 	isc_buffer_t *buffer;
501 	isc_region_t used;
502 	size_t write_count;
503 	size_t skip_count;
504 	struct cmsghdr *cmsgp;
505 
506 	memset(msg, 0, sizeof(*msg));
507 
508 	if (!sock->connected) {
509 		msg->msg_name = (void *)&dev->address.type.sa;
510 		msg->msg_namelen = dev->address.length;
511 	} else {
512 		msg->msg_name = NULL;
513 		msg->msg_namelen = 0;
514 	}
515 
516 	buffer = ISC_LIST_HEAD(dev->bufferlist);
517 	write_count = 0;
518 	iovcount = 0;
519 
520 	/*
521 	 * Single buffer I/O?  Skip what we've done so far in this region.
522 	 */
523 	if (buffer == NULL) {
524 		write_count = dev->region.length - dev->n;
525 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
526 		iov[0].iov_len = write_count;
527 		iovcount = 1;
528 
529 		goto config;
530 	}
531 
532 	/*
533 	 * Multibuffer I/O.
534 	 * Skip the data in the buffer list that we have already written.
535 	 */
536 	skip_count = dev->n;
537 	while (buffer != NULL) {
538 		if (skip_count < isc_buffer_usedlength(buffer))
539 			break;
540 		skip_count -= isc_buffer_usedlength(buffer);
541 		buffer = ISC_LIST_NEXT(buffer, link);
542 	}
543 
544 	while (buffer != NULL) {
545 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
546 
547 		isc_buffer_usedregion(buffer, &used);
548 
549 		if (used.length > 0) {
550 			iov[iovcount].iov_base = (void *)(used.base
551 							  + skip_count);
552 			iov[iovcount].iov_len = used.length - skip_count;
553 			write_count += (used.length - skip_count);
554 			skip_count = 0;
555 			iovcount++;
556 		}
557 		buffer = ISC_LIST_NEXT(buffer, link);
558 	}
559 
560 	INSIST(skip_count == 0U);
561 
562  config:
563 	msg->msg_iov = iov;
564 	msg->msg_iovlen = iovcount;
565 
566 	msg->msg_control = NULL;
567 	msg->msg_controllen = 0;
568 	msg->msg_flags = 0;
569 
570 	if ((sock->type == isc_sockettype_udp) &&
571 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
572 	{
573 		struct in6_pktinfo *pktinfop;
574 
575 		socket_log(sock, NULL, TRACE,
576 			   "sendto pktinfo data, ifindex %u",
577 			   dev->pktinfo.ipi6_ifindex);
578 
579 		msg->msg_control = (void *)cmsgbuf;
580 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
581 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
582 
583 		cmsgp = (struct cmsghdr *)cmsgbuf;
584 		cmsgp->cmsg_level = IPPROTO_IPV6;
585 		cmsgp->cmsg_type = IPV6_PKTINFO;
586 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
587 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
588 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
589 	}
590 
591 	if ((sock->type == isc_sockettype_udp) &&
592 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
593 	{
594 		int use_min_mtu = 1;	/* -1, 0, 1 */
595 
596 		cmsgp = (struct cmsghdr *)(cmsgbuf +
597 					   msg->msg_controllen);
598 
599 		msg->msg_control = (void *)cmsgbuf;
600 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
601 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
602 
603 		cmsgp->cmsg_level = IPPROTO_IPV6;
604 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
605 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
606 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
607 	}
608 
609 	if (isc_dscp_check_value > -1) {
610 		if (sock->type == isc_sockettype_udp)
611 			INSIST((int)dev->dscp == isc_dscp_check_value);
612 		else if (sock->type == isc_sockettype_tcp)
613 			INSIST((int)sock->dscp == isc_dscp_check_value);
614 	}
615 
616 	if ((sock->type == isc_sockettype_udp) &&
617 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
618 	{
619 		int dscp = (dev->dscp << 2) & 0xff;
620 
621 		INSIST(dev->dscp < 0x40);
622 
623 		if (sock->pf == AF_INET && sock->pktdscp) {
624 			cmsgp = (struct cmsghdr *)(cmsgbuf +
625 						   msg->msg_controllen);
626 			msg->msg_control = (void *)cmsgbuf;
627 			msg->msg_controllen += cmsg_space(sizeof(dscp));
628 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
629 
630 			cmsgp->cmsg_level = IPPROTO_IP;
631 			cmsgp->cmsg_type = IP_TOS;
632 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
633 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
634 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
635 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
636 			       (void *)&dscp, sizeof(int)) < 0)
637 			{
638 				UNEXPECTED_ERROR(__FILE__, __LINE__,
639 						 "setsockopt(%d, IP_TOS, %.02x)"
640 						 " %s: %s",
641 						 sock->fd, dscp >> 2,
642 						 "failed", strerror(errno));
643 			} else
644 				sock->dscp = dscp;
645 		}
646 
647 		if (sock->pf == AF_INET6 && sock->pktdscp) {
648 			cmsgp = (struct cmsghdr *)(cmsgbuf +
649 						   msg->msg_controllen);
650 			msg->msg_control = (void *)cmsgbuf;
651 			msg->msg_controllen += cmsg_space(sizeof(dscp));
652 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
653 
654 			cmsgp->cmsg_level = IPPROTO_IPV6;
655 			cmsgp->cmsg_type = IPV6_TCLASS;
656 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
657 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
658 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
659 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
660 				       (void *)&dscp, sizeof(int)) < 0) {
661 				UNEXPECTED_ERROR(__FILE__, __LINE__,
662 						 "setsockopt(%d, IPV6_TCLASS, "
663 						 "%.02x) %s: %s",
664 						 sock->fd, dscp >> 2,
665 						 "failed", strerror(errno));
666 			} else
667 				sock->dscp = dscp;
668 		}
669 
670 		if (msg->msg_controllen != 0 &&
671 		    msg->msg_controllen < SENDCMSGBUFLEN)
672 		{
673 			memset(cmsgbuf + msg->msg_controllen, 0,
674 			       SENDCMSGBUFLEN - msg->msg_controllen);
675 		}
676 	}
677 
678 	if (write_countp != NULL)
679 		*write_countp = write_count;
680 }
681 
682 /*
683  * Construct an iov array and attach it to the msghdr passed in.  This is
684  * the RECV constructor, which will use the available region of the buffer
685  * (if using a buffer list) or will use the internal region (if a single
686  * buffer I/O is requested).
687  *
688  * Nothing can be NULL, and the done event must list at least one buffer
689  * on the buffer linked list for this function to be meaningful.
690  *
691  * If read_countp != NULL, *read_countp will hold the number of bytes
692  * this transaction can receive.
693  */
694 static void
695 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
696 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
697 {
698 	unsigned int iovcount;
699 	isc_buffer_t *buffer;
700 	isc_region_t available;
701 	size_t read_count;
702 
703 	memset(msg, 0, sizeof(struct msghdr));
704 
705 	if (sock->type == isc_sockettype_udp) {
706 		memset(&dev->address, 0, sizeof(dev->address));
707 		msg->msg_name = (void *)&dev->address.type.sa;
708 		msg->msg_namelen = sizeof(dev->address.type);
709 	} else { /* TCP */
710 		msg->msg_name = NULL;
711 		msg->msg_namelen = 0;
712 		dev->address = sock->peer_address;
713 	}
714 
715 	buffer = ISC_LIST_HEAD(dev->bufferlist);
716 	read_count = 0;
717 
718 	/*
719 	 * Single buffer I/O?  Skip what we've done so far in this region.
720 	 */
721 	if (buffer == NULL) {
722 		read_count = dev->region.length - dev->n;
723 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
724 		iov[0].iov_len = read_count;
725 		iovcount = 1;
726 
727 		goto config;
728 	}
729 
730 	/*
731 	 * Multibuffer I/O.
732 	 * Skip empty buffers.
733 	 */
734 	while (buffer != NULL) {
735 		if (isc_buffer_availablelength(buffer) != 0)
736 			break;
737 		buffer = ISC_LIST_NEXT(buffer, link);
738 	}
739 
740 	iovcount = 0;
741 	while (buffer != NULL) {
742 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
743 
744 		isc_buffer_availableregion(buffer, &available);
745 
746 		if (available.length > 0) {
747 			iov[iovcount].iov_base = (void *)(available.base);
748 			iov[iovcount].iov_len = available.length;
749 			read_count += available.length;
750 			iovcount++;
751 		}
752 		buffer = ISC_LIST_NEXT(buffer, link);
753 	}
754 
755  config:
756 
757 	/*
758 	 * If needed, set up to receive that one extra byte.
759 	 */
760 	msg->msg_iov = iov;
761 	msg->msg_iovlen = iovcount;
762 
763 	msg->msg_control = cmsgbuf;
764 	msg->msg_controllen = RECVCMSGBUFLEN;
765 	msg->msg_flags = 0;
766 
767 	if (read_countp != NULL)
768 		*read_countp = read_count;
769 }
770 
771 static void
772 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
773 		isc_socketevent_t *dev)
774 {
775 	if (sock->type == isc_sockettype_udp) {
776 		if (address != NULL)
777 			dev->address = *address;
778 		else
779 			dev->address = sock->peer_address;
780 	} else if (sock->type == isc_sockettype_tcp) {
781 		INSIST(address == NULL);
782 		dev->address = sock->peer_address;
783 	}
784 }
785 
786 static void
787 destroy_socketevent(isc_event_t *event) {
788 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
789 
790 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
791 
792 	(ev->destroy)(event);
793 }
794 
795 static isc_socketevent_t *
796 allocate_socketevent(void *sender,
797 		     isc_eventtype_t eventtype, isc_taskaction_t action,
798 		     void *arg)
799 {
800 	isc_socketevent_t *ev;
801 
802 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
803 						     eventtype, action, arg,
804 						     sizeof(*ev));
805 
806 	if (ev == NULL)
807 		return (NULL);
808 
809 	ev->result = ISC_R_UNSET;
810 	ISC_LINK_INIT(ev, ev_link);
811 	ISC_LIST_INIT(ev->bufferlist);
812 	ev->region.base = NULL;
813 	ev->n = 0;
814 	ev->offset = 0;
815 	ev->attributes = 0;
816 	ev->destroy = ev->ev_destroy;
817 	ev->ev_destroy = destroy_socketevent;
818 	ev->dscp = 0;
819 
820 	return (ev);
821 }
822 
823 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
824 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
825 #define DOIO_HARD		2	/* i/o error, event sent */
826 #define DOIO_EOF		3	/* EOF, no event sent */
827 
828 static int
829 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
830 	int cc;
831 	struct iovec iov[MAXSCATTERGATHER_RECV];
832 	size_t read_count;
833 	size_t actual_count;
834 	struct msghdr msghdr;
835 	isc_buffer_t *buffer;
836 	int recv_errno;
837 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
838 
839 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
840 
841 	cc = recvmsg(sock->fd, &msghdr, 0);
842 	recv_errno = errno;
843 
844 	if (cc < 0) {
845 		if (SOFT_ERROR(recv_errno))
846 			return (DOIO_SOFT);
847 
848 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
849 			socket_log(sock, NULL, IOEVENT,
850 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
851 				   sock->fd, cc, recv_errno,
852 				   strerror(recv_errno));
853 		}
854 
855 #define SOFT_OR_HARD(_system, _isc) \
856 	if (recv_errno == _system) { \
857 		if (sock->connected) { \
858 			dev->result = _isc; \
859 			return (DOIO_HARD); \
860 		} \
861 		return (DOIO_SOFT); \
862 	}
863 #define ALWAYS_HARD(_system, _isc) \
864 	if (recv_errno == _system) { \
865 		dev->result = _isc; \
866 		return (DOIO_HARD); \
867 	}
868 
869 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
870 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
871 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
872 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
873 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
874 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
875 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
876 		/* Should never get this one but it was seen. */
877 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
878 		/*
879 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
880 		 * errors.
881 		 */
882 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
883 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
884 
885 #undef SOFT_OR_HARD
886 #undef ALWAYS_HARD
887 
888 		dev->result = isc__errno2result(recv_errno);
889 		return (DOIO_HARD);
890 	}
891 
892 	/*
893 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
894 	 * while on UDP sockets, zero length reads are perfectly valid,
895 	 * although strange.
896 	 */
897 	switch (sock->type) {
898 	case isc_sockettype_tcp:
899 		if (cc == 0)
900 			return (DOIO_EOF);
901 		break;
902 	case isc_sockettype_udp:
903 		break;
904 	default:
905 		INSIST(0);
906 	}
907 
908 	if (sock->type == isc_sockettype_udp) {
909 		dev->address.length = msghdr.msg_namelen;
910 		if (isc_sockaddr_getport(&dev->address) == 0) {
911 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
912 				socket_log(sock, &dev->address, IOEVENT,
913 					   "dropping source port zero packet");
914 			}
915 			return (DOIO_SOFT);
916 		}
917 	}
918 
919 	socket_log(sock, &dev->address, IOEVENT,
920 		   "packet received correctly");
921 
922 	/*
923 	 * Overflow bit detection.  If we received MORE bytes than we should,
924 	 * this indicates an overflow situation.  Set the flag in the
925 	 * dev entry and adjust how much we read by one.
926 	 */
927 	/*
928 	 * If there are control messages attached, run through them and pull
929 	 * out the interesting bits.
930 	 */
931 	process_cmsg(sock, &msghdr, dev);
932 
933 	/*
934 	 * update the buffers (if any) and the i/o count
935 	 */
936 	dev->n += cc;
937 	actual_count = cc;
938 	buffer = ISC_LIST_HEAD(dev->bufferlist);
939 	while (buffer != NULL && actual_count > 0U) {
940 		if (isc_buffer_availablelength(buffer) <= actual_count) {
941 			actual_count -= isc_buffer_availablelength(buffer);
942 			isc_buffer_add(buffer,
943 				       isc_buffer_availablelength(buffer));
944 		} else {
945 			isc_buffer_add(buffer, actual_count);
946 			actual_count = 0;
947 			POST(actual_count);
948 			break;
949 		}
950 		buffer = ISC_LIST_NEXT(buffer, link);
951 		if (buffer == NULL) {
952 			INSIST(actual_count == 0U);
953 		}
954 	}
955 
956 	/*
957 	 * If we read less than we expected, update counters,
958 	 * and let the upper layer poke the descriptor.
959 	 */
960 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
961 		return (DOIO_SOFT);
962 
963 	/*
964 	 * Full reads are posted, or partials if partials are ok.
965 	 */
966 	dev->result = ISC_R_SUCCESS;
967 	return (DOIO_SUCCESS);
968 }
969 
970 /*
971  * Returns:
972  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
973  *			ISC_R_SUCCESS.
974  *
975  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
976  *			dev->result contains the appropriate error.
977  *
978  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
979  *			event was sent.  The operation should be retried.
980  *
981  *	No other return values are possible.
982  */
983 static int
984 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
985 	int cc;
986 	struct iovec iov[MAXSCATTERGATHER_SEND];
987 	size_t write_count;
988 	struct msghdr msghdr;
989 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
990 	int attempts = 0;
991 	int send_errno;
992 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
993 
994 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
995 
996  resend:
997 	cc = sendmsg(sock->fd, &msghdr, 0);
998 	send_errno = errno;
999 
1000 	/*
1001 	 * Check for error or block condition.
1002 	 */
1003 	if (cc < 0) {
1004 		if (send_errno == EINTR && ++attempts < NRETRIES)
1005 			goto resend;
1006 
1007 		if (SOFT_ERROR(send_errno)) {
1008 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1009 				dev->result = ISC_R_WOULDBLOCK;
1010 			return (DOIO_SOFT);
1011 		}
1012 
1013 #define SOFT_OR_HARD(_system, _isc) \
1014 	if (send_errno == _system) { \
1015 		if (sock->connected) { \
1016 			dev->result = _isc; \
1017 			return (DOIO_HARD); \
1018 		} \
1019 		return (DOIO_SOFT); \
1020 	}
1021 #define ALWAYS_HARD(_system, _isc) \
1022 	if (send_errno == _system) { \
1023 		dev->result = _isc; \
1024 		return (DOIO_HARD); \
1025 	}
1026 
1027 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1028 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1029 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1030 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1031 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1032 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1033 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1034 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1035 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1036 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1037 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1038 
1039 #undef SOFT_OR_HARD
1040 #undef ALWAYS_HARD
1041 
1042 		/*
1043 		 * The other error types depend on whether or not the
1044 		 * socket is UDP or TCP.  If it is UDP, some errors
1045 		 * that we expect to be fatal under TCP are merely
1046 		 * annoying, and are really soft errors.
1047 		 *
1048 		 * However, these soft errors are still returned as
1049 		 * a status.
1050 		 */
1051 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1052 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1053 				 addrbuf, strerror(send_errno));
1054 		dev->result = isc__errno2result(send_errno);
1055 		return (DOIO_HARD);
1056 	}
1057 
1058 	if (cc == 0) {
1059 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1060 				 "doio_send: send() %s 0", "returned");
1061 	}
1062 
1063 	/*
1064 	 * If we write less than we expected, update counters, poke.
1065 	 */
1066 	dev->n += cc;
1067 	if ((size_t)cc != write_count)
1068 		return (DOIO_SOFT);
1069 
1070 	/*
1071 	 * Exactly what we wanted to write.  We're done with this
1072 	 * entry.  Post its completion event.
1073 	 */
1074 	dev->result = ISC_R_SUCCESS;
1075 	return (DOIO_SUCCESS);
1076 }
1077 
1078 /*
1079  * Kill.
1080  *
1081  * Caller must ensure that the socket is not locked and no external
1082  * references exist.
1083  */
1084 static void
1085 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1086 	/*
1087 	 * No one has this socket open, so the watcher doesn't have to be
1088 	 * poked, and the socket doesn't have to be locked.
1089 	 */
1090 	manager->fds[fd] = NULL;
1091 	manager->fdstate[fd] = CLOSE_PENDING;
1092 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1093 
1094 	if (sock->active == 1) {
1095 		sock->active = 0;
1096 	}
1097 
1098 	/*
1099 	 * update manager->maxfd here (XXX: this should be implemented more
1100 	 * efficiently)
1101 	 */
1102 	if (manager->maxfd == fd) {
1103 		int i;
1104 
1105 		manager->maxfd = 0;
1106 		for (i = fd - 1; i >= 0; i--) {
1107 			if (manager->fdstate[i] == MANAGED) {
1108 				manager->maxfd = i;
1109 				break;
1110 			}
1111 		}
1112 	}
1113 
1114 }
1115 
1116 static void
1117 destroy(isc_socket_t **sockp) {
1118 	int fd;
1119 	isc_socket_t *sock = *sockp;
1120 	isc_socketmgr_t *manager = sock->manager;
1121 
1122 	socket_log(sock, NULL, CREATION, "destroying");
1123 
1124 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1125 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1126 	INSIST(sock->connect_ev == NULL);
1127 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1128 
1129 	if (sock->fd >= 0) {
1130 		fd = sock->fd;
1131 		sock->fd = -1;
1132 		socketclose(manager, sock, fd);
1133 	}
1134 
1135 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1136 
1137 	/* can't unlock manager as its memory context is still used */
1138 	free_socket(sockp);
1139 }
1140 
1141 static isc_result_t
1142 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1143 		isc_socket_t **socketp)
1144 {
1145 	isc_socket_t *sock;
1146 
1147 	sock = malloc(sizeof(*sock));
1148 
1149 	if (sock == NULL)
1150 		return (ISC_R_NOMEMORY);
1151 
1152 	sock->references = 0;
1153 
1154 	sock->manager = manager;
1155 	sock->type = type;
1156 	sock->fd = -1;
1157 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1158 	sock->active = 0;
1159 
1160 	ISC_LINK_INIT(sock, link);
1161 
1162 	/*
1163 	 * Set up list of readers and writers to be initially empty.
1164 	 */
1165 	ISC_LIST_INIT(sock->recv_list);
1166 	ISC_LIST_INIT(sock->send_list);
1167 	sock->connect_ev = NULL;
1168 	sock->pending_recv = 0;
1169 	sock->pending_send = 0;
1170 	sock->connected = 0;
1171 	sock->connecting = 0;
1172 	sock->bound = 0;
1173 	sock->pktdscp = 0;
1174 
1175 	/*
1176 	 * Initialize readable and writable events.
1177 	 */
1178 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1179 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1180 		       NULL, sock, sock, NULL);
1181 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1182 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1183 		       NULL, sock, sock, NULL);
1184 
1185 	*socketp = sock;
1186 
1187 	return (ISC_R_SUCCESS);
1188 }
1189 
1190 /*
1191  * This event requires that the various lists be empty, that the reference
1192  * count be 1.  The other socket bits,
1193  * like the lock, must be initialized as well.  The fd associated must be
1194  * marked as closed, by setting it to -1 on close, or this routine will
1195  * also close the socket.
1196  */
1197 static void
1198 free_socket(isc_socket_t **socketp) {
1199 	isc_socket_t *sock = *socketp;
1200 
1201 	INSIST(sock->references == 0);
1202 	INSIST(!sock->connecting);
1203 	INSIST(!sock->pending_recv);
1204 	INSIST(!sock->pending_send);
1205 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1206 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1207 	INSIST(!ISC_LINK_LINKED(sock, link));
1208 
1209 	free(sock);
1210 
1211 	*socketp = NULL;
1212 }
1213 
1214 static void
1215 use_min_mtu(isc_socket_t *sock) {
1216 	/* use minimum MTU */
1217 	if (sock->pf == AF_INET6) {
1218 		int on = 1;
1219 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1220 				(void *)&on, sizeof(on));
1221 	}
1222 }
1223 
1224 static void
1225 set_tcp_maxseg(isc_socket_t *sock, int size) {
1226 	if (sock->type == isc_sockettype_tcp)
1227 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1228 				(void *)&size, sizeof(size));
1229 }
1230 
1231 static isc_result_t
1232 opensocket(isc_socket_t *sock)
1233 {
1234 	isc_result_t result;
1235 	const char *err = "socket";
1236 	int on = 1;
1237 
1238 	switch (sock->type) {
1239 	case isc_sockettype_udp:
1240 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1241 		break;
1242 	case isc_sockettype_tcp:
1243 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1244 		break;
1245 	}
1246 
1247 	if (sock->fd < 0) {
1248 		switch (errno) {
1249 		case EMFILE:
1250 		case ENFILE:
1251 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1252 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1253 				       "%s: %s", err, strerror(errno));
1254 			/* fallthrough */
1255 		case ENOBUFS:
1256 			return (ISC_R_NORESOURCES);
1257 
1258 		case EPROTONOSUPPORT:
1259 		case EPFNOSUPPORT:
1260 		case EAFNOSUPPORT:
1261 		/*
1262 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1263 		 * EAFNOSUPPORT.
1264 		 */
1265 		case EINVAL:
1266 			return (ISC_R_FAMILYNOSUPPORT);
1267 
1268 		default:
1269 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1270 					 "%s() %s: %s", err, "failed",
1271 					 strerror(errno));
1272 			return (ISC_R_UNEXPECTED);
1273 		}
1274 	}
1275 
1276 	result = make_nonblock(sock->fd);
1277 	if (result != ISC_R_SUCCESS) {
1278 		(void)close(sock->fd);
1279 		return (result);
1280 	}
1281 
1282 	/*
1283 	 * Use minimum mtu if possible.
1284 	 */
1285 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1286 		use_min_mtu(sock);
1287 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1288 	}
1289 
1290 	if (sock->type == isc_sockettype_udp) {
1291 
1292 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1293 			       (void *)&on, sizeof(on)) < 0
1294 		    && errno != ENOPROTOOPT) {
1295 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1296 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1297 					 sock->fd, "failed", strerror(errno));
1298 			/* Press on... */
1299 		}
1300 
1301 		/* RFC 3542 */
1302 		if ((sock->pf == AF_INET6)
1303 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1304 				   (void *)&on, sizeof(on)) < 0)) {
1305 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1306 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1307 					 "%s: %s", sock->fd, "failed",
1308 					 strerror(errno));
1309 		}
1310 	}
1311 
1312 	if (sock->active == 0) {
1313 		sock->active = 1;
1314 	}
1315 
1316 	return (ISC_R_SUCCESS);
1317 }
1318 
1319 /*
1320  * Create a 'type' socket managed
1321  * by 'manager'.  Events will be posted to 'task' and when dispatched
1322  * 'action' will be called with 'arg' as the arg value.  The new
1323  * socket is returned in 'socketp'.
1324  */
1325 static isc_result_t
1326 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1327 	      isc_socket_t **socketp)
1328 {
1329 	isc_socket_t *sock = NULL;
1330 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
1331 	isc_result_t result;
1332 
1333 	REQUIRE(socketp != NULL && *socketp == NULL);
1334 
1335 	result = allocate_socket(manager, type, &sock);
1336 	if (result != ISC_R_SUCCESS)
1337 		return (result);
1338 
1339 	switch (sock->type) {
1340 	case isc_sockettype_udp:
1341 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1342 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1343 		break;
1344 	case isc_sockettype_tcp:
1345 		break;
1346 	default:
1347 		INSIST(0);
1348 	}
1349 
1350 	sock->pf = pf;
1351 
1352 	result = opensocket(sock);
1353 	if (result != ISC_R_SUCCESS) {
1354 		free_socket(&sock);
1355 		return (result);
1356 	}
1357 
1358 	sock->references = 1;
1359 	*socketp = (isc_socket_t *)sock;
1360 
1361 	/*
1362 	 * Note we don't have to lock the socket like we normally would because
1363 	 * there are no external references to it yet.
1364 	 */
1365 
1366 	manager->fds[sock->fd] = sock;
1367 	manager->fdstate[sock->fd] = MANAGED;
1368 
1369 	ISC_LIST_APPEND(manager->socklist, sock, link);
1370 	if (manager->maxfd < sock->fd)
1371 		manager->maxfd = sock->fd;
1372 
1373 	socket_log(sock, NULL, CREATION, "created");
1374 
1375 	return (ISC_R_SUCCESS);
1376 }
1377 
1378 /*%
1379  * Create a new 'type' socket managed by 'manager'.  Events
1380  * will be posted to 'task' and when dispatched 'action' will be
1381  * called with 'arg' as the arg value.  The new socket is returned
1382  * in 'socketp'.
1383  */
1384 isc_result_t
1385 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1386 		   isc_socket_t **socketp)
1387 {
1388 	return (socket_create(manager0, pf, type, socketp));
1389 }
1390 
1391 /*
1392  * Attach to a socket.  Caller must explicitly detach when it is done.
1393  */
1394 void
1395 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1396 	isc_socket_t *sock = (isc_socket_t *)sock0;
1397 
1398 	REQUIRE(socketp != NULL && *socketp == NULL);
1399 
1400 	sock->references++;
1401 
1402 	*socketp = (isc_socket_t *)sock;
1403 }
1404 
1405 /*
1406  * Dereference a socket.  If this is the last reference to it, clean things
1407  * up by destroying the socket.
1408  */
1409 void
1410 isc_socket_detach(isc_socket_t **socketp) {
1411 	isc_socket_t *sock;
1412 	isc_boolean_t kill_socket = ISC_FALSE;
1413 
1414 	REQUIRE(socketp != NULL);
1415 	sock = (isc_socket_t *)*socketp;
1416 
1417 	REQUIRE(sock->references > 0);
1418 	sock->references--;
1419 	if (sock->references == 0)
1420 		kill_socket = ISC_TRUE;
1421 
1422 	if (kill_socket)
1423 		destroy(&sock);
1424 
1425 	*socketp = NULL;
1426 }
1427 
1428 /*
1429  * I/O is possible on a given socket.  Schedule an event to this task that
1430  * will call an internal function to do the I/O.  This will charge the
1431  * task with the I/O operation and let our select loop handler get back
1432  * to doing something real as fast as possible.
1433  *
1434  * The socket and manager must be locked before calling this function.
1435  */
1436 static void
1437 dispatch_recv(isc_socket_t *sock) {
1438 	intev_t *iev;
1439 	isc_socketevent_t *ev;
1440 	isc_task_t *sender;
1441 
1442 	INSIST(!sock->pending_recv);
1443 
1444 	ev = ISC_LIST_HEAD(sock->recv_list);
1445 	if (ev == NULL)
1446 		return;
1447 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1448 		   "dispatch_recv:  event %p -> task %p",
1449 		   ev, ev->ev_sender);
1450 	sender = ev->ev_sender;
1451 
1452 	sock->pending_recv = 1;
1453 	iev = &sock->readable_ev;
1454 
1455 	sock->references++;
1456 	iev->ev_sender = sock;
1457 	iev->ev_action = internal_recv;
1458 	iev->ev_arg = sock;
1459 
1460 	isc_task_send(sender, (isc_event_t **)&iev);
1461 }
1462 
1463 static void
1464 dispatch_send(isc_socket_t *sock) {
1465 	intev_t *iev;
1466 	isc_socketevent_t *ev;
1467 	isc_task_t *sender;
1468 
1469 	INSIST(!sock->pending_send);
1470 
1471 	ev = ISC_LIST_HEAD(sock->send_list);
1472 	if (ev == NULL)
1473 		return;
1474 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1475 		   "dispatch_send:  event %p -> task %p",
1476 		   ev, ev->ev_sender);
1477 	sender = ev->ev_sender;
1478 
1479 	sock->pending_send = 1;
1480 	iev = &sock->writable_ev;
1481 
1482 	sock->references++;
1483 	iev->ev_sender = sock;
1484 	iev->ev_action = internal_send;
1485 	iev->ev_arg = sock;
1486 
1487 	isc_task_send(sender, (isc_event_t **)&iev);
1488 }
1489 
1490 static void
1491 dispatch_connect(isc_socket_t *sock) {
1492 	intev_t *iev;
1493 	isc_socket_connev_t *ev;
1494 
1495 	iev = &sock->writable_ev;
1496 
1497 	ev = sock->connect_ev;
1498 	INSIST(ev != NULL); /* XXX */
1499 
1500 	INSIST(sock->connecting);
1501 
1502 	sock->references++;  /* keep socket around for this internal event */
1503 	iev->ev_sender = sock;
1504 	iev->ev_action = internal_connect;
1505 	iev->ev_arg = sock;
1506 
1507 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1508 }
1509 
1510 /*
1511  * Dequeue an item off the given socket's read queue, set the result code
1512  * in the done event to the one provided, and send it to the task it was
1513  * destined for.
1514  *
1515  * If the event to be sent is on a list, remove it before sending.  If
1516  * asked to, send and detach from the socket as well.
1517  *
1518  * Caller must have the socket locked if the event is attached to the socket.
1519  */
1520 static void
1521 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1522 	isc_task_t *task;
1523 
1524 	task = (*dev)->ev_sender;
1525 
1526 	(*dev)->ev_sender = sock;
1527 
1528 	if (ISC_LINK_LINKED(*dev, ev_link))
1529 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1530 
1531 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1532 	    == ISC_SOCKEVENTATTR_ATTACHED)
1533 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1534 	else
1535 		isc_task_send(task, (isc_event_t **)dev);
1536 }
1537 
1538 /*
1539  * See comments for send_recvdone_event() above.
1540  *
1541  * Caller must have the socket locked if the event is attached to the socket.
1542  */
1543 static void
1544 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1545 	isc_task_t *task;
1546 
1547 	INSIST(dev != NULL && *dev != NULL);
1548 
1549 	task = (*dev)->ev_sender;
1550 	(*dev)->ev_sender = sock;
1551 
1552 	if (ISC_LINK_LINKED(*dev, ev_link))
1553 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1554 
1555 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1556 	    == ISC_SOCKEVENTATTR_ATTACHED)
1557 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1558 	else
1559 		isc_task_send(task, (isc_event_t **)dev);
1560 }
1561 
1562 static void
1563 internal_recv(isc_task_t *me, isc_event_t *ev) {
1564 	isc_socketevent_t *dev;
1565 	isc_socket_t *sock;
1566 
1567 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1568 
1569 	sock = ev->ev_sender;
1570 
1571 	socket_log(sock, NULL, IOEVENT,
1572 		   "internal_recv: task %p got event %p", me, ev);
1573 
1574 	INSIST(sock->pending_recv == 1);
1575 	sock->pending_recv = 0;
1576 
1577 	INSIST(sock->references > 0);
1578 	sock->references--;  /* the internal event is done with this socket */
1579 	if (sock->references == 0) {
1580 		destroy(&sock);
1581 		return;
1582 	}
1583 
1584 	/*
1585 	 * Try to do as much I/O as possible on this socket.  There are no
1586 	 * limits here, currently.
1587 	 */
1588 	dev = ISC_LIST_HEAD(sock->recv_list);
1589 	while (dev != NULL) {
1590 		switch (doio_recv(sock, dev)) {
1591 		case DOIO_SOFT:
1592 			goto poke;
1593 
1594 		case DOIO_EOF:
1595 			/*
1596 			 * read of 0 means the remote end was closed.
1597 			 * Run through the event queue and dispatch all
1598 			 * the events with an EOF result code.
1599 			 */
1600 			do {
1601 				dev->result = ISC_R_EOF;
1602 				send_recvdone_event(sock, &dev);
1603 				dev = ISC_LIST_HEAD(sock->recv_list);
1604 			} while (dev != NULL);
1605 			goto poke;
1606 
1607 		case DOIO_SUCCESS:
1608 		case DOIO_HARD:
1609 			send_recvdone_event(sock, &dev);
1610 			break;
1611 		}
1612 
1613 		dev = ISC_LIST_HEAD(sock->recv_list);
1614 	}
1615 
1616  poke:
1617 	if (!ISC_LIST_EMPTY(sock->recv_list))
1618 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1619 }
1620 
1621 static void
1622 internal_send(isc_task_t *me, isc_event_t *ev) {
1623 	isc_socketevent_t *dev;
1624 	isc_socket_t *sock;
1625 
1626 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1627 
1628 	/*
1629 	 * Find out what socket this is and lock it.
1630 	 */
1631 	sock = (isc_socket_t *)ev->ev_sender;
1632 	socket_log(sock, NULL, IOEVENT,
1633 		   "internal_send: task %p got event %p", me, ev);
1634 
1635 	INSIST(sock->pending_send == 1);
1636 	sock->pending_send = 0;
1637 
1638 	INSIST(sock->references > 0);
1639 	sock->references--;  /* the internal event is done with this socket */
1640 	if (sock->references == 0) {
1641 		destroy(&sock);
1642 		return;
1643 	}
1644 
1645 	/*
1646 	 * Try to do as much I/O as possible on this socket.  There are no
1647 	 * limits here, currently.
1648 	 */
1649 	dev = ISC_LIST_HEAD(sock->send_list);
1650 	while (dev != NULL) {
1651 		switch (doio_send(sock, dev)) {
1652 		case DOIO_SOFT:
1653 			goto poke;
1654 
1655 		case DOIO_HARD:
1656 		case DOIO_SUCCESS:
1657 			send_senddone_event(sock, &dev);
1658 			break;
1659 		}
1660 
1661 		dev = ISC_LIST_HEAD(sock->send_list);
1662 	}
1663 
1664  poke:
1665 	if (!ISC_LIST_EMPTY(sock->send_list))
1666 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1667 }
1668 
1669 /*
1670  * Process read/writes on each fd here.  Avoid locking
1671  * and unlocking twice if both reads and writes are possible.
1672  */
1673 static void
1674 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
1675 	   isc_boolean_t writeable)
1676 {
1677 	isc_socket_t *sock;
1678 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1679 
1680 	/*
1681 	 * If the socket is going to be closed, don't do more I/O.
1682 	 */
1683 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1684 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1685 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1686 		return;
1687 	}
1688 
1689 	sock = manager->fds[fd];
1690 	if (readable) {
1691 		if (sock == NULL) {
1692 			unwatch_read = ISC_TRUE;
1693 			goto check_write;
1694 		}
1695 		if (!SOCK_DEAD(sock)) {
1696 			dispatch_recv(sock);
1697 		}
1698 		unwatch_read = ISC_TRUE;
1699 	}
1700 check_write:
1701 	if (writeable) {
1702 		if (sock == NULL) {
1703 			unwatch_write = ISC_TRUE;
1704 			goto unlock_fd;
1705 		}
1706 		if (!SOCK_DEAD(sock)) {
1707 			if (sock->connecting)
1708 				dispatch_connect(sock);
1709 			else
1710 				dispatch_send(sock);
1711 		}
1712 		unwatch_write = ISC_TRUE;
1713 	}
1714 
1715  unlock_fd:
1716 	if (unwatch_read)
1717 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1718 	if (unwatch_write)
1719 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1720 
1721 }
1722 
1723 static void
1724 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds,
1725 	    fd_set *writefds)
1726 {
1727 	int i;
1728 
1729 	REQUIRE(maxfd <= (int)manager->maxsocks);
1730 
1731 	for (i = 0; i < maxfd; i++) {
1732 		process_fd(manager, i, FD_ISSET(i, readfds),
1733 			   FD_ISSET(i, writefds));
1734 	}
1735 }
1736 
1737 /*
1738  * Create a new socket manager.
1739  */
1740 
1741 static isc_result_t
1742 setup_watcher(isc_socketmgr_t *manager) {
1743 	isc_result_t result;
1744 
1745 	UNUSED(result);
1746 
1747 	manager->fd_bufsize = sizeof(fd_set);
1748 
1749 	manager->read_fds = NULL;
1750 	manager->read_fds_copy = NULL;
1751 	manager->write_fds = NULL;
1752 	manager->write_fds_copy = NULL;
1753 
1754 	manager->read_fds = malloc(manager->fd_bufsize);
1755 	if (manager->read_fds != NULL)
1756 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1757 	if (manager->read_fds_copy != NULL)
1758 		manager->write_fds = malloc(manager->fd_bufsize);
1759 	if (manager->write_fds != NULL) {
1760 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1761 	}
1762 	if (manager->write_fds_copy == NULL) {
1763 		if (manager->write_fds != NULL) {
1764 			free(manager->write_fds);
1765 		}
1766 		if (manager->read_fds_copy != NULL) {
1767 			free(manager->read_fds_copy);
1768 		}
1769 		if (manager->read_fds != NULL) {
1770 			free(manager->read_fds);
1771 		}
1772 		return (ISC_R_NOMEMORY);
1773 	}
1774 	memset(manager->read_fds, 0, manager->fd_bufsize);
1775 	memset(manager->write_fds, 0, manager->fd_bufsize);
1776 
1777 	manager->maxfd = 0;
1778 
1779 	return (ISC_R_SUCCESS);
1780 }
1781 
1782 static void
1783 cleanup_watcher(isc_socketmgr_t *manager) {
1784 
1785 	if (manager->read_fds != NULL)
1786 		free(manager->read_fds);
1787 	if (manager->read_fds_copy != NULL)
1788 		free(manager->read_fds_copy);
1789 	if (manager->write_fds != NULL)
1790 		free(manager->write_fds);
1791 	if (manager->write_fds_copy != NULL)
1792 		free(manager->write_fds_copy);
1793 }
1794 
1795 static isc_result_t
1796 isc_socketmgr_create2(isc_socketmgr_t **managerp,
1797 		       unsigned int maxsocks)
1798 {
1799 	isc_socketmgr_t *manager;
1800 	isc_result_t result;
1801 
1802 	REQUIRE(managerp != NULL && *managerp == NULL);
1803 
1804 	if (socketmgr != NULL) {
1805 		/* Don't allow maxsocks to be updated */
1806 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1807 			return (ISC_R_EXISTS);
1808 
1809 		socketmgr->refs++;
1810 		*managerp = (isc_socketmgr_t *)socketmgr;
1811 		return (ISC_R_SUCCESS);
1812 	}
1813 
1814 	if (maxsocks == 0)
1815 		maxsocks = FD_SETSIZE;
1816 
1817 	manager = malloc(sizeof(*manager));
1818 	if (manager == NULL)
1819 		return (ISC_R_NOMEMORY);
1820 
1821 	/* zero-clear so that necessary cleanup on failure will be easy */
1822 	memset(manager, 0, sizeof(*manager));
1823 	manager->maxsocks = maxsocks;
1824 	manager->fds = reallocarray(NULL, manager->maxsocks, sizeof(isc_socket_t *));
1825 	if (manager->fds == NULL) {
1826 		result = ISC_R_NOMEMORY;
1827 		goto free_manager;
1828 	}
1829 	manager->fdstate = reallocarray(NULL, manager->maxsocks, sizeof(int));
1830 	if (manager->fdstate == NULL) {
1831 		result = ISC_R_NOMEMORY;
1832 		goto free_manager;
1833 	}
1834 
1835 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1836 	ISC_LIST_INIT(manager->socklist);
1837 
1838 	manager->refs = 1;
1839 
1840 	/*
1841 	 * Set up initial state for the select loop
1842 	 */
1843 	result = setup_watcher(manager);
1844 	if (result != ISC_R_SUCCESS)
1845 		goto cleanup;
1846 
1847 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1848 
1849 	socketmgr = manager;
1850 	*managerp = (isc_socketmgr_t *)manager;
1851 
1852 	return (ISC_R_SUCCESS);
1853 
1854 cleanup:
1855 
1856 free_manager:
1857 	if (manager->fdstate != NULL) {
1858 		free(manager->fdstate);
1859 	}
1860 	if (manager->fds != NULL) {
1861 		free(manager->fds);
1862 	}
1863 	free(manager);
1864 
1865 	return (result);
1866 }
1867 
1868 isc_result_t
1869 isc_socketmgr_create(isc_socketmgr_t **managerp) {
1870 	return (isc_socketmgr_create2(managerp, 0));
1871 }
1872 
1873 void
1874 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
1875 	isc_socketmgr_t *manager;
1876 	int i;
1877 
1878 	/*
1879 	 * Destroy a socket manager.
1880 	 */
1881 
1882 	REQUIRE(managerp != NULL);
1883 	manager = (isc_socketmgr_t *)*managerp;
1884 
1885 	manager->refs--;
1886 	if (manager->refs > 0) {
1887 		*managerp = NULL;
1888 		return;
1889 	}
1890 	socketmgr = NULL;
1891 
1892 	/*
1893 	 * Wait for all sockets to be destroyed.
1894 	 */
1895 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1896 		isc_taskmgr_dispatch(NULL);
1897 	}
1898 
1899 	/*
1900 	 * Here, poke our select/poll thread.  Do this by closing the write
1901 	 * half of the pipe, which will send EOF to the read half.
1902 	 * This is currently a no-op in the non-threaded case.
1903 	 */
1904 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
1905 
1906 	/*
1907 	 * Clean up.
1908 	 */
1909 	cleanup_watcher(manager);
1910 
1911 	for (i = 0; i < (int)manager->maxsocks; i++)
1912 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
1913 			(void)close(i);
1914 
1915 	free(manager->fds);
1916 	free(manager->fdstate);
1917 
1918 	free(manager);
1919 
1920 	*managerp = NULL;
1921 
1922 	socketmgr = NULL;
1923 }
1924 
1925 static isc_result_t
1926 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
1927 	    unsigned int flags)
1928 {
1929 	int io_state;
1930 	isc_task_t *ntask = NULL;
1931 	isc_result_t result = ISC_R_SUCCESS;
1932 
1933 	dev->ev_sender = task;
1934 
1935 	if (sock->type == isc_sockettype_udp) {
1936 		io_state = doio_recv(sock, dev);
1937 	} else {
1938 		if (ISC_LIST_EMPTY(sock->recv_list))
1939 			io_state = doio_recv(sock, dev);
1940 		else
1941 			io_state = DOIO_SOFT;
1942 	}
1943 
1944 	switch (io_state) {
1945 	case DOIO_SOFT:
1946 		/*
1947 		 * We couldn't read all or part of the request right now, so
1948 		 * queue it.
1949 		 *
1950 		 * Attach to socket and to task
1951 		 */
1952 		isc_task_attach(task, &ntask);
1953 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
1954 
1955 		/*
1956 		 * Enqueue the request.  If the socket was previously not being
1957 		 * watched, poke the watcher to start paying attention to it.
1958 		 */
1959 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
1960 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1961 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
1962 
1963 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
1964 			   "socket_recv: event %p -> task %p",
1965 			   dev, ntask);
1966 
1967 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
1968 			result = ISC_R_INPROGRESS;
1969 		break;
1970 
1971 	case DOIO_EOF:
1972 		dev->result = ISC_R_EOF;
1973 		/* fallthrough */
1974 
1975 	case DOIO_HARD:
1976 	case DOIO_SUCCESS:
1977 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
1978 			send_recvdone_event(sock, &dev);
1979 		break;
1980 	}
1981 
1982 	return (result);
1983 }
1984 
1985 isc_result_t
1986 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
1987 		  unsigned int minimum, isc_task_t *task,
1988 		  isc_taskaction_t action, void *arg)
1989 {
1990 	isc_socket_t *sock = (isc_socket_t *)sock0;
1991 	isc_socketevent_t *dev;
1992 	unsigned int iocount;
1993 	isc_buffer_t *buffer;
1994 
1995 	REQUIRE(buflist != NULL);
1996 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
1997 	REQUIRE(task != NULL);
1998 	REQUIRE(action != NULL);
1999 
2000 	iocount = isc_bufferlist_availablecount(buflist);
2001 	REQUIRE(iocount > 0);
2002 
2003 	INSIST(sock->bound);
2004 
2005 	dev = allocate_socketevent(sock,
2006 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2007 	if (dev == NULL)
2008 		return (ISC_R_NOMEMORY);
2009 
2010 	/*
2011 	 * UDP sockets are always partial read
2012 	 */
2013 	if (sock->type == isc_sockettype_udp)
2014 		dev->minimum = 1;
2015 	else {
2016 		if (minimum == 0)
2017 			dev->minimum = iocount;
2018 		else
2019 			dev->minimum = minimum;
2020 	}
2021 
2022 	/*
2023 	 * Move each buffer from the passed in list to our internal one.
2024 	 */
2025 	buffer = ISC_LIST_HEAD(*buflist);
2026 	while (buffer != NULL) {
2027 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2028 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2029 		buffer = ISC_LIST_HEAD(*buflist);
2030 	}
2031 
2032 	return (socket_recv(sock, dev, task, 0));
2033 }
2034 
2035 static isc_result_t
2036 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2037 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2038 	    unsigned int flags)
2039 {
2040 	int io_state;
2041 	isc_task_t *ntask = NULL;
2042 	isc_result_t result = ISC_R_SUCCESS;
2043 
2044 	dev->ev_sender = task;
2045 
2046 	set_dev_address(address, sock, dev);
2047 	if (pktinfo != NULL) {
2048 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2049 		dev->pktinfo = *pktinfo;
2050 
2051 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2052 		    !isc_sockaddr_islinklocal(&dev->address)) {
2053 			socket_log(sock, NULL, TRACE,
2054 				   "pktinfo structure provided, ifindex %u "
2055 				   "(set to 0)", pktinfo->ipi6_ifindex);
2056 
2057 			/*
2058 			 * Set the pktinfo index to 0 here, to let the
2059 			 * kernel decide what interface it should send on.
2060 			 */
2061 			dev->pktinfo.ipi6_ifindex = 0;
2062 		}
2063 	}
2064 
2065 	if (sock->type == isc_sockettype_udp)
2066 		io_state = doio_send(sock, dev);
2067 	else {
2068 		if (ISC_LIST_EMPTY(sock->send_list))
2069 			io_state = doio_send(sock, dev);
2070 		else
2071 			io_state = DOIO_SOFT;
2072 	}
2073 
2074 	switch (io_state) {
2075 	case DOIO_SOFT:
2076 		/*
2077 		 * We couldn't send all or part of the request right now, so
2078 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2079 		 */
2080 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2081 			isc_task_attach(task, &ntask);
2082 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2083 
2084 			/*
2085 			 * Enqueue the request.  If the socket was previously
2086 			 * not being watched, poke the watcher to start
2087 			 * paying attention to it.
2088 			 */
2089 			if (ISC_LIST_EMPTY(sock->send_list) &&
2090 			    !sock->pending_send)
2091 				select_poke(sock->manager, sock->fd,
2092 					    SELECT_POKE_WRITE);
2093 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2094 
2095 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2096 				   "socket_send: event %p -> task %p",
2097 				   dev, ntask);
2098 
2099 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2100 				result = ISC_R_INPROGRESS;
2101 			break;
2102 		}
2103 
2104 		/* FALLTHROUGH */
2105 
2106 	case DOIO_HARD:
2107 	case DOIO_SUCCESS:
2108 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2109 			send_senddone_event(sock, &dev);
2110 		break;
2111 	}
2112 
2113 	return (result);
2114 }
2115 
2116 isc_result_t
2117 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2118 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2119 {
2120 	return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL,
2121 				     NULL, 0));
2122 }
2123 
2124 isc_result_t
2125 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2126 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2127 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2128 		     unsigned int flags)
2129 {
2130 	isc_socket_t *sock = (isc_socket_t *)sock0;
2131 	isc_socketevent_t *dev;
2132 	unsigned int iocount;
2133 	isc_buffer_t *buffer;
2134 
2135 	REQUIRE(buflist != NULL);
2136 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2137 	REQUIRE(task != NULL);
2138 	REQUIRE(action != NULL);
2139 
2140 	iocount = isc_bufferlist_usedcount(buflist);
2141 	REQUIRE(iocount > 0);
2142 
2143 	dev = allocate_socketevent(sock,
2144 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2145 	if (dev == NULL)
2146 		return (ISC_R_NOMEMORY);
2147 
2148 	/*
2149 	 * Move each buffer from the passed in list to our internal one.
2150 	 */
2151 	buffer = ISC_LIST_HEAD(*buflist);
2152 	while (buffer != NULL) {
2153 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2154 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2155 		buffer = ISC_LIST_HEAD(*buflist);
2156 	}
2157 
2158 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2159 }
2160 
2161 isc_result_t
2162 isc_socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2163 		 unsigned int options) {
2164 	isc_socket_t *sock = (isc_socket_t *)sock0;
2165 	int on = 1;
2166 
2167 	INSIST(!sock->bound);
2168 
2169 	if (sock->pf != sockaddr->type.sa.sa_family) {
2170 		return (ISC_R_FAMILYMISMATCH);
2171 	}
2172 
2173 	/*
2174 	 * Only set SO_REUSEADDR when we want a specific port.
2175 	 */
2176 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2177 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2178 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2179 		       sizeof(on)) < 0) {
2180 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2181 				 "setsockopt(%d) %s", sock->fd, "failed");
2182 		/* Press on... */
2183 	}
2184 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2185 		switch (errno) {
2186 		case EACCES:
2187 			return (ISC_R_NOPERM);
2188 		case EADDRNOTAVAIL:
2189 			return (ISC_R_ADDRNOTAVAIL);
2190 		case EADDRINUSE:
2191 			return (ISC_R_ADDRINUSE);
2192 		case EINVAL:
2193 			return (ISC_R_BOUND);
2194 		default:
2195 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2196 					 strerror(errno));
2197 			return (ISC_R_UNEXPECTED);
2198 		}
2199 	}
2200 
2201 	socket_log(sock, sockaddr, TRACE, "bound");
2202 	sock->bound = 1;
2203 
2204 	return (ISC_R_SUCCESS);
2205 }
2206 
2207 isc_result_t
2208 isc_socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2209 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2210 {
2211 	isc_socket_t *sock = (isc_socket_t *)sock0;
2212 	isc_socket_connev_t *dev;
2213 	isc_task_t *ntask = NULL;
2214 	isc_socketmgr_t *manager;
2215 	int cc;
2216 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2217 
2218 	REQUIRE(addr != NULL);
2219 	REQUIRE(task != NULL);
2220 	REQUIRE(action != NULL);
2221 
2222 	manager = sock->manager;
2223 	REQUIRE(addr != NULL);
2224 
2225 	if (isc_sockaddr_ismulticast(addr))
2226 		return (ISC_R_MULTICAST);
2227 
2228 	REQUIRE(!sock->connecting);
2229 
2230 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2231 							ISC_SOCKEVENT_CONNECT,
2232 							action,	arg,
2233 							sizeof(*dev));
2234 	if (dev == NULL) {
2235 		return (ISC_R_NOMEMORY);
2236 	}
2237 	ISC_LINK_INIT(dev, ev_link);
2238 
2239 	/*
2240 	 * Try to do the connect right away, as there can be only one
2241 	 * outstanding, and it might happen to complete.
2242 	 */
2243 	sock->peer_address = *addr;
2244 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2245 	if (cc < 0) {
2246 		/*
2247 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2248 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2249 		 * a success and let the user detect it if it's really an error
2250 		 * at the time of sending a packet on the socket.
2251 		 */
2252 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2253 			cc = 0;
2254 			goto success;
2255 		}
2256 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2257 			goto queue;
2258 
2259 		switch (errno) {
2260 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2261 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2262 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2263 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2264 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2265 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2266 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2267 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2268 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2269 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2270 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2271 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2272 #undef ERROR_MATCH
2273 		}
2274 
2275 		sock->connected = 0;
2276 
2277 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2278 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2279 				 addrbuf, errno, strerror(errno));
2280 
2281 		isc_event_free(ISC_EVENT_PTR(&dev));
2282 		return (ISC_R_UNEXPECTED);
2283 
2284 	err_exit:
2285 		sock->connected = 0;
2286 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2287 
2288 		return (ISC_R_SUCCESS);
2289 	}
2290 
2291 	/*
2292 	 * If connect completed, fire off the done event.
2293 	 */
2294  success:
2295 	if (cc == 0) {
2296 		sock->connected = 1;
2297 		sock->bound = 1;
2298 		dev->result = ISC_R_SUCCESS;
2299 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2300 
2301 		return (ISC_R_SUCCESS);
2302 	}
2303 
2304  queue:
2305 
2306 	/*
2307 	 * Attach to task.
2308 	 */
2309 	isc_task_attach(task, &ntask);
2310 
2311 	sock->connecting = 1;
2312 
2313 	dev->ev_sender = ntask;
2314 
2315 	/*
2316 	 * Poke watcher here.  We still have the socket locked, so there
2317 	 * is no race condition.  We will keep the lock for such a short
2318 	 * bit of time waking it up now or later won't matter all that much.
2319 	 */
2320 	if (sock->connect_ev == NULL)
2321 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2322 
2323 	sock->connect_ev = dev;
2324 
2325 	return (ISC_R_SUCCESS);
2326 }
2327 
2328 /*
2329  * Called when a socket with a pending connect() finishes.
2330  */
2331 static void
2332 internal_connect(isc_task_t *me, isc_event_t *ev) {
2333 	isc_socket_t *sock;
2334 	isc_socket_connev_t *dev;
2335 	isc_task_t *task;
2336 	int cc;
2337 	socklen_t optlen;
2338 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2339 
2340 	UNUSED(me);
2341 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2342 
2343 	sock = ev->ev_sender;
2344 
2345 	/*
2346 	 * When the internal event was sent the reference count was bumped
2347 	 * to keep the socket around for us.  Decrement the count here.
2348 	 */
2349 	INSIST(sock->references > 0);
2350 	sock->references--;
2351 	if (sock->references == 0) {
2352 		destroy(&sock);
2353 		return;
2354 	}
2355 
2356 	/*
2357 	 * Has this event been canceled?
2358 	 */
2359 	dev = sock->connect_ev;
2360 	if (dev == NULL) {
2361 		INSIST(!sock->connecting);
2362 		return;
2363 	}
2364 
2365 	INSIST(sock->connecting);
2366 	sock->connecting = 0;
2367 
2368 	/*
2369 	 * Get any possible error status here.
2370 	 */
2371 	optlen = sizeof(cc);
2372 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2373 		       (void *)&cc, (void *)&optlen) < 0)
2374 		cc = errno;
2375 	else
2376 		errno = cc;
2377 
2378 	if (errno != 0) {
2379 		/*
2380 		 * If the error is EAGAIN, just re-select on this
2381 		 * fd and pretend nothing strange happened.
2382 		 */
2383 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2384 			sock->connecting = 1;
2385 			select_poke(sock->manager, sock->fd,
2386 				    SELECT_POKE_CONNECT);
2387 			return;
2388 		}
2389 
2390 		/*
2391 		 * Translate other errors into ISC_R_* flavors.
2392 		 */
2393 		switch (errno) {
2394 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2395 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2396 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2397 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2398 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2399 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2400 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2401 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2402 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2403 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2404 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2405 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2406 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2407 #undef ERROR_MATCH
2408 		default:
2409 			dev->result = ISC_R_UNEXPECTED;
2410 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2411 					    sizeof(peerbuf));
2412 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2413 					 "internal_connect: connect(%s) %s",
2414 					 peerbuf, strerror(errno));
2415 		}
2416 	} else {
2417 		dev->result = ISC_R_SUCCESS;
2418 		sock->connected = 1;
2419 		sock->bound = 1;
2420 	}
2421 
2422 	sock->connect_ev = NULL;
2423 
2424 	task = dev->ev_sender;
2425 	dev->ev_sender = sock;
2426 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2427 }
2428 
2429 /*
2430  * Run through the list of events on this socket, and cancel the ones
2431  * queued for task "task" of type "how".  "how" is a bitmask.
2432  */
2433 void
2434 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2435 	isc_socket_t *sock = (isc_socket_t *)sock0;
2436 
2437 	/*
2438 	 * Quick exit if there is nothing to do.  Don't even bother locking
2439 	 * in this case.
2440 	 */
2441 	if (how == 0)
2442 		return;
2443 
2444 	/*
2445 	 * All of these do the same thing, more or less.
2446 	 * Each will:
2447 	 *	o If the internal event is marked as "posted" try to
2448 	 *	  remove it from the task's queue.  If this fails, mark it
2449 	 *	  as canceled instead, and let the task clean it up later.
2450 	 *	o For each I/O request for that task of that type, post
2451 	 *	  its done event with status of "ISC_R_CANCELED".
2452 	 *	o Reset any state needed.
2453 	 */
2454 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2455 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2456 		isc_socketevent_t      *dev;
2457 		isc_socketevent_t      *next;
2458 		isc_task_t	       *current_task;
2459 
2460 		dev = ISC_LIST_HEAD(sock->recv_list);
2461 
2462 		while (dev != NULL) {
2463 			current_task = dev->ev_sender;
2464 			next = ISC_LIST_NEXT(dev, ev_link);
2465 
2466 			if ((task == NULL) || (task == current_task)) {
2467 				dev->result = ISC_R_CANCELED;
2468 				send_recvdone_event(sock, &dev);
2469 			}
2470 			dev = next;
2471 		}
2472 	}
2473 
2474 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2475 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2476 		isc_socketevent_t      *dev;
2477 		isc_socketevent_t      *next;
2478 		isc_task_t	       *current_task;
2479 
2480 		dev = ISC_LIST_HEAD(sock->send_list);
2481 
2482 		while (dev != NULL) {
2483 			current_task = dev->ev_sender;
2484 			next = ISC_LIST_NEXT(dev, ev_link);
2485 
2486 			if ((task == NULL) || (task == current_task)) {
2487 				dev->result = ISC_R_CANCELED;
2488 				send_senddone_event(sock, &dev);
2489 			}
2490 			dev = next;
2491 		}
2492 	}
2493 
2494 	/*
2495 	 * Connecting is not a list.
2496 	 */
2497 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2498 	    && sock->connect_ev != NULL) {
2499 		isc_socket_connev_t    *dev;
2500 		isc_task_t	       *current_task;
2501 
2502 		INSIST(sock->connecting);
2503 		sock->connecting = 0;
2504 
2505 		dev = sock->connect_ev;
2506 		current_task = dev->ev_sender;
2507 
2508 		if ((task == NULL) || (task == current_task)) {
2509 			sock->connect_ev = NULL;
2510 
2511 			dev->result = ISC_R_CANCELED;
2512 			dev->ev_sender = sock;
2513 			isc_task_sendanddetach(&current_task,
2514 					       ISC_EVENT_PTR(&dev));
2515 		}
2516 	}
2517 
2518 }
2519 
2520 /*
2521  * In our assumed scenario, we can simply use a single static object.
2522  * XXX: this is not true if the application uses multiple threads with
2523  *      'multi-context' mode.  Fixing this is a future TODO item.
2524  */
2525 static isc_socketwait_t swait_private;
2526 
2527 int
2528 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2529 			  isc_socketwait_t **swaitp)
2530 {
2531 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2532 	int n;
2533 
2534 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2535 
2536 	if (manager == NULL)
2537 		manager = socketmgr;
2538 	if (manager == NULL)
2539 		return (0);
2540 
2541 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2542 	memmove(manager->write_fds_copy, manager->write_fds,
2543 		manager->fd_bufsize);
2544 
2545 	swait_private.readset = manager->read_fds_copy;
2546 	swait_private.writeset = manager->write_fds_copy;
2547 	swait_private.maxfd = manager->maxfd + 1;
2548 
2549 	n = select(swait_private.maxfd, swait_private.readset,
2550 		   swait_private.writeset, NULL, tvp);
2551 
2552 	*swaitp = &swait_private;
2553 	return (n);
2554 }
2555 
2556 isc_result_t
2557 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2558 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2559 
2560 	REQUIRE(swait == &swait_private);
2561 
2562 	if (manager == NULL)
2563 		manager = socketmgr;
2564 	if (manager == NULL)
2565 		return (ISC_R_NOTFOUND);
2566 
2567 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2568 	return (ISC_R_SUCCESS);
2569 }
2570