xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 4354c957c4ec2632f18fdac86b9259a01d8daac6)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 /*!
101  * IPv6 control information.  If the socket is an IPv6 socket we want
102  * to collect the destination address and interface so the client can
103  * set them on outgoing packets.
104  */
105 
106 /*%
107  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
108  * a setsockopt() like interface to request timestamps, and if the OS
109  * doesn't do it for us, call gettimeofday() on every UDP receive?
110  */
111 
112 /*%
113  * Instead of calculating the cmsgbuf lengths every time we take
114  * a rule of thumb approach - sizes are taken from x86_64 linux,
115  * multiplied by 2, everything should fit. Those sizes are not
116  * large enough to cause any concern.
117  */
118 #define CMSG_SP_IN6PKT 40
119 
120 #define CMSG_SP_TIMESTAMP 32
121 
122 #define CMSG_SP_TCTOS 24
123 
124 #define CMSG_SP_INT 24
125 
126 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
127 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
128 
129 /*%
130  * The number of times a send operation is repeated if the result is EINTR.
131  */
132 #define NRETRIES 10
133 
134 struct isc_socket {
135 	/* Not locked. */
136 	isc_socketmgr_t	*manager;
137 	isc_sockettype_t	type;
138 
139 	/* Locked by socket lock. */
140 	ISC_LINK(isc_socket_t)	link;
141 	unsigned int		references;
142 	int			fd;
143 	int			pf;
144 
145 	ISC_LIST(isc_socketevent_t)		send_list;
146 	ISC_LIST(isc_socketevent_t)		recv_list;
147 	isc_socket_connev_t		       *connect_ev;
148 
149 	/*
150 	 * Internal events.  Posted when a descriptor is readable or
151 	 * writable.  These are statically allocated and never freed.
152 	 * They will be set to non-purgable before use.
153 	 */
154 	intev_t			readable_ev;
155 	intev_t			writable_ev;
156 
157 	isc_sockaddr_t		peer_address;       /* remote address */
158 
159 	unsigned int		pending_recv : 1,
160 				pending_send : 1,
161 				connected : 1,
162 				connecting : 1,     /* connect pending */
163 				bound : 1,          /* bound to local addr */
164 				active : 1,         /* currently active */
165 				pktdscp : 1;	    /* per packet dscp */
166 	unsigned int		dscp;
167 };
168 
169 struct isc_socketmgr {
170 	/* Not locked. */
171 	int			fd_bufsize;
172 	unsigned int		maxsocks;
173 
174 	isc_socket_t	       **fds;
175 	int			*fdstate;
176 
177 	/* Locked by manager lock. */
178 	ISC_LIST(isc_socket_t)	socklist;
179 	fd_set			*read_fds;
180 	fd_set			*read_fds_copy;
181 	fd_set			*write_fds;
182 	fd_set			*write_fds_copy;
183 	int			maxfd;
184 	unsigned int		refs;
185 };
186 
187 static isc_socketmgr_t *socketmgr = NULL;
188 
189 #define CLOSED			0	/* this one must be zero */
190 #define MANAGED			1
191 #define CLOSE_PENDING		2
192 
193 /*
194  * send() and recv() iovec counts
195  */
196 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
197 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
198 
199 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
200 				  isc_sockettype_t type,
201 				  isc_socket_t **socketp);
202 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
203 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
204 static void free_socket(isc_socket_t **);
205 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
206 				    isc_socket_t **);
207 static void destroy(isc_socket_t **);
208 static void internal_connect(isc_task_t *, isc_event_t *);
209 static void internal_recv(isc_task_t *, isc_event_t *);
210 static void internal_send(isc_task_t *, isc_event_t *);
211 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
212 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *,
213 			      struct msghdr *, struct iovec *, size_t *);
214 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *,
215 			      struct msghdr *, struct iovec *, size_t *);
216 
217 #define SELECT_POKE_SHUTDOWN		(-1)
218 #define SELECT_POKE_READ		(-3)
219 #define SELECT_POKE_WRITE		(-4)
220 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
221 #define SELECT_POKE_CLOSE		(-5)
222 
223 #define SOCK_DEAD(s)			((s)->references == 0)
224 
225 /*%
226  * Shortcut index arrays to get access to statistics counters.
227  */
228 enum {
229 	STATID_OPEN = 0,
230 	STATID_OPENFAIL = 1,
231 	STATID_CLOSE = 2,
232 	STATID_BINDFAIL = 3,
233 	STATID_CONNECTFAIL = 4,
234 	STATID_CONNECT = 5,
235 	STATID_ACCEPTFAIL = 6,
236 	STATID_ACCEPT = 7,
237 	STATID_SENDFAIL = 8,
238 	STATID_RECVFAIL = 9,
239 	STATID_ACTIVE = 10
240 };
241 
242 
243 static void
244 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
245 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
246 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
247 static void
248 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
249 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
250 	   const char *fmt, ...)
251 {
252 	char msgbuf[2048];
253 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
254 	va_list ap;
255 
256 	if (! isc_log_wouldlog(isc_lctx, level))
257 		return;
258 
259 	va_start(ap, fmt);
260 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
261 	va_end(ap);
262 
263 	if (address == NULL) {
264 		isc_log_write(isc_lctx, category, module, level,
265 			       "socket %p: %s", sock, msgbuf);
266 	} else {
267 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
268 		isc_log_write(isc_lctx, category, module, level,
269 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
270 	}
271 }
272 
273 static inline isc_result_t
274 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
275 	isc_result_t result = ISC_R_SUCCESS;
276 
277 	if (msg == SELECT_POKE_READ)
278 		FD_SET(fd, manager->read_fds);
279 	if (msg == SELECT_POKE_WRITE)
280 		FD_SET(fd, manager->write_fds);
281 
282 	return (result);
283 }
284 
285 static inline isc_result_t
286 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
287 	isc_result_t result = ISC_R_SUCCESS;
288 
289 	if (msg == SELECT_POKE_READ)
290 		FD_CLR(fd, manager->read_fds);
291 	else if (msg == SELECT_POKE_WRITE)
292 		FD_CLR(fd, manager->write_fds);
293 
294 	return (result);
295 }
296 
297 static void
298 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
299 	isc_result_t result;
300 
301 	/*
302 	 * This is a wakeup on a socket.  If the socket is not in the
303 	 * process of being closed, start watching it for either reads
304 	 * or writes.
305 	 */
306 
307 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
308 
309 	if (msg == SELECT_POKE_CLOSE) {
310 		/* No one should be updating fdstate, so no need to lock it */
311 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
312 		manager->fdstate[fd] = CLOSED;
313 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
314 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
315 		(void)close(fd);
316 		return;
317 	}
318 
319 	if (manager->fdstate[fd] == CLOSE_PENDING) {
320 
321 		/*
322 		 * We accept (and ignore) any error from unwatch_fd() as we are
323 		 * closing the socket, hoping it doesn't leave dangling state in
324 		 * the kernel.
325 		 */
326 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
327 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
328 		return;
329 	}
330 	if (manager->fdstate[fd] != MANAGED) {
331 		return;
332 	}
333 
334 	/*
335 	 * Set requested bit.
336 	 */
337 	result = watch_fd(manager, fd, msg);
338 	if (result != ISC_R_SUCCESS) {
339 		/*
340 		 * XXXJT: what should we do?  Ignoring the failure of watching
341 		 * a socket will make the application dysfunctional, but there
342 		 * seems to be no reasonable recovery process.
343 		 */
344 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
345 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
346 			      "failed to start watching FD (%d): %s",
347 			      fd, isc_result_totext(result));
348 	}
349 }
350 
351 /*
352  * Update the state of the socketmgr when something changes.
353  */
354 static void
355 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
356 	if (msg == SELECT_POKE_SHUTDOWN)
357 		return;
358 	else if (fd >= 0)
359 		wakeup_socket(manager, fd, msg);
360 	return;
361 }
362 
363 /*
364  * Make a fd non-blocking.
365  */
366 static isc_result_t
367 make_nonblock(int fd) {
368 	int ret;
369 	int flags;
370 
371 	flags = fcntl(fd, F_GETFL, 0);
372 	flags |= O_NONBLOCK;
373 	ret = fcntl(fd, F_SETFL, flags);
374 
375 	if (ret == -1) {
376 		UNEXPECTED_ERROR(__FILE__, __LINE__,
377 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
378 				 strerror(errno));
379 		return (ISC_R_UNEXPECTED);
380 	}
381 
382 	return (ISC_R_SUCCESS);
383 }
384 
385 /*
386  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
387  * In order to ensure as much portability as possible, we provide wrapper
388  * functions of these macros.
389  * Note that cmsg_space() could run slow on OSes that do not have
390  * CMSG_SPACE.
391  */
392 static inline socklen_t
393 cmsg_len(socklen_t len) {
394 	return (CMSG_LEN(len));
395 }
396 
397 static inline socklen_t
398 cmsg_space(socklen_t len) {
399 	return (CMSG_SPACE(len));
400 }
401 
402 /*
403  * Process control messages received on a socket.
404  */
405 static void
406 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
407 	struct cmsghdr *cmsgp;
408 	struct in6_pktinfo *pktinfop;
409 	void *timevalp;
410 
411 	/*
412 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
413 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
414 	 * They are all here, outside of the CPP tests, because it is
415 	 * more consistent with the usual ISC coding style.
416 	 */
417 	UNUSED(sock);
418 	UNUSED(msg);
419 	UNUSED(dev);
420 
421 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
422 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
423 
424 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
425 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
426 
427 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
428 		return;
429 
430 	timevalp = NULL;
431 	pktinfop = NULL;
432 
433 	cmsgp = CMSG_FIRSTHDR(msg);
434 	while (cmsgp != NULL) {
435 		socket_log(sock, NULL, TRACE,
436 			   "processing cmsg %p", cmsgp);
437 
438 		if (cmsgp->cmsg_level == IPPROTO_IPV6
439 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
440 
441 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
442 			memmove(&dev->pktinfo, pktinfop,
443 				sizeof(struct in6_pktinfo));
444 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
445 			socket_log(sock, NULL, TRACE,
446 				   "interface received on ifindex %u",
447 				   dev->pktinfo.ipi6_ifindex);
448 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
449 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
450 			goto next;
451 		}
452 
453 		if (cmsgp->cmsg_level == SOL_SOCKET
454 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
455 			struct timeval tv;
456 			timevalp = CMSG_DATA(cmsgp);
457 			memmove(&tv, timevalp, sizeof(tv));
458 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
459 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
460 			goto next;
461 		}
462 
463 		if (cmsgp->cmsg_level == IPPROTO_IPV6
464 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
465 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
466 			dev->dscp >>= 2;
467 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
468 			goto next;
469 		}
470 
471 		if (cmsgp->cmsg_level == IPPROTO_IP
472 		    && (cmsgp->cmsg_type == IP_TOS)) {
473 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
474 			dev->dscp >>= 2;
475 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
476 			goto next;
477 		}
478 	next:
479 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
480 	}
481 
482 }
483 
484 /*
485  * Construct an iov array and attach it to the msghdr passed in.  This is
486  * the SEND constructor, which will use the used region of the buffer
487  * (if using a buffer list) or will use the internal region (if a single
488  * buffer I/O is requested).
489  *
490  * Nothing can be NULL, and the done event must list at least one buffer
491  * on the buffer linked list for this function to be meaningful.
492  *
493  * If write_countp != NULL, *write_countp will hold the number of bytes
494  * this transaction can send.
495  */
496 static void
497 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
498 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
499 {
500 	unsigned int iovcount;
501 	isc_buffer_t *buffer;
502 	isc_region_t used;
503 	size_t write_count;
504 	size_t skip_count;
505 	struct cmsghdr *cmsgp;
506 
507 	memset(msg, 0, sizeof(*msg));
508 
509 	if (!sock->connected) {
510 		msg->msg_name = (void *)&dev->address.type.sa;
511 		msg->msg_namelen = dev->address.length;
512 	} else {
513 		msg->msg_name = NULL;
514 		msg->msg_namelen = 0;
515 	}
516 
517 	buffer = ISC_LIST_HEAD(dev->bufferlist);
518 	write_count = 0;
519 	iovcount = 0;
520 
521 	/*
522 	 * Single buffer I/O?  Skip what we've done so far in this region.
523 	 */
524 	if (buffer == NULL) {
525 		write_count = dev->region.length - dev->n;
526 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
527 		iov[0].iov_len = write_count;
528 		iovcount = 1;
529 
530 		goto config;
531 	}
532 
533 	/*
534 	 * Multibuffer I/O.
535 	 * Skip the data in the buffer list that we have already written.
536 	 */
537 	skip_count = dev->n;
538 	while (buffer != NULL) {
539 		if (skip_count < isc_buffer_usedlength(buffer))
540 			break;
541 		skip_count -= isc_buffer_usedlength(buffer);
542 		buffer = ISC_LIST_NEXT(buffer, link);
543 	}
544 
545 	while (buffer != NULL) {
546 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
547 
548 		isc_buffer_usedregion(buffer, &used);
549 
550 		if (used.length > 0) {
551 			iov[iovcount].iov_base = (void *)(used.base
552 							  + skip_count);
553 			iov[iovcount].iov_len = used.length - skip_count;
554 			write_count += (used.length - skip_count);
555 			skip_count = 0;
556 			iovcount++;
557 		}
558 		buffer = ISC_LIST_NEXT(buffer, link);
559 	}
560 
561 	INSIST(skip_count == 0U);
562 
563  config:
564 	msg->msg_iov = iov;
565 	msg->msg_iovlen = iovcount;
566 
567 	msg->msg_control = NULL;
568 	msg->msg_controllen = 0;
569 	msg->msg_flags = 0;
570 
571 	if ((sock->type == isc_sockettype_udp) &&
572 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
573 	{
574 		struct in6_pktinfo *pktinfop;
575 
576 		socket_log(sock, NULL, TRACE,
577 			   "sendto pktinfo data, ifindex %u",
578 			   dev->pktinfo.ipi6_ifindex);
579 
580 		msg->msg_control = (void *)cmsgbuf;
581 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
582 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
583 
584 		cmsgp = (struct cmsghdr *)cmsgbuf;
585 		cmsgp->cmsg_level = IPPROTO_IPV6;
586 		cmsgp->cmsg_type = IPV6_PKTINFO;
587 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
588 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
589 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
590 	}
591 
592 	if ((sock->type == isc_sockettype_udp) &&
593 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
594 	{
595 		int use_min_mtu = 1;	/* -1, 0, 1 */
596 
597 		cmsgp = (struct cmsghdr *)(cmsgbuf +
598 					   msg->msg_controllen);
599 
600 		msg->msg_control = (void *)cmsgbuf;
601 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
602 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
603 
604 		cmsgp->cmsg_level = IPPROTO_IPV6;
605 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
606 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
607 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
608 	}
609 
610 	if (isc_dscp_check_value > -1) {
611 		if (sock->type == isc_sockettype_udp)
612 			INSIST((int)dev->dscp == isc_dscp_check_value);
613 		else if (sock->type == isc_sockettype_tcp)
614 			INSIST((int)sock->dscp == isc_dscp_check_value);
615 	}
616 
617 	if ((sock->type == isc_sockettype_udp) &&
618 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
619 	{
620 		int dscp = (dev->dscp << 2) & 0xff;
621 
622 		INSIST(dev->dscp < 0x40);
623 
624 		if (sock->pf == AF_INET && sock->pktdscp) {
625 			cmsgp = (struct cmsghdr *)(cmsgbuf +
626 						   msg->msg_controllen);
627 			msg->msg_control = (void *)cmsgbuf;
628 			msg->msg_controllen += cmsg_space(sizeof(dscp));
629 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
630 
631 			cmsgp->cmsg_level = IPPROTO_IP;
632 			cmsgp->cmsg_type = IP_TOS;
633 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
634 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
635 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
636 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
637 			       (void *)&dscp, sizeof(int)) < 0)
638 			{
639 				UNEXPECTED_ERROR(__FILE__, __LINE__,
640 						 "setsockopt(%d, IP_TOS, %.02x)"
641 						 " %s: %s",
642 						 sock->fd, dscp >> 2,
643 						 "failed", strerror(errno));
644 			} else
645 				sock->dscp = dscp;
646 		}
647 
648 		if (sock->pf == AF_INET6 && sock->pktdscp) {
649 			cmsgp = (struct cmsghdr *)(cmsgbuf +
650 						   msg->msg_controllen);
651 			msg->msg_control = (void *)cmsgbuf;
652 			msg->msg_controllen += cmsg_space(sizeof(dscp));
653 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
654 
655 			cmsgp->cmsg_level = IPPROTO_IPV6;
656 			cmsgp->cmsg_type = IPV6_TCLASS;
657 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
658 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
659 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
660 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
661 				       (void *)&dscp, sizeof(int)) < 0) {
662 				UNEXPECTED_ERROR(__FILE__, __LINE__,
663 						 "setsockopt(%d, IPV6_TCLASS, "
664 						 "%.02x) %s: %s",
665 						 sock->fd, dscp >> 2,
666 						 "failed", strerror(errno));
667 			} else
668 				sock->dscp = dscp;
669 		}
670 
671 		if (msg->msg_controllen != 0 &&
672 		    msg->msg_controllen < SENDCMSGBUFLEN)
673 		{
674 			memset(cmsgbuf + msg->msg_controllen, 0,
675 			       SENDCMSGBUFLEN - msg->msg_controllen);
676 		}
677 	}
678 
679 	if (write_countp != NULL)
680 		*write_countp = write_count;
681 }
682 
683 /*
684  * Construct an iov array and attach it to the msghdr passed in.  This is
685  * the RECV constructor, which will use the available region of the buffer
686  * (if using a buffer list) or will use the internal region (if a single
687  * buffer I/O is requested).
688  *
689  * Nothing can be NULL, and the done event must list at least one buffer
690  * on the buffer linked list for this function to be meaningful.
691  *
692  * If read_countp != NULL, *read_countp will hold the number of bytes
693  * this transaction can receive.
694  */
695 static void
696 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
697 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
698 {
699 	unsigned int iovcount;
700 	isc_buffer_t *buffer;
701 	isc_region_t available;
702 	size_t read_count;
703 
704 	memset(msg, 0, sizeof(struct msghdr));
705 
706 	if (sock->type == isc_sockettype_udp) {
707 		memset(&dev->address, 0, sizeof(dev->address));
708 		msg->msg_name = (void *)&dev->address.type.sa;
709 		msg->msg_namelen = sizeof(dev->address.type);
710 	} else { /* TCP */
711 		msg->msg_name = NULL;
712 		msg->msg_namelen = 0;
713 		dev->address = sock->peer_address;
714 	}
715 
716 	buffer = ISC_LIST_HEAD(dev->bufferlist);
717 	read_count = 0;
718 
719 	/*
720 	 * Single buffer I/O?  Skip what we've done so far in this region.
721 	 */
722 	if (buffer == NULL) {
723 		read_count = dev->region.length - dev->n;
724 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
725 		iov[0].iov_len = read_count;
726 		iovcount = 1;
727 
728 		goto config;
729 	}
730 
731 	/*
732 	 * Multibuffer I/O.
733 	 * Skip empty buffers.
734 	 */
735 	while (buffer != NULL) {
736 		if (isc_buffer_availablelength(buffer) != 0)
737 			break;
738 		buffer = ISC_LIST_NEXT(buffer, link);
739 	}
740 
741 	iovcount = 0;
742 	while (buffer != NULL) {
743 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
744 
745 		isc_buffer_availableregion(buffer, &available);
746 
747 		if (available.length > 0) {
748 			iov[iovcount].iov_base = (void *)(available.base);
749 			iov[iovcount].iov_len = available.length;
750 			read_count += available.length;
751 			iovcount++;
752 		}
753 		buffer = ISC_LIST_NEXT(buffer, link);
754 	}
755 
756  config:
757 
758 	/*
759 	 * If needed, set up to receive that one extra byte.
760 	 */
761 	msg->msg_iov = iov;
762 	msg->msg_iovlen = iovcount;
763 
764 	msg->msg_control = cmsgbuf;
765 	msg->msg_controllen = RECVCMSGBUFLEN;
766 	msg->msg_flags = 0;
767 
768 	if (read_countp != NULL)
769 		*read_countp = read_count;
770 }
771 
772 static void
773 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
774 		isc_socketevent_t *dev)
775 {
776 	if (sock->type == isc_sockettype_udp) {
777 		if (address != NULL)
778 			dev->address = *address;
779 		else
780 			dev->address = sock->peer_address;
781 	} else if (sock->type == isc_sockettype_tcp) {
782 		INSIST(address == NULL);
783 		dev->address = sock->peer_address;
784 	}
785 }
786 
787 static void
788 destroy_socketevent(isc_event_t *event) {
789 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
790 
791 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
792 
793 	(ev->destroy)(event);
794 }
795 
796 static isc_socketevent_t *
797 allocate_socketevent(void *sender,
798 		     isc_eventtype_t eventtype, isc_taskaction_t action,
799 		     void *arg)
800 {
801 	isc_socketevent_t *ev;
802 
803 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
804 						     eventtype, action, arg,
805 						     sizeof(*ev));
806 
807 	if (ev == NULL)
808 		return (NULL);
809 
810 	ev->result = ISC_R_UNSET;
811 	ISC_LINK_INIT(ev, ev_link);
812 	ISC_LIST_INIT(ev->bufferlist);
813 	ev->region.base = NULL;
814 	ev->n = 0;
815 	ev->offset = 0;
816 	ev->attributes = 0;
817 	ev->destroy = ev->ev_destroy;
818 	ev->ev_destroy = destroy_socketevent;
819 	ev->dscp = 0;
820 
821 	return (ev);
822 }
823 
824 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
825 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
826 #define DOIO_HARD		2	/* i/o error, event sent */
827 #define DOIO_EOF		3	/* EOF, no event sent */
828 
829 static int
830 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
831 	int cc;
832 	struct iovec iov[MAXSCATTERGATHER_RECV];
833 	size_t read_count;
834 	size_t actual_count;
835 	struct msghdr msghdr;
836 	isc_buffer_t *buffer;
837 	int recv_errno;
838 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
839 
840 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
841 
842 	cc = recvmsg(sock->fd, &msghdr, 0);
843 	recv_errno = errno;
844 
845 	if (cc < 0) {
846 		if (SOFT_ERROR(recv_errno))
847 			return (DOIO_SOFT);
848 
849 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
850 			socket_log(sock, NULL, IOEVENT,
851 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
852 				   sock->fd, cc, recv_errno,
853 				   strerror(recv_errno));
854 		}
855 
856 #define SOFT_OR_HARD(_system, _isc) \
857 	if (recv_errno == _system) { \
858 		if (sock->connected) { \
859 			dev->result = _isc; \
860 			return (DOIO_HARD); \
861 		} \
862 		return (DOIO_SOFT); \
863 	}
864 #define ALWAYS_HARD(_system, _isc) \
865 	if (recv_errno == _system) { \
866 		dev->result = _isc; \
867 		return (DOIO_HARD); \
868 	}
869 
870 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
871 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
872 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
873 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
874 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
875 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
876 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
877 		/* Should never get this one but it was seen. */
878 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
879 		/*
880 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
881 		 * errors.
882 		 */
883 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
884 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
885 
886 #undef SOFT_OR_HARD
887 #undef ALWAYS_HARD
888 
889 		dev->result = isc__errno2result(recv_errno);
890 		return (DOIO_HARD);
891 	}
892 
893 	/*
894 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
895 	 * while on UDP sockets, zero length reads are perfectly valid,
896 	 * although strange.
897 	 */
898 	switch (sock->type) {
899 	case isc_sockettype_tcp:
900 		if (cc == 0)
901 			return (DOIO_EOF);
902 		break;
903 	case isc_sockettype_udp:
904 		break;
905 	default:
906 		INSIST(0);
907 	}
908 
909 	if (sock->type == isc_sockettype_udp) {
910 		dev->address.length = msghdr.msg_namelen;
911 		if (isc_sockaddr_getport(&dev->address) == 0) {
912 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
913 				socket_log(sock, &dev->address, IOEVENT,
914 					   "dropping source port zero packet");
915 			}
916 			return (DOIO_SOFT);
917 		}
918 	}
919 
920 	socket_log(sock, &dev->address, IOEVENT,
921 		   "packet received correctly");
922 
923 	/*
924 	 * Overflow bit detection.  If we received MORE bytes than we should,
925 	 * this indicates an overflow situation.  Set the flag in the
926 	 * dev entry and adjust how much we read by one.
927 	 */
928 	/*
929 	 * If there are control messages attached, run through them and pull
930 	 * out the interesting bits.
931 	 */
932 	process_cmsg(sock, &msghdr, dev);
933 
934 	/*
935 	 * update the buffers (if any) and the i/o count
936 	 */
937 	dev->n += cc;
938 	actual_count = cc;
939 	buffer = ISC_LIST_HEAD(dev->bufferlist);
940 	while (buffer != NULL && actual_count > 0U) {
941 		if (isc_buffer_availablelength(buffer) <= actual_count) {
942 			actual_count -= isc_buffer_availablelength(buffer);
943 			isc_buffer_add(buffer,
944 				       isc_buffer_availablelength(buffer));
945 		} else {
946 			isc_buffer_add(buffer, actual_count);
947 			actual_count = 0;
948 			POST(actual_count);
949 			break;
950 		}
951 		buffer = ISC_LIST_NEXT(buffer, link);
952 		if (buffer == NULL) {
953 			INSIST(actual_count == 0U);
954 		}
955 	}
956 
957 	/*
958 	 * If we read less than we expected, update counters,
959 	 * and let the upper layer poke the descriptor.
960 	 */
961 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
962 		return (DOIO_SOFT);
963 
964 	/*
965 	 * Full reads are posted, or partials if partials are ok.
966 	 */
967 	dev->result = ISC_R_SUCCESS;
968 	return (DOIO_SUCCESS);
969 }
970 
971 /*
972  * Returns:
973  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
974  *			ISC_R_SUCCESS.
975  *
976  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
977  *			dev->result contains the appropriate error.
978  *
979  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
980  *			event was sent.  The operation should be retried.
981  *
982  *	No other return values are possible.
983  */
984 static int
985 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
986 	int cc;
987 	struct iovec iov[MAXSCATTERGATHER_SEND];
988 	size_t write_count;
989 	struct msghdr msghdr;
990 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
991 	int attempts = 0;
992 	int send_errno;
993 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
994 
995 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
996 
997  resend:
998 	cc = sendmsg(sock->fd, &msghdr, 0);
999 	send_errno = errno;
1000 
1001 	/*
1002 	 * Check for error or block condition.
1003 	 */
1004 	if (cc < 0) {
1005 		if (send_errno == EINTR && ++attempts < NRETRIES)
1006 			goto resend;
1007 
1008 		if (SOFT_ERROR(send_errno)) {
1009 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1010 				dev->result = ISC_R_WOULDBLOCK;
1011 			return (DOIO_SOFT);
1012 		}
1013 
1014 #define SOFT_OR_HARD(_system, _isc) \
1015 	if (send_errno == _system) { \
1016 		if (sock->connected) { \
1017 			dev->result = _isc; \
1018 			return (DOIO_HARD); \
1019 		} \
1020 		return (DOIO_SOFT); \
1021 	}
1022 #define ALWAYS_HARD(_system, _isc) \
1023 	if (send_errno == _system) { \
1024 		dev->result = _isc; \
1025 		return (DOIO_HARD); \
1026 	}
1027 
1028 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1029 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1030 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1031 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1032 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1033 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1034 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1035 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1036 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1037 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1038 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1039 
1040 #undef SOFT_OR_HARD
1041 #undef ALWAYS_HARD
1042 
1043 		/*
1044 		 * The other error types depend on whether or not the
1045 		 * socket is UDP or TCP.  If it is UDP, some errors
1046 		 * that we expect to be fatal under TCP are merely
1047 		 * annoying, and are really soft errors.
1048 		 *
1049 		 * However, these soft errors are still returned as
1050 		 * a status.
1051 		 */
1052 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1053 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1054 				 addrbuf, strerror(send_errno));
1055 		dev->result = isc__errno2result(send_errno);
1056 		return (DOIO_HARD);
1057 	}
1058 
1059 	if (cc == 0) {
1060 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1061 				 "doio_send: send() %s 0", "returned");
1062 	}
1063 
1064 	/*
1065 	 * If we write less than we expected, update counters, poke.
1066 	 */
1067 	dev->n += cc;
1068 	if ((size_t)cc != write_count)
1069 		return (DOIO_SOFT);
1070 
1071 	/*
1072 	 * Exactly what we wanted to write.  We're done with this
1073 	 * entry.  Post its completion event.
1074 	 */
1075 	dev->result = ISC_R_SUCCESS;
1076 	return (DOIO_SUCCESS);
1077 }
1078 
1079 /*
1080  * Kill.
1081  *
1082  * Caller must ensure that the socket is not locked and no external
1083  * references exist.
1084  */
1085 static void
1086 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1087 	/*
1088 	 * No one has this socket open, so the watcher doesn't have to be
1089 	 * poked, and the socket doesn't have to be locked.
1090 	 */
1091 	manager->fds[fd] = NULL;
1092 	manager->fdstate[fd] = CLOSE_PENDING;
1093 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1094 
1095 	if (sock->active == 1) {
1096 		sock->active = 0;
1097 	}
1098 
1099 	/*
1100 	 * update manager->maxfd here (XXX: this should be implemented more
1101 	 * efficiently)
1102 	 */
1103 	if (manager->maxfd == fd) {
1104 		int i;
1105 
1106 		manager->maxfd = 0;
1107 		for (i = fd - 1; i >= 0; i--) {
1108 			if (manager->fdstate[i] == MANAGED) {
1109 				manager->maxfd = i;
1110 				break;
1111 			}
1112 		}
1113 	}
1114 
1115 }
1116 
1117 static void
1118 destroy(isc_socket_t **sockp) {
1119 	int fd;
1120 	isc_socket_t *sock = *sockp;
1121 	isc_socketmgr_t *manager = sock->manager;
1122 
1123 	socket_log(sock, NULL, CREATION, "destroying");
1124 
1125 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1126 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1127 	INSIST(sock->connect_ev == NULL);
1128 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1129 
1130 	if (sock->fd >= 0) {
1131 		fd = sock->fd;
1132 		sock->fd = -1;
1133 		socketclose(manager, sock, fd);
1134 	}
1135 
1136 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1137 
1138 	/* can't unlock manager as its memory context is still used */
1139 	free_socket(sockp);
1140 }
1141 
1142 static isc_result_t
1143 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1144 		isc_socket_t **socketp)
1145 {
1146 	isc_socket_t *sock;
1147 
1148 	sock = malloc(sizeof(*sock));
1149 
1150 	if (sock == NULL)
1151 		return (ISC_R_NOMEMORY);
1152 
1153 	sock->references = 0;
1154 
1155 	sock->manager = manager;
1156 	sock->type = type;
1157 	sock->fd = -1;
1158 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1159 	sock->active = 0;
1160 
1161 	ISC_LINK_INIT(sock, link);
1162 
1163 	/*
1164 	 * Set up list of readers and writers to be initially empty.
1165 	 */
1166 	ISC_LIST_INIT(sock->recv_list);
1167 	ISC_LIST_INIT(sock->send_list);
1168 	sock->connect_ev = NULL;
1169 	sock->pending_recv = 0;
1170 	sock->pending_send = 0;
1171 	sock->connected = 0;
1172 	sock->connecting = 0;
1173 	sock->bound = 0;
1174 	sock->pktdscp = 0;
1175 
1176 	/*
1177 	 * Initialize readable and writable events.
1178 	 */
1179 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1180 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1181 		       NULL, sock, sock, NULL);
1182 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1183 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1184 		       NULL, sock, sock, NULL);
1185 
1186 	*socketp = sock;
1187 
1188 	return (ISC_R_SUCCESS);
1189 }
1190 
1191 /*
1192  * This event requires that the various lists be empty, that the reference
1193  * count be 1.  The other socket bits,
1194  * like the lock, must be initialized as well.  The fd associated must be
1195  * marked as closed, by setting it to -1 on close, or this routine will
1196  * also close the socket.
1197  */
1198 static void
1199 free_socket(isc_socket_t **socketp) {
1200 	isc_socket_t *sock = *socketp;
1201 
1202 	INSIST(sock->references == 0);
1203 	INSIST(!sock->connecting);
1204 	INSIST(!sock->pending_recv);
1205 	INSIST(!sock->pending_send);
1206 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1207 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1208 	INSIST(!ISC_LINK_LINKED(sock, link));
1209 
1210 	free(sock);
1211 
1212 	*socketp = NULL;
1213 }
1214 
1215 static void
1216 use_min_mtu(isc_socket_t *sock) {
1217 	/* use minimum MTU */
1218 	if (sock->pf == AF_INET6) {
1219 		int on = 1;
1220 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1221 				(void *)&on, sizeof(on));
1222 	}
1223 }
1224 
1225 static void
1226 set_tcp_maxseg(isc_socket_t *sock, int size) {
1227 	if (sock->type == isc_sockettype_tcp)
1228 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1229 				(void *)&size, sizeof(size));
1230 }
1231 
1232 static isc_result_t
1233 opensocket(isc_socket_t *sock)
1234 {
1235 	isc_result_t result;
1236 	const char *err = "socket";
1237 	int on = 1;
1238 
1239 	switch (sock->type) {
1240 	case isc_sockettype_udp:
1241 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1242 		break;
1243 	case isc_sockettype_tcp:
1244 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1245 		break;
1246 	}
1247 
1248 	if (sock->fd < 0) {
1249 		switch (errno) {
1250 		case EMFILE:
1251 		case ENFILE:
1252 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1253 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1254 				       "%s: %s", err, strerror(errno));
1255 			/* fallthrough */
1256 		case ENOBUFS:
1257 			return (ISC_R_NORESOURCES);
1258 
1259 		case EPROTONOSUPPORT:
1260 		case EPFNOSUPPORT:
1261 		case EAFNOSUPPORT:
1262 		/*
1263 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1264 		 * EAFNOSUPPORT.
1265 		 */
1266 		case EINVAL:
1267 			return (ISC_R_FAMILYNOSUPPORT);
1268 
1269 		default:
1270 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1271 					 "%s() %s: %s", err, "failed",
1272 					 strerror(errno));
1273 			return (ISC_R_UNEXPECTED);
1274 		}
1275 	}
1276 
1277 	result = make_nonblock(sock->fd);
1278 	if (result != ISC_R_SUCCESS) {
1279 		(void)close(sock->fd);
1280 		return (result);
1281 	}
1282 
1283 	/*
1284 	 * Use minimum mtu if possible.
1285 	 */
1286 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1287 		use_min_mtu(sock);
1288 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1289 	}
1290 
1291 	if (sock->type == isc_sockettype_udp) {
1292 
1293 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1294 			       (void *)&on, sizeof(on)) < 0
1295 		    && errno != ENOPROTOOPT) {
1296 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1297 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1298 					 sock->fd, "failed", strerror(errno));
1299 			/* Press on... */
1300 		}
1301 
1302 		/* RFC 3542 */
1303 		if ((sock->pf == AF_INET6)
1304 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1305 				   (void *)&on, sizeof(on)) < 0)) {
1306 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1307 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1308 					 "%s: %s", sock->fd, "failed",
1309 					 strerror(errno));
1310 		}
1311 	}
1312 
1313 	if (sock->active == 0) {
1314 		sock->active = 1;
1315 	}
1316 
1317 	return (ISC_R_SUCCESS);
1318 }
1319 
1320 /*
1321  * Create a 'type' socket managed
1322  * by 'manager'.  Events will be posted to 'task' and when dispatched
1323  * 'action' will be called with 'arg' as the arg value.  The new
1324  * socket is returned in 'socketp'.
1325  */
1326 static isc_result_t
1327 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1328 	      isc_socket_t **socketp)
1329 {
1330 	isc_socket_t *sock = NULL;
1331 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
1332 	isc_result_t result;
1333 
1334 	REQUIRE(socketp != NULL && *socketp == NULL);
1335 
1336 	result = allocate_socket(manager, type, &sock);
1337 	if (result != ISC_R_SUCCESS)
1338 		return (result);
1339 
1340 	switch (sock->type) {
1341 	case isc_sockettype_udp:
1342 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1343 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1344 		break;
1345 	case isc_sockettype_tcp:
1346 		break;
1347 	default:
1348 		INSIST(0);
1349 	}
1350 
1351 	sock->pf = pf;
1352 
1353 	result = opensocket(sock);
1354 	if (result != ISC_R_SUCCESS) {
1355 		free_socket(&sock);
1356 		return (result);
1357 	}
1358 
1359 	sock->references = 1;
1360 	*socketp = (isc_socket_t *)sock;
1361 
1362 	/*
1363 	 * Note we don't have to lock the socket like we normally would because
1364 	 * there are no external references to it yet.
1365 	 */
1366 
1367 	manager->fds[sock->fd] = sock;
1368 	manager->fdstate[sock->fd] = MANAGED;
1369 
1370 	ISC_LIST_APPEND(manager->socklist, sock, link);
1371 	if (manager->maxfd < sock->fd)
1372 		manager->maxfd = sock->fd;
1373 
1374 	socket_log(sock, NULL, CREATION, "created");
1375 
1376 	return (ISC_R_SUCCESS);
1377 }
1378 
1379 /*%
1380  * Create a new 'type' socket managed by 'manager'.  Events
1381  * will be posted to 'task' and when dispatched 'action' will be
1382  * called with 'arg' as the arg value.  The new socket is returned
1383  * in 'socketp'.
1384  */
1385 isc_result_t
1386 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1387 		   isc_socket_t **socketp)
1388 {
1389 	return (socket_create(manager0, pf, type, socketp));
1390 }
1391 
1392 /*
1393  * Attach to a socket.  Caller must explicitly detach when it is done.
1394  */
1395 void
1396 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1397 	isc_socket_t *sock = (isc_socket_t *)sock0;
1398 
1399 	REQUIRE(socketp != NULL && *socketp == NULL);
1400 
1401 	sock->references++;
1402 
1403 	*socketp = (isc_socket_t *)sock;
1404 }
1405 
1406 /*
1407  * Dereference a socket.  If this is the last reference to it, clean things
1408  * up by destroying the socket.
1409  */
1410 void
1411 isc_socket_detach(isc_socket_t **socketp) {
1412 	isc_socket_t *sock;
1413 	isc_boolean_t kill_socket = ISC_FALSE;
1414 
1415 	REQUIRE(socketp != NULL);
1416 	sock = (isc_socket_t *)*socketp;
1417 
1418 	REQUIRE(sock->references > 0);
1419 	sock->references--;
1420 	if (sock->references == 0)
1421 		kill_socket = ISC_TRUE;
1422 
1423 	if (kill_socket)
1424 		destroy(&sock);
1425 
1426 	*socketp = NULL;
1427 }
1428 
1429 /*
1430  * I/O is possible on a given socket.  Schedule an event to this task that
1431  * will call an internal function to do the I/O.  This will charge the
1432  * task with the I/O operation and let our select loop handler get back
1433  * to doing something real as fast as possible.
1434  *
1435  * The socket and manager must be locked before calling this function.
1436  */
1437 static void
1438 dispatch_recv(isc_socket_t *sock) {
1439 	intev_t *iev;
1440 	isc_socketevent_t *ev;
1441 	isc_task_t *sender;
1442 
1443 	INSIST(!sock->pending_recv);
1444 
1445 	ev = ISC_LIST_HEAD(sock->recv_list);
1446 	if (ev == NULL)
1447 		return;
1448 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1449 		   "dispatch_recv:  event %p -> task %p",
1450 		   ev, ev->ev_sender);
1451 	sender = ev->ev_sender;
1452 
1453 	sock->pending_recv = 1;
1454 	iev = &sock->readable_ev;
1455 
1456 	sock->references++;
1457 	iev->ev_sender = sock;
1458 	iev->ev_action = internal_recv;
1459 	iev->ev_arg = sock;
1460 
1461 	isc_task_send(sender, (isc_event_t **)&iev);
1462 }
1463 
1464 static void
1465 dispatch_send(isc_socket_t *sock) {
1466 	intev_t *iev;
1467 	isc_socketevent_t *ev;
1468 	isc_task_t *sender;
1469 
1470 	INSIST(!sock->pending_send);
1471 
1472 	ev = ISC_LIST_HEAD(sock->send_list);
1473 	if (ev == NULL)
1474 		return;
1475 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1476 		   "dispatch_send:  event %p -> task %p",
1477 		   ev, ev->ev_sender);
1478 	sender = ev->ev_sender;
1479 
1480 	sock->pending_send = 1;
1481 	iev = &sock->writable_ev;
1482 
1483 	sock->references++;
1484 	iev->ev_sender = sock;
1485 	iev->ev_action = internal_send;
1486 	iev->ev_arg = sock;
1487 
1488 	isc_task_send(sender, (isc_event_t **)&iev);
1489 }
1490 
1491 static void
1492 dispatch_connect(isc_socket_t *sock) {
1493 	intev_t *iev;
1494 	isc_socket_connev_t *ev;
1495 
1496 	iev = &sock->writable_ev;
1497 
1498 	ev = sock->connect_ev;
1499 	INSIST(ev != NULL); /* XXX */
1500 
1501 	INSIST(sock->connecting);
1502 
1503 	sock->references++;  /* keep socket around for this internal event */
1504 	iev->ev_sender = sock;
1505 	iev->ev_action = internal_connect;
1506 	iev->ev_arg = sock;
1507 
1508 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1509 }
1510 
1511 /*
1512  * Dequeue an item off the given socket's read queue, set the result code
1513  * in the done event to the one provided, and send it to the task it was
1514  * destined for.
1515  *
1516  * If the event to be sent is on a list, remove it before sending.  If
1517  * asked to, send and detach from the socket as well.
1518  *
1519  * Caller must have the socket locked if the event is attached to the socket.
1520  */
1521 static void
1522 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1523 	isc_task_t *task;
1524 
1525 	task = (*dev)->ev_sender;
1526 
1527 	(*dev)->ev_sender = sock;
1528 
1529 	if (ISC_LINK_LINKED(*dev, ev_link))
1530 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1531 
1532 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1533 	    == ISC_SOCKEVENTATTR_ATTACHED)
1534 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1535 	else
1536 		isc_task_send(task, (isc_event_t **)dev);
1537 }
1538 
1539 /*
1540  * See comments for send_recvdone_event() above.
1541  *
1542  * Caller must have the socket locked if the event is attached to the socket.
1543  */
1544 static void
1545 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1546 	isc_task_t *task;
1547 
1548 	INSIST(dev != NULL && *dev != NULL);
1549 
1550 	task = (*dev)->ev_sender;
1551 	(*dev)->ev_sender = sock;
1552 
1553 	if (ISC_LINK_LINKED(*dev, ev_link))
1554 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1555 
1556 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1557 	    == ISC_SOCKEVENTATTR_ATTACHED)
1558 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1559 	else
1560 		isc_task_send(task, (isc_event_t **)dev);
1561 }
1562 
1563 static void
1564 internal_recv(isc_task_t *me, isc_event_t *ev) {
1565 	isc_socketevent_t *dev;
1566 	isc_socket_t *sock;
1567 
1568 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1569 
1570 	sock = ev->ev_sender;
1571 
1572 	socket_log(sock, NULL, IOEVENT,
1573 		   "internal_recv: task %p got event %p", me, ev);
1574 
1575 	INSIST(sock->pending_recv == 1);
1576 	sock->pending_recv = 0;
1577 
1578 	INSIST(sock->references > 0);
1579 	sock->references--;  /* the internal event is done with this socket */
1580 	if (sock->references == 0) {
1581 		destroy(&sock);
1582 		return;
1583 	}
1584 
1585 	/*
1586 	 * Try to do as much I/O as possible on this socket.  There are no
1587 	 * limits here, currently.
1588 	 */
1589 	dev = ISC_LIST_HEAD(sock->recv_list);
1590 	while (dev != NULL) {
1591 		switch (doio_recv(sock, dev)) {
1592 		case DOIO_SOFT:
1593 			goto poke;
1594 
1595 		case DOIO_EOF:
1596 			/*
1597 			 * read of 0 means the remote end was closed.
1598 			 * Run through the event queue and dispatch all
1599 			 * the events with an EOF result code.
1600 			 */
1601 			do {
1602 				dev->result = ISC_R_EOF;
1603 				send_recvdone_event(sock, &dev);
1604 				dev = ISC_LIST_HEAD(sock->recv_list);
1605 			} while (dev != NULL);
1606 			goto poke;
1607 
1608 		case DOIO_SUCCESS:
1609 		case DOIO_HARD:
1610 			send_recvdone_event(sock, &dev);
1611 			break;
1612 		}
1613 
1614 		dev = ISC_LIST_HEAD(sock->recv_list);
1615 	}
1616 
1617  poke:
1618 	if (!ISC_LIST_EMPTY(sock->recv_list))
1619 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1620 }
1621 
1622 static void
1623 internal_send(isc_task_t *me, isc_event_t *ev) {
1624 	isc_socketevent_t *dev;
1625 	isc_socket_t *sock;
1626 
1627 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1628 
1629 	/*
1630 	 * Find out what socket this is and lock it.
1631 	 */
1632 	sock = (isc_socket_t *)ev->ev_sender;
1633 	socket_log(sock, NULL, IOEVENT,
1634 		   "internal_send: task %p got event %p", me, ev);
1635 
1636 	INSIST(sock->pending_send == 1);
1637 	sock->pending_send = 0;
1638 
1639 	INSIST(sock->references > 0);
1640 	sock->references--;  /* the internal event is done with this socket */
1641 	if (sock->references == 0) {
1642 		destroy(&sock);
1643 		return;
1644 	}
1645 
1646 	/*
1647 	 * Try to do as much I/O as possible on this socket.  There are no
1648 	 * limits here, currently.
1649 	 */
1650 	dev = ISC_LIST_HEAD(sock->send_list);
1651 	while (dev != NULL) {
1652 		switch (doio_send(sock, dev)) {
1653 		case DOIO_SOFT:
1654 			goto poke;
1655 
1656 		case DOIO_HARD:
1657 		case DOIO_SUCCESS:
1658 			send_senddone_event(sock, &dev);
1659 			break;
1660 		}
1661 
1662 		dev = ISC_LIST_HEAD(sock->send_list);
1663 	}
1664 
1665  poke:
1666 	if (!ISC_LIST_EMPTY(sock->send_list))
1667 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1668 }
1669 
1670 /*
1671  * Process read/writes on each fd here.  Avoid locking
1672  * and unlocking twice if both reads and writes are possible.
1673  */
1674 static void
1675 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
1676 	   isc_boolean_t writeable)
1677 {
1678 	isc_socket_t *sock;
1679 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1680 
1681 	/*
1682 	 * If the socket is going to be closed, don't do more I/O.
1683 	 */
1684 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1685 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1686 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1687 		return;
1688 	}
1689 
1690 	sock = manager->fds[fd];
1691 	if (readable) {
1692 		if (sock == NULL) {
1693 			unwatch_read = ISC_TRUE;
1694 			goto check_write;
1695 		}
1696 		if (!SOCK_DEAD(sock)) {
1697 			dispatch_recv(sock);
1698 		}
1699 		unwatch_read = ISC_TRUE;
1700 	}
1701 check_write:
1702 	if (writeable) {
1703 		if (sock == NULL) {
1704 			unwatch_write = ISC_TRUE;
1705 			goto unlock_fd;
1706 		}
1707 		if (!SOCK_DEAD(sock)) {
1708 			if (sock->connecting)
1709 				dispatch_connect(sock);
1710 			else
1711 				dispatch_send(sock);
1712 		}
1713 		unwatch_write = ISC_TRUE;
1714 	}
1715 
1716  unlock_fd:
1717 	if (unwatch_read)
1718 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1719 	if (unwatch_write)
1720 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1721 
1722 }
1723 
1724 static void
1725 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds,
1726 	    fd_set *writefds)
1727 {
1728 	int i;
1729 
1730 	REQUIRE(maxfd <= (int)manager->maxsocks);
1731 
1732 	for (i = 0; i < maxfd; i++) {
1733 		process_fd(manager, i, FD_ISSET(i, readfds),
1734 			   FD_ISSET(i, writefds));
1735 	}
1736 }
1737 
1738 /*
1739  * Create a new socket manager.
1740  */
1741 
1742 static isc_result_t
1743 setup_watcher(isc_socketmgr_t *manager) {
1744 	isc_result_t result;
1745 
1746 	UNUSED(result);
1747 
1748 	manager->fd_bufsize = sizeof(fd_set);
1749 
1750 	manager->read_fds = NULL;
1751 	manager->read_fds_copy = NULL;
1752 	manager->write_fds = NULL;
1753 	manager->write_fds_copy = NULL;
1754 
1755 	manager->read_fds = malloc(manager->fd_bufsize);
1756 	if (manager->read_fds != NULL)
1757 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1758 	if (manager->read_fds_copy != NULL)
1759 		manager->write_fds = malloc(manager->fd_bufsize);
1760 	if (manager->write_fds != NULL) {
1761 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1762 	}
1763 	if (manager->write_fds_copy == NULL) {
1764 		if (manager->write_fds != NULL) {
1765 			free(manager->write_fds);
1766 		}
1767 		if (manager->read_fds_copy != NULL) {
1768 			free(manager->read_fds_copy);
1769 		}
1770 		if (manager->read_fds != NULL) {
1771 			free(manager->read_fds);
1772 		}
1773 		return (ISC_R_NOMEMORY);
1774 	}
1775 	memset(manager->read_fds, 0, manager->fd_bufsize);
1776 	memset(manager->write_fds, 0, manager->fd_bufsize);
1777 
1778 	manager->maxfd = 0;
1779 
1780 	return (ISC_R_SUCCESS);
1781 }
1782 
1783 static void
1784 cleanup_watcher(isc_socketmgr_t *manager) {
1785 
1786 	if (manager->read_fds != NULL)
1787 		free(manager->read_fds);
1788 	if (manager->read_fds_copy != NULL)
1789 		free(manager->read_fds_copy);
1790 	if (manager->write_fds != NULL)
1791 		free(manager->write_fds);
1792 	if (manager->write_fds_copy != NULL)
1793 		free(manager->write_fds_copy);
1794 }
1795 
1796 static isc_result_t
1797 isc_socketmgr_create2(isc_socketmgr_t **managerp,
1798 		       unsigned int maxsocks)
1799 {
1800 	isc_socketmgr_t *manager;
1801 	isc_result_t result;
1802 
1803 	REQUIRE(managerp != NULL && *managerp == NULL);
1804 
1805 	if (socketmgr != NULL) {
1806 		/* Don't allow maxsocks to be updated */
1807 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1808 			return (ISC_R_EXISTS);
1809 
1810 		socketmgr->refs++;
1811 		*managerp = (isc_socketmgr_t *)socketmgr;
1812 		return (ISC_R_SUCCESS);
1813 	}
1814 
1815 	if (maxsocks == 0)
1816 		maxsocks = FD_SETSIZE;
1817 
1818 	manager = malloc(sizeof(*manager));
1819 	if (manager == NULL)
1820 		return (ISC_R_NOMEMORY);
1821 
1822 	/* zero-clear so that necessary cleanup on failure will be easy */
1823 	memset(manager, 0, sizeof(*manager));
1824 	manager->maxsocks = maxsocks;
1825 	manager->fds = malloc(manager->maxsocks * sizeof(isc_socket_t *));
1826 	if (manager->fds == NULL) {
1827 		result = ISC_R_NOMEMORY;
1828 		goto free_manager;
1829 	}
1830 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1831 	if (manager->fdstate == NULL) {
1832 		result = ISC_R_NOMEMORY;
1833 		goto free_manager;
1834 	}
1835 
1836 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1837 	ISC_LIST_INIT(manager->socklist);
1838 
1839 	manager->refs = 1;
1840 
1841 	/*
1842 	 * Set up initial state for the select loop
1843 	 */
1844 	result = setup_watcher(manager);
1845 	if (result != ISC_R_SUCCESS)
1846 		goto cleanup;
1847 
1848 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1849 
1850 	socketmgr = manager;
1851 	*managerp = (isc_socketmgr_t *)manager;
1852 
1853 	return (ISC_R_SUCCESS);
1854 
1855 cleanup:
1856 
1857 free_manager:
1858 	if (manager->fdstate != NULL) {
1859 		free(manager->fdstate);
1860 	}
1861 	if (manager->fds != NULL) {
1862 		free(manager->fds);
1863 	}
1864 	free(manager);
1865 
1866 	return (result);
1867 }
1868 
1869 isc_result_t
1870 isc_socketmgr_create(isc_socketmgr_t **managerp) {
1871 	return (isc_socketmgr_create2(managerp, 0));
1872 }
1873 
1874 void
1875 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
1876 	isc_socketmgr_t *manager;
1877 	int i;
1878 
1879 	/*
1880 	 * Destroy a socket manager.
1881 	 */
1882 
1883 	REQUIRE(managerp != NULL);
1884 	manager = (isc_socketmgr_t *)*managerp;
1885 
1886 	manager->refs--;
1887 	if (manager->refs > 0) {
1888 		*managerp = NULL;
1889 		return;
1890 	}
1891 	socketmgr = NULL;
1892 
1893 	/*
1894 	 * Wait for all sockets to be destroyed.
1895 	 */
1896 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1897 		isc_taskmgr_dispatch(NULL);
1898 	}
1899 
1900 	/*
1901 	 * Here, poke our select/poll thread.  Do this by closing the write
1902 	 * half of the pipe, which will send EOF to the read half.
1903 	 * This is currently a no-op in the non-threaded case.
1904 	 */
1905 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
1906 
1907 	/*
1908 	 * Clean up.
1909 	 */
1910 	cleanup_watcher(manager);
1911 
1912 	for (i = 0; i < (int)manager->maxsocks; i++)
1913 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
1914 			(void)close(i);
1915 
1916 	free(manager->fds);
1917 	free(manager->fdstate);
1918 
1919 	free(manager);
1920 
1921 	*managerp = NULL;
1922 
1923 	socketmgr = NULL;
1924 }
1925 
1926 static isc_result_t
1927 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
1928 	    unsigned int flags)
1929 {
1930 	int io_state;
1931 	isc_task_t *ntask = NULL;
1932 	isc_result_t result = ISC_R_SUCCESS;
1933 
1934 	dev->ev_sender = task;
1935 
1936 	if (sock->type == isc_sockettype_udp) {
1937 		io_state = doio_recv(sock, dev);
1938 	} else {
1939 		if (ISC_LIST_EMPTY(sock->recv_list))
1940 			io_state = doio_recv(sock, dev);
1941 		else
1942 			io_state = DOIO_SOFT;
1943 	}
1944 
1945 	switch (io_state) {
1946 	case DOIO_SOFT:
1947 		/*
1948 		 * We couldn't read all or part of the request right now, so
1949 		 * queue it.
1950 		 *
1951 		 * Attach to socket and to task
1952 		 */
1953 		isc_task_attach(task, &ntask);
1954 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
1955 
1956 		/*
1957 		 * Enqueue the request.  If the socket was previously not being
1958 		 * watched, poke the watcher to start paying attention to it.
1959 		 */
1960 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
1961 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1962 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
1963 
1964 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
1965 			   "socket_recv: event %p -> task %p",
1966 			   dev, ntask);
1967 
1968 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
1969 			result = ISC_R_INPROGRESS;
1970 		break;
1971 
1972 	case DOIO_EOF:
1973 		dev->result = ISC_R_EOF;
1974 		/* fallthrough */
1975 
1976 	case DOIO_HARD:
1977 	case DOIO_SUCCESS:
1978 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
1979 			send_recvdone_event(sock, &dev);
1980 		break;
1981 	}
1982 
1983 	return (result);
1984 }
1985 
1986 isc_result_t
1987 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
1988 		  unsigned int minimum, isc_task_t *task,
1989 		  isc_taskaction_t action, void *arg)
1990 {
1991 	isc_socket_t *sock = (isc_socket_t *)sock0;
1992 	isc_socketevent_t *dev;
1993 	unsigned int iocount;
1994 	isc_buffer_t *buffer;
1995 
1996 	REQUIRE(buflist != NULL);
1997 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
1998 	REQUIRE(task != NULL);
1999 	REQUIRE(action != NULL);
2000 
2001 	iocount = isc_bufferlist_availablecount(buflist);
2002 	REQUIRE(iocount > 0);
2003 
2004 	INSIST(sock->bound);
2005 
2006 	dev = allocate_socketevent(sock,
2007 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2008 	if (dev == NULL)
2009 		return (ISC_R_NOMEMORY);
2010 
2011 	/*
2012 	 * UDP sockets are always partial read
2013 	 */
2014 	if (sock->type == isc_sockettype_udp)
2015 		dev->minimum = 1;
2016 	else {
2017 		if (minimum == 0)
2018 			dev->minimum = iocount;
2019 		else
2020 			dev->minimum = minimum;
2021 	}
2022 
2023 	/*
2024 	 * Move each buffer from the passed in list to our internal one.
2025 	 */
2026 	buffer = ISC_LIST_HEAD(*buflist);
2027 	while (buffer != NULL) {
2028 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2029 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2030 		buffer = ISC_LIST_HEAD(*buflist);
2031 	}
2032 
2033 	return (socket_recv(sock, dev, task, 0));
2034 }
2035 
2036 static isc_result_t
2037 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2038 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2039 	    unsigned int flags)
2040 {
2041 	int io_state;
2042 	isc_task_t *ntask = NULL;
2043 	isc_result_t result = ISC_R_SUCCESS;
2044 
2045 	dev->ev_sender = task;
2046 
2047 	set_dev_address(address, sock, dev);
2048 	if (pktinfo != NULL) {
2049 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2050 		dev->pktinfo = *pktinfo;
2051 
2052 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2053 		    !isc_sockaddr_islinklocal(&dev->address)) {
2054 			socket_log(sock, NULL, TRACE,
2055 				   "pktinfo structure provided, ifindex %u "
2056 				   "(set to 0)", pktinfo->ipi6_ifindex);
2057 
2058 			/*
2059 			 * Set the pktinfo index to 0 here, to let the
2060 			 * kernel decide what interface it should send on.
2061 			 */
2062 			dev->pktinfo.ipi6_ifindex = 0;
2063 		}
2064 	}
2065 
2066 	if (sock->type == isc_sockettype_udp)
2067 		io_state = doio_send(sock, dev);
2068 	else {
2069 		if (ISC_LIST_EMPTY(sock->send_list))
2070 			io_state = doio_send(sock, dev);
2071 		else
2072 			io_state = DOIO_SOFT;
2073 	}
2074 
2075 	switch (io_state) {
2076 	case DOIO_SOFT:
2077 		/*
2078 		 * We couldn't send all or part of the request right now, so
2079 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2080 		 */
2081 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2082 			isc_task_attach(task, &ntask);
2083 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2084 
2085 			/*
2086 			 * Enqueue the request.  If the socket was previously
2087 			 * not being watched, poke the watcher to start
2088 			 * paying attention to it.
2089 			 */
2090 			if (ISC_LIST_EMPTY(sock->send_list) &&
2091 			    !sock->pending_send)
2092 				select_poke(sock->manager, sock->fd,
2093 					    SELECT_POKE_WRITE);
2094 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2095 
2096 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2097 				   "socket_send: event %p -> task %p",
2098 				   dev, ntask);
2099 
2100 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2101 				result = ISC_R_INPROGRESS;
2102 			break;
2103 		}
2104 
2105 		/* FALLTHROUGH */
2106 
2107 	case DOIO_HARD:
2108 	case DOIO_SUCCESS:
2109 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2110 			send_senddone_event(sock, &dev);
2111 		break;
2112 	}
2113 
2114 	return (result);
2115 }
2116 
2117 isc_result_t
2118 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2119 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2120 {
2121 	return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL,
2122 				     NULL, 0));
2123 }
2124 
2125 isc_result_t
2126 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2127 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2128 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2129 		     unsigned int flags)
2130 {
2131 	isc_socket_t *sock = (isc_socket_t *)sock0;
2132 	isc_socketevent_t *dev;
2133 	unsigned int iocount;
2134 	isc_buffer_t *buffer;
2135 
2136 	REQUIRE(buflist != NULL);
2137 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2138 	REQUIRE(task != NULL);
2139 	REQUIRE(action != NULL);
2140 
2141 	iocount = isc_bufferlist_usedcount(buflist);
2142 	REQUIRE(iocount > 0);
2143 
2144 	dev = allocate_socketevent(sock,
2145 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2146 	if (dev == NULL)
2147 		return (ISC_R_NOMEMORY);
2148 
2149 	/*
2150 	 * Move each buffer from the passed in list to our internal one.
2151 	 */
2152 	buffer = ISC_LIST_HEAD(*buflist);
2153 	while (buffer != NULL) {
2154 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2155 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2156 		buffer = ISC_LIST_HEAD(*buflist);
2157 	}
2158 
2159 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2160 }
2161 
2162 isc_result_t
2163 isc_socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2164 		 unsigned int options) {
2165 	isc_socket_t *sock = (isc_socket_t *)sock0;
2166 	int on = 1;
2167 
2168 	INSIST(!sock->bound);
2169 
2170 	if (sock->pf != sockaddr->type.sa.sa_family) {
2171 		return (ISC_R_FAMILYMISMATCH);
2172 	}
2173 
2174 	/*
2175 	 * Only set SO_REUSEADDR when we want a specific port.
2176 	 */
2177 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2178 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2179 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2180 		       sizeof(on)) < 0) {
2181 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2182 				 "setsockopt(%d) %s", sock->fd, "failed");
2183 		/* Press on... */
2184 	}
2185 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2186 		switch (errno) {
2187 		case EACCES:
2188 			return (ISC_R_NOPERM);
2189 		case EADDRNOTAVAIL:
2190 			return (ISC_R_ADDRNOTAVAIL);
2191 		case EADDRINUSE:
2192 			return (ISC_R_ADDRINUSE);
2193 		case EINVAL:
2194 			return (ISC_R_BOUND);
2195 		default:
2196 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2197 					 strerror(errno));
2198 			return (ISC_R_UNEXPECTED);
2199 		}
2200 	}
2201 
2202 	socket_log(sock, sockaddr, TRACE, "bound");
2203 	sock->bound = 1;
2204 
2205 	return (ISC_R_SUCCESS);
2206 }
2207 
2208 isc_result_t
2209 isc_socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2210 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2211 {
2212 	isc_socket_t *sock = (isc_socket_t *)sock0;
2213 	isc_socket_connev_t *dev;
2214 	isc_task_t *ntask = NULL;
2215 	isc_socketmgr_t *manager;
2216 	int cc;
2217 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2218 
2219 	REQUIRE(addr != NULL);
2220 	REQUIRE(task != NULL);
2221 	REQUIRE(action != NULL);
2222 
2223 	manager = sock->manager;
2224 	REQUIRE(addr != NULL);
2225 
2226 	if (isc_sockaddr_ismulticast(addr))
2227 		return (ISC_R_MULTICAST);
2228 
2229 	REQUIRE(!sock->connecting);
2230 
2231 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2232 							ISC_SOCKEVENT_CONNECT,
2233 							action,	arg,
2234 							sizeof(*dev));
2235 	if (dev == NULL) {
2236 		return (ISC_R_NOMEMORY);
2237 	}
2238 	ISC_LINK_INIT(dev, ev_link);
2239 
2240 	/*
2241 	 * Try to do the connect right away, as there can be only one
2242 	 * outstanding, and it might happen to complete.
2243 	 */
2244 	sock->peer_address = *addr;
2245 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2246 	if (cc < 0) {
2247 		/*
2248 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2249 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2250 		 * a success and let the user detect it if it's really an error
2251 		 * at the time of sending a packet on the socket.
2252 		 */
2253 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2254 			cc = 0;
2255 			goto success;
2256 		}
2257 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2258 			goto queue;
2259 
2260 		switch (errno) {
2261 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2262 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2263 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2264 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2265 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2266 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2267 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2268 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2269 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2270 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2271 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2272 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2273 #undef ERROR_MATCH
2274 		}
2275 
2276 		sock->connected = 0;
2277 
2278 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2279 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2280 				 addrbuf, errno, strerror(errno));
2281 
2282 		isc_event_free(ISC_EVENT_PTR(&dev));
2283 		return (ISC_R_UNEXPECTED);
2284 
2285 	err_exit:
2286 		sock->connected = 0;
2287 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2288 
2289 		return (ISC_R_SUCCESS);
2290 	}
2291 
2292 	/*
2293 	 * If connect completed, fire off the done event.
2294 	 */
2295  success:
2296 	if (cc == 0) {
2297 		sock->connected = 1;
2298 		sock->bound = 1;
2299 		dev->result = ISC_R_SUCCESS;
2300 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2301 
2302 		return (ISC_R_SUCCESS);
2303 	}
2304 
2305  queue:
2306 
2307 	/*
2308 	 * Attach to task.
2309 	 */
2310 	isc_task_attach(task, &ntask);
2311 
2312 	sock->connecting = 1;
2313 
2314 	dev->ev_sender = ntask;
2315 
2316 	/*
2317 	 * Poke watcher here.  We still have the socket locked, so there
2318 	 * is no race condition.  We will keep the lock for such a short
2319 	 * bit of time waking it up now or later won't matter all that much.
2320 	 */
2321 	if (sock->connect_ev == NULL)
2322 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2323 
2324 	sock->connect_ev = dev;
2325 
2326 	return (ISC_R_SUCCESS);
2327 }
2328 
2329 /*
2330  * Called when a socket with a pending connect() finishes.
2331  */
2332 static void
2333 internal_connect(isc_task_t *me, isc_event_t *ev) {
2334 	isc_socket_t *sock;
2335 	isc_socket_connev_t *dev;
2336 	isc_task_t *task;
2337 	int cc;
2338 	socklen_t optlen;
2339 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2340 
2341 	UNUSED(me);
2342 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2343 
2344 	sock = ev->ev_sender;
2345 
2346 	/*
2347 	 * When the internal event was sent the reference count was bumped
2348 	 * to keep the socket around for us.  Decrement the count here.
2349 	 */
2350 	INSIST(sock->references > 0);
2351 	sock->references--;
2352 	if (sock->references == 0) {
2353 		destroy(&sock);
2354 		return;
2355 	}
2356 
2357 	/*
2358 	 * Has this event been canceled?
2359 	 */
2360 	dev = sock->connect_ev;
2361 	if (dev == NULL) {
2362 		INSIST(!sock->connecting);
2363 		return;
2364 	}
2365 
2366 	INSIST(sock->connecting);
2367 	sock->connecting = 0;
2368 
2369 	/*
2370 	 * Get any possible error status here.
2371 	 */
2372 	optlen = sizeof(cc);
2373 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2374 		       (void *)&cc, (void *)&optlen) < 0)
2375 		cc = errno;
2376 	else
2377 		errno = cc;
2378 
2379 	if (errno != 0) {
2380 		/*
2381 		 * If the error is EAGAIN, just re-select on this
2382 		 * fd and pretend nothing strange happened.
2383 		 */
2384 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2385 			sock->connecting = 1;
2386 			select_poke(sock->manager, sock->fd,
2387 				    SELECT_POKE_CONNECT);
2388 			return;
2389 		}
2390 
2391 
2392 		/*
2393 		 * Translate other errors into ISC_R_* flavors.
2394 		 */
2395 		switch (errno) {
2396 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2397 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2398 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2399 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2400 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2401 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2402 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2403 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2404 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2405 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2406 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2407 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2408 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2409 #undef ERROR_MATCH
2410 		default:
2411 			dev->result = ISC_R_UNEXPECTED;
2412 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2413 					    sizeof(peerbuf));
2414 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2415 					 "internal_connect: connect(%s) %s",
2416 					 peerbuf, strerror(errno));
2417 		}
2418 	} else {
2419 		dev->result = ISC_R_SUCCESS;
2420 		sock->connected = 1;
2421 		sock->bound = 1;
2422 	}
2423 
2424 	sock->connect_ev = NULL;
2425 
2426 	task = dev->ev_sender;
2427 	dev->ev_sender = sock;
2428 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2429 }
2430 
2431 /*
2432  * Run through the list of events on this socket, and cancel the ones
2433  * queued for task "task" of type "how".  "how" is a bitmask.
2434  */
2435 void
2436 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2437 	isc_socket_t *sock = (isc_socket_t *)sock0;
2438 
2439 	/*
2440 	 * Quick exit if there is nothing to do.  Don't even bother locking
2441 	 * in this case.
2442 	 */
2443 	if (how == 0)
2444 		return;
2445 
2446 	/*
2447 	 * All of these do the same thing, more or less.
2448 	 * Each will:
2449 	 *	o If the internal event is marked as "posted" try to
2450 	 *	  remove it from the task's queue.  If this fails, mark it
2451 	 *	  as canceled instead, and let the task clean it up later.
2452 	 *	o For each I/O request for that task of that type, post
2453 	 *	  its done event with status of "ISC_R_CANCELED".
2454 	 *	o Reset any state needed.
2455 	 */
2456 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2457 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2458 		isc_socketevent_t      *dev;
2459 		isc_socketevent_t      *next;
2460 		isc_task_t	       *current_task;
2461 
2462 		dev = ISC_LIST_HEAD(sock->recv_list);
2463 
2464 		while (dev != NULL) {
2465 			current_task = dev->ev_sender;
2466 			next = ISC_LIST_NEXT(dev, ev_link);
2467 
2468 			if ((task == NULL) || (task == current_task)) {
2469 				dev->result = ISC_R_CANCELED;
2470 				send_recvdone_event(sock, &dev);
2471 			}
2472 			dev = next;
2473 		}
2474 	}
2475 
2476 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2477 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2478 		isc_socketevent_t      *dev;
2479 		isc_socketevent_t      *next;
2480 		isc_task_t	       *current_task;
2481 
2482 		dev = ISC_LIST_HEAD(sock->send_list);
2483 
2484 		while (dev != NULL) {
2485 			current_task = dev->ev_sender;
2486 			next = ISC_LIST_NEXT(dev, ev_link);
2487 
2488 			if ((task == NULL) || (task == current_task)) {
2489 				dev->result = ISC_R_CANCELED;
2490 				send_senddone_event(sock, &dev);
2491 			}
2492 			dev = next;
2493 		}
2494 	}
2495 
2496 	/*
2497 	 * Connecting is not a list.
2498 	 */
2499 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2500 	    && sock->connect_ev != NULL) {
2501 		isc_socket_connev_t    *dev;
2502 		isc_task_t	       *current_task;
2503 
2504 		INSIST(sock->connecting);
2505 		sock->connecting = 0;
2506 
2507 		dev = sock->connect_ev;
2508 		current_task = dev->ev_sender;
2509 
2510 		if ((task == NULL) || (task == current_task)) {
2511 			sock->connect_ev = NULL;
2512 
2513 			dev->result = ISC_R_CANCELED;
2514 			dev->ev_sender = sock;
2515 			isc_task_sendanddetach(&current_task,
2516 					       ISC_EVENT_PTR(&dev));
2517 		}
2518 	}
2519 
2520 }
2521 
2522 /*
2523  * In our assumed scenario, we can simply use a single static object.
2524  * XXX: this is not true if the application uses multiple threads with
2525  *      'multi-context' mode.  Fixing this is a future TODO item.
2526  */
2527 static isc_socketwait_t swait_private;
2528 
2529 int
2530 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2531 			  isc_socketwait_t **swaitp)
2532 {
2533 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2534 	int n;
2535 
2536 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2537 
2538 	if (manager == NULL)
2539 		manager = socketmgr;
2540 	if (manager == NULL)
2541 		return (0);
2542 
2543 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2544 	memmove(manager->write_fds_copy, manager->write_fds,
2545 		manager->fd_bufsize);
2546 
2547 	swait_private.readset = manager->read_fds_copy;
2548 	swait_private.writeset = manager->write_fds_copy;
2549 	swait_private.maxfd = manager->maxfd + 1;
2550 
2551 	n = select(swait_private.maxfd, swait_private.readset,
2552 		   swait_private.writeset, NULL, tvp);
2553 
2554 	*swaitp = &swait_private;
2555 	return (n);
2556 }
2557 
2558 isc_result_t
2559 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2560 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2561 
2562 	REQUIRE(swait == &swait_private);
2563 
2564 	if (manager == NULL)
2565 		manager = socketmgr;
2566 	if (manager == NULL)
2567 		return (ISC_R_NOTFOUND);
2568 
2569 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2570 	return (ISC_R_SUCCESS);
2571 }
2572