xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision e49c1783d6f57cc56e4b172757e18537f08f2de8)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 #include <isc/formatcheck.h>
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/strerror.h>
41 #include <isc/task.h>
42 #include <isc/util.h>
43 
44 #include "errno2result.h"
45 
46 #include "socket_p.h"
47 #include "../task_p.h"
48 
49 struct isc_socketwait {
50 	fd_set *readset;
51 	fd_set *writeset;
52 	int nfds;
53 	int maxfd;
54 };
55 
56 /*
57  * Set by the -T dscp option on the command line. If set to a value
58  * other than -1, we check to make sure DSCP values match it, and
59  * assert if not.
60  */
61 int isc_dscp_check_value = -1;
62 
63 /*%
64  * Size of per-FD lock buckets.
65  */
66 #define FDLOCK_COUNT		1
67 #define FDLOCK_ID(fd)		0
68 
69 /*%
70  * Some systems define the socket length argument as an int, some as size_t,
71  * some as socklen_t.  This is here so it can be easily changed if needed.
72  */
73 
74 /*%
75  * Define what the possible "soft" errors can be.  These are non-fatal returns
76  * of various network related functions, like recv() and so on.
77  *
78  * For some reason, BSDI (and perhaps others) will sometimes return <0
79  * from recv() but will have errno==0.  This is broken, but we have to
80  * work around it here.
81  */
82 #define SOFT_ERROR(e)	((e) == EAGAIN || \
83 			 (e) == EWOULDBLOCK || \
84 			 (e) == EINTR || \
85 			 (e) == 0)
86 
87 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
88 
89 /*!<
90  * DLVL(90)  --  Function entry/exit and other tracing.
91  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
92  * DLVL(60)  --  Socket data send/receive
93  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
94  * DLVL(20)  --  Socket creation/destruction.
95  */
96 #define TRACE_LEVEL		90
97 #define CORRECTNESS_LEVEL	70
98 #define IOEVENT_LEVEL		60
99 #define EVENT_LEVEL		50
100 #define CREATION_LEVEL		20
101 
102 #define TRACE		DLVL(TRACE_LEVEL)
103 #define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
104 #define IOEVENT		DLVL(IOEVENT_LEVEL)
105 #define EVENT		DLVL(EVENT_LEVEL)
106 #define CREATION	DLVL(CREATION_LEVEL)
107 
108 typedef isc_event_t intev_t;
109 
110 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
111 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
112 
113 /*!
114  * IPv6 control information.  If the socket is an IPv6 socket we want
115  * to collect the destination address and interface so the client can
116  * set them on outgoing packets.
117  */
118 
119 /*%
120  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
121  * a setsockopt() like interface to request timestamps, and if the OS
122  * doesn't do it for us, call gettimeofday() on every UDP receive?
123  */
124 
125 /*%
126  * The size to raise the receive buffer to (from BIND 8).
127  */
128 #define RCVBUFSIZE (32*1024)
129 
130 /*%
131  * Instead of calculating the cmsgbuf lengths every time we take
132  * a rule of thumb approach - sizes are taken from x86_64 linux,
133  * multiplied by 2, everything should fit. Those sizes are not
134  * large enough to cause any concern.
135  */
136 #define CMSG_SP_IN6PKT 40
137 
138 #define CMSG_SP_TIMESTAMP 32
139 
140 #define CMSG_SP_TCTOS 24
141 
142 #define CMSG_SP_INT 24
143 
144 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
145 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
146 
147 /*%
148  * The number of times a send operation is repeated if the result is EINTR.
149  */
150 #define NRETRIES 10
151 
152 typedef struct isc__socket isc__socket_t;
153 typedef struct isc__socketmgr isc__socketmgr_t;
154 
155 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
156 
157 struct isc__socket {
158 	/* Not locked. */
159 	isc_socket_t		common;
160 	isc__socketmgr_t	*manager;
161 	isc_sockettype_t	type;
162 
163 	/* Locked by socket lock. */
164 	ISC_LINK(isc__socket_t)	link;
165 	unsigned int		references;
166 	int			fd;
167 	int			pf;
168 
169 	ISC_LIST(isc_socketevent_t)		send_list;
170 	ISC_LIST(isc_socketevent_t)		recv_list;
171 	isc_socket_connev_t		       *connect_ev;
172 
173 	/*
174 	 * Internal events.  Posted when a descriptor is readable or
175 	 * writable.  These are statically allocated and never freed.
176 	 * They will be set to non-purgable before use.
177 	 */
178 	intev_t			readable_ev;
179 	intev_t			writable_ev;
180 
181 	isc_sockaddr_t		peer_address;       /* remote address */
182 
183 	unsigned int		pending_recv : 1,
184 				pending_send : 1,
185 				connected : 1,
186 				connecting : 1,     /* connect pending */
187 				bound : 1,          /* bound to local addr */
188 				active : 1,         /* currently active */
189 				pktdscp : 1;	    /* per packet dscp */
190 	unsigned int		dscp;
191 };
192 
193 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
194 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
195 
196 struct isc__socketmgr {
197 	/* Not locked. */
198 	isc_socketmgr_t		common;
199 	int			fd_bufsize;
200 	unsigned int		maxsocks;
201 
202 	isc__socket_t	       **fds;
203 	int			*fdstate;
204 
205 	/* Locked by manager lock. */
206 	ISC_LIST(isc__socket_t)	socklist;
207 	fd_set			*read_fds;
208 	fd_set			*read_fds_copy;
209 	fd_set			*write_fds;
210 	fd_set			*write_fds_copy;
211 	int			maxfd;
212 	unsigned int		refs;
213 };
214 
215 static isc__socketmgr_t *socketmgr = NULL;
216 
217 #define CLOSED			0	/* this one must be zero */
218 #define MANAGED			1
219 #define CLOSE_PENDING		2
220 
221 /*
222  * send() and recv() iovec counts
223  */
224 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
225 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
226 
227 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
228 				  isc_sockettype_t type,
229 				  isc_socket_t **socketp);
230 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
231 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
232 static void free_socket(isc__socket_t **);
233 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
234 				    isc__socket_t **);
235 static void destroy(isc__socket_t **);
236 static void internal_connect(isc_task_t *, isc_event_t *);
237 static void internal_recv(isc_task_t *, isc_event_t *);
238 static void internal_send(isc_task_t *, isc_event_t *);
239 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
240 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
241 			      struct msghdr *, struct iovec *, size_t *);
242 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
243 			      struct msghdr *, struct iovec *, size_t *);
244 
245 /*%
246  * The following are intended for internal use (indicated by "isc__"
247  * prefix) but are not declared as static, allowing direct access from
248  * unit tests etc.
249  */
250 
251 isc_result_t
252 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
253 		   isc_socket_t **socketp);
254 void
255 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
256 void
257 isc__socket_detach(isc_socket_t **socketp);
258 isc_result_t
259 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
260 		 unsigned int minimum, isc_task_t *task,
261 		  isc_taskaction_t action, void *arg);
262 isc_result_t
263 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
264 		  isc_task_t *task, isc_taskaction_t action, void *arg);
265 isc_result_t
266 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
267 		     isc_task_t *task, isc_taskaction_t action, void *arg,
268 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
269 		     unsigned int flags);
270 isc_result_t
271 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
272 		 unsigned int options);
273 isc_result_t
274 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
275 		    isc_task_t *task, isc_taskaction_t action,
276 		    void *arg);
277 void
278 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
279 
280 isc_result_t
281 isc__socketmgr_create(isc_socketmgr_t **managerp);
282 isc_result_t
283 isc__socketmgr_create2(isc_socketmgr_t **managerp,
284 		       unsigned int maxsocks);
285 isc_result_t
286 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
287 void
288 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
289 
290 static struct {
291 	isc_socketmethods_t methods;
292 
293 	/*%
294 	 * The following are defined just for avoiding unused static functions.
295 	 */
296 	void *recvv, *sendv;
297 } socketmethods = {
298 	{
299 		isc__socket_attach,
300 		isc__socket_detach,
301 		isc__socket_bind,
302 		isc__socket_connect,
303 		isc__socket_cancel,
304 	},
305 	(void *)isc__socket_recvv,
306 	(void *)isc__socket_sendv,
307 };
308 
309 static isc_socketmgrmethods_t socketmgrmethods = {
310 	isc__socketmgr_destroy,
311 	isc__socket_create
312 };
313 
314 #define SELECT_POKE_SHUTDOWN		(-1)
315 #define SELECT_POKE_NOTHING		(-2)
316 #define SELECT_POKE_READ		(-3)
317 #define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
318 #define SELECT_POKE_WRITE		(-4)
319 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
320 #define SELECT_POKE_CLOSE		(-5)
321 
322 #define SOCK_DEAD(s)			((s)->references == 0)
323 
324 /*%
325  * Shortcut index arrays to get access to statistics counters.
326  */
327 enum {
328 	STATID_OPEN = 0,
329 	STATID_OPENFAIL = 1,
330 	STATID_CLOSE = 2,
331 	STATID_BINDFAIL = 3,
332 	STATID_CONNECTFAIL = 4,
333 	STATID_CONNECT = 5,
334 	STATID_ACCEPTFAIL = 6,
335 	STATID_ACCEPT = 7,
336 	STATID_SENDFAIL = 8,
337 	STATID_RECVFAIL = 9,
338 	STATID_ACTIVE = 10
339 };
340 
341 
342 static void
343 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
344 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
345 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
346 static void
347 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
348 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
349 	   const char *fmt, ...)
350 {
351 	char msgbuf[2048];
352 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
353 	va_list ap;
354 
355 	if (! isc_log_wouldlog(isc_lctx, level))
356 		return;
357 
358 	va_start(ap, fmt);
359 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
360 	va_end(ap);
361 
362 	if (address == NULL) {
363 		isc_log_write(isc_lctx, category, module, level,
364 			       "socket %p: %s", sock, msgbuf);
365 	} else {
366 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
367 		isc_log_write(isc_lctx, category, module, level,
368 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
369 	}
370 }
371 
372 static inline isc_result_t
373 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
374 	isc_result_t result = ISC_R_SUCCESS;
375 
376 	if (msg == SELECT_POKE_READ)
377 		FD_SET(fd, manager->read_fds);
378 	if (msg == SELECT_POKE_WRITE)
379 		FD_SET(fd, manager->write_fds);
380 
381 	return (result);
382 }
383 
384 static inline isc_result_t
385 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
386 	isc_result_t result = ISC_R_SUCCESS;
387 
388 	if (msg == SELECT_POKE_READ)
389 		FD_CLR(fd, manager->read_fds);
390 	else if (msg == SELECT_POKE_WRITE)
391 		FD_CLR(fd, manager->write_fds);
392 
393 	return (result);
394 }
395 
396 static void
397 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
398 	isc_result_t result;
399 
400 	/*
401 	 * This is a wakeup on a socket.  If the socket is not in the
402 	 * process of being closed, start watching it for either reads
403 	 * or writes.
404 	 */
405 
406 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
407 
408 	if (msg == SELECT_POKE_CLOSE) {
409 		/* No one should be updating fdstate, so no need to lock it */
410 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
411 		manager->fdstate[fd] = CLOSED;
412 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
413 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
414 		(void)close(fd);
415 		return;
416 	}
417 
418 	if (manager->fdstate[fd] == CLOSE_PENDING) {
419 
420 		/*
421 		 * We accept (and ignore) any error from unwatch_fd() as we are
422 		 * closing the socket, hoping it doesn't leave dangling state in
423 		 * the kernel.
424 		 */
425 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
426 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
427 		return;
428 	}
429 	if (manager->fdstate[fd] != MANAGED) {
430 		return;
431 	}
432 
433 	/*
434 	 * Set requested bit.
435 	 */
436 	result = watch_fd(manager, fd, msg);
437 	if (result != ISC_R_SUCCESS) {
438 		/*
439 		 * XXXJT: what should we do?  Ignoring the failure of watching
440 		 * a socket will make the application dysfunctional, but there
441 		 * seems to be no reasonable recovery process.
442 		 */
443 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
444 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
445 			      "failed to start watching FD (%d): %s",
446 			      fd, isc_result_totext(result));
447 	}
448 }
449 
450 /*
451  * Update the state of the socketmgr when something changes.
452  */
453 static void
454 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
455 	if (msg == SELECT_POKE_SHUTDOWN)
456 		return;
457 	else if (fd >= 0)
458 		wakeup_socket(manager, fd, msg);
459 	return;
460 }
461 
462 /*
463  * Make a fd non-blocking.
464  */
465 static isc_result_t
466 make_nonblock(int fd) {
467 	int ret;
468 	char strbuf[ISC_STRERRORSIZE];
469 	int flags;
470 
471 	flags = fcntl(fd, F_GETFL, 0);
472 	flags |= O_NONBLOCK;
473 	ret = fcntl(fd, F_SETFL, flags);
474 
475 	if (ret == -1) {
476 		isc__strerror(errno, strbuf, sizeof(strbuf));
477 		UNEXPECTED_ERROR(__FILE__, __LINE__,
478 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
479 				 strbuf);
480 
481 		return (ISC_R_UNEXPECTED);
482 	}
483 
484 	return (ISC_R_SUCCESS);
485 }
486 
487 /*
488  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
489  * In order to ensure as much portability as possible, we provide wrapper
490  * functions of these macros.
491  * Note that cmsg_space() could run slow on OSes that do not have
492  * CMSG_SPACE.
493  */
494 static inline socklen_t
495 cmsg_len(socklen_t len) {
496 	return (CMSG_LEN(len));
497 }
498 
499 static inline socklen_t
500 cmsg_space(socklen_t len) {
501 	return (CMSG_SPACE(len));
502 }
503 
504 /*
505  * Process control messages received on a socket.
506  */
507 static void
508 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
509 	struct cmsghdr *cmsgp;
510 	struct in6_pktinfo *pktinfop;
511 	void *timevalp;
512 
513 	/*
514 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
515 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
516 	 * They are all here, outside of the CPP tests, because it is
517 	 * more consistent with the usual ISC coding style.
518 	 */
519 	UNUSED(sock);
520 	UNUSED(msg);
521 	UNUSED(dev);
522 
523 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
524 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
525 
526 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
527 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
528 
529 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
530 		return;
531 
532 	timevalp = NULL;
533 	pktinfop = NULL;
534 
535 	cmsgp = CMSG_FIRSTHDR(msg);
536 	while (cmsgp != NULL) {
537 		socket_log(sock, NULL, TRACE,
538 			   "processing cmsg %p", cmsgp);
539 
540 		if (cmsgp->cmsg_level == IPPROTO_IPV6
541 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
542 
543 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
544 			memmove(&dev->pktinfo, pktinfop,
545 				sizeof(struct in6_pktinfo));
546 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
547 			socket_log(sock, NULL, TRACE,
548 				   "interface received on ifindex %u",
549 				   dev->pktinfo.ipi6_ifindex);
550 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
551 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
552 			goto next;
553 		}
554 
555 		if (cmsgp->cmsg_level == SOL_SOCKET
556 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
557 			struct timeval tv;
558 			timevalp = CMSG_DATA(cmsgp);
559 			memmove(&tv, timevalp, sizeof(tv));
560 			dev->timestamp.seconds = tv.tv_sec;
561 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
562 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
563 			goto next;
564 		}
565 
566 		if (cmsgp->cmsg_level == IPPROTO_IPV6
567 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
568 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
569 			dev->dscp >>= 2;
570 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
571 			goto next;
572 		}
573 
574 		if (cmsgp->cmsg_level == IPPROTO_IP
575 		    && (cmsgp->cmsg_type == IP_TOS)) {
576 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
577 			dev->dscp >>= 2;
578 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
579 			goto next;
580 		}
581 	next:
582 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
583 	}
584 
585 }
586 
587 /*
588  * Construct an iov array and attach it to the msghdr passed in.  This is
589  * the SEND constructor, which will use the used region of the buffer
590  * (if using a buffer list) or will use the internal region (if a single
591  * buffer I/O is requested).
592  *
593  * Nothing can be NULL, and the done event must list at least one buffer
594  * on the buffer linked list for this function to be meaningful.
595  *
596  * If write_countp != NULL, *write_countp will hold the number of bytes
597  * this transaction can send.
598  */
599 static void
600 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
601 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
602 {
603 	unsigned int iovcount;
604 	isc_buffer_t *buffer;
605 	isc_region_t used;
606 	size_t write_count;
607 	size_t skip_count;
608 	struct cmsghdr *cmsgp;
609 
610 	memset(msg, 0, sizeof(*msg));
611 
612 	if (!sock->connected) {
613 		msg->msg_name = (void *)&dev->address.type.sa;
614 		msg->msg_namelen = dev->address.length;
615 	} else {
616 		msg->msg_name = NULL;
617 		msg->msg_namelen = 0;
618 	}
619 
620 	buffer = ISC_LIST_HEAD(dev->bufferlist);
621 	write_count = 0;
622 	iovcount = 0;
623 
624 	/*
625 	 * Single buffer I/O?  Skip what we've done so far in this region.
626 	 */
627 	if (buffer == NULL) {
628 		write_count = dev->region.length - dev->n;
629 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
630 		iov[0].iov_len = write_count;
631 		iovcount = 1;
632 
633 		goto config;
634 	}
635 
636 	/*
637 	 * Multibuffer I/O.
638 	 * Skip the data in the buffer list that we have already written.
639 	 */
640 	skip_count = dev->n;
641 	while (buffer != NULL) {
642 		REQUIRE(ISC_BUFFER_VALID(buffer));
643 		if (skip_count < isc_buffer_usedlength(buffer))
644 			break;
645 		skip_count -= isc_buffer_usedlength(buffer);
646 		buffer = ISC_LIST_NEXT(buffer, link);
647 	}
648 
649 	while (buffer != NULL) {
650 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
651 
652 		isc_buffer_usedregion(buffer, &used);
653 
654 		if (used.length > 0) {
655 			iov[iovcount].iov_base = (void *)(used.base
656 							  + skip_count);
657 			iov[iovcount].iov_len = used.length - skip_count;
658 			write_count += (used.length - skip_count);
659 			skip_count = 0;
660 			iovcount++;
661 		}
662 		buffer = ISC_LIST_NEXT(buffer, link);
663 	}
664 
665 	INSIST(skip_count == 0U);
666 
667  config:
668 	msg->msg_iov = iov;
669 	msg->msg_iovlen = iovcount;
670 
671 	msg->msg_control = NULL;
672 	msg->msg_controllen = 0;
673 	msg->msg_flags = 0;
674 
675 	if ((sock->type == isc_sockettype_udp) &&
676 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
677 	{
678 		struct in6_pktinfo *pktinfop;
679 
680 		socket_log(sock, NULL, TRACE,
681 			   "sendto pktinfo data, ifindex %u",
682 			   dev->pktinfo.ipi6_ifindex);
683 
684 		msg->msg_control = (void *)cmsgbuf;
685 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
686 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
687 
688 		cmsgp = (struct cmsghdr *)cmsgbuf;
689 		cmsgp->cmsg_level = IPPROTO_IPV6;
690 		cmsgp->cmsg_type = IPV6_PKTINFO;
691 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
692 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
693 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
694 	}
695 
696 	if ((sock->type == isc_sockettype_udp) &&
697 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
698 	{
699 		int use_min_mtu = 1;	/* -1, 0, 1 */
700 
701 		cmsgp = (struct cmsghdr *)(cmsgbuf +
702 					   msg->msg_controllen);
703 
704 		msg->msg_control = (void *)cmsgbuf;
705 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
706 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
707 
708 		cmsgp->cmsg_level = IPPROTO_IPV6;
709 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
710 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
711 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
712 	}
713 
714 	if (isc_dscp_check_value > -1) {
715 		if (sock->type == isc_sockettype_udp)
716 			INSIST((int)dev->dscp == isc_dscp_check_value);
717 		else if (sock->type == isc_sockettype_tcp)
718 			INSIST((int)sock->dscp == isc_dscp_check_value);
719 	}
720 
721 	if ((sock->type == isc_sockettype_udp) &&
722 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
723 	{
724 		int dscp = (dev->dscp << 2) & 0xff;
725 
726 		INSIST(dev->dscp < 0x40);
727 
728 		if (sock->pf == AF_INET && sock->pktdscp) {
729 			cmsgp = (struct cmsghdr *)(cmsgbuf +
730 						   msg->msg_controllen);
731 			msg->msg_control = (void *)cmsgbuf;
732 			msg->msg_controllen += cmsg_space(sizeof(dscp));
733 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
734 
735 			cmsgp->cmsg_level = IPPROTO_IP;
736 			cmsgp->cmsg_type = IP_TOS;
737 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
738 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
739 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
740 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
741 			       (void *)&dscp, sizeof(int)) < 0)
742 			{
743 				char strbuf[ISC_STRERRORSIZE];
744 				isc__strerror(errno, strbuf, sizeof(strbuf));
745 				UNEXPECTED_ERROR(__FILE__, __LINE__,
746 						 "setsockopt(%d, IP_TOS, %.02x)"
747 						 " %s: %s",
748 						 sock->fd, dscp >> 2,
749 						 "failed", strbuf);
750 			} else
751 				sock->dscp = dscp;
752 		}
753 
754 		if (sock->pf == AF_INET6 && sock->pktdscp) {
755 			cmsgp = (struct cmsghdr *)(cmsgbuf +
756 						   msg->msg_controllen);
757 			msg->msg_control = (void *)cmsgbuf;
758 			msg->msg_controllen += cmsg_space(sizeof(dscp));
759 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
760 
761 			cmsgp->cmsg_level = IPPROTO_IPV6;
762 			cmsgp->cmsg_type = IPV6_TCLASS;
763 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
764 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
765 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
766 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
767 				       (void *)&dscp, sizeof(int)) < 0) {
768 				char strbuf[ISC_STRERRORSIZE];
769 				isc__strerror(errno, strbuf, sizeof(strbuf));
770 				UNEXPECTED_ERROR(__FILE__, __LINE__,
771 						 "setsockopt(%d, IPV6_TCLASS, "
772 						 "%.02x) %s: %s",
773 						 sock->fd, dscp >> 2,
774 						 "failed", strbuf);
775 			} else
776 				sock->dscp = dscp;
777 		}
778 
779 		if (msg->msg_controllen != 0 &&
780 		    msg->msg_controllen < SENDCMSGBUFLEN)
781 		{
782 			memset(cmsgbuf + msg->msg_controllen, 0,
783 			       SENDCMSGBUFLEN - msg->msg_controllen);
784 		}
785 	}
786 
787 	if (write_countp != NULL)
788 		*write_countp = write_count;
789 }
790 
791 /*
792  * Construct an iov array and attach it to the msghdr passed in.  This is
793  * the RECV constructor, which will use the available region of the buffer
794  * (if using a buffer list) or will use the internal region (if a single
795  * buffer I/O is requested).
796  *
797  * Nothing can be NULL, and the done event must list at least one buffer
798  * on the buffer linked list for this function to be meaningful.
799  *
800  * If read_countp != NULL, *read_countp will hold the number of bytes
801  * this transaction can receive.
802  */
803 static void
804 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
805 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
806 {
807 	unsigned int iovcount;
808 	isc_buffer_t *buffer;
809 	isc_region_t available;
810 	size_t read_count;
811 
812 	memset(msg, 0, sizeof(struct msghdr));
813 
814 	if (sock->type == isc_sockettype_udp) {
815 		memset(&dev->address, 0, sizeof(dev->address));
816 		msg->msg_name = (void *)&dev->address.type.sa;
817 		msg->msg_namelen = sizeof(dev->address.type);
818 	} else { /* TCP */
819 		msg->msg_name = NULL;
820 		msg->msg_namelen = 0;
821 		dev->address = sock->peer_address;
822 	}
823 
824 	buffer = ISC_LIST_HEAD(dev->bufferlist);
825 	read_count = 0;
826 
827 	/*
828 	 * Single buffer I/O?  Skip what we've done so far in this region.
829 	 */
830 	if (buffer == NULL) {
831 		read_count = dev->region.length - dev->n;
832 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
833 		iov[0].iov_len = read_count;
834 		iovcount = 1;
835 
836 		goto config;
837 	}
838 
839 	/*
840 	 * Multibuffer I/O.
841 	 * Skip empty buffers.
842 	 */
843 	while (buffer != NULL) {
844 		REQUIRE(ISC_BUFFER_VALID(buffer));
845 		if (isc_buffer_availablelength(buffer) != 0)
846 			break;
847 		buffer = ISC_LIST_NEXT(buffer, link);
848 	}
849 
850 	iovcount = 0;
851 	while (buffer != NULL) {
852 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
853 
854 		isc_buffer_availableregion(buffer, &available);
855 
856 		if (available.length > 0) {
857 			iov[iovcount].iov_base = (void *)(available.base);
858 			iov[iovcount].iov_len = available.length;
859 			read_count += available.length;
860 			iovcount++;
861 		}
862 		buffer = ISC_LIST_NEXT(buffer, link);
863 	}
864 
865  config:
866 
867 	/*
868 	 * If needed, set up to receive that one extra byte.
869 	 */
870 	msg->msg_iov = iov;
871 	msg->msg_iovlen = iovcount;
872 
873 	msg->msg_control = cmsgbuf;
874 	msg->msg_controllen = RECVCMSGBUFLEN;
875 	msg->msg_flags = 0;
876 
877 	if (read_countp != NULL)
878 		*read_countp = read_count;
879 }
880 
881 static void
882 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
883 		isc_socketevent_t *dev)
884 {
885 	if (sock->type == isc_sockettype_udp) {
886 		if (address != NULL)
887 			dev->address = *address;
888 		else
889 			dev->address = sock->peer_address;
890 	} else if (sock->type == isc_sockettype_tcp) {
891 		INSIST(address == NULL);
892 		dev->address = sock->peer_address;
893 	}
894 }
895 
896 static void
897 destroy_socketevent(isc_event_t *event) {
898 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
899 
900 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
901 
902 	(ev->destroy)(event);
903 }
904 
905 static isc_socketevent_t *
906 allocate_socketevent(void *sender,
907 		     isc_eventtype_t eventtype, isc_taskaction_t action,
908 		     void *arg)
909 {
910 	isc_socketevent_t *ev;
911 
912 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
913 						     eventtype, action, arg,
914 						     sizeof(*ev));
915 
916 	if (ev == NULL)
917 		return (NULL);
918 
919 	ev->result = ISC_R_UNSET;
920 	ISC_LINK_INIT(ev, ev_link);
921 	ISC_LIST_INIT(ev->bufferlist);
922 	ev->region.base = NULL;
923 	ev->n = 0;
924 	ev->offset = 0;
925 	ev->attributes = 0;
926 	ev->destroy = ev->ev_destroy;
927 	ev->ev_destroy = destroy_socketevent;
928 	ev->dscp = 0;
929 
930 	return (ev);
931 }
932 
933 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
934 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
935 #define DOIO_HARD		2	/* i/o error, event sent */
936 #define DOIO_EOF		3	/* EOF, no event sent */
937 
938 static int
939 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
940 	int cc;
941 	struct iovec iov[MAXSCATTERGATHER_RECV];
942 	size_t read_count;
943 	size_t actual_count;
944 	struct msghdr msghdr;
945 	isc_buffer_t *buffer;
946 	int recv_errno;
947 	char strbuf[ISC_STRERRORSIZE];
948 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
949 
950 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
951 
952 	cc = recvmsg(sock->fd, &msghdr, 0);
953 	recv_errno = errno;
954 
955 	if (cc < 0) {
956 		if (SOFT_ERROR(recv_errno))
957 			return (DOIO_SOFT);
958 
959 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
960 			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
961 			socket_log(sock, NULL, IOEVENT,
962 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
963 				   sock->fd, cc, recv_errno, strbuf);
964 		}
965 
966 #define SOFT_OR_HARD(_system, _isc) \
967 	if (recv_errno == _system) { \
968 		if (sock->connected) { \
969 			dev->result = _isc; \
970 			return (DOIO_HARD); \
971 		} \
972 		return (DOIO_SOFT); \
973 	}
974 #define ALWAYS_HARD(_system, _isc) \
975 	if (recv_errno == _system) { \
976 		dev->result = _isc; \
977 		return (DOIO_HARD); \
978 	}
979 
980 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
981 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
982 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
983 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
984 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
985 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
986 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
987 		/* Should never get this one but it was seen. */
988 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
989 		/*
990 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
991 		 * errors.
992 		 */
993 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
994 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
995 
996 #undef SOFT_OR_HARD
997 #undef ALWAYS_HARD
998 
999 		dev->result = isc__errno2result(recv_errno);
1000 		return (DOIO_HARD);
1001 	}
1002 
1003 	/*
1004 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1005 	 * while on UDP sockets, zero length reads are perfectly valid,
1006 	 * although strange.
1007 	 */
1008 	switch (sock->type) {
1009 	case isc_sockettype_tcp:
1010 		if (cc == 0)
1011 			return (DOIO_EOF);
1012 		break;
1013 	case isc_sockettype_udp:
1014 		break;
1015 	default:
1016 		INSIST(0);
1017 	}
1018 
1019 	if (sock->type == isc_sockettype_udp) {
1020 		dev->address.length = msghdr.msg_namelen;
1021 		if (isc_sockaddr_getport(&dev->address) == 0) {
1022 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1023 				socket_log(sock, &dev->address, IOEVENT,
1024 					   "dropping source port zero packet");
1025 			}
1026 			return (DOIO_SOFT);
1027 		}
1028 	}
1029 
1030 	socket_log(sock, &dev->address, IOEVENT,
1031 		   "packet received correctly");
1032 
1033 	/*
1034 	 * Overflow bit detection.  If we received MORE bytes than we should,
1035 	 * this indicates an overflow situation.  Set the flag in the
1036 	 * dev entry and adjust how much we read by one.
1037 	 */
1038 	/*
1039 	 * If there are control messages attached, run through them and pull
1040 	 * out the interesting bits.
1041 	 */
1042 	process_cmsg(sock, &msghdr, dev);
1043 
1044 	/*
1045 	 * update the buffers (if any) and the i/o count
1046 	 */
1047 	dev->n += cc;
1048 	actual_count = cc;
1049 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1050 	while (buffer != NULL && actual_count > 0U) {
1051 		REQUIRE(ISC_BUFFER_VALID(buffer));
1052 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1053 			actual_count -= isc_buffer_availablelength(buffer);
1054 			isc_buffer_add(buffer,
1055 				       isc_buffer_availablelength(buffer));
1056 		} else {
1057 			isc_buffer_add(buffer, actual_count);
1058 			actual_count = 0;
1059 			POST(actual_count);
1060 			break;
1061 		}
1062 		buffer = ISC_LIST_NEXT(buffer, link);
1063 		if (buffer == NULL) {
1064 			INSIST(actual_count == 0U);
1065 		}
1066 	}
1067 
1068 	/*
1069 	 * If we read less than we expected, update counters,
1070 	 * and let the upper layer poke the descriptor.
1071 	 */
1072 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1073 		return (DOIO_SOFT);
1074 
1075 	/*
1076 	 * Full reads are posted, or partials if partials are ok.
1077 	 */
1078 	dev->result = ISC_R_SUCCESS;
1079 	return (DOIO_SUCCESS);
1080 }
1081 
1082 /*
1083  * Returns:
1084  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1085  *			ISC_R_SUCCESS.
1086  *
1087  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1088  *			dev->result contains the appropriate error.
1089  *
1090  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1091  *			event was sent.  The operation should be retried.
1092  *
1093  *	No other return values are possible.
1094  */
1095 static int
1096 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1097 	int cc;
1098 	struct iovec iov[MAXSCATTERGATHER_SEND];
1099 	size_t write_count;
1100 	struct msghdr msghdr;
1101 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1102 	int attempts = 0;
1103 	int send_errno;
1104 	char strbuf[ISC_STRERRORSIZE];
1105 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1106 
1107 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1108 
1109  resend:
1110 	cc = sendmsg(sock->fd, &msghdr, 0);
1111 	send_errno = errno;
1112 
1113 	/*
1114 	 * Check for error or block condition.
1115 	 */
1116 	if (cc < 0) {
1117 		if (send_errno == EINTR && ++attempts < NRETRIES)
1118 			goto resend;
1119 
1120 		if (SOFT_ERROR(send_errno)) {
1121 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1122 				dev->result = ISC_R_WOULDBLOCK;
1123 			return (DOIO_SOFT);
1124 		}
1125 
1126 #define SOFT_OR_HARD(_system, _isc) \
1127 	if (send_errno == _system) { \
1128 		if (sock->connected) { \
1129 			dev->result = _isc; \
1130 			return (DOIO_HARD); \
1131 		} \
1132 		return (DOIO_SOFT); \
1133 	}
1134 #define ALWAYS_HARD(_system, _isc) \
1135 	if (send_errno == _system) { \
1136 		dev->result = _isc; \
1137 		return (DOIO_HARD); \
1138 	}
1139 
1140 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1141 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1142 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1143 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1144 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1145 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1146 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1147 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1148 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1149 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1150 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1151 
1152 #undef SOFT_OR_HARD
1153 #undef ALWAYS_HARD
1154 
1155 		/*
1156 		 * The other error types depend on whether or not the
1157 		 * socket is UDP or TCP.  If it is UDP, some errors
1158 		 * that we expect to be fatal under TCP are merely
1159 		 * annoying, and are really soft errors.
1160 		 *
1161 		 * However, these soft errors are still returned as
1162 		 * a status.
1163 		 */
1164 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1165 		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1166 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1167 				 addrbuf, strbuf);
1168 		dev->result = isc__errno2result(send_errno);
1169 		return (DOIO_HARD);
1170 	}
1171 
1172 	if (cc == 0) {
1173 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1174 				 "doio_send: send() %s 0", "returned");
1175 	}
1176 
1177 	/*
1178 	 * If we write less than we expected, update counters, poke.
1179 	 */
1180 	dev->n += cc;
1181 	if ((size_t)cc != write_count)
1182 		return (DOIO_SOFT);
1183 
1184 	/*
1185 	 * Exactly what we wanted to write.  We're done with this
1186 	 * entry.  Post its completion event.
1187 	 */
1188 	dev->result = ISC_R_SUCCESS;
1189 	return (DOIO_SUCCESS);
1190 }
1191 
1192 /*
1193  * Kill.
1194  *
1195  * Caller must ensure that the socket is not locked and no external
1196  * references exist.
1197  */
1198 static void
1199 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1200 	/*
1201 	 * No one has this socket open, so the watcher doesn't have to be
1202 	 * poked, and the socket doesn't have to be locked.
1203 	 */
1204 	manager->fds[fd] = NULL;
1205 	manager->fdstate[fd] = CLOSE_PENDING;
1206 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1207 
1208 	if (sock->active == 1) {
1209 		sock->active = 0;
1210 	}
1211 
1212 	/*
1213 	 * update manager->maxfd here (XXX: this should be implemented more
1214 	 * efficiently)
1215 	 */
1216 	if (manager->maxfd == fd) {
1217 		int i;
1218 
1219 		manager->maxfd = 0;
1220 		for (i = fd - 1; i >= 0; i--) {
1221 			if (manager->fdstate[i] == MANAGED) {
1222 				manager->maxfd = i;
1223 				break;
1224 			}
1225 		}
1226 	}
1227 
1228 }
1229 
1230 static void
1231 destroy(isc__socket_t **sockp) {
1232 	int fd;
1233 	isc__socket_t *sock = *sockp;
1234 	isc__socketmgr_t *manager = sock->manager;
1235 
1236 	socket_log(sock, NULL, CREATION, "destroying");
1237 
1238 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1239 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1240 	INSIST(sock->connect_ev == NULL);
1241 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1242 
1243 	if (sock->fd >= 0) {
1244 		fd = sock->fd;
1245 		sock->fd = -1;
1246 		socketclose(manager, sock, fd);
1247 	}
1248 
1249 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1250 
1251 	/* can't unlock manager as its memory context is still used */
1252 	free_socket(sockp);
1253 }
1254 
1255 static isc_result_t
1256 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1257 		isc__socket_t **socketp)
1258 {
1259 	isc__socket_t *sock;
1260 
1261 	sock = malloc(sizeof(*sock));
1262 
1263 	if (sock == NULL)
1264 		return (ISC_R_NOMEMORY);
1265 
1266 	sock->common.magic = 0;
1267 	sock->common.impmagic = 0;
1268 	sock->references = 0;
1269 
1270 	sock->manager = manager;
1271 	sock->type = type;
1272 	sock->fd = -1;
1273 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1274 	sock->active = 0;
1275 
1276 	ISC_LINK_INIT(sock, link);
1277 
1278 	/*
1279 	 * Set up list of readers and writers to be initially empty.
1280 	 */
1281 	ISC_LIST_INIT(sock->recv_list);
1282 	ISC_LIST_INIT(sock->send_list);
1283 	sock->connect_ev = NULL;
1284 	sock->pending_recv = 0;
1285 	sock->pending_send = 0;
1286 	sock->connected = 0;
1287 	sock->connecting = 0;
1288 	sock->bound = 0;
1289 	sock->pktdscp = 0;
1290 
1291 	/*
1292 	 * Initialize readable and writable events.
1293 	 */
1294 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1295 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1296 		       NULL, sock, sock, NULL);
1297 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1298 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1299 		       NULL, sock, sock, NULL);
1300 
1301 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1302 	sock->common.impmagic = SOCKET_MAGIC;
1303 	*socketp = sock;
1304 
1305 	return (ISC_R_SUCCESS);
1306 }
1307 
1308 /*
1309  * This event requires that the various lists be empty, that the reference
1310  * count be 1, and that the magic number is valid.  The other socket bits,
1311  * like the lock, must be initialized as well.  The fd associated must be
1312  * marked as closed, by setting it to -1 on close, or this routine will
1313  * also close the socket.
1314  */
1315 static void
1316 free_socket(isc__socket_t **socketp) {
1317 	isc__socket_t *sock = *socketp;
1318 
1319 	INSIST(VALID_SOCKET(sock));
1320 	INSIST(sock->references == 0);
1321 	INSIST(!sock->connecting);
1322 	INSIST(!sock->pending_recv);
1323 	INSIST(!sock->pending_send);
1324 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1325 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1326 	INSIST(!ISC_LINK_LINKED(sock, link));
1327 
1328 	sock->common.magic = 0;
1329 	sock->common.impmagic = 0;
1330 
1331 	free(sock);
1332 
1333 	*socketp = NULL;
1334 }
1335 
1336 static void
1337 use_min_mtu(isc__socket_t *sock) {
1338 	/* use minimum MTU */
1339 	if (sock->pf == AF_INET6) {
1340 		int on = 1;
1341 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1342 				(void *)&on, sizeof(on));
1343 	}
1344 }
1345 
1346 static void
1347 set_tcp_maxseg(isc__socket_t *sock, int size) {
1348 	if (sock->type == isc_sockettype_tcp)
1349 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1350 				(void *)&size, sizeof(size));
1351 }
1352 
1353 static isc_result_t
1354 opensocket(isc__socket_t *sock)
1355 {
1356 	isc_result_t result;
1357 	char strbuf[ISC_STRERRORSIZE];
1358 	const char *err = "socket";
1359 	int on = 1;
1360 
1361 	switch (sock->type) {
1362 	case isc_sockettype_udp:
1363 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1364 		break;
1365 	case isc_sockettype_tcp:
1366 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1367 		break;
1368 	}
1369 
1370 	if (sock->fd < 0) {
1371 		switch (errno) {
1372 		case EMFILE:
1373 		case ENFILE:
1374 			isc__strerror(errno, strbuf, sizeof(strbuf));
1375 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1376 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1377 				       "%s: %s", err, strbuf);
1378 			/* fallthrough */
1379 		case ENOBUFS:
1380 			return (ISC_R_NORESOURCES);
1381 
1382 		case EPROTONOSUPPORT:
1383 		case EPFNOSUPPORT:
1384 		case EAFNOSUPPORT:
1385 		/*
1386 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1387 		 * EAFNOSUPPORT.
1388 		 */
1389 		case EINVAL:
1390 			return (ISC_R_FAMILYNOSUPPORT);
1391 
1392 		default:
1393 			isc__strerror(errno, strbuf, sizeof(strbuf));
1394 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1395 					 "%s() %s: %s", err, "failed",
1396 					 strbuf);
1397 			return (ISC_R_UNEXPECTED);
1398 		}
1399 	}
1400 
1401 	result = make_nonblock(sock->fd);
1402 	if (result != ISC_R_SUCCESS) {
1403 		(void)close(sock->fd);
1404 		return (result);
1405 	}
1406 
1407 	/*
1408 	 * Use minimum mtu if possible.
1409 	 */
1410 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1411 		use_min_mtu(sock);
1412 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1413 	}
1414 
1415 	if (sock->type == isc_sockettype_udp) {
1416 
1417 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1418 			       (void *)&on, sizeof(on)) < 0
1419 		    && errno != ENOPROTOOPT) {
1420 			isc__strerror(errno, strbuf, sizeof(strbuf));
1421 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1422 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1423 					 sock->fd, "failed", strbuf);
1424 			/* Press on... */
1425 		}
1426 
1427 		/* RFC 3542 */
1428 		if ((sock->pf == AF_INET6)
1429 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1430 				   (void *)&on, sizeof(on)) < 0)) {
1431 			isc__strerror(errno, strbuf, sizeof(strbuf));
1432 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1433 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1434 					 "%s: %s", sock->fd, "failed",
1435 					 strbuf);
1436 		}
1437 	}
1438 
1439 	if (sock->active == 0) {
1440 		sock->active = 1;
1441 	}
1442 
1443 	return (ISC_R_SUCCESS);
1444 }
1445 
1446 /*
1447  * Create a 'type' socket managed
1448  * by 'manager'.  Events will be posted to 'task' and when dispatched
1449  * 'action' will be called with 'arg' as the arg value.  The new
1450  * socket is returned in 'socketp'.
1451  */
1452 static isc_result_t
1453 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1454 	      isc_socket_t **socketp)
1455 {
1456 	isc__socket_t *sock = NULL;
1457 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1458 	isc_result_t result;
1459 	int lockid;
1460 
1461 	REQUIRE(VALID_MANAGER(manager));
1462 	REQUIRE(socketp != NULL && *socketp == NULL);
1463 
1464 	result = allocate_socket(manager, type, &sock);
1465 	if (result != ISC_R_SUCCESS)
1466 		return (result);
1467 
1468 	switch (sock->type) {
1469 	case isc_sockettype_udp:
1470 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1471 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1472 		break;
1473 	case isc_sockettype_tcp:
1474 		break;
1475 	default:
1476 		INSIST(0);
1477 	}
1478 
1479 	sock->pf = pf;
1480 
1481 	result = opensocket(sock);
1482 	if (result != ISC_R_SUCCESS) {
1483 		free_socket(&sock);
1484 		return (result);
1485 	}
1486 
1487 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1488 	sock->references = 1;
1489 	*socketp = (isc_socket_t *)sock;
1490 
1491 	/*
1492 	 * Note we don't have to lock the socket like we normally would because
1493 	 * there are no external references to it yet.
1494 	 */
1495 
1496 	lockid = FDLOCK_ID(sock->fd);
1497 	manager->fds[sock->fd] = sock;
1498 	manager->fdstate[sock->fd] = MANAGED;
1499 
1500 	ISC_LIST_APPEND(manager->socklist, sock, link);
1501 	if (manager->maxfd < sock->fd)
1502 		manager->maxfd = sock->fd;
1503 
1504 	socket_log(sock, NULL, CREATION, "created");
1505 
1506 	return (ISC_R_SUCCESS);
1507 }
1508 
1509 /*%
1510  * Create a new 'type' socket managed by 'manager'.  Events
1511  * will be posted to 'task' and when dispatched 'action' will be
1512  * called with 'arg' as the arg value.  The new socket is returned
1513  * in 'socketp'.
1514  */
1515 isc_result_t
1516 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1517 		   isc_socket_t **socketp)
1518 {
1519 	return (socket_create(manager0, pf, type, socketp));
1520 }
1521 
1522 /*
1523  * Attach to a socket.  Caller must explicitly detach when it is done.
1524  */
1525 void
1526 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1527 	isc__socket_t *sock = (isc__socket_t *)sock0;
1528 
1529 	REQUIRE(VALID_SOCKET(sock));
1530 	REQUIRE(socketp != NULL && *socketp == NULL);
1531 
1532 	sock->references++;
1533 
1534 	*socketp = (isc_socket_t *)sock;
1535 }
1536 
1537 /*
1538  * Dereference a socket.  If this is the last reference to it, clean things
1539  * up by destroying the socket.
1540  */
1541 void
1542 isc__socket_detach(isc_socket_t **socketp) {
1543 	isc__socket_t *sock;
1544 	isc_boolean_t kill_socket = ISC_FALSE;
1545 
1546 	REQUIRE(socketp != NULL);
1547 	sock = (isc__socket_t *)*socketp;
1548 	REQUIRE(VALID_SOCKET(sock));
1549 
1550 	REQUIRE(sock->references > 0);
1551 	sock->references--;
1552 	if (sock->references == 0)
1553 		kill_socket = ISC_TRUE;
1554 
1555 	if (kill_socket)
1556 		destroy(&sock);
1557 
1558 	*socketp = NULL;
1559 }
1560 
1561 /*
1562  * I/O is possible on a given socket.  Schedule an event to this task that
1563  * will call an internal function to do the I/O.  This will charge the
1564  * task with the I/O operation and let our select loop handler get back
1565  * to doing something real as fast as possible.
1566  *
1567  * The socket and manager must be locked before calling this function.
1568  */
1569 static void
1570 dispatch_recv(isc__socket_t *sock) {
1571 	intev_t *iev;
1572 	isc_socketevent_t *ev;
1573 	isc_task_t *sender;
1574 
1575 	INSIST(!sock->pending_recv);
1576 
1577 	ev = ISC_LIST_HEAD(sock->recv_list);
1578 	if (ev == NULL)
1579 		return;
1580 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1581 		   "dispatch_recv:  event %p -> task %p",
1582 		   ev, ev->ev_sender);
1583 	sender = ev->ev_sender;
1584 
1585 	sock->pending_recv = 1;
1586 	iev = &sock->readable_ev;
1587 
1588 	sock->references++;
1589 	iev->ev_sender = sock;
1590 	iev->ev_action = internal_recv;
1591 	iev->ev_arg = sock;
1592 
1593 	isc_task_send(sender, (isc_event_t **)&iev);
1594 }
1595 
1596 static void
1597 dispatch_send(isc__socket_t *sock) {
1598 	intev_t *iev;
1599 	isc_socketevent_t *ev;
1600 	isc_task_t *sender;
1601 
1602 	INSIST(!sock->pending_send);
1603 
1604 	ev = ISC_LIST_HEAD(sock->send_list);
1605 	if (ev == NULL)
1606 		return;
1607 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1608 		   "dispatch_send:  event %p -> task %p",
1609 		   ev, ev->ev_sender);
1610 	sender = ev->ev_sender;
1611 
1612 	sock->pending_send = 1;
1613 	iev = &sock->writable_ev;
1614 
1615 	sock->references++;
1616 	iev->ev_sender = sock;
1617 	iev->ev_action = internal_send;
1618 	iev->ev_arg = sock;
1619 
1620 	isc_task_send(sender, (isc_event_t **)&iev);
1621 }
1622 
1623 static void
1624 dispatch_connect(isc__socket_t *sock) {
1625 	intev_t *iev;
1626 	isc_socket_connev_t *ev;
1627 
1628 	iev = &sock->writable_ev;
1629 
1630 	ev = sock->connect_ev;
1631 	INSIST(ev != NULL); /* XXX */
1632 
1633 	INSIST(sock->connecting);
1634 
1635 	sock->references++;  /* keep socket around for this internal event */
1636 	iev->ev_sender = sock;
1637 	iev->ev_action = internal_connect;
1638 	iev->ev_arg = sock;
1639 
1640 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1641 }
1642 
1643 /*
1644  * Dequeue an item off the given socket's read queue, set the result code
1645  * in the done event to the one provided, and send it to the task it was
1646  * destined for.
1647  *
1648  * If the event to be sent is on a list, remove it before sending.  If
1649  * asked to, send and detach from the socket as well.
1650  *
1651  * Caller must have the socket locked if the event is attached to the socket.
1652  */
1653 static void
1654 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1655 	isc_task_t *task;
1656 
1657 	task = (*dev)->ev_sender;
1658 
1659 	(*dev)->ev_sender = sock;
1660 
1661 	if (ISC_LINK_LINKED(*dev, ev_link))
1662 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1663 
1664 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1665 	    == ISC_SOCKEVENTATTR_ATTACHED)
1666 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1667 	else
1668 		isc_task_send(task, (isc_event_t **)dev);
1669 }
1670 
1671 /*
1672  * See comments for send_recvdone_event() above.
1673  *
1674  * Caller must have the socket locked if the event is attached to the socket.
1675  */
1676 static void
1677 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1678 	isc_task_t *task;
1679 
1680 	INSIST(dev != NULL && *dev != NULL);
1681 
1682 	task = (*dev)->ev_sender;
1683 	(*dev)->ev_sender = sock;
1684 
1685 	if (ISC_LINK_LINKED(*dev, ev_link))
1686 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1687 
1688 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1689 	    == ISC_SOCKEVENTATTR_ATTACHED)
1690 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1691 	else
1692 		isc_task_send(task, (isc_event_t **)dev);
1693 }
1694 
1695 static void
1696 internal_recv(isc_task_t *me, isc_event_t *ev) {
1697 	isc_socketevent_t *dev;
1698 	isc__socket_t *sock;
1699 
1700 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1701 
1702 	sock = ev->ev_sender;
1703 	INSIST(VALID_SOCKET(sock));
1704 
1705 	socket_log(sock, NULL, IOEVENT,
1706 		   "internal_recv: task %p got event %p", me, ev);
1707 
1708 	INSIST(sock->pending_recv == 1);
1709 	sock->pending_recv = 0;
1710 
1711 	INSIST(sock->references > 0);
1712 	sock->references--;  /* the internal event is done with this socket */
1713 	if (sock->references == 0) {
1714 		destroy(&sock);
1715 		return;
1716 	}
1717 
1718 	/*
1719 	 * Try to do as much I/O as possible on this socket.  There are no
1720 	 * limits here, currently.
1721 	 */
1722 	dev = ISC_LIST_HEAD(sock->recv_list);
1723 	while (dev != NULL) {
1724 		switch (doio_recv(sock, dev)) {
1725 		case DOIO_SOFT:
1726 			goto poke;
1727 
1728 		case DOIO_EOF:
1729 			/*
1730 			 * read of 0 means the remote end was closed.
1731 			 * Run through the event queue and dispatch all
1732 			 * the events with an EOF result code.
1733 			 */
1734 			do {
1735 				dev->result = ISC_R_EOF;
1736 				send_recvdone_event(sock, &dev);
1737 				dev = ISC_LIST_HEAD(sock->recv_list);
1738 			} while (dev != NULL);
1739 			goto poke;
1740 
1741 		case DOIO_SUCCESS:
1742 		case DOIO_HARD:
1743 			send_recvdone_event(sock, &dev);
1744 			break;
1745 		}
1746 
1747 		dev = ISC_LIST_HEAD(sock->recv_list);
1748 	}
1749 
1750  poke:
1751 	if (!ISC_LIST_EMPTY(sock->recv_list))
1752 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1753 }
1754 
1755 static void
1756 internal_send(isc_task_t *me, isc_event_t *ev) {
1757 	isc_socketevent_t *dev;
1758 	isc__socket_t *sock;
1759 
1760 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1761 
1762 	/*
1763 	 * Find out what socket this is and lock it.
1764 	 */
1765 	sock = (isc__socket_t *)ev->ev_sender;
1766 	INSIST(VALID_SOCKET(sock));
1767 	socket_log(sock, NULL, IOEVENT,
1768 		   "internal_send: task %p got event %p", me, ev);
1769 
1770 	INSIST(sock->pending_send == 1);
1771 	sock->pending_send = 0;
1772 
1773 	INSIST(sock->references > 0);
1774 	sock->references--;  /* the internal event is done with this socket */
1775 	if (sock->references == 0) {
1776 		destroy(&sock);
1777 		return;
1778 	}
1779 
1780 	/*
1781 	 * Try to do as much I/O as possible on this socket.  There are no
1782 	 * limits here, currently.
1783 	 */
1784 	dev = ISC_LIST_HEAD(sock->send_list);
1785 	while (dev != NULL) {
1786 		switch (doio_send(sock, dev)) {
1787 		case DOIO_SOFT:
1788 			goto poke;
1789 
1790 		case DOIO_HARD:
1791 		case DOIO_SUCCESS:
1792 			send_senddone_event(sock, &dev);
1793 			break;
1794 		}
1795 
1796 		dev = ISC_LIST_HEAD(sock->send_list);
1797 	}
1798 
1799  poke:
1800 	if (!ISC_LIST_EMPTY(sock->send_list))
1801 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1802 }
1803 
1804 /*
1805  * Process read/writes on each fd here.  Avoid locking
1806  * and unlocking twice if both reads and writes are possible.
1807  */
1808 static void
1809 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1810 	   isc_boolean_t writeable)
1811 {
1812 	isc__socket_t *sock;
1813 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1814 
1815 	/*
1816 	 * If the socket is going to be closed, don't do more I/O.
1817 	 */
1818 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1819 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1820 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1821 		return;
1822 	}
1823 
1824 	sock = manager->fds[fd];
1825 	if (readable) {
1826 		if (sock == NULL) {
1827 			unwatch_read = ISC_TRUE;
1828 			goto check_write;
1829 		}
1830 		if (!SOCK_DEAD(sock)) {
1831 			dispatch_recv(sock);
1832 		}
1833 		unwatch_read = ISC_TRUE;
1834 	}
1835 check_write:
1836 	if (writeable) {
1837 		if (sock == NULL) {
1838 			unwatch_write = ISC_TRUE;
1839 			goto unlock_fd;
1840 		}
1841 		if (!SOCK_DEAD(sock)) {
1842 			if (sock->connecting)
1843 				dispatch_connect(sock);
1844 			else
1845 				dispatch_send(sock);
1846 		}
1847 		unwatch_write = ISC_TRUE;
1848 	}
1849 
1850  unlock_fd:
1851 	if (unwatch_read)
1852 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1853 	if (unwatch_write)
1854 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1855 
1856 }
1857 
1858 static void
1859 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1860 	    fd_set *writefds)
1861 {
1862 	int i;
1863 
1864 	REQUIRE(maxfd <= (int)manager->maxsocks);
1865 
1866 	for (i = 0; i < maxfd; i++) {
1867 		process_fd(manager, i, FD_ISSET(i, readfds),
1868 			   FD_ISSET(i, writefds));
1869 	}
1870 }
1871 
1872 /*
1873  * Create a new socket manager.
1874  */
1875 
1876 static isc_result_t
1877 setup_watcher(isc__socketmgr_t *manager) {
1878 	isc_result_t result;
1879 
1880 	UNUSED(result);
1881 
1882 	manager->fd_bufsize = sizeof(fd_set);
1883 
1884 	manager->read_fds = NULL;
1885 	manager->read_fds_copy = NULL;
1886 	manager->write_fds = NULL;
1887 	manager->write_fds_copy = NULL;
1888 
1889 	manager->read_fds = malloc(manager->fd_bufsize);
1890 	if (manager->read_fds != NULL)
1891 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1892 	if (manager->read_fds_copy != NULL)
1893 		manager->write_fds = malloc(manager->fd_bufsize);
1894 	if (manager->write_fds != NULL) {
1895 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1896 	}
1897 	if (manager->write_fds_copy == NULL) {
1898 		if (manager->write_fds != NULL) {
1899 			free(manager->write_fds);
1900 		}
1901 		if (manager->read_fds_copy != NULL) {
1902 			free(manager->read_fds_copy);
1903 		}
1904 		if (manager->read_fds != NULL) {
1905 			free(manager->read_fds);
1906 		}
1907 		return (ISC_R_NOMEMORY);
1908 	}
1909 	memset(manager->read_fds, 0, manager->fd_bufsize);
1910 	memset(manager->write_fds, 0, manager->fd_bufsize);
1911 
1912 	manager->maxfd = 0;
1913 
1914 	return (ISC_R_SUCCESS);
1915 }
1916 
1917 static void
1918 cleanup_watcher(isc__socketmgr_t *manager) {
1919 
1920 	if (manager->read_fds != NULL)
1921 		free(manager->read_fds);
1922 	if (manager->read_fds_copy != NULL)
1923 		free(manager->read_fds_copy);
1924 	if (manager->write_fds != NULL)
1925 		free(manager->write_fds);
1926 	if (manager->write_fds_copy != NULL)
1927 		free(manager->write_fds_copy);
1928 }
1929 
1930 isc_result_t
1931 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1932 	return (isc__socketmgr_create2(managerp, 0));
1933 }
1934 
1935 isc_result_t
1936 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1937 		       unsigned int maxsocks)
1938 {
1939 	isc__socketmgr_t *manager;
1940 	isc_result_t result;
1941 
1942 	REQUIRE(managerp != NULL && *managerp == NULL);
1943 
1944 	if (socketmgr != NULL) {
1945 		/* Don't allow maxsocks to be updated */
1946 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1947 			return (ISC_R_EXISTS);
1948 
1949 		socketmgr->refs++;
1950 		*managerp = (isc_socketmgr_t *)socketmgr;
1951 		return (ISC_R_SUCCESS);
1952 	}
1953 
1954 	if (maxsocks == 0)
1955 		maxsocks = FD_SETSIZE;
1956 
1957 	manager = malloc(sizeof(*manager));
1958 	if (manager == NULL)
1959 		return (ISC_R_NOMEMORY);
1960 
1961 	/* zero-clear so that necessary cleanup on failure will be easy */
1962 	memset(manager, 0, sizeof(*manager));
1963 	manager->maxsocks = maxsocks;
1964 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1965 	if (manager->fds == NULL) {
1966 		result = ISC_R_NOMEMORY;
1967 		goto free_manager;
1968 	}
1969 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1970 	if (manager->fdstate == NULL) {
1971 		result = ISC_R_NOMEMORY;
1972 		goto free_manager;
1973 	}
1974 
1975 	manager->common.methods = &socketmgrmethods;
1976 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1977 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1978 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1979 	ISC_LIST_INIT(manager->socklist);
1980 
1981 	manager->refs = 1;
1982 
1983 	/*
1984 	 * Set up initial state for the select loop
1985 	 */
1986 	result = setup_watcher(manager);
1987 	if (result != ISC_R_SUCCESS)
1988 		goto cleanup;
1989 
1990 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1991 
1992 	socketmgr = manager;
1993 	*managerp = (isc_socketmgr_t *)manager;
1994 
1995 	return (ISC_R_SUCCESS);
1996 
1997 cleanup:
1998 
1999 free_manager:
2000 	if (manager->fdstate != NULL) {
2001 		free(manager->fdstate);
2002 	}
2003 	if (manager->fds != NULL) {
2004 		free(manager->fds);
2005 	}
2006 	free(manager);
2007 
2008 	return (result);
2009 }
2010 
2011 void
2012 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2013 	isc__socketmgr_t *manager;
2014 	int i;
2015 
2016 	/*
2017 	 * Destroy a socket manager.
2018 	 */
2019 
2020 	REQUIRE(managerp != NULL);
2021 	manager = (isc__socketmgr_t *)*managerp;
2022 	REQUIRE(VALID_MANAGER(manager));
2023 
2024 	manager->refs--;
2025 	if (manager->refs > 0) {
2026 		*managerp = NULL;
2027 		return;
2028 	}
2029 	socketmgr = NULL;
2030 
2031 	/*
2032 	 * Wait for all sockets to be destroyed.
2033 	 */
2034 	while (!ISC_LIST_EMPTY(manager->socklist)) {
2035 		isc__taskmgr_dispatch(NULL);
2036 	}
2037 
2038 	/*
2039 	 * Here, poke our select/poll thread.  Do this by closing the write
2040 	 * half of the pipe, which will send EOF to the read half.
2041 	 * This is currently a no-op in the non-threaded case.
2042 	 */
2043 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2044 
2045 	/*
2046 	 * Clean up.
2047 	 */
2048 	cleanup_watcher(manager);
2049 
2050 	for (i = 0; i < (int)manager->maxsocks; i++)
2051 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2052 			(void)close(i);
2053 
2054 	free(manager->fds);
2055 	free(manager->fdstate);
2056 
2057 	manager->common.magic = 0;
2058 	manager->common.impmagic = 0;
2059 	free(manager);
2060 
2061 	*managerp = NULL;
2062 
2063 	socketmgr = NULL;
2064 }
2065 
2066 static isc_result_t
2067 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2068 	    unsigned int flags)
2069 {
2070 	int io_state;
2071 	isc_task_t *ntask = NULL;
2072 	isc_result_t result = ISC_R_SUCCESS;
2073 
2074 	dev->ev_sender = task;
2075 
2076 	if (sock->type == isc_sockettype_udp) {
2077 		io_state = doio_recv(sock, dev);
2078 	} else {
2079 		if (ISC_LIST_EMPTY(sock->recv_list))
2080 			io_state = doio_recv(sock, dev);
2081 		else
2082 			io_state = DOIO_SOFT;
2083 	}
2084 
2085 	switch (io_state) {
2086 	case DOIO_SOFT:
2087 		/*
2088 		 * We couldn't read all or part of the request right now, so
2089 		 * queue it.
2090 		 *
2091 		 * Attach to socket and to task
2092 		 */
2093 		isc_task_attach(task, &ntask);
2094 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2095 
2096 		/*
2097 		 * Enqueue the request.  If the socket was previously not being
2098 		 * watched, poke the watcher to start paying attention to it.
2099 		 */
2100 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2101 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2102 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2103 
2104 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2105 			   "socket_recv: event %p -> task %p",
2106 			   dev, ntask);
2107 
2108 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2109 			result = ISC_R_INPROGRESS;
2110 		break;
2111 
2112 	case DOIO_EOF:
2113 		dev->result = ISC_R_EOF;
2114 		/* fallthrough */
2115 
2116 	case DOIO_HARD:
2117 	case DOIO_SUCCESS:
2118 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2119 			send_recvdone_event(sock, &dev);
2120 		break;
2121 	}
2122 
2123 	return (result);
2124 }
2125 
2126 isc_result_t
2127 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2128 		  unsigned int minimum, isc_task_t *task,
2129 		  isc_taskaction_t action, void *arg)
2130 {
2131 	isc__socket_t *sock = (isc__socket_t *)sock0;
2132 	isc_socketevent_t *dev;
2133 	isc__socketmgr_t *manager;
2134 	unsigned int iocount;
2135 	isc_buffer_t *buffer;
2136 
2137 	REQUIRE(VALID_SOCKET(sock));
2138 	REQUIRE(buflist != NULL);
2139 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2140 	REQUIRE(task != NULL);
2141 	REQUIRE(action != NULL);
2142 
2143 	manager = sock->manager;
2144 	REQUIRE(VALID_MANAGER(manager));
2145 
2146 	iocount = isc_bufferlist_availablecount(buflist);
2147 	REQUIRE(iocount > 0);
2148 
2149 	INSIST(sock->bound);
2150 
2151 	dev = allocate_socketevent(sock,
2152 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2153 	if (dev == NULL)
2154 		return (ISC_R_NOMEMORY);
2155 
2156 	/*
2157 	 * UDP sockets are always partial read
2158 	 */
2159 	if (sock->type == isc_sockettype_udp)
2160 		dev->minimum = 1;
2161 	else {
2162 		if (minimum == 0)
2163 			dev->minimum = iocount;
2164 		else
2165 			dev->minimum = minimum;
2166 	}
2167 
2168 	/*
2169 	 * Move each buffer from the passed in list to our internal one.
2170 	 */
2171 	buffer = ISC_LIST_HEAD(*buflist);
2172 	while (buffer != NULL) {
2173 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2174 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2175 		buffer = ISC_LIST_HEAD(*buflist);
2176 	}
2177 
2178 	return (socket_recv(sock, dev, task, 0));
2179 }
2180 
2181 static isc_result_t
2182 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2183 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2184 	    unsigned int flags)
2185 {
2186 	int io_state;
2187 	isc_task_t *ntask = NULL;
2188 	isc_result_t result = ISC_R_SUCCESS;
2189 
2190 	dev->ev_sender = task;
2191 
2192 	set_dev_address(address, sock, dev);
2193 	if (pktinfo != NULL) {
2194 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2195 		dev->pktinfo = *pktinfo;
2196 
2197 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2198 		    !isc_sockaddr_islinklocal(&dev->address)) {
2199 			socket_log(sock, NULL, TRACE,
2200 				   "pktinfo structure provided, ifindex %u "
2201 				   "(set to 0)", pktinfo->ipi6_ifindex);
2202 
2203 			/*
2204 			 * Set the pktinfo index to 0 here, to let the
2205 			 * kernel decide what interface it should send on.
2206 			 */
2207 			dev->pktinfo.ipi6_ifindex = 0;
2208 		}
2209 	}
2210 
2211 	if (sock->type == isc_sockettype_udp)
2212 		io_state = doio_send(sock, dev);
2213 	else {
2214 		if (ISC_LIST_EMPTY(sock->send_list))
2215 			io_state = doio_send(sock, dev);
2216 		else
2217 			io_state = DOIO_SOFT;
2218 	}
2219 
2220 	switch (io_state) {
2221 	case DOIO_SOFT:
2222 		/*
2223 		 * We couldn't send all or part of the request right now, so
2224 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2225 		 */
2226 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2227 			isc_task_attach(task, &ntask);
2228 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2229 
2230 			/*
2231 			 * Enqueue the request.  If the socket was previously
2232 			 * not being watched, poke the watcher to start
2233 			 * paying attention to it.
2234 			 */
2235 			if (ISC_LIST_EMPTY(sock->send_list) &&
2236 			    !sock->pending_send)
2237 				select_poke(sock->manager, sock->fd,
2238 					    SELECT_POKE_WRITE);
2239 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2240 
2241 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2242 				   "socket_send: event %p -> task %p",
2243 				   dev, ntask);
2244 
2245 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2246 				result = ISC_R_INPROGRESS;
2247 			break;
2248 		}
2249 
2250 		/* FALLTHROUGH */
2251 
2252 	case DOIO_HARD:
2253 	case DOIO_SUCCESS:
2254 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2255 			send_senddone_event(sock, &dev);
2256 		break;
2257 	}
2258 
2259 	return (result);
2260 }
2261 
2262 isc_result_t
2263 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2264 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2265 {
2266 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2267 				     NULL, 0));
2268 }
2269 
2270 isc_result_t
2271 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2272 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2273 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2274 		     unsigned int flags)
2275 {
2276 	isc__socket_t *sock = (isc__socket_t *)sock0;
2277 	isc_socketevent_t *dev;
2278 	isc__socketmgr_t *manager;
2279 	unsigned int iocount;
2280 	isc_buffer_t *buffer;
2281 
2282 	REQUIRE(VALID_SOCKET(sock));
2283 	REQUIRE(buflist != NULL);
2284 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2285 	REQUIRE(task != NULL);
2286 	REQUIRE(action != NULL);
2287 
2288 	manager = sock->manager;
2289 	REQUIRE(VALID_MANAGER(manager));
2290 
2291 	iocount = isc_bufferlist_usedcount(buflist);
2292 	REQUIRE(iocount > 0);
2293 
2294 	dev = allocate_socketevent(sock,
2295 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2296 	if (dev == NULL)
2297 		return (ISC_R_NOMEMORY);
2298 
2299 	/*
2300 	 * Move each buffer from the passed in list to our internal one.
2301 	 */
2302 	buffer = ISC_LIST_HEAD(*buflist);
2303 	while (buffer != NULL) {
2304 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2305 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2306 		buffer = ISC_LIST_HEAD(*buflist);
2307 	}
2308 
2309 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2310 }
2311 
2312 isc_result_t
2313 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2314 		 unsigned int options) {
2315 	isc__socket_t *sock = (isc__socket_t *)sock0;
2316 	char strbuf[ISC_STRERRORSIZE];
2317 	int on = 1;
2318 
2319 	REQUIRE(VALID_SOCKET(sock));
2320 
2321 	INSIST(!sock->bound);
2322 
2323 	if (sock->pf != sockaddr->type.sa.sa_family) {
2324 		return (ISC_R_FAMILYMISMATCH);
2325 	}
2326 
2327 	/*
2328 	 * Only set SO_REUSEADDR when we want a specific port.
2329 	 */
2330 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2331 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2332 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2333 		       sizeof(on)) < 0) {
2334 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2335 				 "setsockopt(%d) %s", sock->fd, "failed");
2336 		/* Press on... */
2337 	}
2338 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2339 		switch (errno) {
2340 		case EACCES:
2341 			return (ISC_R_NOPERM);
2342 		case EADDRNOTAVAIL:
2343 			return (ISC_R_ADDRNOTAVAIL);
2344 		case EADDRINUSE:
2345 			return (ISC_R_ADDRINUSE);
2346 		case EINVAL:
2347 			return (ISC_R_BOUND);
2348 		default:
2349 			isc__strerror(errno, strbuf, sizeof(strbuf));
2350 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2351 					 strbuf);
2352 			return (ISC_R_UNEXPECTED);
2353 		}
2354 	}
2355 
2356 	socket_log(sock, sockaddr, TRACE, "bound");
2357 	sock->bound = 1;
2358 
2359 	return (ISC_R_SUCCESS);
2360 }
2361 
2362 isc_result_t
2363 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2364 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2365 {
2366 	isc__socket_t *sock = (isc__socket_t *)sock0;
2367 	isc_socket_connev_t *dev;
2368 	isc_task_t *ntask = NULL;
2369 	isc__socketmgr_t *manager;
2370 	int cc;
2371 	char strbuf[ISC_STRERRORSIZE];
2372 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2373 
2374 	REQUIRE(VALID_SOCKET(sock));
2375 	REQUIRE(addr != NULL);
2376 	REQUIRE(task != NULL);
2377 	REQUIRE(action != NULL);
2378 
2379 	manager = sock->manager;
2380 	REQUIRE(VALID_MANAGER(manager));
2381 	REQUIRE(addr != NULL);
2382 
2383 	if (isc_sockaddr_ismulticast(addr))
2384 		return (ISC_R_MULTICAST);
2385 
2386 	REQUIRE(!sock->connecting);
2387 
2388 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2389 							ISC_SOCKEVENT_CONNECT,
2390 							action,	arg,
2391 							sizeof(*dev));
2392 	if (dev == NULL) {
2393 		return (ISC_R_NOMEMORY);
2394 	}
2395 	ISC_LINK_INIT(dev, ev_link);
2396 
2397 	/*
2398 	 * Try to do the connect right away, as there can be only one
2399 	 * outstanding, and it might happen to complete.
2400 	 */
2401 	sock->peer_address = *addr;
2402 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2403 	if (cc < 0) {
2404 		/*
2405 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2406 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2407 		 * a success and let the user detect it if it's really an error
2408 		 * at the time of sending a packet on the socket.
2409 		 */
2410 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2411 			cc = 0;
2412 			goto success;
2413 		}
2414 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2415 			goto queue;
2416 
2417 		switch (errno) {
2418 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2419 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2420 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2421 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2422 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2423 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2424 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2425 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2426 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2427 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2428 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2429 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2430 #undef ERROR_MATCH
2431 		}
2432 
2433 		sock->connected = 0;
2434 
2435 		isc__strerror(errno, strbuf, sizeof(strbuf));
2436 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2437 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2438 				 addrbuf, errno, strbuf);
2439 
2440 		isc_event_free(ISC_EVENT_PTR(&dev));
2441 		return (ISC_R_UNEXPECTED);
2442 
2443 	err_exit:
2444 		sock->connected = 0;
2445 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2446 
2447 		return (ISC_R_SUCCESS);
2448 	}
2449 
2450 	/*
2451 	 * If connect completed, fire off the done event.
2452 	 */
2453  success:
2454 	if (cc == 0) {
2455 		sock->connected = 1;
2456 		sock->bound = 1;
2457 		dev->result = ISC_R_SUCCESS;
2458 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2459 
2460 		return (ISC_R_SUCCESS);
2461 	}
2462 
2463  queue:
2464 
2465 	/*
2466 	 * Attach to task.
2467 	 */
2468 	isc_task_attach(task, &ntask);
2469 
2470 	sock->connecting = 1;
2471 
2472 	dev->ev_sender = ntask;
2473 
2474 	/*
2475 	 * Poke watcher here.  We still have the socket locked, so there
2476 	 * is no race condition.  We will keep the lock for such a short
2477 	 * bit of time waking it up now or later won't matter all that much.
2478 	 */
2479 	if (sock->connect_ev == NULL)
2480 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2481 
2482 	sock->connect_ev = dev;
2483 
2484 	return (ISC_R_SUCCESS);
2485 }
2486 
2487 /*
2488  * Called when a socket with a pending connect() finishes.
2489  */
2490 static void
2491 internal_connect(isc_task_t *me, isc_event_t *ev) {
2492 	isc__socket_t *sock;
2493 	isc_socket_connev_t *dev;
2494 	isc_task_t *task;
2495 	int cc;
2496 	socklen_t optlen;
2497 	char strbuf[ISC_STRERRORSIZE];
2498 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2499 
2500 	UNUSED(me);
2501 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2502 
2503 	sock = ev->ev_sender;
2504 	INSIST(VALID_SOCKET(sock));
2505 
2506 	/*
2507 	 * When the internal event was sent the reference count was bumped
2508 	 * to keep the socket around for us.  Decrement the count here.
2509 	 */
2510 	INSIST(sock->references > 0);
2511 	sock->references--;
2512 	if (sock->references == 0) {
2513 		destroy(&sock);
2514 		return;
2515 	}
2516 
2517 	/*
2518 	 * Has this event been canceled?
2519 	 */
2520 	dev = sock->connect_ev;
2521 	if (dev == NULL) {
2522 		INSIST(!sock->connecting);
2523 		return;
2524 	}
2525 
2526 	INSIST(sock->connecting);
2527 	sock->connecting = 0;
2528 
2529 	/*
2530 	 * Get any possible error status here.
2531 	 */
2532 	optlen = sizeof(cc);
2533 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2534 		       (void *)&cc, (void *)&optlen) < 0)
2535 		cc = errno;
2536 	else
2537 		errno = cc;
2538 
2539 	if (errno != 0) {
2540 		/*
2541 		 * If the error is EAGAIN, just re-select on this
2542 		 * fd and pretend nothing strange happened.
2543 		 */
2544 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2545 			sock->connecting = 1;
2546 			select_poke(sock->manager, sock->fd,
2547 				    SELECT_POKE_CONNECT);
2548 			return;
2549 		}
2550 
2551 
2552 		/*
2553 		 * Translate other errors into ISC_R_* flavors.
2554 		 */
2555 		switch (errno) {
2556 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2557 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2558 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2559 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2560 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2561 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2562 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2563 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2564 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2565 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2566 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2567 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2568 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2569 #undef ERROR_MATCH
2570 		default:
2571 			dev->result = ISC_R_UNEXPECTED;
2572 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2573 					    sizeof(peerbuf));
2574 			isc__strerror(errno, strbuf, sizeof(strbuf));
2575 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2576 					 "internal_connect: connect(%s) %s",
2577 					 peerbuf, strbuf);
2578 		}
2579 	} else {
2580 		dev->result = ISC_R_SUCCESS;
2581 		sock->connected = 1;
2582 		sock->bound = 1;
2583 	}
2584 
2585 	sock->connect_ev = NULL;
2586 
2587 	task = dev->ev_sender;
2588 	dev->ev_sender = sock;
2589 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2590 }
2591 
2592 /*
2593  * Run through the list of events on this socket, and cancel the ones
2594  * queued for task "task" of type "how".  "how" is a bitmask.
2595  */
2596 void
2597 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2598 	isc__socket_t *sock = (isc__socket_t *)sock0;
2599 
2600 	REQUIRE(VALID_SOCKET(sock));
2601 
2602 	/*
2603 	 * Quick exit if there is nothing to do.  Don't even bother locking
2604 	 * in this case.
2605 	 */
2606 	if (how == 0)
2607 		return;
2608 
2609 	/*
2610 	 * All of these do the same thing, more or less.
2611 	 * Each will:
2612 	 *	o If the internal event is marked as "posted" try to
2613 	 *	  remove it from the task's queue.  If this fails, mark it
2614 	 *	  as canceled instead, and let the task clean it up later.
2615 	 *	o For each I/O request for that task of that type, post
2616 	 *	  its done event with status of "ISC_R_CANCELED".
2617 	 *	o Reset any state needed.
2618 	 */
2619 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2620 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2621 		isc_socketevent_t      *dev;
2622 		isc_socketevent_t      *next;
2623 		isc_task_t	       *current_task;
2624 
2625 		dev = ISC_LIST_HEAD(sock->recv_list);
2626 
2627 		while (dev != NULL) {
2628 			current_task = dev->ev_sender;
2629 			next = ISC_LIST_NEXT(dev, ev_link);
2630 
2631 			if ((task == NULL) || (task == current_task)) {
2632 				dev->result = ISC_R_CANCELED;
2633 				send_recvdone_event(sock, &dev);
2634 			}
2635 			dev = next;
2636 		}
2637 	}
2638 
2639 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2640 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2641 		isc_socketevent_t      *dev;
2642 		isc_socketevent_t      *next;
2643 		isc_task_t	       *current_task;
2644 
2645 		dev = ISC_LIST_HEAD(sock->send_list);
2646 
2647 		while (dev != NULL) {
2648 			current_task = dev->ev_sender;
2649 			next = ISC_LIST_NEXT(dev, ev_link);
2650 
2651 			if ((task == NULL) || (task == current_task)) {
2652 				dev->result = ISC_R_CANCELED;
2653 				send_senddone_event(sock, &dev);
2654 			}
2655 			dev = next;
2656 		}
2657 	}
2658 
2659 	/*
2660 	 * Connecting is not a list.
2661 	 */
2662 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2663 	    && sock->connect_ev != NULL) {
2664 		isc_socket_connev_t    *dev;
2665 		isc_task_t	       *current_task;
2666 
2667 		INSIST(sock->connecting);
2668 		sock->connecting = 0;
2669 
2670 		dev = sock->connect_ev;
2671 		current_task = dev->ev_sender;
2672 
2673 		if ((task == NULL) || (task == current_task)) {
2674 			sock->connect_ev = NULL;
2675 
2676 			dev->result = ISC_R_CANCELED;
2677 			dev->ev_sender = sock;
2678 			isc_task_sendanddetach(&current_task,
2679 					       ISC_EVENT_PTR(&dev));
2680 		}
2681 	}
2682 
2683 }
2684 
2685 /*
2686  * In our assumed scenario, we can simply use a single static object.
2687  * XXX: this is not true if the application uses multiple threads with
2688  *      'multi-context' mode.  Fixing this is a future TODO item.
2689  */
2690 static isc_socketwait_t swait_private;
2691 
2692 int
2693 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2694 			  isc_socketwait_t **swaitp)
2695 {
2696 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2697 	int n;
2698 
2699 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2700 
2701 	if (manager == NULL)
2702 		manager = socketmgr;
2703 	if (manager == NULL)
2704 		return (0);
2705 
2706 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2707 	memmove(manager->write_fds_copy, manager->write_fds,
2708 		manager->fd_bufsize);
2709 
2710 	swait_private.readset = manager->read_fds_copy;
2711 	swait_private.writeset = manager->write_fds_copy;
2712 	swait_private.maxfd = manager->maxfd + 1;
2713 
2714 	n = select(swait_private.maxfd, swait_private.readset,
2715 		   swait_private.writeset, NULL, tvp);
2716 
2717 	*swaitp = &swait_private;
2718 	return (n);
2719 }
2720 
2721 isc_result_t
2722 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2723 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2724 
2725 	REQUIRE(swait == &swait_private);
2726 
2727 	if (manager == NULL)
2728 		manager = socketmgr;
2729 	if (manager == NULL)
2730 		return (ISC_R_NOTFOUND);
2731 
2732 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2733 	return (ISC_R_SUCCESS);
2734 }
2735 
2736 #include "../socket_api.c"
2737