xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 77d843245b4a0a4806bdb972be24cbe48da36f94)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/param.h>
20 #include <sys/types.h>
21 #include <sys/event.h>
22 #include <sys/socket.h>
23 #include <sys/stat.h>
24 #include <sys/time.h>
25 #include <sys/uio.h>
26 #include <sys/un.h>
27 
28 #include <netinet/tcp.h>
29 
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <stddef.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include <inttypes.h> /* uintptr_t */
37 
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/formatcheck.h>
41 #include <isc/list.h>
42 #include <isc/log.h>
43 #include <isc/msgs.h>
44 #include <isc/net.h>
45 #include <isc/region.h>
46 #include <isc/socket.h>
47 #include <isc/strerror.h>
48 #include <isc/task.h>
49 #include <isc/util.h>
50 
51 #include "errno2result.h"
52 
53 #include "socket_p.h"
54 #include "../task_p.h"
55 
56 struct isc_socketwait {
57 	fd_set *readset;
58 	fd_set *writeset;
59 	int nfds;
60 	int maxfd;
61 };
62 
63 /*
64  * Set by the -T dscp option on the command line. If set to a value
65  * other than -1, we check to make sure DSCP values match it, and
66  * assert if not.
67  */
68 int isc_dscp_check_value = -1;
69 
70 /*%
71  * Size of per-FD lock buckets.
72  */
73 #define FDLOCK_COUNT		1
74 #define FDLOCK_ID(fd)		0
75 
76 /*%
77  * Some systems define the socket length argument as an int, some as size_t,
78  * some as socklen_t.  This is here so it can be easily changed if needed.
79  */
80 
81 /*%
82  * Define what the possible "soft" errors can be.  These are non-fatal returns
83  * of various network related functions, like recv() and so on.
84  *
85  * For some reason, BSDI (and perhaps others) will sometimes return <0
86  * from recv() but will have errno==0.  This is broken, but we have to
87  * work around it here.
88  */
89 #define SOFT_ERROR(e)	((e) == EAGAIN || \
90 			 (e) == EWOULDBLOCK || \
91 			 (e) == EINTR || \
92 			 (e) == 0)
93 
94 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
95 
96 /*!<
97  * DLVL(90)  --  Function entry/exit and other tracing.
98  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
99  * DLVL(60)  --  Socket data send/receive
100  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
101  * DLVL(20)  --  Socket creation/destruction.
102  */
103 #define TRACE_LEVEL		90
104 #define CORRECTNESS_LEVEL	70
105 #define IOEVENT_LEVEL		60
106 #define EVENT_LEVEL		50
107 #define CREATION_LEVEL		20
108 
109 #define TRACE		DLVL(TRACE_LEVEL)
110 #define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
111 #define IOEVENT		DLVL(IOEVENT_LEVEL)
112 #define EVENT		DLVL(EVENT_LEVEL)
113 #define CREATION	DLVL(CREATION_LEVEL)
114 
115 typedef isc_event_t intev_t;
116 
117 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
118 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
119 
120 /*!
121  * IPv6 control information.  If the socket is an IPv6 socket we want
122  * to collect the destination address and interface so the client can
123  * set them on outgoing packets.
124  */
125 
126 /*%
127  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
128  * a setsockopt() like interface to request timestamps, and if the OS
129  * doesn't do it for us, call gettimeofday() on every UDP receive?
130  */
131 
132 /*%
133  * The size to raise the receive buffer to (from BIND 8).
134  */
135 #define RCVBUFSIZE (32*1024)
136 
137 /*%
138  * Instead of calculating the cmsgbuf lengths every time we take
139  * a rule of thumb approach - sizes are taken from x86_64 linux,
140  * multiplied by 2, everything should fit. Those sizes are not
141  * large enough to cause any concern.
142  */
143 #define CMSG_SP_IN6PKT 40
144 
145 #define CMSG_SP_TIMESTAMP 32
146 
147 #define CMSG_SP_TCTOS 24
148 
149 #define CMSG_SP_INT 24
150 
151 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
152 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
153 
154 /*%
155  * The number of times a send operation is repeated if the result is EINTR.
156  */
157 #define NRETRIES 10
158 
159 typedef struct isc__socket isc__socket_t;
160 typedef struct isc__socketmgr isc__socketmgr_t;
161 
162 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
163 
164 struct isc__socket {
165 	/* Not locked. */
166 	isc_socket_t		common;
167 	isc__socketmgr_t	*manager;
168 	isc_sockettype_t	type;
169 
170 	/* Locked by socket lock. */
171 	ISC_LINK(isc__socket_t)	link;
172 	unsigned int		references;
173 	int			fd;
174 	int			pf;
175 
176 	ISC_LIST(isc_socketevent_t)		send_list;
177 	ISC_LIST(isc_socketevent_t)		recv_list;
178 	isc_socket_connev_t		       *connect_ev;
179 
180 	/*
181 	 * Internal events.  Posted when a descriptor is readable or
182 	 * writable.  These are statically allocated and never freed.
183 	 * They will be set to non-purgable before use.
184 	 */
185 	intev_t			readable_ev;
186 	intev_t			writable_ev;
187 
188 	isc_sockaddr_t		peer_address;       /* remote address */
189 
190 	unsigned int		pending_recv : 1,
191 				pending_send : 1,
192 				connected : 1,
193 				connecting : 1,     /* connect pending */
194 				bound : 1,          /* bound to local addr */
195 				active : 1,         /* currently active */
196 				pktdscp : 1;	    /* per packet dscp */
197 	unsigned int		dscp;
198 };
199 
200 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
201 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
202 
203 struct isc__socketmgr {
204 	/* Not locked. */
205 	isc_socketmgr_t		common;
206 	int			fd_bufsize;
207 	unsigned int		maxsocks;
208 
209 	isc__socket_t	       **fds;
210 	int			*fdstate;
211 
212 	/* Locked by manager lock. */
213 	ISC_LIST(isc__socket_t)	socklist;
214 	fd_set			*read_fds;
215 	fd_set			*read_fds_copy;
216 	fd_set			*write_fds;
217 	fd_set			*write_fds_copy;
218 	int			maxfd;
219 	unsigned int		refs;
220 };
221 
222 static isc__socketmgr_t *socketmgr = NULL;
223 
224 #define CLOSED			0	/* this one must be zero */
225 #define MANAGED			1
226 #define CLOSE_PENDING		2
227 
228 /*
229  * send() and recv() iovec counts
230  */
231 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
232 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
233 
234 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
235 				  isc_sockettype_t type,
236 				  isc_socket_t **socketp);
237 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
238 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
239 static void free_socket(isc__socket_t **);
240 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
241 				    isc__socket_t **);
242 static void destroy(isc__socket_t **);
243 static void internal_connect(isc_task_t *, isc_event_t *);
244 static void internal_recv(isc_task_t *, isc_event_t *);
245 static void internal_send(isc_task_t *, isc_event_t *);
246 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
247 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
248 			      struct msghdr *, struct iovec *, size_t *);
249 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
250 			      struct msghdr *, struct iovec *, size_t *);
251 
252 /*%
253  * The following are intended for internal use (indicated by "isc__"
254  * prefix) but are not declared as static, allowing direct access from
255  * unit tests etc.
256  */
257 
258 isc_result_t
259 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
260 		   isc_socket_t **socketp);
261 void
262 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
263 void
264 isc__socket_detach(isc_socket_t **socketp);
265 isc_result_t
266 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
267 		 unsigned int minimum, isc_task_t *task,
268 		  isc_taskaction_t action, void *arg);
269 isc_result_t
270 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
271 		  isc_task_t *task, isc_taskaction_t action, void *arg);
272 isc_result_t
273 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
274 		     isc_task_t *task, isc_taskaction_t action, void *arg,
275 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
276 		     unsigned int flags);
277 isc_result_t
278 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
279 		 unsigned int options);
280 isc_result_t
281 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
282 		    isc_task_t *task, isc_taskaction_t action,
283 		    void *arg);
284 void
285 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
286 
287 isc_result_t
288 isc__socketmgr_create(isc_socketmgr_t **managerp);
289 isc_result_t
290 isc__socketmgr_create2(isc_socketmgr_t **managerp,
291 		       unsigned int maxsocks);
292 isc_result_t
293 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
294 void
295 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
296 
297 static struct {
298 	isc_socketmethods_t methods;
299 
300 	/*%
301 	 * The following are defined just for avoiding unused static functions.
302 	 */
303 	void *recvv, *sendv;
304 } socketmethods = {
305 	{
306 		isc__socket_attach,
307 		isc__socket_detach,
308 		isc__socket_bind,
309 		isc__socket_connect,
310 		isc__socket_cancel,
311 	},
312 	(void *)isc__socket_recvv,
313 	(void *)isc__socket_sendv,
314 };
315 
316 static isc_socketmgrmethods_t socketmgrmethods = {
317 	isc__socketmgr_destroy,
318 	isc__socket_create
319 };
320 
321 #define SELECT_POKE_SHUTDOWN		(-1)
322 #define SELECT_POKE_NOTHING		(-2)
323 #define SELECT_POKE_READ		(-3)
324 #define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
325 #define SELECT_POKE_WRITE		(-4)
326 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
327 #define SELECT_POKE_CLOSE		(-5)
328 
329 #define SOCK_DEAD(s)			((s)->references == 0)
330 
331 /*%
332  * Shortcut index arrays to get access to statistics counters.
333  */
334 enum {
335 	STATID_OPEN = 0,
336 	STATID_OPENFAIL = 1,
337 	STATID_CLOSE = 2,
338 	STATID_BINDFAIL = 3,
339 	STATID_CONNECTFAIL = 4,
340 	STATID_CONNECT = 5,
341 	STATID_ACCEPTFAIL = 6,
342 	STATID_ACCEPT = 7,
343 	STATID_SENDFAIL = 8,
344 	STATID_RECVFAIL = 9,
345 	STATID_ACTIVE = 10
346 };
347 
348 
349 static void
350 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
351 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
352 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
353 static void
354 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
355 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
356 	   const char *fmt, ...)
357 {
358 	char msgbuf[2048];
359 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
360 	va_list ap;
361 
362 	if (! isc_log_wouldlog(isc_lctx, level))
363 		return;
364 
365 	va_start(ap, fmt);
366 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
367 	va_end(ap);
368 
369 	if (address == NULL) {
370 		isc_log_write(isc_lctx, category, module, level,
371 			       "socket %p: %s", sock, msgbuf);
372 	} else {
373 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
374 		isc_log_write(isc_lctx, category, module, level,
375 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
376 	}
377 }
378 
379 static inline isc_result_t
380 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
381 	isc_result_t result = ISC_R_SUCCESS;
382 
383 	if (msg == SELECT_POKE_READ)
384 		FD_SET(fd, manager->read_fds);
385 	if (msg == SELECT_POKE_WRITE)
386 		FD_SET(fd, manager->write_fds);
387 
388 	return (result);
389 }
390 
391 static inline isc_result_t
392 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
393 	isc_result_t result = ISC_R_SUCCESS;
394 
395 	if (msg == SELECT_POKE_READ)
396 		FD_CLR(fd, manager->read_fds);
397 	else if (msg == SELECT_POKE_WRITE)
398 		FD_CLR(fd, manager->write_fds);
399 
400 	return (result);
401 }
402 
403 static void
404 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
405 	isc_result_t result;
406 
407 	/*
408 	 * This is a wakeup on a socket.  If the socket is not in the
409 	 * process of being closed, start watching it for either reads
410 	 * or writes.
411 	 */
412 
413 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
414 
415 	if (msg == SELECT_POKE_CLOSE) {
416 		/* No one should be updating fdstate, so no need to lock it */
417 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
418 		manager->fdstate[fd] = CLOSED;
419 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
420 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
421 		(void)close(fd);
422 		return;
423 	}
424 
425 	if (manager->fdstate[fd] == CLOSE_PENDING) {
426 
427 		/*
428 		 * We accept (and ignore) any error from unwatch_fd() as we are
429 		 * closing the socket, hoping it doesn't leave dangling state in
430 		 * the kernel.
431 		 */
432 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
433 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
434 		return;
435 	}
436 	if (manager->fdstate[fd] != MANAGED) {
437 		return;
438 	}
439 
440 	/*
441 	 * Set requested bit.
442 	 */
443 	result = watch_fd(manager, fd, msg);
444 	if (result != ISC_R_SUCCESS) {
445 		/*
446 		 * XXXJT: what should we do?  Ignoring the failure of watching
447 		 * a socket will make the application dysfunctional, but there
448 		 * seems to be no reasonable recovery process.
449 		 */
450 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
451 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
452 			      "failed to start watching FD (%d): %s",
453 			      fd, isc_result_totext(result));
454 	}
455 }
456 
457 /*
458  * Update the state of the socketmgr when something changes.
459  */
460 static void
461 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
462 	if (msg == SELECT_POKE_SHUTDOWN)
463 		return;
464 	else if (fd >= 0)
465 		wakeup_socket(manager, fd, msg);
466 	return;
467 }
468 
469 /*
470  * Make a fd non-blocking.
471  */
472 static isc_result_t
473 make_nonblock(int fd) {
474 	int ret;
475 	char strbuf[ISC_STRERRORSIZE];
476 	int flags;
477 
478 	flags = fcntl(fd, F_GETFL, 0);
479 	flags |= O_NONBLOCK;
480 	ret = fcntl(fd, F_SETFL, flags);
481 
482 	if (ret == -1) {
483 		isc__strerror(errno, strbuf, sizeof(strbuf));
484 		UNEXPECTED_ERROR(__FILE__, __LINE__,
485 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
486 				 strbuf);
487 
488 		return (ISC_R_UNEXPECTED);
489 	}
490 
491 	return (ISC_R_SUCCESS);
492 }
493 
494 /*
495  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
496  * In order to ensure as much portability as possible, we provide wrapper
497  * functions of these macros.
498  * Note that cmsg_space() could run slow on OSes that do not have
499  * CMSG_SPACE.
500  */
501 static inline socklen_t
502 cmsg_len(socklen_t len) {
503 	return (CMSG_LEN(len));
504 }
505 
506 static inline socklen_t
507 cmsg_space(socklen_t len) {
508 	return (CMSG_SPACE(len));
509 }
510 
511 /*
512  * Process control messages received on a socket.
513  */
514 static void
515 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
516 	struct cmsghdr *cmsgp;
517 	struct in6_pktinfo *pktinfop;
518 	void *timevalp;
519 
520 	/*
521 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
522 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
523 	 * They are all here, outside of the CPP tests, because it is
524 	 * more consistent with the usual ISC coding style.
525 	 */
526 	UNUSED(sock);
527 	UNUSED(msg);
528 	UNUSED(dev);
529 
530 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
531 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
532 
533 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
534 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
535 
536 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
537 		return;
538 
539 	timevalp = NULL;
540 	pktinfop = NULL;
541 
542 	cmsgp = CMSG_FIRSTHDR(msg);
543 	while (cmsgp != NULL) {
544 		socket_log(sock, NULL, TRACE,
545 			   "processing cmsg %p", cmsgp);
546 
547 		if (cmsgp->cmsg_level == IPPROTO_IPV6
548 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
549 
550 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
551 			memmove(&dev->pktinfo, pktinfop,
552 				sizeof(struct in6_pktinfo));
553 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
554 			socket_log(sock, NULL, TRACE,
555 				   "interface received on ifindex %u",
556 				   dev->pktinfo.ipi6_ifindex);
557 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
558 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
559 			goto next;
560 		}
561 
562 		if (cmsgp->cmsg_level == SOL_SOCKET
563 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
564 			struct timeval tv;
565 			timevalp = CMSG_DATA(cmsgp);
566 			memmove(&tv, timevalp, sizeof(tv));
567 			dev->timestamp.seconds = tv.tv_sec;
568 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
569 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
570 			goto next;
571 		}
572 
573 		if (cmsgp->cmsg_level == IPPROTO_IPV6
574 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
575 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
576 			dev->dscp >>= 2;
577 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
578 			goto next;
579 		}
580 
581 		if (cmsgp->cmsg_level == IPPROTO_IP
582 		    && (cmsgp->cmsg_type == IP_TOS)) {
583 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
584 			dev->dscp >>= 2;
585 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
586 			goto next;
587 		}
588 	next:
589 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
590 	}
591 
592 }
593 
594 /*
595  * Construct an iov array and attach it to the msghdr passed in.  This is
596  * the SEND constructor, which will use the used region of the buffer
597  * (if using a buffer list) or will use the internal region (if a single
598  * buffer I/O is requested).
599  *
600  * Nothing can be NULL, and the done event must list at least one buffer
601  * on the buffer linked list for this function to be meaningful.
602  *
603  * If write_countp != NULL, *write_countp will hold the number of bytes
604  * this transaction can send.
605  */
606 static void
607 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
608 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
609 {
610 	unsigned int iovcount;
611 	isc_buffer_t *buffer;
612 	isc_region_t used;
613 	size_t write_count;
614 	size_t skip_count;
615 	struct cmsghdr *cmsgp;
616 
617 	memset(msg, 0, sizeof(*msg));
618 
619 	if (!sock->connected) {
620 		msg->msg_name = (void *)&dev->address.type.sa;
621 		msg->msg_namelen = dev->address.length;
622 	} else {
623 		msg->msg_name = NULL;
624 		msg->msg_namelen = 0;
625 	}
626 
627 	buffer = ISC_LIST_HEAD(dev->bufferlist);
628 	write_count = 0;
629 	iovcount = 0;
630 
631 	/*
632 	 * Single buffer I/O?  Skip what we've done so far in this region.
633 	 */
634 	if (buffer == NULL) {
635 		write_count = dev->region.length - dev->n;
636 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
637 		iov[0].iov_len = write_count;
638 		iovcount = 1;
639 
640 		goto config;
641 	}
642 
643 	/*
644 	 * Multibuffer I/O.
645 	 * Skip the data in the buffer list that we have already written.
646 	 */
647 	skip_count = dev->n;
648 	while (buffer != NULL) {
649 		REQUIRE(ISC_BUFFER_VALID(buffer));
650 		if (skip_count < isc_buffer_usedlength(buffer))
651 			break;
652 		skip_count -= isc_buffer_usedlength(buffer);
653 		buffer = ISC_LIST_NEXT(buffer, link);
654 	}
655 
656 	while (buffer != NULL) {
657 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
658 
659 		isc_buffer_usedregion(buffer, &used);
660 
661 		if (used.length > 0) {
662 			iov[iovcount].iov_base = (void *)(used.base
663 							  + skip_count);
664 			iov[iovcount].iov_len = used.length - skip_count;
665 			write_count += (used.length - skip_count);
666 			skip_count = 0;
667 			iovcount++;
668 		}
669 		buffer = ISC_LIST_NEXT(buffer, link);
670 	}
671 
672 	INSIST(skip_count == 0U);
673 
674  config:
675 	msg->msg_iov = iov;
676 	msg->msg_iovlen = iovcount;
677 
678 	msg->msg_control = NULL;
679 	msg->msg_controllen = 0;
680 	msg->msg_flags = 0;
681 
682 	if ((sock->type == isc_sockettype_udp) &&
683 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
684 	{
685 		struct in6_pktinfo *pktinfop;
686 
687 		socket_log(sock, NULL, TRACE,
688 			   "sendto pktinfo data, ifindex %u",
689 			   dev->pktinfo.ipi6_ifindex);
690 
691 		msg->msg_control = (void *)cmsgbuf;
692 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
693 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
694 
695 		cmsgp = (struct cmsghdr *)cmsgbuf;
696 		cmsgp->cmsg_level = IPPROTO_IPV6;
697 		cmsgp->cmsg_type = IPV6_PKTINFO;
698 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
699 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
700 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
701 	}
702 
703 	if ((sock->type == isc_sockettype_udp) &&
704 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
705 	{
706 		int use_min_mtu = 1;	/* -1, 0, 1 */
707 
708 		cmsgp = (struct cmsghdr *)(cmsgbuf +
709 					   msg->msg_controllen);
710 
711 		msg->msg_control = (void *)cmsgbuf;
712 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
713 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
714 
715 		cmsgp->cmsg_level = IPPROTO_IPV6;
716 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
717 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
718 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
719 	}
720 
721 	if (isc_dscp_check_value > -1) {
722 		if (sock->type == isc_sockettype_udp)
723 			INSIST((int)dev->dscp == isc_dscp_check_value);
724 		else if (sock->type == isc_sockettype_tcp)
725 			INSIST((int)sock->dscp == isc_dscp_check_value);
726 	}
727 
728 	if ((sock->type == isc_sockettype_udp) &&
729 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
730 	{
731 		int dscp = (dev->dscp << 2) & 0xff;
732 
733 		INSIST(dev->dscp < 0x40);
734 
735 		if (sock->pf == AF_INET && sock->pktdscp) {
736 			cmsgp = (struct cmsghdr *)(cmsgbuf +
737 						   msg->msg_controllen);
738 			msg->msg_control = (void *)cmsgbuf;
739 			msg->msg_controllen += cmsg_space(sizeof(dscp));
740 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
741 
742 			cmsgp->cmsg_level = IPPROTO_IP;
743 			cmsgp->cmsg_type = IP_TOS;
744 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
745 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
746 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
747 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
748 			       (void *)&dscp, sizeof(int)) < 0)
749 			{
750 				char strbuf[ISC_STRERRORSIZE];
751 				isc__strerror(errno, strbuf, sizeof(strbuf));
752 				UNEXPECTED_ERROR(__FILE__, __LINE__,
753 						 "setsockopt(%d, IP_TOS, %.02x)"
754 						 " %s: %s",
755 						 sock->fd, dscp >> 2,
756 						 "failed", strbuf);
757 			} else
758 				sock->dscp = dscp;
759 		}
760 
761 		if (sock->pf == AF_INET6 && sock->pktdscp) {
762 			cmsgp = (struct cmsghdr *)(cmsgbuf +
763 						   msg->msg_controllen);
764 			msg->msg_control = (void *)cmsgbuf;
765 			msg->msg_controllen += cmsg_space(sizeof(dscp));
766 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
767 
768 			cmsgp->cmsg_level = IPPROTO_IPV6;
769 			cmsgp->cmsg_type = IPV6_TCLASS;
770 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
771 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
772 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
773 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
774 				       (void *)&dscp, sizeof(int)) < 0) {
775 				char strbuf[ISC_STRERRORSIZE];
776 				isc__strerror(errno, strbuf, sizeof(strbuf));
777 				UNEXPECTED_ERROR(__FILE__, __LINE__,
778 						 "setsockopt(%d, IPV6_TCLASS, "
779 						 "%.02x) %s: %s",
780 						 sock->fd, dscp >> 2,
781 						 "failed", strbuf);
782 			} else
783 				sock->dscp = dscp;
784 		}
785 
786 		if (msg->msg_controllen != 0 &&
787 		    msg->msg_controllen < SENDCMSGBUFLEN)
788 		{
789 			memset(cmsgbuf + msg->msg_controllen, 0,
790 			       SENDCMSGBUFLEN - msg->msg_controllen);
791 		}
792 	}
793 
794 	if (write_countp != NULL)
795 		*write_countp = write_count;
796 }
797 
798 /*
799  * Construct an iov array and attach it to the msghdr passed in.  This is
800  * the RECV constructor, which will use the available region of the buffer
801  * (if using a buffer list) or will use the internal region (if a single
802  * buffer I/O is requested).
803  *
804  * Nothing can be NULL, and the done event must list at least one buffer
805  * on the buffer linked list for this function to be meaningful.
806  *
807  * If read_countp != NULL, *read_countp will hold the number of bytes
808  * this transaction can receive.
809  */
810 static void
811 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
812 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
813 {
814 	unsigned int iovcount;
815 	isc_buffer_t *buffer;
816 	isc_region_t available;
817 	size_t read_count;
818 
819 	memset(msg, 0, sizeof(struct msghdr));
820 
821 	if (sock->type == isc_sockettype_udp) {
822 		memset(&dev->address, 0, sizeof(dev->address));
823 		msg->msg_name = (void *)&dev->address.type.sa;
824 		msg->msg_namelen = sizeof(dev->address.type);
825 	} else { /* TCP */
826 		msg->msg_name = NULL;
827 		msg->msg_namelen = 0;
828 		dev->address = sock->peer_address;
829 	}
830 
831 	buffer = ISC_LIST_HEAD(dev->bufferlist);
832 	read_count = 0;
833 
834 	/*
835 	 * Single buffer I/O?  Skip what we've done so far in this region.
836 	 */
837 	if (buffer == NULL) {
838 		read_count = dev->region.length - dev->n;
839 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
840 		iov[0].iov_len = read_count;
841 		iovcount = 1;
842 
843 		goto config;
844 	}
845 
846 	/*
847 	 * Multibuffer I/O.
848 	 * Skip empty buffers.
849 	 */
850 	while (buffer != NULL) {
851 		REQUIRE(ISC_BUFFER_VALID(buffer));
852 		if (isc_buffer_availablelength(buffer) != 0)
853 			break;
854 		buffer = ISC_LIST_NEXT(buffer, link);
855 	}
856 
857 	iovcount = 0;
858 	while (buffer != NULL) {
859 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
860 
861 		isc_buffer_availableregion(buffer, &available);
862 
863 		if (available.length > 0) {
864 			iov[iovcount].iov_base = (void *)(available.base);
865 			iov[iovcount].iov_len = available.length;
866 			read_count += available.length;
867 			iovcount++;
868 		}
869 		buffer = ISC_LIST_NEXT(buffer, link);
870 	}
871 
872  config:
873 
874 	/*
875 	 * If needed, set up to receive that one extra byte.
876 	 */
877 	msg->msg_iov = iov;
878 	msg->msg_iovlen = iovcount;
879 
880 	msg->msg_control = cmsgbuf;
881 	msg->msg_controllen = RECVCMSGBUFLEN;
882 	msg->msg_flags = 0;
883 
884 	if (read_countp != NULL)
885 		*read_countp = read_count;
886 }
887 
888 static void
889 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
890 		isc_socketevent_t *dev)
891 {
892 	if (sock->type == isc_sockettype_udp) {
893 		if (address != NULL)
894 			dev->address = *address;
895 		else
896 			dev->address = sock->peer_address;
897 	} else if (sock->type == isc_sockettype_tcp) {
898 		INSIST(address == NULL);
899 		dev->address = sock->peer_address;
900 	}
901 }
902 
903 static void
904 destroy_socketevent(isc_event_t *event) {
905 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
906 
907 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
908 
909 	(ev->destroy)(event);
910 }
911 
912 static isc_socketevent_t *
913 allocate_socketevent(void *sender,
914 		     isc_eventtype_t eventtype, isc_taskaction_t action,
915 		     void *arg)
916 {
917 	isc_socketevent_t *ev;
918 
919 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
920 						     eventtype, action, arg,
921 						     sizeof(*ev));
922 
923 	if (ev == NULL)
924 		return (NULL);
925 
926 	ev->result = ISC_R_UNSET;
927 	ISC_LINK_INIT(ev, ev_link);
928 	ISC_LIST_INIT(ev->bufferlist);
929 	ev->region.base = NULL;
930 	ev->n = 0;
931 	ev->offset = 0;
932 	ev->attributes = 0;
933 	ev->destroy = ev->ev_destroy;
934 	ev->ev_destroy = destroy_socketevent;
935 	ev->dscp = 0;
936 
937 	return (ev);
938 }
939 
940 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
941 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
942 #define DOIO_HARD		2	/* i/o error, event sent */
943 #define DOIO_EOF		3	/* EOF, no event sent */
944 
945 static int
946 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
947 	int cc;
948 	struct iovec iov[MAXSCATTERGATHER_RECV];
949 	size_t read_count;
950 	size_t actual_count;
951 	struct msghdr msghdr;
952 	isc_buffer_t *buffer;
953 	int recv_errno;
954 	char strbuf[ISC_STRERRORSIZE];
955 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
956 
957 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
958 
959 	cc = recvmsg(sock->fd, &msghdr, 0);
960 	recv_errno = errno;
961 
962 	if (cc < 0) {
963 		if (SOFT_ERROR(recv_errno))
964 			return (DOIO_SOFT);
965 
966 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
967 			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
968 			socket_log(sock, NULL, IOEVENT,
969 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
970 				   sock->fd, cc, recv_errno, strbuf);
971 		}
972 
973 #define SOFT_OR_HARD(_system, _isc) \
974 	if (recv_errno == _system) { \
975 		if (sock->connected) { \
976 			dev->result = _isc; \
977 			return (DOIO_HARD); \
978 		} \
979 		return (DOIO_SOFT); \
980 	}
981 #define ALWAYS_HARD(_system, _isc) \
982 	if (recv_errno == _system) { \
983 		dev->result = _isc; \
984 		return (DOIO_HARD); \
985 	}
986 
987 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
988 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
989 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
990 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
991 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
992 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
993 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
994 		/* Should never get this one but it was seen. */
995 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
996 		/*
997 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
998 		 * errors.
999 		 */
1000 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1001 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1002 
1003 #undef SOFT_OR_HARD
1004 #undef ALWAYS_HARD
1005 
1006 		dev->result = isc__errno2result(recv_errno);
1007 		return (DOIO_HARD);
1008 	}
1009 
1010 	/*
1011 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1012 	 * while on UDP sockets, zero length reads are perfectly valid,
1013 	 * although strange.
1014 	 */
1015 	switch (sock->type) {
1016 	case isc_sockettype_tcp:
1017 		if (cc == 0)
1018 			return (DOIO_EOF);
1019 		break;
1020 	case isc_sockettype_udp:
1021 		break;
1022 	default:
1023 		INSIST(0);
1024 	}
1025 
1026 	if (sock->type == isc_sockettype_udp) {
1027 		dev->address.length = msghdr.msg_namelen;
1028 		if (isc_sockaddr_getport(&dev->address) == 0) {
1029 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1030 				socket_log(sock, &dev->address, IOEVENT,
1031 					   "dropping source port zero packet");
1032 			}
1033 			return (DOIO_SOFT);
1034 		}
1035 	}
1036 
1037 	socket_log(sock, &dev->address, IOEVENT,
1038 		   "packet received correctly");
1039 
1040 	/*
1041 	 * Overflow bit detection.  If we received MORE bytes than we should,
1042 	 * this indicates an overflow situation.  Set the flag in the
1043 	 * dev entry and adjust how much we read by one.
1044 	 */
1045 	/*
1046 	 * If there are control messages attached, run through them and pull
1047 	 * out the interesting bits.
1048 	 */
1049 	process_cmsg(sock, &msghdr, dev);
1050 
1051 	/*
1052 	 * update the buffers (if any) and the i/o count
1053 	 */
1054 	dev->n += cc;
1055 	actual_count = cc;
1056 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1057 	while (buffer != NULL && actual_count > 0U) {
1058 		REQUIRE(ISC_BUFFER_VALID(buffer));
1059 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1060 			actual_count -= isc_buffer_availablelength(buffer);
1061 			isc_buffer_add(buffer,
1062 				       isc_buffer_availablelength(buffer));
1063 		} else {
1064 			isc_buffer_add(buffer, actual_count);
1065 			actual_count = 0;
1066 			POST(actual_count);
1067 			break;
1068 		}
1069 		buffer = ISC_LIST_NEXT(buffer, link);
1070 		if (buffer == NULL) {
1071 			INSIST(actual_count == 0U);
1072 		}
1073 	}
1074 
1075 	/*
1076 	 * If we read less than we expected, update counters,
1077 	 * and let the upper layer poke the descriptor.
1078 	 */
1079 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1080 		return (DOIO_SOFT);
1081 
1082 	/*
1083 	 * Full reads are posted, or partials if partials are ok.
1084 	 */
1085 	dev->result = ISC_R_SUCCESS;
1086 	return (DOIO_SUCCESS);
1087 }
1088 
1089 /*
1090  * Returns:
1091  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1092  *			ISC_R_SUCCESS.
1093  *
1094  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1095  *			dev->result contains the appropriate error.
1096  *
1097  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1098  *			event was sent.  The operation should be retried.
1099  *
1100  *	No other return values are possible.
1101  */
1102 static int
1103 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1104 	int cc;
1105 	struct iovec iov[MAXSCATTERGATHER_SEND];
1106 	size_t write_count;
1107 	struct msghdr msghdr;
1108 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1109 	int attempts = 0;
1110 	int send_errno;
1111 	char strbuf[ISC_STRERRORSIZE];
1112 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1113 
1114 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1115 
1116  resend:
1117 	cc = sendmsg(sock->fd, &msghdr, 0);
1118 	send_errno = errno;
1119 
1120 	/*
1121 	 * Check for error or block condition.
1122 	 */
1123 	if (cc < 0) {
1124 		if (send_errno == EINTR && ++attempts < NRETRIES)
1125 			goto resend;
1126 
1127 		if (SOFT_ERROR(send_errno)) {
1128 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1129 				dev->result = ISC_R_WOULDBLOCK;
1130 			return (DOIO_SOFT);
1131 		}
1132 
1133 #define SOFT_OR_HARD(_system, _isc) \
1134 	if (send_errno == _system) { \
1135 		if (sock->connected) { \
1136 			dev->result = _isc; \
1137 			return (DOIO_HARD); \
1138 		} \
1139 		return (DOIO_SOFT); \
1140 	}
1141 #define ALWAYS_HARD(_system, _isc) \
1142 	if (send_errno == _system) { \
1143 		dev->result = _isc; \
1144 		return (DOIO_HARD); \
1145 	}
1146 
1147 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1148 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1149 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1150 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1151 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1152 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1153 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1154 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1155 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1156 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1157 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1158 
1159 #undef SOFT_OR_HARD
1160 #undef ALWAYS_HARD
1161 
1162 		/*
1163 		 * The other error types depend on whether or not the
1164 		 * socket is UDP or TCP.  If it is UDP, some errors
1165 		 * that we expect to be fatal under TCP are merely
1166 		 * annoying, and are really soft errors.
1167 		 *
1168 		 * However, these soft errors are still returned as
1169 		 * a status.
1170 		 */
1171 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1172 		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1173 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1174 				 addrbuf, strbuf);
1175 		dev->result = isc__errno2result(send_errno);
1176 		return (DOIO_HARD);
1177 	}
1178 
1179 	if (cc == 0) {
1180 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1181 				 "doio_send: send() %s 0", "returned");
1182 	}
1183 
1184 	/*
1185 	 * If we write less than we expected, update counters, poke.
1186 	 */
1187 	dev->n += cc;
1188 	if ((size_t)cc != write_count)
1189 		return (DOIO_SOFT);
1190 
1191 	/*
1192 	 * Exactly what we wanted to write.  We're done with this
1193 	 * entry.  Post its completion event.
1194 	 */
1195 	dev->result = ISC_R_SUCCESS;
1196 	return (DOIO_SUCCESS);
1197 }
1198 
1199 /*
1200  * Kill.
1201  *
1202  * Caller must ensure that the socket is not locked and no external
1203  * references exist.
1204  */
1205 static void
1206 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1207 	/*
1208 	 * No one has this socket open, so the watcher doesn't have to be
1209 	 * poked, and the socket doesn't have to be locked.
1210 	 */
1211 	manager->fds[fd] = NULL;
1212 	manager->fdstate[fd] = CLOSE_PENDING;
1213 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1214 
1215 	if (sock->active == 1) {
1216 		sock->active = 0;
1217 	}
1218 
1219 	/*
1220 	 * update manager->maxfd here (XXX: this should be implemented more
1221 	 * efficiently)
1222 	 */
1223 	if (manager->maxfd == fd) {
1224 		int i;
1225 
1226 		manager->maxfd = 0;
1227 		for (i = fd - 1; i >= 0; i--) {
1228 			if (manager->fdstate[i] == MANAGED) {
1229 				manager->maxfd = i;
1230 				break;
1231 			}
1232 		}
1233 	}
1234 
1235 }
1236 
1237 static void
1238 destroy(isc__socket_t **sockp) {
1239 	int fd;
1240 	isc__socket_t *sock = *sockp;
1241 	isc__socketmgr_t *manager = sock->manager;
1242 
1243 	socket_log(sock, NULL, CREATION, "destroying");
1244 
1245 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1246 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1247 	INSIST(sock->connect_ev == NULL);
1248 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1249 
1250 	if (sock->fd >= 0) {
1251 		fd = sock->fd;
1252 		sock->fd = -1;
1253 		socketclose(manager, sock, fd);
1254 	}
1255 
1256 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1257 
1258 	/* can't unlock manager as its memory context is still used */
1259 	free_socket(sockp);
1260 }
1261 
1262 static isc_result_t
1263 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1264 		isc__socket_t **socketp)
1265 {
1266 	isc__socket_t *sock;
1267 
1268 	sock = malloc(sizeof(*sock));
1269 
1270 	if (sock == NULL)
1271 		return (ISC_R_NOMEMORY);
1272 
1273 	sock->common.magic = 0;
1274 	sock->common.impmagic = 0;
1275 	sock->references = 0;
1276 
1277 	sock->manager = manager;
1278 	sock->type = type;
1279 	sock->fd = -1;
1280 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1281 	sock->active = 0;
1282 
1283 	ISC_LINK_INIT(sock, link);
1284 
1285 	/*
1286 	 * Set up list of readers and writers to be initially empty.
1287 	 */
1288 	ISC_LIST_INIT(sock->recv_list);
1289 	ISC_LIST_INIT(sock->send_list);
1290 	sock->connect_ev = NULL;
1291 	sock->pending_recv = 0;
1292 	sock->pending_send = 0;
1293 	sock->connected = 0;
1294 	sock->connecting = 0;
1295 	sock->bound = 0;
1296 	sock->pktdscp = 0;
1297 
1298 	/*
1299 	 * Initialize readable and writable events.
1300 	 */
1301 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1302 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1303 		       NULL, sock, sock, NULL);
1304 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1305 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1306 		       NULL, sock, sock, NULL);
1307 
1308 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1309 	sock->common.impmagic = SOCKET_MAGIC;
1310 	*socketp = sock;
1311 
1312 	return (ISC_R_SUCCESS);
1313 }
1314 
1315 /*
1316  * This event requires that the various lists be empty, that the reference
1317  * count be 1, and that the magic number is valid.  The other socket bits,
1318  * like the lock, must be initialized as well.  The fd associated must be
1319  * marked as closed, by setting it to -1 on close, or this routine will
1320  * also close the socket.
1321  */
1322 static void
1323 free_socket(isc__socket_t **socketp) {
1324 	isc__socket_t *sock = *socketp;
1325 
1326 	INSIST(VALID_SOCKET(sock));
1327 	INSIST(sock->references == 0);
1328 	INSIST(!sock->connecting);
1329 	INSIST(!sock->pending_recv);
1330 	INSIST(!sock->pending_send);
1331 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1332 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1333 	INSIST(!ISC_LINK_LINKED(sock, link));
1334 
1335 	sock->common.magic = 0;
1336 	sock->common.impmagic = 0;
1337 
1338 	free(sock);
1339 
1340 	*socketp = NULL;
1341 }
1342 
1343 static void
1344 use_min_mtu(isc__socket_t *sock) {
1345 	/* use minimum MTU */
1346 	if (sock->pf == AF_INET6) {
1347 		int on = 1;
1348 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1349 				(void *)&on, sizeof(on));
1350 	}
1351 }
1352 
1353 static void
1354 set_tcp_maxseg(isc__socket_t *sock, int size) {
1355 	if (sock->type == isc_sockettype_tcp)
1356 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1357 				(void *)&size, sizeof(size));
1358 }
1359 
1360 static isc_result_t
1361 opensocket(isc__socket_t *sock)
1362 {
1363 	isc_result_t result;
1364 	char strbuf[ISC_STRERRORSIZE];
1365 	const char *err = "socket";
1366 	int on = 1;
1367 
1368 	switch (sock->type) {
1369 	case isc_sockettype_udp:
1370 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1371 		break;
1372 	case isc_sockettype_tcp:
1373 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1374 		break;
1375 	}
1376 
1377 	if (sock->fd < 0) {
1378 		switch (errno) {
1379 		case EMFILE:
1380 		case ENFILE:
1381 			isc__strerror(errno, strbuf, sizeof(strbuf));
1382 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1383 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1384 				       "%s: %s", err, strbuf);
1385 			/* fallthrough */
1386 		case ENOBUFS:
1387 			return (ISC_R_NORESOURCES);
1388 
1389 		case EPROTONOSUPPORT:
1390 		case EPFNOSUPPORT:
1391 		case EAFNOSUPPORT:
1392 		/*
1393 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1394 		 * EAFNOSUPPORT.
1395 		 */
1396 		case EINVAL:
1397 			return (ISC_R_FAMILYNOSUPPORT);
1398 
1399 		default:
1400 			isc__strerror(errno, strbuf, sizeof(strbuf));
1401 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1402 					 "%s() %s: %s", err, "failed",
1403 					 strbuf);
1404 			return (ISC_R_UNEXPECTED);
1405 		}
1406 	}
1407 
1408 	result = make_nonblock(sock->fd);
1409 	if (result != ISC_R_SUCCESS) {
1410 		(void)close(sock->fd);
1411 		return (result);
1412 	}
1413 
1414 	/*
1415 	 * Use minimum mtu if possible.
1416 	 */
1417 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1418 		use_min_mtu(sock);
1419 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1420 	}
1421 
1422 	if (sock->type == isc_sockettype_udp) {
1423 
1424 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1425 			       (void *)&on, sizeof(on)) < 0
1426 		    && errno != ENOPROTOOPT) {
1427 			isc__strerror(errno, strbuf, sizeof(strbuf));
1428 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1429 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1430 					 sock->fd, "failed", strbuf);
1431 			/* Press on... */
1432 		}
1433 
1434 		/* RFC 3542 */
1435 		if ((sock->pf == AF_INET6)
1436 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1437 				   (void *)&on, sizeof(on)) < 0)) {
1438 			isc__strerror(errno, strbuf, sizeof(strbuf));
1439 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1440 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1441 					 "%s: %s", sock->fd, "failed",
1442 					 strbuf);
1443 		}
1444 	}
1445 
1446 	if (sock->active == 0) {
1447 		sock->active = 1;
1448 	}
1449 
1450 	return (ISC_R_SUCCESS);
1451 }
1452 
1453 /*
1454  * Create a 'type' socket managed
1455  * by 'manager'.  Events will be posted to 'task' and when dispatched
1456  * 'action' will be called with 'arg' as the arg value.  The new
1457  * socket is returned in 'socketp'.
1458  */
1459 static isc_result_t
1460 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1461 	      isc_socket_t **socketp)
1462 {
1463 	isc__socket_t *sock = NULL;
1464 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1465 	isc_result_t result;
1466 	int lockid;
1467 
1468 	REQUIRE(VALID_MANAGER(manager));
1469 	REQUIRE(socketp != NULL && *socketp == NULL);
1470 
1471 	result = allocate_socket(manager, type, &sock);
1472 	if (result != ISC_R_SUCCESS)
1473 		return (result);
1474 
1475 	switch (sock->type) {
1476 	case isc_sockettype_udp:
1477 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1478 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1479 		break;
1480 	case isc_sockettype_tcp:
1481 		break;
1482 	default:
1483 		INSIST(0);
1484 	}
1485 
1486 	sock->pf = pf;
1487 
1488 	result = opensocket(sock);
1489 	if (result != ISC_R_SUCCESS) {
1490 		free_socket(&sock);
1491 		return (result);
1492 	}
1493 
1494 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1495 	sock->references = 1;
1496 	*socketp = (isc_socket_t *)sock;
1497 
1498 	/*
1499 	 * Note we don't have to lock the socket like we normally would because
1500 	 * there are no external references to it yet.
1501 	 */
1502 
1503 	lockid = FDLOCK_ID(sock->fd);
1504 	manager->fds[sock->fd] = sock;
1505 	manager->fdstate[sock->fd] = MANAGED;
1506 
1507 	ISC_LIST_APPEND(manager->socklist, sock, link);
1508 	if (manager->maxfd < sock->fd)
1509 		manager->maxfd = sock->fd;
1510 
1511 	socket_log(sock, NULL, CREATION, "created");
1512 
1513 	return (ISC_R_SUCCESS);
1514 }
1515 
1516 /*%
1517  * Create a new 'type' socket managed by 'manager'.  Events
1518  * will be posted to 'task' and when dispatched 'action' will be
1519  * called with 'arg' as the arg value.  The new socket is returned
1520  * in 'socketp'.
1521  */
1522 isc_result_t
1523 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1524 		   isc_socket_t **socketp)
1525 {
1526 	return (socket_create(manager0, pf, type, socketp));
1527 }
1528 
1529 /*
1530  * Attach to a socket.  Caller must explicitly detach when it is done.
1531  */
1532 void
1533 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1534 	isc__socket_t *sock = (isc__socket_t *)sock0;
1535 
1536 	REQUIRE(VALID_SOCKET(sock));
1537 	REQUIRE(socketp != NULL && *socketp == NULL);
1538 
1539 	sock->references++;
1540 
1541 	*socketp = (isc_socket_t *)sock;
1542 }
1543 
1544 /*
1545  * Dereference a socket.  If this is the last reference to it, clean things
1546  * up by destroying the socket.
1547  */
1548 void
1549 isc__socket_detach(isc_socket_t **socketp) {
1550 	isc__socket_t *sock;
1551 	isc_boolean_t kill_socket = ISC_FALSE;
1552 
1553 	REQUIRE(socketp != NULL);
1554 	sock = (isc__socket_t *)*socketp;
1555 	REQUIRE(VALID_SOCKET(sock));
1556 
1557 	REQUIRE(sock->references > 0);
1558 	sock->references--;
1559 	if (sock->references == 0)
1560 		kill_socket = ISC_TRUE;
1561 
1562 	if (kill_socket)
1563 		destroy(&sock);
1564 
1565 	*socketp = NULL;
1566 }
1567 
1568 /*
1569  * I/O is possible on a given socket.  Schedule an event to this task that
1570  * will call an internal function to do the I/O.  This will charge the
1571  * task with the I/O operation and let our select loop handler get back
1572  * to doing something real as fast as possible.
1573  *
1574  * The socket and manager must be locked before calling this function.
1575  */
1576 static void
1577 dispatch_recv(isc__socket_t *sock) {
1578 	intev_t *iev;
1579 	isc_socketevent_t *ev;
1580 	isc_task_t *sender;
1581 
1582 	INSIST(!sock->pending_recv);
1583 
1584 	ev = ISC_LIST_HEAD(sock->recv_list);
1585 	if (ev == NULL)
1586 		return;
1587 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1588 		   "dispatch_recv:  event %p -> task %p",
1589 		   ev, ev->ev_sender);
1590 	sender = ev->ev_sender;
1591 
1592 	sock->pending_recv = 1;
1593 	iev = &sock->readable_ev;
1594 
1595 	sock->references++;
1596 	iev->ev_sender = sock;
1597 	iev->ev_action = internal_recv;
1598 	iev->ev_arg = sock;
1599 
1600 	isc_task_send(sender, (isc_event_t **)&iev);
1601 }
1602 
1603 static void
1604 dispatch_send(isc__socket_t *sock) {
1605 	intev_t *iev;
1606 	isc_socketevent_t *ev;
1607 	isc_task_t *sender;
1608 
1609 	INSIST(!sock->pending_send);
1610 
1611 	ev = ISC_LIST_HEAD(sock->send_list);
1612 	if (ev == NULL)
1613 		return;
1614 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1615 		   "dispatch_send:  event %p -> task %p",
1616 		   ev, ev->ev_sender);
1617 	sender = ev->ev_sender;
1618 
1619 	sock->pending_send = 1;
1620 	iev = &sock->writable_ev;
1621 
1622 	sock->references++;
1623 	iev->ev_sender = sock;
1624 	iev->ev_action = internal_send;
1625 	iev->ev_arg = sock;
1626 
1627 	isc_task_send(sender, (isc_event_t **)&iev);
1628 }
1629 
1630 static void
1631 dispatch_connect(isc__socket_t *sock) {
1632 	intev_t *iev;
1633 	isc_socket_connev_t *ev;
1634 
1635 	iev = &sock->writable_ev;
1636 
1637 	ev = sock->connect_ev;
1638 	INSIST(ev != NULL); /* XXX */
1639 
1640 	INSIST(sock->connecting);
1641 
1642 	sock->references++;  /* keep socket around for this internal event */
1643 	iev->ev_sender = sock;
1644 	iev->ev_action = internal_connect;
1645 	iev->ev_arg = sock;
1646 
1647 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1648 }
1649 
1650 /*
1651  * Dequeue an item off the given socket's read queue, set the result code
1652  * in the done event to the one provided, and send it to the task it was
1653  * destined for.
1654  *
1655  * If the event to be sent is on a list, remove it before sending.  If
1656  * asked to, send and detach from the socket as well.
1657  *
1658  * Caller must have the socket locked if the event is attached to the socket.
1659  */
1660 static void
1661 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1662 	isc_task_t *task;
1663 
1664 	task = (*dev)->ev_sender;
1665 
1666 	(*dev)->ev_sender = sock;
1667 
1668 	if (ISC_LINK_LINKED(*dev, ev_link))
1669 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1670 
1671 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1672 	    == ISC_SOCKEVENTATTR_ATTACHED)
1673 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1674 	else
1675 		isc_task_send(task, (isc_event_t **)dev);
1676 }
1677 
1678 /*
1679  * See comments for send_recvdone_event() above.
1680  *
1681  * Caller must have the socket locked if the event is attached to the socket.
1682  */
1683 static void
1684 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1685 	isc_task_t *task;
1686 
1687 	INSIST(dev != NULL && *dev != NULL);
1688 
1689 	task = (*dev)->ev_sender;
1690 	(*dev)->ev_sender = sock;
1691 
1692 	if (ISC_LINK_LINKED(*dev, ev_link))
1693 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1694 
1695 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1696 	    == ISC_SOCKEVENTATTR_ATTACHED)
1697 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1698 	else
1699 		isc_task_send(task, (isc_event_t **)dev);
1700 }
1701 
1702 static void
1703 internal_recv(isc_task_t *me, isc_event_t *ev) {
1704 	isc_socketevent_t *dev;
1705 	isc__socket_t *sock;
1706 
1707 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1708 
1709 	sock = ev->ev_sender;
1710 	INSIST(VALID_SOCKET(sock));
1711 
1712 	socket_log(sock, NULL, IOEVENT,
1713 		   "internal_recv: task %p got event %p", me, ev);
1714 
1715 	INSIST(sock->pending_recv == 1);
1716 	sock->pending_recv = 0;
1717 
1718 	INSIST(sock->references > 0);
1719 	sock->references--;  /* the internal event is done with this socket */
1720 	if (sock->references == 0) {
1721 		destroy(&sock);
1722 		return;
1723 	}
1724 
1725 	/*
1726 	 * Try to do as much I/O as possible on this socket.  There are no
1727 	 * limits here, currently.
1728 	 */
1729 	dev = ISC_LIST_HEAD(sock->recv_list);
1730 	while (dev != NULL) {
1731 		switch (doio_recv(sock, dev)) {
1732 		case DOIO_SOFT:
1733 			goto poke;
1734 
1735 		case DOIO_EOF:
1736 			/*
1737 			 * read of 0 means the remote end was closed.
1738 			 * Run through the event queue and dispatch all
1739 			 * the events with an EOF result code.
1740 			 */
1741 			do {
1742 				dev->result = ISC_R_EOF;
1743 				send_recvdone_event(sock, &dev);
1744 				dev = ISC_LIST_HEAD(sock->recv_list);
1745 			} while (dev != NULL);
1746 			goto poke;
1747 
1748 		case DOIO_SUCCESS:
1749 		case DOIO_HARD:
1750 			send_recvdone_event(sock, &dev);
1751 			break;
1752 		}
1753 
1754 		dev = ISC_LIST_HEAD(sock->recv_list);
1755 	}
1756 
1757  poke:
1758 	if (!ISC_LIST_EMPTY(sock->recv_list))
1759 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1760 }
1761 
1762 static void
1763 internal_send(isc_task_t *me, isc_event_t *ev) {
1764 	isc_socketevent_t *dev;
1765 	isc__socket_t *sock;
1766 
1767 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1768 
1769 	/*
1770 	 * Find out what socket this is and lock it.
1771 	 */
1772 	sock = (isc__socket_t *)ev->ev_sender;
1773 	INSIST(VALID_SOCKET(sock));
1774 	socket_log(sock, NULL, IOEVENT,
1775 		   "internal_send: task %p got event %p", me, ev);
1776 
1777 	INSIST(sock->pending_send == 1);
1778 	sock->pending_send = 0;
1779 
1780 	INSIST(sock->references > 0);
1781 	sock->references--;  /* the internal event is done with this socket */
1782 	if (sock->references == 0) {
1783 		destroy(&sock);
1784 		return;
1785 	}
1786 
1787 	/*
1788 	 * Try to do as much I/O as possible on this socket.  There are no
1789 	 * limits here, currently.
1790 	 */
1791 	dev = ISC_LIST_HEAD(sock->send_list);
1792 	while (dev != NULL) {
1793 		switch (doio_send(sock, dev)) {
1794 		case DOIO_SOFT:
1795 			goto poke;
1796 
1797 		case DOIO_HARD:
1798 		case DOIO_SUCCESS:
1799 			send_senddone_event(sock, &dev);
1800 			break;
1801 		}
1802 
1803 		dev = ISC_LIST_HEAD(sock->send_list);
1804 	}
1805 
1806  poke:
1807 	if (!ISC_LIST_EMPTY(sock->send_list))
1808 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1809 }
1810 
1811 /*
1812  * Process read/writes on each fd here.  Avoid locking
1813  * and unlocking twice if both reads and writes are possible.
1814  */
1815 static void
1816 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1817 	   isc_boolean_t writeable)
1818 {
1819 	isc__socket_t *sock;
1820 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1821 
1822 	/*
1823 	 * If the socket is going to be closed, don't do more I/O.
1824 	 */
1825 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1826 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1827 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1828 		return;
1829 	}
1830 
1831 	sock = manager->fds[fd];
1832 	if (readable) {
1833 		if (sock == NULL) {
1834 			unwatch_read = ISC_TRUE;
1835 			goto check_write;
1836 		}
1837 		if (!SOCK_DEAD(sock)) {
1838 			dispatch_recv(sock);
1839 		}
1840 		unwatch_read = ISC_TRUE;
1841 	}
1842 check_write:
1843 	if (writeable) {
1844 		if (sock == NULL) {
1845 			unwatch_write = ISC_TRUE;
1846 			goto unlock_fd;
1847 		}
1848 		if (!SOCK_DEAD(sock)) {
1849 			if (sock->connecting)
1850 				dispatch_connect(sock);
1851 			else
1852 				dispatch_send(sock);
1853 		}
1854 		unwatch_write = ISC_TRUE;
1855 	}
1856 
1857  unlock_fd:
1858 	if (unwatch_read)
1859 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1860 	if (unwatch_write)
1861 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1862 
1863 }
1864 
1865 static void
1866 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1867 	    fd_set *writefds)
1868 {
1869 	int i;
1870 
1871 	REQUIRE(maxfd <= (int)manager->maxsocks);
1872 
1873 	for (i = 0; i < maxfd; i++) {
1874 		process_fd(manager, i, FD_ISSET(i, readfds),
1875 			   FD_ISSET(i, writefds));
1876 	}
1877 }
1878 
1879 /*
1880  * Create a new socket manager.
1881  */
1882 
1883 static isc_result_t
1884 setup_watcher(isc__socketmgr_t *manager) {
1885 	isc_result_t result;
1886 
1887 	UNUSED(result);
1888 
1889 	manager->fd_bufsize = sizeof(fd_set);
1890 
1891 	manager->read_fds = NULL;
1892 	manager->read_fds_copy = NULL;
1893 	manager->write_fds = NULL;
1894 	manager->write_fds_copy = NULL;
1895 
1896 	manager->read_fds = malloc(manager->fd_bufsize);
1897 	if (manager->read_fds != NULL)
1898 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1899 	if (manager->read_fds_copy != NULL)
1900 		manager->write_fds = malloc(manager->fd_bufsize);
1901 	if (manager->write_fds != NULL) {
1902 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1903 	}
1904 	if (manager->write_fds_copy == NULL) {
1905 		if (manager->write_fds != NULL) {
1906 			free(manager->write_fds);
1907 		}
1908 		if (manager->read_fds_copy != NULL) {
1909 			free(manager->read_fds_copy);
1910 		}
1911 		if (manager->read_fds != NULL) {
1912 			free(manager->read_fds);
1913 		}
1914 		return (ISC_R_NOMEMORY);
1915 	}
1916 	memset(manager->read_fds, 0, manager->fd_bufsize);
1917 	memset(manager->write_fds, 0, manager->fd_bufsize);
1918 
1919 	manager->maxfd = 0;
1920 
1921 	return (ISC_R_SUCCESS);
1922 }
1923 
1924 static void
1925 cleanup_watcher(isc__socketmgr_t *manager) {
1926 
1927 	if (manager->read_fds != NULL)
1928 		free(manager->read_fds);
1929 	if (manager->read_fds_copy != NULL)
1930 		free(manager->read_fds_copy);
1931 	if (manager->write_fds != NULL)
1932 		free(manager->write_fds);
1933 	if (manager->write_fds_copy != NULL)
1934 		free(manager->write_fds_copy);
1935 }
1936 
1937 isc_result_t
1938 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1939 	return (isc__socketmgr_create2(managerp, 0));
1940 }
1941 
1942 isc_result_t
1943 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1944 		       unsigned int maxsocks)
1945 {
1946 	isc__socketmgr_t *manager;
1947 	isc_result_t result;
1948 
1949 	REQUIRE(managerp != NULL && *managerp == NULL);
1950 
1951 	if (socketmgr != NULL) {
1952 		/* Don't allow maxsocks to be updated */
1953 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1954 			return (ISC_R_EXISTS);
1955 
1956 		socketmgr->refs++;
1957 		*managerp = (isc_socketmgr_t *)socketmgr;
1958 		return (ISC_R_SUCCESS);
1959 	}
1960 
1961 	if (maxsocks == 0)
1962 		maxsocks = FD_SETSIZE;
1963 
1964 	manager = malloc(sizeof(*manager));
1965 	if (manager == NULL)
1966 		return (ISC_R_NOMEMORY);
1967 
1968 	/* zero-clear so that necessary cleanup on failure will be easy */
1969 	memset(manager, 0, sizeof(*manager));
1970 	manager->maxsocks = maxsocks;
1971 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1972 	if (manager->fds == NULL) {
1973 		result = ISC_R_NOMEMORY;
1974 		goto free_manager;
1975 	}
1976 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1977 	if (manager->fdstate == NULL) {
1978 		result = ISC_R_NOMEMORY;
1979 		goto free_manager;
1980 	}
1981 
1982 	manager->common.methods = &socketmgrmethods;
1983 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1984 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1985 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1986 	ISC_LIST_INIT(manager->socklist);
1987 
1988 	manager->refs = 1;
1989 
1990 	/*
1991 	 * Set up initial state for the select loop
1992 	 */
1993 	result = setup_watcher(manager);
1994 	if (result != ISC_R_SUCCESS)
1995 		goto cleanup;
1996 
1997 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1998 
1999 	socketmgr = manager;
2000 	*managerp = (isc_socketmgr_t *)manager;
2001 
2002 	return (ISC_R_SUCCESS);
2003 
2004 cleanup:
2005 
2006 free_manager:
2007 	if (manager->fdstate != NULL) {
2008 		free(manager->fdstate);
2009 	}
2010 	if (manager->fds != NULL) {
2011 		free(manager->fds);
2012 	}
2013 	free(manager);
2014 
2015 	return (result);
2016 }
2017 
2018 void
2019 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2020 	isc__socketmgr_t *manager;
2021 	int i;
2022 
2023 	/*
2024 	 * Destroy a socket manager.
2025 	 */
2026 
2027 	REQUIRE(managerp != NULL);
2028 	manager = (isc__socketmgr_t *)*managerp;
2029 	REQUIRE(VALID_MANAGER(manager));
2030 
2031 	manager->refs--;
2032 	if (manager->refs > 0) {
2033 		*managerp = NULL;
2034 		return;
2035 	}
2036 	socketmgr = NULL;
2037 
2038 	/*
2039 	 * Wait for all sockets to be destroyed.
2040 	 */
2041 	while (!ISC_LIST_EMPTY(manager->socklist)) {
2042 		isc__taskmgr_dispatch(NULL);
2043 	}
2044 
2045 	/*
2046 	 * Here, poke our select/poll thread.  Do this by closing the write
2047 	 * half of the pipe, which will send EOF to the read half.
2048 	 * This is currently a no-op in the non-threaded case.
2049 	 */
2050 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2051 
2052 	/*
2053 	 * Clean up.
2054 	 */
2055 	cleanup_watcher(manager);
2056 
2057 	for (i = 0; i < (int)manager->maxsocks; i++)
2058 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2059 			(void)close(i);
2060 
2061 	free(manager->fds);
2062 	free(manager->fdstate);
2063 
2064 	manager->common.magic = 0;
2065 	manager->common.impmagic = 0;
2066 	free(manager);
2067 
2068 	*managerp = NULL;
2069 
2070 	socketmgr = NULL;
2071 }
2072 
2073 static isc_result_t
2074 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2075 	    unsigned int flags)
2076 {
2077 	int io_state;
2078 	isc_task_t *ntask = NULL;
2079 	isc_result_t result = ISC_R_SUCCESS;
2080 
2081 	dev->ev_sender = task;
2082 
2083 	if (sock->type == isc_sockettype_udp) {
2084 		io_state = doio_recv(sock, dev);
2085 	} else {
2086 		if (ISC_LIST_EMPTY(sock->recv_list))
2087 			io_state = doio_recv(sock, dev);
2088 		else
2089 			io_state = DOIO_SOFT;
2090 	}
2091 
2092 	switch (io_state) {
2093 	case DOIO_SOFT:
2094 		/*
2095 		 * We couldn't read all or part of the request right now, so
2096 		 * queue it.
2097 		 *
2098 		 * Attach to socket and to task
2099 		 */
2100 		isc_task_attach(task, &ntask);
2101 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2102 
2103 		/*
2104 		 * Enqueue the request.  If the socket was previously not being
2105 		 * watched, poke the watcher to start paying attention to it.
2106 		 */
2107 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2108 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2109 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2110 
2111 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2112 			   "socket_recv: event %p -> task %p",
2113 			   dev, ntask);
2114 
2115 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2116 			result = ISC_R_INPROGRESS;
2117 		break;
2118 
2119 	case DOIO_EOF:
2120 		dev->result = ISC_R_EOF;
2121 		/* fallthrough */
2122 
2123 	case DOIO_HARD:
2124 	case DOIO_SUCCESS:
2125 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2126 			send_recvdone_event(sock, &dev);
2127 		break;
2128 	}
2129 
2130 	return (result);
2131 }
2132 
2133 isc_result_t
2134 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2135 		  unsigned int minimum, isc_task_t *task,
2136 		  isc_taskaction_t action, void *arg)
2137 {
2138 	isc__socket_t *sock = (isc__socket_t *)sock0;
2139 	isc_socketevent_t *dev;
2140 	isc__socketmgr_t *manager;
2141 	unsigned int iocount;
2142 	isc_buffer_t *buffer;
2143 
2144 	REQUIRE(VALID_SOCKET(sock));
2145 	REQUIRE(buflist != NULL);
2146 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2147 	REQUIRE(task != NULL);
2148 	REQUIRE(action != NULL);
2149 
2150 	manager = sock->manager;
2151 	REQUIRE(VALID_MANAGER(manager));
2152 
2153 	iocount = isc_bufferlist_availablecount(buflist);
2154 	REQUIRE(iocount > 0);
2155 
2156 	INSIST(sock->bound);
2157 
2158 	dev = allocate_socketevent(sock,
2159 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2160 	if (dev == NULL)
2161 		return (ISC_R_NOMEMORY);
2162 
2163 	/*
2164 	 * UDP sockets are always partial read
2165 	 */
2166 	if (sock->type == isc_sockettype_udp)
2167 		dev->minimum = 1;
2168 	else {
2169 		if (minimum == 0)
2170 			dev->minimum = iocount;
2171 		else
2172 			dev->minimum = minimum;
2173 	}
2174 
2175 	/*
2176 	 * Move each buffer from the passed in list to our internal one.
2177 	 */
2178 	buffer = ISC_LIST_HEAD(*buflist);
2179 	while (buffer != NULL) {
2180 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2181 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2182 		buffer = ISC_LIST_HEAD(*buflist);
2183 	}
2184 
2185 	return (socket_recv(sock, dev, task, 0));
2186 }
2187 
2188 static isc_result_t
2189 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2190 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2191 	    unsigned int flags)
2192 {
2193 	int io_state;
2194 	isc_task_t *ntask = NULL;
2195 	isc_result_t result = ISC_R_SUCCESS;
2196 
2197 	dev->ev_sender = task;
2198 
2199 	set_dev_address(address, sock, dev);
2200 	if (pktinfo != NULL) {
2201 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2202 		dev->pktinfo = *pktinfo;
2203 
2204 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2205 		    !isc_sockaddr_islinklocal(&dev->address)) {
2206 			socket_log(sock, NULL, TRACE,
2207 				   "pktinfo structure provided, ifindex %u "
2208 				   "(set to 0)", pktinfo->ipi6_ifindex);
2209 
2210 			/*
2211 			 * Set the pktinfo index to 0 here, to let the
2212 			 * kernel decide what interface it should send on.
2213 			 */
2214 			dev->pktinfo.ipi6_ifindex = 0;
2215 		}
2216 	}
2217 
2218 	if (sock->type == isc_sockettype_udp)
2219 		io_state = doio_send(sock, dev);
2220 	else {
2221 		if (ISC_LIST_EMPTY(sock->send_list))
2222 			io_state = doio_send(sock, dev);
2223 		else
2224 			io_state = DOIO_SOFT;
2225 	}
2226 
2227 	switch (io_state) {
2228 	case DOIO_SOFT:
2229 		/*
2230 		 * We couldn't send all or part of the request right now, so
2231 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2232 		 */
2233 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2234 			isc_task_attach(task, &ntask);
2235 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2236 
2237 			/*
2238 			 * Enqueue the request.  If the socket was previously
2239 			 * not being watched, poke the watcher to start
2240 			 * paying attention to it.
2241 			 */
2242 			if (ISC_LIST_EMPTY(sock->send_list) &&
2243 			    !sock->pending_send)
2244 				select_poke(sock->manager, sock->fd,
2245 					    SELECT_POKE_WRITE);
2246 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2247 
2248 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2249 				   "socket_send: event %p -> task %p",
2250 				   dev, ntask);
2251 
2252 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2253 				result = ISC_R_INPROGRESS;
2254 			break;
2255 		}
2256 
2257 		/* FALLTHROUGH */
2258 
2259 	case DOIO_HARD:
2260 	case DOIO_SUCCESS:
2261 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2262 			send_senddone_event(sock, &dev);
2263 		break;
2264 	}
2265 
2266 	return (result);
2267 }
2268 
2269 isc_result_t
2270 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2271 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2272 {
2273 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2274 				     NULL, 0));
2275 }
2276 
2277 isc_result_t
2278 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2279 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2280 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2281 		     unsigned int flags)
2282 {
2283 	isc__socket_t *sock = (isc__socket_t *)sock0;
2284 	isc_socketevent_t *dev;
2285 	isc__socketmgr_t *manager;
2286 	unsigned int iocount;
2287 	isc_buffer_t *buffer;
2288 
2289 	REQUIRE(VALID_SOCKET(sock));
2290 	REQUIRE(buflist != NULL);
2291 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2292 	REQUIRE(task != NULL);
2293 	REQUIRE(action != NULL);
2294 
2295 	manager = sock->manager;
2296 	REQUIRE(VALID_MANAGER(manager));
2297 
2298 	iocount = isc_bufferlist_usedcount(buflist);
2299 	REQUIRE(iocount > 0);
2300 
2301 	dev = allocate_socketevent(sock,
2302 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2303 	if (dev == NULL)
2304 		return (ISC_R_NOMEMORY);
2305 
2306 	/*
2307 	 * Move each buffer from the passed in list to our internal one.
2308 	 */
2309 	buffer = ISC_LIST_HEAD(*buflist);
2310 	while (buffer != NULL) {
2311 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2312 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2313 		buffer = ISC_LIST_HEAD(*buflist);
2314 	}
2315 
2316 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2317 }
2318 
2319 isc_result_t
2320 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2321 		 unsigned int options) {
2322 	isc__socket_t *sock = (isc__socket_t *)sock0;
2323 	char strbuf[ISC_STRERRORSIZE];
2324 	int on = 1;
2325 
2326 	REQUIRE(VALID_SOCKET(sock));
2327 
2328 	INSIST(!sock->bound);
2329 
2330 	if (sock->pf != sockaddr->type.sa.sa_family) {
2331 		return (ISC_R_FAMILYMISMATCH);
2332 	}
2333 
2334 	/*
2335 	 * Only set SO_REUSEADDR when we want a specific port.
2336 	 */
2337 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2338 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2339 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2340 		       sizeof(on)) < 0) {
2341 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2342 				 "setsockopt(%d) %s", sock->fd, "failed");
2343 		/* Press on... */
2344 	}
2345 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2346 		switch (errno) {
2347 		case EACCES:
2348 			return (ISC_R_NOPERM);
2349 		case EADDRNOTAVAIL:
2350 			return (ISC_R_ADDRNOTAVAIL);
2351 		case EADDRINUSE:
2352 			return (ISC_R_ADDRINUSE);
2353 		case EINVAL:
2354 			return (ISC_R_BOUND);
2355 		default:
2356 			isc__strerror(errno, strbuf, sizeof(strbuf));
2357 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2358 					 strbuf);
2359 			return (ISC_R_UNEXPECTED);
2360 		}
2361 	}
2362 
2363 	socket_log(sock, sockaddr, TRACE, "bound");
2364 	sock->bound = 1;
2365 
2366 	return (ISC_R_SUCCESS);
2367 }
2368 
2369 isc_result_t
2370 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2371 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2372 {
2373 	isc__socket_t *sock = (isc__socket_t *)sock0;
2374 	isc_socket_connev_t *dev;
2375 	isc_task_t *ntask = NULL;
2376 	isc__socketmgr_t *manager;
2377 	int cc;
2378 	char strbuf[ISC_STRERRORSIZE];
2379 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2380 
2381 	REQUIRE(VALID_SOCKET(sock));
2382 	REQUIRE(addr != NULL);
2383 	REQUIRE(task != NULL);
2384 	REQUIRE(action != NULL);
2385 
2386 	manager = sock->manager;
2387 	REQUIRE(VALID_MANAGER(manager));
2388 	REQUIRE(addr != NULL);
2389 
2390 	if (isc_sockaddr_ismulticast(addr))
2391 		return (ISC_R_MULTICAST);
2392 
2393 	REQUIRE(!sock->connecting);
2394 
2395 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2396 							ISC_SOCKEVENT_CONNECT,
2397 							action,	arg,
2398 							sizeof(*dev));
2399 	if (dev == NULL) {
2400 		return (ISC_R_NOMEMORY);
2401 	}
2402 	ISC_LINK_INIT(dev, ev_link);
2403 
2404 	/*
2405 	 * Try to do the connect right away, as there can be only one
2406 	 * outstanding, and it might happen to complete.
2407 	 */
2408 	sock->peer_address = *addr;
2409 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2410 	if (cc < 0) {
2411 		/*
2412 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2413 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2414 		 * a success and let the user detect it if it's really an error
2415 		 * at the time of sending a packet on the socket.
2416 		 */
2417 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2418 			cc = 0;
2419 			goto success;
2420 		}
2421 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2422 			goto queue;
2423 
2424 		switch (errno) {
2425 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2426 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2427 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2428 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2429 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2430 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2431 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2432 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2433 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2434 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2435 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2436 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2437 #undef ERROR_MATCH
2438 		}
2439 
2440 		sock->connected = 0;
2441 
2442 		isc__strerror(errno, strbuf, sizeof(strbuf));
2443 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2444 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2445 				 addrbuf, errno, strbuf);
2446 
2447 		isc_event_free(ISC_EVENT_PTR(&dev));
2448 		return (ISC_R_UNEXPECTED);
2449 
2450 	err_exit:
2451 		sock->connected = 0;
2452 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2453 
2454 		return (ISC_R_SUCCESS);
2455 	}
2456 
2457 	/*
2458 	 * If connect completed, fire off the done event.
2459 	 */
2460  success:
2461 	if (cc == 0) {
2462 		sock->connected = 1;
2463 		sock->bound = 1;
2464 		dev->result = ISC_R_SUCCESS;
2465 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2466 
2467 		return (ISC_R_SUCCESS);
2468 	}
2469 
2470  queue:
2471 
2472 	/*
2473 	 * Attach to task.
2474 	 */
2475 	isc_task_attach(task, &ntask);
2476 
2477 	sock->connecting = 1;
2478 
2479 	dev->ev_sender = ntask;
2480 
2481 	/*
2482 	 * Poke watcher here.  We still have the socket locked, so there
2483 	 * is no race condition.  We will keep the lock for such a short
2484 	 * bit of time waking it up now or later won't matter all that much.
2485 	 */
2486 	if (sock->connect_ev == NULL)
2487 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2488 
2489 	sock->connect_ev = dev;
2490 
2491 	return (ISC_R_SUCCESS);
2492 }
2493 
2494 /*
2495  * Called when a socket with a pending connect() finishes.
2496  */
2497 static void
2498 internal_connect(isc_task_t *me, isc_event_t *ev) {
2499 	isc__socket_t *sock;
2500 	isc_socket_connev_t *dev;
2501 	isc_task_t *task;
2502 	int cc;
2503 	socklen_t optlen;
2504 	char strbuf[ISC_STRERRORSIZE];
2505 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2506 
2507 	UNUSED(me);
2508 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2509 
2510 	sock = ev->ev_sender;
2511 	INSIST(VALID_SOCKET(sock));
2512 
2513 	/*
2514 	 * When the internal event was sent the reference count was bumped
2515 	 * to keep the socket around for us.  Decrement the count here.
2516 	 */
2517 	INSIST(sock->references > 0);
2518 	sock->references--;
2519 	if (sock->references == 0) {
2520 		destroy(&sock);
2521 		return;
2522 	}
2523 
2524 	/*
2525 	 * Has this event been canceled?
2526 	 */
2527 	dev = sock->connect_ev;
2528 	if (dev == NULL) {
2529 		INSIST(!sock->connecting);
2530 		return;
2531 	}
2532 
2533 	INSIST(sock->connecting);
2534 	sock->connecting = 0;
2535 
2536 	/*
2537 	 * Get any possible error status here.
2538 	 */
2539 	optlen = sizeof(cc);
2540 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2541 		       (void *)&cc, (void *)&optlen) < 0)
2542 		cc = errno;
2543 	else
2544 		errno = cc;
2545 
2546 	if (errno != 0) {
2547 		/*
2548 		 * If the error is EAGAIN, just re-select on this
2549 		 * fd and pretend nothing strange happened.
2550 		 */
2551 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2552 			sock->connecting = 1;
2553 			select_poke(sock->manager, sock->fd,
2554 				    SELECT_POKE_CONNECT);
2555 			return;
2556 		}
2557 
2558 
2559 		/*
2560 		 * Translate other errors into ISC_R_* flavors.
2561 		 */
2562 		switch (errno) {
2563 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2564 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2565 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2566 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2567 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2568 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2569 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2570 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2571 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2572 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2573 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2574 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2575 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2576 #undef ERROR_MATCH
2577 		default:
2578 			dev->result = ISC_R_UNEXPECTED;
2579 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2580 					    sizeof(peerbuf));
2581 			isc__strerror(errno, strbuf, sizeof(strbuf));
2582 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2583 					 "internal_connect: connect(%s) %s",
2584 					 peerbuf, strbuf);
2585 		}
2586 	} else {
2587 		dev->result = ISC_R_SUCCESS;
2588 		sock->connected = 1;
2589 		sock->bound = 1;
2590 	}
2591 
2592 	sock->connect_ev = NULL;
2593 
2594 	task = dev->ev_sender;
2595 	dev->ev_sender = sock;
2596 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2597 }
2598 
2599 /*
2600  * Run through the list of events on this socket, and cancel the ones
2601  * queued for task "task" of type "how".  "how" is a bitmask.
2602  */
2603 void
2604 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2605 	isc__socket_t *sock = (isc__socket_t *)sock0;
2606 
2607 	REQUIRE(VALID_SOCKET(sock));
2608 
2609 	/*
2610 	 * Quick exit if there is nothing to do.  Don't even bother locking
2611 	 * in this case.
2612 	 */
2613 	if (how == 0)
2614 		return;
2615 
2616 	/*
2617 	 * All of these do the same thing, more or less.
2618 	 * Each will:
2619 	 *	o If the internal event is marked as "posted" try to
2620 	 *	  remove it from the task's queue.  If this fails, mark it
2621 	 *	  as canceled instead, and let the task clean it up later.
2622 	 *	o For each I/O request for that task of that type, post
2623 	 *	  its done event with status of "ISC_R_CANCELED".
2624 	 *	o Reset any state needed.
2625 	 */
2626 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2627 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2628 		isc_socketevent_t      *dev;
2629 		isc_socketevent_t      *next;
2630 		isc_task_t	       *current_task;
2631 
2632 		dev = ISC_LIST_HEAD(sock->recv_list);
2633 
2634 		while (dev != NULL) {
2635 			current_task = dev->ev_sender;
2636 			next = ISC_LIST_NEXT(dev, ev_link);
2637 
2638 			if ((task == NULL) || (task == current_task)) {
2639 				dev->result = ISC_R_CANCELED;
2640 				send_recvdone_event(sock, &dev);
2641 			}
2642 			dev = next;
2643 		}
2644 	}
2645 
2646 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2647 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2648 		isc_socketevent_t      *dev;
2649 		isc_socketevent_t      *next;
2650 		isc_task_t	       *current_task;
2651 
2652 		dev = ISC_LIST_HEAD(sock->send_list);
2653 
2654 		while (dev != NULL) {
2655 			current_task = dev->ev_sender;
2656 			next = ISC_LIST_NEXT(dev, ev_link);
2657 
2658 			if ((task == NULL) || (task == current_task)) {
2659 				dev->result = ISC_R_CANCELED;
2660 				send_senddone_event(sock, &dev);
2661 			}
2662 			dev = next;
2663 		}
2664 	}
2665 
2666 	/*
2667 	 * Connecting is not a list.
2668 	 */
2669 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2670 	    && sock->connect_ev != NULL) {
2671 		isc_socket_connev_t    *dev;
2672 		isc_task_t	       *current_task;
2673 
2674 		INSIST(sock->connecting);
2675 		sock->connecting = 0;
2676 
2677 		dev = sock->connect_ev;
2678 		current_task = dev->ev_sender;
2679 
2680 		if ((task == NULL) || (task == current_task)) {
2681 			sock->connect_ev = NULL;
2682 
2683 			dev->result = ISC_R_CANCELED;
2684 			dev->ev_sender = sock;
2685 			isc_task_sendanddetach(&current_task,
2686 					       ISC_EVENT_PTR(&dev));
2687 		}
2688 	}
2689 
2690 }
2691 
2692 /*
2693  * In our assumed scenario, we can simply use a single static object.
2694  * XXX: this is not true if the application uses multiple threads with
2695  *      'multi-context' mode.  Fixing this is a future TODO item.
2696  */
2697 static isc_socketwait_t swait_private;
2698 
2699 int
2700 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2701 			  isc_socketwait_t **swaitp)
2702 {
2703 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2704 	int n;
2705 
2706 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2707 
2708 	if (manager == NULL)
2709 		manager = socketmgr;
2710 	if (manager == NULL)
2711 		return (0);
2712 
2713 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2714 	memmove(manager->write_fds_copy, manager->write_fds,
2715 		manager->fd_bufsize);
2716 
2717 	swait_private.readset = manager->read_fds_copy;
2718 	swait_private.writeset = manager->write_fds_copy;
2719 	swait_private.maxfd = manager->maxfd + 1;
2720 
2721 	n = select(swait_private.maxfd, swait_private.readset,
2722 		   swait_private.writeset, NULL, tvp);
2723 
2724 	*swaitp = &swait_private;
2725 	return (n);
2726 }
2727 
2728 isc_result_t
2729 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2730 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2731 
2732 	REQUIRE(swait == &swait_private);
2733 
2734 	if (manager == NULL)
2735 		manager = socketmgr;
2736 	if (manager == NULL)
2737 		return (ISC_R_NOTFOUND);
2738 
2739 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2740 	return (ISC_R_SUCCESS);
2741 }
2742 
2743 #include "../socket_api.c"
2744