xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 87f06ebfa2676f5f3be9c0bb649f609d4128e018)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/strerror.h>
41 #include <isc/task.h>
42 #include <isc/util.h>
43 
44 #include "errno2result.h"
45 
46 #include "socket_p.h"
47 #include "../task_p.h"
48 
49 struct isc_socketwait {
50 	fd_set *readset;
51 	fd_set *writeset;
52 	int nfds;
53 	int maxfd;
54 };
55 
56 /*
57  * Set by the -T dscp option on the command line. If set to a value
58  * other than -1, we check to make sure DSCP values match it, and
59  * assert if not.
60  */
61 int isc_dscp_check_value = -1;
62 
63 /*%
64  * Size of per-FD lock buckets.
65  */
66 #define FDLOCK_ID(fd)		0
67 
68 /*%
69  * Some systems define the socket length argument as an int, some as size_t,
70  * some as socklen_t.  This is here so it can be easily changed if needed.
71  */
72 
73 /*%
74  * Define what the possible "soft" errors can be.  These are non-fatal returns
75  * of various network related functions, like recv() and so on.
76  *
77  * For some reason, BSDI (and perhaps others) will sometimes return <0
78  * from recv() but will have errno==0.  This is broken, but we have to
79  * work around it here.
80  */
81 #define SOFT_ERROR(e)	((e) == EAGAIN || \
82 			 (e) == EWOULDBLOCK || \
83 			 (e) == EINTR || \
84 			 (e) == 0)
85 
86 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
87 
88 /*!<
89  * DLVL(90)  --  Function entry/exit and other tracing.
90  * DLVL(60)  --  Socket data send/receive
91  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
92  * DLVL(20)  --  Socket creation/destruction.
93  */
94 #define TRACE_LEVEL		90
95 #define IOEVENT_LEVEL		60
96 #define EVENT_LEVEL		50
97 #define CREATION_LEVEL		20
98 
99 #define TRACE		DLVL(TRACE_LEVEL)
100 #define IOEVENT		DLVL(IOEVENT_LEVEL)
101 #define EVENT		DLVL(EVENT_LEVEL)
102 #define CREATION	DLVL(CREATION_LEVEL)
103 
104 typedef isc_event_t intev_t;
105 
106 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
107 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
108 
109 /*!
110  * IPv6 control information.  If the socket is an IPv6 socket we want
111  * to collect the destination address and interface so the client can
112  * set them on outgoing packets.
113  */
114 
115 /*%
116  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
117  * a setsockopt() like interface to request timestamps, and if the OS
118  * doesn't do it for us, call gettimeofday() on every UDP receive?
119  */
120 
121 /*%
122  * Instead of calculating the cmsgbuf lengths every time we take
123  * a rule of thumb approach - sizes are taken from x86_64 linux,
124  * multiplied by 2, everything should fit. Those sizes are not
125  * large enough to cause any concern.
126  */
127 #define CMSG_SP_IN6PKT 40
128 
129 #define CMSG_SP_TIMESTAMP 32
130 
131 #define CMSG_SP_TCTOS 24
132 
133 #define CMSG_SP_INT 24
134 
135 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
136 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
137 
138 /*%
139  * The number of times a send operation is repeated if the result is EINTR.
140  */
141 #define NRETRIES 10
142 
143 typedef struct isc__socket isc__socket_t;
144 typedef struct isc__socketmgr isc__socketmgr_t;
145 
146 struct isc__socket {
147 	/* Not locked. */
148 	isc_socket_t		common;
149 	isc__socketmgr_t	*manager;
150 	isc_sockettype_t	type;
151 
152 	/* Locked by socket lock. */
153 	ISC_LINK(isc__socket_t)	link;
154 	unsigned int		references;
155 	int			fd;
156 	int			pf;
157 
158 	ISC_LIST(isc_socketevent_t)		send_list;
159 	ISC_LIST(isc_socketevent_t)		recv_list;
160 	isc_socket_connev_t		       *connect_ev;
161 
162 	/*
163 	 * Internal events.  Posted when a descriptor is readable or
164 	 * writable.  These are statically allocated and never freed.
165 	 * They will be set to non-purgable before use.
166 	 */
167 	intev_t			readable_ev;
168 	intev_t			writable_ev;
169 
170 	isc_sockaddr_t		peer_address;       /* remote address */
171 
172 	unsigned int		pending_recv : 1,
173 				pending_send : 1,
174 				connected : 1,
175 				connecting : 1,     /* connect pending */
176 				bound : 1,          /* bound to local addr */
177 				active : 1,         /* currently active */
178 				pktdscp : 1;	    /* per packet dscp */
179 	unsigned int		dscp;
180 };
181 
182 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
183 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
184 
185 struct isc__socketmgr {
186 	/* Not locked. */
187 	isc_socketmgr_t		common;
188 	int			fd_bufsize;
189 	unsigned int		maxsocks;
190 
191 	isc__socket_t	       **fds;
192 	int			*fdstate;
193 
194 	/* Locked by manager lock. */
195 	ISC_LIST(isc__socket_t)	socklist;
196 	fd_set			*read_fds;
197 	fd_set			*read_fds_copy;
198 	fd_set			*write_fds;
199 	fd_set			*write_fds_copy;
200 	int			maxfd;
201 	unsigned int		refs;
202 };
203 
204 static isc__socketmgr_t *socketmgr = NULL;
205 
206 #define CLOSED			0	/* this one must be zero */
207 #define MANAGED			1
208 #define CLOSE_PENDING		2
209 
210 /*
211  * send() and recv() iovec counts
212  */
213 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
214 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
215 
216 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
217 				  isc_sockettype_t type,
218 				  isc_socket_t **socketp);
219 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
220 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
221 static void free_socket(isc__socket_t **);
222 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
223 				    isc__socket_t **);
224 static void destroy(isc__socket_t **);
225 static void internal_connect(isc_task_t *, isc_event_t *);
226 static void internal_recv(isc_task_t *, isc_event_t *);
227 static void internal_send(isc_task_t *, isc_event_t *);
228 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
229 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
230 			      struct msghdr *, struct iovec *, size_t *);
231 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
232 			      struct msghdr *, struct iovec *, size_t *);
233 
234 /*%
235  * The following are intended for internal use (indicated by "isc__"
236  * prefix) but are not declared as static, allowing direct access from
237  * unit tests etc.
238  */
239 
240 isc_result_t
241 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
242 		   isc_socket_t **socketp);
243 void
244 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
245 void
246 isc__socket_detach(isc_socket_t **socketp);
247 isc_result_t
248 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
249 		 unsigned int minimum, isc_task_t *task,
250 		  isc_taskaction_t action, void *arg);
251 isc_result_t
252 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
253 		  isc_task_t *task, isc_taskaction_t action, void *arg);
254 isc_result_t
255 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
256 		     isc_task_t *task, isc_taskaction_t action, void *arg,
257 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
258 		     unsigned int flags);
259 isc_result_t
260 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
261 		 unsigned int options);
262 isc_result_t
263 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
264 		    isc_task_t *task, isc_taskaction_t action,
265 		    void *arg);
266 void
267 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
268 
269 isc_result_t
270 isc__socketmgr_create(isc_socketmgr_t **managerp);
271 isc_result_t
272 isc__socketmgr_create2(isc_socketmgr_t **managerp,
273 		       unsigned int maxsocks);
274 isc_result_t
275 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
276 void
277 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
278 
279 static struct {
280 	isc_socketmethods_t methods;
281 
282 	/*%
283 	 * The following are defined just for avoiding unused static functions.
284 	 */
285 	void *recvv, *sendv;
286 } socketmethods = {
287 	{
288 		isc__socket_attach,
289 		isc__socket_detach,
290 		isc__socket_bind,
291 		isc__socket_connect,
292 		isc__socket_cancel,
293 	},
294 	(void *)isc__socket_recvv,
295 	(void *)isc__socket_sendv,
296 };
297 
298 static isc_socketmgrmethods_t socketmgrmethods = {
299 	isc__socketmgr_destroy,
300 	isc__socket_create
301 };
302 
303 #define SELECT_POKE_SHUTDOWN		(-1)
304 #define SELECT_POKE_READ		(-3)
305 #define SELECT_POKE_WRITE		(-4)
306 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
307 #define SELECT_POKE_CLOSE		(-5)
308 
309 #define SOCK_DEAD(s)			((s)->references == 0)
310 
311 /*%
312  * Shortcut index arrays to get access to statistics counters.
313  */
314 enum {
315 	STATID_OPEN = 0,
316 	STATID_OPENFAIL = 1,
317 	STATID_CLOSE = 2,
318 	STATID_BINDFAIL = 3,
319 	STATID_CONNECTFAIL = 4,
320 	STATID_CONNECT = 5,
321 	STATID_ACCEPTFAIL = 6,
322 	STATID_ACCEPT = 7,
323 	STATID_SENDFAIL = 8,
324 	STATID_RECVFAIL = 9,
325 	STATID_ACTIVE = 10
326 };
327 
328 
329 static void
330 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
331 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
332 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
333 static void
334 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
335 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
336 	   const char *fmt, ...)
337 {
338 	char msgbuf[2048];
339 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
340 	va_list ap;
341 
342 	if (! isc_log_wouldlog(isc_lctx, level))
343 		return;
344 
345 	va_start(ap, fmt);
346 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
347 	va_end(ap);
348 
349 	if (address == NULL) {
350 		isc_log_write(isc_lctx, category, module, level,
351 			       "socket %p: %s", sock, msgbuf);
352 	} else {
353 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
354 		isc_log_write(isc_lctx, category, module, level,
355 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
356 	}
357 }
358 
359 static inline isc_result_t
360 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
361 	isc_result_t result = ISC_R_SUCCESS;
362 
363 	if (msg == SELECT_POKE_READ)
364 		FD_SET(fd, manager->read_fds);
365 	if (msg == SELECT_POKE_WRITE)
366 		FD_SET(fd, manager->write_fds);
367 
368 	return (result);
369 }
370 
371 static inline isc_result_t
372 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
373 	isc_result_t result = ISC_R_SUCCESS;
374 
375 	if (msg == SELECT_POKE_READ)
376 		FD_CLR(fd, manager->read_fds);
377 	else if (msg == SELECT_POKE_WRITE)
378 		FD_CLR(fd, manager->write_fds);
379 
380 	return (result);
381 }
382 
383 static void
384 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
385 	isc_result_t result;
386 
387 	/*
388 	 * This is a wakeup on a socket.  If the socket is not in the
389 	 * process of being closed, start watching it for either reads
390 	 * or writes.
391 	 */
392 
393 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
394 
395 	if (msg == SELECT_POKE_CLOSE) {
396 		/* No one should be updating fdstate, so no need to lock it */
397 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
398 		manager->fdstate[fd] = CLOSED;
399 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
400 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
401 		(void)close(fd);
402 		return;
403 	}
404 
405 	if (manager->fdstate[fd] == CLOSE_PENDING) {
406 
407 		/*
408 		 * We accept (and ignore) any error from unwatch_fd() as we are
409 		 * closing the socket, hoping it doesn't leave dangling state in
410 		 * the kernel.
411 		 */
412 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
413 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
414 		return;
415 	}
416 	if (manager->fdstate[fd] != MANAGED) {
417 		return;
418 	}
419 
420 	/*
421 	 * Set requested bit.
422 	 */
423 	result = watch_fd(manager, fd, msg);
424 	if (result != ISC_R_SUCCESS) {
425 		/*
426 		 * XXXJT: what should we do?  Ignoring the failure of watching
427 		 * a socket will make the application dysfunctional, but there
428 		 * seems to be no reasonable recovery process.
429 		 */
430 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
431 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
432 			      "failed to start watching FD (%d): %s",
433 			      fd, isc_result_totext(result));
434 	}
435 }
436 
437 /*
438  * Update the state of the socketmgr when something changes.
439  */
440 static void
441 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
442 	if (msg == SELECT_POKE_SHUTDOWN)
443 		return;
444 	else if (fd >= 0)
445 		wakeup_socket(manager, fd, msg);
446 	return;
447 }
448 
449 /*
450  * Make a fd non-blocking.
451  */
452 static isc_result_t
453 make_nonblock(int fd) {
454 	int ret;
455 	char strbuf[ISC_STRERRORSIZE];
456 	int flags;
457 
458 	flags = fcntl(fd, F_GETFL, 0);
459 	flags |= O_NONBLOCK;
460 	ret = fcntl(fd, F_SETFL, flags);
461 
462 	if (ret == -1) {
463 		isc__strerror(errno, strbuf, sizeof(strbuf));
464 		UNEXPECTED_ERROR(__FILE__, __LINE__,
465 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
466 				 strbuf);
467 
468 		return (ISC_R_UNEXPECTED);
469 	}
470 
471 	return (ISC_R_SUCCESS);
472 }
473 
474 /*
475  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
476  * In order to ensure as much portability as possible, we provide wrapper
477  * functions of these macros.
478  * Note that cmsg_space() could run slow on OSes that do not have
479  * CMSG_SPACE.
480  */
481 static inline socklen_t
482 cmsg_len(socklen_t len) {
483 	return (CMSG_LEN(len));
484 }
485 
486 static inline socklen_t
487 cmsg_space(socklen_t len) {
488 	return (CMSG_SPACE(len));
489 }
490 
491 /*
492  * Process control messages received on a socket.
493  */
494 static void
495 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
496 	struct cmsghdr *cmsgp;
497 	struct in6_pktinfo *pktinfop;
498 	void *timevalp;
499 
500 	/*
501 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
502 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
503 	 * They are all here, outside of the CPP tests, because it is
504 	 * more consistent with the usual ISC coding style.
505 	 */
506 	UNUSED(sock);
507 	UNUSED(msg);
508 	UNUSED(dev);
509 
510 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
511 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
512 
513 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
514 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
515 
516 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
517 		return;
518 
519 	timevalp = NULL;
520 	pktinfop = NULL;
521 
522 	cmsgp = CMSG_FIRSTHDR(msg);
523 	while (cmsgp != NULL) {
524 		socket_log(sock, NULL, TRACE,
525 			   "processing cmsg %p", cmsgp);
526 
527 		if (cmsgp->cmsg_level == IPPROTO_IPV6
528 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
529 
530 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
531 			memmove(&dev->pktinfo, pktinfop,
532 				sizeof(struct in6_pktinfo));
533 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
534 			socket_log(sock, NULL, TRACE,
535 				   "interface received on ifindex %u",
536 				   dev->pktinfo.ipi6_ifindex);
537 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
538 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
539 			goto next;
540 		}
541 
542 		if (cmsgp->cmsg_level == SOL_SOCKET
543 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
544 			struct timeval tv;
545 			timevalp = CMSG_DATA(cmsgp);
546 			memmove(&tv, timevalp, sizeof(tv));
547 			dev->timestamp.seconds = tv.tv_sec;
548 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
549 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
550 			goto next;
551 		}
552 
553 		if (cmsgp->cmsg_level == IPPROTO_IPV6
554 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
555 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
556 			dev->dscp >>= 2;
557 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
558 			goto next;
559 		}
560 
561 		if (cmsgp->cmsg_level == IPPROTO_IP
562 		    && (cmsgp->cmsg_type == IP_TOS)) {
563 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
564 			dev->dscp >>= 2;
565 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
566 			goto next;
567 		}
568 	next:
569 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
570 	}
571 
572 }
573 
574 /*
575  * Construct an iov array and attach it to the msghdr passed in.  This is
576  * the SEND constructor, which will use the used region of the buffer
577  * (if using a buffer list) or will use the internal region (if a single
578  * buffer I/O is requested).
579  *
580  * Nothing can be NULL, and the done event must list at least one buffer
581  * on the buffer linked list for this function to be meaningful.
582  *
583  * If write_countp != NULL, *write_countp will hold the number of bytes
584  * this transaction can send.
585  */
586 static void
587 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
588 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
589 {
590 	unsigned int iovcount;
591 	isc_buffer_t *buffer;
592 	isc_region_t used;
593 	size_t write_count;
594 	size_t skip_count;
595 	struct cmsghdr *cmsgp;
596 
597 	memset(msg, 0, sizeof(*msg));
598 
599 	if (!sock->connected) {
600 		msg->msg_name = (void *)&dev->address.type.sa;
601 		msg->msg_namelen = dev->address.length;
602 	} else {
603 		msg->msg_name = NULL;
604 		msg->msg_namelen = 0;
605 	}
606 
607 	buffer = ISC_LIST_HEAD(dev->bufferlist);
608 	write_count = 0;
609 	iovcount = 0;
610 
611 	/*
612 	 * Single buffer I/O?  Skip what we've done so far in this region.
613 	 */
614 	if (buffer == NULL) {
615 		write_count = dev->region.length - dev->n;
616 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
617 		iov[0].iov_len = write_count;
618 		iovcount = 1;
619 
620 		goto config;
621 	}
622 
623 	/*
624 	 * Multibuffer I/O.
625 	 * Skip the data in the buffer list that we have already written.
626 	 */
627 	skip_count = dev->n;
628 	while (buffer != NULL) {
629 		REQUIRE(ISC_BUFFER_VALID(buffer));
630 		if (skip_count < isc_buffer_usedlength(buffer))
631 			break;
632 		skip_count -= isc_buffer_usedlength(buffer);
633 		buffer = ISC_LIST_NEXT(buffer, link);
634 	}
635 
636 	while (buffer != NULL) {
637 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
638 
639 		isc_buffer_usedregion(buffer, &used);
640 
641 		if (used.length > 0) {
642 			iov[iovcount].iov_base = (void *)(used.base
643 							  + skip_count);
644 			iov[iovcount].iov_len = used.length - skip_count;
645 			write_count += (used.length - skip_count);
646 			skip_count = 0;
647 			iovcount++;
648 		}
649 		buffer = ISC_LIST_NEXT(buffer, link);
650 	}
651 
652 	INSIST(skip_count == 0U);
653 
654  config:
655 	msg->msg_iov = iov;
656 	msg->msg_iovlen = iovcount;
657 
658 	msg->msg_control = NULL;
659 	msg->msg_controllen = 0;
660 	msg->msg_flags = 0;
661 
662 	if ((sock->type == isc_sockettype_udp) &&
663 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
664 	{
665 		struct in6_pktinfo *pktinfop;
666 
667 		socket_log(sock, NULL, TRACE,
668 			   "sendto pktinfo data, ifindex %u",
669 			   dev->pktinfo.ipi6_ifindex);
670 
671 		msg->msg_control = (void *)cmsgbuf;
672 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
673 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
674 
675 		cmsgp = (struct cmsghdr *)cmsgbuf;
676 		cmsgp->cmsg_level = IPPROTO_IPV6;
677 		cmsgp->cmsg_type = IPV6_PKTINFO;
678 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
679 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
680 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
681 	}
682 
683 	if ((sock->type == isc_sockettype_udp) &&
684 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
685 	{
686 		int use_min_mtu = 1;	/* -1, 0, 1 */
687 
688 		cmsgp = (struct cmsghdr *)(cmsgbuf +
689 					   msg->msg_controllen);
690 
691 		msg->msg_control = (void *)cmsgbuf;
692 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
693 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
694 
695 		cmsgp->cmsg_level = IPPROTO_IPV6;
696 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
697 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
698 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
699 	}
700 
701 	if (isc_dscp_check_value > -1) {
702 		if (sock->type == isc_sockettype_udp)
703 			INSIST((int)dev->dscp == isc_dscp_check_value);
704 		else if (sock->type == isc_sockettype_tcp)
705 			INSIST((int)sock->dscp == isc_dscp_check_value);
706 	}
707 
708 	if ((sock->type == isc_sockettype_udp) &&
709 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
710 	{
711 		int dscp = (dev->dscp << 2) & 0xff;
712 
713 		INSIST(dev->dscp < 0x40);
714 
715 		if (sock->pf == AF_INET && sock->pktdscp) {
716 			cmsgp = (struct cmsghdr *)(cmsgbuf +
717 						   msg->msg_controllen);
718 			msg->msg_control = (void *)cmsgbuf;
719 			msg->msg_controllen += cmsg_space(sizeof(dscp));
720 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
721 
722 			cmsgp->cmsg_level = IPPROTO_IP;
723 			cmsgp->cmsg_type = IP_TOS;
724 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
725 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
726 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
727 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
728 			       (void *)&dscp, sizeof(int)) < 0)
729 			{
730 				char strbuf[ISC_STRERRORSIZE];
731 				isc__strerror(errno, strbuf, sizeof(strbuf));
732 				UNEXPECTED_ERROR(__FILE__, __LINE__,
733 						 "setsockopt(%d, IP_TOS, %.02x)"
734 						 " %s: %s",
735 						 sock->fd, dscp >> 2,
736 						 "failed", strbuf);
737 			} else
738 				sock->dscp = dscp;
739 		}
740 
741 		if (sock->pf == AF_INET6 && sock->pktdscp) {
742 			cmsgp = (struct cmsghdr *)(cmsgbuf +
743 						   msg->msg_controllen);
744 			msg->msg_control = (void *)cmsgbuf;
745 			msg->msg_controllen += cmsg_space(sizeof(dscp));
746 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
747 
748 			cmsgp->cmsg_level = IPPROTO_IPV6;
749 			cmsgp->cmsg_type = IPV6_TCLASS;
750 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
751 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
752 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
753 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
754 				       (void *)&dscp, sizeof(int)) < 0) {
755 				char strbuf[ISC_STRERRORSIZE];
756 				isc__strerror(errno, strbuf, sizeof(strbuf));
757 				UNEXPECTED_ERROR(__FILE__, __LINE__,
758 						 "setsockopt(%d, IPV6_TCLASS, "
759 						 "%.02x) %s: %s",
760 						 sock->fd, dscp >> 2,
761 						 "failed", strbuf);
762 			} else
763 				sock->dscp = dscp;
764 		}
765 
766 		if (msg->msg_controllen != 0 &&
767 		    msg->msg_controllen < SENDCMSGBUFLEN)
768 		{
769 			memset(cmsgbuf + msg->msg_controllen, 0,
770 			       SENDCMSGBUFLEN - msg->msg_controllen);
771 		}
772 	}
773 
774 	if (write_countp != NULL)
775 		*write_countp = write_count;
776 }
777 
778 /*
779  * Construct an iov array and attach it to the msghdr passed in.  This is
780  * the RECV constructor, which will use the available region of the buffer
781  * (if using a buffer list) or will use the internal region (if a single
782  * buffer I/O is requested).
783  *
784  * Nothing can be NULL, and the done event must list at least one buffer
785  * on the buffer linked list for this function to be meaningful.
786  *
787  * If read_countp != NULL, *read_countp will hold the number of bytes
788  * this transaction can receive.
789  */
790 static void
791 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
792 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
793 {
794 	unsigned int iovcount;
795 	isc_buffer_t *buffer;
796 	isc_region_t available;
797 	size_t read_count;
798 
799 	memset(msg, 0, sizeof(struct msghdr));
800 
801 	if (sock->type == isc_sockettype_udp) {
802 		memset(&dev->address, 0, sizeof(dev->address));
803 		msg->msg_name = (void *)&dev->address.type.sa;
804 		msg->msg_namelen = sizeof(dev->address.type);
805 	} else { /* TCP */
806 		msg->msg_name = NULL;
807 		msg->msg_namelen = 0;
808 		dev->address = sock->peer_address;
809 	}
810 
811 	buffer = ISC_LIST_HEAD(dev->bufferlist);
812 	read_count = 0;
813 
814 	/*
815 	 * Single buffer I/O?  Skip what we've done so far in this region.
816 	 */
817 	if (buffer == NULL) {
818 		read_count = dev->region.length - dev->n;
819 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
820 		iov[0].iov_len = read_count;
821 		iovcount = 1;
822 
823 		goto config;
824 	}
825 
826 	/*
827 	 * Multibuffer I/O.
828 	 * Skip empty buffers.
829 	 */
830 	while (buffer != NULL) {
831 		REQUIRE(ISC_BUFFER_VALID(buffer));
832 		if (isc_buffer_availablelength(buffer) != 0)
833 			break;
834 		buffer = ISC_LIST_NEXT(buffer, link);
835 	}
836 
837 	iovcount = 0;
838 	while (buffer != NULL) {
839 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
840 
841 		isc_buffer_availableregion(buffer, &available);
842 
843 		if (available.length > 0) {
844 			iov[iovcount].iov_base = (void *)(available.base);
845 			iov[iovcount].iov_len = available.length;
846 			read_count += available.length;
847 			iovcount++;
848 		}
849 		buffer = ISC_LIST_NEXT(buffer, link);
850 	}
851 
852  config:
853 
854 	/*
855 	 * If needed, set up to receive that one extra byte.
856 	 */
857 	msg->msg_iov = iov;
858 	msg->msg_iovlen = iovcount;
859 
860 	msg->msg_control = cmsgbuf;
861 	msg->msg_controllen = RECVCMSGBUFLEN;
862 	msg->msg_flags = 0;
863 
864 	if (read_countp != NULL)
865 		*read_countp = read_count;
866 }
867 
868 static void
869 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
870 		isc_socketevent_t *dev)
871 {
872 	if (sock->type == isc_sockettype_udp) {
873 		if (address != NULL)
874 			dev->address = *address;
875 		else
876 			dev->address = sock->peer_address;
877 	} else if (sock->type == isc_sockettype_tcp) {
878 		INSIST(address == NULL);
879 		dev->address = sock->peer_address;
880 	}
881 }
882 
883 static void
884 destroy_socketevent(isc_event_t *event) {
885 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
886 
887 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
888 
889 	(ev->destroy)(event);
890 }
891 
892 static isc_socketevent_t *
893 allocate_socketevent(void *sender,
894 		     isc_eventtype_t eventtype, isc_taskaction_t action,
895 		     void *arg)
896 {
897 	isc_socketevent_t *ev;
898 
899 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
900 						     eventtype, action, arg,
901 						     sizeof(*ev));
902 
903 	if (ev == NULL)
904 		return (NULL);
905 
906 	ev->result = ISC_R_UNSET;
907 	ISC_LINK_INIT(ev, ev_link);
908 	ISC_LIST_INIT(ev->bufferlist);
909 	ev->region.base = NULL;
910 	ev->n = 0;
911 	ev->offset = 0;
912 	ev->attributes = 0;
913 	ev->destroy = ev->ev_destroy;
914 	ev->ev_destroy = destroy_socketevent;
915 	ev->dscp = 0;
916 
917 	return (ev);
918 }
919 
920 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
921 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
922 #define DOIO_HARD		2	/* i/o error, event sent */
923 #define DOIO_EOF		3	/* EOF, no event sent */
924 
925 static int
926 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
927 	int cc;
928 	struct iovec iov[MAXSCATTERGATHER_RECV];
929 	size_t read_count;
930 	size_t actual_count;
931 	struct msghdr msghdr;
932 	isc_buffer_t *buffer;
933 	int recv_errno;
934 	char strbuf[ISC_STRERRORSIZE];
935 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
936 
937 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
938 
939 	cc = recvmsg(sock->fd, &msghdr, 0);
940 	recv_errno = errno;
941 
942 	if (cc < 0) {
943 		if (SOFT_ERROR(recv_errno))
944 			return (DOIO_SOFT);
945 
946 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
947 			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
948 			socket_log(sock, NULL, IOEVENT,
949 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
950 				   sock->fd, cc, recv_errno, strbuf);
951 		}
952 
953 #define SOFT_OR_HARD(_system, _isc) \
954 	if (recv_errno == _system) { \
955 		if (sock->connected) { \
956 			dev->result = _isc; \
957 			return (DOIO_HARD); \
958 		} \
959 		return (DOIO_SOFT); \
960 	}
961 #define ALWAYS_HARD(_system, _isc) \
962 	if (recv_errno == _system) { \
963 		dev->result = _isc; \
964 		return (DOIO_HARD); \
965 	}
966 
967 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
968 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
969 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
970 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
971 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
972 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
973 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
974 		/* Should never get this one but it was seen. */
975 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
976 		/*
977 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
978 		 * errors.
979 		 */
980 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
981 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
982 
983 #undef SOFT_OR_HARD
984 #undef ALWAYS_HARD
985 
986 		dev->result = isc__errno2result(recv_errno);
987 		return (DOIO_HARD);
988 	}
989 
990 	/*
991 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
992 	 * while on UDP sockets, zero length reads are perfectly valid,
993 	 * although strange.
994 	 */
995 	switch (sock->type) {
996 	case isc_sockettype_tcp:
997 		if (cc == 0)
998 			return (DOIO_EOF);
999 		break;
1000 	case isc_sockettype_udp:
1001 		break;
1002 	default:
1003 		INSIST(0);
1004 	}
1005 
1006 	if (sock->type == isc_sockettype_udp) {
1007 		dev->address.length = msghdr.msg_namelen;
1008 		if (isc_sockaddr_getport(&dev->address) == 0) {
1009 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1010 				socket_log(sock, &dev->address, IOEVENT,
1011 					   "dropping source port zero packet");
1012 			}
1013 			return (DOIO_SOFT);
1014 		}
1015 	}
1016 
1017 	socket_log(sock, &dev->address, IOEVENT,
1018 		   "packet received correctly");
1019 
1020 	/*
1021 	 * Overflow bit detection.  If we received MORE bytes than we should,
1022 	 * this indicates an overflow situation.  Set the flag in the
1023 	 * dev entry and adjust how much we read by one.
1024 	 */
1025 	/*
1026 	 * If there are control messages attached, run through them and pull
1027 	 * out the interesting bits.
1028 	 */
1029 	process_cmsg(sock, &msghdr, dev);
1030 
1031 	/*
1032 	 * update the buffers (if any) and the i/o count
1033 	 */
1034 	dev->n += cc;
1035 	actual_count = cc;
1036 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1037 	while (buffer != NULL && actual_count > 0U) {
1038 		REQUIRE(ISC_BUFFER_VALID(buffer));
1039 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1040 			actual_count -= isc_buffer_availablelength(buffer);
1041 			isc_buffer_add(buffer,
1042 				       isc_buffer_availablelength(buffer));
1043 		} else {
1044 			isc_buffer_add(buffer, actual_count);
1045 			actual_count = 0;
1046 			POST(actual_count);
1047 			break;
1048 		}
1049 		buffer = ISC_LIST_NEXT(buffer, link);
1050 		if (buffer == NULL) {
1051 			INSIST(actual_count == 0U);
1052 		}
1053 	}
1054 
1055 	/*
1056 	 * If we read less than we expected, update counters,
1057 	 * and let the upper layer poke the descriptor.
1058 	 */
1059 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1060 		return (DOIO_SOFT);
1061 
1062 	/*
1063 	 * Full reads are posted, or partials if partials are ok.
1064 	 */
1065 	dev->result = ISC_R_SUCCESS;
1066 	return (DOIO_SUCCESS);
1067 }
1068 
1069 /*
1070  * Returns:
1071  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1072  *			ISC_R_SUCCESS.
1073  *
1074  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1075  *			dev->result contains the appropriate error.
1076  *
1077  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1078  *			event was sent.  The operation should be retried.
1079  *
1080  *	No other return values are possible.
1081  */
1082 static int
1083 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1084 	int cc;
1085 	struct iovec iov[MAXSCATTERGATHER_SEND];
1086 	size_t write_count;
1087 	struct msghdr msghdr;
1088 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1089 	int attempts = 0;
1090 	int send_errno;
1091 	char strbuf[ISC_STRERRORSIZE];
1092 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1093 
1094 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1095 
1096  resend:
1097 	cc = sendmsg(sock->fd, &msghdr, 0);
1098 	send_errno = errno;
1099 
1100 	/*
1101 	 * Check for error or block condition.
1102 	 */
1103 	if (cc < 0) {
1104 		if (send_errno == EINTR && ++attempts < NRETRIES)
1105 			goto resend;
1106 
1107 		if (SOFT_ERROR(send_errno)) {
1108 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1109 				dev->result = ISC_R_WOULDBLOCK;
1110 			return (DOIO_SOFT);
1111 		}
1112 
1113 #define SOFT_OR_HARD(_system, _isc) \
1114 	if (send_errno == _system) { \
1115 		if (sock->connected) { \
1116 			dev->result = _isc; \
1117 			return (DOIO_HARD); \
1118 		} \
1119 		return (DOIO_SOFT); \
1120 	}
1121 #define ALWAYS_HARD(_system, _isc) \
1122 	if (send_errno == _system) { \
1123 		dev->result = _isc; \
1124 		return (DOIO_HARD); \
1125 	}
1126 
1127 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1128 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1129 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1130 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1131 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1132 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1133 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1134 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1135 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1136 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1137 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1138 
1139 #undef SOFT_OR_HARD
1140 #undef ALWAYS_HARD
1141 
1142 		/*
1143 		 * The other error types depend on whether or not the
1144 		 * socket is UDP or TCP.  If it is UDP, some errors
1145 		 * that we expect to be fatal under TCP are merely
1146 		 * annoying, and are really soft errors.
1147 		 *
1148 		 * However, these soft errors are still returned as
1149 		 * a status.
1150 		 */
1151 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1152 		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1153 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1154 				 addrbuf, strbuf);
1155 		dev->result = isc__errno2result(send_errno);
1156 		return (DOIO_HARD);
1157 	}
1158 
1159 	if (cc == 0) {
1160 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1161 				 "doio_send: send() %s 0", "returned");
1162 	}
1163 
1164 	/*
1165 	 * If we write less than we expected, update counters, poke.
1166 	 */
1167 	dev->n += cc;
1168 	if ((size_t)cc != write_count)
1169 		return (DOIO_SOFT);
1170 
1171 	/*
1172 	 * Exactly what we wanted to write.  We're done with this
1173 	 * entry.  Post its completion event.
1174 	 */
1175 	dev->result = ISC_R_SUCCESS;
1176 	return (DOIO_SUCCESS);
1177 }
1178 
1179 /*
1180  * Kill.
1181  *
1182  * Caller must ensure that the socket is not locked and no external
1183  * references exist.
1184  */
1185 static void
1186 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1187 	/*
1188 	 * No one has this socket open, so the watcher doesn't have to be
1189 	 * poked, and the socket doesn't have to be locked.
1190 	 */
1191 	manager->fds[fd] = NULL;
1192 	manager->fdstate[fd] = CLOSE_PENDING;
1193 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1194 
1195 	if (sock->active == 1) {
1196 		sock->active = 0;
1197 	}
1198 
1199 	/*
1200 	 * update manager->maxfd here (XXX: this should be implemented more
1201 	 * efficiently)
1202 	 */
1203 	if (manager->maxfd == fd) {
1204 		int i;
1205 
1206 		manager->maxfd = 0;
1207 		for (i = fd - 1; i >= 0; i--) {
1208 			if (manager->fdstate[i] == MANAGED) {
1209 				manager->maxfd = i;
1210 				break;
1211 			}
1212 		}
1213 	}
1214 
1215 }
1216 
1217 static void
1218 destroy(isc__socket_t **sockp) {
1219 	int fd;
1220 	isc__socket_t *sock = *sockp;
1221 	isc__socketmgr_t *manager = sock->manager;
1222 
1223 	socket_log(sock, NULL, CREATION, "destroying");
1224 
1225 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1226 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1227 	INSIST(sock->connect_ev == NULL);
1228 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1229 
1230 	if (sock->fd >= 0) {
1231 		fd = sock->fd;
1232 		sock->fd = -1;
1233 		socketclose(manager, sock, fd);
1234 	}
1235 
1236 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1237 
1238 	/* can't unlock manager as its memory context is still used */
1239 	free_socket(sockp);
1240 }
1241 
1242 static isc_result_t
1243 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1244 		isc__socket_t **socketp)
1245 {
1246 	isc__socket_t *sock;
1247 
1248 	sock = malloc(sizeof(*sock));
1249 
1250 	if (sock == NULL)
1251 		return (ISC_R_NOMEMORY);
1252 
1253 	sock->common.magic = 0;
1254 	sock->common.impmagic = 0;
1255 	sock->references = 0;
1256 
1257 	sock->manager = manager;
1258 	sock->type = type;
1259 	sock->fd = -1;
1260 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1261 	sock->active = 0;
1262 
1263 	ISC_LINK_INIT(sock, link);
1264 
1265 	/*
1266 	 * Set up list of readers and writers to be initially empty.
1267 	 */
1268 	ISC_LIST_INIT(sock->recv_list);
1269 	ISC_LIST_INIT(sock->send_list);
1270 	sock->connect_ev = NULL;
1271 	sock->pending_recv = 0;
1272 	sock->pending_send = 0;
1273 	sock->connected = 0;
1274 	sock->connecting = 0;
1275 	sock->bound = 0;
1276 	sock->pktdscp = 0;
1277 
1278 	/*
1279 	 * Initialize readable and writable events.
1280 	 */
1281 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1282 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1283 		       NULL, sock, sock, NULL);
1284 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1285 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1286 		       NULL, sock, sock, NULL);
1287 
1288 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1289 	sock->common.impmagic = SOCKET_MAGIC;
1290 	*socketp = sock;
1291 
1292 	return (ISC_R_SUCCESS);
1293 }
1294 
1295 /*
1296  * This event requires that the various lists be empty, that the reference
1297  * count be 1, and that the magic number is valid.  The other socket bits,
1298  * like the lock, must be initialized as well.  The fd associated must be
1299  * marked as closed, by setting it to -1 on close, or this routine will
1300  * also close the socket.
1301  */
1302 static void
1303 free_socket(isc__socket_t **socketp) {
1304 	isc__socket_t *sock = *socketp;
1305 
1306 	INSIST(VALID_SOCKET(sock));
1307 	INSIST(sock->references == 0);
1308 	INSIST(!sock->connecting);
1309 	INSIST(!sock->pending_recv);
1310 	INSIST(!sock->pending_send);
1311 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1312 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1313 	INSIST(!ISC_LINK_LINKED(sock, link));
1314 
1315 	sock->common.magic = 0;
1316 	sock->common.impmagic = 0;
1317 
1318 	free(sock);
1319 
1320 	*socketp = NULL;
1321 }
1322 
1323 static void
1324 use_min_mtu(isc__socket_t *sock) {
1325 	/* use minimum MTU */
1326 	if (sock->pf == AF_INET6) {
1327 		int on = 1;
1328 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1329 				(void *)&on, sizeof(on));
1330 	}
1331 }
1332 
1333 static void
1334 set_tcp_maxseg(isc__socket_t *sock, int size) {
1335 	if (sock->type == isc_sockettype_tcp)
1336 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1337 				(void *)&size, sizeof(size));
1338 }
1339 
1340 static isc_result_t
1341 opensocket(isc__socket_t *sock)
1342 {
1343 	isc_result_t result;
1344 	char strbuf[ISC_STRERRORSIZE];
1345 	const char *err = "socket";
1346 	int on = 1;
1347 
1348 	switch (sock->type) {
1349 	case isc_sockettype_udp:
1350 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1351 		break;
1352 	case isc_sockettype_tcp:
1353 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1354 		break;
1355 	}
1356 
1357 	if (sock->fd < 0) {
1358 		switch (errno) {
1359 		case EMFILE:
1360 		case ENFILE:
1361 			isc__strerror(errno, strbuf, sizeof(strbuf));
1362 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1363 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1364 				       "%s: %s", err, strbuf);
1365 			/* fallthrough */
1366 		case ENOBUFS:
1367 			return (ISC_R_NORESOURCES);
1368 
1369 		case EPROTONOSUPPORT:
1370 		case EPFNOSUPPORT:
1371 		case EAFNOSUPPORT:
1372 		/*
1373 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1374 		 * EAFNOSUPPORT.
1375 		 */
1376 		case EINVAL:
1377 			return (ISC_R_FAMILYNOSUPPORT);
1378 
1379 		default:
1380 			isc__strerror(errno, strbuf, sizeof(strbuf));
1381 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1382 					 "%s() %s: %s", err, "failed",
1383 					 strbuf);
1384 			return (ISC_R_UNEXPECTED);
1385 		}
1386 	}
1387 
1388 	result = make_nonblock(sock->fd);
1389 	if (result != ISC_R_SUCCESS) {
1390 		(void)close(sock->fd);
1391 		return (result);
1392 	}
1393 
1394 	/*
1395 	 * Use minimum mtu if possible.
1396 	 */
1397 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1398 		use_min_mtu(sock);
1399 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1400 	}
1401 
1402 	if (sock->type == isc_sockettype_udp) {
1403 
1404 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1405 			       (void *)&on, sizeof(on)) < 0
1406 		    && errno != ENOPROTOOPT) {
1407 			isc__strerror(errno, strbuf, sizeof(strbuf));
1408 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1409 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1410 					 sock->fd, "failed", strbuf);
1411 			/* Press on... */
1412 		}
1413 
1414 		/* RFC 3542 */
1415 		if ((sock->pf == AF_INET6)
1416 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1417 				   (void *)&on, sizeof(on)) < 0)) {
1418 			isc__strerror(errno, strbuf, sizeof(strbuf));
1419 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1420 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1421 					 "%s: %s", sock->fd, "failed",
1422 					 strbuf);
1423 		}
1424 	}
1425 
1426 	if (sock->active == 0) {
1427 		sock->active = 1;
1428 	}
1429 
1430 	return (ISC_R_SUCCESS);
1431 }
1432 
1433 /*
1434  * Create a 'type' socket managed
1435  * by 'manager'.  Events will be posted to 'task' and when dispatched
1436  * 'action' will be called with 'arg' as the arg value.  The new
1437  * socket is returned in 'socketp'.
1438  */
1439 static isc_result_t
1440 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1441 	      isc_socket_t **socketp)
1442 {
1443 	isc__socket_t *sock = NULL;
1444 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1445 	isc_result_t result;
1446 	int lockid;
1447 
1448 	REQUIRE(VALID_MANAGER(manager));
1449 	REQUIRE(socketp != NULL && *socketp == NULL);
1450 
1451 	result = allocate_socket(manager, type, &sock);
1452 	if (result != ISC_R_SUCCESS)
1453 		return (result);
1454 
1455 	switch (sock->type) {
1456 	case isc_sockettype_udp:
1457 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1458 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1459 		break;
1460 	case isc_sockettype_tcp:
1461 		break;
1462 	default:
1463 		INSIST(0);
1464 	}
1465 
1466 	sock->pf = pf;
1467 
1468 	result = opensocket(sock);
1469 	if (result != ISC_R_SUCCESS) {
1470 		free_socket(&sock);
1471 		return (result);
1472 	}
1473 
1474 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1475 	sock->references = 1;
1476 	*socketp = (isc_socket_t *)sock;
1477 
1478 	/*
1479 	 * Note we don't have to lock the socket like we normally would because
1480 	 * there are no external references to it yet.
1481 	 */
1482 
1483 	lockid = FDLOCK_ID(sock->fd);
1484 	manager->fds[sock->fd] = sock;
1485 	manager->fdstate[sock->fd] = MANAGED;
1486 
1487 	ISC_LIST_APPEND(manager->socklist, sock, link);
1488 	if (manager->maxfd < sock->fd)
1489 		manager->maxfd = sock->fd;
1490 
1491 	socket_log(sock, NULL, CREATION, "created");
1492 
1493 	return (ISC_R_SUCCESS);
1494 }
1495 
1496 /*%
1497  * Create a new 'type' socket managed by 'manager'.  Events
1498  * will be posted to 'task' and when dispatched 'action' will be
1499  * called with 'arg' as the arg value.  The new socket is returned
1500  * in 'socketp'.
1501  */
1502 isc_result_t
1503 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1504 		   isc_socket_t **socketp)
1505 {
1506 	return (socket_create(manager0, pf, type, socketp));
1507 }
1508 
1509 /*
1510  * Attach to a socket.  Caller must explicitly detach when it is done.
1511  */
1512 void
1513 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1514 	isc__socket_t *sock = (isc__socket_t *)sock0;
1515 
1516 	REQUIRE(VALID_SOCKET(sock));
1517 	REQUIRE(socketp != NULL && *socketp == NULL);
1518 
1519 	sock->references++;
1520 
1521 	*socketp = (isc_socket_t *)sock;
1522 }
1523 
1524 /*
1525  * Dereference a socket.  If this is the last reference to it, clean things
1526  * up by destroying the socket.
1527  */
1528 void
1529 isc__socket_detach(isc_socket_t **socketp) {
1530 	isc__socket_t *sock;
1531 	isc_boolean_t kill_socket = ISC_FALSE;
1532 
1533 	REQUIRE(socketp != NULL);
1534 	sock = (isc__socket_t *)*socketp;
1535 	REQUIRE(VALID_SOCKET(sock));
1536 
1537 	REQUIRE(sock->references > 0);
1538 	sock->references--;
1539 	if (sock->references == 0)
1540 		kill_socket = ISC_TRUE;
1541 
1542 	if (kill_socket)
1543 		destroy(&sock);
1544 
1545 	*socketp = NULL;
1546 }
1547 
1548 /*
1549  * I/O is possible on a given socket.  Schedule an event to this task that
1550  * will call an internal function to do the I/O.  This will charge the
1551  * task with the I/O operation and let our select loop handler get back
1552  * to doing something real as fast as possible.
1553  *
1554  * The socket and manager must be locked before calling this function.
1555  */
1556 static void
1557 dispatch_recv(isc__socket_t *sock) {
1558 	intev_t *iev;
1559 	isc_socketevent_t *ev;
1560 	isc_task_t *sender;
1561 
1562 	INSIST(!sock->pending_recv);
1563 
1564 	ev = ISC_LIST_HEAD(sock->recv_list);
1565 	if (ev == NULL)
1566 		return;
1567 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1568 		   "dispatch_recv:  event %p -> task %p",
1569 		   ev, ev->ev_sender);
1570 	sender = ev->ev_sender;
1571 
1572 	sock->pending_recv = 1;
1573 	iev = &sock->readable_ev;
1574 
1575 	sock->references++;
1576 	iev->ev_sender = sock;
1577 	iev->ev_action = internal_recv;
1578 	iev->ev_arg = sock;
1579 
1580 	isc_task_send(sender, (isc_event_t **)&iev);
1581 }
1582 
1583 static void
1584 dispatch_send(isc__socket_t *sock) {
1585 	intev_t *iev;
1586 	isc_socketevent_t *ev;
1587 	isc_task_t *sender;
1588 
1589 	INSIST(!sock->pending_send);
1590 
1591 	ev = ISC_LIST_HEAD(sock->send_list);
1592 	if (ev == NULL)
1593 		return;
1594 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1595 		   "dispatch_send:  event %p -> task %p",
1596 		   ev, ev->ev_sender);
1597 	sender = ev->ev_sender;
1598 
1599 	sock->pending_send = 1;
1600 	iev = &sock->writable_ev;
1601 
1602 	sock->references++;
1603 	iev->ev_sender = sock;
1604 	iev->ev_action = internal_send;
1605 	iev->ev_arg = sock;
1606 
1607 	isc_task_send(sender, (isc_event_t **)&iev);
1608 }
1609 
1610 static void
1611 dispatch_connect(isc__socket_t *sock) {
1612 	intev_t *iev;
1613 	isc_socket_connev_t *ev;
1614 
1615 	iev = &sock->writable_ev;
1616 
1617 	ev = sock->connect_ev;
1618 	INSIST(ev != NULL); /* XXX */
1619 
1620 	INSIST(sock->connecting);
1621 
1622 	sock->references++;  /* keep socket around for this internal event */
1623 	iev->ev_sender = sock;
1624 	iev->ev_action = internal_connect;
1625 	iev->ev_arg = sock;
1626 
1627 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1628 }
1629 
1630 /*
1631  * Dequeue an item off the given socket's read queue, set the result code
1632  * in the done event to the one provided, and send it to the task it was
1633  * destined for.
1634  *
1635  * If the event to be sent is on a list, remove it before sending.  If
1636  * asked to, send and detach from the socket as well.
1637  *
1638  * Caller must have the socket locked if the event is attached to the socket.
1639  */
1640 static void
1641 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1642 	isc_task_t *task;
1643 
1644 	task = (*dev)->ev_sender;
1645 
1646 	(*dev)->ev_sender = sock;
1647 
1648 	if (ISC_LINK_LINKED(*dev, ev_link))
1649 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1650 
1651 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1652 	    == ISC_SOCKEVENTATTR_ATTACHED)
1653 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1654 	else
1655 		isc_task_send(task, (isc_event_t **)dev);
1656 }
1657 
1658 /*
1659  * See comments for send_recvdone_event() above.
1660  *
1661  * Caller must have the socket locked if the event is attached to the socket.
1662  */
1663 static void
1664 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1665 	isc_task_t *task;
1666 
1667 	INSIST(dev != NULL && *dev != NULL);
1668 
1669 	task = (*dev)->ev_sender;
1670 	(*dev)->ev_sender = sock;
1671 
1672 	if (ISC_LINK_LINKED(*dev, ev_link))
1673 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1674 
1675 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1676 	    == ISC_SOCKEVENTATTR_ATTACHED)
1677 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1678 	else
1679 		isc_task_send(task, (isc_event_t **)dev);
1680 }
1681 
1682 static void
1683 internal_recv(isc_task_t *me, isc_event_t *ev) {
1684 	isc_socketevent_t *dev;
1685 	isc__socket_t *sock;
1686 
1687 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1688 
1689 	sock = ev->ev_sender;
1690 	INSIST(VALID_SOCKET(sock));
1691 
1692 	socket_log(sock, NULL, IOEVENT,
1693 		   "internal_recv: task %p got event %p", me, ev);
1694 
1695 	INSIST(sock->pending_recv == 1);
1696 	sock->pending_recv = 0;
1697 
1698 	INSIST(sock->references > 0);
1699 	sock->references--;  /* the internal event is done with this socket */
1700 	if (sock->references == 0) {
1701 		destroy(&sock);
1702 		return;
1703 	}
1704 
1705 	/*
1706 	 * Try to do as much I/O as possible on this socket.  There are no
1707 	 * limits here, currently.
1708 	 */
1709 	dev = ISC_LIST_HEAD(sock->recv_list);
1710 	while (dev != NULL) {
1711 		switch (doio_recv(sock, dev)) {
1712 		case DOIO_SOFT:
1713 			goto poke;
1714 
1715 		case DOIO_EOF:
1716 			/*
1717 			 * read of 0 means the remote end was closed.
1718 			 * Run through the event queue and dispatch all
1719 			 * the events with an EOF result code.
1720 			 */
1721 			do {
1722 				dev->result = ISC_R_EOF;
1723 				send_recvdone_event(sock, &dev);
1724 				dev = ISC_LIST_HEAD(sock->recv_list);
1725 			} while (dev != NULL);
1726 			goto poke;
1727 
1728 		case DOIO_SUCCESS:
1729 		case DOIO_HARD:
1730 			send_recvdone_event(sock, &dev);
1731 			break;
1732 		}
1733 
1734 		dev = ISC_LIST_HEAD(sock->recv_list);
1735 	}
1736 
1737  poke:
1738 	if (!ISC_LIST_EMPTY(sock->recv_list))
1739 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1740 }
1741 
1742 static void
1743 internal_send(isc_task_t *me, isc_event_t *ev) {
1744 	isc_socketevent_t *dev;
1745 	isc__socket_t *sock;
1746 
1747 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1748 
1749 	/*
1750 	 * Find out what socket this is and lock it.
1751 	 */
1752 	sock = (isc__socket_t *)ev->ev_sender;
1753 	INSIST(VALID_SOCKET(sock));
1754 	socket_log(sock, NULL, IOEVENT,
1755 		   "internal_send: task %p got event %p", me, ev);
1756 
1757 	INSIST(sock->pending_send == 1);
1758 	sock->pending_send = 0;
1759 
1760 	INSIST(sock->references > 0);
1761 	sock->references--;  /* the internal event is done with this socket */
1762 	if (sock->references == 0) {
1763 		destroy(&sock);
1764 		return;
1765 	}
1766 
1767 	/*
1768 	 * Try to do as much I/O as possible on this socket.  There are no
1769 	 * limits here, currently.
1770 	 */
1771 	dev = ISC_LIST_HEAD(sock->send_list);
1772 	while (dev != NULL) {
1773 		switch (doio_send(sock, dev)) {
1774 		case DOIO_SOFT:
1775 			goto poke;
1776 
1777 		case DOIO_HARD:
1778 		case DOIO_SUCCESS:
1779 			send_senddone_event(sock, &dev);
1780 			break;
1781 		}
1782 
1783 		dev = ISC_LIST_HEAD(sock->send_list);
1784 	}
1785 
1786  poke:
1787 	if (!ISC_LIST_EMPTY(sock->send_list))
1788 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1789 }
1790 
1791 /*
1792  * Process read/writes on each fd here.  Avoid locking
1793  * and unlocking twice if both reads and writes are possible.
1794  */
1795 static void
1796 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1797 	   isc_boolean_t writeable)
1798 {
1799 	isc__socket_t *sock;
1800 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1801 
1802 	/*
1803 	 * If the socket is going to be closed, don't do more I/O.
1804 	 */
1805 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1806 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1807 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1808 		return;
1809 	}
1810 
1811 	sock = manager->fds[fd];
1812 	if (readable) {
1813 		if (sock == NULL) {
1814 			unwatch_read = ISC_TRUE;
1815 			goto check_write;
1816 		}
1817 		if (!SOCK_DEAD(sock)) {
1818 			dispatch_recv(sock);
1819 		}
1820 		unwatch_read = ISC_TRUE;
1821 	}
1822 check_write:
1823 	if (writeable) {
1824 		if (sock == NULL) {
1825 			unwatch_write = ISC_TRUE;
1826 			goto unlock_fd;
1827 		}
1828 		if (!SOCK_DEAD(sock)) {
1829 			if (sock->connecting)
1830 				dispatch_connect(sock);
1831 			else
1832 				dispatch_send(sock);
1833 		}
1834 		unwatch_write = ISC_TRUE;
1835 	}
1836 
1837  unlock_fd:
1838 	if (unwatch_read)
1839 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1840 	if (unwatch_write)
1841 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1842 
1843 }
1844 
1845 static void
1846 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1847 	    fd_set *writefds)
1848 {
1849 	int i;
1850 
1851 	REQUIRE(maxfd <= (int)manager->maxsocks);
1852 
1853 	for (i = 0; i < maxfd; i++) {
1854 		process_fd(manager, i, FD_ISSET(i, readfds),
1855 			   FD_ISSET(i, writefds));
1856 	}
1857 }
1858 
1859 /*
1860  * Create a new socket manager.
1861  */
1862 
1863 static isc_result_t
1864 setup_watcher(isc__socketmgr_t *manager) {
1865 	isc_result_t result;
1866 
1867 	UNUSED(result);
1868 
1869 	manager->fd_bufsize = sizeof(fd_set);
1870 
1871 	manager->read_fds = NULL;
1872 	manager->read_fds_copy = NULL;
1873 	manager->write_fds = NULL;
1874 	manager->write_fds_copy = NULL;
1875 
1876 	manager->read_fds = malloc(manager->fd_bufsize);
1877 	if (manager->read_fds != NULL)
1878 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1879 	if (manager->read_fds_copy != NULL)
1880 		manager->write_fds = malloc(manager->fd_bufsize);
1881 	if (manager->write_fds != NULL) {
1882 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1883 	}
1884 	if (manager->write_fds_copy == NULL) {
1885 		if (manager->write_fds != NULL) {
1886 			free(manager->write_fds);
1887 		}
1888 		if (manager->read_fds_copy != NULL) {
1889 			free(manager->read_fds_copy);
1890 		}
1891 		if (manager->read_fds != NULL) {
1892 			free(manager->read_fds);
1893 		}
1894 		return (ISC_R_NOMEMORY);
1895 	}
1896 	memset(manager->read_fds, 0, manager->fd_bufsize);
1897 	memset(manager->write_fds, 0, manager->fd_bufsize);
1898 
1899 	manager->maxfd = 0;
1900 
1901 	return (ISC_R_SUCCESS);
1902 }
1903 
1904 static void
1905 cleanup_watcher(isc__socketmgr_t *manager) {
1906 
1907 	if (manager->read_fds != NULL)
1908 		free(manager->read_fds);
1909 	if (manager->read_fds_copy != NULL)
1910 		free(manager->read_fds_copy);
1911 	if (manager->write_fds != NULL)
1912 		free(manager->write_fds);
1913 	if (manager->write_fds_copy != NULL)
1914 		free(manager->write_fds_copy);
1915 }
1916 
1917 isc_result_t
1918 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1919 	return (isc__socketmgr_create2(managerp, 0));
1920 }
1921 
1922 isc_result_t
1923 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1924 		       unsigned int maxsocks)
1925 {
1926 	isc__socketmgr_t *manager;
1927 	isc_result_t result;
1928 
1929 	REQUIRE(managerp != NULL && *managerp == NULL);
1930 
1931 	if (socketmgr != NULL) {
1932 		/* Don't allow maxsocks to be updated */
1933 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1934 			return (ISC_R_EXISTS);
1935 
1936 		socketmgr->refs++;
1937 		*managerp = (isc_socketmgr_t *)socketmgr;
1938 		return (ISC_R_SUCCESS);
1939 	}
1940 
1941 	if (maxsocks == 0)
1942 		maxsocks = FD_SETSIZE;
1943 
1944 	manager = malloc(sizeof(*manager));
1945 	if (manager == NULL)
1946 		return (ISC_R_NOMEMORY);
1947 
1948 	/* zero-clear so that necessary cleanup on failure will be easy */
1949 	memset(manager, 0, sizeof(*manager));
1950 	manager->maxsocks = maxsocks;
1951 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1952 	if (manager->fds == NULL) {
1953 		result = ISC_R_NOMEMORY;
1954 		goto free_manager;
1955 	}
1956 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1957 	if (manager->fdstate == NULL) {
1958 		result = ISC_R_NOMEMORY;
1959 		goto free_manager;
1960 	}
1961 
1962 	manager->common.methods = &socketmgrmethods;
1963 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1964 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1965 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1966 	ISC_LIST_INIT(manager->socklist);
1967 
1968 	manager->refs = 1;
1969 
1970 	/*
1971 	 * Set up initial state for the select loop
1972 	 */
1973 	result = setup_watcher(manager);
1974 	if (result != ISC_R_SUCCESS)
1975 		goto cleanup;
1976 
1977 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1978 
1979 	socketmgr = manager;
1980 	*managerp = (isc_socketmgr_t *)manager;
1981 
1982 	return (ISC_R_SUCCESS);
1983 
1984 cleanup:
1985 
1986 free_manager:
1987 	if (manager->fdstate != NULL) {
1988 		free(manager->fdstate);
1989 	}
1990 	if (manager->fds != NULL) {
1991 		free(manager->fds);
1992 	}
1993 	free(manager);
1994 
1995 	return (result);
1996 }
1997 
1998 void
1999 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2000 	isc__socketmgr_t *manager;
2001 	int i;
2002 
2003 	/*
2004 	 * Destroy a socket manager.
2005 	 */
2006 
2007 	REQUIRE(managerp != NULL);
2008 	manager = (isc__socketmgr_t *)*managerp;
2009 	REQUIRE(VALID_MANAGER(manager));
2010 
2011 	manager->refs--;
2012 	if (manager->refs > 0) {
2013 		*managerp = NULL;
2014 		return;
2015 	}
2016 	socketmgr = NULL;
2017 
2018 	/*
2019 	 * Wait for all sockets to be destroyed.
2020 	 */
2021 	while (!ISC_LIST_EMPTY(manager->socklist)) {
2022 		isc__taskmgr_dispatch(NULL);
2023 	}
2024 
2025 	/*
2026 	 * Here, poke our select/poll thread.  Do this by closing the write
2027 	 * half of the pipe, which will send EOF to the read half.
2028 	 * This is currently a no-op in the non-threaded case.
2029 	 */
2030 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2031 
2032 	/*
2033 	 * Clean up.
2034 	 */
2035 	cleanup_watcher(manager);
2036 
2037 	for (i = 0; i < (int)manager->maxsocks; i++)
2038 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2039 			(void)close(i);
2040 
2041 	free(manager->fds);
2042 	free(manager->fdstate);
2043 
2044 	manager->common.magic = 0;
2045 	manager->common.impmagic = 0;
2046 	free(manager);
2047 
2048 	*managerp = NULL;
2049 
2050 	socketmgr = NULL;
2051 }
2052 
2053 static isc_result_t
2054 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2055 	    unsigned int flags)
2056 {
2057 	int io_state;
2058 	isc_task_t *ntask = NULL;
2059 	isc_result_t result = ISC_R_SUCCESS;
2060 
2061 	dev->ev_sender = task;
2062 
2063 	if (sock->type == isc_sockettype_udp) {
2064 		io_state = doio_recv(sock, dev);
2065 	} else {
2066 		if (ISC_LIST_EMPTY(sock->recv_list))
2067 			io_state = doio_recv(sock, dev);
2068 		else
2069 			io_state = DOIO_SOFT;
2070 	}
2071 
2072 	switch (io_state) {
2073 	case DOIO_SOFT:
2074 		/*
2075 		 * We couldn't read all or part of the request right now, so
2076 		 * queue it.
2077 		 *
2078 		 * Attach to socket and to task
2079 		 */
2080 		isc_task_attach(task, &ntask);
2081 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2082 
2083 		/*
2084 		 * Enqueue the request.  If the socket was previously not being
2085 		 * watched, poke the watcher to start paying attention to it.
2086 		 */
2087 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2088 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2089 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2090 
2091 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2092 			   "socket_recv: event %p -> task %p",
2093 			   dev, ntask);
2094 
2095 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2096 			result = ISC_R_INPROGRESS;
2097 		break;
2098 
2099 	case DOIO_EOF:
2100 		dev->result = ISC_R_EOF;
2101 		/* fallthrough */
2102 
2103 	case DOIO_HARD:
2104 	case DOIO_SUCCESS:
2105 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2106 			send_recvdone_event(sock, &dev);
2107 		break;
2108 	}
2109 
2110 	return (result);
2111 }
2112 
2113 isc_result_t
2114 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2115 		  unsigned int minimum, isc_task_t *task,
2116 		  isc_taskaction_t action, void *arg)
2117 {
2118 	isc__socket_t *sock = (isc__socket_t *)sock0;
2119 	isc_socketevent_t *dev;
2120 	isc__socketmgr_t *manager;
2121 	unsigned int iocount;
2122 	isc_buffer_t *buffer;
2123 
2124 	REQUIRE(VALID_SOCKET(sock));
2125 	REQUIRE(buflist != NULL);
2126 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2127 	REQUIRE(task != NULL);
2128 	REQUIRE(action != NULL);
2129 
2130 	manager = sock->manager;
2131 	REQUIRE(VALID_MANAGER(manager));
2132 
2133 	iocount = isc_bufferlist_availablecount(buflist);
2134 	REQUIRE(iocount > 0);
2135 
2136 	INSIST(sock->bound);
2137 
2138 	dev = allocate_socketevent(sock,
2139 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2140 	if (dev == NULL)
2141 		return (ISC_R_NOMEMORY);
2142 
2143 	/*
2144 	 * UDP sockets are always partial read
2145 	 */
2146 	if (sock->type == isc_sockettype_udp)
2147 		dev->minimum = 1;
2148 	else {
2149 		if (minimum == 0)
2150 			dev->minimum = iocount;
2151 		else
2152 			dev->minimum = minimum;
2153 	}
2154 
2155 	/*
2156 	 * Move each buffer from the passed in list to our internal one.
2157 	 */
2158 	buffer = ISC_LIST_HEAD(*buflist);
2159 	while (buffer != NULL) {
2160 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2161 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2162 		buffer = ISC_LIST_HEAD(*buflist);
2163 	}
2164 
2165 	return (socket_recv(sock, dev, task, 0));
2166 }
2167 
2168 static isc_result_t
2169 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2170 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2171 	    unsigned int flags)
2172 {
2173 	int io_state;
2174 	isc_task_t *ntask = NULL;
2175 	isc_result_t result = ISC_R_SUCCESS;
2176 
2177 	dev->ev_sender = task;
2178 
2179 	set_dev_address(address, sock, dev);
2180 	if (pktinfo != NULL) {
2181 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2182 		dev->pktinfo = *pktinfo;
2183 
2184 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2185 		    !isc_sockaddr_islinklocal(&dev->address)) {
2186 			socket_log(sock, NULL, TRACE,
2187 				   "pktinfo structure provided, ifindex %u "
2188 				   "(set to 0)", pktinfo->ipi6_ifindex);
2189 
2190 			/*
2191 			 * Set the pktinfo index to 0 here, to let the
2192 			 * kernel decide what interface it should send on.
2193 			 */
2194 			dev->pktinfo.ipi6_ifindex = 0;
2195 		}
2196 	}
2197 
2198 	if (sock->type == isc_sockettype_udp)
2199 		io_state = doio_send(sock, dev);
2200 	else {
2201 		if (ISC_LIST_EMPTY(sock->send_list))
2202 			io_state = doio_send(sock, dev);
2203 		else
2204 			io_state = DOIO_SOFT;
2205 	}
2206 
2207 	switch (io_state) {
2208 	case DOIO_SOFT:
2209 		/*
2210 		 * We couldn't send all or part of the request right now, so
2211 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2212 		 */
2213 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2214 			isc_task_attach(task, &ntask);
2215 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2216 
2217 			/*
2218 			 * Enqueue the request.  If the socket was previously
2219 			 * not being watched, poke the watcher to start
2220 			 * paying attention to it.
2221 			 */
2222 			if (ISC_LIST_EMPTY(sock->send_list) &&
2223 			    !sock->pending_send)
2224 				select_poke(sock->manager, sock->fd,
2225 					    SELECT_POKE_WRITE);
2226 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2227 
2228 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2229 				   "socket_send: event %p -> task %p",
2230 				   dev, ntask);
2231 
2232 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2233 				result = ISC_R_INPROGRESS;
2234 			break;
2235 		}
2236 
2237 		/* FALLTHROUGH */
2238 
2239 	case DOIO_HARD:
2240 	case DOIO_SUCCESS:
2241 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2242 			send_senddone_event(sock, &dev);
2243 		break;
2244 	}
2245 
2246 	return (result);
2247 }
2248 
2249 isc_result_t
2250 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2251 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2252 {
2253 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2254 				     NULL, 0));
2255 }
2256 
2257 isc_result_t
2258 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2259 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2260 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2261 		     unsigned int flags)
2262 {
2263 	isc__socket_t *sock = (isc__socket_t *)sock0;
2264 	isc_socketevent_t *dev;
2265 	isc__socketmgr_t *manager;
2266 	unsigned int iocount;
2267 	isc_buffer_t *buffer;
2268 
2269 	REQUIRE(VALID_SOCKET(sock));
2270 	REQUIRE(buflist != NULL);
2271 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2272 	REQUIRE(task != NULL);
2273 	REQUIRE(action != NULL);
2274 
2275 	manager = sock->manager;
2276 	REQUIRE(VALID_MANAGER(manager));
2277 
2278 	iocount = isc_bufferlist_usedcount(buflist);
2279 	REQUIRE(iocount > 0);
2280 
2281 	dev = allocate_socketevent(sock,
2282 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2283 	if (dev == NULL)
2284 		return (ISC_R_NOMEMORY);
2285 
2286 	/*
2287 	 * Move each buffer from the passed in list to our internal one.
2288 	 */
2289 	buffer = ISC_LIST_HEAD(*buflist);
2290 	while (buffer != NULL) {
2291 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2292 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2293 		buffer = ISC_LIST_HEAD(*buflist);
2294 	}
2295 
2296 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2297 }
2298 
2299 isc_result_t
2300 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2301 		 unsigned int options) {
2302 	isc__socket_t *sock = (isc__socket_t *)sock0;
2303 	char strbuf[ISC_STRERRORSIZE];
2304 	int on = 1;
2305 
2306 	REQUIRE(VALID_SOCKET(sock));
2307 
2308 	INSIST(!sock->bound);
2309 
2310 	if (sock->pf != sockaddr->type.sa.sa_family) {
2311 		return (ISC_R_FAMILYMISMATCH);
2312 	}
2313 
2314 	/*
2315 	 * Only set SO_REUSEADDR when we want a specific port.
2316 	 */
2317 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2318 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2319 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2320 		       sizeof(on)) < 0) {
2321 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2322 				 "setsockopt(%d) %s", sock->fd, "failed");
2323 		/* Press on... */
2324 	}
2325 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2326 		switch (errno) {
2327 		case EACCES:
2328 			return (ISC_R_NOPERM);
2329 		case EADDRNOTAVAIL:
2330 			return (ISC_R_ADDRNOTAVAIL);
2331 		case EADDRINUSE:
2332 			return (ISC_R_ADDRINUSE);
2333 		case EINVAL:
2334 			return (ISC_R_BOUND);
2335 		default:
2336 			isc__strerror(errno, strbuf, sizeof(strbuf));
2337 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2338 					 strbuf);
2339 			return (ISC_R_UNEXPECTED);
2340 		}
2341 	}
2342 
2343 	socket_log(sock, sockaddr, TRACE, "bound");
2344 	sock->bound = 1;
2345 
2346 	return (ISC_R_SUCCESS);
2347 }
2348 
2349 isc_result_t
2350 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2351 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2352 {
2353 	isc__socket_t *sock = (isc__socket_t *)sock0;
2354 	isc_socket_connev_t *dev;
2355 	isc_task_t *ntask = NULL;
2356 	isc__socketmgr_t *manager;
2357 	int cc;
2358 	char strbuf[ISC_STRERRORSIZE];
2359 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2360 
2361 	REQUIRE(VALID_SOCKET(sock));
2362 	REQUIRE(addr != NULL);
2363 	REQUIRE(task != NULL);
2364 	REQUIRE(action != NULL);
2365 
2366 	manager = sock->manager;
2367 	REQUIRE(VALID_MANAGER(manager));
2368 	REQUIRE(addr != NULL);
2369 
2370 	if (isc_sockaddr_ismulticast(addr))
2371 		return (ISC_R_MULTICAST);
2372 
2373 	REQUIRE(!sock->connecting);
2374 
2375 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2376 							ISC_SOCKEVENT_CONNECT,
2377 							action,	arg,
2378 							sizeof(*dev));
2379 	if (dev == NULL) {
2380 		return (ISC_R_NOMEMORY);
2381 	}
2382 	ISC_LINK_INIT(dev, ev_link);
2383 
2384 	/*
2385 	 * Try to do the connect right away, as there can be only one
2386 	 * outstanding, and it might happen to complete.
2387 	 */
2388 	sock->peer_address = *addr;
2389 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2390 	if (cc < 0) {
2391 		/*
2392 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2393 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2394 		 * a success and let the user detect it if it's really an error
2395 		 * at the time of sending a packet on the socket.
2396 		 */
2397 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2398 			cc = 0;
2399 			goto success;
2400 		}
2401 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2402 			goto queue;
2403 
2404 		switch (errno) {
2405 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2406 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2407 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2408 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2409 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2410 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2411 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2412 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2413 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2414 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2415 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2416 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2417 #undef ERROR_MATCH
2418 		}
2419 
2420 		sock->connected = 0;
2421 
2422 		isc__strerror(errno, strbuf, sizeof(strbuf));
2423 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2424 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2425 				 addrbuf, errno, strbuf);
2426 
2427 		isc_event_free(ISC_EVENT_PTR(&dev));
2428 		return (ISC_R_UNEXPECTED);
2429 
2430 	err_exit:
2431 		sock->connected = 0;
2432 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2433 
2434 		return (ISC_R_SUCCESS);
2435 	}
2436 
2437 	/*
2438 	 * If connect completed, fire off the done event.
2439 	 */
2440  success:
2441 	if (cc == 0) {
2442 		sock->connected = 1;
2443 		sock->bound = 1;
2444 		dev->result = ISC_R_SUCCESS;
2445 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2446 
2447 		return (ISC_R_SUCCESS);
2448 	}
2449 
2450  queue:
2451 
2452 	/*
2453 	 * Attach to task.
2454 	 */
2455 	isc_task_attach(task, &ntask);
2456 
2457 	sock->connecting = 1;
2458 
2459 	dev->ev_sender = ntask;
2460 
2461 	/*
2462 	 * Poke watcher here.  We still have the socket locked, so there
2463 	 * is no race condition.  We will keep the lock for such a short
2464 	 * bit of time waking it up now or later won't matter all that much.
2465 	 */
2466 	if (sock->connect_ev == NULL)
2467 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2468 
2469 	sock->connect_ev = dev;
2470 
2471 	return (ISC_R_SUCCESS);
2472 }
2473 
2474 /*
2475  * Called when a socket with a pending connect() finishes.
2476  */
2477 static void
2478 internal_connect(isc_task_t *me, isc_event_t *ev) {
2479 	isc__socket_t *sock;
2480 	isc_socket_connev_t *dev;
2481 	isc_task_t *task;
2482 	int cc;
2483 	socklen_t optlen;
2484 	char strbuf[ISC_STRERRORSIZE];
2485 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2486 
2487 	UNUSED(me);
2488 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2489 
2490 	sock = ev->ev_sender;
2491 	INSIST(VALID_SOCKET(sock));
2492 
2493 	/*
2494 	 * When the internal event was sent the reference count was bumped
2495 	 * to keep the socket around for us.  Decrement the count here.
2496 	 */
2497 	INSIST(sock->references > 0);
2498 	sock->references--;
2499 	if (sock->references == 0) {
2500 		destroy(&sock);
2501 		return;
2502 	}
2503 
2504 	/*
2505 	 * Has this event been canceled?
2506 	 */
2507 	dev = sock->connect_ev;
2508 	if (dev == NULL) {
2509 		INSIST(!sock->connecting);
2510 		return;
2511 	}
2512 
2513 	INSIST(sock->connecting);
2514 	sock->connecting = 0;
2515 
2516 	/*
2517 	 * Get any possible error status here.
2518 	 */
2519 	optlen = sizeof(cc);
2520 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2521 		       (void *)&cc, (void *)&optlen) < 0)
2522 		cc = errno;
2523 	else
2524 		errno = cc;
2525 
2526 	if (errno != 0) {
2527 		/*
2528 		 * If the error is EAGAIN, just re-select on this
2529 		 * fd and pretend nothing strange happened.
2530 		 */
2531 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2532 			sock->connecting = 1;
2533 			select_poke(sock->manager, sock->fd,
2534 				    SELECT_POKE_CONNECT);
2535 			return;
2536 		}
2537 
2538 
2539 		/*
2540 		 * Translate other errors into ISC_R_* flavors.
2541 		 */
2542 		switch (errno) {
2543 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2544 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2545 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2546 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2547 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2548 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2549 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2550 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2551 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2552 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2553 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2554 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2555 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2556 #undef ERROR_MATCH
2557 		default:
2558 			dev->result = ISC_R_UNEXPECTED;
2559 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2560 					    sizeof(peerbuf));
2561 			isc__strerror(errno, strbuf, sizeof(strbuf));
2562 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2563 					 "internal_connect: connect(%s) %s",
2564 					 peerbuf, strbuf);
2565 		}
2566 	} else {
2567 		dev->result = ISC_R_SUCCESS;
2568 		sock->connected = 1;
2569 		sock->bound = 1;
2570 	}
2571 
2572 	sock->connect_ev = NULL;
2573 
2574 	task = dev->ev_sender;
2575 	dev->ev_sender = sock;
2576 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2577 }
2578 
2579 /*
2580  * Run through the list of events on this socket, and cancel the ones
2581  * queued for task "task" of type "how".  "how" is a bitmask.
2582  */
2583 void
2584 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2585 	isc__socket_t *sock = (isc__socket_t *)sock0;
2586 
2587 	REQUIRE(VALID_SOCKET(sock));
2588 
2589 	/*
2590 	 * Quick exit if there is nothing to do.  Don't even bother locking
2591 	 * in this case.
2592 	 */
2593 	if (how == 0)
2594 		return;
2595 
2596 	/*
2597 	 * All of these do the same thing, more or less.
2598 	 * Each will:
2599 	 *	o If the internal event is marked as "posted" try to
2600 	 *	  remove it from the task's queue.  If this fails, mark it
2601 	 *	  as canceled instead, and let the task clean it up later.
2602 	 *	o For each I/O request for that task of that type, post
2603 	 *	  its done event with status of "ISC_R_CANCELED".
2604 	 *	o Reset any state needed.
2605 	 */
2606 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2607 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2608 		isc_socketevent_t      *dev;
2609 		isc_socketevent_t      *next;
2610 		isc_task_t	       *current_task;
2611 
2612 		dev = ISC_LIST_HEAD(sock->recv_list);
2613 
2614 		while (dev != NULL) {
2615 			current_task = dev->ev_sender;
2616 			next = ISC_LIST_NEXT(dev, ev_link);
2617 
2618 			if ((task == NULL) || (task == current_task)) {
2619 				dev->result = ISC_R_CANCELED;
2620 				send_recvdone_event(sock, &dev);
2621 			}
2622 			dev = next;
2623 		}
2624 	}
2625 
2626 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2627 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2628 		isc_socketevent_t      *dev;
2629 		isc_socketevent_t      *next;
2630 		isc_task_t	       *current_task;
2631 
2632 		dev = ISC_LIST_HEAD(sock->send_list);
2633 
2634 		while (dev != NULL) {
2635 			current_task = dev->ev_sender;
2636 			next = ISC_LIST_NEXT(dev, ev_link);
2637 
2638 			if ((task == NULL) || (task == current_task)) {
2639 				dev->result = ISC_R_CANCELED;
2640 				send_senddone_event(sock, &dev);
2641 			}
2642 			dev = next;
2643 		}
2644 	}
2645 
2646 	/*
2647 	 * Connecting is not a list.
2648 	 */
2649 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2650 	    && sock->connect_ev != NULL) {
2651 		isc_socket_connev_t    *dev;
2652 		isc_task_t	       *current_task;
2653 
2654 		INSIST(sock->connecting);
2655 		sock->connecting = 0;
2656 
2657 		dev = sock->connect_ev;
2658 		current_task = dev->ev_sender;
2659 
2660 		if ((task == NULL) || (task == current_task)) {
2661 			sock->connect_ev = NULL;
2662 
2663 			dev->result = ISC_R_CANCELED;
2664 			dev->ev_sender = sock;
2665 			isc_task_sendanddetach(&current_task,
2666 					       ISC_EVENT_PTR(&dev));
2667 		}
2668 	}
2669 
2670 }
2671 
2672 /*
2673  * In our assumed scenario, we can simply use a single static object.
2674  * XXX: this is not true if the application uses multiple threads with
2675  *      'multi-context' mode.  Fixing this is a future TODO item.
2676  */
2677 static isc_socketwait_t swait_private;
2678 
2679 int
2680 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2681 			  isc_socketwait_t **swaitp)
2682 {
2683 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2684 	int n;
2685 
2686 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2687 
2688 	if (manager == NULL)
2689 		manager = socketmgr;
2690 	if (manager == NULL)
2691 		return (0);
2692 
2693 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2694 	memmove(manager->write_fds_copy, manager->write_fds,
2695 		manager->fd_bufsize);
2696 
2697 	swait_private.readset = manager->read_fds_copy;
2698 	swait_private.writeset = manager->write_fds_copy;
2699 	swait_private.maxfd = manager->maxfd + 1;
2700 
2701 	n = select(swait_private.maxfd, swait_private.readset,
2702 		   swait_private.writeset, NULL, tvp);
2703 
2704 	*swaitp = &swait_private;
2705 	return (n);
2706 }
2707 
2708 isc_result_t
2709 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2710 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2711 
2712 	REQUIRE(swait == &swait_private);
2713 
2714 	if (manager == NULL)
2715 		manager = socketmgr;
2716 	if (manager == NULL)
2717 		return (ISC_R_NOTFOUND);
2718 
2719 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2720 	return (ISC_R_SUCCESS);
2721 }
2722 
2723 #include "../socket_api.c"
2724