xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 40adc7c53e27d127bc335a503ac956d7aa0cbf3a)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Size of per-FD lock buckets.
64  */
65 #define FDLOCK_ID(fd)		0
66 
67 /*%
68  * Some systems define the socket length argument as an int, some as size_t,
69  * some as socklen_t.  This is here so it can be easily changed if needed.
70  */
71 
72 /*%
73  * Define what the possible "soft" errors can be.  These are non-fatal returns
74  * of various network related functions, like recv() and so on.
75  *
76  * For some reason, BSDI (and perhaps others) will sometimes return <0
77  * from recv() but will have errno==0.  This is broken, but we have to
78  * work around it here.
79  */
80 #define SOFT_ERROR(e)	((e) == EAGAIN || \
81 			 (e) == EWOULDBLOCK || \
82 			 (e) == EINTR || \
83 			 (e) == 0)
84 
85 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
86 
87 /*!<
88  * DLVL(90)  --  Function entry/exit and other tracing.
89  * DLVL(60)  --  Socket data send/receive
90  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
91  * DLVL(20)  --  Socket creation/destruction.
92  */
93 #define TRACE_LEVEL		90
94 #define IOEVENT_LEVEL		60
95 #define EVENT_LEVEL		50
96 #define CREATION_LEVEL		20
97 
98 #define TRACE		DLVL(TRACE_LEVEL)
99 #define IOEVENT		DLVL(IOEVENT_LEVEL)
100 #define EVENT		DLVL(EVENT_LEVEL)
101 #define CREATION	DLVL(CREATION_LEVEL)
102 
103 typedef isc_event_t intev_t;
104 
105 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
106 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
107 
108 /*!
109  * IPv6 control information.  If the socket is an IPv6 socket we want
110  * to collect the destination address and interface so the client can
111  * set them on outgoing packets.
112  */
113 
114 /*%
115  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
116  * a setsockopt() like interface to request timestamps, and if the OS
117  * doesn't do it for us, call gettimeofday() on every UDP receive?
118  */
119 
120 /*%
121  * Instead of calculating the cmsgbuf lengths every time we take
122  * a rule of thumb approach - sizes are taken from x86_64 linux,
123  * multiplied by 2, everything should fit. Those sizes are not
124  * large enough to cause any concern.
125  */
126 #define CMSG_SP_IN6PKT 40
127 
128 #define CMSG_SP_TIMESTAMP 32
129 
130 #define CMSG_SP_TCTOS 24
131 
132 #define CMSG_SP_INT 24
133 
134 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
135 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
136 
137 /*%
138  * The number of times a send operation is repeated if the result is EINTR.
139  */
140 #define NRETRIES 10
141 
142 typedef struct isc__socket isc__socket_t;
143 typedef struct isc__socketmgr isc__socketmgr_t;
144 
145 struct isc__socket {
146 	/* Not locked. */
147 	isc_socket_t		common;
148 	isc__socketmgr_t	*manager;
149 	isc_sockettype_t	type;
150 
151 	/* Locked by socket lock. */
152 	ISC_LINK(isc__socket_t)	link;
153 	unsigned int		references;
154 	int			fd;
155 	int			pf;
156 
157 	ISC_LIST(isc_socketevent_t)		send_list;
158 	ISC_LIST(isc_socketevent_t)		recv_list;
159 	isc_socket_connev_t		       *connect_ev;
160 
161 	/*
162 	 * Internal events.  Posted when a descriptor is readable or
163 	 * writable.  These are statically allocated and never freed.
164 	 * They will be set to non-purgable before use.
165 	 */
166 	intev_t			readable_ev;
167 	intev_t			writable_ev;
168 
169 	isc_sockaddr_t		peer_address;       /* remote address */
170 
171 	unsigned int		pending_recv : 1,
172 				pending_send : 1,
173 				connected : 1,
174 				connecting : 1,     /* connect pending */
175 				bound : 1,          /* bound to local addr */
176 				active : 1,         /* currently active */
177 				pktdscp : 1;	    /* per packet dscp */
178 	unsigned int		dscp;
179 };
180 
181 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
182 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
183 
184 struct isc__socketmgr {
185 	/* Not locked. */
186 	isc_socketmgr_t		common;
187 	int			fd_bufsize;
188 	unsigned int		maxsocks;
189 
190 	isc__socket_t	       **fds;
191 	int			*fdstate;
192 
193 	/* Locked by manager lock. */
194 	ISC_LIST(isc__socket_t)	socklist;
195 	fd_set			*read_fds;
196 	fd_set			*read_fds_copy;
197 	fd_set			*write_fds;
198 	fd_set			*write_fds_copy;
199 	int			maxfd;
200 	unsigned int		refs;
201 };
202 
203 static isc__socketmgr_t *socketmgr = NULL;
204 
205 #define CLOSED			0	/* this one must be zero */
206 #define MANAGED			1
207 #define CLOSE_PENDING		2
208 
209 /*
210  * send() and recv() iovec counts
211  */
212 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
213 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
214 
215 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
216 				  isc_sockettype_t type,
217 				  isc_socket_t **socketp);
218 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
219 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
220 static void free_socket(isc__socket_t **);
221 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
222 				    isc__socket_t **);
223 static void destroy(isc__socket_t **);
224 static void internal_connect(isc_task_t *, isc_event_t *);
225 static void internal_recv(isc_task_t *, isc_event_t *);
226 static void internal_send(isc_task_t *, isc_event_t *);
227 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
228 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
229 			      struct msghdr *, struct iovec *, size_t *);
230 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
231 			      struct msghdr *, struct iovec *, size_t *);
232 
233 /*%
234  * The following are intended for internal use (indicated by "isc__"
235  * prefix) but are not declared as static, allowing direct access from
236  * unit tests etc.
237  */
238 
239 isc_result_t
240 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
241 		   isc_socket_t **socketp);
242 void
243 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
244 void
245 isc__socket_detach(isc_socket_t **socketp);
246 isc_result_t
247 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
248 		 unsigned int minimum, isc_task_t *task,
249 		  isc_taskaction_t action, void *arg);
250 isc_result_t
251 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
252 		  isc_task_t *task, isc_taskaction_t action, void *arg);
253 isc_result_t
254 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
255 		     isc_task_t *task, isc_taskaction_t action, void *arg,
256 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
257 		     unsigned int flags);
258 isc_result_t
259 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
260 		 unsigned int options);
261 isc_result_t
262 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
263 		    isc_task_t *task, isc_taskaction_t action,
264 		    void *arg);
265 void
266 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
267 
268 isc_result_t
269 isc__socketmgr_create(isc_socketmgr_t **managerp);
270 isc_result_t
271 isc__socketmgr_create2(isc_socketmgr_t **managerp,
272 		       unsigned int maxsocks);
273 isc_result_t
274 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
275 void
276 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
277 
278 static struct {
279 	isc_socketmethods_t methods;
280 
281 	/*%
282 	 * The following are defined just for avoiding unused static functions.
283 	 */
284 	void *recvv, *sendv;
285 } socketmethods = {
286 	{
287 		isc__socket_attach,
288 		isc__socket_detach,
289 		isc__socket_bind,
290 		isc__socket_connect,
291 		isc__socket_cancel,
292 	},
293 	(void *)isc__socket_recvv,
294 	(void *)isc__socket_sendv,
295 };
296 
297 static isc_socketmgrmethods_t socketmgrmethods = {
298 	isc__socketmgr_destroy,
299 	isc__socket_create
300 };
301 
302 #define SELECT_POKE_SHUTDOWN		(-1)
303 #define SELECT_POKE_READ		(-3)
304 #define SELECT_POKE_WRITE		(-4)
305 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
306 #define SELECT_POKE_CLOSE		(-5)
307 
308 #define SOCK_DEAD(s)			((s)->references == 0)
309 
310 /*%
311  * Shortcut index arrays to get access to statistics counters.
312  */
313 enum {
314 	STATID_OPEN = 0,
315 	STATID_OPENFAIL = 1,
316 	STATID_CLOSE = 2,
317 	STATID_BINDFAIL = 3,
318 	STATID_CONNECTFAIL = 4,
319 	STATID_CONNECT = 5,
320 	STATID_ACCEPTFAIL = 6,
321 	STATID_ACCEPT = 7,
322 	STATID_SENDFAIL = 8,
323 	STATID_RECVFAIL = 9,
324 	STATID_ACTIVE = 10
325 };
326 
327 
328 static void
329 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
330 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
331 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
332 static void
333 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
334 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
335 	   const char *fmt, ...)
336 {
337 	char msgbuf[2048];
338 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
339 	va_list ap;
340 
341 	if (! isc_log_wouldlog(isc_lctx, level))
342 		return;
343 
344 	va_start(ap, fmt);
345 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
346 	va_end(ap);
347 
348 	if (address == NULL) {
349 		isc_log_write(isc_lctx, category, module, level,
350 			       "socket %p: %s", sock, msgbuf);
351 	} else {
352 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
353 		isc_log_write(isc_lctx, category, module, level,
354 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
355 	}
356 }
357 
358 static inline isc_result_t
359 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
360 	isc_result_t result = ISC_R_SUCCESS;
361 
362 	if (msg == SELECT_POKE_READ)
363 		FD_SET(fd, manager->read_fds);
364 	if (msg == SELECT_POKE_WRITE)
365 		FD_SET(fd, manager->write_fds);
366 
367 	return (result);
368 }
369 
370 static inline isc_result_t
371 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
372 	isc_result_t result = ISC_R_SUCCESS;
373 
374 	if (msg == SELECT_POKE_READ)
375 		FD_CLR(fd, manager->read_fds);
376 	else if (msg == SELECT_POKE_WRITE)
377 		FD_CLR(fd, manager->write_fds);
378 
379 	return (result);
380 }
381 
382 static void
383 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
384 	isc_result_t result;
385 
386 	/*
387 	 * This is a wakeup on a socket.  If the socket is not in the
388 	 * process of being closed, start watching it for either reads
389 	 * or writes.
390 	 */
391 
392 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
393 
394 	if (msg == SELECT_POKE_CLOSE) {
395 		/* No one should be updating fdstate, so no need to lock it */
396 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
397 		manager->fdstate[fd] = CLOSED;
398 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
399 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
400 		(void)close(fd);
401 		return;
402 	}
403 
404 	if (manager->fdstate[fd] == CLOSE_PENDING) {
405 
406 		/*
407 		 * We accept (and ignore) any error from unwatch_fd() as we are
408 		 * closing the socket, hoping it doesn't leave dangling state in
409 		 * the kernel.
410 		 */
411 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
412 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
413 		return;
414 	}
415 	if (manager->fdstate[fd] != MANAGED) {
416 		return;
417 	}
418 
419 	/*
420 	 * Set requested bit.
421 	 */
422 	result = watch_fd(manager, fd, msg);
423 	if (result != ISC_R_SUCCESS) {
424 		/*
425 		 * XXXJT: what should we do?  Ignoring the failure of watching
426 		 * a socket will make the application dysfunctional, but there
427 		 * seems to be no reasonable recovery process.
428 		 */
429 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
430 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
431 			      "failed to start watching FD (%d): %s",
432 			      fd, isc_result_totext(result));
433 	}
434 }
435 
436 /*
437  * Update the state of the socketmgr when something changes.
438  */
439 static void
440 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
441 	if (msg == SELECT_POKE_SHUTDOWN)
442 		return;
443 	else if (fd >= 0)
444 		wakeup_socket(manager, fd, msg);
445 	return;
446 }
447 
448 /*
449  * Make a fd non-blocking.
450  */
451 static isc_result_t
452 make_nonblock(int fd) {
453 	int ret;
454 	int flags;
455 
456 	flags = fcntl(fd, F_GETFL, 0);
457 	flags |= O_NONBLOCK;
458 	ret = fcntl(fd, F_SETFL, flags);
459 
460 	if (ret == -1) {
461 		UNEXPECTED_ERROR(__FILE__, __LINE__,
462 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
463 				 strerror(errno));
464 		return (ISC_R_UNEXPECTED);
465 	}
466 
467 	return (ISC_R_SUCCESS);
468 }
469 
470 /*
471  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
472  * In order to ensure as much portability as possible, we provide wrapper
473  * functions of these macros.
474  * Note that cmsg_space() could run slow on OSes that do not have
475  * CMSG_SPACE.
476  */
477 static inline socklen_t
478 cmsg_len(socklen_t len) {
479 	return (CMSG_LEN(len));
480 }
481 
482 static inline socklen_t
483 cmsg_space(socklen_t len) {
484 	return (CMSG_SPACE(len));
485 }
486 
487 /*
488  * Process control messages received on a socket.
489  */
490 static void
491 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
492 	struct cmsghdr *cmsgp;
493 	struct in6_pktinfo *pktinfop;
494 	void *timevalp;
495 
496 	/*
497 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
498 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
499 	 * They are all here, outside of the CPP tests, because it is
500 	 * more consistent with the usual ISC coding style.
501 	 */
502 	UNUSED(sock);
503 	UNUSED(msg);
504 	UNUSED(dev);
505 
506 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
507 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
508 
509 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
510 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
511 
512 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
513 		return;
514 
515 	timevalp = NULL;
516 	pktinfop = NULL;
517 
518 	cmsgp = CMSG_FIRSTHDR(msg);
519 	while (cmsgp != NULL) {
520 		socket_log(sock, NULL, TRACE,
521 			   "processing cmsg %p", cmsgp);
522 
523 		if (cmsgp->cmsg_level == IPPROTO_IPV6
524 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
525 
526 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
527 			memmove(&dev->pktinfo, pktinfop,
528 				sizeof(struct in6_pktinfo));
529 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
530 			socket_log(sock, NULL, TRACE,
531 				   "interface received on ifindex %u",
532 				   dev->pktinfo.ipi6_ifindex);
533 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
534 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
535 			goto next;
536 		}
537 
538 		if (cmsgp->cmsg_level == SOL_SOCKET
539 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
540 			struct timeval tv;
541 			timevalp = CMSG_DATA(cmsgp);
542 			memmove(&tv, timevalp, sizeof(tv));
543 			dev->timestamp.seconds = tv.tv_sec;
544 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
545 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
546 			goto next;
547 		}
548 
549 		if (cmsgp->cmsg_level == IPPROTO_IPV6
550 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
551 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
552 			dev->dscp >>= 2;
553 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
554 			goto next;
555 		}
556 
557 		if (cmsgp->cmsg_level == IPPROTO_IP
558 		    && (cmsgp->cmsg_type == IP_TOS)) {
559 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
560 			dev->dscp >>= 2;
561 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
562 			goto next;
563 		}
564 	next:
565 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
566 	}
567 
568 }
569 
570 /*
571  * Construct an iov array and attach it to the msghdr passed in.  This is
572  * the SEND constructor, which will use the used region of the buffer
573  * (if using a buffer list) or will use the internal region (if a single
574  * buffer I/O is requested).
575  *
576  * Nothing can be NULL, and the done event must list at least one buffer
577  * on the buffer linked list for this function to be meaningful.
578  *
579  * If write_countp != NULL, *write_countp will hold the number of bytes
580  * this transaction can send.
581  */
582 static void
583 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
584 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
585 {
586 	unsigned int iovcount;
587 	isc_buffer_t *buffer;
588 	isc_region_t used;
589 	size_t write_count;
590 	size_t skip_count;
591 	struct cmsghdr *cmsgp;
592 
593 	memset(msg, 0, sizeof(*msg));
594 
595 	if (!sock->connected) {
596 		msg->msg_name = (void *)&dev->address.type.sa;
597 		msg->msg_namelen = dev->address.length;
598 	} else {
599 		msg->msg_name = NULL;
600 		msg->msg_namelen = 0;
601 	}
602 
603 	buffer = ISC_LIST_HEAD(dev->bufferlist);
604 	write_count = 0;
605 	iovcount = 0;
606 
607 	/*
608 	 * Single buffer I/O?  Skip what we've done so far in this region.
609 	 */
610 	if (buffer == NULL) {
611 		write_count = dev->region.length - dev->n;
612 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
613 		iov[0].iov_len = write_count;
614 		iovcount = 1;
615 
616 		goto config;
617 	}
618 
619 	/*
620 	 * Multibuffer I/O.
621 	 * Skip the data in the buffer list that we have already written.
622 	 */
623 	skip_count = dev->n;
624 	while (buffer != NULL) {
625 		REQUIRE(ISC_BUFFER_VALID(buffer));
626 		if (skip_count < isc_buffer_usedlength(buffer))
627 			break;
628 		skip_count -= isc_buffer_usedlength(buffer);
629 		buffer = ISC_LIST_NEXT(buffer, link);
630 	}
631 
632 	while (buffer != NULL) {
633 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
634 
635 		isc_buffer_usedregion(buffer, &used);
636 
637 		if (used.length > 0) {
638 			iov[iovcount].iov_base = (void *)(used.base
639 							  + skip_count);
640 			iov[iovcount].iov_len = used.length - skip_count;
641 			write_count += (used.length - skip_count);
642 			skip_count = 0;
643 			iovcount++;
644 		}
645 		buffer = ISC_LIST_NEXT(buffer, link);
646 	}
647 
648 	INSIST(skip_count == 0U);
649 
650  config:
651 	msg->msg_iov = iov;
652 	msg->msg_iovlen = iovcount;
653 
654 	msg->msg_control = NULL;
655 	msg->msg_controllen = 0;
656 	msg->msg_flags = 0;
657 
658 	if ((sock->type == isc_sockettype_udp) &&
659 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
660 	{
661 		struct in6_pktinfo *pktinfop;
662 
663 		socket_log(sock, NULL, TRACE,
664 			   "sendto pktinfo data, ifindex %u",
665 			   dev->pktinfo.ipi6_ifindex);
666 
667 		msg->msg_control = (void *)cmsgbuf;
668 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
669 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
670 
671 		cmsgp = (struct cmsghdr *)cmsgbuf;
672 		cmsgp->cmsg_level = IPPROTO_IPV6;
673 		cmsgp->cmsg_type = IPV6_PKTINFO;
674 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
675 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
676 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
677 	}
678 
679 	if ((sock->type == isc_sockettype_udp) &&
680 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
681 	{
682 		int use_min_mtu = 1;	/* -1, 0, 1 */
683 
684 		cmsgp = (struct cmsghdr *)(cmsgbuf +
685 					   msg->msg_controllen);
686 
687 		msg->msg_control = (void *)cmsgbuf;
688 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
689 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
690 
691 		cmsgp->cmsg_level = IPPROTO_IPV6;
692 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
693 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
694 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
695 	}
696 
697 	if (isc_dscp_check_value > -1) {
698 		if (sock->type == isc_sockettype_udp)
699 			INSIST((int)dev->dscp == isc_dscp_check_value);
700 		else if (sock->type == isc_sockettype_tcp)
701 			INSIST((int)sock->dscp == isc_dscp_check_value);
702 	}
703 
704 	if ((sock->type == isc_sockettype_udp) &&
705 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
706 	{
707 		int dscp = (dev->dscp << 2) & 0xff;
708 
709 		INSIST(dev->dscp < 0x40);
710 
711 		if (sock->pf == AF_INET && sock->pktdscp) {
712 			cmsgp = (struct cmsghdr *)(cmsgbuf +
713 						   msg->msg_controllen);
714 			msg->msg_control = (void *)cmsgbuf;
715 			msg->msg_controllen += cmsg_space(sizeof(dscp));
716 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
717 
718 			cmsgp->cmsg_level = IPPROTO_IP;
719 			cmsgp->cmsg_type = IP_TOS;
720 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
721 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
722 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
723 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
724 			       (void *)&dscp, sizeof(int)) < 0)
725 			{
726 				UNEXPECTED_ERROR(__FILE__, __LINE__,
727 						 "setsockopt(%d, IP_TOS, %.02x)"
728 						 " %s: %s",
729 						 sock->fd, dscp >> 2,
730 						 "failed", strerror(errno));
731 			} else
732 				sock->dscp = dscp;
733 		}
734 
735 		if (sock->pf == AF_INET6 && sock->pktdscp) {
736 			cmsgp = (struct cmsghdr *)(cmsgbuf +
737 						   msg->msg_controllen);
738 			msg->msg_control = (void *)cmsgbuf;
739 			msg->msg_controllen += cmsg_space(sizeof(dscp));
740 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
741 
742 			cmsgp->cmsg_level = IPPROTO_IPV6;
743 			cmsgp->cmsg_type = IPV6_TCLASS;
744 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
745 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
746 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
747 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
748 				       (void *)&dscp, sizeof(int)) < 0) {
749 				UNEXPECTED_ERROR(__FILE__, __LINE__,
750 						 "setsockopt(%d, IPV6_TCLASS, "
751 						 "%.02x) %s: %s",
752 						 sock->fd, dscp >> 2,
753 						 "failed", strerror(errno));
754 			} else
755 				sock->dscp = dscp;
756 		}
757 
758 		if (msg->msg_controllen != 0 &&
759 		    msg->msg_controllen < SENDCMSGBUFLEN)
760 		{
761 			memset(cmsgbuf + msg->msg_controllen, 0,
762 			       SENDCMSGBUFLEN - msg->msg_controllen);
763 		}
764 	}
765 
766 	if (write_countp != NULL)
767 		*write_countp = write_count;
768 }
769 
770 /*
771  * Construct an iov array and attach it to the msghdr passed in.  This is
772  * the RECV constructor, which will use the available region of the buffer
773  * (if using a buffer list) or will use the internal region (if a single
774  * buffer I/O is requested).
775  *
776  * Nothing can be NULL, and the done event must list at least one buffer
777  * on the buffer linked list for this function to be meaningful.
778  *
779  * If read_countp != NULL, *read_countp will hold the number of bytes
780  * this transaction can receive.
781  */
782 static void
783 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
784 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
785 {
786 	unsigned int iovcount;
787 	isc_buffer_t *buffer;
788 	isc_region_t available;
789 	size_t read_count;
790 
791 	memset(msg, 0, sizeof(struct msghdr));
792 
793 	if (sock->type == isc_sockettype_udp) {
794 		memset(&dev->address, 0, sizeof(dev->address));
795 		msg->msg_name = (void *)&dev->address.type.sa;
796 		msg->msg_namelen = sizeof(dev->address.type);
797 	} else { /* TCP */
798 		msg->msg_name = NULL;
799 		msg->msg_namelen = 0;
800 		dev->address = sock->peer_address;
801 	}
802 
803 	buffer = ISC_LIST_HEAD(dev->bufferlist);
804 	read_count = 0;
805 
806 	/*
807 	 * Single buffer I/O?  Skip what we've done so far in this region.
808 	 */
809 	if (buffer == NULL) {
810 		read_count = dev->region.length - dev->n;
811 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
812 		iov[0].iov_len = read_count;
813 		iovcount = 1;
814 
815 		goto config;
816 	}
817 
818 	/*
819 	 * Multibuffer I/O.
820 	 * Skip empty buffers.
821 	 */
822 	while (buffer != NULL) {
823 		REQUIRE(ISC_BUFFER_VALID(buffer));
824 		if (isc_buffer_availablelength(buffer) != 0)
825 			break;
826 		buffer = ISC_LIST_NEXT(buffer, link);
827 	}
828 
829 	iovcount = 0;
830 	while (buffer != NULL) {
831 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
832 
833 		isc_buffer_availableregion(buffer, &available);
834 
835 		if (available.length > 0) {
836 			iov[iovcount].iov_base = (void *)(available.base);
837 			iov[iovcount].iov_len = available.length;
838 			read_count += available.length;
839 			iovcount++;
840 		}
841 		buffer = ISC_LIST_NEXT(buffer, link);
842 	}
843 
844  config:
845 
846 	/*
847 	 * If needed, set up to receive that one extra byte.
848 	 */
849 	msg->msg_iov = iov;
850 	msg->msg_iovlen = iovcount;
851 
852 	msg->msg_control = cmsgbuf;
853 	msg->msg_controllen = RECVCMSGBUFLEN;
854 	msg->msg_flags = 0;
855 
856 	if (read_countp != NULL)
857 		*read_countp = read_count;
858 }
859 
860 static void
861 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
862 		isc_socketevent_t *dev)
863 {
864 	if (sock->type == isc_sockettype_udp) {
865 		if (address != NULL)
866 			dev->address = *address;
867 		else
868 			dev->address = sock->peer_address;
869 	} else if (sock->type == isc_sockettype_tcp) {
870 		INSIST(address == NULL);
871 		dev->address = sock->peer_address;
872 	}
873 }
874 
875 static void
876 destroy_socketevent(isc_event_t *event) {
877 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
878 
879 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
880 
881 	(ev->destroy)(event);
882 }
883 
884 static isc_socketevent_t *
885 allocate_socketevent(void *sender,
886 		     isc_eventtype_t eventtype, isc_taskaction_t action,
887 		     void *arg)
888 {
889 	isc_socketevent_t *ev;
890 
891 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
892 						     eventtype, action, arg,
893 						     sizeof(*ev));
894 
895 	if (ev == NULL)
896 		return (NULL);
897 
898 	ev->result = ISC_R_UNSET;
899 	ISC_LINK_INIT(ev, ev_link);
900 	ISC_LIST_INIT(ev->bufferlist);
901 	ev->region.base = NULL;
902 	ev->n = 0;
903 	ev->offset = 0;
904 	ev->attributes = 0;
905 	ev->destroy = ev->ev_destroy;
906 	ev->ev_destroy = destroy_socketevent;
907 	ev->dscp = 0;
908 
909 	return (ev);
910 }
911 
912 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
913 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
914 #define DOIO_HARD		2	/* i/o error, event sent */
915 #define DOIO_EOF		3	/* EOF, no event sent */
916 
917 static int
918 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
919 	int cc;
920 	struct iovec iov[MAXSCATTERGATHER_RECV];
921 	size_t read_count;
922 	size_t actual_count;
923 	struct msghdr msghdr;
924 	isc_buffer_t *buffer;
925 	int recv_errno;
926 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
927 
928 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
929 
930 	cc = recvmsg(sock->fd, &msghdr, 0);
931 	recv_errno = errno;
932 
933 	if (cc < 0) {
934 		if (SOFT_ERROR(recv_errno))
935 			return (DOIO_SOFT);
936 
937 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
938 			socket_log(sock, NULL, IOEVENT,
939 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
940 				   sock->fd, cc, recv_errno,
941 				   strerror(recv_errno));
942 		}
943 
944 #define SOFT_OR_HARD(_system, _isc) \
945 	if (recv_errno == _system) { \
946 		if (sock->connected) { \
947 			dev->result = _isc; \
948 			return (DOIO_HARD); \
949 		} \
950 		return (DOIO_SOFT); \
951 	}
952 #define ALWAYS_HARD(_system, _isc) \
953 	if (recv_errno == _system) { \
954 		dev->result = _isc; \
955 		return (DOIO_HARD); \
956 	}
957 
958 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
959 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
960 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
961 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
962 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
963 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
964 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
965 		/* Should never get this one but it was seen. */
966 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
967 		/*
968 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
969 		 * errors.
970 		 */
971 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
972 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
973 
974 #undef SOFT_OR_HARD
975 #undef ALWAYS_HARD
976 
977 		dev->result = isc__errno2result(recv_errno);
978 		return (DOIO_HARD);
979 	}
980 
981 	/*
982 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
983 	 * while on UDP sockets, zero length reads are perfectly valid,
984 	 * although strange.
985 	 */
986 	switch (sock->type) {
987 	case isc_sockettype_tcp:
988 		if (cc == 0)
989 			return (DOIO_EOF);
990 		break;
991 	case isc_sockettype_udp:
992 		break;
993 	default:
994 		INSIST(0);
995 	}
996 
997 	if (sock->type == isc_sockettype_udp) {
998 		dev->address.length = msghdr.msg_namelen;
999 		if (isc_sockaddr_getport(&dev->address) == 0) {
1000 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1001 				socket_log(sock, &dev->address, IOEVENT,
1002 					   "dropping source port zero packet");
1003 			}
1004 			return (DOIO_SOFT);
1005 		}
1006 	}
1007 
1008 	socket_log(sock, &dev->address, IOEVENT,
1009 		   "packet received correctly");
1010 
1011 	/*
1012 	 * Overflow bit detection.  If we received MORE bytes than we should,
1013 	 * this indicates an overflow situation.  Set the flag in the
1014 	 * dev entry and adjust how much we read by one.
1015 	 */
1016 	/*
1017 	 * If there are control messages attached, run through them and pull
1018 	 * out the interesting bits.
1019 	 */
1020 	process_cmsg(sock, &msghdr, dev);
1021 
1022 	/*
1023 	 * update the buffers (if any) and the i/o count
1024 	 */
1025 	dev->n += cc;
1026 	actual_count = cc;
1027 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1028 	while (buffer != NULL && actual_count > 0U) {
1029 		REQUIRE(ISC_BUFFER_VALID(buffer));
1030 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1031 			actual_count -= isc_buffer_availablelength(buffer);
1032 			isc_buffer_add(buffer,
1033 				       isc_buffer_availablelength(buffer));
1034 		} else {
1035 			isc_buffer_add(buffer, actual_count);
1036 			actual_count = 0;
1037 			POST(actual_count);
1038 			break;
1039 		}
1040 		buffer = ISC_LIST_NEXT(buffer, link);
1041 		if (buffer == NULL) {
1042 			INSIST(actual_count == 0U);
1043 		}
1044 	}
1045 
1046 	/*
1047 	 * If we read less than we expected, update counters,
1048 	 * and let the upper layer poke the descriptor.
1049 	 */
1050 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1051 		return (DOIO_SOFT);
1052 
1053 	/*
1054 	 * Full reads are posted, or partials if partials are ok.
1055 	 */
1056 	dev->result = ISC_R_SUCCESS;
1057 	return (DOIO_SUCCESS);
1058 }
1059 
1060 /*
1061  * Returns:
1062  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1063  *			ISC_R_SUCCESS.
1064  *
1065  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1066  *			dev->result contains the appropriate error.
1067  *
1068  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1069  *			event was sent.  The operation should be retried.
1070  *
1071  *	No other return values are possible.
1072  */
1073 static int
1074 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1075 	int cc;
1076 	struct iovec iov[MAXSCATTERGATHER_SEND];
1077 	size_t write_count;
1078 	struct msghdr msghdr;
1079 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1080 	int attempts = 0;
1081 	int send_errno;
1082 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1083 
1084 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1085 
1086  resend:
1087 	cc = sendmsg(sock->fd, &msghdr, 0);
1088 	send_errno = errno;
1089 
1090 	/*
1091 	 * Check for error or block condition.
1092 	 */
1093 	if (cc < 0) {
1094 		if (send_errno == EINTR && ++attempts < NRETRIES)
1095 			goto resend;
1096 
1097 		if (SOFT_ERROR(send_errno)) {
1098 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1099 				dev->result = ISC_R_WOULDBLOCK;
1100 			return (DOIO_SOFT);
1101 		}
1102 
1103 #define SOFT_OR_HARD(_system, _isc) \
1104 	if (send_errno == _system) { \
1105 		if (sock->connected) { \
1106 			dev->result = _isc; \
1107 			return (DOIO_HARD); \
1108 		} \
1109 		return (DOIO_SOFT); \
1110 	}
1111 #define ALWAYS_HARD(_system, _isc) \
1112 	if (send_errno == _system) { \
1113 		dev->result = _isc; \
1114 		return (DOIO_HARD); \
1115 	}
1116 
1117 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1118 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1119 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1120 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1121 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1122 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1123 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1124 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1125 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1126 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1127 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1128 
1129 #undef SOFT_OR_HARD
1130 #undef ALWAYS_HARD
1131 
1132 		/*
1133 		 * The other error types depend on whether or not the
1134 		 * socket is UDP or TCP.  If it is UDP, some errors
1135 		 * that we expect to be fatal under TCP are merely
1136 		 * annoying, and are really soft errors.
1137 		 *
1138 		 * However, these soft errors are still returned as
1139 		 * a status.
1140 		 */
1141 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1142 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1143 				 addrbuf, strerror(send_errno));
1144 		dev->result = isc__errno2result(send_errno);
1145 		return (DOIO_HARD);
1146 	}
1147 
1148 	if (cc == 0) {
1149 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1150 				 "doio_send: send() %s 0", "returned");
1151 	}
1152 
1153 	/*
1154 	 * If we write less than we expected, update counters, poke.
1155 	 */
1156 	dev->n += cc;
1157 	if ((size_t)cc != write_count)
1158 		return (DOIO_SOFT);
1159 
1160 	/*
1161 	 * Exactly what we wanted to write.  We're done with this
1162 	 * entry.  Post its completion event.
1163 	 */
1164 	dev->result = ISC_R_SUCCESS;
1165 	return (DOIO_SUCCESS);
1166 }
1167 
1168 /*
1169  * Kill.
1170  *
1171  * Caller must ensure that the socket is not locked and no external
1172  * references exist.
1173  */
1174 static void
1175 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1176 	/*
1177 	 * No one has this socket open, so the watcher doesn't have to be
1178 	 * poked, and the socket doesn't have to be locked.
1179 	 */
1180 	manager->fds[fd] = NULL;
1181 	manager->fdstate[fd] = CLOSE_PENDING;
1182 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1183 
1184 	if (sock->active == 1) {
1185 		sock->active = 0;
1186 	}
1187 
1188 	/*
1189 	 * update manager->maxfd here (XXX: this should be implemented more
1190 	 * efficiently)
1191 	 */
1192 	if (manager->maxfd == fd) {
1193 		int i;
1194 
1195 		manager->maxfd = 0;
1196 		for (i = fd - 1; i >= 0; i--) {
1197 			if (manager->fdstate[i] == MANAGED) {
1198 				manager->maxfd = i;
1199 				break;
1200 			}
1201 		}
1202 	}
1203 
1204 }
1205 
1206 static void
1207 destroy(isc__socket_t **sockp) {
1208 	int fd;
1209 	isc__socket_t *sock = *sockp;
1210 	isc__socketmgr_t *manager = sock->manager;
1211 
1212 	socket_log(sock, NULL, CREATION, "destroying");
1213 
1214 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1215 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1216 	INSIST(sock->connect_ev == NULL);
1217 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1218 
1219 	if (sock->fd >= 0) {
1220 		fd = sock->fd;
1221 		sock->fd = -1;
1222 		socketclose(manager, sock, fd);
1223 	}
1224 
1225 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1226 
1227 	/* can't unlock manager as its memory context is still used */
1228 	free_socket(sockp);
1229 }
1230 
1231 static isc_result_t
1232 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1233 		isc__socket_t **socketp)
1234 {
1235 	isc__socket_t *sock;
1236 
1237 	sock = malloc(sizeof(*sock));
1238 
1239 	if (sock == NULL)
1240 		return (ISC_R_NOMEMORY);
1241 
1242 	sock->common.magic = 0;
1243 	sock->common.impmagic = 0;
1244 	sock->references = 0;
1245 
1246 	sock->manager = manager;
1247 	sock->type = type;
1248 	sock->fd = -1;
1249 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1250 	sock->active = 0;
1251 
1252 	ISC_LINK_INIT(sock, link);
1253 
1254 	/*
1255 	 * Set up list of readers and writers to be initially empty.
1256 	 */
1257 	ISC_LIST_INIT(sock->recv_list);
1258 	ISC_LIST_INIT(sock->send_list);
1259 	sock->connect_ev = NULL;
1260 	sock->pending_recv = 0;
1261 	sock->pending_send = 0;
1262 	sock->connected = 0;
1263 	sock->connecting = 0;
1264 	sock->bound = 0;
1265 	sock->pktdscp = 0;
1266 
1267 	/*
1268 	 * Initialize readable and writable events.
1269 	 */
1270 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1271 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1272 		       NULL, sock, sock, NULL);
1273 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1274 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1275 		       NULL, sock, sock, NULL);
1276 
1277 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1278 	sock->common.impmagic = SOCKET_MAGIC;
1279 	*socketp = sock;
1280 
1281 	return (ISC_R_SUCCESS);
1282 }
1283 
1284 /*
1285  * This event requires that the various lists be empty, that the reference
1286  * count be 1, and that the magic number is valid.  The other socket bits,
1287  * like the lock, must be initialized as well.  The fd associated must be
1288  * marked as closed, by setting it to -1 on close, or this routine will
1289  * also close the socket.
1290  */
1291 static void
1292 free_socket(isc__socket_t **socketp) {
1293 	isc__socket_t *sock = *socketp;
1294 
1295 	INSIST(VALID_SOCKET(sock));
1296 	INSIST(sock->references == 0);
1297 	INSIST(!sock->connecting);
1298 	INSIST(!sock->pending_recv);
1299 	INSIST(!sock->pending_send);
1300 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1301 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1302 	INSIST(!ISC_LINK_LINKED(sock, link));
1303 
1304 	sock->common.magic = 0;
1305 	sock->common.impmagic = 0;
1306 
1307 	free(sock);
1308 
1309 	*socketp = NULL;
1310 }
1311 
1312 static void
1313 use_min_mtu(isc__socket_t *sock) {
1314 	/* use minimum MTU */
1315 	if (sock->pf == AF_INET6) {
1316 		int on = 1;
1317 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1318 				(void *)&on, sizeof(on));
1319 	}
1320 }
1321 
1322 static void
1323 set_tcp_maxseg(isc__socket_t *sock, int size) {
1324 	if (sock->type == isc_sockettype_tcp)
1325 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1326 				(void *)&size, sizeof(size));
1327 }
1328 
1329 static isc_result_t
1330 opensocket(isc__socket_t *sock)
1331 {
1332 	isc_result_t result;
1333 	const char *err = "socket";
1334 	int on = 1;
1335 
1336 	switch (sock->type) {
1337 	case isc_sockettype_udp:
1338 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1339 		break;
1340 	case isc_sockettype_tcp:
1341 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1342 		break;
1343 	}
1344 
1345 	if (sock->fd < 0) {
1346 		switch (errno) {
1347 		case EMFILE:
1348 		case ENFILE:
1349 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1350 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1351 				       "%s: %s", err, strerror(errno));
1352 			/* fallthrough */
1353 		case ENOBUFS:
1354 			return (ISC_R_NORESOURCES);
1355 
1356 		case EPROTONOSUPPORT:
1357 		case EPFNOSUPPORT:
1358 		case EAFNOSUPPORT:
1359 		/*
1360 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1361 		 * EAFNOSUPPORT.
1362 		 */
1363 		case EINVAL:
1364 			return (ISC_R_FAMILYNOSUPPORT);
1365 
1366 		default:
1367 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1368 					 "%s() %s: %s", err, "failed",
1369 					 strerror(errno));
1370 			return (ISC_R_UNEXPECTED);
1371 		}
1372 	}
1373 
1374 	result = make_nonblock(sock->fd);
1375 	if (result != ISC_R_SUCCESS) {
1376 		(void)close(sock->fd);
1377 		return (result);
1378 	}
1379 
1380 	/*
1381 	 * Use minimum mtu if possible.
1382 	 */
1383 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1384 		use_min_mtu(sock);
1385 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1386 	}
1387 
1388 	if (sock->type == isc_sockettype_udp) {
1389 
1390 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1391 			       (void *)&on, sizeof(on)) < 0
1392 		    && errno != ENOPROTOOPT) {
1393 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1394 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1395 					 sock->fd, "failed", strerror(errno));
1396 			/* Press on... */
1397 		}
1398 
1399 		/* RFC 3542 */
1400 		if ((sock->pf == AF_INET6)
1401 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1402 				   (void *)&on, sizeof(on)) < 0)) {
1403 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1404 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1405 					 "%s: %s", sock->fd, "failed",
1406 					 strerror(errno));
1407 		}
1408 	}
1409 
1410 	if (sock->active == 0) {
1411 		sock->active = 1;
1412 	}
1413 
1414 	return (ISC_R_SUCCESS);
1415 }
1416 
1417 /*
1418  * Create a 'type' socket managed
1419  * by 'manager'.  Events will be posted to 'task' and when dispatched
1420  * 'action' will be called with 'arg' as the arg value.  The new
1421  * socket is returned in 'socketp'.
1422  */
1423 static isc_result_t
1424 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1425 	      isc_socket_t **socketp)
1426 {
1427 	isc__socket_t *sock = NULL;
1428 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1429 	isc_result_t result;
1430 	int lockid;
1431 
1432 	REQUIRE(VALID_MANAGER(manager));
1433 	REQUIRE(socketp != NULL && *socketp == NULL);
1434 
1435 	result = allocate_socket(manager, type, &sock);
1436 	if (result != ISC_R_SUCCESS)
1437 		return (result);
1438 
1439 	switch (sock->type) {
1440 	case isc_sockettype_udp:
1441 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1442 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1443 		break;
1444 	case isc_sockettype_tcp:
1445 		break;
1446 	default:
1447 		INSIST(0);
1448 	}
1449 
1450 	sock->pf = pf;
1451 
1452 	result = opensocket(sock);
1453 	if (result != ISC_R_SUCCESS) {
1454 		free_socket(&sock);
1455 		return (result);
1456 	}
1457 
1458 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1459 	sock->references = 1;
1460 	*socketp = (isc_socket_t *)sock;
1461 
1462 	/*
1463 	 * Note we don't have to lock the socket like we normally would because
1464 	 * there are no external references to it yet.
1465 	 */
1466 
1467 	lockid = FDLOCK_ID(sock->fd);
1468 	manager->fds[sock->fd] = sock;
1469 	manager->fdstate[sock->fd] = MANAGED;
1470 
1471 	ISC_LIST_APPEND(manager->socklist, sock, link);
1472 	if (manager->maxfd < sock->fd)
1473 		manager->maxfd = sock->fd;
1474 
1475 	socket_log(sock, NULL, CREATION, "created");
1476 
1477 	return (ISC_R_SUCCESS);
1478 }
1479 
1480 /*%
1481  * Create a new 'type' socket managed by 'manager'.  Events
1482  * will be posted to 'task' and when dispatched 'action' will be
1483  * called with 'arg' as the arg value.  The new socket is returned
1484  * in 'socketp'.
1485  */
1486 isc_result_t
1487 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1488 		   isc_socket_t **socketp)
1489 {
1490 	return (socket_create(manager0, pf, type, socketp));
1491 }
1492 
1493 /*
1494  * Attach to a socket.  Caller must explicitly detach when it is done.
1495  */
1496 void
1497 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1498 	isc__socket_t *sock = (isc__socket_t *)sock0;
1499 
1500 	REQUIRE(VALID_SOCKET(sock));
1501 	REQUIRE(socketp != NULL && *socketp == NULL);
1502 
1503 	sock->references++;
1504 
1505 	*socketp = (isc_socket_t *)sock;
1506 }
1507 
1508 /*
1509  * Dereference a socket.  If this is the last reference to it, clean things
1510  * up by destroying the socket.
1511  */
1512 void
1513 isc__socket_detach(isc_socket_t **socketp) {
1514 	isc__socket_t *sock;
1515 	isc_boolean_t kill_socket = ISC_FALSE;
1516 
1517 	REQUIRE(socketp != NULL);
1518 	sock = (isc__socket_t *)*socketp;
1519 	REQUIRE(VALID_SOCKET(sock));
1520 
1521 	REQUIRE(sock->references > 0);
1522 	sock->references--;
1523 	if (sock->references == 0)
1524 		kill_socket = ISC_TRUE;
1525 
1526 	if (kill_socket)
1527 		destroy(&sock);
1528 
1529 	*socketp = NULL;
1530 }
1531 
1532 /*
1533  * I/O is possible on a given socket.  Schedule an event to this task that
1534  * will call an internal function to do the I/O.  This will charge the
1535  * task with the I/O operation and let our select loop handler get back
1536  * to doing something real as fast as possible.
1537  *
1538  * The socket and manager must be locked before calling this function.
1539  */
1540 static void
1541 dispatch_recv(isc__socket_t *sock) {
1542 	intev_t *iev;
1543 	isc_socketevent_t *ev;
1544 	isc_task_t *sender;
1545 
1546 	INSIST(!sock->pending_recv);
1547 
1548 	ev = ISC_LIST_HEAD(sock->recv_list);
1549 	if (ev == NULL)
1550 		return;
1551 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1552 		   "dispatch_recv:  event %p -> task %p",
1553 		   ev, ev->ev_sender);
1554 	sender = ev->ev_sender;
1555 
1556 	sock->pending_recv = 1;
1557 	iev = &sock->readable_ev;
1558 
1559 	sock->references++;
1560 	iev->ev_sender = sock;
1561 	iev->ev_action = internal_recv;
1562 	iev->ev_arg = sock;
1563 
1564 	isc_task_send(sender, (isc_event_t **)&iev);
1565 }
1566 
1567 static void
1568 dispatch_send(isc__socket_t *sock) {
1569 	intev_t *iev;
1570 	isc_socketevent_t *ev;
1571 	isc_task_t *sender;
1572 
1573 	INSIST(!sock->pending_send);
1574 
1575 	ev = ISC_LIST_HEAD(sock->send_list);
1576 	if (ev == NULL)
1577 		return;
1578 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1579 		   "dispatch_send:  event %p -> task %p",
1580 		   ev, ev->ev_sender);
1581 	sender = ev->ev_sender;
1582 
1583 	sock->pending_send = 1;
1584 	iev = &sock->writable_ev;
1585 
1586 	sock->references++;
1587 	iev->ev_sender = sock;
1588 	iev->ev_action = internal_send;
1589 	iev->ev_arg = sock;
1590 
1591 	isc_task_send(sender, (isc_event_t **)&iev);
1592 }
1593 
1594 static void
1595 dispatch_connect(isc__socket_t *sock) {
1596 	intev_t *iev;
1597 	isc_socket_connev_t *ev;
1598 
1599 	iev = &sock->writable_ev;
1600 
1601 	ev = sock->connect_ev;
1602 	INSIST(ev != NULL); /* XXX */
1603 
1604 	INSIST(sock->connecting);
1605 
1606 	sock->references++;  /* keep socket around for this internal event */
1607 	iev->ev_sender = sock;
1608 	iev->ev_action = internal_connect;
1609 	iev->ev_arg = sock;
1610 
1611 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1612 }
1613 
1614 /*
1615  * Dequeue an item off the given socket's read queue, set the result code
1616  * in the done event to the one provided, and send it to the task it was
1617  * destined for.
1618  *
1619  * If the event to be sent is on a list, remove it before sending.  If
1620  * asked to, send and detach from the socket as well.
1621  *
1622  * Caller must have the socket locked if the event is attached to the socket.
1623  */
1624 static void
1625 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1626 	isc_task_t *task;
1627 
1628 	task = (*dev)->ev_sender;
1629 
1630 	(*dev)->ev_sender = sock;
1631 
1632 	if (ISC_LINK_LINKED(*dev, ev_link))
1633 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1634 
1635 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1636 	    == ISC_SOCKEVENTATTR_ATTACHED)
1637 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1638 	else
1639 		isc_task_send(task, (isc_event_t **)dev);
1640 }
1641 
1642 /*
1643  * See comments for send_recvdone_event() above.
1644  *
1645  * Caller must have the socket locked if the event is attached to the socket.
1646  */
1647 static void
1648 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1649 	isc_task_t *task;
1650 
1651 	INSIST(dev != NULL && *dev != NULL);
1652 
1653 	task = (*dev)->ev_sender;
1654 	(*dev)->ev_sender = sock;
1655 
1656 	if (ISC_LINK_LINKED(*dev, ev_link))
1657 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1658 
1659 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1660 	    == ISC_SOCKEVENTATTR_ATTACHED)
1661 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1662 	else
1663 		isc_task_send(task, (isc_event_t **)dev);
1664 }
1665 
1666 static void
1667 internal_recv(isc_task_t *me, isc_event_t *ev) {
1668 	isc_socketevent_t *dev;
1669 	isc__socket_t *sock;
1670 
1671 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1672 
1673 	sock = ev->ev_sender;
1674 	INSIST(VALID_SOCKET(sock));
1675 
1676 	socket_log(sock, NULL, IOEVENT,
1677 		   "internal_recv: task %p got event %p", me, ev);
1678 
1679 	INSIST(sock->pending_recv == 1);
1680 	sock->pending_recv = 0;
1681 
1682 	INSIST(sock->references > 0);
1683 	sock->references--;  /* the internal event is done with this socket */
1684 	if (sock->references == 0) {
1685 		destroy(&sock);
1686 		return;
1687 	}
1688 
1689 	/*
1690 	 * Try to do as much I/O as possible on this socket.  There are no
1691 	 * limits here, currently.
1692 	 */
1693 	dev = ISC_LIST_HEAD(sock->recv_list);
1694 	while (dev != NULL) {
1695 		switch (doio_recv(sock, dev)) {
1696 		case DOIO_SOFT:
1697 			goto poke;
1698 
1699 		case DOIO_EOF:
1700 			/*
1701 			 * read of 0 means the remote end was closed.
1702 			 * Run through the event queue and dispatch all
1703 			 * the events with an EOF result code.
1704 			 */
1705 			do {
1706 				dev->result = ISC_R_EOF;
1707 				send_recvdone_event(sock, &dev);
1708 				dev = ISC_LIST_HEAD(sock->recv_list);
1709 			} while (dev != NULL);
1710 			goto poke;
1711 
1712 		case DOIO_SUCCESS:
1713 		case DOIO_HARD:
1714 			send_recvdone_event(sock, &dev);
1715 			break;
1716 		}
1717 
1718 		dev = ISC_LIST_HEAD(sock->recv_list);
1719 	}
1720 
1721  poke:
1722 	if (!ISC_LIST_EMPTY(sock->recv_list))
1723 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1724 }
1725 
1726 static void
1727 internal_send(isc_task_t *me, isc_event_t *ev) {
1728 	isc_socketevent_t *dev;
1729 	isc__socket_t *sock;
1730 
1731 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1732 
1733 	/*
1734 	 * Find out what socket this is and lock it.
1735 	 */
1736 	sock = (isc__socket_t *)ev->ev_sender;
1737 	INSIST(VALID_SOCKET(sock));
1738 	socket_log(sock, NULL, IOEVENT,
1739 		   "internal_send: task %p got event %p", me, ev);
1740 
1741 	INSIST(sock->pending_send == 1);
1742 	sock->pending_send = 0;
1743 
1744 	INSIST(sock->references > 0);
1745 	sock->references--;  /* the internal event is done with this socket */
1746 	if (sock->references == 0) {
1747 		destroy(&sock);
1748 		return;
1749 	}
1750 
1751 	/*
1752 	 * Try to do as much I/O as possible on this socket.  There are no
1753 	 * limits here, currently.
1754 	 */
1755 	dev = ISC_LIST_HEAD(sock->send_list);
1756 	while (dev != NULL) {
1757 		switch (doio_send(sock, dev)) {
1758 		case DOIO_SOFT:
1759 			goto poke;
1760 
1761 		case DOIO_HARD:
1762 		case DOIO_SUCCESS:
1763 			send_senddone_event(sock, &dev);
1764 			break;
1765 		}
1766 
1767 		dev = ISC_LIST_HEAD(sock->send_list);
1768 	}
1769 
1770  poke:
1771 	if (!ISC_LIST_EMPTY(sock->send_list))
1772 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1773 }
1774 
1775 /*
1776  * Process read/writes on each fd here.  Avoid locking
1777  * and unlocking twice if both reads and writes are possible.
1778  */
1779 static void
1780 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1781 	   isc_boolean_t writeable)
1782 {
1783 	isc__socket_t *sock;
1784 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1785 
1786 	/*
1787 	 * If the socket is going to be closed, don't do more I/O.
1788 	 */
1789 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1790 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1791 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1792 		return;
1793 	}
1794 
1795 	sock = manager->fds[fd];
1796 	if (readable) {
1797 		if (sock == NULL) {
1798 			unwatch_read = ISC_TRUE;
1799 			goto check_write;
1800 		}
1801 		if (!SOCK_DEAD(sock)) {
1802 			dispatch_recv(sock);
1803 		}
1804 		unwatch_read = ISC_TRUE;
1805 	}
1806 check_write:
1807 	if (writeable) {
1808 		if (sock == NULL) {
1809 			unwatch_write = ISC_TRUE;
1810 			goto unlock_fd;
1811 		}
1812 		if (!SOCK_DEAD(sock)) {
1813 			if (sock->connecting)
1814 				dispatch_connect(sock);
1815 			else
1816 				dispatch_send(sock);
1817 		}
1818 		unwatch_write = ISC_TRUE;
1819 	}
1820 
1821  unlock_fd:
1822 	if (unwatch_read)
1823 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1824 	if (unwatch_write)
1825 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1826 
1827 }
1828 
1829 static void
1830 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1831 	    fd_set *writefds)
1832 {
1833 	int i;
1834 
1835 	REQUIRE(maxfd <= (int)manager->maxsocks);
1836 
1837 	for (i = 0; i < maxfd; i++) {
1838 		process_fd(manager, i, FD_ISSET(i, readfds),
1839 			   FD_ISSET(i, writefds));
1840 	}
1841 }
1842 
1843 /*
1844  * Create a new socket manager.
1845  */
1846 
1847 static isc_result_t
1848 setup_watcher(isc__socketmgr_t *manager) {
1849 	isc_result_t result;
1850 
1851 	UNUSED(result);
1852 
1853 	manager->fd_bufsize = sizeof(fd_set);
1854 
1855 	manager->read_fds = NULL;
1856 	manager->read_fds_copy = NULL;
1857 	manager->write_fds = NULL;
1858 	manager->write_fds_copy = NULL;
1859 
1860 	manager->read_fds = malloc(manager->fd_bufsize);
1861 	if (manager->read_fds != NULL)
1862 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1863 	if (manager->read_fds_copy != NULL)
1864 		manager->write_fds = malloc(manager->fd_bufsize);
1865 	if (manager->write_fds != NULL) {
1866 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1867 	}
1868 	if (manager->write_fds_copy == NULL) {
1869 		if (manager->write_fds != NULL) {
1870 			free(manager->write_fds);
1871 		}
1872 		if (manager->read_fds_copy != NULL) {
1873 			free(manager->read_fds_copy);
1874 		}
1875 		if (manager->read_fds != NULL) {
1876 			free(manager->read_fds);
1877 		}
1878 		return (ISC_R_NOMEMORY);
1879 	}
1880 	memset(manager->read_fds, 0, manager->fd_bufsize);
1881 	memset(manager->write_fds, 0, manager->fd_bufsize);
1882 
1883 	manager->maxfd = 0;
1884 
1885 	return (ISC_R_SUCCESS);
1886 }
1887 
1888 static void
1889 cleanup_watcher(isc__socketmgr_t *manager) {
1890 
1891 	if (manager->read_fds != NULL)
1892 		free(manager->read_fds);
1893 	if (manager->read_fds_copy != NULL)
1894 		free(manager->read_fds_copy);
1895 	if (manager->write_fds != NULL)
1896 		free(manager->write_fds);
1897 	if (manager->write_fds_copy != NULL)
1898 		free(manager->write_fds_copy);
1899 }
1900 
1901 isc_result_t
1902 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1903 	return (isc__socketmgr_create2(managerp, 0));
1904 }
1905 
1906 isc_result_t
1907 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1908 		       unsigned int maxsocks)
1909 {
1910 	isc__socketmgr_t *manager;
1911 	isc_result_t result;
1912 
1913 	REQUIRE(managerp != NULL && *managerp == NULL);
1914 
1915 	if (socketmgr != NULL) {
1916 		/* Don't allow maxsocks to be updated */
1917 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1918 			return (ISC_R_EXISTS);
1919 
1920 		socketmgr->refs++;
1921 		*managerp = (isc_socketmgr_t *)socketmgr;
1922 		return (ISC_R_SUCCESS);
1923 	}
1924 
1925 	if (maxsocks == 0)
1926 		maxsocks = FD_SETSIZE;
1927 
1928 	manager = malloc(sizeof(*manager));
1929 	if (manager == NULL)
1930 		return (ISC_R_NOMEMORY);
1931 
1932 	/* zero-clear so that necessary cleanup on failure will be easy */
1933 	memset(manager, 0, sizeof(*manager));
1934 	manager->maxsocks = maxsocks;
1935 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1936 	if (manager->fds == NULL) {
1937 		result = ISC_R_NOMEMORY;
1938 		goto free_manager;
1939 	}
1940 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1941 	if (manager->fdstate == NULL) {
1942 		result = ISC_R_NOMEMORY;
1943 		goto free_manager;
1944 	}
1945 
1946 	manager->common.methods = &socketmgrmethods;
1947 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1948 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1949 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1950 	ISC_LIST_INIT(manager->socklist);
1951 
1952 	manager->refs = 1;
1953 
1954 	/*
1955 	 * Set up initial state for the select loop
1956 	 */
1957 	result = setup_watcher(manager);
1958 	if (result != ISC_R_SUCCESS)
1959 		goto cleanup;
1960 
1961 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1962 
1963 	socketmgr = manager;
1964 	*managerp = (isc_socketmgr_t *)manager;
1965 
1966 	return (ISC_R_SUCCESS);
1967 
1968 cleanup:
1969 
1970 free_manager:
1971 	if (manager->fdstate != NULL) {
1972 		free(manager->fdstate);
1973 	}
1974 	if (manager->fds != NULL) {
1975 		free(manager->fds);
1976 	}
1977 	free(manager);
1978 
1979 	return (result);
1980 }
1981 
1982 void
1983 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
1984 	isc__socketmgr_t *manager;
1985 	int i;
1986 
1987 	/*
1988 	 * Destroy a socket manager.
1989 	 */
1990 
1991 	REQUIRE(managerp != NULL);
1992 	manager = (isc__socketmgr_t *)*managerp;
1993 	REQUIRE(VALID_MANAGER(manager));
1994 
1995 	manager->refs--;
1996 	if (manager->refs > 0) {
1997 		*managerp = NULL;
1998 		return;
1999 	}
2000 	socketmgr = NULL;
2001 
2002 	/*
2003 	 * Wait for all sockets to be destroyed.
2004 	 */
2005 	while (!ISC_LIST_EMPTY(manager->socklist)) {
2006 		isc__taskmgr_dispatch(NULL);
2007 	}
2008 
2009 	/*
2010 	 * Here, poke our select/poll thread.  Do this by closing the write
2011 	 * half of the pipe, which will send EOF to the read half.
2012 	 * This is currently a no-op in the non-threaded case.
2013 	 */
2014 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2015 
2016 	/*
2017 	 * Clean up.
2018 	 */
2019 	cleanup_watcher(manager);
2020 
2021 	for (i = 0; i < (int)manager->maxsocks; i++)
2022 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2023 			(void)close(i);
2024 
2025 	free(manager->fds);
2026 	free(manager->fdstate);
2027 
2028 	manager->common.magic = 0;
2029 	manager->common.impmagic = 0;
2030 	free(manager);
2031 
2032 	*managerp = NULL;
2033 
2034 	socketmgr = NULL;
2035 }
2036 
2037 static isc_result_t
2038 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2039 	    unsigned int flags)
2040 {
2041 	int io_state;
2042 	isc_task_t *ntask = NULL;
2043 	isc_result_t result = ISC_R_SUCCESS;
2044 
2045 	dev->ev_sender = task;
2046 
2047 	if (sock->type == isc_sockettype_udp) {
2048 		io_state = doio_recv(sock, dev);
2049 	} else {
2050 		if (ISC_LIST_EMPTY(sock->recv_list))
2051 			io_state = doio_recv(sock, dev);
2052 		else
2053 			io_state = DOIO_SOFT;
2054 	}
2055 
2056 	switch (io_state) {
2057 	case DOIO_SOFT:
2058 		/*
2059 		 * We couldn't read all or part of the request right now, so
2060 		 * queue it.
2061 		 *
2062 		 * Attach to socket and to task
2063 		 */
2064 		isc_task_attach(task, &ntask);
2065 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2066 
2067 		/*
2068 		 * Enqueue the request.  If the socket was previously not being
2069 		 * watched, poke the watcher to start paying attention to it.
2070 		 */
2071 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2072 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2073 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2074 
2075 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2076 			   "socket_recv: event %p -> task %p",
2077 			   dev, ntask);
2078 
2079 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2080 			result = ISC_R_INPROGRESS;
2081 		break;
2082 
2083 	case DOIO_EOF:
2084 		dev->result = ISC_R_EOF;
2085 		/* fallthrough */
2086 
2087 	case DOIO_HARD:
2088 	case DOIO_SUCCESS:
2089 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2090 			send_recvdone_event(sock, &dev);
2091 		break;
2092 	}
2093 
2094 	return (result);
2095 }
2096 
2097 isc_result_t
2098 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2099 		  unsigned int minimum, isc_task_t *task,
2100 		  isc_taskaction_t action, void *arg)
2101 {
2102 	isc__socket_t *sock = (isc__socket_t *)sock0;
2103 	isc_socketevent_t *dev;
2104 	isc__socketmgr_t *manager;
2105 	unsigned int iocount;
2106 	isc_buffer_t *buffer;
2107 
2108 	REQUIRE(VALID_SOCKET(sock));
2109 	REQUIRE(buflist != NULL);
2110 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2111 	REQUIRE(task != NULL);
2112 	REQUIRE(action != NULL);
2113 
2114 	manager = sock->manager;
2115 	REQUIRE(VALID_MANAGER(manager));
2116 
2117 	iocount = isc_bufferlist_availablecount(buflist);
2118 	REQUIRE(iocount > 0);
2119 
2120 	INSIST(sock->bound);
2121 
2122 	dev = allocate_socketevent(sock,
2123 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2124 	if (dev == NULL)
2125 		return (ISC_R_NOMEMORY);
2126 
2127 	/*
2128 	 * UDP sockets are always partial read
2129 	 */
2130 	if (sock->type == isc_sockettype_udp)
2131 		dev->minimum = 1;
2132 	else {
2133 		if (minimum == 0)
2134 			dev->minimum = iocount;
2135 		else
2136 			dev->minimum = minimum;
2137 	}
2138 
2139 	/*
2140 	 * Move each buffer from the passed in list to our internal one.
2141 	 */
2142 	buffer = ISC_LIST_HEAD(*buflist);
2143 	while (buffer != NULL) {
2144 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2145 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2146 		buffer = ISC_LIST_HEAD(*buflist);
2147 	}
2148 
2149 	return (socket_recv(sock, dev, task, 0));
2150 }
2151 
2152 static isc_result_t
2153 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2154 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2155 	    unsigned int flags)
2156 {
2157 	int io_state;
2158 	isc_task_t *ntask = NULL;
2159 	isc_result_t result = ISC_R_SUCCESS;
2160 
2161 	dev->ev_sender = task;
2162 
2163 	set_dev_address(address, sock, dev);
2164 	if (pktinfo != NULL) {
2165 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2166 		dev->pktinfo = *pktinfo;
2167 
2168 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2169 		    !isc_sockaddr_islinklocal(&dev->address)) {
2170 			socket_log(sock, NULL, TRACE,
2171 				   "pktinfo structure provided, ifindex %u "
2172 				   "(set to 0)", pktinfo->ipi6_ifindex);
2173 
2174 			/*
2175 			 * Set the pktinfo index to 0 here, to let the
2176 			 * kernel decide what interface it should send on.
2177 			 */
2178 			dev->pktinfo.ipi6_ifindex = 0;
2179 		}
2180 	}
2181 
2182 	if (sock->type == isc_sockettype_udp)
2183 		io_state = doio_send(sock, dev);
2184 	else {
2185 		if (ISC_LIST_EMPTY(sock->send_list))
2186 			io_state = doio_send(sock, dev);
2187 		else
2188 			io_state = DOIO_SOFT;
2189 	}
2190 
2191 	switch (io_state) {
2192 	case DOIO_SOFT:
2193 		/*
2194 		 * We couldn't send all or part of the request right now, so
2195 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2196 		 */
2197 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2198 			isc_task_attach(task, &ntask);
2199 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2200 
2201 			/*
2202 			 * Enqueue the request.  If the socket was previously
2203 			 * not being watched, poke the watcher to start
2204 			 * paying attention to it.
2205 			 */
2206 			if (ISC_LIST_EMPTY(sock->send_list) &&
2207 			    !sock->pending_send)
2208 				select_poke(sock->manager, sock->fd,
2209 					    SELECT_POKE_WRITE);
2210 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2211 
2212 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2213 				   "socket_send: event %p -> task %p",
2214 				   dev, ntask);
2215 
2216 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2217 				result = ISC_R_INPROGRESS;
2218 			break;
2219 		}
2220 
2221 		/* FALLTHROUGH */
2222 
2223 	case DOIO_HARD:
2224 	case DOIO_SUCCESS:
2225 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2226 			send_senddone_event(sock, &dev);
2227 		break;
2228 	}
2229 
2230 	return (result);
2231 }
2232 
2233 isc_result_t
2234 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2235 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2236 {
2237 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2238 				     NULL, 0));
2239 }
2240 
2241 isc_result_t
2242 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2243 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2244 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2245 		     unsigned int flags)
2246 {
2247 	isc__socket_t *sock = (isc__socket_t *)sock0;
2248 	isc_socketevent_t *dev;
2249 	isc__socketmgr_t *manager;
2250 	unsigned int iocount;
2251 	isc_buffer_t *buffer;
2252 
2253 	REQUIRE(VALID_SOCKET(sock));
2254 	REQUIRE(buflist != NULL);
2255 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2256 	REQUIRE(task != NULL);
2257 	REQUIRE(action != NULL);
2258 
2259 	manager = sock->manager;
2260 	REQUIRE(VALID_MANAGER(manager));
2261 
2262 	iocount = isc_bufferlist_usedcount(buflist);
2263 	REQUIRE(iocount > 0);
2264 
2265 	dev = allocate_socketevent(sock,
2266 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2267 	if (dev == NULL)
2268 		return (ISC_R_NOMEMORY);
2269 
2270 	/*
2271 	 * Move each buffer from the passed in list to our internal one.
2272 	 */
2273 	buffer = ISC_LIST_HEAD(*buflist);
2274 	while (buffer != NULL) {
2275 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2276 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2277 		buffer = ISC_LIST_HEAD(*buflist);
2278 	}
2279 
2280 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2281 }
2282 
2283 isc_result_t
2284 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2285 		 unsigned int options) {
2286 	isc__socket_t *sock = (isc__socket_t *)sock0;
2287 	int on = 1;
2288 
2289 	REQUIRE(VALID_SOCKET(sock));
2290 
2291 	INSIST(!sock->bound);
2292 
2293 	if (sock->pf != sockaddr->type.sa.sa_family) {
2294 		return (ISC_R_FAMILYMISMATCH);
2295 	}
2296 
2297 	/*
2298 	 * Only set SO_REUSEADDR when we want a specific port.
2299 	 */
2300 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2301 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2302 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2303 		       sizeof(on)) < 0) {
2304 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2305 				 "setsockopt(%d) %s", sock->fd, "failed");
2306 		/* Press on... */
2307 	}
2308 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2309 		switch (errno) {
2310 		case EACCES:
2311 			return (ISC_R_NOPERM);
2312 		case EADDRNOTAVAIL:
2313 			return (ISC_R_ADDRNOTAVAIL);
2314 		case EADDRINUSE:
2315 			return (ISC_R_ADDRINUSE);
2316 		case EINVAL:
2317 			return (ISC_R_BOUND);
2318 		default:
2319 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2320 					 strerror(errno));
2321 			return (ISC_R_UNEXPECTED);
2322 		}
2323 	}
2324 
2325 	socket_log(sock, sockaddr, TRACE, "bound");
2326 	sock->bound = 1;
2327 
2328 	return (ISC_R_SUCCESS);
2329 }
2330 
2331 isc_result_t
2332 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2333 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2334 {
2335 	isc__socket_t *sock = (isc__socket_t *)sock0;
2336 	isc_socket_connev_t *dev;
2337 	isc_task_t *ntask = NULL;
2338 	isc__socketmgr_t *manager;
2339 	int cc;
2340 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2341 
2342 	REQUIRE(VALID_SOCKET(sock));
2343 	REQUIRE(addr != NULL);
2344 	REQUIRE(task != NULL);
2345 	REQUIRE(action != NULL);
2346 
2347 	manager = sock->manager;
2348 	REQUIRE(VALID_MANAGER(manager));
2349 	REQUIRE(addr != NULL);
2350 
2351 	if (isc_sockaddr_ismulticast(addr))
2352 		return (ISC_R_MULTICAST);
2353 
2354 	REQUIRE(!sock->connecting);
2355 
2356 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2357 							ISC_SOCKEVENT_CONNECT,
2358 							action,	arg,
2359 							sizeof(*dev));
2360 	if (dev == NULL) {
2361 		return (ISC_R_NOMEMORY);
2362 	}
2363 	ISC_LINK_INIT(dev, ev_link);
2364 
2365 	/*
2366 	 * Try to do the connect right away, as there can be only one
2367 	 * outstanding, and it might happen to complete.
2368 	 */
2369 	sock->peer_address = *addr;
2370 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2371 	if (cc < 0) {
2372 		/*
2373 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2374 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2375 		 * a success and let the user detect it if it's really an error
2376 		 * at the time of sending a packet on the socket.
2377 		 */
2378 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2379 			cc = 0;
2380 			goto success;
2381 		}
2382 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2383 			goto queue;
2384 
2385 		switch (errno) {
2386 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2387 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2388 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2389 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2390 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2391 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2392 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2393 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2394 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2395 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2396 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2397 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2398 #undef ERROR_MATCH
2399 		}
2400 
2401 		sock->connected = 0;
2402 
2403 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2404 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2405 				 addrbuf, errno, strerror(errno));
2406 
2407 		isc_event_free(ISC_EVENT_PTR(&dev));
2408 		return (ISC_R_UNEXPECTED);
2409 
2410 	err_exit:
2411 		sock->connected = 0;
2412 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2413 
2414 		return (ISC_R_SUCCESS);
2415 	}
2416 
2417 	/*
2418 	 * If connect completed, fire off the done event.
2419 	 */
2420  success:
2421 	if (cc == 0) {
2422 		sock->connected = 1;
2423 		sock->bound = 1;
2424 		dev->result = ISC_R_SUCCESS;
2425 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2426 
2427 		return (ISC_R_SUCCESS);
2428 	}
2429 
2430  queue:
2431 
2432 	/*
2433 	 * Attach to task.
2434 	 */
2435 	isc_task_attach(task, &ntask);
2436 
2437 	sock->connecting = 1;
2438 
2439 	dev->ev_sender = ntask;
2440 
2441 	/*
2442 	 * Poke watcher here.  We still have the socket locked, so there
2443 	 * is no race condition.  We will keep the lock for such a short
2444 	 * bit of time waking it up now or later won't matter all that much.
2445 	 */
2446 	if (sock->connect_ev == NULL)
2447 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2448 
2449 	sock->connect_ev = dev;
2450 
2451 	return (ISC_R_SUCCESS);
2452 }
2453 
2454 /*
2455  * Called when a socket with a pending connect() finishes.
2456  */
2457 static void
2458 internal_connect(isc_task_t *me, isc_event_t *ev) {
2459 	isc__socket_t *sock;
2460 	isc_socket_connev_t *dev;
2461 	isc_task_t *task;
2462 	int cc;
2463 	socklen_t optlen;
2464 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2465 
2466 	UNUSED(me);
2467 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2468 
2469 	sock = ev->ev_sender;
2470 	INSIST(VALID_SOCKET(sock));
2471 
2472 	/*
2473 	 * When the internal event was sent the reference count was bumped
2474 	 * to keep the socket around for us.  Decrement the count here.
2475 	 */
2476 	INSIST(sock->references > 0);
2477 	sock->references--;
2478 	if (sock->references == 0) {
2479 		destroy(&sock);
2480 		return;
2481 	}
2482 
2483 	/*
2484 	 * Has this event been canceled?
2485 	 */
2486 	dev = sock->connect_ev;
2487 	if (dev == NULL) {
2488 		INSIST(!sock->connecting);
2489 		return;
2490 	}
2491 
2492 	INSIST(sock->connecting);
2493 	sock->connecting = 0;
2494 
2495 	/*
2496 	 * Get any possible error status here.
2497 	 */
2498 	optlen = sizeof(cc);
2499 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2500 		       (void *)&cc, (void *)&optlen) < 0)
2501 		cc = errno;
2502 	else
2503 		errno = cc;
2504 
2505 	if (errno != 0) {
2506 		/*
2507 		 * If the error is EAGAIN, just re-select on this
2508 		 * fd and pretend nothing strange happened.
2509 		 */
2510 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2511 			sock->connecting = 1;
2512 			select_poke(sock->manager, sock->fd,
2513 				    SELECT_POKE_CONNECT);
2514 			return;
2515 		}
2516 
2517 
2518 		/*
2519 		 * Translate other errors into ISC_R_* flavors.
2520 		 */
2521 		switch (errno) {
2522 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2523 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2524 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2525 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2526 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2527 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2528 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2529 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2530 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2531 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2532 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2533 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2534 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2535 #undef ERROR_MATCH
2536 		default:
2537 			dev->result = ISC_R_UNEXPECTED;
2538 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2539 					    sizeof(peerbuf));
2540 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2541 					 "internal_connect: connect(%s) %s",
2542 					 peerbuf, strerror(errno));
2543 		}
2544 	} else {
2545 		dev->result = ISC_R_SUCCESS;
2546 		sock->connected = 1;
2547 		sock->bound = 1;
2548 	}
2549 
2550 	sock->connect_ev = NULL;
2551 
2552 	task = dev->ev_sender;
2553 	dev->ev_sender = sock;
2554 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2555 }
2556 
2557 /*
2558  * Run through the list of events on this socket, and cancel the ones
2559  * queued for task "task" of type "how".  "how" is a bitmask.
2560  */
2561 void
2562 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2563 	isc__socket_t *sock = (isc__socket_t *)sock0;
2564 
2565 	REQUIRE(VALID_SOCKET(sock));
2566 
2567 	/*
2568 	 * Quick exit if there is nothing to do.  Don't even bother locking
2569 	 * in this case.
2570 	 */
2571 	if (how == 0)
2572 		return;
2573 
2574 	/*
2575 	 * All of these do the same thing, more or less.
2576 	 * Each will:
2577 	 *	o If the internal event is marked as "posted" try to
2578 	 *	  remove it from the task's queue.  If this fails, mark it
2579 	 *	  as canceled instead, and let the task clean it up later.
2580 	 *	o For each I/O request for that task of that type, post
2581 	 *	  its done event with status of "ISC_R_CANCELED".
2582 	 *	o Reset any state needed.
2583 	 */
2584 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2585 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2586 		isc_socketevent_t      *dev;
2587 		isc_socketevent_t      *next;
2588 		isc_task_t	       *current_task;
2589 
2590 		dev = ISC_LIST_HEAD(sock->recv_list);
2591 
2592 		while (dev != NULL) {
2593 			current_task = dev->ev_sender;
2594 			next = ISC_LIST_NEXT(dev, ev_link);
2595 
2596 			if ((task == NULL) || (task == current_task)) {
2597 				dev->result = ISC_R_CANCELED;
2598 				send_recvdone_event(sock, &dev);
2599 			}
2600 			dev = next;
2601 		}
2602 	}
2603 
2604 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2605 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2606 		isc_socketevent_t      *dev;
2607 		isc_socketevent_t      *next;
2608 		isc_task_t	       *current_task;
2609 
2610 		dev = ISC_LIST_HEAD(sock->send_list);
2611 
2612 		while (dev != NULL) {
2613 			current_task = dev->ev_sender;
2614 			next = ISC_LIST_NEXT(dev, ev_link);
2615 
2616 			if ((task == NULL) || (task == current_task)) {
2617 				dev->result = ISC_R_CANCELED;
2618 				send_senddone_event(sock, &dev);
2619 			}
2620 			dev = next;
2621 		}
2622 	}
2623 
2624 	/*
2625 	 * Connecting is not a list.
2626 	 */
2627 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2628 	    && sock->connect_ev != NULL) {
2629 		isc_socket_connev_t    *dev;
2630 		isc_task_t	       *current_task;
2631 
2632 		INSIST(sock->connecting);
2633 		sock->connecting = 0;
2634 
2635 		dev = sock->connect_ev;
2636 		current_task = dev->ev_sender;
2637 
2638 		if ((task == NULL) || (task == current_task)) {
2639 			sock->connect_ev = NULL;
2640 
2641 			dev->result = ISC_R_CANCELED;
2642 			dev->ev_sender = sock;
2643 			isc_task_sendanddetach(&current_task,
2644 					       ISC_EVENT_PTR(&dev));
2645 		}
2646 	}
2647 
2648 }
2649 
2650 /*
2651  * In our assumed scenario, we can simply use a single static object.
2652  * XXX: this is not true if the application uses multiple threads with
2653  *      'multi-context' mode.  Fixing this is a future TODO item.
2654  */
2655 static isc_socketwait_t swait_private;
2656 
2657 int
2658 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2659 			  isc_socketwait_t **swaitp)
2660 {
2661 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2662 	int n;
2663 
2664 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2665 
2666 	if (manager == NULL)
2667 		manager = socketmgr;
2668 	if (manager == NULL)
2669 		return (0);
2670 
2671 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2672 	memmove(manager->write_fds_copy, manager->write_fds,
2673 		manager->fd_bufsize);
2674 
2675 	swait_private.readset = manager->read_fds_copy;
2676 	swait_private.writeset = manager->write_fds_copy;
2677 	swait_private.maxfd = manager->maxfd + 1;
2678 
2679 	n = select(swait_private.maxfd, swait_private.readset,
2680 		   swait_private.writeset, NULL, tvp);
2681 
2682 	*swaitp = &swait_private;
2683 	return (n);
2684 }
2685 
2686 isc_result_t
2687 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2688 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2689 
2690 	REQUIRE(swait == &swait_private);
2691 
2692 	if (manager == NULL)
2693 		manager = socketmgr;
2694 	if (manager == NULL)
2695 		return (ISC_R_NOTFOUND);
2696 
2697 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2698 	return (ISC_R_SUCCESS);
2699 }
2700 
2701 #include "../socket_api.c"
2702