xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 5cbe9ad1122054ffaa734c0282e6961ef40b5000)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
101 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
102 
103 /*!
104  * IPv6 control information.  If the socket is an IPv6 socket we want
105  * to collect the destination address and interface so the client can
106  * set them on outgoing packets.
107  */
108 
109 /*%
110  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
111  * a setsockopt() like interface to request timestamps, and if the OS
112  * doesn't do it for us, call gettimeofday() on every UDP receive?
113  */
114 
115 /*%
116  * Instead of calculating the cmsgbuf lengths every time we take
117  * a rule of thumb approach - sizes are taken from x86_64 linux,
118  * multiplied by 2, everything should fit. Those sizes are not
119  * large enough to cause any concern.
120  */
121 #define CMSG_SP_IN6PKT 40
122 
123 #define CMSG_SP_TIMESTAMP 32
124 
125 #define CMSG_SP_TCTOS 24
126 
127 #define CMSG_SP_INT 24
128 
129 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
130 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
131 
132 /*%
133  * The number of times a send operation is repeated if the result is EINTR.
134  */
135 #define NRETRIES 10
136 
137 typedef struct isc__socket isc__socket_t;
138 typedef struct isc__socketmgr isc__socketmgr_t;
139 
140 struct isc__socket {
141 	/* Not locked. */
142 	isc_socket_t		common;
143 	isc__socketmgr_t	*manager;
144 	isc_sockettype_t	type;
145 
146 	/* Locked by socket lock. */
147 	ISC_LINK(isc__socket_t)	link;
148 	unsigned int		references;
149 	int			fd;
150 	int			pf;
151 
152 	ISC_LIST(isc_socketevent_t)		send_list;
153 	ISC_LIST(isc_socketevent_t)		recv_list;
154 	isc_socket_connev_t		       *connect_ev;
155 
156 	/*
157 	 * Internal events.  Posted when a descriptor is readable or
158 	 * writable.  These are statically allocated and never freed.
159 	 * They will be set to non-purgable before use.
160 	 */
161 	intev_t			readable_ev;
162 	intev_t			writable_ev;
163 
164 	isc_sockaddr_t		peer_address;       /* remote address */
165 
166 	unsigned int		pending_recv : 1,
167 				pending_send : 1,
168 				connected : 1,
169 				connecting : 1,     /* connect pending */
170 				bound : 1,          /* bound to local addr */
171 				active : 1,         /* currently active */
172 				pktdscp : 1;	    /* per packet dscp */
173 	unsigned int		dscp;
174 };
175 
176 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
177 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
178 
179 struct isc__socketmgr {
180 	/* Not locked. */
181 	isc_socketmgr_t		common;
182 	int			fd_bufsize;
183 	unsigned int		maxsocks;
184 
185 	isc__socket_t	       **fds;
186 	int			*fdstate;
187 
188 	/* Locked by manager lock. */
189 	ISC_LIST(isc__socket_t)	socklist;
190 	fd_set			*read_fds;
191 	fd_set			*read_fds_copy;
192 	fd_set			*write_fds;
193 	fd_set			*write_fds_copy;
194 	int			maxfd;
195 	unsigned int		refs;
196 };
197 
198 static isc__socketmgr_t *socketmgr = NULL;
199 
200 #define CLOSED			0	/* this one must be zero */
201 #define MANAGED			1
202 #define CLOSE_PENDING		2
203 
204 /*
205  * send() and recv() iovec counts
206  */
207 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
208 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
209 
210 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
211 				  isc_sockettype_t type,
212 				  isc_socket_t **socketp);
213 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
214 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
215 static void free_socket(isc__socket_t **);
216 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
217 				    isc__socket_t **);
218 static void destroy(isc__socket_t **);
219 static void internal_connect(isc_task_t *, isc_event_t *);
220 static void internal_recv(isc_task_t *, isc_event_t *);
221 static void internal_send(isc_task_t *, isc_event_t *);
222 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
223 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
224 			      struct msghdr *, struct iovec *, size_t *);
225 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
226 			      struct msghdr *, struct iovec *, size_t *);
227 
228 /*%
229  * The following are intended for internal use (indicated by "isc__"
230  * prefix) but are not declared as static, allowing direct access from
231  * unit tests etc.
232  */
233 
234 isc_result_t
235 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
236 		   isc_socket_t **socketp);
237 void
238 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
239 void
240 isc__socket_detach(isc_socket_t **socketp);
241 isc_result_t
242 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
243 		 unsigned int minimum, isc_task_t *task,
244 		  isc_taskaction_t action, void *arg);
245 isc_result_t
246 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
247 		  isc_task_t *task, isc_taskaction_t action, void *arg);
248 isc_result_t
249 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
250 		     isc_task_t *task, isc_taskaction_t action, void *arg,
251 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
252 		     unsigned int flags);
253 isc_result_t
254 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
255 		 unsigned int options);
256 isc_result_t
257 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
258 		    isc_task_t *task, isc_taskaction_t action,
259 		    void *arg);
260 void
261 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
262 
263 isc_result_t
264 isc__socketmgr_create(isc_socketmgr_t **managerp);
265 isc_result_t
266 isc__socketmgr_create2(isc_socketmgr_t **managerp,
267 		       unsigned int maxsocks);
268 isc_result_t
269 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
270 void
271 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
272 
273 static struct {
274 	isc_socketmethods_t methods;
275 
276 	/*%
277 	 * The following are defined just for avoiding unused static functions.
278 	 */
279 	void *recvv, *sendv;
280 } socketmethods = {
281 	{
282 		isc__socket_attach,
283 		isc__socket_detach,
284 		isc__socket_bind,
285 		isc__socket_connect,
286 		isc__socket_cancel,
287 	},
288 	(void *)isc__socket_recvv,
289 	(void *)isc__socket_sendv,
290 };
291 
292 static isc_socketmgrmethods_t socketmgrmethods = {
293 	isc__socketmgr_destroy,
294 	isc__socket_create
295 };
296 
297 #define SELECT_POKE_SHUTDOWN		(-1)
298 #define SELECT_POKE_READ		(-3)
299 #define SELECT_POKE_WRITE		(-4)
300 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
301 #define SELECT_POKE_CLOSE		(-5)
302 
303 #define SOCK_DEAD(s)			((s)->references == 0)
304 
305 /*%
306  * Shortcut index arrays to get access to statistics counters.
307  */
308 enum {
309 	STATID_OPEN = 0,
310 	STATID_OPENFAIL = 1,
311 	STATID_CLOSE = 2,
312 	STATID_BINDFAIL = 3,
313 	STATID_CONNECTFAIL = 4,
314 	STATID_CONNECT = 5,
315 	STATID_ACCEPTFAIL = 6,
316 	STATID_ACCEPT = 7,
317 	STATID_SENDFAIL = 8,
318 	STATID_RECVFAIL = 9,
319 	STATID_ACTIVE = 10
320 };
321 
322 
323 static void
324 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
325 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
326 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
327 static void
328 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
329 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
330 	   const char *fmt, ...)
331 {
332 	char msgbuf[2048];
333 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
334 	va_list ap;
335 
336 	if (! isc_log_wouldlog(isc_lctx, level))
337 		return;
338 
339 	va_start(ap, fmt);
340 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
341 	va_end(ap);
342 
343 	if (address == NULL) {
344 		isc_log_write(isc_lctx, category, module, level,
345 			       "socket %p: %s", sock, msgbuf);
346 	} else {
347 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
348 		isc_log_write(isc_lctx, category, module, level,
349 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
350 	}
351 }
352 
353 static inline isc_result_t
354 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
355 	isc_result_t result = ISC_R_SUCCESS;
356 
357 	if (msg == SELECT_POKE_READ)
358 		FD_SET(fd, manager->read_fds);
359 	if (msg == SELECT_POKE_WRITE)
360 		FD_SET(fd, manager->write_fds);
361 
362 	return (result);
363 }
364 
365 static inline isc_result_t
366 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
367 	isc_result_t result = ISC_R_SUCCESS;
368 
369 	if (msg == SELECT_POKE_READ)
370 		FD_CLR(fd, manager->read_fds);
371 	else if (msg == SELECT_POKE_WRITE)
372 		FD_CLR(fd, manager->write_fds);
373 
374 	return (result);
375 }
376 
377 static void
378 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
379 	isc_result_t result;
380 
381 	/*
382 	 * This is a wakeup on a socket.  If the socket is not in the
383 	 * process of being closed, start watching it for either reads
384 	 * or writes.
385 	 */
386 
387 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
388 
389 	if (msg == SELECT_POKE_CLOSE) {
390 		/* No one should be updating fdstate, so no need to lock it */
391 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
392 		manager->fdstate[fd] = CLOSED;
393 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
394 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
395 		(void)close(fd);
396 		return;
397 	}
398 
399 	if (manager->fdstate[fd] == CLOSE_PENDING) {
400 
401 		/*
402 		 * We accept (and ignore) any error from unwatch_fd() as we are
403 		 * closing the socket, hoping it doesn't leave dangling state in
404 		 * the kernel.
405 		 */
406 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
407 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
408 		return;
409 	}
410 	if (manager->fdstate[fd] != MANAGED) {
411 		return;
412 	}
413 
414 	/*
415 	 * Set requested bit.
416 	 */
417 	result = watch_fd(manager, fd, msg);
418 	if (result != ISC_R_SUCCESS) {
419 		/*
420 		 * XXXJT: what should we do?  Ignoring the failure of watching
421 		 * a socket will make the application dysfunctional, but there
422 		 * seems to be no reasonable recovery process.
423 		 */
424 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
425 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
426 			      "failed to start watching FD (%d): %s",
427 			      fd, isc_result_totext(result));
428 	}
429 }
430 
431 /*
432  * Update the state of the socketmgr when something changes.
433  */
434 static void
435 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
436 	if (msg == SELECT_POKE_SHUTDOWN)
437 		return;
438 	else if (fd >= 0)
439 		wakeup_socket(manager, fd, msg);
440 	return;
441 }
442 
443 /*
444  * Make a fd non-blocking.
445  */
446 static isc_result_t
447 make_nonblock(int fd) {
448 	int ret;
449 	int flags;
450 
451 	flags = fcntl(fd, F_GETFL, 0);
452 	flags |= O_NONBLOCK;
453 	ret = fcntl(fd, F_SETFL, flags);
454 
455 	if (ret == -1) {
456 		UNEXPECTED_ERROR(__FILE__, __LINE__,
457 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
458 				 strerror(errno));
459 		return (ISC_R_UNEXPECTED);
460 	}
461 
462 	return (ISC_R_SUCCESS);
463 }
464 
465 /*
466  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
467  * In order to ensure as much portability as possible, we provide wrapper
468  * functions of these macros.
469  * Note that cmsg_space() could run slow on OSes that do not have
470  * CMSG_SPACE.
471  */
472 static inline socklen_t
473 cmsg_len(socklen_t len) {
474 	return (CMSG_LEN(len));
475 }
476 
477 static inline socklen_t
478 cmsg_space(socklen_t len) {
479 	return (CMSG_SPACE(len));
480 }
481 
482 /*
483  * Process control messages received on a socket.
484  */
485 static void
486 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
487 	struct cmsghdr *cmsgp;
488 	struct in6_pktinfo *pktinfop;
489 	void *timevalp;
490 
491 	/*
492 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
493 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
494 	 * They are all here, outside of the CPP tests, because it is
495 	 * more consistent with the usual ISC coding style.
496 	 */
497 	UNUSED(sock);
498 	UNUSED(msg);
499 	UNUSED(dev);
500 
501 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
502 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
503 
504 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
505 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
506 
507 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
508 		return;
509 
510 	timevalp = NULL;
511 	pktinfop = NULL;
512 
513 	cmsgp = CMSG_FIRSTHDR(msg);
514 	while (cmsgp != NULL) {
515 		socket_log(sock, NULL, TRACE,
516 			   "processing cmsg %p", cmsgp);
517 
518 		if (cmsgp->cmsg_level == IPPROTO_IPV6
519 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
520 
521 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
522 			memmove(&dev->pktinfo, pktinfop,
523 				sizeof(struct in6_pktinfo));
524 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
525 			socket_log(sock, NULL, TRACE,
526 				   "interface received on ifindex %u",
527 				   dev->pktinfo.ipi6_ifindex);
528 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
529 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
530 			goto next;
531 		}
532 
533 		if (cmsgp->cmsg_level == SOL_SOCKET
534 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
535 			struct timeval tv;
536 			timevalp = CMSG_DATA(cmsgp);
537 			memmove(&tv, timevalp, sizeof(tv));
538 			dev->timestamp.seconds = tv.tv_sec;
539 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
540 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
541 			goto next;
542 		}
543 
544 		if (cmsgp->cmsg_level == IPPROTO_IPV6
545 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
546 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
547 			dev->dscp >>= 2;
548 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
549 			goto next;
550 		}
551 
552 		if (cmsgp->cmsg_level == IPPROTO_IP
553 		    && (cmsgp->cmsg_type == IP_TOS)) {
554 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
555 			dev->dscp >>= 2;
556 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
557 			goto next;
558 		}
559 	next:
560 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
561 	}
562 
563 }
564 
565 /*
566  * Construct an iov array and attach it to the msghdr passed in.  This is
567  * the SEND constructor, which will use the used region of the buffer
568  * (if using a buffer list) or will use the internal region (if a single
569  * buffer I/O is requested).
570  *
571  * Nothing can be NULL, and the done event must list at least one buffer
572  * on the buffer linked list for this function to be meaningful.
573  *
574  * If write_countp != NULL, *write_countp will hold the number of bytes
575  * this transaction can send.
576  */
577 static void
578 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
579 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
580 {
581 	unsigned int iovcount;
582 	isc_buffer_t *buffer;
583 	isc_region_t used;
584 	size_t write_count;
585 	size_t skip_count;
586 	struct cmsghdr *cmsgp;
587 
588 	memset(msg, 0, sizeof(*msg));
589 
590 	if (!sock->connected) {
591 		msg->msg_name = (void *)&dev->address.type.sa;
592 		msg->msg_namelen = dev->address.length;
593 	} else {
594 		msg->msg_name = NULL;
595 		msg->msg_namelen = 0;
596 	}
597 
598 	buffer = ISC_LIST_HEAD(dev->bufferlist);
599 	write_count = 0;
600 	iovcount = 0;
601 
602 	/*
603 	 * Single buffer I/O?  Skip what we've done so far in this region.
604 	 */
605 	if (buffer == NULL) {
606 		write_count = dev->region.length - dev->n;
607 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
608 		iov[0].iov_len = write_count;
609 		iovcount = 1;
610 
611 		goto config;
612 	}
613 
614 	/*
615 	 * Multibuffer I/O.
616 	 * Skip the data in the buffer list that we have already written.
617 	 */
618 	skip_count = dev->n;
619 	while (buffer != NULL) {
620 		REQUIRE(ISC_BUFFER_VALID(buffer));
621 		if (skip_count < isc_buffer_usedlength(buffer))
622 			break;
623 		skip_count -= isc_buffer_usedlength(buffer);
624 		buffer = ISC_LIST_NEXT(buffer, link);
625 	}
626 
627 	while (buffer != NULL) {
628 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
629 
630 		isc_buffer_usedregion(buffer, &used);
631 
632 		if (used.length > 0) {
633 			iov[iovcount].iov_base = (void *)(used.base
634 							  + skip_count);
635 			iov[iovcount].iov_len = used.length - skip_count;
636 			write_count += (used.length - skip_count);
637 			skip_count = 0;
638 			iovcount++;
639 		}
640 		buffer = ISC_LIST_NEXT(buffer, link);
641 	}
642 
643 	INSIST(skip_count == 0U);
644 
645  config:
646 	msg->msg_iov = iov;
647 	msg->msg_iovlen = iovcount;
648 
649 	msg->msg_control = NULL;
650 	msg->msg_controllen = 0;
651 	msg->msg_flags = 0;
652 
653 	if ((sock->type == isc_sockettype_udp) &&
654 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
655 	{
656 		struct in6_pktinfo *pktinfop;
657 
658 		socket_log(sock, NULL, TRACE,
659 			   "sendto pktinfo data, ifindex %u",
660 			   dev->pktinfo.ipi6_ifindex);
661 
662 		msg->msg_control = (void *)cmsgbuf;
663 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
664 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
665 
666 		cmsgp = (struct cmsghdr *)cmsgbuf;
667 		cmsgp->cmsg_level = IPPROTO_IPV6;
668 		cmsgp->cmsg_type = IPV6_PKTINFO;
669 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
670 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
671 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
672 	}
673 
674 	if ((sock->type == isc_sockettype_udp) &&
675 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
676 	{
677 		int use_min_mtu = 1;	/* -1, 0, 1 */
678 
679 		cmsgp = (struct cmsghdr *)(cmsgbuf +
680 					   msg->msg_controllen);
681 
682 		msg->msg_control = (void *)cmsgbuf;
683 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
684 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
685 
686 		cmsgp->cmsg_level = IPPROTO_IPV6;
687 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
688 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
689 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
690 	}
691 
692 	if (isc_dscp_check_value > -1) {
693 		if (sock->type == isc_sockettype_udp)
694 			INSIST((int)dev->dscp == isc_dscp_check_value);
695 		else if (sock->type == isc_sockettype_tcp)
696 			INSIST((int)sock->dscp == isc_dscp_check_value);
697 	}
698 
699 	if ((sock->type == isc_sockettype_udp) &&
700 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
701 	{
702 		int dscp = (dev->dscp << 2) & 0xff;
703 
704 		INSIST(dev->dscp < 0x40);
705 
706 		if (sock->pf == AF_INET && sock->pktdscp) {
707 			cmsgp = (struct cmsghdr *)(cmsgbuf +
708 						   msg->msg_controllen);
709 			msg->msg_control = (void *)cmsgbuf;
710 			msg->msg_controllen += cmsg_space(sizeof(dscp));
711 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
712 
713 			cmsgp->cmsg_level = IPPROTO_IP;
714 			cmsgp->cmsg_type = IP_TOS;
715 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
716 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
717 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
718 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
719 			       (void *)&dscp, sizeof(int)) < 0)
720 			{
721 				UNEXPECTED_ERROR(__FILE__, __LINE__,
722 						 "setsockopt(%d, IP_TOS, %.02x)"
723 						 " %s: %s",
724 						 sock->fd, dscp >> 2,
725 						 "failed", strerror(errno));
726 			} else
727 				sock->dscp = dscp;
728 		}
729 
730 		if (sock->pf == AF_INET6 && sock->pktdscp) {
731 			cmsgp = (struct cmsghdr *)(cmsgbuf +
732 						   msg->msg_controllen);
733 			msg->msg_control = (void *)cmsgbuf;
734 			msg->msg_controllen += cmsg_space(sizeof(dscp));
735 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
736 
737 			cmsgp->cmsg_level = IPPROTO_IPV6;
738 			cmsgp->cmsg_type = IPV6_TCLASS;
739 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
740 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
741 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
742 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
743 				       (void *)&dscp, sizeof(int)) < 0) {
744 				UNEXPECTED_ERROR(__FILE__, __LINE__,
745 						 "setsockopt(%d, IPV6_TCLASS, "
746 						 "%.02x) %s: %s",
747 						 sock->fd, dscp >> 2,
748 						 "failed", strerror(errno));
749 			} else
750 				sock->dscp = dscp;
751 		}
752 
753 		if (msg->msg_controllen != 0 &&
754 		    msg->msg_controllen < SENDCMSGBUFLEN)
755 		{
756 			memset(cmsgbuf + msg->msg_controllen, 0,
757 			       SENDCMSGBUFLEN - msg->msg_controllen);
758 		}
759 	}
760 
761 	if (write_countp != NULL)
762 		*write_countp = write_count;
763 }
764 
765 /*
766  * Construct an iov array and attach it to the msghdr passed in.  This is
767  * the RECV constructor, which will use the available region of the buffer
768  * (if using a buffer list) or will use the internal region (if a single
769  * buffer I/O is requested).
770  *
771  * Nothing can be NULL, and the done event must list at least one buffer
772  * on the buffer linked list for this function to be meaningful.
773  *
774  * If read_countp != NULL, *read_countp will hold the number of bytes
775  * this transaction can receive.
776  */
777 static void
778 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
779 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
780 {
781 	unsigned int iovcount;
782 	isc_buffer_t *buffer;
783 	isc_region_t available;
784 	size_t read_count;
785 
786 	memset(msg, 0, sizeof(struct msghdr));
787 
788 	if (sock->type == isc_sockettype_udp) {
789 		memset(&dev->address, 0, sizeof(dev->address));
790 		msg->msg_name = (void *)&dev->address.type.sa;
791 		msg->msg_namelen = sizeof(dev->address.type);
792 	} else { /* TCP */
793 		msg->msg_name = NULL;
794 		msg->msg_namelen = 0;
795 		dev->address = sock->peer_address;
796 	}
797 
798 	buffer = ISC_LIST_HEAD(dev->bufferlist);
799 	read_count = 0;
800 
801 	/*
802 	 * Single buffer I/O?  Skip what we've done so far in this region.
803 	 */
804 	if (buffer == NULL) {
805 		read_count = dev->region.length - dev->n;
806 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
807 		iov[0].iov_len = read_count;
808 		iovcount = 1;
809 
810 		goto config;
811 	}
812 
813 	/*
814 	 * Multibuffer I/O.
815 	 * Skip empty buffers.
816 	 */
817 	while (buffer != NULL) {
818 		REQUIRE(ISC_BUFFER_VALID(buffer));
819 		if (isc_buffer_availablelength(buffer) != 0)
820 			break;
821 		buffer = ISC_LIST_NEXT(buffer, link);
822 	}
823 
824 	iovcount = 0;
825 	while (buffer != NULL) {
826 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
827 
828 		isc_buffer_availableregion(buffer, &available);
829 
830 		if (available.length > 0) {
831 			iov[iovcount].iov_base = (void *)(available.base);
832 			iov[iovcount].iov_len = available.length;
833 			read_count += available.length;
834 			iovcount++;
835 		}
836 		buffer = ISC_LIST_NEXT(buffer, link);
837 	}
838 
839  config:
840 
841 	/*
842 	 * If needed, set up to receive that one extra byte.
843 	 */
844 	msg->msg_iov = iov;
845 	msg->msg_iovlen = iovcount;
846 
847 	msg->msg_control = cmsgbuf;
848 	msg->msg_controllen = RECVCMSGBUFLEN;
849 	msg->msg_flags = 0;
850 
851 	if (read_countp != NULL)
852 		*read_countp = read_count;
853 }
854 
855 static void
856 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
857 		isc_socketevent_t *dev)
858 {
859 	if (sock->type == isc_sockettype_udp) {
860 		if (address != NULL)
861 			dev->address = *address;
862 		else
863 			dev->address = sock->peer_address;
864 	} else if (sock->type == isc_sockettype_tcp) {
865 		INSIST(address == NULL);
866 		dev->address = sock->peer_address;
867 	}
868 }
869 
870 static void
871 destroy_socketevent(isc_event_t *event) {
872 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
873 
874 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
875 
876 	(ev->destroy)(event);
877 }
878 
879 static isc_socketevent_t *
880 allocate_socketevent(void *sender,
881 		     isc_eventtype_t eventtype, isc_taskaction_t action,
882 		     void *arg)
883 {
884 	isc_socketevent_t *ev;
885 
886 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
887 						     eventtype, action, arg,
888 						     sizeof(*ev));
889 
890 	if (ev == NULL)
891 		return (NULL);
892 
893 	ev->result = ISC_R_UNSET;
894 	ISC_LINK_INIT(ev, ev_link);
895 	ISC_LIST_INIT(ev->bufferlist);
896 	ev->region.base = NULL;
897 	ev->n = 0;
898 	ev->offset = 0;
899 	ev->attributes = 0;
900 	ev->destroy = ev->ev_destroy;
901 	ev->ev_destroy = destroy_socketevent;
902 	ev->dscp = 0;
903 
904 	return (ev);
905 }
906 
907 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
908 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
909 #define DOIO_HARD		2	/* i/o error, event sent */
910 #define DOIO_EOF		3	/* EOF, no event sent */
911 
912 static int
913 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
914 	int cc;
915 	struct iovec iov[MAXSCATTERGATHER_RECV];
916 	size_t read_count;
917 	size_t actual_count;
918 	struct msghdr msghdr;
919 	isc_buffer_t *buffer;
920 	int recv_errno;
921 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
922 
923 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
924 
925 	cc = recvmsg(sock->fd, &msghdr, 0);
926 	recv_errno = errno;
927 
928 	if (cc < 0) {
929 		if (SOFT_ERROR(recv_errno))
930 			return (DOIO_SOFT);
931 
932 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
933 			socket_log(sock, NULL, IOEVENT,
934 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
935 				   sock->fd, cc, recv_errno,
936 				   strerror(recv_errno));
937 		}
938 
939 #define SOFT_OR_HARD(_system, _isc) \
940 	if (recv_errno == _system) { \
941 		if (sock->connected) { \
942 			dev->result = _isc; \
943 			return (DOIO_HARD); \
944 		} \
945 		return (DOIO_SOFT); \
946 	}
947 #define ALWAYS_HARD(_system, _isc) \
948 	if (recv_errno == _system) { \
949 		dev->result = _isc; \
950 		return (DOIO_HARD); \
951 	}
952 
953 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
954 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
955 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
956 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
957 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
958 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
959 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
960 		/* Should never get this one but it was seen. */
961 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
962 		/*
963 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
964 		 * errors.
965 		 */
966 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
967 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
968 
969 #undef SOFT_OR_HARD
970 #undef ALWAYS_HARD
971 
972 		dev->result = isc__errno2result(recv_errno);
973 		return (DOIO_HARD);
974 	}
975 
976 	/*
977 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
978 	 * while on UDP sockets, zero length reads are perfectly valid,
979 	 * although strange.
980 	 */
981 	switch (sock->type) {
982 	case isc_sockettype_tcp:
983 		if (cc == 0)
984 			return (DOIO_EOF);
985 		break;
986 	case isc_sockettype_udp:
987 		break;
988 	default:
989 		INSIST(0);
990 	}
991 
992 	if (sock->type == isc_sockettype_udp) {
993 		dev->address.length = msghdr.msg_namelen;
994 		if (isc_sockaddr_getport(&dev->address) == 0) {
995 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
996 				socket_log(sock, &dev->address, IOEVENT,
997 					   "dropping source port zero packet");
998 			}
999 			return (DOIO_SOFT);
1000 		}
1001 	}
1002 
1003 	socket_log(sock, &dev->address, IOEVENT,
1004 		   "packet received correctly");
1005 
1006 	/*
1007 	 * Overflow bit detection.  If we received MORE bytes than we should,
1008 	 * this indicates an overflow situation.  Set the flag in the
1009 	 * dev entry and adjust how much we read by one.
1010 	 */
1011 	/*
1012 	 * If there are control messages attached, run through them and pull
1013 	 * out the interesting bits.
1014 	 */
1015 	process_cmsg(sock, &msghdr, dev);
1016 
1017 	/*
1018 	 * update the buffers (if any) and the i/o count
1019 	 */
1020 	dev->n += cc;
1021 	actual_count = cc;
1022 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1023 	while (buffer != NULL && actual_count > 0U) {
1024 		REQUIRE(ISC_BUFFER_VALID(buffer));
1025 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1026 			actual_count -= isc_buffer_availablelength(buffer);
1027 			isc_buffer_add(buffer,
1028 				       isc_buffer_availablelength(buffer));
1029 		} else {
1030 			isc_buffer_add(buffer, actual_count);
1031 			actual_count = 0;
1032 			POST(actual_count);
1033 			break;
1034 		}
1035 		buffer = ISC_LIST_NEXT(buffer, link);
1036 		if (buffer == NULL) {
1037 			INSIST(actual_count == 0U);
1038 		}
1039 	}
1040 
1041 	/*
1042 	 * If we read less than we expected, update counters,
1043 	 * and let the upper layer poke the descriptor.
1044 	 */
1045 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1046 		return (DOIO_SOFT);
1047 
1048 	/*
1049 	 * Full reads are posted, or partials if partials are ok.
1050 	 */
1051 	dev->result = ISC_R_SUCCESS;
1052 	return (DOIO_SUCCESS);
1053 }
1054 
1055 /*
1056  * Returns:
1057  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1058  *			ISC_R_SUCCESS.
1059  *
1060  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1061  *			dev->result contains the appropriate error.
1062  *
1063  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1064  *			event was sent.  The operation should be retried.
1065  *
1066  *	No other return values are possible.
1067  */
1068 static int
1069 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1070 	int cc;
1071 	struct iovec iov[MAXSCATTERGATHER_SEND];
1072 	size_t write_count;
1073 	struct msghdr msghdr;
1074 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1075 	int attempts = 0;
1076 	int send_errno;
1077 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1078 
1079 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1080 
1081  resend:
1082 	cc = sendmsg(sock->fd, &msghdr, 0);
1083 	send_errno = errno;
1084 
1085 	/*
1086 	 * Check for error or block condition.
1087 	 */
1088 	if (cc < 0) {
1089 		if (send_errno == EINTR && ++attempts < NRETRIES)
1090 			goto resend;
1091 
1092 		if (SOFT_ERROR(send_errno)) {
1093 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1094 				dev->result = ISC_R_WOULDBLOCK;
1095 			return (DOIO_SOFT);
1096 		}
1097 
1098 #define SOFT_OR_HARD(_system, _isc) \
1099 	if (send_errno == _system) { \
1100 		if (sock->connected) { \
1101 			dev->result = _isc; \
1102 			return (DOIO_HARD); \
1103 		} \
1104 		return (DOIO_SOFT); \
1105 	}
1106 #define ALWAYS_HARD(_system, _isc) \
1107 	if (send_errno == _system) { \
1108 		dev->result = _isc; \
1109 		return (DOIO_HARD); \
1110 	}
1111 
1112 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1113 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1114 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1115 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1116 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1117 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1118 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1119 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1120 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1121 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1122 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1123 
1124 #undef SOFT_OR_HARD
1125 #undef ALWAYS_HARD
1126 
1127 		/*
1128 		 * The other error types depend on whether or not the
1129 		 * socket is UDP or TCP.  If it is UDP, some errors
1130 		 * that we expect to be fatal under TCP are merely
1131 		 * annoying, and are really soft errors.
1132 		 *
1133 		 * However, these soft errors are still returned as
1134 		 * a status.
1135 		 */
1136 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1137 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1138 				 addrbuf, strerror(send_errno));
1139 		dev->result = isc__errno2result(send_errno);
1140 		return (DOIO_HARD);
1141 	}
1142 
1143 	if (cc == 0) {
1144 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1145 				 "doio_send: send() %s 0", "returned");
1146 	}
1147 
1148 	/*
1149 	 * If we write less than we expected, update counters, poke.
1150 	 */
1151 	dev->n += cc;
1152 	if ((size_t)cc != write_count)
1153 		return (DOIO_SOFT);
1154 
1155 	/*
1156 	 * Exactly what we wanted to write.  We're done with this
1157 	 * entry.  Post its completion event.
1158 	 */
1159 	dev->result = ISC_R_SUCCESS;
1160 	return (DOIO_SUCCESS);
1161 }
1162 
1163 /*
1164  * Kill.
1165  *
1166  * Caller must ensure that the socket is not locked and no external
1167  * references exist.
1168  */
1169 static void
1170 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1171 	/*
1172 	 * No one has this socket open, so the watcher doesn't have to be
1173 	 * poked, and the socket doesn't have to be locked.
1174 	 */
1175 	manager->fds[fd] = NULL;
1176 	manager->fdstate[fd] = CLOSE_PENDING;
1177 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1178 
1179 	if (sock->active == 1) {
1180 		sock->active = 0;
1181 	}
1182 
1183 	/*
1184 	 * update manager->maxfd here (XXX: this should be implemented more
1185 	 * efficiently)
1186 	 */
1187 	if (manager->maxfd == fd) {
1188 		int i;
1189 
1190 		manager->maxfd = 0;
1191 		for (i = fd - 1; i >= 0; i--) {
1192 			if (manager->fdstate[i] == MANAGED) {
1193 				manager->maxfd = i;
1194 				break;
1195 			}
1196 		}
1197 	}
1198 
1199 }
1200 
1201 static void
1202 destroy(isc__socket_t **sockp) {
1203 	int fd;
1204 	isc__socket_t *sock = *sockp;
1205 	isc__socketmgr_t *manager = sock->manager;
1206 
1207 	socket_log(sock, NULL, CREATION, "destroying");
1208 
1209 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1210 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1211 	INSIST(sock->connect_ev == NULL);
1212 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1213 
1214 	if (sock->fd >= 0) {
1215 		fd = sock->fd;
1216 		sock->fd = -1;
1217 		socketclose(manager, sock, fd);
1218 	}
1219 
1220 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1221 
1222 	/* can't unlock manager as its memory context is still used */
1223 	free_socket(sockp);
1224 }
1225 
1226 static isc_result_t
1227 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1228 		isc__socket_t **socketp)
1229 {
1230 	isc__socket_t *sock;
1231 
1232 	sock = malloc(sizeof(*sock));
1233 
1234 	if (sock == NULL)
1235 		return (ISC_R_NOMEMORY);
1236 
1237 	sock->common.magic = 0;
1238 	sock->common.impmagic = 0;
1239 	sock->references = 0;
1240 
1241 	sock->manager = manager;
1242 	sock->type = type;
1243 	sock->fd = -1;
1244 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1245 	sock->active = 0;
1246 
1247 	ISC_LINK_INIT(sock, link);
1248 
1249 	/*
1250 	 * Set up list of readers and writers to be initially empty.
1251 	 */
1252 	ISC_LIST_INIT(sock->recv_list);
1253 	ISC_LIST_INIT(sock->send_list);
1254 	sock->connect_ev = NULL;
1255 	sock->pending_recv = 0;
1256 	sock->pending_send = 0;
1257 	sock->connected = 0;
1258 	sock->connecting = 0;
1259 	sock->bound = 0;
1260 	sock->pktdscp = 0;
1261 
1262 	/*
1263 	 * Initialize readable and writable events.
1264 	 */
1265 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1266 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1267 		       NULL, sock, sock, NULL);
1268 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1269 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1270 		       NULL, sock, sock, NULL);
1271 
1272 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1273 	sock->common.impmagic = SOCKET_MAGIC;
1274 	*socketp = sock;
1275 
1276 	return (ISC_R_SUCCESS);
1277 }
1278 
1279 /*
1280  * This event requires that the various lists be empty, that the reference
1281  * count be 1, and that the magic number is valid.  The other socket bits,
1282  * like the lock, must be initialized as well.  The fd associated must be
1283  * marked as closed, by setting it to -1 on close, or this routine will
1284  * also close the socket.
1285  */
1286 static void
1287 free_socket(isc__socket_t **socketp) {
1288 	isc__socket_t *sock = *socketp;
1289 
1290 	INSIST(VALID_SOCKET(sock));
1291 	INSIST(sock->references == 0);
1292 	INSIST(!sock->connecting);
1293 	INSIST(!sock->pending_recv);
1294 	INSIST(!sock->pending_send);
1295 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1296 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1297 	INSIST(!ISC_LINK_LINKED(sock, link));
1298 
1299 	sock->common.magic = 0;
1300 	sock->common.impmagic = 0;
1301 
1302 	free(sock);
1303 
1304 	*socketp = NULL;
1305 }
1306 
1307 static void
1308 use_min_mtu(isc__socket_t *sock) {
1309 	/* use minimum MTU */
1310 	if (sock->pf == AF_INET6) {
1311 		int on = 1;
1312 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1313 				(void *)&on, sizeof(on));
1314 	}
1315 }
1316 
1317 static void
1318 set_tcp_maxseg(isc__socket_t *sock, int size) {
1319 	if (sock->type == isc_sockettype_tcp)
1320 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1321 				(void *)&size, sizeof(size));
1322 }
1323 
1324 static isc_result_t
1325 opensocket(isc__socket_t *sock)
1326 {
1327 	isc_result_t result;
1328 	const char *err = "socket";
1329 	int on = 1;
1330 
1331 	switch (sock->type) {
1332 	case isc_sockettype_udp:
1333 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1334 		break;
1335 	case isc_sockettype_tcp:
1336 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1337 		break;
1338 	}
1339 
1340 	if (sock->fd < 0) {
1341 		switch (errno) {
1342 		case EMFILE:
1343 		case ENFILE:
1344 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1345 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1346 				       "%s: %s", err, strerror(errno));
1347 			/* fallthrough */
1348 		case ENOBUFS:
1349 			return (ISC_R_NORESOURCES);
1350 
1351 		case EPROTONOSUPPORT:
1352 		case EPFNOSUPPORT:
1353 		case EAFNOSUPPORT:
1354 		/*
1355 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1356 		 * EAFNOSUPPORT.
1357 		 */
1358 		case EINVAL:
1359 			return (ISC_R_FAMILYNOSUPPORT);
1360 
1361 		default:
1362 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1363 					 "%s() %s: %s", err, "failed",
1364 					 strerror(errno));
1365 			return (ISC_R_UNEXPECTED);
1366 		}
1367 	}
1368 
1369 	result = make_nonblock(sock->fd);
1370 	if (result != ISC_R_SUCCESS) {
1371 		(void)close(sock->fd);
1372 		return (result);
1373 	}
1374 
1375 	/*
1376 	 * Use minimum mtu if possible.
1377 	 */
1378 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1379 		use_min_mtu(sock);
1380 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1381 	}
1382 
1383 	if (sock->type == isc_sockettype_udp) {
1384 
1385 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1386 			       (void *)&on, sizeof(on)) < 0
1387 		    && errno != ENOPROTOOPT) {
1388 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1389 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1390 					 sock->fd, "failed", strerror(errno));
1391 			/* Press on... */
1392 		}
1393 
1394 		/* RFC 3542 */
1395 		if ((sock->pf == AF_INET6)
1396 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1397 				   (void *)&on, sizeof(on)) < 0)) {
1398 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1399 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1400 					 "%s: %s", sock->fd, "failed",
1401 					 strerror(errno));
1402 		}
1403 	}
1404 
1405 	if (sock->active == 0) {
1406 		sock->active = 1;
1407 	}
1408 
1409 	return (ISC_R_SUCCESS);
1410 }
1411 
1412 /*
1413  * Create a 'type' socket managed
1414  * by 'manager'.  Events will be posted to 'task' and when dispatched
1415  * 'action' will be called with 'arg' as the arg value.  The new
1416  * socket is returned in 'socketp'.
1417  */
1418 static isc_result_t
1419 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1420 	      isc_socket_t **socketp)
1421 {
1422 	isc__socket_t *sock = NULL;
1423 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1424 	isc_result_t result;
1425 
1426 	REQUIRE(VALID_MANAGER(manager));
1427 	REQUIRE(socketp != NULL && *socketp == NULL);
1428 
1429 	result = allocate_socket(manager, type, &sock);
1430 	if (result != ISC_R_SUCCESS)
1431 		return (result);
1432 
1433 	switch (sock->type) {
1434 	case isc_sockettype_udp:
1435 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1436 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1437 		break;
1438 	case isc_sockettype_tcp:
1439 		break;
1440 	default:
1441 		INSIST(0);
1442 	}
1443 
1444 	sock->pf = pf;
1445 
1446 	result = opensocket(sock);
1447 	if (result != ISC_R_SUCCESS) {
1448 		free_socket(&sock);
1449 		return (result);
1450 	}
1451 
1452 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1453 	sock->references = 1;
1454 	*socketp = (isc_socket_t *)sock;
1455 
1456 	/*
1457 	 * Note we don't have to lock the socket like we normally would because
1458 	 * there are no external references to it yet.
1459 	 */
1460 
1461 	manager->fds[sock->fd] = sock;
1462 	manager->fdstate[sock->fd] = MANAGED;
1463 
1464 	ISC_LIST_APPEND(manager->socklist, sock, link);
1465 	if (manager->maxfd < sock->fd)
1466 		manager->maxfd = sock->fd;
1467 
1468 	socket_log(sock, NULL, CREATION, "created");
1469 
1470 	return (ISC_R_SUCCESS);
1471 }
1472 
1473 /*%
1474  * Create a new 'type' socket managed by 'manager'.  Events
1475  * will be posted to 'task' and when dispatched 'action' will be
1476  * called with 'arg' as the arg value.  The new socket is returned
1477  * in 'socketp'.
1478  */
1479 isc_result_t
1480 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1481 		   isc_socket_t **socketp)
1482 {
1483 	return (socket_create(manager0, pf, type, socketp));
1484 }
1485 
1486 /*
1487  * Attach to a socket.  Caller must explicitly detach when it is done.
1488  */
1489 void
1490 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1491 	isc__socket_t *sock = (isc__socket_t *)sock0;
1492 
1493 	REQUIRE(VALID_SOCKET(sock));
1494 	REQUIRE(socketp != NULL && *socketp == NULL);
1495 
1496 	sock->references++;
1497 
1498 	*socketp = (isc_socket_t *)sock;
1499 }
1500 
1501 /*
1502  * Dereference a socket.  If this is the last reference to it, clean things
1503  * up by destroying the socket.
1504  */
1505 void
1506 isc__socket_detach(isc_socket_t **socketp) {
1507 	isc__socket_t *sock;
1508 	isc_boolean_t kill_socket = ISC_FALSE;
1509 
1510 	REQUIRE(socketp != NULL);
1511 	sock = (isc__socket_t *)*socketp;
1512 	REQUIRE(VALID_SOCKET(sock));
1513 
1514 	REQUIRE(sock->references > 0);
1515 	sock->references--;
1516 	if (sock->references == 0)
1517 		kill_socket = ISC_TRUE;
1518 
1519 	if (kill_socket)
1520 		destroy(&sock);
1521 
1522 	*socketp = NULL;
1523 }
1524 
1525 /*
1526  * I/O is possible on a given socket.  Schedule an event to this task that
1527  * will call an internal function to do the I/O.  This will charge the
1528  * task with the I/O operation and let our select loop handler get back
1529  * to doing something real as fast as possible.
1530  *
1531  * The socket and manager must be locked before calling this function.
1532  */
1533 static void
1534 dispatch_recv(isc__socket_t *sock) {
1535 	intev_t *iev;
1536 	isc_socketevent_t *ev;
1537 	isc_task_t *sender;
1538 
1539 	INSIST(!sock->pending_recv);
1540 
1541 	ev = ISC_LIST_HEAD(sock->recv_list);
1542 	if (ev == NULL)
1543 		return;
1544 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1545 		   "dispatch_recv:  event %p -> task %p",
1546 		   ev, ev->ev_sender);
1547 	sender = ev->ev_sender;
1548 
1549 	sock->pending_recv = 1;
1550 	iev = &sock->readable_ev;
1551 
1552 	sock->references++;
1553 	iev->ev_sender = sock;
1554 	iev->ev_action = internal_recv;
1555 	iev->ev_arg = sock;
1556 
1557 	isc_task_send(sender, (isc_event_t **)&iev);
1558 }
1559 
1560 static void
1561 dispatch_send(isc__socket_t *sock) {
1562 	intev_t *iev;
1563 	isc_socketevent_t *ev;
1564 	isc_task_t *sender;
1565 
1566 	INSIST(!sock->pending_send);
1567 
1568 	ev = ISC_LIST_HEAD(sock->send_list);
1569 	if (ev == NULL)
1570 		return;
1571 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1572 		   "dispatch_send:  event %p -> task %p",
1573 		   ev, ev->ev_sender);
1574 	sender = ev->ev_sender;
1575 
1576 	sock->pending_send = 1;
1577 	iev = &sock->writable_ev;
1578 
1579 	sock->references++;
1580 	iev->ev_sender = sock;
1581 	iev->ev_action = internal_send;
1582 	iev->ev_arg = sock;
1583 
1584 	isc_task_send(sender, (isc_event_t **)&iev);
1585 }
1586 
1587 static void
1588 dispatch_connect(isc__socket_t *sock) {
1589 	intev_t *iev;
1590 	isc_socket_connev_t *ev;
1591 
1592 	iev = &sock->writable_ev;
1593 
1594 	ev = sock->connect_ev;
1595 	INSIST(ev != NULL); /* XXX */
1596 
1597 	INSIST(sock->connecting);
1598 
1599 	sock->references++;  /* keep socket around for this internal event */
1600 	iev->ev_sender = sock;
1601 	iev->ev_action = internal_connect;
1602 	iev->ev_arg = sock;
1603 
1604 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1605 }
1606 
1607 /*
1608  * Dequeue an item off the given socket's read queue, set the result code
1609  * in the done event to the one provided, and send it to the task it was
1610  * destined for.
1611  *
1612  * If the event to be sent is on a list, remove it before sending.  If
1613  * asked to, send and detach from the socket as well.
1614  *
1615  * Caller must have the socket locked if the event is attached to the socket.
1616  */
1617 static void
1618 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1619 	isc_task_t *task;
1620 
1621 	task = (*dev)->ev_sender;
1622 
1623 	(*dev)->ev_sender = sock;
1624 
1625 	if (ISC_LINK_LINKED(*dev, ev_link))
1626 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1627 
1628 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1629 	    == ISC_SOCKEVENTATTR_ATTACHED)
1630 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1631 	else
1632 		isc_task_send(task, (isc_event_t **)dev);
1633 }
1634 
1635 /*
1636  * See comments for send_recvdone_event() above.
1637  *
1638  * Caller must have the socket locked if the event is attached to the socket.
1639  */
1640 static void
1641 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1642 	isc_task_t *task;
1643 
1644 	INSIST(dev != NULL && *dev != NULL);
1645 
1646 	task = (*dev)->ev_sender;
1647 	(*dev)->ev_sender = sock;
1648 
1649 	if (ISC_LINK_LINKED(*dev, ev_link))
1650 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1651 
1652 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1653 	    == ISC_SOCKEVENTATTR_ATTACHED)
1654 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1655 	else
1656 		isc_task_send(task, (isc_event_t **)dev);
1657 }
1658 
1659 static void
1660 internal_recv(isc_task_t *me, isc_event_t *ev) {
1661 	isc_socketevent_t *dev;
1662 	isc__socket_t *sock;
1663 
1664 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1665 
1666 	sock = ev->ev_sender;
1667 	INSIST(VALID_SOCKET(sock));
1668 
1669 	socket_log(sock, NULL, IOEVENT,
1670 		   "internal_recv: task %p got event %p", me, ev);
1671 
1672 	INSIST(sock->pending_recv == 1);
1673 	sock->pending_recv = 0;
1674 
1675 	INSIST(sock->references > 0);
1676 	sock->references--;  /* the internal event is done with this socket */
1677 	if (sock->references == 0) {
1678 		destroy(&sock);
1679 		return;
1680 	}
1681 
1682 	/*
1683 	 * Try to do as much I/O as possible on this socket.  There are no
1684 	 * limits here, currently.
1685 	 */
1686 	dev = ISC_LIST_HEAD(sock->recv_list);
1687 	while (dev != NULL) {
1688 		switch (doio_recv(sock, dev)) {
1689 		case DOIO_SOFT:
1690 			goto poke;
1691 
1692 		case DOIO_EOF:
1693 			/*
1694 			 * read of 0 means the remote end was closed.
1695 			 * Run through the event queue and dispatch all
1696 			 * the events with an EOF result code.
1697 			 */
1698 			do {
1699 				dev->result = ISC_R_EOF;
1700 				send_recvdone_event(sock, &dev);
1701 				dev = ISC_LIST_HEAD(sock->recv_list);
1702 			} while (dev != NULL);
1703 			goto poke;
1704 
1705 		case DOIO_SUCCESS:
1706 		case DOIO_HARD:
1707 			send_recvdone_event(sock, &dev);
1708 			break;
1709 		}
1710 
1711 		dev = ISC_LIST_HEAD(sock->recv_list);
1712 	}
1713 
1714  poke:
1715 	if (!ISC_LIST_EMPTY(sock->recv_list))
1716 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1717 }
1718 
1719 static void
1720 internal_send(isc_task_t *me, isc_event_t *ev) {
1721 	isc_socketevent_t *dev;
1722 	isc__socket_t *sock;
1723 
1724 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1725 
1726 	/*
1727 	 * Find out what socket this is and lock it.
1728 	 */
1729 	sock = (isc__socket_t *)ev->ev_sender;
1730 	INSIST(VALID_SOCKET(sock));
1731 	socket_log(sock, NULL, IOEVENT,
1732 		   "internal_send: task %p got event %p", me, ev);
1733 
1734 	INSIST(sock->pending_send == 1);
1735 	sock->pending_send = 0;
1736 
1737 	INSIST(sock->references > 0);
1738 	sock->references--;  /* the internal event is done with this socket */
1739 	if (sock->references == 0) {
1740 		destroy(&sock);
1741 		return;
1742 	}
1743 
1744 	/*
1745 	 * Try to do as much I/O as possible on this socket.  There are no
1746 	 * limits here, currently.
1747 	 */
1748 	dev = ISC_LIST_HEAD(sock->send_list);
1749 	while (dev != NULL) {
1750 		switch (doio_send(sock, dev)) {
1751 		case DOIO_SOFT:
1752 			goto poke;
1753 
1754 		case DOIO_HARD:
1755 		case DOIO_SUCCESS:
1756 			send_senddone_event(sock, &dev);
1757 			break;
1758 		}
1759 
1760 		dev = ISC_LIST_HEAD(sock->send_list);
1761 	}
1762 
1763  poke:
1764 	if (!ISC_LIST_EMPTY(sock->send_list))
1765 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1766 }
1767 
1768 /*
1769  * Process read/writes on each fd here.  Avoid locking
1770  * and unlocking twice if both reads and writes are possible.
1771  */
1772 static void
1773 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1774 	   isc_boolean_t writeable)
1775 {
1776 	isc__socket_t *sock;
1777 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1778 
1779 	/*
1780 	 * If the socket is going to be closed, don't do more I/O.
1781 	 */
1782 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1783 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1784 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1785 		return;
1786 	}
1787 
1788 	sock = manager->fds[fd];
1789 	if (readable) {
1790 		if (sock == NULL) {
1791 			unwatch_read = ISC_TRUE;
1792 			goto check_write;
1793 		}
1794 		if (!SOCK_DEAD(sock)) {
1795 			dispatch_recv(sock);
1796 		}
1797 		unwatch_read = ISC_TRUE;
1798 	}
1799 check_write:
1800 	if (writeable) {
1801 		if (sock == NULL) {
1802 			unwatch_write = ISC_TRUE;
1803 			goto unlock_fd;
1804 		}
1805 		if (!SOCK_DEAD(sock)) {
1806 			if (sock->connecting)
1807 				dispatch_connect(sock);
1808 			else
1809 				dispatch_send(sock);
1810 		}
1811 		unwatch_write = ISC_TRUE;
1812 	}
1813 
1814  unlock_fd:
1815 	if (unwatch_read)
1816 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1817 	if (unwatch_write)
1818 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1819 
1820 }
1821 
1822 static void
1823 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1824 	    fd_set *writefds)
1825 {
1826 	int i;
1827 
1828 	REQUIRE(maxfd <= (int)manager->maxsocks);
1829 
1830 	for (i = 0; i < maxfd; i++) {
1831 		process_fd(manager, i, FD_ISSET(i, readfds),
1832 			   FD_ISSET(i, writefds));
1833 	}
1834 }
1835 
1836 /*
1837  * Create a new socket manager.
1838  */
1839 
1840 static isc_result_t
1841 setup_watcher(isc__socketmgr_t *manager) {
1842 	isc_result_t result;
1843 
1844 	UNUSED(result);
1845 
1846 	manager->fd_bufsize = sizeof(fd_set);
1847 
1848 	manager->read_fds = NULL;
1849 	manager->read_fds_copy = NULL;
1850 	manager->write_fds = NULL;
1851 	manager->write_fds_copy = NULL;
1852 
1853 	manager->read_fds = malloc(manager->fd_bufsize);
1854 	if (manager->read_fds != NULL)
1855 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1856 	if (manager->read_fds_copy != NULL)
1857 		manager->write_fds = malloc(manager->fd_bufsize);
1858 	if (manager->write_fds != NULL) {
1859 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1860 	}
1861 	if (manager->write_fds_copy == NULL) {
1862 		if (manager->write_fds != NULL) {
1863 			free(manager->write_fds);
1864 		}
1865 		if (manager->read_fds_copy != NULL) {
1866 			free(manager->read_fds_copy);
1867 		}
1868 		if (manager->read_fds != NULL) {
1869 			free(manager->read_fds);
1870 		}
1871 		return (ISC_R_NOMEMORY);
1872 	}
1873 	memset(manager->read_fds, 0, manager->fd_bufsize);
1874 	memset(manager->write_fds, 0, manager->fd_bufsize);
1875 
1876 	manager->maxfd = 0;
1877 
1878 	return (ISC_R_SUCCESS);
1879 }
1880 
1881 static void
1882 cleanup_watcher(isc__socketmgr_t *manager) {
1883 
1884 	if (manager->read_fds != NULL)
1885 		free(manager->read_fds);
1886 	if (manager->read_fds_copy != NULL)
1887 		free(manager->read_fds_copy);
1888 	if (manager->write_fds != NULL)
1889 		free(manager->write_fds);
1890 	if (manager->write_fds_copy != NULL)
1891 		free(manager->write_fds_copy);
1892 }
1893 
1894 isc_result_t
1895 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1896 	return (isc__socketmgr_create2(managerp, 0));
1897 }
1898 
1899 isc_result_t
1900 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1901 		       unsigned int maxsocks)
1902 {
1903 	isc__socketmgr_t *manager;
1904 	isc_result_t result;
1905 
1906 	REQUIRE(managerp != NULL && *managerp == NULL);
1907 
1908 	if (socketmgr != NULL) {
1909 		/* Don't allow maxsocks to be updated */
1910 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1911 			return (ISC_R_EXISTS);
1912 
1913 		socketmgr->refs++;
1914 		*managerp = (isc_socketmgr_t *)socketmgr;
1915 		return (ISC_R_SUCCESS);
1916 	}
1917 
1918 	if (maxsocks == 0)
1919 		maxsocks = FD_SETSIZE;
1920 
1921 	manager = malloc(sizeof(*manager));
1922 	if (manager == NULL)
1923 		return (ISC_R_NOMEMORY);
1924 
1925 	/* zero-clear so that necessary cleanup on failure will be easy */
1926 	memset(manager, 0, sizeof(*manager));
1927 	manager->maxsocks = maxsocks;
1928 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1929 	if (manager->fds == NULL) {
1930 		result = ISC_R_NOMEMORY;
1931 		goto free_manager;
1932 	}
1933 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1934 	if (manager->fdstate == NULL) {
1935 		result = ISC_R_NOMEMORY;
1936 		goto free_manager;
1937 	}
1938 
1939 	manager->common.methods = &socketmgrmethods;
1940 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1941 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1942 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1943 	ISC_LIST_INIT(manager->socklist);
1944 
1945 	manager->refs = 1;
1946 
1947 	/*
1948 	 * Set up initial state for the select loop
1949 	 */
1950 	result = setup_watcher(manager);
1951 	if (result != ISC_R_SUCCESS)
1952 		goto cleanup;
1953 
1954 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1955 
1956 	socketmgr = manager;
1957 	*managerp = (isc_socketmgr_t *)manager;
1958 
1959 	return (ISC_R_SUCCESS);
1960 
1961 cleanup:
1962 
1963 free_manager:
1964 	if (manager->fdstate != NULL) {
1965 		free(manager->fdstate);
1966 	}
1967 	if (manager->fds != NULL) {
1968 		free(manager->fds);
1969 	}
1970 	free(manager);
1971 
1972 	return (result);
1973 }
1974 
1975 void
1976 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
1977 	isc__socketmgr_t *manager;
1978 	int i;
1979 
1980 	/*
1981 	 * Destroy a socket manager.
1982 	 */
1983 
1984 	REQUIRE(managerp != NULL);
1985 	manager = (isc__socketmgr_t *)*managerp;
1986 	REQUIRE(VALID_MANAGER(manager));
1987 
1988 	manager->refs--;
1989 	if (manager->refs > 0) {
1990 		*managerp = NULL;
1991 		return;
1992 	}
1993 	socketmgr = NULL;
1994 
1995 	/*
1996 	 * Wait for all sockets to be destroyed.
1997 	 */
1998 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1999 		isc__taskmgr_dispatch(NULL);
2000 	}
2001 
2002 	/*
2003 	 * Here, poke our select/poll thread.  Do this by closing the write
2004 	 * half of the pipe, which will send EOF to the read half.
2005 	 * This is currently a no-op in the non-threaded case.
2006 	 */
2007 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2008 
2009 	/*
2010 	 * Clean up.
2011 	 */
2012 	cleanup_watcher(manager);
2013 
2014 	for (i = 0; i < (int)manager->maxsocks; i++)
2015 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2016 			(void)close(i);
2017 
2018 	free(manager->fds);
2019 	free(manager->fdstate);
2020 
2021 	manager->common.magic = 0;
2022 	manager->common.impmagic = 0;
2023 	free(manager);
2024 
2025 	*managerp = NULL;
2026 
2027 	socketmgr = NULL;
2028 }
2029 
2030 static isc_result_t
2031 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2032 	    unsigned int flags)
2033 {
2034 	int io_state;
2035 	isc_task_t *ntask = NULL;
2036 	isc_result_t result = ISC_R_SUCCESS;
2037 
2038 	dev->ev_sender = task;
2039 
2040 	if (sock->type == isc_sockettype_udp) {
2041 		io_state = doio_recv(sock, dev);
2042 	} else {
2043 		if (ISC_LIST_EMPTY(sock->recv_list))
2044 			io_state = doio_recv(sock, dev);
2045 		else
2046 			io_state = DOIO_SOFT;
2047 	}
2048 
2049 	switch (io_state) {
2050 	case DOIO_SOFT:
2051 		/*
2052 		 * We couldn't read all or part of the request right now, so
2053 		 * queue it.
2054 		 *
2055 		 * Attach to socket and to task
2056 		 */
2057 		isc_task_attach(task, &ntask);
2058 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2059 
2060 		/*
2061 		 * Enqueue the request.  If the socket was previously not being
2062 		 * watched, poke the watcher to start paying attention to it.
2063 		 */
2064 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2065 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2066 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2067 
2068 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2069 			   "socket_recv: event %p -> task %p",
2070 			   dev, ntask);
2071 
2072 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2073 			result = ISC_R_INPROGRESS;
2074 		break;
2075 
2076 	case DOIO_EOF:
2077 		dev->result = ISC_R_EOF;
2078 		/* fallthrough */
2079 
2080 	case DOIO_HARD:
2081 	case DOIO_SUCCESS:
2082 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2083 			send_recvdone_event(sock, &dev);
2084 		break;
2085 	}
2086 
2087 	return (result);
2088 }
2089 
2090 isc_result_t
2091 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2092 		  unsigned int minimum, isc_task_t *task,
2093 		  isc_taskaction_t action, void *arg)
2094 {
2095 	isc__socket_t *sock = (isc__socket_t *)sock0;
2096 	isc_socketevent_t *dev;
2097 	isc__socketmgr_t *manager;
2098 	unsigned int iocount;
2099 	isc_buffer_t *buffer;
2100 
2101 	REQUIRE(VALID_SOCKET(sock));
2102 	REQUIRE(buflist != NULL);
2103 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2104 	REQUIRE(task != NULL);
2105 	REQUIRE(action != NULL);
2106 
2107 	manager = sock->manager;
2108 	REQUIRE(VALID_MANAGER(manager));
2109 
2110 	iocount = isc_bufferlist_availablecount(buflist);
2111 	REQUIRE(iocount > 0);
2112 
2113 	INSIST(sock->bound);
2114 
2115 	dev = allocate_socketevent(sock,
2116 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2117 	if (dev == NULL)
2118 		return (ISC_R_NOMEMORY);
2119 
2120 	/*
2121 	 * UDP sockets are always partial read
2122 	 */
2123 	if (sock->type == isc_sockettype_udp)
2124 		dev->minimum = 1;
2125 	else {
2126 		if (minimum == 0)
2127 			dev->minimum = iocount;
2128 		else
2129 			dev->minimum = minimum;
2130 	}
2131 
2132 	/*
2133 	 * Move each buffer from the passed in list to our internal one.
2134 	 */
2135 	buffer = ISC_LIST_HEAD(*buflist);
2136 	while (buffer != NULL) {
2137 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2138 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2139 		buffer = ISC_LIST_HEAD(*buflist);
2140 	}
2141 
2142 	return (socket_recv(sock, dev, task, 0));
2143 }
2144 
2145 static isc_result_t
2146 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2147 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2148 	    unsigned int flags)
2149 {
2150 	int io_state;
2151 	isc_task_t *ntask = NULL;
2152 	isc_result_t result = ISC_R_SUCCESS;
2153 
2154 	dev->ev_sender = task;
2155 
2156 	set_dev_address(address, sock, dev);
2157 	if (pktinfo != NULL) {
2158 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2159 		dev->pktinfo = *pktinfo;
2160 
2161 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2162 		    !isc_sockaddr_islinklocal(&dev->address)) {
2163 			socket_log(sock, NULL, TRACE,
2164 				   "pktinfo structure provided, ifindex %u "
2165 				   "(set to 0)", pktinfo->ipi6_ifindex);
2166 
2167 			/*
2168 			 * Set the pktinfo index to 0 here, to let the
2169 			 * kernel decide what interface it should send on.
2170 			 */
2171 			dev->pktinfo.ipi6_ifindex = 0;
2172 		}
2173 	}
2174 
2175 	if (sock->type == isc_sockettype_udp)
2176 		io_state = doio_send(sock, dev);
2177 	else {
2178 		if (ISC_LIST_EMPTY(sock->send_list))
2179 			io_state = doio_send(sock, dev);
2180 		else
2181 			io_state = DOIO_SOFT;
2182 	}
2183 
2184 	switch (io_state) {
2185 	case DOIO_SOFT:
2186 		/*
2187 		 * We couldn't send all or part of the request right now, so
2188 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2189 		 */
2190 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2191 			isc_task_attach(task, &ntask);
2192 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2193 
2194 			/*
2195 			 * Enqueue the request.  If the socket was previously
2196 			 * not being watched, poke the watcher to start
2197 			 * paying attention to it.
2198 			 */
2199 			if (ISC_LIST_EMPTY(sock->send_list) &&
2200 			    !sock->pending_send)
2201 				select_poke(sock->manager, sock->fd,
2202 					    SELECT_POKE_WRITE);
2203 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2204 
2205 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2206 				   "socket_send: event %p -> task %p",
2207 				   dev, ntask);
2208 
2209 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2210 				result = ISC_R_INPROGRESS;
2211 			break;
2212 		}
2213 
2214 		/* FALLTHROUGH */
2215 
2216 	case DOIO_HARD:
2217 	case DOIO_SUCCESS:
2218 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2219 			send_senddone_event(sock, &dev);
2220 		break;
2221 	}
2222 
2223 	return (result);
2224 }
2225 
2226 isc_result_t
2227 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2228 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2229 {
2230 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2231 				     NULL, 0));
2232 }
2233 
2234 isc_result_t
2235 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2236 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2237 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2238 		     unsigned int flags)
2239 {
2240 	isc__socket_t *sock = (isc__socket_t *)sock0;
2241 	isc_socketevent_t *dev;
2242 	isc__socketmgr_t *manager;
2243 	unsigned int iocount;
2244 	isc_buffer_t *buffer;
2245 
2246 	REQUIRE(VALID_SOCKET(sock));
2247 	REQUIRE(buflist != NULL);
2248 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2249 	REQUIRE(task != NULL);
2250 	REQUIRE(action != NULL);
2251 
2252 	manager = sock->manager;
2253 	REQUIRE(VALID_MANAGER(manager));
2254 
2255 	iocount = isc_bufferlist_usedcount(buflist);
2256 	REQUIRE(iocount > 0);
2257 
2258 	dev = allocate_socketevent(sock,
2259 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2260 	if (dev == NULL)
2261 		return (ISC_R_NOMEMORY);
2262 
2263 	/*
2264 	 * Move each buffer from the passed in list to our internal one.
2265 	 */
2266 	buffer = ISC_LIST_HEAD(*buflist);
2267 	while (buffer != NULL) {
2268 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2269 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2270 		buffer = ISC_LIST_HEAD(*buflist);
2271 	}
2272 
2273 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2274 }
2275 
2276 isc_result_t
2277 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2278 		 unsigned int options) {
2279 	isc__socket_t *sock = (isc__socket_t *)sock0;
2280 	int on = 1;
2281 
2282 	REQUIRE(VALID_SOCKET(sock));
2283 
2284 	INSIST(!sock->bound);
2285 
2286 	if (sock->pf != sockaddr->type.sa.sa_family) {
2287 		return (ISC_R_FAMILYMISMATCH);
2288 	}
2289 
2290 	/*
2291 	 * Only set SO_REUSEADDR when we want a specific port.
2292 	 */
2293 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2294 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2295 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2296 		       sizeof(on)) < 0) {
2297 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2298 				 "setsockopt(%d) %s", sock->fd, "failed");
2299 		/* Press on... */
2300 	}
2301 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2302 		switch (errno) {
2303 		case EACCES:
2304 			return (ISC_R_NOPERM);
2305 		case EADDRNOTAVAIL:
2306 			return (ISC_R_ADDRNOTAVAIL);
2307 		case EADDRINUSE:
2308 			return (ISC_R_ADDRINUSE);
2309 		case EINVAL:
2310 			return (ISC_R_BOUND);
2311 		default:
2312 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2313 					 strerror(errno));
2314 			return (ISC_R_UNEXPECTED);
2315 		}
2316 	}
2317 
2318 	socket_log(sock, sockaddr, TRACE, "bound");
2319 	sock->bound = 1;
2320 
2321 	return (ISC_R_SUCCESS);
2322 }
2323 
2324 isc_result_t
2325 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2326 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2327 {
2328 	isc__socket_t *sock = (isc__socket_t *)sock0;
2329 	isc_socket_connev_t *dev;
2330 	isc_task_t *ntask = NULL;
2331 	isc__socketmgr_t *manager;
2332 	int cc;
2333 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2334 
2335 	REQUIRE(VALID_SOCKET(sock));
2336 	REQUIRE(addr != NULL);
2337 	REQUIRE(task != NULL);
2338 	REQUIRE(action != NULL);
2339 
2340 	manager = sock->manager;
2341 	REQUIRE(VALID_MANAGER(manager));
2342 	REQUIRE(addr != NULL);
2343 
2344 	if (isc_sockaddr_ismulticast(addr))
2345 		return (ISC_R_MULTICAST);
2346 
2347 	REQUIRE(!sock->connecting);
2348 
2349 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2350 							ISC_SOCKEVENT_CONNECT,
2351 							action,	arg,
2352 							sizeof(*dev));
2353 	if (dev == NULL) {
2354 		return (ISC_R_NOMEMORY);
2355 	}
2356 	ISC_LINK_INIT(dev, ev_link);
2357 
2358 	/*
2359 	 * Try to do the connect right away, as there can be only one
2360 	 * outstanding, and it might happen to complete.
2361 	 */
2362 	sock->peer_address = *addr;
2363 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2364 	if (cc < 0) {
2365 		/*
2366 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2367 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2368 		 * a success and let the user detect it if it's really an error
2369 		 * at the time of sending a packet on the socket.
2370 		 */
2371 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2372 			cc = 0;
2373 			goto success;
2374 		}
2375 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2376 			goto queue;
2377 
2378 		switch (errno) {
2379 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2380 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2381 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2382 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2383 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2384 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2385 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2386 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2387 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2388 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2389 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2390 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2391 #undef ERROR_MATCH
2392 		}
2393 
2394 		sock->connected = 0;
2395 
2396 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2397 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2398 				 addrbuf, errno, strerror(errno));
2399 
2400 		isc_event_free(ISC_EVENT_PTR(&dev));
2401 		return (ISC_R_UNEXPECTED);
2402 
2403 	err_exit:
2404 		sock->connected = 0;
2405 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2406 
2407 		return (ISC_R_SUCCESS);
2408 	}
2409 
2410 	/*
2411 	 * If connect completed, fire off the done event.
2412 	 */
2413  success:
2414 	if (cc == 0) {
2415 		sock->connected = 1;
2416 		sock->bound = 1;
2417 		dev->result = ISC_R_SUCCESS;
2418 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2419 
2420 		return (ISC_R_SUCCESS);
2421 	}
2422 
2423  queue:
2424 
2425 	/*
2426 	 * Attach to task.
2427 	 */
2428 	isc_task_attach(task, &ntask);
2429 
2430 	sock->connecting = 1;
2431 
2432 	dev->ev_sender = ntask;
2433 
2434 	/*
2435 	 * Poke watcher here.  We still have the socket locked, so there
2436 	 * is no race condition.  We will keep the lock for such a short
2437 	 * bit of time waking it up now or later won't matter all that much.
2438 	 */
2439 	if (sock->connect_ev == NULL)
2440 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2441 
2442 	sock->connect_ev = dev;
2443 
2444 	return (ISC_R_SUCCESS);
2445 }
2446 
2447 /*
2448  * Called when a socket with a pending connect() finishes.
2449  */
2450 static void
2451 internal_connect(isc_task_t *me, isc_event_t *ev) {
2452 	isc__socket_t *sock;
2453 	isc_socket_connev_t *dev;
2454 	isc_task_t *task;
2455 	int cc;
2456 	socklen_t optlen;
2457 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2458 
2459 	UNUSED(me);
2460 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2461 
2462 	sock = ev->ev_sender;
2463 	INSIST(VALID_SOCKET(sock));
2464 
2465 	/*
2466 	 * When the internal event was sent the reference count was bumped
2467 	 * to keep the socket around for us.  Decrement the count here.
2468 	 */
2469 	INSIST(sock->references > 0);
2470 	sock->references--;
2471 	if (sock->references == 0) {
2472 		destroy(&sock);
2473 		return;
2474 	}
2475 
2476 	/*
2477 	 * Has this event been canceled?
2478 	 */
2479 	dev = sock->connect_ev;
2480 	if (dev == NULL) {
2481 		INSIST(!sock->connecting);
2482 		return;
2483 	}
2484 
2485 	INSIST(sock->connecting);
2486 	sock->connecting = 0;
2487 
2488 	/*
2489 	 * Get any possible error status here.
2490 	 */
2491 	optlen = sizeof(cc);
2492 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2493 		       (void *)&cc, (void *)&optlen) < 0)
2494 		cc = errno;
2495 	else
2496 		errno = cc;
2497 
2498 	if (errno != 0) {
2499 		/*
2500 		 * If the error is EAGAIN, just re-select on this
2501 		 * fd and pretend nothing strange happened.
2502 		 */
2503 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2504 			sock->connecting = 1;
2505 			select_poke(sock->manager, sock->fd,
2506 				    SELECT_POKE_CONNECT);
2507 			return;
2508 		}
2509 
2510 
2511 		/*
2512 		 * Translate other errors into ISC_R_* flavors.
2513 		 */
2514 		switch (errno) {
2515 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2516 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2517 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2518 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2519 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2520 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2521 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2522 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2523 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2524 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2525 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2526 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2527 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2528 #undef ERROR_MATCH
2529 		default:
2530 			dev->result = ISC_R_UNEXPECTED;
2531 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2532 					    sizeof(peerbuf));
2533 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2534 					 "internal_connect: connect(%s) %s",
2535 					 peerbuf, strerror(errno));
2536 		}
2537 	} else {
2538 		dev->result = ISC_R_SUCCESS;
2539 		sock->connected = 1;
2540 		sock->bound = 1;
2541 	}
2542 
2543 	sock->connect_ev = NULL;
2544 
2545 	task = dev->ev_sender;
2546 	dev->ev_sender = sock;
2547 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2548 }
2549 
2550 /*
2551  * Run through the list of events on this socket, and cancel the ones
2552  * queued for task "task" of type "how".  "how" is a bitmask.
2553  */
2554 void
2555 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2556 	isc__socket_t *sock = (isc__socket_t *)sock0;
2557 
2558 	REQUIRE(VALID_SOCKET(sock));
2559 
2560 	/*
2561 	 * Quick exit if there is nothing to do.  Don't even bother locking
2562 	 * in this case.
2563 	 */
2564 	if (how == 0)
2565 		return;
2566 
2567 	/*
2568 	 * All of these do the same thing, more or less.
2569 	 * Each will:
2570 	 *	o If the internal event is marked as "posted" try to
2571 	 *	  remove it from the task's queue.  If this fails, mark it
2572 	 *	  as canceled instead, and let the task clean it up later.
2573 	 *	o For each I/O request for that task of that type, post
2574 	 *	  its done event with status of "ISC_R_CANCELED".
2575 	 *	o Reset any state needed.
2576 	 */
2577 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2578 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2579 		isc_socketevent_t      *dev;
2580 		isc_socketevent_t      *next;
2581 		isc_task_t	       *current_task;
2582 
2583 		dev = ISC_LIST_HEAD(sock->recv_list);
2584 
2585 		while (dev != NULL) {
2586 			current_task = dev->ev_sender;
2587 			next = ISC_LIST_NEXT(dev, ev_link);
2588 
2589 			if ((task == NULL) || (task == current_task)) {
2590 				dev->result = ISC_R_CANCELED;
2591 				send_recvdone_event(sock, &dev);
2592 			}
2593 			dev = next;
2594 		}
2595 	}
2596 
2597 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2598 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2599 		isc_socketevent_t      *dev;
2600 		isc_socketevent_t      *next;
2601 		isc_task_t	       *current_task;
2602 
2603 		dev = ISC_LIST_HEAD(sock->send_list);
2604 
2605 		while (dev != NULL) {
2606 			current_task = dev->ev_sender;
2607 			next = ISC_LIST_NEXT(dev, ev_link);
2608 
2609 			if ((task == NULL) || (task == current_task)) {
2610 				dev->result = ISC_R_CANCELED;
2611 				send_senddone_event(sock, &dev);
2612 			}
2613 			dev = next;
2614 		}
2615 	}
2616 
2617 	/*
2618 	 * Connecting is not a list.
2619 	 */
2620 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2621 	    && sock->connect_ev != NULL) {
2622 		isc_socket_connev_t    *dev;
2623 		isc_task_t	       *current_task;
2624 
2625 		INSIST(sock->connecting);
2626 		sock->connecting = 0;
2627 
2628 		dev = sock->connect_ev;
2629 		current_task = dev->ev_sender;
2630 
2631 		if ((task == NULL) || (task == current_task)) {
2632 			sock->connect_ev = NULL;
2633 
2634 			dev->result = ISC_R_CANCELED;
2635 			dev->ev_sender = sock;
2636 			isc_task_sendanddetach(&current_task,
2637 					       ISC_EVENT_PTR(&dev));
2638 		}
2639 	}
2640 
2641 }
2642 
2643 /*
2644  * In our assumed scenario, we can simply use a single static object.
2645  * XXX: this is not true if the application uses multiple threads with
2646  *      'multi-context' mode.  Fixing this is a future TODO item.
2647  */
2648 static isc_socketwait_t swait_private;
2649 
2650 int
2651 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2652 			  isc_socketwait_t **swaitp)
2653 {
2654 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2655 	int n;
2656 
2657 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2658 
2659 	if (manager == NULL)
2660 		manager = socketmgr;
2661 	if (manager == NULL)
2662 		return (0);
2663 
2664 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2665 	memmove(manager->write_fds_copy, manager->write_fds,
2666 		manager->fd_bufsize);
2667 
2668 	swait_private.readset = manager->read_fds_copy;
2669 	swait_private.writeset = manager->write_fds_copy;
2670 	swait_private.maxfd = manager->maxfd + 1;
2671 
2672 	n = select(swait_private.maxfd, swait_private.readset,
2673 		   swait_private.writeset, NULL, tvp);
2674 
2675 	*swaitp = &swait_private;
2676 	return (n);
2677 }
2678 
2679 isc_result_t
2680 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2681 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2682 
2683 	REQUIRE(swait == &swait_private);
2684 
2685 	if (manager == NULL)
2686 		manager = socketmgr;
2687 	if (manager == NULL)
2688 		return (ISC_R_NOTFOUND);
2689 
2690 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2691 	return (ISC_R_SUCCESS);
2692 }
2693 
2694 #include "../socket_api.c"
2695