xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision a61b9c11d1cfbd02350b4fbc5b6cf37b75fec5ff)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
101 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
102 
103 /*!
104  * IPv6 control information.  If the socket is an IPv6 socket we want
105  * to collect the destination address and interface so the client can
106  * set them on outgoing packets.
107  */
108 
109 /*%
110  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
111  * a setsockopt() like interface to request timestamps, and if the OS
112  * doesn't do it for us, call gettimeofday() on every UDP receive?
113  */
114 
115 /*%
116  * Instead of calculating the cmsgbuf lengths every time we take
117  * a rule of thumb approach - sizes are taken from x86_64 linux,
118  * multiplied by 2, everything should fit. Those sizes are not
119  * large enough to cause any concern.
120  */
121 #define CMSG_SP_IN6PKT 40
122 
123 #define CMSG_SP_TIMESTAMP 32
124 
125 #define CMSG_SP_TCTOS 24
126 
127 #define CMSG_SP_INT 24
128 
129 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
130 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
131 
132 /*%
133  * The number of times a send operation is repeated if the result is EINTR.
134  */
135 #define NRETRIES 10
136 
137 typedef struct isc__socket isc__socket_t;
138 typedef struct isc__socketmgr isc__socketmgr_t;
139 
140 struct isc__socket {
141 	/* Not locked. */
142 	isc_socket_t		common;
143 	isc__socketmgr_t	*manager;
144 	isc_sockettype_t	type;
145 
146 	/* Locked by socket lock. */
147 	ISC_LINK(isc__socket_t)	link;
148 	unsigned int		references;
149 	int			fd;
150 	int			pf;
151 
152 	ISC_LIST(isc_socketevent_t)		send_list;
153 	ISC_LIST(isc_socketevent_t)		recv_list;
154 	isc_socket_connev_t		       *connect_ev;
155 
156 	/*
157 	 * Internal events.  Posted when a descriptor is readable or
158 	 * writable.  These are statically allocated and never freed.
159 	 * They will be set to non-purgable before use.
160 	 */
161 	intev_t			readable_ev;
162 	intev_t			writable_ev;
163 
164 	isc_sockaddr_t		peer_address;       /* remote address */
165 
166 	unsigned int		pending_recv : 1,
167 				pending_send : 1,
168 				connected : 1,
169 				connecting : 1,     /* connect pending */
170 				bound : 1,          /* bound to local addr */
171 				active : 1,         /* currently active */
172 				pktdscp : 1;	    /* per packet dscp */
173 	unsigned int		dscp;
174 };
175 
176 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
177 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
178 
179 struct isc__socketmgr {
180 	/* Not locked. */
181 	isc_socketmgr_t		common;
182 	int			fd_bufsize;
183 	unsigned int		maxsocks;
184 
185 	isc__socket_t	       **fds;
186 	int			*fdstate;
187 
188 	/* Locked by manager lock. */
189 	ISC_LIST(isc__socket_t)	socklist;
190 	fd_set			*read_fds;
191 	fd_set			*read_fds_copy;
192 	fd_set			*write_fds;
193 	fd_set			*write_fds_copy;
194 	int			maxfd;
195 	unsigned int		refs;
196 };
197 
198 static isc__socketmgr_t *socketmgr = NULL;
199 
200 #define CLOSED			0	/* this one must be zero */
201 #define MANAGED			1
202 #define CLOSE_PENDING		2
203 
204 /*
205  * send() and recv() iovec counts
206  */
207 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
208 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
209 
210 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
211 				  isc_sockettype_t type,
212 				  isc_socket_t **socketp);
213 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
214 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
215 static void free_socket(isc__socket_t **);
216 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
217 				    isc__socket_t **);
218 static void destroy(isc__socket_t **);
219 static void internal_connect(isc_task_t *, isc_event_t *);
220 static void internal_recv(isc_task_t *, isc_event_t *);
221 static void internal_send(isc_task_t *, isc_event_t *);
222 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
223 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
224 			      struct msghdr *, struct iovec *, size_t *);
225 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
226 			      struct msghdr *, struct iovec *, size_t *);
227 
228 /*%
229  * The following are intended for internal use (indicated by "isc__"
230  * prefix) but are not declared as static, allowing direct access from
231  * unit tests etc.
232  */
233 
234 isc_result_t
235 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
236 		   isc_socket_t **socketp);
237 void
238 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
239 void
240 isc__socket_detach(isc_socket_t **socketp);
241 isc_result_t
242 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
243 		 unsigned int minimum, isc_task_t *task,
244 		  isc_taskaction_t action, void *arg);
245 isc_result_t
246 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
247 		  isc_task_t *task, isc_taskaction_t action, void *arg);
248 isc_result_t
249 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
250 		     isc_task_t *task, isc_taskaction_t action, void *arg,
251 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
252 		     unsigned int flags);
253 isc_result_t
254 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
255 		 unsigned int options);
256 isc_result_t
257 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
258 		    isc_task_t *task, isc_taskaction_t action,
259 		    void *arg);
260 void
261 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
262 
263 isc_result_t
264 isc__socketmgr_create(isc_socketmgr_t **managerp);
265 isc_result_t
266 isc__socketmgr_create2(isc_socketmgr_t **managerp,
267 		       unsigned int maxsocks);
268 void
269 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
270 
271 static struct {
272 	isc_socketmethods_t methods;
273 
274 	/*%
275 	 * The following are defined just for avoiding unused static functions.
276 	 */
277 	void *recvv, *sendv;
278 } socketmethods = {
279 	{
280 		isc__socket_attach,
281 		isc__socket_detach,
282 		isc__socket_bind,
283 		isc__socket_connect,
284 		isc__socket_cancel,
285 	},
286 	(void *)isc__socket_recvv,
287 	(void *)isc__socket_sendv,
288 };
289 
290 static isc_socketmgrmethods_t socketmgrmethods = {
291 	isc__socketmgr_destroy,
292 	isc__socket_create
293 };
294 
295 #define SELECT_POKE_SHUTDOWN		(-1)
296 #define SELECT_POKE_READ		(-3)
297 #define SELECT_POKE_WRITE		(-4)
298 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
299 #define SELECT_POKE_CLOSE		(-5)
300 
301 #define SOCK_DEAD(s)			((s)->references == 0)
302 
303 /*%
304  * Shortcut index arrays to get access to statistics counters.
305  */
306 enum {
307 	STATID_OPEN = 0,
308 	STATID_OPENFAIL = 1,
309 	STATID_CLOSE = 2,
310 	STATID_BINDFAIL = 3,
311 	STATID_CONNECTFAIL = 4,
312 	STATID_CONNECT = 5,
313 	STATID_ACCEPTFAIL = 6,
314 	STATID_ACCEPT = 7,
315 	STATID_SENDFAIL = 8,
316 	STATID_RECVFAIL = 9,
317 	STATID_ACTIVE = 10
318 };
319 
320 
321 static void
322 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
323 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
324 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
325 static void
326 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
327 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
328 	   const char *fmt, ...)
329 {
330 	char msgbuf[2048];
331 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
332 	va_list ap;
333 
334 	if (! isc_log_wouldlog(isc_lctx, level))
335 		return;
336 
337 	va_start(ap, fmt);
338 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
339 	va_end(ap);
340 
341 	if (address == NULL) {
342 		isc_log_write(isc_lctx, category, module, level,
343 			       "socket %p: %s", sock, msgbuf);
344 	} else {
345 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
346 		isc_log_write(isc_lctx, category, module, level,
347 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
348 	}
349 }
350 
351 static inline isc_result_t
352 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
353 	isc_result_t result = ISC_R_SUCCESS;
354 
355 	if (msg == SELECT_POKE_READ)
356 		FD_SET(fd, manager->read_fds);
357 	if (msg == SELECT_POKE_WRITE)
358 		FD_SET(fd, manager->write_fds);
359 
360 	return (result);
361 }
362 
363 static inline isc_result_t
364 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
365 	isc_result_t result = ISC_R_SUCCESS;
366 
367 	if (msg == SELECT_POKE_READ)
368 		FD_CLR(fd, manager->read_fds);
369 	else if (msg == SELECT_POKE_WRITE)
370 		FD_CLR(fd, manager->write_fds);
371 
372 	return (result);
373 }
374 
375 static void
376 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
377 	isc_result_t result;
378 
379 	/*
380 	 * This is a wakeup on a socket.  If the socket is not in the
381 	 * process of being closed, start watching it for either reads
382 	 * or writes.
383 	 */
384 
385 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
386 
387 	if (msg == SELECT_POKE_CLOSE) {
388 		/* No one should be updating fdstate, so no need to lock it */
389 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
390 		manager->fdstate[fd] = CLOSED;
391 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
392 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
393 		(void)close(fd);
394 		return;
395 	}
396 
397 	if (manager->fdstate[fd] == CLOSE_PENDING) {
398 
399 		/*
400 		 * We accept (and ignore) any error from unwatch_fd() as we are
401 		 * closing the socket, hoping it doesn't leave dangling state in
402 		 * the kernel.
403 		 */
404 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
405 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
406 		return;
407 	}
408 	if (manager->fdstate[fd] != MANAGED) {
409 		return;
410 	}
411 
412 	/*
413 	 * Set requested bit.
414 	 */
415 	result = watch_fd(manager, fd, msg);
416 	if (result != ISC_R_SUCCESS) {
417 		/*
418 		 * XXXJT: what should we do?  Ignoring the failure of watching
419 		 * a socket will make the application dysfunctional, but there
420 		 * seems to be no reasonable recovery process.
421 		 */
422 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
423 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
424 			      "failed to start watching FD (%d): %s",
425 			      fd, isc_result_totext(result));
426 	}
427 }
428 
429 /*
430  * Update the state of the socketmgr when something changes.
431  */
432 static void
433 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
434 	if (msg == SELECT_POKE_SHUTDOWN)
435 		return;
436 	else if (fd >= 0)
437 		wakeup_socket(manager, fd, msg);
438 	return;
439 }
440 
441 /*
442  * Make a fd non-blocking.
443  */
444 static isc_result_t
445 make_nonblock(int fd) {
446 	int ret;
447 	int flags;
448 
449 	flags = fcntl(fd, F_GETFL, 0);
450 	flags |= O_NONBLOCK;
451 	ret = fcntl(fd, F_SETFL, flags);
452 
453 	if (ret == -1) {
454 		UNEXPECTED_ERROR(__FILE__, __LINE__,
455 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
456 				 strerror(errno));
457 		return (ISC_R_UNEXPECTED);
458 	}
459 
460 	return (ISC_R_SUCCESS);
461 }
462 
463 /*
464  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
465  * In order to ensure as much portability as possible, we provide wrapper
466  * functions of these macros.
467  * Note that cmsg_space() could run slow on OSes that do not have
468  * CMSG_SPACE.
469  */
470 static inline socklen_t
471 cmsg_len(socklen_t len) {
472 	return (CMSG_LEN(len));
473 }
474 
475 static inline socklen_t
476 cmsg_space(socklen_t len) {
477 	return (CMSG_SPACE(len));
478 }
479 
480 /*
481  * Process control messages received on a socket.
482  */
483 static void
484 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
485 	struct cmsghdr *cmsgp;
486 	struct in6_pktinfo *pktinfop;
487 	void *timevalp;
488 
489 	/*
490 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
491 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
492 	 * They are all here, outside of the CPP tests, because it is
493 	 * more consistent with the usual ISC coding style.
494 	 */
495 	UNUSED(sock);
496 	UNUSED(msg);
497 	UNUSED(dev);
498 
499 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
500 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
501 
502 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
503 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
504 
505 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
506 		return;
507 
508 	timevalp = NULL;
509 	pktinfop = NULL;
510 
511 	cmsgp = CMSG_FIRSTHDR(msg);
512 	while (cmsgp != NULL) {
513 		socket_log(sock, NULL, TRACE,
514 			   "processing cmsg %p", cmsgp);
515 
516 		if (cmsgp->cmsg_level == IPPROTO_IPV6
517 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
518 
519 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
520 			memmove(&dev->pktinfo, pktinfop,
521 				sizeof(struct in6_pktinfo));
522 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
523 			socket_log(sock, NULL, TRACE,
524 				   "interface received on ifindex %u",
525 				   dev->pktinfo.ipi6_ifindex);
526 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
527 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
528 			goto next;
529 		}
530 
531 		if (cmsgp->cmsg_level == SOL_SOCKET
532 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
533 			struct timeval tv;
534 			timevalp = CMSG_DATA(cmsgp);
535 			memmove(&tv, timevalp, sizeof(tv));
536 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
537 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
538 			goto next;
539 		}
540 
541 		if (cmsgp->cmsg_level == IPPROTO_IPV6
542 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
543 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
544 			dev->dscp >>= 2;
545 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
546 			goto next;
547 		}
548 
549 		if (cmsgp->cmsg_level == IPPROTO_IP
550 		    && (cmsgp->cmsg_type == IP_TOS)) {
551 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
552 			dev->dscp >>= 2;
553 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
554 			goto next;
555 		}
556 	next:
557 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
558 	}
559 
560 }
561 
562 /*
563  * Construct an iov array and attach it to the msghdr passed in.  This is
564  * the SEND constructor, which will use the used region of the buffer
565  * (if using a buffer list) or will use the internal region (if a single
566  * buffer I/O is requested).
567  *
568  * Nothing can be NULL, and the done event must list at least one buffer
569  * on the buffer linked list for this function to be meaningful.
570  *
571  * If write_countp != NULL, *write_countp will hold the number of bytes
572  * this transaction can send.
573  */
574 static void
575 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
576 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
577 {
578 	unsigned int iovcount;
579 	isc_buffer_t *buffer;
580 	isc_region_t used;
581 	size_t write_count;
582 	size_t skip_count;
583 	struct cmsghdr *cmsgp;
584 
585 	memset(msg, 0, sizeof(*msg));
586 
587 	if (!sock->connected) {
588 		msg->msg_name = (void *)&dev->address.type.sa;
589 		msg->msg_namelen = dev->address.length;
590 	} else {
591 		msg->msg_name = NULL;
592 		msg->msg_namelen = 0;
593 	}
594 
595 	buffer = ISC_LIST_HEAD(dev->bufferlist);
596 	write_count = 0;
597 	iovcount = 0;
598 
599 	/*
600 	 * Single buffer I/O?  Skip what we've done so far in this region.
601 	 */
602 	if (buffer == NULL) {
603 		write_count = dev->region.length - dev->n;
604 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
605 		iov[0].iov_len = write_count;
606 		iovcount = 1;
607 
608 		goto config;
609 	}
610 
611 	/*
612 	 * Multibuffer I/O.
613 	 * Skip the data in the buffer list that we have already written.
614 	 */
615 	skip_count = dev->n;
616 	while (buffer != NULL) {
617 		REQUIRE(ISC_BUFFER_VALID(buffer));
618 		if (skip_count < isc_buffer_usedlength(buffer))
619 			break;
620 		skip_count -= isc_buffer_usedlength(buffer);
621 		buffer = ISC_LIST_NEXT(buffer, link);
622 	}
623 
624 	while (buffer != NULL) {
625 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
626 
627 		isc_buffer_usedregion(buffer, &used);
628 
629 		if (used.length > 0) {
630 			iov[iovcount].iov_base = (void *)(used.base
631 							  + skip_count);
632 			iov[iovcount].iov_len = used.length - skip_count;
633 			write_count += (used.length - skip_count);
634 			skip_count = 0;
635 			iovcount++;
636 		}
637 		buffer = ISC_LIST_NEXT(buffer, link);
638 	}
639 
640 	INSIST(skip_count == 0U);
641 
642  config:
643 	msg->msg_iov = iov;
644 	msg->msg_iovlen = iovcount;
645 
646 	msg->msg_control = NULL;
647 	msg->msg_controllen = 0;
648 	msg->msg_flags = 0;
649 
650 	if ((sock->type == isc_sockettype_udp) &&
651 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
652 	{
653 		struct in6_pktinfo *pktinfop;
654 
655 		socket_log(sock, NULL, TRACE,
656 			   "sendto pktinfo data, ifindex %u",
657 			   dev->pktinfo.ipi6_ifindex);
658 
659 		msg->msg_control = (void *)cmsgbuf;
660 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
661 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
662 
663 		cmsgp = (struct cmsghdr *)cmsgbuf;
664 		cmsgp->cmsg_level = IPPROTO_IPV6;
665 		cmsgp->cmsg_type = IPV6_PKTINFO;
666 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
667 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
668 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
669 	}
670 
671 	if ((sock->type == isc_sockettype_udp) &&
672 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
673 	{
674 		int use_min_mtu = 1;	/* -1, 0, 1 */
675 
676 		cmsgp = (struct cmsghdr *)(cmsgbuf +
677 					   msg->msg_controllen);
678 
679 		msg->msg_control = (void *)cmsgbuf;
680 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
681 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
682 
683 		cmsgp->cmsg_level = IPPROTO_IPV6;
684 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
685 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
686 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
687 	}
688 
689 	if (isc_dscp_check_value > -1) {
690 		if (sock->type == isc_sockettype_udp)
691 			INSIST((int)dev->dscp == isc_dscp_check_value);
692 		else if (sock->type == isc_sockettype_tcp)
693 			INSIST((int)sock->dscp == isc_dscp_check_value);
694 	}
695 
696 	if ((sock->type == isc_sockettype_udp) &&
697 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
698 	{
699 		int dscp = (dev->dscp << 2) & 0xff;
700 
701 		INSIST(dev->dscp < 0x40);
702 
703 		if (sock->pf == AF_INET && sock->pktdscp) {
704 			cmsgp = (struct cmsghdr *)(cmsgbuf +
705 						   msg->msg_controllen);
706 			msg->msg_control = (void *)cmsgbuf;
707 			msg->msg_controllen += cmsg_space(sizeof(dscp));
708 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
709 
710 			cmsgp->cmsg_level = IPPROTO_IP;
711 			cmsgp->cmsg_type = IP_TOS;
712 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
713 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
714 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
715 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
716 			       (void *)&dscp, sizeof(int)) < 0)
717 			{
718 				UNEXPECTED_ERROR(__FILE__, __LINE__,
719 						 "setsockopt(%d, IP_TOS, %.02x)"
720 						 " %s: %s",
721 						 sock->fd, dscp >> 2,
722 						 "failed", strerror(errno));
723 			} else
724 				sock->dscp = dscp;
725 		}
726 
727 		if (sock->pf == AF_INET6 && sock->pktdscp) {
728 			cmsgp = (struct cmsghdr *)(cmsgbuf +
729 						   msg->msg_controllen);
730 			msg->msg_control = (void *)cmsgbuf;
731 			msg->msg_controllen += cmsg_space(sizeof(dscp));
732 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
733 
734 			cmsgp->cmsg_level = IPPROTO_IPV6;
735 			cmsgp->cmsg_type = IPV6_TCLASS;
736 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
737 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
738 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
739 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
740 				       (void *)&dscp, sizeof(int)) < 0) {
741 				UNEXPECTED_ERROR(__FILE__, __LINE__,
742 						 "setsockopt(%d, IPV6_TCLASS, "
743 						 "%.02x) %s: %s",
744 						 sock->fd, dscp >> 2,
745 						 "failed", strerror(errno));
746 			} else
747 				sock->dscp = dscp;
748 		}
749 
750 		if (msg->msg_controllen != 0 &&
751 		    msg->msg_controllen < SENDCMSGBUFLEN)
752 		{
753 			memset(cmsgbuf + msg->msg_controllen, 0,
754 			       SENDCMSGBUFLEN - msg->msg_controllen);
755 		}
756 	}
757 
758 	if (write_countp != NULL)
759 		*write_countp = write_count;
760 }
761 
762 /*
763  * Construct an iov array and attach it to the msghdr passed in.  This is
764  * the RECV constructor, which will use the available region of the buffer
765  * (if using a buffer list) or will use the internal region (if a single
766  * buffer I/O is requested).
767  *
768  * Nothing can be NULL, and the done event must list at least one buffer
769  * on the buffer linked list for this function to be meaningful.
770  *
771  * If read_countp != NULL, *read_countp will hold the number of bytes
772  * this transaction can receive.
773  */
774 static void
775 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
776 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
777 {
778 	unsigned int iovcount;
779 	isc_buffer_t *buffer;
780 	isc_region_t available;
781 	size_t read_count;
782 
783 	memset(msg, 0, sizeof(struct msghdr));
784 
785 	if (sock->type == isc_sockettype_udp) {
786 		memset(&dev->address, 0, sizeof(dev->address));
787 		msg->msg_name = (void *)&dev->address.type.sa;
788 		msg->msg_namelen = sizeof(dev->address.type);
789 	} else { /* TCP */
790 		msg->msg_name = NULL;
791 		msg->msg_namelen = 0;
792 		dev->address = sock->peer_address;
793 	}
794 
795 	buffer = ISC_LIST_HEAD(dev->bufferlist);
796 	read_count = 0;
797 
798 	/*
799 	 * Single buffer I/O?  Skip what we've done so far in this region.
800 	 */
801 	if (buffer == NULL) {
802 		read_count = dev->region.length - dev->n;
803 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
804 		iov[0].iov_len = read_count;
805 		iovcount = 1;
806 
807 		goto config;
808 	}
809 
810 	/*
811 	 * Multibuffer I/O.
812 	 * Skip empty buffers.
813 	 */
814 	while (buffer != NULL) {
815 		REQUIRE(ISC_BUFFER_VALID(buffer));
816 		if (isc_buffer_availablelength(buffer) != 0)
817 			break;
818 		buffer = ISC_LIST_NEXT(buffer, link);
819 	}
820 
821 	iovcount = 0;
822 	while (buffer != NULL) {
823 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
824 
825 		isc_buffer_availableregion(buffer, &available);
826 
827 		if (available.length > 0) {
828 			iov[iovcount].iov_base = (void *)(available.base);
829 			iov[iovcount].iov_len = available.length;
830 			read_count += available.length;
831 			iovcount++;
832 		}
833 		buffer = ISC_LIST_NEXT(buffer, link);
834 	}
835 
836  config:
837 
838 	/*
839 	 * If needed, set up to receive that one extra byte.
840 	 */
841 	msg->msg_iov = iov;
842 	msg->msg_iovlen = iovcount;
843 
844 	msg->msg_control = cmsgbuf;
845 	msg->msg_controllen = RECVCMSGBUFLEN;
846 	msg->msg_flags = 0;
847 
848 	if (read_countp != NULL)
849 		*read_countp = read_count;
850 }
851 
852 static void
853 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
854 		isc_socketevent_t *dev)
855 {
856 	if (sock->type == isc_sockettype_udp) {
857 		if (address != NULL)
858 			dev->address = *address;
859 		else
860 			dev->address = sock->peer_address;
861 	} else if (sock->type == isc_sockettype_tcp) {
862 		INSIST(address == NULL);
863 		dev->address = sock->peer_address;
864 	}
865 }
866 
867 static void
868 destroy_socketevent(isc_event_t *event) {
869 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
870 
871 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
872 
873 	(ev->destroy)(event);
874 }
875 
876 static isc_socketevent_t *
877 allocate_socketevent(void *sender,
878 		     isc_eventtype_t eventtype, isc_taskaction_t action,
879 		     void *arg)
880 {
881 	isc_socketevent_t *ev;
882 
883 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
884 						     eventtype, action, arg,
885 						     sizeof(*ev));
886 
887 	if (ev == NULL)
888 		return (NULL);
889 
890 	ev->result = ISC_R_UNSET;
891 	ISC_LINK_INIT(ev, ev_link);
892 	ISC_LIST_INIT(ev->bufferlist);
893 	ev->region.base = NULL;
894 	ev->n = 0;
895 	ev->offset = 0;
896 	ev->attributes = 0;
897 	ev->destroy = ev->ev_destroy;
898 	ev->ev_destroy = destroy_socketevent;
899 	ev->dscp = 0;
900 
901 	return (ev);
902 }
903 
904 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
905 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
906 #define DOIO_HARD		2	/* i/o error, event sent */
907 #define DOIO_EOF		3	/* EOF, no event sent */
908 
909 static int
910 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
911 	int cc;
912 	struct iovec iov[MAXSCATTERGATHER_RECV];
913 	size_t read_count;
914 	size_t actual_count;
915 	struct msghdr msghdr;
916 	isc_buffer_t *buffer;
917 	int recv_errno;
918 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
919 
920 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
921 
922 	cc = recvmsg(sock->fd, &msghdr, 0);
923 	recv_errno = errno;
924 
925 	if (cc < 0) {
926 		if (SOFT_ERROR(recv_errno))
927 			return (DOIO_SOFT);
928 
929 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
930 			socket_log(sock, NULL, IOEVENT,
931 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
932 				   sock->fd, cc, recv_errno,
933 				   strerror(recv_errno));
934 		}
935 
936 #define SOFT_OR_HARD(_system, _isc) \
937 	if (recv_errno == _system) { \
938 		if (sock->connected) { \
939 			dev->result = _isc; \
940 			return (DOIO_HARD); \
941 		} \
942 		return (DOIO_SOFT); \
943 	}
944 #define ALWAYS_HARD(_system, _isc) \
945 	if (recv_errno == _system) { \
946 		dev->result = _isc; \
947 		return (DOIO_HARD); \
948 	}
949 
950 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
951 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
952 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
953 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
954 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
955 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
956 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
957 		/* Should never get this one but it was seen. */
958 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
959 		/*
960 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
961 		 * errors.
962 		 */
963 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
964 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
965 
966 #undef SOFT_OR_HARD
967 #undef ALWAYS_HARD
968 
969 		dev->result = isc__errno2result(recv_errno);
970 		return (DOIO_HARD);
971 	}
972 
973 	/*
974 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
975 	 * while on UDP sockets, zero length reads are perfectly valid,
976 	 * although strange.
977 	 */
978 	switch (sock->type) {
979 	case isc_sockettype_tcp:
980 		if (cc == 0)
981 			return (DOIO_EOF);
982 		break;
983 	case isc_sockettype_udp:
984 		break;
985 	default:
986 		INSIST(0);
987 	}
988 
989 	if (sock->type == isc_sockettype_udp) {
990 		dev->address.length = msghdr.msg_namelen;
991 		if (isc_sockaddr_getport(&dev->address) == 0) {
992 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
993 				socket_log(sock, &dev->address, IOEVENT,
994 					   "dropping source port zero packet");
995 			}
996 			return (DOIO_SOFT);
997 		}
998 	}
999 
1000 	socket_log(sock, &dev->address, IOEVENT,
1001 		   "packet received correctly");
1002 
1003 	/*
1004 	 * Overflow bit detection.  If we received MORE bytes than we should,
1005 	 * this indicates an overflow situation.  Set the flag in the
1006 	 * dev entry and adjust how much we read by one.
1007 	 */
1008 	/*
1009 	 * If there are control messages attached, run through them and pull
1010 	 * out the interesting bits.
1011 	 */
1012 	process_cmsg(sock, &msghdr, dev);
1013 
1014 	/*
1015 	 * update the buffers (if any) and the i/o count
1016 	 */
1017 	dev->n += cc;
1018 	actual_count = cc;
1019 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1020 	while (buffer != NULL && actual_count > 0U) {
1021 		REQUIRE(ISC_BUFFER_VALID(buffer));
1022 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1023 			actual_count -= isc_buffer_availablelength(buffer);
1024 			isc_buffer_add(buffer,
1025 				       isc_buffer_availablelength(buffer));
1026 		} else {
1027 			isc_buffer_add(buffer, actual_count);
1028 			actual_count = 0;
1029 			POST(actual_count);
1030 			break;
1031 		}
1032 		buffer = ISC_LIST_NEXT(buffer, link);
1033 		if (buffer == NULL) {
1034 			INSIST(actual_count == 0U);
1035 		}
1036 	}
1037 
1038 	/*
1039 	 * If we read less than we expected, update counters,
1040 	 * and let the upper layer poke the descriptor.
1041 	 */
1042 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1043 		return (DOIO_SOFT);
1044 
1045 	/*
1046 	 * Full reads are posted, or partials if partials are ok.
1047 	 */
1048 	dev->result = ISC_R_SUCCESS;
1049 	return (DOIO_SUCCESS);
1050 }
1051 
1052 /*
1053  * Returns:
1054  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1055  *			ISC_R_SUCCESS.
1056  *
1057  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1058  *			dev->result contains the appropriate error.
1059  *
1060  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1061  *			event was sent.  The operation should be retried.
1062  *
1063  *	No other return values are possible.
1064  */
1065 static int
1066 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1067 	int cc;
1068 	struct iovec iov[MAXSCATTERGATHER_SEND];
1069 	size_t write_count;
1070 	struct msghdr msghdr;
1071 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1072 	int attempts = 0;
1073 	int send_errno;
1074 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1075 
1076 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1077 
1078  resend:
1079 	cc = sendmsg(sock->fd, &msghdr, 0);
1080 	send_errno = errno;
1081 
1082 	/*
1083 	 * Check for error or block condition.
1084 	 */
1085 	if (cc < 0) {
1086 		if (send_errno == EINTR && ++attempts < NRETRIES)
1087 			goto resend;
1088 
1089 		if (SOFT_ERROR(send_errno)) {
1090 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1091 				dev->result = ISC_R_WOULDBLOCK;
1092 			return (DOIO_SOFT);
1093 		}
1094 
1095 #define SOFT_OR_HARD(_system, _isc) \
1096 	if (send_errno == _system) { \
1097 		if (sock->connected) { \
1098 			dev->result = _isc; \
1099 			return (DOIO_HARD); \
1100 		} \
1101 		return (DOIO_SOFT); \
1102 	}
1103 #define ALWAYS_HARD(_system, _isc) \
1104 	if (send_errno == _system) { \
1105 		dev->result = _isc; \
1106 		return (DOIO_HARD); \
1107 	}
1108 
1109 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1110 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1111 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1112 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1113 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1114 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1115 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1116 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1117 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1118 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1119 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1120 
1121 #undef SOFT_OR_HARD
1122 #undef ALWAYS_HARD
1123 
1124 		/*
1125 		 * The other error types depend on whether or not the
1126 		 * socket is UDP or TCP.  If it is UDP, some errors
1127 		 * that we expect to be fatal under TCP are merely
1128 		 * annoying, and are really soft errors.
1129 		 *
1130 		 * However, these soft errors are still returned as
1131 		 * a status.
1132 		 */
1133 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1134 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1135 				 addrbuf, strerror(send_errno));
1136 		dev->result = isc__errno2result(send_errno);
1137 		return (DOIO_HARD);
1138 	}
1139 
1140 	if (cc == 0) {
1141 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1142 				 "doio_send: send() %s 0", "returned");
1143 	}
1144 
1145 	/*
1146 	 * If we write less than we expected, update counters, poke.
1147 	 */
1148 	dev->n += cc;
1149 	if ((size_t)cc != write_count)
1150 		return (DOIO_SOFT);
1151 
1152 	/*
1153 	 * Exactly what we wanted to write.  We're done with this
1154 	 * entry.  Post its completion event.
1155 	 */
1156 	dev->result = ISC_R_SUCCESS;
1157 	return (DOIO_SUCCESS);
1158 }
1159 
1160 /*
1161  * Kill.
1162  *
1163  * Caller must ensure that the socket is not locked and no external
1164  * references exist.
1165  */
1166 static void
1167 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1168 	/*
1169 	 * No one has this socket open, so the watcher doesn't have to be
1170 	 * poked, and the socket doesn't have to be locked.
1171 	 */
1172 	manager->fds[fd] = NULL;
1173 	manager->fdstate[fd] = CLOSE_PENDING;
1174 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1175 
1176 	if (sock->active == 1) {
1177 		sock->active = 0;
1178 	}
1179 
1180 	/*
1181 	 * update manager->maxfd here (XXX: this should be implemented more
1182 	 * efficiently)
1183 	 */
1184 	if (manager->maxfd == fd) {
1185 		int i;
1186 
1187 		manager->maxfd = 0;
1188 		for (i = fd - 1; i >= 0; i--) {
1189 			if (manager->fdstate[i] == MANAGED) {
1190 				manager->maxfd = i;
1191 				break;
1192 			}
1193 		}
1194 	}
1195 
1196 }
1197 
1198 static void
1199 destroy(isc__socket_t **sockp) {
1200 	int fd;
1201 	isc__socket_t *sock = *sockp;
1202 	isc__socketmgr_t *manager = sock->manager;
1203 
1204 	socket_log(sock, NULL, CREATION, "destroying");
1205 
1206 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1207 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1208 	INSIST(sock->connect_ev == NULL);
1209 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1210 
1211 	if (sock->fd >= 0) {
1212 		fd = sock->fd;
1213 		sock->fd = -1;
1214 		socketclose(manager, sock, fd);
1215 	}
1216 
1217 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1218 
1219 	/* can't unlock manager as its memory context is still used */
1220 	free_socket(sockp);
1221 }
1222 
1223 static isc_result_t
1224 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1225 		isc__socket_t **socketp)
1226 {
1227 	isc__socket_t *sock;
1228 
1229 	sock = malloc(sizeof(*sock));
1230 
1231 	if (sock == NULL)
1232 		return (ISC_R_NOMEMORY);
1233 
1234 	sock->common.magic = 0;
1235 	sock->common.impmagic = 0;
1236 	sock->references = 0;
1237 
1238 	sock->manager = manager;
1239 	sock->type = type;
1240 	sock->fd = -1;
1241 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1242 	sock->active = 0;
1243 
1244 	ISC_LINK_INIT(sock, link);
1245 
1246 	/*
1247 	 * Set up list of readers and writers to be initially empty.
1248 	 */
1249 	ISC_LIST_INIT(sock->recv_list);
1250 	ISC_LIST_INIT(sock->send_list);
1251 	sock->connect_ev = NULL;
1252 	sock->pending_recv = 0;
1253 	sock->pending_send = 0;
1254 	sock->connected = 0;
1255 	sock->connecting = 0;
1256 	sock->bound = 0;
1257 	sock->pktdscp = 0;
1258 
1259 	/*
1260 	 * Initialize readable and writable events.
1261 	 */
1262 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1263 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1264 		       NULL, sock, sock, NULL);
1265 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1266 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1267 		       NULL, sock, sock, NULL);
1268 
1269 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1270 	sock->common.impmagic = SOCKET_MAGIC;
1271 	*socketp = sock;
1272 
1273 	return (ISC_R_SUCCESS);
1274 }
1275 
1276 /*
1277  * This event requires that the various lists be empty, that the reference
1278  * count be 1, and that the magic number is valid.  The other socket bits,
1279  * like the lock, must be initialized as well.  The fd associated must be
1280  * marked as closed, by setting it to -1 on close, or this routine will
1281  * also close the socket.
1282  */
1283 static void
1284 free_socket(isc__socket_t **socketp) {
1285 	isc__socket_t *sock = *socketp;
1286 
1287 	INSIST(VALID_SOCKET(sock));
1288 	INSIST(sock->references == 0);
1289 	INSIST(!sock->connecting);
1290 	INSIST(!sock->pending_recv);
1291 	INSIST(!sock->pending_send);
1292 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1293 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1294 	INSIST(!ISC_LINK_LINKED(sock, link));
1295 
1296 	sock->common.magic = 0;
1297 	sock->common.impmagic = 0;
1298 
1299 	free(sock);
1300 
1301 	*socketp = NULL;
1302 }
1303 
1304 static void
1305 use_min_mtu(isc__socket_t *sock) {
1306 	/* use minimum MTU */
1307 	if (sock->pf == AF_INET6) {
1308 		int on = 1;
1309 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1310 				(void *)&on, sizeof(on));
1311 	}
1312 }
1313 
1314 static void
1315 set_tcp_maxseg(isc__socket_t *sock, int size) {
1316 	if (sock->type == isc_sockettype_tcp)
1317 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1318 				(void *)&size, sizeof(size));
1319 }
1320 
1321 static isc_result_t
1322 opensocket(isc__socket_t *sock)
1323 {
1324 	isc_result_t result;
1325 	const char *err = "socket";
1326 	int on = 1;
1327 
1328 	switch (sock->type) {
1329 	case isc_sockettype_udp:
1330 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1331 		break;
1332 	case isc_sockettype_tcp:
1333 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1334 		break;
1335 	}
1336 
1337 	if (sock->fd < 0) {
1338 		switch (errno) {
1339 		case EMFILE:
1340 		case ENFILE:
1341 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1342 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1343 				       "%s: %s", err, strerror(errno));
1344 			/* fallthrough */
1345 		case ENOBUFS:
1346 			return (ISC_R_NORESOURCES);
1347 
1348 		case EPROTONOSUPPORT:
1349 		case EPFNOSUPPORT:
1350 		case EAFNOSUPPORT:
1351 		/*
1352 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1353 		 * EAFNOSUPPORT.
1354 		 */
1355 		case EINVAL:
1356 			return (ISC_R_FAMILYNOSUPPORT);
1357 
1358 		default:
1359 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1360 					 "%s() %s: %s", err, "failed",
1361 					 strerror(errno));
1362 			return (ISC_R_UNEXPECTED);
1363 		}
1364 	}
1365 
1366 	result = make_nonblock(sock->fd);
1367 	if (result != ISC_R_SUCCESS) {
1368 		(void)close(sock->fd);
1369 		return (result);
1370 	}
1371 
1372 	/*
1373 	 * Use minimum mtu if possible.
1374 	 */
1375 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1376 		use_min_mtu(sock);
1377 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1378 	}
1379 
1380 	if (sock->type == isc_sockettype_udp) {
1381 
1382 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1383 			       (void *)&on, sizeof(on)) < 0
1384 		    && errno != ENOPROTOOPT) {
1385 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1386 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1387 					 sock->fd, "failed", strerror(errno));
1388 			/* Press on... */
1389 		}
1390 
1391 		/* RFC 3542 */
1392 		if ((sock->pf == AF_INET6)
1393 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1394 				   (void *)&on, sizeof(on)) < 0)) {
1395 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1396 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1397 					 "%s: %s", sock->fd, "failed",
1398 					 strerror(errno));
1399 		}
1400 	}
1401 
1402 	if (sock->active == 0) {
1403 		sock->active = 1;
1404 	}
1405 
1406 	return (ISC_R_SUCCESS);
1407 }
1408 
1409 /*
1410  * Create a 'type' socket managed
1411  * by 'manager'.  Events will be posted to 'task' and when dispatched
1412  * 'action' will be called with 'arg' as the arg value.  The new
1413  * socket is returned in 'socketp'.
1414  */
1415 static isc_result_t
1416 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1417 	      isc_socket_t **socketp)
1418 {
1419 	isc__socket_t *sock = NULL;
1420 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1421 	isc_result_t result;
1422 
1423 	REQUIRE(VALID_MANAGER(manager));
1424 	REQUIRE(socketp != NULL && *socketp == NULL);
1425 
1426 	result = allocate_socket(manager, type, &sock);
1427 	if (result != ISC_R_SUCCESS)
1428 		return (result);
1429 
1430 	switch (sock->type) {
1431 	case isc_sockettype_udp:
1432 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1433 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1434 		break;
1435 	case isc_sockettype_tcp:
1436 		break;
1437 	default:
1438 		INSIST(0);
1439 	}
1440 
1441 	sock->pf = pf;
1442 
1443 	result = opensocket(sock);
1444 	if (result != ISC_R_SUCCESS) {
1445 		free_socket(&sock);
1446 		return (result);
1447 	}
1448 
1449 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1450 	sock->references = 1;
1451 	*socketp = (isc_socket_t *)sock;
1452 
1453 	/*
1454 	 * Note we don't have to lock the socket like we normally would because
1455 	 * there are no external references to it yet.
1456 	 */
1457 
1458 	manager->fds[sock->fd] = sock;
1459 	manager->fdstate[sock->fd] = MANAGED;
1460 
1461 	ISC_LIST_APPEND(manager->socklist, sock, link);
1462 	if (manager->maxfd < sock->fd)
1463 		manager->maxfd = sock->fd;
1464 
1465 	socket_log(sock, NULL, CREATION, "created");
1466 
1467 	return (ISC_R_SUCCESS);
1468 }
1469 
1470 /*%
1471  * Create a new 'type' socket managed by 'manager'.  Events
1472  * will be posted to 'task' and when dispatched 'action' will be
1473  * called with 'arg' as the arg value.  The new socket is returned
1474  * in 'socketp'.
1475  */
1476 isc_result_t
1477 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1478 		   isc_socket_t **socketp)
1479 {
1480 	return (socket_create(manager0, pf, type, socketp));
1481 }
1482 
1483 /*
1484  * Attach to a socket.  Caller must explicitly detach when it is done.
1485  */
1486 void
1487 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1488 	isc__socket_t *sock = (isc__socket_t *)sock0;
1489 
1490 	REQUIRE(VALID_SOCKET(sock));
1491 	REQUIRE(socketp != NULL && *socketp == NULL);
1492 
1493 	sock->references++;
1494 
1495 	*socketp = (isc_socket_t *)sock;
1496 }
1497 
1498 /*
1499  * Dereference a socket.  If this is the last reference to it, clean things
1500  * up by destroying the socket.
1501  */
1502 void
1503 isc__socket_detach(isc_socket_t **socketp) {
1504 	isc__socket_t *sock;
1505 	isc_boolean_t kill_socket = ISC_FALSE;
1506 
1507 	REQUIRE(socketp != NULL);
1508 	sock = (isc__socket_t *)*socketp;
1509 	REQUIRE(VALID_SOCKET(sock));
1510 
1511 	REQUIRE(sock->references > 0);
1512 	sock->references--;
1513 	if (sock->references == 0)
1514 		kill_socket = ISC_TRUE;
1515 
1516 	if (kill_socket)
1517 		destroy(&sock);
1518 
1519 	*socketp = NULL;
1520 }
1521 
1522 /*
1523  * I/O is possible on a given socket.  Schedule an event to this task that
1524  * will call an internal function to do the I/O.  This will charge the
1525  * task with the I/O operation and let our select loop handler get back
1526  * to doing something real as fast as possible.
1527  *
1528  * The socket and manager must be locked before calling this function.
1529  */
1530 static void
1531 dispatch_recv(isc__socket_t *sock) {
1532 	intev_t *iev;
1533 	isc_socketevent_t *ev;
1534 	isc_task_t *sender;
1535 
1536 	INSIST(!sock->pending_recv);
1537 
1538 	ev = ISC_LIST_HEAD(sock->recv_list);
1539 	if (ev == NULL)
1540 		return;
1541 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1542 		   "dispatch_recv:  event %p -> task %p",
1543 		   ev, ev->ev_sender);
1544 	sender = ev->ev_sender;
1545 
1546 	sock->pending_recv = 1;
1547 	iev = &sock->readable_ev;
1548 
1549 	sock->references++;
1550 	iev->ev_sender = sock;
1551 	iev->ev_action = internal_recv;
1552 	iev->ev_arg = sock;
1553 
1554 	isc_task_send(sender, (isc_event_t **)&iev);
1555 }
1556 
1557 static void
1558 dispatch_send(isc__socket_t *sock) {
1559 	intev_t *iev;
1560 	isc_socketevent_t *ev;
1561 	isc_task_t *sender;
1562 
1563 	INSIST(!sock->pending_send);
1564 
1565 	ev = ISC_LIST_HEAD(sock->send_list);
1566 	if (ev == NULL)
1567 		return;
1568 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1569 		   "dispatch_send:  event %p -> task %p",
1570 		   ev, ev->ev_sender);
1571 	sender = ev->ev_sender;
1572 
1573 	sock->pending_send = 1;
1574 	iev = &sock->writable_ev;
1575 
1576 	sock->references++;
1577 	iev->ev_sender = sock;
1578 	iev->ev_action = internal_send;
1579 	iev->ev_arg = sock;
1580 
1581 	isc_task_send(sender, (isc_event_t **)&iev);
1582 }
1583 
1584 static void
1585 dispatch_connect(isc__socket_t *sock) {
1586 	intev_t *iev;
1587 	isc_socket_connev_t *ev;
1588 
1589 	iev = &sock->writable_ev;
1590 
1591 	ev = sock->connect_ev;
1592 	INSIST(ev != NULL); /* XXX */
1593 
1594 	INSIST(sock->connecting);
1595 
1596 	sock->references++;  /* keep socket around for this internal event */
1597 	iev->ev_sender = sock;
1598 	iev->ev_action = internal_connect;
1599 	iev->ev_arg = sock;
1600 
1601 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1602 }
1603 
1604 /*
1605  * Dequeue an item off the given socket's read queue, set the result code
1606  * in the done event to the one provided, and send it to the task it was
1607  * destined for.
1608  *
1609  * If the event to be sent is on a list, remove it before sending.  If
1610  * asked to, send and detach from the socket as well.
1611  *
1612  * Caller must have the socket locked if the event is attached to the socket.
1613  */
1614 static void
1615 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1616 	isc_task_t *task;
1617 
1618 	task = (*dev)->ev_sender;
1619 
1620 	(*dev)->ev_sender = sock;
1621 
1622 	if (ISC_LINK_LINKED(*dev, ev_link))
1623 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1624 
1625 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1626 	    == ISC_SOCKEVENTATTR_ATTACHED)
1627 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1628 	else
1629 		isc_task_send(task, (isc_event_t **)dev);
1630 }
1631 
1632 /*
1633  * See comments for send_recvdone_event() above.
1634  *
1635  * Caller must have the socket locked if the event is attached to the socket.
1636  */
1637 static void
1638 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1639 	isc_task_t *task;
1640 
1641 	INSIST(dev != NULL && *dev != NULL);
1642 
1643 	task = (*dev)->ev_sender;
1644 	(*dev)->ev_sender = sock;
1645 
1646 	if (ISC_LINK_LINKED(*dev, ev_link))
1647 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1648 
1649 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1650 	    == ISC_SOCKEVENTATTR_ATTACHED)
1651 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1652 	else
1653 		isc_task_send(task, (isc_event_t **)dev);
1654 }
1655 
1656 static void
1657 internal_recv(isc_task_t *me, isc_event_t *ev) {
1658 	isc_socketevent_t *dev;
1659 	isc__socket_t *sock;
1660 
1661 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1662 
1663 	sock = ev->ev_sender;
1664 	INSIST(VALID_SOCKET(sock));
1665 
1666 	socket_log(sock, NULL, IOEVENT,
1667 		   "internal_recv: task %p got event %p", me, ev);
1668 
1669 	INSIST(sock->pending_recv == 1);
1670 	sock->pending_recv = 0;
1671 
1672 	INSIST(sock->references > 0);
1673 	sock->references--;  /* the internal event is done with this socket */
1674 	if (sock->references == 0) {
1675 		destroy(&sock);
1676 		return;
1677 	}
1678 
1679 	/*
1680 	 * Try to do as much I/O as possible on this socket.  There are no
1681 	 * limits here, currently.
1682 	 */
1683 	dev = ISC_LIST_HEAD(sock->recv_list);
1684 	while (dev != NULL) {
1685 		switch (doio_recv(sock, dev)) {
1686 		case DOIO_SOFT:
1687 			goto poke;
1688 
1689 		case DOIO_EOF:
1690 			/*
1691 			 * read of 0 means the remote end was closed.
1692 			 * Run through the event queue and dispatch all
1693 			 * the events with an EOF result code.
1694 			 */
1695 			do {
1696 				dev->result = ISC_R_EOF;
1697 				send_recvdone_event(sock, &dev);
1698 				dev = ISC_LIST_HEAD(sock->recv_list);
1699 			} while (dev != NULL);
1700 			goto poke;
1701 
1702 		case DOIO_SUCCESS:
1703 		case DOIO_HARD:
1704 			send_recvdone_event(sock, &dev);
1705 			break;
1706 		}
1707 
1708 		dev = ISC_LIST_HEAD(sock->recv_list);
1709 	}
1710 
1711  poke:
1712 	if (!ISC_LIST_EMPTY(sock->recv_list))
1713 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1714 }
1715 
1716 static void
1717 internal_send(isc_task_t *me, isc_event_t *ev) {
1718 	isc_socketevent_t *dev;
1719 	isc__socket_t *sock;
1720 
1721 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1722 
1723 	/*
1724 	 * Find out what socket this is and lock it.
1725 	 */
1726 	sock = (isc__socket_t *)ev->ev_sender;
1727 	INSIST(VALID_SOCKET(sock));
1728 	socket_log(sock, NULL, IOEVENT,
1729 		   "internal_send: task %p got event %p", me, ev);
1730 
1731 	INSIST(sock->pending_send == 1);
1732 	sock->pending_send = 0;
1733 
1734 	INSIST(sock->references > 0);
1735 	sock->references--;  /* the internal event is done with this socket */
1736 	if (sock->references == 0) {
1737 		destroy(&sock);
1738 		return;
1739 	}
1740 
1741 	/*
1742 	 * Try to do as much I/O as possible on this socket.  There are no
1743 	 * limits here, currently.
1744 	 */
1745 	dev = ISC_LIST_HEAD(sock->send_list);
1746 	while (dev != NULL) {
1747 		switch (doio_send(sock, dev)) {
1748 		case DOIO_SOFT:
1749 			goto poke;
1750 
1751 		case DOIO_HARD:
1752 		case DOIO_SUCCESS:
1753 			send_senddone_event(sock, &dev);
1754 			break;
1755 		}
1756 
1757 		dev = ISC_LIST_HEAD(sock->send_list);
1758 	}
1759 
1760  poke:
1761 	if (!ISC_LIST_EMPTY(sock->send_list))
1762 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1763 }
1764 
1765 /*
1766  * Process read/writes on each fd here.  Avoid locking
1767  * and unlocking twice if both reads and writes are possible.
1768  */
1769 static void
1770 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1771 	   isc_boolean_t writeable)
1772 {
1773 	isc__socket_t *sock;
1774 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1775 
1776 	/*
1777 	 * If the socket is going to be closed, don't do more I/O.
1778 	 */
1779 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1780 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1781 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1782 		return;
1783 	}
1784 
1785 	sock = manager->fds[fd];
1786 	if (readable) {
1787 		if (sock == NULL) {
1788 			unwatch_read = ISC_TRUE;
1789 			goto check_write;
1790 		}
1791 		if (!SOCK_DEAD(sock)) {
1792 			dispatch_recv(sock);
1793 		}
1794 		unwatch_read = ISC_TRUE;
1795 	}
1796 check_write:
1797 	if (writeable) {
1798 		if (sock == NULL) {
1799 			unwatch_write = ISC_TRUE;
1800 			goto unlock_fd;
1801 		}
1802 		if (!SOCK_DEAD(sock)) {
1803 			if (sock->connecting)
1804 				dispatch_connect(sock);
1805 			else
1806 				dispatch_send(sock);
1807 		}
1808 		unwatch_write = ISC_TRUE;
1809 	}
1810 
1811  unlock_fd:
1812 	if (unwatch_read)
1813 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1814 	if (unwatch_write)
1815 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1816 
1817 }
1818 
1819 static void
1820 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1821 	    fd_set *writefds)
1822 {
1823 	int i;
1824 
1825 	REQUIRE(maxfd <= (int)manager->maxsocks);
1826 
1827 	for (i = 0; i < maxfd; i++) {
1828 		process_fd(manager, i, FD_ISSET(i, readfds),
1829 			   FD_ISSET(i, writefds));
1830 	}
1831 }
1832 
1833 /*
1834  * Create a new socket manager.
1835  */
1836 
1837 static isc_result_t
1838 setup_watcher(isc__socketmgr_t *manager) {
1839 	isc_result_t result;
1840 
1841 	UNUSED(result);
1842 
1843 	manager->fd_bufsize = sizeof(fd_set);
1844 
1845 	manager->read_fds = NULL;
1846 	manager->read_fds_copy = NULL;
1847 	manager->write_fds = NULL;
1848 	manager->write_fds_copy = NULL;
1849 
1850 	manager->read_fds = malloc(manager->fd_bufsize);
1851 	if (manager->read_fds != NULL)
1852 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1853 	if (manager->read_fds_copy != NULL)
1854 		manager->write_fds = malloc(manager->fd_bufsize);
1855 	if (manager->write_fds != NULL) {
1856 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1857 	}
1858 	if (manager->write_fds_copy == NULL) {
1859 		if (manager->write_fds != NULL) {
1860 			free(manager->write_fds);
1861 		}
1862 		if (manager->read_fds_copy != NULL) {
1863 			free(manager->read_fds_copy);
1864 		}
1865 		if (manager->read_fds != NULL) {
1866 			free(manager->read_fds);
1867 		}
1868 		return (ISC_R_NOMEMORY);
1869 	}
1870 	memset(manager->read_fds, 0, manager->fd_bufsize);
1871 	memset(manager->write_fds, 0, manager->fd_bufsize);
1872 
1873 	manager->maxfd = 0;
1874 
1875 	return (ISC_R_SUCCESS);
1876 }
1877 
1878 static void
1879 cleanup_watcher(isc__socketmgr_t *manager) {
1880 
1881 	if (manager->read_fds != NULL)
1882 		free(manager->read_fds);
1883 	if (manager->read_fds_copy != NULL)
1884 		free(manager->read_fds_copy);
1885 	if (manager->write_fds != NULL)
1886 		free(manager->write_fds);
1887 	if (manager->write_fds_copy != NULL)
1888 		free(manager->write_fds_copy);
1889 }
1890 
1891 isc_result_t
1892 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1893 	return (isc__socketmgr_create2(managerp, 0));
1894 }
1895 
1896 isc_result_t
1897 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1898 		       unsigned int maxsocks)
1899 {
1900 	isc__socketmgr_t *manager;
1901 	isc_result_t result;
1902 
1903 	REQUIRE(managerp != NULL && *managerp == NULL);
1904 
1905 	if (socketmgr != NULL) {
1906 		/* Don't allow maxsocks to be updated */
1907 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1908 			return (ISC_R_EXISTS);
1909 
1910 		socketmgr->refs++;
1911 		*managerp = (isc_socketmgr_t *)socketmgr;
1912 		return (ISC_R_SUCCESS);
1913 	}
1914 
1915 	if (maxsocks == 0)
1916 		maxsocks = FD_SETSIZE;
1917 
1918 	manager = malloc(sizeof(*manager));
1919 	if (manager == NULL)
1920 		return (ISC_R_NOMEMORY);
1921 
1922 	/* zero-clear so that necessary cleanup on failure will be easy */
1923 	memset(manager, 0, sizeof(*manager));
1924 	manager->maxsocks = maxsocks;
1925 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1926 	if (manager->fds == NULL) {
1927 		result = ISC_R_NOMEMORY;
1928 		goto free_manager;
1929 	}
1930 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1931 	if (manager->fdstate == NULL) {
1932 		result = ISC_R_NOMEMORY;
1933 		goto free_manager;
1934 	}
1935 
1936 	manager->common.methods = &socketmgrmethods;
1937 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1938 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1939 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1940 	ISC_LIST_INIT(manager->socklist);
1941 
1942 	manager->refs = 1;
1943 
1944 	/*
1945 	 * Set up initial state for the select loop
1946 	 */
1947 	result = setup_watcher(manager);
1948 	if (result != ISC_R_SUCCESS)
1949 		goto cleanup;
1950 
1951 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1952 
1953 	socketmgr = manager;
1954 	*managerp = (isc_socketmgr_t *)manager;
1955 
1956 	return (ISC_R_SUCCESS);
1957 
1958 cleanup:
1959 
1960 free_manager:
1961 	if (manager->fdstate != NULL) {
1962 		free(manager->fdstate);
1963 	}
1964 	if (manager->fds != NULL) {
1965 		free(manager->fds);
1966 	}
1967 	free(manager);
1968 
1969 	return (result);
1970 }
1971 
1972 void
1973 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
1974 	isc__socketmgr_t *manager;
1975 	int i;
1976 
1977 	/*
1978 	 * Destroy a socket manager.
1979 	 */
1980 
1981 	REQUIRE(managerp != NULL);
1982 	manager = (isc__socketmgr_t *)*managerp;
1983 	REQUIRE(VALID_MANAGER(manager));
1984 
1985 	manager->refs--;
1986 	if (manager->refs > 0) {
1987 		*managerp = NULL;
1988 		return;
1989 	}
1990 	socketmgr = NULL;
1991 
1992 	/*
1993 	 * Wait for all sockets to be destroyed.
1994 	 */
1995 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1996 		isc__taskmgr_dispatch(NULL);
1997 	}
1998 
1999 	/*
2000 	 * Here, poke our select/poll thread.  Do this by closing the write
2001 	 * half of the pipe, which will send EOF to the read half.
2002 	 * This is currently a no-op in the non-threaded case.
2003 	 */
2004 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2005 
2006 	/*
2007 	 * Clean up.
2008 	 */
2009 	cleanup_watcher(manager);
2010 
2011 	for (i = 0; i < (int)manager->maxsocks; i++)
2012 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2013 			(void)close(i);
2014 
2015 	free(manager->fds);
2016 	free(manager->fdstate);
2017 
2018 	manager->common.magic = 0;
2019 	manager->common.impmagic = 0;
2020 	free(manager);
2021 
2022 	*managerp = NULL;
2023 
2024 	socketmgr = NULL;
2025 }
2026 
2027 static isc_result_t
2028 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2029 	    unsigned int flags)
2030 {
2031 	int io_state;
2032 	isc_task_t *ntask = NULL;
2033 	isc_result_t result = ISC_R_SUCCESS;
2034 
2035 	dev->ev_sender = task;
2036 
2037 	if (sock->type == isc_sockettype_udp) {
2038 		io_state = doio_recv(sock, dev);
2039 	} else {
2040 		if (ISC_LIST_EMPTY(sock->recv_list))
2041 			io_state = doio_recv(sock, dev);
2042 		else
2043 			io_state = DOIO_SOFT;
2044 	}
2045 
2046 	switch (io_state) {
2047 	case DOIO_SOFT:
2048 		/*
2049 		 * We couldn't read all or part of the request right now, so
2050 		 * queue it.
2051 		 *
2052 		 * Attach to socket and to task
2053 		 */
2054 		isc_task_attach(task, &ntask);
2055 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2056 
2057 		/*
2058 		 * Enqueue the request.  If the socket was previously not being
2059 		 * watched, poke the watcher to start paying attention to it.
2060 		 */
2061 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2062 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2063 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2064 
2065 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2066 			   "socket_recv: event %p -> task %p",
2067 			   dev, ntask);
2068 
2069 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2070 			result = ISC_R_INPROGRESS;
2071 		break;
2072 
2073 	case DOIO_EOF:
2074 		dev->result = ISC_R_EOF;
2075 		/* fallthrough */
2076 
2077 	case DOIO_HARD:
2078 	case DOIO_SUCCESS:
2079 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2080 			send_recvdone_event(sock, &dev);
2081 		break;
2082 	}
2083 
2084 	return (result);
2085 }
2086 
2087 isc_result_t
2088 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2089 		  unsigned int minimum, isc_task_t *task,
2090 		  isc_taskaction_t action, void *arg)
2091 {
2092 	isc__socket_t *sock = (isc__socket_t *)sock0;
2093 	isc_socketevent_t *dev;
2094 	isc__socketmgr_t *manager;
2095 	unsigned int iocount;
2096 	isc_buffer_t *buffer;
2097 
2098 	REQUIRE(VALID_SOCKET(sock));
2099 	REQUIRE(buflist != NULL);
2100 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2101 	REQUIRE(task != NULL);
2102 	REQUIRE(action != NULL);
2103 
2104 	manager = sock->manager;
2105 	REQUIRE(VALID_MANAGER(manager));
2106 
2107 	iocount = isc_bufferlist_availablecount(buflist);
2108 	REQUIRE(iocount > 0);
2109 
2110 	INSIST(sock->bound);
2111 
2112 	dev = allocate_socketevent(sock,
2113 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2114 	if (dev == NULL)
2115 		return (ISC_R_NOMEMORY);
2116 
2117 	/*
2118 	 * UDP sockets are always partial read
2119 	 */
2120 	if (sock->type == isc_sockettype_udp)
2121 		dev->minimum = 1;
2122 	else {
2123 		if (minimum == 0)
2124 			dev->minimum = iocount;
2125 		else
2126 			dev->minimum = minimum;
2127 	}
2128 
2129 	/*
2130 	 * Move each buffer from the passed in list to our internal one.
2131 	 */
2132 	buffer = ISC_LIST_HEAD(*buflist);
2133 	while (buffer != NULL) {
2134 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2135 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2136 		buffer = ISC_LIST_HEAD(*buflist);
2137 	}
2138 
2139 	return (socket_recv(sock, dev, task, 0));
2140 }
2141 
2142 static isc_result_t
2143 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2144 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2145 	    unsigned int flags)
2146 {
2147 	int io_state;
2148 	isc_task_t *ntask = NULL;
2149 	isc_result_t result = ISC_R_SUCCESS;
2150 
2151 	dev->ev_sender = task;
2152 
2153 	set_dev_address(address, sock, dev);
2154 	if (pktinfo != NULL) {
2155 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2156 		dev->pktinfo = *pktinfo;
2157 
2158 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2159 		    !isc_sockaddr_islinklocal(&dev->address)) {
2160 			socket_log(sock, NULL, TRACE,
2161 				   "pktinfo structure provided, ifindex %u "
2162 				   "(set to 0)", pktinfo->ipi6_ifindex);
2163 
2164 			/*
2165 			 * Set the pktinfo index to 0 here, to let the
2166 			 * kernel decide what interface it should send on.
2167 			 */
2168 			dev->pktinfo.ipi6_ifindex = 0;
2169 		}
2170 	}
2171 
2172 	if (sock->type == isc_sockettype_udp)
2173 		io_state = doio_send(sock, dev);
2174 	else {
2175 		if (ISC_LIST_EMPTY(sock->send_list))
2176 			io_state = doio_send(sock, dev);
2177 		else
2178 			io_state = DOIO_SOFT;
2179 	}
2180 
2181 	switch (io_state) {
2182 	case DOIO_SOFT:
2183 		/*
2184 		 * We couldn't send all or part of the request right now, so
2185 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2186 		 */
2187 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2188 			isc_task_attach(task, &ntask);
2189 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2190 
2191 			/*
2192 			 * Enqueue the request.  If the socket was previously
2193 			 * not being watched, poke the watcher to start
2194 			 * paying attention to it.
2195 			 */
2196 			if (ISC_LIST_EMPTY(sock->send_list) &&
2197 			    !sock->pending_send)
2198 				select_poke(sock->manager, sock->fd,
2199 					    SELECT_POKE_WRITE);
2200 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2201 
2202 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2203 				   "socket_send: event %p -> task %p",
2204 				   dev, ntask);
2205 
2206 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2207 				result = ISC_R_INPROGRESS;
2208 			break;
2209 		}
2210 
2211 		/* FALLTHROUGH */
2212 
2213 	case DOIO_HARD:
2214 	case DOIO_SUCCESS:
2215 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2216 			send_senddone_event(sock, &dev);
2217 		break;
2218 	}
2219 
2220 	return (result);
2221 }
2222 
2223 isc_result_t
2224 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2225 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2226 {
2227 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2228 				     NULL, 0));
2229 }
2230 
2231 isc_result_t
2232 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2233 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2234 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2235 		     unsigned int flags)
2236 {
2237 	isc__socket_t *sock = (isc__socket_t *)sock0;
2238 	isc_socketevent_t *dev;
2239 	isc__socketmgr_t *manager;
2240 	unsigned int iocount;
2241 	isc_buffer_t *buffer;
2242 
2243 	REQUIRE(VALID_SOCKET(sock));
2244 	REQUIRE(buflist != NULL);
2245 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2246 	REQUIRE(task != NULL);
2247 	REQUIRE(action != NULL);
2248 
2249 	manager = sock->manager;
2250 	REQUIRE(VALID_MANAGER(manager));
2251 
2252 	iocount = isc_bufferlist_usedcount(buflist);
2253 	REQUIRE(iocount > 0);
2254 
2255 	dev = allocate_socketevent(sock,
2256 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2257 	if (dev == NULL)
2258 		return (ISC_R_NOMEMORY);
2259 
2260 	/*
2261 	 * Move each buffer from the passed in list to our internal one.
2262 	 */
2263 	buffer = ISC_LIST_HEAD(*buflist);
2264 	while (buffer != NULL) {
2265 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2266 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2267 		buffer = ISC_LIST_HEAD(*buflist);
2268 	}
2269 
2270 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2271 }
2272 
2273 isc_result_t
2274 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2275 		 unsigned int options) {
2276 	isc__socket_t *sock = (isc__socket_t *)sock0;
2277 	int on = 1;
2278 
2279 	REQUIRE(VALID_SOCKET(sock));
2280 
2281 	INSIST(!sock->bound);
2282 
2283 	if (sock->pf != sockaddr->type.sa.sa_family) {
2284 		return (ISC_R_FAMILYMISMATCH);
2285 	}
2286 
2287 	/*
2288 	 * Only set SO_REUSEADDR when we want a specific port.
2289 	 */
2290 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2291 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2292 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2293 		       sizeof(on)) < 0) {
2294 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2295 				 "setsockopt(%d) %s", sock->fd, "failed");
2296 		/* Press on... */
2297 	}
2298 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2299 		switch (errno) {
2300 		case EACCES:
2301 			return (ISC_R_NOPERM);
2302 		case EADDRNOTAVAIL:
2303 			return (ISC_R_ADDRNOTAVAIL);
2304 		case EADDRINUSE:
2305 			return (ISC_R_ADDRINUSE);
2306 		case EINVAL:
2307 			return (ISC_R_BOUND);
2308 		default:
2309 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2310 					 strerror(errno));
2311 			return (ISC_R_UNEXPECTED);
2312 		}
2313 	}
2314 
2315 	socket_log(sock, sockaddr, TRACE, "bound");
2316 	sock->bound = 1;
2317 
2318 	return (ISC_R_SUCCESS);
2319 }
2320 
2321 isc_result_t
2322 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2323 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2324 {
2325 	isc__socket_t *sock = (isc__socket_t *)sock0;
2326 	isc_socket_connev_t *dev;
2327 	isc_task_t *ntask = NULL;
2328 	isc__socketmgr_t *manager;
2329 	int cc;
2330 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2331 
2332 	REQUIRE(VALID_SOCKET(sock));
2333 	REQUIRE(addr != NULL);
2334 	REQUIRE(task != NULL);
2335 	REQUIRE(action != NULL);
2336 
2337 	manager = sock->manager;
2338 	REQUIRE(VALID_MANAGER(manager));
2339 	REQUIRE(addr != NULL);
2340 
2341 	if (isc_sockaddr_ismulticast(addr))
2342 		return (ISC_R_MULTICAST);
2343 
2344 	REQUIRE(!sock->connecting);
2345 
2346 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2347 							ISC_SOCKEVENT_CONNECT,
2348 							action,	arg,
2349 							sizeof(*dev));
2350 	if (dev == NULL) {
2351 		return (ISC_R_NOMEMORY);
2352 	}
2353 	ISC_LINK_INIT(dev, ev_link);
2354 
2355 	/*
2356 	 * Try to do the connect right away, as there can be only one
2357 	 * outstanding, and it might happen to complete.
2358 	 */
2359 	sock->peer_address = *addr;
2360 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2361 	if (cc < 0) {
2362 		/*
2363 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2364 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2365 		 * a success and let the user detect it if it's really an error
2366 		 * at the time of sending a packet on the socket.
2367 		 */
2368 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2369 			cc = 0;
2370 			goto success;
2371 		}
2372 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2373 			goto queue;
2374 
2375 		switch (errno) {
2376 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2377 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2378 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2379 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2380 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2381 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2382 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2383 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2384 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2385 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2386 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2387 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2388 #undef ERROR_MATCH
2389 		}
2390 
2391 		sock->connected = 0;
2392 
2393 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2394 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2395 				 addrbuf, errno, strerror(errno));
2396 
2397 		isc_event_free(ISC_EVENT_PTR(&dev));
2398 		return (ISC_R_UNEXPECTED);
2399 
2400 	err_exit:
2401 		sock->connected = 0;
2402 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2403 
2404 		return (ISC_R_SUCCESS);
2405 	}
2406 
2407 	/*
2408 	 * If connect completed, fire off the done event.
2409 	 */
2410  success:
2411 	if (cc == 0) {
2412 		sock->connected = 1;
2413 		sock->bound = 1;
2414 		dev->result = ISC_R_SUCCESS;
2415 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2416 
2417 		return (ISC_R_SUCCESS);
2418 	}
2419 
2420  queue:
2421 
2422 	/*
2423 	 * Attach to task.
2424 	 */
2425 	isc_task_attach(task, &ntask);
2426 
2427 	sock->connecting = 1;
2428 
2429 	dev->ev_sender = ntask;
2430 
2431 	/*
2432 	 * Poke watcher here.  We still have the socket locked, so there
2433 	 * is no race condition.  We will keep the lock for such a short
2434 	 * bit of time waking it up now or later won't matter all that much.
2435 	 */
2436 	if (sock->connect_ev == NULL)
2437 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2438 
2439 	sock->connect_ev = dev;
2440 
2441 	return (ISC_R_SUCCESS);
2442 }
2443 
2444 /*
2445  * Called when a socket with a pending connect() finishes.
2446  */
2447 static void
2448 internal_connect(isc_task_t *me, isc_event_t *ev) {
2449 	isc__socket_t *sock;
2450 	isc_socket_connev_t *dev;
2451 	isc_task_t *task;
2452 	int cc;
2453 	socklen_t optlen;
2454 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2455 
2456 	UNUSED(me);
2457 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2458 
2459 	sock = ev->ev_sender;
2460 	INSIST(VALID_SOCKET(sock));
2461 
2462 	/*
2463 	 * When the internal event was sent the reference count was bumped
2464 	 * to keep the socket around for us.  Decrement the count here.
2465 	 */
2466 	INSIST(sock->references > 0);
2467 	sock->references--;
2468 	if (sock->references == 0) {
2469 		destroy(&sock);
2470 		return;
2471 	}
2472 
2473 	/*
2474 	 * Has this event been canceled?
2475 	 */
2476 	dev = sock->connect_ev;
2477 	if (dev == NULL) {
2478 		INSIST(!sock->connecting);
2479 		return;
2480 	}
2481 
2482 	INSIST(sock->connecting);
2483 	sock->connecting = 0;
2484 
2485 	/*
2486 	 * Get any possible error status here.
2487 	 */
2488 	optlen = sizeof(cc);
2489 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2490 		       (void *)&cc, (void *)&optlen) < 0)
2491 		cc = errno;
2492 	else
2493 		errno = cc;
2494 
2495 	if (errno != 0) {
2496 		/*
2497 		 * If the error is EAGAIN, just re-select on this
2498 		 * fd and pretend nothing strange happened.
2499 		 */
2500 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2501 			sock->connecting = 1;
2502 			select_poke(sock->manager, sock->fd,
2503 				    SELECT_POKE_CONNECT);
2504 			return;
2505 		}
2506 
2507 
2508 		/*
2509 		 * Translate other errors into ISC_R_* flavors.
2510 		 */
2511 		switch (errno) {
2512 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2513 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2514 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2515 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2516 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2517 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2518 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2519 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2520 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2521 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2522 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2523 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2524 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2525 #undef ERROR_MATCH
2526 		default:
2527 			dev->result = ISC_R_UNEXPECTED;
2528 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2529 					    sizeof(peerbuf));
2530 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2531 					 "internal_connect: connect(%s) %s",
2532 					 peerbuf, strerror(errno));
2533 		}
2534 	} else {
2535 		dev->result = ISC_R_SUCCESS;
2536 		sock->connected = 1;
2537 		sock->bound = 1;
2538 	}
2539 
2540 	sock->connect_ev = NULL;
2541 
2542 	task = dev->ev_sender;
2543 	dev->ev_sender = sock;
2544 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2545 }
2546 
2547 /*
2548  * Run through the list of events on this socket, and cancel the ones
2549  * queued for task "task" of type "how".  "how" is a bitmask.
2550  */
2551 void
2552 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2553 	isc__socket_t *sock = (isc__socket_t *)sock0;
2554 
2555 	REQUIRE(VALID_SOCKET(sock));
2556 
2557 	/*
2558 	 * Quick exit if there is nothing to do.  Don't even bother locking
2559 	 * in this case.
2560 	 */
2561 	if (how == 0)
2562 		return;
2563 
2564 	/*
2565 	 * All of these do the same thing, more or less.
2566 	 * Each will:
2567 	 *	o If the internal event is marked as "posted" try to
2568 	 *	  remove it from the task's queue.  If this fails, mark it
2569 	 *	  as canceled instead, and let the task clean it up later.
2570 	 *	o For each I/O request for that task of that type, post
2571 	 *	  its done event with status of "ISC_R_CANCELED".
2572 	 *	o Reset any state needed.
2573 	 */
2574 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2575 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2576 		isc_socketevent_t      *dev;
2577 		isc_socketevent_t      *next;
2578 		isc_task_t	       *current_task;
2579 
2580 		dev = ISC_LIST_HEAD(sock->recv_list);
2581 
2582 		while (dev != NULL) {
2583 			current_task = dev->ev_sender;
2584 			next = ISC_LIST_NEXT(dev, ev_link);
2585 
2586 			if ((task == NULL) || (task == current_task)) {
2587 				dev->result = ISC_R_CANCELED;
2588 				send_recvdone_event(sock, &dev);
2589 			}
2590 			dev = next;
2591 		}
2592 	}
2593 
2594 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2595 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2596 		isc_socketevent_t      *dev;
2597 		isc_socketevent_t      *next;
2598 		isc_task_t	       *current_task;
2599 
2600 		dev = ISC_LIST_HEAD(sock->send_list);
2601 
2602 		while (dev != NULL) {
2603 			current_task = dev->ev_sender;
2604 			next = ISC_LIST_NEXT(dev, ev_link);
2605 
2606 			if ((task == NULL) || (task == current_task)) {
2607 				dev->result = ISC_R_CANCELED;
2608 				send_senddone_event(sock, &dev);
2609 			}
2610 			dev = next;
2611 		}
2612 	}
2613 
2614 	/*
2615 	 * Connecting is not a list.
2616 	 */
2617 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2618 	    && sock->connect_ev != NULL) {
2619 		isc_socket_connev_t    *dev;
2620 		isc_task_t	       *current_task;
2621 
2622 		INSIST(sock->connecting);
2623 		sock->connecting = 0;
2624 
2625 		dev = sock->connect_ev;
2626 		current_task = dev->ev_sender;
2627 
2628 		if ((task == NULL) || (task == current_task)) {
2629 			sock->connect_ev = NULL;
2630 
2631 			dev->result = ISC_R_CANCELED;
2632 			dev->ev_sender = sock;
2633 			isc_task_sendanddetach(&current_task,
2634 					       ISC_EVENT_PTR(&dev));
2635 		}
2636 	}
2637 
2638 }
2639 
2640 /*
2641  * In our assumed scenario, we can simply use a single static object.
2642  * XXX: this is not true if the application uses multiple threads with
2643  *      'multi-context' mode.  Fixing this is a future TODO item.
2644  */
2645 static isc_socketwait_t swait_private;
2646 
2647 int
2648 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2649 			  isc_socketwait_t **swaitp)
2650 {
2651 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2652 	int n;
2653 
2654 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2655 
2656 	if (manager == NULL)
2657 		manager = socketmgr;
2658 	if (manager == NULL)
2659 		return (0);
2660 
2661 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2662 	memmove(manager->write_fds_copy, manager->write_fds,
2663 		manager->fd_bufsize);
2664 
2665 	swait_private.readset = manager->read_fds_copy;
2666 	swait_private.writeset = manager->write_fds_copy;
2667 	swait_private.maxfd = manager->maxfd + 1;
2668 
2669 	n = select(swait_private.maxfd, swait_private.readset,
2670 		   swait_private.writeset, NULL, tvp);
2671 
2672 	*swaitp = &swait_private;
2673 	return (n);
2674 }
2675 
2676 isc_result_t
2677 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2678 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2679 
2680 	REQUIRE(swait == &swait_private);
2681 
2682 	if (manager == NULL)
2683 		manager = socketmgr;
2684 	if (manager == NULL)
2685 		return (ISC_R_NOTFOUND);
2686 
2687 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2688 	return (ISC_R_SUCCESS);
2689 }
2690 
2691 #include "../socket_api.c"
2692