xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 5185a7002a54c85c252cec8ac1712898b2c4d364)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/param.h>
20 #include <sys/types.h>
21 #include <sys/event.h>
22 #include <sys/socket.h>
23 #include <sys/stat.h>
24 #include <sys/time.h>
25 #include <sys/uio.h>
26 #include <sys/un.h>
27 
28 #include <netinet/tcp.h>
29 
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <stddef.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include <inttypes.h> /* uintptr_t */
37 
38 #include <isc/buffer.h>
39 #include <isc/bufferlist.h>
40 #include <isc/formatcheck.h>
41 #include <isc/list.h>
42 #include <isc/log.h>
43 #include <isc/msgs.h>
44 #include <isc/net.h>
45 #include <isc/region.h>
46 #include <isc/socket.h>
47 #include <isc/strerror.h>
48 #include <isc/task.h>
49 #include <isc/util.h>
50 
51 #include "errno2result.h"
52 
53 #include "socket_p.h"
54 #include "../task_p.h"
55 
56 struct isc_socketwait {
57 	fd_set *readset;
58 	fd_set *writeset;
59 	int nfds;
60 	int maxfd;
61 };
62 
63 /*
64  * Set by the -T dscp option on the command line. If set to a value
65  * other than -1, we check to make sure DSCP values match it, and
66  * assert if not.
67  */
68 int isc_dscp_check_value = -1;
69 
70 /*%
71  * Size of per-FD lock buckets.
72  */
73 #define FDLOCK_COUNT		1
74 #define FDLOCK_ID(fd)		0
75 
76 /*%
77  * Some systems define the socket length argument as an int, some as size_t,
78  * some as socklen_t.  This is here so it can be easily changed if needed.
79  */
80 
81 /*%
82  * Define what the possible "soft" errors can be.  These are non-fatal returns
83  * of various network related functions, like recv() and so on.
84  *
85  * For some reason, BSDI (and perhaps others) will sometimes return <0
86  * from recv() but will have errno==0.  This is broken, but we have to
87  * work around it here.
88  */
89 #define SOFT_ERROR(e)	((e) == EAGAIN || \
90 			 (e) == EWOULDBLOCK || \
91 			 (e) == EINTR || \
92 			 (e) == 0)
93 
94 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
95 
96 /*!<
97  * DLVL(90)  --  Function entry/exit and other tracing.
98  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
99  * DLVL(60)  --  Socket data send/receive
100  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
101  * DLVL(20)  --  Socket creation/destruction.
102  */
103 #define TRACE_LEVEL		90
104 #define CORRECTNESS_LEVEL	70
105 #define IOEVENT_LEVEL		60
106 #define EVENT_LEVEL		50
107 #define CREATION_LEVEL		20
108 
109 #define TRACE		DLVL(TRACE_LEVEL)
110 #define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
111 #define IOEVENT		DLVL(IOEVENT_LEVEL)
112 #define EVENT		DLVL(EVENT_LEVEL)
113 #define CREATION	DLVL(CREATION_LEVEL)
114 
115 typedef isc_event_t intev_t;
116 
117 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
118 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
119 
120 /*!
121  * IPv6 control information.  If the socket is an IPv6 socket we want
122  * to collect the destination address and interface so the client can
123  * set them on outgoing packets.
124  */
125 
126 /*%
127  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
128  * a setsockopt() like interface to request timestamps, and if the OS
129  * doesn't do it for us, call gettimeofday() on every UDP receive?
130  */
131 
132 /*%
133  * The size to raise the receive buffer to (from BIND 8).
134  */
135 #define RCVBUFSIZE (32*1024)
136 
137 /*%
138  * Instead of calculating the cmsgbuf lengths every time we take
139  * a rule of thumb approach - sizes are taken from x86_64 linux,
140  * multiplied by 2, everything should fit. Those sizes are not
141  * large enough to cause any concern.
142  */
143 #define CMSG_SP_IN6PKT 40
144 
145 #define CMSG_SP_TIMESTAMP 32
146 
147 #define CMSG_SP_TCTOS 24
148 
149 #define CMSG_SP_INT 24
150 
151 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
152 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
153 
154 /*%
155  * The number of times a send operation is repeated if the result is EINTR.
156  */
157 #define NRETRIES 10
158 
159 typedef struct isc__socket isc__socket_t;
160 typedef struct isc__socketmgr isc__socketmgr_t;
161 
162 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
163 
164 struct isc__socket {
165 	/* Not locked. */
166 	isc_socket_t		common;
167 	isc__socketmgr_t	*manager;
168 	isc_sockettype_t	type;
169 
170 	/* Locked by socket lock. */
171 	ISC_LINK(isc__socket_t)	link;
172 	unsigned int		references;
173 	int			fd;
174 	int			pf;
175 
176 	ISC_LIST(isc_socketevent_t)		send_list;
177 	ISC_LIST(isc_socketevent_t)		recv_list;
178 	isc_socket_connev_t		       *connect_ev;
179 
180 	/*
181 	 * Internal events.  Posted when a descriptor is readable or
182 	 * writable.  These are statically allocated and never freed.
183 	 * They will be set to non-purgable before use.
184 	 */
185 	intev_t			readable_ev;
186 	intev_t			writable_ev;
187 
188 	isc_sockaddr_t		peer_address;       /* remote address */
189 
190 	unsigned int		pending_recv : 1,
191 				pending_send : 1,
192 				connected : 1,
193 				connecting : 1,     /* connect pending */
194 				bound : 1,          /* bound to local addr */
195 				active : 1,         /* currently active */
196 				pktdscp : 1;	    /* per packet dscp */
197 	unsigned int		dscp;
198 };
199 
200 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
201 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
202 
203 struct isc__socketmgr {
204 	/* Not locked. */
205 	isc_socketmgr_t		common;
206 	int			fd_bufsize;
207 	unsigned int		maxsocks;
208 
209 	isc__socket_t	       **fds;
210 	int			*fdstate;
211 
212 	/* Locked by manager lock. */
213 	ISC_LIST(isc__socket_t)	socklist;
214 	fd_set			*read_fds;
215 	fd_set			*read_fds_copy;
216 	fd_set			*write_fds;
217 	fd_set			*write_fds_copy;
218 	int			maxfd;
219 	unsigned int		refs;
220 };
221 
222 static isc__socketmgr_t *socketmgr = NULL;
223 
224 #define CLOSED			0	/* this one must be zero */
225 #define MANAGED			1
226 #define CLOSE_PENDING		2
227 
228 /*
229  * send() and recv() iovec counts
230  */
231 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
232 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
233 
234 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
235 				  isc_sockettype_t type,
236 				  isc_socket_t **socketp);
237 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
238 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
239 static void free_socket(isc__socket_t **);
240 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
241 				    isc__socket_t **);
242 static void destroy(isc__socket_t **);
243 static void internal_connect(isc_task_t *, isc_event_t *);
244 static void internal_recv(isc_task_t *, isc_event_t *);
245 static void internal_send(isc_task_t *, isc_event_t *);
246 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
247 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
248 			      struct msghdr *, struct iovec *, size_t *);
249 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
250 			      struct msghdr *, struct iovec *, size_t *);
251 
252 /*%
253  * The following are intended for internal use (indicated by "isc__"
254  * prefix) but are not declared as static, allowing direct access from
255  * unit tests etc.
256  */
257 
258 isc_result_t
259 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
260 		   isc_socket_t **socketp);
261 void
262 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
263 void
264 isc__socket_detach(isc_socket_t **socketp);
265 isc_result_t
266 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
267 		 unsigned int minimum, isc_task_t *task,
268 		  isc_taskaction_t action, void *arg);
269 isc_result_t
270 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
271 		  isc_task_t *task, isc_taskaction_t action, void *arg);
272 isc_result_t
273 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
274 		     isc_task_t *task, isc_taskaction_t action, void *arg,
275 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
276 		     unsigned int flags);
277 isc_result_t
278 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
279 		 unsigned int options);
280 isc_result_t
281 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
282 		    isc_task_t *task, isc_taskaction_t action,
283 		    void *arg);
284 void
285 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
286 
287 isc_result_t
288 isc__socketmgr_create(isc_socketmgr_t **managerp);
289 isc_result_t
290 isc__socketmgr_create2(isc_socketmgr_t **managerp,
291 		       unsigned int maxsocks);
292 isc_result_t
293 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
294 void
295 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
296 
297 static struct {
298 	isc_socketmethods_t methods;
299 
300 	/*%
301 	 * The following are defined just for avoiding unused static functions.
302 	 */
303 	void *recvv, *sendv;
304 } socketmethods = {
305 	{
306 		isc__socket_attach,
307 		isc__socket_detach,
308 		isc__socket_bind,
309 		isc__socket_connect,
310 		isc__socket_cancel,
311 	},
312 	(void *)isc__socket_recvv,
313 	(void *)isc__socket_sendv,
314 };
315 
316 static isc_socketmgrmethods_t socketmgrmethods = {
317 	isc__socketmgr_destroy,
318 	isc__socket_create
319 };
320 
321 #define SELECT_POKE_SHUTDOWN		(-1)
322 #define SELECT_POKE_NOTHING		(-2)
323 #define SELECT_POKE_READ		(-3)
324 #define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
325 #define SELECT_POKE_WRITE		(-4)
326 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
327 #define SELECT_POKE_CLOSE		(-5)
328 
329 #define SOCK_DEAD(s)			((s)->references == 0)
330 
331 /*%
332  * Shortcut index arrays to get access to statistics counters.
333  */
334 enum {
335 	STATID_OPEN = 0,
336 	STATID_OPENFAIL = 1,
337 	STATID_CLOSE = 2,
338 	STATID_BINDFAIL = 3,
339 	STATID_CONNECTFAIL = 4,
340 	STATID_CONNECT = 5,
341 	STATID_ACCEPTFAIL = 6,
342 	STATID_ACCEPT = 7,
343 	STATID_SENDFAIL = 8,
344 	STATID_RECVFAIL = 9,
345 	STATID_ACTIVE = 10
346 };
347 
348 
349 static void
350 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
351 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
352 	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
353 static void
354 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
355 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
356 	   const char *fmt, ...)
357 {
358 	char msgbuf[2048];
359 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
360 	va_list ap;
361 
362 	if (! isc_log_wouldlog(isc_lctx, level))
363 		return;
364 
365 	va_start(ap, fmt);
366 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
367 	va_end(ap);
368 
369 	if (address == NULL) {
370 		isc_log_write(isc_lctx, category, module, level,
371 			       "socket %p: %s", sock, msgbuf);
372 	} else {
373 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
374 		isc_log_write(isc_lctx, category, module, level,
375 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
376 	}
377 }
378 
379 static inline isc_result_t
380 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
381 	isc_result_t result = ISC_R_SUCCESS;
382 
383 	if (msg == SELECT_POKE_READ)
384 		FD_SET(fd, manager->read_fds);
385 	if (msg == SELECT_POKE_WRITE)
386 		FD_SET(fd, manager->write_fds);
387 
388 	return (result);
389 }
390 
391 static inline isc_result_t
392 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
393 	isc_result_t result = ISC_R_SUCCESS;
394 
395 	if (msg == SELECT_POKE_READ)
396 		FD_CLR(fd, manager->read_fds);
397 	else if (msg == SELECT_POKE_WRITE)
398 		FD_CLR(fd, manager->write_fds);
399 
400 	return (result);
401 }
402 
403 static void
404 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
405 	isc_result_t result;
406 
407 	/*
408 	 * This is a wakeup on a socket.  If the socket is not in the
409 	 * process of being closed, start watching it for either reads
410 	 * or writes.
411 	 */
412 
413 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
414 
415 	if (msg == SELECT_POKE_CLOSE) {
416 		/* No one should be updating fdstate, so no need to lock it */
417 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
418 		manager->fdstate[fd] = CLOSED;
419 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
420 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
421 		(void)close(fd);
422 		return;
423 	}
424 
425 	if (manager->fdstate[fd] == CLOSE_PENDING) {
426 
427 		/*
428 		 * We accept (and ignore) any error from unwatch_fd() as we are
429 		 * closing the socket, hoping it doesn't leave dangling state in
430 		 * the kernel.
431 		 */
432 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
433 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
434 		return;
435 	}
436 	if (manager->fdstate[fd] != MANAGED) {
437 		return;
438 	}
439 
440 	/*
441 	 * Set requested bit.
442 	 */
443 	result = watch_fd(manager, fd, msg);
444 	if (result != ISC_R_SUCCESS) {
445 		/*
446 		 * XXXJT: what should we do?  Ignoring the failure of watching
447 		 * a socket will make the application dysfunctional, but there
448 		 * seems to be no reasonable recovery process.
449 		 */
450 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
451 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
452 			      "failed to start watching FD (%d): %s",
453 			      fd, isc_result_totext(result));
454 	}
455 }
456 
457 /*
458  * Update the state of the socketmgr when something changes.
459  */
460 static void
461 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
462 	if (msg == SELECT_POKE_SHUTDOWN)
463 		return;
464 	else if (fd >= 0)
465 		wakeup_socket(manager, fd, msg);
466 	return;
467 }
468 
469 /*
470  * Make a fd non-blocking.
471  */
472 static isc_result_t
473 make_nonblock(int fd) {
474 	int ret;
475 	char strbuf[ISC_STRERRORSIZE];
476 	int flags;
477 
478 	flags = fcntl(fd, F_GETFL, 0);
479 	flags |= O_NONBLOCK;
480 	ret = fcntl(fd, F_SETFL, flags);
481 
482 	if (ret == -1) {
483 		isc__strerror(errno, strbuf, sizeof(strbuf));
484 		UNEXPECTED_ERROR(__FILE__, __LINE__,
485 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
486 				 strbuf);
487 
488 		return (ISC_R_UNEXPECTED);
489 	}
490 
491 	return (ISC_R_SUCCESS);
492 }
493 
494 /*
495  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
496  * In order to ensure as much portability as possible, we provide wrapper
497  * functions of these macros.
498  * Note that cmsg_space() could run slow on OSes that do not have
499  * CMSG_SPACE.
500  */
501 static inline socklen_t
502 cmsg_len(socklen_t len) {
503 	return (CMSG_LEN(len));
504 }
505 
506 static inline socklen_t
507 cmsg_space(socklen_t len) {
508 	return (CMSG_SPACE(len));
509 }
510 
511 /*
512  * Process control messages received on a socket.
513  */
514 static void
515 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
516 	struct cmsghdr *cmsgp;
517 	struct in6_pktinfo *pktinfop;
518 	void *timevalp;
519 
520 	/*
521 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
522 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
523 	 * They are all here, outside of the CPP tests, because it is
524 	 * more consistent with the usual ISC coding style.
525 	 */
526 	UNUSED(sock);
527 	UNUSED(msg);
528 	UNUSED(dev);
529 
530 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
531 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
532 
533 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
534 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
535 
536 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
537 		return;
538 
539 	timevalp = NULL;
540 	pktinfop = NULL;
541 
542 	cmsgp = CMSG_FIRSTHDR(msg);
543 	while (cmsgp != NULL) {
544 		socket_log(sock, NULL, TRACE,
545 			   "processing cmsg %p", cmsgp);
546 
547 		if (cmsgp->cmsg_level == IPPROTO_IPV6
548 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
549 
550 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
551 			memmove(&dev->pktinfo, pktinfop,
552 				sizeof(struct in6_pktinfo));
553 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
554 			socket_log(sock, NULL, TRACE,
555 				   "interface received on ifindex %u",
556 				   dev->pktinfo.ipi6_ifindex);
557 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
558 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
559 			goto next;
560 		}
561 
562 		if (cmsgp->cmsg_level == SOL_SOCKET
563 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
564 			struct timeval tv;
565 			timevalp = CMSG_DATA(cmsgp);
566 			memmove(&tv, timevalp, sizeof(tv));
567 			dev->timestamp.seconds = tv.tv_sec;
568 			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
569 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
570 			goto next;
571 		}
572 
573 		if (cmsgp->cmsg_level == IPPROTO_IPV6
574 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
575 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
576 			dev->dscp >>= 2;
577 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
578 			goto next;
579 		}
580 
581 		if (cmsgp->cmsg_level == IPPROTO_IP
582 		    && (cmsgp->cmsg_type == IP_TOS)) {
583 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
584 			dev->dscp >>= 2;
585 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
586 			goto next;
587 		}
588 	next:
589 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
590 	}
591 
592 }
593 
594 /*
595  * Construct an iov array and attach it to the msghdr passed in.  This is
596  * the SEND constructor, which will use the used region of the buffer
597  * (if using a buffer list) or will use the internal region (if a single
598  * buffer I/O is requested).
599  *
600  * Nothing can be NULL, and the done event must list at least one buffer
601  * on the buffer linked list for this function to be meaningful.
602  *
603  * If write_countp != NULL, *write_countp will hold the number of bytes
604  * this transaction can send.
605  */
606 static void
607 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
608 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
609 {
610 	unsigned int iovcount;
611 	isc_buffer_t *buffer;
612 	isc_region_t used;
613 	size_t write_count;
614 	size_t skip_count;
615 	struct cmsghdr *cmsgp;
616 
617 	memset(msg, 0, sizeof(*msg));
618 
619 	if (!sock->connected) {
620 		msg->msg_name = (void *)&dev->address.type.sa;
621 		msg->msg_namelen = dev->address.length;
622 	} else {
623 		msg->msg_name = NULL;
624 		msg->msg_namelen = 0;
625 	}
626 
627 	buffer = ISC_LIST_HEAD(dev->bufferlist);
628 	write_count = 0;
629 	iovcount = 0;
630 
631 	/*
632 	 * Single buffer I/O?  Skip what we've done so far in this region.
633 	 */
634 	if (buffer == NULL) {
635 		write_count = dev->region.length - dev->n;
636 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
637 		iov[0].iov_len = write_count;
638 		iovcount = 1;
639 
640 		goto config;
641 	}
642 
643 	/*
644 	 * Multibuffer I/O.
645 	 * Skip the data in the buffer list that we have already written.
646 	 */
647 	skip_count = dev->n;
648 	while (buffer != NULL) {
649 		REQUIRE(ISC_BUFFER_VALID(buffer));
650 		if (skip_count < isc_buffer_usedlength(buffer))
651 			break;
652 		skip_count -= isc_buffer_usedlength(buffer);
653 		buffer = ISC_LIST_NEXT(buffer, link);
654 	}
655 
656 	while (buffer != NULL) {
657 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
658 
659 		isc_buffer_usedregion(buffer, &used);
660 
661 		if (used.length > 0) {
662 			iov[iovcount].iov_base = (void *)(used.base
663 							  + skip_count);
664 			iov[iovcount].iov_len = used.length - skip_count;
665 			write_count += (used.length - skip_count);
666 			skip_count = 0;
667 			iovcount++;
668 		}
669 		buffer = ISC_LIST_NEXT(buffer, link);
670 	}
671 
672 	INSIST(skip_count == 0U);
673 
674  config:
675 	msg->msg_iov = iov;
676 	msg->msg_iovlen = iovcount;
677 
678 	msg->msg_control = NULL;
679 	msg->msg_controllen = 0;
680 	msg->msg_flags = 0;
681 
682 	if ((sock->type == isc_sockettype_udp) &&
683 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
684 	{
685 		struct in6_pktinfo *pktinfop;
686 
687 		socket_log(sock, NULL, TRACE,
688 			   "sendto pktinfo data, ifindex %u",
689 			   dev->pktinfo.ipi6_ifindex);
690 
691 		msg->msg_control = (void *)cmsgbuf;
692 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
693 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
694 
695 		cmsgp = (struct cmsghdr *)cmsgbuf;
696 		cmsgp->cmsg_level = IPPROTO_IPV6;
697 		cmsgp->cmsg_type = IPV6_PKTINFO;
698 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
699 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
700 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
701 	}
702 
703 	if ((sock->type == isc_sockettype_udp) &&
704 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
705 	{
706 		int use_min_mtu = 1;	/* -1, 0, 1 */
707 
708 		cmsgp = (struct cmsghdr *)(cmsgbuf +
709 					   msg->msg_controllen);
710 
711 		msg->msg_control = (void *)cmsgbuf;
712 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
713 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
714 
715 		cmsgp->cmsg_level = IPPROTO_IPV6;
716 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
717 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
718 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
719 	}
720 
721 	if (isc_dscp_check_value > -1) {
722 		if (sock->type == isc_sockettype_udp)
723 			INSIST((int)dev->dscp == isc_dscp_check_value);
724 		else if (sock->type == isc_sockettype_tcp)
725 			INSIST((int)sock->dscp == isc_dscp_check_value);
726 	}
727 
728 	if ((sock->type == isc_sockettype_udp) &&
729 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
730 	{
731 		int dscp = (dev->dscp << 2) & 0xff;
732 
733 		INSIST(dev->dscp < 0x40);
734 
735 		if (sock->pf == AF_INET && sock->pktdscp) {
736 			cmsgp = (struct cmsghdr *)(cmsgbuf +
737 						   msg->msg_controllen);
738 			msg->msg_control = (void *)cmsgbuf;
739 			msg->msg_controllen += cmsg_space(sizeof(dscp));
740 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
741 
742 			cmsgp->cmsg_level = IPPROTO_IP;
743 			cmsgp->cmsg_type = IP_TOS;
744 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
745 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
746 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
747 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
748 			       (void *)&dscp, sizeof(int)) < 0)
749 			{
750 				char strbuf[ISC_STRERRORSIZE];
751 				isc__strerror(errno, strbuf, sizeof(strbuf));
752 				UNEXPECTED_ERROR(__FILE__, __LINE__,
753 						 "setsockopt(%d, IP_TOS, %.02x)"
754 						 " %s: %s",
755 						 sock->fd, dscp >> 2,
756 						 "failed", strbuf);
757 			} else
758 				sock->dscp = dscp;
759 		}
760 
761 		if (sock->pf == AF_INET6 && sock->pktdscp) {
762 			cmsgp = (struct cmsghdr *)(cmsgbuf +
763 						   msg->msg_controllen);
764 			msg->msg_control = (void *)cmsgbuf;
765 			msg->msg_controllen += cmsg_space(sizeof(dscp));
766 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
767 
768 			cmsgp->cmsg_level = IPPROTO_IPV6;
769 			cmsgp->cmsg_type = IPV6_TCLASS;
770 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
771 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
772 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
773 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
774 				       (void *)&dscp, sizeof(int)) < 0) {
775 				char strbuf[ISC_STRERRORSIZE];
776 				isc__strerror(errno, strbuf, sizeof(strbuf));
777 				UNEXPECTED_ERROR(__FILE__, __LINE__,
778 						 "setsockopt(%d, IPV6_TCLASS, "
779 						 "%.02x) %s: %s",
780 						 sock->fd, dscp >> 2,
781 						 "failed", strbuf);
782 			} else
783 				sock->dscp = dscp;
784 		}
785 
786 		if (msg->msg_controllen != 0 &&
787 		    msg->msg_controllen < SENDCMSGBUFLEN)
788 		{
789 			memset(cmsgbuf + msg->msg_controllen, 0,
790 			       SENDCMSGBUFLEN - msg->msg_controllen);
791 		}
792 	}
793 
794 	if (write_countp != NULL)
795 		*write_countp = write_count;
796 }
797 
798 /*
799  * Construct an iov array and attach it to the msghdr passed in.  This is
800  * the RECV constructor, which will use the available region of the buffer
801  * (if using a buffer list) or will use the internal region (if a single
802  * buffer I/O is requested).
803  *
804  * Nothing can be NULL, and the done event must list at least one buffer
805  * on the buffer linked list for this function to be meaningful.
806  *
807  * If read_countp != NULL, *read_countp will hold the number of bytes
808  * this transaction can receive.
809  */
810 static void
811 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
812 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
813 {
814 	unsigned int iovcount;
815 	isc_buffer_t *buffer;
816 	isc_region_t available;
817 	size_t read_count;
818 
819 	memset(msg, 0, sizeof(struct msghdr));
820 
821 	if (sock->type == isc_sockettype_udp) {
822 		memset(&dev->address, 0, sizeof(dev->address));
823 		msg->msg_name = (void *)&dev->address.type.sa;
824 		msg->msg_namelen = sizeof(dev->address.type);
825 	} else { /* TCP */
826 		msg->msg_name = NULL;
827 		msg->msg_namelen = 0;
828 		dev->address = sock->peer_address;
829 	}
830 
831 	buffer = ISC_LIST_HEAD(dev->bufferlist);
832 	read_count = 0;
833 
834 	/*
835 	 * Single buffer I/O?  Skip what we've done so far in this region.
836 	 */
837 	if (buffer == NULL) {
838 		read_count = dev->region.length - dev->n;
839 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
840 		iov[0].iov_len = read_count;
841 		iovcount = 1;
842 
843 		goto config;
844 	}
845 
846 	/*
847 	 * Multibuffer I/O.
848 	 * Skip empty buffers.
849 	 */
850 	while (buffer != NULL) {
851 		REQUIRE(ISC_BUFFER_VALID(buffer));
852 		if (isc_buffer_availablelength(buffer) != 0)
853 			break;
854 		buffer = ISC_LIST_NEXT(buffer, link);
855 	}
856 
857 	iovcount = 0;
858 	while (buffer != NULL) {
859 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
860 
861 		isc_buffer_availableregion(buffer, &available);
862 
863 		if (available.length > 0) {
864 			iov[iovcount].iov_base = (void *)(available.base);
865 			iov[iovcount].iov_len = available.length;
866 			read_count += available.length;
867 			iovcount++;
868 		}
869 		buffer = ISC_LIST_NEXT(buffer, link);
870 	}
871 
872  config:
873 
874 	/*
875 	 * If needed, set up to receive that one extra byte.
876 	 */
877 	msg->msg_iov = iov;
878 	msg->msg_iovlen = iovcount;
879 
880 	msg->msg_control = cmsgbuf;
881 	msg->msg_controllen = RECVCMSGBUFLEN;
882 	msg->msg_flags = 0;
883 
884 	if (read_countp != NULL)
885 		*read_countp = read_count;
886 }
887 
888 static void
889 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
890 		isc_socketevent_t *dev)
891 {
892 	if (sock->type == isc_sockettype_udp) {
893 		if (address != NULL)
894 			dev->address = *address;
895 		else
896 			dev->address = sock->peer_address;
897 	} else if (sock->type == isc_sockettype_tcp) {
898 		INSIST(address == NULL);
899 		dev->address = sock->peer_address;
900 	}
901 }
902 
903 static void
904 destroy_socketevent(isc_event_t *event) {
905 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
906 
907 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
908 
909 	(ev->destroy)(event);
910 }
911 
912 static isc_socketevent_t *
913 allocate_socketevent(void *sender,
914 		     isc_eventtype_t eventtype, isc_taskaction_t action,
915 		     void *arg)
916 {
917 	isc_socketevent_t *ev;
918 
919 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
920 						     eventtype, action, arg,
921 						     sizeof(*ev));
922 
923 	if (ev == NULL)
924 		return (NULL);
925 
926 	ev->result = ISC_R_UNSET;
927 	ISC_LINK_INIT(ev, ev_link);
928 	ISC_LIST_INIT(ev->bufferlist);
929 	ev->region.base = NULL;
930 	ev->n = 0;
931 	ev->offset = 0;
932 	ev->attributes = 0;
933 	ev->destroy = ev->ev_destroy;
934 	ev->ev_destroy = destroy_socketevent;
935 	ev->dscp = 0;
936 
937 	return (ev);
938 }
939 
940 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
941 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
942 #define DOIO_HARD		2	/* i/o error, event sent */
943 #define DOIO_EOF		3	/* EOF, no event sent */
944 
945 static int
946 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
947 	int cc;
948 	struct iovec iov[MAXSCATTERGATHER_RECV];
949 	size_t read_count;
950 	size_t actual_count;
951 	struct msghdr msghdr;
952 	isc_buffer_t *buffer;
953 	int recv_errno;
954 	char strbuf[ISC_STRERRORSIZE];
955 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
956 
957 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
958 
959 	cc = recvmsg(sock->fd, &msghdr, 0);
960 	recv_errno = errno;
961 
962 	if (cc < 0) {
963 		if (SOFT_ERROR(recv_errno))
964 			return (DOIO_SOFT);
965 
966 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
967 			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
968 			socket_log(sock, NULL, IOEVENT,
969 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
970 				   sock->fd, cc, recv_errno, strbuf);
971 		}
972 
973 #define SOFT_OR_HARD(_system, _isc) \
974 	if (recv_errno == _system) { \
975 		if (sock->connected) { \
976 			dev->result = _isc; \
977 			return (DOIO_HARD); \
978 		} \
979 		return (DOIO_SOFT); \
980 	}
981 #define ALWAYS_HARD(_system, _isc) \
982 	if (recv_errno == _system) { \
983 		dev->result = _isc; \
984 		return (DOIO_HARD); \
985 	}
986 
987 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
988 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
989 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
990 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
991 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
992 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
993 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
994 		/* Should never get this one but it was seen. */
995 #ifdef ENOPROTOOPT
996 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
997 #endif
998 		/*
999 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
1000 		 * errors.
1001 		 */
1002 #ifdef EPROTO
1003 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
1004 #endif
1005 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
1006 
1007 #undef SOFT_OR_HARD
1008 #undef ALWAYS_HARD
1009 
1010 		dev->result = isc__errno2result(recv_errno);
1011 		return (DOIO_HARD);
1012 	}
1013 
1014 	/*
1015 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
1016 	 * while on UDP sockets, zero length reads are perfectly valid,
1017 	 * although strange.
1018 	 */
1019 	switch (sock->type) {
1020 	case isc_sockettype_tcp:
1021 		if (cc == 0)
1022 			return (DOIO_EOF);
1023 		break;
1024 	case isc_sockettype_udp:
1025 		break;
1026 	default:
1027 		INSIST(0);
1028 	}
1029 
1030 	if (sock->type == isc_sockettype_udp) {
1031 		dev->address.length = msghdr.msg_namelen;
1032 		if (isc_sockaddr_getport(&dev->address) == 0) {
1033 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1034 				socket_log(sock, &dev->address, IOEVENT,
1035 					   "dropping source port zero packet");
1036 			}
1037 			return (DOIO_SOFT);
1038 		}
1039 	}
1040 
1041 	socket_log(sock, &dev->address, IOEVENT,
1042 		   "packet received correctly");
1043 
1044 	/*
1045 	 * Overflow bit detection.  If we received MORE bytes than we should,
1046 	 * this indicates an overflow situation.  Set the flag in the
1047 	 * dev entry and adjust how much we read by one.
1048 	 */
1049 	/*
1050 	 * If there are control messages attached, run through them and pull
1051 	 * out the interesting bits.
1052 	 */
1053 	process_cmsg(sock, &msghdr, dev);
1054 
1055 	/*
1056 	 * update the buffers (if any) and the i/o count
1057 	 */
1058 	dev->n += cc;
1059 	actual_count = cc;
1060 	buffer = ISC_LIST_HEAD(dev->bufferlist);
1061 	while (buffer != NULL && actual_count > 0U) {
1062 		REQUIRE(ISC_BUFFER_VALID(buffer));
1063 		if (isc_buffer_availablelength(buffer) <= actual_count) {
1064 			actual_count -= isc_buffer_availablelength(buffer);
1065 			isc_buffer_add(buffer,
1066 				       isc_buffer_availablelength(buffer));
1067 		} else {
1068 			isc_buffer_add(buffer, actual_count);
1069 			actual_count = 0;
1070 			POST(actual_count);
1071 			break;
1072 		}
1073 		buffer = ISC_LIST_NEXT(buffer, link);
1074 		if (buffer == NULL) {
1075 			INSIST(actual_count == 0U);
1076 		}
1077 	}
1078 
1079 	/*
1080 	 * If we read less than we expected, update counters,
1081 	 * and let the upper layer poke the descriptor.
1082 	 */
1083 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1084 		return (DOIO_SOFT);
1085 
1086 	/*
1087 	 * Full reads are posted, or partials if partials are ok.
1088 	 */
1089 	dev->result = ISC_R_SUCCESS;
1090 	return (DOIO_SUCCESS);
1091 }
1092 
1093 /*
1094  * Returns:
1095  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1096  *			ISC_R_SUCCESS.
1097  *
1098  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1099  *			dev->result contains the appropriate error.
1100  *
1101  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1102  *			event was sent.  The operation should be retried.
1103  *
1104  *	No other return values are possible.
1105  */
1106 static int
1107 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1108 	int cc;
1109 	struct iovec iov[MAXSCATTERGATHER_SEND];
1110 	size_t write_count;
1111 	struct msghdr msghdr;
1112 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1113 	int attempts = 0;
1114 	int send_errno;
1115 	char strbuf[ISC_STRERRORSIZE];
1116 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1117 
1118 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1119 
1120  resend:
1121 	cc = sendmsg(sock->fd, &msghdr, 0);
1122 	send_errno = errno;
1123 
1124 	/*
1125 	 * Check for error or block condition.
1126 	 */
1127 	if (cc < 0) {
1128 		if (send_errno == EINTR && ++attempts < NRETRIES)
1129 			goto resend;
1130 
1131 		if (SOFT_ERROR(send_errno)) {
1132 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1133 				dev->result = ISC_R_WOULDBLOCK;
1134 			return (DOIO_SOFT);
1135 		}
1136 
1137 #define SOFT_OR_HARD(_system, _isc) \
1138 	if (send_errno == _system) { \
1139 		if (sock->connected) { \
1140 			dev->result = _isc; \
1141 			return (DOIO_HARD); \
1142 		} \
1143 		return (DOIO_SOFT); \
1144 	}
1145 #define ALWAYS_HARD(_system, _isc) \
1146 	if (send_errno == _system) { \
1147 		dev->result = _isc; \
1148 		return (DOIO_HARD); \
1149 	}
1150 
1151 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1152 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1153 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1154 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1155 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1156 #ifdef EHOSTDOWN
1157 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1158 #endif
1159 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1160 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1161 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1162 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1163 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1164 
1165 #undef SOFT_OR_HARD
1166 #undef ALWAYS_HARD
1167 
1168 		/*
1169 		 * The other error types depend on whether or not the
1170 		 * socket is UDP or TCP.  If it is UDP, some errors
1171 		 * that we expect to be fatal under TCP are merely
1172 		 * annoying, and are really soft errors.
1173 		 *
1174 		 * However, these soft errors are still returned as
1175 		 * a status.
1176 		 */
1177 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1178 		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1179 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1180 				 addrbuf, strbuf);
1181 		dev->result = isc__errno2result(send_errno);
1182 		return (DOIO_HARD);
1183 	}
1184 
1185 	if (cc == 0) {
1186 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1187 				 "doio_send: send() %s 0", "returned");
1188 	}
1189 
1190 	/*
1191 	 * If we write less than we expected, update counters, poke.
1192 	 */
1193 	dev->n += cc;
1194 	if ((size_t)cc != write_count)
1195 		return (DOIO_SOFT);
1196 
1197 	/*
1198 	 * Exactly what we wanted to write.  We're done with this
1199 	 * entry.  Post its completion event.
1200 	 */
1201 	dev->result = ISC_R_SUCCESS;
1202 	return (DOIO_SUCCESS);
1203 }
1204 
1205 /*
1206  * Kill.
1207  *
1208  * Caller must ensure that the socket is not locked and no external
1209  * references exist.
1210  */
1211 static void
1212 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1213 	/*
1214 	 * No one has this socket open, so the watcher doesn't have to be
1215 	 * poked, and the socket doesn't have to be locked.
1216 	 */
1217 	manager->fds[fd] = NULL;
1218 	manager->fdstate[fd] = CLOSE_PENDING;
1219 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1220 
1221 	if (sock->active == 1) {
1222 		sock->active = 0;
1223 	}
1224 
1225 	/*
1226 	 * update manager->maxfd here (XXX: this should be implemented more
1227 	 * efficiently)
1228 	 */
1229 	if (manager->maxfd == fd) {
1230 		int i;
1231 
1232 		manager->maxfd = 0;
1233 		for (i = fd - 1; i >= 0; i--) {
1234 			if (manager->fdstate[i] == MANAGED) {
1235 				manager->maxfd = i;
1236 				break;
1237 			}
1238 		}
1239 	}
1240 
1241 }
1242 
1243 static void
1244 destroy(isc__socket_t **sockp) {
1245 	int fd;
1246 	isc__socket_t *sock = *sockp;
1247 	isc__socketmgr_t *manager = sock->manager;
1248 
1249 	socket_log(sock, NULL, CREATION, "destroying");
1250 
1251 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1252 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1253 	INSIST(sock->connect_ev == NULL);
1254 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1255 
1256 	if (sock->fd >= 0) {
1257 		fd = sock->fd;
1258 		sock->fd = -1;
1259 		socketclose(manager, sock, fd);
1260 	}
1261 
1262 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1263 
1264 	/* can't unlock manager as its memory context is still used */
1265 	free_socket(sockp);
1266 }
1267 
1268 static isc_result_t
1269 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1270 		isc__socket_t **socketp)
1271 {
1272 	isc__socket_t *sock;
1273 
1274 	sock = malloc(sizeof(*sock));
1275 
1276 	if (sock == NULL)
1277 		return (ISC_R_NOMEMORY);
1278 
1279 	sock->common.magic = 0;
1280 	sock->common.impmagic = 0;
1281 	sock->references = 0;
1282 
1283 	sock->manager = manager;
1284 	sock->type = type;
1285 	sock->fd = -1;
1286 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1287 	sock->active = 0;
1288 
1289 	ISC_LINK_INIT(sock, link);
1290 
1291 	/*
1292 	 * Set up list of readers and writers to be initially empty.
1293 	 */
1294 	ISC_LIST_INIT(sock->recv_list);
1295 	ISC_LIST_INIT(sock->send_list);
1296 	sock->connect_ev = NULL;
1297 	sock->pending_recv = 0;
1298 	sock->pending_send = 0;
1299 	sock->connected = 0;
1300 	sock->connecting = 0;
1301 	sock->bound = 0;
1302 	sock->pktdscp = 0;
1303 
1304 	/*
1305 	 * Initialize readable and writable events.
1306 	 */
1307 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1308 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1309 		       NULL, sock, sock, NULL);
1310 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1311 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1312 		       NULL, sock, sock, NULL);
1313 
1314 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1315 	sock->common.impmagic = SOCKET_MAGIC;
1316 	*socketp = sock;
1317 
1318 	return (ISC_R_SUCCESS);
1319 }
1320 
1321 /*
1322  * This event requires that the various lists be empty, that the reference
1323  * count be 1, and that the magic number is valid.  The other socket bits,
1324  * like the lock, must be initialized as well.  The fd associated must be
1325  * marked as closed, by setting it to -1 on close, or this routine will
1326  * also close the socket.
1327  */
1328 static void
1329 free_socket(isc__socket_t **socketp) {
1330 	isc__socket_t *sock = *socketp;
1331 
1332 	INSIST(VALID_SOCKET(sock));
1333 	INSIST(sock->references == 0);
1334 	INSIST(!sock->connecting);
1335 	INSIST(!sock->pending_recv);
1336 	INSIST(!sock->pending_send);
1337 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1338 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1339 	INSIST(!ISC_LINK_LINKED(sock, link));
1340 
1341 	sock->common.magic = 0;
1342 	sock->common.impmagic = 0;
1343 
1344 	free(sock);
1345 
1346 	*socketp = NULL;
1347 }
1348 
1349 static void
1350 use_min_mtu(isc__socket_t *sock) {
1351 	/* use minimum MTU */
1352 	if (sock->pf == AF_INET6) {
1353 		int on = 1;
1354 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1355 				(void *)&on, sizeof(on));
1356 	}
1357 }
1358 
1359 static void
1360 set_tcp_maxseg(isc__socket_t *sock, int size) {
1361 	if (sock->type == isc_sockettype_tcp)
1362 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1363 				(void *)&size, sizeof(size));
1364 }
1365 
1366 static isc_result_t
1367 opensocket(isc__socket_t *sock)
1368 {
1369 	isc_result_t result;
1370 	char strbuf[ISC_STRERRORSIZE];
1371 	const char *err = "socket";
1372 	int on = 1;
1373 
1374 	switch (sock->type) {
1375 	case isc_sockettype_udp:
1376 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1377 		break;
1378 	case isc_sockettype_tcp:
1379 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1380 		break;
1381 	}
1382 
1383 	if (sock->fd < 0) {
1384 		switch (errno) {
1385 		case EMFILE:
1386 		case ENFILE:
1387 			isc__strerror(errno, strbuf, sizeof(strbuf));
1388 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1389 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1390 				       "%s: %s", err, strbuf);
1391 			/* fallthrough */
1392 		case ENOBUFS:
1393 			return (ISC_R_NORESOURCES);
1394 
1395 		case EPROTONOSUPPORT:
1396 		case EPFNOSUPPORT:
1397 		case EAFNOSUPPORT:
1398 		/*
1399 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1400 		 * EAFNOSUPPORT.
1401 		 */
1402 		case EINVAL:
1403 			return (ISC_R_FAMILYNOSUPPORT);
1404 
1405 		default:
1406 			isc__strerror(errno, strbuf, sizeof(strbuf));
1407 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1408 					 "%s() %s: %s", err, "failed",
1409 					 strbuf);
1410 			return (ISC_R_UNEXPECTED);
1411 		}
1412 	}
1413 
1414 	result = make_nonblock(sock->fd);
1415 	if (result != ISC_R_SUCCESS) {
1416 		(void)close(sock->fd);
1417 		return (result);
1418 	}
1419 
1420 	/*
1421 	 * Use minimum mtu if possible.
1422 	 */
1423 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1424 		use_min_mtu(sock);
1425 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1426 	}
1427 
1428 	if (sock->type == isc_sockettype_udp) {
1429 
1430 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1431 			       (void *)&on, sizeof(on)) < 0
1432 		    && errno != ENOPROTOOPT) {
1433 			isc__strerror(errno, strbuf, sizeof(strbuf));
1434 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1435 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1436 					 sock->fd, "failed", strbuf);
1437 			/* Press on... */
1438 		}
1439 
1440 		/* RFC 3542 */
1441 		if ((sock->pf == AF_INET6)
1442 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1443 				   (void *)&on, sizeof(on)) < 0)) {
1444 			isc__strerror(errno, strbuf, sizeof(strbuf));
1445 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1446 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1447 					 "%s: %s", sock->fd, "failed",
1448 					 strbuf);
1449 		}
1450 	}
1451 
1452 	if (sock->active == 0) {
1453 		sock->active = 1;
1454 	}
1455 
1456 	return (ISC_R_SUCCESS);
1457 }
1458 
1459 /*
1460  * Create a 'type' socket managed
1461  * by 'manager'.  Events will be posted to 'task' and when dispatched
1462  * 'action' will be called with 'arg' as the arg value.  The new
1463  * socket is returned in 'socketp'.
1464  */
1465 static isc_result_t
1466 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1467 	      isc_socket_t **socketp)
1468 {
1469 	isc__socket_t *sock = NULL;
1470 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1471 	isc_result_t result;
1472 	int lockid;
1473 
1474 	REQUIRE(VALID_MANAGER(manager));
1475 	REQUIRE(socketp != NULL && *socketp == NULL);
1476 
1477 	result = allocate_socket(manager, type, &sock);
1478 	if (result != ISC_R_SUCCESS)
1479 		return (result);
1480 
1481 	switch (sock->type) {
1482 	case isc_sockettype_udp:
1483 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1484 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1485 		break;
1486 	case isc_sockettype_tcp:
1487 		break;
1488 	default:
1489 		INSIST(0);
1490 	}
1491 
1492 	sock->pf = pf;
1493 
1494 	result = opensocket(sock);
1495 	if (result != ISC_R_SUCCESS) {
1496 		free_socket(&sock);
1497 		return (result);
1498 	}
1499 
1500 	sock->common.methods = (isc_socketmethods_t *)&socketmethods;
1501 	sock->references = 1;
1502 	*socketp = (isc_socket_t *)sock;
1503 
1504 	/*
1505 	 * Note we don't have to lock the socket like we normally would because
1506 	 * there are no external references to it yet.
1507 	 */
1508 
1509 	lockid = FDLOCK_ID(sock->fd);
1510 	manager->fds[sock->fd] = sock;
1511 	manager->fdstate[sock->fd] = MANAGED;
1512 
1513 	ISC_LIST_APPEND(manager->socklist, sock, link);
1514 	if (manager->maxfd < sock->fd)
1515 		manager->maxfd = sock->fd;
1516 
1517 	socket_log(sock, NULL, CREATION, "created");
1518 
1519 	return (ISC_R_SUCCESS);
1520 }
1521 
1522 /*%
1523  * Create a new 'type' socket managed by 'manager'.  Events
1524  * will be posted to 'task' and when dispatched 'action' will be
1525  * called with 'arg' as the arg value.  The new socket is returned
1526  * in 'socketp'.
1527  */
1528 isc_result_t
1529 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1530 		   isc_socket_t **socketp)
1531 {
1532 	return (socket_create(manager0, pf, type, socketp));
1533 }
1534 
1535 /*
1536  * Attach to a socket.  Caller must explicitly detach when it is done.
1537  */
1538 void
1539 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1540 	isc__socket_t *sock = (isc__socket_t *)sock0;
1541 
1542 	REQUIRE(VALID_SOCKET(sock));
1543 	REQUIRE(socketp != NULL && *socketp == NULL);
1544 
1545 	sock->references++;
1546 
1547 	*socketp = (isc_socket_t *)sock;
1548 }
1549 
1550 /*
1551  * Dereference a socket.  If this is the last reference to it, clean things
1552  * up by destroying the socket.
1553  */
1554 void
1555 isc__socket_detach(isc_socket_t **socketp) {
1556 	isc__socket_t *sock;
1557 	isc_boolean_t kill_socket = ISC_FALSE;
1558 
1559 	REQUIRE(socketp != NULL);
1560 	sock = (isc__socket_t *)*socketp;
1561 	REQUIRE(VALID_SOCKET(sock));
1562 
1563 	REQUIRE(sock->references > 0);
1564 	sock->references--;
1565 	if (sock->references == 0)
1566 		kill_socket = ISC_TRUE;
1567 
1568 	if (kill_socket)
1569 		destroy(&sock);
1570 
1571 	*socketp = NULL;
1572 }
1573 
1574 /*
1575  * I/O is possible on a given socket.  Schedule an event to this task that
1576  * will call an internal function to do the I/O.  This will charge the
1577  * task with the I/O operation and let our select loop handler get back
1578  * to doing something real as fast as possible.
1579  *
1580  * The socket and manager must be locked before calling this function.
1581  */
1582 static void
1583 dispatch_recv(isc__socket_t *sock) {
1584 	intev_t *iev;
1585 	isc_socketevent_t *ev;
1586 	isc_task_t *sender;
1587 
1588 	INSIST(!sock->pending_recv);
1589 
1590 	ev = ISC_LIST_HEAD(sock->recv_list);
1591 	if (ev == NULL)
1592 		return;
1593 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1594 		   "dispatch_recv:  event %p -> task %p",
1595 		   ev, ev->ev_sender);
1596 	sender = ev->ev_sender;
1597 
1598 	sock->pending_recv = 1;
1599 	iev = &sock->readable_ev;
1600 
1601 	sock->references++;
1602 	iev->ev_sender = sock;
1603 	iev->ev_action = internal_recv;
1604 	iev->ev_arg = sock;
1605 
1606 	isc_task_send(sender, (isc_event_t **)&iev);
1607 }
1608 
1609 static void
1610 dispatch_send(isc__socket_t *sock) {
1611 	intev_t *iev;
1612 	isc_socketevent_t *ev;
1613 	isc_task_t *sender;
1614 
1615 	INSIST(!sock->pending_send);
1616 
1617 	ev = ISC_LIST_HEAD(sock->send_list);
1618 	if (ev == NULL)
1619 		return;
1620 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1621 		   "dispatch_send:  event %p -> task %p",
1622 		   ev, ev->ev_sender);
1623 	sender = ev->ev_sender;
1624 
1625 	sock->pending_send = 1;
1626 	iev = &sock->writable_ev;
1627 
1628 	sock->references++;
1629 	iev->ev_sender = sock;
1630 	iev->ev_action = internal_send;
1631 	iev->ev_arg = sock;
1632 
1633 	isc_task_send(sender, (isc_event_t **)&iev);
1634 }
1635 
1636 static void
1637 dispatch_connect(isc__socket_t *sock) {
1638 	intev_t *iev;
1639 	isc_socket_connev_t *ev;
1640 
1641 	iev = &sock->writable_ev;
1642 
1643 	ev = sock->connect_ev;
1644 	INSIST(ev != NULL); /* XXX */
1645 
1646 	INSIST(sock->connecting);
1647 
1648 	sock->references++;  /* keep socket around for this internal event */
1649 	iev->ev_sender = sock;
1650 	iev->ev_action = internal_connect;
1651 	iev->ev_arg = sock;
1652 
1653 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1654 }
1655 
1656 /*
1657  * Dequeue an item off the given socket's read queue, set the result code
1658  * in the done event to the one provided, and send it to the task it was
1659  * destined for.
1660  *
1661  * If the event to be sent is on a list, remove it before sending.  If
1662  * asked to, send and detach from the socket as well.
1663  *
1664  * Caller must have the socket locked if the event is attached to the socket.
1665  */
1666 static void
1667 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1668 	isc_task_t *task;
1669 
1670 	task = (*dev)->ev_sender;
1671 
1672 	(*dev)->ev_sender = sock;
1673 
1674 	if (ISC_LINK_LINKED(*dev, ev_link))
1675 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1676 
1677 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1678 	    == ISC_SOCKEVENTATTR_ATTACHED)
1679 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1680 	else
1681 		isc_task_send(task, (isc_event_t **)dev);
1682 }
1683 
1684 /*
1685  * See comments for send_recvdone_event() above.
1686  *
1687  * Caller must have the socket locked if the event is attached to the socket.
1688  */
1689 static void
1690 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1691 	isc_task_t *task;
1692 
1693 	INSIST(dev != NULL && *dev != NULL);
1694 
1695 	task = (*dev)->ev_sender;
1696 	(*dev)->ev_sender = sock;
1697 
1698 	if (ISC_LINK_LINKED(*dev, ev_link))
1699 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1700 
1701 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1702 	    == ISC_SOCKEVENTATTR_ATTACHED)
1703 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1704 	else
1705 		isc_task_send(task, (isc_event_t **)dev);
1706 }
1707 
1708 static void
1709 internal_recv(isc_task_t *me, isc_event_t *ev) {
1710 	isc_socketevent_t *dev;
1711 	isc__socket_t *sock;
1712 
1713 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1714 
1715 	sock = ev->ev_sender;
1716 	INSIST(VALID_SOCKET(sock));
1717 
1718 	socket_log(sock, NULL, IOEVENT,
1719 		   "internal_recv: task %p got event %p", me, ev);
1720 
1721 	INSIST(sock->pending_recv == 1);
1722 	sock->pending_recv = 0;
1723 
1724 	INSIST(sock->references > 0);
1725 	sock->references--;  /* the internal event is done with this socket */
1726 	if (sock->references == 0) {
1727 		destroy(&sock);
1728 		return;
1729 	}
1730 
1731 	/*
1732 	 * Try to do as much I/O as possible on this socket.  There are no
1733 	 * limits here, currently.
1734 	 */
1735 	dev = ISC_LIST_HEAD(sock->recv_list);
1736 	while (dev != NULL) {
1737 		switch (doio_recv(sock, dev)) {
1738 		case DOIO_SOFT:
1739 			goto poke;
1740 
1741 		case DOIO_EOF:
1742 			/*
1743 			 * read of 0 means the remote end was closed.
1744 			 * Run through the event queue and dispatch all
1745 			 * the events with an EOF result code.
1746 			 */
1747 			do {
1748 				dev->result = ISC_R_EOF;
1749 				send_recvdone_event(sock, &dev);
1750 				dev = ISC_LIST_HEAD(sock->recv_list);
1751 			} while (dev != NULL);
1752 			goto poke;
1753 
1754 		case DOIO_SUCCESS:
1755 		case DOIO_HARD:
1756 			send_recvdone_event(sock, &dev);
1757 			break;
1758 		}
1759 
1760 		dev = ISC_LIST_HEAD(sock->recv_list);
1761 	}
1762 
1763  poke:
1764 	if (!ISC_LIST_EMPTY(sock->recv_list))
1765 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1766 }
1767 
1768 static void
1769 internal_send(isc_task_t *me, isc_event_t *ev) {
1770 	isc_socketevent_t *dev;
1771 	isc__socket_t *sock;
1772 
1773 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1774 
1775 	/*
1776 	 * Find out what socket this is and lock it.
1777 	 */
1778 	sock = (isc__socket_t *)ev->ev_sender;
1779 	INSIST(VALID_SOCKET(sock));
1780 	socket_log(sock, NULL, IOEVENT,
1781 		   "internal_send: task %p got event %p", me, ev);
1782 
1783 	INSIST(sock->pending_send == 1);
1784 	sock->pending_send = 0;
1785 
1786 	INSIST(sock->references > 0);
1787 	sock->references--;  /* the internal event is done with this socket */
1788 	if (sock->references == 0) {
1789 		destroy(&sock);
1790 		return;
1791 	}
1792 
1793 	/*
1794 	 * Try to do as much I/O as possible on this socket.  There are no
1795 	 * limits here, currently.
1796 	 */
1797 	dev = ISC_LIST_HEAD(sock->send_list);
1798 	while (dev != NULL) {
1799 		switch (doio_send(sock, dev)) {
1800 		case DOIO_SOFT:
1801 			goto poke;
1802 
1803 		case DOIO_HARD:
1804 		case DOIO_SUCCESS:
1805 			send_senddone_event(sock, &dev);
1806 			break;
1807 		}
1808 
1809 		dev = ISC_LIST_HEAD(sock->send_list);
1810 	}
1811 
1812  poke:
1813 	if (!ISC_LIST_EMPTY(sock->send_list))
1814 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1815 }
1816 
1817 /*
1818  * Process read/writes on each fd here.  Avoid locking
1819  * and unlocking twice if both reads and writes are possible.
1820  */
1821 static void
1822 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1823 	   isc_boolean_t writeable)
1824 {
1825 	isc__socket_t *sock;
1826 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1827 
1828 	/*
1829 	 * If the socket is going to be closed, don't do more I/O.
1830 	 */
1831 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1832 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1833 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1834 		return;
1835 	}
1836 
1837 	sock = manager->fds[fd];
1838 	if (readable) {
1839 		if (sock == NULL) {
1840 			unwatch_read = ISC_TRUE;
1841 			goto check_write;
1842 		}
1843 		if (!SOCK_DEAD(sock)) {
1844 			dispatch_recv(sock);
1845 		}
1846 		unwatch_read = ISC_TRUE;
1847 	}
1848 check_write:
1849 	if (writeable) {
1850 		if (sock == NULL) {
1851 			unwatch_write = ISC_TRUE;
1852 			goto unlock_fd;
1853 		}
1854 		if (!SOCK_DEAD(sock)) {
1855 			if (sock->connecting)
1856 				dispatch_connect(sock);
1857 			else
1858 				dispatch_send(sock);
1859 		}
1860 		unwatch_write = ISC_TRUE;
1861 	}
1862 
1863  unlock_fd:
1864 	if (unwatch_read)
1865 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1866 	if (unwatch_write)
1867 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1868 
1869 }
1870 
1871 static void
1872 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1873 	    fd_set *writefds)
1874 {
1875 	int i;
1876 
1877 	REQUIRE(maxfd <= (int)manager->maxsocks);
1878 
1879 	for (i = 0; i < maxfd; i++) {
1880 		process_fd(manager, i, FD_ISSET(i, readfds),
1881 			   FD_ISSET(i, writefds));
1882 	}
1883 }
1884 
1885 /*
1886  * Create a new socket manager.
1887  */
1888 
1889 static isc_result_t
1890 setup_watcher(isc__socketmgr_t *manager) {
1891 	isc_result_t result;
1892 
1893 	UNUSED(result);
1894 
1895 	manager->fd_bufsize = sizeof(fd_set);
1896 
1897 	manager->read_fds = NULL;
1898 	manager->read_fds_copy = NULL;
1899 	manager->write_fds = NULL;
1900 	manager->write_fds_copy = NULL;
1901 
1902 	manager->read_fds = malloc(manager->fd_bufsize);
1903 	if (manager->read_fds != NULL)
1904 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1905 	if (manager->read_fds_copy != NULL)
1906 		manager->write_fds = malloc(manager->fd_bufsize);
1907 	if (manager->write_fds != NULL) {
1908 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1909 	}
1910 	if (manager->write_fds_copy == NULL) {
1911 		if (manager->write_fds != NULL) {
1912 			free(manager->write_fds);
1913 		}
1914 		if (manager->read_fds_copy != NULL) {
1915 			free(manager->read_fds_copy);
1916 		}
1917 		if (manager->read_fds != NULL) {
1918 			free(manager->read_fds);
1919 		}
1920 		return (ISC_R_NOMEMORY);
1921 	}
1922 	memset(manager->read_fds, 0, manager->fd_bufsize);
1923 	memset(manager->write_fds, 0, manager->fd_bufsize);
1924 
1925 	manager->maxfd = 0;
1926 
1927 	return (ISC_R_SUCCESS);
1928 }
1929 
1930 static void
1931 cleanup_watcher(isc__socketmgr_t *manager) {
1932 
1933 	if (manager->read_fds != NULL)
1934 		free(manager->read_fds);
1935 	if (manager->read_fds_copy != NULL)
1936 		free(manager->read_fds_copy);
1937 	if (manager->write_fds != NULL)
1938 		free(manager->write_fds);
1939 	if (manager->write_fds_copy != NULL)
1940 		free(manager->write_fds_copy);
1941 }
1942 
1943 isc_result_t
1944 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1945 	return (isc__socketmgr_create2(managerp, 0));
1946 }
1947 
1948 isc_result_t
1949 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1950 		       unsigned int maxsocks)
1951 {
1952 	isc__socketmgr_t *manager;
1953 	isc_result_t result;
1954 
1955 	REQUIRE(managerp != NULL && *managerp == NULL);
1956 
1957 	if (socketmgr != NULL) {
1958 		/* Don't allow maxsocks to be updated */
1959 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1960 			return (ISC_R_EXISTS);
1961 
1962 		socketmgr->refs++;
1963 		*managerp = (isc_socketmgr_t *)socketmgr;
1964 		return (ISC_R_SUCCESS);
1965 	}
1966 
1967 	if (maxsocks == 0)
1968 		maxsocks = FD_SETSIZE;
1969 
1970 	manager = malloc(sizeof(*manager));
1971 	if (manager == NULL)
1972 		return (ISC_R_NOMEMORY);
1973 
1974 	/* zero-clear so that necessary cleanup on failure will be easy */
1975 	memset(manager, 0, sizeof(*manager));
1976 	manager->maxsocks = maxsocks;
1977 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1978 	if (manager->fds == NULL) {
1979 		result = ISC_R_NOMEMORY;
1980 		goto free_manager;
1981 	}
1982 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1983 	if (manager->fdstate == NULL) {
1984 		result = ISC_R_NOMEMORY;
1985 		goto free_manager;
1986 	}
1987 
1988 	manager->common.methods = &socketmgrmethods;
1989 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1990 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1991 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1992 	ISC_LIST_INIT(manager->socklist);
1993 
1994 	manager->refs = 1;
1995 
1996 	/*
1997 	 * Set up initial state for the select loop
1998 	 */
1999 	result = setup_watcher(manager);
2000 	if (result != ISC_R_SUCCESS)
2001 		goto cleanup;
2002 
2003 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
2004 
2005 	socketmgr = manager;
2006 	*managerp = (isc_socketmgr_t *)manager;
2007 
2008 	return (ISC_R_SUCCESS);
2009 
2010 cleanup:
2011 
2012 free_manager:
2013 	if (manager->fdstate != NULL) {
2014 		free(manager->fdstate);
2015 	}
2016 	if (manager->fds != NULL) {
2017 		free(manager->fds);
2018 	}
2019 	free(manager);
2020 
2021 	return (result);
2022 }
2023 
2024 void
2025 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2026 	isc__socketmgr_t *manager;
2027 	int i;
2028 
2029 	/*
2030 	 * Destroy a socket manager.
2031 	 */
2032 
2033 	REQUIRE(managerp != NULL);
2034 	manager = (isc__socketmgr_t *)*managerp;
2035 	REQUIRE(VALID_MANAGER(manager));
2036 
2037 	manager->refs--;
2038 	if (manager->refs > 0) {
2039 		*managerp = NULL;
2040 		return;
2041 	}
2042 	socketmgr = NULL;
2043 
2044 	/*
2045 	 * Wait for all sockets to be destroyed.
2046 	 */
2047 	while (!ISC_LIST_EMPTY(manager->socklist)) {
2048 		isc__taskmgr_dispatch(NULL);
2049 	}
2050 
2051 	/*
2052 	 * Here, poke our select/poll thread.  Do this by closing the write
2053 	 * half of the pipe, which will send EOF to the read half.
2054 	 * This is currently a no-op in the non-threaded case.
2055 	 */
2056 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
2057 
2058 	/*
2059 	 * Clean up.
2060 	 */
2061 	cleanup_watcher(manager);
2062 
2063 	for (i = 0; i < (int)manager->maxsocks; i++)
2064 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
2065 			(void)close(i);
2066 
2067 	free(manager->fds);
2068 	free(manager->fdstate);
2069 
2070 	manager->common.magic = 0;
2071 	manager->common.impmagic = 0;
2072 	free(manager);
2073 
2074 	*managerp = NULL;
2075 
2076 	socketmgr = NULL;
2077 }
2078 
2079 static isc_result_t
2080 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2081 	    unsigned int flags)
2082 {
2083 	int io_state;
2084 	isc_task_t *ntask = NULL;
2085 	isc_result_t result = ISC_R_SUCCESS;
2086 
2087 	dev->ev_sender = task;
2088 
2089 	if (sock->type == isc_sockettype_udp) {
2090 		io_state = doio_recv(sock, dev);
2091 	} else {
2092 		if (ISC_LIST_EMPTY(sock->recv_list))
2093 			io_state = doio_recv(sock, dev);
2094 		else
2095 			io_state = DOIO_SOFT;
2096 	}
2097 
2098 	switch (io_state) {
2099 	case DOIO_SOFT:
2100 		/*
2101 		 * We couldn't read all or part of the request right now, so
2102 		 * queue it.
2103 		 *
2104 		 * Attach to socket and to task
2105 		 */
2106 		isc_task_attach(task, &ntask);
2107 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2108 
2109 		/*
2110 		 * Enqueue the request.  If the socket was previously not being
2111 		 * watched, poke the watcher to start paying attention to it.
2112 		 */
2113 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2114 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2115 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2116 
2117 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2118 			   "socket_recv: event %p -> task %p",
2119 			   dev, ntask);
2120 
2121 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2122 			result = ISC_R_INPROGRESS;
2123 		break;
2124 
2125 	case DOIO_EOF:
2126 		dev->result = ISC_R_EOF;
2127 		/* fallthrough */
2128 
2129 	case DOIO_HARD:
2130 	case DOIO_SUCCESS:
2131 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2132 			send_recvdone_event(sock, &dev);
2133 		break;
2134 	}
2135 
2136 	return (result);
2137 }
2138 
2139 isc_result_t
2140 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2141 		  unsigned int minimum, isc_task_t *task,
2142 		  isc_taskaction_t action, void *arg)
2143 {
2144 	isc__socket_t *sock = (isc__socket_t *)sock0;
2145 	isc_socketevent_t *dev;
2146 	isc__socketmgr_t *manager;
2147 	unsigned int iocount;
2148 	isc_buffer_t *buffer;
2149 
2150 	REQUIRE(VALID_SOCKET(sock));
2151 	REQUIRE(buflist != NULL);
2152 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2153 	REQUIRE(task != NULL);
2154 	REQUIRE(action != NULL);
2155 
2156 	manager = sock->manager;
2157 	REQUIRE(VALID_MANAGER(manager));
2158 
2159 	iocount = isc_bufferlist_availablecount(buflist);
2160 	REQUIRE(iocount > 0);
2161 
2162 	INSIST(sock->bound);
2163 
2164 	dev = allocate_socketevent(sock,
2165 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2166 	if (dev == NULL)
2167 		return (ISC_R_NOMEMORY);
2168 
2169 	/*
2170 	 * UDP sockets are always partial read
2171 	 */
2172 	if (sock->type == isc_sockettype_udp)
2173 		dev->minimum = 1;
2174 	else {
2175 		if (minimum == 0)
2176 			dev->minimum = iocount;
2177 		else
2178 			dev->minimum = minimum;
2179 	}
2180 
2181 	/*
2182 	 * Move each buffer from the passed in list to our internal one.
2183 	 */
2184 	buffer = ISC_LIST_HEAD(*buflist);
2185 	while (buffer != NULL) {
2186 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2187 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2188 		buffer = ISC_LIST_HEAD(*buflist);
2189 	}
2190 
2191 	return (socket_recv(sock, dev, task, 0));
2192 }
2193 
2194 static isc_result_t
2195 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2196 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2197 	    unsigned int flags)
2198 {
2199 	int io_state;
2200 	isc_task_t *ntask = NULL;
2201 	isc_result_t result = ISC_R_SUCCESS;
2202 
2203 	dev->ev_sender = task;
2204 
2205 	set_dev_address(address, sock, dev);
2206 	if (pktinfo != NULL) {
2207 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2208 		dev->pktinfo = *pktinfo;
2209 
2210 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2211 		    !isc_sockaddr_islinklocal(&dev->address)) {
2212 			socket_log(sock, NULL, TRACE,
2213 				   "pktinfo structure provided, ifindex %u "
2214 				   "(set to 0)", pktinfo->ipi6_ifindex);
2215 
2216 			/*
2217 			 * Set the pktinfo index to 0 here, to let the
2218 			 * kernel decide what interface it should send on.
2219 			 */
2220 			dev->pktinfo.ipi6_ifindex = 0;
2221 		}
2222 	}
2223 
2224 	if (sock->type == isc_sockettype_udp)
2225 		io_state = doio_send(sock, dev);
2226 	else {
2227 		if (ISC_LIST_EMPTY(sock->send_list))
2228 			io_state = doio_send(sock, dev);
2229 		else
2230 			io_state = DOIO_SOFT;
2231 	}
2232 
2233 	switch (io_state) {
2234 	case DOIO_SOFT:
2235 		/*
2236 		 * We couldn't send all or part of the request right now, so
2237 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2238 		 */
2239 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2240 			isc_task_attach(task, &ntask);
2241 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2242 
2243 			/*
2244 			 * Enqueue the request.  If the socket was previously
2245 			 * not being watched, poke the watcher to start
2246 			 * paying attention to it.
2247 			 */
2248 			if (ISC_LIST_EMPTY(sock->send_list) &&
2249 			    !sock->pending_send)
2250 				select_poke(sock->manager, sock->fd,
2251 					    SELECT_POKE_WRITE);
2252 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2253 
2254 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2255 				   "socket_send: event %p -> task %p",
2256 				   dev, ntask);
2257 
2258 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2259 				result = ISC_R_INPROGRESS;
2260 			break;
2261 		}
2262 
2263 		/* FALLTHROUGH */
2264 
2265 	case DOIO_HARD:
2266 	case DOIO_SUCCESS:
2267 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2268 			send_senddone_event(sock, &dev);
2269 		break;
2270 	}
2271 
2272 	return (result);
2273 }
2274 
2275 isc_result_t
2276 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2277 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2278 {
2279 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2280 				     NULL, 0));
2281 }
2282 
2283 isc_result_t
2284 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2285 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2286 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2287 		     unsigned int flags)
2288 {
2289 	isc__socket_t *sock = (isc__socket_t *)sock0;
2290 	isc_socketevent_t *dev;
2291 	isc__socketmgr_t *manager;
2292 	unsigned int iocount;
2293 	isc_buffer_t *buffer;
2294 
2295 	REQUIRE(VALID_SOCKET(sock));
2296 	REQUIRE(buflist != NULL);
2297 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2298 	REQUIRE(task != NULL);
2299 	REQUIRE(action != NULL);
2300 
2301 	manager = sock->manager;
2302 	REQUIRE(VALID_MANAGER(manager));
2303 
2304 	iocount = isc_bufferlist_usedcount(buflist);
2305 	REQUIRE(iocount > 0);
2306 
2307 	dev = allocate_socketevent(sock,
2308 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2309 	if (dev == NULL)
2310 		return (ISC_R_NOMEMORY);
2311 
2312 	/*
2313 	 * Move each buffer from the passed in list to our internal one.
2314 	 */
2315 	buffer = ISC_LIST_HEAD(*buflist);
2316 	while (buffer != NULL) {
2317 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2318 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2319 		buffer = ISC_LIST_HEAD(*buflist);
2320 	}
2321 
2322 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2323 }
2324 
2325 isc_result_t
2326 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2327 		 unsigned int options) {
2328 	isc__socket_t *sock = (isc__socket_t *)sock0;
2329 	char strbuf[ISC_STRERRORSIZE];
2330 	int on = 1;
2331 
2332 	REQUIRE(VALID_SOCKET(sock));
2333 
2334 	INSIST(!sock->bound);
2335 
2336 	if (sock->pf != sockaddr->type.sa.sa_family) {
2337 		return (ISC_R_FAMILYMISMATCH);
2338 	}
2339 
2340 	/*
2341 	 * Only set SO_REUSEADDR when we want a specific port.
2342 	 */
2343 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2344 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2345 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2346 		       sizeof(on)) < 0) {
2347 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2348 				 "setsockopt(%d) %s", sock->fd, "failed");
2349 		/* Press on... */
2350 	}
2351 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2352 		switch (errno) {
2353 		case EACCES:
2354 			return (ISC_R_NOPERM);
2355 		case EADDRNOTAVAIL:
2356 			return (ISC_R_ADDRNOTAVAIL);
2357 		case EADDRINUSE:
2358 			return (ISC_R_ADDRINUSE);
2359 		case EINVAL:
2360 			return (ISC_R_BOUND);
2361 		default:
2362 			isc__strerror(errno, strbuf, sizeof(strbuf));
2363 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2364 					 strbuf);
2365 			return (ISC_R_UNEXPECTED);
2366 		}
2367 	}
2368 
2369 	socket_log(sock, sockaddr, TRACE, "bound");
2370 	sock->bound = 1;
2371 
2372 	return (ISC_R_SUCCESS);
2373 }
2374 
2375 isc_result_t
2376 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2377 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2378 {
2379 	isc__socket_t *sock = (isc__socket_t *)sock0;
2380 	isc_socket_connev_t *dev;
2381 	isc_task_t *ntask = NULL;
2382 	isc__socketmgr_t *manager;
2383 	int cc;
2384 	char strbuf[ISC_STRERRORSIZE];
2385 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2386 
2387 	REQUIRE(VALID_SOCKET(sock));
2388 	REQUIRE(addr != NULL);
2389 	REQUIRE(task != NULL);
2390 	REQUIRE(action != NULL);
2391 
2392 	manager = sock->manager;
2393 	REQUIRE(VALID_MANAGER(manager));
2394 	REQUIRE(addr != NULL);
2395 
2396 	if (isc_sockaddr_ismulticast(addr))
2397 		return (ISC_R_MULTICAST);
2398 
2399 	REQUIRE(!sock->connecting);
2400 
2401 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2402 							ISC_SOCKEVENT_CONNECT,
2403 							action,	arg,
2404 							sizeof(*dev));
2405 	if (dev == NULL) {
2406 		return (ISC_R_NOMEMORY);
2407 	}
2408 	ISC_LINK_INIT(dev, ev_link);
2409 
2410 	/*
2411 	 * Try to do the connect right away, as there can be only one
2412 	 * outstanding, and it might happen to complete.
2413 	 */
2414 	sock->peer_address = *addr;
2415 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2416 	if (cc < 0) {
2417 		/*
2418 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2419 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2420 		 * a success and let the user detect it if it's really an error
2421 		 * at the time of sending a packet on the socket.
2422 		 */
2423 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2424 			cc = 0;
2425 			goto success;
2426 		}
2427 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2428 			goto queue;
2429 
2430 		switch (errno) {
2431 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2432 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2433 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2434 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2435 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2436 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2437 #ifdef EHOSTDOWN
2438 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2439 #endif
2440 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2441 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2442 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2443 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2444 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2445 #undef ERROR_MATCH
2446 		}
2447 
2448 		sock->connected = 0;
2449 
2450 		isc__strerror(errno, strbuf, sizeof(strbuf));
2451 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2452 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2453 				 addrbuf, errno, strbuf);
2454 
2455 		isc_event_free(ISC_EVENT_PTR(&dev));
2456 		return (ISC_R_UNEXPECTED);
2457 
2458 	err_exit:
2459 		sock->connected = 0;
2460 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2461 
2462 		return (ISC_R_SUCCESS);
2463 	}
2464 
2465 	/*
2466 	 * If connect completed, fire off the done event.
2467 	 */
2468  success:
2469 	if (cc == 0) {
2470 		sock->connected = 1;
2471 		sock->bound = 1;
2472 		dev->result = ISC_R_SUCCESS;
2473 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2474 
2475 		return (ISC_R_SUCCESS);
2476 	}
2477 
2478  queue:
2479 
2480 	/*
2481 	 * Attach to task.
2482 	 */
2483 	isc_task_attach(task, &ntask);
2484 
2485 	sock->connecting = 1;
2486 
2487 	dev->ev_sender = ntask;
2488 
2489 	/*
2490 	 * Poke watcher here.  We still have the socket locked, so there
2491 	 * is no race condition.  We will keep the lock for such a short
2492 	 * bit of time waking it up now or later won't matter all that much.
2493 	 */
2494 	if (sock->connect_ev == NULL)
2495 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2496 
2497 	sock->connect_ev = dev;
2498 
2499 	return (ISC_R_SUCCESS);
2500 }
2501 
2502 /*
2503  * Called when a socket with a pending connect() finishes.
2504  */
2505 static void
2506 internal_connect(isc_task_t *me, isc_event_t *ev) {
2507 	isc__socket_t *sock;
2508 	isc_socket_connev_t *dev;
2509 	isc_task_t *task;
2510 	int cc;
2511 	socklen_t optlen;
2512 	char strbuf[ISC_STRERRORSIZE];
2513 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2514 
2515 	UNUSED(me);
2516 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2517 
2518 	sock = ev->ev_sender;
2519 	INSIST(VALID_SOCKET(sock));
2520 
2521 	/*
2522 	 * When the internal event was sent the reference count was bumped
2523 	 * to keep the socket around for us.  Decrement the count here.
2524 	 */
2525 	INSIST(sock->references > 0);
2526 	sock->references--;
2527 	if (sock->references == 0) {
2528 		destroy(&sock);
2529 		return;
2530 	}
2531 
2532 	/*
2533 	 * Has this event been canceled?
2534 	 */
2535 	dev = sock->connect_ev;
2536 	if (dev == NULL) {
2537 		INSIST(!sock->connecting);
2538 		return;
2539 	}
2540 
2541 	INSIST(sock->connecting);
2542 	sock->connecting = 0;
2543 
2544 	/*
2545 	 * Get any possible error status here.
2546 	 */
2547 	optlen = sizeof(cc);
2548 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2549 		       (void *)&cc, (void *)&optlen) < 0)
2550 		cc = errno;
2551 	else
2552 		errno = cc;
2553 
2554 	if (errno != 0) {
2555 		/*
2556 		 * If the error is EAGAIN, just re-select on this
2557 		 * fd and pretend nothing strange happened.
2558 		 */
2559 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2560 			sock->connecting = 1;
2561 			select_poke(sock->manager, sock->fd,
2562 				    SELECT_POKE_CONNECT);
2563 			return;
2564 		}
2565 
2566 
2567 		/*
2568 		 * Translate other errors into ISC_R_* flavors.
2569 		 */
2570 		switch (errno) {
2571 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2572 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2573 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2574 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2575 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2576 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2577 #ifdef EHOSTDOWN
2578 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2579 #endif
2580 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2581 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2582 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2583 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2584 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2585 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2586 #undef ERROR_MATCH
2587 		default:
2588 			dev->result = ISC_R_UNEXPECTED;
2589 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2590 					    sizeof(peerbuf));
2591 			isc__strerror(errno, strbuf, sizeof(strbuf));
2592 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2593 					 "internal_connect: connect(%s) %s",
2594 					 peerbuf, strbuf);
2595 		}
2596 	} else {
2597 		dev->result = ISC_R_SUCCESS;
2598 		sock->connected = 1;
2599 		sock->bound = 1;
2600 	}
2601 
2602 	sock->connect_ev = NULL;
2603 
2604 	task = dev->ev_sender;
2605 	dev->ev_sender = sock;
2606 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2607 }
2608 
2609 /*
2610  * Run through the list of events on this socket, and cancel the ones
2611  * queued for task "task" of type "how".  "how" is a bitmask.
2612  */
2613 void
2614 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2615 	isc__socket_t *sock = (isc__socket_t *)sock0;
2616 
2617 	REQUIRE(VALID_SOCKET(sock));
2618 
2619 	/*
2620 	 * Quick exit if there is nothing to do.  Don't even bother locking
2621 	 * in this case.
2622 	 */
2623 	if (how == 0)
2624 		return;
2625 
2626 	/*
2627 	 * All of these do the same thing, more or less.
2628 	 * Each will:
2629 	 *	o If the internal event is marked as "posted" try to
2630 	 *	  remove it from the task's queue.  If this fails, mark it
2631 	 *	  as canceled instead, and let the task clean it up later.
2632 	 *	o For each I/O request for that task of that type, post
2633 	 *	  its done event with status of "ISC_R_CANCELED".
2634 	 *	o Reset any state needed.
2635 	 */
2636 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2637 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2638 		isc_socketevent_t      *dev;
2639 		isc_socketevent_t      *next;
2640 		isc_task_t	       *current_task;
2641 
2642 		dev = ISC_LIST_HEAD(sock->recv_list);
2643 
2644 		while (dev != NULL) {
2645 			current_task = dev->ev_sender;
2646 			next = ISC_LIST_NEXT(dev, ev_link);
2647 
2648 			if ((task == NULL) || (task == current_task)) {
2649 				dev->result = ISC_R_CANCELED;
2650 				send_recvdone_event(sock, &dev);
2651 			}
2652 			dev = next;
2653 		}
2654 	}
2655 
2656 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2657 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2658 		isc_socketevent_t      *dev;
2659 		isc_socketevent_t      *next;
2660 		isc_task_t	       *current_task;
2661 
2662 		dev = ISC_LIST_HEAD(sock->send_list);
2663 
2664 		while (dev != NULL) {
2665 			current_task = dev->ev_sender;
2666 			next = ISC_LIST_NEXT(dev, ev_link);
2667 
2668 			if ((task == NULL) || (task == current_task)) {
2669 				dev->result = ISC_R_CANCELED;
2670 				send_senddone_event(sock, &dev);
2671 			}
2672 			dev = next;
2673 		}
2674 	}
2675 
2676 	/*
2677 	 * Connecting is not a list.
2678 	 */
2679 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2680 	    && sock->connect_ev != NULL) {
2681 		isc_socket_connev_t    *dev;
2682 		isc_task_t	       *current_task;
2683 
2684 		INSIST(sock->connecting);
2685 		sock->connecting = 0;
2686 
2687 		dev = sock->connect_ev;
2688 		current_task = dev->ev_sender;
2689 
2690 		if ((task == NULL) || (task == current_task)) {
2691 			sock->connect_ev = NULL;
2692 
2693 			dev->result = ISC_R_CANCELED;
2694 			dev->ev_sender = sock;
2695 			isc_task_sendanddetach(&current_task,
2696 					       ISC_EVENT_PTR(&dev));
2697 		}
2698 	}
2699 
2700 }
2701 
2702 /*
2703  * In our assumed scenario, we can simply use a single static object.
2704  * XXX: this is not true if the application uses multiple threads with
2705  *      'multi-context' mode.  Fixing this is a future TODO item.
2706  */
2707 static isc_socketwait_t swait_private;
2708 
2709 int
2710 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2711 			  isc_socketwait_t **swaitp)
2712 {
2713 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2714 	int n;
2715 
2716 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2717 
2718 	if (manager == NULL)
2719 		manager = socketmgr;
2720 	if (manager == NULL)
2721 		return (0);
2722 
2723 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2724 	memmove(manager->write_fds_copy, manager->write_fds,
2725 		manager->fd_bufsize);
2726 
2727 	swait_private.readset = manager->read_fds_copy;
2728 	swait_private.writeset = manager->write_fds_copy;
2729 	swait_private.maxfd = manager->maxfd + 1;
2730 
2731 	n = select(swait_private.maxfd, swait_private.readset,
2732 		   swait_private.writeset, NULL, tvp);
2733 
2734 	*swaitp = &swait_private;
2735 	return (n);
2736 }
2737 
2738 isc_result_t
2739 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2740 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2741 
2742 	REQUIRE(swait == &swait_private);
2743 
2744 	if (manager == NULL)
2745 		manager = socketmgr;
2746 	if (manager == NULL)
2747 		return (ISC_R_NOTFOUND);
2748 
2749 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2750 	return (ISC_R_SUCCESS);
2751 }
2752 
2753 #include "../socket_api.c"
2754