xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 960186aafd858ccd22145c215a7cef64418baf3b)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 #define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
101 #define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
102 
103 /*!
104  * IPv6 control information.  If the socket is an IPv6 socket we want
105  * to collect the destination address and interface so the client can
106  * set them on outgoing packets.
107  */
108 
109 /*%
110  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
111  * a setsockopt() like interface to request timestamps, and if the OS
112  * doesn't do it for us, call gettimeofday() on every UDP receive?
113  */
114 
115 /*%
116  * Instead of calculating the cmsgbuf lengths every time we take
117  * a rule of thumb approach - sizes are taken from x86_64 linux,
118  * multiplied by 2, everything should fit. Those sizes are not
119  * large enough to cause any concern.
120  */
121 #define CMSG_SP_IN6PKT 40
122 
123 #define CMSG_SP_TIMESTAMP 32
124 
125 #define CMSG_SP_TCTOS 24
126 
127 #define CMSG_SP_INT 24
128 
129 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
130 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
131 
132 /*%
133  * The number of times a send operation is repeated if the result is EINTR.
134  */
135 #define NRETRIES 10
136 
137 typedef struct isc__socket isc__socket_t;
138 typedef struct isc__socketmgr isc__socketmgr_t;
139 
140 struct isc__socket {
141 	/* Not locked. */
142 	isc_socket_t		common;
143 	isc__socketmgr_t	*manager;
144 	isc_sockettype_t	type;
145 
146 	/* Locked by socket lock. */
147 	ISC_LINK(isc__socket_t)	link;
148 	unsigned int		references;
149 	int			fd;
150 	int			pf;
151 
152 	ISC_LIST(isc_socketevent_t)		send_list;
153 	ISC_LIST(isc_socketevent_t)		recv_list;
154 	isc_socket_connev_t		       *connect_ev;
155 
156 	/*
157 	 * Internal events.  Posted when a descriptor is readable or
158 	 * writable.  These are statically allocated and never freed.
159 	 * They will be set to non-purgable before use.
160 	 */
161 	intev_t			readable_ev;
162 	intev_t			writable_ev;
163 
164 	isc_sockaddr_t		peer_address;       /* remote address */
165 
166 	unsigned int		pending_recv : 1,
167 				pending_send : 1,
168 				connected : 1,
169 				connecting : 1,     /* connect pending */
170 				bound : 1,          /* bound to local addr */
171 				active : 1,         /* currently active */
172 				pktdscp : 1;	    /* per packet dscp */
173 	unsigned int		dscp;
174 };
175 
176 #define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
177 #define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
178 
179 struct isc__socketmgr {
180 	/* Not locked. */
181 	isc_socketmgr_t		common;
182 	int			fd_bufsize;
183 	unsigned int		maxsocks;
184 
185 	isc__socket_t	       **fds;
186 	int			*fdstate;
187 
188 	/* Locked by manager lock. */
189 	ISC_LIST(isc__socket_t)	socklist;
190 	fd_set			*read_fds;
191 	fd_set			*read_fds_copy;
192 	fd_set			*write_fds;
193 	fd_set			*write_fds_copy;
194 	int			maxfd;
195 	unsigned int		refs;
196 };
197 
198 static isc__socketmgr_t *socketmgr = NULL;
199 
200 #define CLOSED			0	/* this one must be zero */
201 #define MANAGED			1
202 #define CLOSE_PENDING		2
203 
204 /*
205  * send() and recv() iovec counts
206  */
207 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
208 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
209 
210 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
211 				  isc_sockettype_t type,
212 				  isc_socket_t **socketp);
213 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
214 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
215 static void free_socket(isc__socket_t **);
216 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
217 				    isc__socket_t **);
218 static void destroy(isc__socket_t **);
219 static void internal_connect(isc_task_t *, isc_event_t *);
220 static void internal_recv(isc_task_t *, isc_event_t *);
221 static void internal_send(isc_task_t *, isc_event_t *);
222 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
223 static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
224 			      struct msghdr *, struct iovec *, size_t *);
225 static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
226 			      struct msghdr *, struct iovec *, size_t *);
227 
228 /*%
229  * The following are intended for internal use (indicated by "isc__"
230  * prefix) but are not declared as static, allowing direct access from
231  * unit tests etc.
232  */
233 
234 isc_result_t
235 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
236 		   isc_socket_t **socketp);
237 void
238 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
239 void
240 isc__socket_detach(isc_socket_t **socketp);
241 isc_result_t
242 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
243 		 unsigned int minimum, isc_task_t *task,
244 		  isc_taskaction_t action, void *arg);
245 isc_result_t
246 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
247 		  isc_task_t *task, isc_taskaction_t action, void *arg);
248 isc_result_t
249 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
250 		     isc_task_t *task, isc_taskaction_t action, void *arg,
251 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
252 		     unsigned int flags);
253 isc_result_t
254 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
255 		 unsigned int options);
256 isc_result_t
257 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
258 		    isc_task_t *task, isc_taskaction_t action,
259 		    void *arg);
260 void
261 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
262 
263 isc_result_t
264 isc__socketmgr_create(isc_socketmgr_t **managerp);
265 isc_result_t
266 isc__socketmgr_create2(isc_socketmgr_t **managerp,
267 		       unsigned int maxsocks);
268 void
269 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
270 
271 #define SELECT_POKE_SHUTDOWN		(-1)
272 #define SELECT_POKE_READ		(-3)
273 #define SELECT_POKE_WRITE		(-4)
274 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
275 #define SELECT_POKE_CLOSE		(-5)
276 
277 #define SOCK_DEAD(s)			((s)->references == 0)
278 
279 /*%
280  * Shortcut index arrays to get access to statistics counters.
281  */
282 enum {
283 	STATID_OPEN = 0,
284 	STATID_OPENFAIL = 1,
285 	STATID_CLOSE = 2,
286 	STATID_BINDFAIL = 3,
287 	STATID_CONNECTFAIL = 4,
288 	STATID_CONNECT = 5,
289 	STATID_ACCEPTFAIL = 6,
290 	STATID_ACCEPT = 7,
291 	STATID_SENDFAIL = 8,
292 	STATID_RECVFAIL = 9,
293 	STATID_ACTIVE = 10
294 };
295 
296 
297 static void
298 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
299 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
300 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
301 static void
302 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
303 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
304 	   const char *fmt, ...)
305 {
306 	char msgbuf[2048];
307 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
308 	va_list ap;
309 
310 	if (! isc_log_wouldlog(isc_lctx, level))
311 		return;
312 
313 	va_start(ap, fmt);
314 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
315 	va_end(ap);
316 
317 	if (address == NULL) {
318 		isc_log_write(isc_lctx, category, module, level,
319 			       "socket %p: %s", sock, msgbuf);
320 	} else {
321 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
322 		isc_log_write(isc_lctx, category, module, level,
323 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
324 	}
325 }
326 
327 static inline isc_result_t
328 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
329 	isc_result_t result = ISC_R_SUCCESS;
330 
331 	if (msg == SELECT_POKE_READ)
332 		FD_SET(fd, manager->read_fds);
333 	if (msg == SELECT_POKE_WRITE)
334 		FD_SET(fd, manager->write_fds);
335 
336 	return (result);
337 }
338 
339 static inline isc_result_t
340 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
341 	isc_result_t result = ISC_R_SUCCESS;
342 
343 	if (msg == SELECT_POKE_READ)
344 		FD_CLR(fd, manager->read_fds);
345 	else if (msg == SELECT_POKE_WRITE)
346 		FD_CLR(fd, manager->write_fds);
347 
348 	return (result);
349 }
350 
351 static void
352 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
353 	isc_result_t result;
354 
355 	/*
356 	 * This is a wakeup on a socket.  If the socket is not in the
357 	 * process of being closed, start watching it for either reads
358 	 * or writes.
359 	 */
360 
361 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
362 
363 	if (msg == SELECT_POKE_CLOSE) {
364 		/* No one should be updating fdstate, so no need to lock it */
365 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
366 		manager->fdstate[fd] = CLOSED;
367 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
368 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
369 		(void)close(fd);
370 		return;
371 	}
372 
373 	if (manager->fdstate[fd] == CLOSE_PENDING) {
374 
375 		/*
376 		 * We accept (and ignore) any error from unwatch_fd() as we are
377 		 * closing the socket, hoping it doesn't leave dangling state in
378 		 * the kernel.
379 		 */
380 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
381 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
382 		return;
383 	}
384 	if (manager->fdstate[fd] != MANAGED) {
385 		return;
386 	}
387 
388 	/*
389 	 * Set requested bit.
390 	 */
391 	result = watch_fd(manager, fd, msg);
392 	if (result != ISC_R_SUCCESS) {
393 		/*
394 		 * XXXJT: what should we do?  Ignoring the failure of watching
395 		 * a socket will make the application dysfunctional, but there
396 		 * seems to be no reasonable recovery process.
397 		 */
398 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
399 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
400 			      "failed to start watching FD (%d): %s",
401 			      fd, isc_result_totext(result));
402 	}
403 }
404 
405 /*
406  * Update the state of the socketmgr when something changes.
407  */
408 static void
409 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
410 	if (msg == SELECT_POKE_SHUTDOWN)
411 		return;
412 	else if (fd >= 0)
413 		wakeup_socket(manager, fd, msg);
414 	return;
415 }
416 
417 /*
418  * Make a fd non-blocking.
419  */
420 static isc_result_t
421 make_nonblock(int fd) {
422 	int ret;
423 	int flags;
424 
425 	flags = fcntl(fd, F_GETFL, 0);
426 	flags |= O_NONBLOCK;
427 	ret = fcntl(fd, F_SETFL, flags);
428 
429 	if (ret == -1) {
430 		UNEXPECTED_ERROR(__FILE__, __LINE__,
431 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
432 				 strerror(errno));
433 		return (ISC_R_UNEXPECTED);
434 	}
435 
436 	return (ISC_R_SUCCESS);
437 }
438 
439 /*
440  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
441  * In order to ensure as much portability as possible, we provide wrapper
442  * functions of these macros.
443  * Note that cmsg_space() could run slow on OSes that do not have
444  * CMSG_SPACE.
445  */
446 static inline socklen_t
447 cmsg_len(socklen_t len) {
448 	return (CMSG_LEN(len));
449 }
450 
451 static inline socklen_t
452 cmsg_space(socklen_t len) {
453 	return (CMSG_SPACE(len));
454 }
455 
456 /*
457  * Process control messages received on a socket.
458  */
459 static void
460 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
461 	struct cmsghdr *cmsgp;
462 	struct in6_pktinfo *pktinfop;
463 	void *timevalp;
464 
465 	/*
466 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
467 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
468 	 * They are all here, outside of the CPP tests, because it is
469 	 * more consistent with the usual ISC coding style.
470 	 */
471 	UNUSED(sock);
472 	UNUSED(msg);
473 	UNUSED(dev);
474 
475 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
476 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
477 
478 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
479 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
480 
481 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
482 		return;
483 
484 	timevalp = NULL;
485 	pktinfop = NULL;
486 
487 	cmsgp = CMSG_FIRSTHDR(msg);
488 	while (cmsgp != NULL) {
489 		socket_log(sock, NULL, TRACE,
490 			   "processing cmsg %p", cmsgp);
491 
492 		if (cmsgp->cmsg_level == IPPROTO_IPV6
493 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
494 
495 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
496 			memmove(&dev->pktinfo, pktinfop,
497 				sizeof(struct in6_pktinfo));
498 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
499 			socket_log(sock, NULL, TRACE,
500 				   "interface received on ifindex %u",
501 				   dev->pktinfo.ipi6_ifindex);
502 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
503 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
504 			goto next;
505 		}
506 
507 		if (cmsgp->cmsg_level == SOL_SOCKET
508 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
509 			struct timeval tv;
510 			timevalp = CMSG_DATA(cmsgp);
511 			memmove(&tv, timevalp, sizeof(tv));
512 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
513 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
514 			goto next;
515 		}
516 
517 		if (cmsgp->cmsg_level == IPPROTO_IPV6
518 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
519 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
520 			dev->dscp >>= 2;
521 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
522 			goto next;
523 		}
524 
525 		if (cmsgp->cmsg_level == IPPROTO_IP
526 		    && (cmsgp->cmsg_type == IP_TOS)) {
527 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
528 			dev->dscp >>= 2;
529 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
530 			goto next;
531 		}
532 	next:
533 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
534 	}
535 
536 }
537 
538 /*
539  * Construct an iov array and attach it to the msghdr passed in.  This is
540  * the SEND constructor, which will use the used region of the buffer
541  * (if using a buffer list) or will use the internal region (if a single
542  * buffer I/O is requested).
543  *
544  * Nothing can be NULL, and the done event must list at least one buffer
545  * on the buffer linked list for this function to be meaningful.
546  *
547  * If write_countp != NULL, *write_countp will hold the number of bytes
548  * this transaction can send.
549  */
550 static void
551 build_msghdr_send(isc__socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
552 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
553 {
554 	unsigned int iovcount;
555 	isc_buffer_t *buffer;
556 	isc_region_t used;
557 	size_t write_count;
558 	size_t skip_count;
559 	struct cmsghdr *cmsgp;
560 
561 	memset(msg, 0, sizeof(*msg));
562 
563 	if (!sock->connected) {
564 		msg->msg_name = (void *)&dev->address.type.sa;
565 		msg->msg_namelen = dev->address.length;
566 	} else {
567 		msg->msg_name = NULL;
568 		msg->msg_namelen = 0;
569 	}
570 
571 	buffer = ISC_LIST_HEAD(dev->bufferlist);
572 	write_count = 0;
573 	iovcount = 0;
574 
575 	/*
576 	 * Single buffer I/O?  Skip what we've done so far in this region.
577 	 */
578 	if (buffer == NULL) {
579 		write_count = dev->region.length - dev->n;
580 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
581 		iov[0].iov_len = write_count;
582 		iovcount = 1;
583 
584 		goto config;
585 	}
586 
587 	/*
588 	 * Multibuffer I/O.
589 	 * Skip the data in the buffer list that we have already written.
590 	 */
591 	skip_count = dev->n;
592 	while (buffer != NULL) {
593 		REQUIRE(ISC_BUFFER_VALID(buffer));
594 		if (skip_count < isc_buffer_usedlength(buffer))
595 			break;
596 		skip_count -= isc_buffer_usedlength(buffer);
597 		buffer = ISC_LIST_NEXT(buffer, link);
598 	}
599 
600 	while (buffer != NULL) {
601 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
602 
603 		isc_buffer_usedregion(buffer, &used);
604 
605 		if (used.length > 0) {
606 			iov[iovcount].iov_base = (void *)(used.base
607 							  + skip_count);
608 			iov[iovcount].iov_len = used.length - skip_count;
609 			write_count += (used.length - skip_count);
610 			skip_count = 0;
611 			iovcount++;
612 		}
613 		buffer = ISC_LIST_NEXT(buffer, link);
614 	}
615 
616 	INSIST(skip_count == 0U);
617 
618  config:
619 	msg->msg_iov = iov;
620 	msg->msg_iovlen = iovcount;
621 
622 	msg->msg_control = NULL;
623 	msg->msg_controllen = 0;
624 	msg->msg_flags = 0;
625 
626 	if ((sock->type == isc_sockettype_udp) &&
627 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
628 	{
629 		struct in6_pktinfo *pktinfop;
630 
631 		socket_log(sock, NULL, TRACE,
632 			   "sendto pktinfo data, ifindex %u",
633 			   dev->pktinfo.ipi6_ifindex);
634 
635 		msg->msg_control = (void *)cmsgbuf;
636 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
637 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
638 
639 		cmsgp = (struct cmsghdr *)cmsgbuf;
640 		cmsgp->cmsg_level = IPPROTO_IPV6;
641 		cmsgp->cmsg_type = IPV6_PKTINFO;
642 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
643 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
644 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
645 	}
646 
647 	if ((sock->type == isc_sockettype_udp) &&
648 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
649 	{
650 		int use_min_mtu = 1;	/* -1, 0, 1 */
651 
652 		cmsgp = (struct cmsghdr *)(cmsgbuf +
653 					   msg->msg_controllen);
654 
655 		msg->msg_control = (void *)cmsgbuf;
656 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
657 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
658 
659 		cmsgp->cmsg_level = IPPROTO_IPV6;
660 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
661 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
662 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
663 	}
664 
665 	if (isc_dscp_check_value > -1) {
666 		if (sock->type == isc_sockettype_udp)
667 			INSIST((int)dev->dscp == isc_dscp_check_value);
668 		else if (sock->type == isc_sockettype_tcp)
669 			INSIST((int)sock->dscp == isc_dscp_check_value);
670 	}
671 
672 	if ((sock->type == isc_sockettype_udp) &&
673 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
674 	{
675 		int dscp = (dev->dscp << 2) & 0xff;
676 
677 		INSIST(dev->dscp < 0x40);
678 
679 		if (sock->pf == AF_INET && sock->pktdscp) {
680 			cmsgp = (struct cmsghdr *)(cmsgbuf +
681 						   msg->msg_controllen);
682 			msg->msg_control = (void *)cmsgbuf;
683 			msg->msg_controllen += cmsg_space(sizeof(dscp));
684 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
685 
686 			cmsgp->cmsg_level = IPPROTO_IP;
687 			cmsgp->cmsg_type = IP_TOS;
688 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
689 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
690 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
691 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
692 			       (void *)&dscp, sizeof(int)) < 0)
693 			{
694 				UNEXPECTED_ERROR(__FILE__, __LINE__,
695 						 "setsockopt(%d, IP_TOS, %.02x)"
696 						 " %s: %s",
697 						 sock->fd, dscp >> 2,
698 						 "failed", strerror(errno));
699 			} else
700 				sock->dscp = dscp;
701 		}
702 
703 		if (sock->pf == AF_INET6 && sock->pktdscp) {
704 			cmsgp = (struct cmsghdr *)(cmsgbuf +
705 						   msg->msg_controllen);
706 			msg->msg_control = (void *)cmsgbuf;
707 			msg->msg_controllen += cmsg_space(sizeof(dscp));
708 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
709 
710 			cmsgp->cmsg_level = IPPROTO_IPV6;
711 			cmsgp->cmsg_type = IPV6_TCLASS;
712 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
713 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
714 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
715 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
716 				       (void *)&dscp, sizeof(int)) < 0) {
717 				UNEXPECTED_ERROR(__FILE__, __LINE__,
718 						 "setsockopt(%d, IPV6_TCLASS, "
719 						 "%.02x) %s: %s",
720 						 sock->fd, dscp >> 2,
721 						 "failed", strerror(errno));
722 			} else
723 				sock->dscp = dscp;
724 		}
725 
726 		if (msg->msg_controllen != 0 &&
727 		    msg->msg_controllen < SENDCMSGBUFLEN)
728 		{
729 			memset(cmsgbuf + msg->msg_controllen, 0,
730 			       SENDCMSGBUFLEN - msg->msg_controllen);
731 		}
732 	}
733 
734 	if (write_countp != NULL)
735 		*write_countp = write_count;
736 }
737 
738 /*
739  * Construct an iov array and attach it to the msghdr passed in.  This is
740  * the RECV constructor, which will use the available region of the buffer
741  * (if using a buffer list) or will use the internal region (if a single
742  * buffer I/O is requested).
743  *
744  * Nothing can be NULL, and the done event must list at least one buffer
745  * on the buffer linked list for this function to be meaningful.
746  *
747  * If read_countp != NULL, *read_countp will hold the number of bytes
748  * this transaction can receive.
749  */
750 static void
751 build_msghdr_recv(isc__socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
752 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
753 {
754 	unsigned int iovcount;
755 	isc_buffer_t *buffer;
756 	isc_region_t available;
757 	size_t read_count;
758 
759 	memset(msg, 0, sizeof(struct msghdr));
760 
761 	if (sock->type == isc_sockettype_udp) {
762 		memset(&dev->address, 0, sizeof(dev->address));
763 		msg->msg_name = (void *)&dev->address.type.sa;
764 		msg->msg_namelen = sizeof(dev->address.type);
765 	} else { /* TCP */
766 		msg->msg_name = NULL;
767 		msg->msg_namelen = 0;
768 		dev->address = sock->peer_address;
769 	}
770 
771 	buffer = ISC_LIST_HEAD(dev->bufferlist);
772 	read_count = 0;
773 
774 	/*
775 	 * Single buffer I/O?  Skip what we've done so far in this region.
776 	 */
777 	if (buffer == NULL) {
778 		read_count = dev->region.length - dev->n;
779 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
780 		iov[0].iov_len = read_count;
781 		iovcount = 1;
782 
783 		goto config;
784 	}
785 
786 	/*
787 	 * Multibuffer I/O.
788 	 * Skip empty buffers.
789 	 */
790 	while (buffer != NULL) {
791 		REQUIRE(ISC_BUFFER_VALID(buffer));
792 		if (isc_buffer_availablelength(buffer) != 0)
793 			break;
794 		buffer = ISC_LIST_NEXT(buffer, link);
795 	}
796 
797 	iovcount = 0;
798 	while (buffer != NULL) {
799 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
800 
801 		isc_buffer_availableregion(buffer, &available);
802 
803 		if (available.length > 0) {
804 			iov[iovcount].iov_base = (void *)(available.base);
805 			iov[iovcount].iov_len = available.length;
806 			read_count += available.length;
807 			iovcount++;
808 		}
809 		buffer = ISC_LIST_NEXT(buffer, link);
810 	}
811 
812  config:
813 
814 	/*
815 	 * If needed, set up to receive that one extra byte.
816 	 */
817 	msg->msg_iov = iov;
818 	msg->msg_iovlen = iovcount;
819 
820 	msg->msg_control = cmsgbuf;
821 	msg->msg_controllen = RECVCMSGBUFLEN;
822 	msg->msg_flags = 0;
823 
824 	if (read_countp != NULL)
825 		*read_countp = read_count;
826 }
827 
828 static void
829 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
830 		isc_socketevent_t *dev)
831 {
832 	if (sock->type == isc_sockettype_udp) {
833 		if (address != NULL)
834 			dev->address = *address;
835 		else
836 			dev->address = sock->peer_address;
837 	} else if (sock->type == isc_sockettype_tcp) {
838 		INSIST(address == NULL);
839 		dev->address = sock->peer_address;
840 	}
841 }
842 
843 static void
844 destroy_socketevent(isc_event_t *event) {
845 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
846 
847 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
848 
849 	(ev->destroy)(event);
850 }
851 
852 static isc_socketevent_t *
853 allocate_socketevent(void *sender,
854 		     isc_eventtype_t eventtype, isc_taskaction_t action,
855 		     void *arg)
856 {
857 	isc_socketevent_t *ev;
858 
859 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
860 						     eventtype, action, arg,
861 						     sizeof(*ev));
862 
863 	if (ev == NULL)
864 		return (NULL);
865 
866 	ev->result = ISC_R_UNSET;
867 	ISC_LINK_INIT(ev, ev_link);
868 	ISC_LIST_INIT(ev->bufferlist);
869 	ev->region.base = NULL;
870 	ev->n = 0;
871 	ev->offset = 0;
872 	ev->attributes = 0;
873 	ev->destroy = ev->ev_destroy;
874 	ev->ev_destroy = destroy_socketevent;
875 	ev->dscp = 0;
876 
877 	return (ev);
878 }
879 
880 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
881 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
882 #define DOIO_HARD		2	/* i/o error, event sent */
883 #define DOIO_EOF		3	/* EOF, no event sent */
884 
885 static int
886 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
887 	int cc;
888 	struct iovec iov[MAXSCATTERGATHER_RECV];
889 	size_t read_count;
890 	size_t actual_count;
891 	struct msghdr msghdr;
892 	isc_buffer_t *buffer;
893 	int recv_errno;
894 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
895 
896 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
897 
898 	cc = recvmsg(sock->fd, &msghdr, 0);
899 	recv_errno = errno;
900 
901 	if (cc < 0) {
902 		if (SOFT_ERROR(recv_errno))
903 			return (DOIO_SOFT);
904 
905 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
906 			socket_log(sock, NULL, IOEVENT,
907 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
908 				   sock->fd, cc, recv_errno,
909 				   strerror(recv_errno));
910 		}
911 
912 #define SOFT_OR_HARD(_system, _isc) \
913 	if (recv_errno == _system) { \
914 		if (sock->connected) { \
915 			dev->result = _isc; \
916 			return (DOIO_HARD); \
917 		} \
918 		return (DOIO_SOFT); \
919 	}
920 #define ALWAYS_HARD(_system, _isc) \
921 	if (recv_errno == _system) { \
922 		dev->result = _isc; \
923 		return (DOIO_HARD); \
924 	}
925 
926 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
927 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
928 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
929 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
930 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
931 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
932 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
933 		/* Should never get this one but it was seen. */
934 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
935 		/*
936 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
937 		 * errors.
938 		 */
939 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
940 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
941 
942 #undef SOFT_OR_HARD
943 #undef ALWAYS_HARD
944 
945 		dev->result = isc__errno2result(recv_errno);
946 		return (DOIO_HARD);
947 	}
948 
949 	/*
950 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
951 	 * while on UDP sockets, zero length reads are perfectly valid,
952 	 * although strange.
953 	 */
954 	switch (sock->type) {
955 	case isc_sockettype_tcp:
956 		if (cc == 0)
957 			return (DOIO_EOF);
958 		break;
959 	case isc_sockettype_udp:
960 		break;
961 	default:
962 		INSIST(0);
963 	}
964 
965 	if (sock->type == isc_sockettype_udp) {
966 		dev->address.length = msghdr.msg_namelen;
967 		if (isc_sockaddr_getport(&dev->address) == 0) {
968 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
969 				socket_log(sock, &dev->address, IOEVENT,
970 					   "dropping source port zero packet");
971 			}
972 			return (DOIO_SOFT);
973 		}
974 	}
975 
976 	socket_log(sock, &dev->address, IOEVENT,
977 		   "packet received correctly");
978 
979 	/*
980 	 * Overflow bit detection.  If we received MORE bytes than we should,
981 	 * this indicates an overflow situation.  Set the flag in the
982 	 * dev entry and adjust how much we read by one.
983 	 */
984 	/*
985 	 * If there are control messages attached, run through them and pull
986 	 * out the interesting bits.
987 	 */
988 	process_cmsg(sock, &msghdr, dev);
989 
990 	/*
991 	 * update the buffers (if any) and the i/o count
992 	 */
993 	dev->n += cc;
994 	actual_count = cc;
995 	buffer = ISC_LIST_HEAD(dev->bufferlist);
996 	while (buffer != NULL && actual_count > 0U) {
997 		REQUIRE(ISC_BUFFER_VALID(buffer));
998 		if (isc_buffer_availablelength(buffer) <= actual_count) {
999 			actual_count -= isc_buffer_availablelength(buffer);
1000 			isc_buffer_add(buffer,
1001 				       isc_buffer_availablelength(buffer));
1002 		} else {
1003 			isc_buffer_add(buffer, actual_count);
1004 			actual_count = 0;
1005 			POST(actual_count);
1006 			break;
1007 		}
1008 		buffer = ISC_LIST_NEXT(buffer, link);
1009 		if (buffer == NULL) {
1010 			INSIST(actual_count == 0U);
1011 		}
1012 	}
1013 
1014 	/*
1015 	 * If we read less than we expected, update counters,
1016 	 * and let the upper layer poke the descriptor.
1017 	 */
1018 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
1019 		return (DOIO_SOFT);
1020 
1021 	/*
1022 	 * Full reads are posted, or partials if partials are ok.
1023 	 */
1024 	dev->result = ISC_R_SUCCESS;
1025 	return (DOIO_SUCCESS);
1026 }
1027 
1028 /*
1029  * Returns:
1030  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1031  *			ISC_R_SUCCESS.
1032  *
1033  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1034  *			dev->result contains the appropriate error.
1035  *
1036  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1037  *			event was sent.  The operation should be retried.
1038  *
1039  *	No other return values are possible.
1040  */
1041 static int
1042 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
1043 	int cc;
1044 	struct iovec iov[MAXSCATTERGATHER_SEND];
1045 	size_t write_count;
1046 	struct msghdr msghdr;
1047 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1048 	int attempts = 0;
1049 	int send_errno;
1050 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
1051 
1052 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
1053 
1054  resend:
1055 	cc = sendmsg(sock->fd, &msghdr, 0);
1056 	send_errno = errno;
1057 
1058 	/*
1059 	 * Check for error or block condition.
1060 	 */
1061 	if (cc < 0) {
1062 		if (send_errno == EINTR && ++attempts < NRETRIES)
1063 			goto resend;
1064 
1065 		if (SOFT_ERROR(send_errno)) {
1066 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1067 				dev->result = ISC_R_WOULDBLOCK;
1068 			return (DOIO_SOFT);
1069 		}
1070 
1071 #define SOFT_OR_HARD(_system, _isc) \
1072 	if (send_errno == _system) { \
1073 		if (sock->connected) { \
1074 			dev->result = _isc; \
1075 			return (DOIO_HARD); \
1076 		} \
1077 		return (DOIO_SOFT); \
1078 	}
1079 #define ALWAYS_HARD(_system, _isc) \
1080 	if (send_errno == _system) { \
1081 		dev->result = _isc; \
1082 		return (DOIO_HARD); \
1083 	}
1084 
1085 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1086 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1087 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1088 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1089 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1090 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1091 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1092 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1093 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1094 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1095 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1096 
1097 #undef SOFT_OR_HARD
1098 #undef ALWAYS_HARD
1099 
1100 		/*
1101 		 * The other error types depend on whether or not the
1102 		 * socket is UDP or TCP.  If it is UDP, some errors
1103 		 * that we expect to be fatal under TCP are merely
1104 		 * annoying, and are really soft errors.
1105 		 *
1106 		 * However, these soft errors are still returned as
1107 		 * a status.
1108 		 */
1109 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1110 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1111 				 addrbuf, strerror(send_errno));
1112 		dev->result = isc__errno2result(send_errno);
1113 		return (DOIO_HARD);
1114 	}
1115 
1116 	if (cc == 0) {
1117 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1118 				 "doio_send: send() %s 0", "returned");
1119 	}
1120 
1121 	/*
1122 	 * If we write less than we expected, update counters, poke.
1123 	 */
1124 	dev->n += cc;
1125 	if ((size_t)cc != write_count)
1126 		return (DOIO_SOFT);
1127 
1128 	/*
1129 	 * Exactly what we wanted to write.  We're done with this
1130 	 * entry.  Post its completion event.
1131 	 */
1132 	dev->result = ISC_R_SUCCESS;
1133 	return (DOIO_SUCCESS);
1134 }
1135 
1136 /*
1137  * Kill.
1138  *
1139  * Caller must ensure that the socket is not locked and no external
1140  * references exist.
1141  */
1142 static void
1143 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
1144 	/*
1145 	 * No one has this socket open, so the watcher doesn't have to be
1146 	 * poked, and the socket doesn't have to be locked.
1147 	 */
1148 	manager->fds[fd] = NULL;
1149 	manager->fdstate[fd] = CLOSE_PENDING;
1150 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1151 
1152 	if (sock->active == 1) {
1153 		sock->active = 0;
1154 	}
1155 
1156 	/*
1157 	 * update manager->maxfd here (XXX: this should be implemented more
1158 	 * efficiently)
1159 	 */
1160 	if (manager->maxfd == fd) {
1161 		int i;
1162 
1163 		manager->maxfd = 0;
1164 		for (i = fd - 1; i >= 0; i--) {
1165 			if (manager->fdstate[i] == MANAGED) {
1166 				manager->maxfd = i;
1167 				break;
1168 			}
1169 		}
1170 	}
1171 
1172 }
1173 
1174 static void
1175 destroy(isc__socket_t **sockp) {
1176 	int fd;
1177 	isc__socket_t *sock = *sockp;
1178 	isc__socketmgr_t *manager = sock->manager;
1179 
1180 	socket_log(sock, NULL, CREATION, "destroying");
1181 
1182 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1183 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1184 	INSIST(sock->connect_ev == NULL);
1185 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1186 
1187 	if (sock->fd >= 0) {
1188 		fd = sock->fd;
1189 		sock->fd = -1;
1190 		socketclose(manager, sock, fd);
1191 	}
1192 
1193 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1194 
1195 	/* can't unlock manager as its memory context is still used */
1196 	free_socket(sockp);
1197 }
1198 
1199 static isc_result_t
1200 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
1201 		isc__socket_t **socketp)
1202 {
1203 	isc__socket_t *sock;
1204 
1205 	sock = malloc(sizeof(*sock));
1206 
1207 	if (sock == NULL)
1208 		return (ISC_R_NOMEMORY);
1209 
1210 	sock->common.magic = 0;
1211 	sock->common.impmagic = 0;
1212 	sock->references = 0;
1213 
1214 	sock->manager = manager;
1215 	sock->type = type;
1216 	sock->fd = -1;
1217 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1218 	sock->active = 0;
1219 
1220 	ISC_LINK_INIT(sock, link);
1221 
1222 	/*
1223 	 * Set up list of readers and writers to be initially empty.
1224 	 */
1225 	ISC_LIST_INIT(sock->recv_list);
1226 	ISC_LIST_INIT(sock->send_list);
1227 	sock->connect_ev = NULL;
1228 	sock->pending_recv = 0;
1229 	sock->pending_send = 0;
1230 	sock->connected = 0;
1231 	sock->connecting = 0;
1232 	sock->bound = 0;
1233 	sock->pktdscp = 0;
1234 
1235 	/*
1236 	 * Initialize readable and writable events.
1237 	 */
1238 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1239 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1240 		       NULL, sock, sock, NULL);
1241 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1242 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1243 		       NULL, sock, sock, NULL);
1244 
1245 	sock->common.magic = ISCAPI_SOCKET_MAGIC;
1246 	sock->common.impmagic = SOCKET_MAGIC;
1247 	*socketp = sock;
1248 
1249 	return (ISC_R_SUCCESS);
1250 }
1251 
1252 /*
1253  * This event requires that the various lists be empty, that the reference
1254  * count be 1, and that the magic number is valid.  The other socket bits,
1255  * like the lock, must be initialized as well.  The fd associated must be
1256  * marked as closed, by setting it to -1 on close, or this routine will
1257  * also close the socket.
1258  */
1259 static void
1260 free_socket(isc__socket_t **socketp) {
1261 	isc__socket_t *sock = *socketp;
1262 
1263 	INSIST(VALID_SOCKET(sock));
1264 	INSIST(sock->references == 0);
1265 	INSIST(!sock->connecting);
1266 	INSIST(!sock->pending_recv);
1267 	INSIST(!sock->pending_send);
1268 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1269 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1270 	INSIST(!ISC_LINK_LINKED(sock, link));
1271 
1272 	sock->common.magic = 0;
1273 	sock->common.impmagic = 0;
1274 
1275 	free(sock);
1276 
1277 	*socketp = NULL;
1278 }
1279 
1280 static void
1281 use_min_mtu(isc__socket_t *sock) {
1282 	/* use minimum MTU */
1283 	if (sock->pf == AF_INET6) {
1284 		int on = 1;
1285 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1286 				(void *)&on, sizeof(on));
1287 	}
1288 }
1289 
1290 static void
1291 set_tcp_maxseg(isc__socket_t *sock, int size) {
1292 	if (sock->type == isc_sockettype_tcp)
1293 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1294 				(void *)&size, sizeof(size));
1295 }
1296 
1297 static isc_result_t
1298 opensocket(isc__socket_t *sock)
1299 {
1300 	isc_result_t result;
1301 	const char *err = "socket";
1302 	int on = 1;
1303 
1304 	switch (sock->type) {
1305 	case isc_sockettype_udp:
1306 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1307 		break;
1308 	case isc_sockettype_tcp:
1309 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1310 		break;
1311 	}
1312 
1313 	if (sock->fd < 0) {
1314 		switch (errno) {
1315 		case EMFILE:
1316 		case ENFILE:
1317 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1318 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1319 				       "%s: %s", err, strerror(errno));
1320 			/* fallthrough */
1321 		case ENOBUFS:
1322 			return (ISC_R_NORESOURCES);
1323 
1324 		case EPROTONOSUPPORT:
1325 		case EPFNOSUPPORT:
1326 		case EAFNOSUPPORT:
1327 		/*
1328 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1329 		 * EAFNOSUPPORT.
1330 		 */
1331 		case EINVAL:
1332 			return (ISC_R_FAMILYNOSUPPORT);
1333 
1334 		default:
1335 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1336 					 "%s() %s: %s", err, "failed",
1337 					 strerror(errno));
1338 			return (ISC_R_UNEXPECTED);
1339 		}
1340 	}
1341 
1342 	result = make_nonblock(sock->fd);
1343 	if (result != ISC_R_SUCCESS) {
1344 		(void)close(sock->fd);
1345 		return (result);
1346 	}
1347 
1348 	/*
1349 	 * Use minimum mtu if possible.
1350 	 */
1351 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1352 		use_min_mtu(sock);
1353 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1354 	}
1355 
1356 	if (sock->type == isc_sockettype_udp) {
1357 
1358 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1359 			       (void *)&on, sizeof(on)) < 0
1360 		    && errno != ENOPROTOOPT) {
1361 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1362 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1363 					 sock->fd, "failed", strerror(errno));
1364 			/* Press on... */
1365 		}
1366 
1367 		/* RFC 3542 */
1368 		if ((sock->pf == AF_INET6)
1369 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1370 				   (void *)&on, sizeof(on)) < 0)) {
1371 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1372 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1373 					 "%s: %s", sock->fd, "failed",
1374 					 strerror(errno));
1375 		}
1376 	}
1377 
1378 	if (sock->active == 0) {
1379 		sock->active = 1;
1380 	}
1381 
1382 	return (ISC_R_SUCCESS);
1383 }
1384 
1385 /*
1386  * Create a 'type' socket managed
1387  * by 'manager'.  Events will be posted to 'task' and when dispatched
1388  * 'action' will be called with 'arg' as the arg value.  The new
1389  * socket is returned in 'socketp'.
1390  */
1391 static isc_result_t
1392 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1393 	      isc_socket_t **socketp)
1394 {
1395 	isc__socket_t *sock = NULL;
1396 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
1397 	isc_result_t result;
1398 
1399 	REQUIRE(VALID_MANAGER(manager));
1400 	REQUIRE(socketp != NULL && *socketp == NULL);
1401 
1402 	result = allocate_socket(manager, type, &sock);
1403 	if (result != ISC_R_SUCCESS)
1404 		return (result);
1405 
1406 	switch (sock->type) {
1407 	case isc_sockettype_udp:
1408 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1409 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1410 		break;
1411 	case isc_sockettype_tcp:
1412 		break;
1413 	default:
1414 		INSIST(0);
1415 	}
1416 
1417 	sock->pf = pf;
1418 
1419 	result = opensocket(sock);
1420 	if (result != ISC_R_SUCCESS) {
1421 		free_socket(&sock);
1422 		return (result);
1423 	}
1424 
1425 	sock->references = 1;
1426 	*socketp = (isc_socket_t *)sock;
1427 
1428 	/*
1429 	 * Note we don't have to lock the socket like we normally would because
1430 	 * there are no external references to it yet.
1431 	 */
1432 
1433 	manager->fds[sock->fd] = sock;
1434 	manager->fdstate[sock->fd] = MANAGED;
1435 
1436 	ISC_LIST_APPEND(manager->socklist, sock, link);
1437 	if (manager->maxfd < sock->fd)
1438 		manager->maxfd = sock->fd;
1439 
1440 	socket_log(sock, NULL, CREATION, "created");
1441 
1442 	return (ISC_R_SUCCESS);
1443 }
1444 
1445 /*%
1446  * Create a new 'type' socket managed by 'manager'.  Events
1447  * will be posted to 'task' and when dispatched 'action' will be
1448  * called with 'arg' as the arg value.  The new socket is returned
1449  * in 'socketp'.
1450  */
1451 isc_result_t
1452 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1453 		   isc_socket_t **socketp)
1454 {
1455 	return (socket_create(manager0, pf, type, socketp));
1456 }
1457 
1458 /*
1459  * Attach to a socket.  Caller must explicitly detach when it is done.
1460  */
1461 void
1462 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1463 	isc__socket_t *sock = (isc__socket_t *)sock0;
1464 
1465 	REQUIRE(VALID_SOCKET(sock));
1466 	REQUIRE(socketp != NULL && *socketp == NULL);
1467 
1468 	sock->references++;
1469 
1470 	*socketp = (isc_socket_t *)sock;
1471 }
1472 
1473 /*
1474  * Dereference a socket.  If this is the last reference to it, clean things
1475  * up by destroying the socket.
1476  */
1477 void
1478 isc__socket_detach(isc_socket_t **socketp) {
1479 	isc__socket_t *sock;
1480 	isc_boolean_t kill_socket = ISC_FALSE;
1481 
1482 	REQUIRE(socketp != NULL);
1483 	sock = (isc__socket_t *)*socketp;
1484 	REQUIRE(VALID_SOCKET(sock));
1485 
1486 	REQUIRE(sock->references > 0);
1487 	sock->references--;
1488 	if (sock->references == 0)
1489 		kill_socket = ISC_TRUE;
1490 
1491 	if (kill_socket)
1492 		destroy(&sock);
1493 
1494 	*socketp = NULL;
1495 }
1496 
1497 /*
1498  * I/O is possible on a given socket.  Schedule an event to this task that
1499  * will call an internal function to do the I/O.  This will charge the
1500  * task with the I/O operation and let our select loop handler get back
1501  * to doing something real as fast as possible.
1502  *
1503  * The socket and manager must be locked before calling this function.
1504  */
1505 static void
1506 dispatch_recv(isc__socket_t *sock) {
1507 	intev_t *iev;
1508 	isc_socketevent_t *ev;
1509 	isc_task_t *sender;
1510 
1511 	INSIST(!sock->pending_recv);
1512 
1513 	ev = ISC_LIST_HEAD(sock->recv_list);
1514 	if (ev == NULL)
1515 		return;
1516 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1517 		   "dispatch_recv:  event %p -> task %p",
1518 		   ev, ev->ev_sender);
1519 	sender = ev->ev_sender;
1520 
1521 	sock->pending_recv = 1;
1522 	iev = &sock->readable_ev;
1523 
1524 	sock->references++;
1525 	iev->ev_sender = sock;
1526 	iev->ev_action = internal_recv;
1527 	iev->ev_arg = sock;
1528 
1529 	isc_task_send(sender, (isc_event_t **)&iev);
1530 }
1531 
1532 static void
1533 dispatch_send(isc__socket_t *sock) {
1534 	intev_t *iev;
1535 	isc_socketevent_t *ev;
1536 	isc_task_t *sender;
1537 
1538 	INSIST(!sock->pending_send);
1539 
1540 	ev = ISC_LIST_HEAD(sock->send_list);
1541 	if (ev == NULL)
1542 		return;
1543 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1544 		   "dispatch_send:  event %p -> task %p",
1545 		   ev, ev->ev_sender);
1546 	sender = ev->ev_sender;
1547 
1548 	sock->pending_send = 1;
1549 	iev = &sock->writable_ev;
1550 
1551 	sock->references++;
1552 	iev->ev_sender = sock;
1553 	iev->ev_action = internal_send;
1554 	iev->ev_arg = sock;
1555 
1556 	isc_task_send(sender, (isc_event_t **)&iev);
1557 }
1558 
1559 static void
1560 dispatch_connect(isc__socket_t *sock) {
1561 	intev_t *iev;
1562 	isc_socket_connev_t *ev;
1563 
1564 	iev = &sock->writable_ev;
1565 
1566 	ev = sock->connect_ev;
1567 	INSIST(ev != NULL); /* XXX */
1568 
1569 	INSIST(sock->connecting);
1570 
1571 	sock->references++;  /* keep socket around for this internal event */
1572 	iev->ev_sender = sock;
1573 	iev->ev_action = internal_connect;
1574 	iev->ev_arg = sock;
1575 
1576 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1577 }
1578 
1579 /*
1580  * Dequeue an item off the given socket's read queue, set the result code
1581  * in the done event to the one provided, and send it to the task it was
1582  * destined for.
1583  *
1584  * If the event to be sent is on a list, remove it before sending.  If
1585  * asked to, send and detach from the socket as well.
1586  *
1587  * Caller must have the socket locked if the event is attached to the socket.
1588  */
1589 static void
1590 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1591 	isc_task_t *task;
1592 
1593 	task = (*dev)->ev_sender;
1594 
1595 	(*dev)->ev_sender = sock;
1596 
1597 	if (ISC_LINK_LINKED(*dev, ev_link))
1598 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1599 
1600 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1601 	    == ISC_SOCKEVENTATTR_ATTACHED)
1602 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1603 	else
1604 		isc_task_send(task, (isc_event_t **)dev);
1605 }
1606 
1607 /*
1608  * See comments for send_recvdone_event() above.
1609  *
1610  * Caller must have the socket locked if the event is attached to the socket.
1611  */
1612 static void
1613 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
1614 	isc_task_t *task;
1615 
1616 	INSIST(dev != NULL && *dev != NULL);
1617 
1618 	task = (*dev)->ev_sender;
1619 	(*dev)->ev_sender = sock;
1620 
1621 	if (ISC_LINK_LINKED(*dev, ev_link))
1622 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1623 
1624 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1625 	    == ISC_SOCKEVENTATTR_ATTACHED)
1626 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1627 	else
1628 		isc_task_send(task, (isc_event_t **)dev);
1629 }
1630 
1631 static void
1632 internal_recv(isc_task_t *me, isc_event_t *ev) {
1633 	isc_socketevent_t *dev;
1634 	isc__socket_t *sock;
1635 
1636 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1637 
1638 	sock = ev->ev_sender;
1639 	INSIST(VALID_SOCKET(sock));
1640 
1641 	socket_log(sock, NULL, IOEVENT,
1642 		   "internal_recv: task %p got event %p", me, ev);
1643 
1644 	INSIST(sock->pending_recv == 1);
1645 	sock->pending_recv = 0;
1646 
1647 	INSIST(sock->references > 0);
1648 	sock->references--;  /* the internal event is done with this socket */
1649 	if (sock->references == 0) {
1650 		destroy(&sock);
1651 		return;
1652 	}
1653 
1654 	/*
1655 	 * Try to do as much I/O as possible on this socket.  There are no
1656 	 * limits here, currently.
1657 	 */
1658 	dev = ISC_LIST_HEAD(sock->recv_list);
1659 	while (dev != NULL) {
1660 		switch (doio_recv(sock, dev)) {
1661 		case DOIO_SOFT:
1662 			goto poke;
1663 
1664 		case DOIO_EOF:
1665 			/*
1666 			 * read of 0 means the remote end was closed.
1667 			 * Run through the event queue and dispatch all
1668 			 * the events with an EOF result code.
1669 			 */
1670 			do {
1671 				dev->result = ISC_R_EOF;
1672 				send_recvdone_event(sock, &dev);
1673 				dev = ISC_LIST_HEAD(sock->recv_list);
1674 			} while (dev != NULL);
1675 			goto poke;
1676 
1677 		case DOIO_SUCCESS:
1678 		case DOIO_HARD:
1679 			send_recvdone_event(sock, &dev);
1680 			break;
1681 		}
1682 
1683 		dev = ISC_LIST_HEAD(sock->recv_list);
1684 	}
1685 
1686  poke:
1687 	if (!ISC_LIST_EMPTY(sock->recv_list))
1688 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1689 }
1690 
1691 static void
1692 internal_send(isc_task_t *me, isc_event_t *ev) {
1693 	isc_socketevent_t *dev;
1694 	isc__socket_t *sock;
1695 
1696 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1697 
1698 	/*
1699 	 * Find out what socket this is and lock it.
1700 	 */
1701 	sock = (isc__socket_t *)ev->ev_sender;
1702 	INSIST(VALID_SOCKET(sock));
1703 	socket_log(sock, NULL, IOEVENT,
1704 		   "internal_send: task %p got event %p", me, ev);
1705 
1706 	INSIST(sock->pending_send == 1);
1707 	sock->pending_send = 0;
1708 
1709 	INSIST(sock->references > 0);
1710 	sock->references--;  /* the internal event is done with this socket */
1711 	if (sock->references == 0) {
1712 		destroy(&sock);
1713 		return;
1714 	}
1715 
1716 	/*
1717 	 * Try to do as much I/O as possible on this socket.  There are no
1718 	 * limits here, currently.
1719 	 */
1720 	dev = ISC_LIST_HEAD(sock->send_list);
1721 	while (dev != NULL) {
1722 		switch (doio_send(sock, dev)) {
1723 		case DOIO_SOFT:
1724 			goto poke;
1725 
1726 		case DOIO_HARD:
1727 		case DOIO_SUCCESS:
1728 			send_senddone_event(sock, &dev);
1729 			break;
1730 		}
1731 
1732 		dev = ISC_LIST_HEAD(sock->send_list);
1733 	}
1734 
1735  poke:
1736 	if (!ISC_LIST_EMPTY(sock->send_list))
1737 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1738 }
1739 
1740 /*
1741  * Process read/writes on each fd here.  Avoid locking
1742  * and unlocking twice if both reads and writes are possible.
1743  */
1744 static void
1745 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
1746 	   isc_boolean_t writeable)
1747 {
1748 	isc__socket_t *sock;
1749 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1750 
1751 	/*
1752 	 * If the socket is going to be closed, don't do more I/O.
1753 	 */
1754 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1755 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1756 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1757 		return;
1758 	}
1759 
1760 	sock = manager->fds[fd];
1761 	if (readable) {
1762 		if (sock == NULL) {
1763 			unwatch_read = ISC_TRUE;
1764 			goto check_write;
1765 		}
1766 		if (!SOCK_DEAD(sock)) {
1767 			dispatch_recv(sock);
1768 		}
1769 		unwatch_read = ISC_TRUE;
1770 	}
1771 check_write:
1772 	if (writeable) {
1773 		if (sock == NULL) {
1774 			unwatch_write = ISC_TRUE;
1775 			goto unlock_fd;
1776 		}
1777 		if (!SOCK_DEAD(sock)) {
1778 			if (sock->connecting)
1779 				dispatch_connect(sock);
1780 			else
1781 				dispatch_send(sock);
1782 		}
1783 		unwatch_write = ISC_TRUE;
1784 	}
1785 
1786  unlock_fd:
1787 	if (unwatch_read)
1788 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1789 	if (unwatch_write)
1790 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1791 
1792 }
1793 
1794 static void
1795 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
1796 	    fd_set *writefds)
1797 {
1798 	int i;
1799 
1800 	REQUIRE(maxfd <= (int)manager->maxsocks);
1801 
1802 	for (i = 0; i < maxfd; i++) {
1803 		process_fd(manager, i, FD_ISSET(i, readfds),
1804 			   FD_ISSET(i, writefds));
1805 	}
1806 }
1807 
1808 /*
1809  * Create a new socket manager.
1810  */
1811 
1812 static isc_result_t
1813 setup_watcher(isc__socketmgr_t *manager) {
1814 	isc_result_t result;
1815 
1816 	UNUSED(result);
1817 
1818 	manager->fd_bufsize = sizeof(fd_set);
1819 
1820 	manager->read_fds = NULL;
1821 	manager->read_fds_copy = NULL;
1822 	manager->write_fds = NULL;
1823 	manager->write_fds_copy = NULL;
1824 
1825 	manager->read_fds = malloc(manager->fd_bufsize);
1826 	if (manager->read_fds != NULL)
1827 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1828 	if (manager->read_fds_copy != NULL)
1829 		manager->write_fds = malloc(manager->fd_bufsize);
1830 	if (manager->write_fds != NULL) {
1831 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1832 	}
1833 	if (manager->write_fds_copy == NULL) {
1834 		if (manager->write_fds != NULL) {
1835 			free(manager->write_fds);
1836 		}
1837 		if (manager->read_fds_copy != NULL) {
1838 			free(manager->read_fds_copy);
1839 		}
1840 		if (manager->read_fds != NULL) {
1841 			free(manager->read_fds);
1842 		}
1843 		return (ISC_R_NOMEMORY);
1844 	}
1845 	memset(manager->read_fds, 0, manager->fd_bufsize);
1846 	memset(manager->write_fds, 0, manager->fd_bufsize);
1847 
1848 	manager->maxfd = 0;
1849 
1850 	return (ISC_R_SUCCESS);
1851 }
1852 
1853 static void
1854 cleanup_watcher(isc__socketmgr_t *manager) {
1855 
1856 	if (manager->read_fds != NULL)
1857 		free(manager->read_fds);
1858 	if (manager->read_fds_copy != NULL)
1859 		free(manager->read_fds_copy);
1860 	if (manager->write_fds != NULL)
1861 		free(manager->write_fds);
1862 	if (manager->write_fds_copy != NULL)
1863 		free(manager->write_fds_copy);
1864 }
1865 
1866 isc_result_t
1867 isc__socketmgr_create(isc_socketmgr_t **managerp) {
1868 	return (isc__socketmgr_create2(managerp, 0));
1869 }
1870 
1871 isc_result_t
1872 isc__socketmgr_create2(isc_socketmgr_t **managerp,
1873 		       unsigned int maxsocks)
1874 {
1875 	isc__socketmgr_t *manager;
1876 	isc_result_t result;
1877 
1878 	REQUIRE(managerp != NULL && *managerp == NULL);
1879 
1880 	if (socketmgr != NULL) {
1881 		/* Don't allow maxsocks to be updated */
1882 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1883 			return (ISC_R_EXISTS);
1884 
1885 		socketmgr->refs++;
1886 		*managerp = (isc_socketmgr_t *)socketmgr;
1887 		return (ISC_R_SUCCESS);
1888 	}
1889 
1890 	if (maxsocks == 0)
1891 		maxsocks = FD_SETSIZE;
1892 
1893 	manager = malloc(sizeof(*manager));
1894 	if (manager == NULL)
1895 		return (ISC_R_NOMEMORY);
1896 
1897 	/* zero-clear so that necessary cleanup on failure will be easy */
1898 	memset(manager, 0, sizeof(*manager));
1899 	manager->maxsocks = maxsocks;
1900 	manager->fds = malloc(manager->maxsocks * sizeof(isc__socket_t *));
1901 	if (manager->fds == NULL) {
1902 		result = ISC_R_NOMEMORY;
1903 		goto free_manager;
1904 	}
1905 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1906 	if (manager->fdstate == NULL) {
1907 		result = ISC_R_NOMEMORY;
1908 		goto free_manager;
1909 	}
1910 
1911 	manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
1912 	manager->common.impmagic = SOCKET_MANAGER_MAGIC;
1913 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1914 	ISC_LIST_INIT(manager->socklist);
1915 
1916 	manager->refs = 1;
1917 
1918 	/*
1919 	 * Set up initial state for the select loop
1920 	 */
1921 	result = setup_watcher(manager);
1922 	if (result != ISC_R_SUCCESS)
1923 		goto cleanup;
1924 
1925 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1926 
1927 	socketmgr = manager;
1928 	*managerp = (isc_socketmgr_t *)manager;
1929 
1930 	return (ISC_R_SUCCESS);
1931 
1932 cleanup:
1933 
1934 free_manager:
1935 	if (manager->fdstate != NULL) {
1936 		free(manager->fdstate);
1937 	}
1938 	if (manager->fds != NULL) {
1939 		free(manager->fds);
1940 	}
1941 	free(manager);
1942 
1943 	return (result);
1944 }
1945 
1946 void
1947 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
1948 	isc__socketmgr_t *manager;
1949 	int i;
1950 
1951 	/*
1952 	 * Destroy a socket manager.
1953 	 */
1954 
1955 	REQUIRE(managerp != NULL);
1956 	manager = (isc__socketmgr_t *)*managerp;
1957 	REQUIRE(VALID_MANAGER(manager));
1958 
1959 	manager->refs--;
1960 	if (manager->refs > 0) {
1961 		*managerp = NULL;
1962 		return;
1963 	}
1964 	socketmgr = NULL;
1965 
1966 	/*
1967 	 * Wait for all sockets to be destroyed.
1968 	 */
1969 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1970 		isc__taskmgr_dispatch(NULL);
1971 	}
1972 
1973 	/*
1974 	 * Here, poke our select/poll thread.  Do this by closing the write
1975 	 * half of the pipe, which will send EOF to the read half.
1976 	 * This is currently a no-op in the non-threaded case.
1977 	 */
1978 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
1979 
1980 	/*
1981 	 * Clean up.
1982 	 */
1983 	cleanup_watcher(manager);
1984 
1985 	for (i = 0; i < (int)manager->maxsocks; i++)
1986 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
1987 			(void)close(i);
1988 
1989 	free(manager->fds);
1990 	free(manager->fdstate);
1991 
1992 	manager->common.magic = 0;
1993 	manager->common.impmagic = 0;
1994 	free(manager);
1995 
1996 	*managerp = NULL;
1997 
1998 	socketmgr = NULL;
1999 }
2000 
2001 static isc_result_t
2002 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2003 	    unsigned int flags)
2004 {
2005 	int io_state;
2006 	isc_task_t *ntask = NULL;
2007 	isc_result_t result = ISC_R_SUCCESS;
2008 
2009 	dev->ev_sender = task;
2010 
2011 	if (sock->type == isc_sockettype_udp) {
2012 		io_state = doio_recv(sock, dev);
2013 	} else {
2014 		if (ISC_LIST_EMPTY(sock->recv_list))
2015 			io_state = doio_recv(sock, dev);
2016 		else
2017 			io_state = DOIO_SOFT;
2018 	}
2019 
2020 	switch (io_state) {
2021 	case DOIO_SOFT:
2022 		/*
2023 		 * We couldn't read all or part of the request right now, so
2024 		 * queue it.
2025 		 *
2026 		 * Attach to socket and to task
2027 		 */
2028 		isc_task_attach(task, &ntask);
2029 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2030 
2031 		/*
2032 		 * Enqueue the request.  If the socket was previously not being
2033 		 * watched, poke the watcher to start paying attention to it.
2034 		 */
2035 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
2036 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
2037 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
2038 
2039 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
2040 			   "socket_recv: event %p -> task %p",
2041 			   dev, ntask);
2042 
2043 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2044 			result = ISC_R_INPROGRESS;
2045 		break;
2046 
2047 	case DOIO_EOF:
2048 		dev->result = ISC_R_EOF;
2049 		/* fallthrough */
2050 
2051 	case DOIO_HARD:
2052 	case DOIO_SUCCESS:
2053 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2054 			send_recvdone_event(sock, &dev);
2055 		break;
2056 	}
2057 
2058 	return (result);
2059 }
2060 
2061 isc_result_t
2062 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2063 		  unsigned int minimum, isc_task_t *task,
2064 		  isc_taskaction_t action, void *arg)
2065 {
2066 	isc__socket_t *sock = (isc__socket_t *)sock0;
2067 	isc_socketevent_t *dev;
2068 	isc__socketmgr_t *manager;
2069 	unsigned int iocount;
2070 	isc_buffer_t *buffer;
2071 
2072 	REQUIRE(VALID_SOCKET(sock));
2073 	REQUIRE(buflist != NULL);
2074 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2075 	REQUIRE(task != NULL);
2076 	REQUIRE(action != NULL);
2077 
2078 	manager = sock->manager;
2079 	REQUIRE(VALID_MANAGER(manager));
2080 
2081 	iocount = isc_bufferlist_availablecount(buflist);
2082 	REQUIRE(iocount > 0);
2083 
2084 	INSIST(sock->bound);
2085 
2086 	dev = allocate_socketevent(sock,
2087 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2088 	if (dev == NULL)
2089 		return (ISC_R_NOMEMORY);
2090 
2091 	/*
2092 	 * UDP sockets are always partial read
2093 	 */
2094 	if (sock->type == isc_sockettype_udp)
2095 		dev->minimum = 1;
2096 	else {
2097 		if (minimum == 0)
2098 			dev->minimum = iocount;
2099 		else
2100 			dev->minimum = minimum;
2101 	}
2102 
2103 	/*
2104 	 * Move each buffer from the passed in list to our internal one.
2105 	 */
2106 	buffer = ISC_LIST_HEAD(*buflist);
2107 	while (buffer != NULL) {
2108 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2109 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2110 		buffer = ISC_LIST_HEAD(*buflist);
2111 	}
2112 
2113 	return (socket_recv(sock, dev, task, 0));
2114 }
2115 
2116 static isc_result_t
2117 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2118 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2119 	    unsigned int flags)
2120 {
2121 	int io_state;
2122 	isc_task_t *ntask = NULL;
2123 	isc_result_t result = ISC_R_SUCCESS;
2124 
2125 	dev->ev_sender = task;
2126 
2127 	set_dev_address(address, sock, dev);
2128 	if (pktinfo != NULL) {
2129 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2130 		dev->pktinfo = *pktinfo;
2131 
2132 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2133 		    !isc_sockaddr_islinklocal(&dev->address)) {
2134 			socket_log(sock, NULL, TRACE,
2135 				   "pktinfo structure provided, ifindex %u "
2136 				   "(set to 0)", pktinfo->ipi6_ifindex);
2137 
2138 			/*
2139 			 * Set the pktinfo index to 0 here, to let the
2140 			 * kernel decide what interface it should send on.
2141 			 */
2142 			dev->pktinfo.ipi6_ifindex = 0;
2143 		}
2144 	}
2145 
2146 	if (sock->type == isc_sockettype_udp)
2147 		io_state = doio_send(sock, dev);
2148 	else {
2149 		if (ISC_LIST_EMPTY(sock->send_list))
2150 			io_state = doio_send(sock, dev);
2151 		else
2152 			io_state = DOIO_SOFT;
2153 	}
2154 
2155 	switch (io_state) {
2156 	case DOIO_SOFT:
2157 		/*
2158 		 * We couldn't send all or part of the request right now, so
2159 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2160 		 */
2161 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2162 			isc_task_attach(task, &ntask);
2163 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2164 
2165 			/*
2166 			 * Enqueue the request.  If the socket was previously
2167 			 * not being watched, poke the watcher to start
2168 			 * paying attention to it.
2169 			 */
2170 			if (ISC_LIST_EMPTY(sock->send_list) &&
2171 			    !sock->pending_send)
2172 				select_poke(sock->manager, sock->fd,
2173 					    SELECT_POKE_WRITE);
2174 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2175 
2176 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2177 				   "socket_send: event %p -> task %p",
2178 				   dev, ntask);
2179 
2180 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2181 				result = ISC_R_INPROGRESS;
2182 			break;
2183 		}
2184 
2185 		/* FALLTHROUGH */
2186 
2187 	case DOIO_HARD:
2188 	case DOIO_SUCCESS:
2189 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2190 			send_senddone_event(sock, &dev);
2191 		break;
2192 	}
2193 
2194 	return (result);
2195 }
2196 
2197 isc_result_t
2198 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2199 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2200 {
2201 	return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
2202 				     NULL, 0));
2203 }
2204 
2205 isc_result_t
2206 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2207 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2208 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2209 		     unsigned int flags)
2210 {
2211 	isc__socket_t *sock = (isc__socket_t *)sock0;
2212 	isc_socketevent_t *dev;
2213 	isc__socketmgr_t *manager;
2214 	unsigned int iocount;
2215 	isc_buffer_t *buffer;
2216 
2217 	REQUIRE(VALID_SOCKET(sock));
2218 	REQUIRE(buflist != NULL);
2219 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2220 	REQUIRE(task != NULL);
2221 	REQUIRE(action != NULL);
2222 
2223 	manager = sock->manager;
2224 	REQUIRE(VALID_MANAGER(manager));
2225 
2226 	iocount = isc_bufferlist_usedcount(buflist);
2227 	REQUIRE(iocount > 0);
2228 
2229 	dev = allocate_socketevent(sock,
2230 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2231 	if (dev == NULL)
2232 		return (ISC_R_NOMEMORY);
2233 
2234 	/*
2235 	 * Move each buffer from the passed in list to our internal one.
2236 	 */
2237 	buffer = ISC_LIST_HEAD(*buflist);
2238 	while (buffer != NULL) {
2239 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2240 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2241 		buffer = ISC_LIST_HEAD(*buflist);
2242 	}
2243 
2244 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2245 }
2246 
2247 isc_result_t
2248 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2249 		 unsigned int options) {
2250 	isc__socket_t *sock = (isc__socket_t *)sock0;
2251 	int on = 1;
2252 
2253 	REQUIRE(VALID_SOCKET(sock));
2254 
2255 	INSIST(!sock->bound);
2256 
2257 	if (sock->pf != sockaddr->type.sa.sa_family) {
2258 		return (ISC_R_FAMILYMISMATCH);
2259 	}
2260 
2261 	/*
2262 	 * Only set SO_REUSEADDR when we want a specific port.
2263 	 */
2264 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2265 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2266 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2267 		       sizeof(on)) < 0) {
2268 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2269 				 "setsockopt(%d) %s", sock->fd, "failed");
2270 		/* Press on... */
2271 	}
2272 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2273 		switch (errno) {
2274 		case EACCES:
2275 			return (ISC_R_NOPERM);
2276 		case EADDRNOTAVAIL:
2277 			return (ISC_R_ADDRNOTAVAIL);
2278 		case EADDRINUSE:
2279 			return (ISC_R_ADDRINUSE);
2280 		case EINVAL:
2281 			return (ISC_R_BOUND);
2282 		default:
2283 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2284 					 strerror(errno));
2285 			return (ISC_R_UNEXPECTED);
2286 		}
2287 	}
2288 
2289 	socket_log(sock, sockaddr, TRACE, "bound");
2290 	sock->bound = 1;
2291 
2292 	return (ISC_R_SUCCESS);
2293 }
2294 
2295 isc_result_t
2296 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2297 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2298 {
2299 	isc__socket_t *sock = (isc__socket_t *)sock0;
2300 	isc_socket_connev_t *dev;
2301 	isc_task_t *ntask = NULL;
2302 	isc__socketmgr_t *manager;
2303 	int cc;
2304 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2305 
2306 	REQUIRE(VALID_SOCKET(sock));
2307 	REQUIRE(addr != NULL);
2308 	REQUIRE(task != NULL);
2309 	REQUIRE(action != NULL);
2310 
2311 	manager = sock->manager;
2312 	REQUIRE(VALID_MANAGER(manager));
2313 	REQUIRE(addr != NULL);
2314 
2315 	if (isc_sockaddr_ismulticast(addr))
2316 		return (ISC_R_MULTICAST);
2317 
2318 	REQUIRE(!sock->connecting);
2319 
2320 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2321 							ISC_SOCKEVENT_CONNECT,
2322 							action,	arg,
2323 							sizeof(*dev));
2324 	if (dev == NULL) {
2325 		return (ISC_R_NOMEMORY);
2326 	}
2327 	ISC_LINK_INIT(dev, ev_link);
2328 
2329 	/*
2330 	 * Try to do the connect right away, as there can be only one
2331 	 * outstanding, and it might happen to complete.
2332 	 */
2333 	sock->peer_address = *addr;
2334 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2335 	if (cc < 0) {
2336 		/*
2337 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2338 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2339 		 * a success and let the user detect it if it's really an error
2340 		 * at the time of sending a packet on the socket.
2341 		 */
2342 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2343 			cc = 0;
2344 			goto success;
2345 		}
2346 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2347 			goto queue;
2348 
2349 		switch (errno) {
2350 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2351 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2352 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2353 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2354 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2355 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2356 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2357 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2358 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2359 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2360 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2361 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2362 #undef ERROR_MATCH
2363 		}
2364 
2365 		sock->connected = 0;
2366 
2367 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2368 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2369 				 addrbuf, errno, strerror(errno));
2370 
2371 		isc_event_free(ISC_EVENT_PTR(&dev));
2372 		return (ISC_R_UNEXPECTED);
2373 
2374 	err_exit:
2375 		sock->connected = 0;
2376 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2377 
2378 		return (ISC_R_SUCCESS);
2379 	}
2380 
2381 	/*
2382 	 * If connect completed, fire off the done event.
2383 	 */
2384  success:
2385 	if (cc == 0) {
2386 		sock->connected = 1;
2387 		sock->bound = 1;
2388 		dev->result = ISC_R_SUCCESS;
2389 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2390 
2391 		return (ISC_R_SUCCESS);
2392 	}
2393 
2394  queue:
2395 
2396 	/*
2397 	 * Attach to task.
2398 	 */
2399 	isc_task_attach(task, &ntask);
2400 
2401 	sock->connecting = 1;
2402 
2403 	dev->ev_sender = ntask;
2404 
2405 	/*
2406 	 * Poke watcher here.  We still have the socket locked, so there
2407 	 * is no race condition.  We will keep the lock for such a short
2408 	 * bit of time waking it up now or later won't matter all that much.
2409 	 */
2410 	if (sock->connect_ev == NULL)
2411 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2412 
2413 	sock->connect_ev = dev;
2414 
2415 	return (ISC_R_SUCCESS);
2416 }
2417 
2418 /*
2419  * Called when a socket with a pending connect() finishes.
2420  */
2421 static void
2422 internal_connect(isc_task_t *me, isc_event_t *ev) {
2423 	isc__socket_t *sock;
2424 	isc_socket_connev_t *dev;
2425 	isc_task_t *task;
2426 	int cc;
2427 	socklen_t optlen;
2428 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2429 
2430 	UNUSED(me);
2431 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2432 
2433 	sock = ev->ev_sender;
2434 	INSIST(VALID_SOCKET(sock));
2435 
2436 	/*
2437 	 * When the internal event was sent the reference count was bumped
2438 	 * to keep the socket around for us.  Decrement the count here.
2439 	 */
2440 	INSIST(sock->references > 0);
2441 	sock->references--;
2442 	if (sock->references == 0) {
2443 		destroy(&sock);
2444 		return;
2445 	}
2446 
2447 	/*
2448 	 * Has this event been canceled?
2449 	 */
2450 	dev = sock->connect_ev;
2451 	if (dev == NULL) {
2452 		INSIST(!sock->connecting);
2453 		return;
2454 	}
2455 
2456 	INSIST(sock->connecting);
2457 	sock->connecting = 0;
2458 
2459 	/*
2460 	 * Get any possible error status here.
2461 	 */
2462 	optlen = sizeof(cc);
2463 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2464 		       (void *)&cc, (void *)&optlen) < 0)
2465 		cc = errno;
2466 	else
2467 		errno = cc;
2468 
2469 	if (errno != 0) {
2470 		/*
2471 		 * If the error is EAGAIN, just re-select on this
2472 		 * fd and pretend nothing strange happened.
2473 		 */
2474 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2475 			sock->connecting = 1;
2476 			select_poke(sock->manager, sock->fd,
2477 				    SELECT_POKE_CONNECT);
2478 			return;
2479 		}
2480 
2481 
2482 		/*
2483 		 * Translate other errors into ISC_R_* flavors.
2484 		 */
2485 		switch (errno) {
2486 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2487 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2488 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2489 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2490 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2491 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2492 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2493 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2494 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2495 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2496 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2497 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2498 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2499 #undef ERROR_MATCH
2500 		default:
2501 			dev->result = ISC_R_UNEXPECTED;
2502 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2503 					    sizeof(peerbuf));
2504 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2505 					 "internal_connect: connect(%s) %s",
2506 					 peerbuf, strerror(errno));
2507 		}
2508 	} else {
2509 		dev->result = ISC_R_SUCCESS;
2510 		sock->connected = 1;
2511 		sock->bound = 1;
2512 	}
2513 
2514 	sock->connect_ev = NULL;
2515 
2516 	task = dev->ev_sender;
2517 	dev->ev_sender = sock;
2518 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2519 }
2520 
2521 /*
2522  * Run through the list of events on this socket, and cancel the ones
2523  * queued for task "task" of type "how".  "how" is a bitmask.
2524  */
2525 void
2526 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2527 	isc__socket_t *sock = (isc__socket_t *)sock0;
2528 
2529 	REQUIRE(VALID_SOCKET(sock));
2530 
2531 	/*
2532 	 * Quick exit if there is nothing to do.  Don't even bother locking
2533 	 * in this case.
2534 	 */
2535 	if (how == 0)
2536 		return;
2537 
2538 	/*
2539 	 * All of these do the same thing, more or less.
2540 	 * Each will:
2541 	 *	o If the internal event is marked as "posted" try to
2542 	 *	  remove it from the task's queue.  If this fails, mark it
2543 	 *	  as canceled instead, and let the task clean it up later.
2544 	 *	o For each I/O request for that task of that type, post
2545 	 *	  its done event with status of "ISC_R_CANCELED".
2546 	 *	o Reset any state needed.
2547 	 */
2548 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2549 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2550 		isc_socketevent_t      *dev;
2551 		isc_socketevent_t      *next;
2552 		isc_task_t	       *current_task;
2553 
2554 		dev = ISC_LIST_HEAD(sock->recv_list);
2555 
2556 		while (dev != NULL) {
2557 			current_task = dev->ev_sender;
2558 			next = ISC_LIST_NEXT(dev, ev_link);
2559 
2560 			if ((task == NULL) || (task == current_task)) {
2561 				dev->result = ISC_R_CANCELED;
2562 				send_recvdone_event(sock, &dev);
2563 			}
2564 			dev = next;
2565 		}
2566 	}
2567 
2568 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2569 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2570 		isc_socketevent_t      *dev;
2571 		isc_socketevent_t      *next;
2572 		isc_task_t	       *current_task;
2573 
2574 		dev = ISC_LIST_HEAD(sock->send_list);
2575 
2576 		while (dev != NULL) {
2577 			current_task = dev->ev_sender;
2578 			next = ISC_LIST_NEXT(dev, ev_link);
2579 
2580 			if ((task == NULL) || (task == current_task)) {
2581 				dev->result = ISC_R_CANCELED;
2582 				send_senddone_event(sock, &dev);
2583 			}
2584 			dev = next;
2585 		}
2586 	}
2587 
2588 	/*
2589 	 * Connecting is not a list.
2590 	 */
2591 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2592 	    && sock->connect_ev != NULL) {
2593 		isc_socket_connev_t    *dev;
2594 		isc_task_t	       *current_task;
2595 
2596 		INSIST(sock->connecting);
2597 		sock->connecting = 0;
2598 
2599 		dev = sock->connect_ev;
2600 		current_task = dev->ev_sender;
2601 
2602 		if ((task == NULL) || (task == current_task)) {
2603 			sock->connect_ev = NULL;
2604 
2605 			dev->result = ISC_R_CANCELED;
2606 			dev->ev_sender = sock;
2607 			isc_task_sendanddetach(&current_task,
2608 					       ISC_EVENT_PTR(&dev));
2609 		}
2610 	}
2611 
2612 }
2613 
2614 /*
2615  * In our assumed scenario, we can simply use a single static object.
2616  * XXX: this is not true if the application uses multiple threads with
2617  *      'multi-context' mode.  Fixing this is a future TODO item.
2618  */
2619 static isc_socketwait_t swait_private;
2620 
2621 int
2622 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2623 			  isc_socketwait_t **swaitp)
2624 {
2625 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2626 	int n;
2627 
2628 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2629 
2630 	if (manager == NULL)
2631 		manager = socketmgr;
2632 	if (manager == NULL)
2633 		return (0);
2634 
2635 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2636 	memmove(manager->write_fds_copy, manager->write_fds,
2637 		manager->fd_bufsize);
2638 
2639 	swait_private.readset = manager->read_fds_copy;
2640 	swait_private.writeset = manager->write_fds_copy;
2641 	swait_private.maxfd = manager->maxfd + 1;
2642 
2643 	n = select(swait_private.maxfd, swait_private.readset,
2644 		   swait_private.writeset, NULL, tvp);
2645 
2646 	*swaitp = &swait_private;
2647 	return (n);
2648 }
2649 
2650 isc_result_t
2651 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2652 	isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
2653 
2654 	REQUIRE(swait == &swait_private);
2655 
2656 	if (manager == NULL)
2657 		manager = socketmgr;
2658 	if (manager == NULL)
2659 		return (ISC_R_NOTFOUND);
2660 
2661 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2662 	return (ISC_R_SUCCESS);
2663 }
2664 
2665 #include "../socket_api.c"
2666