xref: /openbsd-src/usr.bin/dig/lib/isc/unix/socket.c (revision 8b5538545d486ecceb041780b03e8ef5e76cedd6)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /*! \file */
18 
19 #include <sys/socket.h>
20 #include <sys/time.h>
21 #include <sys/uio.h>
22 
23 #include <netinet/tcp.h>
24 
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <stddef.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <isc/buffer.h>
33 #include <isc/bufferlist.h>
34 
35 #include <isc/list.h>
36 #include <isc/log.h>
37 #include <isc/net.h>
38 #include <isc/region.h>
39 #include <isc/socket.h>
40 #include <isc/task.h>
41 #include <isc/util.h>
42 
43 #include "errno2result.h"
44 
45 #include "socket_p.h"
46 #include "../task_p.h"
47 
48 struct isc_socketwait {
49 	fd_set *readset;
50 	fd_set *writeset;
51 	int nfds;
52 	int maxfd;
53 };
54 
55 /*
56  * Set by the -T dscp option on the command line. If set to a value
57  * other than -1, we check to make sure DSCP values match it, and
58  * assert if not.
59  */
60 int isc_dscp_check_value = -1;
61 
62 /*%
63  * Some systems define the socket length argument as an int, some as size_t,
64  * some as socklen_t.  This is here so it can be easily changed if needed.
65  */
66 
67 /*%
68  * Define what the possible "soft" errors can be.  These are non-fatal returns
69  * of various network related functions, like recv() and so on.
70  *
71  * For some reason, BSDI (and perhaps others) will sometimes return <0
72  * from recv() but will have errno==0.  This is broken, but we have to
73  * work around it here.
74  */
75 #define SOFT_ERROR(e)	((e) == EAGAIN || \
76 			 (e) == EWOULDBLOCK || \
77 			 (e) == EINTR || \
78 			 (e) == 0)
79 
80 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
81 
82 /*!<
83  * DLVL(90)  --  Function entry/exit and other tracing.
84  * DLVL(60)  --  Socket data send/receive
85  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
86  * DLVL(20)  --  Socket creation/destruction.
87  */
88 #define TRACE_LEVEL		90
89 #define IOEVENT_LEVEL		60
90 #define EVENT_LEVEL		50
91 #define CREATION_LEVEL		20
92 
93 #define TRACE		DLVL(TRACE_LEVEL)
94 #define IOEVENT		DLVL(IOEVENT_LEVEL)
95 #define EVENT		DLVL(EVENT_LEVEL)
96 #define CREATION	DLVL(CREATION_LEVEL)
97 
98 typedef isc_event_t intev_t;
99 
100 /*!
101  * IPv6 control information.  If the socket is an IPv6 socket we want
102  * to collect the destination address and interface so the client can
103  * set them on outgoing packets.
104  */
105 
106 /*%
107  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
108  * a setsockopt() like interface to request timestamps, and if the OS
109  * doesn't do it for us, call gettimeofday() on every UDP receive?
110  */
111 
112 /*%
113  * Instead of calculating the cmsgbuf lengths every time we take
114  * a rule of thumb approach - sizes are taken from x86_64 linux,
115  * multiplied by 2, everything should fit. Those sizes are not
116  * large enough to cause any concern.
117  */
118 #define CMSG_SP_IN6PKT 40
119 
120 #define CMSG_SP_TIMESTAMP 32
121 
122 #define CMSG_SP_TCTOS 24
123 
124 #define CMSG_SP_INT 24
125 
126 #define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
127 #define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)
128 
129 /*%
130  * The number of times a send operation is repeated if the result is EINTR.
131  */
132 #define NRETRIES 10
133 
134 typedef struct isc_socket isc_socket_t;
135 typedef struct isc_socketmgr isc_socketmgr_t;
136 
137 struct isc_socket {
138 	/* Not locked. */
139 	isc_socketmgr_t	*manager;
140 	isc_sockettype_t	type;
141 
142 	/* Locked by socket lock. */
143 	ISC_LINK(isc_socket_t)	link;
144 	unsigned int		references;
145 	int			fd;
146 	int			pf;
147 
148 	ISC_LIST(isc_socketevent_t)		send_list;
149 	ISC_LIST(isc_socketevent_t)		recv_list;
150 	isc_socket_connev_t		       *connect_ev;
151 
152 	/*
153 	 * Internal events.  Posted when a descriptor is readable or
154 	 * writable.  These are statically allocated and never freed.
155 	 * They will be set to non-purgable before use.
156 	 */
157 	intev_t			readable_ev;
158 	intev_t			writable_ev;
159 
160 	isc_sockaddr_t		peer_address;       /* remote address */
161 
162 	unsigned int		pending_recv : 1,
163 				pending_send : 1,
164 				connected : 1,
165 				connecting : 1,     /* connect pending */
166 				bound : 1,          /* bound to local addr */
167 				active : 1,         /* currently active */
168 				pktdscp : 1;	    /* per packet dscp */
169 	unsigned int		dscp;
170 };
171 
172 struct isc_socketmgr {
173 	/* Not locked. */
174 	int			fd_bufsize;
175 	unsigned int		maxsocks;
176 
177 	isc_socket_t	       **fds;
178 	int			*fdstate;
179 
180 	/* Locked by manager lock. */
181 	ISC_LIST(isc_socket_t)	socklist;
182 	fd_set			*read_fds;
183 	fd_set			*read_fds_copy;
184 	fd_set			*write_fds;
185 	fd_set			*write_fds_copy;
186 	int			maxfd;
187 	unsigned int		refs;
188 };
189 
190 static isc_socketmgr_t *socketmgr = NULL;
191 
192 #define CLOSED			0	/* this one must be zero */
193 #define MANAGED			1
194 #define CLOSE_PENDING		2
195 
196 /*
197  * send() and recv() iovec counts
198  */
199 #define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
200 #define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
201 
202 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
203 				  isc_sockettype_t type,
204 				  isc_socket_t **socketp);
205 static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
206 static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
207 static void free_socket(isc_socket_t **);
208 static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
209 				    isc_socket_t **);
210 static void destroy(isc_socket_t **);
211 static void internal_connect(isc_task_t *, isc_event_t *);
212 static void internal_recv(isc_task_t *, isc_event_t *);
213 static void internal_send(isc_task_t *, isc_event_t *);
214 static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
215 static void build_msghdr_send(isc_socket_t *, char *, isc_socketevent_t *,
216 			      struct msghdr *, struct iovec *, size_t *);
217 static void build_msghdr_recv(isc_socket_t *, char *, isc_socketevent_t *,
218 			      struct msghdr *, struct iovec *, size_t *);
219 
220 #define SELECT_POKE_SHUTDOWN		(-1)
221 #define SELECT_POKE_READ		(-3)
222 #define SELECT_POKE_WRITE		(-4)
223 #define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
224 #define SELECT_POKE_CLOSE		(-5)
225 
226 #define SOCK_DEAD(s)			((s)->references == 0)
227 
228 /*%
229  * Shortcut index arrays to get access to statistics counters.
230  */
231 enum {
232 	STATID_OPEN = 0,
233 	STATID_OPENFAIL = 1,
234 	STATID_CLOSE = 2,
235 	STATID_BINDFAIL = 3,
236 	STATID_CONNECTFAIL = 4,
237 	STATID_CONNECT = 5,
238 	STATID_ACCEPTFAIL = 6,
239 	STATID_ACCEPT = 7,
240 	STATID_SENDFAIL = 8,
241 	STATID_RECVFAIL = 9,
242 	STATID_ACTIVE = 10
243 };
244 
245 
246 static void
247 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
248 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
249 	   const char *fmt, ...) __attribute__((__format__(__printf__, 6, 7)));
250 static void
251 socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
252 	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
253 	   const char *fmt, ...)
254 {
255 	char msgbuf[2048];
256 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
257 	va_list ap;
258 
259 	if (! isc_log_wouldlog(isc_lctx, level))
260 		return;
261 
262 	va_start(ap, fmt);
263 	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
264 	va_end(ap);
265 
266 	if (address == NULL) {
267 		isc_log_write(isc_lctx, category, module, level,
268 			       "socket %p: %s", sock, msgbuf);
269 	} else {
270 		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
271 		isc_log_write(isc_lctx, category, module, level,
272 			       "socket %p %s: %s", sock, peerbuf, msgbuf);
273 	}
274 }
275 
276 static inline isc_result_t
277 watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
278 	isc_result_t result = ISC_R_SUCCESS;
279 
280 	if (msg == SELECT_POKE_READ)
281 		FD_SET(fd, manager->read_fds);
282 	if (msg == SELECT_POKE_WRITE)
283 		FD_SET(fd, manager->write_fds);
284 
285 	return (result);
286 }
287 
288 static inline isc_result_t
289 unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
290 	isc_result_t result = ISC_R_SUCCESS;
291 
292 	if (msg == SELECT_POKE_READ)
293 		FD_CLR(fd, manager->read_fds);
294 	else if (msg == SELECT_POKE_WRITE)
295 		FD_CLR(fd, manager->write_fds);
296 
297 	return (result);
298 }
299 
300 static void
301 wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
302 	isc_result_t result;
303 
304 	/*
305 	 * This is a wakeup on a socket.  If the socket is not in the
306 	 * process of being closed, start watching it for either reads
307 	 * or writes.
308 	 */
309 
310 	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
311 
312 	if (msg == SELECT_POKE_CLOSE) {
313 		/* No one should be updating fdstate, so no need to lock it */
314 		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
315 		manager->fdstate[fd] = CLOSED;
316 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
317 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
318 		(void)close(fd);
319 		return;
320 	}
321 
322 	if (manager->fdstate[fd] == CLOSE_PENDING) {
323 
324 		/*
325 		 * We accept (and ignore) any error from unwatch_fd() as we are
326 		 * closing the socket, hoping it doesn't leave dangling state in
327 		 * the kernel.
328 		 */
329 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
330 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
331 		return;
332 	}
333 	if (manager->fdstate[fd] != MANAGED) {
334 		return;
335 	}
336 
337 	/*
338 	 * Set requested bit.
339 	 */
340 	result = watch_fd(manager, fd, msg);
341 	if (result != ISC_R_SUCCESS) {
342 		/*
343 		 * XXXJT: what should we do?  Ignoring the failure of watching
344 		 * a socket will make the application dysfunctional, but there
345 		 * seems to be no reasonable recovery process.
346 		 */
347 		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
348 			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
349 			      "failed to start watching FD (%d): %s",
350 			      fd, isc_result_totext(result));
351 	}
352 }
353 
354 /*
355  * Update the state of the socketmgr when something changes.
356  */
357 static void
358 select_poke(isc_socketmgr_t *manager, int fd, int msg) {
359 	if (msg == SELECT_POKE_SHUTDOWN)
360 		return;
361 	else if (fd >= 0)
362 		wakeup_socket(manager, fd, msg);
363 	return;
364 }
365 
366 /*
367  * Make a fd non-blocking.
368  */
369 static isc_result_t
370 make_nonblock(int fd) {
371 	int ret;
372 	int flags;
373 
374 	flags = fcntl(fd, F_GETFL, 0);
375 	flags |= O_NONBLOCK;
376 	ret = fcntl(fd, F_SETFL, flags);
377 
378 	if (ret == -1) {
379 		UNEXPECTED_ERROR(__FILE__, __LINE__,
380 				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
381 				 strerror(errno));
382 		return (ISC_R_UNEXPECTED);
383 	}
384 
385 	return (ISC_R_SUCCESS);
386 }
387 
388 /*
389  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
390  * In order to ensure as much portability as possible, we provide wrapper
391  * functions of these macros.
392  * Note that cmsg_space() could run slow on OSes that do not have
393  * CMSG_SPACE.
394  */
395 static inline socklen_t
396 cmsg_len(socklen_t len) {
397 	return (CMSG_LEN(len));
398 }
399 
400 static inline socklen_t
401 cmsg_space(socklen_t len) {
402 	return (CMSG_SPACE(len));
403 }
404 
405 /*
406  * Process control messages received on a socket.
407  */
408 static void
409 process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
410 	struct cmsghdr *cmsgp;
411 	struct in6_pktinfo *pktinfop;
412 	void *timevalp;
413 
414 	/*
415 	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
416 	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
417 	 * They are all here, outside of the CPP tests, because it is
418 	 * more consistent with the usual ISC coding style.
419 	 */
420 	UNUSED(sock);
421 	UNUSED(msg);
422 	UNUSED(dev);
423 
424 	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
425 		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
426 
427 	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
428 		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
429 
430 	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
431 		return;
432 
433 	timevalp = NULL;
434 	pktinfop = NULL;
435 
436 	cmsgp = CMSG_FIRSTHDR(msg);
437 	while (cmsgp != NULL) {
438 		socket_log(sock, NULL, TRACE,
439 			   "processing cmsg %p", cmsgp);
440 
441 		if (cmsgp->cmsg_level == IPPROTO_IPV6
442 		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
443 
444 			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
445 			memmove(&dev->pktinfo, pktinfop,
446 				sizeof(struct in6_pktinfo));
447 			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
448 			socket_log(sock, NULL, TRACE,
449 				   "interface received on ifindex %u",
450 				   dev->pktinfo.ipi6_ifindex);
451 			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
452 				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
453 			goto next;
454 		}
455 
456 		if (cmsgp->cmsg_level == SOL_SOCKET
457 		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
458 			struct timeval tv;
459 			timevalp = CMSG_DATA(cmsgp);
460 			memmove(&tv, timevalp, sizeof(tv));
461 			TIMEVAL_TO_TIMESPEC(&tv, &dev->timestamp);
462 			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
463 			goto next;
464 		}
465 
466 		if (cmsgp->cmsg_level == IPPROTO_IPV6
467 		    && cmsgp->cmsg_type == IPV6_TCLASS) {
468 			dev->dscp = *(int *)CMSG_DATA(cmsgp);
469 			dev->dscp >>= 2;
470 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
471 			goto next;
472 		}
473 
474 		if (cmsgp->cmsg_level == IPPROTO_IP
475 		    && (cmsgp->cmsg_type == IP_TOS)) {
476 			dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
477 			dev->dscp >>= 2;
478 			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
479 			goto next;
480 		}
481 	next:
482 		cmsgp = CMSG_NXTHDR(msg, cmsgp);
483 	}
484 
485 }
486 
487 /*
488  * Construct an iov array and attach it to the msghdr passed in.  This is
489  * the SEND constructor, which will use the used region of the buffer
490  * (if using a buffer list) or will use the internal region (if a single
491  * buffer I/O is requested).
492  *
493  * Nothing can be NULL, and the done event must list at least one buffer
494  * on the buffer linked list for this function to be meaningful.
495  *
496  * If write_countp != NULL, *write_countp will hold the number of bytes
497  * this transaction can send.
498  */
499 static void
500 build_msghdr_send(isc_socket_t *sock, char* cmsgbuf, isc_socketevent_t *dev,
501 		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
502 {
503 	unsigned int iovcount;
504 	isc_buffer_t *buffer;
505 	isc_region_t used;
506 	size_t write_count;
507 	size_t skip_count;
508 	struct cmsghdr *cmsgp;
509 
510 	memset(msg, 0, sizeof(*msg));
511 
512 	if (!sock->connected) {
513 		msg->msg_name = (void *)&dev->address.type.sa;
514 		msg->msg_namelen = dev->address.length;
515 	} else {
516 		msg->msg_name = NULL;
517 		msg->msg_namelen = 0;
518 	}
519 
520 	buffer = ISC_LIST_HEAD(dev->bufferlist);
521 	write_count = 0;
522 	iovcount = 0;
523 
524 	/*
525 	 * Single buffer I/O?  Skip what we've done so far in this region.
526 	 */
527 	if (buffer == NULL) {
528 		write_count = dev->region.length - dev->n;
529 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
530 		iov[0].iov_len = write_count;
531 		iovcount = 1;
532 
533 		goto config;
534 	}
535 
536 	/*
537 	 * Multibuffer I/O.
538 	 * Skip the data in the buffer list that we have already written.
539 	 */
540 	skip_count = dev->n;
541 	while (buffer != NULL) {
542 		if (skip_count < isc_buffer_usedlength(buffer))
543 			break;
544 		skip_count -= isc_buffer_usedlength(buffer);
545 		buffer = ISC_LIST_NEXT(buffer, link);
546 	}
547 
548 	while (buffer != NULL) {
549 		INSIST(iovcount < MAXSCATTERGATHER_SEND);
550 
551 		isc_buffer_usedregion(buffer, &used);
552 
553 		if (used.length > 0) {
554 			iov[iovcount].iov_base = (void *)(used.base
555 							  + skip_count);
556 			iov[iovcount].iov_len = used.length - skip_count;
557 			write_count += (used.length - skip_count);
558 			skip_count = 0;
559 			iovcount++;
560 		}
561 		buffer = ISC_LIST_NEXT(buffer, link);
562 	}
563 
564 	INSIST(skip_count == 0U);
565 
566  config:
567 	msg->msg_iov = iov;
568 	msg->msg_iovlen = iovcount;
569 
570 	msg->msg_control = NULL;
571 	msg->msg_controllen = 0;
572 	msg->msg_flags = 0;
573 
574 	if ((sock->type == isc_sockettype_udp) &&
575 	    ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0))
576 	{
577 		struct in6_pktinfo *pktinfop;
578 
579 		socket_log(sock, NULL, TRACE,
580 			   "sendto pktinfo data, ifindex %u",
581 			   dev->pktinfo.ipi6_ifindex);
582 
583 		msg->msg_control = (void *)cmsgbuf;
584 		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
585 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
586 
587 		cmsgp = (struct cmsghdr *)cmsgbuf;
588 		cmsgp->cmsg_level = IPPROTO_IPV6;
589 		cmsgp->cmsg_type = IPV6_PKTINFO;
590 		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
591 		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
592 		memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
593 	}
594 
595 	if ((sock->type == isc_sockettype_udp) &&
596 	    ((dev->attributes & ISC_SOCKEVENTATTR_USEMINMTU) != 0))
597 	{
598 		int use_min_mtu = 1;	/* -1, 0, 1 */
599 
600 		cmsgp = (struct cmsghdr *)(cmsgbuf +
601 					   msg->msg_controllen);
602 
603 		msg->msg_control = (void *)cmsgbuf;
604 		msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
605 		INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
606 
607 		cmsgp->cmsg_level = IPPROTO_IPV6;
608 		cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
609 		cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
610 		memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
611 	}
612 
613 	if (isc_dscp_check_value > -1) {
614 		if (sock->type == isc_sockettype_udp)
615 			INSIST((int)dev->dscp == isc_dscp_check_value);
616 		else if (sock->type == isc_sockettype_tcp)
617 			INSIST((int)sock->dscp == isc_dscp_check_value);
618 	}
619 
620 	if ((sock->type == isc_sockettype_udp) &&
621 	    ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
622 	{
623 		int dscp = (dev->dscp << 2) & 0xff;
624 
625 		INSIST(dev->dscp < 0x40);
626 
627 		if (sock->pf == AF_INET && sock->pktdscp) {
628 			cmsgp = (struct cmsghdr *)(cmsgbuf +
629 						   msg->msg_controllen);
630 			msg->msg_control = (void *)cmsgbuf;
631 			msg->msg_controllen += cmsg_space(sizeof(dscp));
632 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
633 
634 			cmsgp->cmsg_level = IPPROTO_IP;
635 			cmsgp->cmsg_type = IP_TOS;
636 			cmsgp->cmsg_len = cmsg_len(sizeof(char));
637 			*(unsigned char*)CMSG_DATA(cmsgp) = dscp;
638 		} else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
639 			if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
640 			       (void *)&dscp, sizeof(int)) < 0)
641 			{
642 				UNEXPECTED_ERROR(__FILE__, __LINE__,
643 						 "setsockopt(%d, IP_TOS, %.02x)"
644 						 " %s: %s",
645 						 sock->fd, dscp >> 2,
646 						 "failed", strerror(errno));
647 			} else
648 				sock->dscp = dscp;
649 		}
650 
651 		if (sock->pf == AF_INET6 && sock->pktdscp) {
652 			cmsgp = (struct cmsghdr *)(cmsgbuf +
653 						   msg->msg_controllen);
654 			msg->msg_control = (void *)cmsgbuf;
655 			msg->msg_controllen += cmsg_space(sizeof(dscp));
656 			INSIST(msg->msg_controllen <= SENDCMSGBUFLEN);
657 
658 			cmsgp->cmsg_level = IPPROTO_IPV6;
659 			cmsgp->cmsg_type = IPV6_TCLASS;
660 			cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
661 			memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
662 		} else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
663 			if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
664 				       (void *)&dscp, sizeof(int)) < 0) {
665 				UNEXPECTED_ERROR(__FILE__, __LINE__,
666 						 "setsockopt(%d, IPV6_TCLASS, "
667 						 "%.02x) %s: %s",
668 						 sock->fd, dscp >> 2,
669 						 "failed", strerror(errno));
670 			} else
671 				sock->dscp = dscp;
672 		}
673 
674 		if (msg->msg_controllen != 0 &&
675 		    msg->msg_controllen < SENDCMSGBUFLEN)
676 		{
677 			memset(cmsgbuf + msg->msg_controllen, 0,
678 			       SENDCMSGBUFLEN - msg->msg_controllen);
679 		}
680 	}
681 
682 	if (write_countp != NULL)
683 		*write_countp = write_count;
684 }
685 
686 /*
687  * Construct an iov array and attach it to the msghdr passed in.  This is
688  * the RECV constructor, which will use the available region of the buffer
689  * (if using a buffer list) or will use the internal region (if a single
690  * buffer I/O is requested).
691  *
692  * Nothing can be NULL, and the done event must list at least one buffer
693  * on the buffer linked list for this function to be meaningful.
694  *
695  * If read_countp != NULL, *read_countp will hold the number of bytes
696  * this transaction can receive.
697  */
698 static void
699 build_msghdr_recv(isc_socket_t *sock, char *cmsgbuf, isc_socketevent_t *dev,
700 		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
701 {
702 	unsigned int iovcount;
703 	isc_buffer_t *buffer;
704 	isc_region_t available;
705 	size_t read_count;
706 
707 	memset(msg, 0, sizeof(struct msghdr));
708 
709 	if (sock->type == isc_sockettype_udp) {
710 		memset(&dev->address, 0, sizeof(dev->address));
711 		msg->msg_name = (void *)&dev->address.type.sa;
712 		msg->msg_namelen = sizeof(dev->address.type);
713 	} else { /* TCP */
714 		msg->msg_name = NULL;
715 		msg->msg_namelen = 0;
716 		dev->address = sock->peer_address;
717 	}
718 
719 	buffer = ISC_LIST_HEAD(dev->bufferlist);
720 	read_count = 0;
721 
722 	/*
723 	 * Single buffer I/O?  Skip what we've done so far in this region.
724 	 */
725 	if (buffer == NULL) {
726 		read_count = dev->region.length - dev->n;
727 		iov[0].iov_base = (void *)(dev->region.base + dev->n);
728 		iov[0].iov_len = read_count;
729 		iovcount = 1;
730 
731 		goto config;
732 	}
733 
734 	/*
735 	 * Multibuffer I/O.
736 	 * Skip empty buffers.
737 	 */
738 	while (buffer != NULL) {
739 		if (isc_buffer_availablelength(buffer) != 0)
740 			break;
741 		buffer = ISC_LIST_NEXT(buffer, link);
742 	}
743 
744 	iovcount = 0;
745 	while (buffer != NULL) {
746 		INSIST(iovcount < MAXSCATTERGATHER_RECV);
747 
748 		isc_buffer_availableregion(buffer, &available);
749 
750 		if (available.length > 0) {
751 			iov[iovcount].iov_base = (void *)(available.base);
752 			iov[iovcount].iov_len = available.length;
753 			read_count += available.length;
754 			iovcount++;
755 		}
756 		buffer = ISC_LIST_NEXT(buffer, link);
757 	}
758 
759  config:
760 
761 	/*
762 	 * If needed, set up to receive that one extra byte.
763 	 */
764 	msg->msg_iov = iov;
765 	msg->msg_iovlen = iovcount;
766 
767 	msg->msg_control = cmsgbuf;
768 	msg->msg_controllen = RECVCMSGBUFLEN;
769 	msg->msg_flags = 0;
770 
771 	if (read_countp != NULL)
772 		*read_countp = read_count;
773 }
774 
775 static void
776 set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
777 		isc_socketevent_t *dev)
778 {
779 	if (sock->type == isc_sockettype_udp) {
780 		if (address != NULL)
781 			dev->address = *address;
782 		else
783 			dev->address = sock->peer_address;
784 	} else if (sock->type == isc_sockettype_tcp) {
785 		INSIST(address == NULL);
786 		dev->address = sock->peer_address;
787 	}
788 }
789 
790 static void
791 destroy_socketevent(isc_event_t *event) {
792 	isc_socketevent_t *ev = (isc_socketevent_t *)event;
793 
794 	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
795 
796 	(ev->destroy)(event);
797 }
798 
799 static isc_socketevent_t *
800 allocate_socketevent(void *sender,
801 		     isc_eventtype_t eventtype, isc_taskaction_t action,
802 		     void *arg)
803 {
804 	isc_socketevent_t *ev;
805 
806 	ev = (isc_socketevent_t *)isc_event_allocate(sender,
807 						     eventtype, action, arg,
808 						     sizeof(*ev));
809 
810 	if (ev == NULL)
811 		return (NULL);
812 
813 	ev->result = ISC_R_UNSET;
814 	ISC_LINK_INIT(ev, ev_link);
815 	ISC_LIST_INIT(ev->bufferlist);
816 	ev->region.base = NULL;
817 	ev->n = 0;
818 	ev->offset = 0;
819 	ev->attributes = 0;
820 	ev->destroy = ev->ev_destroy;
821 	ev->ev_destroy = destroy_socketevent;
822 	ev->dscp = 0;
823 
824 	return (ev);
825 }
826 
827 #define DOIO_SUCCESS		0	/* i/o ok, event sent */
828 #define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
829 #define DOIO_HARD		2	/* i/o error, event sent */
830 #define DOIO_EOF		3	/* EOF, no event sent */
831 
832 static int
833 doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
834 	int cc;
835 	struct iovec iov[MAXSCATTERGATHER_RECV];
836 	size_t read_count;
837 	size_t actual_count;
838 	struct msghdr msghdr;
839 	isc_buffer_t *buffer;
840 	int recv_errno;
841 	char cmsgbuf[RECVCMSGBUFLEN] = {0};
842 
843 	build_msghdr_recv(sock, cmsgbuf, dev, &msghdr, iov, &read_count);
844 
845 	cc = recvmsg(sock->fd, &msghdr, 0);
846 	recv_errno = errno;
847 
848 	if (cc < 0) {
849 		if (SOFT_ERROR(recv_errno))
850 			return (DOIO_SOFT);
851 
852 		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
853 			socket_log(sock, NULL, IOEVENT,
854 				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
855 				   sock->fd, cc, recv_errno,
856 				   strerror(recv_errno));
857 		}
858 
859 #define SOFT_OR_HARD(_system, _isc) \
860 	if (recv_errno == _system) { \
861 		if (sock->connected) { \
862 			dev->result = _isc; \
863 			return (DOIO_HARD); \
864 		} \
865 		return (DOIO_SOFT); \
866 	}
867 #define ALWAYS_HARD(_system, _isc) \
868 	if (recv_errno == _system) { \
869 		dev->result = _isc; \
870 		return (DOIO_HARD); \
871 	}
872 
873 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
874 		SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
875 		SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
876 		SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
877 		/* HPUX 11.11 can return EADDRNOTAVAIL. */
878 		SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
879 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
880 		/* Should never get this one but it was seen. */
881 		SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
882 		/*
883 		 * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
884 		 * errors.
885 		 */
886 		SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
887 		SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
888 
889 #undef SOFT_OR_HARD
890 #undef ALWAYS_HARD
891 
892 		dev->result = isc__errno2result(recv_errno);
893 		return (DOIO_HARD);
894 	}
895 
896 	/*
897 	 * On TCP and UNIX sockets, zero length reads indicate EOF,
898 	 * while on UDP sockets, zero length reads are perfectly valid,
899 	 * although strange.
900 	 */
901 	switch (sock->type) {
902 	case isc_sockettype_tcp:
903 		if (cc == 0)
904 			return (DOIO_EOF);
905 		break;
906 	case isc_sockettype_udp:
907 		break;
908 	default:
909 		INSIST(0);
910 	}
911 
912 	if (sock->type == isc_sockettype_udp) {
913 		dev->address.length = msghdr.msg_namelen;
914 		if (isc_sockaddr_getport(&dev->address) == 0) {
915 			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
916 				socket_log(sock, &dev->address, IOEVENT,
917 					   "dropping source port zero packet");
918 			}
919 			return (DOIO_SOFT);
920 		}
921 	}
922 
923 	socket_log(sock, &dev->address, IOEVENT,
924 		   "packet received correctly");
925 
926 	/*
927 	 * Overflow bit detection.  If we received MORE bytes than we should,
928 	 * this indicates an overflow situation.  Set the flag in the
929 	 * dev entry and adjust how much we read by one.
930 	 */
931 	/*
932 	 * If there are control messages attached, run through them and pull
933 	 * out the interesting bits.
934 	 */
935 	process_cmsg(sock, &msghdr, dev);
936 
937 	/*
938 	 * update the buffers (if any) and the i/o count
939 	 */
940 	dev->n += cc;
941 	actual_count = cc;
942 	buffer = ISC_LIST_HEAD(dev->bufferlist);
943 	while (buffer != NULL && actual_count > 0U) {
944 		if (isc_buffer_availablelength(buffer) <= actual_count) {
945 			actual_count -= isc_buffer_availablelength(buffer);
946 			isc_buffer_add(buffer,
947 				       isc_buffer_availablelength(buffer));
948 		} else {
949 			isc_buffer_add(buffer, actual_count);
950 			actual_count = 0;
951 			POST(actual_count);
952 			break;
953 		}
954 		buffer = ISC_LIST_NEXT(buffer, link);
955 		if (buffer == NULL) {
956 			INSIST(actual_count == 0U);
957 		}
958 	}
959 
960 	/*
961 	 * If we read less than we expected, update counters,
962 	 * and let the upper layer poke the descriptor.
963 	 */
964 	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
965 		return (DOIO_SOFT);
966 
967 	/*
968 	 * Full reads are posted, or partials if partials are ok.
969 	 */
970 	dev->result = ISC_R_SUCCESS;
971 	return (DOIO_SUCCESS);
972 }
973 
974 /*
975  * Returns:
976  *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
977  *			ISC_R_SUCCESS.
978  *
979  *	DOIO_HARD	A hard or unexpected I/O error was encountered.
980  *			dev->result contains the appropriate error.
981  *
982  *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
983  *			event was sent.  The operation should be retried.
984  *
985  *	No other return values are possible.
986  */
987 static int
988 doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
989 	int cc;
990 	struct iovec iov[MAXSCATTERGATHER_SEND];
991 	size_t write_count;
992 	struct msghdr msghdr;
993 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
994 	int attempts = 0;
995 	int send_errno;
996 	char cmsgbuf[SENDCMSGBUFLEN] = {0};
997 
998 	build_msghdr_send(sock, cmsgbuf, dev, &msghdr, iov, &write_count);
999 
1000  resend:
1001 	cc = sendmsg(sock->fd, &msghdr, 0);
1002 	send_errno = errno;
1003 
1004 	/*
1005 	 * Check for error or block condition.
1006 	 */
1007 	if (cc < 0) {
1008 		if (send_errno == EINTR && ++attempts < NRETRIES)
1009 			goto resend;
1010 
1011 		if (SOFT_ERROR(send_errno)) {
1012 			if (errno == EWOULDBLOCK || errno == EAGAIN)
1013 				dev->result = ISC_R_WOULDBLOCK;
1014 			return (DOIO_SOFT);
1015 		}
1016 
1017 #define SOFT_OR_HARD(_system, _isc) \
1018 	if (send_errno == _system) { \
1019 		if (sock->connected) { \
1020 			dev->result = _isc; \
1021 			return (DOIO_HARD); \
1022 		} \
1023 		return (DOIO_SOFT); \
1024 	}
1025 #define ALWAYS_HARD(_system, _isc) \
1026 	if (send_errno == _system) { \
1027 		dev->result = _isc; \
1028 		return (DOIO_HARD); \
1029 	}
1030 
1031 		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
1032 		ALWAYS_HARD(EACCES, ISC_R_NOPERM);
1033 		ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
1034 		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
1035 		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
1036 		ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
1037 		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
1038 		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
1039 		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
1040 		ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
1041 		ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
1042 
1043 #undef SOFT_OR_HARD
1044 #undef ALWAYS_HARD
1045 
1046 		/*
1047 		 * The other error types depend on whether or not the
1048 		 * socket is UDP or TCP.  If it is UDP, some errors
1049 		 * that we expect to be fatal under TCP are merely
1050 		 * annoying, and are really soft errors.
1051 		 *
1052 		 * However, these soft errors are still returned as
1053 		 * a status.
1054 		 */
1055 		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1056 		UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
1057 				 addrbuf, strerror(send_errno));
1058 		dev->result = isc__errno2result(send_errno);
1059 		return (DOIO_HARD);
1060 	}
1061 
1062 	if (cc == 0) {
1063 		UNEXPECTED_ERROR(__FILE__, __LINE__,
1064 				 "doio_send: send() %s 0", "returned");
1065 	}
1066 
1067 	/*
1068 	 * If we write less than we expected, update counters, poke.
1069 	 */
1070 	dev->n += cc;
1071 	if ((size_t)cc != write_count)
1072 		return (DOIO_SOFT);
1073 
1074 	/*
1075 	 * Exactly what we wanted to write.  We're done with this
1076 	 * entry.  Post its completion event.
1077 	 */
1078 	dev->result = ISC_R_SUCCESS;
1079 	return (DOIO_SUCCESS);
1080 }
1081 
1082 /*
1083  * Kill.
1084  *
1085  * Caller must ensure that the socket is not locked and no external
1086  * references exist.
1087  */
1088 static void
1089 socketclose(isc_socketmgr_t *manager, isc_socket_t *sock, int fd) {
1090 	/*
1091 	 * No one has this socket open, so the watcher doesn't have to be
1092 	 * poked, and the socket doesn't have to be locked.
1093 	 */
1094 	manager->fds[fd] = NULL;
1095 	manager->fdstate[fd] = CLOSE_PENDING;
1096 	select_poke(manager, fd, SELECT_POKE_CLOSE);
1097 
1098 	if (sock->active == 1) {
1099 		sock->active = 0;
1100 	}
1101 
1102 	/*
1103 	 * update manager->maxfd here (XXX: this should be implemented more
1104 	 * efficiently)
1105 	 */
1106 	if (manager->maxfd == fd) {
1107 		int i;
1108 
1109 		manager->maxfd = 0;
1110 		for (i = fd - 1; i >= 0; i--) {
1111 			if (manager->fdstate[i] == MANAGED) {
1112 				manager->maxfd = i;
1113 				break;
1114 			}
1115 		}
1116 	}
1117 
1118 }
1119 
1120 static void
1121 destroy(isc_socket_t **sockp) {
1122 	int fd;
1123 	isc_socket_t *sock = *sockp;
1124 	isc_socketmgr_t *manager = sock->manager;
1125 
1126 	socket_log(sock, NULL, CREATION, "destroying");
1127 
1128 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1129 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1130 	INSIST(sock->connect_ev == NULL);
1131 	INSIST(sock->fd >= -1 && sock->fd < (int)manager->maxsocks);
1132 
1133 	if (sock->fd >= 0) {
1134 		fd = sock->fd;
1135 		sock->fd = -1;
1136 		socketclose(manager, sock, fd);
1137 	}
1138 
1139 	ISC_LIST_UNLINK(manager->socklist, sock, link);
1140 
1141 	/* can't unlock manager as its memory context is still used */
1142 	free_socket(sockp);
1143 }
1144 
1145 static isc_result_t
1146 allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1147 		isc_socket_t **socketp)
1148 {
1149 	isc_socket_t *sock;
1150 
1151 	sock = malloc(sizeof(*sock));
1152 
1153 	if (sock == NULL)
1154 		return (ISC_R_NOMEMORY);
1155 
1156 	sock->references = 0;
1157 
1158 	sock->manager = manager;
1159 	sock->type = type;
1160 	sock->fd = -1;
1161 	sock->dscp = 0;		/* TOS/TCLASS is zero until set. */
1162 	sock->active = 0;
1163 
1164 	ISC_LINK_INIT(sock, link);
1165 
1166 	/*
1167 	 * Set up list of readers and writers to be initially empty.
1168 	 */
1169 	ISC_LIST_INIT(sock->recv_list);
1170 	ISC_LIST_INIT(sock->send_list);
1171 	sock->connect_ev = NULL;
1172 	sock->pending_recv = 0;
1173 	sock->pending_send = 0;
1174 	sock->connected = 0;
1175 	sock->connecting = 0;
1176 	sock->bound = 0;
1177 	sock->pktdscp = 0;
1178 
1179 	/*
1180 	 * Initialize readable and writable events.
1181 	 */
1182 	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
1183 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
1184 		       NULL, sock, sock, NULL);
1185 	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
1186 		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
1187 		       NULL, sock, sock, NULL);
1188 
1189 	*socketp = sock;
1190 
1191 	return (ISC_R_SUCCESS);
1192 }
1193 
1194 /*
1195  * This event requires that the various lists be empty, that the reference
1196  * count be 1.  The other socket bits,
1197  * like the lock, must be initialized as well.  The fd associated must be
1198  * marked as closed, by setting it to -1 on close, or this routine will
1199  * also close the socket.
1200  */
1201 static void
1202 free_socket(isc_socket_t **socketp) {
1203 	isc_socket_t *sock = *socketp;
1204 
1205 	INSIST(sock->references == 0);
1206 	INSIST(!sock->connecting);
1207 	INSIST(!sock->pending_recv);
1208 	INSIST(!sock->pending_send);
1209 	INSIST(ISC_LIST_EMPTY(sock->recv_list));
1210 	INSIST(ISC_LIST_EMPTY(sock->send_list));
1211 	INSIST(!ISC_LINK_LINKED(sock, link));
1212 
1213 	free(sock);
1214 
1215 	*socketp = NULL;
1216 }
1217 
1218 static void
1219 use_min_mtu(isc_socket_t *sock) {
1220 	/* use minimum MTU */
1221 	if (sock->pf == AF_INET6) {
1222 		int on = 1;
1223 		(void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
1224 				(void *)&on, sizeof(on));
1225 	}
1226 }
1227 
1228 static void
1229 set_tcp_maxseg(isc_socket_t *sock, int size) {
1230 	if (sock->type == isc_sockettype_tcp)
1231 		(void)setsockopt(sock->fd, IPPROTO_TCP, TCP_MAXSEG,
1232 				(void *)&size, sizeof(size));
1233 }
1234 
1235 static isc_result_t
1236 opensocket(isc_socket_t *sock)
1237 {
1238 	isc_result_t result;
1239 	const char *err = "socket";
1240 	int on = 1;
1241 
1242 	switch (sock->type) {
1243 	case isc_sockettype_udp:
1244 		sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
1245 		break;
1246 	case isc_sockettype_tcp:
1247 		sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
1248 		break;
1249 	}
1250 
1251 	if (sock->fd < 0) {
1252 		switch (errno) {
1253 		case EMFILE:
1254 		case ENFILE:
1255 			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1256 				       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
1257 				       "%s: %s", err, strerror(errno));
1258 			/* fallthrough */
1259 		case ENOBUFS:
1260 			return (ISC_R_NORESOURCES);
1261 
1262 		case EPROTONOSUPPORT:
1263 		case EPFNOSUPPORT:
1264 		case EAFNOSUPPORT:
1265 		/*
1266 		 * Linux 2.2 (and maybe others) return EINVAL instead of
1267 		 * EAFNOSUPPORT.
1268 		 */
1269 		case EINVAL:
1270 			return (ISC_R_FAMILYNOSUPPORT);
1271 
1272 		default:
1273 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1274 					 "%s() %s: %s", err, "failed",
1275 					 strerror(errno));
1276 			return (ISC_R_UNEXPECTED);
1277 		}
1278 	}
1279 
1280 	result = make_nonblock(sock->fd);
1281 	if (result != ISC_R_SUCCESS) {
1282 		(void)close(sock->fd);
1283 		return (result);
1284 	}
1285 
1286 	/*
1287 	 * Use minimum mtu if possible.
1288 	 */
1289 	if (sock->type == isc_sockettype_tcp && sock->pf == AF_INET6) {
1290 		use_min_mtu(sock);
1291 		set_tcp_maxseg(sock, 1280 - 20 - 40); /* 1280 - TCP - IPV6 */
1292 	}
1293 
1294 	if (sock->type == isc_sockettype_udp) {
1295 
1296 		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
1297 			       (void *)&on, sizeof(on)) < 0
1298 		    && errno != ENOPROTOOPT) {
1299 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1300 					 "setsockopt(%d, SO_TIMESTAMP) %s: %s",
1301 					 sock->fd, "failed", strerror(errno));
1302 			/* Press on... */
1303 		}
1304 
1305 		/* RFC 3542 */
1306 		if ((sock->pf == AF_INET6)
1307 		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1308 				   (void *)&on, sizeof(on)) < 0)) {
1309 			UNEXPECTED_ERROR(__FILE__, __LINE__,
1310 					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1311 					 "%s: %s", sock->fd, "failed",
1312 					 strerror(errno));
1313 		}
1314 	}
1315 
1316 	if (sock->active == 0) {
1317 		sock->active = 1;
1318 	}
1319 
1320 	return (ISC_R_SUCCESS);
1321 }
1322 
1323 /*
1324  * Create a 'type' socket managed
1325  * by 'manager'.  Events will be posted to 'task' and when dispatched
1326  * 'action' will be called with 'arg' as the arg value.  The new
1327  * socket is returned in 'socketp'.
1328  */
1329 static isc_result_t
1330 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1331 	      isc_socket_t **socketp)
1332 {
1333 	isc_socket_t *sock = NULL;
1334 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
1335 	isc_result_t result;
1336 
1337 	REQUIRE(socketp != NULL && *socketp == NULL);
1338 
1339 	result = allocate_socket(manager, type, &sock);
1340 	if (result != ISC_R_SUCCESS)
1341 		return (result);
1342 
1343 	switch (sock->type) {
1344 	case isc_sockettype_udp:
1345 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
1346 		sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
1347 		break;
1348 	case isc_sockettype_tcp:
1349 		break;
1350 	default:
1351 		INSIST(0);
1352 	}
1353 
1354 	sock->pf = pf;
1355 
1356 	result = opensocket(sock);
1357 	if (result != ISC_R_SUCCESS) {
1358 		free_socket(&sock);
1359 		return (result);
1360 	}
1361 
1362 	sock->references = 1;
1363 	*socketp = (isc_socket_t *)sock;
1364 
1365 	/*
1366 	 * Note we don't have to lock the socket like we normally would because
1367 	 * there are no external references to it yet.
1368 	 */
1369 
1370 	manager->fds[sock->fd] = sock;
1371 	manager->fdstate[sock->fd] = MANAGED;
1372 
1373 	ISC_LIST_APPEND(manager->socklist, sock, link);
1374 	if (manager->maxfd < sock->fd)
1375 		manager->maxfd = sock->fd;
1376 
1377 	socket_log(sock, NULL, CREATION, "created");
1378 
1379 	return (ISC_R_SUCCESS);
1380 }
1381 
1382 /*%
1383  * Create a new 'type' socket managed by 'manager'.  Events
1384  * will be posted to 'task' and when dispatched 'action' will be
1385  * called with 'arg' as the arg value.  The new socket is returned
1386  * in 'socketp'.
1387  */
1388 isc_result_t
1389 isc_socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
1390 		   isc_socket_t **socketp)
1391 {
1392 	return (socket_create(manager0, pf, type, socketp));
1393 }
1394 
1395 /*
1396  * Attach to a socket.  Caller must explicitly detach when it is done.
1397  */
1398 void
1399 isc_socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
1400 	isc_socket_t *sock = (isc_socket_t *)sock0;
1401 
1402 	REQUIRE(socketp != NULL && *socketp == NULL);
1403 
1404 	sock->references++;
1405 
1406 	*socketp = (isc_socket_t *)sock;
1407 }
1408 
1409 /*
1410  * Dereference a socket.  If this is the last reference to it, clean things
1411  * up by destroying the socket.
1412  */
1413 void
1414 isc_socket_detach(isc_socket_t **socketp) {
1415 	isc_socket_t *sock;
1416 	isc_boolean_t kill_socket = ISC_FALSE;
1417 
1418 	REQUIRE(socketp != NULL);
1419 	sock = (isc_socket_t *)*socketp;
1420 
1421 	REQUIRE(sock->references > 0);
1422 	sock->references--;
1423 	if (sock->references == 0)
1424 		kill_socket = ISC_TRUE;
1425 
1426 	if (kill_socket)
1427 		destroy(&sock);
1428 
1429 	*socketp = NULL;
1430 }
1431 
1432 /*
1433  * I/O is possible on a given socket.  Schedule an event to this task that
1434  * will call an internal function to do the I/O.  This will charge the
1435  * task with the I/O operation and let our select loop handler get back
1436  * to doing something real as fast as possible.
1437  *
1438  * The socket and manager must be locked before calling this function.
1439  */
1440 static void
1441 dispatch_recv(isc_socket_t *sock) {
1442 	intev_t *iev;
1443 	isc_socketevent_t *ev;
1444 	isc_task_t *sender;
1445 
1446 	INSIST(!sock->pending_recv);
1447 
1448 	ev = ISC_LIST_HEAD(sock->recv_list);
1449 	if (ev == NULL)
1450 		return;
1451 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1452 		   "dispatch_recv:  event %p -> task %p",
1453 		   ev, ev->ev_sender);
1454 	sender = ev->ev_sender;
1455 
1456 	sock->pending_recv = 1;
1457 	iev = &sock->readable_ev;
1458 
1459 	sock->references++;
1460 	iev->ev_sender = sock;
1461 	iev->ev_action = internal_recv;
1462 	iev->ev_arg = sock;
1463 
1464 	isc_task_send(sender, (isc_event_t **)&iev);
1465 }
1466 
1467 static void
1468 dispatch_send(isc_socket_t *sock) {
1469 	intev_t *iev;
1470 	isc_socketevent_t *ev;
1471 	isc_task_t *sender;
1472 
1473 	INSIST(!sock->pending_send);
1474 
1475 	ev = ISC_LIST_HEAD(sock->send_list);
1476 	if (ev == NULL)
1477 		return;
1478 	socket_log(sock, NULL, EVENT, NULL, 0, 0,
1479 		   "dispatch_send:  event %p -> task %p",
1480 		   ev, ev->ev_sender);
1481 	sender = ev->ev_sender;
1482 
1483 	sock->pending_send = 1;
1484 	iev = &sock->writable_ev;
1485 
1486 	sock->references++;
1487 	iev->ev_sender = sock;
1488 	iev->ev_action = internal_send;
1489 	iev->ev_arg = sock;
1490 
1491 	isc_task_send(sender, (isc_event_t **)&iev);
1492 }
1493 
1494 static void
1495 dispatch_connect(isc_socket_t *sock) {
1496 	intev_t *iev;
1497 	isc_socket_connev_t *ev;
1498 
1499 	iev = &sock->writable_ev;
1500 
1501 	ev = sock->connect_ev;
1502 	INSIST(ev != NULL); /* XXX */
1503 
1504 	INSIST(sock->connecting);
1505 
1506 	sock->references++;  /* keep socket around for this internal event */
1507 	iev->ev_sender = sock;
1508 	iev->ev_action = internal_connect;
1509 	iev->ev_arg = sock;
1510 
1511 	isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
1512 }
1513 
1514 /*
1515  * Dequeue an item off the given socket's read queue, set the result code
1516  * in the done event to the one provided, and send it to the task it was
1517  * destined for.
1518  *
1519  * If the event to be sent is on a list, remove it before sending.  If
1520  * asked to, send and detach from the socket as well.
1521  *
1522  * Caller must have the socket locked if the event is attached to the socket.
1523  */
1524 static void
1525 send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1526 	isc_task_t *task;
1527 
1528 	task = (*dev)->ev_sender;
1529 
1530 	(*dev)->ev_sender = sock;
1531 
1532 	if (ISC_LINK_LINKED(*dev, ev_link))
1533 		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1534 
1535 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1536 	    == ISC_SOCKEVENTATTR_ATTACHED)
1537 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1538 	else
1539 		isc_task_send(task, (isc_event_t **)dev);
1540 }
1541 
1542 /*
1543  * See comments for send_recvdone_event() above.
1544  *
1545  * Caller must have the socket locked if the event is attached to the socket.
1546  */
1547 static void
1548 send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1549 	isc_task_t *task;
1550 
1551 	INSIST(dev != NULL && *dev != NULL);
1552 
1553 	task = (*dev)->ev_sender;
1554 	(*dev)->ev_sender = sock;
1555 
1556 	if (ISC_LINK_LINKED(*dev, ev_link))
1557 		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1558 
1559 	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1560 	    == ISC_SOCKEVENTATTR_ATTACHED)
1561 		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1562 	else
1563 		isc_task_send(task, (isc_event_t **)dev);
1564 }
1565 
1566 static void
1567 internal_recv(isc_task_t *me, isc_event_t *ev) {
1568 	isc_socketevent_t *dev;
1569 	isc_socket_t *sock;
1570 
1571 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
1572 
1573 	sock = ev->ev_sender;
1574 
1575 	socket_log(sock, NULL, IOEVENT,
1576 		   "internal_recv: task %p got event %p", me, ev);
1577 
1578 	INSIST(sock->pending_recv == 1);
1579 	sock->pending_recv = 0;
1580 
1581 	INSIST(sock->references > 0);
1582 	sock->references--;  /* the internal event is done with this socket */
1583 	if (sock->references == 0) {
1584 		destroy(&sock);
1585 		return;
1586 	}
1587 
1588 	/*
1589 	 * Try to do as much I/O as possible on this socket.  There are no
1590 	 * limits here, currently.
1591 	 */
1592 	dev = ISC_LIST_HEAD(sock->recv_list);
1593 	while (dev != NULL) {
1594 		switch (doio_recv(sock, dev)) {
1595 		case DOIO_SOFT:
1596 			goto poke;
1597 
1598 		case DOIO_EOF:
1599 			/*
1600 			 * read of 0 means the remote end was closed.
1601 			 * Run through the event queue and dispatch all
1602 			 * the events with an EOF result code.
1603 			 */
1604 			do {
1605 				dev->result = ISC_R_EOF;
1606 				send_recvdone_event(sock, &dev);
1607 				dev = ISC_LIST_HEAD(sock->recv_list);
1608 			} while (dev != NULL);
1609 			goto poke;
1610 
1611 		case DOIO_SUCCESS:
1612 		case DOIO_HARD:
1613 			send_recvdone_event(sock, &dev);
1614 			break;
1615 		}
1616 
1617 		dev = ISC_LIST_HEAD(sock->recv_list);
1618 	}
1619 
1620  poke:
1621 	if (!ISC_LIST_EMPTY(sock->recv_list))
1622 		select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1623 }
1624 
1625 static void
1626 internal_send(isc_task_t *me, isc_event_t *ev) {
1627 	isc_socketevent_t *dev;
1628 	isc_socket_t *sock;
1629 
1630 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
1631 
1632 	/*
1633 	 * Find out what socket this is and lock it.
1634 	 */
1635 	sock = (isc_socket_t *)ev->ev_sender;
1636 	socket_log(sock, NULL, IOEVENT,
1637 		   "internal_send: task %p got event %p", me, ev);
1638 
1639 	INSIST(sock->pending_send == 1);
1640 	sock->pending_send = 0;
1641 
1642 	INSIST(sock->references > 0);
1643 	sock->references--;  /* the internal event is done with this socket */
1644 	if (sock->references == 0) {
1645 		destroy(&sock);
1646 		return;
1647 	}
1648 
1649 	/*
1650 	 * Try to do as much I/O as possible on this socket.  There are no
1651 	 * limits here, currently.
1652 	 */
1653 	dev = ISC_LIST_HEAD(sock->send_list);
1654 	while (dev != NULL) {
1655 		switch (doio_send(sock, dev)) {
1656 		case DOIO_SOFT:
1657 			goto poke;
1658 
1659 		case DOIO_HARD:
1660 		case DOIO_SUCCESS:
1661 			send_senddone_event(sock, &dev);
1662 			break;
1663 		}
1664 
1665 		dev = ISC_LIST_HEAD(sock->send_list);
1666 	}
1667 
1668  poke:
1669 	if (!ISC_LIST_EMPTY(sock->send_list))
1670 		select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
1671 }
1672 
1673 /*
1674  * Process read/writes on each fd here.  Avoid locking
1675  * and unlocking twice if both reads and writes are possible.
1676  */
1677 static void
1678 process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
1679 	   isc_boolean_t writeable)
1680 {
1681 	isc_socket_t *sock;
1682 	isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
1683 
1684 	/*
1685 	 * If the socket is going to be closed, don't do more I/O.
1686 	 */
1687 	if (manager->fdstate[fd] == CLOSE_PENDING) {
1688 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1689 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1690 		return;
1691 	}
1692 
1693 	sock = manager->fds[fd];
1694 	if (readable) {
1695 		if (sock == NULL) {
1696 			unwatch_read = ISC_TRUE;
1697 			goto check_write;
1698 		}
1699 		if (!SOCK_DEAD(sock)) {
1700 			dispatch_recv(sock);
1701 		}
1702 		unwatch_read = ISC_TRUE;
1703 	}
1704 check_write:
1705 	if (writeable) {
1706 		if (sock == NULL) {
1707 			unwatch_write = ISC_TRUE;
1708 			goto unlock_fd;
1709 		}
1710 		if (!SOCK_DEAD(sock)) {
1711 			if (sock->connecting)
1712 				dispatch_connect(sock);
1713 			else
1714 				dispatch_send(sock);
1715 		}
1716 		unwatch_write = ISC_TRUE;
1717 	}
1718 
1719  unlock_fd:
1720 	if (unwatch_read)
1721 		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
1722 	if (unwatch_write)
1723 		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1724 
1725 }
1726 
1727 static void
1728 process_fds(isc_socketmgr_t *manager, int maxfd, fd_set *readfds,
1729 	    fd_set *writefds)
1730 {
1731 	int i;
1732 
1733 	REQUIRE(maxfd <= (int)manager->maxsocks);
1734 
1735 	for (i = 0; i < maxfd; i++) {
1736 		process_fd(manager, i, FD_ISSET(i, readfds),
1737 			   FD_ISSET(i, writefds));
1738 	}
1739 }
1740 
1741 /*
1742  * Create a new socket manager.
1743  */
1744 
1745 static isc_result_t
1746 setup_watcher(isc_socketmgr_t *manager) {
1747 	isc_result_t result;
1748 
1749 	UNUSED(result);
1750 
1751 	manager->fd_bufsize = sizeof(fd_set);
1752 
1753 	manager->read_fds = NULL;
1754 	manager->read_fds_copy = NULL;
1755 	manager->write_fds = NULL;
1756 	manager->write_fds_copy = NULL;
1757 
1758 	manager->read_fds = malloc(manager->fd_bufsize);
1759 	if (manager->read_fds != NULL)
1760 		manager->read_fds_copy = malloc(manager->fd_bufsize);
1761 	if (manager->read_fds_copy != NULL)
1762 		manager->write_fds = malloc(manager->fd_bufsize);
1763 	if (manager->write_fds != NULL) {
1764 		manager->write_fds_copy = malloc(manager->fd_bufsize);
1765 	}
1766 	if (manager->write_fds_copy == NULL) {
1767 		if (manager->write_fds != NULL) {
1768 			free(manager->write_fds);
1769 		}
1770 		if (manager->read_fds_copy != NULL) {
1771 			free(manager->read_fds_copy);
1772 		}
1773 		if (manager->read_fds != NULL) {
1774 			free(manager->read_fds);
1775 		}
1776 		return (ISC_R_NOMEMORY);
1777 	}
1778 	memset(manager->read_fds, 0, manager->fd_bufsize);
1779 	memset(manager->write_fds, 0, manager->fd_bufsize);
1780 
1781 	manager->maxfd = 0;
1782 
1783 	return (ISC_R_SUCCESS);
1784 }
1785 
1786 static void
1787 cleanup_watcher(isc_socketmgr_t *manager) {
1788 
1789 	if (manager->read_fds != NULL)
1790 		free(manager->read_fds);
1791 	if (manager->read_fds_copy != NULL)
1792 		free(manager->read_fds_copy);
1793 	if (manager->write_fds != NULL)
1794 		free(manager->write_fds);
1795 	if (manager->write_fds_copy != NULL)
1796 		free(manager->write_fds_copy);
1797 }
1798 
1799 static isc_result_t
1800 isc_socketmgr_create2(isc_socketmgr_t **managerp,
1801 		       unsigned int maxsocks)
1802 {
1803 	isc_socketmgr_t *manager;
1804 	isc_result_t result;
1805 
1806 	REQUIRE(managerp != NULL && *managerp == NULL);
1807 
1808 	if (socketmgr != NULL) {
1809 		/* Don't allow maxsocks to be updated */
1810 		if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
1811 			return (ISC_R_EXISTS);
1812 
1813 		socketmgr->refs++;
1814 		*managerp = (isc_socketmgr_t *)socketmgr;
1815 		return (ISC_R_SUCCESS);
1816 	}
1817 
1818 	if (maxsocks == 0)
1819 		maxsocks = FD_SETSIZE;
1820 
1821 	manager = malloc(sizeof(*manager));
1822 	if (manager == NULL)
1823 		return (ISC_R_NOMEMORY);
1824 
1825 	/* zero-clear so that necessary cleanup on failure will be easy */
1826 	memset(manager, 0, sizeof(*manager));
1827 	manager->maxsocks = maxsocks;
1828 	manager->fds = malloc(manager->maxsocks * sizeof(isc_socket_t *));
1829 	if (manager->fds == NULL) {
1830 		result = ISC_R_NOMEMORY;
1831 		goto free_manager;
1832 	}
1833 	manager->fdstate = malloc(manager->maxsocks * sizeof(int));
1834 	if (manager->fdstate == NULL) {
1835 		result = ISC_R_NOMEMORY;
1836 		goto free_manager;
1837 	}
1838 
1839 	memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
1840 	ISC_LIST_INIT(manager->socklist);
1841 
1842 	manager->refs = 1;
1843 
1844 	/*
1845 	 * Set up initial state for the select loop
1846 	 */
1847 	result = setup_watcher(manager);
1848 	if (result != ISC_R_SUCCESS)
1849 		goto cleanup;
1850 
1851 	memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
1852 
1853 	socketmgr = manager;
1854 	*managerp = (isc_socketmgr_t *)manager;
1855 
1856 	return (ISC_R_SUCCESS);
1857 
1858 cleanup:
1859 
1860 free_manager:
1861 	if (manager->fdstate != NULL) {
1862 		free(manager->fdstate);
1863 	}
1864 	if (manager->fds != NULL) {
1865 		free(manager->fds);
1866 	}
1867 	free(manager);
1868 
1869 	return (result);
1870 }
1871 
1872 isc_result_t
1873 isc_socketmgr_create(isc_socketmgr_t **managerp) {
1874 	return (isc_socketmgr_create2(managerp, 0));
1875 }
1876 
1877 void
1878 isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
1879 	isc_socketmgr_t *manager;
1880 	int i;
1881 
1882 	/*
1883 	 * Destroy a socket manager.
1884 	 */
1885 
1886 	REQUIRE(managerp != NULL);
1887 	manager = (isc_socketmgr_t *)*managerp;
1888 
1889 	manager->refs--;
1890 	if (manager->refs > 0) {
1891 		*managerp = NULL;
1892 		return;
1893 	}
1894 	socketmgr = NULL;
1895 
1896 	/*
1897 	 * Wait for all sockets to be destroyed.
1898 	 */
1899 	while (!ISC_LIST_EMPTY(manager->socklist)) {
1900 		isc_taskmgr_dispatch(NULL);
1901 	}
1902 
1903 	/*
1904 	 * Here, poke our select/poll thread.  Do this by closing the write
1905 	 * half of the pipe, which will send EOF to the read half.
1906 	 * This is currently a no-op in the non-threaded case.
1907 	 */
1908 	select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
1909 
1910 	/*
1911 	 * Clean up.
1912 	 */
1913 	cleanup_watcher(manager);
1914 
1915 	for (i = 0; i < (int)manager->maxsocks; i++)
1916 		if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
1917 			(void)close(i);
1918 
1919 	free(manager->fds);
1920 	free(manager->fdstate);
1921 
1922 	free(manager);
1923 
1924 	*managerp = NULL;
1925 
1926 	socketmgr = NULL;
1927 }
1928 
1929 static isc_result_t
1930 socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
1931 	    unsigned int flags)
1932 {
1933 	int io_state;
1934 	isc_task_t *ntask = NULL;
1935 	isc_result_t result = ISC_R_SUCCESS;
1936 
1937 	dev->ev_sender = task;
1938 
1939 	if (sock->type == isc_sockettype_udp) {
1940 		io_state = doio_recv(sock, dev);
1941 	} else {
1942 		if (ISC_LIST_EMPTY(sock->recv_list))
1943 			io_state = doio_recv(sock, dev);
1944 		else
1945 			io_state = DOIO_SOFT;
1946 	}
1947 
1948 	switch (io_state) {
1949 	case DOIO_SOFT:
1950 		/*
1951 		 * We couldn't read all or part of the request right now, so
1952 		 * queue it.
1953 		 *
1954 		 * Attach to socket and to task
1955 		 */
1956 		isc_task_attach(task, &ntask);
1957 		dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
1958 
1959 		/*
1960 		 * Enqueue the request.  If the socket was previously not being
1961 		 * watched, poke the watcher to start paying attention to it.
1962 		 */
1963 		if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
1964 			select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
1965 		ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
1966 
1967 		socket_log(sock, NULL, EVENT, NULL, 0, 0,
1968 			   "socket_recv: event %p -> task %p",
1969 			   dev, ntask);
1970 
1971 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
1972 			result = ISC_R_INPROGRESS;
1973 		break;
1974 
1975 	case DOIO_EOF:
1976 		dev->result = ISC_R_EOF;
1977 		/* fallthrough */
1978 
1979 	case DOIO_HARD:
1980 	case DOIO_SUCCESS:
1981 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
1982 			send_recvdone_event(sock, &dev);
1983 		break;
1984 	}
1985 
1986 	return (result);
1987 }
1988 
1989 isc_result_t
1990 isc_socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
1991 		  unsigned int minimum, isc_task_t *task,
1992 		  isc_taskaction_t action, void *arg)
1993 {
1994 	isc_socket_t *sock = (isc_socket_t *)sock0;
1995 	isc_socketevent_t *dev;
1996 	isc_socketmgr_t *manager;
1997 	unsigned int iocount;
1998 	isc_buffer_t *buffer;
1999 
2000 	REQUIRE(buflist != NULL);
2001 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2002 	REQUIRE(task != NULL);
2003 	REQUIRE(action != NULL);
2004 
2005 	manager = sock->manager;
2006 
2007 	iocount = isc_bufferlist_availablecount(buflist);
2008 	REQUIRE(iocount > 0);
2009 
2010 	INSIST(sock->bound);
2011 
2012 	dev = allocate_socketevent(sock,
2013 				   ISC_SOCKEVENT_RECVDONE, action, arg);
2014 	if (dev == NULL)
2015 		return (ISC_R_NOMEMORY);
2016 
2017 	/*
2018 	 * UDP sockets are always partial read
2019 	 */
2020 	if (sock->type == isc_sockettype_udp)
2021 		dev->minimum = 1;
2022 	else {
2023 		if (minimum == 0)
2024 			dev->minimum = iocount;
2025 		else
2026 			dev->minimum = minimum;
2027 	}
2028 
2029 	/*
2030 	 * Move each buffer from the passed in list to our internal one.
2031 	 */
2032 	buffer = ISC_LIST_HEAD(*buflist);
2033 	while (buffer != NULL) {
2034 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2035 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2036 		buffer = ISC_LIST_HEAD(*buflist);
2037 	}
2038 
2039 	return (socket_recv(sock, dev, task, 0));
2040 }
2041 
2042 static isc_result_t
2043 socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
2044 	    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2045 	    unsigned int flags)
2046 {
2047 	int io_state;
2048 	isc_task_t *ntask = NULL;
2049 	isc_result_t result = ISC_R_SUCCESS;
2050 
2051 	dev->ev_sender = task;
2052 
2053 	set_dev_address(address, sock, dev);
2054 	if (pktinfo != NULL) {
2055 		dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
2056 		dev->pktinfo = *pktinfo;
2057 
2058 		if (!isc_sockaddr_issitelocal(&dev->address) &&
2059 		    !isc_sockaddr_islinklocal(&dev->address)) {
2060 			socket_log(sock, NULL, TRACE,
2061 				   "pktinfo structure provided, ifindex %u "
2062 				   "(set to 0)", pktinfo->ipi6_ifindex);
2063 
2064 			/*
2065 			 * Set the pktinfo index to 0 here, to let the
2066 			 * kernel decide what interface it should send on.
2067 			 */
2068 			dev->pktinfo.ipi6_ifindex = 0;
2069 		}
2070 	}
2071 
2072 	if (sock->type == isc_sockettype_udp)
2073 		io_state = doio_send(sock, dev);
2074 	else {
2075 		if (ISC_LIST_EMPTY(sock->send_list))
2076 			io_state = doio_send(sock, dev);
2077 		else
2078 			io_state = DOIO_SOFT;
2079 	}
2080 
2081 	switch (io_state) {
2082 	case DOIO_SOFT:
2083 		/*
2084 		 * We couldn't send all or part of the request right now, so
2085 		 * queue it unless ISC_SOCKFLAG_NORETRY is set.
2086 		 */
2087 		if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
2088 			isc_task_attach(task, &ntask);
2089 			dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
2090 
2091 			/*
2092 			 * Enqueue the request.  If the socket was previously
2093 			 * not being watched, poke the watcher to start
2094 			 * paying attention to it.
2095 			 */
2096 			if (ISC_LIST_EMPTY(sock->send_list) &&
2097 			    !sock->pending_send)
2098 				select_poke(sock->manager, sock->fd,
2099 					    SELECT_POKE_WRITE);
2100 			ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
2101 
2102 			socket_log(sock, NULL, EVENT, NULL, 0, 0,
2103 				   "socket_send: event %p -> task %p",
2104 				   dev, ntask);
2105 
2106 			if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
2107 				result = ISC_R_INPROGRESS;
2108 			break;
2109 		}
2110 
2111 		/* FALLTHROUGH */
2112 
2113 	case DOIO_HARD:
2114 	case DOIO_SUCCESS:
2115 		if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
2116 			send_senddone_event(sock, &dev);
2117 		break;
2118 	}
2119 
2120 	return (result);
2121 }
2122 
2123 isc_result_t
2124 isc_socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
2125 		  isc_task_t *task, isc_taskaction_t action, void *arg)
2126 {
2127 	return (isc_socket_sendtov2(sock, buflist, task, action, arg, NULL,
2128 				     NULL, 0));
2129 }
2130 
2131 isc_result_t
2132 isc_socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
2133 		     isc_task_t *task, isc_taskaction_t action, void *arg,
2134 		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
2135 		     unsigned int flags)
2136 {
2137 	isc_socket_t *sock = (isc_socket_t *)sock0;
2138 	isc_socketevent_t *dev;
2139 	isc_socketmgr_t *manager;
2140 	unsigned int iocount;
2141 	isc_buffer_t *buffer;
2142 
2143 	REQUIRE(buflist != NULL);
2144 	REQUIRE(!ISC_LIST_EMPTY(*buflist));
2145 	REQUIRE(task != NULL);
2146 	REQUIRE(action != NULL);
2147 
2148 	manager = sock->manager;
2149 
2150 	iocount = isc_bufferlist_usedcount(buflist);
2151 	REQUIRE(iocount > 0);
2152 
2153 	dev = allocate_socketevent(sock,
2154 				   ISC_SOCKEVENT_SENDDONE, action, arg);
2155 	if (dev == NULL)
2156 		return (ISC_R_NOMEMORY);
2157 
2158 	/*
2159 	 * Move each buffer from the passed in list to our internal one.
2160 	 */
2161 	buffer = ISC_LIST_HEAD(*buflist);
2162 	while (buffer != NULL) {
2163 		ISC_LIST_DEQUEUE(*buflist, buffer, link);
2164 		ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
2165 		buffer = ISC_LIST_HEAD(*buflist);
2166 	}
2167 
2168 	return (socket_send(sock, dev, task, address, pktinfo, flags));
2169 }
2170 
2171 isc_result_t
2172 isc_socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
2173 		 unsigned int options) {
2174 	isc_socket_t *sock = (isc_socket_t *)sock0;
2175 	int on = 1;
2176 
2177 	INSIST(!sock->bound);
2178 
2179 	if (sock->pf != sockaddr->type.sa.sa_family) {
2180 		return (ISC_R_FAMILYMISMATCH);
2181 	}
2182 
2183 	/*
2184 	 * Only set SO_REUSEADDR when we want a specific port.
2185 	 */
2186 	if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
2187 	    isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
2188 	    setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
2189 		       sizeof(on)) < 0) {
2190 		UNEXPECTED_ERROR(__FILE__, __LINE__,
2191 				 "setsockopt(%d) %s", sock->fd, "failed");
2192 		/* Press on... */
2193 	}
2194 	if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
2195 		switch (errno) {
2196 		case EACCES:
2197 			return (ISC_R_NOPERM);
2198 		case EADDRNOTAVAIL:
2199 			return (ISC_R_ADDRNOTAVAIL);
2200 		case EADDRINUSE:
2201 			return (ISC_R_ADDRINUSE);
2202 		case EINVAL:
2203 			return (ISC_R_BOUND);
2204 		default:
2205 			UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
2206 					 strerror(errno));
2207 			return (ISC_R_UNEXPECTED);
2208 		}
2209 	}
2210 
2211 	socket_log(sock, sockaddr, TRACE, "bound");
2212 	sock->bound = 1;
2213 
2214 	return (ISC_R_SUCCESS);
2215 }
2216 
2217 isc_result_t
2218 isc_socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
2219 		   isc_task_t *task, isc_taskaction_t action, void *arg)
2220 {
2221 	isc_socket_t *sock = (isc_socket_t *)sock0;
2222 	isc_socket_connev_t *dev;
2223 	isc_task_t *ntask = NULL;
2224 	isc_socketmgr_t *manager;
2225 	int cc;
2226 	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
2227 
2228 	REQUIRE(addr != NULL);
2229 	REQUIRE(task != NULL);
2230 	REQUIRE(action != NULL);
2231 
2232 	manager = sock->manager;
2233 	REQUIRE(addr != NULL);
2234 
2235 	if (isc_sockaddr_ismulticast(addr))
2236 		return (ISC_R_MULTICAST);
2237 
2238 	REQUIRE(!sock->connecting);
2239 
2240 	dev = (isc_socket_connev_t *)isc_event_allocate(sock,
2241 							ISC_SOCKEVENT_CONNECT,
2242 							action,	arg,
2243 							sizeof(*dev));
2244 	if (dev == NULL) {
2245 		return (ISC_R_NOMEMORY);
2246 	}
2247 	ISC_LINK_INIT(dev, ev_link);
2248 
2249 	/*
2250 	 * Try to do the connect right away, as there can be only one
2251 	 * outstanding, and it might happen to complete.
2252 	 */
2253 	sock->peer_address = *addr;
2254 	cc = connect(sock->fd, &addr->type.sa, addr->length);
2255 	if (cc < 0) {
2256 		/*
2257 		 * HP-UX "fails" to connect a UDP socket and sets errno to
2258 		 * EINPROGRESS if it's non-blocking.  We'd rather regard this as
2259 		 * a success and let the user detect it if it's really an error
2260 		 * at the time of sending a packet on the socket.
2261 		 */
2262 		if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
2263 			cc = 0;
2264 			goto success;
2265 		}
2266 		if (SOFT_ERROR(errno) || errno == EINPROGRESS)
2267 			goto queue;
2268 
2269 		switch (errno) {
2270 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
2271 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2272 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2273 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2274 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2275 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2276 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2277 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2278 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2279 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2280 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2281 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2282 #undef ERROR_MATCH
2283 		}
2284 
2285 		sock->connected = 0;
2286 
2287 		isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
2288 		UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
2289 				 addrbuf, errno, strerror(errno));
2290 
2291 		isc_event_free(ISC_EVENT_PTR(&dev));
2292 		return (ISC_R_UNEXPECTED);
2293 
2294 	err_exit:
2295 		sock->connected = 0;
2296 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2297 
2298 		return (ISC_R_SUCCESS);
2299 	}
2300 
2301 	/*
2302 	 * If connect completed, fire off the done event.
2303 	 */
2304  success:
2305 	if (cc == 0) {
2306 		sock->connected = 1;
2307 		sock->bound = 1;
2308 		dev->result = ISC_R_SUCCESS;
2309 		isc_task_send(task, ISC_EVENT_PTR(&dev));
2310 
2311 		return (ISC_R_SUCCESS);
2312 	}
2313 
2314  queue:
2315 
2316 	/*
2317 	 * Attach to task.
2318 	 */
2319 	isc_task_attach(task, &ntask);
2320 
2321 	sock->connecting = 1;
2322 
2323 	dev->ev_sender = ntask;
2324 
2325 	/*
2326 	 * Poke watcher here.  We still have the socket locked, so there
2327 	 * is no race condition.  We will keep the lock for such a short
2328 	 * bit of time waking it up now or later won't matter all that much.
2329 	 */
2330 	if (sock->connect_ev == NULL)
2331 		select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
2332 
2333 	sock->connect_ev = dev;
2334 
2335 	return (ISC_R_SUCCESS);
2336 }
2337 
2338 /*
2339  * Called when a socket with a pending connect() finishes.
2340  */
2341 static void
2342 internal_connect(isc_task_t *me, isc_event_t *ev) {
2343 	isc_socket_t *sock;
2344 	isc_socket_connev_t *dev;
2345 	isc_task_t *task;
2346 	int cc;
2347 	socklen_t optlen;
2348 	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
2349 
2350 	UNUSED(me);
2351 	INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
2352 
2353 	sock = ev->ev_sender;
2354 
2355 	/*
2356 	 * When the internal event was sent the reference count was bumped
2357 	 * to keep the socket around for us.  Decrement the count here.
2358 	 */
2359 	INSIST(sock->references > 0);
2360 	sock->references--;
2361 	if (sock->references == 0) {
2362 		destroy(&sock);
2363 		return;
2364 	}
2365 
2366 	/*
2367 	 * Has this event been canceled?
2368 	 */
2369 	dev = sock->connect_ev;
2370 	if (dev == NULL) {
2371 		INSIST(!sock->connecting);
2372 		return;
2373 	}
2374 
2375 	INSIST(sock->connecting);
2376 	sock->connecting = 0;
2377 
2378 	/*
2379 	 * Get any possible error status here.
2380 	 */
2381 	optlen = sizeof(cc);
2382 	if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
2383 		       (void *)&cc, (void *)&optlen) < 0)
2384 		cc = errno;
2385 	else
2386 		errno = cc;
2387 
2388 	if (errno != 0) {
2389 		/*
2390 		 * If the error is EAGAIN, just re-select on this
2391 		 * fd and pretend nothing strange happened.
2392 		 */
2393 		if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
2394 			sock->connecting = 1;
2395 			select_poke(sock->manager, sock->fd,
2396 				    SELECT_POKE_CONNECT);
2397 			return;
2398 		}
2399 
2400 
2401 		/*
2402 		 * Translate other errors into ISC_R_* flavors.
2403 		 */
2404 		switch (errno) {
2405 #define ERROR_MATCH(a, b) case a: dev->result = b; break;
2406 			ERROR_MATCH(EACCES, ISC_R_NOPERM);
2407 			ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2408 			ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2409 			ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
2410 			ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
2411 			ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
2412 			ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
2413 			ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
2414 			ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
2415 			ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
2416 			ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
2417 			ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
2418 #undef ERROR_MATCH
2419 		default:
2420 			dev->result = ISC_R_UNEXPECTED;
2421 			isc_sockaddr_format(&sock->peer_address, peerbuf,
2422 					    sizeof(peerbuf));
2423 			UNEXPECTED_ERROR(__FILE__, __LINE__,
2424 					 "internal_connect: connect(%s) %s",
2425 					 peerbuf, strerror(errno));
2426 		}
2427 	} else {
2428 		dev->result = ISC_R_SUCCESS;
2429 		sock->connected = 1;
2430 		sock->bound = 1;
2431 	}
2432 
2433 	sock->connect_ev = NULL;
2434 
2435 	task = dev->ev_sender;
2436 	dev->ev_sender = sock;
2437 	isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
2438 }
2439 
2440 /*
2441  * Run through the list of events on this socket, and cancel the ones
2442  * queued for task "task" of type "how".  "how" is a bitmask.
2443  */
2444 void
2445 isc_socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
2446 	isc_socket_t *sock = (isc_socket_t *)sock0;
2447 
2448 	/*
2449 	 * Quick exit if there is nothing to do.  Don't even bother locking
2450 	 * in this case.
2451 	 */
2452 	if (how == 0)
2453 		return;
2454 
2455 	/*
2456 	 * All of these do the same thing, more or less.
2457 	 * Each will:
2458 	 *	o If the internal event is marked as "posted" try to
2459 	 *	  remove it from the task's queue.  If this fails, mark it
2460 	 *	  as canceled instead, and let the task clean it up later.
2461 	 *	o For each I/O request for that task of that type, post
2462 	 *	  its done event with status of "ISC_R_CANCELED".
2463 	 *	o Reset any state needed.
2464 	 */
2465 	if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
2466 	    && !ISC_LIST_EMPTY(sock->recv_list)) {
2467 		isc_socketevent_t      *dev;
2468 		isc_socketevent_t      *next;
2469 		isc_task_t	       *current_task;
2470 
2471 		dev = ISC_LIST_HEAD(sock->recv_list);
2472 
2473 		while (dev != NULL) {
2474 			current_task = dev->ev_sender;
2475 			next = ISC_LIST_NEXT(dev, ev_link);
2476 
2477 			if ((task == NULL) || (task == current_task)) {
2478 				dev->result = ISC_R_CANCELED;
2479 				send_recvdone_event(sock, &dev);
2480 			}
2481 			dev = next;
2482 		}
2483 	}
2484 
2485 	if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
2486 	    && !ISC_LIST_EMPTY(sock->send_list)) {
2487 		isc_socketevent_t      *dev;
2488 		isc_socketevent_t      *next;
2489 		isc_task_t	       *current_task;
2490 
2491 		dev = ISC_LIST_HEAD(sock->send_list);
2492 
2493 		while (dev != NULL) {
2494 			current_task = dev->ev_sender;
2495 			next = ISC_LIST_NEXT(dev, ev_link);
2496 
2497 			if ((task == NULL) || (task == current_task)) {
2498 				dev->result = ISC_R_CANCELED;
2499 				send_senddone_event(sock, &dev);
2500 			}
2501 			dev = next;
2502 		}
2503 	}
2504 
2505 	/*
2506 	 * Connecting is not a list.
2507 	 */
2508 	if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
2509 	    && sock->connect_ev != NULL) {
2510 		isc_socket_connev_t    *dev;
2511 		isc_task_t	       *current_task;
2512 
2513 		INSIST(sock->connecting);
2514 		sock->connecting = 0;
2515 
2516 		dev = sock->connect_ev;
2517 		current_task = dev->ev_sender;
2518 
2519 		if ((task == NULL) || (task == current_task)) {
2520 			sock->connect_ev = NULL;
2521 
2522 			dev->result = ISC_R_CANCELED;
2523 			dev->ev_sender = sock;
2524 			isc_task_sendanddetach(&current_task,
2525 					       ISC_EVENT_PTR(&dev));
2526 		}
2527 	}
2528 
2529 }
2530 
2531 /*
2532  * In our assumed scenario, we can simply use a single static object.
2533  * XXX: this is not true if the application uses multiple threads with
2534  *      'multi-context' mode.  Fixing this is a future TODO item.
2535  */
2536 static isc_socketwait_t swait_private;
2537 
2538 int
2539 isc_socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
2540 			  isc_socketwait_t **swaitp)
2541 {
2542 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2543 	int n;
2544 
2545 	REQUIRE(swaitp != NULL && *swaitp == NULL);
2546 
2547 	if (manager == NULL)
2548 		manager = socketmgr;
2549 	if (manager == NULL)
2550 		return (0);
2551 
2552 	memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
2553 	memmove(manager->write_fds_copy, manager->write_fds,
2554 		manager->fd_bufsize);
2555 
2556 	swait_private.readset = manager->read_fds_copy;
2557 	swait_private.writeset = manager->write_fds_copy;
2558 	swait_private.maxfd = manager->maxfd + 1;
2559 
2560 	n = select(swait_private.maxfd, swait_private.readset,
2561 		   swait_private.writeset, NULL, tvp);
2562 
2563 	*swaitp = &swait_private;
2564 	return (n);
2565 }
2566 
2567 isc_result_t
2568 isc_socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
2569 	isc_socketmgr_t *manager = (isc_socketmgr_t *)manager0;
2570 
2571 	REQUIRE(swait == &swait_private);
2572 
2573 	if (manager == NULL)
2574 		manager = socketmgr;
2575 	if (manager == NULL)
2576 		return (ISC_R_NOTFOUND);
2577 
2578 	process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
2579 	return (ISC_R_SUCCESS);
2580 }
2581