xref: /dflybsd-src/sys/kern/uipc_socket.c (revision e368a6e95e2cd9556a3e0fc43167d2dcf3a8253f)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1990, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
63  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $
64  */
65 
66 #include "opt_inet.h"
67 #include "opt_sctp.h"
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/fcntl.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/domain.h>
75 #include <sys/file.h>			/* for struct knote */
76 #include <sys/kernel.h>
77 #include <sys/event.h>
78 #include <sys/proc.h>
79 #include <sys/protosw.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/socketops.h>
83 #include <sys/resourcevar.h>
84 #include <sys/signalvar.h>
85 #include <sys/sysctl.h>
86 #include <sys/uio.h>
87 #include <sys/jail.h>
88 #include <vm/vm_zone.h>
89 #include <vm/pmap.h>
90 #include <net/netmsg2.h>
91 #include <net/netisr2.h>
92 
93 #include <sys/thread2.h>
94 #include <sys/socketvar2.h>
95 #include <sys/spinlock2.h>
96 
97 #include <machine/limits.h>
98 
99 #ifdef INET
100 extern int tcp_sosend_agglim;
101 extern int tcp_sosend_async;
102 extern int udp_sosend_async;
103 extern int udp_sosend_prepend;
104 
105 static int	 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
106 #endif /* INET */
107 
108 static void 	filt_sordetach(struct knote *kn);
109 static int 	filt_soread(struct knote *kn, long hint);
110 static void 	filt_sowdetach(struct knote *kn);
111 static int	filt_sowrite(struct knote *kn, long hint);
112 static int	filt_solisten(struct knote *kn, long hint);
113 
114 static void	sodiscard(struct socket *so);
115 static int	soclose_sync(struct socket *so, int fflag);
116 static void	soclose_fast(struct socket *so);
117 
118 static struct filterops solisten_filtops =
119 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten };
120 static struct filterops soread_filtops =
121 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread };
122 static struct filterops sowrite_filtops =
123 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite };
124 static struct filterops soexcept_filtops =
125 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread };
126 
127 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct");
128 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
129 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
130 
131 
132 static int somaxconn = SOMAXCONN;
133 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
134     &somaxconn, 0, "Maximum pending socket connection queue size");
135 
136 static int use_soclose_fast = 1;
137 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW,
138     &use_soclose_fast, 0, "Fast socket close");
139 
140 int use_soaccept_pred_fast = 1;
141 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW,
142     &use_soaccept_pred_fast, 0, "Fast socket accept predication");
143 
144 int use_sendfile_async = 1;
145 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW,
146     &use_sendfile_async, 0, "sendfile uses asynchronized pru_send");
147 
148 int use_soconnect_async = 1;
149 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW,
150     &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect");
151 
152 /*
153  * Socket operation routines.
154  * These routines are called by the routines in
155  * sys_socket.c or from a system process, and
156  * implement the semantics of socket operations by
157  * switching out to the protocol specific routines.
158  */
159 
160 /*
161  * Get a socket structure, and initialize it.
162  * Note that it would probably be better to allocate socket
163  * and PCB at the same time, but I'm not convinced that all
164  * the protocols can be easily modified to do this.
165  */
166 struct socket *
167 soalloc(int waitok, struct protosw *pr)
168 {
169 	struct socket *so;
170 	unsigned waitmask;
171 
172 	waitmask = waitok ? M_WAITOK : M_NOWAIT;
173 	so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask);
174 	if (so) {
175 		/* XXX race condition for reentrant kernel */
176 		so->so_proto = pr;
177 		TAILQ_INIT(&so->so_aiojobq);
178 		TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist);
179 		TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist);
180 		lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok");
181 		lwkt_token_init(&so->so_snd.ssb_token, "sndtok");
182 		spin_init(&so->so_rcvd_spin);
183 		netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport,
184 		    MSGF_DROPABLE | MSGF_PRIORITY,
185 		    so->so_proto->pr_usrreqs->pru_rcvd);
186 		so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC;
187 		so->so_state = SS_NOFDREF;
188 		so->so_refs = 1;
189 	}
190 	return so;
191 }
192 
193 int
194 socreate(int dom, struct socket **aso, int type,
195 	int proto, struct thread *td)
196 {
197 	struct proc *p = td->td_proc;
198 	struct protosw *prp;
199 	struct socket *so;
200 	struct pru_attach_info ai;
201 	int error;
202 
203 	if (proto)
204 		prp = pffindproto(dom, proto, type);
205 	else
206 		prp = pffindtype(dom, type);
207 
208 	if (prp == NULL || prp->pr_usrreqs->pru_attach == 0)
209 		return (EPROTONOSUPPORT);
210 
211 	if (p->p_ucred->cr_prison && jail_socket_unixiproute_only &&
212 	    prp->pr_domain->dom_family != PF_LOCAL &&
213 	    prp->pr_domain->dom_family != PF_INET &&
214 	    prp->pr_domain->dom_family != PF_INET6 &&
215 	    prp->pr_domain->dom_family != PF_ROUTE) {
216 		return (EPROTONOSUPPORT);
217 	}
218 
219 	if (prp->pr_type != type)
220 		return (EPROTOTYPE);
221 	so = soalloc(p != NULL, prp);
222 	if (so == NULL)
223 		return (ENOBUFS);
224 
225 	/*
226 	 * Callers of socreate() presumably will connect up a descriptor
227 	 * and call soclose() if they cannot.  This represents our so_refs
228 	 * (which should be 1) from soalloc().
229 	 */
230 	soclrstate(so, SS_NOFDREF);
231 
232 	/*
233 	 * Set a default port for protocol processing.  No action will occur
234 	 * on the socket on this port until an inpcb is attached to it and
235 	 * is able to match incoming packets, or until the socket becomes
236 	 * available to userland.
237 	 *
238 	 * We normally default the socket to the protocol thread on cpu 0.
239 	 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol
240 	 * thread and all pr_*()/pru_*() calls are executed synchronously.
241 	 */
242 	if (prp->pr_flags & PR_SYNC_PORT)
243 		so->so_port = &netisr_sync_port;
244 	else
245 		so->so_port = netisr_cpuport(0);
246 
247 	TAILQ_INIT(&so->so_incomp);
248 	TAILQ_INIT(&so->so_comp);
249 	so->so_type = type;
250 	so->so_cred = crhold(p->p_ucred);
251 	ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
252 	ai.p_ucred = p->p_ucred;
253 	ai.fd_rdir = p->p_fd->fd_rdir;
254 
255 	/*
256 	 * Auto-sizing of socket buffers is managed by the protocols and
257 	 * the appropriate flags must be set in the pru_attach function.
258 	 */
259 	error = so_pru_attach(so, proto, &ai);
260 	if (error) {
261 		sosetstate(so, SS_NOFDREF);
262 		sofree(so);	/* from soalloc */
263 		return error;
264 	}
265 
266 	/*
267 	 * NOTE: Returns referenced socket.
268 	 */
269 	*aso = so;
270 	return (0);
271 }
272 
273 int
274 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
275 {
276 	int error;
277 
278 	error = so_pru_bind(so, nam, td);
279 	return (error);
280 }
281 
282 static void
283 sodealloc(struct socket *so)
284 {
285 	if (so->so_rcv.ssb_hiwat)
286 		(void)chgsbsize(so->so_cred->cr_uidinfo,
287 		    &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY);
288 	if (so->so_snd.ssb_hiwat)
289 		(void)chgsbsize(so->so_cred->cr_uidinfo,
290 		    &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY);
291 #ifdef INET
292 	/* remove accept filter if present */
293 	if (so->so_accf != NULL)
294 		do_setopt_accept_filter(so, NULL);
295 #endif /* INET */
296 	crfree(so->so_cred);
297 	if (so->so_faddr != NULL)
298 		kfree(so->so_faddr, M_SONAME);
299 	kfree(so, M_SOCKET);
300 }
301 
302 int
303 solisten(struct socket *so, int backlog, struct thread *td)
304 {
305 	int error;
306 #ifdef SCTP
307 	short oldopt, oldqlimit;
308 #endif /* SCTP */
309 
310 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))
311 		return (EINVAL);
312 
313 #ifdef SCTP
314 	oldopt = so->so_options;
315 	oldqlimit = so->so_qlimit;
316 #endif /* SCTP */
317 
318 	lwkt_gettoken(&so->so_rcv.ssb_token);
319 	if (TAILQ_EMPTY(&so->so_comp))
320 		so->so_options |= SO_ACCEPTCONN;
321 	lwkt_reltoken(&so->so_rcv.ssb_token);
322 	if (backlog < 0 || backlog > somaxconn)
323 		backlog = somaxconn;
324 	so->so_qlimit = backlog;
325 	/* SCTP needs to look at tweak both the inbound backlog parameter AND
326 	 * the so_options (UDP model both connect's and gets inbound
327 	 * connections .. implicitly).
328 	 */
329 	error = so_pru_listen(so, td);
330 	if (error) {
331 #ifdef SCTP
332 		/* Restore the params */
333 		so->so_options = oldopt;
334 		so->so_qlimit = oldqlimit;
335 #endif /* SCTP */
336 		return (error);
337 	}
338 	return (0);
339 }
340 
341 /*
342  * Destroy a disconnected socket.  This routine is a NOP if entities
343  * still have a reference on the socket:
344  *
345  *	so_pcb -	The protocol stack still has a reference
346  *	SS_NOFDREF -	There is no longer a file pointer reference
347  */
348 void
349 sofree(struct socket *so)
350 {
351 	struct socket *head;
352 
353 	/*
354 	 * This is a bit hackish at the moment.  We need to interlock
355 	 * any accept queue we are on before we potentially lose the
356 	 * last reference to avoid races against a re-reference from
357 	 * someone operating on the queue.
358 	 */
359 	while ((head = so->so_head) != NULL) {
360 		lwkt_getpooltoken(head);
361 		if (so->so_head == head)
362 			break;
363 		lwkt_relpooltoken(head);
364 	}
365 
366 	/*
367 	 * Arbitrage the last free.
368 	 */
369 	KKASSERT(so->so_refs > 0);
370 	if (atomic_fetchadd_int(&so->so_refs, -1) != 1) {
371 		if (head)
372 			lwkt_relpooltoken(head);
373 		return;
374 	}
375 
376 	KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF));
377 	KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
378 
379 	/*
380 	 * We're done, remove ourselves from the accept queue we are
381 	 * on, if we are on one.
382 	 */
383 	if (head != NULL) {
384 		if (so->so_state & SS_INCOMP) {
385 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
386 			head->so_incqlen--;
387 		} else if (so->so_state & SS_COMP) {
388 			/*
389 			 * We must not decommission a socket that's
390 			 * on the accept(2) queue.  If we do, then
391 			 * accept(2) may hang after select(2) indicated
392 			 * that the listening socket was ready.
393 			 */
394 			lwkt_relpooltoken(head);
395 			return;
396 		} else {
397 			panic("sofree: not queued");
398 		}
399 		soclrstate(so, SS_INCOMP);
400 		so->so_head = NULL;
401 		lwkt_relpooltoken(head);
402 	}
403 	ssb_release(&so->so_snd, so);
404 	sorflush(so);
405 	sodealloc(so);
406 }
407 
408 /*
409  * Close a socket on last file table reference removal.
410  * Initiate disconnect if connected.
411  * Free socket when disconnect complete.
412  */
413 int
414 soclose(struct socket *so, int fflag)
415 {
416 	int error;
417 
418 	funsetown(&so->so_sigio);
419 	if (!use_soclose_fast ||
420 	    (so->so_proto->pr_flags & PR_SYNC_PORT) ||
421 	    ((so->so_state & SS_ISCONNECTED) &&
422 	     (so->so_options & SO_LINGER))) {
423 		error = soclose_sync(so, fflag);
424 	} else {
425 		soclose_fast(so);
426 		error = 0;
427 	}
428 	return error;
429 }
430 
431 static void
432 sodiscard(struct socket *so)
433 {
434 	lwkt_getpooltoken(so);
435 	if (so->so_options & SO_ACCEPTCONN) {
436 		struct socket *sp;
437 
438 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
439 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
440 			soclrstate(sp, SS_INCOMP);
441 			sp->so_head = NULL;
442 			so->so_incqlen--;
443 			soaborta(sp);
444 		}
445 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
446 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
447 			soclrstate(sp, SS_COMP);
448 			sp->so_head = NULL;
449 			so->so_qlen--;
450 			soaborta(sp);
451 		}
452 	}
453 	lwkt_relpooltoken(so);
454 
455 	if (so->so_state & SS_NOFDREF)
456 		panic("soclose: NOFDREF");
457 	sosetstate(so, SS_NOFDREF);	/* take ref */
458 }
459 
460 void
461 soinherit(struct socket *so, struct socket *so_inh)
462 {
463 	TAILQ_HEAD(, socket) comp, incomp;
464 	struct socket *sp;
465 	int qlen, incqlen;
466 
467 	KASSERT(so->so_options & SO_ACCEPTCONN,
468 	    ("so does not accept connection"));
469 	KASSERT(so_inh->so_options & SO_ACCEPTCONN,
470 	    ("so_inh does not accept connection"));
471 
472 	TAILQ_INIT(&comp);
473 	TAILQ_INIT(&incomp);
474 
475 	lwkt_getpooltoken(so);
476 	lwkt_getpooltoken(so_inh);
477 
478 	/*
479 	 * Save completed queue and incompleted queue
480 	 */
481 	TAILQ_CONCAT(&comp, &so->so_comp, so_list);
482 	qlen = so->so_qlen;
483 	so->so_qlen = 0;
484 
485 	TAILQ_CONCAT(&incomp, &so->so_incomp, so_list);
486 	incqlen = so->so_incqlen;
487 	so->so_incqlen = 0;
488 
489 	/*
490 	 * Append the saved completed queue and incompleted
491 	 * queue to the socket inherits them.
492 	 *
493 	 * XXX
494 	 * This may temporarily break the inheriting socket's
495 	 * so_qlimit.
496 	 */
497 	TAILQ_FOREACH(sp, &comp, so_list) {
498 		sp->so_head = so_inh;
499 		crfree(sp->so_cred);
500 		sp->so_cred = crhold(so_inh->so_cred);
501 	}
502 
503 	TAILQ_FOREACH(sp, &incomp, so_list) {
504 		sp->so_head = so_inh;
505 		crfree(sp->so_cred);
506 		sp->so_cred = crhold(so_inh->so_cred);
507 	}
508 
509 	TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list);
510 	so_inh->so_qlen += qlen;
511 
512 	TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list);
513 	so_inh->so_incqlen += incqlen;
514 
515 	lwkt_relpooltoken(so_inh);
516 	lwkt_relpooltoken(so);
517 
518 	if (qlen) {
519 		/*
520 		 * "New" connections have arrived
521 		 */
522 		sorwakeup(so_inh);
523 		wakeup(&so_inh->so_timeo);
524 	}
525 }
526 
527 static int
528 soclose_sync(struct socket *so, int fflag)
529 {
530 	int error = 0;
531 
532 	if (so->so_pcb == NULL)
533 		goto discard;
534 	if (so->so_state & SS_ISCONNECTED) {
535 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
536 			error = sodisconnect(so);
537 			if (error)
538 				goto drop;
539 		}
540 		if (so->so_options & SO_LINGER) {
541 			if ((so->so_state & SS_ISDISCONNECTING) &&
542 			    (fflag & FNONBLOCK))
543 				goto drop;
544 			while (so->so_state & SS_ISCONNECTED) {
545 				error = tsleep(&so->so_timeo, PCATCH,
546 					       "soclos", so->so_linger * hz);
547 				if (error)
548 					break;
549 			}
550 		}
551 	}
552 drop:
553 	if (so->so_pcb) {
554 		int error2;
555 
556 		error2 = so_pru_detach(so);
557 		if (error == 0)
558 			error = error2;
559 	}
560 discard:
561 	sodiscard(so);
562 	so_pru_sync(so);	/* unpend async sending */
563 	sofree(so);		/* dispose of ref */
564 
565 	return (error);
566 }
567 
568 static void
569 soclose_sofree_async_handler(netmsg_t msg)
570 {
571 	sofree(msg->base.nm_so);
572 }
573 
574 static void
575 soclose_sofree_async(struct socket *so)
576 {
577 	struct netmsg_base *base = &so->so_clomsg;
578 
579 	netmsg_init(base, so, &netisr_apanic_rport, 0,
580 	    soclose_sofree_async_handler);
581 	lwkt_sendmsg(so->so_port, &base->lmsg);
582 }
583 
584 static void
585 soclose_disconn_async_handler(netmsg_t msg)
586 {
587 	struct socket *so = msg->base.nm_so;
588 
589 	if ((so->so_state & SS_ISCONNECTED) &&
590 	    (so->so_state & SS_ISDISCONNECTING) == 0)
591 		so_pru_disconnect_direct(so);
592 
593 	if (so->so_pcb)
594 		so_pru_detach_direct(so);
595 
596 	sodiscard(so);
597 	sofree(so);
598 }
599 
600 static void
601 soclose_disconn_async(struct socket *so)
602 {
603 	struct netmsg_base *base = &so->so_clomsg;
604 
605 	netmsg_init(base, so, &netisr_apanic_rport, 0,
606 	    soclose_disconn_async_handler);
607 	lwkt_sendmsg(so->so_port, &base->lmsg);
608 }
609 
610 static void
611 soclose_detach_async_handler(netmsg_t msg)
612 {
613 	struct socket *so = msg->base.nm_so;
614 
615 	if (so->so_pcb)
616 		so_pru_detach_direct(so);
617 
618 	sodiscard(so);
619 	sofree(so);
620 }
621 
622 static void
623 soclose_detach_async(struct socket *so)
624 {
625 	struct netmsg_base *base = &so->so_clomsg;
626 
627 	netmsg_init(base, so, &netisr_apanic_rport, 0,
628 	    soclose_detach_async_handler);
629 	lwkt_sendmsg(so->so_port, &base->lmsg);
630 }
631 
632 static void
633 soclose_fast(struct socket *so)
634 {
635 	if (so->so_pcb == NULL)
636 		goto discard;
637 
638 	if ((so->so_state & SS_ISCONNECTED) &&
639 	    (so->so_state & SS_ISDISCONNECTING) == 0) {
640 		soclose_disconn_async(so);
641 		return;
642 	}
643 
644 	if (so->so_pcb) {
645 		soclose_detach_async(so);
646 		return;
647 	}
648 
649 discard:
650 	sodiscard(so);
651 	soclose_sofree_async(so);
652 }
653 
654 /*
655  * Abort and destroy a socket.  Only one abort can be in progress
656  * at any given moment.
657  */
658 void
659 soabort(struct socket *so)
660 {
661 	soreference(so);
662 	so_pru_abort(so);
663 }
664 
665 void
666 soaborta(struct socket *so)
667 {
668 	soreference(so);
669 	so_pru_aborta(so);
670 }
671 
672 void
673 soabort_oncpu(struct socket *so)
674 {
675 	soreference(so);
676 	so_pru_abort_oncpu(so);
677 }
678 
679 /*
680  * so is passed in ref'd, which becomes owned by
681  * the cleared SS_NOFDREF flag.
682  */
683 void
684 soaccept_generic(struct socket *so)
685 {
686 	if ((so->so_state & SS_NOFDREF) == 0)
687 		panic("soaccept: !NOFDREF");
688 	soclrstate(so, SS_NOFDREF);	/* owned by lack of SS_NOFDREF */
689 }
690 
691 int
692 soaccept(struct socket *so, struct sockaddr **nam)
693 {
694 	int error;
695 
696 	soaccept_generic(so);
697 	error = so_pru_accept(so, nam);
698 	return (error);
699 }
700 
701 int
702 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td,
703     boolean_t sync)
704 {
705 	int error;
706 
707 	if (so->so_options & SO_ACCEPTCONN)
708 		return (EOPNOTSUPP);
709 	/*
710 	 * If protocol is connection-based, can only connect once.
711 	 * Otherwise, if connected, try to disconnect first.
712 	 * This allows user to disconnect by connecting to, e.g.,
713 	 * a null address.
714 	 */
715 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
716 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
717 	    (error = sodisconnect(so)))) {
718 		error = EISCONN;
719 	} else {
720 		/*
721 		 * Prevent accumulated error from previous connection
722 		 * from biting us.
723 		 */
724 		so->so_error = 0;
725 		if (!sync && so->so_proto->pr_usrreqs->pru_preconnect)
726 			error = so_pru_connect_async(so, nam, td);
727 		else
728 			error = so_pru_connect(so, nam, td);
729 	}
730 	return (error);
731 }
732 
733 int
734 soconnect2(struct socket *so1, struct socket *so2)
735 {
736 	int error;
737 
738 	error = so_pru_connect2(so1, so2);
739 	return (error);
740 }
741 
742 int
743 sodisconnect(struct socket *so)
744 {
745 	int error;
746 
747 	if ((so->so_state & SS_ISCONNECTED) == 0) {
748 		error = ENOTCONN;
749 		goto bad;
750 	}
751 	if (so->so_state & SS_ISDISCONNECTING) {
752 		error = EALREADY;
753 		goto bad;
754 	}
755 	error = so_pru_disconnect(so);
756 bad:
757 	return (error);
758 }
759 
760 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
761 /*
762  * Send on a socket.
763  * If send must go all at once and message is larger than
764  * send buffering, then hard error.
765  * Lock against other senders.
766  * If must go all at once and not enough room now, then
767  * inform user that this would block and do nothing.
768  * Otherwise, if nonblocking, send as much as possible.
769  * The data to be sent is described by "uio" if nonzero,
770  * otherwise by the mbuf chain "top" (which must be null
771  * if uio is not).  Data provided in mbuf chain must be small
772  * enough to send all at once.
773  *
774  * Returns nonzero on error, timeout or signal; callers
775  * must check for short counts if EINTR/ERESTART are returned.
776  * Data and control buffers are freed on return.
777  */
778 int
779 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
780 	struct mbuf *top, struct mbuf *control, int flags,
781 	struct thread *td)
782 {
783 	struct mbuf **mp;
784 	struct mbuf *m;
785 	size_t resid;
786 	int space, len;
787 	int clen = 0, error, dontroute, mlen;
788 	int atomic = sosendallatonce(so) || top;
789 	int pru_flags;
790 
791 	if (uio) {
792 		resid = uio->uio_resid;
793 	} else {
794 		resid = (size_t)top->m_pkthdr.len;
795 #ifdef INVARIANTS
796 		len = 0;
797 		for (m = top; m; m = m->m_next)
798 			len += m->m_len;
799 		KKASSERT(top->m_pkthdr.len == len);
800 #endif
801 	}
802 
803 	/*
804 	 * WARNING!  resid is unsigned, space and len are signed.  space
805 	 * 	     can wind up negative if the sockbuf is overcommitted.
806 	 *
807 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
808 	 * type sockets since that's an error.
809 	 */
810 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
811 		error = EINVAL;
812 		goto out;
813 	}
814 
815 	dontroute =
816 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
817 	    (so->so_proto->pr_flags & PR_ATOMIC);
818 	if (td->td_lwp != NULL)
819 		td->td_lwp->lwp_ru.ru_msgsnd++;
820 	if (control)
821 		clen = control->m_len;
822 #define	gotoerr(errcode)	{ error = errcode; goto release; }
823 
824 restart:
825 	error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
826 	if (error)
827 		goto out;
828 
829 	do {
830 		if (so->so_state & SS_CANTSENDMORE)
831 			gotoerr(EPIPE);
832 		if (so->so_error) {
833 			error = so->so_error;
834 			so->so_error = 0;
835 			goto release;
836 		}
837 		if ((so->so_state & SS_ISCONNECTED) == 0) {
838 			/*
839 			 * `sendto' and `sendmsg' is allowed on a connection-
840 			 * based socket if it supports implied connect.
841 			 * Return ENOTCONN if not connected and no address is
842 			 * supplied.
843 			 */
844 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
845 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
846 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
847 				    !(resid == 0 && clen != 0))
848 					gotoerr(ENOTCONN);
849 			} else if (addr == NULL)
850 			    gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
851 				   ENOTCONN : EDESTADDRREQ);
852 		}
853 		if ((atomic && resid > so->so_snd.ssb_hiwat) ||
854 		    clen > so->so_snd.ssb_hiwat) {
855 			gotoerr(EMSGSIZE);
856 		}
857 		space = ssb_space(&so->so_snd);
858 		if (flags & MSG_OOB)
859 			space += 1024;
860 		if ((space < 0 || (size_t)space < resid + clen) && uio &&
861 		    (atomic || space < so->so_snd.ssb_lowat || space < clen)) {
862 			if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
863 				gotoerr(EWOULDBLOCK);
864 			ssb_unlock(&so->so_snd);
865 			error = ssb_wait(&so->so_snd);
866 			if (error)
867 				goto out;
868 			goto restart;
869 		}
870 		mp = &top;
871 		space -= clen;
872 		do {
873 		    if (uio == NULL) {
874 			/*
875 			 * Data is prepackaged in "top".
876 			 */
877 			resid = 0;
878 			if (flags & MSG_EOR)
879 				top->m_flags |= M_EOR;
880 		    } else do {
881 			if (resid > INT_MAX)
882 				resid = INT_MAX;
883 			m = m_getl((int)resid, MB_WAIT, MT_DATA,
884 				   top == NULL ? M_PKTHDR : 0, &mlen);
885 			if (top == NULL) {
886 				m->m_pkthdr.len = 0;
887 				m->m_pkthdr.rcvif = NULL;
888 			}
889 			len = imin((int)szmin(mlen, resid), space);
890 			if (resid < MINCLSIZE) {
891 				/*
892 				 * For datagram protocols, leave room
893 				 * for protocol headers in first mbuf.
894 				 */
895 				if (atomic && top == NULL && len < mlen)
896 					MH_ALIGN(m, len);
897 			}
898 			space -= len;
899 			error = uiomove(mtod(m, caddr_t), (size_t)len, uio);
900 			resid = uio->uio_resid;
901 			m->m_len = len;
902 			*mp = m;
903 			top->m_pkthdr.len += len;
904 			if (error)
905 				goto release;
906 			mp = &m->m_next;
907 			if (resid == 0) {
908 				if (flags & MSG_EOR)
909 					top->m_flags |= M_EOR;
910 				break;
911 			}
912 		    } while (space > 0 && atomic);
913 		    if (dontroute)
914 			    so->so_options |= SO_DONTROUTE;
915 		    if (flags & MSG_OOB) {
916 		    	    pru_flags = PRUS_OOB;
917 		    } else if ((flags & MSG_EOF) &&
918 		    	       (so->so_proto->pr_flags & PR_IMPLOPCL) &&
919 			       (resid == 0)) {
920 			    /*
921 			     * If the user set MSG_EOF, the protocol
922 			     * understands this flag and nothing left to
923 			     * send then use PRU_SEND_EOF instead of PRU_SEND.
924 			     */
925 		    	    pru_flags = PRUS_EOF;
926 		    } else if (resid > 0 && space > 0) {
927 			    /* If there is more to send, set PRUS_MORETOCOME */
928 		    	    pru_flags = PRUS_MORETOCOME;
929 		    } else {
930 		    	    pru_flags = 0;
931 		    }
932 		    /*
933 		     * XXX all the SS_CANTSENDMORE checks previously
934 		     * done could be out of date.  We could have recieved
935 		     * a reset packet in an interrupt or maybe we slept
936 		     * while doing page faults in uiomove() etc. We could
937 		     * probably recheck again inside the splnet() protection
938 		     * here, but there are probably other places that this
939 		     * also happens.  We must rethink this.
940 		     */
941 		    error = so_pru_send(so, pru_flags, top, addr, control, td);
942 		    if (dontroute)
943 			    so->so_options &= ~SO_DONTROUTE;
944 		    clen = 0;
945 		    control = NULL;
946 		    top = NULL;
947 		    mp = &top;
948 		    if (error)
949 			    goto release;
950 		} while (resid && space > 0);
951 	} while (resid);
952 
953 release:
954 	ssb_unlock(&so->so_snd);
955 out:
956 	if (top)
957 		m_freem(top);
958 	if (control)
959 		m_freem(control);
960 	return (error);
961 }
962 
963 #ifdef INET
964 /*
965  * A specialization of sosend() for UDP based on protocol-specific knowledge:
966  *   so->so_proto->pr_flags has the PR_ATOMIC field set.  This means that
967  *	sosendallatonce() returns true,
968  *	the "atomic" variable is true,
969  *	and sosendudp() blocks until space is available for the entire send.
970  *   so->so_proto->pr_flags does not have the PR_CONNREQUIRED or
971  *	PR_IMPLOPCL flags set.
972  *   UDP has no out-of-band data.
973  *   UDP has no control data.
974  *   UDP does not support MSG_EOR.
975  */
976 int
977 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio,
978 	  struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
979 {
980 	size_t resid;
981 	int error, pru_flags = 0;
982 	int space;
983 
984 	if (td->td_lwp != NULL)
985 		td->td_lwp->lwp_ru.ru_msgsnd++;
986 	if (control)
987 		m_freem(control);
988 
989 	KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp"));
990 	resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len;
991 
992 restart:
993 	error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
994 	if (error)
995 		goto out;
996 
997 	if (so->so_state & SS_CANTSENDMORE)
998 		gotoerr(EPIPE);
999 	if (so->so_error) {
1000 		error = so->so_error;
1001 		so->so_error = 0;
1002 		goto release;
1003 	}
1004 	if (!(so->so_state & SS_ISCONNECTED) && addr == NULL)
1005 		gotoerr(EDESTADDRREQ);
1006 	if (resid > so->so_snd.ssb_hiwat)
1007 		gotoerr(EMSGSIZE);
1008 	space = ssb_space(&so->so_snd);
1009 	if (uio && (space < 0 || (size_t)space < resid)) {
1010 		if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
1011 			gotoerr(EWOULDBLOCK);
1012 		ssb_unlock(&so->so_snd);
1013 		error = ssb_wait(&so->so_snd);
1014 		if (error)
1015 			goto out;
1016 		goto restart;
1017 	}
1018 
1019 	if (uio) {
1020 		int hdrlen = max_hdr;
1021 
1022 		/*
1023 		 * We try to optimize out the additional mbuf
1024 		 * allocations in M_PREPEND() on output path, e.g.
1025 		 * - udp_output(), when it tries to prepend protocol
1026 		 *   headers.
1027 		 * - Link layer output function, when it tries to
1028 		 *   prepend link layer header.
1029 		 *
1030 		 * This probably will not benefit any data that will
1031 		 * be fragmented, so this optimization is only performed
1032 		 * when the size of data and max size of protocol+link
1033 		 * headers fit into one mbuf cluster.
1034 		 */
1035 		if (uio->uio_resid > MCLBYTES - hdrlen ||
1036 		    !udp_sosend_prepend) {
1037 			top = m_uiomove(uio);
1038 			if (top == NULL)
1039 				goto release;
1040 		} else {
1041 			int nsize;
1042 
1043 			top = m_getl(uio->uio_resid + hdrlen, MB_WAIT,
1044 			    MT_DATA, M_PKTHDR, &nsize);
1045 			KASSERT(nsize >= uio->uio_resid + hdrlen,
1046 			    ("sosendudp invalid nsize %d, "
1047 			     "resid %zu, hdrlen %d",
1048 			     nsize, uio->uio_resid, hdrlen));
1049 
1050 			top->m_len = uio->uio_resid;
1051 			top->m_pkthdr.len = uio->uio_resid;
1052 			top->m_data += hdrlen;
1053 
1054 			error = uiomove(mtod(top, caddr_t), top->m_len, uio);
1055 			if (error)
1056 				goto out;
1057 		}
1058 	}
1059 
1060 	if (flags & MSG_DONTROUTE)
1061 		pru_flags |= PRUS_DONTROUTE;
1062 
1063 	if (udp_sosend_async && (flags & MSG_SYNC) == 0) {
1064 		so_pru_send_async(so, pru_flags, top, addr, NULL, td);
1065 		error = 0;
1066 	} else {
1067 		error = so_pru_send(so, pru_flags, top, addr, NULL, td);
1068 	}
1069 	top = NULL;		/* sent or freed in lower layer */
1070 
1071 release:
1072 	ssb_unlock(&so->so_snd);
1073 out:
1074 	if (top)
1075 		m_freem(top);
1076 	return (error);
1077 }
1078 
1079 int
1080 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio,
1081 	struct mbuf *top, struct mbuf *control, int flags,
1082 	struct thread *td)
1083 {
1084 	struct mbuf **mp;
1085 	struct mbuf *m;
1086 	size_t resid;
1087 	int space, len;
1088 	int error, mlen;
1089 	int allatonce;
1090 	int pru_flags;
1091 
1092 	if (uio) {
1093 		KKASSERT(top == NULL);
1094 		allatonce = 0;
1095 		resid = uio->uio_resid;
1096 	} else {
1097 		allatonce = 1;
1098 		resid = (size_t)top->m_pkthdr.len;
1099 #ifdef INVARIANTS
1100 		len = 0;
1101 		for (m = top; m; m = m->m_next)
1102 			len += m->m_len;
1103 		KKASSERT(top->m_pkthdr.len == len);
1104 #endif
1105 	}
1106 
1107 	/*
1108 	 * WARNING!  resid is unsigned, space and len are signed.  space
1109 	 * 	     can wind up negative if the sockbuf is overcommitted.
1110 	 *
1111 	 * Also check to make sure that MSG_EOR isn't used on TCP
1112 	 */
1113 	if (flags & MSG_EOR) {
1114 		error = EINVAL;
1115 		goto out;
1116 	}
1117 
1118 	if (control) {
1119 		/* TCP doesn't do control messages (rights, creds, etc) */
1120 		if (control->m_len) {
1121 			error = EINVAL;
1122 			goto out;
1123 		}
1124 		m_freem(control);	/* empty control, just free it */
1125 		control = NULL;
1126 	}
1127 
1128 	if (td->td_lwp != NULL)
1129 		td->td_lwp->lwp_ru.ru_msgsnd++;
1130 
1131 #define	gotoerr(errcode)	{ error = errcode; goto release; }
1132 
1133 restart:
1134 	error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
1135 	if (error)
1136 		goto out;
1137 
1138 	do {
1139 		if (so->so_state & SS_CANTSENDMORE)
1140 			gotoerr(EPIPE);
1141 		if (so->so_error) {
1142 			error = so->so_error;
1143 			so->so_error = 0;
1144 			goto release;
1145 		}
1146 		if ((so->so_state & SS_ISCONNECTED) == 0 &&
1147 		    (so->so_state & SS_ISCONFIRMING) == 0)
1148 			gotoerr(ENOTCONN);
1149 		if (allatonce && resid > so->so_snd.ssb_hiwat)
1150 			gotoerr(EMSGSIZE);
1151 
1152 		space = ssb_space_prealloc(&so->so_snd);
1153 		if (flags & MSG_OOB)
1154 			space += 1024;
1155 		if ((space < 0 || (size_t)space < resid) && !allatonce &&
1156 		    space < so->so_snd.ssb_lowat) {
1157 			if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
1158 				gotoerr(EWOULDBLOCK);
1159 			ssb_unlock(&so->so_snd);
1160 			error = ssb_wait(&so->so_snd);
1161 			if (error)
1162 				goto out;
1163 			goto restart;
1164 		}
1165 		mp = &top;
1166 		do {
1167 		    int cnt = 0, async = 0;
1168 
1169 		    if (uio == NULL) {
1170 			/*
1171 			 * Data is prepackaged in "top".
1172 			 */
1173 			resid = 0;
1174 		    } else do {
1175 			if (resid > INT_MAX)
1176 				resid = INT_MAX;
1177 			m = m_getl((int)resid, MB_WAIT, MT_DATA,
1178 				   top == NULL ? M_PKTHDR : 0, &mlen);
1179 			if (top == NULL) {
1180 				m->m_pkthdr.len = 0;
1181 				m->m_pkthdr.rcvif = NULL;
1182 			}
1183 			len = imin((int)szmin(mlen, resid), space);
1184 			space -= len;
1185 			error = uiomove(mtod(m, caddr_t), (size_t)len, uio);
1186 			resid = uio->uio_resid;
1187 			m->m_len = len;
1188 			*mp = m;
1189 			top->m_pkthdr.len += len;
1190 			if (error)
1191 				goto release;
1192 			mp = &m->m_next;
1193 			if (resid == 0)
1194 				break;
1195 			++cnt;
1196 		    } while (space > 0 && cnt < tcp_sosend_agglim);
1197 
1198 		    if (tcp_sosend_async)
1199 			    async = 1;
1200 
1201 		    if (flags & MSG_OOB) {
1202 		    	    pru_flags = PRUS_OOB;
1203 			    async = 0;
1204 		    } else if ((flags & MSG_EOF) && resid == 0) {
1205 			    pru_flags = PRUS_EOF;
1206 		    } else if (resid > 0 && space > 0) {
1207 			    /* If there is more to send, set PRUS_MORETOCOME */
1208 		    	    pru_flags = PRUS_MORETOCOME;
1209 			    async = 1;
1210 		    } else {
1211 		    	    pru_flags = 0;
1212 		    }
1213 
1214 		    if (flags & MSG_SYNC)
1215 			    async = 0;
1216 
1217 		    /*
1218 		     * XXX all the SS_CANTSENDMORE checks previously
1219 		     * done could be out of date.  We could have recieved
1220 		     * a reset packet in an interrupt or maybe we slept
1221 		     * while doing page faults in uiomove() etc. We could
1222 		     * probably recheck again inside the splnet() protection
1223 		     * here, but there are probably other places that this
1224 		     * also happens.  We must rethink this.
1225 		     */
1226 		    for (m = top; m; m = m->m_next)
1227 			    ssb_preallocstream(&so->so_snd, m);
1228 		    if (!async) {
1229 			    error = so_pru_send(so, pru_flags, top,
1230 			        NULL, NULL, td);
1231 		    } else {
1232 			    so_pru_send_async(so, pru_flags, top,
1233 			        NULL, NULL, td);
1234 			    error = 0;
1235 		    }
1236 
1237 		    top = NULL;
1238 		    mp = &top;
1239 		    if (error)
1240 			    goto release;
1241 		} while (resid && space > 0);
1242 	} while (resid);
1243 
1244 release:
1245 	ssb_unlock(&so->so_snd);
1246 out:
1247 	if (top)
1248 		m_freem(top);
1249 	if (control)
1250 		m_freem(control);
1251 	return (error);
1252 }
1253 #endif
1254 
1255 /*
1256  * Implement receive operations on a socket.
1257  *
1258  * We depend on the way that records are added to the signalsockbuf
1259  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1260  * must begin with an address if the protocol so specifies,
1261  * followed by an optional mbuf or mbufs containing ancillary data,
1262  * and then zero or more mbufs of data.
1263  *
1264  * Although the signalsockbuf is locked, new data may still be appended.
1265  * A token inside the ssb_lock deals with MP issues and still allows
1266  * the network to access the socket if we block in a uio.
1267  *
1268  * The caller may receive the data as a single mbuf chain by supplying
1269  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1270  * only for the count in uio_resid.
1271  */
1272 int
1273 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1274 	  struct sockbuf *sio, struct mbuf **controlp, int *flagsp)
1275 {
1276 	struct mbuf *m, *n;
1277 	struct mbuf *free_chain = NULL;
1278 	int flags, len, error, offset;
1279 	struct protosw *pr = so->so_proto;
1280 	int moff, type = 0;
1281 	size_t resid, orig_resid;
1282 
1283 	if (uio)
1284 		resid = uio->uio_resid;
1285 	else
1286 		resid = (size_t)(sio->sb_climit - sio->sb_cc);
1287 	orig_resid = resid;
1288 
1289 	if (psa)
1290 		*psa = NULL;
1291 	if (controlp)
1292 		*controlp = NULL;
1293 	if (flagsp)
1294 		flags = *flagsp &~ MSG_EOR;
1295 	else
1296 		flags = 0;
1297 	if (flags & MSG_OOB) {
1298 		m = m_get(MB_WAIT, MT_DATA);
1299 		if (m == NULL)
1300 			return (ENOBUFS);
1301 		error = so_pru_rcvoob(so, m, flags & MSG_PEEK);
1302 		if (error)
1303 			goto bad;
1304 		if (sio) {
1305 			do {
1306 				sbappend(sio, m);
1307 				KKASSERT(resid >= (size_t)m->m_len);
1308 				resid -= (size_t)m->m_len;
1309 			} while (resid > 0 && m);
1310 		} else {
1311 			do {
1312 				uio->uio_resid = resid;
1313 				error = uiomove(mtod(m, caddr_t),
1314 						(int)szmin(resid, m->m_len),
1315 						uio);
1316 				resid = uio->uio_resid;
1317 				m = m_free(m);
1318 			} while (uio->uio_resid && error == 0 && m);
1319 		}
1320 bad:
1321 		if (m)
1322 			m_freem(m);
1323 		return (error);
1324 	}
1325 	if ((so->so_state & SS_ISCONFIRMING) && resid)
1326 		so_pru_rcvd(so, 0);
1327 
1328 	/*
1329 	 * The token interlocks against the protocol thread while
1330 	 * ssb_lock is a blocking lock against other userland entities.
1331 	 */
1332 	lwkt_gettoken(&so->so_rcv.ssb_token);
1333 restart:
1334 	error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags));
1335 	if (error)
1336 		goto done;
1337 
1338 	m = so->so_rcv.ssb_mb;
1339 	/*
1340 	 * If we have less data than requested, block awaiting more
1341 	 * (subject to any timeout) if:
1342 	 *   1. the current count is less than the low water mark, or
1343 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1344 	 *	receive operation at once if we block (resid <= hiwat).
1345 	 *   3. MSG_DONTWAIT is not set
1346 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1347 	 * we have to do the receive in sections, and thus risk returning
1348 	 * a short count if a timeout or signal occurs after we start.
1349 	 */
1350 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1351 	    (size_t)so->so_rcv.ssb_cc < resid) &&
1352 	    (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat ||
1353 	    ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) &&
1354 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
1355 		KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1"));
1356 		if (so->so_error) {
1357 			if (m)
1358 				goto dontblock;
1359 			error = so->so_error;
1360 			if ((flags & MSG_PEEK) == 0)
1361 				so->so_error = 0;
1362 			goto release;
1363 		}
1364 		if (so->so_state & SS_CANTRCVMORE) {
1365 			if (m)
1366 				goto dontblock;
1367 			else
1368 				goto release;
1369 		}
1370 		for (; m; m = m->m_next) {
1371 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1372 				m = so->so_rcv.ssb_mb;
1373 				goto dontblock;
1374 			}
1375 		}
1376 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1377 		    (pr->pr_flags & PR_CONNREQUIRED)) {
1378 			error = ENOTCONN;
1379 			goto release;
1380 		}
1381 		if (resid == 0)
1382 			goto release;
1383 		if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) {
1384 			error = EWOULDBLOCK;
1385 			goto release;
1386 		}
1387 		ssb_unlock(&so->so_rcv);
1388 		error = ssb_wait(&so->so_rcv);
1389 		if (error)
1390 			goto done;
1391 		goto restart;
1392 	}
1393 dontblock:
1394 	if (uio && uio->uio_td && uio->uio_td->td_proc)
1395 		uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++;
1396 
1397 	/*
1398 	 * note: m should be == sb_mb here.  Cache the next record while
1399 	 * cleaning up.  Note that calling m_free*() will break out critical
1400 	 * section.
1401 	 */
1402 	KKASSERT(m == so->so_rcv.ssb_mb);
1403 
1404 	/*
1405 	 * Skip any address mbufs prepending the record.
1406 	 */
1407 	if (pr->pr_flags & PR_ADDR) {
1408 		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
1409 		orig_resid = 0;
1410 		if (psa)
1411 			*psa = dup_sockaddr(mtod(m, struct sockaddr *));
1412 		if (flags & MSG_PEEK)
1413 			m = m->m_next;
1414 		else
1415 			m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1416 	}
1417 
1418 	/*
1419 	 * Skip any control mbufs prepending the record.
1420 	 */
1421 #ifdef SCTP
1422 	if (pr->pr_flags & PR_ADDR_OPT) {
1423 		/*
1424 		 * For SCTP we may be getting a
1425 		 * whole message OR a partial delivery.
1426 		 */
1427 		if (m && m->m_type == MT_SONAME) {
1428 			orig_resid = 0;
1429 			if (psa)
1430 				*psa = dup_sockaddr(mtod(m, struct sockaddr *));
1431 			if (flags & MSG_PEEK)
1432 				m = m->m_next;
1433 			else
1434 				m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1435 		}
1436 	}
1437 #endif /* SCTP */
1438 	while (m && m->m_type == MT_CONTROL && error == 0) {
1439 		if (flags & MSG_PEEK) {
1440 			if (controlp)
1441 				*controlp = m_copy(m, 0, m->m_len);
1442 			m = m->m_next;	/* XXX race */
1443 		} else {
1444 			if (controlp) {
1445 				n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1446 				if (pr->pr_domain->dom_externalize &&
1447 				    mtod(m, struct cmsghdr *)->cmsg_type ==
1448 				    SCM_RIGHTS)
1449 				   error = (*pr->pr_domain->dom_externalize)(m);
1450 				*controlp = m;
1451 				m = n;
1452 			} else {
1453 				m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1454 			}
1455 		}
1456 		if (controlp && *controlp) {
1457 			orig_resid = 0;
1458 			controlp = &(*controlp)->m_next;
1459 		}
1460 	}
1461 
1462 	/*
1463 	 * flag OOB data.
1464 	 */
1465 	if (m) {
1466 		type = m->m_type;
1467 		if (type == MT_OOBDATA)
1468 			flags |= MSG_OOB;
1469 	}
1470 
1471 	/*
1472 	 * Copy to the UIO or mbuf return chain (*mp).
1473 	 */
1474 	moff = 0;
1475 	offset = 0;
1476 	while (m && resid > 0 && error == 0) {
1477 		if (m->m_type == MT_OOBDATA) {
1478 			if (type != MT_OOBDATA)
1479 				break;
1480 		} else if (type == MT_OOBDATA)
1481 			break;
1482 		else
1483 		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1484 			("receive 3"));
1485 		soclrstate(so, SS_RCVATMARK);
1486 		len = (resid > INT_MAX) ? INT_MAX : resid;
1487 		if (so->so_oobmark && len > so->so_oobmark - offset)
1488 			len = so->so_oobmark - offset;
1489 		if (len > m->m_len - moff)
1490 			len = m->m_len - moff;
1491 
1492 		/*
1493 		 * Copy out to the UIO or pass the mbufs back to the SIO.
1494 		 * The SIO is dealt with when we eat the mbuf, but deal
1495 		 * with the resid here either way.
1496 		 */
1497 		if (uio) {
1498 			uio->uio_resid = resid;
1499 			error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1500 			resid = uio->uio_resid;
1501 			if (error)
1502 				goto release;
1503 		} else {
1504 			resid -= (size_t)len;
1505 		}
1506 
1507 		/*
1508 		 * Eat the entire mbuf or just a piece of it
1509 		 */
1510 		if (len == m->m_len - moff) {
1511 			if (m->m_flags & M_EOR)
1512 				flags |= MSG_EOR;
1513 #ifdef SCTP
1514 			if (m->m_flags & M_NOTIFICATION)
1515 				flags |= MSG_NOTIFICATION;
1516 #endif /* SCTP */
1517 			if (flags & MSG_PEEK) {
1518 				m = m->m_next;
1519 				moff = 0;
1520 			} else {
1521 				if (sio) {
1522 					n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1523 					sbappend(sio, m);
1524 					m = n;
1525 				} else {
1526 					m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1527 				}
1528 			}
1529 		} else {
1530 			if (flags & MSG_PEEK) {
1531 				moff += len;
1532 			} else {
1533 				if (sio) {
1534 					n = m_copym(m, 0, len, MB_WAIT);
1535 					if (n)
1536 						sbappend(sio, n);
1537 				}
1538 				m->m_data += len;
1539 				m->m_len -= len;
1540 				so->so_rcv.ssb_cc -= len;
1541 			}
1542 		}
1543 		if (so->so_oobmark) {
1544 			if ((flags & MSG_PEEK) == 0) {
1545 				so->so_oobmark -= len;
1546 				if (so->so_oobmark == 0) {
1547 					sosetstate(so, SS_RCVATMARK);
1548 					break;
1549 				}
1550 			} else {
1551 				offset += len;
1552 				if (offset == so->so_oobmark)
1553 					break;
1554 			}
1555 		}
1556 		if (flags & MSG_EOR)
1557 			break;
1558 		/*
1559 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1560 		 * we must not quit until resid == 0 or an error
1561 		 * termination.  If a signal/timeout occurs, return
1562 		 * with a short count but without error.
1563 		 * Keep signalsockbuf locked against other readers.
1564 		 */
1565 		while ((flags & MSG_WAITALL) && m == NULL &&
1566 		       resid > 0 && !sosendallatonce(so) &&
1567 		       so->so_rcv.ssb_mb == NULL) {
1568 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
1569 				break;
1570 			/*
1571 			 * The window might have closed to zero, make
1572 			 * sure we send an ack now that we've drained
1573 			 * the buffer or we might end up blocking until
1574 			 * the idle takes over (5 seconds).
1575 			 */
1576 			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1577 				so_pru_rcvd(so, flags);
1578 			error = ssb_wait(&so->so_rcv);
1579 			if (error) {
1580 				ssb_unlock(&so->so_rcv);
1581 				error = 0;
1582 				goto done;
1583 			}
1584 			m = so->so_rcv.ssb_mb;
1585 		}
1586 	}
1587 
1588 	/*
1589 	 * If an atomic read was requested but unread data still remains
1590 	 * in the record, set MSG_TRUNC.
1591 	 */
1592 	if (m && pr->pr_flags & PR_ATOMIC)
1593 		flags |= MSG_TRUNC;
1594 
1595 	/*
1596 	 * Cleanup.  If an atomic read was requested drop any unread data.
1597 	 */
1598 	if ((flags & MSG_PEEK) == 0) {
1599 		if (m && (pr->pr_flags & PR_ATOMIC))
1600 			sbdroprecord(&so->so_rcv.sb);
1601 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
1602 			so_pru_rcvd(so, flags);
1603 	}
1604 
1605 	if (orig_resid == resid && orig_resid &&
1606 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1607 		ssb_unlock(&so->so_rcv);
1608 		goto restart;
1609 	}
1610 
1611 	if (flagsp)
1612 		*flagsp |= flags;
1613 release:
1614 	ssb_unlock(&so->so_rcv);
1615 done:
1616 	lwkt_reltoken(&so->so_rcv.ssb_token);
1617 	if (free_chain)
1618 		m_freem(free_chain);
1619 	return (error);
1620 }
1621 
1622 int
1623 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1624 	  struct sockbuf *sio, struct mbuf **controlp, int *flagsp)
1625 {
1626 	struct mbuf *m, *n;
1627 	struct mbuf *free_chain = NULL;
1628 	int flags, len, error, offset;
1629 	struct protosw *pr = so->so_proto;
1630 	int moff;
1631 	size_t resid, orig_resid;
1632 
1633 	if (uio)
1634 		resid = uio->uio_resid;
1635 	else
1636 		resid = (size_t)(sio->sb_climit - sio->sb_cc);
1637 	orig_resid = resid;
1638 
1639 	if (psa)
1640 		*psa = NULL;
1641 	if (controlp)
1642 		*controlp = NULL;
1643 	if (flagsp)
1644 		flags = *flagsp &~ MSG_EOR;
1645 	else
1646 		flags = 0;
1647 	if (flags & MSG_OOB) {
1648 		m = m_get(MB_WAIT, MT_DATA);
1649 		if (m == NULL)
1650 			return (ENOBUFS);
1651 		error = so_pru_rcvoob(so, m, flags & MSG_PEEK);
1652 		if (error)
1653 			goto bad;
1654 		if (sio) {
1655 			do {
1656 				sbappend(sio, m);
1657 				KKASSERT(resid >= (size_t)m->m_len);
1658 				resid -= (size_t)m->m_len;
1659 			} while (resid > 0 && m);
1660 		} else {
1661 			do {
1662 				uio->uio_resid = resid;
1663 				error = uiomove(mtod(m, caddr_t),
1664 						(int)szmin(resid, m->m_len),
1665 						uio);
1666 				resid = uio->uio_resid;
1667 				m = m_free(m);
1668 			} while (uio->uio_resid && error == 0 && m);
1669 		}
1670 bad:
1671 		if (m)
1672 			m_freem(m);
1673 		return (error);
1674 	}
1675 
1676 	/*
1677 	 * The token interlocks against the protocol thread while
1678 	 * ssb_lock is a blocking lock against other userland entities.
1679 	 */
1680 	lwkt_gettoken(&so->so_rcv.ssb_token);
1681 restart:
1682 	error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags));
1683 	if (error)
1684 		goto done;
1685 
1686 	m = so->so_rcv.ssb_mb;
1687 	/*
1688 	 * If we have less data than requested, block awaiting more
1689 	 * (subject to any timeout) if:
1690 	 *   1. the current count is less than the low water mark, or
1691 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1692 	 *	receive operation at once if we block (resid <= hiwat).
1693 	 *   3. MSG_DONTWAIT is not set
1694 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1695 	 * we have to do the receive in sections, and thus risk returning
1696 	 * a short count if a timeout or signal occurs after we start.
1697 	 */
1698 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1699 	    (size_t)so->so_rcv.ssb_cc < resid) &&
1700 	    (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat ||
1701 	   ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) {
1702 		KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1"));
1703 		if (so->so_error) {
1704 			if (m)
1705 				goto dontblock;
1706 			error = so->so_error;
1707 			if ((flags & MSG_PEEK) == 0)
1708 				so->so_error = 0;
1709 			goto release;
1710 		}
1711 		if (so->so_state & SS_CANTRCVMORE) {
1712 			if (m)
1713 				goto dontblock;
1714 			else
1715 				goto release;
1716 		}
1717 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1718 		    (pr->pr_flags & PR_CONNREQUIRED)) {
1719 			error = ENOTCONN;
1720 			goto release;
1721 		}
1722 		if (resid == 0)
1723 			goto release;
1724 		if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) {
1725 			error = EWOULDBLOCK;
1726 			goto release;
1727 		}
1728 		ssb_unlock(&so->so_rcv);
1729 		error = ssb_wait(&so->so_rcv);
1730 		if (error)
1731 			goto done;
1732 		goto restart;
1733 	}
1734 dontblock:
1735 	if (uio && uio->uio_td && uio->uio_td->td_proc)
1736 		uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++;
1737 
1738 	/*
1739 	 * note: m should be == sb_mb here.  Cache the next record while
1740 	 * cleaning up.  Note that calling m_free*() will break out critical
1741 	 * section.
1742 	 */
1743 	KKASSERT(m == so->so_rcv.ssb_mb);
1744 
1745 	/*
1746 	 * Copy to the UIO or mbuf return chain (*mp).
1747 	 */
1748 	moff = 0;
1749 	offset = 0;
1750 	while (m && resid > 0 && error == 0) {
1751 		KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1752 		    ("receive 3"));
1753 
1754 		soclrstate(so, SS_RCVATMARK);
1755 		len = (resid > INT_MAX) ? INT_MAX : resid;
1756 		if (so->so_oobmark && len > so->so_oobmark - offset)
1757 			len = so->so_oobmark - offset;
1758 		if (len > m->m_len - moff)
1759 			len = m->m_len - moff;
1760 
1761 		/*
1762 		 * Copy out to the UIO or pass the mbufs back to the SIO.
1763 		 * The SIO is dealt with when we eat the mbuf, but deal
1764 		 * with the resid here either way.
1765 		 */
1766 		if (uio) {
1767 			uio->uio_resid = resid;
1768 			error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1769 			resid = uio->uio_resid;
1770 			if (error)
1771 				goto release;
1772 		} else {
1773 			resid -= (size_t)len;
1774 		}
1775 
1776 		/*
1777 		 * Eat the entire mbuf or just a piece of it
1778 		 */
1779 		if (len == m->m_len - moff) {
1780 			if (flags & MSG_PEEK) {
1781 				m = m->m_next;
1782 				moff = 0;
1783 			} else {
1784 				if (sio) {
1785 					n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1786 					sbappend(sio, m);
1787 					m = n;
1788 				} else {
1789 					m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1790 				}
1791 			}
1792 		} else {
1793 			if (flags & MSG_PEEK) {
1794 				moff += len;
1795 			} else {
1796 				if (sio) {
1797 					n = m_copym(m, 0, len, MB_WAIT);
1798 					if (n)
1799 						sbappend(sio, n);
1800 				}
1801 				m->m_data += len;
1802 				m->m_len -= len;
1803 				so->so_rcv.ssb_cc -= len;
1804 			}
1805 		}
1806 		if (so->so_oobmark) {
1807 			if ((flags & MSG_PEEK) == 0) {
1808 				so->so_oobmark -= len;
1809 				if (so->so_oobmark == 0) {
1810 					sosetstate(so, SS_RCVATMARK);
1811 					break;
1812 				}
1813 			} else {
1814 				offset += len;
1815 				if (offset == so->so_oobmark)
1816 					break;
1817 			}
1818 		}
1819 		/*
1820 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1821 		 * we must not quit until resid == 0 or an error
1822 		 * termination.  If a signal/timeout occurs, return
1823 		 * with a short count but without error.
1824 		 * Keep signalsockbuf locked against other readers.
1825 		 */
1826 		while ((flags & MSG_WAITALL) && m == NULL &&
1827 		       resid > 0 && !sosendallatonce(so) &&
1828 		       so->so_rcv.ssb_mb == NULL) {
1829 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
1830 				break;
1831 			/*
1832 			 * The window might have closed to zero, make
1833 			 * sure we send an ack now that we've drained
1834 			 * the buffer or we might end up blocking until
1835 			 * the idle takes over (5 seconds).
1836 			 */
1837 			if (so->so_pcb)
1838 				so_pru_rcvd_async(so);
1839 			error = ssb_wait(&so->so_rcv);
1840 			if (error) {
1841 				ssb_unlock(&so->so_rcv);
1842 				error = 0;
1843 				goto done;
1844 			}
1845 			m = so->so_rcv.ssb_mb;
1846 		}
1847 	}
1848 
1849 	/*
1850 	 * Cleanup.  If an atomic read was requested drop any unread data.
1851 	 */
1852 	if ((flags & MSG_PEEK) == 0) {
1853 		if (so->so_pcb)
1854 			so_pru_rcvd_async(so);
1855 	}
1856 
1857 	if (orig_resid == resid && orig_resid &&
1858 	    (so->so_state & SS_CANTRCVMORE) == 0) {
1859 		ssb_unlock(&so->so_rcv);
1860 		goto restart;
1861 	}
1862 
1863 	if (flagsp)
1864 		*flagsp |= flags;
1865 release:
1866 	ssb_unlock(&so->so_rcv);
1867 done:
1868 	lwkt_reltoken(&so->so_rcv.ssb_token);
1869 	if (free_chain)
1870 		m_freem(free_chain);
1871 	return (error);
1872 }
1873 
1874 /*
1875  * Shut a socket down.  Note that we do not get a frontend lock as we
1876  * want to be able to shut the socket down even if another thread is
1877  * blocked in a read(), thus waking it up.
1878  */
1879 int
1880 soshutdown(struct socket *so, int how)
1881 {
1882 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1883 		return (EINVAL);
1884 
1885 	if (how != SHUT_WR) {
1886 		/*ssb_lock(&so->so_rcv, M_WAITOK);*/
1887 		sorflush(so);
1888 		/*ssb_unlock(&so->so_rcv);*/
1889 	}
1890 	if (how != SHUT_RD)
1891 		return (so_pru_shutdown(so));
1892 	return (0);
1893 }
1894 
1895 void
1896 sorflush(struct socket *so)
1897 {
1898 	struct signalsockbuf *ssb = &so->so_rcv;
1899 	struct protosw *pr = so->so_proto;
1900 	struct signalsockbuf asb;
1901 
1902 	atomic_set_int(&ssb->ssb_flags, SSB_NOINTR);
1903 
1904 	lwkt_gettoken(&ssb->ssb_token);
1905 	socantrcvmore(so);
1906 	asb = *ssb;
1907 
1908 	/*
1909 	 * Can't just blow up the ssb structure here
1910 	 */
1911 	bzero(&ssb->sb, sizeof(ssb->sb));
1912 	ssb->ssb_timeo = 0;
1913 	ssb->ssb_lowat = 0;
1914 	ssb->ssb_hiwat = 0;
1915 	ssb->ssb_mbmax = 0;
1916 	atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK);
1917 
1918 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
1919 		(*pr->pr_domain->dom_dispose)(asb.ssb_mb);
1920 	ssb_release(&asb, so);
1921 
1922 	lwkt_reltoken(&ssb->ssb_token);
1923 }
1924 
1925 #ifdef INET
1926 static int
1927 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
1928 {
1929 	struct accept_filter_arg	*afap = NULL;
1930 	struct accept_filter	*afp;
1931 	struct so_accf	*af = so->so_accf;
1932 	int	error = 0;
1933 
1934 	/* do not set/remove accept filters on non listen sockets */
1935 	if ((so->so_options & SO_ACCEPTCONN) == 0) {
1936 		error = EINVAL;
1937 		goto out;
1938 	}
1939 
1940 	/* removing the filter */
1941 	if (sopt == NULL) {
1942 		if (af != NULL) {
1943 			if (af->so_accept_filter != NULL &&
1944 				af->so_accept_filter->accf_destroy != NULL) {
1945 				af->so_accept_filter->accf_destroy(so);
1946 			}
1947 			if (af->so_accept_filter_str != NULL) {
1948 				kfree(af->so_accept_filter_str, M_ACCF);
1949 			}
1950 			kfree(af, M_ACCF);
1951 			so->so_accf = NULL;
1952 		}
1953 		so->so_options &= ~SO_ACCEPTFILTER;
1954 		return (0);
1955 	}
1956 	/* adding a filter */
1957 	/* must remove previous filter first */
1958 	if (af != NULL) {
1959 		error = EINVAL;
1960 		goto out;
1961 	}
1962 	/* don't put large objects on the kernel stack */
1963 	afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK);
1964 	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1965 	afap->af_name[sizeof(afap->af_name)-1] = '\0';
1966 	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1967 	if (error)
1968 		goto out;
1969 	afp = accept_filt_get(afap->af_name);
1970 	if (afp == NULL) {
1971 		error = ENOENT;
1972 		goto out;
1973 	}
1974 	af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
1975 	if (afp->accf_create != NULL) {
1976 		if (afap->af_name[0] != '\0') {
1977 			int len = strlen(afap->af_name) + 1;
1978 
1979 			af->so_accept_filter_str = kmalloc(len, M_ACCF,
1980 							   M_WAITOK);
1981 			strcpy(af->so_accept_filter_str, afap->af_name);
1982 		}
1983 		af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1984 		if (af->so_accept_filter_arg == NULL) {
1985 			kfree(af->so_accept_filter_str, M_ACCF);
1986 			kfree(af, M_ACCF);
1987 			so->so_accf = NULL;
1988 			error = EINVAL;
1989 			goto out;
1990 		}
1991 	}
1992 	af->so_accept_filter = afp;
1993 	so->so_accf = af;
1994 	so->so_options |= SO_ACCEPTFILTER;
1995 out:
1996 	if (afap != NULL)
1997 		kfree(afap, M_TEMP);
1998 	return (error);
1999 }
2000 #endif /* INET */
2001 
2002 /*
2003  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2004  * an additional variant to handle the case where the option value needs
2005  * to be some kind of integer, but not a specific size.
2006  * In addition to their use here, these functions are also called by the
2007  * protocol-level pr_ctloutput() routines.
2008  */
2009 int
2010 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2011 {
2012 	return soopt_to_kbuf(sopt, buf, len, minlen);
2013 }
2014 
2015 int
2016 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2017 {
2018 	size_t	valsize;
2019 
2020 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2021 	KKASSERT(kva_p(buf));
2022 
2023 	/*
2024 	 * If the user gives us more than we wanted, we ignore it,
2025 	 * but if we don't get the minimum length the caller
2026 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
2027 	 * is set to however much we actually retrieved.
2028 	 */
2029 	if ((valsize = sopt->sopt_valsize) < minlen)
2030 		return EINVAL;
2031 	if (valsize > len)
2032 		sopt->sopt_valsize = valsize = len;
2033 
2034 	bcopy(sopt->sopt_val, buf, valsize);
2035 	return 0;
2036 }
2037 
2038 
2039 int
2040 sosetopt(struct socket *so, struct sockopt *sopt)
2041 {
2042 	int	error, optval;
2043 	struct	linger l;
2044 	struct	timeval tv;
2045 	u_long  val;
2046 	struct signalsockbuf *sotmp;
2047 
2048 	error = 0;
2049 	sopt->sopt_dir = SOPT_SET;
2050 	if (sopt->sopt_level != SOL_SOCKET) {
2051 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2052 			return (so_pr_ctloutput(so, sopt));
2053 		}
2054 		error = ENOPROTOOPT;
2055 	} else {
2056 		switch (sopt->sopt_name) {
2057 #ifdef INET
2058 		case SO_ACCEPTFILTER:
2059 			error = do_setopt_accept_filter(so, sopt);
2060 			if (error)
2061 				goto bad;
2062 			break;
2063 #endif /* INET */
2064 		case SO_LINGER:
2065 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2066 			if (error)
2067 				goto bad;
2068 
2069 			so->so_linger = l.l_linger;
2070 			if (l.l_onoff)
2071 				so->so_options |= SO_LINGER;
2072 			else
2073 				so->so_options &= ~SO_LINGER;
2074 			break;
2075 
2076 		case SO_DEBUG:
2077 		case SO_KEEPALIVE:
2078 		case SO_DONTROUTE:
2079 		case SO_USELOOPBACK:
2080 		case SO_BROADCAST:
2081 		case SO_REUSEADDR:
2082 		case SO_REUSEPORT:
2083 		case SO_OOBINLINE:
2084 		case SO_TIMESTAMP:
2085 		case SO_NOSIGPIPE:
2086 			error = sooptcopyin(sopt, &optval, sizeof optval,
2087 					    sizeof optval);
2088 			if (error)
2089 				goto bad;
2090 			if (optval)
2091 				so->so_options |= sopt->sopt_name;
2092 			else
2093 				so->so_options &= ~sopt->sopt_name;
2094 			break;
2095 
2096 		case SO_SNDBUF:
2097 		case SO_RCVBUF:
2098 		case SO_SNDLOWAT:
2099 		case SO_RCVLOWAT:
2100 			error = sooptcopyin(sopt, &optval, sizeof optval,
2101 					    sizeof optval);
2102 			if (error)
2103 				goto bad;
2104 
2105 			/*
2106 			 * Values < 1 make no sense for any of these
2107 			 * options, so disallow them.
2108 			 */
2109 			if (optval < 1) {
2110 				error = EINVAL;
2111 				goto bad;
2112 			}
2113 
2114 			switch (sopt->sopt_name) {
2115 			case SO_SNDBUF:
2116 			case SO_RCVBUF:
2117 				if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ?
2118 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2119 				    so,
2120 				    &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) {
2121 					error = ENOBUFS;
2122 					goto bad;
2123 				}
2124 				sotmp = (sopt->sopt_name == SO_SNDBUF) ?
2125 						&so->so_snd : &so->so_rcv;
2126 				atomic_clear_int(&sotmp->ssb_flags,
2127 						 SSB_AUTOSIZE);
2128 				break;
2129 
2130 			/*
2131 			 * Make sure the low-water is never greater than
2132 			 * the high-water.
2133 			 */
2134 			case SO_SNDLOWAT:
2135 				so->so_snd.ssb_lowat =
2136 				    (optval > so->so_snd.ssb_hiwat) ?
2137 				    so->so_snd.ssb_hiwat : optval;
2138 				atomic_clear_int(&so->so_snd.ssb_flags,
2139 						 SSB_AUTOLOWAT);
2140 				break;
2141 			case SO_RCVLOWAT:
2142 				so->so_rcv.ssb_lowat =
2143 				    (optval > so->so_rcv.ssb_hiwat) ?
2144 				    so->so_rcv.ssb_hiwat : optval;
2145 				atomic_clear_int(&so->so_rcv.ssb_flags,
2146 						 SSB_AUTOLOWAT);
2147 				break;
2148 			}
2149 			break;
2150 
2151 		case SO_SNDTIMEO:
2152 		case SO_RCVTIMEO:
2153 			error = sooptcopyin(sopt, &tv, sizeof tv,
2154 					    sizeof tv);
2155 			if (error)
2156 				goto bad;
2157 
2158 			/* assert(hz > 0); */
2159 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2160 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2161 				error = EDOM;
2162 				goto bad;
2163 			}
2164 			/* assert(tick > 0); */
2165 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2166 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick;
2167 			if (val > INT_MAX) {
2168 				error = EDOM;
2169 				goto bad;
2170 			}
2171 			if (val == 0 && tv.tv_usec != 0)
2172 				val = 1;
2173 
2174 			switch (sopt->sopt_name) {
2175 			case SO_SNDTIMEO:
2176 				so->so_snd.ssb_timeo = val;
2177 				break;
2178 			case SO_RCVTIMEO:
2179 				so->so_rcv.ssb_timeo = val;
2180 				break;
2181 			}
2182 			break;
2183 		default:
2184 			error = ENOPROTOOPT;
2185 			break;
2186 		}
2187 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
2188 			(void) so_pr_ctloutput(so, sopt);
2189 		}
2190 	}
2191 bad:
2192 	return (error);
2193 }
2194 
2195 /* Helper routine for getsockopt */
2196 int
2197 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2198 {
2199 	soopt_from_kbuf(sopt, buf, len);
2200 	return 0;
2201 }
2202 
2203 void
2204 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len)
2205 {
2206 	size_t	valsize;
2207 
2208 	if (len == 0) {
2209 		sopt->sopt_valsize = 0;
2210 		return;
2211 	}
2212 
2213 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2214 	KKASSERT(kva_p(buf));
2215 
2216 	/*
2217 	 * Documented get behavior is that we always return a value,
2218 	 * possibly truncated to fit in the user's buffer.
2219 	 * Traditional behavior is that we always tell the user
2220 	 * precisely how much we copied, rather than something useful
2221 	 * like the total amount we had available for her.
2222 	 * Note that this interface is not idempotent; the entire answer must
2223 	 * generated ahead of time.
2224 	 */
2225 	valsize = szmin(len, sopt->sopt_valsize);
2226 	sopt->sopt_valsize = valsize;
2227 	if (sopt->sopt_val != 0) {
2228 		bcopy(buf, sopt->sopt_val, valsize);
2229 	}
2230 }
2231 
2232 int
2233 sogetopt(struct socket *so, struct sockopt *sopt)
2234 {
2235 	int	error, optval;
2236 	long	optval_l;
2237 	struct	linger l;
2238 	struct	timeval tv;
2239 #ifdef INET
2240 	struct accept_filter_arg *afap;
2241 #endif
2242 
2243 	error = 0;
2244 	sopt->sopt_dir = SOPT_GET;
2245 	if (sopt->sopt_level != SOL_SOCKET) {
2246 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2247 			return (so_pr_ctloutput(so, sopt));
2248 		} else
2249 			return (ENOPROTOOPT);
2250 	} else {
2251 		switch (sopt->sopt_name) {
2252 #ifdef INET
2253 		case SO_ACCEPTFILTER:
2254 			if ((so->so_options & SO_ACCEPTCONN) == 0)
2255 				return (EINVAL);
2256 			afap = kmalloc(sizeof(*afap), M_TEMP,
2257 				       M_WAITOK | M_ZERO);
2258 			if ((so->so_options & SO_ACCEPTFILTER) != 0) {
2259 				strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
2260 				if (so->so_accf->so_accept_filter_str != NULL)
2261 					strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
2262 			}
2263 			error = sooptcopyout(sopt, afap, sizeof(*afap));
2264 			kfree(afap, M_TEMP);
2265 			break;
2266 #endif /* INET */
2267 
2268 		case SO_LINGER:
2269 			l.l_onoff = so->so_options & SO_LINGER;
2270 			l.l_linger = so->so_linger;
2271 			error = sooptcopyout(sopt, &l, sizeof l);
2272 			break;
2273 
2274 		case SO_USELOOPBACK:
2275 		case SO_DONTROUTE:
2276 		case SO_DEBUG:
2277 		case SO_KEEPALIVE:
2278 		case SO_REUSEADDR:
2279 		case SO_REUSEPORT:
2280 		case SO_BROADCAST:
2281 		case SO_OOBINLINE:
2282 		case SO_TIMESTAMP:
2283 		case SO_NOSIGPIPE:
2284 			optval = so->so_options & sopt->sopt_name;
2285 integer:
2286 			error = sooptcopyout(sopt, &optval, sizeof optval);
2287 			break;
2288 
2289 		case SO_TYPE:
2290 			optval = so->so_type;
2291 			goto integer;
2292 
2293 		case SO_ERROR:
2294 			optval = so->so_error;
2295 			so->so_error = 0;
2296 			goto integer;
2297 
2298 		case SO_SNDBUF:
2299 			optval = so->so_snd.ssb_hiwat;
2300 			goto integer;
2301 
2302 		case SO_RCVBUF:
2303 			optval = so->so_rcv.ssb_hiwat;
2304 			goto integer;
2305 
2306 		case SO_SNDLOWAT:
2307 			optval = so->so_snd.ssb_lowat;
2308 			goto integer;
2309 
2310 		case SO_RCVLOWAT:
2311 			optval = so->so_rcv.ssb_lowat;
2312 			goto integer;
2313 
2314 		case SO_SNDTIMEO:
2315 		case SO_RCVTIMEO:
2316 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2317 				  so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo);
2318 
2319 			tv.tv_sec = optval / hz;
2320 			tv.tv_usec = (optval % hz) * ustick;
2321 			error = sooptcopyout(sopt, &tv, sizeof tv);
2322 			break;
2323 
2324 		case SO_SNDSPACE:
2325 			optval_l = ssb_space(&so->so_snd);
2326 			error = sooptcopyout(sopt, &optval_l, sizeof(optval_l));
2327 			break;
2328 
2329 		default:
2330 			error = ENOPROTOOPT;
2331 			break;
2332 		}
2333 		return (error);
2334 	}
2335 }
2336 
2337 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2338 int
2339 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2340 {
2341 	struct mbuf *m, *m_prev;
2342 	int sopt_size = sopt->sopt_valsize, msize;
2343 
2344 	m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA,
2345 		   0, &msize);
2346 	if (m == NULL)
2347 		return (ENOBUFS);
2348 	m->m_len = min(msize, sopt_size);
2349 	sopt_size -= m->m_len;
2350 	*mp = m;
2351 	m_prev = m;
2352 
2353 	while (sopt_size > 0) {
2354 		m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT,
2355 			   MT_DATA, 0, &msize);
2356 		if (m == NULL) {
2357 			m_freem(*mp);
2358 			return (ENOBUFS);
2359 		}
2360 		m->m_len = min(msize, sopt_size);
2361 		sopt_size -= m->m_len;
2362 		m_prev->m_next = m;
2363 		m_prev = m;
2364 	}
2365 	return (0);
2366 }
2367 
2368 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2369 int
2370 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2371 {
2372 	soopt_to_mbuf(sopt, m);
2373 	return 0;
2374 }
2375 
2376 void
2377 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m)
2378 {
2379 	size_t valsize;
2380 	void *val;
2381 
2382 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2383 	KKASSERT(kva_p(m));
2384 	if (sopt->sopt_val == NULL)
2385 		return;
2386 	val = sopt->sopt_val;
2387 	valsize = sopt->sopt_valsize;
2388 	while (m != NULL && valsize >= m->m_len) {
2389 		bcopy(val, mtod(m, char *), m->m_len);
2390 		valsize -= m->m_len;
2391 		val = (caddr_t)val + m->m_len;
2392 		m = m->m_next;
2393 	}
2394 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2395 		panic("ip6_sooptmcopyin");
2396 }
2397 
2398 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2399 int
2400 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2401 {
2402 	return soopt_from_mbuf(sopt, m);
2403 }
2404 
2405 int
2406 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m)
2407 {
2408 	struct mbuf *m0 = m;
2409 	size_t valsize = 0;
2410 	size_t maxsize;
2411 	void *val;
2412 
2413 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2414 	KKASSERT(kva_p(m));
2415 	if (sopt->sopt_val == NULL)
2416 		return 0;
2417 	val = sopt->sopt_val;
2418 	maxsize = sopt->sopt_valsize;
2419 	while (m != NULL && maxsize >= m->m_len) {
2420 		bcopy(mtod(m, char *), val, m->m_len);
2421 	       maxsize -= m->m_len;
2422 	       val = (caddr_t)val + m->m_len;
2423 	       valsize += m->m_len;
2424 	       m = m->m_next;
2425 	}
2426 	if (m != NULL) {
2427 		/* enough soopt buffer should be given from user-land */
2428 		m_freem(m0);
2429 		return (EINVAL);
2430 	}
2431 	sopt->sopt_valsize = valsize;
2432 	return 0;
2433 }
2434 
2435 void
2436 sohasoutofband(struct socket *so)
2437 {
2438 	if (so->so_sigio != NULL)
2439 		pgsigio(so->so_sigio, SIGURG, 0);
2440 	KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB);
2441 }
2442 
2443 int
2444 sokqfilter(struct file *fp, struct knote *kn)
2445 {
2446 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2447 	struct signalsockbuf *ssb;
2448 
2449 	switch (kn->kn_filter) {
2450 	case EVFILT_READ:
2451 		if (so->so_options & SO_ACCEPTCONN)
2452 			kn->kn_fop = &solisten_filtops;
2453 		else
2454 			kn->kn_fop = &soread_filtops;
2455 		ssb = &so->so_rcv;
2456 		break;
2457 	case EVFILT_WRITE:
2458 		kn->kn_fop = &sowrite_filtops;
2459 		ssb = &so->so_snd;
2460 		break;
2461 	case EVFILT_EXCEPT:
2462 		kn->kn_fop = &soexcept_filtops;
2463 		ssb = &so->so_rcv;
2464 		break;
2465 	default:
2466 		return (EOPNOTSUPP);
2467 	}
2468 
2469 	knote_insert(&ssb->ssb_kq.ki_note, kn);
2470 	atomic_set_int(&ssb->ssb_flags, SSB_KNOTE);
2471 	return (0);
2472 }
2473 
2474 static void
2475 filt_sordetach(struct knote *kn)
2476 {
2477 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2478 
2479 	knote_remove(&so->so_rcv.ssb_kq.ki_note, kn);
2480 	if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note))
2481 		atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE);
2482 }
2483 
2484 /*ARGSUSED*/
2485 static int
2486 filt_soread(struct knote *kn, long hint)
2487 {
2488 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2489 
2490 	if (kn->kn_sfflags & NOTE_OOB) {
2491 		if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) {
2492 			kn->kn_fflags |= NOTE_OOB;
2493 			return (1);
2494 		}
2495 		return (0);
2496 	}
2497 	kn->kn_data = so->so_rcv.ssb_cc;
2498 
2499 	if (so->so_state & SS_CANTRCVMORE) {
2500 		/*
2501 		 * Only set NODATA if all data has been exhausted.
2502 		 */
2503 		if (kn->kn_data == 0)
2504 			kn->kn_flags |= EV_NODATA;
2505 		kn->kn_flags |= EV_EOF;
2506 		kn->kn_fflags = so->so_error;
2507 		return (1);
2508 	}
2509 	if (so->so_error)	/* temporary udp error */
2510 		return (1);
2511 	if (kn->kn_sfflags & NOTE_LOWAT)
2512 		return (kn->kn_data >= kn->kn_sdata);
2513 	return ((kn->kn_data >= so->so_rcv.ssb_lowat) ||
2514 		!TAILQ_EMPTY(&so->so_comp));
2515 }
2516 
2517 static void
2518 filt_sowdetach(struct knote *kn)
2519 {
2520 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2521 
2522 	knote_remove(&so->so_snd.ssb_kq.ki_note, kn);
2523 	if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note))
2524 		atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE);
2525 }
2526 
2527 /*ARGSUSED*/
2528 static int
2529 filt_sowrite(struct knote *kn, long hint)
2530 {
2531 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2532 
2533 	kn->kn_data = ssb_space(&so->so_snd);
2534 	if (so->so_state & SS_CANTSENDMORE) {
2535 		kn->kn_flags |= (EV_EOF | EV_NODATA);
2536 		kn->kn_fflags = so->so_error;
2537 		return (1);
2538 	}
2539 	if (so->so_error)	/* temporary udp error */
2540 		return (1);
2541 	if (((so->so_state & SS_ISCONNECTED) == 0) &&
2542 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2543 		return (0);
2544 	if (kn->kn_sfflags & NOTE_LOWAT)
2545 		return (kn->kn_data >= kn->kn_sdata);
2546 	return (kn->kn_data >= so->so_snd.ssb_lowat);
2547 }
2548 
2549 /*ARGSUSED*/
2550 static int
2551 filt_solisten(struct knote *kn, long hint)
2552 {
2553 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2554 
2555 	kn->kn_data = so->so_qlen;
2556 	return (! TAILQ_EMPTY(&so->so_comp));
2557 }
2558