xref: /dflybsd-src/sys/kern/uipc_socket.c (revision d63cf9941d2a18200388107ed8db010f572a492b)
1 /*
2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 /*
35  * Copyright (c) 1982, 1986, 1988, 1990, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
63  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $
64  */
65 
66 #include "opt_inet.h"
67 #include "opt_sctp.h"
68 
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/fcntl.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/domain.h>
75 #include <sys/file.h>			/* for struct knote */
76 #include <sys/kernel.h>
77 #include <sys/event.h>
78 #include <sys/proc.h>
79 #include <sys/protosw.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/socketops.h>
83 #include <sys/resourcevar.h>
84 #include <sys/signalvar.h>
85 #include <sys/sysctl.h>
86 #include <sys/uio.h>
87 #include <sys/jail.h>
88 #include <vm/vm_zone.h>
89 #include <vm/pmap.h>
90 #include <net/netmsg2.h>
91 #include <net/netisr2.h>
92 
93 #include <sys/thread2.h>
94 #include <sys/socketvar2.h>
95 #include <sys/spinlock2.h>
96 
97 #include <machine/limits.h>
98 
99 #ifdef INET
100 extern int tcp_sosend_agglim;
101 extern int tcp_sosend_async;
102 extern int udp_sosend_async;
103 extern int udp_sosend_prepend;
104 
105 static int	 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt);
106 #endif /* INET */
107 
108 static void 	filt_sordetach(struct knote *kn);
109 static int 	filt_soread(struct knote *kn, long hint);
110 static void 	filt_sowdetach(struct knote *kn);
111 static int	filt_sowrite(struct knote *kn, long hint);
112 static int	filt_solisten(struct knote *kn, long hint);
113 
114 static void	sodiscard(struct socket *so);
115 static int	soclose_sync(struct socket *so, int fflag);
116 static void	soclose_fast(struct socket *so);
117 
118 static struct filterops solisten_filtops =
119 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten };
120 static struct filterops soread_filtops =
121 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread };
122 static struct filterops sowrite_filtops =
123 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite };
124 static struct filterops soexcept_filtops =
125 	{ FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread };
126 
127 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct");
128 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
129 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
130 
131 
132 static int somaxconn = SOMAXCONN;
133 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW,
134     &somaxconn, 0, "Maximum pending socket connection queue size");
135 
136 static int use_soclose_fast = 1;
137 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW,
138     &use_soclose_fast, 0, "Fast socket close");
139 
140 int use_soaccept_pred_fast = 1;
141 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW,
142     &use_soaccept_pred_fast, 0, "Fast socket accept predication");
143 
144 int use_sendfile_async = 1;
145 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW,
146     &use_sendfile_async, 0, "sendfile uses asynchronized pru_send");
147 
148 int use_soconnect_async = 1;
149 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW,
150     &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect");
151 
152 int use_rand_initport = 1;
153 SYSCTL_INT(_kern_ipc, OID_AUTO, rand_initport, CTLFLAG_RW,
154     &use_rand_initport, 0, "socket uses random initial msgport");
155 
156 /*
157  * Socket operation routines.
158  * These routines are called by the routines in
159  * sys_socket.c or from a system process, and
160  * implement the semantics of socket operations by
161  * switching out to the protocol specific routines.
162  */
163 
164 /*
165  * Get a socket structure, and initialize it.
166  * Note that it would probably be better to allocate socket
167  * and PCB at the same time, but I'm not convinced that all
168  * the protocols can be easily modified to do this.
169  */
170 struct socket *
171 soalloc(int waitok, struct protosw *pr)
172 {
173 	struct socket *so;
174 	unsigned waitmask;
175 
176 	waitmask = waitok ? M_WAITOK : M_NOWAIT;
177 	so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask);
178 	if (so) {
179 		/* XXX race condition for reentrant kernel */
180 		so->so_proto = pr;
181 		TAILQ_INIT(&so->so_aiojobq);
182 		TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist);
183 		TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist);
184 		lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok");
185 		lwkt_token_init(&so->so_snd.ssb_token, "sndtok");
186 		spin_init(&so->so_rcvd_spin);
187 		netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport,
188 		    MSGF_DROPABLE | MSGF_PRIORITY,
189 		    so->so_proto->pr_usrreqs->pru_rcvd);
190 		so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC;
191 		so->so_state = SS_NOFDREF;
192 		so->so_refs = 1;
193 	}
194 	return so;
195 }
196 
197 int
198 socreate(int dom, struct socket **aso, int type,
199 	int proto, struct thread *td)
200 {
201 	struct proc *p = td->td_proc;
202 	struct protosw *prp;
203 	struct socket *so;
204 	struct pru_attach_info ai;
205 	int error;
206 
207 	if (proto)
208 		prp = pffindproto(dom, proto, type);
209 	else
210 		prp = pffindtype(dom, type);
211 
212 	if (prp == NULL || prp->pr_usrreqs->pru_attach == 0)
213 		return (EPROTONOSUPPORT);
214 
215 	if (p->p_ucred->cr_prison && jail_socket_unixiproute_only &&
216 	    prp->pr_domain->dom_family != PF_LOCAL &&
217 	    prp->pr_domain->dom_family != PF_INET &&
218 	    prp->pr_domain->dom_family != PF_INET6 &&
219 	    prp->pr_domain->dom_family != PF_ROUTE) {
220 		return (EPROTONOSUPPORT);
221 	}
222 
223 	if (prp->pr_type != type)
224 		return (EPROTOTYPE);
225 	so = soalloc(p != NULL, prp);
226 	if (so == NULL)
227 		return (ENOBUFS);
228 
229 	/*
230 	 * Callers of socreate() presumably will connect up a descriptor
231 	 * and call soclose() if they cannot.  This represents our so_refs
232 	 * (which should be 1) from soalloc().
233 	 */
234 	soclrstate(so, SS_NOFDREF);
235 
236 	/*
237 	 * Set a default port for protocol processing.  No action will occur
238 	 * on the socket on this port until an inpcb is attached to it and
239 	 * is able to match incoming packets, or until the socket becomes
240 	 * available to userland.
241 	 *
242 	 * We normally default the socket to the protocol thread on cpu 0.
243 	 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol
244 	 * thread and all pr_*()/pru_*() calls are executed synchronously.
245 	 */
246 	if (prp->pr_flags & PR_SYNC_PORT) {
247 		so->so_port = &netisr_sync_port;
248 	} else if (prp->pr_flags & PR_RAND_INITPORT) {
249 		if (use_rand_initport)
250 			so->so_port = netisr_cpuport(mycpuid & ncpus2_mask);
251 		else
252 			so->so_port = netisr_cpuport(0);
253 	} else {
254 		so->so_port = netisr_cpuport(0);
255 	}
256 
257 	TAILQ_INIT(&so->so_incomp);
258 	TAILQ_INIT(&so->so_comp);
259 	so->so_type = type;
260 	so->so_cred = crhold(p->p_ucred);
261 	ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
262 	ai.p_ucred = p->p_ucred;
263 	ai.fd_rdir = p->p_fd->fd_rdir;
264 
265 	/*
266 	 * Auto-sizing of socket buffers is managed by the protocols and
267 	 * the appropriate flags must be set in the pru_attach function.
268 	 */
269 	error = so_pru_attach(so, proto, &ai);
270 	if (error) {
271 		sosetstate(so, SS_NOFDREF);
272 		sofree(so);	/* from soalloc */
273 		return error;
274 	}
275 
276 	/*
277 	 * NOTE: Returns referenced socket.
278 	 */
279 	*aso = so;
280 	return (0);
281 }
282 
283 int
284 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
285 {
286 	int error;
287 
288 	error = so_pru_bind(so, nam, td);
289 	return (error);
290 }
291 
292 static void
293 sodealloc(struct socket *so)
294 {
295 	if (so->so_rcv.ssb_hiwat)
296 		(void)chgsbsize(so->so_cred->cr_uidinfo,
297 		    &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY);
298 	if (so->so_snd.ssb_hiwat)
299 		(void)chgsbsize(so->so_cred->cr_uidinfo,
300 		    &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY);
301 #ifdef INET
302 	/* remove accept filter if present */
303 	if (so->so_accf != NULL)
304 		do_setopt_accept_filter(so, NULL);
305 #endif /* INET */
306 	crfree(so->so_cred);
307 	if (so->so_faddr != NULL)
308 		kfree(so->so_faddr, M_SONAME);
309 	kfree(so, M_SOCKET);
310 }
311 
312 int
313 solisten(struct socket *so, int backlog, struct thread *td)
314 {
315 	int error;
316 #ifdef SCTP
317 	short oldopt, oldqlimit;
318 #endif /* SCTP */
319 
320 	if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))
321 		return (EINVAL);
322 
323 #ifdef SCTP
324 	oldopt = so->so_options;
325 	oldqlimit = so->so_qlimit;
326 #endif /* SCTP */
327 
328 	lwkt_gettoken(&so->so_rcv.ssb_token);
329 	if (TAILQ_EMPTY(&so->so_comp))
330 		so->so_options |= SO_ACCEPTCONN;
331 	lwkt_reltoken(&so->so_rcv.ssb_token);
332 	if (backlog < 0 || backlog > somaxconn)
333 		backlog = somaxconn;
334 	so->so_qlimit = backlog;
335 	/* SCTP needs to look at tweak both the inbound backlog parameter AND
336 	 * the so_options (UDP model both connect's and gets inbound
337 	 * connections .. implicitly).
338 	 */
339 	error = so_pru_listen(so, td);
340 	if (error) {
341 #ifdef SCTP
342 		/* Restore the params */
343 		so->so_options = oldopt;
344 		so->so_qlimit = oldqlimit;
345 #endif /* SCTP */
346 		return (error);
347 	}
348 	return (0);
349 }
350 
351 /*
352  * Destroy a disconnected socket.  This routine is a NOP if entities
353  * still have a reference on the socket:
354  *
355  *	so_pcb -	The protocol stack still has a reference
356  *	SS_NOFDREF -	There is no longer a file pointer reference
357  */
358 void
359 sofree(struct socket *so)
360 {
361 	struct socket *head;
362 
363 	/*
364 	 * This is a bit hackish at the moment.  We need to interlock
365 	 * any accept queue we are on before we potentially lose the
366 	 * last reference to avoid races against a re-reference from
367 	 * someone operating on the queue.
368 	 */
369 	while ((head = so->so_head) != NULL) {
370 		lwkt_getpooltoken(head);
371 		if (so->so_head == head)
372 			break;
373 		lwkt_relpooltoken(head);
374 	}
375 
376 	/*
377 	 * Arbitrage the last free.
378 	 */
379 	KKASSERT(so->so_refs > 0);
380 	if (atomic_fetchadd_int(&so->so_refs, -1) != 1) {
381 		if (head)
382 			lwkt_relpooltoken(head);
383 		return;
384 	}
385 
386 	KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF));
387 	KKASSERT((so->so_state & SS_ASSERTINPROG) == 0);
388 
389 	/*
390 	 * We're done, remove ourselves from the accept queue we are
391 	 * on, if we are on one.
392 	 */
393 	if (head != NULL) {
394 		if (so->so_state & SS_INCOMP) {
395 			TAILQ_REMOVE(&head->so_incomp, so, so_list);
396 			head->so_incqlen--;
397 		} else if (so->so_state & SS_COMP) {
398 			/*
399 			 * We must not decommission a socket that's
400 			 * on the accept(2) queue.  If we do, then
401 			 * accept(2) may hang after select(2) indicated
402 			 * that the listening socket was ready.
403 			 */
404 			lwkt_relpooltoken(head);
405 			return;
406 		} else {
407 			panic("sofree: not queued");
408 		}
409 		soclrstate(so, SS_INCOMP);
410 		so->so_head = NULL;
411 		lwkt_relpooltoken(head);
412 	}
413 	ssb_release(&so->so_snd, so);
414 	sorflush(so);
415 	sodealloc(so);
416 }
417 
418 /*
419  * Close a socket on last file table reference removal.
420  * Initiate disconnect if connected.
421  * Free socket when disconnect complete.
422  */
423 int
424 soclose(struct socket *so, int fflag)
425 {
426 	int error;
427 
428 	funsetown(&so->so_sigio);
429 	if (!use_soclose_fast ||
430 	    (so->so_proto->pr_flags & PR_SYNC_PORT) ||
431 	    ((so->so_state & SS_ISCONNECTED) &&
432 	     (so->so_options & SO_LINGER))) {
433 		error = soclose_sync(so, fflag);
434 	} else {
435 		soclose_fast(so);
436 		error = 0;
437 	}
438 	return error;
439 }
440 
441 static void
442 sodiscard(struct socket *so)
443 {
444 	lwkt_getpooltoken(so);
445 	if (so->so_options & SO_ACCEPTCONN) {
446 		struct socket *sp;
447 
448 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
449 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
450 			soclrstate(sp, SS_INCOMP);
451 			sp->so_head = NULL;
452 			so->so_incqlen--;
453 			soaborta(sp);
454 		}
455 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
456 			TAILQ_REMOVE(&so->so_comp, sp, so_list);
457 			soclrstate(sp, SS_COMP);
458 			sp->so_head = NULL;
459 			so->so_qlen--;
460 			soaborta(sp);
461 		}
462 	}
463 	lwkt_relpooltoken(so);
464 
465 	if (so->so_state & SS_NOFDREF)
466 		panic("soclose: NOFDREF");
467 	sosetstate(so, SS_NOFDREF);	/* take ref */
468 }
469 
470 void
471 soinherit(struct socket *so, struct socket *so_inh)
472 {
473 	TAILQ_HEAD(, socket) comp, incomp;
474 	struct socket *sp;
475 	int qlen, incqlen;
476 
477 	KASSERT(so->so_options & SO_ACCEPTCONN,
478 	    ("so does not accept connection"));
479 	KASSERT(so_inh->so_options & SO_ACCEPTCONN,
480 	    ("so_inh does not accept connection"));
481 
482 	TAILQ_INIT(&comp);
483 	TAILQ_INIT(&incomp);
484 
485 	lwkt_getpooltoken(so);
486 	lwkt_getpooltoken(so_inh);
487 
488 	/*
489 	 * Save completed queue and incompleted queue
490 	 */
491 	TAILQ_CONCAT(&comp, &so->so_comp, so_list);
492 	qlen = so->so_qlen;
493 	so->so_qlen = 0;
494 
495 	TAILQ_CONCAT(&incomp, &so->so_incomp, so_list);
496 	incqlen = so->so_incqlen;
497 	so->so_incqlen = 0;
498 
499 	/*
500 	 * Append the saved completed queue and incompleted
501 	 * queue to the socket inherits them.
502 	 *
503 	 * XXX
504 	 * This may temporarily break the inheriting socket's
505 	 * so_qlimit.
506 	 */
507 	TAILQ_FOREACH(sp, &comp, so_list) {
508 		sp->so_head = so_inh;
509 		crfree(sp->so_cred);
510 		sp->so_cred = crhold(so_inh->so_cred);
511 	}
512 
513 	TAILQ_FOREACH(sp, &incomp, so_list) {
514 		sp->so_head = so_inh;
515 		crfree(sp->so_cred);
516 		sp->so_cred = crhold(so_inh->so_cred);
517 	}
518 
519 	TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list);
520 	so_inh->so_qlen += qlen;
521 
522 	TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list);
523 	so_inh->so_incqlen += incqlen;
524 
525 	lwkt_relpooltoken(so_inh);
526 	lwkt_relpooltoken(so);
527 
528 	if (qlen) {
529 		/*
530 		 * "New" connections have arrived
531 		 */
532 		sorwakeup(so_inh);
533 		wakeup(&so_inh->so_timeo);
534 	}
535 }
536 
537 static int
538 soclose_sync(struct socket *so, int fflag)
539 {
540 	int error = 0;
541 
542 	if (so->so_pcb == NULL)
543 		goto discard;
544 	if (so->so_state & SS_ISCONNECTED) {
545 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
546 			error = sodisconnect(so);
547 			if (error)
548 				goto drop;
549 		}
550 		if (so->so_options & SO_LINGER) {
551 			if ((so->so_state & SS_ISDISCONNECTING) &&
552 			    (fflag & FNONBLOCK))
553 				goto drop;
554 			while (so->so_state & SS_ISCONNECTED) {
555 				error = tsleep(&so->so_timeo, PCATCH,
556 					       "soclos", so->so_linger * hz);
557 				if (error)
558 					break;
559 			}
560 		}
561 	}
562 drop:
563 	if (so->so_pcb) {
564 		int error2;
565 
566 		error2 = so_pru_detach(so);
567 		if (error == 0)
568 			error = error2;
569 	}
570 discard:
571 	sodiscard(so);
572 	so_pru_sync(so);	/* unpend async sending */
573 	sofree(so);		/* dispose of ref */
574 
575 	return (error);
576 }
577 
578 static void
579 soclose_sofree_async_handler(netmsg_t msg)
580 {
581 	sofree(msg->base.nm_so);
582 }
583 
584 static void
585 soclose_sofree_async(struct socket *so)
586 {
587 	struct netmsg_base *base = &so->so_clomsg;
588 
589 	netmsg_init(base, so, &netisr_apanic_rport, 0,
590 	    soclose_sofree_async_handler);
591 	lwkt_sendmsg(so->so_port, &base->lmsg);
592 }
593 
594 static void
595 soclose_disconn_async_handler(netmsg_t msg)
596 {
597 	struct socket *so = msg->base.nm_so;
598 
599 	if ((so->so_state & SS_ISCONNECTED) &&
600 	    (so->so_state & SS_ISDISCONNECTING) == 0)
601 		so_pru_disconnect_direct(so);
602 
603 	if (so->so_pcb)
604 		so_pru_detach_direct(so);
605 
606 	sodiscard(so);
607 	sofree(so);
608 }
609 
610 static void
611 soclose_disconn_async(struct socket *so)
612 {
613 	struct netmsg_base *base = &so->so_clomsg;
614 
615 	netmsg_init(base, so, &netisr_apanic_rport, 0,
616 	    soclose_disconn_async_handler);
617 	lwkt_sendmsg(so->so_port, &base->lmsg);
618 }
619 
620 static void
621 soclose_detach_async_handler(netmsg_t msg)
622 {
623 	struct socket *so = msg->base.nm_so;
624 
625 	if (so->so_pcb)
626 		so_pru_detach_direct(so);
627 
628 	sodiscard(so);
629 	sofree(so);
630 }
631 
632 static void
633 soclose_detach_async(struct socket *so)
634 {
635 	struct netmsg_base *base = &so->so_clomsg;
636 
637 	netmsg_init(base, so, &netisr_apanic_rport, 0,
638 	    soclose_detach_async_handler);
639 	lwkt_sendmsg(so->so_port, &base->lmsg);
640 }
641 
642 static void
643 soclose_fast(struct socket *so)
644 {
645 	if (so->so_pcb == NULL)
646 		goto discard;
647 
648 	if ((so->so_state & SS_ISCONNECTED) &&
649 	    (so->so_state & SS_ISDISCONNECTING) == 0) {
650 		soclose_disconn_async(so);
651 		return;
652 	}
653 
654 	if (so->so_pcb) {
655 		soclose_detach_async(so);
656 		return;
657 	}
658 
659 discard:
660 	sodiscard(so);
661 	soclose_sofree_async(so);
662 }
663 
664 /*
665  * Abort and destroy a socket.  Only one abort can be in progress
666  * at any given moment.
667  */
668 void
669 soabort(struct socket *so)
670 {
671 	soreference(so);
672 	so_pru_abort(so);
673 }
674 
675 void
676 soaborta(struct socket *so)
677 {
678 	soreference(so);
679 	so_pru_aborta(so);
680 }
681 
682 void
683 soabort_oncpu(struct socket *so)
684 {
685 	soreference(so);
686 	so_pru_abort_oncpu(so);
687 }
688 
689 /*
690  * so is passed in ref'd, which becomes owned by
691  * the cleared SS_NOFDREF flag.
692  */
693 void
694 soaccept_generic(struct socket *so)
695 {
696 	if ((so->so_state & SS_NOFDREF) == 0)
697 		panic("soaccept: !NOFDREF");
698 	soclrstate(so, SS_NOFDREF);	/* owned by lack of SS_NOFDREF */
699 }
700 
701 int
702 soaccept(struct socket *so, struct sockaddr **nam)
703 {
704 	int error;
705 
706 	soaccept_generic(so);
707 	error = so_pru_accept(so, nam);
708 	return (error);
709 }
710 
711 int
712 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td,
713     boolean_t sync)
714 {
715 	int error;
716 
717 	if (so->so_options & SO_ACCEPTCONN)
718 		return (EOPNOTSUPP);
719 	/*
720 	 * If protocol is connection-based, can only connect once.
721 	 * Otherwise, if connected, try to disconnect first.
722 	 * This allows user to disconnect by connecting to, e.g.,
723 	 * a null address.
724 	 */
725 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
726 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
727 	    (error = sodisconnect(so)))) {
728 		error = EISCONN;
729 	} else {
730 		/*
731 		 * Prevent accumulated error from previous connection
732 		 * from biting us.
733 		 */
734 		so->so_error = 0;
735 		if (!sync && so->so_proto->pr_usrreqs->pru_preconnect)
736 			error = so_pru_connect_async(so, nam, td);
737 		else
738 			error = so_pru_connect(so, nam, td);
739 	}
740 	return (error);
741 }
742 
743 int
744 soconnect2(struct socket *so1, struct socket *so2)
745 {
746 	int error;
747 
748 	error = so_pru_connect2(so1, so2);
749 	return (error);
750 }
751 
752 int
753 sodisconnect(struct socket *so)
754 {
755 	int error;
756 
757 	if ((so->so_state & SS_ISCONNECTED) == 0) {
758 		error = ENOTCONN;
759 		goto bad;
760 	}
761 	if (so->so_state & SS_ISDISCONNECTING) {
762 		error = EALREADY;
763 		goto bad;
764 	}
765 	error = so_pru_disconnect(so);
766 bad:
767 	return (error);
768 }
769 
770 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
771 /*
772  * Send on a socket.
773  * If send must go all at once and message is larger than
774  * send buffering, then hard error.
775  * Lock against other senders.
776  * If must go all at once and not enough room now, then
777  * inform user that this would block and do nothing.
778  * Otherwise, if nonblocking, send as much as possible.
779  * The data to be sent is described by "uio" if nonzero,
780  * otherwise by the mbuf chain "top" (which must be null
781  * if uio is not).  Data provided in mbuf chain must be small
782  * enough to send all at once.
783  *
784  * Returns nonzero on error, timeout or signal; callers
785  * must check for short counts if EINTR/ERESTART are returned.
786  * Data and control buffers are freed on return.
787  */
788 int
789 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
790 	struct mbuf *top, struct mbuf *control, int flags,
791 	struct thread *td)
792 {
793 	struct mbuf **mp;
794 	struct mbuf *m;
795 	size_t resid;
796 	int space, len;
797 	int clen = 0, error, dontroute, mlen;
798 	int atomic = sosendallatonce(so) || top;
799 	int pru_flags;
800 
801 	if (uio) {
802 		resid = uio->uio_resid;
803 	} else {
804 		resid = (size_t)top->m_pkthdr.len;
805 #ifdef INVARIANTS
806 		len = 0;
807 		for (m = top; m; m = m->m_next)
808 			len += m->m_len;
809 		KKASSERT(top->m_pkthdr.len == len);
810 #endif
811 	}
812 
813 	/*
814 	 * WARNING!  resid is unsigned, space and len are signed.  space
815 	 * 	     can wind up negative if the sockbuf is overcommitted.
816 	 *
817 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
818 	 * type sockets since that's an error.
819 	 */
820 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
821 		error = EINVAL;
822 		goto out;
823 	}
824 
825 	dontroute =
826 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
827 	    (so->so_proto->pr_flags & PR_ATOMIC);
828 	if (td->td_lwp != NULL)
829 		td->td_lwp->lwp_ru.ru_msgsnd++;
830 	if (control)
831 		clen = control->m_len;
832 #define	gotoerr(errcode)	{ error = errcode; goto release; }
833 
834 restart:
835 	error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
836 	if (error)
837 		goto out;
838 
839 	do {
840 		if (so->so_state & SS_CANTSENDMORE)
841 			gotoerr(EPIPE);
842 		if (so->so_error) {
843 			error = so->so_error;
844 			so->so_error = 0;
845 			goto release;
846 		}
847 		if ((so->so_state & SS_ISCONNECTED) == 0) {
848 			/*
849 			 * `sendto' and `sendmsg' is allowed on a connection-
850 			 * based socket if it supports implied connect.
851 			 * Return ENOTCONN if not connected and no address is
852 			 * supplied.
853 			 */
854 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
855 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
856 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
857 				    !(resid == 0 && clen != 0))
858 					gotoerr(ENOTCONN);
859 			} else if (addr == NULL)
860 			    gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
861 				   ENOTCONN : EDESTADDRREQ);
862 		}
863 		if ((atomic && resid > so->so_snd.ssb_hiwat) ||
864 		    clen > so->so_snd.ssb_hiwat) {
865 			gotoerr(EMSGSIZE);
866 		}
867 		space = ssb_space(&so->so_snd);
868 		if (flags & MSG_OOB)
869 			space += 1024;
870 		if ((space < 0 || (size_t)space < resid + clen) && uio &&
871 		    (atomic || space < so->so_snd.ssb_lowat || space < clen)) {
872 			if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
873 				gotoerr(EWOULDBLOCK);
874 			ssb_unlock(&so->so_snd);
875 			error = ssb_wait(&so->so_snd);
876 			if (error)
877 				goto out;
878 			goto restart;
879 		}
880 		mp = &top;
881 		space -= clen;
882 		do {
883 		    if (uio == NULL) {
884 			/*
885 			 * Data is prepackaged in "top".
886 			 */
887 			resid = 0;
888 			if (flags & MSG_EOR)
889 				top->m_flags |= M_EOR;
890 		    } else do {
891 			if (resid > INT_MAX)
892 				resid = INT_MAX;
893 			m = m_getl((int)resid, MB_WAIT, MT_DATA,
894 				   top == NULL ? M_PKTHDR : 0, &mlen);
895 			if (top == NULL) {
896 				m->m_pkthdr.len = 0;
897 				m->m_pkthdr.rcvif = NULL;
898 			}
899 			len = imin((int)szmin(mlen, resid), space);
900 			if (resid < MINCLSIZE) {
901 				/*
902 				 * For datagram protocols, leave room
903 				 * for protocol headers in first mbuf.
904 				 */
905 				if (atomic && top == NULL && len < mlen)
906 					MH_ALIGN(m, len);
907 			}
908 			space -= len;
909 			error = uiomove(mtod(m, caddr_t), (size_t)len, uio);
910 			resid = uio->uio_resid;
911 			m->m_len = len;
912 			*mp = m;
913 			top->m_pkthdr.len += len;
914 			if (error)
915 				goto release;
916 			mp = &m->m_next;
917 			if (resid == 0) {
918 				if (flags & MSG_EOR)
919 					top->m_flags |= M_EOR;
920 				break;
921 			}
922 		    } while (space > 0 && atomic);
923 		    if (dontroute)
924 			    so->so_options |= SO_DONTROUTE;
925 		    if (flags & MSG_OOB) {
926 		    	    pru_flags = PRUS_OOB;
927 		    } else if ((flags & MSG_EOF) &&
928 		    	       (so->so_proto->pr_flags & PR_IMPLOPCL) &&
929 			       (resid == 0)) {
930 			    /*
931 			     * If the user set MSG_EOF, the protocol
932 			     * understands this flag and nothing left to
933 			     * send then use PRU_SEND_EOF instead of PRU_SEND.
934 			     */
935 		    	    pru_flags = PRUS_EOF;
936 		    } else if (resid > 0 && space > 0) {
937 			    /* If there is more to send, set PRUS_MORETOCOME */
938 		    	    pru_flags = PRUS_MORETOCOME;
939 		    } else {
940 		    	    pru_flags = 0;
941 		    }
942 		    /*
943 		     * XXX all the SS_CANTSENDMORE checks previously
944 		     * done could be out of date.  We could have recieved
945 		     * a reset packet in an interrupt or maybe we slept
946 		     * while doing page faults in uiomove() etc. We could
947 		     * probably recheck again inside the splnet() protection
948 		     * here, but there are probably other places that this
949 		     * also happens.  We must rethink this.
950 		     */
951 		    error = so_pru_send(so, pru_flags, top, addr, control, td);
952 		    if (dontroute)
953 			    so->so_options &= ~SO_DONTROUTE;
954 		    clen = 0;
955 		    control = NULL;
956 		    top = NULL;
957 		    mp = &top;
958 		    if (error)
959 			    goto release;
960 		} while (resid && space > 0);
961 	} while (resid);
962 
963 release:
964 	ssb_unlock(&so->so_snd);
965 out:
966 	if (top)
967 		m_freem(top);
968 	if (control)
969 		m_freem(control);
970 	return (error);
971 }
972 
973 #ifdef INET
974 /*
975  * A specialization of sosend() for UDP based on protocol-specific knowledge:
976  *   so->so_proto->pr_flags has the PR_ATOMIC field set.  This means that
977  *	sosendallatonce() returns true,
978  *	the "atomic" variable is true,
979  *	and sosendudp() blocks until space is available for the entire send.
980  *   so->so_proto->pr_flags does not have the PR_CONNREQUIRED or
981  *	PR_IMPLOPCL flags set.
982  *   UDP has no out-of-band data.
983  *   UDP has no control data.
984  *   UDP does not support MSG_EOR.
985  */
986 int
987 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio,
988 	  struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
989 {
990 	size_t resid;
991 	int error, pru_flags = 0;
992 	int space;
993 
994 	if (td->td_lwp != NULL)
995 		td->td_lwp->lwp_ru.ru_msgsnd++;
996 	if (control)
997 		m_freem(control);
998 
999 	KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp"));
1000 	resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len;
1001 
1002 restart:
1003 	error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
1004 	if (error)
1005 		goto out;
1006 
1007 	if (so->so_state & SS_CANTSENDMORE)
1008 		gotoerr(EPIPE);
1009 	if (so->so_error) {
1010 		error = so->so_error;
1011 		so->so_error = 0;
1012 		goto release;
1013 	}
1014 	if (!(so->so_state & SS_ISCONNECTED) && addr == NULL)
1015 		gotoerr(EDESTADDRREQ);
1016 	if (resid > so->so_snd.ssb_hiwat)
1017 		gotoerr(EMSGSIZE);
1018 	space = ssb_space(&so->so_snd);
1019 	if (uio && (space < 0 || (size_t)space < resid)) {
1020 		if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
1021 			gotoerr(EWOULDBLOCK);
1022 		ssb_unlock(&so->so_snd);
1023 		error = ssb_wait(&so->so_snd);
1024 		if (error)
1025 			goto out;
1026 		goto restart;
1027 	}
1028 
1029 	if (uio) {
1030 		int hdrlen = max_hdr;
1031 
1032 		/*
1033 		 * We try to optimize out the additional mbuf
1034 		 * allocations in M_PREPEND() on output path, e.g.
1035 		 * - udp_output(), when it tries to prepend protocol
1036 		 *   headers.
1037 		 * - Link layer output function, when it tries to
1038 		 *   prepend link layer header.
1039 		 *
1040 		 * This probably will not benefit any data that will
1041 		 * be fragmented, so this optimization is only performed
1042 		 * when the size of data and max size of protocol+link
1043 		 * headers fit into one mbuf cluster.
1044 		 */
1045 		if (uio->uio_resid > MCLBYTES - hdrlen ||
1046 		    !udp_sosend_prepend) {
1047 			top = m_uiomove(uio);
1048 			if (top == NULL)
1049 				goto release;
1050 		} else {
1051 			int nsize;
1052 
1053 			top = m_getl(uio->uio_resid + hdrlen, MB_WAIT,
1054 			    MT_DATA, M_PKTHDR, &nsize);
1055 			KASSERT(nsize >= uio->uio_resid + hdrlen,
1056 			    ("sosendudp invalid nsize %d, "
1057 			     "resid %zu, hdrlen %d",
1058 			     nsize, uio->uio_resid, hdrlen));
1059 
1060 			top->m_len = uio->uio_resid;
1061 			top->m_pkthdr.len = uio->uio_resid;
1062 			top->m_data += hdrlen;
1063 
1064 			error = uiomove(mtod(top, caddr_t), top->m_len, uio);
1065 			if (error)
1066 				goto out;
1067 		}
1068 	}
1069 
1070 	if (flags & MSG_DONTROUTE)
1071 		pru_flags |= PRUS_DONTROUTE;
1072 
1073 	if (udp_sosend_async && (flags & MSG_SYNC) == 0) {
1074 		so_pru_send_async(so, pru_flags, top, addr, NULL, td);
1075 		error = 0;
1076 	} else {
1077 		error = so_pru_send(so, pru_flags, top, addr, NULL, td);
1078 	}
1079 	top = NULL;		/* sent or freed in lower layer */
1080 
1081 release:
1082 	ssb_unlock(&so->so_snd);
1083 out:
1084 	if (top)
1085 		m_freem(top);
1086 	return (error);
1087 }
1088 
1089 int
1090 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio,
1091 	struct mbuf *top, struct mbuf *control, int flags,
1092 	struct thread *td)
1093 {
1094 	struct mbuf **mp;
1095 	struct mbuf *m;
1096 	size_t resid;
1097 	int space, len;
1098 	int error, mlen;
1099 	int allatonce;
1100 	int pru_flags;
1101 
1102 	if (uio) {
1103 		KKASSERT(top == NULL);
1104 		allatonce = 0;
1105 		resid = uio->uio_resid;
1106 	} else {
1107 		allatonce = 1;
1108 		resid = (size_t)top->m_pkthdr.len;
1109 #ifdef INVARIANTS
1110 		len = 0;
1111 		for (m = top; m; m = m->m_next)
1112 			len += m->m_len;
1113 		KKASSERT(top->m_pkthdr.len == len);
1114 #endif
1115 	}
1116 
1117 	/*
1118 	 * WARNING!  resid is unsigned, space and len are signed.  space
1119 	 * 	     can wind up negative if the sockbuf is overcommitted.
1120 	 *
1121 	 * Also check to make sure that MSG_EOR isn't used on TCP
1122 	 */
1123 	if (flags & MSG_EOR) {
1124 		error = EINVAL;
1125 		goto out;
1126 	}
1127 
1128 	if (control) {
1129 		/* TCP doesn't do control messages (rights, creds, etc) */
1130 		if (control->m_len) {
1131 			error = EINVAL;
1132 			goto out;
1133 		}
1134 		m_freem(control);	/* empty control, just free it */
1135 		control = NULL;
1136 	}
1137 
1138 	if (td->td_lwp != NULL)
1139 		td->td_lwp->lwp_ru.ru_msgsnd++;
1140 
1141 #define	gotoerr(errcode)	{ error = errcode; goto release; }
1142 
1143 restart:
1144 	error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags));
1145 	if (error)
1146 		goto out;
1147 
1148 	do {
1149 		if (so->so_state & SS_CANTSENDMORE)
1150 			gotoerr(EPIPE);
1151 		if (so->so_error) {
1152 			error = so->so_error;
1153 			so->so_error = 0;
1154 			goto release;
1155 		}
1156 		if ((so->so_state & SS_ISCONNECTED) == 0 &&
1157 		    (so->so_state & SS_ISCONFIRMING) == 0)
1158 			gotoerr(ENOTCONN);
1159 		if (allatonce && resid > so->so_snd.ssb_hiwat)
1160 			gotoerr(EMSGSIZE);
1161 
1162 		space = ssb_space_prealloc(&so->so_snd);
1163 		if (flags & MSG_OOB)
1164 			space += 1024;
1165 		if ((space < 0 || (size_t)space < resid) && !allatonce &&
1166 		    space < so->so_snd.ssb_lowat) {
1167 			if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT))
1168 				gotoerr(EWOULDBLOCK);
1169 			ssb_unlock(&so->so_snd);
1170 			error = ssb_wait(&so->so_snd);
1171 			if (error)
1172 				goto out;
1173 			goto restart;
1174 		}
1175 		mp = &top;
1176 		do {
1177 		    int cnt = 0, async = 0;
1178 
1179 		    if (uio == NULL) {
1180 			/*
1181 			 * Data is prepackaged in "top".
1182 			 */
1183 			resid = 0;
1184 		    } else do {
1185 			if (resid > INT_MAX)
1186 				resid = INT_MAX;
1187 			m = m_getl((int)resid, MB_WAIT, MT_DATA,
1188 				   top == NULL ? M_PKTHDR : 0, &mlen);
1189 			if (top == NULL) {
1190 				m->m_pkthdr.len = 0;
1191 				m->m_pkthdr.rcvif = NULL;
1192 			}
1193 			len = imin((int)szmin(mlen, resid), space);
1194 			space -= len;
1195 			error = uiomove(mtod(m, caddr_t), (size_t)len, uio);
1196 			resid = uio->uio_resid;
1197 			m->m_len = len;
1198 			*mp = m;
1199 			top->m_pkthdr.len += len;
1200 			if (error)
1201 				goto release;
1202 			mp = &m->m_next;
1203 			if (resid == 0)
1204 				break;
1205 			++cnt;
1206 		    } while (space > 0 && cnt < tcp_sosend_agglim);
1207 
1208 		    if (tcp_sosend_async)
1209 			    async = 1;
1210 
1211 		    if (flags & MSG_OOB) {
1212 		    	    pru_flags = PRUS_OOB;
1213 			    async = 0;
1214 		    } else if ((flags & MSG_EOF) && resid == 0) {
1215 			    pru_flags = PRUS_EOF;
1216 		    } else if (resid > 0 && space > 0) {
1217 			    /* If there is more to send, set PRUS_MORETOCOME */
1218 		    	    pru_flags = PRUS_MORETOCOME;
1219 			    async = 1;
1220 		    } else {
1221 		    	    pru_flags = 0;
1222 		    }
1223 
1224 		    if (flags & MSG_SYNC)
1225 			    async = 0;
1226 
1227 		    /*
1228 		     * XXX all the SS_CANTSENDMORE checks previously
1229 		     * done could be out of date.  We could have recieved
1230 		     * a reset packet in an interrupt or maybe we slept
1231 		     * while doing page faults in uiomove() etc. We could
1232 		     * probably recheck again inside the splnet() protection
1233 		     * here, but there are probably other places that this
1234 		     * also happens.  We must rethink this.
1235 		     */
1236 		    for (m = top; m; m = m->m_next)
1237 			    ssb_preallocstream(&so->so_snd, m);
1238 		    if (!async) {
1239 			    error = so_pru_send(so, pru_flags, top,
1240 			        NULL, NULL, td);
1241 		    } else {
1242 			    so_pru_send_async(so, pru_flags, top,
1243 			        NULL, NULL, td);
1244 			    error = 0;
1245 		    }
1246 
1247 		    top = NULL;
1248 		    mp = &top;
1249 		    if (error)
1250 			    goto release;
1251 		} while (resid && space > 0);
1252 	} while (resid);
1253 
1254 release:
1255 	ssb_unlock(&so->so_snd);
1256 out:
1257 	if (top)
1258 		m_freem(top);
1259 	if (control)
1260 		m_freem(control);
1261 	return (error);
1262 }
1263 #endif
1264 
1265 /*
1266  * Implement receive operations on a socket.
1267  *
1268  * We depend on the way that records are added to the signalsockbuf
1269  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1270  * must begin with an address if the protocol so specifies,
1271  * followed by an optional mbuf or mbufs containing ancillary data,
1272  * and then zero or more mbufs of data.
1273  *
1274  * Although the signalsockbuf is locked, new data may still be appended.
1275  * A token inside the ssb_lock deals with MP issues and still allows
1276  * the network to access the socket if we block in a uio.
1277  *
1278  * The caller may receive the data as a single mbuf chain by supplying
1279  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1280  * only for the count in uio_resid.
1281  */
1282 int
1283 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1284 	  struct sockbuf *sio, struct mbuf **controlp, int *flagsp)
1285 {
1286 	struct mbuf *m, *n;
1287 	struct mbuf *free_chain = NULL;
1288 	int flags, len, error, offset;
1289 	struct protosw *pr = so->so_proto;
1290 	int moff, type = 0;
1291 	size_t resid, orig_resid;
1292 
1293 	if (uio)
1294 		resid = uio->uio_resid;
1295 	else
1296 		resid = (size_t)(sio->sb_climit - sio->sb_cc);
1297 	orig_resid = resid;
1298 
1299 	if (psa)
1300 		*psa = NULL;
1301 	if (controlp)
1302 		*controlp = NULL;
1303 	if (flagsp)
1304 		flags = *flagsp &~ MSG_EOR;
1305 	else
1306 		flags = 0;
1307 	if (flags & MSG_OOB) {
1308 		m = m_get(MB_WAIT, MT_DATA);
1309 		if (m == NULL)
1310 			return (ENOBUFS);
1311 		error = so_pru_rcvoob(so, m, flags & MSG_PEEK);
1312 		if (error)
1313 			goto bad;
1314 		if (sio) {
1315 			do {
1316 				sbappend(sio, m);
1317 				KKASSERT(resid >= (size_t)m->m_len);
1318 				resid -= (size_t)m->m_len;
1319 			} while (resid > 0 && m);
1320 		} else {
1321 			do {
1322 				uio->uio_resid = resid;
1323 				error = uiomove(mtod(m, caddr_t),
1324 						(int)szmin(resid, m->m_len),
1325 						uio);
1326 				resid = uio->uio_resid;
1327 				m = m_free(m);
1328 			} while (uio->uio_resid && error == 0 && m);
1329 		}
1330 bad:
1331 		if (m)
1332 			m_freem(m);
1333 		return (error);
1334 	}
1335 	if ((so->so_state & SS_ISCONFIRMING) && resid)
1336 		so_pru_rcvd(so, 0);
1337 
1338 	/*
1339 	 * The token interlocks against the protocol thread while
1340 	 * ssb_lock is a blocking lock against other userland entities.
1341 	 */
1342 	lwkt_gettoken(&so->so_rcv.ssb_token);
1343 restart:
1344 	error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags));
1345 	if (error)
1346 		goto done;
1347 
1348 	m = so->so_rcv.ssb_mb;
1349 	/*
1350 	 * If we have less data than requested, block awaiting more
1351 	 * (subject to any timeout) if:
1352 	 *   1. the current count is less than the low water mark, or
1353 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1354 	 *	receive operation at once if we block (resid <= hiwat).
1355 	 *   3. MSG_DONTWAIT is not set
1356 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1357 	 * we have to do the receive in sections, and thus risk returning
1358 	 * a short count if a timeout or signal occurs after we start.
1359 	 */
1360 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1361 	    (size_t)so->so_rcv.ssb_cc < resid) &&
1362 	    (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat ||
1363 	    ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) &&
1364 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
1365 		KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1"));
1366 		if (so->so_error) {
1367 			if (m)
1368 				goto dontblock;
1369 			error = so->so_error;
1370 			if ((flags & MSG_PEEK) == 0)
1371 				so->so_error = 0;
1372 			goto release;
1373 		}
1374 		if (so->so_state & SS_CANTRCVMORE) {
1375 			if (m)
1376 				goto dontblock;
1377 			else
1378 				goto release;
1379 		}
1380 		for (; m; m = m->m_next) {
1381 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1382 				m = so->so_rcv.ssb_mb;
1383 				goto dontblock;
1384 			}
1385 		}
1386 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1387 		    (pr->pr_flags & PR_CONNREQUIRED)) {
1388 			error = ENOTCONN;
1389 			goto release;
1390 		}
1391 		if (resid == 0)
1392 			goto release;
1393 		if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) {
1394 			error = EWOULDBLOCK;
1395 			goto release;
1396 		}
1397 		ssb_unlock(&so->so_rcv);
1398 		error = ssb_wait(&so->so_rcv);
1399 		if (error)
1400 			goto done;
1401 		goto restart;
1402 	}
1403 dontblock:
1404 	if (uio && uio->uio_td && uio->uio_td->td_proc)
1405 		uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++;
1406 
1407 	/*
1408 	 * note: m should be == sb_mb here.  Cache the next record while
1409 	 * cleaning up.  Note that calling m_free*() will break out critical
1410 	 * section.
1411 	 */
1412 	KKASSERT(m == so->so_rcv.ssb_mb);
1413 
1414 	/*
1415 	 * Skip any address mbufs prepending the record.
1416 	 */
1417 	if (pr->pr_flags & PR_ADDR) {
1418 		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
1419 		orig_resid = 0;
1420 		if (psa)
1421 			*psa = dup_sockaddr(mtod(m, struct sockaddr *));
1422 		if (flags & MSG_PEEK)
1423 			m = m->m_next;
1424 		else
1425 			m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1426 	}
1427 
1428 	/*
1429 	 * Skip any control mbufs prepending the record.
1430 	 */
1431 #ifdef SCTP
1432 	if (pr->pr_flags & PR_ADDR_OPT) {
1433 		/*
1434 		 * For SCTP we may be getting a
1435 		 * whole message OR a partial delivery.
1436 		 */
1437 		if (m && m->m_type == MT_SONAME) {
1438 			orig_resid = 0;
1439 			if (psa)
1440 				*psa = dup_sockaddr(mtod(m, struct sockaddr *));
1441 			if (flags & MSG_PEEK)
1442 				m = m->m_next;
1443 			else
1444 				m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1445 		}
1446 	}
1447 #endif /* SCTP */
1448 	while (m && m->m_type == MT_CONTROL && error == 0) {
1449 		if (flags & MSG_PEEK) {
1450 			if (controlp)
1451 				*controlp = m_copy(m, 0, m->m_len);
1452 			m = m->m_next;	/* XXX race */
1453 		} else {
1454 			if (controlp) {
1455 				n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1456 				if (pr->pr_domain->dom_externalize &&
1457 				    mtod(m, struct cmsghdr *)->cmsg_type ==
1458 				    SCM_RIGHTS)
1459 				   error = (*pr->pr_domain->dom_externalize)(m);
1460 				*controlp = m;
1461 				m = n;
1462 			} else {
1463 				m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1464 			}
1465 		}
1466 		if (controlp && *controlp) {
1467 			orig_resid = 0;
1468 			controlp = &(*controlp)->m_next;
1469 		}
1470 	}
1471 
1472 	/*
1473 	 * flag OOB data.
1474 	 */
1475 	if (m) {
1476 		type = m->m_type;
1477 		if (type == MT_OOBDATA)
1478 			flags |= MSG_OOB;
1479 	}
1480 
1481 	/*
1482 	 * Copy to the UIO or mbuf return chain (*mp).
1483 	 */
1484 	moff = 0;
1485 	offset = 0;
1486 	while (m && resid > 0 && error == 0) {
1487 		if (m->m_type == MT_OOBDATA) {
1488 			if (type != MT_OOBDATA)
1489 				break;
1490 		} else if (type == MT_OOBDATA)
1491 			break;
1492 		else
1493 		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1494 			("receive 3"));
1495 		soclrstate(so, SS_RCVATMARK);
1496 		len = (resid > INT_MAX) ? INT_MAX : resid;
1497 		if (so->so_oobmark && len > so->so_oobmark - offset)
1498 			len = so->so_oobmark - offset;
1499 		if (len > m->m_len - moff)
1500 			len = m->m_len - moff;
1501 
1502 		/*
1503 		 * Copy out to the UIO or pass the mbufs back to the SIO.
1504 		 * The SIO is dealt with when we eat the mbuf, but deal
1505 		 * with the resid here either way.
1506 		 */
1507 		if (uio) {
1508 			uio->uio_resid = resid;
1509 			error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1510 			resid = uio->uio_resid;
1511 			if (error)
1512 				goto release;
1513 		} else {
1514 			resid -= (size_t)len;
1515 		}
1516 
1517 		/*
1518 		 * Eat the entire mbuf or just a piece of it
1519 		 */
1520 		if (len == m->m_len - moff) {
1521 			if (m->m_flags & M_EOR)
1522 				flags |= MSG_EOR;
1523 #ifdef SCTP
1524 			if (m->m_flags & M_NOTIFICATION)
1525 				flags |= MSG_NOTIFICATION;
1526 #endif /* SCTP */
1527 			if (flags & MSG_PEEK) {
1528 				m = m->m_next;
1529 				moff = 0;
1530 			} else {
1531 				if (sio) {
1532 					n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1533 					sbappend(sio, m);
1534 					m = n;
1535 				} else {
1536 					m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1537 				}
1538 			}
1539 		} else {
1540 			if (flags & MSG_PEEK) {
1541 				moff += len;
1542 			} else {
1543 				if (sio) {
1544 					n = m_copym(m, 0, len, MB_WAIT);
1545 					if (n)
1546 						sbappend(sio, n);
1547 				}
1548 				m->m_data += len;
1549 				m->m_len -= len;
1550 				so->so_rcv.ssb_cc -= len;
1551 			}
1552 		}
1553 		if (so->so_oobmark) {
1554 			if ((flags & MSG_PEEK) == 0) {
1555 				so->so_oobmark -= len;
1556 				if (so->so_oobmark == 0) {
1557 					sosetstate(so, SS_RCVATMARK);
1558 					break;
1559 				}
1560 			} else {
1561 				offset += len;
1562 				if (offset == so->so_oobmark)
1563 					break;
1564 			}
1565 		}
1566 		if (flags & MSG_EOR)
1567 			break;
1568 		/*
1569 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1570 		 * we must not quit until resid == 0 or an error
1571 		 * termination.  If a signal/timeout occurs, return
1572 		 * with a short count but without error.
1573 		 * Keep signalsockbuf locked against other readers.
1574 		 */
1575 		while ((flags & MSG_WAITALL) && m == NULL &&
1576 		       resid > 0 && !sosendallatonce(so) &&
1577 		       so->so_rcv.ssb_mb == NULL) {
1578 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
1579 				break;
1580 			/*
1581 			 * The window might have closed to zero, make
1582 			 * sure we send an ack now that we've drained
1583 			 * the buffer or we might end up blocking until
1584 			 * the idle takes over (5 seconds).
1585 			 */
1586 			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1587 				so_pru_rcvd(so, flags);
1588 			error = ssb_wait(&so->so_rcv);
1589 			if (error) {
1590 				ssb_unlock(&so->so_rcv);
1591 				error = 0;
1592 				goto done;
1593 			}
1594 			m = so->so_rcv.ssb_mb;
1595 		}
1596 	}
1597 
1598 	/*
1599 	 * If an atomic read was requested but unread data still remains
1600 	 * in the record, set MSG_TRUNC.
1601 	 */
1602 	if (m && pr->pr_flags & PR_ATOMIC)
1603 		flags |= MSG_TRUNC;
1604 
1605 	/*
1606 	 * Cleanup.  If an atomic read was requested drop any unread data.
1607 	 */
1608 	if ((flags & MSG_PEEK) == 0) {
1609 		if (m && (pr->pr_flags & PR_ATOMIC))
1610 			sbdroprecord(&so->so_rcv.sb);
1611 		if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
1612 			so_pru_rcvd(so, flags);
1613 	}
1614 
1615 	if (orig_resid == resid && orig_resid &&
1616 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1617 		ssb_unlock(&so->so_rcv);
1618 		goto restart;
1619 	}
1620 
1621 	if (flagsp)
1622 		*flagsp |= flags;
1623 release:
1624 	ssb_unlock(&so->so_rcv);
1625 done:
1626 	lwkt_reltoken(&so->so_rcv.ssb_token);
1627 	if (free_chain)
1628 		m_freem(free_chain);
1629 	return (error);
1630 }
1631 
1632 int
1633 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1634 	  struct sockbuf *sio, struct mbuf **controlp, int *flagsp)
1635 {
1636 	struct mbuf *m, *n;
1637 	struct mbuf *free_chain = NULL;
1638 	int flags, len, error, offset;
1639 	struct protosw *pr = so->so_proto;
1640 	int moff;
1641 	size_t resid, orig_resid;
1642 
1643 	if (uio)
1644 		resid = uio->uio_resid;
1645 	else
1646 		resid = (size_t)(sio->sb_climit - sio->sb_cc);
1647 	orig_resid = resid;
1648 
1649 	if (psa)
1650 		*psa = NULL;
1651 	if (controlp)
1652 		*controlp = NULL;
1653 	if (flagsp)
1654 		flags = *flagsp &~ MSG_EOR;
1655 	else
1656 		flags = 0;
1657 	if (flags & MSG_OOB) {
1658 		m = m_get(MB_WAIT, MT_DATA);
1659 		if (m == NULL)
1660 			return (ENOBUFS);
1661 		error = so_pru_rcvoob(so, m, flags & MSG_PEEK);
1662 		if (error)
1663 			goto bad;
1664 		if (sio) {
1665 			do {
1666 				sbappend(sio, m);
1667 				KKASSERT(resid >= (size_t)m->m_len);
1668 				resid -= (size_t)m->m_len;
1669 			} while (resid > 0 && m);
1670 		} else {
1671 			do {
1672 				uio->uio_resid = resid;
1673 				error = uiomove(mtod(m, caddr_t),
1674 						(int)szmin(resid, m->m_len),
1675 						uio);
1676 				resid = uio->uio_resid;
1677 				m = m_free(m);
1678 			} while (uio->uio_resid && error == 0 && m);
1679 		}
1680 bad:
1681 		if (m)
1682 			m_freem(m);
1683 		return (error);
1684 	}
1685 
1686 	/*
1687 	 * The token interlocks against the protocol thread while
1688 	 * ssb_lock is a blocking lock against other userland entities.
1689 	 */
1690 	lwkt_gettoken(&so->so_rcv.ssb_token);
1691 restart:
1692 	error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags));
1693 	if (error)
1694 		goto done;
1695 
1696 	m = so->so_rcv.ssb_mb;
1697 	/*
1698 	 * If we have less data than requested, block awaiting more
1699 	 * (subject to any timeout) if:
1700 	 *   1. the current count is less than the low water mark, or
1701 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
1702 	 *	receive operation at once if we block (resid <= hiwat).
1703 	 *   3. MSG_DONTWAIT is not set
1704 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1705 	 * we have to do the receive in sections, and thus risk returning
1706 	 * a short count if a timeout or signal occurs after we start.
1707 	 */
1708 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1709 	    (size_t)so->so_rcv.ssb_cc < resid) &&
1710 	    (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat ||
1711 	   ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) {
1712 		KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1"));
1713 		if (so->so_error) {
1714 			if (m)
1715 				goto dontblock;
1716 			error = so->so_error;
1717 			if ((flags & MSG_PEEK) == 0)
1718 				so->so_error = 0;
1719 			goto release;
1720 		}
1721 		if (so->so_state & SS_CANTRCVMORE) {
1722 			if (m)
1723 				goto dontblock;
1724 			else
1725 				goto release;
1726 		}
1727 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1728 		    (pr->pr_flags & PR_CONNREQUIRED)) {
1729 			error = ENOTCONN;
1730 			goto release;
1731 		}
1732 		if (resid == 0)
1733 			goto release;
1734 		if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) {
1735 			error = EWOULDBLOCK;
1736 			goto release;
1737 		}
1738 		ssb_unlock(&so->so_rcv);
1739 		error = ssb_wait(&so->so_rcv);
1740 		if (error)
1741 			goto done;
1742 		goto restart;
1743 	}
1744 dontblock:
1745 	if (uio && uio->uio_td && uio->uio_td->td_proc)
1746 		uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++;
1747 
1748 	/*
1749 	 * note: m should be == sb_mb here.  Cache the next record while
1750 	 * cleaning up.  Note that calling m_free*() will break out critical
1751 	 * section.
1752 	 */
1753 	KKASSERT(m == so->so_rcv.ssb_mb);
1754 
1755 	/*
1756 	 * Copy to the UIO or mbuf return chain (*mp).
1757 	 */
1758 	moff = 0;
1759 	offset = 0;
1760 	while (m && resid > 0 && error == 0) {
1761 		KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1762 		    ("receive 3"));
1763 
1764 		soclrstate(so, SS_RCVATMARK);
1765 		len = (resid > INT_MAX) ? INT_MAX : resid;
1766 		if (so->so_oobmark && len > so->so_oobmark - offset)
1767 			len = so->so_oobmark - offset;
1768 		if (len > m->m_len - moff)
1769 			len = m->m_len - moff;
1770 
1771 		/*
1772 		 * Copy out to the UIO or pass the mbufs back to the SIO.
1773 		 * The SIO is dealt with when we eat the mbuf, but deal
1774 		 * with the resid here either way.
1775 		 */
1776 		if (uio) {
1777 			uio->uio_resid = resid;
1778 			error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1779 			resid = uio->uio_resid;
1780 			if (error)
1781 				goto release;
1782 		} else {
1783 			resid -= (size_t)len;
1784 		}
1785 
1786 		/*
1787 		 * Eat the entire mbuf or just a piece of it
1788 		 */
1789 		if (len == m->m_len - moff) {
1790 			if (flags & MSG_PEEK) {
1791 				m = m->m_next;
1792 				moff = 0;
1793 			} else {
1794 				if (sio) {
1795 					n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL);
1796 					sbappend(sio, m);
1797 					m = n;
1798 				} else {
1799 					m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain);
1800 				}
1801 			}
1802 		} else {
1803 			if (flags & MSG_PEEK) {
1804 				moff += len;
1805 			} else {
1806 				if (sio) {
1807 					n = m_copym(m, 0, len, MB_WAIT);
1808 					if (n)
1809 						sbappend(sio, n);
1810 				}
1811 				m->m_data += len;
1812 				m->m_len -= len;
1813 				so->so_rcv.ssb_cc -= len;
1814 			}
1815 		}
1816 		if (so->so_oobmark) {
1817 			if ((flags & MSG_PEEK) == 0) {
1818 				so->so_oobmark -= len;
1819 				if (so->so_oobmark == 0) {
1820 					sosetstate(so, SS_RCVATMARK);
1821 					break;
1822 				}
1823 			} else {
1824 				offset += len;
1825 				if (offset == so->so_oobmark)
1826 					break;
1827 			}
1828 		}
1829 		/*
1830 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1831 		 * we must not quit until resid == 0 or an error
1832 		 * termination.  If a signal/timeout occurs, return
1833 		 * with a short count but without error.
1834 		 * Keep signalsockbuf locked against other readers.
1835 		 */
1836 		while ((flags & MSG_WAITALL) && m == NULL &&
1837 		       resid > 0 && !sosendallatonce(so) &&
1838 		       so->so_rcv.ssb_mb == NULL) {
1839 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
1840 				break;
1841 			/*
1842 			 * The window might have closed to zero, make
1843 			 * sure we send an ack now that we've drained
1844 			 * the buffer or we might end up blocking until
1845 			 * the idle takes over (5 seconds).
1846 			 */
1847 			if (so->so_pcb)
1848 				so_pru_rcvd_async(so);
1849 			error = ssb_wait(&so->so_rcv);
1850 			if (error) {
1851 				ssb_unlock(&so->so_rcv);
1852 				error = 0;
1853 				goto done;
1854 			}
1855 			m = so->so_rcv.ssb_mb;
1856 		}
1857 	}
1858 
1859 	/*
1860 	 * Cleanup.  If an atomic read was requested drop any unread data.
1861 	 */
1862 	if ((flags & MSG_PEEK) == 0) {
1863 		if (so->so_pcb)
1864 			so_pru_rcvd_async(so);
1865 	}
1866 
1867 	if (orig_resid == resid && orig_resid &&
1868 	    (so->so_state & SS_CANTRCVMORE) == 0) {
1869 		ssb_unlock(&so->so_rcv);
1870 		goto restart;
1871 	}
1872 
1873 	if (flagsp)
1874 		*flagsp |= flags;
1875 release:
1876 	ssb_unlock(&so->so_rcv);
1877 done:
1878 	lwkt_reltoken(&so->so_rcv.ssb_token);
1879 	if (free_chain)
1880 		m_freem(free_chain);
1881 	return (error);
1882 }
1883 
1884 /*
1885  * Shut a socket down.  Note that we do not get a frontend lock as we
1886  * want to be able to shut the socket down even if another thread is
1887  * blocked in a read(), thus waking it up.
1888  */
1889 int
1890 soshutdown(struct socket *so, int how)
1891 {
1892 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1893 		return (EINVAL);
1894 
1895 	if (how != SHUT_WR) {
1896 		/*ssb_lock(&so->so_rcv, M_WAITOK);*/
1897 		sorflush(so);
1898 		/*ssb_unlock(&so->so_rcv);*/
1899 	}
1900 	if (how != SHUT_RD)
1901 		return (so_pru_shutdown(so));
1902 	return (0);
1903 }
1904 
1905 void
1906 sorflush(struct socket *so)
1907 {
1908 	struct signalsockbuf *ssb = &so->so_rcv;
1909 	struct protosw *pr = so->so_proto;
1910 	struct signalsockbuf asb;
1911 
1912 	atomic_set_int(&ssb->ssb_flags, SSB_NOINTR);
1913 
1914 	lwkt_gettoken(&ssb->ssb_token);
1915 	socantrcvmore(so);
1916 	asb = *ssb;
1917 
1918 	/*
1919 	 * Can't just blow up the ssb structure here
1920 	 */
1921 	bzero(&ssb->sb, sizeof(ssb->sb));
1922 	ssb->ssb_timeo = 0;
1923 	ssb->ssb_lowat = 0;
1924 	ssb->ssb_hiwat = 0;
1925 	ssb->ssb_mbmax = 0;
1926 	atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK);
1927 
1928 	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
1929 		(*pr->pr_domain->dom_dispose)(asb.ssb_mb);
1930 	ssb_release(&asb, so);
1931 
1932 	lwkt_reltoken(&ssb->ssb_token);
1933 }
1934 
1935 #ifdef INET
1936 static int
1937 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
1938 {
1939 	struct accept_filter_arg	*afap = NULL;
1940 	struct accept_filter	*afp;
1941 	struct so_accf	*af = so->so_accf;
1942 	int	error = 0;
1943 
1944 	/* do not set/remove accept filters on non listen sockets */
1945 	if ((so->so_options & SO_ACCEPTCONN) == 0) {
1946 		error = EINVAL;
1947 		goto out;
1948 	}
1949 
1950 	/* removing the filter */
1951 	if (sopt == NULL) {
1952 		if (af != NULL) {
1953 			if (af->so_accept_filter != NULL &&
1954 				af->so_accept_filter->accf_destroy != NULL) {
1955 				af->so_accept_filter->accf_destroy(so);
1956 			}
1957 			if (af->so_accept_filter_str != NULL) {
1958 				kfree(af->so_accept_filter_str, M_ACCF);
1959 			}
1960 			kfree(af, M_ACCF);
1961 			so->so_accf = NULL;
1962 		}
1963 		so->so_options &= ~SO_ACCEPTFILTER;
1964 		return (0);
1965 	}
1966 	/* adding a filter */
1967 	/* must remove previous filter first */
1968 	if (af != NULL) {
1969 		error = EINVAL;
1970 		goto out;
1971 	}
1972 	/* don't put large objects on the kernel stack */
1973 	afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK);
1974 	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
1975 	afap->af_name[sizeof(afap->af_name)-1] = '\0';
1976 	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
1977 	if (error)
1978 		goto out;
1979 	afp = accept_filt_get(afap->af_name);
1980 	if (afp == NULL) {
1981 		error = ENOENT;
1982 		goto out;
1983 	}
1984 	af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO);
1985 	if (afp->accf_create != NULL) {
1986 		if (afap->af_name[0] != '\0') {
1987 			int len = strlen(afap->af_name) + 1;
1988 
1989 			af->so_accept_filter_str = kmalloc(len, M_ACCF,
1990 							   M_WAITOK);
1991 			strcpy(af->so_accept_filter_str, afap->af_name);
1992 		}
1993 		af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg);
1994 		if (af->so_accept_filter_arg == NULL) {
1995 			kfree(af->so_accept_filter_str, M_ACCF);
1996 			kfree(af, M_ACCF);
1997 			so->so_accf = NULL;
1998 			error = EINVAL;
1999 			goto out;
2000 		}
2001 	}
2002 	af->so_accept_filter = afp;
2003 	so->so_accf = af;
2004 	so->so_options |= SO_ACCEPTFILTER;
2005 out:
2006 	if (afap != NULL)
2007 		kfree(afap, M_TEMP);
2008 	return (error);
2009 }
2010 #endif /* INET */
2011 
2012 /*
2013  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2014  * an additional variant to handle the case where the option value needs
2015  * to be some kind of integer, but not a specific size.
2016  * In addition to their use here, these functions are also called by the
2017  * protocol-level pr_ctloutput() routines.
2018  */
2019 int
2020 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2021 {
2022 	return soopt_to_kbuf(sopt, buf, len, minlen);
2023 }
2024 
2025 int
2026 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2027 {
2028 	size_t	valsize;
2029 
2030 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2031 	KKASSERT(kva_p(buf));
2032 
2033 	/*
2034 	 * If the user gives us more than we wanted, we ignore it,
2035 	 * but if we don't get the minimum length the caller
2036 	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
2037 	 * is set to however much we actually retrieved.
2038 	 */
2039 	if ((valsize = sopt->sopt_valsize) < minlen)
2040 		return EINVAL;
2041 	if (valsize > len)
2042 		sopt->sopt_valsize = valsize = len;
2043 
2044 	bcopy(sopt->sopt_val, buf, valsize);
2045 	return 0;
2046 }
2047 
2048 
2049 int
2050 sosetopt(struct socket *so, struct sockopt *sopt)
2051 {
2052 	int	error, optval;
2053 	struct	linger l;
2054 	struct	timeval tv;
2055 	u_long  val;
2056 	struct signalsockbuf *sotmp;
2057 
2058 	error = 0;
2059 	sopt->sopt_dir = SOPT_SET;
2060 	if (sopt->sopt_level != SOL_SOCKET) {
2061 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2062 			return (so_pr_ctloutput(so, sopt));
2063 		}
2064 		error = ENOPROTOOPT;
2065 	} else {
2066 		switch (sopt->sopt_name) {
2067 #ifdef INET
2068 		case SO_ACCEPTFILTER:
2069 			error = do_setopt_accept_filter(so, sopt);
2070 			if (error)
2071 				goto bad;
2072 			break;
2073 #endif /* INET */
2074 		case SO_LINGER:
2075 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
2076 			if (error)
2077 				goto bad;
2078 
2079 			so->so_linger = l.l_linger;
2080 			if (l.l_onoff)
2081 				so->so_options |= SO_LINGER;
2082 			else
2083 				so->so_options &= ~SO_LINGER;
2084 			break;
2085 
2086 		case SO_DEBUG:
2087 		case SO_KEEPALIVE:
2088 		case SO_DONTROUTE:
2089 		case SO_USELOOPBACK:
2090 		case SO_BROADCAST:
2091 		case SO_REUSEADDR:
2092 		case SO_REUSEPORT:
2093 		case SO_OOBINLINE:
2094 		case SO_TIMESTAMP:
2095 		case SO_NOSIGPIPE:
2096 			error = sooptcopyin(sopt, &optval, sizeof optval,
2097 					    sizeof optval);
2098 			if (error)
2099 				goto bad;
2100 			if (optval)
2101 				so->so_options |= sopt->sopt_name;
2102 			else
2103 				so->so_options &= ~sopt->sopt_name;
2104 			break;
2105 
2106 		case SO_SNDBUF:
2107 		case SO_RCVBUF:
2108 		case SO_SNDLOWAT:
2109 		case SO_RCVLOWAT:
2110 			error = sooptcopyin(sopt, &optval, sizeof optval,
2111 					    sizeof optval);
2112 			if (error)
2113 				goto bad;
2114 
2115 			/*
2116 			 * Values < 1 make no sense for any of these
2117 			 * options, so disallow them.
2118 			 */
2119 			if (optval < 1) {
2120 				error = EINVAL;
2121 				goto bad;
2122 			}
2123 
2124 			switch (sopt->sopt_name) {
2125 			case SO_SNDBUF:
2126 			case SO_RCVBUF:
2127 				if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ?
2128 				    &so->so_snd : &so->so_rcv, (u_long)optval,
2129 				    so,
2130 				    &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) {
2131 					error = ENOBUFS;
2132 					goto bad;
2133 				}
2134 				sotmp = (sopt->sopt_name == SO_SNDBUF) ?
2135 						&so->so_snd : &so->so_rcv;
2136 				atomic_clear_int(&sotmp->ssb_flags,
2137 						 SSB_AUTOSIZE);
2138 				break;
2139 
2140 			/*
2141 			 * Make sure the low-water is never greater than
2142 			 * the high-water.
2143 			 */
2144 			case SO_SNDLOWAT:
2145 				so->so_snd.ssb_lowat =
2146 				    (optval > so->so_snd.ssb_hiwat) ?
2147 				    so->so_snd.ssb_hiwat : optval;
2148 				atomic_clear_int(&so->so_snd.ssb_flags,
2149 						 SSB_AUTOLOWAT);
2150 				break;
2151 			case SO_RCVLOWAT:
2152 				so->so_rcv.ssb_lowat =
2153 				    (optval > so->so_rcv.ssb_hiwat) ?
2154 				    so->so_rcv.ssb_hiwat : optval;
2155 				atomic_clear_int(&so->so_rcv.ssb_flags,
2156 						 SSB_AUTOLOWAT);
2157 				break;
2158 			}
2159 			break;
2160 
2161 		case SO_SNDTIMEO:
2162 		case SO_RCVTIMEO:
2163 			error = sooptcopyin(sopt, &tv, sizeof tv,
2164 					    sizeof tv);
2165 			if (error)
2166 				goto bad;
2167 
2168 			/* assert(hz > 0); */
2169 			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
2170 			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
2171 				error = EDOM;
2172 				goto bad;
2173 			}
2174 			/* assert(tick > 0); */
2175 			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
2176 			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick;
2177 			if (val > INT_MAX) {
2178 				error = EDOM;
2179 				goto bad;
2180 			}
2181 			if (val == 0 && tv.tv_usec != 0)
2182 				val = 1;
2183 
2184 			switch (sopt->sopt_name) {
2185 			case SO_SNDTIMEO:
2186 				so->so_snd.ssb_timeo = val;
2187 				break;
2188 			case SO_RCVTIMEO:
2189 				so->so_rcv.ssb_timeo = val;
2190 				break;
2191 			}
2192 			break;
2193 		default:
2194 			error = ENOPROTOOPT;
2195 			break;
2196 		}
2197 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
2198 			(void) so_pr_ctloutput(so, sopt);
2199 		}
2200 	}
2201 bad:
2202 	return (error);
2203 }
2204 
2205 /* Helper routine for getsockopt */
2206 int
2207 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
2208 {
2209 	soopt_from_kbuf(sopt, buf, len);
2210 	return 0;
2211 }
2212 
2213 void
2214 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len)
2215 {
2216 	size_t	valsize;
2217 
2218 	if (len == 0) {
2219 		sopt->sopt_valsize = 0;
2220 		return;
2221 	}
2222 
2223 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2224 	KKASSERT(kva_p(buf));
2225 
2226 	/*
2227 	 * Documented get behavior is that we always return a value,
2228 	 * possibly truncated to fit in the user's buffer.
2229 	 * Traditional behavior is that we always tell the user
2230 	 * precisely how much we copied, rather than something useful
2231 	 * like the total amount we had available for her.
2232 	 * Note that this interface is not idempotent; the entire answer must
2233 	 * generated ahead of time.
2234 	 */
2235 	valsize = szmin(len, sopt->sopt_valsize);
2236 	sopt->sopt_valsize = valsize;
2237 	if (sopt->sopt_val != 0) {
2238 		bcopy(buf, sopt->sopt_val, valsize);
2239 	}
2240 }
2241 
2242 int
2243 sogetopt(struct socket *so, struct sockopt *sopt)
2244 {
2245 	int	error, optval;
2246 	long	optval_l;
2247 	struct	linger l;
2248 	struct	timeval tv;
2249 #ifdef INET
2250 	struct accept_filter_arg *afap;
2251 #endif
2252 
2253 	error = 0;
2254 	sopt->sopt_dir = SOPT_GET;
2255 	if (sopt->sopt_level != SOL_SOCKET) {
2256 		if (so->so_proto && so->so_proto->pr_ctloutput) {
2257 			return (so_pr_ctloutput(so, sopt));
2258 		} else
2259 			return (ENOPROTOOPT);
2260 	} else {
2261 		switch (sopt->sopt_name) {
2262 #ifdef INET
2263 		case SO_ACCEPTFILTER:
2264 			if ((so->so_options & SO_ACCEPTCONN) == 0)
2265 				return (EINVAL);
2266 			afap = kmalloc(sizeof(*afap), M_TEMP,
2267 				       M_WAITOK | M_ZERO);
2268 			if ((so->so_options & SO_ACCEPTFILTER) != 0) {
2269 				strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
2270 				if (so->so_accf->so_accept_filter_str != NULL)
2271 					strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
2272 			}
2273 			error = sooptcopyout(sopt, afap, sizeof(*afap));
2274 			kfree(afap, M_TEMP);
2275 			break;
2276 #endif /* INET */
2277 
2278 		case SO_LINGER:
2279 			l.l_onoff = so->so_options & SO_LINGER;
2280 			l.l_linger = so->so_linger;
2281 			error = sooptcopyout(sopt, &l, sizeof l);
2282 			break;
2283 
2284 		case SO_USELOOPBACK:
2285 		case SO_DONTROUTE:
2286 		case SO_DEBUG:
2287 		case SO_KEEPALIVE:
2288 		case SO_REUSEADDR:
2289 		case SO_REUSEPORT:
2290 		case SO_BROADCAST:
2291 		case SO_OOBINLINE:
2292 		case SO_TIMESTAMP:
2293 		case SO_NOSIGPIPE:
2294 			optval = so->so_options & sopt->sopt_name;
2295 integer:
2296 			error = sooptcopyout(sopt, &optval, sizeof optval);
2297 			break;
2298 
2299 		case SO_TYPE:
2300 			optval = so->so_type;
2301 			goto integer;
2302 
2303 		case SO_ERROR:
2304 			optval = so->so_error;
2305 			so->so_error = 0;
2306 			goto integer;
2307 
2308 		case SO_SNDBUF:
2309 			optval = so->so_snd.ssb_hiwat;
2310 			goto integer;
2311 
2312 		case SO_RCVBUF:
2313 			optval = so->so_rcv.ssb_hiwat;
2314 			goto integer;
2315 
2316 		case SO_SNDLOWAT:
2317 			optval = so->so_snd.ssb_lowat;
2318 			goto integer;
2319 
2320 		case SO_RCVLOWAT:
2321 			optval = so->so_rcv.ssb_lowat;
2322 			goto integer;
2323 
2324 		case SO_SNDTIMEO:
2325 		case SO_RCVTIMEO:
2326 			optval = (sopt->sopt_name == SO_SNDTIMEO ?
2327 				  so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo);
2328 
2329 			tv.tv_sec = optval / hz;
2330 			tv.tv_usec = (optval % hz) * ustick;
2331 			error = sooptcopyout(sopt, &tv, sizeof tv);
2332 			break;
2333 
2334 		case SO_SNDSPACE:
2335 			optval_l = ssb_space(&so->so_snd);
2336 			error = sooptcopyout(sopt, &optval_l, sizeof(optval_l));
2337 			break;
2338 
2339 		default:
2340 			error = ENOPROTOOPT;
2341 			break;
2342 		}
2343 		return (error);
2344 	}
2345 }
2346 
2347 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
2348 int
2349 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
2350 {
2351 	struct mbuf *m, *m_prev;
2352 	int sopt_size = sopt->sopt_valsize, msize;
2353 
2354 	m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA,
2355 		   0, &msize);
2356 	if (m == NULL)
2357 		return (ENOBUFS);
2358 	m->m_len = min(msize, sopt_size);
2359 	sopt_size -= m->m_len;
2360 	*mp = m;
2361 	m_prev = m;
2362 
2363 	while (sopt_size > 0) {
2364 		m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT,
2365 			   MT_DATA, 0, &msize);
2366 		if (m == NULL) {
2367 			m_freem(*mp);
2368 			return (ENOBUFS);
2369 		}
2370 		m->m_len = min(msize, sopt_size);
2371 		sopt_size -= m->m_len;
2372 		m_prev->m_next = m;
2373 		m_prev = m;
2374 	}
2375 	return (0);
2376 }
2377 
2378 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2379 int
2380 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
2381 {
2382 	soopt_to_mbuf(sopt, m);
2383 	return 0;
2384 }
2385 
2386 void
2387 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m)
2388 {
2389 	size_t valsize;
2390 	void *val;
2391 
2392 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2393 	KKASSERT(kva_p(m));
2394 	if (sopt->sopt_val == NULL)
2395 		return;
2396 	val = sopt->sopt_val;
2397 	valsize = sopt->sopt_valsize;
2398 	while (m != NULL && valsize >= m->m_len) {
2399 		bcopy(val, mtod(m, char *), m->m_len);
2400 		valsize -= m->m_len;
2401 		val = (caddr_t)val + m->m_len;
2402 		m = m->m_next;
2403 	}
2404 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2405 		panic("ip6_sooptmcopyin");
2406 }
2407 
2408 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2409 int
2410 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
2411 {
2412 	return soopt_from_mbuf(sopt, m);
2413 }
2414 
2415 int
2416 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m)
2417 {
2418 	struct mbuf *m0 = m;
2419 	size_t valsize = 0;
2420 	size_t maxsize;
2421 	void *val;
2422 
2423 	KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val));
2424 	KKASSERT(kva_p(m));
2425 	if (sopt->sopt_val == NULL)
2426 		return 0;
2427 	val = sopt->sopt_val;
2428 	maxsize = sopt->sopt_valsize;
2429 	while (m != NULL && maxsize >= m->m_len) {
2430 		bcopy(mtod(m, char *), val, m->m_len);
2431 	       maxsize -= m->m_len;
2432 	       val = (caddr_t)val + m->m_len;
2433 	       valsize += m->m_len;
2434 	       m = m->m_next;
2435 	}
2436 	if (m != NULL) {
2437 		/* enough soopt buffer should be given from user-land */
2438 		m_freem(m0);
2439 		return (EINVAL);
2440 	}
2441 	sopt->sopt_valsize = valsize;
2442 	return 0;
2443 }
2444 
2445 void
2446 sohasoutofband(struct socket *so)
2447 {
2448 	if (so->so_sigio != NULL)
2449 		pgsigio(so->so_sigio, SIGURG, 0);
2450 	KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB);
2451 }
2452 
2453 int
2454 sokqfilter(struct file *fp, struct knote *kn)
2455 {
2456 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2457 	struct signalsockbuf *ssb;
2458 
2459 	switch (kn->kn_filter) {
2460 	case EVFILT_READ:
2461 		if (so->so_options & SO_ACCEPTCONN)
2462 			kn->kn_fop = &solisten_filtops;
2463 		else
2464 			kn->kn_fop = &soread_filtops;
2465 		ssb = &so->so_rcv;
2466 		break;
2467 	case EVFILT_WRITE:
2468 		kn->kn_fop = &sowrite_filtops;
2469 		ssb = &so->so_snd;
2470 		break;
2471 	case EVFILT_EXCEPT:
2472 		kn->kn_fop = &soexcept_filtops;
2473 		ssb = &so->so_rcv;
2474 		break;
2475 	default:
2476 		return (EOPNOTSUPP);
2477 	}
2478 
2479 	knote_insert(&ssb->ssb_kq.ki_note, kn);
2480 	atomic_set_int(&ssb->ssb_flags, SSB_KNOTE);
2481 	return (0);
2482 }
2483 
2484 static void
2485 filt_sordetach(struct knote *kn)
2486 {
2487 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2488 
2489 	knote_remove(&so->so_rcv.ssb_kq.ki_note, kn);
2490 	if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note))
2491 		atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE);
2492 }
2493 
2494 /*ARGSUSED*/
2495 static int
2496 filt_soread(struct knote *kn, long hint)
2497 {
2498 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2499 
2500 	if (kn->kn_sfflags & NOTE_OOB) {
2501 		if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) {
2502 			kn->kn_fflags |= NOTE_OOB;
2503 			return (1);
2504 		}
2505 		return (0);
2506 	}
2507 	kn->kn_data = so->so_rcv.ssb_cc;
2508 
2509 	if (so->so_state & SS_CANTRCVMORE) {
2510 		/*
2511 		 * Only set NODATA if all data has been exhausted.
2512 		 */
2513 		if (kn->kn_data == 0)
2514 			kn->kn_flags |= EV_NODATA;
2515 		kn->kn_flags |= EV_EOF;
2516 		kn->kn_fflags = so->so_error;
2517 		return (1);
2518 	}
2519 	if (so->so_error)	/* temporary udp error */
2520 		return (1);
2521 	if (kn->kn_sfflags & NOTE_LOWAT)
2522 		return (kn->kn_data >= kn->kn_sdata);
2523 	return ((kn->kn_data >= so->so_rcv.ssb_lowat) ||
2524 		!TAILQ_EMPTY(&so->so_comp));
2525 }
2526 
2527 static void
2528 filt_sowdetach(struct knote *kn)
2529 {
2530 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2531 
2532 	knote_remove(&so->so_snd.ssb_kq.ki_note, kn);
2533 	if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note))
2534 		atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE);
2535 }
2536 
2537 /*ARGSUSED*/
2538 static int
2539 filt_sowrite(struct knote *kn, long hint)
2540 {
2541 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2542 
2543 	kn->kn_data = ssb_space(&so->so_snd);
2544 	if (so->so_state & SS_CANTSENDMORE) {
2545 		kn->kn_flags |= (EV_EOF | EV_NODATA);
2546 		kn->kn_fflags = so->so_error;
2547 		return (1);
2548 	}
2549 	if (so->so_error)	/* temporary udp error */
2550 		return (1);
2551 	if (((so->so_state & SS_ISCONNECTED) == 0) &&
2552 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
2553 		return (0);
2554 	if (kn->kn_sfflags & NOTE_LOWAT)
2555 		return (kn->kn_data >= kn->kn_sdata);
2556 	return (kn->kn_data >= so->so_snd.ssb_lowat);
2557 }
2558 
2559 /*ARGSUSED*/
2560 static int
2561 filt_solisten(struct knote *kn, long hint)
2562 {
2563 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
2564 
2565 	kn->kn_data = so->so_qlen;
2566 	return (! TAILQ_EMPTY(&so->so_comp));
2567 }
2568