xref: /openbsd-src/sys/kern/uipc_socket.c (revision 6fb93e477091bc98fa8efc08a035b53b7fd309c8)
1 /*	$OpenBSD: uipc_socket.c,v 1.346 2024/12/15 11:00:05 dlg Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/event.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/unpcb.h>
47 #include <sys/socketvar.h>
48 #include <sys/signalvar.h>
49 #include <sys/pool.h>
50 #include <sys/atomic.h>
51 #include <sys/rwlock.h>
52 #include <sys/time.h>
53 #include <sys/refcnt.h>
54 
55 #ifdef DDB
56 #include <machine/db_machdep.h>
57 #endif
58 
59 void	sbsync(struct sockbuf *, struct mbuf *);
60 
61 int	sosplice(struct socket *, int, off_t, struct timeval *);
62 void	sounsplice(struct socket *, struct socket *, int);
63 void	soidle(void *);
64 void	sotask(void *);
65 void	soreaper(void *);
66 void	soput(void *);
67 int	somove(struct socket *, int);
68 void	sorflush(struct socket *);
69 
70 void	filt_sordetach(struct knote *kn);
71 int	filt_soread(struct knote *kn, long hint);
72 void	filt_sowdetach(struct knote *kn);
73 int	filt_sowrite(struct knote *kn, long hint);
74 int	filt_soexcept(struct knote *kn, long hint);
75 
76 int	filt_sowmodify(struct kevent *kev, struct knote *kn);
77 int	filt_sowprocess(struct knote *kn, struct kevent *kev);
78 
79 int	filt_sormodify(struct kevent *kev, struct knote *kn);
80 int	filt_sorprocess(struct knote *kn, struct kevent *kev);
81 
82 const struct filterops soread_filtops = {
83 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
84 	.f_attach	= NULL,
85 	.f_detach	= filt_sordetach,
86 	.f_event	= filt_soread,
87 	.f_modify	= filt_sormodify,
88 	.f_process	= filt_sorprocess,
89 };
90 
91 const struct filterops sowrite_filtops = {
92 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
93 	.f_attach	= NULL,
94 	.f_detach	= filt_sowdetach,
95 	.f_event	= filt_sowrite,
96 	.f_modify	= filt_sowmodify,
97 	.f_process	= filt_sowprocess,
98 };
99 
100 const struct filterops soexcept_filtops = {
101 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
102 	.f_attach	= NULL,
103 	.f_detach	= filt_sordetach,
104 	.f_event	= filt_soexcept,
105 	.f_modify	= filt_sormodify,
106 	.f_process	= filt_sorprocess,
107 };
108 
109 #ifndef SOMINCONN
110 #define SOMINCONN 80
111 #endif /* SOMINCONN */
112 
113 int	somaxconn = SOMAXCONN;
114 int	sominconn = SOMINCONN;
115 
116 struct pool socket_pool;
117 #ifdef SOCKET_SPLICE
118 struct pool sosplice_pool;
119 struct taskq *sosplice_taskq;
120 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
121 #endif
122 
123 void
124 soinit(void)
125 {
126 	pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
127 	    "sockpl", NULL);
128 #ifdef SOCKET_SPLICE
129 	pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
130 	    "sosppl", NULL);
131 #endif
132 }
133 
134 struct socket *
135 soalloc(const struct protosw *prp, int wait)
136 {
137 	const struct domain *dp = prp->pr_domain;
138 	struct socket *so;
139 
140 	so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
141 	    PR_ZERO);
142 	if (so == NULL)
143 		return (NULL);
144 	rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK);
145 	refcnt_init(&so->so_refcnt);
146 	rw_init(&so->so_rcv.sb_lock, "sbufrcv");
147 	rw_init(&so->so_snd.sb_lock, "sbufsnd");
148 	mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0);
149 	mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0);
150 	klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
151 	klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
152 	sigio_init(&so->so_sigio);
153 	TAILQ_INIT(&so->so_q0);
154 	TAILQ_INIT(&so->so_q);
155 
156 	switch (dp->dom_family) {
157 	case AF_INET:
158 	case AF_INET6:
159 		switch (prp->pr_type) {
160 		case SOCK_RAW:
161 		case SOCK_DGRAM:
162 			so->so_snd.sb_flags |= SB_MTXLOCK;
163 			so->so_rcv.sb_flags |= SB_MTXLOCK;
164 			break;
165 		}
166 		break;
167 	case AF_KEY:
168 	case AF_ROUTE:
169 	case AF_UNIX:
170 	case AF_FRAME:
171 		so->so_snd.sb_flags |= SB_MTXLOCK;
172 		so->so_rcv.sb_flags |= SB_MTXLOCK;
173 		break;
174 	}
175 
176 	return (so);
177 }
178 
179 /*
180  * Socket operation routines.
181  * These routines are called by the routines in
182  * sys_socket.c or from a system process, and
183  * implement the semantics of socket operations by
184  * switching out to the protocol specific routines.
185  */
186 int
187 socreate(int dom, struct socket **aso, int type, int proto)
188 {
189 	struct proc *p = curproc;		/* XXX */
190 	const struct protosw *prp;
191 	struct socket *so;
192 	int error;
193 
194 	if (proto)
195 		prp = pffindproto(dom, proto, type);
196 	else
197 		prp = pffindtype(dom, type);
198 	if (prp == NULL || prp->pr_usrreqs == NULL)
199 		return (EPROTONOSUPPORT);
200 	if (prp->pr_type != type)
201 		return (EPROTOTYPE);
202 	so = soalloc(prp, M_WAIT);
203 	so->so_type = type;
204 	if (suser(p) == 0)
205 		so->so_state = SS_PRIV;
206 	so->so_ruid = p->p_ucred->cr_ruid;
207 	so->so_euid = p->p_ucred->cr_uid;
208 	so->so_rgid = p->p_ucred->cr_rgid;
209 	so->so_egid = p->p_ucred->cr_gid;
210 	so->so_cpid = p->p_p->ps_pid;
211 	so->so_proto = prp;
212 	so->so_snd.sb_timeo_nsecs = INFSLP;
213 	so->so_rcv.sb_timeo_nsecs = INFSLP;
214 
215 	solock(so);
216 	error = pru_attach(so, proto, M_WAIT);
217 	if (error) {
218 		so->so_state |= SS_NOFDREF;
219 		/* sofree() calls sounlock(). */
220 		sofree(so, 0);
221 		return (error);
222 	}
223 	sounlock(so);
224 	*aso = so;
225 	return (0);
226 }
227 
228 int
229 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
230 {
231 	soassertlocked(so);
232 	return pru_bind(so, nam, p);
233 }
234 
235 int
236 solisten(struct socket *so, int backlog)
237 {
238 	int somaxconn_local = atomic_load_int(&somaxconn);
239 	int sominconn_local = atomic_load_int(&sominconn);
240 	int error;
241 
242 	switch (so->so_type) {
243 	case SOCK_STREAM:
244 	case SOCK_SEQPACKET:
245 		break;
246 	default:
247 		return (EOPNOTSUPP);
248 	}
249 
250 	soassertlocked(so);
251 
252 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
253 		return (EINVAL);
254 #ifdef SOCKET_SPLICE
255 	if (isspliced(so) || issplicedback(so))
256 		return (EOPNOTSUPP);
257 #endif /* SOCKET_SPLICE */
258 	error = pru_listen(so);
259 	if (error)
260 		return (error);
261 	if (TAILQ_FIRST(&so->so_q) == NULL)
262 		so->so_options |= SO_ACCEPTCONN;
263 	if (backlog < 0 || backlog > somaxconn_local)
264 		backlog = somaxconn_local;
265 	if (backlog < sominconn_local)
266 		backlog = sominconn_local;
267 	so->so_qlimit = backlog;
268 	return (0);
269 }
270 
271 #define SOSP_FREEING_READ	1
272 #define SOSP_FREEING_WRITE	2
273 void
274 sofree(struct socket *so, int keep_lock)
275 {
276 	int persocket = solock_persocket(so);
277 
278 	soassertlocked(so);
279 
280 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
281 		if (!keep_lock)
282 			sounlock(so);
283 		return;
284 	}
285 	if (so->so_head) {
286 		struct socket *head = so->so_head;
287 
288 		/*
289 		 * We must not decommission a socket that's on the accept(2)
290 		 * queue.  If we do, then accept(2) may hang after select(2)
291 		 * indicated that the listening socket was ready.
292 		 */
293 		if (so->so_onq == &head->so_q) {
294 			if (!keep_lock)
295 				sounlock(so);
296 			return;
297 		}
298 
299 		if (persocket) {
300 			/*
301 			 * Concurrent close of `head' could
302 			 * abort `so' due to re-lock.
303 			 */
304 			soref(so);
305 			soref(head);
306 			sounlock(so);
307 			solock(head);
308 			solock(so);
309 
310 			if (so->so_onq != &head->so_q0) {
311 				sounlock(head);
312 				sounlock(so);
313 				sorele(head);
314 				sorele(so);
315 				return;
316 			}
317 
318 			sorele(head);
319 			sorele(so);
320 		}
321 
322 		soqremque(so, 0);
323 
324 		if (persocket)
325 			sounlock(head);
326 	}
327 
328 	switch (so->so_proto->pr_domain->dom_family) {
329 	case AF_INET:
330 	case AF_INET6:
331 		if (so->so_proto->pr_type == SOCK_STREAM)
332 			break;
333 		/* FALLTHROUGH */
334 	default:
335 		sounlock(so);
336 		refcnt_finalize(&so->so_refcnt, "sofinal");
337 		solock(so);
338 		break;
339 	}
340 
341 	sigio_free(&so->so_sigio);
342 	klist_free(&so->so_rcv.sb_klist);
343 	klist_free(&so->so_snd.sb_klist);
344 
345 	mtx_enter(&so->so_snd.sb_mtx);
346 	sbrelease(so, &so->so_snd);
347 	mtx_leave(&so->so_snd.sb_mtx);
348 
349 	/*
350 	 * Unlocked dispose and cleanup is safe. Socket is unlinked
351 	 * from everywhere. Even concurrent sotask() thread will not
352 	 * call somove().
353 	 */
354 	if (so->so_proto->pr_flags & PR_RIGHTS &&
355 	    so->so_proto->pr_domain->dom_dispose)
356 		(*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
357 	m_purge(so->so_rcv.sb_mb);
358 
359 	if (!keep_lock)
360 		sounlock(so);
361 
362 #ifdef SOCKET_SPLICE
363 	if (so->so_sp) {
364 		/* Reuse splice idle, sounsplice() has been called before. */
365 		timeout_set_flags(&so->so_sp->ssp_idleto, soreaper, so,
366 		    KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
367 		timeout_add(&so->so_sp->ssp_idleto, 0);
368 	} else
369 #endif /* SOCKET_SPLICE */
370 	{
371 		pool_put(&socket_pool, so);
372 	}
373 }
374 
375 static inline uint64_t
376 solinger_nsec(struct socket *so)
377 {
378 	if (so->so_linger == 0)
379 		return INFSLP;
380 
381 	return SEC_TO_NSEC(so->so_linger);
382 }
383 
384 /*
385  * Close a socket on last file table reference removal.
386  * Initiate disconnect if connected.
387  * Free socket when disconnect complete.
388  */
389 int
390 soclose(struct socket *so, int flags)
391 {
392 	struct socket *so2;
393 	int error = 0;
394 
395 	solock(so);
396 	/* Revoke async IO early. There is a final revocation in sofree(). */
397 	sigio_free(&so->so_sigio);
398 	if (so->so_state & SS_ISCONNECTED) {
399 		if (so->so_pcb == NULL)
400 			goto discard;
401 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
402 			error = sodisconnect(so);
403 			if (error)
404 				goto drop;
405 		}
406 		if (so->so_options & SO_LINGER) {
407 			if ((so->so_state & SS_ISDISCONNECTING) &&
408 			    (flags & MSG_DONTWAIT))
409 				goto drop;
410 			while (so->so_state & SS_ISCONNECTED) {
411 				error = sosleep_nsec(so, &so->so_timeo,
412 				    PSOCK | PCATCH, "netcls",
413 				    solinger_nsec(so));
414 				if (error)
415 					break;
416 			}
417 		}
418 	}
419 drop:
420 	if (so->so_pcb) {
421 		int error2;
422 		error2 = pru_detach(so);
423 		if (error == 0)
424 			error = error2;
425 	}
426 	if (so->so_options & SO_ACCEPTCONN) {
427 		int persocket = solock_persocket(so);
428 
429 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
430 			if (persocket)
431 				solock(so2);
432 			(void) soqremque(so2, 0);
433 			if (persocket)
434 				sounlock(so);
435 			soabort(so2);
436 			if (persocket)
437 				solock(so);
438 		}
439 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
440 			if (persocket)
441 				solock(so2);
442 			(void) soqremque(so2, 1);
443 			if (persocket)
444 				sounlock(so);
445 			soabort(so2);
446 			if (persocket)
447 				solock(so);
448 		}
449 	}
450 discard:
451 	if (so->so_state & SS_NOFDREF)
452 		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
453 	so->so_state |= SS_NOFDREF;
454 
455 #ifdef SOCKET_SPLICE
456 	if (so->so_sp) {
457 		struct socket *soback;
458 
459 		if (so->so_proto->pr_flags & PR_WANTRCVD) {
460 			/*
461 			 * Copy - Paste, but can't relock and sleep in
462 			 * sofree() in tcp(4) case. That's why tcp(4)
463 			 * still rely on solock() for splicing and
464 			 * unsplicing.
465 			 */
466 
467 			if (issplicedback(so)) {
468 				int freeing = SOSP_FREEING_WRITE;
469 
470 				if (so->so_sp->ssp_soback == so)
471 					freeing |= SOSP_FREEING_READ;
472 				sounsplice(so->so_sp->ssp_soback, so, freeing);
473 			}
474 			if (isspliced(so)) {
475 				int freeing = SOSP_FREEING_READ;
476 
477 				if (so == so->so_sp->ssp_socket)
478 					freeing |= SOSP_FREEING_WRITE;
479 				sounsplice(so, so->so_sp->ssp_socket, freeing);
480 			}
481 			goto free;
482 		}
483 
484 		sounlock(so);
485 		mtx_enter(&so->so_snd.sb_mtx);
486 		/*
487 		 * Concurrent sounsplice() locks `sb_mtx' mutexes on
488 		 * both `so_snd' and `so_rcv' before unsplice sockets.
489 		 */
490 		if ((soback = so->so_sp->ssp_soback) == NULL) {
491 			mtx_leave(&so->so_snd.sb_mtx);
492 			goto notsplicedback;
493 		}
494 		soref(soback);
495 		mtx_leave(&so->so_snd.sb_mtx);
496 
497 		/*
498 		 * `so' can be only unspliced, and never spliced again.
499 		 * Thus if issplicedback(so) check is positive, socket is
500 		 * still spliced and `ssp_soback' points to the same
501 		 * socket that `soback'.
502 		 */
503 		sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR);
504 		if (issplicedback(so)) {
505 			int freeing = SOSP_FREEING_WRITE;
506 
507 			if (so->so_sp->ssp_soback == so)
508 				freeing |= SOSP_FREEING_READ;
509 			solock(soback);
510 			sounsplice(so->so_sp->ssp_soback, so, freeing);
511 			sounlock(soback);
512 		}
513 		sbunlock(&soback->so_rcv);
514 		sorele(soback);
515 
516 notsplicedback:
517 		sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
518 		if (isspliced(so)) {
519 			int freeing = SOSP_FREEING_READ;
520 
521 			if (so == so->so_sp->ssp_socket)
522 				freeing |= SOSP_FREEING_WRITE;
523 			solock(so);
524 			sounsplice(so, so->so_sp->ssp_socket, freeing);
525 			sounlock(so);
526 		}
527 		sbunlock(&so->so_rcv);
528 
529 		solock(so);
530 	}
531 free:
532 #endif /* SOCKET_SPLICE */
533 	/* sofree() calls sounlock(). */
534 	sofree(so, 0);
535 	return (error);
536 }
537 
538 void
539 soabort(struct socket *so)
540 {
541 	soassertlocked(so);
542 	pru_abort(so);
543 }
544 
545 int
546 soaccept(struct socket *so, struct mbuf *nam)
547 {
548 	int error = 0;
549 
550 	soassertlocked(so);
551 
552 	if ((so->so_state & SS_NOFDREF) == 0)
553 		panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
554 	so->so_state &= ~SS_NOFDREF;
555 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
556 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
557 		error = pru_accept(so, nam);
558 	else
559 		error = ECONNABORTED;
560 	return (error);
561 }
562 
563 int
564 soconnect(struct socket *so, struct mbuf *nam)
565 {
566 	int error;
567 
568 	soassertlocked(so);
569 
570 	if (so->so_options & SO_ACCEPTCONN)
571 		return (EOPNOTSUPP);
572 	/*
573 	 * If protocol is connection-based, can only connect once.
574 	 * Otherwise, if connected, try to disconnect first.
575 	 * This allows user to disconnect by connecting to, e.g.,
576 	 * a null address.
577 	 */
578 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
579 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
580 	    (error = sodisconnect(so))))
581 		error = EISCONN;
582 	else
583 		error = pru_connect(so, nam);
584 	return (error);
585 }
586 
587 int
588 soconnect2(struct socket *so1, struct socket *so2)
589 {
590 	int persocket, error;
591 
592 	if ((persocket = solock_persocket(so1)))
593 		solock_pair(so1, so2);
594 	else
595 		solock(so1);
596 
597 	error = pru_connect2(so1, so2);
598 
599 	if (persocket)
600 		sounlock(so2);
601 	sounlock(so1);
602 	return (error);
603 }
604 
605 int
606 sodisconnect(struct socket *so)
607 {
608 	int error;
609 
610 	soassertlocked(so);
611 
612 	if ((so->so_state & SS_ISCONNECTED) == 0)
613 		return (ENOTCONN);
614 	if (so->so_state & SS_ISDISCONNECTING)
615 		return (EALREADY);
616 	error = pru_disconnect(so);
617 	return (error);
618 }
619 
620 int m_getuio(struct mbuf **, int, long, struct uio *);
621 
622 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
623 /*
624  * Send on a socket.
625  * If send must go all at once and message is larger than
626  * send buffering, then hard error.
627  * Lock against other senders.
628  * If must go all at once and not enough room now, then
629  * inform user that this would block and do nothing.
630  * Otherwise, if nonblocking, send as much as possible.
631  * The data to be sent is described by "uio" if nonzero,
632  * otherwise by the mbuf chain "top" (which must be null
633  * if uio is not).  Data provided in mbuf chain must be small
634  * enough to send all at once.
635  *
636  * Returns nonzero on error, timeout or signal; callers
637  * must check for short counts if EINTR/ERESTART are returned.
638  * Data and control buffers are freed on return.
639  */
640 int
641 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
642     struct mbuf *control, int flags)
643 {
644 	long space, clen = 0;
645 	size_t resid;
646 	int error;
647 	int atomic = sosendallatonce(so) || top;
648 	int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0);
649 
650 	if (uio)
651 		resid = uio->uio_resid;
652 	else
653 		resid = top->m_pkthdr.len;
654 	/* MSG_EOR on a SOCK_STREAM socket is invalid. */
655 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
656 		m_freem(top);
657 		m_freem(control);
658 		return (EINVAL);
659 	}
660 	if (uio && uio->uio_procp)
661 		uio->uio_procp->p_ru.ru_msgsnd++;
662 	if (control) {
663 		/*
664 		 * In theory clen should be unsigned (since control->m_len is).
665 		 * However, space must be signed, as it might be less than 0
666 		 * if we over-committed, and we must use a signed comparison
667 		 * of space and clen.
668 		 */
669 		clen = control->m_len;
670 		/* reserve extra space for AF_UNIX's internalize */
671 		if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
672 		    clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
673 		    mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
674 			clen = CMSG_SPACE(
675 			    (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
676 			    (sizeof(struct fdpass) / sizeof(int)));
677 	}
678 
679 #define	snderr(errno)	{ error = errno; goto release; }
680 
681 restart:
682 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
683 		goto out;
684 	if (dosolock)
685 		solock_shared(so);
686 	sb_mtx_lock(&so->so_snd);
687 	so->so_snd.sb_state |= SS_ISSENDING;
688 	do {
689 		if (so->so_snd.sb_state & SS_CANTSENDMORE)
690 			snderr(EPIPE);
691 		if ((error = READ_ONCE(so->so_error))) {
692 			so->so_error = 0;
693 			snderr(error);
694 		}
695 		if ((so->so_state & SS_ISCONNECTED) == 0) {
696 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
697 				if (!(resid == 0 && clen != 0))
698 					snderr(ENOTCONN);
699 			} else if (addr == NULL)
700 				snderr(EDESTADDRREQ);
701 		}
702 		space = sbspace_locked(so, &so->so_snd);
703 		if (flags & MSG_OOB)
704 			space += 1024;
705 		if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
706 			if (atomic && resid > so->so_snd.sb_hiwat)
707 				snderr(EMSGSIZE);
708 		} else {
709 			if (clen > so->so_snd.sb_hiwat ||
710 			    (atomic && resid > so->so_snd.sb_hiwat - clen))
711 				snderr(EMSGSIZE);
712 		}
713 		if (space < clen ||
714 		    (space - clen < resid &&
715 		    (atomic || space < so->so_snd.sb_lowat))) {
716 			if (flags & MSG_DONTWAIT)
717 				snderr(EWOULDBLOCK);
718 			sbunlock(&so->so_snd);
719 			error = sbwait(so, &so->so_snd);
720 			so->so_snd.sb_state &= ~SS_ISSENDING;
721 			sb_mtx_unlock(&so->so_snd);
722 			if (dosolock)
723 				sounlock_shared(so);
724 			if (error)
725 				goto out;
726 			goto restart;
727 		}
728 		space -= clen;
729 		do {
730 			if (uio == NULL) {
731 				/*
732 				 * Data is prepackaged in "top".
733 				 */
734 				resid = 0;
735 				if (flags & MSG_EOR)
736 					top->m_flags |= M_EOR;
737 			} else {
738 				sb_mtx_unlock(&so->so_snd);
739 				if (dosolock)
740 					sounlock_shared(so);
741 				error = m_getuio(&top, atomic, space, uio);
742 				if (dosolock)
743 					solock_shared(so);
744 				sb_mtx_lock(&so->so_snd);
745 				if (error)
746 					goto release;
747 				space -= top->m_pkthdr.len;
748 				resid = uio->uio_resid;
749 				if (flags & MSG_EOR)
750 					top->m_flags |= M_EOR;
751 			}
752 			if (resid == 0)
753 				so->so_snd.sb_state &= ~SS_ISSENDING;
754 			if (top && so->so_options & SO_ZEROIZE)
755 				top->m_flags |= M_ZEROIZE;
756 			sb_mtx_unlock(&so->so_snd);
757 			if (!dosolock)
758 				solock_shared(so);
759 			if (flags & MSG_OOB)
760 				error = pru_sendoob(so, top, addr, control);
761 			else
762 				error = pru_send(so, top, addr, control);
763 			if (!dosolock)
764 				sounlock_shared(so);
765 			sb_mtx_lock(&so->so_snd);
766 			clen = 0;
767 			control = NULL;
768 			top = NULL;
769 			if (error)
770 				goto release;
771 		} while (resid && space > 0);
772 	} while (resid);
773 
774 release:
775 	so->so_snd.sb_state &= ~SS_ISSENDING;
776 	sb_mtx_unlock(&so->so_snd);
777 	if (dosolock)
778 		sounlock_shared(so);
779 	sbunlock(&so->so_snd);
780 out:
781 	m_freem(top);
782 	m_freem(control);
783 	return (error);
784 }
785 
786 int
787 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
788 {
789 	struct mbuf *m, *top = NULL;
790 	struct mbuf **nextp = &top;
791 	u_long len, mlen;
792 	size_t resid = uio->uio_resid;
793 	int error;
794 
795 	do {
796 		if (top == NULL) {
797 			MGETHDR(m, M_WAIT, MT_DATA);
798 			mlen = MHLEN;
799 		} else {
800 			MGET(m, M_WAIT, MT_DATA);
801 			mlen = MLEN;
802 		}
803 		/* chain mbuf together */
804 		*nextp = m;
805 		nextp = &m->m_next;
806 
807 		resid = ulmin(resid, space);
808 		if (resid >= MINCLSIZE) {
809 			MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES));
810 			if ((m->m_flags & M_EXT) == 0)
811 				MCLGETL(m, M_NOWAIT, MCLBYTES);
812 			if ((m->m_flags & M_EXT) == 0)
813 				goto nopages;
814 			mlen = m->m_ext.ext_size;
815 			len = ulmin(mlen, resid);
816 			/*
817 			 * For datagram protocols, leave room
818 			 * for protocol headers in first mbuf.
819 			 */
820 			if (atomic && m == top && len < mlen - max_hdr)
821 				m->m_data += max_hdr;
822 		} else {
823 nopages:
824 			len = ulmin(mlen, resid);
825 			/*
826 			 * For datagram protocols, leave room
827 			 * for protocol headers in first mbuf.
828 			 */
829 			if (atomic && m == top && len < mlen - max_hdr)
830 				m_align(m, len);
831 		}
832 
833 		error = uiomove(mtod(m, caddr_t), len, uio);
834 		if (error) {
835 			m_freem(top);
836 			return (error);
837 		}
838 
839 		/* adjust counters */
840 		resid = uio->uio_resid;
841 		space -= len;
842 		m->m_len = len;
843 		top->m_pkthdr.len += len;
844 
845 		/* Is there more space and more data? */
846 	} while (space > 0 && resid > 0);
847 
848 	*mp = top;
849 	return 0;
850 }
851 
852 /*
853  * Following replacement or removal of the first mbuf on the first
854  * mbuf chain of a socket buffer, push necessary state changes back
855  * into the socket buffer so that other consumers see the values
856  * consistently.  'nextrecord' is the callers locally stored value of
857  * the original value of sb->sb_mb->m_nextpkt which must be restored
858  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
859  */
860 void
861 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
862 {
863 
864 	/*
865 	 * First, update for the new value of nextrecord.  If necessary,
866 	 * make it the first record.
867 	 */
868 	if (sb->sb_mb != NULL)
869 		sb->sb_mb->m_nextpkt = nextrecord;
870 	else
871 		sb->sb_mb = nextrecord;
872 
873 	/*
874 	 * Now update any dependent socket buffer fields to reflect
875 	 * the new state.  This is an inline of SB_EMPTY_FIXUP, with
876 	 * the addition of a second clause that takes care of the
877 	 * case where sb_mb has been updated, but remains the last
878 	 * record.
879 	 */
880 	if (sb->sb_mb == NULL) {
881 		sb->sb_mbtail = NULL;
882 		sb->sb_lastrecord = NULL;
883 	} else if (sb->sb_mb->m_nextpkt == NULL)
884 		sb->sb_lastrecord = sb->sb_mb;
885 }
886 
887 /*
888  * Implement receive operations on a socket.
889  * We depend on the way that records are added to the sockbuf
890  * by sbappend*.  In particular, each record (mbufs linked through m_next)
891  * must begin with an address if the protocol so specifies,
892  * followed by an optional mbuf or mbufs containing ancillary data,
893  * and then zero or more mbufs of data.
894  * In order to avoid blocking network for the entire time here, we release
895  * the solock() while doing the actual copy to user space.
896  * Although the sockbuf is locked, new data may still be appended,
897  * and thus we must maintain consistency of the sockbuf during that time.
898  *
899  * The caller may receive the data as a single mbuf chain by supplying
900  * an mbuf **mp0 for use in returning the chain.  The uio is then used
901  * only for the count in uio_resid.
902  */
903 int
904 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
905     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
906     socklen_t controllen)
907 {
908 	struct mbuf *m, **mp;
909 	struct mbuf *cm;
910 	u_long len, offset, moff;
911 	int flags, error, error2, type, uio_error = 0;
912 	const struct protosw *pr = so->so_proto;
913 	struct mbuf *nextrecord;
914 	size_t resid, orig_resid = uio->uio_resid;
915 	int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0);
916 
917 	mp = mp0;
918 	if (paddr)
919 		*paddr = NULL;
920 	if (controlp)
921 		*controlp = NULL;
922 	if (flagsp)
923 		flags = *flagsp &~ MSG_EOR;
924 	else
925 		flags = 0;
926 	if (flags & MSG_OOB) {
927 		m = m_get(M_WAIT, MT_DATA);
928 		solock(so);
929 		error = pru_rcvoob(so, m, flags & MSG_PEEK);
930 		sounlock(so);
931 		if (error)
932 			goto bad;
933 		do {
934 			error = uiomove(mtod(m, caddr_t),
935 			    ulmin(uio->uio_resid, m->m_len), uio);
936 			m = m_free(m);
937 		} while (uio->uio_resid && error == 0 && m);
938 bad:
939 		m_freem(m);
940 		return (error);
941 	}
942 	if (mp)
943 		*mp = NULL;
944 
945 restart:
946 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
947 		return (error);
948 	if (dosolock)
949 		solock_shared(so);
950 	sb_mtx_lock(&so->so_rcv);
951 
952 	m = so->so_rcv.sb_mb;
953 #ifdef SOCKET_SPLICE
954 	if (isspliced(so))
955 		m = NULL;
956 #endif /* SOCKET_SPLICE */
957 	/*
958 	 * If we have less data than requested, block awaiting more
959 	 * (subject to any timeout) if:
960 	 *   1. the current count is less than the low water mark,
961 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
962 	 *	receive operation at once if we block (resid <= hiwat), or
963 	 *   3. MSG_DONTWAIT is not set.
964 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
965 	 * we have to do the receive in sections, and thus risk returning
966 	 * a short count if a timeout or signal occurs after we start.
967 	 */
968 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
969 	    so->so_rcv.sb_cc < uio->uio_resid) &&
970 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
971 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
972 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
973 #ifdef DIAGNOSTIC
974 		if (m == NULL && so->so_rcv.sb_cc)
975 #ifdef SOCKET_SPLICE
976 		    if (!isspliced(so))
977 #endif /* SOCKET_SPLICE */
978 			panic("receive 1: so %p, so_type %d, sb_cc %lu",
979 			    so, so->so_type, so->so_rcv.sb_cc);
980 #endif
981 		if ((error2 = READ_ONCE(so->so_error))) {
982 			if (m)
983 				goto dontblock;
984 			error = error2;
985 			if ((flags & MSG_PEEK) == 0)
986 				so->so_error = 0;
987 			goto release;
988 		}
989 		if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
990 			if (m)
991 				goto dontblock;
992 			else if (so->so_rcv.sb_cc == 0)
993 				goto release;
994 		}
995 		for (; m; m = m->m_next)
996 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
997 				m = so->so_rcv.sb_mb;
998 				goto dontblock;
999 			}
1000 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1001 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1002 			error = ENOTCONN;
1003 			goto release;
1004 		}
1005 		if (uio->uio_resid == 0 && controlp == NULL)
1006 			goto release;
1007 		if (flags & MSG_DONTWAIT) {
1008 			error = EWOULDBLOCK;
1009 			goto release;
1010 		}
1011 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
1012 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
1013 
1014 		sbunlock(&so->so_rcv);
1015 		error = sbwait(so, &so->so_rcv);
1016 		sb_mtx_unlock(&so->so_rcv);
1017 		if (dosolock)
1018 			sounlock_shared(so);
1019 		if (error)
1020 			return (error);
1021 		goto restart;
1022 	}
1023 dontblock:
1024 	/*
1025 	 * On entry here, m points to the first record of the socket buffer.
1026 	 * From this point onward, we maintain 'nextrecord' as a cache of the
1027 	 * pointer to the next record in the socket buffer.  We must keep the
1028 	 * various socket buffer pointers and local stack versions of the
1029 	 * pointers in sync, pushing out modifications before operations that
1030 	 * may sleep, and re-reading them afterwards.
1031 	 *
1032 	 * Otherwise, we will race with the network stack appending new data
1033 	 * or records onto the socket buffer by using inconsistent/stale
1034 	 * versions of the field, possibly resulting in socket buffer
1035 	 * corruption.
1036 	 */
1037 	if (uio->uio_procp)
1038 		uio->uio_procp->p_ru.ru_msgrcv++;
1039 	KASSERT(m == so->so_rcv.sb_mb);
1040 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
1041 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1042 	nextrecord = m->m_nextpkt;
1043 	if (pr->pr_flags & PR_ADDR) {
1044 #ifdef DIAGNOSTIC
1045 		if (m->m_type != MT_SONAME)
1046 			panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
1047 			    so, so->so_type, m, m->m_type);
1048 #endif
1049 		orig_resid = 0;
1050 		if (flags & MSG_PEEK) {
1051 			if (paddr)
1052 				*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
1053 			m = m->m_next;
1054 		} else {
1055 			sbfree(so, &so->so_rcv, m);
1056 			if (paddr) {
1057 				*paddr = m;
1058 				so->so_rcv.sb_mb = m->m_next;
1059 				m->m_next = NULL;
1060 				m = so->so_rcv.sb_mb;
1061 			} else {
1062 				so->so_rcv.sb_mb = m_free(m);
1063 				m = so->so_rcv.sb_mb;
1064 			}
1065 			sbsync(&so->so_rcv, nextrecord);
1066 		}
1067 	}
1068 	while (m && m->m_type == MT_CONTROL && error == 0) {
1069 		int skip = 0;
1070 		if (flags & MSG_PEEK) {
1071 			if (mtod(m, struct cmsghdr *)->cmsg_type ==
1072 			    SCM_RIGHTS) {
1073 				/* don't leak internalized SCM_RIGHTS msgs */
1074 				skip = 1;
1075 			} else if (controlp)
1076 				*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
1077 			m = m->m_next;
1078 		} else {
1079 			sbfree(so, &so->so_rcv, m);
1080 			so->so_rcv.sb_mb = m->m_next;
1081 			m->m_nextpkt = m->m_next = NULL;
1082 			cm = m;
1083 			m = so->so_rcv.sb_mb;
1084 			sbsync(&so->so_rcv, nextrecord);
1085 			if (controlp) {
1086 				if (pr->pr_domain->dom_externalize) {
1087 					sb_mtx_unlock(&so->so_rcv);
1088 					if (dosolock)
1089 						sounlock_shared(so);
1090 					error =
1091 					    (*pr->pr_domain->dom_externalize)
1092 					    (cm, controllen, flags);
1093 					if (dosolock)
1094 						solock_shared(so);
1095 					sb_mtx_lock(&so->so_rcv);
1096 				}
1097 				*controlp = cm;
1098 			} else {
1099 				/*
1100 				 * Dispose of any SCM_RIGHTS message that went
1101 				 * through the read path rather than recv.
1102 				 */
1103 				if (pr->pr_domain->dom_dispose) {
1104 					sb_mtx_unlock(&so->so_rcv);
1105 					pr->pr_domain->dom_dispose(cm);
1106 					sb_mtx_lock(&so->so_rcv);
1107 				}
1108 				m_free(cm);
1109 			}
1110 		}
1111 		if (m != NULL)
1112 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1113 		else
1114 			nextrecord = so->so_rcv.sb_mb;
1115 		if (controlp && !skip)
1116 			controlp = &(*controlp)->m_next;
1117 		orig_resid = 0;
1118 	}
1119 
1120 	/* If m is non-NULL, we have some data to read. */
1121 	if (m) {
1122 		type = m->m_type;
1123 		if (type == MT_OOBDATA)
1124 			flags |= MSG_OOB;
1125 		if (m->m_flags & M_BCAST)
1126 			flags |= MSG_BCAST;
1127 		if (m->m_flags & M_MCAST)
1128 			flags |= MSG_MCAST;
1129 	}
1130 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1131 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1132 
1133 	moff = 0;
1134 	offset = 0;
1135 	while (m && uio->uio_resid > 0 && error == 0) {
1136 		if (m->m_type == MT_OOBDATA) {
1137 			if (type != MT_OOBDATA)
1138 				break;
1139 		} else if (type == MT_OOBDATA) {
1140 			break;
1141 		} else if (m->m_type == MT_CONTROL) {
1142 			/*
1143 			 * If there is more than one control message in the
1144 			 * stream, we do a short read.  Next can be received
1145 			 * or disposed by another system call.
1146 			 */
1147 			break;
1148 #ifdef DIAGNOSTIC
1149 		} else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1150 			panic("receive 3: so %p, so_type %d, m %p, m_type %d",
1151 			    so, so->so_type, m, m->m_type);
1152 #endif
1153 		}
1154 		so->so_rcv.sb_state &= ~SS_RCVATMARK;
1155 		len = uio->uio_resid;
1156 		if (so->so_oobmark && len > so->so_oobmark - offset)
1157 			len = so->so_oobmark - offset;
1158 		if (len > m->m_len - moff)
1159 			len = m->m_len - moff;
1160 		/*
1161 		 * If mp is set, just pass back the mbufs.
1162 		 * Otherwise copy them out via the uio, then free.
1163 		 * Sockbuf must be consistent here (points to current mbuf,
1164 		 * it points to next record) when we drop priority;
1165 		 * we must note any additions to the sockbuf when we
1166 		 * block interrupts again.
1167 		 */
1168 		if (mp == NULL && uio_error == 0) {
1169 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1170 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1171 			resid = uio->uio_resid;
1172 			sb_mtx_unlock(&so->so_rcv);
1173 			if (dosolock)
1174 				sounlock_shared(so);
1175 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1176 			if (dosolock)
1177 				solock_shared(so);
1178 			sb_mtx_lock(&so->so_rcv);
1179 			if (uio_error)
1180 				uio->uio_resid = resid - len;
1181 		} else
1182 			uio->uio_resid -= len;
1183 		if (len == m->m_len - moff) {
1184 			if (m->m_flags & M_EOR)
1185 				flags |= MSG_EOR;
1186 			if (flags & MSG_PEEK) {
1187 				m = m->m_next;
1188 				moff = 0;
1189 				orig_resid = 0;
1190 			} else {
1191 				nextrecord = m->m_nextpkt;
1192 				sbfree(so, &so->so_rcv, m);
1193 				if (mp) {
1194 					*mp = m;
1195 					mp = &m->m_next;
1196 					so->so_rcv.sb_mb = m = m->m_next;
1197 					*mp = NULL;
1198 				} else {
1199 					so->so_rcv.sb_mb = m_free(m);
1200 					m = so->so_rcv.sb_mb;
1201 				}
1202 				/*
1203 				 * If m != NULL, we also know that
1204 				 * so->so_rcv.sb_mb != NULL.
1205 				 */
1206 				KASSERT(so->so_rcv.sb_mb == m);
1207 				if (m) {
1208 					m->m_nextpkt = nextrecord;
1209 					if (nextrecord == NULL)
1210 						so->so_rcv.sb_lastrecord = m;
1211 				} else {
1212 					so->so_rcv.sb_mb = nextrecord;
1213 					SB_EMPTY_FIXUP(&so->so_rcv);
1214 				}
1215 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1216 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1217 			}
1218 		} else {
1219 			if (flags & MSG_PEEK) {
1220 				moff += len;
1221 				orig_resid = 0;
1222 			} else {
1223 				if (mp)
1224 					*mp = m_copym(m, 0, len, M_WAIT);
1225 				m->m_data += len;
1226 				m->m_len -= len;
1227 				so->so_rcv.sb_cc -= len;
1228 				so->so_rcv.sb_datacc -= len;
1229 			}
1230 		}
1231 		if (so->so_oobmark) {
1232 			if ((flags & MSG_PEEK) == 0) {
1233 				so->so_oobmark -= len;
1234 				if (so->so_oobmark == 0) {
1235 					so->so_rcv.sb_state |= SS_RCVATMARK;
1236 					break;
1237 				}
1238 			} else {
1239 				offset += len;
1240 				if (offset == so->so_oobmark)
1241 					break;
1242 			}
1243 		}
1244 		if (flags & MSG_EOR)
1245 			break;
1246 		/*
1247 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1248 		 * we must not quit until "uio->uio_resid == 0" or an error
1249 		 * termination.  If a signal/timeout occurs, return
1250 		 * with a short count but without error.
1251 		 * Keep sockbuf locked against other readers.
1252 		 */
1253 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1254 		    !sosendallatonce(so) && !nextrecord) {
1255 			if (so->so_rcv.sb_state & SS_CANTRCVMORE ||
1256 			    so->so_error)
1257 				break;
1258 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1259 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1260 			if (sbwait(so, &so->so_rcv)) {
1261 				sb_mtx_unlock(&so->so_rcv);
1262 				if (dosolock)
1263 					sounlock_shared(so);
1264 				sbunlock(&so->so_rcv);
1265 				return (0);
1266 			}
1267 			if ((m = so->so_rcv.sb_mb) != NULL)
1268 				nextrecord = m->m_nextpkt;
1269 		}
1270 	}
1271 
1272 	if (m && pr->pr_flags & PR_ATOMIC) {
1273 		flags |= MSG_TRUNC;
1274 		if ((flags & MSG_PEEK) == 0)
1275 			(void) sbdroprecord(so, &so->so_rcv);
1276 	}
1277 	if ((flags & MSG_PEEK) == 0) {
1278 		if (m == NULL) {
1279 			/*
1280 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1281 			 * part makes sure sb_lastrecord is up-to-date if
1282 			 * there is still data in the socket buffer.
1283 			 */
1284 			so->so_rcv.sb_mb = nextrecord;
1285 			if (so->so_rcv.sb_mb == NULL) {
1286 				so->so_rcv.sb_mbtail = NULL;
1287 				so->so_rcv.sb_lastrecord = NULL;
1288 			} else if (nextrecord->m_nextpkt == NULL)
1289 				so->so_rcv.sb_lastrecord = nextrecord;
1290 		}
1291 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1292 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1293 		if (pr->pr_flags & PR_WANTRCVD) {
1294 			sb_mtx_unlock(&so->so_rcv);
1295 			if (!dosolock)
1296 				solock_shared(so);
1297 			pru_rcvd(so);
1298 			if (!dosolock)
1299 				sounlock_shared(so);
1300 			sb_mtx_lock(&so->so_rcv);
1301 		}
1302 	}
1303 	if (orig_resid == uio->uio_resid && orig_resid &&
1304 	    (flags & MSG_EOR) == 0 &&
1305 	    (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
1306 		sb_mtx_unlock(&so->so_rcv);
1307 		sbunlock(&so->so_rcv);
1308 		goto restart;
1309 	}
1310 
1311 	if (uio_error)
1312 		error = uio_error;
1313 
1314 	if (flagsp)
1315 		*flagsp |= flags;
1316 release:
1317 	sb_mtx_unlock(&so->so_rcv);
1318 	if (dosolock)
1319 		sounlock_shared(so);
1320 	sbunlock(&so->so_rcv);
1321 	return (error);
1322 }
1323 
1324 int
1325 soshutdown(struct socket *so, int how)
1326 {
1327 	int error = 0;
1328 
1329 	switch (how) {
1330 	case SHUT_RD:
1331 		sorflush(so);
1332 		break;
1333 	case SHUT_RDWR:
1334 		sorflush(so);
1335 		/* FALLTHROUGH */
1336 	case SHUT_WR:
1337 		solock(so);
1338 		error = pru_shutdown(so);
1339 		sounlock(so);
1340 		break;
1341 	default:
1342 		error = EINVAL;
1343 		break;
1344 	}
1345 
1346 	return (error);
1347 }
1348 
1349 void
1350 sorflush(struct socket *so)
1351 {
1352 	struct sockbuf *sb = &so->so_rcv;
1353 	struct mbuf *m;
1354 	const struct protosw *pr = so->so_proto;
1355 	int error;
1356 
1357 	error = sblock(sb, SBL_WAIT | SBL_NOINTR);
1358 	/* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
1359 	KASSERT(error == 0);
1360 
1361 	solock_shared(so);
1362 	socantrcvmore(so);
1363 	mtx_enter(&sb->sb_mtx);
1364 	m = sb->sb_mb;
1365 	memset(&sb->sb_startzero, 0,
1366 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1367 	sb->sb_timeo_nsecs = INFSLP;
1368 	mtx_leave(&sb->sb_mtx);
1369 	sounlock_shared(so);
1370 	sbunlock(sb);
1371 
1372 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1373 		(*pr->pr_domain->dom_dispose)(m);
1374 	m_purge(m);
1375 }
1376 
1377 #ifdef SOCKET_SPLICE
1378 
1379 #define so_splicelen	so_sp->ssp_len
1380 #define so_splicemax	so_sp->ssp_max
1381 #define so_idletv	so_sp->ssp_idletv
1382 #define so_idleto	so_sp->ssp_idleto
1383 #define so_splicetask	so_sp->ssp_task
1384 
1385 int
1386 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1387 {
1388 	struct file	*fp;
1389 	struct socket	*sosp;
1390 	struct taskq	*tq;
1391 	int		 error = 0;
1392 
1393 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1394 		return (EPROTONOSUPPORT);
1395 	if (max && max < 0)
1396 		return (EINVAL);
1397 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1398 		return (EINVAL);
1399 
1400 	/* If no fd is given, unsplice by removing existing link. */
1401 	if (fd < 0) {
1402 		if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1403 			return (error);
1404 		solock(so);
1405 		if (so->so_options & SO_ACCEPTCONN) {
1406 			error = EOPNOTSUPP;
1407 			goto out;
1408 		}
1409 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1410 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1411 			error = ENOTCONN;
1412 			goto out;
1413 		}
1414 
1415 		if (so->so_sp && so->so_sp->ssp_socket)
1416 			sounsplice(so, so->so_sp->ssp_socket, 0);
1417  out:
1418 		sounlock(so);
1419 		sbunlock(&so->so_rcv);
1420 		return (error);
1421 	}
1422 
1423 	if (sosplice_taskq == NULL) {
1424 		rw_enter_write(&sosplice_lock);
1425 		if (sosplice_taskq == NULL) {
1426 			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1427 			    TASKQ_MPSAFE);
1428 			if (tq == NULL) {
1429 				rw_exit_write(&sosplice_lock);
1430 				return (ENOMEM);
1431 			}
1432 			/* Ensure the taskq is fully visible to other CPUs. */
1433 			membar_producer();
1434 			sosplice_taskq = tq;
1435 		}
1436 		rw_exit_write(&sosplice_lock);
1437 	} else {
1438 		/* Ensure the taskq is fully visible on this CPU. */
1439 		membar_consumer();
1440 	}
1441 
1442 	/* Find sosp, the drain socket where data will be spliced into. */
1443 	if ((error = getsock(curproc, fd, &fp)) != 0)
1444 		return (error);
1445 	sosp = fp->f_data;
1446 
1447 	if (sosp->so_proto->pr_usrreqs->pru_send !=
1448 	    so->so_proto->pr_usrreqs->pru_send) {
1449 		error = EPROTONOSUPPORT;
1450 		goto frele;
1451 	}
1452 
1453 	if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1454 		goto frele;
1455 	if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) {
1456 		sbunlock(&so->so_rcv);
1457 		goto frele;
1458 	}
1459 	solock(so);
1460 
1461 	if ((so->so_options & SO_ACCEPTCONN) ||
1462 	    (sosp->so_options & SO_ACCEPTCONN)) {
1463 		error = EOPNOTSUPP;
1464 		goto release;
1465 	}
1466 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1467 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1468 		error = ENOTCONN;
1469 		goto release;
1470 	}
1471 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1472 		error = ENOTCONN;
1473 		goto release;
1474 	}
1475 	if (so->so_sp == NULL)
1476 		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1477 	if (sosp->so_sp == NULL)
1478 		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1479 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1480 		error = EBUSY;
1481 		goto release;
1482 	}
1483 
1484 	so->so_splicelen = 0;
1485 	so->so_splicemax = max;
1486 	if (tv)
1487 		so->so_idletv = *tv;
1488 	else
1489 		timerclear(&so->so_idletv);
1490 	timeout_set_flags(&so->so_idleto, soidle, so,
1491 	    KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
1492 	task_set(&so->so_splicetask, sotask, so);
1493 
1494 	/*
1495 	 * To prevent sorwakeup() calling somove() before this somove()
1496 	 * has finished, the socket buffers are not marked as spliced yet.
1497 	 */
1498 
1499 	/* Splice so and sosp together. */
1500 	mtx_enter(&so->so_rcv.sb_mtx);
1501 	mtx_enter(&sosp->so_snd.sb_mtx);
1502 	so->so_sp->ssp_socket = sosp;
1503 	sosp->so_sp->ssp_soback = so;
1504 	mtx_leave(&sosp->so_snd.sb_mtx);
1505 	mtx_leave(&so->so_rcv.sb_mtx);
1506 
1507 	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
1508 		sounlock(so);
1509 	if (somove(so, M_WAIT)) {
1510 		mtx_enter(&so->so_rcv.sb_mtx);
1511 		mtx_enter(&sosp->so_snd.sb_mtx);
1512 		so->so_rcv.sb_flags |= SB_SPLICE;
1513 		sosp->so_snd.sb_flags |= SB_SPLICE;
1514 		mtx_leave(&sosp->so_snd.sb_mtx);
1515 		mtx_leave(&so->so_rcv.sb_mtx);
1516 	}
1517 	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
1518 		solock(so);
1519 
1520  release:
1521 	sounlock(so);
1522 	sbunlock(&sosp->so_snd);
1523 	sbunlock(&so->so_rcv);
1524  frele:
1525 	FRELE(fp, curproc);
1526 
1527 	return (error);
1528 }
1529 
1530 void
1531 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1532 {
1533 	if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
1534 		sbassertlocked(&so->so_rcv);
1535 	soassertlocked(so);
1536 
1537 	task_del(sosplice_taskq, &so->so_splicetask);
1538 	timeout_del(&so->so_idleto);
1539 
1540 	mtx_enter(&so->so_rcv.sb_mtx);
1541 	mtx_enter(&sosp->so_snd.sb_mtx);
1542 	so->so_rcv.sb_flags &= ~SB_SPLICE;
1543 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
1544 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1545 	mtx_leave(&sosp->so_snd.sb_mtx);
1546 	mtx_leave(&so->so_rcv.sb_mtx);
1547 
1548 	/* Do not wakeup a socket that is about to be freed. */
1549 	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
1550 		sorwakeup(so);
1551 	if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp))
1552 		sowwakeup(sosp);
1553 }
1554 
1555 void
1556 soidle(void *arg)
1557 {
1558 	struct socket *so = arg;
1559 
1560 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
1561 	solock(so);
1562 	/*
1563 	 * Depending on socket type, sblock(&so->so_rcv) or solock()
1564 	 * is always held while modifying SB_SPLICE and
1565 	 * so->so_sp->ssp_socket.
1566 	 */
1567 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1568 		so->so_error = ETIMEDOUT;
1569 		sounsplice(so, so->so_sp->ssp_socket, 0);
1570 	}
1571 	sounlock(so);
1572 	sbunlock(&so->so_rcv);
1573 }
1574 
1575 void
1576 sotask(void *arg)
1577 {
1578 	struct socket *so = arg;
1579 	int doyield = 0;
1580 	int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD);
1581 
1582 	/*
1583 	 * sblock() on `so_rcv' protects sockets from being unspliced
1584 	 * for UDP case. TCP sockets still rely on solock().
1585 	 */
1586 
1587 	sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
1588 	if (sockstream)
1589 		solock(so);
1590 
1591 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1592 		if (sockstream)
1593 			doyield = 1;
1594 		somove(so, M_DONTWAIT);
1595 	}
1596 
1597 	if (sockstream)
1598 		sounlock(so);
1599 	sbunlock(&so->so_rcv);
1600 
1601 	if (doyield) {
1602 		/* Avoid user land starvation. */
1603 		yield();
1604 	}
1605 }
1606 
1607 /*
1608  * The socket splicing task or idle timeout may sleep while grabbing the net
1609  * lock.  As sofree() can be called anytime, sotask() or soidle() could access
1610  * the socket memory of a freed socket after wakeup.  So delay the pool_put()
1611  * after all pending socket splicing tasks or timeouts have finished.  Do this
1612  * by scheduling it on the same threads.
1613  */
1614 void
1615 soreaper(void *arg)
1616 {
1617 	struct socket *so = arg;
1618 
1619 	/* Reuse splice task, sounsplice() has been called before. */
1620 	task_set(&so->so_sp->ssp_task, soput, so);
1621 	task_add(sosplice_taskq, &so->so_sp->ssp_task);
1622 }
1623 
1624 void
1625 soput(void *arg)
1626 {
1627 	struct socket *so = arg;
1628 
1629 	pool_put(&sosplice_pool, so->so_sp);
1630 	pool_put(&socket_pool, so);
1631 }
1632 
1633 /*
1634  * Move data from receive buffer of spliced source socket to send
1635  * buffer of drain socket.  Try to move as much as possible in one
1636  * big chunk.  It is a TCP only implementation.
1637  * Return value 0 means splicing has been finished, 1 continue.
1638  */
1639 int
1640 somove(struct socket *so, int wait)
1641 {
1642 	struct socket	*sosp = so->so_sp->ssp_socket;
1643 	struct mbuf	*m, **mp, *nextrecord;
1644 	u_long		 len, off, oobmark;
1645 	long		 space;
1646 	int		 error = 0, maxreached = 0, unsplice = 0;
1647 	unsigned int	 rcvstate;
1648 	int		 sockdgram = ((so->so_proto->pr_flags &
1649 			     PR_WANTRCVD) == 0);
1650 
1651 	if (sockdgram)
1652 		sbassertlocked(&so->so_rcv);
1653 	else
1654 		soassertlocked(so);
1655 
1656 	mtx_enter(&so->so_rcv.sb_mtx);
1657 	mtx_enter(&sosp->so_snd.sb_mtx);
1658 
1659  nextpkt:
1660 	if ((error = READ_ONCE(so->so_error)))
1661 		goto release;
1662 	if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
1663 		error = EPIPE;
1664 		goto release;
1665 	}
1666 
1667 	error = READ_ONCE(sosp->so_error);
1668 	if (error) {
1669 		if (error != ETIMEDOUT && error != EFBIG && error != ELOOP)
1670 			goto release;
1671 		error = 0;
1672 	}
1673 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1674 		goto release;
1675 
1676 	/* Calculate how many bytes can be copied now. */
1677 	len = so->so_rcv.sb_datacc;
1678 	if (so->so_splicemax) {
1679 		KASSERT(so->so_splicelen < so->so_splicemax);
1680 		if (so->so_splicemax <= so->so_splicelen + len) {
1681 			len = so->so_splicemax - so->so_splicelen;
1682 			maxreached = 1;
1683 		}
1684 	}
1685 	space = sbspace_locked(sosp, &sosp->so_snd);
1686 	if (so->so_oobmark && so->so_oobmark < len &&
1687 	    so->so_oobmark < space + 1024)
1688 		space += 1024;
1689 	if (space <= 0) {
1690 		maxreached = 0;
1691 		goto release;
1692 	}
1693 	if (space < len) {
1694 		maxreached = 0;
1695 		if (space < sosp->so_snd.sb_lowat)
1696 			goto release;
1697 		len = space;
1698 	}
1699 	sosp->so_snd.sb_state |= SS_ISSENDING;
1700 
1701 	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1702 	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1703 	m = so->so_rcv.sb_mb;
1704 	if (m == NULL)
1705 		goto release;
1706 	nextrecord = m->m_nextpkt;
1707 
1708 	/* Drop address and control information not used with splicing. */
1709 	if (so->so_proto->pr_flags & PR_ADDR) {
1710 #ifdef DIAGNOSTIC
1711 		if (m->m_type != MT_SONAME)
1712 			panic("somove soname: so %p, so_type %d, m %p, "
1713 			    "m_type %d", so, so->so_type, m, m->m_type);
1714 #endif
1715 		m = m->m_next;
1716 	}
1717 	while (m && m->m_type == MT_CONTROL)
1718 		m = m->m_next;
1719 	if (m == NULL) {
1720 		sbdroprecord(so, &so->so_rcv);
1721 		if (so->so_proto->pr_flags & PR_WANTRCVD) {
1722 			mtx_leave(&sosp->so_snd.sb_mtx);
1723 			mtx_leave(&so->so_rcv.sb_mtx);
1724 			pru_rcvd(so);
1725 			mtx_enter(&so->so_rcv.sb_mtx);
1726 			mtx_enter(&sosp->so_snd.sb_mtx);
1727 		}
1728 		goto nextpkt;
1729 	}
1730 
1731 	/*
1732 	 * By splicing sockets connected to localhost, userland might create a
1733 	 * loop.  Dissolve splicing with error if loop is detected by counter.
1734 	 *
1735 	 * If we deal with looped broadcast/multicast packet we bail out with
1736 	 * no error to suppress splice termination.
1737 	 */
1738 	if ((m->m_flags & M_PKTHDR) &&
1739 	    ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1740 	    ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1741 		error = ELOOP;
1742 		goto release;
1743 	}
1744 
1745 	if (so->so_proto->pr_flags & PR_ATOMIC) {
1746 		if ((m->m_flags & M_PKTHDR) == 0)
1747 			panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1748 			    "m_type %d", so, so->so_type, m, m->m_type);
1749 		if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1750 			error = EMSGSIZE;
1751 			goto release;
1752 		}
1753 		if (len < m->m_pkthdr.len)
1754 			goto release;
1755 		if (m->m_pkthdr.len < len) {
1756 			maxreached = 0;
1757 			len = m->m_pkthdr.len;
1758 		}
1759 		/*
1760 		 * Throw away the name mbuf after it has been assured
1761 		 * that the whole first record can be processed.
1762 		 */
1763 		m = so->so_rcv.sb_mb;
1764 		sbfree(so, &so->so_rcv, m);
1765 		so->so_rcv.sb_mb = m_free(m);
1766 		sbsync(&so->so_rcv, nextrecord);
1767 	}
1768 	/*
1769 	 * Throw away the control mbufs after it has been assured
1770 	 * that the whole first record can be processed.
1771 	 */
1772 	m = so->so_rcv.sb_mb;
1773 	while (m && m->m_type == MT_CONTROL) {
1774 		sbfree(so, &so->so_rcv, m);
1775 		so->so_rcv.sb_mb = m_free(m);
1776 		m = so->so_rcv.sb_mb;
1777 		sbsync(&so->so_rcv, nextrecord);
1778 	}
1779 
1780 	SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1781 	SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1782 
1783 	/* Take at most len mbufs out of receive buffer. */
1784 	for (off = 0, mp = &m; off <= len && *mp;
1785 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1786 		u_long size = len - off;
1787 
1788 #ifdef DIAGNOSTIC
1789 		if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1790 			panic("somove type: so %p, so_type %d, m %p, "
1791 			    "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1792 #endif
1793 		if ((*mp)->m_len > size) {
1794 			/*
1795 			 * Move only a partial mbuf at maximum splice length or
1796 			 * if the drain buffer is too small for this large mbuf.
1797 			 */
1798 			if (!maxreached && sosp->so_snd.sb_datacc > 0) {
1799 				len -= size;
1800 				break;
1801 			}
1802 			*mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1803 			if (*mp == NULL) {
1804 				len -= size;
1805 				break;
1806 			}
1807 			so->so_rcv.sb_mb->m_data += size;
1808 			so->so_rcv.sb_mb->m_len -= size;
1809 			so->so_rcv.sb_cc -= size;
1810 			so->so_rcv.sb_datacc -= size;
1811 		} else {
1812 			*mp = so->so_rcv.sb_mb;
1813 			sbfree(so, &so->so_rcv, *mp);
1814 			so->so_rcv.sb_mb = (*mp)->m_next;
1815 			sbsync(&so->so_rcv, nextrecord);
1816 		}
1817 	}
1818 	*mp = NULL;
1819 
1820 	SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1821 	SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1822 	SBCHECK(so, &so->so_rcv);
1823 	if (m == NULL)
1824 		goto release;
1825 	m->m_nextpkt = NULL;
1826 	if (m->m_flags & M_PKTHDR) {
1827 		m_resethdr(m);
1828 		m->m_pkthdr.len = len;
1829 	}
1830 
1831 	/* Send window update to source peer as receive buffer has changed. */
1832 	if (so->so_proto->pr_flags & PR_WANTRCVD) {
1833 		mtx_leave(&sosp->so_snd.sb_mtx);
1834 		mtx_leave(&so->so_rcv.sb_mtx);
1835 		pru_rcvd(so);
1836 		mtx_enter(&so->so_rcv.sb_mtx);
1837 		mtx_enter(&sosp->so_snd.sb_mtx);
1838 	}
1839 
1840 	/* Receive buffer did shrink by len bytes, adjust oob. */
1841 	rcvstate = so->so_rcv.sb_state;
1842 	so->so_rcv.sb_state &= ~SS_RCVATMARK;
1843 	oobmark = so->so_oobmark;
1844 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1845 	if (oobmark) {
1846 		if (oobmark == len)
1847 			so->so_rcv.sb_state |= SS_RCVATMARK;
1848 		if (oobmark >= len)
1849 			oobmark = 0;
1850 	}
1851 
1852 	/*
1853 	 * Handle oob data.  If any malloc fails, ignore error.
1854 	 * TCP urgent data is not very reliable anyway.
1855 	 */
1856 	while (((rcvstate & SS_RCVATMARK) || oobmark) &&
1857 	    (so->so_options & SO_OOBINLINE)) {
1858 		struct mbuf *o = NULL;
1859 
1860 		if (rcvstate & SS_RCVATMARK) {
1861 			o = m_get(wait, MT_DATA);
1862 			rcvstate &= ~SS_RCVATMARK;
1863 		} else if (oobmark) {
1864 			o = m_split(m, oobmark, wait);
1865 			if (o) {
1866 				mtx_leave(&sosp->so_snd.sb_mtx);
1867 				mtx_leave(&so->so_rcv.sb_mtx);
1868 				error = pru_send(sosp, m, NULL, NULL);
1869 				mtx_enter(&so->so_rcv.sb_mtx);
1870 				mtx_enter(&sosp->so_snd.sb_mtx);
1871 
1872 				if (error) {
1873 					if (sosp->so_snd.sb_state &
1874 					    SS_CANTSENDMORE)
1875 						error = EPIPE;
1876 					m_freem(o);
1877 					goto release;
1878 				}
1879 				len -= oobmark;
1880 				so->so_splicelen += oobmark;
1881 				m = o;
1882 				o = m_get(wait, MT_DATA);
1883 			}
1884 			oobmark = 0;
1885 		}
1886 		if (o) {
1887 			o->m_len = 1;
1888 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1889 
1890 			mtx_leave(&sosp->so_snd.sb_mtx);
1891 			mtx_leave(&so->so_rcv.sb_mtx);
1892 			error = pru_sendoob(sosp, o, NULL, NULL);
1893 			mtx_enter(&so->so_rcv.sb_mtx);
1894 			mtx_enter(&sosp->so_snd.sb_mtx);
1895 
1896 			if (error) {
1897 				if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1898 					error = EPIPE;
1899 				m_freem(m);
1900 				goto release;
1901 			}
1902 			len -= 1;
1903 			so->so_splicelen += 1;
1904 			if (oobmark) {
1905 				oobmark -= 1;
1906 				if (oobmark == 0)
1907 					rcvstate |= SS_RCVATMARK;
1908 			}
1909 			m_adj(m, 1);
1910 		}
1911 	}
1912 
1913 	/* Append all remaining data to drain socket. */
1914 	if (so->so_rcv.sb_cc == 0 || maxreached)
1915 		sosp->so_snd.sb_state &= ~SS_ISSENDING;
1916 
1917 	mtx_leave(&sosp->so_snd.sb_mtx);
1918 	mtx_leave(&so->so_rcv.sb_mtx);
1919 
1920 	if (sockdgram)
1921 		solock_shared(sosp);
1922 	error = pru_send(sosp, m, NULL, NULL);
1923 	if (sockdgram)
1924 		sounlock_shared(sosp);
1925 
1926 	mtx_enter(&so->so_rcv.sb_mtx);
1927 	mtx_enter(&sosp->so_snd.sb_mtx);
1928 
1929 	if (error) {
1930 		if (sosp->so_snd.sb_state & SS_CANTSENDMORE ||
1931 		    sosp->so_pcb == NULL)
1932 			error = EPIPE;
1933 		goto release;
1934 	}
1935 	so->so_splicelen += len;
1936 
1937 	/* Move several packets if possible. */
1938 	if (!maxreached && nextrecord)
1939 		goto nextpkt;
1940 
1941  release:
1942 	sosp->so_snd.sb_state &= ~SS_ISSENDING;
1943 
1944 	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1945 		error = EFBIG;
1946 	if (error)
1947 		WRITE_ONCE(so->so_error, error);
1948 
1949 	if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
1950 	    so->so_rcv.sb_cc == 0) ||
1951 	    (sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
1952 	    maxreached || error)
1953 		unsplice = 1;
1954 
1955 	mtx_leave(&sosp->so_snd.sb_mtx);
1956 	mtx_leave(&so->so_rcv.sb_mtx);
1957 
1958 	if (unsplice) {
1959 		if (sockdgram)
1960 			solock(so);
1961 		sounsplice(so, sosp, 0);
1962 		if (sockdgram)
1963 			sounlock(so);
1964 
1965 		return (0);
1966 	}
1967 	if (timerisset(&so->so_idletv))
1968 		timeout_add_tv(&so->so_idleto, &so->so_idletv);
1969 	return (1);
1970 }
1971 #endif /* SOCKET_SPLICE */
1972 
1973 void
1974 sorwakeup(struct socket *so)
1975 {
1976 	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
1977 		soassertlocked_readonly(so);
1978 
1979 #ifdef SOCKET_SPLICE
1980 	if (so->so_proto->pr_flags & PR_SPLICE) {
1981 		sb_mtx_lock(&so->so_rcv);
1982 		if (so->so_rcv.sb_flags & SB_SPLICE)
1983 			task_add(sosplice_taskq, &so->so_splicetask);
1984 		if (isspliced(so)) {
1985 			sb_mtx_unlock(&so->so_rcv);
1986 			return;
1987 		}
1988 		sb_mtx_unlock(&so->so_rcv);
1989 	}
1990 #endif
1991 	sowakeup(so, &so->so_rcv);
1992 	if (so->so_upcall)
1993 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1994 }
1995 
1996 void
1997 sowwakeup(struct socket *so)
1998 {
1999 	if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
2000 		soassertlocked_readonly(so);
2001 
2002 #ifdef SOCKET_SPLICE
2003 	if (so->so_proto->pr_flags & PR_SPLICE) {
2004 		sb_mtx_lock(&so->so_snd);
2005 		if (so->so_snd.sb_flags & SB_SPLICE)
2006 			task_add(sosplice_taskq,
2007 			    &so->so_sp->ssp_soback->so_splicetask);
2008 		if (issplicedback(so)) {
2009 			sb_mtx_unlock(&so->so_snd);
2010 			return;
2011 		}
2012 		sb_mtx_unlock(&so->so_snd);
2013 	}
2014 #endif
2015 	sowakeup(so, &so->so_snd);
2016 }
2017 
2018 int
2019 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
2020 {
2021 	int error = 0;
2022 
2023 	if (level != SOL_SOCKET) {
2024 		if (so->so_proto->pr_ctloutput) {
2025 			solock(so);
2026 			error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
2027 			    level, optname, m);
2028 			sounlock(so);
2029 			return (error);
2030 		}
2031 		error = ENOPROTOOPT;
2032 	} else {
2033 		switch (optname) {
2034 
2035 		case SO_LINGER:
2036 			if (m == NULL || m->m_len != sizeof (struct linger) ||
2037 			    mtod(m, struct linger *)->l_linger < 0 ||
2038 			    mtod(m, struct linger *)->l_linger > SHRT_MAX)
2039 				return (EINVAL);
2040 
2041 			solock(so);
2042 			so->so_linger = mtod(m, struct linger *)->l_linger;
2043 			if (*mtod(m, int *))
2044 				so->so_options |= optname;
2045 			else
2046 				so->so_options &= ~optname;
2047 			sounlock(so);
2048 
2049 			break;
2050 		case SO_BINDANY:
2051 			if ((error = suser(curproc)) != 0)	/* XXX */
2052 				return (error);
2053 			/* FALLTHROUGH */
2054 
2055 		case SO_DEBUG:
2056 		case SO_KEEPALIVE:
2057 		case SO_USELOOPBACK:
2058 		case SO_BROADCAST:
2059 		case SO_REUSEADDR:
2060 		case SO_REUSEPORT:
2061 		case SO_OOBINLINE:
2062 		case SO_TIMESTAMP:
2063 		case SO_ZEROIZE:
2064 			if (m == NULL || m->m_len < sizeof (int))
2065 				return (EINVAL);
2066 
2067 			solock(so);
2068 			if (*mtod(m, int *))
2069 				so->so_options |= optname;
2070 			else
2071 				so->so_options &= ~optname;
2072 			sounlock(so);
2073 
2074 			break;
2075 		case SO_DONTROUTE:
2076 			if (m == NULL || m->m_len < sizeof (int))
2077 				return (EINVAL);
2078 			if (*mtod(m, int *))
2079 				error = EOPNOTSUPP;
2080 			break;
2081 
2082 		case SO_SNDBUF:
2083 		case SO_RCVBUF:
2084 		case SO_SNDLOWAT:
2085 		case SO_RCVLOWAT:
2086 		    {
2087 			struct sockbuf *sb = (optname == SO_SNDBUF ||
2088 			    optname == SO_SNDLOWAT ?
2089 			    &so->so_snd : &so->so_rcv);
2090 			u_long cnt;
2091 
2092 			if (m == NULL || m->m_len < sizeof (int))
2093 				return (EINVAL);
2094 			cnt = *mtod(m, int *);
2095 			if ((long)cnt <= 0)
2096 				cnt = 1;
2097 
2098 			if (((sb->sb_flags & SB_MTXLOCK) == 0))
2099 				solock(so);
2100 			mtx_enter(&sb->sb_mtx);
2101 
2102 			switch (optname) {
2103 			case SO_SNDBUF:
2104 			case SO_RCVBUF:
2105 				if (sb->sb_state &
2106 				    (SS_CANTSENDMORE | SS_CANTRCVMORE)) {
2107 					error = EINVAL;
2108 					break;
2109 				}
2110 				if (sbcheckreserve(cnt, sb->sb_wat) ||
2111 				    sbreserve(so, sb, cnt)) {
2112 					error = ENOBUFS;
2113 					break;
2114 				}
2115 				sb->sb_wat = cnt;
2116 				break;
2117 			case SO_SNDLOWAT:
2118 			case SO_RCVLOWAT:
2119 				sb->sb_lowat = (cnt > sb->sb_hiwat) ?
2120 				    sb->sb_hiwat : cnt;
2121 				break;
2122 			}
2123 
2124 			mtx_leave(&sb->sb_mtx);
2125 			if (((sb->sb_flags & SB_MTXLOCK) == 0))
2126 				sounlock(so);
2127 
2128 			break;
2129 		    }
2130 
2131 		case SO_SNDTIMEO:
2132 		case SO_RCVTIMEO:
2133 		    {
2134 			struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2135 			    &so->so_snd : &so->so_rcv);
2136 			struct timeval tv;
2137 			uint64_t nsecs;
2138 
2139 			if (m == NULL || m->m_len < sizeof (tv))
2140 				return (EINVAL);
2141 			memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
2142 			if (!timerisvalid(&tv))
2143 				return (EINVAL);
2144 			nsecs = TIMEVAL_TO_NSEC(&tv);
2145 			if (nsecs == UINT64_MAX)
2146 				return (EDOM);
2147 			if (nsecs == 0)
2148 				nsecs = INFSLP;
2149 
2150 			mtx_enter(&sb->sb_mtx);
2151 			sb->sb_timeo_nsecs = nsecs;
2152 			mtx_leave(&sb->sb_mtx);
2153 			break;
2154 		    }
2155 
2156 		case SO_RTABLE:
2157 			if (so->so_proto->pr_domain &&
2158 			    so->so_proto->pr_domain->dom_protosw &&
2159 			    so->so_proto->pr_ctloutput) {
2160 				const struct domain *dom =
2161 				    so->so_proto->pr_domain;
2162 
2163 				level = dom->dom_protosw->pr_protocol;
2164 				solock(so);
2165 				error = (*so->so_proto->pr_ctloutput)
2166 				    (PRCO_SETOPT, so, level, optname, m);
2167 				sounlock(so);
2168 			} else
2169 				error = ENOPROTOOPT;
2170 			break;
2171 #ifdef SOCKET_SPLICE
2172 		case SO_SPLICE:
2173 			if (m == NULL) {
2174 				error = sosplice(so, -1, 0, NULL);
2175 			} else if (m->m_len < sizeof(int)) {
2176 				error = EINVAL;
2177 			} else if (m->m_len < sizeof(struct splice)) {
2178 				error = sosplice(so, *mtod(m, int *), 0, NULL);
2179 			} else {
2180 				error = sosplice(so,
2181 				    mtod(m, struct splice *)->sp_fd,
2182 				    mtod(m, struct splice *)->sp_max,
2183 				   &mtod(m, struct splice *)->sp_idle);
2184 			}
2185 			break;
2186 #endif /* SOCKET_SPLICE */
2187 
2188 		default:
2189 			error = ENOPROTOOPT;
2190 			break;
2191 		}
2192 	}
2193 
2194 	return (error);
2195 }
2196 
2197 int
2198 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
2199 {
2200 	int error = 0;
2201 
2202 	if (level != SOL_SOCKET) {
2203 		if (so->so_proto->pr_ctloutput) {
2204 			m->m_len = 0;
2205 
2206 			solock(so);
2207 			error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
2208 			    level, optname, m);
2209 			sounlock(so);
2210 			return (error);
2211 		} else
2212 			return (ENOPROTOOPT);
2213 	} else {
2214 		m->m_len = sizeof (int);
2215 
2216 		switch (optname) {
2217 
2218 		case SO_LINGER:
2219 			m->m_len = sizeof (struct linger);
2220 			solock_shared(so);
2221 			mtod(m, struct linger *)->l_onoff =
2222 				so->so_options & SO_LINGER;
2223 			mtod(m, struct linger *)->l_linger = so->so_linger;
2224 			sounlock_shared(so);
2225 			break;
2226 
2227 		case SO_BINDANY:
2228 		case SO_USELOOPBACK:
2229 		case SO_DEBUG:
2230 		case SO_KEEPALIVE:
2231 		case SO_REUSEADDR:
2232 		case SO_REUSEPORT:
2233 		case SO_BROADCAST:
2234 		case SO_OOBINLINE:
2235 		case SO_ACCEPTCONN:
2236 		case SO_TIMESTAMP:
2237 		case SO_ZEROIZE:
2238 			*mtod(m, int *) = so->so_options & optname;
2239 			break;
2240 
2241 		case SO_DONTROUTE:
2242 			*mtod(m, int *) = 0;
2243 			break;
2244 
2245 		case SO_TYPE:
2246 			*mtod(m, int *) = so->so_type;
2247 			break;
2248 
2249 		case SO_ERROR:
2250 			solock(so);
2251 			*mtod(m, int *) = so->so_error;
2252 			so->so_error = 0;
2253 			sounlock(so);
2254 
2255 			break;
2256 
2257 		case SO_DOMAIN:
2258 			*mtod(m, int *) = so->so_proto->pr_domain->dom_family;
2259 			break;
2260 
2261 		case SO_PROTOCOL:
2262 			*mtod(m, int *) = so->so_proto->pr_protocol;
2263 			break;
2264 
2265 		case SO_SNDBUF:
2266 			*mtod(m, int *) = so->so_snd.sb_hiwat;
2267 			break;
2268 
2269 		case SO_RCVBUF:
2270 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
2271 			break;
2272 
2273 		case SO_SNDLOWAT:
2274 			*mtod(m, int *) = so->so_snd.sb_lowat;
2275 			break;
2276 
2277 		case SO_RCVLOWAT:
2278 			*mtod(m, int *) = so->so_rcv.sb_lowat;
2279 			break;
2280 
2281 		case SO_SNDTIMEO:
2282 		case SO_RCVTIMEO:
2283 		    {
2284 			struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2285 			    &so->so_snd : &so->so_rcv);
2286 			struct timeval tv;
2287 			uint64_t nsecs;
2288 
2289 			mtx_enter(&sb->sb_mtx);
2290 			nsecs = sb->sb_timeo_nsecs;
2291 			mtx_leave(&sb->sb_mtx);
2292 
2293 			m->m_len = sizeof(struct timeval);
2294 			memset(&tv, 0, sizeof(tv));
2295 			if (nsecs != INFSLP)
2296 				NSEC_TO_TIMEVAL(nsecs, &tv);
2297 			memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
2298 			break;
2299 		    }
2300 
2301 		case SO_RTABLE:
2302 			if (so->so_proto->pr_domain &&
2303 			    so->so_proto->pr_domain->dom_protosw &&
2304 			    so->so_proto->pr_ctloutput) {
2305 				const struct domain *dom =
2306 				    so->so_proto->pr_domain;
2307 
2308 				level = dom->dom_protosw->pr_protocol;
2309 				solock(so);
2310 				error = (*so->so_proto->pr_ctloutput)
2311 				    (PRCO_GETOPT, so, level, optname, m);
2312 				sounlock(so);
2313 				if (error)
2314 					return (error);
2315 				break;
2316 			}
2317 			return (ENOPROTOOPT);
2318 
2319 #ifdef SOCKET_SPLICE
2320 		case SO_SPLICE:
2321 		    {
2322 			off_t len;
2323 
2324 			m->m_len = sizeof(off_t);
2325 			solock_shared(so);
2326 			len = so->so_sp ? so->so_sp->ssp_len : 0;
2327 			sounlock_shared(so);
2328 			memcpy(mtod(m, off_t *), &len, sizeof(off_t));
2329 			break;
2330 		    }
2331 #endif /* SOCKET_SPLICE */
2332 
2333 		case SO_PEERCRED:
2334 			if (so->so_proto->pr_protocol == AF_UNIX) {
2335 				struct unpcb *unp = sotounpcb(so);
2336 
2337 				solock(so);
2338 				if (unp->unp_flags & UNP_FEIDS) {
2339 					m->m_len = sizeof(unp->unp_connid);
2340 					memcpy(mtod(m, caddr_t),
2341 					    &(unp->unp_connid), m->m_len);
2342 					sounlock(so);
2343 					break;
2344 				}
2345 				sounlock(so);
2346 
2347 				return (ENOTCONN);
2348 			}
2349 			return (EOPNOTSUPP);
2350 
2351 		default:
2352 			return (ENOPROTOOPT);
2353 		}
2354 		return (0);
2355 	}
2356 }
2357 
2358 void
2359 sohasoutofband(struct socket *so)
2360 {
2361 	pgsigio(&so->so_sigio, SIGURG, 0);
2362 	knote(&so->so_rcv.sb_klist, 0);
2363 }
2364 
2365 void
2366 sofilt_lock(struct socket *so, struct sockbuf *sb)
2367 {
2368 	switch (so->so_proto->pr_domain->dom_family) {
2369 	case PF_INET:
2370 	case PF_INET6:
2371 		NET_LOCK_SHARED();
2372 		break;
2373 	default:
2374 		rw_enter_write(&so->so_lock);
2375 		break;
2376 	}
2377 
2378 	mtx_enter(&sb->sb_mtx);
2379 }
2380 
2381 void
2382 sofilt_unlock(struct socket *so, struct sockbuf *sb)
2383 {
2384 	mtx_leave(&sb->sb_mtx);
2385 
2386 	switch (so->so_proto->pr_domain->dom_family) {
2387 	case PF_INET:
2388 	case PF_INET6:
2389 		NET_UNLOCK_SHARED();
2390 		break;
2391 	default:
2392 		rw_exit_write(&so->so_lock);
2393 		break;
2394 	}
2395 }
2396 
2397 int
2398 soo_kqfilter(struct file *fp, struct knote *kn)
2399 {
2400 	struct socket *so = kn->kn_fp->f_data;
2401 	struct sockbuf *sb;
2402 
2403 	switch (kn->kn_filter) {
2404 	case EVFILT_READ:
2405 		kn->kn_fop = &soread_filtops;
2406 		sb = &so->so_rcv;
2407 		break;
2408 	case EVFILT_WRITE:
2409 		kn->kn_fop = &sowrite_filtops;
2410 		sb = &so->so_snd;
2411 		break;
2412 	case EVFILT_EXCEPT:
2413 		kn->kn_fop = &soexcept_filtops;
2414 		sb = &so->so_rcv;
2415 		break;
2416 	default:
2417 		return (EINVAL);
2418 	}
2419 
2420 	klist_insert(&sb->sb_klist, kn);
2421 
2422 	return (0);
2423 }
2424 
2425 void
2426 filt_sordetach(struct knote *kn)
2427 {
2428 	struct socket *so = kn->kn_fp->f_data;
2429 
2430 	klist_remove(&so->so_rcv.sb_klist, kn);
2431 }
2432 
2433 int
2434 filt_soread(struct knote *kn, long hint)
2435 {
2436 	struct socket *so = kn->kn_fp->f_data;
2437 	u_int state = READ_ONCE(so->so_state);
2438 	u_int error = READ_ONCE(so->so_error);
2439 	int rv = 0;
2440 
2441 	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2442 	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
2443 		soassertlocked_readonly(so);
2444 
2445 	if (so->so_options & SO_ACCEPTCONN) {
2446 		short qlen = READ_ONCE(so->so_qlen);
2447 
2448 		if (so->so_rcv.sb_flags & SB_MTXLOCK)
2449 			soassertlocked_readonly(so);
2450 
2451 		kn->kn_data = qlen;
2452 		rv = (kn->kn_data != 0);
2453 
2454 		if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) {
2455 			if (state & SS_ISDISCONNECTED) {
2456 				kn->kn_flags |= __EV_HUP;
2457 				rv = 1;
2458 			} else {
2459 				rv = qlen || soreadable(so);
2460 			}
2461 		}
2462 
2463 		return rv;
2464 	}
2465 
2466 	kn->kn_data = so->so_rcv.sb_cc;
2467 #ifdef SOCKET_SPLICE
2468 	if (isspliced(so)) {
2469 		rv = 0;
2470 	} else
2471 #endif /* SOCKET_SPLICE */
2472 	if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
2473 		kn->kn_flags |= EV_EOF;
2474 		if (kn->kn_flags & __EV_POLL) {
2475 			if (state & SS_ISDISCONNECTED)
2476 				kn->kn_flags |= __EV_HUP;
2477 		}
2478 		kn->kn_fflags = error;
2479 		rv = 1;
2480 	} else if (error) {
2481 		rv = 1;
2482 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2483 		rv = (kn->kn_data >= kn->kn_sdata);
2484 	} else {
2485 		rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2486 	}
2487 
2488 	return rv;
2489 }
2490 
2491 void
2492 filt_sowdetach(struct knote *kn)
2493 {
2494 	struct socket *so = kn->kn_fp->f_data;
2495 
2496 	klist_remove(&so->so_snd.sb_klist, kn);
2497 }
2498 
2499 int
2500 filt_sowrite(struct knote *kn, long hint)
2501 {
2502 	struct socket *so = kn->kn_fp->f_data;
2503 	u_int state = READ_ONCE(so->so_state);
2504 	u_int error = READ_ONCE(so->so_error);
2505 	int rv;
2506 
2507 	MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
2508 	if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0)
2509 		soassertlocked_readonly(so);
2510 
2511 	kn->kn_data = sbspace_locked(so, &so->so_snd);
2512 	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
2513 		kn->kn_flags |= EV_EOF;
2514 		if (kn->kn_flags & __EV_POLL) {
2515 			if (state & SS_ISDISCONNECTED)
2516 				kn->kn_flags |= __EV_HUP;
2517 		}
2518 		kn->kn_fflags = error;
2519 		rv = 1;
2520 	} else if (error) {
2521 		rv = 1;
2522 	} else if (((state & SS_ISCONNECTED) == 0) &&
2523 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2524 		rv = 0;
2525 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2526 		rv = (kn->kn_data >= kn->kn_sdata);
2527 	} else {
2528 		rv = (kn->kn_data >= so->so_snd.sb_lowat);
2529 	}
2530 
2531 	return (rv);
2532 }
2533 
2534 int
2535 filt_soexcept(struct knote *kn, long hint)
2536 {
2537 	struct socket *so = kn->kn_fp->f_data;
2538 	int rv = 0;
2539 
2540 	MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2541 	if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0)
2542 		soassertlocked_readonly(so);
2543 
2544 #ifdef SOCKET_SPLICE
2545 	if (isspliced(so)) {
2546 		rv = 0;
2547 	} else
2548 #endif /* SOCKET_SPLICE */
2549 	if (kn->kn_sfflags & NOTE_OOB) {
2550 		if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) {
2551 			kn->kn_fflags |= NOTE_OOB;
2552 			kn->kn_data -= so->so_oobmark;
2553 			rv = 1;
2554 		}
2555 	}
2556 
2557 	if (kn->kn_flags & __EV_POLL) {
2558 		u_int state = READ_ONCE(so->so_state);
2559 
2560 		if (state & SS_ISDISCONNECTED) {
2561 			kn->kn_flags |= __EV_HUP;
2562 			rv = 1;
2563 		}
2564 	}
2565 
2566 	return rv;
2567 }
2568 
2569 int
2570 filt_sowmodify(struct kevent *kev, struct knote *kn)
2571 {
2572 	struct socket *so = kn->kn_fp->f_data;
2573 	int rv;
2574 
2575 	sofilt_lock(so, &so->so_snd);
2576 	rv = knote_modify(kev, kn);
2577 	sofilt_unlock(so, &so->so_snd);
2578 
2579 	return (rv);
2580 }
2581 
2582 int
2583 filt_sowprocess(struct knote *kn, struct kevent *kev)
2584 {
2585 	struct socket *so = kn->kn_fp->f_data;
2586 	int rv;
2587 
2588 	sofilt_lock(so, &so->so_snd);
2589 	rv = knote_process(kn, kev);
2590 	sofilt_unlock(so, &so->so_snd);
2591 
2592 	return (rv);
2593 }
2594 
2595 int
2596 filt_sormodify(struct kevent *kev, struct knote *kn)
2597 {
2598 	struct socket *so = kn->kn_fp->f_data;
2599 	int rv;
2600 
2601 	sofilt_lock(so, &so->so_rcv);
2602 	rv = knote_modify(kev, kn);
2603 	sofilt_unlock(so, &so->so_rcv);
2604 
2605 	return (rv);
2606 }
2607 
2608 int
2609 filt_sorprocess(struct knote *kn, struct kevent *kev)
2610 {
2611 	struct socket *so = kn->kn_fp->f_data;
2612 	int rv;
2613 
2614 	sofilt_lock(so, &so->so_rcv);
2615 	rv = knote_process(kn, kev);
2616 	sofilt_unlock(so, &so->so_rcv);
2617 
2618 	return (rv);
2619 }
2620 
2621 #ifdef DDB
2622 void
2623 sobuf_print(struct sockbuf *,
2624     int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2625 
2626 void
2627 sobuf_print(struct sockbuf *sb,
2628     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2629 {
2630 	(*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2631 	(*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2632 	(*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2633 	(*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2634 	(*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2635 	(*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2636 	(*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2637 	(*pr)("\tsb_mb: %p\n", sb->sb_mb);
2638 	(*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2639 	(*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2640 	(*pr)("\tsb_flags: %04x\n", sb->sb_flags);
2641 	(*pr)("\tsb_state: %04x\n", sb->sb_state);
2642 	(*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2643 }
2644 
2645 void
2646 so_print(void *v,
2647     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2648 {
2649 	struct socket *so = v;
2650 
2651 	(*pr)("socket %p\n", so);
2652 	(*pr)("so_type: %i\n", so->so_type);
2653 	(*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2654 	(*pr)("so_linger: %i\n", so->so_linger);
2655 	(*pr)("so_state: 0x%04x\n", so->so_state);
2656 	(*pr)("so_pcb: %p\n", so->so_pcb);
2657 	(*pr)("so_proto: %p\n", so->so_proto);
2658 	(*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2659 
2660 	(*pr)("so_head: %p\n", so->so_head);
2661 	(*pr)("so_onq: %p\n", so->so_onq);
2662 	(*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2663 	(*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2664 	(*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2665 	(*pr)("so_q0len: %i\n", so->so_q0len);
2666 	(*pr)("so_qlen: %i\n", so->so_qlen);
2667 	(*pr)("so_qlimit: %i\n", so->so_qlimit);
2668 	(*pr)("so_timeo: %i\n", so->so_timeo);
2669 	(*pr)("so_obmark: %lu\n", so->so_oobmark);
2670 
2671 	(*pr)("so_sp: %p\n", so->so_sp);
2672 	if (so->so_sp != NULL) {
2673 		(*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2674 		(*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2675 		(*pr)("\tssp_len: %lld\n",
2676 		    (unsigned long long)so->so_sp->ssp_len);
2677 		(*pr)("\tssp_max: %lld\n",
2678 		    (unsigned long long)so->so_sp->ssp_max);
2679 		(*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2680 		    so->so_sp->ssp_idletv.tv_usec);
2681 		(*pr)("\tssp_idleto: %spending (@%i)\n",
2682 		    timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2683 		    so->so_sp->ssp_idleto.to_time);
2684 	}
2685 
2686 	(*pr)("so_rcv:\n");
2687 	sobuf_print(&so->so_rcv, pr);
2688 	(*pr)("so_snd:\n");
2689 	sobuf_print(&so->so_snd, pr);
2690 
2691 	(*pr)("so_upcall: %p so_upcallarg: %p\n",
2692 	    so->so_upcall, so->so_upcallarg);
2693 
2694 	(*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2695 	(*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2696 	(*pr)("so_cpid: %d\n", so->so_cpid);
2697 }
2698 #endif
2699