xref: /openbsd-src/sys/kern/uipc_socket.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*	$OpenBSD: uipc_socket.c,v 1.249 2020/09/29 11:48:54 claudio Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/kernel.h>
44 #include <sys/event.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/unpcb.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <net/if.h>
51 #include <sys/pool.h>
52 #include <sys/atomic.h>
53 #include <sys/rwlock.h>
54 #include <sys/time.h>
55 
56 #ifdef DDB
57 #include <machine/db_machdep.h>
58 #endif
59 
60 void	sbsync(struct sockbuf *, struct mbuf *);
61 
62 int	sosplice(struct socket *, int, off_t, struct timeval *);
63 void	sounsplice(struct socket *, struct socket *, int);
64 void	soidle(void *);
65 void	sotask(void *);
66 void	soreaper(void *);
67 void	soput(void *);
68 int	somove(struct socket *, int);
69 
70 void	filt_sordetach(struct knote *kn);
71 int	filt_soread(struct knote *kn, long hint);
72 void	filt_sowdetach(struct knote *kn);
73 int	filt_sowrite(struct knote *kn, long hint);
74 int	filt_solisten(struct knote *kn, long hint);
75 
76 const struct filterops solisten_filtops = {
77 	.f_flags	= FILTEROP_ISFD,
78 	.f_attach	= NULL,
79 	.f_detach	= filt_sordetach,
80 	.f_event	= filt_solisten,
81 };
82 
83 const struct filterops soread_filtops = {
84 	.f_flags	= FILTEROP_ISFD,
85 	.f_attach	= NULL,
86 	.f_detach	= filt_sordetach,
87 	.f_event	= filt_soread,
88 };
89 
90 const struct filterops sowrite_filtops = {
91 	.f_flags	= FILTEROP_ISFD,
92 	.f_attach	= NULL,
93 	.f_detach	= filt_sowdetach,
94 	.f_event	= filt_sowrite,
95 };
96 
97 const struct filterops soexcept_filtops = {
98 	.f_flags	= FILTEROP_ISFD,
99 	.f_attach	= NULL,
100 	.f_detach	= filt_sordetach,
101 	.f_event	= filt_soread,
102 };
103 
104 #ifndef SOMINCONN
105 #define SOMINCONN 80
106 #endif /* SOMINCONN */
107 
108 int	somaxconn = SOMAXCONN;
109 int	sominconn = SOMINCONN;
110 
111 struct pool socket_pool;
112 #ifdef SOCKET_SPLICE
113 struct pool sosplice_pool;
114 struct taskq *sosplice_taskq;
115 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
116 #endif
117 
118 void
119 soinit(void)
120 {
121 	pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
122 	    "sockpl", NULL);
123 #ifdef SOCKET_SPLICE
124 	pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
125 	    "sosppl", NULL);
126 #endif
127 }
128 
129 /*
130  * Socket operation routines.
131  * These routines are called by the routines in
132  * sys_socket.c or from a system process, and
133  * implement the semantics of socket operations by
134  * switching out to the protocol specific routines.
135  */
136 int
137 socreate(int dom, struct socket **aso, int type, int proto)
138 {
139 	struct proc *p = curproc;		/* XXX */
140 	const struct protosw *prp;
141 	struct socket *so;
142 	int error, s;
143 
144 	if (proto)
145 		prp = pffindproto(dom, proto, type);
146 	else
147 		prp = pffindtype(dom, type);
148 	if (prp == NULL || prp->pr_attach == NULL)
149 		return (EPROTONOSUPPORT);
150 	if (prp->pr_type != type)
151 		return (EPROTOTYPE);
152 	so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO);
153 	sigio_init(&so->so_sigio);
154 	TAILQ_INIT(&so->so_q0);
155 	TAILQ_INIT(&so->so_q);
156 	so->so_type = type;
157 	if (suser(p) == 0)
158 		so->so_state = SS_PRIV;
159 	so->so_ruid = p->p_ucred->cr_ruid;
160 	so->so_euid = p->p_ucred->cr_uid;
161 	so->so_rgid = p->p_ucred->cr_rgid;
162 	so->so_egid = p->p_ucred->cr_gid;
163 	so->so_cpid = p->p_p->ps_pid;
164 	so->so_proto = prp;
165 	so->so_snd.sb_timeo_nsecs = INFSLP;
166 	so->so_rcv.sb_timeo_nsecs = INFSLP;
167 
168 	s = solock(so);
169 	error = (*prp->pr_attach)(so, proto);
170 	if (error) {
171 		so->so_state |= SS_NOFDREF;
172 		/* sofree() calls sounlock(). */
173 		sofree(so, s);
174 		return (error);
175 	}
176 	sounlock(so, s);
177 	*aso = so;
178 	return (0);
179 }
180 
181 int
182 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
183 {
184 	int error;
185 
186 	soassertlocked(so);
187 
188 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
189 	return (error);
190 }
191 
192 int
193 solisten(struct socket *so, int backlog)
194 {
195 	int error;
196 
197 	soassertlocked(so);
198 
199 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
200 		return (EINVAL);
201 #ifdef SOCKET_SPLICE
202 	if (isspliced(so) || issplicedback(so))
203 		return (EOPNOTSUPP);
204 #endif /* SOCKET_SPLICE */
205 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
206 	    curproc);
207 	if (error)
208 		return (error);
209 	if (TAILQ_FIRST(&so->so_q) == NULL)
210 		so->so_options |= SO_ACCEPTCONN;
211 	if (backlog < 0 || backlog > somaxconn)
212 		backlog = somaxconn;
213 	if (backlog < sominconn)
214 		backlog = sominconn;
215 	so->so_qlimit = backlog;
216 	return (0);
217 }
218 
219 #define SOSP_FREEING_READ	1
220 #define SOSP_FREEING_WRITE	2
221 void
222 sofree(struct socket *so, int s)
223 {
224 	soassertlocked(so);
225 
226 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
227 		sounlock(so, s);
228 		return;
229 	}
230 	if (so->so_head) {
231 		/*
232 		 * We must not decommission a socket that's on the accept(2)
233 		 * queue.  If we do, then accept(2) may hang after select(2)
234 		 * indicated that the listening socket was ready.
235 		 */
236 		if (!soqremque(so, 0)) {
237 			sounlock(so, s);
238 			return;
239 		}
240 	}
241 	sigio_free(&so->so_sigio);
242 #ifdef SOCKET_SPLICE
243 	if (so->so_sp) {
244 		if (issplicedback(so)) {
245 			int freeing = SOSP_FREEING_WRITE;
246 
247 			if (so->so_sp->ssp_soback == so)
248 				freeing |= SOSP_FREEING_READ;
249 			sounsplice(so->so_sp->ssp_soback, so, freeing);
250 		}
251 		if (isspliced(so)) {
252 			int freeing = SOSP_FREEING_READ;
253 
254 			if (so == so->so_sp->ssp_socket)
255 				freeing |= SOSP_FREEING_WRITE;
256 			sounsplice(so, so->so_sp->ssp_socket, freeing);
257 		}
258 	}
259 #endif /* SOCKET_SPLICE */
260 	sbrelease(so, &so->so_snd);
261 	sorflush(so);
262 	sounlock(so, s);
263 #ifdef SOCKET_SPLICE
264 	if (so->so_sp) {
265 		/* Reuse splice idle, sounsplice() has been called before. */
266 		timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so);
267 		timeout_add(&so->so_sp->ssp_idleto, 0);
268 	} else
269 #endif /* SOCKET_SPLICE */
270 	{
271 		pool_put(&socket_pool, so);
272 	}
273 }
274 
275 static inline uint64_t
276 solinger_nsec(struct socket *so)
277 {
278 	if (so->so_linger == 0)
279 		return INFSLP;
280 
281 	return SEC_TO_NSEC(so->so_linger);
282 }
283 
284 /*
285  * Close a socket on last file table reference removal.
286  * Initiate disconnect if connected.
287  * Free socket when disconnect complete.
288  */
289 int
290 soclose(struct socket *so, int flags)
291 {
292 	struct socket *so2;
293 	int s, error = 0;
294 
295 	s = solock(so);
296 	/* Revoke async IO early. There is a final revocation in sofree(). */
297 	sigio_free(&so->so_sigio);
298 	if (so->so_options & SO_ACCEPTCONN) {
299 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
300 			(void) soqremque(so2, 0);
301 			(void) soabort(so2);
302 		}
303 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
304 			(void) soqremque(so2, 1);
305 			(void) soabort(so2);
306 		}
307 	}
308 	if (so->so_pcb == NULL)
309 		goto discard;
310 	if (so->so_state & SS_ISCONNECTED) {
311 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
312 			error = sodisconnect(so);
313 			if (error)
314 				goto drop;
315 		}
316 		if (so->so_options & SO_LINGER) {
317 			if ((so->so_state & SS_ISDISCONNECTING) &&
318 			    (flags & MSG_DONTWAIT))
319 				goto drop;
320 			while (so->so_state & SS_ISCONNECTED) {
321 				error = sosleep_nsec(so, &so->so_timeo,
322 				    PSOCK | PCATCH, "netcls",
323 				    solinger_nsec(so));
324 				if (error)
325 					break;
326 			}
327 		}
328 	}
329 drop:
330 	if (so->so_pcb) {
331 		int error2;
332 		KASSERT(so->so_proto->pr_detach);
333 		error2 = (*so->so_proto->pr_detach)(so);
334 		if (error == 0)
335 			error = error2;
336 	}
337 discard:
338 	if (so->so_state & SS_NOFDREF)
339 		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
340 	so->so_state |= SS_NOFDREF;
341 	/* sofree() calls sounlock(). */
342 	sofree(so, s);
343 	return (error);
344 }
345 
346 int
347 soabort(struct socket *so)
348 {
349 	soassertlocked(so);
350 
351 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL,
352 	   curproc);
353 }
354 
355 int
356 soaccept(struct socket *so, struct mbuf *nam)
357 {
358 	int error = 0;
359 
360 	soassertlocked(so);
361 
362 	if ((so->so_state & SS_NOFDREF) == 0)
363 		panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
364 	so->so_state &= ~SS_NOFDREF;
365 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
366 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
367 		error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL,
368 		    nam, NULL, curproc);
369 	else
370 		error = ECONNABORTED;
371 	return (error);
372 }
373 
374 int
375 soconnect(struct socket *so, struct mbuf *nam)
376 {
377 	int error;
378 
379 	soassertlocked(so);
380 
381 	if (so->so_options & SO_ACCEPTCONN)
382 		return (EOPNOTSUPP);
383 	/*
384 	 * If protocol is connection-based, can only connect once.
385 	 * Otherwise, if connected, try to disconnect first.
386 	 * This allows user to disconnect by connecting to, e.g.,
387 	 * a null address.
388 	 */
389 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
390 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
391 	    (error = sodisconnect(so))))
392 		error = EISCONN;
393 	else
394 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
395 		    NULL, nam, NULL, curproc);
396 	return (error);
397 }
398 
399 int
400 soconnect2(struct socket *so1, struct socket *so2)
401 {
402 	int s, error;
403 
404 	s = solock(so1);
405 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
406 	    (struct mbuf *)so2, NULL, curproc);
407 	sounlock(so1, s);
408 	return (error);
409 }
410 
411 int
412 sodisconnect(struct socket *so)
413 {
414 	int error;
415 
416 	soassertlocked(so);
417 
418 	if ((so->so_state & SS_ISCONNECTED) == 0)
419 		return (ENOTCONN);
420 	if (so->so_state & SS_ISDISCONNECTING)
421 		return (EALREADY);
422 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL,
423 	    NULL, curproc);
424 	return (error);
425 }
426 
427 int m_getuio(struct mbuf **, int, long, struct uio *);
428 
429 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
430 /*
431  * Send on a socket.
432  * If send must go all at once and message is larger than
433  * send buffering, then hard error.
434  * Lock against other senders.
435  * If must go all at once and not enough room now, then
436  * inform user that this would block and do nothing.
437  * Otherwise, if nonblocking, send as much as possible.
438  * The data to be sent is described by "uio" if nonzero,
439  * otherwise by the mbuf chain "top" (which must be null
440  * if uio is not).  Data provided in mbuf chain must be small
441  * enough to send all at once.
442  *
443  * Returns nonzero on error, timeout or signal; callers
444  * must check for short counts if EINTR/ERESTART are returned.
445  * Data and control buffers are freed on return.
446  */
447 int
448 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
449     struct mbuf *control, int flags)
450 {
451 	long space, clen = 0;
452 	size_t resid;
453 	int error, s;
454 	int atomic = sosendallatonce(so) || top;
455 
456 	if (uio)
457 		resid = uio->uio_resid;
458 	else
459 		resid = top->m_pkthdr.len;
460 	/* MSG_EOR on a SOCK_STREAM socket is invalid. */
461 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
462 		m_freem(top);
463 		m_freem(control);
464 		return (EINVAL);
465 	}
466 	if (uio && uio->uio_procp)
467 		uio->uio_procp->p_ru.ru_msgsnd++;
468 	if (control) {
469 		/*
470 		 * In theory clen should be unsigned (since control->m_len is).
471 		 * However, space must be signed, as it might be less than 0
472 		 * if we over-committed, and we must use a signed comparison
473 		 * of space and clen.
474 		 */
475 		clen = control->m_len;
476 		/* reserve extra space for AF_UNIX's internalize */
477 		if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
478 		    clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
479 		    mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
480 			clen = CMSG_SPACE(
481 			    (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
482 			    (sizeof(struct fdpass) / sizeof(int)));
483 	}
484 
485 #define	snderr(errno)	{ error = errno; goto release; }
486 
487 	s = solock(so);
488 restart:
489 	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
490 		goto out;
491 	so->so_state |= SS_ISSENDING;
492 	do {
493 		if (so->so_state & SS_CANTSENDMORE)
494 			snderr(EPIPE);
495 		if (so->so_error) {
496 			error = so->so_error;
497 			so->so_error = 0;
498 			snderr(error);
499 		}
500 		if ((so->so_state & SS_ISCONNECTED) == 0) {
501 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
502 				if (!(resid == 0 && clen != 0))
503 					snderr(ENOTCONN);
504 			} else if (addr == 0)
505 				snderr(EDESTADDRREQ);
506 		}
507 		space = sbspace(so, &so->so_snd);
508 		if (flags & MSG_OOB)
509 			space += 1024;
510 		if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
511 			if (atomic && resid > so->so_snd.sb_hiwat)
512 				snderr(EMSGSIZE);
513 		} else {
514 			if (clen > so->so_snd.sb_hiwat ||
515 			    (atomic && resid > so->so_snd.sb_hiwat - clen))
516 				snderr(EMSGSIZE);
517 		}
518 		if (space < clen ||
519 		    (space - clen < resid &&
520 		    (atomic || space < so->so_snd.sb_lowat))) {
521 			if (flags & MSG_DONTWAIT)
522 				snderr(EWOULDBLOCK);
523 			sbunlock(so, &so->so_snd);
524 			error = sbwait(so, &so->so_snd);
525 			so->so_state &= ~SS_ISSENDING;
526 			if (error)
527 				goto out;
528 			goto restart;
529 		}
530 		space -= clen;
531 		do {
532 			if (uio == NULL) {
533 				/*
534 				 * Data is prepackaged in "top".
535 				 */
536 				resid = 0;
537 				if (flags & MSG_EOR)
538 					top->m_flags |= M_EOR;
539 			} else {
540 				sounlock(so, s);
541 				error = m_getuio(&top, atomic, space, uio);
542 				s = solock(so);
543 				if (error)
544 					goto release;
545 				space -= top->m_pkthdr.len;
546 				resid = uio->uio_resid;
547 				if (flags & MSG_EOR)
548 					top->m_flags |= M_EOR;
549 			}
550 			if (resid == 0)
551 				so->so_state &= ~SS_ISSENDING;
552 			if (top && so->so_options & SO_ZEROIZE)
553 				top->m_flags |= M_ZEROIZE;
554 			error = (*so->so_proto->pr_usrreq)(so,
555 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
556 			    top, addr, control, curproc);
557 			clen = 0;
558 			control = NULL;
559 			top = NULL;
560 			if (error)
561 				goto release;
562 		} while (resid && space > 0);
563 	} while (resid);
564 
565 release:
566 	so->so_state &= ~SS_ISSENDING;
567 	sbunlock(so, &so->so_snd);
568 out:
569 	sounlock(so, s);
570 	m_freem(top);
571 	m_freem(control);
572 	return (error);
573 }
574 
575 int
576 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
577 {
578 	struct mbuf *m, *top = NULL;
579 	struct mbuf **nextp = &top;
580 	u_long len, mlen;
581 	size_t resid = uio->uio_resid;
582 	int error;
583 
584 	do {
585 		if (top == NULL) {
586 			MGETHDR(m, M_WAIT, MT_DATA);
587 			mlen = MHLEN;
588 			m->m_pkthdr.len = 0;
589 			m->m_pkthdr.ph_ifidx = 0;
590 		} else {
591 			MGET(m, M_WAIT, MT_DATA);
592 			mlen = MLEN;
593 		}
594 		/* chain mbuf together */
595 		*nextp = m;
596 		nextp = &m->m_next;
597 
598 		resid = ulmin(resid, space);
599 		if (resid >= MINCLSIZE) {
600 			MCLGETI(m, M_NOWAIT, NULL, ulmin(resid, MAXMCLBYTES));
601 			if ((m->m_flags & M_EXT) == 0)
602 				MCLGETI(m, M_NOWAIT, NULL, MCLBYTES);
603 			if ((m->m_flags & M_EXT) == 0)
604 				goto nopages;
605 			mlen = m->m_ext.ext_size;
606 			len = ulmin(mlen, resid);
607 			/*
608 			 * For datagram protocols, leave room
609 			 * for protocol headers in first mbuf.
610 			 */
611 			if (atomic && m == top && len < mlen - max_hdr)
612 				m->m_data += max_hdr;
613 		} else {
614 nopages:
615 			len = ulmin(mlen, resid);
616 			/*
617 			 * For datagram protocols, leave room
618 			 * for protocol headers in first mbuf.
619 			 */
620 			if (atomic && m == top && len < mlen - max_hdr)
621 				m_align(m, len);
622 		}
623 
624 		error = uiomove(mtod(m, caddr_t), len, uio);
625 		if (error) {
626 			m_freem(top);
627 			return (error);
628 		}
629 
630 		/* adjust counters */
631 		resid = uio->uio_resid;
632 		space -= len;
633 		m->m_len = len;
634 		top->m_pkthdr.len += len;
635 
636 		/* Is there more space and more data? */
637 	} while (space > 0 && resid > 0);
638 
639 	*mp = top;
640 	return 0;
641 }
642 
643 /*
644  * Following replacement or removal of the first mbuf on the first
645  * mbuf chain of a socket buffer, push necessary state changes back
646  * into the socket buffer so that other consumers see the values
647  * consistently.  'nextrecord' is the callers locally stored value of
648  * the original value of sb->sb_mb->m_nextpkt which must be restored
649  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
650  */
651 void
652 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
653 {
654 
655 	/*
656 	 * First, update for the new value of nextrecord.  If necessary,
657 	 * make it the first record.
658 	 */
659 	if (sb->sb_mb != NULL)
660 		sb->sb_mb->m_nextpkt = nextrecord;
661 	else
662 		sb->sb_mb = nextrecord;
663 
664 	/*
665 	 * Now update any dependent socket buffer fields to reflect
666 	 * the new state.  This is an inline of SB_EMPTY_FIXUP, with
667 	 * the addition of a second clause that takes care of the
668 	 * case where sb_mb has been updated, but remains the last
669 	 * record.
670 	 */
671 	if (sb->sb_mb == NULL) {
672 		sb->sb_mbtail = NULL;
673 		sb->sb_lastrecord = NULL;
674 	} else if (sb->sb_mb->m_nextpkt == NULL)
675 		sb->sb_lastrecord = sb->sb_mb;
676 }
677 
678 /*
679  * Implement receive operations on a socket.
680  * We depend on the way that records are added to the sockbuf
681  * by sbappend*.  In particular, each record (mbufs linked through m_next)
682  * must begin with an address if the protocol so specifies,
683  * followed by an optional mbuf or mbufs containing ancillary data,
684  * and then zero or more mbufs of data.
685  * In order to avoid blocking network for the entire time here, we release
686  * the solock() while doing the actual copy to user space.
687  * Although the sockbuf is locked, new data may still be appended,
688  * and thus we must maintain consistency of the sockbuf during that time.
689  *
690  * The caller may receive the data as a single mbuf chain by supplying
691  * an mbuf **mp0 for use in returning the chain.  The uio is then used
692  * only for the count in uio_resid.
693  */
694 int
695 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
696     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
697     socklen_t controllen)
698 {
699 	struct mbuf *m, **mp;
700 	struct mbuf *cm;
701 	u_long len, offset, moff;
702 	int flags, error, s, type, uio_error = 0;
703 	const struct protosw *pr = so->so_proto;
704 	struct mbuf *nextrecord;
705 	size_t resid, orig_resid = uio->uio_resid;
706 
707 	mp = mp0;
708 	if (paddr)
709 		*paddr = NULL;
710 	if (controlp)
711 		*controlp = NULL;
712 	if (flagsp)
713 		flags = *flagsp &~ MSG_EOR;
714 	else
715 		flags = 0;
716 	if (flags & MSG_OOB) {
717 		m = m_get(M_WAIT, MT_DATA);
718 		s = solock(so);
719 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
720 		    (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc);
721 		sounlock(so, s);
722 		if (error)
723 			goto bad;
724 		do {
725 			error = uiomove(mtod(m, caddr_t),
726 			    ulmin(uio->uio_resid, m->m_len), uio);
727 			m = m_free(m);
728 		} while (uio->uio_resid && error == 0 && m);
729 bad:
730 		m_freem(m);
731 		return (error);
732 	}
733 	if (mp)
734 		*mp = NULL;
735 
736 	s = solock(so);
737 restart:
738 	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
739 		sounlock(so, s);
740 		return (error);
741 	}
742 
743 	m = so->so_rcv.sb_mb;
744 #ifdef SOCKET_SPLICE
745 	if (isspliced(so))
746 		m = NULL;
747 #endif /* SOCKET_SPLICE */
748 	/*
749 	 * If we have less data than requested, block awaiting more
750 	 * (subject to any timeout) if:
751 	 *   1. the current count is less than the low water mark,
752 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
753 	 *	receive operation at once if we block (resid <= hiwat), or
754 	 *   3. MSG_DONTWAIT is not set.
755 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
756 	 * we have to do the receive in sections, and thus risk returning
757 	 * a short count if a timeout or signal occurs after we start.
758 	 */
759 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
760 	    so->so_rcv.sb_cc < uio->uio_resid) &&
761 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
762 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
763 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
764 #ifdef DIAGNOSTIC
765 		if (m == NULL && so->so_rcv.sb_cc)
766 #ifdef SOCKET_SPLICE
767 		    if (!isspliced(so))
768 #endif /* SOCKET_SPLICE */
769 			panic("receive 1: so %p, so_type %d, sb_cc %lu",
770 			    so, so->so_type, so->so_rcv.sb_cc);
771 #endif
772 		if (so->so_error) {
773 			if (m)
774 				goto dontblock;
775 			error = so->so_error;
776 			if ((flags & MSG_PEEK) == 0)
777 				so->so_error = 0;
778 			goto release;
779 		}
780 		if (so->so_state & SS_CANTRCVMORE) {
781 			if (m)
782 				goto dontblock;
783 			else if (so->so_rcv.sb_cc == 0)
784 				goto release;
785 		}
786 		for (; m; m = m->m_next)
787 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
788 				m = so->so_rcv.sb_mb;
789 				goto dontblock;
790 			}
791 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
792 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
793 			error = ENOTCONN;
794 			goto release;
795 		}
796 		if (uio->uio_resid == 0 && controlp == NULL)
797 			goto release;
798 		if (flags & MSG_DONTWAIT) {
799 			error = EWOULDBLOCK;
800 			goto release;
801 		}
802 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
803 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
804 		sbunlock(so, &so->so_rcv);
805 		error = sbwait(so, &so->so_rcv);
806 		if (error) {
807 			sounlock(so, s);
808 			return (error);
809 		}
810 		goto restart;
811 	}
812 dontblock:
813 	/*
814 	 * On entry here, m points to the first record of the socket buffer.
815 	 * From this point onward, we maintain 'nextrecord' as a cache of the
816 	 * pointer to the next record in the socket buffer.  We must keep the
817 	 * various socket buffer pointers and local stack versions of the
818 	 * pointers in sync, pushing out modifications before operations that
819 	 * may sleep, and re-reading them afterwards.
820 	 *
821 	 * Otherwise, we will race with the network stack appending new data
822 	 * or records onto the socket buffer by using inconsistent/stale
823 	 * versions of the field, possibly resulting in socket buffer
824 	 * corruption.
825 	 */
826 	if (uio->uio_procp)
827 		uio->uio_procp->p_ru.ru_msgrcv++;
828 	KASSERT(m == so->so_rcv.sb_mb);
829 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
830 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
831 	nextrecord = m->m_nextpkt;
832 	if (pr->pr_flags & PR_ADDR) {
833 #ifdef DIAGNOSTIC
834 		if (m->m_type != MT_SONAME)
835 			panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
836 			    so, so->so_type, m, m->m_type);
837 #endif
838 		orig_resid = 0;
839 		if (flags & MSG_PEEK) {
840 			if (paddr)
841 				*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
842 			m = m->m_next;
843 		} else {
844 			sbfree(&so->so_rcv, m);
845 			if (paddr) {
846 				*paddr = m;
847 				so->so_rcv.sb_mb = m->m_next;
848 				m->m_next = 0;
849 				m = so->so_rcv.sb_mb;
850 			} else {
851 				so->so_rcv.sb_mb = m_free(m);
852 				m = so->so_rcv.sb_mb;
853 			}
854 			sbsync(&so->so_rcv, nextrecord);
855 		}
856 	}
857 	while (m && m->m_type == MT_CONTROL && error == 0) {
858 		int skip = 0;
859 		if (flags & MSG_PEEK) {
860 			if (mtod(m, struct cmsghdr *)->cmsg_type ==
861 			    SCM_RIGHTS) {
862 				/* don't leak internalized SCM_RIGHTS msgs */
863 				skip = 1;
864 			} else if (controlp)
865 				*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
866 			m = m->m_next;
867 		} else {
868 			sbfree(&so->so_rcv, m);
869 			so->so_rcv.sb_mb = m->m_next;
870 			m->m_nextpkt = m->m_next = NULL;
871 			cm = m;
872 			m = so->so_rcv.sb_mb;
873 			sbsync(&so->so_rcv, nextrecord);
874 			if (controlp) {
875 				if (pr->pr_domain->dom_externalize) {
876 					error =
877 					    (*pr->pr_domain->dom_externalize)
878 					    (cm, controllen, flags);
879 				}
880 				*controlp = cm;
881 			} else {
882 				/*
883 				 * Dispose of any SCM_RIGHTS message that went
884 				 * through the read path rather than recv.
885 				 */
886 				if (pr->pr_domain->dom_dispose)
887 					pr->pr_domain->dom_dispose(cm);
888 				m_free(cm);
889 			}
890 		}
891 		if (m != NULL)
892 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
893 		else
894 			nextrecord = so->so_rcv.sb_mb;
895 		if (controlp && !skip) {
896 			orig_resid = 0;
897 			controlp = &(*controlp)->m_next;
898 		}
899 	}
900 
901 	/* If m is non-NULL, we have some data to read. */
902 	if (m) {
903 		type = m->m_type;
904 		if (type == MT_OOBDATA)
905 			flags |= MSG_OOB;
906 		if (m->m_flags & M_BCAST)
907 			flags |= MSG_BCAST;
908 		if (m->m_flags & M_MCAST)
909 			flags |= MSG_MCAST;
910 	}
911 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
912 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
913 
914 	moff = 0;
915 	offset = 0;
916 	while (m && uio->uio_resid > 0 && error == 0) {
917 		if (m->m_type == MT_OOBDATA) {
918 			if (type != MT_OOBDATA)
919 				break;
920 		} else if (type == MT_OOBDATA) {
921 			break;
922 		} else if (m->m_type == MT_CONTROL) {
923 			/*
924 			 * If there is more than one control message in the
925 			 * stream, we do a short read.  Next can be received
926 			 * or disposed by another system call.
927 			 */
928 			break;
929 #ifdef DIAGNOSTIC
930 		} else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
931 			panic("receive 3: so %p, so_type %d, m %p, m_type %d",
932 			    so, so->so_type, m, m->m_type);
933 #endif
934 		}
935 		so->so_state &= ~SS_RCVATMARK;
936 		len = uio->uio_resid;
937 		if (so->so_oobmark && len > so->so_oobmark - offset)
938 			len = so->so_oobmark - offset;
939 		if (len > m->m_len - moff)
940 			len = m->m_len - moff;
941 		/*
942 		 * If mp is set, just pass back the mbufs.
943 		 * Otherwise copy them out via the uio, then free.
944 		 * Sockbuf must be consistent here (points to current mbuf,
945 		 * it points to next record) when we drop priority;
946 		 * we must note any additions to the sockbuf when we
947 		 * block interrupts again.
948 		 */
949 		if (mp == NULL && uio_error == 0) {
950 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
951 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
952 			resid = uio->uio_resid;
953 			sounlock(so, s);
954 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
955 			s = solock(so);
956 			if (uio_error)
957 				uio->uio_resid = resid - len;
958 		} else
959 			uio->uio_resid -= len;
960 		if (len == m->m_len - moff) {
961 			if (m->m_flags & M_EOR)
962 				flags |= MSG_EOR;
963 			if (flags & MSG_PEEK) {
964 				m = m->m_next;
965 				moff = 0;
966 			} else {
967 				nextrecord = m->m_nextpkt;
968 				sbfree(&so->so_rcv, m);
969 				if (mp) {
970 					*mp = m;
971 					mp = &m->m_next;
972 					so->so_rcv.sb_mb = m = m->m_next;
973 					*mp = NULL;
974 				} else {
975 					so->so_rcv.sb_mb = m_free(m);
976 					m = so->so_rcv.sb_mb;
977 				}
978 				/*
979 				 * If m != NULL, we also know that
980 				 * so->so_rcv.sb_mb != NULL.
981 				 */
982 				KASSERT(so->so_rcv.sb_mb == m);
983 				if (m) {
984 					m->m_nextpkt = nextrecord;
985 					if (nextrecord == NULL)
986 						so->so_rcv.sb_lastrecord = m;
987 				} else {
988 					so->so_rcv.sb_mb = nextrecord;
989 					SB_EMPTY_FIXUP(&so->so_rcv);
990 				}
991 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
992 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
993 			}
994 		} else {
995 			if (flags & MSG_PEEK)
996 				moff += len;
997 			else {
998 				if (mp)
999 					*mp = m_copym(m, 0, len, M_WAIT);
1000 				m->m_data += len;
1001 				m->m_len -= len;
1002 				so->so_rcv.sb_cc -= len;
1003 				so->so_rcv.sb_datacc -= len;
1004 			}
1005 		}
1006 		if (so->so_oobmark) {
1007 			if ((flags & MSG_PEEK) == 0) {
1008 				so->so_oobmark -= len;
1009 				if (so->so_oobmark == 0) {
1010 					so->so_state |= SS_RCVATMARK;
1011 					break;
1012 				}
1013 			} else {
1014 				offset += len;
1015 				if (offset == so->so_oobmark)
1016 					break;
1017 			}
1018 		}
1019 		if (flags & MSG_EOR)
1020 			break;
1021 		/*
1022 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1023 		 * we must not quit until "uio->uio_resid == 0" or an error
1024 		 * termination.  If a signal/timeout occurs, return
1025 		 * with a short count but without error.
1026 		 * Keep sockbuf locked against other readers.
1027 		 */
1028 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1029 		    !sosendallatonce(so) && !nextrecord) {
1030 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
1031 				break;
1032 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1033 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1034 			error = sbwait(so, &so->so_rcv);
1035 			if (error) {
1036 				sbunlock(so, &so->so_rcv);
1037 				sounlock(so, s);
1038 				return (0);
1039 			}
1040 			if ((m = so->so_rcv.sb_mb) != NULL)
1041 				nextrecord = m->m_nextpkt;
1042 		}
1043 	}
1044 
1045 	if (m && pr->pr_flags & PR_ATOMIC) {
1046 		flags |= MSG_TRUNC;
1047 		if ((flags & MSG_PEEK) == 0)
1048 			(void) sbdroprecord(&so->so_rcv);
1049 	}
1050 	if ((flags & MSG_PEEK) == 0) {
1051 		if (m == NULL) {
1052 			/*
1053 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1054 			 * part makes sure sb_lastrecord is up-to-date if
1055 			 * there is still data in the socket buffer.
1056 			 */
1057 			so->so_rcv.sb_mb = nextrecord;
1058 			if (so->so_rcv.sb_mb == NULL) {
1059 				so->so_rcv.sb_mbtail = NULL;
1060 				so->so_rcv.sb_lastrecord = NULL;
1061 			} else if (nextrecord->m_nextpkt == NULL)
1062 				so->so_rcv.sb_lastrecord = nextrecord;
1063 		}
1064 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1065 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1066 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1067 			(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
1068 			    (struct mbuf *)(long)flags, NULL, curproc);
1069 	}
1070 	if (orig_resid == uio->uio_resid && orig_resid &&
1071 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1072 		sbunlock(so, &so->so_rcv);
1073 		goto restart;
1074 	}
1075 
1076 	if (uio_error)
1077 		error = uio_error;
1078 
1079 	if (flagsp)
1080 		*flagsp |= flags;
1081 release:
1082 	sbunlock(so, &so->so_rcv);
1083 	sounlock(so, s);
1084 	return (error);
1085 }
1086 
1087 int
1088 soshutdown(struct socket *so, int how)
1089 {
1090 	const struct protosw *pr = so->so_proto;
1091 	int s, error = 0;
1092 
1093 	s = solock(so);
1094 	switch (how) {
1095 	case SHUT_RD:
1096 		sorflush(so);
1097 		break;
1098 	case SHUT_RDWR:
1099 		sorflush(so);
1100 		/* FALLTHROUGH */
1101 	case SHUT_WR:
1102 		error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL,
1103 		    curproc);
1104 		break;
1105 	default:
1106 		error = EINVAL;
1107 		break;
1108 	}
1109 	sounlock(so, s);
1110 
1111 	return (error);
1112 }
1113 
1114 void
1115 sorflush(struct socket *so)
1116 {
1117 	struct sockbuf *sb = &so->so_rcv;
1118 	const struct protosw *pr = so->so_proto;
1119 	struct socket aso;
1120 	int error;
1121 
1122 	sb->sb_flags |= SB_NOINTR;
1123 	error = sblock(so, sb, M_WAITOK);
1124 	/* with SB_NOINTR and M_WAITOK sblock() must not fail */
1125 	KASSERT(error == 0);
1126 	socantrcvmore(so);
1127 	sbunlock(so, sb);
1128 	aso.so_proto = pr;
1129 	aso.so_rcv = *sb;
1130 	memset(&sb->sb_startzero, 0,
1131 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1132 	sb->sb_timeo_nsecs = INFSLP;
1133 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1134 		(*pr->pr_domain->dom_dispose)(aso.so_rcv.sb_mb);
1135 	sbrelease(&aso, &aso.so_rcv);
1136 }
1137 
1138 #ifdef SOCKET_SPLICE
1139 
1140 #define so_splicelen	so_sp->ssp_len
1141 #define so_splicemax	so_sp->ssp_max
1142 #define so_idletv	so_sp->ssp_idletv
1143 #define so_idleto	so_sp->ssp_idleto
1144 #define so_splicetask	so_sp->ssp_task
1145 
1146 int
1147 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1148 {
1149 	struct file	*fp;
1150 	struct socket	*sosp;
1151 	struct sosplice	*sp;
1152 	struct taskq	*tq;
1153 	int		 error = 0;
1154 
1155 	soassertlocked(so);
1156 
1157 	if (sosplice_taskq == NULL) {
1158 		rw_enter_write(&sosplice_lock);
1159 		if (sosplice_taskq == NULL) {
1160 			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1161 			    TASKQ_MPSAFE);
1162 			/* Ensure the taskq is fully visible to other CPUs. */
1163 			membar_producer();
1164 			sosplice_taskq = tq;
1165 		}
1166 		rw_exit_write(&sosplice_lock);
1167 	}
1168 	if (sosplice_taskq == NULL)
1169 		return (ENOMEM);
1170 
1171 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1172 		return (EPROTONOSUPPORT);
1173 	if (so->so_options & SO_ACCEPTCONN)
1174 		return (EOPNOTSUPP);
1175 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1176 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1177 		return (ENOTCONN);
1178 	if (so->so_sp == NULL) {
1179 		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1180 		if (so->so_sp == NULL)
1181 			so->so_sp = sp;
1182 		else
1183 			pool_put(&sosplice_pool, sp);
1184 	}
1185 
1186 	/* If no fd is given, unsplice by removing existing link. */
1187 	if (fd < 0) {
1188 		/* Lock receive buffer. */
1189 		if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) {
1190 			return (error);
1191 		}
1192 		if (so->so_sp->ssp_socket)
1193 			sounsplice(so, so->so_sp->ssp_socket, 0);
1194 		sbunlock(so, &so->so_rcv);
1195 		return (0);
1196 	}
1197 
1198 	if (max && max < 0)
1199 		return (EINVAL);
1200 
1201 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1202 		return (EINVAL);
1203 
1204 	/* Find sosp, the drain socket where data will be spliced into. */
1205 	if ((error = getsock(curproc, fd, &fp)) != 0)
1206 		return (error);
1207 	sosp = fp->f_data;
1208 	if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) {
1209 		error = EPROTONOSUPPORT;
1210 		goto frele;
1211 	}
1212 	if (sosp->so_sp == NULL) {
1213 		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1214 		if (sosp->so_sp == NULL)
1215 			sosp->so_sp = sp;
1216 		else
1217 			pool_put(&sosplice_pool, sp);
1218 	}
1219 
1220 	/* Lock both receive and send buffer. */
1221 	if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) {
1222 		goto frele;
1223 	}
1224 	if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) {
1225 		sbunlock(so, &so->so_rcv);
1226 		goto frele;
1227 	}
1228 
1229 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1230 		error = EBUSY;
1231 		goto release;
1232 	}
1233 	if (sosp->so_options & SO_ACCEPTCONN) {
1234 		error = EOPNOTSUPP;
1235 		goto release;
1236 	}
1237 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1238 		error = ENOTCONN;
1239 		goto release;
1240 	}
1241 
1242 	/* Splice so and sosp together. */
1243 	so->so_sp->ssp_socket = sosp;
1244 	sosp->so_sp->ssp_soback = so;
1245 	so->so_splicelen = 0;
1246 	so->so_splicemax = max;
1247 	if (tv)
1248 		so->so_idletv = *tv;
1249 	else
1250 		timerclear(&so->so_idletv);
1251 	timeout_set_proc(&so->so_idleto, soidle, so);
1252 	task_set(&so->so_splicetask, sotask, so);
1253 
1254 	/*
1255 	 * To prevent softnet interrupt from calling somove() while
1256 	 * we sleep, the socket buffers are not marked as spliced yet.
1257 	 */
1258 	if (somove(so, M_WAIT)) {
1259 		so->so_rcv.sb_flags |= SB_SPLICE;
1260 		sosp->so_snd.sb_flags |= SB_SPLICE;
1261 	}
1262 
1263  release:
1264 	sbunlock(sosp, &sosp->so_snd);
1265 	sbunlock(so, &so->so_rcv);
1266  frele:
1267 	/*
1268 	 * FRELE() must not be called with the socket lock held. It is safe to
1269 	 * release the lock here as long as no other operation happen on the
1270 	 * socket when sosplice() returns. The dance could be avoided by
1271 	 * grabbing the socket lock inside this function.
1272 	 */
1273 	sounlock(so, SL_LOCKED);
1274 	FRELE(fp, curproc);
1275 	solock(so);
1276 	return (error);
1277 }
1278 
1279 void
1280 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1281 {
1282 	soassertlocked(so);
1283 
1284 	task_del(sosplice_taskq, &so->so_splicetask);
1285 	timeout_del(&so->so_idleto);
1286 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
1287 	so->so_rcv.sb_flags &= ~SB_SPLICE;
1288 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1289 	/* Do not wakeup a socket that is about to be freed. */
1290 	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
1291 		sorwakeup(so);
1292 	if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp))
1293 		sowwakeup(sosp);
1294 }
1295 
1296 void
1297 soidle(void *arg)
1298 {
1299 	struct socket *so = arg;
1300 	int s;
1301 
1302 	s = solock(so);
1303 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1304 		so->so_error = ETIMEDOUT;
1305 		sounsplice(so, so->so_sp->ssp_socket, 0);
1306 	}
1307 	sounlock(so, s);
1308 }
1309 
1310 void
1311 sotask(void *arg)
1312 {
1313 	struct socket *so = arg;
1314 	int s;
1315 
1316 	s = solock(so);
1317 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1318 		/*
1319 		 * We may not sleep here as sofree() and unsplice() may be
1320 		 * called from softnet interrupt context.  This would remove
1321 		 * the socket during somove().
1322 		 */
1323 		somove(so, M_DONTWAIT);
1324 	}
1325 	sounlock(so, s);
1326 
1327 	/* Avoid user land starvation. */
1328 	yield();
1329 }
1330 
1331 /*
1332  * The socket splicing task or idle timeout may sleep while grabbing the net
1333  * lock.  As sofree() can be called anytime, sotask() or soidle() could access
1334  * the socket memory of a freed socket after wakeup.  So delay the pool_put()
1335  * after all pending socket splicing tasks or timeouts have finished.  Do this
1336  * by scheduling it on the same threads.
1337  */
1338 void
1339 soreaper(void *arg)
1340 {
1341 	struct socket *so = arg;
1342 
1343 	/* Reuse splice task, sounsplice() has been called before. */
1344 	task_set(&so->so_sp->ssp_task, soput, so);
1345 	task_add(sosplice_taskq, &so->so_sp->ssp_task);
1346 }
1347 
1348 void
1349 soput(void *arg)
1350 {
1351 	struct socket *so = arg;
1352 
1353 	pool_put(&sosplice_pool, so->so_sp);
1354 	pool_put(&socket_pool, so);
1355 }
1356 
1357 /*
1358  * Move data from receive buffer of spliced source socket to send
1359  * buffer of drain socket.  Try to move as much as possible in one
1360  * big chunk.  It is a TCP only implementation.
1361  * Return value 0 means splicing has been finished, 1 continue.
1362  */
1363 int
1364 somove(struct socket *so, int wait)
1365 {
1366 	struct socket	*sosp = so->so_sp->ssp_socket;
1367 	struct mbuf	*m, **mp, *nextrecord;
1368 	u_long		 len, off, oobmark;
1369 	long		 space;
1370 	int		 error = 0, maxreached = 0;
1371 	unsigned int	 state;
1372 
1373 	soassertlocked(so);
1374 
1375  nextpkt:
1376 	if (so->so_error) {
1377 		error = so->so_error;
1378 		goto release;
1379 	}
1380 	if (sosp->so_state & SS_CANTSENDMORE) {
1381 		error = EPIPE;
1382 		goto release;
1383 	}
1384 	if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
1385 	    sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
1386 		error = sosp->so_error;
1387 		goto release;
1388 	}
1389 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1390 		goto release;
1391 
1392 	/* Calculate how many bytes can be copied now. */
1393 	len = so->so_rcv.sb_datacc;
1394 	if (so->so_splicemax) {
1395 		KASSERT(so->so_splicelen < so->so_splicemax);
1396 		if (so->so_splicemax <= so->so_splicelen + len) {
1397 			len = so->so_splicemax - so->so_splicelen;
1398 			maxreached = 1;
1399 		}
1400 	}
1401 	space = sbspace(sosp, &sosp->so_snd);
1402 	if (so->so_oobmark && so->so_oobmark < len &&
1403 	    so->so_oobmark < space + 1024)
1404 		space += 1024;
1405 	if (space <= 0) {
1406 		maxreached = 0;
1407 		goto release;
1408 	}
1409 	if (space < len) {
1410 		maxreached = 0;
1411 		if (space < sosp->so_snd.sb_lowat)
1412 			goto release;
1413 		len = space;
1414 	}
1415 	sosp->so_state |= SS_ISSENDING;
1416 
1417 	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1418 	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1419 	m = so->so_rcv.sb_mb;
1420 	if (m == NULL)
1421 		goto release;
1422 	nextrecord = m->m_nextpkt;
1423 
1424 	/* Drop address and control information not used with splicing. */
1425 	if (so->so_proto->pr_flags & PR_ADDR) {
1426 #ifdef DIAGNOSTIC
1427 		if (m->m_type != MT_SONAME)
1428 			panic("somove soname: so %p, so_type %d, m %p, "
1429 			    "m_type %d", so, so->so_type, m, m->m_type);
1430 #endif
1431 		m = m->m_next;
1432 	}
1433 	while (m && m->m_type == MT_CONTROL)
1434 		m = m->m_next;
1435 	if (m == NULL) {
1436 		sbdroprecord(&so->so_rcv);
1437 		if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
1438 			(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1439 			    NULL, NULL, NULL);
1440 		goto nextpkt;
1441 	}
1442 
1443 	/*
1444 	 * By splicing sockets connected to localhost, userland might create a
1445 	 * loop.  Dissolve splicing with error if loop is detected by counter.
1446 	 *
1447 	 * If we deal with looped broadcast/multicast packet we bail out with
1448 	 * no error to suppress splice termination.
1449 	 */
1450 	if ((m->m_flags & M_PKTHDR) &&
1451 	    ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1452 	    ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1453 		if (m->m_pkthdr.ph_loopcnt >= M_MAXLOOP)
1454 			error = ELOOP;
1455 		goto release;
1456 	}
1457 
1458 	if (so->so_proto->pr_flags & PR_ATOMIC) {
1459 		if ((m->m_flags & M_PKTHDR) == 0)
1460 			panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1461 			    "m_type %d", so, so->so_type, m, m->m_type);
1462 		if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1463 			error = EMSGSIZE;
1464 			goto release;
1465 		}
1466 		if (len < m->m_pkthdr.len)
1467 			goto release;
1468 		if (m->m_pkthdr.len < len) {
1469 			maxreached = 0;
1470 			len = m->m_pkthdr.len;
1471 		}
1472 		/*
1473 		 * Throw away the name mbuf after it has been assured
1474 		 * that the whole first record can be processed.
1475 		 */
1476 		m = so->so_rcv.sb_mb;
1477 		sbfree(&so->so_rcv, m);
1478 		so->so_rcv.sb_mb = m_free(m);
1479 		sbsync(&so->so_rcv, nextrecord);
1480 	}
1481 	/*
1482 	 * Throw away the control mbufs after it has been assured
1483 	 * that the whole first record can be processed.
1484 	 */
1485 	m = so->so_rcv.sb_mb;
1486 	while (m && m->m_type == MT_CONTROL) {
1487 		sbfree(&so->so_rcv, m);
1488 		so->so_rcv.sb_mb = m_free(m);
1489 		m = so->so_rcv.sb_mb;
1490 		sbsync(&so->so_rcv, nextrecord);
1491 	}
1492 
1493 	SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1494 	SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1495 
1496 	/* Take at most len mbufs out of receive buffer. */
1497 	for (off = 0, mp = &m; off <= len && *mp;
1498 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1499 		u_long size = len - off;
1500 
1501 #ifdef DIAGNOSTIC
1502 		if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1503 			panic("somove type: so %p, so_type %d, m %p, "
1504 			    "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1505 #endif
1506 		if ((*mp)->m_len > size) {
1507 			/*
1508 			 * Move only a partial mbuf at maximum splice length or
1509 			 * if the drain buffer is too small for this large mbuf.
1510 			 */
1511 			if (!maxreached && so->so_snd.sb_datacc > 0) {
1512 				len -= size;
1513 				break;
1514 			}
1515 			*mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1516 			if (*mp == NULL) {
1517 				len -= size;
1518 				break;
1519 			}
1520 			so->so_rcv.sb_mb->m_data += size;
1521 			so->so_rcv.sb_mb->m_len -= size;
1522 			so->so_rcv.sb_cc -= size;
1523 			so->so_rcv.sb_datacc -= size;
1524 		} else {
1525 			*mp = so->so_rcv.sb_mb;
1526 			sbfree(&so->so_rcv, *mp);
1527 			so->so_rcv.sb_mb = (*mp)->m_next;
1528 			sbsync(&so->so_rcv, nextrecord);
1529 		}
1530 	}
1531 	*mp = NULL;
1532 
1533 	SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1534 	SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1535 	SBCHECK(&so->so_rcv);
1536 	if (m == NULL)
1537 		goto release;
1538 	m->m_nextpkt = NULL;
1539 	if (m->m_flags & M_PKTHDR) {
1540 		m_resethdr(m);
1541 		m->m_pkthdr.len = len;
1542 	}
1543 
1544 	/* Send window update to source peer as receive buffer has changed. */
1545 	if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
1546 		(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1547 		    NULL, NULL, NULL);
1548 
1549 	/* Receive buffer did shrink by len bytes, adjust oob. */
1550 	state = so->so_state;
1551 	so->so_state &= ~SS_RCVATMARK;
1552 	oobmark = so->so_oobmark;
1553 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1554 	if (oobmark) {
1555 		if (oobmark == len)
1556 			so->so_state |= SS_RCVATMARK;
1557 		if (oobmark >= len)
1558 			oobmark = 0;
1559 	}
1560 
1561 	/*
1562 	 * Handle oob data.  If any malloc fails, ignore error.
1563 	 * TCP urgent data is not very reliable anyway.
1564 	 */
1565 	while (((state & SS_RCVATMARK) || oobmark) &&
1566 	    (so->so_options & SO_OOBINLINE)) {
1567 		struct mbuf *o = NULL;
1568 
1569 		if (state & SS_RCVATMARK) {
1570 			o = m_get(wait, MT_DATA);
1571 			state &= ~SS_RCVATMARK;
1572 		} else if (oobmark) {
1573 			o = m_split(m, oobmark, wait);
1574 			if (o) {
1575 				error = (*sosp->so_proto->pr_usrreq)(sosp,
1576 				    PRU_SEND, m, NULL, NULL, NULL);
1577 				if (error) {
1578 					if (sosp->so_state & SS_CANTSENDMORE)
1579 						error = EPIPE;
1580 					m_freem(o);
1581 					goto release;
1582 				}
1583 				len -= oobmark;
1584 				so->so_splicelen += oobmark;
1585 				m = o;
1586 				o = m_get(wait, MT_DATA);
1587 			}
1588 			oobmark = 0;
1589 		}
1590 		if (o) {
1591 			o->m_len = 1;
1592 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1593 			error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB,
1594 			    o, NULL, NULL, NULL);
1595 			if (error) {
1596 				if (sosp->so_state & SS_CANTSENDMORE)
1597 					error = EPIPE;
1598 				m_freem(m);
1599 				goto release;
1600 			}
1601 			len -= 1;
1602 			so->so_splicelen += 1;
1603 			if (oobmark) {
1604 				oobmark -= 1;
1605 				if (oobmark == 0)
1606 					state |= SS_RCVATMARK;
1607 			}
1608 			m_adj(m, 1);
1609 		}
1610 	}
1611 
1612 	/* Append all remaining data to drain socket. */
1613 	if (so->so_rcv.sb_cc == 0 || maxreached)
1614 		sosp->so_state &= ~SS_ISSENDING;
1615 	error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL,
1616 	    NULL);
1617 	if (error) {
1618 		if (sosp->so_state & SS_CANTSENDMORE)
1619 			error = EPIPE;
1620 		goto release;
1621 	}
1622 	so->so_splicelen += len;
1623 
1624 	/* Move several packets if possible. */
1625 	if (!maxreached && nextrecord)
1626 		goto nextpkt;
1627 
1628  release:
1629 	sosp->so_state &= ~SS_ISSENDING;
1630 	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1631 		error = EFBIG;
1632 	if (error)
1633 		so->so_error = error;
1634 	if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) ||
1635 	    (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) {
1636 		sounsplice(so, sosp, 0);
1637 		return (0);
1638 	}
1639 	if (timerisset(&so->so_idletv))
1640 		timeout_add_tv(&so->so_idleto, &so->so_idletv);
1641 	return (1);
1642 }
1643 
1644 #endif /* SOCKET_SPLICE */
1645 
1646 void
1647 sorwakeup(struct socket *so)
1648 {
1649 	soassertlocked(so);
1650 
1651 #ifdef SOCKET_SPLICE
1652 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1653 		/*
1654 		 * TCP has a sendbuffer that can handle multiple packets
1655 		 * at once.  So queue the stream a bit to accumulate data.
1656 		 * The sosplice thread will call somove() later and send
1657 		 * the packets calling tcp_output() only once.
1658 		 * In the UDP case, send out the packets immediately.
1659 		 * Using a thread would make things slower.
1660 		 */
1661 		if (so->so_proto->pr_flags & PR_WANTRCVD)
1662 			task_add(sosplice_taskq, &so->so_splicetask);
1663 		else
1664 			somove(so, M_DONTWAIT);
1665 	}
1666 	if (isspliced(so))
1667 		return;
1668 #endif
1669 	sowakeup(so, &so->so_rcv);
1670 	if (so->so_upcall)
1671 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1672 }
1673 
1674 void
1675 sowwakeup(struct socket *so)
1676 {
1677 	soassertlocked(so);
1678 
1679 #ifdef SOCKET_SPLICE
1680 	if (so->so_snd.sb_flags & SB_SPLICE)
1681 		task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
1682 	if (issplicedback(so))
1683 		return;
1684 #endif
1685 	sowakeup(so, &so->so_snd);
1686 }
1687 
1688 int
1689 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1690 {
1691 	int error = 0;
1692 
1693 	soassertlocked(so);
1694 
1695 	if (level != SOL_SOCKET) {
1696 		if (so->so_proto->pr_ctloutput) {
1697 			error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1698 			    level, optname, m);
1699 			return (error);
1700 		}
1701 		error = ENOPROTOOPT;
1702 	} else {
1703 		switch (optname) {
1704 		case SO_BINDANY:
1705 			if ((error = suser(curproc)) != 0)	/* XXX */
1706 				return (error);
1707 			break;
1708 		}
1709 
1710 		switch (optname) {
1711 
1712 		case SO_LINGER:
1713 			if (m == NULL || m->m_len != sizeof (struct linger) ||
1714 			    mtod(m, struct linger *)->l_linger < 0 ||
1715 			    mtod(m, struct linger *)->l_linger > SHRT_MAX)
1716 				return (EINVAL);
1717 			so->so_linger = mtod(m, struct linger *)->l_linger;
1718 			/* FALLTHROUGH */
1719 
1720 		case SO_BINDANY:
1721 		case SO_DEBUG:
1722 		case SO_KEEPALIVE:
1723 		case SO_USELOOPBACK:
1724 		case SO_BROADCAST:
1725 		case SO_REUSEADDR:
1726 		case SO_REUSEPORT:
1727 		case SO_OOBINLINE:
1728 		case SO_TIMESTAMP:
1729 		case SO_ZEROIZE:
1730 			if (m == NULL || m->m_len < sizeof (int))
1731 				return (EINVAL);
1732 			if (*mtod(m, int *))
1733 				so->so_options |= optname;
1734 			else
1735 				so->so_options &= ~optname;
1736 			break;
1737 
1738 		case SO_DONTROUTE:
1739 			if (m == NULL || m->m_len < sizeof (int))
1740 				return (EINVAL);
1741 			if (*mtod(m, int *))
1742 				error = EOPNOTSUPP;
1743 			break;
1744 
1745 		case SO_SNDBUF:
1746 		case SO_RCVBUF:
1747 		case SO_SNDLOWAT:
1748 		case SO_RCVLOWAT:
1749 		    {
1750 			u_long cnt;
1751 
1752 			if (m == NULL || m->m_len < sizeof (int))
1753 				return (EINVAL);
1754 			cnt = *mtod(m, int *);
1755 			if ((long)cnt <= 0)
1756 				cnt = 1;
1757 			switch (optname) {
1758 
1759 			case SO_SNDBUF:
1760 				if (so->so_state & SS_CANTSENDMORE)
1761 					return (EINVAL);
1762 				if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
1763 				    sbreserve(so, &so->so_snd, cnt))
1764 					return (ENOBUFS);
1765 				so->so_snd.sb_wat = cnt;
1766 				break;
1767 
1768 			case SO_RCVBUF:
1769 				if (so->so_state & SS_CANTRCVMORE)
1770 					return (EINVAL);
1771 				if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
1772 				    sbreserve(so, &so->so_rcv, cnt))
1773 					return (ENOBUFS);
1774 				so->so_rcv.sb_wat = cnt;
1775 				break;
1776 
1777 			case SO_SNDLOWAT:
1778 				so->so_snd.sb_lowat =
1779 				    (cnt > so->so_snd.sb_hiwat) ?
1780 				    so->so_snd.sb_hiwat : cnt;
1781 				break;
1782 			case SO_RCVLOWAT:
1783 				so->so_rcv.sb_lowat =
1784 				    (cnt > so->so_rcv.sb_hiwat) ?
1785 				    so->so_rcv.sb_hiwat : cnt;
1786 				break;
1787 			}
1788 			break;
1789 		    }
1790 
1791 		case SO_SNDTIMEO:
1792 		case SO_RCVTIMEO:
1793 		    {
1794 			struct timeval tv;
1795 			uint64_t nsecs;
1796 
1797 			if (m == NULL || m->m_len < sizeof (tv))
1798 				return (EINVAL);
1799 			memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
1800 			if (!timerisvalid(&tv))
1801 				return (EINVAL);
1802 			nsecs = TIMEVAL_TO_NSEC(&tv);
1803 			if (nsecs == UINT64_MAX)
1804 				return (EDOM);
1805 			if (nsecs == 0)
1806 				nsecs = INFSLP;
1807 			switch (optname) {
1808 
1809 			case SO_SNDTIMEO:
1810 				so->so_snd.sb_timeo_nsecs = nsecs;
1811 				break;
1812 			case SO_RCVTIMEO:
1813 				so->so_rcv.sb_timeo_nsecs = nsecs;
1814 				break;
1815 			}
1816 			break;
1817 		    }
1818 
1819 		case SO_RTABLE:
1820 			if (so->so_proto->pr_domain &&
1821 			    so->so_proto->pr_domain->dom_protosw &&
1822 			    so->so_proto->pr_ctloutput) {
1823 				struct domain *dom = so->so_proto->pr_domain;
1824 
1825 				level = dom->dom_protosw->pr_protocol;
1826 				error = (*so->so_proto->pr_ctloutput)
1827 				    (PRCO_SETOPT, so, level, optname, m);
1828 				return (error);
1829 			}
1830 			error = ENOPROTOOPT;
1831 			break;
1832 
1833 #ifdef SOCKET_SPLICE
1834 		case SO_SPLICE:
1835 			if (m == NULL) {
1836 				error = sosplice(so, -1, 0, NULL);
1837 			} else if (m->m_len < sizeof(int)) {
1838 				return (EINVAL);
1839 			} else if (m->m_len < sizeof(struct splice)) {
1840 				error = sosplice(so, *mtod(m, int *), 0, NULL);
1841 			} else {
1842 				error = sosplice(so,
1843 				    mtod(m, struct splice *)->sp_fd,
1844 				    mtod(m, struct splice *)->sp_max,
1845 				   &mtod(m, struct splice *)->sp_idle);
1846 			}
1847 			break;
1848 #endif /* SOCKET_SPLICE */
1849 
1850 		default:
1851 			error = ENOPROTOOPT;
1852 			break;
1853 		}
1854 		if (error == 0 && so->so_proto->pr_ctloutput) {
1855 			(*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1856 			    level, optname, m);
1857 		}
1858 	}
1859 
1860 	return (error);
1861 }
1862 
1863 int
1864 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
1865 {
1866 	int error = 0;
1867 
1868 	soassertlocked(so);
1869 
1870 	if (level != SOL_SOCKET) {
1871 		if (so->so_proto->pr_ctloutput) {
1872 			m->m_len = 0;
1873 
1874 			error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
1875 			    level, optname, m);
1876 			if (error)
1877 				return (error);
1878 			return (0);
1879 		} else
1880 			return (ENOPROTOOPT);
1881 	} else {
1882 		m->m_len = sizeof (int);
1883 
1884 		switch (optname) {
1885 
1886 		case SO_LINGER:
1887 			m->m_len = sizeof (struct linger);
1888 			mtod(m, struct linger *)->l_onoff =
1889 				so->so_options & SO_LINGER;
1890 			mtod(m, struct linger *)->l_linger = so->so_linger;
1891 			break;
1892 
1893 		case SO_BINDANY:
1894 		case SO_USELOOPBACK:
1895 		case SO_DEBUG:
1896 		case SO_KEEPALIVE:
1897 		case SO_REUSEADDR:
1898 		case SO_REUSEPORT:
1899 		case SO_BROADCAST:
1900 		case SO_OOBINLINE:
1901 		case SO_TIMESTAMP:
1902 		case SO_ZEROIZE:
1903 			*mtod(m, int *) = so->so_options & optname;
1904 			break;
1905 
1906 		case SO_DONTROUTE:
1907 			*mtod(m, int *) = 0;
1908 			break;
1909 
1910 		case SO_TYPE:
1911 			*mtod(m, int *) = so->so_type;
1912 			break;
1913 
1914 		case SO_ERROR:
1915 			*mtod(m, int *) = so->so_error;
1916 			so->so_error = 0;
1917 			break;
1918 
1919 		case SO_DOMAIN:
1920 			*mtod(m, int *) = so->so_proto->pr_domain->dom_family;
1921 			break;
1922 
1923 		case SO_PROTOCOL:
1924 			*mtod(m, int *) = so->so_proto->pr_protocol;
1925 			break;
1926 
1927 		case SO_SNDBUF:
1928 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1929 			break;
1930 
1931 		case SO_RCVBUF:
1932 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1933 			break;
1934 
1935 		case SO_SNDLOWAT:
1936 			*mtod(m, int *) = so->so_snd.sb_lowat;
1937 			break;
1938 
1939 		case SO_RCVLOWAT:
1940 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1941 			break;
1942 
1943 		case SO_SNDTIMEO:
1944 		case SO_RCVTIMEO:
1945 		    {
1946 			struct timeval tv;
1947 			uint64_t nsecs = (optname == SO_SNDTIMEO ?
1948 			    so->so_snd.sb_timeo_nsecs :
1949 			    so->so_rcv.sb_timeo_nsecs);
1950 
1951 			m->m_len = sizeof(struct timeval);
1952 			memset(&tv, 0, sizeof(tv));
1953 			if (nsecs != INFSLP)
1954 				NSEC_TO_TIMEVAL(nsecs, &tv);
1955 			memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
1956 			break;
1957 		    }
1958 
1959 		case SO_RTABLE:
1960 			if (so->so_proto->pr_domain &&
1961 			    so->so_proto->pr_domain->dom_protosw &&
1962 			    so->so_proto->pr_ctloutput) {
1963 				struct domain *dom = so->so_proto->pr_domain;
1964 
1965 				level = dom->dom_protosw->pr_protocol;
1966 				error = (*so->so_proto->pr_ctloutput)
1967 				    (PRCO_GETOPT, so, level, optname, m);
1968 				if (error)
1969 					return (error);
1970 				break;
1971 			}
1972 			return (ENOPROTOOPT);
1973 
1974 #ifdef SOCKET_SPLICE
1975 		case SO_SPLICE:
1976 		    {
1977 			off_t len;
1978 
1979 			m->m_len = sizeof(off_t);
1980 			len = so->so_sp ? so->so_sp->ssp_len : 0;
1981 			memcpy(mtod(m, off_t *), &len, sizeof(off_t));
1982 			break;
1983 		    }
1984 #endif /* SOCKET_SPLICE */
1985 
1986 		case SO_PEERCRED:
1987 			if (so->so_proto->pr_protocol == AF_UNIX) {
1988 				struct unpcb *unp = sotounpcb(so);
1989 
1990 				if (unp->unp_flags & UNP_FEIDS) {
1991 					m->m_len = sizeof(unp->unp_connid);
1992 					memcpy(mtod(m, caddr_t),
1993 					    &(unp->unp_connid), m->m_len);
1994 					break;
1995 				}
1996 				return (ENOTCONN);
1997 			}
1998 			return (EOPNOTSUPP);
1999 
2000 		default:
2001 			return (ENOPROTOOPT);
2002 		}
2003 		return (0);
2004 	}
2005 }
2006 
2007 void
2008 sohasoutofband(struct socket *so)
2009 {
2010 	pgsigio(&so->so_sigio, SIGURG, 0);
2011 	selwakeup(&so->so_rcv.sb_sel);
2012 }
2013 
2014 int
2015 soo_kqfilter(struct file *fp, struct knote *kn)
2016 {
2017 	struct socket *so = kn->kn_fp->f_data;
2018 	struct sockbuf *sb;
2019 
2020 	KERNEL_ASSERT_LOCKED();
2021 
2022 	switch (kn->kn_filter) {
2023 	case EVFILT_READ:
2024 		if (so->so_options & SO_ACCEPTCONN)
2025 			kn->kn_fop = &solisten_filtops;
2026 		else
2027 			kn->kn_fop = &soread_filtops;
2028 		sb = &so->so_rcv;
2029 		break;
2030 	case EVFILT_WRITE:
2031 		kn->kn_fop = &sowrite_filtops;
2032 		sb = &so->so_snd;
2033 		break;
2034 	case EVFILT_EXCEPT:
2035 		kn->kn_fop = &soexcept_filtops;
2036 		sb = &so->so_rcv;
2037 		break;
2038 	default:
2039 		return (EINVAL);
2040 	}
2041 
2042 	klist_insert(&sb->sb_sel.si_note, kn);
2043 	sb->sb_flagsintr |= SB_KNOTE;
2044 
2045 	return (0);
2046 }
2047 
2048 void
2049 filt_sordetach(struct knote *kn)
2050 {
2051 	struct socket *so = kn->kn_fp->f_data;
2052 
2053 	KERNEL_ASSERT_LOCKED();
2054 
2055 	klist_remove(&so->so_rcv.sb_sel.si_note, kn);
2056 	if (klist_empty(&so->so_rcv.sb_sel.si_note))
2057 		so->so_rcv.sb_flagsintr &= ~SB_KNOTE;
2058 }
2059 
2060 int
2061 filt_soread(struct knote *kn, long hint)
2062 {
2063 	struct socket *so = kn->kn_fp->f_data;
2064 	int s, rv = 0;
2065 
2066 	if ((hint & NOTE_SUBMIT) == 0)
2067 		s = solock(so);
2068 	kn->kn_data = so->so_rcv.sb_cc;
2069 #ifdef SOCKET_SPLICE
2070 	if (isspliced(so)) {
2071 		rv = 0;
2072 	} else
2073 #endif /* SOCKET_SPLICE */
2074 	if (kn->kn_sfflags & NOTE_OOB) {
2075 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
2076 			kn->kn_fflags |= NOTE_OOB;
2077 			kn->kn_data -= so->so_oobmark;
2078 			rv = 1;
2079 		}
2080 	} else if (so->so_state & SS_CANTRCVMORE) {
2081 		kn->kn_flags |= EV_EOF;
2082 		if (kn->kn_flags & __EV_POLL) {
2083 			if (so->so_state & SS_ISDISCONNECTED)
2084 				kn->kn_flags |= __EV_HUP;
2085 		}
2086 		kn->kn_fflags = so->so_error;
2087 		rv = 1;
2088 	} else if (so->so_error) {	/* temporary udp error */
2089 		rv = 1;
2090 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2091 		rv = (kn->kn_data >= kn->kn_sdata);
2092 	} else {
2093 		rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2094 	}
2095 	if ((hint & NOTE_SUBMIT) == 0)
2096 		sounlock(so, s);
2097 
2098 	return rv;
2099 }
2100 
2101 void
2102 filt_sowdetach(struct knote *kn)
2103 {
2104 	struct socket *so = kn->kn_fp->f_data;
2105 
2106 	KERNEL_ASSERT_LOCKED();
2107 
2108 	klist_remove(&so->so_snd.sb_sel.si_note, kn);
2109 	if (klist_empty(&so->so_snd.sb_sel.si_note))
2110 		so->so_snd.sb_flagsintr &= ~SB_KNOTE;
2111 }
2112 
2113 int
2114 filt_sowrite(struct knote *kn, long hint)
2115 {
2116 	struct socket *so = kn->kn_fp->f_data;
2117 	int s, rv;
2118 
2119 	if ((hint & NOTE_SUBMIT) == 0)
2120 		s = solock(so);
2121 	kn->kn_data = sbspace(so, &so->so_snd);
2122 	if (so->so_state & SS_CANTSENDMORE) {
2123 		kn->kn_flags |= EV_EOF;
2124 		if (kn->kn_flags & __EV_POLL) {
2125 			if (so->so_state & SS_ISDISCONNECTED)
2126 				kn->kn_flags |= __EV_HUP;
2127 		}
2128 		kn->kn_fflags = so->so_error;
2129 		rv = 1;
2130 	} else if (so->so_error) {	/* temporary udp error */
2131 		rv = 1;
2132 	} else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2133 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2134 		rv = 0;
2135 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2136 		rv = (kn->kn_data >= kn->kn_sdata);
2137 	} else {
2138 		rv = (kn->kn_data >= so->so_snd.sb_lowat);
2139 	}
2140 	if ((hint & NOTE_SUBMIT) == 0)
2141 		sounlock(so, s);
2142 
2143 	return (rv);
2144 }
2145 
2146 int
2147 filt_solisten(struct knote *kn, long hint)
2148 {
2149 	struct socket *so = kn->kn_fp->f_data;
2150 	int s;
2151 
2152 	if ((hint & NOTE_SUBMIT) == 0)
2153 		s = solock(so);
2154 	kn->kn_data = so->so_qlen;
2155 	if ((hint & NOTE_SUBMIT) == 0)
2156 		sounlock(so, s);
2157 
2158 	return (kn->kn_data != 0);
2159 }
2160 
2161 #ifdef DDB
2162 void
2163 sobuf_print(struct sockbuf *,
2164     int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2165 
2166 void
2167 sobuf_print(struct sockbuf *sb,
2168     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2169 {
2170 	(*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2171 	(*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2172 	(*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2173 	(*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2174 	(*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2175 	(*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2176 	(*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2177 	(*pr)("\tsb_mb: %p\n", sb->sb_mb);
2178 	(*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2179 	(*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2180 	(*pr)("\tsb_sel: ...\n");
2181 	(*pr)("\tsb_flagsintr: %d\n", sb->sb_flagsintr);
2182 	(*pr)("\tsb_flags: %i\n", sb->sb_flags);
2183 	(*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2184 }
2185 
2186 void
2187 so_print(void *v,
2188     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2189 {
2190 	struct socket *so = v;
2191 
2192 	(*pr)("socket %p\n", so);
2193 	(*pr)("so_type: %i\n", so->so_type);
2194 	(*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2195 	(*pr)("so_linger: %i\n", so->so_linger);
2196 	(*pr)("so_state: 0x%04x\n", so->so_state);
2197 	(*pr)("so_pcb: %p\n", so->so_pcb);
2198 	(*pr)("so_proto: %p\n", so->so_proto);
2199 	(*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2200 
2201 	(*pr)("so_head: %p\n", so->so_head);
2202 	(*pr)("so_onq: %p\n", so->so_onq);
2203 	(*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2204 	(*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2205 	(*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2206 	(*pr)("so_q0len: %i\n", so->so_q0len);
2207 	(*pr)("so_qlen: %i\n", so->so_qlen);
2208 	(*pr)("so_qlimit: %i\n", so->so_qlimit);
2209 	(*pr)("so_timeo: %i\n", so->so_timeo);
2210 	(*pr)("so_obmark: %lu\n", so->so_oobmark);
2211 
2212 	(*pr)("so_sp: %p\n", so->so_sp);
2213 	if (so->so_sp != NULL) {
2214 		(*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2215 		(*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2216 		(*pr)("\tssp_len: %lld\n",
2217 		    (unsigned long long)so->so_sp->ssp_len);
2218 		(*pr)("\tssp_max: %lld\n",
2219 		    (unsigned long long)so->so_sp->ssp_max);
2220 		(*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2221 		    so->so_sp->ssp_idletv.tv_usec);
2222 		(*pr)("\tssp_idleto: %spending (@%i)\n",
2223 		    timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2224 		    so->so_sp->ssp_idleto.to_time);
2225 	}
2226 
2227 	(*pr)("so_rcv:\n");
2228 	sobuf_print(&so->so_rcv, pr);
2229 	(*pr)("so_snd:\n");
2230 	sobuf_print(&so->so_snd, pr);
2231 
2232 	(*pr)("so_upcall: %p so_upcallarg: %p\n",
2233 	    so->so_upcall, so->so_upcallarg);
2234 
2235 	(*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2236 	(*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2237 	(*pr)("so_cpid: %d\n", so->so_cpid);
2238 }
2239 #endif
2240