xref: /openbsd-src/sys/kern/uipc_socket.c (revision a5429850edcc9dd5646cc8ddb251ed22eba08b09)
1 /*	$OpenBSD: uipc_socket.c,v 1.289 2022/09/05 14:56:08 bluhm Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/event.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/unpcb.h>
47 #include <sys/socketvar.h>
48 #include <sys/signalvar.h>
49 #include <sys/pool.h>
50 #include <sys/atomic.h>
51 #include <sys/rwlock.h>
52 #include <sys/time.h>
53 #include <sys/refcnt.h>
54 
55 #ifdef DDB
56 #include <machine/db_machdep.h>
57 #endif
58 
59 void	sbsync(struct sockbuf *, struct mbuf *);
60 
61 int	sosplice(struct socket *, int, off_t, struct timeval *);
62 void	sounsplice(struct socket *, struct socket *, int);
63 void	soidle(void *);
64 void	sotask(void *);
65 void	soreaper(void *);
66 void	soput(void *);
67 int	somove(struct socket *, int);
68 void	sorflush(struct socket *);
69 
70 void	filt_sordetach(struct knote *kn);
71 int	filt_soread(struct knote *kn, long hint);
72 void	filt_sowdetach(struct knote *kn);
73 int	filt_sowrite(struct knote *kn, long hint);
74 int	filt_soexcept(struct knote *kn, long hint);
75 int	filt_solisten(struct knote *kn, long hint);
76 int	filt_somodify(struct kevent *kev, struct knote *kn);
77 int	filt_soprocess(struct knote *kn, struct kevent *kev);
78 
79 const struct filterops solisten_filtops = {
80 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
81 	.f_attach	= NULL,
82 	.f_detach	= filt_sordetach,
83 	.f_event	= filt_solisten,
84 	.f_modify	= filt_somodify,
85 	.f_process	= filt_soprocess,
86 };
87 
88 const struct filterops soread_filtops = {
89 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
90 	.f_attach	= NULL,
91 	.f_detach	= filt_sordetach,
92 	.f_event	= filt_soread,
93 	.f_modify	= filt_somodify,
94 	.f_process	= filt_soprocess,
95 };
96 
97 const struct filterops sowrite_filtops = {
98 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
99 	.f_attach	= NULL,
100 	.f_detach	= filt_sowdetach,
101 	.f_event	= filt_sowrite,
102 	.f_modify	= filt_somodify,
103 	.f_process	= filt_soprocess,
104 };
105 
106 const struct filterops soexcept_filtops = {
107 	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
108 	.f_attach	= NULL,
109 	.f_detach	= filt_sordetach,
110 	.f_event	= filt_soexcept,
111 	.f_modify	= filt_somodify,
112 	.f_process	= filt_soprocess,
113 };
114 
115 #ifndef SOMINCONN
116 #define SOMINCONN 80
117 #endif /* SOMINCONN */
118 
119 int	somaxconn = SOMAXCONN;
120 int	sominconn = SOMINCONN;
121 
122 struct pool socket_pool;
123 #ifdef SOCKET_SPLICE
124 struct pool sosplice_pool;
125 struct taskq *sosplice_taskq;
126 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
127 #endif
128 
129 void
130 soinit(void)
131 {
132 	pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
133 	    "sockpl", NULL);
134 #ifdef SOCKET_SPLICE
135 	pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
136 	    "sosppl", NULL);
137 #endif
138 }
139 
140 struct socket *
141 soalloc(int prflags)
142 {
143 	struct socket *so;
144 
145 	so = pool_get(&socket_pool, prflags);
146 	if (so == NULL)
147 		return (NULL);
148 	rw_init_flags(&so->so_lock, "solock", RWL_DUPOK);
149 	refcnt_init(&so->so_refcnt);
150 
151 	return (so);
152 }
153 
154 /*
155  * Socket operation routines.
156  * These routines are called by the routines in
157  * sys_socket.c or from a system process, and
158  * implement the semantics of socket operations by
159  * switching out to the protocol specific routines.
160  */
161 int
162 socreate(int dom, struct socket **aso, int type, int proto)
163 {
164 	struct proc *p = curproc;		/* XXX */
165 	const struct protosw *prp;
166 	struct socket *so;
167 	int error;
168 
169 	if (proto)
170 		prp = pffindproto(dom, proto, type);
171 	else
172 		prp = pffindtype(dom, type);
173 	if (prp == NULL || prp->pr_usrreqs == NULL)
174 		return (EPROTONOSUPPORT);
175 	if (prp->pr_type != type)
176 		return (EPROTOTYPE);
177 	so = soalloc(PR_WAITOK | PR_ZERO);
178 	klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so);
179 	klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so);
180 	sigio_init(&so->so_sigio);
181 	TAILQ_INIT(&so->so_q0);
182 	TAILQ_INIT(&so->so_q);
183 	so->so_type = type;
184 	if (suser(p) == 0)
185 		so->so_state = SS_PRIV;
186 	so->so_ruid = p->p_ucred->cr_ruid;
187 	so->so_euid = p->p_ucred->cr_uid;
188 	so->so_rgid = p->p_ucred->cr_rgid;
189 	so->so_egid = p->p_ucred->cr_gid;
190 	so->so_cpid = p->p_p->ps_pid;
191 	so->so_proto = prp;
192 	so->so_snd.sb_timeo_nsecs = INFSLP;
193 	so->so_rcv.sb_timeo_nsecs = INFSLP;
194 
195 	solock(so);
196 	error = pru_attach(so, proto);
197 	if (error) {
198 		so->so_state |= SS_NOFDREF;
199 		/* sofree() calls sounlock(). */
200 		sofree(so, 0);
201 		return (error);
202 	}
203 	sounlock(so);
204 	*aso = so;
205 	return (0);
206 }
207 
208 int
209 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
210 {
211 	soassertlocked(so);
212 	return pru_bind(so, nam, p);
213 }
214 
215 int
216 solisten(struct socket *so, int backlog)
217 {
218 	int error;
219 
220 	soassertlocked(so);
221 
222 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
223 		return (EINVAL);
224 #ifdef SOCKET_SPLICE
225 	if (isspliced(so) || issplicedback(so))
226 		return (EOPNOTSUPP);
227 #endif /* SOCKET_SPLICE */
228 	error = pru_listen(so);
229 	if (error)
230 		return (error);
231 	if (TAILQ_FIRST(&so->so_q) == NULL)
232 		so->so_options |= SO_ACCEPTCONN;
233 	if (backlog < 0 || backlog > somaxconn)
234 		backlog = somaxconn;
235 	if (backlog < sominconn)
236 		backlog = sominconn;
237 	so->so_qlimit = backlog;
238 	return (0);
239 }
240 
241 #define SOSP_FREEING_READ	1
242 #define SOSP_FREEING_WRITE	2
243 void
244 sofree(struct socket *so, int keep_lock)
245 {
246 	int persocket = solock_persocket(so);
247 
248 	soassertlocked(so);
249 
250 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
251 		if (!keep_lock)
252 			sounlock(so);
253 		return;
254 	}
255 	if (so->so_head) {
256 		struct socket *head = so->so_head;
257 
258 		/*
259 		 * We must not decommission a socket that's on the accept(2)
260 		 * queue.  If we do, then accept(2) may hang after select(2)
261 		 * indicated that the listening socket was ready.
262 		 */
263 		if (so->so_onq == &head->so_q) {
264 			if (!keep_lock)
265 				sounlock(so);
266 			return;
267 		}
268 
269 		if (persocket) {
270 			/*
271 			 * Concurrent close of `head' could
272 			 * abort `so' due to re-lock.
273 			 */
274 			soref(so);
275 			soref(head);
276 			sounlock(so);
277 			solock(head);
278 			solock(so);
279 
280 			if (so->so_onq != &head->so_q0) {
281 				sounlock(head);
282 				sounlock(so);
283 				sorele(head);
284 				sorele(so);
285 				return;
286 			}
287 
288 			sorele(head);
289 			sorele(so);
290 		}
291 
292 		soqremque(so, 0);
293 
294 		if (persocket)
295 			sounlock(head);
296 	}
297 
298 	if (persocket) {
299 		sounlock(so);
300 		refcnt_finalize(&so->so_refcnt, "sofinal");
301 		solock(so);
302 	}
303 
304 	sigio_free(&so->so_sigio);
305 	klist_free(&so->so_rcv.sb_sel.si_note);
306 	klist_free(&so->so_snd.sb_sel.si_note);
307 #ifdef SOCKET_SPLICE
308 	if (so->so_sp) {
309 		if (issplicedback(so)) {
310 			int freeing = SOSP_FREEING_WRITE;
311 
312 			if (so->so_sp->ssp_soback == so)
313 				freeing |= SOSP_FREEING_READ;
314 			sounsplice(so->so_sp->ssp_soback, so, freeing);
315 		}
316 		if (isspliced(so)) {
317 			int freeing = SOSP_FREEING_READ;
318 
319 			if (so == so->so_sp->ssp_socket)
320 				freeing |= SOSP_FREEING_WRITE;
321 			sounsplice(so, so->so_sp->ssp_socket, freeing);
322 		}
323 	}
324 #endif /* SOCKET_SPLICE */
325 	sbrelease(so, &so->so_snd);
326 	sorflush(so);
327 	if (!keep_lock)
328 		sounlock(so);
329 #ifdef SOCKET_SPLICE
330 	if (so->so_sp) {
331 		/* Reuse splice idle, sounsplice() has been called before. */
332 		timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so);
333 		timeout_add(&so->so_sp->ssp_idleto, 0);
334 	} else
335 #endif /* SOCKET_SPLICE */
336 	{
337 		pool_put(&socket_pool, so);
338 	}
339 }
340 
341 static inline uint64_t
342 solinger_nsec(struct socket *so)
343 {
344 	if (so->so_linger == 0)
345 		return INFSLP;
346 
347 	return SEC_TO_NSEC(so->so_linger);
348 }
349 
350 /*
351  * Close a socket on last file table reference removal.
352  * Initiate disconnect if connected.
353  * Free socket when disconnect complete.
354  */
355 int
356 soclose(struct socket *so, int flags)
357 {
358 	struct socket *so2;
359 	int error = 0;
360 
361 	solock(so);
362 	/* Revoke async IO early. There is a final revocation in sofree(). */
363 	sigio_free(&so->so_sigio);
364 	if (so->so_state & SS_ISCONNECTED) {
365 		if (so->so_pcb == NULL)
366 			goto discard;
367 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
368 			error = sodisconnect(so);
369 			if (error)
370 				goto drop;
371 		}
372 		if (so->so_options & SO_LINGER) {
373 			if ((so->so_state & SS_ISDISCONNECTING) &&
374 			    (flags & MSG_DONTWAIT))
375 				goto drop;
376 			while (so->so_state & SS_ISCONNECTED) {
377 				error = sosleep_nsec(so, &so->so_timeo,
378 				    PSOCK | PCATCH, "netcls",
379 				    solinger_nsec(so));
380 				if (error)
381 					break;
382 			}
383 		}
384 	}
385 drop:
386 	if (so->so_pcb) {
387 		int error2;
388 		error2 = pru_detach(so);
389 		if (error == 0)
390 			error = error2;
391 	}
392 	if (so->so_options & SO_ACCEPTCONN) {
393 		int persocket = solock_persocket(so);
394 
395 		if (persocket) {
396 			/* Wait concurrent sonewconn() threads. */
397 			while (so->so_newconn > 0) {
398 				so->so_state |= SS_NEWCONN_WAIT;
399 				sosleep_nsec(so, &so->so_newconn, PSOCK,
400 					"netlck", INFSLP);
401 			}
402 		}
403 
404 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
405 			if (persocket)
406 				solock(so2);
407 			(void) soqremque(so2, 0);
408 			if (persocket)
409 				sounlock(so);
410 			soabort(so2);
411 			if (persocket)
412 				solock(so);
413 		}
414 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
415 			if (persocket)
416 				solock(so2);
417 			(void) soqremque(so2, 1);
418 			if (persocket)
419 				sounlock(so);
420 			soabort(so2);
421 			if (persocket)
422 				solock(so);
423 		}
424 	}
425 discard:
426 	if (so->so_state & SS_NOFDREF)
427 		panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
428 	so->so_state |= SS_NOFDREF;
429 	/* sofree() calls sounlock(). */
430 	sofree(so, 0);
431 	return (error);
432 }
433 
434 void
435 soabort(struct socket *so)
436 {
437 	soassertlocked(so);
438 	pru_abort(so);
439 }
440 
441 int
442 soaccept(struct socket *so, struct mbuf *nam)
443 {
444 	int error = 0;
445 
446 	soassertlocked(so);
447 
448 	if ((so->so_state & SS_NOFDREF) == 0)
449 		panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
450 	so->so_state &= ~SS_NOFDREF;
451 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
452 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
453 		error = pru_accept(so, nam);
454 	else
455 		error = ECONNABORTED;
456 	return (error);
457 }
458 
459 int
460 soconnect(struct socket *so, struct mbuf *nam)
461 {
462 	int error;
463 
464 	soassertlocked(so);
465 
466 	if (so->so_options & SO_ACCEPTCONN)
467 		return (EOPNOTSUPP);
468 	/*
469 	 * If protocol is connection-based, can only connect once.
470 	 * Otherwise, if connected, try to disconnect first.
471 	 * This allows user to disconnect by connecting to, e.g.,
472 	 * a null address.
473 	 */
474 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
475 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
476 	    (error = sodisconnect(so))))
477 		error = EISCONN;
478 	else
479 		error = pru_connect(so, nam);
480 	return (error);
481 }
482 
483 int
484 soconnect2(struct socket *so1, struct socket *so2)
485 {
486 	int persocket, error;
487 
488 	if ((persocket = solock_persocket(so1)))
489 		solock_pair(so1, so2);
490 	else
491 		solock(so1);
492 
493 	error = pru_connect2(so1, so2);
494 
495 	if (persocket)
496 		sounlock(so2);
497 	sounlock(so1);
498 	return (error);
499 }
500 
501 int
502 sodisconnect(struct socket *so)
503 {
504 	int error;
505 
506 	soassertlocked(so);
507 
508 	if ((so->so_state & SS_ISCONNECTED) == 0)
509 		return (ENOTCONN);
510 	if (so->so_state & SS_ISDISCONNECTING)
511 		return (EALREADY);
512 	error = pru_disconnect(so);
513 	return (error);
514 }
515 
516 int m_getuio(struct mbuf **, int, long, struct uio *);
517 
518 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
519 /*
520  * Send on a socket.
521  * If send must go all at once and message is larger than
522  * send buffering, then hard error.
523  * Lock against other senders.
524  * If must go all at once and not enough room now, then
525  * inform user that this would block and do nothing.
526  * Otherwise, if nonblocking, send as much as possible.
527  * The data to be sent is described by "uio" if nonzero,
528  * otherwise by the mbuf chain "top" (which must be null
529  * if uio is not).  Data provided in mbuf chain must be small
530  * enough to send all at once.
531  *
532  * Returns nonzero on error, timeout or signal; callers
533  * must check for short counts if EINTR/ERESTART are returned.
534  * Data and control buffers are freed on return.
535  */
536 int
537 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
538     struct mbuf *control, int flags)
539 {
540 	long space, clen = 0;
541 	size_t resid;
542 	int error;
543 	int atomic = sosendallatonce(so) || top;
544 
545 	if (uio)
546 		resid = uio->uio_resid;
547 	else
548 		resid = top->m_pkthdr.len;
549 	/* MSG_EOR on a SOCK_STREAM socket is invalid. */
550 	if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
551 		m_freem(top);
552 		m_freem(control);
553 		return (EINVAL);
554 	}
555 	if (uio && uio->uio_procp)
556 		uio->uio_procp->p_ru.ru_msgsnd++;
557 	if (control) {
558 		/*
559 		 * In theory clen should be unsigned (since control->m_len is).
560 		 * However, space must be signed, as it might be less than 0
561 		 * if we over-committed, and we must use a signed comparison
562 		 * of space and clen.
563 		 */
564 		clen = control->m_len;
565 		/* reserve extra space for AF_UNIX's internalize */
566 		if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
567 		    clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
568 		    mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
569 			clen = CMSG_SPACE(
570 			    (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
571 			    (sizeof(struct fdpass) / sizeof(int)));
572 	}
573 
574 #define	snderr(errno)	{ error = errno; goto release; }
575 
576 	solock(so);
577 restart:
578 	if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
579 		goto out;
580 	so->so_state |= SS_ISSENDING;
581 	do {
582 		if (so->so_state & SS_CANTSENDMORE)
583 			snderr(EPIPE);
584 		if (so->so_error) {
585 			error = so->so_error;
586 			so->so_error = 0;
587 			snderr(error);
588 		}
589 		if ((so->so_state & SS_ISCONNECTED) == 0) {
590 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
591 				if (!(resid == 0 && clen != 0))
592 					snderr(ENOTCONN);
593 			} else if (addr == NULL)
594 				snderr(EDESTADDRREQ);
595 		}
596 		space = sbspace(so, &so->so_snd);
597 		if (flags & MSG_OOB)
598 			space += 1024;
599 		if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
600 			if (atomic && resid > so->so_snd.sb_hiwat)
601 				snderr(EMSGSIZE);
602 		} else {
603 			if (clen > so->so_snd.sb_hiwat ||
604 			    (atomic && resid > so->so_snd.sb_hiwat - clen))
605 				snderr(EMSGSIZE);
606 		}
607 		if (space < clen ||
608 		    (space - clen < resid &&
609 		    (atomic || space < so->so_snd.sb_lowat))) {
610 			if (flags & MSG_DONTWAIT)
611 				snderr(EWOULDBLOCK);
612 			sbunlock(so, &so->so_snd);
613 			error = sbwait(so, &so->so_snd);
614 			so->so_state &= ~SS_ISSENDING;
615 			if (error)
616 				goto out;
617 			goto restart;
618 		}
619 		space -= clen;
620 		do {
621 			if (uio == NULL) {
622 				/*
623 				 * Data is prepackaged in "top".
624 				 */
625 				resid = 0;
626 				if (flags & MSG_EOR)
627 					top->m_flags |= M_EOR;
628 			} else {
629 				sounlock(so);
630 				error = m_getuio(&top, atomic, space, uio);
631 				solock(so);
632 				if (error)
633 					goto release;
634 				space -= top->m_pkthdr.len;
635 				resid = uio->uio_resid;
636 				if (flags & MSG_EOR)
637 					top->m_flags |= M_EOR;
638 			}
639 			if (resid == 0)
640 				so->so_state &= ~SS_ISSENDING;
641 			if (top && so->so_options & SO_ZEROIZE)
642 				top->m_flags |= M_ZEROIZE;
643 			if (flags & MSG_OOB)
644 				error = pru_sendoob(so, top, addr, control);
645 			else
646 				error = pru_send(so, top, addr, control);
647 			clen = 0;
648 			control = NULL;
649 			top = NULL;
650 			if (error)
651 				goto release;
652 		} while (resid && space > 0);
653 	} while (resid);
654 
655 release:
656 	so->so_state &= ~SS_ISSENDING;
657 	sbunlock(so, &so->so_snd);
658 out:
659 	sounlock(so);
660 	m_freem(top);
661 	m_freem(control);
662 	return (error);
663 }
664 
665 int
666 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
667 {
668 	struct mbuf *m, *top = NULL;
669 	struct mbuf **nextp = &top;
670 	u_long len, mlen;
671 	size_t resid = uio->uio_resid;
672 	int error;
673 
674 	do {
675 		if (top == NULL) {
676 			MGETHDR(m, M_WAIT, MT_DATA);
677 			mlen = MHLEN;
678 			m->m_pkthdr.len = 0;
679 			m->m_pkthdr.ph_ifidx = 0;
680 		} else {
681 			MGET(m, M_WAIT, MT_DATA);
682 			mlen = MLEN;
683 		}
684 		/* chain mbuf together */
685 		*nextp = m;
686 		nextp = &m->m_next;
687 
688 		resid = ulmin(resid, space);
689 		if (resid >= MINCLSIZE) {
690 			MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES));
691 			if ((m->m_flags & M_EXT) == 0)
692 				MCLGETL(m, M_NOWAIT, MCLBYTES);
693 			if ((m->m_flags & M_EXT) == 0)
694 				goto nopages;
695 			mlen = m->m_ext.ext_size;
696 			len = ulmin(mlen, resid);
697 			/*
698 			 * For datagram protocols, leave room
699 			 * for protocol headers in first mbuf.
700 			 */
701 			if (atomic && m == top && len < mlen - max_hdr)
702 				m->m_data += max_hdr;
703 		} else {
704 nopages:
705 			len = ulmin(mlen, resid);
706 			/*
707 			 * For datagram protocols, leave room
708 			 * for protocol headers in first mbuf.
709 			 */
710 			if (atomic && m == top && len < mlen - max_hdr)
711 				m_align(m, len);
712 		}
713 
714 		error = uiomove(mtod(m, caddr_t), len, uio);
715 		if (error) {
716 			m_freem(top);
717 			return (error);
718 		}
719 
720 		/* adjust counters */
721 		resid = uio->uio_resid;
722 		space -= len;
723 		m->m_len = len;
724 		top->m_pkthdr.len += len;
725 
726 		/* Is there more space and more data? */
727 	} while (space > 0 && resid > 0);
728 
729 	*mp = top;
730 	return 0;
731 }
732 
733 /*
734  * Following replacement or removal of the first mbuf on the first
735  * mbuf chain of a socket buffer, push necessary state changes back
736  * into the socket buffer so that other consumers see the values
737  * consistently.  'nextrecord' is the callers locally stored value of
738  * the original value of sb->sb_mb->m_nextpkt which must be restored
739  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
740  */
741 void
742 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
743 {
744 
745 	/*
746 	 * First, update for the new value of nextrecord.  If necessary,
747 	 * make it the first record.
748 	 */
749 	if (sb->sb_mb != NULL)
750 		sb->sb_mb->m_nextpkt = nextrecord;
751 	else
752 		sb->sb_mb = nextrecord;
753 
754 	/*
755 	 * Now update any dependent socket buffer fields to reflect
756 	 * the new state.  This is an inline of SB_EMPTY_FIXUP, with
757 	 * the addition of a second clause that takes care of the
758 	 * case where sb_mb has been updated, but remains the last
759 	 * record.
760 	 */
761 	if (sb->sb_mb == NULL) {
762 		sb->sb_mbtail = NULL;
763 		sb->sb_lastrecord = NULL;
764 	} else if (sb->sb_mb->m_nextpkt == NULL)
765 		sb->sb_lastrecord = sb->sb_mb;
766 }
767 
768 /*
769  * Implement receive operations on a socket.
770  * We depend on the way that records are added to the sockbuf
771  * by sbappend*.  In particular, each record (mbufs linked through m_next)
772  * must begin with an address if the protocol so specifies,
773  * followed by an optional mbuf or mbufs containing ancillary data,
774  * and then zero or more mbufs of data.
775  * In order to avoid blocking network for the entire time here, we release
776  * the solock() while doing the actual copy to user space.
777  * Although the sockbuf is locked, new data may still be appended,
778  * and thus we must maintain consistency of the sockbuf during that time.
779  *
780  * The caller may receive the data as a single mbuf chain by supplying
781  * an mbuf **mp0 for use in returning the chain.  The uio is then used
782  * only for the count in uio_resid.
783  */
784 int
785 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
786     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
787     socklen_t controllen)
788 {
789 	struct mbuf *m, **mp;
790 	struct mbuf *cm;
791 	u_long len, offset, moff;
792 	int flags, error, type, uio_error = 0;
793 	const struct protosw *pr = so->so_proto;
794 	struct mbuf *nextrecord;
795 	size_t resid, orig_resid = uio->uio_resid;
796 
797 	mp = mp0;
798 	if (paddr)
799 		*paddr = NULL;
800 	if (controlp)
801 		*controlp = NULL;
802 	if (flagsp)
803 		flags = *flagsp &~ MSG_EOR;
804 	else
805 		flags = 0;
806 	if (flags & MSG_OOB) {
807 		m = m_get(M_WAIT, MT_DATA);
808 		solock(so);
809 		error = pru_rcvoob(so, m, flags & MSG_PEEK);
810 		sounlock(so);
811 		if (error)
812 			goto bad;
813 		do {
814 			error = uiomove(mtod(m, caddr_t),
815 			    ulmin(uio->uio_resid, m->m_len), uio);
816 			m = m_free(m);
817 		} while (uio->uio_resid && error == 0 && m);
818 bad:
819 		m_freem(m);
820 		return (error);
821 	}
822 	if (mp)
823 		*mp = NULL;
824 
825 	solock_shared(so);
826 restart:
827 	if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
828 		sounlock_shared(so);
829 		return (error);
830 	}
831 
832 	m = so->so_rcv.sb_mb;
833 #ifdef SOCKET_SPLICE
834 	if (isspliced(so))
835 		m = NULL;
836 #endif /* SOCKET_SPLICE */
837 	/*
838 	 * If we have less data than requested, block awaiting more
839 	 * (subject to any timeout) if:
840 	 *   1. the current count is less than the low water mark,
841 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
842 	 *	receive operation at once if we block (resid <= hiwat), or
843 	 *   3. MSG_DONTWAIT is not set.
844 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
845 	 * we have to do the receive in sections, and thus risk returning
846 	 * a short count if a timeout or signal occurs after we start.
847 	 */
848 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
849 	    so->so_rcv.sb_cc < uio->uio_resid) &&
850 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
851 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
852 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
853 #ifdef DIAGNOSTIC
854 		if (m == NULL && so->so_rcv.sb_cc)
855 #ifdef SOCKET_SPLICE
856 		    if (!isspliced(so))
857 #endif /* SOCKET_SPLICE */
858 			panic("receive 1: so %p, so_type %d, sb_cc %lu",
859 			    so, so->so_type, so->so_rcv.sb_cc);
860 #endif
861 		if (so->so_error) {
862 			if (m)
863 				goto dontblock;
864 			error = so->so_error;
865 			if ((flags & MSG_PEEK) == 0)
866 				so->so_error = 0;
867 			goto release;
868 		}
869 		if (so->so_state & SS_CANTRCVMORE) {
870 			if (m)
871 				goto dontblock;
872 			else if (so->so_rcv.sb_cc == 0)
873 				goto release;
874 		}
875 		for (; m; m = m->m_next)
876 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
877 				m = so->so_rcv.sb_mb;
878 				goto dontblock;
879 			}
880 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
881 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
882 			error = ENOTCONN;
883 			goto release;
884 		}
885 		if (uio->uio_resid == 0 && controlp == NULL)
886 			goto release;
887 		if (flags & MSG_DONTWAIT) {
888 			error = EWOULDBLOCK;
889 			goto release;
890 		}
891 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
892 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
893 		sbunlock(so, &so->so_rcv);
894 		error = sbwait(so, &so->so_rcv);
895 		if (error) {
896 			sounlock_shared(so);
897 			return (error);
898 		}
899 		goto restart;
900 	}
901 dontblock:
902 	/*
903 	 * On entry here, m points to the first record of the socket buffer.
904 	 * From this point onward, we maintain 'nextrecord' as a cache of the
905 	 * pointer to the next record in the socket buffer.  We must keep the
906 	 * various socket buffer pointers and local stack versions of the
907 	 * pointers in sync, pushing out modifications before operations that
908 	 * may sleep, and re-reading them afterwards.
909 	 *
910 	 * Otherwise, we will race with the network stack appending new data
911 	 * or records onto the socket buffer by using inconsistent/stale
912 	 * versions of the field, possibly resulting in socket buffer
913 	 * corruption.
914 	 */
915 	if (uio->uio_procp)
916 		uio->uio_procp->p_ru.ru_msgrcv++;
917 	KASSERT(m == so->so_rcv.sb_mb);
918 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
919 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
920 	nextrecord = m->m_nextpkt;
921 	if (pr->pr_flags & PR_ADDR) {
922 #ifdef DIAGNOSTIC
923 		if (m->m_type != MT_SONAME)
924 			panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
925 			    so, so->so_type, m, m->m_type);
926 #endif
927 		orig_resid = 0;
928 		if (flags & MSG_PEEK) {
929 			if (paddr)
930 				*paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
931 			m = m->m_next;
932 		} else {
933 			sbfree(so, &so->so_rcv, m);
934 			if (paddr) {
935 				*paddr = m;
936 				so->so_rcv.sb_mb = m->m_next;
937 				m->m_next = NULL;
938 				m = so->so_rcv.sb_mb;
939 			} else {
940 				so->so_rcv.sb_mb = m_free(m);
941 				m = so->so_rcv.sb_mb;
942 			}
943 			sbsync(&so->so_rcv, nextrecord);
944 		}
945 	}
946 	while (m && m->m_type == MT_CONTROL && error == 0) {
947 		int skip = 0;
948 		if (flags & MSG_PEEK) {
949 			if (mtod(m, struct cmsghdr *)->cmsg_type ==
950 			    SCM_RIGHTS) {
951 				/* don't leak internalized SCM_RIGHTS msgs */
952 				skip = 1;
953 			} else if (controlp)
954 				*controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
955 			m = m->m_next;
956 		} else {
957 			sbfree(so, &so->so_rcv, m);
958 			so->so_rcv.sb_mb = m->m_next;
959 			m->m_nextpkt = m->m_next = NULL;
960 			cm = m;
961 			m = so->so_rcv.sb_mb;
962 			sbsync(&so->so_rcv, nextrecord);
963 			if (controlp) {
964 				if (pr->pr_domain->dom_externalize) {
965 					sounlock_shared(so);
966 					error =
967 					    (*pr->pr_domain->dom_externalize)
968 					    (cm, controllen, flags);
969 					solock_shared(so);
970 				}
971 				*controlp = cm;
972 			} else {
973 				/*
974 				 * Dispose of any SCM_RIGHTS message that went
975 				 * through the read path rather than recv.
976 				 */
977 				if (pr->pr_domain->dom_dispose)
978 					pr->pr_domain->dom_dispose(cm);
979 				m_free(cm);
980 			}
981 		}
982 		if (m != NULL)
983 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
984 		else
985 			nextrecord = so->so_rcv.sb_mb;
986 		if (controlp && !skip)
987 			controlp = &(*controlp)->m_next;
988 		orig_resid = 0;
989 	}
990 
991 	/* If m is non-NULL, we have some data to read. */
992 	if (m) {
993 		type = m->m_type;
994 		if (type == MT_OOBDATA)
995 			flags |= MSG_OOB;
996 		if (m->m_flags & M_BCAST)
997 			flags |= MSG_BCAST;
998 		if (m->m_flags & M_MCAST)
999 			flags |= MSG_MCAST;
1000 	}
1001 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1002 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1003 
1004 	moff = 0;
1005 	offset = 0;
1006 	while (m && uio->uio_resid > 0 && error == 0) {
1007 		if (m->m_type == MT_OOBDATA) {
1008 			if (type != MT_OOBDATA)
1009 				break;
1010 		} else if (type == MT_OOBDATA) {
1011 			break;
1012 		} else if (m->m_type == MT_CONTROL) {
1013 			/*
1014 			 * If there is more than one control message in the
1015 			 * stream, we do a short read.  Next can be received
1016 			 * or disposed by another system call.
1017 			 */
1018 			break;
1019 #ifdef DIAGNOSTIC
1020 		} else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1021 			panic("receive 3: so %p, so_type %d, m %p, m_type %d",
1022 			    so, so->so_type, m, m->m_type);
1023 #endif
1024 		}
1025 		so->so_state &= ~SS_RCVATMARK;
1026 		len = uio->uio_resid;
1027 		if (so->so_oobmark && len > so->so_oobmark - offset)
1028 			len = so->so_oobmark - offset;
1029 		if (len > m->m_len - moff)
1030 			len = m->m_len - moff;
1031 		/*
1032 		 * If mp is set, just pass back the mbufs.
1033 		 * Otherwise copy them out via the uio, then free.
1034 		 * Sockbuf must be consistent here (points to current mbuf,
1035 		 * it points to next record) when we drop priority;
1036 		 * we must note any additions to the sockbuf when we
1037 		 * block interrupts again.
1038 		 */
1039 		if (mp == NULL && uio_error == 0) {
1040 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1041 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1042 			resid = uio->uio_resid;
1043 			sounlock_shared(so);
1044 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1045 			solock_shared(so);
1046 			if (uio_error)
1047 				uio->uio_resid = resid - len;
1048 		} else
1049 			uio->uio_resid -= len;
1050 		if (len == m->m_len - moff) {
1051 			if (m->m_flags & M_EOR)
1052 				flags |= MSG_EOR;
1053 			if (flags & MSG_PEEK) {
1054 				m = m->m_next;
1055 				moff = 0;
1056 				orig_resid = 0;
1057 			} else {
1058 				nextrecord = m->m_nextpkt;
1059 				sbfree(so, &so->so_rcv, m);
1060 				if (mp) {
1061 					*mp = m;
1062 					mp = &m->m_next;
1063 					so->so_rcv.sb_mb = m = m->m_next;
1064 					*mp = NULL;
1065 				} else {
1066 					so->so_rcv.sb_mb = m_free(m);
1067 					m = so->so_rcv.sb_mb;
1068 				}
1069 				/*
1070 				 * If m != NULL, we also know that
1071 				 * so->so_rcv.sb_mb != NULL.
1072 				 */
1073 				KASSERT(so->so_rcv.sb_mb == m);
1074 				if (m) {
1075 					m->m_nextpkt = nextrecord;
1076 					if (nextrecord == NULL)
1077 						so->so_rcv.sb_lastrecord = m;
1078 				} else {
1079 					so->so_rcv.sb_mb = nextrecord;
1080 					SB_EMPTY_FIXUP(&so->so_rcv);
1081 				}
1082 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1083 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1084 			}
1085 		} else {
1086 			if (flags & MSG_PEEK) {
1087 				moff += len;
1088 				orig_resid = 0;
1089 			} else {
1090 				if (mp)
1091 					*mp = m_copym(m, 0, len, M_WAIT);
1092 				m->m_data += len;
1093 				m->m_len -= len;
1094 				so->so_rcv.sb_cc -= len;
1095 				so->so_rcv.sb_datacc -= len;
1096 			}
1097 		}
1098 		if (so->so_oobmark) {
1099 			if ((flags & MSG_PEEK) == 0) {
1100 				so->so_oobmark -= len;
1101 				if (so->so_oobmark == 0) {
1102 					so->so_state |= SS_RCVATMARK;
1103 					break;
1104 				}
1105 			} else {
1106 				offset += len;
1107 				if (offset == so->so_oobmark)
1108 					break;
1109 			}
1110 		}
1111 		if (flags & MSG_EOR)
1112 			break;
1113 		/*
1114 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
1115 		 * we must not quit until "uio->uio_resid == 0" or an error
1116 		 * termination.  If a signal/timeout occurs, return
1117 		 * with a short count but without error.
1118 		 * Keep sockbuf locked against other readers.
1119 		 */
1120 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1121 		    !sosendallatonce(so) && !nextrecord) {
1122 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
1123 				break;
1124 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1125 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1126 			error = sbwait(so, &so->so_rcv);
1127 			if (error) {
1128 				sbunlock(so, &so->so_rcv);
1129 				sounlock_shared(so);
1130 				return (0);
1131 			}
1132 			if ((m = so->so_rcv.sb_mb) != NULL)
1133 				nextrecord = m->m_nextpkt;
1134 		}
1135 	}
1136 
1137 	if (m && pr->pr_flags & PR_ATOMIC) {
1138 		flags |= MSG_TRUNC;
1139 		if ((flags & MSG_PEEK) == 0)
1140 			(void) sbdroprecord(so, &so->so_rcv);
1141 	}
1142 	if ((flags & MSG_PEEK) == 0) {
1143 		if (m == NULL) {
1144 			/*
1145 			 * First part is an inline SB_EMPTY_FIXUP().  Second
1146 			 * part makes sure sb_lastrecord is up-to-date if
1147 			 * there is still data in the socket buffer.
1148 			 */
1149 			so->so_rcv.sb_mb = nextrecord;
1150 			if (so->so_rcv.sb_mb == NULL) {
1151 				so->so_rcv.sb_mbtail = NULL;
1152 				so->so_rcv.sb_lastrecord = NULL;
1153 			} else if (nextrecord->m_nextpkt == NULL)
1154 				so->so_rcv.sb_lastrecord = nextrecord;
1155 		}
1156 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1157 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1158 		if (pr->pr_flags & PR_WANTRCVD)
1159 			pru_rcvd(so);
1160 	}
1161 	if (orig_resid == uio->uio_resid && orig_resid &&
1162 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1163 		sbunlock(so, &so->so_rcv);
1164 		goto restart;
1165 	}
1166 
1167 	if (uio_error)
1168 		error = uio_error;
1169 
1170 	if (flagsp)
1171 		*flagsp |= flags;
1172 release:
1173 	sbunlock(so, &so->so_rcv);
1174 	sounlock_shared(so);
1175 	return (error);
1176 }
1177 
1178 int
1179 soshutdown(struct socket *so, int how)
1180 {
1181 	int error = 0;
1182 
1183 	solock(so);
1184 	switch (how) {
1185 	case SHUT_RD:
1186 		sorflush(so);
1187 		break;
1188 	case SHUT_RDWR:
1189 		sorflush(so);
1190 		/* FALLTHROUGH */
1191 	case SHUT_WR:
1192 		error = pru_shutdown(so);
1193 		break;
1194 	default:
1195 		error = EINVAL;
1196 		break;
1197 	}
1198 	sounlock(so);
1199 
1200 	return (error);
1201 }
1202 
1203 void
1204 sorflush(struct socket *so)
1205 {
1206 	struct sockbuf *sb = &so->so_rcv;
1207 	struct mbuf *m;
1208 	const struct protosw *pr = so->so_proto;
1209 	int error;
1210 
1211 	sb->sb_flags |= SB_NOINTR;
1212 	error = sblock(so, sb, M_WAITOK);
1213 	/* with SB_NOINTR and M_WAITOK sblock() must not fail */
1214 	KASSERT(error == 0);
1215 	socantrcvmore(so);
1216 	m = sb->sb_mb;
1217 	memset(&sb->sb_startzero, 0,
1218 	     (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1219 	sb->sb_timeo_nsecs = INFSLP;
1220 	sbunlock(so, sb);
1221 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1222 		(*pr->pr_domain->dom_dispose)(m);
1223 	m_purge(m);
1224 }
1225 
1226 #ifdef SOCKET_SPLICE
1227 
1228 #define so_splicelen	so_sp->ssp_len
1229 #define so_splicemax	so_sp->ssp_max
1230 #define so_idletv	so_sp->ssp_idletv
1231 #define so_idleto	so_sp->ssp_idleto
1232 #define so_splicetask	so_sp->ssp_task
1233 
1234 int
1235 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1236 {
1237 	struct file	*fp;
1238 	struct socket	*sosp;
1239 	struct sosplice	*sp;
1240 	struct taskq	*tq;
1241 	int		 error = 0;
1242 
1243 	soassertlocked(so);
1244 
1245 	if (sosplice_taskq == NULL) {
1246 		rw_enter_write(&sosplice_lock);
1247 		if (sosplice_taskq == NULL) {
1248 			tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1249 			    TASKQ_MPSAFE);
1250 			/* Ensure the taskq is fully visible to other CPUs. */
1251 			membar_producer();
1252 			sosplice_taskq = tq;
1253 		}
1254 		rw_exit_write(&sosplice_lock);
1255 	}
1256 	if (sosplice_taskq == NULL)
1257 		return (ENOMEM);
1258 
1259 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1260 		return (EPROTONOSUPPORT);
1261 	if (so->so_options & SO_ACCEPTCONN)
1262 		return (EOPNOTSUPP);
1263 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1264 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1265 		return (ENOTCONN);
1266 	if (so->so_sp == NULL) {
1267 		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1268 		if (so->so_sp == NULL)
1269 			so->so_sp = sp;
1270 		else
1271 			pool_put(&sosplice_pool, sp);
1272 	}
1273 
1274 	/* If no fd is given, unsplice by removing existing link. */
1275 	if (fd < 0) {
1276 		/* Lock receive buffer. */
1277 		if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) {
1278 			return (error);
1279 		}
1280 		if (so->so_sp->ssp_socket)
1281 			sounsplice(so, so->so_sp->ssp_socket, 0);
1282 		sbunlock(so, &so->so_rcv);
1283 		return (0);
1284 	}
1285 
1286 	if (max && max < 0)
1287 		return (EINVAL);
1288 
1289 	if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1290 		return (EINVAL);
1291 
1292 	/* Find sosp, the drain socket where data will be spliced into. */
1293 	if ((error = getsock(curproc, fd, &fp)) != 0)
1294 		return (error);
1295 	sosp = fp->f_data;
1296 	if (sosp->so_proto->pr_usrreqs->pru_send !=
1297 	    so->so_proto->pr_usrreqs->pru_send) {
1298 		error = EPROTONOSUPPORT;
1299 		goto frele;
1300 	}
1301 	if (sosp->so_sp == NULL) {
1302 		sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1303 		if (sosp->so_sp == NULL)
1304 			sosp->so_sp = sp;
1305 		else
1306 			pool_put(&sosplice_pool, sp);
1307 	}
1308 
1309 	/* Lock both receive and send buffer. */
1310 	if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) {
1311 		goto frele;
1312 	}
1313 	if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) {
1314 		sbunlock(so, &so->so_rcv);
1315 		goto frele;
1316 	}
1317 
1318 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1319 		error = EBUSY;
1320 		goto release;
1321 	}
1322 	if (sosp->so_options & SO_ACCEPTCONN) {
1323 		error = EOPNOTSUPP;
1324 		goto release;
1325 	}
1326 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1327 		error = ENOTCONN;
1328 		goto release;
1329 	}
1330 
1331 	/* Splice so and sosp together. */
1332 	so->so_sp->ssp_socket = sosp;
1333 	sosp->so_sp->ssp_soback = so;
1334 	so->so_splicelen = 0;
1335 	so->so_splicemax = max;
1336 	if (tv)
1337 		so->so_idletv = *tv;
1338 	else
1339 		timerclear(&so->so_idletv);
1340 	timeout_set_proc(&so->so_idleto, soidle, so);
1341 	task_set(&so->so_splicetask, sotask, so);
1342 
1343 	/*
1344 	 * To prevent softnet interrupt from calling somove() while
1345 	 * we sleep, the socket buffers are not marked as spliced yet.
1346 	 */
1347 	if (somove(so, M_WAIT)) {
1348 		so->so_rcv.sb_flags |= SB_SPLICE;
1349 		sosp->so_snd.sb_flags |= SB_SPLICE;
1350 	}
1351 
1352  release:
1353 	sbunlock(sosp, &sosp->so_snd);
1354 	sbunlock(so, &so->so_rcv);
1355  frele:
1356 	/*
1357 	 * FRELE() must not be called with the socket lock held. It is safe to
1358 	 * release the lock here as long as no other operation happen on the
1359 	 * socket when sosplice() returns. The dance could be avoided by
1360 	 * grabbing the socket lock inside this function.
1361 	 */
1362 	sounlock(so);
1363 	FRELE(fp, curproc);
1364 	solock(so);
1365 	return (error);
1366 }
1367 
1368 void
1369 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1370 {
1371 	soassertlocked(so);
1372 
1373 	task_del(sosplice_taskq, &so->so_splicetask);
1374 	timeout_del(&so->so_idleto);
1375 	sosp->so_snd.sb_flags &= ~SB_SPLICE;
1376 	so->so_rcv.sb_flags &= ~SB_SPLICE;
1377 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1378 	/* Do not wakeup a socket that is about to be freed. */
1379 	if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so))
1380 		sorwakeup(so);
1381 	if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp))
1382 		sowwakeup(sosp);
1383 }
1384 
1385 void
1386 soidle(void *arg)
1387 {
1388 	struct socket *so = arg;
1389 
1390 	solock(so);
1391 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1392 		so->so_error = ETIMEDOUT;
1393 		sounsplice(so, so->so_sp->ssp_socket, 0);
1394 	}
1395 	sounlock(so);
1396 }
1397 
1398 void
1399 sotask(void *arg)
1400 {
1401 	struct socket *so = arg;
1402 
1403 	solock(so);
1404 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1405 		/*
1406 		 * We may not sleep here as sofree() and unsplice() may be
1407 		 * called from softnet interrupt context.  This would remove
1408 		 * the socket during somove().
1409 		 */
1410 		somove(so, M_DONTWAIT);
1411 	}
1412 	sounlock(so);
1413 
1414 	/* Avoid user land starvation. */
1415 	yield();
1416 }
1417 
1418 /*
1419  * The socket splicing task or idle timeout may sleep while grabbing the net
1420  * lock.  As sofree() can be called anytime, sotask() or soidle() could access
1421  * the socket memory of a freed socket after wakeup.  So delay the pool_put()
1422  * after all pending socket splicing tasks or timeouts have finished.  Do this
1423  * by scheduling it on the same threads.
1424  */
1425 void
1426 soreaper(void *arg)
1427 {
1428 	struct socket *so = arg;
1429 
1430 	/* Reuse splice task, sounsplice() has been called before. */
1431 	task_set(&so->so_sp->ssp_task, soput, so);
1432 	task_add(sosplice_taskq, &so->so_sp->ssp_task);
1433 }
1434 
1435 void
1436 soput(void *arg)
1437 {
1438 	struct socket *so = arg;
1439 
1440 	pool_put(&sosplice_pool, so->so_sp);
1441 	pool_put(&socket_pool, so);
1442 }
1443 
1444 /*
1445  * Move data from receive buffer of spliced source socket to send
1446  * buffer of drain socket.  Try to move as much as possible in one
1447  * big chunk.  It is a TCP only implementation.
1448  * Return value 0 means splicing has been finished, 1 continue.
1449  */
1450 int
1451 somove(struct socket *so, int wait)
1452 {
1453 	struct socket	*sosp = so->so_sp->ssp_socket;
1454 	struct mbuf	*m, **mp, *nextrecord;
1455 	u_long		 len, off, oobmark;
1456 	long		 space;
1457 	int		 error = 0, maxreached = 0;
1458 	unsigned int	 state;
1459 
1460 	soassertlocked(so);
1461 
1462  nextpkt:
1463 	if (so->so_error) {
1464 		error = so->so_error;
1465 		goto release;
1466 	}
1467 	if (sosp->so_state & SS_CANTSENDMORE) {
1468 		error = EPIPE;
1469 		goto release;
1470 	}
1471 	if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
1472 	    sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
1473 		error = sosp->so_error;
1474 		goto release;
1475 	}
1476 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1477 		goto release;
1478 
1479 	/* Calculate how many bytes can be copied now. */
1480 	len = so->so_rcv.sb_datacc;
1481 	if (so->so_splicemax) {
1482 		KASSERT(so->so_splicelen < so->so_splicemax);
1483 		if (so->so_splicemax <= so->so_splicelen + len) {
1484 			len = so->so_splicemax - so->so_splicelen;
1485 			maxreached = 1;
1486 		}
1487 	}
1488 	space = sbspace(sosp, &sosp->so_snd);
1489 	if (so->so_oobmark && so->so_oobmark < len &&
1490 	    so->so_oobmark < space + 1024)
1491 		space += 1024;
1492 	if (space <= 0) {
1493 		maxreached = 0;
1494 		goto release;
1495 	}
1496 	if (space < len) {
1497 		maxreached = 0;
1498 		if (space < sosp->so_snd.sb_lowat)
1499 			goto release;
1500 		len = space;
1501 	}
1502 	sosp->so_state |= SS_ISSENDING;
1503 
1504 	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1505 	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1506 	m = so->so_rcv.sb_mb;
1507 	if (m == NULL)
1508 		goto release;
1509 	nextrecord = m->m_nextpkt;
1510 
1511 	/* Drop address and control information not used with splicing. */
1512 	if (so->so_proto->pr_flags & PR_ADDR) {
1513 #ifdef DIAGNOSTIC
1514 		if (m->m_type != MT_SONAME)
1515 			panic("somove soname: so %p, so_type %d, m %p, "
1516 			    "m_type %d", so, so->so_type, m, m->m_type);
1517 #endif
1518 		m = m->m_next;
1519 	}
1520 	while (m && m->m_type == MT_CONTROL)
1521 		m = m->m_next;
1522 	if (m == NULL) {
1523 		sbdroprecord(so, &so->so_rcv);
1524 		if (so->so_proto->pr_flags & PR_WANTRCVD)
1525 			pru_rcvd(so);
1526 		goto nextpkt;
1527 	}
1528 
1529 	/*
1530 	 * By splicing sockets connected to localhost, userland might create a
1531 	 * loop.  Dissolve splicing with error if loop is detected by counter.
1532 	 *
1533 	 * If we deal with looped broadcast/multicast packet we bail out with
1534 	 * no error to suppress splice termination.
1535 	 */
1536 	if ((m->m_flags & M_PKTHDR) &&
1537 	    ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1538 	    ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1539 		error = ELOOP;
1540 		goto release;
1541 	}
1542 
1543 	if (so->so_proto->pr_flags & PR_ATOMIC) {
1544 		if ((m->m_flags & M_PKTHDR) == 0)
1545 			panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1546 			    "m_type %d", so, so->so_type, m, m->m_type);
1547 		if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1548 			error = EMSGSIZE;
1549 			goto release;
1550 		}
1551 		if (len < m->m_pkthdr.len)
1552 			goto release;
1553 		if (m->m_pkthdr.len < len) {
1554 			maxreached = 0;
1555 			len = m->m_pkthdr.len;
1556 		}
1557 		/*
1558 		 * Throw away the name mbuf after it has been assured
1559 		 * that the whole first record can be processed.
1560 		 */
1561 		m = so->so_rcv.sb_mb;
1562 		sbfree(so, &so->so_rcv, m);
1563 		so->so_rcv.sb_mb = m_free(m);
1564 		sbsync(&so->so_rcv, nextrecord);
1565 	}
1566 	/*
1567 	 * Throw away the control mbufs after it has been assured
1568 	 * that the whole first record can be processed.
1569 	 */
1570 	m = so->so_rcv.sb_mb;
1571 	while (m && m->m_type == MT_CONTROL) {
1572 		sbfree(so, &so->so_rcv, m);
1573 		so->so_rcv.sb_mb = m_free(m);
1574 		m = so->so_rcv.sb_mb;
1575 		sbsync(&so->so_rcv, nextrecord);
1576 	}
1577 
1578 	SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1579 	SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1580 
1581 	/* Take at most len mbufs out of receive buffer. */
1582 	for (off = 0, mp = &m; off <= len && *mp;
1583 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1584 		u_long size = len - off;
1585 
1586 #ifdef DIAGNOSTIC
1587 		if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1588 			panic("somove type: so %p, so_type %d, m %p, "
1589 			    "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1590 #endif
1591 		if ((*mp)->m_len > size) {
1592 			/*
1593 			 * Move only a partial mbuf at maximum splice length or
1594 			 * if the drain buffer is too small for this large mbuf.
1595 			 */
1596 			if (!maxreached && so->so_snd.sb_datacc > 0) {
1597 				len -= size;
1598 				break;
1599 			}
1600 			*mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1601 			if (*mp == NULL) {
1602 				len -= size;
1603 				break;
1604 			}
1605 			so->so_rcv.sb_mb->m_data += size;
1606 			so->so_rcv.sb_mb->m_len -= size;
1607 			so->so_rcv.sb_cc -= size;
1608 			so->so_rcv.sb_datacc -= size;
1609 		} else {
1610 			*mp = so->so_rcv.sb_mb;
1611 			sbfree(so, &so->so_rcv, *mp);
1612 			so->so_rcv.sb_mb = (*mp)->m_next;
1613 			sbsync(&so->so_rcv, nextrecord);
1614 		}
1615 	}
1616 	*mp = NULL;
1617 
1618 	SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1619 	SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1620 	SBCHECK(so, &so->so_rcv);
1621 	if (m == NULL)
1622 		goto release;
1623 	m->m_nextpkt = NULL;
1624 	if (m->m_flags & M_PKTHDR) {
1625 		m_resethdr(m);
1626 		m->m_pkthdr.len = len;
1627 	}
1628 
1629 	/* Send window update to source peer as receive buffer has changed. */
1630 	if (so->so_proto->pr_flags & PR_WANTRCVD)
1631 		pru_rcvd(so);
1632 
1633 	/* Receive buffer did shrink by len bytes, adjust oob. */
1634 	state = so->so_state;
1635 	so->so_state &= ~SS_RCVATMARK;
1636 	oobmark = so->so_oobmark;
1637 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1638 	if (oobmark) {
1639 		if (oobmark == len)
1640 			so->so_state |= SS_RCVATMARK;
1641 		if (oobmark >= len)
1642 			oobmark = 0;
1643 	}
1644 
1645 	/*
1646 	 * Handle oob data.  If any malloc fails, ignore error.
1647 	 * TCP urgent data is not very reliable anyway.
1648 	 */
1649 	while (((state & SS_RCVATMARK) || oobmark) &&
1650 	    (so->so_options & SO_OOBINLINE)) {
1651 		struct mbuf *o = NULL;
1652 
1653 		if (state & SS_RCVATMARK) {
1654 			o = m_get(wait, MT_DATA);
1655 			state &= ~SS_RCVATMARK;
1656 		} else if (oobmark) {
1657 			o = m_split(m, oobmark, wait);
1658 			if (o) {
1659 				error = pru_send(sosp, m, NULL, NULL);
1660 				if (error) {
1661 					if (sosp->so_state & SS_CANTSENDMORE)
1662 						error = EPIPE;
1663 					m_freem(o);
1664 					goto release;
1665 				}
1666 				len -= oobmark;
1667 				so->so_splicelen += oobmark;
1668 				m = o;
1669 				o = m_get(wait, MT_DATA);
1670 			}
1671 			oobmark = 0;
1672 		}
1673 		if (o) {
1674 			o->m_len = 1;
1675 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1676 			error = pru_sendoob(sosp, o, NULL, NULL);
1677 			if (error) {
1678 				if (sosp->so_state & SS_CANTSENDMORE)
1679 					error = EPIPE;
1680 				m_freem(m);
1681 				goto release;
1682 			}
1683 			len -= 1;
1684 			so->so_splicelen += 1;
1685 			if (oobmark) {
1686 				oobmark -= 1;
1687 				if (oobmark == 0)
1688 					state |= SS_RCVATMARK;
1689 			}
1690 			m_adj(m, 1);
1691 		}
1692 	}
1693 
1694 	/* Append all remaining data to drain socket. */
1695 	if (so->so_rcv.sb_cc == 0 || maxreached)
1696 		sosp->so_state &= ~SS_ISSENDING;
1697 	error = pru_send(sosp, m, NULL, NULL);
1698 	if (error) {
1699 		if (sosp->so_state & SS_CANTSENDMORE)
1700 			error = EPIPE;
1701 		goto release;
1702 	}
1703 	so->so_splicelen += len;
1704 
1705 	/* Move several packets if possible. */
1706 	if (!maxreached && nextrecord)
1707 		goto nextpkt;
1708 
1709  release:
1710 	sosp->so_state &= ~SS_ISSENDING;
1711 	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1712 		error = EFBIG;
1713 	if (error)
1714 		so->so_error = error;
1715 	if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) ||
1716 	    (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) {
1717 		sounsplice(so, sosp, 0);
1718 		return (0);
1719 	}
1720 	if (timerisset(&so->so_idletv))
1721 		timeout_add_tv(&so->so_idleto, &so->so_idletv);
1722 	return (1);
1723 }
1724 
1725 #endif /* SOCKET_SPLICE */
1726 
1727 void
1728 sorwakeup(struct socket *so)
1729 {
1730 	soassertlocked(so);
1731 
1732 #ifdef SOCKET_SPLICE
1733 	if (so->so_rcv.sb_flags & SB_SPLICE) {
1734 		/*
1735 		 * TCP has a sendbuffer that can handle multiple packets
1736 		 * at once.  So queue the stream a bit to accumulate data.
1737 		 * The sosplice thread will call somove() later and send
1738 		 * the packets calling tcp_output() only once.
1739 		 * In the UDP case, send out the packets immediately.
1740 		 * Using a thread would make things slower.
1741 		 */
1742 		if (so->so_proto->pr_flags & PR_WANTRCVD)
1743 			task_add(sosplice_taskq, &so->so_splicetask);
1744 		else
1745 			somove(so, M_DONTWAIT);
1746 	}
1747 	if (isspliced(so))
1748 		return;
1749 #endif
1750 	sowakeup(so, &so->so_rcv);
1751 	if (so->so_upcall)
1752 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1753 }
1754 
1755 void
1756 sowwakeup(struct socket *so)
1757 {
1758 	soassertlocked(so);
1759 
1760 #ifdef SOCKET_SPLICE
1761 	if (so->so_snd.sb_flags & SB_SPLICE)
1762 		task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
1763 	if (issplicedback(so))
1764 		return;
1765 #endif
1766 	sowakeup(so, &so->so_snd);
1767 }
1768 
1769 int
1770 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1771 {
1772 	int error = 0;
1773 
1774 	soassertlocked(so);
1775 
1776 	if (level != SOL_SOCKET) {
1777 		if (so->so_proto->pr_ctloutput) {
1778 			error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1779 			    level, optname, m);
1780 			return (error);
1781 		}
1782 		error = ENOPROTOOPT;
1783 	} else {
1784 		switch (optname) {
1785 		case SO_BINDANY:
1786 			if ((error = suser(curproc)) != 0)	/* XXX */
1787 				return (error);
1788 			break;
1789 		}
1790 
1791 		switch (optname) {
1792 
1793 		case SO_LINGER:
1794 			if (m == NULL || m->m_len != sizeof (struct linger) ||
1795 			    mtod(m, struct linger *)->l_linger < 0 ||
1796 			    mtod(m, struct linger *)->l_linger > SHRT_MAX)
1797 				return (EINVAL);
1798 			so->so_linger = mtod(m, struct linger *)->l_linger;
1799 			/* FALLTHROUGH */
1800 
1801 		case SO_BINDANY:
1802 		case SO_DEBUG:
1803 		case SO_KEEPALIVE:
1804 		case SO_USELOOPBACK:
1805 		case SO_BROADCAST:
1806 		case SO_REUSEADDR:
1807 		case SO_REUSEPORT:
1808 		case SO_OOBINLINE:
1809 		case SO_TIMESTAMP:
1810 		case SO_ZEROIZE:
1811 			if (m == NULL || m->m_len < sizeof (int))
1812 				return (EINVAL);
1813 			if (*mtod(m, int *))
1814 				so->so_options |= optname;
1815 			else
1816 				so->so_options &= ~optname;
1817 			break;
1818 
1819 		case SO_DONTROUTE:
1820 			if (m == NULL || m->m_len < sizeof (int))
1821 				return (EINVAL);
1822 			if (*mtod(m, int *))
1823 				error = EOPNOTSUPP;
1824 			break;
1825 
1826 		case SO_SNDBUF:
1827 		case SO_RCVBUF:
1828 		case SO_SNDLOWAT:
1829 		case SO_RCVLOWAT:
1830 		    {
1831 			u_long cnt;
1832 
1833 			if (m == NULL || m->m_len < sizeof (int))
1834 				return (EINVAL);
1835 			cnt = *mtod(m, int *);
1836 			if ((long)cnt <= 0)
1837 				cnt = 1;
1838 			switch (optname) {
1839 
1840 			case SO_SNDBUF:
1841 				if (so->so_state & SS_CANTSENDMORE)
1842 					return (EINVAL);
1843 				if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
1844 				    sbreserve(so, &so->so_snd, cnt))
1845 					return (ENOBUFS);
1846 				so->so_snd.sb_wat = cnt;
1847 				break;
1848 
1849 			case SO_RCVBUF:
1850 				if (so->so_state & SS_CANTRCVMORE)
1851 					return (EINVAL);
1852 				if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
1853 				    sbreserve(so, &so->so_rcv, cnt))
1854 					return (ENOBUFS);
1855 				so->so_rcv.sb_wat = cnt;
1856 				break;
1857 
1858 			case SO_SNDLOWAT:
1859 				so->so_snd.sb_lowat =
1860 				    (cnt > so->so_snd.sb_hiwat) ?
1861 				    so->so_snd.sb_hiwat : cnt;
1862 				break;
1863 			case SO_RCVLOWAT:
1864 				so->so_rcv.sb_lowat =
1865 				    (cnt > so->so_rcv.sb_hiwat) ?
1866 				    so->so_rcv.sb_hiwat : cnt;
1867 				break;
1868 			}
1869 			break;
1870 		    }
1871 
1872 		case SO_SNDTIMEO:
1873 		case SO_RCVTIMEO:
1874 		    {
1875 			struct timeval tv;
1876 			uint64_t nsecs;
1877 
1878 			if (m == NULL || m->m_len < sizeof (tv))
1879 				return (EINVAL);
1880 			memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
1881 			if (!timerisvalid(&tv))
1882 				return (EINVAL);
1883 			nsecs = TIMEVAL_TO_NSEC(&tv);
1884 			if (nsecs == UINT64_MAX)
1885 				return (EDOM);
1886 			if (nsecs == 0)
1887 				nsecs = INFSLP;
1888 			switch (optname) {
1889 
1890 			case SO_SNDTIMEO:
1891 				so->so_snd.sb_timeo_nsecs = nsecs;
1892 				break;
1893 			case SO_RCVTIMEO:
1894 				so->so_rcv.sb_timeo_nsecs = nsecs;
1895 				break;
1896 			}
1897 			break;
1898 		    }
1899 
1900 		case SO_RTABLE:
1901 			if (so->so_proto->pr_domain &&
1902 			    so->so_proto->pr_domain->dom_protosw &&
1903 			    so->so_proto->pr_ctloutput) {
1904 				const struct domain *dom =
1905 				    so->so_proto->pr_domain;
1906 
1907 				level = dom->dom_protosw->pr_protocol;
1908 				error = (*so->so_proto->pr_ctloutput)
1909 				    (PRCO_SETOPT, so, level, optname, m);
1910 				return (error);
1911 			}
1912 			error = ENOPROTOOPT;
1913 			break;
1914 
1915 #ifdef SOCKET_SPLICE
1916 		case SO_SPLICE:
1917 			if (m == NULL) {
1918 				error = sosplice(so, -1, 0, NULL);
1919 			} else if (m->m_len < sizeof(int)) {
1920 				return (EINVAL);
1921 			} else if (m->m_len < sizeof(struct splice)) {
1922 				error = sosplice(so, *mtod(m, int *), 0, NULL);
1923 			} else {
1924 				error = sosplice(so,
1925 				    mtod(m, struct splice *)->sp_fd,
1926 				    mtod(m, struct splice *)->sp_max,
1927 				   &mtod(m, struct splice *)->sp_idle);
1928 			}
1929 			break;
1930 #endif /* SOCKET_SPLICE */
1931 
1932 		default:
1933 			error = ENOPROTOOPT;
1934 			break;
1935 		}
1936 		if (error == 0 && so->so_proto->pr_ctloutput) {
1937 			(*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1938 			    level, optname, m);
1939 		}
1940 	}
1941 
1942 	return (error);
1943 }
1944 
1945 int
1946 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
1947 {
1948 	int error = 0;
1949 
1950 	soassertlocked(so);
1951 
1952 	if (level != SOL_SOCKET) {
1953 		if (so->so_proto->pr_ctloutput) {
1954 			m->m_len = 0;
1955 
1956 			error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
1957 			    level, optname, m);
1958 			if (error)
1959 				return (error);
1960 			return (0);
1961 		} else
1962 			return (ENOPROTOOPT);
1963 	} else {
1964 		m->m_len = sizeof (int);
1965 
1966 		switch (optname) {
1967 
1968 		case SO_LINGER:
1969 			m->m_len = sizeof (struct linger);
1970 			mtod(m, struct linger *)->l_onoff =
1971 				so->so_options & SO_LINGER;
1972 			mtod(m, struct linger *)->l_linger = so->so_linger;
1973 			break;
1974 
1975 		case SO_BINDANY:
1976 		case SO_USELOOPBACK:
1977 		case SO_DEBUG:
1978 		case SO_KEEPALIVE:
1979 		case SO_REUSEADDR:
1980 		case SO_REUSEPORT:
1981 		case SO_BROADCAST:
1982 		case SO_OOBINLINE:
1983 		case SO_TIMESTAMP:
1984 		case SO_ZEROIZE:
1985 			*mtod(m, int *) = so->so_options & optname;
1986 			break;
1987 
1988 		case SO_DONTROUTE:
1989 			*mtod(m, int *) = 0;
1990 			break;
1991 
1992 		case SO_TYPE:
1993 			*mtod(m, int *) = so->so_type;
1994 			break;
1995 
1996 		case SO_ERROR:
1997 			*mtod(m, int *) = so->so_error;
1998 			so->so_error = 0;
1999 			break;
2000 
2001 		case SO_DOMAIN:
2002 			*mtod(m, int *) = so->so_proto->pr_domain->dom_family;
2003 			break;
2004 
2005 		case SO_PROTOCOL:
2006 			*mtod(m, int *) = so->so_proto->pr_protocol;
2007 			break;
2008 
2009 		case SO_SNDBUF:
2010 			*mtod(m, int *) = so->so_snd.sb_hiwat;
2011 			break;
2012 
2013 		case SO_RCVBUF:
2014 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
2015 			break;
2016 
2017 		case SO_SNDLOWAT:
2018 			*mtod(m, int *) = so->so_snd.sb_lowat;
2019 			break;
2020 
2021 		case SO_RCVLOWAT:
2022 			*mtod(m, int *) = so->so_rcv.sb_lowat;
2023 			break;
2024 
2025 		case SO_SNDTIMEO:
2026 		case SO_RCVTIMEO:
2027 		    {
2028 			struct timeval tv;
2029 			uint64_t nsecs = (optname == SO_SNDTIMEO ?
2030 			    so->so_snd.sb_timeo_nsecs :
2031 			    so->so_rcv.sb_timeo_nsecs);
2032 
2033 			m->m_len = sizeof(struct timeval);
2034 			memset(&tv, 0, sizeof(tv));
2035 			if (nsecs != INFSLP)
2036 				NSEC_TO_TIMEVAL(nsecs, &tv);
2037 			memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
2038 			break;
2039 		    }
2040 
2041 		case SO_RTABLE:
2042 			if (so->so_proto->pr_domain &&
2043 			    so->so_proto->pr_domain->dom_protosw &&
2044 			    so->so_proto->pr_ctloutput) {
2045 				const struct domain *dom =
2046 				    so->so_proto->pr_domain;
2047 
2048 				level = dom->dom_protosw->pr_protocol;
2049 				error = (*so->so_proto->pr_ctloutput)
2050 				    (PRCO_GETOPT, so, level, optname, m);
2051 				if (error)
2052 					return (error);
2053 				break;
2054 			}
2055 			return (ENOPROTOOPT);
2056 
2057 #ifdef SOCKET_SPLICE
2058 		case SO_SPLICE:
2059 		    {
2060 			off_t len;
2061 
2062 			m->m_len = sizeof(off_t);
2063 			len = so->so_sp ? so->so_sp->ssp_len : 0;
2064 			memcpy(mtod(m, off_t *), &len, sizeof(off_t));
2065 			break;
2066 		    }
2067 #endif /* SOCKET_SPLICE */
2068 
2069 		case SO_PEERCRED:
2070 			if (so->so_proto->pr_protocol == AF_UNIX) {
2071 				struct unpcb *unp = sotounpcb(so);
2072 
2073 				if (unp->unp_flags & UNP_FEIDS) {
2074 					m->m_len = sizeof(unp->unp_connid);
2075 					memcpy(mtod(m, caddr_t),
2076 					    &(unp->unp_connid), m->m_len);
2077 					break;
2078 				}
2079 				return (ENOTCONN);
2080 			}
2081 			return (EOPNOTSUPP);
2082 
2083 		default:
2084 			return (ENOPROTOOPT);
2085 		}
2086 		return (0);
2087 	}
2088 }
2089 
2090 void
2091 sohasoutofband(struct socket *so)
2092 {
2093 	pgsigio(&so->so_sigio, SIGURG, 0);
2094 	KNOTE(&so->so_rcv.sb_sel.si_note, 0);
2095 }
2096 
2097 int
2098 soo_kqfilter(struct file *fp, struct knote *kn)
2099 {
2100 	struct socket *so = kn->kn_fp->f_data;
2101 	struct sockbuf *sb;
2102 
2103 	solock(so);
2104 	switch (kn->kn_filter) {
2105 	case EVFILT_READ:
2106 		if (so->so_options & SO_ACCEPTCONN)
2107 			kn->kn_fop = &solisten_filtops;
2108 		else
2109 			kn->kn_fop = &soread_filtops;
2110 		sb = &so->so_rcv;
2111 		break;
2112 	case EVFILT_WRITE:
2113 		kn->kn_fop = &sowrite_filtops;
2114 		sb = &so->so_snd;
2115 		break;
2116 	case EVFILT_EXCEPT:
2117 		kn->kn_fop = &soexcept_filtops;
2118 		sb = &so->so_rcv;
2119 		break;
2120 	default:
2121 		sounlock(so);
2122 		return (EINVAL);
2123 	}
2124 
2125 	klist_insert_locked(&sb->sb_sel.si_note, kn);
2126 	sounlock(so);
2127 
2128 	return (0);
2129 }
2130 
2131 void
2132 filt_sordetach(struct knote *kn)
2133 {
2134 	struct socket *so = kn->kn_fp->f_data;
2135 
2136 	klist_remove(&so->so_rcv.sb_sel.si_note, kn);
2137 }
2138 
2139 int
2140 filt_soread(struct knote *kn, long hint)
2141 {
2142 	struct socket *so = kn->kn_fp->f_data;
2143 	int rv = 0;
2144 
2145 	soassertlocked(so);
2146 
2147 	kn->kn_data = so->so_rcv.sb_cc;
2148 #ifdef SOCKET_SPLICE
2149 	if (isspliced(so)) {
2150 		rv = 0;
2151 	} else
2152 #endif /* SOCKET_SPLICE */
2153 	if (so->so_state & SS_CANTRCVMORE) {
2154 		kn->kn_flags |= EV_EOF;
2155 		if (kn->kn_flags & __EV_POLL) {
2156 			if (so->so_state & SS_ISDISCONNECTED)
2157 				kn->kn_flags |= __EV_HUP;
2158 		}
2159 		kn->kn_fflags = so->so_error;
2160 		rv = 1;
2161 	} else if (so->so_error) {	/* temporary udp error */
2162 		rv = 1;
2163 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2164 		rv = (kn->kn_data >= kn->kn_sdata);
2165 	} else {
2166 		rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2167 	}
2168 
2169 	return rv;
2170 }
2171 
2172 void
2173 filt_sowdetach(struct knote *kn)
2174 {
2175 	struct socket *so = kn->kn_fp->f_data;
2176 
2177 	klist_remove(&so->so_snd.sb_sel.si_note, kn);
2178 }
2179 
2180 int
2181 filt_sowrite(struct knote *kn, long hint)
2182 {
2183 	struct socket *so = kn->kn_fp->f_data;
2184 	int rv;
2185 
2186 	soassertlocked(so);
2187 
2188 	kn->kn_data = sbspace(so, &so->so_snd);
2189 	if (so->so_state & SS_CANTSENDMORE) {
2190 		kn->kn_flags |= EV_EOF;
2191 		if (kn->kn_flags & __EV_POLL) {
2192 			if (so->so_state & SS_ISDISCONNECTED)
2193 				kn->kn_flags |= __EV_HUP;
2194 		}
2195 		kn->kn_fflags = so->so_error;
2196 		rv = 1;
2197 	} else if (so->so_error) {	/* temporary udp error */
2198 		rv = 1;
2199 	} else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2200 	    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2201 		rv = 0;
2202 	} else if (kn->kn_sfflags & NOTE_LOWAT) {
2203 		rv = (kn->kn_data >= kn->kn_sdata);
2204 	} else {
2205 		rv = (kn->kn_data >= so->so_snd.sb_lowat);
2206 	}
2207 
2208 	return (rv);
2209 }
2210 
2211 int
2212 filt_soexcept(struct knote *kn, long hint)
2213 {
2214 	struct socket *so = kn->kn_fp->f_data;
2215 	int rv = 0;
2216 
2217 	soassertlocked(so);
2218 
2219 #ifdef SOCKET_SPLICE
2220 	if (isspliced(so)) {
2221 		rv = 0;
2222 	} else
2223 #endif /* SOCKET_SPLICE */
2224 	if (kn->kn_sfflags & NOTE_OOB) {
2225 		if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
2226 			kn->kn_fflags |= NOTE_OOB;
2227 			kn->kn_data -= so->so_oobmark;
2228 			rv = 1;
2229 		}
2230 	}
2231 
2232 	if (kn->kn_flags & __EV_POLL) {
2233 		if (so->so_state & SS_ISDISCONNECTED) {
2234 			kn->kn_flags |= __EV_HUP;
2235 			rv = 1;
2236 		}
2237 	}
2238 
2239 	return rv;
2240 }
2241 
2242 int
2243 filt_solisten(struct knote *kn, long hint)
2244 {
2245 	struct socket *so = kn->kn_fp->f_data;
2246 	int active;
2247 
2248 	soassertlocked(so);
2249 
2250 	kn->kn_data = so->so_qlen;
2251 	active = (kn->kn_data != 0);
2252 
2253 	if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) {
2254 		if (so->so_state & SS_ISDISCONNECTED) {
2255 			kn->kn_flags |= __EV_HUP;
2256 			active = 1;
2257 		} else {
2258 			active = soreadable(so);
2259 		}
2260 	}
2261 
2262 	return (active);
2263 }
2264 
2265 int
2266 filt_somodify(struct kevent *kev, struct knote *kn)
2267 {
2268 	struct socket *so = kn->kn_fp->f_data;
2269 	int rv;
2270 
2271 	solock(so);
2272 	rv = knote_modify(kev, kn);
2273 	sounlock(so);
2274 
2275 	return (rv);
2276 }
2277 
2278 int
2279 filt_soprocess(struct knote *kn, struct kevent *kev)
2280 {
2281 	struct socket *so = kn->kn_fp->f_data;
2282 	int rv;
2283 
2284 	solock(so);
2285 	rv = knote_process(kn, kev);
2286 	sounlock(so);
2287 
2288 	return (rv);
2289 }
2290 
2291 void
2292 klist_soassertlk(void *arg)
2293 {
2294 	struct socket *so = arg;
2295 
2296 	soassertlocked(so);
2297 }
2298 
2299 int
2300 klist_solock(void *arg)
2301 {
2302 	struct socket *so = arg;
2303 
2304 	solock(so);
2305 	return (1);
2306 }
2307 
2308 void
2309 klist_sounlock(void *arg, int ls)
2310 {
2311 	struct socket *so = arg;
2312 
2313 	sounlock(so);
2314 }
2315 
2316 const struct klistops socket_klistops = {
2317 	.klo_assertlk	= klist_soassertlk,
2318 	.klo_lock	= klist_solock,
2319 	.klo_unlock	= klist_sounlock,
2320 };
2321 
2322 #ifdef DDB
2323 void
2324 sobuf_print(struct sockbuf *,
2325     int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2326 
2327 void
2328 sobuf_print(struct sockbuf *sb,
2329     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2330 {
2331 	(*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2332 	(*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2333 	(*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2334 	(*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2335 	(*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2336 	(*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2337 	(*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2338 	(*pr)("\tsb_mb: %p\n", sb->sb_mb);
2339 	(*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2340 	(*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2341 	(*pr)("\tsb_sel: ...\n");
2342 	(*pr)("\tsb_flags: %i\n", sb->sb_flags);
2343 	(*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2344 }
2345 
2346 void
2347 so_print(void *v,
2348     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2349 {
2350 	struct socket *so = v;
2351 
2352 	(*pr)("socket %p\n", so);
2353 	(*pr)("so_type: %i\n", so->so_type);
2354 	(*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2355 	(*pr)("so_linger: %i\n", so->so_linger);
2356 	(*pr)("so_state: 0x%04x\n", so->so_state);
2357 	(*pr)("so_pcb: %p\n", so->so_pcb);
2358 	(*pr)("so_proto: %p\n", so->so_proto);
2359 	(*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2360 
2361 	(*pr)("so_head: %p\n", so->so_head);
2362 	(*pr)("so_onq: %p\n", so->so_onq);
2363 	(*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2364 	(*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2365 	(*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2366 	(*pr)("so_q0len: %i\n", so->so_q0len);
2367 	(*pr)("so_qlen: %i\n", so->so_qlen);
2368 	(*pr)("so_qlimit: %i\n", so->so_qlimit);
2369 	(*pr)("so_timeo: %i\n", so->so_timeo);
2370 	(*pr)("so_obmark: %lu\n", so->so_oobmark);
2371 
2372 	(*pr)("so_sp: %p\n", so->so_sp);
2373 	if (so->so_sp != NULL) {
2374 		(*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2375 		(*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2376 		(*pr)("\tssp_len: %lld\n",
2377 		    (unsigned long long)so->so_sp->ssp_len);
2378 		(*pr)("\tssp_max: %lld\n",
2379 		    (unsigned long long)so->so_sp->ssp_max);
2380 		(*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2381 		    so->so_sp->ssp_idletv.tv_usec);
2382 		(*pr)("\tssp_idleto: %spending (@%i)\n",
2383 		    timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2384 		    so->so_sp->ssp_idleto.to_time);
2385 	}
2386 
2387 	(*pr)("so_rcv:\n");
2388 	sobuf_print(&so->so_rcv, pr);
2389 	(*pr)("so_snd:\n");
2390 	sobuf_print(&so->so_snd, pr);
2391 
2392 	(*pr)("so_upcall: %p so_upcallarg: %p\n",
2393 	    so->so_upcall, so->so_upcallarg);
2394 
2395 	(*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2396 	(*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2397 	(*pr)("so_cpid: %d\n", so->so_cpid);
2398 }
2399 #endif
2400