xref: /openbsd-src/sys/kern/uipc_socket.c (revision e5157e49389faebcb42b7237d55fbf096d9c2523)
1 /*	$OpenBSD: uipc_socket.c,v 1.134 2014/11/03 17:20:46 bluhm Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/kernel.h>
44 #include <sys/event.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/unpcb.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/resourcevar.h>
51 #include <net/if.h>
52 #include <sys/pool.h>
53 
54 void	sbsync(struct sockbuf *, struct mbuf *);
55 
56 int	sosplice(struct socket *, int, off_t, struct timeval *);
57 void	sounsplice(struct socket *, struct socket *, int);
58 void	soidle(void *);
59 int	somove(struct socket *, int);
60 
61 void	filt_sordetach(struct knote *kn);
62 int	filt_soread(struct knote *kn, long hint);
63 void	filt_sowdetach(struct knote *kn);
64 int	filt_sowrite(struct knote *kn, long hint);
65 int	filt_solisten(struct knote *kn, long hint);
66 
67 struct filterops solisten_filtops =
68 	{ 1, NULL, filt_sordetach, filt_solisten };
69 struct filterops soread_filtops =
70 	{ 1, NULL, filt_sordetach, filt_soread };
71 struct filterops sowrite_filtops =
72 	{ 1, NULL, filt_sowdetach, filt_sowrite };
73 
74 
75 #ifndef SOMINCONN
76 #define SOMINCONN 80
77 #endif /* SOMINCONN */
78 
79 int	somaxconn = SOMAXCONN;
80 int	sominconn = SOMINCONN;
81 
82 struct pool socket_pool;
83 #ifdef SOCKET_SPLICE
84 struct pool sosplice_pool;
85 #endif
86 
87 void
88 soinit(void)
89 {
90 
91 	pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL);
92 #ifdef SOCKET_SPLICE
93 	pool_init(&sosplice_pool, sizeof(struct sosplice), 0, 0, 0, "sosppl",
94 	    NULL);
95 #endif
96 }
97 
98 /*
99  * Socket operation routines.
100  * These routines are called by the routines in
101  * sys_socket.c or from a system process, and
102  * implement the semantics of socket operations by
103  * switching out to the protocol specific routines.
104  */
105 /*ARGSUSED*/
106 int
107 socreate(int dom, struct socket **aso, int type, int proto)
108 {
109 	struct proc *p = curproc;		/* XXX */
110 	struct protosw *prp;
111 	struct socket *so;
112 	int error, s;
113 
114 	if (proto)
115 		prp = pffindproto(dom, proto, type);
116 	else
117 		prp = pffindtype(dom, type);
118 	if (prp == NULL || prp->pr_usrreq == 0)
119 		return (EPROTONOSUPPORT);
120 	if (prp->pr_type != type)
121 		return (EPROTOTYPE);
122 	s = splsoftnet();
123 	so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO);
124 	TAILQ_INIT(&so->so_q0);
125 	TAILQ_INIT(&so->so_q);
126 	so->so_type = type;
127 	if (suser(p, 0) == 0)
128 		so->so_state = SS_PRIV;
129 	so->so_ruid = p->p_ucred->cr_ruid;
130 	so->so_euid = p->p_ucred->cr_uid;
131 	so->so_rgid = p->p_ucred->cr_rgid;
132 	so->so_egid = p->p_ucred->cr_gid;
133 	so->so_cpid = p->p_p->ps_pid;
134 	so->so_proto = prp;
135 	error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL,
136 	    (struct mbuf *)(long)proto, NULL, p);
137 	if (error) {
138 		so->so_state |= SS_NOFDREF;
139 		sofree(so);
140 		splx(s);
141 		return (error);
142 	}
143 	splx(s);
144 	*aso = so;
145 	return (0);
146 }
147 
148 int
149 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
150 {
151 	int s = splsoftnet();
152 	int error;
153 
154 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
155 	splx(s);
156 	return (error);
157 }
158 
159 int
160 solisten(struct socket *so, int backlog)
161 {
162 	int s, error;
163 
164 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
165 		return (EOPNOTSUPP);
166 #ifdef SOCKET_SPLICE
167 	if (isspliced(so) || issplicedback(so))
168 		return (EOPNOTSUPP);
169 #endif /* SOCKET_SPLICE */
170 	s = splsoftnet();
171 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
172 	    curproc);
173 	if (error) {
174 		splx(s);
175 		return (error);
176 	}
177 	if (TAILQ_FIRST(&so->so_q) == NULL)
178 		so->so_options |= SO_ACCEPTCONN;
179 	if (backlog < 0 || backlog > somaxconn)
180 		backlog = somaxconn;
181 	if (backlog < sominconn)
182 		backlog = sominconn;
183 	so->so_qlimit = backlog;
184 	splx(s);
185 	return (0);
186 }
187 
188 /*
189  *  Must be called at splsoftnet()
190  */
191 
192 void
193 sofree(struct socket *so)
194 {
195 	splsoftassert(IPL_SOFTNET);
196 
197 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
198 		return;
199 	if (so->so_head) {
200 		/*
201 		 * We must not decommission a socket that's on the accept(2)
202 		 * queue.  If we do, then accept(2) may hang after select(2)
203 		 * indicated that the listening socket was ready.
204 		 */
205 		if (!soqremque(so, 0))
206 			return;
207 	}
208 #ifdef SOCKET_SPLICE
209 	if (so->so_sp) {
210 		if (issplicedback(so))
211 			sounsplice(so->so_sp->ssp_soback, so,
212 			    so->so_sp->ssp_soback != so);
213 		if (isspliced(so))
214 			sounsplice(so, so->so_sp->ssp_socket, 0);
215 		pool_put(&sosplice_pool, so->so_sp);
216 		so->so_sp = NULL;
217 	}
218 #endif /* SOCKET_SPLICE */
219 	sbrelease(&so->so_snd);
220 	sorflush(so);
221 	pool_put(&socket_pool, so);
222 }
223 
224 /*
225  * Close a socket on last file table reference removal.
226  * Initiate disconnect if connected.
227  * Free socket when disconnect complete.
228  */
229 int
230 soclose(struct socket *so)
231 {
232 	struct socket *so2;
233 	int s = splsoftnet();		/* conservative */
234 	int error = 0;
235 
236 	if (so->so_options & SO_ACCEPTCONN) {
237 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
238 			(void) soqremque(so2, 0);
239 			(void) soabort(so2);
240 		}
241 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
242 			(void) soqremque(so2, 1);
243 			(void) soabort(so2);
244 		}
245 	}
246 	if (so->so_pcb == 0)
247 		goto discard;
248 	if (so->so_state & SS_ISCONNECTED) {
249 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
250 			error = sodisconnect(so);
251 			if (error)
252 				goto drop;
253 		}
254 		if (so->so_options & SO_LINGER) {
255 			if ((so->so_state & SS_ISDISCONNECTING) &&
256 			    (so->so_state & SS_NBIO))
257 				goto drop;
258 			while (so->so_state & SS_ISCONNECTED) {
259 				error = tsleep(&so->so_timeo,
260 				    PSOCK | PCATCH, "netcls",
261 				    so->so_linger * hz);
262 				if (error)
263 					break;
264 			}
265 		}
266 	}
267 drop:
268 	if (so->so_pcb) {
269 		int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL,
270 		    NULL, NULL, curproc);
271 		if (error == 0)
272 			error = error2;
273 	}
274 discard:
275 	if (so->so_state & SS_NOFDREF)
276 		panic("soclose: NOFDREF");
277 	so->so_state |= SS_NOFDREF;
278 	sofree(so);
279 	splx(s);
280 	return (error);
281 }
282 
283 /*
284  * Must be called at splsoftnet.
285  */
286 int
287 soabort(struct socket *so)
288 {
289 	splsoftassert(IPL_SOFTNET);
290 
291 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL,
292 	   curproc);
293 }
294 
295 int
296 soaccept(struct socket *so, struct mbuf *nam)
297 {
298 	int s = splsoftnet();
299 	int error = 0;
300 
301 	if ((so->so_state & SS_NOFDREF) == 0)
302 		panic("soaccept: !NOFDREF");
303 	so->so_state &= ~SS_NOFDREF;
304 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
305 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
306 		error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL,
307 		    nam, NULL, curproc);
308 	else
309 		error = ECONNABORTED;
310 	splx(s);
311 	return (error);
312 }
313 
314 int
315 soconnect(struct socket *so, struct mbuf *nam)
316 {
317 	int s;
318 	int error;
319 
320 	if (so->so_options & SO_ACCEPTCONN)
321 		return (EOPNOTSUPP);
322 	s = splsoftnet();
323 	/*
324 	 * If protocol is connection-based, can only connect once.
325 	 * Otherwise, if connected, try to disconnect first.
326 	 * This allows user to disconnect by connecting to, e.g.,
327 	 * a null address.
328 	 */
329 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
330 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
331 	    (error = sodisconnect(so))))
332 		error = EISCONN;
333 	else
334 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
335 		    NULL, nam, NULL, curproc);
336 	splx(s);
337 	return (error);
338 }
339 
340 int
341 soconnect2(struct socket *so1, struct socket *so2)
342 {
343 	int s = splsoftnet();
344 	int error;
345 
346 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
347 	    (struct mbuf *)so2, NULL, curproc);
348 	splx(s);
349 	return (error);
350 }
351 
352 int
353 sodisconnect(struct socket *so)
354 {
355 	int s = splsoftnet();
356 	int error;
357 
358 	if ((so->so_state & SS_ISCONNECTED) == 0) {
359 		error = ENOTCONN;
360 		goto bad;
361 	}
362 	if (so->so_state & SS_ISDISCONNECTING) {
363 		error = EALREADY;
364 		goto bad;
365 	}
366 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL,
367 	    NULL, curproc);
368 bad:
369 	splx(s);
370 	return (error);
371 }
372 
373 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
374 /*
375  * Send on a socket.
376  * If send must go all at once and message is larger than
377  * send buffering, then hard error.
378  * Lock against other senders.
379  * If must go all at once and not enough room now, then
380  * inform user that this would block and do nothing.
381  * Otherwise, if nonblocking, send as much as possible.
382  * The data to be sent is described by "uio" if nonzero,
383  * otherwise by the mbuf chain "top" (which must be null
384  * if uio is not).  Data provided in mbuf chain must be small
385  * enough to send all at once.
386  *
387  * Returns nonzero on error, timeout or signal; callers
388  * must check for short counts if EINTR/ERESTART are returned.
389  * Data and control buffers are freed on return.
390  */
391 int
392 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
393     struct mbuf *control, int flags)
394 {
395 	struct mbuf **mp;
396 	struct mbuf *m;
397 	long space, len, mlen, clen = 0;
398 	quad_t resid;
399 	int error, s;
400 	int atomic = sosendallatonce(so) || top;
401 
402 	if (uio)
403 		resid = uio->uio_resid;
404 	else
405 		resid = top->m_pkthdr.len;
406 	/*
407 	 * In theory resid should be unsigned (since uio->uio_resid is).
408 	 * However, space must be signed, as it might be less than 0
409 	 * if we over-committed, and we must use a signed comparison
410 	 * of space and resid.  On the other hand, a negative resid
411 	 * causes us to loop sending 0-length segments to the protocol.
412 	 * MSG_EOR on a SOCK_STREAM socket is also invalid.
413 	 */
414 	if (resid < 0 ||
415 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
416 		error = EINVAL;
417 		goto out;
418 	}
419 	if (uio && uio->uio_procp)
420 		uio->uio_procp->p_ru.ru_msgsnd++;
421 	if (control) {
422 		clen = control->m_len;
423 		/* reserve extra space for AF_LOCAL's internalize */
424 		if (so->so_proto->pr_domain->dom_family == AF_LOCAL &&
425 		    clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
426 		    mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
427 			clen = CMSG_SPACE(
428 			    (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
429 			    (sizeof(struct file *) / sizeof(int)));
430 	}
431 
432 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
433 
434 restart:
435 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
436 		goto out;
437 	so->so_state |= SS_ISSENDING;
438 	do {
439 		s = splsoftnet();
440 		if (so->so_state & SS_CANTSENDMORE)
441 			snderr(EPIPE);
442 		if (so->so_error) {
443 			error = so->so_error;
444 			so->so_error = 0;
445 			splx(s);
446 			goto release;
447 		}
448 		if ((so->so_state & SS_ISCONNECTED) == 0) {
449 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
450 				if (!(resid == 0 && clen != 0))
451 					snderr(ENOTCONN);
452 			} else if (addr == 0)
453 				snderr(EDESTADDRREQ);
454 		}
455 		space = sbspace(&so->so_snd);
456 		if (flags & MSG_OOB)
457 			space += 1024;
458 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
459 		    (so->so_proto->pr_domain->dom_family != AF_LOCAL &&
460 		    clen > so->so_snd.sb_hiwat))
461 			snderr(EMSGSIZE);
462 		if (space < resid + clen &&
463 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
464 			if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT))
465 				snderr(EWOULDBLOCK);
466 			sbunlock(&so->so_snd);
467 			error = sbwait(&so->so_snd);
468 			so->so_state &= ~SS_ISSENDING;
469 			splx(s);
470 			if (error)
471 				goto out;
472 			goto restart;
473 		}
474 		splx(s);
475 		mp = &top;
476 		space -= clen;
477 		do {
478 			if (uio == NULL) {
479 				/*
480 				 * Data is prepackaged in "top".
481 				 */
482 				resid = 0;
483 				if (flags & MSG_EOR)
484 					top->m_flags |= M_EOR;
485 			} else do {
486 				if (top == 0) {
487 					MGETHDR(m, M_WAIT, MT_DATA);
488 					mlen = MHLEN;
489 					m->m_pkthdr.len = 0;
490 					m->m_pkthdr.rcvif = (struct ifnet *)0;
491 				} else {
492 					MGET(m, M_WAIT, MT_DATA);
493 					mlen = MLEN;
494 				}
495 				if (resid >= MINCLSIZE && space >= MCLBYTES) {
496 					MCLGET(m, M_NOWAIT);
497 					if ((m->m_flags & M_EXT) == 0)
498 						goto nopages;
499 					if (atomic && top == 0) {
500 						len = lmin(MCLBYTES - max_hdr,
501 						    resid);
502 						m->m_data += max_hdr;
503 					} else
504 						len = lmin(MCLBYTES, resid);
505 					space -= len;
506 				} else {
507 nopages:
508 					len = lmin(lmin(mlen, resid), space);
509 					space -= len;
510 					/*
511 					 * For datagram protocols, leave room
512 					 * for protocol headers in first mbuf.
513 					 */
514 					if (atomic && top == 0 && len < mlen)
515 						MH_ALIGN(m, len);
516 				}
517 				error = uiomove(mtod(m, caddr_t), (int)len,
518 				    uio);
519 				resid = uio->uio_resid;
520 				m->m_len = len;
521 				*mp = m;
522 				top->m_pkthdr.len += len;
523 				if (error)
524 					goto release;
525 				mp = &m->m_next;
526 				if (resid <= 0) {
527 					if (flags & MSG_EOR)
528 						top->m_flags |= M_EOR;
529 					break;
530 				}
531 			} while (space > 0 && atomic);
532 			s = splsoftnet();		/* XXX */
533 			if (resid <= 0)
534 				so->so_state &= ~SS_ISSENDING;
535 			error = (*so->so_proto->pr_usrreq)(so,
536 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
537 			    top, addr, control, curproc);
538 			splx(s);
539 			clen = 0;
540 			control = 0;
541 			top = 0;
542 			mp = &top;
543 			if (error)
544 				goto release;
545 		} while (resid && space > 0);
546 	} while (resid);
547 
548 release:
549 	so->so_state &= ~SS_ISSENDING;
550 	sbunlock(&so->so_snd);
551 out:
552 	if (top)
553 		m_freem(top);
554 	if (control)
555 		m_freem(control);
556 	return (error);
557 }
558 
559 /*
560  * Following replacement or removal of the first mbuf on the first
561  * mbuf chain of a socket buffer, push necessary state changes back
562  * into the socket buffer so that other consumers see the values
563  * consistently.  'nextrecord' is the callers locally stored value of
564  * the original value of sb->sb_mb->m_nextpkt which must be restored
565  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
566  */
567 void
568 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
569 {
570 
571 	/*
572 	 * First, update for the new value of nextrecord.  If necessary,
573 	 * make it the first record.
574 	 */
575 	if (sb->sb_mb != NULL)
576 		sb->sb_mb->m_nextpkt = nextrecord;
577 	else
578 		sb->sb_mb = nextrecord;
579 
580 	/*
581 	 * Now update any dependent socket buffer fields to reflect
582 	 * the new state.  This is an inline of SB_EMPTY_FIXUP, with
583 	 * the addition of a second clause that takes care of the
584 	 * case where sb_mb has been updated, but remains the last
585 	 * record.
586 	 */
587 	if (sb->sb_mb == NULL) {
588 		sb->sb_mbtail = NULL;
589 		sb->sb_lastrecord = NULL;
590 	} else if (sb->sb_mb->m_nextpkt == NULL)
591 		sb->sb_lastrecord = sb->sb_mb;
592 }
593 
594 /*
595  * Implement receive operations on a socket.
596  * We depend on the way that records are added to the sockbuf
597  * by sbappend*.  In particular, each record (mbufs linked through m_next)
598  * must begin with an address if the protocol so specifies,
599  * followed by an optional mbuf or mbufs containing ancillary data,
600  * and then zero or more mbufs of data.
601  * In order to avoid blocking network interrupts for the entire time here,
602  * we splx() while doing the actual copy to user space.
603  * Although the sockbuf is locked, new data may still be appended,
604  * and thus we must maintain consistency of the sockbuf during that time.
605  *
606  * The caller may receive the data as a single mbuf chain by supplying
607  * an mbuf **mp0 for use in returning the chain.  The uio is then used
608  * only for the count in uio_resid.
609  */
610 int
611 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
612     struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
613     socklen_t controllen)
614 {
615 	struct mbuf *m, **mp;
616 	struct mbuf *cm;
617 	int flags, len, error, s, offset;
618 	struct protosw *pr = so->so_proto;
619 	struct mbuf *nextrecord;
620 	int moff, type = 0;
621 	size_t orig_resid = uio->uio_resid;
622 	int uio_error = 0;
623 	int resid;
624 
625 	mp = mp0;
626 	if (paddr)
627 		*paddr = 0;
628 	if (controlp)
629 		*controlp = 0;
630 	if (flagsp)
631 		flags = *flagsp &~ MSG_EOR;
632 	else
633 		flags = 0;
634 	if (so->so_state & SS_NBIO)
635 		flags |= MSG_DONTWAIT;
636 	if (flags & MSG_OOB) {
637 		m = m_get(M_WAIT, MT_DATA);
638 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
639 		    (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc);
640 		if (error)
641 			goto bad;
642 		do {
643 			error = uiomove(mtod(m, caddr_t),
644 			    (int) min(uio->uio_resid, m->m_len), uio);
645 			m = m_free(m);
646 		} while (uio->uio_resid && error == 0 && m);
647 bad:
648 		if (m)
649 			m_freem(m);
650 		return (error);
651 	}
652 	if (mp)
653 		*mp = NULL;
654 
655 restart:
656 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
657 		return (error);
658 	s = splsoftnet();
659 
660 	m = so->so_rcv.sb_mb;
661 #ifdef SOCKET_SPLICE
662 	if (isspliced(so))
663 		m = NULL;
664 #endif /* SOCKET_SPLICE */
665 	/*
666 	 * If we have less data than requested, block awaiting more
667 	 * (subject to any timeout) if:
668 	 *   1. the current count is less than the low water mark,
669 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
670 	 *	receive operation at once if we block (resid <= hiwat), or
671 	 *   3. MSG_DONTWAIT is not set.
672 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
673 	 * we have to do the receive in sections, and thus risk returning
674 	 * a short count if a timeout or signal occurs after we start.
675 	 */
676 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
677 	    so->so_rcv.sb_cc < uio->uio_resid) &&
678 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
679 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
680 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
681 #ifdef DIAGNOSTIC
682 		if (m == NULL && so->so_rcv.sb_cc)
683 #ifdef SOCKET_SPLICE
684 		    if (!isspliced(so))
685 #endif /* SOCKET_SPLICE */
686 			panic("receive 1");
687 #endif
688 		if (so->so_error) {
689 			if (m)
690 				goto dontblock;
691 			error = so->so_error;
692 			if ((flags & MSG_PEEK) == 0)
693 				so->so_error = 0;
694 			goto release;
695 		}
696 		if (so->so_state & SS_CANTRCVMORE) {
697 			if (m)
698 				goto dontblock;
699 			else if (so->so_rcv.sb_cc == 0)
700 				goto release;
701 		}
702 		for (; m; m = m->m_next)
703 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
704 				m = so->so_rcv.sb_mb;
705 				goto dontblock;
706 			}
707 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
708 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
709 			error = ENOTCONN;
710 			goto release;
711 		}
712 		if (uio->uio_resid == 0 && controlp == NULL)
713 			goto release;
714 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
715 			error = EWOULDBLOCK;
716 			goto release;
717 		}
718 		SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
719 		SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
720 		sbunlock(&so->so_rcv);
721 		error = sbwait(&so->so_rcv);
722 		splx(s);
723 		if (error)
724 			return (error);
725 		goto restart;
726 	}
727 dontblock:
728 	/*
729 	 * On entry here, m points to the first record of the socket buffer.
730 	 * From this point onward, we maintain 'nextrecord' as a cache of the
731 	 * pointer to the next record in the socket buffer.  We must keep the
732 	 * various socket buffer pointers and local stack versions of the
733 	 * pointers in sync, pushing out modifications before operations that
734 	 * may sleep, and re-reading them afterwards.
735 	 *
736 	 * Otherwise, we will race with the network stack appending new data
737 	 * or records onto the socket buffer by using inconsistent/stale
738 	 * versions of the field, possibly resulting in socket buffer
739 	 * corruption.
740 	 */
741 	if (uio->uio_procp)
742 		uio->uio_procp->p_ru.ru_msgrcv++;
743 	KASSERT(m == so->so_rcv.sb_mb);
744 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
745 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
746 	nextrecord = m->m_nextpkt;
747 	if (pr->pr_flags & PR_ADDR) {
748 #ifdef DIAGNOSTIC
749 		if (m->m_type != MT_SONAME)
750 			panic("receive 1a");
751 #endif
752 		orig_resid = 0;
753 		if (flags & MSG_PEEK) {
754 			if (paddr)
755 				*paddr = m_copy(m, 0, m->m_len);
756 			m = m->m_next;
757 		} else {
758 			sbfree(&so->so_rcv, m);
759 			if (paddr) {
760 				*paddr = m;
761 				so->so_rcv.sb_mb = m->m_next;
762 				m->m_next = 0;
763 				m = so->so_rcv.sb_mb;
764 			} else {
765 				MFREE(m, so->so_rcv.sb_mb);
766 				m = so->so_rcv.sb_mb;
767 			}
768 			sbsync(&so->so_rcv, nextrecord);
769 		}
770 	}
771 	while (m && m->m_type == MT_CONTROL && error == 0) {
772 		if (flags & MSG_PEEK) {
773 			if (controlp)
774 				*controlp = m_copy(m, 0, m->m_len);
775 			m = m->m_next;
776 		} else {
777 			sbfree(&so->so_rcv, m);
778 			so->so_rcv.sb_mb = m->m_next;
779 			m->m_nextpkt = m->m_next = NULL;
780 			cm = m;
781 			m = so->so_rcv.sb_mb;
782 			sbsync(&so->so_rcv, nextrecord);
783 			if (controlp) {
784 				if (pr->pr_domain->dom_externalize &&
785 				    mtod(cm, struct cmsghdr *)->cmsg_type ==
786 				    SCM_RIGHTS)
787 				   error = (*pr->pr_domain->dom_externalize)(cm,
788 				       controllen, flags);
789 				*controlp = cm;
790 			} else {
791 				/*
792 				 * Dispose of any SCM_RIGHTS message that went
793 				 * through the read path rather than recv.
794 				 */
795 				if (pr->pr_domain->dom_dispose &&
796 				    mtod(cm, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
797 					pr->pr_domain->dom_dispose(cm);
798 				m_free(cm);
799 			}
800 		}
801 		if (m != NULL)
802 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
803 		else
804 			nextrecord = so->so_rcv.sb_mb;
805 		if (controlp) {
806 			orig_resid = 0;
807 			controlp = &(*controlp)->m_next;
808 		}
809 	}
810 
811 	/* If m is non-NULL, we have some data to read. */
812 	if (m) {
813 		type = m->m_type;
814 		if (type == MT_OOBDATA)
815 			flags |= MSG_OOB;
816 		if (m->m_flags & M_BCAST)
817 			flags |= MSG_BCAST;
818 		if (m->m_flags & M_MCAST)
819 			flags |= MSG_MCAST;
820 	}
821 	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
822 	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
823 
824 	moff = 0;
825 	offset = 0;
826 	while (m && uio->uio_resid > 0 && error == 0) {
827 		if (m->m_type == MT_OOBDATA) {
828 			if (type != MT_OOBDATA)
829 				break;
830 		} else if (type == MT_OOBDATA)
831 			break;
832 #ifdef DIAGNOSTIC
833 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
834 			panic("receive 3");
835 #endif
836 		so->so_state &= ~SS_RCVATMARK;
837 		len = uio->uio_resid;
838 		if (so->so_oobmark && len > so->so_oobmark - offset)
839 			len = so->so_oobmark - offset;
840 		if (len > m->m_len - moff)
841 			len = m->m_len - moff;
842 		/*
843 		 * If mp is set, just pass back the mbufs.
844 		 * Otherwise copy them out via the uio, then free.
845 		 * Sockbuf must be consistent here (points to current mbuf,
846 		 * it points to next record) when we drop priority;
847 		 * we must note any additions to the sockbuf when we
848 		 * block interrupts again.
849 		 */
850 		if (mp == NULL && uio_error == 0) {
851 			SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
852 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
853 			resid = uio->uio_resid;
854 			splx(s);
855 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
856 			s = splsoftnet();
857 			if (uio_error)
858 				uio->uio_resid = resid - len;
859 		} else
860 			uio->uio_resid -= len;
861 		if (len == m->m_len - moff) {
862 			if (m->m_flags & M_EOR)
863 				flags |= MSG_EOR;
864 			if (flags & MSG_PEEK) {
865 				m = m->m_next;
866 				moff = 0;
867 			} else {
868 				nextrecord = m->m_nextpkt;
869 				sbfree(&so->so_rcv, m);
870 				if (mp) {
871 					*mp = m;
872 					mp = &m->m_next;
873 					so->so_rcv.sb_mb = m = m->m_next;
874 					*mp = NULL;
875 				} else {
876 					MFREE(m, so->so_rcv.sb_mb);
877 					m = so->so_rcv.sb_mb;
878 				}
879 				/*
880 				 * If m != NULL, we also know that
881 				 * so->so_rcv.sb_mb != NULL.
882 				 */
883 				KASSERT(so->so_rcv.sb_mb == m);
884 				if (m) {
885 					m->m_nextpkt = nextrecord;
886 					if (nextrecord == NULL)
887 						so->so_rcv.sb_lastrecord = m;
888 				} else {
889 					so->so_rcv.sb_mb = nextrecord;
890 					SB_EMPTY_FIXUP(&so->so_rcv);
891 				}
892 				SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
893 				SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
894 			}
895 		} else {
896 			if (flags & MSG_PEEK)
897 				moff += len;
898 			else {
899 				if (mp)
900 					*mp = m_copym(m, 0, len, M_WAIT);
901 				m->m_data += len;
902 				m->m_len -= len;
903 				so->so_rcv.sb_cc -= len;
904 				so->so_rcv.sb_datacc -= len;
905 			}
906 		}
907 		if (so->so_oobmark) {
908 			if ((flags & MSG_PEEK) == 0) {
909 				so->so_oobmark -= len;
910 				if (so->so_oobmark == 0) {
911 					so->so_state |= SS_RCVATMARK;
912 					break;
913 				}
914 			} else {
915 				offset += len;
916 				if (offset == so->so_oobmark)
917 					break;
918 			}
919 		}
920 		if (flags & MSG_EOR)
921 			break;
922 		/*
923 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
924 		 * we must not quit until "uio->uio_resid == 0" or an error
925 		 * termination.  If a signal/timeout occurs, return
926 		 * with a short count but without error.
927 		 * Keep sockbuf locked against other readers.
928 		 */
929 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
930 		    !sosendallatonce(so) && !nextrecord) {
931 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
932 				break;
933 			SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
934 			SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
935 			error = sbwait(&so->so_rcv);
936 			if (error) {
937 				sbunlock(&so->so_rcv);
938 				splx(s);
939 				return (0);
940 			}
941 			if ((m = so->so_rcv.sb_mb) != NULL)
942 				nextrecord = m->m_nextpkt;
943 		}
944 	}
945 
946 	if (m && pr->pr_flags & PR_ATOMIC) {
947 		flags |= MSG_TRUNC;
948 		if ((flags & MSG_PEEK) == 0)
949 			(void) sbdroprecord(&so->so_rcv);
950 	}
951 	if ((flags & MSG_PEEK) == 0) {
952 		if (m == NULL) {
953 			/*
954 			 * First part is an inline SB_EMPTY_FIXUP().  Second
955 			 * part makes sure sb_lastrecord is up-to-date if
956 			 * there is still data in the socket buffer.
957 			 */
958 			so->so_rcv.sb_mb = nextrecord;
959 			if (so->so_rcv.sb_mb == NULL) {
960 				so->so_rcv.sb_mbtail = NULL;
961 				so->so_rcv.sb_lastrecord = NULL;
962 			} else if (nextrecord->m_nextpkt == NULL)
963 				so->so_rcv.sb_lastrecord = nextrecord;
964 		}
965 		SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
966 		SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
967 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
968 			(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
969 			    (struct mbuf *)(long)flags, NULL, curproc);
970 	}
971 	if (orig_resid == uio->uio_resid && orig_resid &&
972 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
973 		sbunlock(&so->so_rcv);
974 		splx(s);
975 		goto restart;
976 	}
977 
978 	if (uio_error)
979 		error = uio_error;
980 
981 	if (flagsp)
982 		*flagsp |= flags;
983 release:
984 	sbunlock(&so->so_rcv);
985 	splx(s);
986 	return (error);
987 }
988 
989 int
990 soshutdown(struct socket *so, int how)
991 {
992 	struct protosw *pr = so->so_proto;
993 
994 	switch (how) {
995 	case SHUT_RD:
996 	case SHUT_RDWR:
997 		sorflush(so);
998 		if (how == SHUT_RD)
999 			return (0);
1000 		/* FALLTHROUGH */
1001 	case SHUT_WR:
1002 		return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL,
1003 		    curproc);
1004 	default:
1005 		return (EINVAL);
1006 	}
1007 }
1008 
1009 void
1010 sorflush(struct socket *so)
1011 {
1012 	struct sockbuf *sb = &so->so_rcv;
1013 	struct protosw *pr = so->so_proto;
1014 	int s;
1015 	struct sockbuf asb;
1016 
1017 	sb->sb_flags |= SB_NOINTR;
1018 	(void) sblock(sb, M_WAITOK);
1019 	s = splnet();
1020 	socantrcvmore(so);
1021 	sbunlock(sb);
1022 	asb = *sb;
1023 	memset(sb, 0, sizeof (*sb));
1024 	/* XXX - the memset stomps all over so_rcv */
1025 	if (asb.sb_flags & SB_KNOTE) {
1026 		sb->sb_sel.si_note = asb.sb_sel.si_note;
1027 		sb->sb_flags = SB_KNOTE;
1028 	}
1029 	splx(s);
1030 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1031 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
1032 	sbrelease(&asb);
1033 }
1034 
1035 #ifdef SOCKET_SPLICE
1036 
1037 #define so_splicelen	so_sp->ssp_len
1038 #define so_splicemax	so_sp->ssp_max
1039 #define so_idletv	so_sp->ssp_idletv
1040 #define so_idleto	so_sp->ssp_idleto
1041 
1042 int
1043 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1044 {
1045 	struct file	*fp;
1046 	struct socket	*sosp;
1047 	int		 s, error = 0;
1048 
1049 	if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1050 		return (EPROTONOSUPPORT);
1051 	if (so->so_options & SO_ACCEPTCONN)
1052 		return (EOPNOTSUPP);
1053 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1054 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1055 		return (ENOTCONN);
1056 	if (so->so_sp == NULL)
1057 		so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1058 
1059 	/* If no fd is given, unsplice by removing existing link. */
1060 	if (fd < 0) {
1061 		/* Lock receive buffer. */
1062 		if ((error = sblock(&so->so_rcv,
1063 		    (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0)
1064 			return (error);
1065 		s = splsoftnet();
1066 		if (so->so_sp->ssp_socket)
1067 			sounsplice(so, so->so_sp->ssp_socket, 1);
1068 		splx(s);
1069 		sbunlock(&so->so_rcv);
1070 		return (0);
1071 	}
1072 
1073 	if (max && max < 0)
1074 		return (EINVAL);
1075 
1076 	if (tv && (tv->tv_sec < 0 || tv->tv_usec < 0))
1077 		return (EINVAL);
1078 
1079 	/* Find sosp, the drain socket where data will be spliced into. */
1080 	if ((error = getsock(curproc->p_fd, fd, &fp)) != 0)
1081 		return (error);
1082 	sosp = fp->f_data;
1083 	if (sosp->so_sp == NULL)
1084 		sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1085 
1086 	/* Lock both receive and send buffer. */
1087 	if ((error = sblock(&so->so_rcv,
1088 	    (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0) {
1089 		FRELE(fp, curproc);
1090 		return (error);
1091 	}
1092 	if ((error = sblock(&sosp->so_snd, M_WAITOK)) != 0) {
1093 		sbunlock(&so->so_rcv);
1094 		FRELE(fp, curproc);
1095 		return (error);
1096 	}
1097 	s = splsoftnet();
1098 
1099 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1100 		error = EBUSY;
1101 		goto release;
1102 	}
1103 	if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) {
1104 		error = EPROTONOSUPPORT;
1105 		goto release;
1106 	}
1107 	if (sosp->so_options & SO_ACCEPTCONN) {
1108 		error = EOPNOTSUPP;
1109 		goto release;
1110 	}
1111 	if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1112 		error = ENOTCONN;
1113 		goto release;
1114 	}
1115 
1116 	/* Splice so and sosp together. */
1117 	so->so_sp->ssp_socket = sosp;
1118 	sosp->so_sp->ssp_soback = so;
1119 	so->so_splicelen = 0;
1120 	so->so_splicemax = max;
1121 	if (tv)
1122 		so->so_idletv = *tv;
1123 	else
1124 		timerclear(&so->so_idletv);
1125 	timeout_set(&so->so_idleto, soidle, so);
1126 
1127 	/*
1128 	 * To prevent softnet interrupt from calling somove() while
1129 	 * we sleep, the socket buffers are not marked as spliced yet.
1130 	 */
1131 	if (somove(so, M_WAIT)) {
1132 		so->so_rcv.sb_flagsintr |= SB_SPLICE;
1133 		sosp->so_snd.sb_flagsintr |= SB_SPLICE;
1134 	}
1135 
1136  release:
1137 	splx(s);
1138 	sbunlock(&sosp->so_snd);
1139 	sbunlock(&so->so_rcv);
1140 	FRELE(fp, curproc);
1141 	return (error);
1142 }
1143 
1144 void
1145 sounsplice(struct socket *so, struct socket *sosp, int wakeup)
1146 {
1147 	splsoftassert(IPL_SOFTNET);
1148 
1149 	timeout_del(&so->so_idleto);
1150 	sosp->so_snd.sb_flagsintr &= ~SB_SPLICE;
1151 	so->so_rcv.sb_flagsintr &= ~SB_SPLICE;
1152 	so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1153 	if (wakeup && soreadable(so))
1154 		sorwakeup(so);
1155 }
1156 
1157 void
1158 soidle(void *arg)
1159 {
1160 	struct socket *so = arg;
1161 	int s;
1162 
1163 	s = splsoftnet();
1164 	if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
1165 		so->so_error = ETIMEDOUT;
1166 		sounsplice(so, so->so_sp->ssp_socket, 1);
1167 	}
1168 	splx(s);
1169 }
1170 
1171 /*
1172  * Move data from receive buffer of spliced source socket to send
1173  * buffer of drain socket.  Try to move as much as possible in one
1174  * big chunk.  It is a TCP only implementation.
1175  * Return value 0 means splicing has been finished, 1 continue.
1176  */
1177 int
1178 somove(struct socket *so, int wait)
1179 {
1180 	struct socket	*sosp = so->so_sp->ssp_socket;
1181 	struct mbuf	*m, **mp, *nextrecord;
1182 	u_long		 len, off, oobmark;
1183 	long		 space;
1184 	int		 error = 0, maxreached = 0;
1185 	short		 state;
1186 
1187 	splsoftassert(IPL_SOFTNET);
1188 
1189  nextpkt:
1190 	if (so->so_error) {
1191 		error = so->so_error;
1192 		goto release;
1193 	}
1194 	if (sosp->so_state & SS_CANTSENDMORE) {
1195 		error = EPIPE;
1196 		goto release;
1197 	}
1198 	if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
1199 	    sosp->so_error != EFBIG) {
1200 		error = sosp->so_error;
1201 		goto release;
1202 	}
1203 	if ((sosp->so_state & SS_ISCONNECTED) == 0)
1204 		goto release;
1205 
1206 	/* Calculate how many bytes can be copied now. */
1207 	len = so->so_rcv.sb_datacc;
1208 	if (so->so_splicemax) {
1209 		KASSERT(so->so_splicelen < so->so_splicemax);
1210 		if (so->so_splicemax <= so->so_splicelen + len) {
1211 			len = so->so_splicemax - so->so_splicelen;
1212 			maxreached = 1;
1213 		}
1214 	}
1215 	space = sbspace(&sosp->so_snd);
1216 	if (so->so_oobmark && so->so_oobmark < len &&
1217 	    so->so_oobmark < space + 1024)
1218 		space += 1024;
1219 	if (space <= 0) {
1220 		maxreached = 0;
1221 		goto release;
1222 	}
1223 	if (space < len) {
1224 		maxreached = 0;
1225 		if (space < sosp->so_snd.sb_lowat)
1226 			goto release;
1227 		len = space;
1228 	}
1229 	sosp->so_state |= SS_ISSENDING;
1230 
1231 	SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1232 	SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1233 	m = so->so_rcv.sb_mb;
1234 	if (m == NULL)
1235 		goto release;
1236 	nextrecord = m->m_nextpkt;
1237 
1238 	/* Drop address and control information not used with splicing. */
1239 	if (so->so_proto->pr_flags & PR_ADDR) {
1240 #ifdef DIAGNOSTIC
1241 		if (m->m_type != MT_SONAME)
1242 			panic("somove soname");
1243 #endif
1244 		m = m->m_next;
1245 	}
1246 	while (m && m->m_type == MT_CONTROL)
1247 		m = m->m_next;
1248 	if (m == NULL) {
1249 		sbdroprecord(&so->so_rcv);
1250 		if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
1251 			(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1252 			    (struct mbuf *)0L, NULL, NULL);
1253 		goto nextpkt;
1254 	}
1255 
1256 	if (so->so_proto->pr_flags & PR_ATOMIC) {
1257 		if ((m->m_flags & M_PKTHDR) == 0)
1258 			panic("somove pkthdr");
1259 		if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1260 			error = EMSGSIZE;
1261 			goto release;
1262 		}
1263 		if (len < m->m_pkthdr.len)
1264 			goto release;
1265 		if (m->m_pkthdr.len < len) {
1266 			maxreached = 0;
1267 			len = m->m_pkthdr.len;
1268 		}
1269 		/*
1270 		 * Throw away the name mbuf after it has been assured
1271 		 * that the whole first record can be processed.
1272 		 */
1273 		m = so->so_rcv.sb_mb;
1274 		sbfree(&so->so_rcv, m);
1275 		MFREE(m, so->so_rcv.sb_mb);
1276 		sbsync(&so->so_rcv, nextrecord);
1277 	}
1278 	/*
1279 	 * Throw away the control mbufs after it has been assured
1280 	 * that the whole first record can be processed.
1281 	 */
1282 	m = so->so_rcv.sb_mb;
1283 	while (m && m->m_type == MT_CONTROL) {
1284 		sbfree(&so->so_rcv, m);
1285 		MFREE(m, so->so_rcv.sb_mb);
1286 		m = so->so_rcv.sb_mb;
1287 		sbsync(&so->so_rcv, nextrecord);
1288 	}
1289 
1290 	SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1291 	SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1292 
1293 	/* Take at most len mbufs out of receive buffer. */
1294 	for (off = 0, mp = &m; off <= len && *mp;
1295 	    off += (*mp)->m_len, mp = &(*mp)->m_next) {
1296 		u_long size = len - off;
1297 
1298 #ifdef DIAGNOSTIC
1299 		if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1300 			panic("somove type");
1301 #endif
1302 		if ((*mp)->m_len > size) {
1303 			if (!maxreached || (*mp = m_copym(
1304 			    so->so_rcv.sb_mb, 0, size, wait)) == NULL) {
1305 				len -= size;
1306 				break;
1307 			}
1308 			so->so_rcv.sb_mb->m_data += size;
1309 			so->so_rcv.sb_mb->m_len -= size;
1310 			so->so_rcv.sb_cc -= size;
1311 			so->so_rcv.sb_datacc -= size;
1312 		} else {
1313 			*mp = so->so_rcv.sb_mb;
1314 			sbfree(&so->so_rcv, *mp);
1315 			so->so_rcv.sb_mb = (*mp)->m_next;
1316 			sbsync(&so->so_rcv, nextrecord);
1317 		}
1318 	}
1319 	*mp = NULL;
1320 
1321 	SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1322 	SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1323 	SBCHECK(&so->so_rcv);
1324 	if (m == NULL)
1325 		goto release;
1326 	m->m_nextpkt = NULL;
1327 	if (m->m_flags & M_PKTHDR) {
1328 		m_tag_delete_chain(m);
1329 		memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr));
1330 		m->m_pkthdr.len = len;
1331 		m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
1332 	}
1333 
1334 	/* Send window update to source peer as receive buffer has changed. */
1335 	if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
1336 		(so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1337 		    NULL, NULL, NULL);
1338 
1339 	/* Receive buffer did shrink by len bytes, adjust oob. */
1340 	state = so->so_state;
1341 	so->so_state &= ~SS_RCVATMARK;
1342 	oobmark = so->so_oobmark;
1343 	so->so_oobmark = oobmark > len ? oobmark - len : 0;
1344 	if (oobmark) {
1345 		if (oobmark == len)
1346 			so->so_state |= SS_RCVATMARK;
1347 		if (oobmark >= len)
1348 			oobmark = 0;
1349 	}
1350 
1351 	/*
1352 	 * Handle oob data.  If any malloc fails, ignore error.
1353 	 * TCP urgent data is not very reliable anyway.
1354 	 */
1355 	while (((state & SS_RCVATMARK) || oobmark) &&
1356 	    (so->so_options & SO_OOBINLINE)) {
1357 		struct mbuf *o = NULL;
1358 
1359 		if (state & SS_RCVATMARK) {
1360 			o = m_get(wait, MT_DATA);
1361 			state &= ~SS_RCVATMARK;
1362 		} else if (oobmark) {
1363 			o = m_split(m, oobmark, wait);
1364 			if (o) {
1365 				error = (*sosp->so_proto->pr_usrreq)(sosp,
1366 				    PRU_SEND, m, NULL, NULL, NULL);
1367 				if (error) {
1368 					if (sosp->so_state & SS_CANTSENDMORE)
1369 						error = EPIPE;
1370 					m_freem(o);
1371 					goto release;
1372 				}
1373 				len -= oobmark;
1374 				so->so_splicelen += oobmark;
1375 				m = o;
1376 				o = m_get(wait, MT_DATA);
1377 			}
1378 			oobmark = 0;
1379 		}
1380 		if (o) {
1381 			o->m_len = 1;
1382 			*mtod(o, caddr_t) = *mtod(m, caddr_t);
1383 			error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB,
1384 			    o, NULL, NULL, NULL);
1385 			if (error) {
1386 				if (sosp->so_state & SS_CANTSENDMORE)
1387 					error = EPIPE;
1388 				m_freem(m);
1389 				goto release;
1390 			}
1391 			len -= 1;
1392 			so->so_splicelen += 1;
1393 			if (oobmark) {
1394 				oobmark -= 1;
1395 				if (oobmark == 0)
1396 					state |= SS_RCVATMARK;
1397 			}
1398 			m_adj(m, 1);
1399 		}
1400 	}
1401 
1402 	/* Append all remaining data to drain socket. */
1403 	if (so->so_rcv.sb_cc == 0 || maxreached)
1404 		sosp->so_state &= ~SS_ISSENDING;
1405 	error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL,
1406 	    NULL);
1407 	if (error) {
1408 		if (sosp->so_state & SS_CANTSENDMORE)
1409 			error = EPIPE;
1410 		goto release;
1411 	}
1412 	so->so_splicelen += len;
1413 
1414 	/* Move several packets if possible. */
1415 	if (!maxreached && nextrecord)
1416 		goto nextpkt;
1417 
1418  release:
1419 	sosp->so_state &= ~SS_ISSENDING;
1420 	if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1421 		error = EFBIG;
1422 	if (error)
1423 		so->so_error = error;
1424 	if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) ||
1425 	    (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) {
1426 		sounsplice(so, sosp, 1);
1427 		return (0);
1428 	}
1429 	if (timerisset(&so->so_idletv))
1430 		timeout_add_tv(&so->so_idleto, &so->so_idletv);
1431 	return (1);
1432 }
1433 
1434 #undef so_splicelen
1435 #undef so_splicemax
1436 #undef so_idletv
1437 #undef so_idleto
1438 
1439 #endif /* SOCKET_SPLICE */
1440 
1441 void
1442 sorwakeup(struct socket *so)
1443 {
1444 #ifdef SOCKET_SPLICE
1445 	if (so->so_rcv.sb_flagsintr & SB_SPLICE)
1446 		(void) somove(so, M_DONTWAIT);
1447 	if (isspliced(so))
1448 		return;
1449 #endif
1450 	sowakeup(so, &so->so_rcv);
1451 	if (so->so_upcall)
1452 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1453 }
1454 
1455 void
1456 sowwakeup(struct socket *so)
1457 {
1458 #ifdef SOCKET_SPLICE
1459 	if (so->so_snd.sb_flagsintr & SB_SPLICE)
1460 		(void) somove(so->so_sp->ssp_soback, M_DONTWAIT);
1461 #endif
1462 	sowakeup(so, &so->so_snd);
1463 }
1464 
1465 int
1466 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
1467 {
1468 	int error = 0;
1469 	struct mbuf *m = m0;
1470 
1471 	if (level != SOL_SOCKET) {
1472 		if (so->so_proto && so->so_proto->pr_ctloutput)
1473 			return ((*so->so_proto->pr_ctloutput)
1474 				  (PRCO_SETOPT, so, level, optname, &m0));
1475 		error = ENOPROTOOPT;
1476 	} else {
1477 		switch (optname) {
1478 		case SO_BINDANY:
1479 			if ((error = suser(curproc, 0)) != 0)	/* XXX */
1480 				goto bad;
1481 			break;
1482 		}
1483 
1484 		switch (optname) {
1485 
1486 		case SO_LINGER:
1487 			if (m == NULL || m->m_len != sizeof (struct linger) ||
1488 			    mtod(m, struct linger *)->l_linger < 0 ||
1489 			    mtod(m, struct linger *)->l_linger > SHRT_MAX) {
1490 				error = EINVAL;
1491 				goto bad;
1492 			}
1493 			so->so_linger = mtod(m, struct linger *)->l_linger;
1494 			/* FALLTHROUGH */
1495 
1496 		case SO_BINDANY:
1497 		case SO_DEBUG:
1498 		case SO_KEEPALIVE:
1499 		case SO_USELOOPBACK:
1500 		case SO_BROADCAST:
1501 		case SO_REUSEADDR:
1502 		case SO_REUSEPORT:
1503 		case SO_OOBINLINE:
1504 		case SO_TIMESTAMP:
1505 			if (m == NULL || m->m_len < sizeof (int)) {
1506 				error = EINVAL;
1507 				goto bad;
1508 			}
1509 			if (*mtod(m, int *))
1510 				so->so_options |= optname;
1511 			else
1512 				so->so_options &= ~optname;
1513 			break;
1514 
1515 		case SO_DONTROUTE:
1516 			if (m == NULL || m->m_len < sizeof (int)) {
1517 				error = EINVAL;
1518 				goto bad;
1519 			}
1520 			if (*mtod(m, int *))
1521 				error = EOPNOTSUPP;
1522 			break;
1523 
1524 		case SO_SNDBUF:
1525 		case SO_RCVBUF:
1526 		case SO_SNDLOWAT:
1527 		case SO_RCVLOWAT:
1528 		    {
1529 			u_long cnt;
1530 
1531 			if (m == NULL || m->m_len < sizeof (int)) {
1532 				error = EINVAL;
1533 				goto bad;
1534 			}
1535 			cnt = *mtod(m, int *);
1536 			if ((long)cnt <= 0)
1537 				cnt = 1;
1538 			switch (optname) {
1539 
1540 			case SO_SNDBUF:
1541 				if (so->so_state & SS_CANTSENDMORE) {
1542 					error = EINVAL;
1543 					goto bad;
1544 				}
1545 				if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
1546 				    sbreserve(&so->so_snd, cnt)) {
1547 					error = ENOBUFS;
1548 					goto bad;
1549 				}
1550 				so->so_snd.sb_wat = cnt;
1551 				break;
1552 
1553 			case SO_RCVBUF:
1554 				if (so->so_state & SS_CANTRCVMORE) {
1555 					error = EINVAL;
1556 					goto bad;
1557 				}
1558 				if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
1559 				    sbreserve(&so->so_rcv, cnt)) {
1560 					error = ENOBUFS;
1561 					goto bad;
1562 				}
1563 				so->so_rcv.sb_wat = cnt;
1564 				break;
1565 
1566 			case SO_SNDLOWAT:
1567 				so->so_snd.sb_lowat =
1568 				    (cnt > so->so_snd.sb_hiwat) ?
1569 				    so->so_snd.sb_hiwat : cnt;
1570 				break;
1571 			case SO_RCVLOWAT:
1572 				so->so_rcv.sb_lowat =
1573 				    (cnt > so->so_rcv.sb_hiwat) ?
1574 				    so->so_rcv.sb_hiwat : cnt;
1575 				break;
1576 			}
1577 			break;
1578 		    }
1579 
1580 		case SO_SNDTIMEO:
1581 		case SO_RCVTIMEO:
1582 		    {
1583 			struct timeval tv;
1584 			int val;
1585 
1586 			if (m == NULL || m->m_len < sizeof (tv)) {
1587 				error = EINVAL;
1588 				goto bad;
1589 			}
1590 			memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
1591 			val = tvtohz(&tv);
1592 			if (val > USHRT_MAX) {
1593 				error = EDOM;
1594 				goto bad;
1595 			}
1596 
1597 			switch (optname) {
1598 
1599 			case SO_SNDTIMEO:
1600 				so->so_snd.sb_timeo = val;
1601 				break;
1602 			case SO_RCVTIMEO:
1603 				so->so_rcv.sb_timeo = val;
1604 				break;
1605 			}
1606 			break;
1607 		    }
1608 
1609 		case SO_RTABLE:
1610 			if (so->so_proto && so->so_proto->pr_domain &&
1611 			    so->so_proto->pr_domain->dom_protosw &&
1612 			    so->so_proto->pr_ctloutput) {
1613 				struct domain *dom = so->so_proto->pr_domain;
1614 
1615 				level = dom->dom_protosw->pr_protocol;
1616 				return ((*so->so_proto->pr_ctloutput)
1617 				    (PRCO_SETOPT, so, level, optname, &m0));
1618 			}
1619 			error = ENOPROTOOPT;
1620 			break;
1621 
1622 #ifdef SOCKET_SPLICE
1623 		case SO_SPLICE:
1624 			if (m == NULL) {
1625 				error = sosplice(so, -1, 0, NULL);
1626 			} else if (m->m_len < sizeof(int)) {
1627 				error = EINVAL;
1628 				goto bad;
1629 			} else if (m->m_len < sizeof(struct splice)) {
1630 				error = sosplice(so, *mtod(m, int *), 0, NULL);
1631 			} else {
1632 				error = sosplice(so,
1633 				    mtod(m, struct splice *)->sp_fd,
1634 				    mtod(m, struct splice *)->sp_max,
1635 				   &mtod(m, struct splice *)->sp_idle);
1636 			}
1637 			break;
1638 #endif /* SOCKET_SPLICE */
1639 
1640 		default:
1641 			error = ENOPROTOOPT;
1642 			break;
1643 		}
1644 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1645 			(void) ((*so->so_proto->pr_ctloutput)
1646 				  (PRCO_SETOPT, so, level, optname, &m0));
1647 			m = NULL;	/* freed by protocol */
1648 		}
1649 	}
1650 bad:
1651 	if (m)
1652 		(void) m_free(m);
1653 	return (error);
1654 }
1655 
1656 int
1657 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
1658 {
1659 	struct mbuf *m;
1660 
1661 	if (level != SOL_SOCKET) {
1662 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1663 			return ((*so->so_proto->pr_ctloutput)
1664 				  (PRCO_GETOPT, so, level, optname, mp));
1665 		} else
1666 			return (ENOPROTOOPT);
1667 	} else {
1668 		m = m_get(M_WAIT, MT_SOOPTS);
1669 		m->m_len = sizeof (int);
1670 
1671 		switch (optname) {
1672 
1673 		case SO_LINGER:
1674 			m->m_len = sizeof (struct linger);
1675 			mtod(m, struct linger *)->l_onoff =
1676 				so->so_options & SO_LINGER;
1677 			mtod(m, struct linger *)->l_linger = so->so_linger;
1678 			break;
1679 
1680 		case SO_BINDANY:
1681 		case SO_USELOOPBACK:
1682 		case SO_DEBUG:
1683 		case SO_KEEPALIVE:
1684 		case SO_REUSEADDR:
1685 		case SO_REUSEPORT:
1686 		case SO_BROADCAST:
1687 		case SO_OOBINLINE:
1688 		case SO_TIMESTAMP:
1689 			*mtod(m, int *) = so->so_options & optname;
1690 			break;
1691 
1692 		case SO_DONTROUTE:
1693 			*mtod(m, int *) = 0;
1694 			break;
1695 
1696 		case SO_TYPE:
1697 			*mtod(m, int *) = so->so_type;
1698 			break;
1699 
1700 		case SO_ERROR:
1701 			*mtod(m, int *) = so->so_error;
1702 			so->so_error = 0;
1703 			break;
1704 
1705 		case SO_SNDBUF:
1706 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1707 			break;
1708 
1709 		case SO_RCVBUF:
1710 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1711 			break;
1712 
1713 		case SO_SNDLOWAT:
1714 			*mtod(m, int *) = so->so_snd.sb_lowat;
1715 			break;
1716 
1717 		case SO_RCVLOWAT:
1718 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1719 			break;
1720 
1721 		case SO_SNDTIMEO:
1722 		case SO_RCVTIMEO:
1723 		    {
1724 			struct timeval tv;
1725 			int val = (optname == SO_SNDTIMEO ?
1726 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1727 
1728 			m->m_len = sizeof(struct timeval);
1729 			memset(&tv, 0, sizeof(tv));
1730 			tv.tv_sec = val / hz;
1731 			tv.tv_usec = (val % hz) * tick;
1732 			memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
1733 			break;
1734 		    }
1735 
1736 		case SO_RTABLE:
1737 			(void)m_free(m);
1738 			if (so->so_proto && so->so_proto->pr_domain &&
1739 			    so->so_proto->pr_domain->dom_protosw &&
1740 			    so->so_proto->pr_ctloutput) {
1741 				struct domain *dom = so->so_proto->pr_domain;
1742 
1743 				level = dom->dom_protosw->pr_protocol;
1744 				return ((*so->so_proto->pr_ctloutput)
1745 				    (PRCO_GETOPT, so, level, optname, mp));
1746 			}
1747 			return (ENOPROTOOPT);
1748 			break;
1749 
1750 #ifdef SOCKET_SPLICE
1751 		case SO_SPLICE:
1752 		    {
1753 			off_t len;
1754 			int s = splsoftnet();
1755 
1756 			m->m_len = sizeof(off_t);
1757 			len = so->so_sp ? so->so_sp->ssp_len : 0;
1758 			memcpy(mtod(m, off_t *), &len, sizeof(off_t));
1759 			splx(s);
1760 			break;
1761 		    }
1762 #endif /* SOCKET_SPLICE */
1763 
1764 		case SO_PEERCRED:
1765 			if (so->so_proto->pr_protocol == AF_UNIX) {
1766 				struct unpcb *unp = sotounpcb(so);
1767 
1768 				if (unp->unp_flags & UNP_FEIDS) {
1769 					m->m_len = sizeof(unp->unp_connid);
1770 					bcopy(&(unp->unp_connid),
1771 					    mtod(m, caddr_t), m->m_len);
1772 					break;
1773 				}
1774 				(void)m_free(m);
1775 				return (ENOTCONN);
1776 			}
1777 			(void)m_free(m);
1778 			return (EOPNOTSUPP);
1779 			break;
1780 
1781 		default:
1782 			(void)m_free(m);
1783 			return (ENOPROTOOPT);
1784 		}
1785 		*mp = m;
1786 		return (0);
1787 	}
1788 }
1789 
1790 void
1791 sohasoutofband(struct socket *so)
1792 {
1793 	csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid);
1794 	selwakeup(&so->so_rcv.sb_sel);
1795 }
1796 
1797 int
1798 soo_kqfilter(struct file *fp, struct knote *kn)
1799 {
1800 	struct socket *so = kn->kn_fp->f_data;
1801 	struct sockbuf *sb;
1802 	int s;
1803 
1804 	switch (kn->kn_filter) {
1805 	case EVFILT_READ:
1806 		if (so->so_options & SO_ACCEPTCONN)
1807 			kn->kn_fop = &solisten_filtops;
1808 		else
1809 			kn->kn_fop = &soread_filtops;
1810 		sb = &so->so_rcv;
1811 		break;
1812 	case EVFILT_WRITE:
1813 		kn->kn_fop = &sowrite_filtops;
1814 		sb = &so->so_snd;
1815 		break;
1816 	default:
1817 		return (EINVAL);
1818 	}
1819 
1820 	s = splnet();
1821 	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1822 	sb->sb_flags |= SB_KNOTE;
1823 	splx(s);
1824 	return (0);
1825 }
1826 
1827 void
1828 filt_sordetach(struct knote *kn)
1829 {
1830 	struct socket *so = kn->kn_fp->f_data;
1831 	int s = splnet();
1832 
1833 	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1834 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1835 		so->so_rcv.sb_flags &= ~SB_KNOTE;
1836 	splx(s);
1837 }
1838 
1839 /*ARGSUSED*/
1840 int
1841 filt_soread(struct knote *kn, long hint)
1842 {
1843 	struct socket *so = kn->kn_fp->f_data;
1844 
1845 	kn->kn_data = so->so_rcv.sb_cc;
1846 #ifdef SOCKET_SPLICE
1847 	if (isspliced(so))
1848 		return (0);
1849 #endif /* SOCKET_SPLICE */
1850 	if (so->so_state & SS_CANTRCVMORE) {
1851 		kn->kn_flags |= EV_EOF;
1852 		kn->kn_fflags = so->so_error;
1853 		return (1);
1854 	}
1855 	if (so->so_error)	/* temporary udp error */
1856 		return (1);
1857 	if (kn->kn_sfflags & NOTE_LOWAT)
1858 		return (kn->kn_data >= kn->kn_sdata);
1859 	return (kn->kn_data >= so->so_rcv.sb_lowat);
1860 }
1861 
1862 void
1863 filt_sowdetach(struct knote *kn)
1864 {
1865 	struct socket *so = kn->kn_fp->f_data;
1866 	int s = splnet();
1867 
1868 	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1869 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1870 		so->so_snd.sb_flags &= ~SB_KNOTE;
1871 	splx(s);
1872 }
1873 
1874 /*ARGSUSED*/
1875 int
1876 filt_sowrite(struct knote *kn, long hint)
1877 {
1878 	struct socket *so = kn->kn_fp->f_data;
1879 
1880 	kn->kn_data = sbspace(&so->so_snd);
1881 	if (so->so_state & SS_CANTSENDMORE) {
1882 		kn->kn_flags |= EV_EOF;
1883 		kn->kn_fflags = so->so_error;
1884 		return (1);
1885 	}
1886 	if (so->so_error)	/* temporary udp error */
1887 		return (1);
1888 	if (((so->so_state & SS_ISCONNECTED) == 0) &&
1889 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1890 		return (0);
1891 	if (kn->kn_sfflags & NOTE_LOWAT)
1892 		return (kn->kn_data >= kn->kn_sdata);
1893 	return (kn->kn_data >= so->so_snd.sb_lowat);
1894 }
1895 
1896 /*ARGSUSED*/
1897 int
1898 filt_solisten(struct knote *kn, long hint)
1899 {
1900 	struct socket *so = kn->kn_fp->f_data;
1901 
1902 	kn->kn_data = so->so_qlen;
1903 	return (so->so_qlen != 0);
1904 }
1905