xref: /openbsd-src/sys/kern/uipc_socket.c (revision 33b792a3c1c87b47219fdf9a73548c4003214de3)
1 /*	$OpenBSD: uipc_socket.c,v 1.41 2002/02/05 22:04:43 nordin Exp $	*/
2 /*	$NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/file.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/domain.h>
46 #include <sys/kernel.h>
47 #include <sys/event.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/resourcevar.h>
53 #include <sys/pool.h>
54 
55 void 	filt_sordetach(struct knote *kn);
56 int 	filt_soread(struct knote *kn, long hint);
57 void 	filt_sowdetach(struct knote *kn);
58 int	filt_sowrite(struct knote *kn, long hint);
59 int	filt_solisten(struct knote *kn, long hint);
60 
61 struct filterops solisten_filtops =
62 	{ 1, NULL, filt_sordetach, filt_solisten };
63 struct filterops soread_filtops =
64 	{ 1, NULL, filt_sordetach, filt_soread };
65 struct filterops sowrite_filtops =
66 	{ 1, NULL, filt_sowdetach, filt_sowrite };
67 
68 
69 #ifndef SOMINCONN
70 #define SOMINCONN 80
71 #endif /* SOMINCONN */
72 
73 int	somaxconn = SOMAXCONN;
74 int	sominconn = SOMINCONN;
75 
76 struct pool socket_pool;
77 
78 void
79 soinit(void)
80 {
81 
82 	pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL);
83 }
84 
85 /*
86  * Socket operation routines.
87  * These routines are called by the routines in
88  * sys_socket.c or from a system process, and
89  * implement the semantics of socket operations by
90  * switching out to the protocol specific routines.
91  */
92 /*ARGSUSED*/
93 int
94 socreate(dom, aso, type, proto)
95 	int dom;
96 	struct socket **aso;
97 	register int type;
98 	int proto;
99 {
100 	struct proc *p = curproc;		/* XXX */
101 	struct protosw *prp;
102 	struct socket *so;
103 	int error, s;
104 
105 	if (proto)
106 		prp = pffindproto(dom, proto, type);
107 	else
108 		prp = pffindtype(dom, type);
109 	if (prp == 0 || prp->pr_usrreq == 0)
110 		return (EPROTONOSUPPORT);
111 	if (prp->pr_type != type)
112 		return (EPROTOTYPE);
113 	s = splsoftnet();
114 	so = pool_get(&socket_pool, PR_WAITOK);
115 	bzero((caddr_t)so, sizeof(*so));
116 	TAILQ_INIT(&so->so_q0);
117 	TAILQ_INIT(&so->so_q);
118 	so->so_type = type;
119 	if (p->p_ucred->cr_uid == 0)
120 		so->so_state = SS_PRIV;
121 	so->so_ruid = p->p_cred->p_ruid;
122 	so->so_euid = p->p_ucred->cr_uid;
123 	so->so_proto = prp;
124 	error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL,
125 	    (struct mbuf *)(long)proto, NULL);
126 	if (error) {
127 		so->so_state |= SS_NOFDREF;
128 		sofree(so);
129 		splx(s);
130 		return (error);
131 	}
132 #ifdef COMPAT_SUNOS
133 	{
134 		extern struct emul emul_sunos;
135 		if (p->p_emul == &emul_sunos && type == SOCK_DGRAM)
136 			so->so_options |= SO_BROADCAST;
137 	}
138 #endif
139 	splx(s);
140 	*aso = so;
141 	return (0);
142 }
143 
144 int
145 sobind(so, nam)
146 	struct socket *so;
147 	struct mbuf *nam;
148 {
149 	int s = splsoftnet();
150 	int error;
151 
152 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL);
153 	splx(s);
154 	return (error);
155 }
156 
157 int
158 solisten(so, backlog)
159 	register struct socket *so;
160 	int backlog;
161 {
162 	int s = splsoftnet(), error;
163 
164 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL);
165 	if (error) {
166 		splx(s);
167 		return (error);
168 	}
169 	if (TAILQ_FIRST(&so->so_q) == NULL)
170 		so->so_options |= SO_ACCEPTCONN;
171 	if (backlog < 0 || backlog > somaxconn)
172 		backlog = somaxconn;
173 	if (backlog < sominconn)
174 		backlog = sominconn;
175 	so->so_qlimit = backlog;
176 	splx(s);
177 	return (0);
178 }
179 
180 /*
181  *  Must be called at splsoftnet()
182  */
183 
184 void
185 sofree(so)
186 	register struct socket *so;
187 {
188 
189 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
190 		return;
191 	if (so->so_head) {
192 		/*
193 		 * We must not decommission a socket that's on the accept(2)
194 		 * queue.  If we do, then accept(2) may hang after select(2)
195 		 * indicated that the listening socket was ready.
196 		 */
197 		if (!soqremque(so, 0))
198 			return;
199 	}
200 	sbrelease(&so->so_snd);
201 	sorflush(so);
202 	pool_put(&socket_pool, so);
203 }
204 
205 /*
206  * Close a socket on last file table reference removal.
207  * Initiate disconnect if connected.
208  * Free socket when disconnect complete.
209  */
210 int
211 soclose(so)
212 	register struct socket *so;
213 {
214 	struct socket *so2;
215 	int s = splsoftnet();		/* conservative */
216 	int error = 0;
217 
218 	if (so->so_options & SO_ACCEPTCONN) {
219 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
220 			(void) soqremque(so2, 0);
221 			(void) soabort(so2);
222 		}
223 		while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
224 			(void) soqremque(so2, 1);
225 			(void) soabort(so2);
226 		}
227 	}
228 	if (so->so_pcb == 0)
229 		goto discard;
230 	if (so->so_state & SS_ISCONNECTED) {
231 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
232 			error = sodisconnect(so);
233 			if (error)
234 				goto drop;
235 		}
236 		if (so->so_options & SO_LINGER) {
237 			if ((so->so_state & SS_ISDISCONNECTING) &&
238 			    (so->so_state & SS_NBIO))
239 				goto drop;
240 			while (so->so_state & SS_ISCONNECTED) {
241 				error = tsleep((caddr_t)&so->so_timeo,
242 				    PSOCK | PCATCH, netcls,
243 				    so->so_linger * hz);
244 				if (error)
245 					break;
246 			}
247 		}
248 	}
249 drop:
250 	if (so->so_pcb) {
251 		int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL,
252 							NULL, NULL);
253 		if (error == 0)
254 			error = error2;
255 	}
256 discard:
257 	if (so->so_state & SS_NOFDREF)
258 		panic("soclose: NOFDREF");
259 	so->so_state |= SS_NOFDREF;
260 	sofree(so);
261 	splx(s);
262 	return (error);
263 }
264 
265 /*
266  * Must be called at splsoftnet...
267  */
268 int
269 soabort(so)
270 	struct socket *so;
271 {
272 
273 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL);
274 }
275 
276 int
277 soaccept(so, nam)
278 	register struct socket *so;
279 	struct mbuf *nam;
280 {
281 	int s = splsoftnet();
282 	int error = 0;
283 
284 	if ((so->so_state & SS_NOFDREF) == 0)
285 		panic("soaccept: !NOFDREF");
286 	so->so_state &= ~SS_NOFDREF;
287 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
288 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
289 		error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL,
290 		    nam, NULL);
291 	else
292 		error = ECONNABORTED;
293 	splx(s);
294 	return (error);
295 }
296 
297 int
298 soconnect(so, nam)
299 	register struct socket *so;
300 	struct mbuf *nam;
301 {
302 	int s;
303 	int error;
304 
305 	if (so->so_options & SO_ACCEPTCONN)
306 		return (EOPNOTSUPP);
307 	s = splsoftnet();
308 	/*
309 	 * If protocol is connection-based, can only connect once.
310 	 * Otherwise, if connected, try to disconnect first.
311 	 * This allows user to disconnect by connecting to, e.g.,
312 	 * a null address.
313 	 */
314 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
315 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
316 	    (error = sodisconnect(so))))
317 		error = EISCONN;
318 	else
319 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
320 						   NULL, nam, NULL);
321 	splx(s);
322 	return (error);
323 }
324 
325 int
326 soconnect2(so1, so2)
327 	register struct socket *so1;
328 	struct socket *so2;
329 {
330 	int s = splsoftnet();
331 	int error;
332 
333 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
334 					    (struct mbuf *)so2, NULL);
335 	splx(s);
336 	return (error);
337 }
338 
339 int
340 sodisconnect(so)
341 	register struct socket *so;
342 {
343 	int s = splsoftnet();
344 	int error;
345 
346 	if ((so->so_state & SS_ISCONNECTED) == 0) {
347 		error = ENOTCONN;
348 		goto bad;
349 	}
350 	if (so->so_state & SS_ISDISCONNECTING) {
351 		error = EALREADY;
352 		goto bad;
353 	}
354 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL,
355 					   NULL);
356 bad:
357 	splx(s);
358 	return (error);
359 }
360 
361 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
362 /*
363  * Send on a socket.
364  * If send must go all at once and message is larger than
365  * send buffering, then hard error.
366  * Lock against other senders.
367  * If must go all at once and not enough room now, then
368  * inform user that this would block and do nothing.
369  * Otherwise, if nonblocking, send as much as possible.
370  * The data to be sent is described by "uio" if nonzero,
371  * otherwise by the mbuf chain "top" (which must be null
372  * if uio is not).  Data provided in mbuf chain must be small
373  * enough to send all at once.
374  *
375  * Returns nonzero on error, timeout or signal; callers
376  * must check for short counts if EINTR/ERESTART are returned.
377  * Data and control buffers are freed on return.
378  */
379 int
380 sosend(so, addr, uio, top, control, flags)
381 	register struct socket *so;
382 	struct mbuf *addr;
383 	struct uio *uio;
384 	struct mbuf *top;
385 	struct mbuf *control;
386 	int flags;
387 {
388 	struct proc *p = curproc;		/* XXX */
389 	struct mbuf **mp;
390 	struct mbuf *m;
391 	long space, len, mlen, clen = 0;
392 	quad_t resid;
393 	int error, s, dontroute;
394 	int atomic = sosendallatonce(so) || top;
395 
396 	if (uio)
397 		resid = uio->uio_resid;
398 	else
399 		resid = top->m_pkthdr.len;
400 	/*
401 	 * In theory resid should be unsigned (since uio->uio_resid is).
402 	 * However, space must be signed, as it might be less than 0
403 	 * if we over-committed, and we must use a signed comparison
404 	 * of space and resid.  On the other hand, a negative resid
405 	 * causes us to loop sending 0-length segments to the protocol.
406 	 * MSG_EOR on a SOCK_STREAM socket is also invalid.
407 	 */
408 	if (resid < 0 ||
409 	    (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
410 		error = EINVAL;
411 		goto out;
412 	}
413 	dontroute =
414 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
415 	    (so->so_proto->pr_flags & PR_ATOMIC);
416 	p->p_stats->p_ru.ru_msgsnd++;
417 	if (control)
418 		clen = control->m_len;
419 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
420 
421 restart:
422 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
423 		goto out;
424 	do {
425 		s = splsoftnet();
426 		if (so->so_state & SS_CANTSENDMORE)
427 			snderr(EPIPE);
428 		if (so->so_error) {
429 			error = so->so_error;
430 			so->so_error = 0;
431 			splx(s);
432 			goto release;
433 		}
434 		if ((so->so_state & SS_ISCONNECTED) == 0) {
435 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
436 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
437 				    !(resid == 0 && clen != 0))
438 					snderr(ENOTCONN);
439 			} else if (addr == 0)
440 				snderr(EDESTADDRREQ);
441 		}
442 		space = sbspace(&so->so_snd);
443 		if (flags & MSG_OOB)
444 			space += 1024;
445 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
446 		    clen > so->so_snd.sb_hiwat)
447 			snderr(EMSGSIZE);
448 		if (space < resid + clen && uio &&
449 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
450 			if (so->so_state & SS_NBIO)
451 				snderr(EWOULDBLOCK);
452 			sbunlock(&so->so_snd);
453 			error = sbwait(&so->so_snd);
454 			splx(s);
455 			if (error)
456 				goto out;
457 			goto restart;
458 		}
459 		splx(s);
460 		mp = &top;
461 		space -= clen;
462 		do {
463 		    if (uio == NULL) {
464 			/*
465 			 * Data is prepackaged in "top".
466 			 */
467 			resid = 0;
468 			if (flags & MSG_EOR)
469 				top->m_flags |= M_EOR;
470 		    } else do {
471 				if (top == 0) {
472 					MGETHDR(m, M_WAIT, MT_DATA);
473 					mlen = MHLEN;
474 					m->m_pkthdr.len = 0;
475 					m->m_pkthdr.rcvif = (struct ifnet *)0;
476 				} else {
477 					MGET(m, M_WAIT, MT_DATA);
478 					mlen = MLEN;
479 				}
480 				if (resid >= MINCLSIZE && space >= MCLBYTES) {
481 					MCLGET(m, M_WAIT);
482 					if ((m->m_flags & M_EXT) == 0)
483 						goto nopages;
484 					mlen = MCLBYTES;
485 					if (atomic && top == 0) {
486 						len = lmin(MCLBYTES - max_hdr, resid);
487 						m->m_data += max_hdr;
488 					} else
489 						len = lmin(MCLBYTES, resid);
490 					space -= len;
491 				} else {
492 nopages:
493 					len = lmin(lmin(mlen, resid), space);
494 					space -= len;
495 					/*
496 					 * For datagram protocols, leave room
497 					 * for protocol headers in first mbuf.
498 					 */
499 					if (atomic && top == 0 && len < mlen)
500 						MH_ALIGN(m, len);
501 				}
502 				error = uiomove(mtod(m, caddr_t), (int)len,
503 				    uio);
504 				resid = uio->uio_resid;
505 				m->m_len = len;
506 				*mp = m;
507 				top->m_pkthdr.len += len;
508 				if (error)
509 					goto release;
510 				mp = &m->m_next;
511 				if (resid <= 0) {
512 					if (flags & MSG_EOR)
513 						top->m_flags |= M_EOR;
514 					break;
515 				}
516 			} while (space > 0 && atomic);
517 			if (dontroute)
518 				so->so_options |= SO_DONTROUTE;
519 			s = splsoftnet();		/* XXX */
520 			error = (*so->so_proto->pr_usrreq)(so,
521 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
522 			    top, addr, control);
523 			splx(s);
524 			if (dontroute)
525 				so->so_options &= ~SO_DONTROUTE;
526 			clen = 0;
527 			control = 0;
528 			top = 0;
529 			mp = &top;
530 			if (error)
531 				goto release;
532 		} while (resid && space > 0);
533 	} while (resid);
534 
535 release:
536 	sbunlock(&so->so_snd);
537 out:
538 	if (top)
539 		m_freem(top);
540 	if (control)
541 		m_freem(control);
542 	return (error);
543 }
544 
545 /*
546  * Implement receive operations on a socket.
547  * We depend on the way that records are added to the sockbuf
548  * by sbappend*.  In particular, each record (mbufs linked through m_next)
549  * must begin with an address if the protocol so specifies,
550  * followed by an optional mbuf or mbufs containing ancillary data,
551  * and then zero or more mbufs of data.
552  * In order to avoid blocking network interrupts for the entire time here,
553  * we splx() while doing the actual copy to user space.
554  * Although the sockbuf is locked, new data may still be appended,
555  * and thus we must maintain consistency of the sockbuf during that time.
556  *
557  * The caller may receive the data as a single mbuf chain by supplying
558  * an mbuf **mp0 for use in returning the chain.  The uio is then used
559  * only for the count in uio_resid.
560  */
561 int
562 soreceive(so, paddr, uio, mp0, controlp, flagsp)
563 	register struct socket *so;
564 	struct mbuf **paddr;
565 	struct uio *uio;
566 	struct mbuf **mp0;
567 	struct mbuf **controlp;
568 	int *flagsp;
569 {
570 	register struct mbuf *m, **mp;
571 	register int flags, len, error, s, offset;
572 	struct protosw *pr = so->so_proto;
573 	struct mbuf *nextrecord;
574 	int moff, type = 0;
575 	size_t orig_resid = uio->uio_resid;
576 	int uio_error = 0;
577 	int resid;
578 
579 	mp = mp0;
580 	if (paddr)
581 		*paddr = 0;
582 	if (controlp)
583 		*controlp = 0;
584 	if (flagsp)
585 		flags = *flagsp &~ MSG_EOR;
586 	else
587 		flags = 0;
588 	if (so->so_state & SS_NBIO)
589 		flags |= MSG_DONTWAIT;
590 	if (flags & MSG_OOB) {
591 		m = m_get(M_WAIT, MT_DATA);
592 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
593 		    (struct mbuf *)(long)(flags & MSG_PEEK), NULL);
594 		if (error)
595 			goto bad;
596 		do {
597 			error = uiomove(mtod(m, caddr_t),
598 			    (int) min(uio->uio_resid, m->m_len), uio);
599 			m = m_free(m);
600 		} while (uio->uio_resid && error == 0 && m);
601 bad:
602 		if (m)
603 			m_freem(m);
604 		return (error);
605 	}
606 	if (mp)
607 		*mp = (struct mbuf *)0;
608 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
609 		(*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL);
610 
611 restart:
612 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
613 		return (error);
614 	s = splsoftnet();
615 
616 	m = so->so_rcv.sb_mb;
617 	/*
618 	 * If we have less data than requested, block awaiting more
619 	 * (subject to any timeout) if:
620 	 *   1. the current count is less than the low water mark,
621 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
622 	 *	receive operation at once if we block (resid <= hiwat), or
623 	 *   3. MSG_DONTWAIT is not set.
624 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
625 	 * we have to do the receive in sections, and thus risk returning
626 	 * a short count if a timeout or signal occurs after we start.
627 	 */
628 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
629 	    so->so_rcv.sb_cc < uio->uio_resid) &&
630 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
631 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
632 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
633 #ifdef DIAGNOSTIC
634 		if (m == 0 && so->so_rcv.sb_cc)
635 			panic("receive 1");
636 #endif
637 		if (so->so_error) {
638 			if (m)
639 				goto dontblock;
640 			error = so->so_error;
641 			if ((flags & MSG_PEEK) == 0)
642 				so->so_error = 0;
643 			goto release;
644 		}
645 		if (so->so_state & SS_CANTRCVMORE) {
646 			if (m)
647 				goto dontblock;
648 			else
649 				goto release;
650 		}
651 		for (; m; m = m->m_next)
652 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
653 				m = so->so_rcv.sb_mb;
654 				goto dontblock;
655 			}
656 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
657 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
658 			error = ENOTCONN;
659 			goto release;
660 		}
661 		if (uio->uio_resid == 0 && controlp == NULL)
662 			goto release;
663 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
664 			error = EWOULDBLOCK;
665 			goto release;
666 		}
667 		sbunlock(&so->so_rcv);
668 		error = sbwait(&so->so_rcv);
669 		splx(s);
670 		if (error)
671 			return (error);
672 		goto restart;
673 	}
674 dontblock:
675 #ifdef notyet /* XXXX */
676 	if (uio->uio_procp)
677 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
678 #endif
679 	nextrecord = m->m_nextpkt;
680 	if (pr->pr_flags & PR_ADDR) {
681 #ifdef DIAGNOSTIC
682 		if (m->m_type != MT_SONAME)
683 			panic("receive 1a");
684 #endif
685 		orig_resid = 0;
686 		if (flags & MSG_PEEK) {
687 			if (paddr)
688 				*paddr = m_copy(m, 0, m->m_len);
689 			m = m->m_next;
690 		} else {
691 			sbfree(&so->so_rcv, m);
692 			if (paddr) {
693 				*paddr = m;
694 				so->so_rcv.sb_mb = m->m_next;
695 				m->m_next = 0;
696 				m = so->so_rcv.sb_mb;
697 			} else {
698 				MFREE(m, so->so_rcv.sb_mb);
699 				m = so->so_rcv.sb_mb;
700 			}
701 		}
702 	}
703 	while (m && m->m_type == MT_CONTROL && error == 0) {
704 		if (flags & MSG_PEEK) {
705 			if (controlp)
706 				*controlp = m_copy(m, 0, m->m_len);
707 			m = m->m_next;
708 		} else {
709 			sbfree(&so->so_rcv, m);
710 			if (controlp) {
711 				if (pr->pr_domain->dom_externalize &&
712 				    mtod(m, struct cmsghdr *)->cmsg_type ==
713 				    SCM_RIGHTS)
714 				   error = (*pr->pr_domain->dom_externalize)(m);
715 				*controlp = m;
716 				so->so_rcv.sb_mb = m->m_next;
717 				m->m_next = 0;
718 				m = so->so_rcv.sb_mb;
719 			} else {
720 				MFREE(m, so->so_rcv.sb_mb);
721 				m = so->so_rcv.sb_mb;
722 			}
723 		}
724 		if (controlp) {
725 			orig_resid = 0;
726 			controlp = &(*controlp)->m_next;
727 		}
728 	}
729 	if (m) {
730 		if ((flags & MSG_PEEK) == 0)
731 			m->m_nextpkt = nextrecord;
732 		type = m->m_type;
733 		if (type == MT_OOBDATA)
734 			flags |= MSG_OOB;
735 		if (m->m_flags & M_BCAST)
736 			flags |= MSG_BCAST;
737 		if (m->m_flags & M_MCAST)
738 			flags |= MSG_MCAST;
739 	}
740 	moff = 0;
741 	offset = 0;
742 	while (m && uio->uio_resid > 0 && error == 0) {
743 		if (m->m_type == MT_OOBDATA) {
744 			if (type != MT_OOBDATA)
745 				break;
746 		} else if (type == MT_OOBDATA)
747 			break;
748 #ifdef DIAGNOSTIC
749 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
750 			panic("receive 3");
751 #endif
752 		so->so_state &= ~SS_RCVATMARK;
753 		len = uio->uio_resid;
754 		if (so->so_oobmark && len > so->so_oobmark - offset)
755 			len = so->so_oobmark - offset;
756 		if (len > m->m_len - moff)
757 			len = m->m_len - moff;
758 		/*
759 		 * If mp is set, just pass back the mbufs.
760 		 * Otherwise copy them out via the uio, then free.
761 		 * Sockbuf must be consistent here (points to current mbuf,
762 		 * it points to next record) when we drop priority;
763 		 * we must note any additions to the sockbuf when we
764 		 * block interrupts again.
765 		 */
766 		if (mp == 0 && uio_error == 0) {
767 			resid = uio->uio_resid;
768 			splx(s);
769 			uio_error =
770 				uiomove(mtod(m, caddr_t) + moff, (int)len,
771 					uio);
772 			s = splsoftnet();
773 			if (uio_error)
774 				uio->uio_resid = resid - len;
775 		} else
776 			uio->uio_resid -= len;
777 		if (len == m->m_len - moff) {
778 			if (m->m_flags & M_EOR)
779 				flags |= MSG_EOR;
780 			if (flags & MSG_PEEK) {
781 				m = m->m_next;
782 				moff = 0;
783 			} else {
784 				nextrecord = m->m_nextpkt;
785 				sbfree(&so->so_rcv, m);
786 				if (mp) {
787 					*mp = m;
788 					mp = &m->m_next;
789 					so->so_rcv.sb_mb = m = m->m_next;
790 					*mp = (struct mbuf *)0;
791 				} else {
792 					MFREE(m, so->so_rcv.sb_mb);
793 					m = so->so_rcv.sb_mb;
794 				}
795 				if (m)
796 					m->m_nextpkt = nextrecord;
797 			}
798 		} else {
799 			if (flags & MSG_PEEK)
800 				moff += len;
801 			else {
802 				if (mp)
803 					*mp = m_copym(m, 0, len, M_WAIT);
804 				m->m_data += len;
805 				m->m_len -= len;
806 				so->so_rcv.sb_cc -= len;
807 			}
808 		}
809 		if (so->so_oobmark) {
810 			if ((flags & MSG_PEEK) == 0) {
811 				so->so_oobmark -= len;
812 				if (so->so_oobmark == 0) {
813 					so->so_state |= SS_RCVATMARK;
814 					break;
815 				}
816 			} else {
817 				offset += len;
818 				if (offset == so->so_oobmark)
819 					break;
820 			}
821 		}
822 		if (flags & MSG_EOR)
823 			break;
824 		/*
825 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
826 		 * we must not quit until "uio->uio_resid == 0" or an error
827 		 * termination.  If a signal/timeout occurs, return
828 		 * with a short count but without error.
829 		 * Keep sockbuf locked against other readers.
830 		 */
831 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
832 		    !sosendallatonce(so) && !nextrecord) {
833 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
834 				break;
835 			error = sbwait(&so->so_rcv);
836 			if (error) {
837 				sbunlock(&so->so_rcv);
838 				splx(s);
839 				return (0);
840 			}
841 			if ((m = so->so_rcv.sb_mb) != NULL)
842 				nextrecord = m->m_nextpkt;
843 		}
844 	}
845 
846 	if (m && pr->pr_flags & PR_ATOMIC) {
847 		flags |= MSG_TRUNC;
848 		if ((flags & MSG_PEEK) == 0)
849 			(void) sbdroprecord(&so->so_rcv);
850 	}
851 	if ((flags & MSG_PEEK) == 0) {
852 		if (m == 0)
853 			so->so_rcv.sb_mb = nextrecord;
854 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
855 			(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
856 					 (struct mbuf *)(long)flags, NULL);
857 	}
858 	if (orig_resid == uio->uio_resid && orig_resid &&
859 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
860 		sbunlock(&so->so_rcv);
861 		splx(s);
862 		goto restart;
863 	}
864 
865 	if (uio_error)
866 		error = uio_error;
867 
868 	if (flagsp)
869 		*flagsp |= flags;
870 release:
871 	sbunlock(&so->so_rcv);
872 	splx(s);
873 	return (error);
874 }
875 
876 int
877 soshutdown(so, how)
878 	register struct socket *so;
879 	register int how;
880 {
881 	register struct protosw *pr = so->so_proto;
882 
883 	how++;
884 	if (how & ~(FREAD|FWRITE))
885 		return (EINVAL);
886 	if (how & FREAD)
887 		sorflush(so);
888 	if (how & FWRITE)
889 		return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL);
890 	return (0);
891 }
892 
893 void
894 sorflush(so)
895 	register struct socket *so;
896 {
897 	register struct sockbuf *sb = &so->so_rcv;
898 	register struct protosw *pr = so->so_proto;
899 	register int s;
900 	struct sockbuf asb;
901 
902 	sb->sb_flags |= SB_NOINTR;
903 	(void) sblock(sb, M_WAITOK);
904 	s = splimp();
905 	socantrcvmore(so);
906 	sbunlock(sb);
907 	asb = *sb;
908 	bzero((caddr_t)sb, sizeof (*sb));
909 	/* XXX - the bzero stumps all over so_rcv */
910 	if (asb.sb_flags & SB_KNOTE) {
911 		sb->sb_sel.si_note = asb.sb_sel.si_note;
912 		sb->sb_flags = SB_KNOTE;
913 	}
914 	splx(s);
915 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
916 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
917 	sbrelease(&asb);
918 }
919 
920 int
921 sosetopt(so, level, optname, m0)
922 	register struct socket *so;
923 	int level, optname;
924 	struct mbuf *m0;
925 {
926 	int error = 0;
927 	register struct mbuf *m = m0;
928 
929 	if (level != SOL_SOCKET) {
930 		if (so->so_proto && so->so_proto->pr_ctloutput)
931 			return ((*so->so_proto->pr_ctloutput)
932 				  (PRCO_SETOPT, so, level, optname, &m0));
933 		error = ENOPROTOOPT;
934 	} else {
935 		switch (optname) {
936 
937 		case SO_LINGER:
938 			if (m == NULL || m->m_len != sizeof (struct linger) ||
939 			    mtod(m, struct linger *)->l_linger < 0 ||
940 			    mtod(m, struct linger *)->l_linger > SHRT_MAX) {
941 				error = EINVAL;
942 				goto bad;
943 			}
944 			so->so_linger = mtod(m, struct linger *)->l_linger;
945 			/* fall thru... */
946 
947 		case SO_DEBUG:
948 		case SO_KEEPALIVE:
949 		case SO_DONTROUTE:
950 		case SO_USELOOPBACK:
951 		case SO_BROADCAST:
952 		case SO_REUSEADDR:
953 		case SO_REUSEPORT:
954 		case SO_OOBINLINE:
955 			if (m == NULL || m->m_len < sizeof (int)) {
956 				error = EINVAL;
957 				goto bad;
958 			}
959 			if (*mtod(m, int *))
960 				so->so_options |= optname;
961 			else
962 				so->so_options &= ~optname;
963 			break;
964 
965 		case SO_SNDBUF:
966 		case SO_RCVBUF:
967 		case SO_SNDLOWAT:
968 		case SO_RCVLOWAT:
969 		    {
970 			u_long cnt;
971 
972 			if (m == NULL || m->m_len < sizeof (int)) {
973 				error = EINVAL;
974 				goto bad;
975 			}
976 			cnt = *mtod(m, int *);
977 			if ((long)cnt <= 0)
978 				cnt = 1;
979 			switch (optname) {
980 
981 			case SO_SNDBUF:
982 			case SO_RCVBUF:
983 				if (sbreserve(optname == SO_SNDBUF ?
984 				    &so->so_snd : &so->so_rcv,
985 				    cnt) == 0) {
986 					error = ENOBUFS;
987 					goto bad;
988 				}
989 				break;
990 
991 			case SO_SNDLOWAT:
992 				so->so_snd.sb_lowat = (cnt > so->so_snd.sb_hiwat) ?
993 				    so->so_snd.sb_hiwat : cnt;
994 				break;
995 			case SO_RCVLOWAT:
996 				so->so_rcv.sb_lowat = (cnt > so->so_rcv.sb_hiwat) ?
997 				    so->so_rcv.sb_hiwat : cnt;
998 				break;
999 			}
1000 			break;
1001 		    }
1002 
1003 		case SO_SNDTIMEO:
1004 		case SO_RCVTIMEO:
1005 		    {
1006 			struct timeval *tv;
1007 			short val;
1008 
1009 			if (m == NULL || m->m_len < sizeof (*tv)) {
1010 				error = EINVAL;
1011 				goto bad;
1012 			}
1013 			tv = mtod(m, struct timeval *);
1014 			if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
1015 				error = EDOM;
1016 				goto bad;
1017 			}
1018 			val = tv->tv_sec * hz + tv->tv_usec / tick;
1019 
1020 			switch (optname) {
1021 
1022 			case SO_SNDTIMEO:
1023 				so->so_snd.sb_timeo = val;
1024 				break;
1025 			case SO_RCVTIMEO:
1026 				so->so_rcv.sb_timeo = val;
1027 				break;
1028 			}
1029 			break;
1030 		    }
1031 
1032 		default:
1033 			error = ENOPROTOOPT;
1034 			break;
1035 		}
1036 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1037 			(void) ((*so->so_proto->pr_ctloutput)
1038 				  (PRCO_SETOPT, so, level, optname, &m0));
1039 			m = NULL;	/* freed by protocol */
1040 		}
1041 	}
1042 bad:
1043 	if (m)
1044 		(void) m_free(m);
1045 	return (error);
1046 }
1047 
1048 int
1049 sogetopt(so, level, optname, mp)
1050 	register struct socket *so;
1051 	int level, optname;
1052 	struct mbuf **mp;
1053 {
1054 	register struct mbuf *m;
1055 
1056 	if (level != SOL_SOCKET) {
1057 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1058 			return ((*so->so_proto->pr_ctloutput)
1059 				  (PRCO_GETOPT, so, level, optname, mp));
1060 		} else
1061 			return (ENOPROTOOPT);
1062 	} else {
1063 		m = m_get(M_WAIT, MT_SOOPTS);
1064 		m->m_len = sizeof (int);
1065 
1066 		switch (optname) {
1067 
1068 		case SO_LINGER:
1069 			m->m_len = sizeof (struct linger);
1070 			mtod(m, struct linger *)->l_onoff =
1071 				so->so_options & SO_LINGER;
1072 			mtod(m, struct linger *)->l_linger = so->so_linger;
1073 			break;
1074 
1075 		case SO_USELOOPBACK:
1076 		case SO_DONTROUTE:
1077 		case SO_DEBUG:
1078 		case SO_KEEPALIVE:
1079 		case SO_REUSEADDR:
1080 		case SO_REUSEPORT:
1081 		case SO_BROADCAST:
1082 		case SO_OOBINLINE:
1083 			*mtod(m, int *) = so->so_options & optname;
1084 			break;
1085 
1086 		case SO_TYPE:
1087 			*mtod(m, int *) = so->so_type;
1088 			break;
1089 
1090 		case SO_ERROR:
1091 			*mtod(m, int *) = so->so_error;
1092 			so->so_error = 0;
1093 			break;
1094 
1095 		case SO_SNDBUF:
1096 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1097 			break;
1098 
1099 		case SO_RCVBUF:
1100 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1101 			break;
1102 
1103 		case SO_SNDLOWAT:
1104 			*mtod(m, int *) = so->so_snd.sb_lowat;
1105 			break;
1106 
1107 		case SO_RCVLOWAT:
1108 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1109 			break;
1110 
1111 		case SO_SNDTIMEO:
1112 		case SO_RCVTIMEO:
1113 		    {
1114 			int val = (optname == SO_SNDTIMEO ?
1115 			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1116 
1117 			m->m_len = sizeof(struct timeval);
1118 			mtod(m, struct timeval *)->tv_sec = val / hz;
1119 			mtod(m, struct timeval *)->tv_usec =
1120 			    (val % hz) * tick;
1121 			break;
1122 		    }
1123 
1124 		default:
1125 			(void)m_free(m);
1126 			return (ENOPROTOOPT);
1127 		}
1128 		*mp = m;
1129 		return (0);
1130 	}
1131 }
1132 
1133 void
1134 sohasoutofband(so)
1135 	register struct socket *so;
1136 {
1137 	csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid);
1138 	selwakeup(&so->so_rcv.sb_sel);
1139 }
1140 
1141 int
1142 soo_kqfilter(struct file *fp, struct knote *kn)
1143 {
1144 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1145 	struct sockbuf *sb;
1146 	int s;
1147 
1148 	switch (kn->kn_filter) {
1149 	case EVFILT_READ:
1150 		if (so->so_options & SO_ACCEPTCONN)
1151 			kn->kn_fop = &solisten_filtops;
1152 		else
1153 			kn->kn_fop = &soread_filtops;
1154 		sb = &so->so_rcv;
1155 		break;
1156 	case EVFILT_WRITE:
1157 		kn->kn_fop = &sowrite_filtops;
1158 		sb = &so->so_snd;
1159 		break;
1160 	default:
1161 		return (1);
1162 	}
1163 
1164 	s = splnet();
1165 	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1166 	sb->sb_flags |= SB_KNOTE;
1167 	splx(s);
1168 	return (0);
1169 }
1170 
1171 void
1172 filt_sordetach(struct knote *kn)
1173 {
1174 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1175 	int s = splnet();
1176 
1177 	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1178 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1179 		so->so_rcv.sb_flags &= ~SB_KNOTE;
1180 	splx(s);
1181 }
1182 
1183 /*ARGSUSED*/
1184 int
1185 filt_soread(struct knote *kn, long hint)
1186 {
1187 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1188 
1189 	kn->kn_data = so->so_rcv.sb_cc;
1190 	if (so->so_state & SS_CANTRCVMORE) {
1191 		kn->kn_flags |= EV_EOF;
1192 		kn->kn_fflags = so->so_error;
1193 		return (1);
1194 	}
1195 	if (so->so_error)	/* temporary udp error */
1196 		return (1);
1197 	if (kn->kn_sfflags & NOTE_LOWAT)
1198 		return (kn->kn_data >= kn->kn_sdata);
1199 	return (kn->kn_data >= so->so_rcv.sb_lowat);
1200 }
1201 
1202 void
1203 filt_sowdetach(struct knote *kn)
1204 {
1205 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1206 	int s = splnet();
1207 
1208 	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
1209 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
1210 		so->so_snd.sb_flags &= ~SB_KNOTE;
1211 	splx(s);
1212 }
1213 
1214 /*ARGSUSED*/
1215 int
1216 filt_sowrite(struct knote *kn, long hint)
1217 {
1218 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1219 
1220 	kn->kn_data = sbspace(&so->so_snd);
1221 	if (so->so_state & SS_CANTSENDMORE) {
1222 		kn->kn_flags |= EV_EOF;
1223 		kn->kn_fflags = so->so_error;
1224 		return (1);
1225 	}
1226 	if (so->so_error)	/* temporary udp error */
1227 		return (1);
1228 	if (((so->so_state & SS_ISCONNECTED) == 0) &&
1229 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
1230 		return (0);
1231 	if (kn->kn_sfflags & NOTE_LOWAT)
1232 		return (kn->kn_data >= kn->kn_sdata);
1233 	return (kn->kn_data >= so->so_snd.sb_lowat);
1234 }
1235 
1236 /*ARGSUSED*/
1237 int
1238 filt_solisten(struct knote *kn, long hint)
1239 {
1240 	struct socket *so = (struct socket *)kn->kn_fp->f_data;
1241 
1242 	kn->kn_data = so->so_qlen;
1243 	return (so->so_qlen != 0);
1244 }
1245