xref: /netbsd-src/sys/kern/uipc_socket.c (revision 2a399c6883d870daece976daec6ffa7bb7f934ce)
1 /*	$NetBSD: uipc_socket.c,v 1.31 1998/01/07 23:47:08 thorpej Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/file.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/resourcevar.h>
51 
52 /*
53  * Socket operation routines.
54  * These routines are called by the routines in
55  * sys_socket.c or from a system process, and
56  * implement the semantics of socket operations by
57  * switching out to the protocol specific routines.
58  */
59 /*ARGSUSED*/
60 int
61 socreate(dom, aso, type, proto)
62 	int dom;
63 	struct socket **aso;
64 	register int type;
65 	int proto;
66 {
67 	struct proc *p = curproc;		/* XXX */
68 	register struct protosw *prp;
69 	register struct socket *so;
70 	register int error;
71 
72 	if (proto)
73 		prp = pffindproto(dom, proto, type);
74 	else
75 		prp = pffindtype(dom, type);
76 	if (prp == 0 || prp->pr_usrreq == 0)
77 		return (EPROTONOSUPPORT);
78 	if (prp->pr_type != type)
79 		return (EPROTOTYPE);
80 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
81 	bzero((caddr_t)so, sizeof(*so));
82 	TAILQ_INIT(&so->so_q0);
83 	TAILQ_INIT(&so->so_q);
84 	so->so_type = type;
85 	so->so_proto = prp;
86 	error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
87 	    (struct mbuf *)(long)proto, (struct mbuf *)0, p);
88 	if (error) {
89 		so->so_state |= SS_NOFDREF;
90 		sofree(so);
91 		return (error);
92 	}
93 #ifdef COMPAT_SUNOS
94 	{
95 		extern struct emul emul_sunos;
96 		if (p->p_emul == &emul_sunos && type == SOCK_DGRAM)
97 			so->so_options |= SO_BROADCAST;
98 	}
99 #endif
100 	*aso = so;
101 	return (0);
102 }
103 
104 int
105 sobind(so, nam)
106 	struct socket *so;
107 	struct mbuf *nam;
108 {
109 	struct proc *p = curproc;		/* XXX */
110 	int s = splsoftnet();
111 	int error;
112 
113 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
114 	    nam, (struct mbuf *)0, p);
115 	splx(s);
116 	return (error);
117 }
118 
119 int
120 solisten(so, backlog)
121 	register struct socket *so;
122 	int backlog;
123 {
124 	int s = splsoftnet(), error;
125 
126 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
127 	    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
128 	if (error) {
129 		splx(s);
130 		return (error);
131 	}
132 	if (so->so_q.tqh_first == NULL)
133 		so->so_options |= SO_ACCEPTCONN;
134 	if (backlog < 0)
135 		backlog = 0;
136 	so->so_qlimit = min(backlog, SOMAXCONN);
137 	splx(s);
138 	return (0);
139 }
140 
141 void
142 sofree(so)
143 	register struct socket *so;
144 {
145 
146 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
147 		return;
148 	if (so->so_head) {
149 		if (!soqremque(so, 0) && !soqremque(so, 1))
150 			panic("sofree dq");
151 		so->so_head = 0;
152 	}
153 	sbrelease(&so->so_snd);
154 	sorflush(so);
155 	FREE(so, M_SOCKET);
156 }
157 
158 /*
159  * Close a socket on last file table reference removal.
160  * Initiate disconnect if connected.
161  * Free socket when disconnect complete.
162  */
163 int
164 soclose(so)
165 	register struct socket *so;
166 {
167 	int s = splsoftnet();		/* conservative */
168 	int error = 0;
169 
170 	if (so->so_options & SO_ACCEPTCONN) {
171 		while (so->so_q0.tqh_first)
172 			(void) soabort(so->so_q0.tqh_first);
173 		while (so->so_q.tqh_first)
174 			(void) soabort(so->so_q.tqh_first);
175 	}
176 	if (so->so_pcb == 0)
177 		goto discard;
178 	if (so->so_state & SS_ISCONNECTED) {
179 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
180 			error = sodisconnect(so);
181 			if (error)
182 				goto drop;
183 		}
184 		if (so->so_options & SO_LINGER) {
185 			if ((so->so_state & SS_ISDISCONNECTING) &&
186 			    (so->so_state & SS_NBIO))
187 				goto drop;
188 			while (so->so_state & SS_ISCONNECTED) {
189 				error = tsleep((caddr_t)&so->so_timeo,
190 					       PSOCK | PCATCH, netcls,
191 					       so->so_linger * hz);
192 				if (error)
193 					break;
194 			}
195 		}
196 	}
197 drop:
198 	if (so->so_pcb) {
199 		int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
200 		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
201 		    (struct proc *)0);
202 		if (error == 0)
203 			error = error2;
204 	}
205 discard:
206 	if (so->so_state & SS_NOFDREF)
207 		panic("soclose: NOFDREF");
208 	so->so_state |= SS_NOFDREF;
209 	sofree(so);
210 	splx(s);
211 	return (error);
212 }
213 
214 /*
215  * Must be called at splsoftnet...
216  */
217 int
218 soabort(so)
219 	struct socket *so;
220 {
221 
222 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
223 	    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
224 }
225 
226 int
227 soaccept(so, nam)
228 	register struct socket *so;
229 	struct mbuf *nam;
230 {
231 	int s = splsoftnet();
232 	int error;
233 
234 	if ((so->so_state & SS_NOFDREF) == 0)
235 		panic("soaccept: !NOFDREF");
236 	so->so_state &= ~SS_NOFDREF;
237 	error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, (struct mbuf *)0,
238 	    nam, (struct mbuf *)0, (struct proc *)0);
239 	splx(s);
240 	return (error);
241 }
242 
243 int
244 soconnect(so, nam)
245 	register struct socket *so;
246 	struct mbuf *nam;
247 {
248 	struct proc *p = curproc;		/* XXX */
249 	int s;
250 	int error;
251 
252 	if (so->so_options & SO_ACCEPTCONN)
253 		return (EOPNOTSUPP);
254 	s = splsoftnet();
255 	/*
256 	 * If protocol is connection-based, can only connect once.
257 	 * Otherwise, if connected, try to disconnect first.
258 	 * This allows user to disconnect by connecting to, e.g.,
259 	 * a null address.
260 	 */
261 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
262 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
263 	    (error = sodisconnect(so))))
264 		error = EISCONN;
265 	else
266 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
267 		    (struct mbuf *)0, nam, (struct mbuf *)0, p);
268 	splx(s);
269 	return (error);
270 }
271 
272 int
273 soconnect2(so1, so2)
274 	register struct socket *so1;
275 	struct socket *so2;
276 {
277 	int s = splsoftnet();
278 	int error;
279 
280 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
281 	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
282 	    (struct proc *)0);
283 	splx(s);
284 	return (error);
285 }
286 
287 int
288 sodisconnect(so)
289 	register struct socket *so;
290 {
291 	int s = splsoftnet();
292 	int error;
293 
294 	if ((so->so_state & SS_ISCONNECTED) == 0) {
295 		error = ENOTCONN;
296 		goto bad;
297 	}
298 	if (so->so_state & SS_ISDISCONNECTING) {
299 		error = EALREADY;
300 		goto bad;
301 	}
302 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
303 	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
304 	    (struct proc *)0);
305 bad:
306 	splx(s);
307 	return (error);
308 }
309 
310 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
311 /*
312  * Send on a socket.
313  * If send must go all at once and message is larger than
314  * send buffering, then hard error.
315  * Lock against other senders.
316  * If must go all at once and not enough room now, then
317  * inform user that this would block and do nothing.
318  * Otherwise, if nonblocking, send as much as possible.
319  * The data to be sent is described by "uio" if nonzero,
320  * otherwise by the mbuf chain "top" (which must be null
321  * if uio is not).  Data provided in mbuf chain must be small
322  * enough to send all at once.
323  *
324  * Returns nonzero on error, timeout or signal; callers
325  * must check for short counts if EINTR/ERESTART are returned.
326  * Data and control buffers are freed on return.
327  */
328 int
329 sosend(so, addr, uio, top, control, flags)
330 	register struct socket *so;
331 	struct mbuf *addr;
332 	struct uio *uio;
333 	struct mbuf *top;
334 	struct mbuf *control;
335 	int flags;
336 {
337 	struct proc *p = curproc;		/* XXX */
338 	struct mbuf **mp;
339 	register struct mbuf *m;
340 	register long space, len, resid;
341 	int clen = 0, error, s, dontroute, mlen;
342 	int atomic = sosendallatonce(so) || top;
343 
344 	if (uio)
345 		resid = uio->uio_resid;
346 	else
347 		resid = top->m_pkthdr.len;
348 	/*
349 	 * In theory resid should be unsigned.
350 	 * However, space must be signed, as it might be less than 0
351 	 * if we over-committed, and we must use a signed comparison
352 	 * of space and resid.  On the other hand, a negative resid
353 	 * causes us to loop sending 0-length segments to the protocol.
354 	 */
355 	if (resid < 0) {
356 		error = EINVAL;
357 		goto out;
358 	}
359 	dontroute =
360 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
361 	    (so->so_proto->pr_flags & PR_ATOMIC);
362 	p->p_stats->p_ru.ru_msgsnd++;
363 	if (control)
364 		clen = control->m_len;
365 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
366 
367 restart:
368 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
369 		goto out;
370 	do {
371 		s = splsoftnet();
372 		if (so->so_state & SS_CANTSENDMORE)
373 			snderr(EPIPE);
374 		if (so->so_error)
375 			snderr(so->so_error);
376 		if ((so->so_state & SS_ISCONNECTED) == 0) {
377 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
378 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
379 				    !(resid == 0 && clen != 0))
380 					snderr(ENOTCONN);
381 			} else if (addr == 0)
382 				snderr(EDESTADDRREQ);
383 		}
384 		space = sbspace(&so->so_snd);
385 		if (flags & MSG_OOB)
386 			space += 1024;
387 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
388 		    clen > so->so_snd.sb_hiwat)
389 			snderr(EMSGSIZE);
390 		if (space < resid + clen && uio &&
391 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
392 			if (so->so_state & SS_NBIO)
393 				snderr(EWOULDBLOCK);
394 			sbunlock(&so->so_snd);
395 			error = sbwait(&so->so_snd);
396 			splx(s);
397 			if (error)
398 				goto out;
399 			goto restart;
400 		}
401 		splx(s);
402 		mp = &top;
403 		space -= clen;
404 		do {
405 		    if (uio == NULL) {
406 			/*
407 			 * Data is prepackaged in "top".
408 			 */
409 			resid = 0;
410 			if (flags & MSG_EOR)
411 				top->m_flags |= M_EOR;
412 		    } else do {
413 			if (top == 0) {
414 				MGETHDR(m, M_WAIT, MT_DATA);
415 				mlen = MHLEN;
416 				m->m_pkthdr.len = 0;
417 				m->m_pkthdr.rcvif = (struct ifnet *)0;
418 			} else {
419 				MGET(m, M_WAIT, MT_DATA);
420 				mlen = MLEN;
421 			}
422 			if (resid >= MINCLSIZE && space >= MCLBYTES) {
423 				MCLGET(m, M_WAIT);
424 				if ((m->m_flags & M_EXT) == 0)
425 					goto nopages;
426 				mlen = MCLBYTES;
427 #ifdef	MAPPED_MBUFS
428 				len = min(MCLBYTES, resid);
429 #else
430 				if (atomic && top == 0) {
431 					len = min(MCLBYTES - max_hdr, resid);
432 					m->m_data += max_hdr;
433 				} else
434 					len = min(MCLBYTES, resid);
435 #endif
436 				space -= len;
437 			} else {
438 nopages:
439 				len = min(min(mlen, resid), space);
440 				space -= len;
441 				/*
442 				 * For datagram protocols, leave room
443 				 * for protocol headers in first mbuf.
444 				 */
445 				if (atomic && top == 0 && len < mlen)
446 					MH_ALIGN(m, len);
447 			}
448 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
449 			resid = uio->uio_resid;
450 			m->m_len = len;
451 			*mp = m;
452 			top->m_pkthdr.len += len;
453 			if (error)
454 				goto release;
455 			mp = &m->m_next;
456 			if (resid <= 0) {
457 				if (flags & MSG_EOR)
458 					top->m_flags |= M_EOR;
459 				break;
460 			}
461 		    } while (space > 0 && atomic);
462 		    if (dontroute)
463 			    so->so_options |= SO_DONTROUTE;
464 		    s = splsoftnet();				/* XXX */
465 		    error = (*so->so_proto->pr_usrreq)(so,
466 			(flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
467 			top, addr, control, p);
468 		    splx(s);
469 		    if (dontroute)
470 			    so->so_options &= ~SO_DONTROUTE;
471 		    clen = 0;
472 		    control = 0;
473 		    top = 0;
474 		    mp = &top;
475 		    if (error)
476 			goto release;
477 		} while (resid && space > 0);
478 	} while (resid);
479 
480 release:
481 	sbunlock(&so->so_snd);
482 out:
483 	if (top)
484 		m_freem(top);
485 	if (control)
486 		m_freem(control);
487 	return (error);
488 }
489 
490 /*
491  * Implement receive operations on a socket.
492  * We depend on the way that records are added to the sockbuf
493  * by sbappend*.  In particular, each record (mbufs linked through m_next)
494  * must begin with an address if the protocol so specifies,
495  * followed by an optional mbuf or mbufs containing ancillary data,
496  * and then zero or more mbufs of data.
497  * In order to avoid blocking network interrupts for the entire time here,
498  * we splx() while doing the actual copy to user space.
499  * Although the sockbuf is locked, new data may still be appended,
500  * and thus we must maintain consistency of the sockbuf during that time.
501  *
502  * The caller may receive the data as a single mbuf chain by supplying
503  * an mbuf **mp0 for use in returning the chain.  The uio is then used
504  * only for the count in uio_resid.
505  */
506 int
507 soreceive(so, paddr, uio, mp0, controlp, flagsp)
508 	register struct socket *so;
509 	struct mbuf **paddr;
510 	struct uio *uio;
511 	struct mbuf **mp0;
512 	struct mbuf **controlp;
513 	int *flagsp;
514 {
515 	register struct mbuf *m, **mp;
516 	register int flags, len, error, s, offset;
517 	struct protosw *pr = so->so_proto;
518 	struct mbuf *nextrecord;
519 	int moff, type = 0;
520 	int orig_resid = uio->uio_resid;
521 
522 	mp = mp0;
523 	if (paddr)
524 		*paddr = 0;
525 	if (controlp)
526 		*controlp = 0;
527 	if (flagsp)
528 		flags = *flagsp &~ MSG_EOR;
529 	else
530 		flags = 0;
531 	if (flags & MSG_OOB) {
532 		m = m_get(M_WAIT, MT_DATA);
533 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
534 		    (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0,
535 		    (struct proc *)0);
536 		if (error)
537 			goto bad;
538 		do {
539 			error = uiomove(mtod(m, caddr_t),
540 			    (int) min(uio->uio_resid, m->m_len), uio);
541 			m = m_free(m);
542 		} while (uio->uio_resid && error == 0 && m);
543 bad:
544 		if (m)
545 			m_freem(m);
546 		return (error);
547 	}
548 	if (mp)
549 		*mp = (struct mbuf *)0;
550 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
551 		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
552 		    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
553 
554 restart:
555 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
556 		return (error);
557 	s = splsoftnet();
558 
559 	m = so->so_rcv.sb_mb;
560 	/*
561 	 * If we have less data than requested, block awaiting more
562 	 * (subject to any timeout) if:
563 	 *   1. the current count is less than the low water mark,
564 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
565 	 *	receive operation at once if we block (resid <= hiwat), or
566 	 *   3. MSG_DONTWAIT is not set.
567 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
568 	 * we have to do the receive in sections, and thus risk returning
569 	 * a short count if a timeout or signal occurs after we start.
570 	 */
571 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
572 	    so->so_rcv.sb_cc < uio->uio_resid) &&
573 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
574 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
575 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
576 #ifdef DIAGNOSTIC
577 		if (m == 0 && so->so_rcv.sb_cc)
578 			panic("receive 1");
579 #endif
580 		if (so->so_error) {
581 			if (m)
582 				goto dontblock;
583 			error = so->so_error;
584 			if ((flags & MSG_PEEK) == 0)
585 				so->so_error = 0;
586 			goto release;
587 		}
588 		if (so->so_state & SS_CANTRCVMORE) {
589 			if (m)
590 				goto dontblock;
591 			else
592 				goto release;
593 		}
594 		for (; m; m = m->m_next)
595 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
596 				m = so->so_rcv.sb_mb;
597 				goto dontblock;
598 			}
599 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
600 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
601 			error = ENOTCONN;
602 			goto release;
603 		}
604 		if (uio->uio_resid == 0)
605 			goto release;
606 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
607 			error = EWOULDBLOCK;
608 			goto release;
609 		}
610 		sbunlock(&so->so_rcv);
611 		error = sbwait(&so->so_rcv);
612 		splx(s);
613 		if (error)
614 			return (error);
615 		goto restart;
616 	}
617 dontblock:
618 #ifdef notyet /* XXXX */
619 	if (uio->uio_procp)
620 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
621 #endif
622 	nextrecord = m->m_nextpkt;
623 	if (pr->pr_flags & PR_ADDR) {
624 #ifdef DIAGNOSTIC
625 		if (m->m_type != MT_SONAME)
626 			panic("receive 1a");
627 #endif
628 		orig_resid = 0;
629 		if (flags & MSG_PEEK) {
630 			if (paddr)
631 				*paddr = m_copy(m, 0, m->m_len);
632 			m = m->m_next;
633 		} else {
634 			sbfree(&so->so_rcv, m);
635 			if (paddr) {
636 				*paddr = m;
637 				so->so_rcv.sb_mb = m->m_next;
638 				m->m_next = 0;
639 				m = so->so_rcv.sb_mb;
640 			} else {
641 				MFREE(m, so->so_rcv.sb_mb);
642 				m = so->so_rcv.sb_mb;
643 			}
644 		}
645 	}
646 	while (m && m->m_type == MT_CONTROL && error == 0) {
647 		if (flags & MSG_PEEK) {
648 			if (controlp)
649 				*controlp = m_copy(m, 0, m->m_len);
650 			m = m->m_next;
651 		} else {
652 			sbfree(&so->so_rcv, m);
653 			if (controlp) {
654 				if (pr->pr_domain->dom_externalize &&
655 				    mtod(m, struct cmsghdr *)->cmsg_type ==
656 				    SCM_RIGHTS)
657 				   error = (*pr->pr_domain->dom_externalize)(m);
658 				*controlp = m;
659 				so->so_rcv.sb_mb = m->m_next;
660 				m->m_next = 0;
661 				m = so->so_rcv.sb_mb;
662 			} else {
663 				MFREE(m, so->so_rcv.sb_mb);
664 				m = so->so_rcv.sb_mb;
665 			}
666 		}
667 		if (controlp) {
668 			orig_resid = 0;
669 			controlp = &(*controlp)->m_next;
670 		}
671 	}
672 	if (m) {
673 		if ((flags & MSG_PEEK) == 0)
674 			m->m_nextpkt = nextrecord;
675 		type = m->m_type;
676 		if (type == MT_OOBDATA)
677 			flags |= MSG_OOB;
678 	}
679 	moff = 0;
680 	offset = 0;
681 	while (m && uio->uio_resid > 0 && error == 0) {
682 		if (m->m_type == MT_OOBDATA) {
683 			if (type != MT_OOBDATA)
684 				break;
685 		} else if (type == MT_OOBDATA)
686 			break;
687 #ifdef DIAGNOSTIC
688 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
689 			panic("receive 3");
690 #endif
691 		so->so_state &= ~SS_RCVATMARK;
692 		len = uio->uio_resid;
693 		if (so->so_oobmark && len > so->so_oobmark - offset)
694 			len = so->so_oobmark - offset;
695 		if (len > m->m_len - moff)
696 			len = m->m_len - moff;
697 		/*
698 		 * If mp is set, just pass back the mbufs.
699 		 * Otherwise copy them out via the uio, then free.
700 		 * Sockbuf must be consistent here (points to current mbuf,
701 		 * it points to next record) when we drop priority;
702 		 * we must note any additions to the sockbuf when we
703 		 * block interrupts again.
704 		 */
705 		if (mp == 0) {
706 			splx(s);
707 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
708 			s = splsoftnet();
709 		} else
710 			uio->uio_resid -= len;
711 		if (len == m->m_len - moff) {
712 			if (m->m_flags & M_EOR)
713 				flags |= MSG_EOR;
714 			if (flags & MSG_PEEK) {
715 				m = m->m_next;
716 				moff = 0;
717 			} else {
718 				nextrecord = m->m_nextpkt;
719 				sbfree(&so->so_rcv, m);
720 				if (mp) {
721 					*mp = m;
722 					mp = &m->m_next;
723 					so->so_rcv.sb_mb = m = m->m_next;
724 					*mp = (struct mbuf *)0;
725 				} else {
726 					MFREE(m, so->so_rcv.sb_mb);
727 					m = so->so_rcv.sb_mb;
728 				}
729 				if (m)
730 					m->m_nextpkt = nextrecord;
731 			}
732 		} else {
733 			if (flags & MSG_PEEK)
734 				moff += len;
735 			else {
736 				if (mp)
737 					*mp = m_copym(m, 0, len, M_WAIT);
738 				m->m_data += len;
739 				m->m_len -= len;
740 				so->so_rcv.sb_cc -= len;
741 			}
742 		}
743 		if (so->so_oobmark) {
744 			if ((flags & MSG_PEEK) == 0) {
745 				so->so_oobmark -= len;
746 				if (so->so_oobmark == 0) {
747 					so->so_state |= SS_RCVATMARK;
748 					break;
749 				}
750 			} else {
751 				offset += len;
752 				if (offset == so->so_oobmark)
753 					break;
754 			}
755 		}
756 		if (flags & MSG_EOR)
757 			break;
758 		/*
759 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
760 		 * we must not quit until "uio->uio_resid == 0" or an error
761 		 * termination.  If a signal/timeout occurs, return
762 		 * with a short count but without error.
763 		 * Keep sockbuf locked against other readers.
764 		 */
765 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
766 		    !sosendallatonce(so) && !nextrecord) {
767 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
768 				break;
769 			error = sbwait(&so->so_rcv);
770 			if (error) {
771 				sbunlock(&so->so_rcv);
772 				splx(s);
773 				return (0);
774 			}
775 			if ((m = so->so_rcv.sb_mb) != NULL)
776 				nextrecord = m->m_nextpkt;
777 		}
778 	}
779 
780 	if (m && pr->pr_flags & PR_ATOMIC) {
781 		flags |= MSG_TRUNC;
782 		if ((flags & MSG_PEEK) == 0)
783 			(void) sbdroprecord(&so->so_rcv);
784 	}
785 	if ((flags & MSG_PEEK) == 0) {
786 		if (m == 0)
787 			so->so_rcv.sb_mb = nextrecord;
788 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
789 			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
790 			    (struct mbuf *)(long)flags, (struct mbuf *)0,
791 			    (struct proc *)0);
792 	}
793 	if (orig_resid == uio->uio_resid && orig_resid &&
794 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
795 		sbunlock(&so->so_rcv);
796 		splx(s);
797 		goto restart;
798 	}
799 
800 	if (flagsp)
801 		*flagsp |= flags;
802 release:
803 	sbunlock(&so->so_rcv);
804 	splx(s);
805 	return (error);
806 }
807 
808 int
809 soshutdown(so, how)
810 	register struct socket *so;
811 	register int how;
812 {
813 	register struct protosw *pr = so->so_proto;
814 
815 	how++;
816 	if (how & FREAD)
817 		sorflush(so);
818 	if (how & FWRITE)
819 		return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
820 		    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
821 	return (0);
822 }
823 
824 void
825 sorflush(so)
826 	register struct socket *so;
827 {
828 	register struct sockbuf *sb = &so->so_rcv;
829 	register struct protosw *pr = so->so_proto;
830 	register int s;
831 	struct sockbuf asb;
832 
833 	sb->sb_flags |= SB_NOINTR;
834 	(void) sblock(sb, M_WAITOK);
835 	s = splimp();
836 	socantrcvmore(so);
837 	sbunlock(sb);
838 	asb = *sb;
839 	bzero((caddr_t)sb, sizeof (*sb));
840 	splx(s);
841 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
842 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
843 	sbrelease(&asb);
844 }
845 
846 int
847 sosetopt(so, level, optname, m0)
848 	register struct socket *so;
849 	int level, optname;
850 	struct mbuf *m0;
851 {
852 	int error = 0;
853 	register struct mbuf *m = m0;
854 
855 	if (level != SOL_SOCKET) {
856 		if (so->so_proto && so->so_proto->pr_ctloutput)
857 			return ((*so->so_proto->pr_ctloutput)
858 				  (PRCO_SETOPT, so, level, optname, &m0));
859 		error = ENOPROTOOPT;
860 	} else {
861 		switch (optname) {
862 
863 		case SO_LINGER:
864 			if (m == NULL || m->m_len != sizeof (struct linger)) {
865 				error = EINVAL;
866 				goto bad;
867 			}
868 			so->so_linger = mtod(m, struct linger *)->l_linger;
869 			/* fall thru... */
870 
871 		case SO_DEBUG:
872 		case SO_KEEPALIVE:
873 		case SO_DONTROUTE:
874 		case SO_USELOOPBACK:
875 		case SO_BROADCAST:
876 		case SO_REUSEADDR:
877 		case SO_REUSEPORT:
878 		case SO_OOBINLINE:
879 		case SO_TIMESTAMP:
880 			if (m == NULL || m->m_len < sizeof (int)) {
881 				error = EINVAL;
882 				goto bad;
883 			}
884 			if (*mtod(m, int *))
885 				so->so_options |= optname;
886 			else
887 				so->so_options &= ~optname;
888 			break;
889 
890 		case SO_SNDBUF:
891 		case SO_RCVBUF:
892 		case SO_SNDLOWAT:
893 		case SO_RCVLOWAT:
894 		    {
895 			int optval;
896 
897 			if (m == NULL || m->m_len < sizeof (int)) {
898 				error = EINVAL;
899 				goto bad;
900 			}
901 
902 			/*
903 			 * Values < 1 make no sense for any of these
904 			 * options, so disallow them.
905 			 */
906 			optval = *mtod(m, int *);
907 			if (optval < 1) {
908 				error = EINVAL;
909 				goto bad;
910 			}
911 
912 			switch (optname) {
913 
914 			case SO_SNDBUF:
915 			case SO_RCVBUF:
916 				if (sbreserve(optname == SO_SNDBUF ?
917 				    &so->so_snd : &so->so_rcv,
918 				    (u_long) optval) == 0) {
919 					error = ENOBUFS;
920 					goto bad;
921 				}
922 				break;
923 
924 			/*
925 			 * Make sure the low-water is never greater than
926 			 * the high-water.
927 			 */
928 			case SO_SNDLOWAT:
929 				so->so_snd.sb_lowat =
930 				    (optval > so->so_snd.sb_hiwat) ?
931 				    so->so_snd.sb_hiwat : optval;
932 				break;
933 			case SO_RCVLOWAT:
934 				so->so_rcv.sb_lowat =
935 				    (optval > so->so_rcv.sb_hiwat) ?
936 				    so->so_rcv.sb_hiwat : optval;
937 				break;
938 			}
939 			break;
940 		    }
941 
942 		case SO_SNDTIMEO:
943 		case SO_RCVTIMEO:
944 		    {
945 			struct timeval *tv;
946 			short val;
947 
948 			if (m == NULL || m->m_len < sizeof (*tv)) {
949 				error = EINVAL;
950 				goto bad;
951 			}
952 			tv = mtod(m, struct timeval *);
953 			if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
954 				error = EDOM;
955 				goto bad;
956 			}
957 			val = tv->tv_sec * hz + tv->tv_usec / tick;
958 
959 			switch (optname) {
960 
961 			case SO_SNDTIMEO:
962 				so->so_snd.sb_timeo = val;
963 				break;
964 			case SO_RCVTIMEO:
965 				so->so_rcv.sb_timeo = val;
966 				break;
967 			}
968 			break;
969 		    }
970 
971 		default:
972 			error = ENOPROTOOPT;
973 			break;
974 		}
975 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
976 			(void) ((*so->so_proto->pr_ctloutput)
977 				  (PRCO_SETOPT, so, level, optname, &m0));
978 			m = NULL;	/* freed by protocol */
979 		}
980 	}
981 bad:
982 	if (m)
983 		(void) m_free(m);
984 	return (error);
985 }
986 
987 int
988 sogetopt(so, level, optname, mp)
989 	register struct socket *so;
990 	int level, optname;
991 	struct mbuf **mp;
992 {
993 	register struct mbuf *m;
994 
995 	if (level != SOL_SOCKET) {
996 		if (so->so_proto && so->so_proto->pr_ctloutput) {
997 			return ((*so->so_proto->pr_ctloutput)
998 				  (PRCO_GETOPT, so, level, optname, mp));
999 		} else
1000 			return (ENOPROTOOPT);
1001 	} else {
1002 		m = m_get(M_WAIT, MT_SOOPTS);
1003 		m->m_len = sizeof (int);
1004 
1005 		switch (optname) {
1006 
1007 		case SO_LINGER:
1008 			m->m_len = sizeof (struct linger);
1009 			mtod(m, struct linger *)->l_onoff =
1010 				so->so_options & SO_LINGER;
1011 			mtod(m, struct linger *)->l_linger = so->so_linger;
1012 			break;
1013 
1014 		case SO_USELOOPBACK:
1015 		case SO_DONTROUTE:
1016 		case SO_DEBUG:
1017 		case SO_KEEPALIVE:
1018 		case SO_REUSEADDR:
1019 		case SO_REUSEPORT:
1020 		case SO_BROADCAST:
1021 		case SO_OOBINLINE:
1022 		case SO_TIMESTAMP:
1023 			*mtod(m, int *) = so->so_options & optname;
1024 			break;
1025 
1026 		case SO_TYPE:
1027 			*mtod(m, int *) = so->so_type;
1028 			break;
1029 
1030 		case SO_ERROR:
1031 			*mtod(m, int *) = so->so_error;
1032 			so->so_error = 0;
1033 			break;
1034 
1035 		case SO_SNDBUF:
1036 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1037 			break;
1038 
1039 		case SO_RCVBUF:
1040 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1041 			break;
1042 
1043 		case SO_SNDLOWAT:
1044 			*mtod(m, int *) = so->so_snd.sb_lowat;
1045 			break;
1046 
1047 		case SO_RCVLOWAT:
1048 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1049 			break;
1050 
1051 		case SO_SNDTIMEO:
1052 		case SO_RCVTIMEO:
1053 		    {
1054 			int val = (optname == SO_SNDTIMEO ?
1055 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1056 
1057 			m->m_len = sizeof(struct timeval);
1058 			mtod(m, struct timeval *)->tv_sec = val / hz;
1059 			mtod(m, struct timeval *)->tv_usec =
1060 			    (val % hz) * tick;
1061 			break;
1062 		    }
1063 
1064 		default:
1065 			(void)m_free(m);
1066 			return (ENOPROTOOPT);
1067 		}
1068 		*mp = m;
1069 		return (0);
1070 	}
1071 }
1072 
1073 void
1074 sohasoutofband(so)
1075 	register struct socket *so;
1076 {
1077 	struct proc *p;
1078 
1079 	if (so->so_pgid < 0)
1080 		gsignal(-so->so_pgid, SIGURG);
1081 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1082 		psignal(p, SIGURG);
1083 	selwakeup(&so->so_rcv.sb_sel);
1084 }
1085