xref: /netbsd-src/sys/kern/uipc_socket.c (revision ce63d6c20fc4ec8ddc95c84bb229e3c4ecf82b69)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	from: @(#)uipc_socket.c	7.28 (Berkeley) 5/4/91
34  *	$Id: uipc_socket.c,v 1.2 1993/05/18 18:19:36 cgd Exp $
35  */
36 
37 #include "param.h"
38 #include "proc.h"
39 #include "file.h"
40 #include "malloc.h"
41 #include "mbuf.h"
42 #include "domain.h"
43 #include "kernel.h"
44 #include "select.h"
45 #include "protosw.h"
46 #include "socket.h"
47 #include "socketvar.h"
48 #include "resourcevar.h"
49 
50 /*
51  * Socket operation routines.
52  * These routines are called by the routines in
53  * sys_socket.c or from a system process, and
54  * implement the semantics of socket operations by
55  * switching out to the protocol specific routines.
56  */
57 /*ARGSUSED*/
58 socreate(dom, aso, type, proto)
59 	struct socket **aso;
60 	register int type;
61 	int proto;
62 {
63 	struct proc *p = curproc;		/* XXX */
64 	register struct protosw *prp;
65 	register struct socket *so;
66 	register int error;
67 
68 	if (proto)
69 		prp = pffindproto(dom, proto, type);
70 	else
71 		prp = pffindtype(dom, type);
72 	if (prp == 0)
73 		return (EPROTONOSUPPORT);
74 	if (prp->pr_type != type)
75 		return (EPROTOTYPE);
76 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
77 	bzero((caddr_t)so, sizeof(*so));
78 	so->so_type = type;
79 	if (p->p_ucred->cr_uid == 0)
80 		so->so_state = SS_PRIV;
81 	so->so_proto = prp;
82 	error =
83 	    (*prp->pr_usrreq)(so, PRU_ATTACH,
84 		(struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0);
85 	if (error) {
86 		so->so_state |= SS_NOFDREF;
87 		sofree(so);
88 		return (error);
89 	}
90 	*aso = so;
91 	return (0);
92 }
93 
94 sobind(so, nam)
95 	struct socket *so;
96 	struct mbuf *nam;
97 {
98 	int s = splnet();
99 	int error;
100 
101 	error =
102 	    (*so->so_proto->pr_usrreq)(so, PRU_BIND,
103 		(struct mbuf *)0, nam, (struct mbuf *)0);
104 	splx(s);
105 	return (error);
106 }
107 
108 solisten(so, backlog)
109 	register struct socket *so;
110 	int backlog;
111 {
112 	int s = splnet(), error;
113 
114 	error =
115 	    (*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
116 		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
117 	if (error) {
118 		splx(s);
119 		return (error);
120 	}
121 	if (so->so_q == 0)
122 		so->so_options |= SO_ACCEPTCONN;
123 	if (backlog < 0)
124 		backlog = 0;
125 	so->so_qlimit = min(backlog, SOMAXCONN);
126 	splx(s);
127 	return (0);
128 }
129 
130 sofree(so)
131 	register struct socket *so;
132 {
133 
134 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
135 		return;
136 	if (so->so_head) {
137 		if (!soqremque(so, 0) && !soqremque(so, 1))
138 			panic("sofree dq");
139 		so->so_head = 0;
140 	}
141 	sbrelease(&so->so_snd);
142 	sorflush(so);
143 	FREE(so, M_SOCKET);
144 }
145 
146 /*
147  * Close a socket on last file table reference removal.
148  * Initiate disconnect if connected.
149  * Free socket when disconnect complete.
150  */
151 soclose(so)
152 	register struct socket *so;
153 {
154 	int s = splnet();		/* conservative */
155 	int error = 0;
156 
157 	if (so->so_options & SO_ACCEPTCONN) {
158 		while (so->so_q0)
159 			(void) soabort(so->so_q0);
160 		while (so->so_q)
161 			(void) soabort(so->so_q);
162 	}
163 	if (so->so_pcb == 0)
164 		goto discard;
165 	if (so->so_state & SS_ISCONNECTED) {
166 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
167 			error = sodisconnect(so);
168 			if (error)
169 				goto drop;
170 		}
171 		if (so->so_options & SO_LINGER) {
172 			if ((so->so_state & SS_ISDISCONNECTING) &&
173 			    (so->so_state & SS_NBIO))
174 				goto drop;
175 			while (so->so_state & SS_ISCONNECTED)
176 				if (error = tsleep((caddr_t)&so->so_timeo,
177 				    PSOCK | PCATCH, netcls, so->so_linger))
178 					break;
179 		}
180 	}
181 drop:
182 	if (so->so_pcb) {
183 		int error2 =
184 		    (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
185 			(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
186 		if (error == 0)
187 			error = error2;
188 	}
189 discard:
190 	if (so->so_state & SS_NOFDREF)
191 		panic("soclose: NOFDREF");
192 	so->so_state |= SS_NOFDREF;
193 	sofree(so);
194 	splx(s);
195 	return (error);
196 }
197 
198 /*
199  * Must be called at splnet...
200  */
201 soabort(so)
202 	struct socket *so;
203 {
204 
205 	return (
206 	    (*so->so_proto->pr_usrreq)(so, PRU_ABORT,
207 		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
208 }
209 
210 soaccept(so, nam)
211 	register struct socket *so;
212 	struct mbuf *nam;
213 {
214 	int s = splnet();
215 	int error;
216 
217 	if ((so->so_state & SS_NOFDREF) == 0)
218 		panic("soaccept: !NOFDREF");
219 	so->so_state &= ~SS_NOFDREF;
220 	error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
221 	    (struct mbuf *)0, nam, (struct mbuf *)0);
222 	splx(s);
223 	return (error);
224 }
225 
226 soconnect(so, nam)
227 	register struct socket *so;
228 	struct mbuf *nam;
229 {
230 	int s;
231 	int error;
232 
233 	if (so->so_options & SO_ACCEPTCONN)
234 		return (EOPNOTSUPP);
235 	s = splnet();
236 	/*
237 	 * If protocol is connection-based, can only connect once.
238 	 * Otherwise, if connected, try to disconnect first.
239 	 * This allows user to disconnect by connecting to, e.g.,
240 	 * a null address.
241 	 */
242 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
243 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
244 	    (error = sodisconnect(so))))
245 		error = EISCONN;
246 	else
247 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
248 		    (struct mbuf *)0, nam, (struct mbuf *)0);
249 	splx(s);
250 	return (error);
251 }
252 
253 soconnect2(so1, so2)
254 	register struct socket *so1;
255 	struct socket *so2;
256 {
257 	int s = splnet();
258 	int error;
259 
260 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
261 	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
262 	splx(s);
263 	return (error);
264 }
265 
266 sodisconnect(so)
267 	register struct socket *so;
268 {
269 	int s = splnet();
270 	int error;
271 
272 	if ((so->so_state & SS_ISCONNECTED) == 0) {
273 		error = ENOTCONN;
274 		goto bad;
275 	}
276 	if (so->so_state & SS_ISDISCONNECTING) {
277 		error = EALREADY;
278 		goto bad;
279 	}
280 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
281 	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
282 bad:
283 	splx(s);
284 	return (error);
285 }
286 
287 /*
288  * Send on a socket.
289  * If send must go all at once and message is larger than
290  * send buffering, then hard error.
291  * Lock against other senders.
292  * If must go all at once and not enough room now, then
293  * inform user that this would block and do nothing.
294  * Otherwise, if nonblocking, send as much as possible.
295  * The data to be sent is described by "uio" if nonzero,
296  * otherwise by the mbuf chain "top" (which must be null
297  * if uio is not).  Data provided in mbuf chain must be small
298  * enough to send all at once.
299  *
300  * Returns nonzero on error, timeout or signal; callers
301  * must check for short counts if EINTR/ERESTART are returned.
302  * Data and control buffers are freed on return.
303  */
304 sosend(so, addr, uio, top, control, flags)
305 	register struct socket *so;
306 	struct mbuf *addr;
307 	struct uio *uio;
308 	struct mbuf *top;
309 	struct mbuf *control;
310 	int flags;
311 {
312 	struct proc *p = curproc;		/* XXX */
313 	struct mbuf **mp;
314 	register struct mbuf *m;
315 	register long space, len, resid;
316 	int clen = 0, error, s, dontroute, mlen;
317 	int atomic = sosendallatonce(so) || top;
318 
319 	if (uio)
320 		resid = uio->uio_resid;
321 	else
322 		resid = top->m_pkthdr.len;
323 	dontroute =
324 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
325 	    (so->so_proto->pr_flags & PR_ATOMIC);
326 	p->p_stats->p_ru.ru_msgsnd++;
327 	if (control)
328 		clen = control->m_len;
329 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
330 
331 restart:
332 	if (error = sblock(&so->so_snd))
333 		goto out;
334 	do {
335 		s = splnet();
336 		if (so->so_state & SS_CANTSENDMORE)
337 			snderr(EPIPE);
338 		if (so->so_error)
339 			snderr(so->so_error);
340 		if ((so->so_state & SS_ISCONNECTED) == 0) {
341 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
342 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
343 				    !(resid == 0 && clen != 0))
344 					snderr(ENOTCONN);
345 			} else if (addr == 0)
346 				snderr(EDESTADDRREQ);
347 		}
348 		space = sbspace(&so->so_snd);
349 		if (flags & MSG_OOB)
350 			space += 1024;
351 		if (space < resid + clen &&
352 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
353 			if (atomic && resid > so->so_snd.sb_hiwat ||
354 			    clen > so->so_snd.sb_hiwat)
355 				snderr(EMSGSIZE);
356 			if (so->so_state & SS_NBIO)
357 				snderr(EWOULDBLOCK);
358 			sbunlock(&so->so_snd);
359 			error = sbwait(&so->so_snd);
360 			splx(s);
361 			if (error)
362 				goto out;
363 			goto restart;
364 		}
365 		splx(s);
366 		mp = &top;
367 		space -= clen;
368 		do {
369 		    if (uio == NULL) {
370 			/*
371 			 * Data is prepackaged in "top".
372 			 */
373 			resid = 0;
374 			if (flags & MSG_EOR)
375 				top->m_flags |= M_EOR;
376 		    } else do {
377 			if (top == 0) {
378 				MGETHDR(m, M_WAIT, MT_DATA);
379 				mlen = MHLEN;
380 				m->m_pkthdr.len = 0;
381 				m->m_pkthdr.rcvif = (struct ifnet *)0;
382 			} else {
383 				MGET(m, M_WAIT, MT_DATA);
384 				mlen = MLEN;
385 			}
386 			if (resid >= MINCLSIZE && space >= MCLBYTES) {
387 				MCLGET(m, M_WAIT);
388 				if ((m->m_flags & M_EXT) == 0)
389 					goto nopages;
390 				mlen = MCLBYTES;
391 #ifdef	MAPPED_MBUFS
392 				len = min(MCLBYTES, resid);
393 #else
394 				if (top == 0) {
395 					len = min(MCLBYTES - max_hdr, resid);
396 					m->m_data += max_hdr;
397 				} else
398 					len = min(MCLBYTES, resid);
399 #endif
400 				space -= MCLBYTES;
401 			} else {
402 nopages:
403 				len = min(min(mlen, resid), space);
404 				space -= len;
405 				/*
406 				 * For datagram protocols, leave room
407 				 * for protocol headers in first mbuf.
408 				 */
409 				if (atomic && top == 0 && len < mlen)
410 					MH_ALIGN(m, len);
411 			}
412 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
413 			resid = uio->uio_resid;
414 			m->m_len = len;
415 			*mp = m;
416 			top->m_pkthdr.len += len;
417 			if (error)
418 				goto release;
419 			mp = &m->m_next;
420 			if (resid <= 0) {
421 				if (flags & MSG_EOR)
422 					top->m_flags |= M_EOR;
423 				break;
424 			}
425 		    } while (space > 0 && atomic);
426 		    if (dontroute)
427 			    so->so_options |= SO_DONTROUTE;
428 		    s = splnet();				/* XXX */
429 		    error = (*so->so_proto->pr_usrreq)(so,
430 			(flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
431 			top, addr, control);
432 		    splx(s);
433 		    if (dontroute)
434 			    so->so_options &= ~SO_DONTROUTE;
435 		    clen = 0;
436 		    control = 0;
437 		    top = 0;
438 		    mp = &top;
439 		    if (error)
440 			goto release;
441 		} while (resid && space > 0);
442 	} while (resid);
443 
444 release:
445 	sbunlock(&so->so_snd);
446 out:
447 	if (top)
448 		m_freem(top);
449 	if (control)
450 		m_freem(control);
451 	return (error);
452 }
453 
454 /*
455  * Implement receive operations on a socket.
456  * We depend on the way that records are added to the sockbuf
457  * by sbappend*.  In particular, each record (mbufs linked through m_next)
458  * must begin with an address if the protocol so specifies,
459  * followed by an optional mbuf or mbufs containing ancillary data,
460  * and then zero or more mbufs of data.
461  * In order to avoid blocking network interrupts for the entire time here,
462  * we splx() while doing the actual copy to user space.
463  * Although the sockbuf is locked, new data may still be appended,
464  * and thus we must maintain consistency of the sockbuf during that time.
465  *
466  * The caller may receive the data as a single mbuf chain by supplying
467  * an mbuf **mp0 for use in returning the chain.  The uio is then used
468  * only for the count in uio_resid.
469  */
470 soreceive(so, paddr, uio, mp0, controlp, flagsp)
471 	register struct socket *so;
472 	struct mbuf **paddr;
473 	struct uio *uio;
474 	struct mbuf **mp0;
475 	struct mbuf **controlp;
476 	int *flagsp;
477 {
478 	struct proc *p = curproc;		/* XXX */
479 	register struct mbuf *m, **mp;
480 	register int flags, len, error, s, offset;
481 	struct protosw *pr = so->so_proto;
482 	struct mbuf *nextrecord;
483 	int moff, type;
484 
485 	mp = mp0;
486 	if (paddr)
487 		*paddr = 0;
488 	if (controlp)
489 		*controlp = 0;
490 	if (flagsp)
491 		flags = *flagsp &~ MSG_EOR;
492 	else
493 		flags = 0;
494 	if (flags & MSG_OOB) {
495 		m = m_get(M_WAIT, MT_DATA);
496 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB,
497 		    m, (struct mbuf *)(flags & MSG_PEEK), (struct mbuf *)0);
498 		if (error)
499 			goto bad;
500 		do {
501 			error = uiomove(mtod(m, caddr_t),
502 			    (int) min(uio->uio_resid, m->m_len), uio);
503 			m = m_free(m);
504 		} while (uio->uio_resid && error == 0 && m);
505 bad:
506 		if (m)
507 			m_freem(m);
508 		return (error);
509 	}
510 	if (mp)
511 		*mp = (struct mbuf *)0;
512 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
513 		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
514 		    (struct mbuf *)0, (struct mbuf *)0);
515 
516 restart:
517 	if (error = sblock(&so->so_rcv))
518 		return (error);
519 	s = splnet();
520 
521 	m = so->so_rcv.sb_mb;
522 	/*
523 	 * If we have less data than requested, block awaiting more
524 	 * (subject to any timeout) if:
525 	 *   1. the current count is less than the low water mark, or
526 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
527 	 *	receive operation at once if we block (resid <= hiwat).
528 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
529 	 * we have to do the receive in sections, and thus risk returning
530 	 * a short count if a timeout or signal occurs after we start.
531 	 */
532 	while (m == 0 || so->so_rcv.sb_cc < uio->uio_resid &&
533 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
534 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
535 	    m->m_nextpkt == 0) {
536 #ifdef DIAGNOSTIC
537 		if (m == 0 && so->so_rcv.sb_cc)
538 			panic("receive 1");
539 #endif
540 		if (so->so_error) {
541 			if (m)
542 				break;
543 			error = so->so_error;
544 			if ((flags & MSG_PEEK) == 0)
545 				so->so_error = 0;
546 			goto release;
547 		}
548 		if (so->so_state & SS_CANTRCVMORE) {
549 			if (m)
550 				break;
551 			else
552 				goto release;
553 		}
554 		for (; m; m = m->m_next)
555 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
556 				m = so->so_rcv.sb_mb;
557 				goto dontblock;
558 			}
559 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
560 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
561 			error = ENOTCONN;
562 			goto release;
563 		}
564 		if (uio->uio_resid == 0)
565 			goto release;
566 		if (so->so_state & SS_NBIO) {
567 			error = EWOULDBLOCK;
568 			goto release;
569 		}
570 		sbunlock(&so->so_rcv);
571 		error = sbwait(&so->so_rcv);
572 		splx(s);
573 		if (error)
574 			return (error);
575 		goto restart;
576 	}
577 dontblock:
578 	p->p_stats->p_ru.ru_msgrcv++;
579 	nextrecord = m->m_nextpkt;
580 	if (pr->pr_flags & PR_ADDR) {
581 #ifdef DIAGNOSTIC
582 		if (m->m_type != MT_SONAME)
583 			panic("receive 1a");
584 #endif
585 		if (flags & MSG_PEEK) {
586 			if (paddr)
587 				*paddr = m_copy(m, 0, m->m_len);
588 			m = m->m_next;
589 		} else {
590 			sbfree(&so->so_rcv, m);
591 			if (paddr) {
592 				*paddr = m;
593 				so->so_rcv.sb_mb = m->m_next;
594 				m->m_next = 0;
595 				m = so->so_rcv.sb_mb;
596 			} else {
597 				MFREE(m, so->so_rcv.sb_mb);
598 				m = so->so_rcv.sb_mb;
599 			}
600 		}
601 	}
602 	while (m && m->m_type == MT_CONTROL && error == 0) {
603 		if (flags & MSG_PEEK) {
604 			if (controlp)
605 				*controlp = m_copy(m, 0, m->m_len);
606 			m = m->m_next;
607 		} else {
608 			sbfree(&so->so_rcv, m);
609 			if (controlp) {
610 				if (pr->pr_domain->dom_externalize &&
611 				    mtod(m, struct cmsghdr *)->cmsg_type ==
612 				    SCM_RIGHTS)
613 				   error = (*pr->pr_domain->dom_externalize)(m);
614 				*controlp = m;
615 				so->so_rcv.sb_mb = m->m_next;
616 				m->m_next = 0;
617 				m = so->so_rcv.sb_mb;
618 			} else {
619 				MFREE(m, so->so_rcv.sb_mb);
620 				m = so->so_rcv.sb_mb;
621 			}
622 		}
623 		if (controlp)
624 			controlp = &(*controlp)->m_next;
625 	}
626 	if (m) {
627 		if ((flags & MSG_PEEK) == 0)
628 			m->m_nextpkt = nextrecord;
629 		type = m->m_type;
630 		if (type == MT_OOBDATA)
631 			flags |= MSG_OOB;
632 	}
633 	moff = 0;
634 	offset = 0;
635 	while (m && uio->uio_resid > 0 && error == 0) {
636 		if (m->m_type == MT_OOBDATA) {
637 			if (type != MT_OOBDATA)
638 				break;
639 		} else if (type == MT_OOBDATA)
640 			break;
641 #ifdef DIAGNOSTIC
642 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
643 			panic("receive 3");
644 #endif
645 		so->so_state &= ~SS_RCVATMARK;
646 		len = uio->uio_resid;
647 		if (so->so_oobmark && len > so->so_oobmark - offset)
648 			len = so->so_oobmark - offset;
649 		if (len > m->m_len - moff)
650 			len = m->m_len - moff;
651 		/*
652 		 * If mp is set, just pass back the mbufs.
653 		 * Otherwise copy them out via the uio, then free.
654 		 * Sockbuf must be consistent here (points to current mbuf,
655 		 * it points to next record) when we drop priority;
656 		 * we must note any additions to the sockbuf when we
657 		 * block interrupts again.
658 		 */
659 		if (mp == 0) {
660 			splx(s);
661 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
662 			s = splnet();
663 		} else
664 			uio->uio_resid -= len;
665 		if (len == m->m_len - moff) {
666 			if (m->m_flags & M_EOR)
667 				flags |= MSG_EOR;
668 			if (flags & MSG_PEEK) {
669 				m = m->m_next;
670 				moff = 0;
671 			} else {
672 				nextrecord = m->m_nextpkt;
673 				sbfree(&so->so_rcv, m);
674 				if (mp) {
675 					*mp = m;
676 					mp = &m->m_next;
677 					so->so_rcv.sb_mb = m = m->m_next;
678 					*mp = (struct mbuf *)0;
679 				} else {
680 					MFREE(m, so->so_rcv.sb_mb);
681 					m = so->so_rcv.sb_mb;
682 				}
683 				if (m)
684 					m->m_nextpkt = nextrecord;
685 			}
686 		} else {
687 			if (flags & MSG_PEEK)
688 				moff += len;
689 			else {
690 				if (mp)
691 					*mp = m_copym(m, 0, len, M_WAIT);
692 				m->m_data += len;
693 				m->m_len -= len;
694 				so->so_rcv.sb_cc -= len;
695 			}
696 		}
697 		if (so->so_oobmark) {
698 			if ((flags & MSG_PEEK) == 0) {
699 				so->so_oobmark -= len;
700 				if (so->so_oobmark == 0) {
701 					so->so_state |= SS_RCVATMARK;
702 					break;
703 				}
704 			} else
705 				offset += len;
706 		}
707 		if (flags & MSG_EOR)
708 			break;
709 		/*
710 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
711 		 * we must not quit until "uio->uio_resid == 0" or an error
712 		 * termination.  If a signal/timeout occurs, return
713 		 * with a short count but without error.
714 		 * Keep sockbuf locked against other readers.
715 		 */
716 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
717 		    !sosendallatonce(so)) {
718 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
719 				break;
720 			error = sbwait(&so->so_rcv);
721 			if (error) {
722 				sbunlock(&so->so_rcv);
723 				splx(s);
724 				return (0);
725 			}
726 			if (m = so->so_rcv.sb_mb)
727 				nextrecord = m->m_nextpkt;
728 		}
729 	}
730 	if ((flags & MSG_PEEK) == 0) {
731 		if (m == 0)
732 			so->so_rcv.sb_mb = nextrecord;
733 		else if (pr->pr_flags & PR_ATOMIC) {
734 			flags |= MSG_TRUNC;
735 			(void) sbdroprecord(&so->so_rcv);
736 		}
737 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
738 			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
739 			    (struct mbuf *)flags, (struct mbuf *)0,
740 			    (struct mbuf *)0);
741 	}
742 	if (flagsp)
743 		*flagsp |= flags;
744 release:
745 	sbunlock(&so->so_rcv);
746 	splx(s);
747 	return (error);
748 }
749 
750 soshutdown(so, how)
751 	register struct socket *so;
752 	register int how;
753 {
754 	register struct protosw *pr = so->so_proto;
755 
756 	how++;
757 	if (how & FREAD)
758 		sorflush(so);
759 	if (how & FWRITE)
760 		return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
761 		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
762 	return (0);
763 }
764 
765 sorflush(so)
766 	register struct socket *so;
767 {
768 	register struct sockbuf *sb = &so->so_rcv;
769 	register struct protosw *pr = so->so_proto;
770 	register int s;
771 	struct sockbuf asb;
772 
773 	sb->sb_flags |= SB_NOINTR;
774 	(void) sblock(sb);
775 	s = splimp();
776 	socantrcvmore(so);
777 	sbunlock(sb);
778 	asb = *sb;
779 	bzero((caddr_t)sb, sizeof (*sb));
780 	splx(s);
781 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
782 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
783 	sbrelease(&asb);
784 }
785 
786 sosetopt(so, level, optname, m0)
787 	register struct socket *so;
788 	int level, optname;
789 	struct mbuf *m0;
790 {
791 	int error = 0;
792 	register struct mbuf *m = m0;
793 
794 	if (level != SOL_SOCKET) {
795 		if (so->so_proto && so->so_proto->pr_ctloutput)
796 			return ((*so->so_proto->pr_ctloutput)
797 				  (PRCO_SETOPT, so, level, optname, &m0));
798 		error = ENOPROTOOPT;
799 	} else {
800 		switch (optname) {
801 
802 		case SO_LINGER:
803 			if (m == NULL || m->m_len != sizeof (struct linger)) {
804 				error = EINVAL;
805 				goto bad;
806 			}
807 			so->so_linger = mtod(m, struct linger *)->l_linger;
808 			/* fall thru... */
809 
810 		case SO_DEBUG:
811 		case SO_KEEPALIVE:
812 		case SO_DONTROUTE:
813 		case SO_USELOOPBACK:
814 		case SO_BROADCAST:
815 		case SO_REUSEADDR:
816 		case SO_OOBINLINE:
817 			if (m == NULL || m->m_len < sizeof (int)) {
818 				error = EINVAL;
819 				goto bad;
820 			}
821 			if (*mtod(m, int *))
822 				so->so_options |= optname;
823 			else
824 				so->so_options &= ~optname;
825 			break;
826 
827 		case SO_SNDBUF:
828 		case SO_RCVBUF:
829 		case SO_SNDLOWAT:
830 		case SO_RCVLOWAT:
831 			if (m == NULL || m->m_len < sizeof (int)) {
832 				error = EINVAL;
833 				goto bad;
834 			}
835 			switch (optname) {
836 
837 			case SO_SNDBUF:
838 			case SO_RCVBUF:
839 				if (sbreserve(optname == SO_SNDBUF ?
840 				    &so->so_snd : &so->so_rcv,
841 				    (u_long) *mtod(m, int *)) == 0) {
842 					error = ENOBUFS;
843 					goto bad;
844 				}
845 				break;
846 
847 			case SO_SNDLOWAT:
848 				so->so_snd.sb_lowat = *mtod(m, int *);
849 				break;
850 			case SO_RCVLOWAT:
851 				so->so_rcv.sb_lowat = *mtod(m, int *);
852 				break;
853 			}
854 			break;
855 
856 		case SO_SNDTIMEO:
857 		case SO_RCVTIMEO:
858 		    {
859 			struct timeval *tv;
860 			short val;
861 
862 			if (m == NULL || m->m_len < sizeof (*tv)) {
863 				error = EINVAL;
864 				goto bad;
865 			}
866 			tv = mtod(m, struct timeval *);
867 			if (tv->tv_sec > SHRT_MAX / hz - hz) {
868 				error = EDOM;
869 				goto bad;
870 			}
871 			val = tv->tv_sec * hz + tv->tv_usec / tick;
872 
873 			switch (optname) {
874 
875 			case SO_SNDTIMEO:
876 				so->so_snd.sb_timeo = val;
877 				break;
878 			case SO_RCVTIMEO:
879 				so->so_rcv.sb_timeo = val;
880 				break;
881 			}
882 			break;
883 		    }
884 
885 		default:
886 			error = ENOPROTOOPT;
887 			break;
888 		}
889 	}
890 bad:
891 	if (m)
892 		(void) m_free(m);
893 	return (error);
894 }
895 
896 sogetopt(so, level, optname, mp)
897 	register struct socket *so;
898 	int level, optname;
899 	struct mbuf **mp;
900 {
901 	register struct mbuf *m;
902 
903 	if (level != SOL_SOCKET) {
904 		if (so->so_proto && so->so_proto->pr_ctloutput) {
905 			return ((*so->so_proto->pr_ctloutput)
906 				  (PRCO_GETOPT, so, level, optname, mp));
907 		} else
908 			return (ENOPROTOOPT);
909 	} else {
910 		m = m_get(M_WAIT, MT_SOOPTS);
911 		m->m_len = sizeof (int);
912 
913 		switch (optname) {
914 
915 		case SO_LINGER:
916 			m->m_len = sizeof (struct linger);
917 			mtod(m, struct linger *)->l_onoff =
918 				so->so_options & SO_LINGER;
919 			mtod(m, struct linger *)->l_linger = so->so_linger;
920 			break;
921 
922 		case SO_USELOOPBACK:
923 		case SO_DONTROUTE:
924 		case SO_DEBUG:
925 		case SO_KEEPALIVE:
926 		case SO_REUSEADDR:
927 		case SO_BROADCAST:
928 		case SO_OOBINLINE:
929 			*mtod(m, int *) = so->so_options & optname;
930 			break;
931 
932 		case SO_TYPE:
933 			*mtod(m, int *) = so->so_type;
934 			break;
935 
936 		case SO_ERROR:
937 			*mtod(m, int *) = so->so_error;
938 			so->so_error = 0;
939 			break;
940 
941 		case SO_SNDBUF:
942 			*mtod(m, int *) = so->so_snd.sb_hiwat;
943 			break;
944 
945 		case SO_RCVBUF:
946 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
947 			break;
948 
949 		case SO_SNDLOWAT:
950 			*mtod(m, int *) = so->so_snd.sb_lowat;
951 			break;
952 
953 		case SO_RCVLOWAT:
954 			*mtod(m, int *) = so->so_rcv.sb_lowat;
955 			break;
956 
957 		case SO_SNDTIMEO:
958 		case SO_RCVTIMEO:
959 		    {
960 			int val = (optname == SO_SNDTIMEO ?
961 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
962 
963 			m->m_len = sizeof(struct timeval);
964 			mtod(m, struct timeval *)->tv_sec = val / hz;
965 			mtod(m, struct timeval *)->tv_usec =
966 			    (val % hz) / tick;
967 			break;
968 		    }
969 
970 		default:
971 			(void)m_free(m);
972 			return (ENOPROTOOPT);
973 		}
974 		*mp = m;
975 		return (0);
976 	}
977 }
978 
979 sohasoutofband(so)
980 	register struct socket *so;
981 {
982 	struct proc *p;
983 
984 	if (so->so_pgid < 0)
985 		gsignal(-so->so_pgid, SIGURG);
986 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
987 		psignal(p, SIGURG);
988 	selwakeup(&so->so_rcv.sb_sel);
989 }
990