xref: /netbsd-src/sys/kern/uipc_socket.c (revision 9573504567626934c7ee01c7dce0c4bb1dfe7403)
1 /*	$NetBSD: uipc_socket.c,v 1.20 1995/08/12 23:59:11 mycroft Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/file.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/resourcevar.h>
50 
51 /*
52  * Socket operation routines.
53  * These routines are called by the routines in
54  * sys_socket.c or from a system process, and
55  * implement the semantics of socket operations by
56  * switching out to the protocol specific routines.
57  */
58 /*ARGSUSED*/
59 int
60 socreate(dom, aso, type, proto)
61 	int dom;
62 	struct socket **aso;
63 	register int type;
64 	int proto;
65 {
66 	struct proc *p = curproc;		/* XXX */
67 	register struct protosw *prp;
68 	register struct socket *so;
69 	register int error;
70 
71 	if (proto)
72 		prp = pffindproto(dom, proto, type);
73 	else
74 		prp = pffindtype(dom, type);
75 	if (prp == 0 || prp->pr_usrreq == 0)
76 		return (EPROTONOSUPPORT);
77 	if (prp->pr_type != type)
78 		return (EPROTOTYPE);
79 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
80 	bzero((caddr_t)so, sizeof(*so));
81 	so->so_type = type;
82 	if (p->p_ucred->cr_uid == 0)
83 		so->so_state = SS_PRIV;
84 	so->so_proto = prp;
85 	error =
86 	    (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
87 		(struct mbuf *)(long)proto, (struct mbuf *)0);
88 	if (error) {
89 		so->so_state |= SS_NOFDREF;
90 		sofree(so);
91 		return (error);
92 	}
93 #ifdef COMPAT_SUNOS
94 	{
95 		extern struct emul emul_sunos;
96 		if (p->p_emul == &emul_sunos && type == SOCK_DGRAM)
97 			so->so_options |= SO_BROADCAST;
98 	}
99 #endif
100 	*aso = so;
101 	return (0);
102 }
103 
104 int
105 sobind(so, nam)
106 	struct socket *so;
107 	struct mbuf *nam;
108 {
109 	int s = splsoftnet();
110 	int error;
111 
112 	error =
113 	    (*so->so_proto->pr_usrreq)(so, PRU_BIND,
114 		(struct mbuf *)0, nam, (struct mbuf *)0);
115 	splx(s);
116 	return (error);
117 }
118 
119 int
120 solisten(so, backlog)
121 	register struct socket *so;
122 	int backlog;
123 {
124 	int s = splsoftnet(), error;
125 
126 	error =
127 	    (*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
128 		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
129 	if (error) {
130 		splx(s);
131 		return (error);
132 	}
133 	if (so->so_q == 0)
134 		so->so_options |= SO_ACCEPTCONN;
135 	if (backlog < 0)
136 		backlog = 0;
137 	so->so_qlimit = min(backlog, SOMAXCONN);
138 	splx(s);
139 	return (0);
140 }
141 
142 int
143 sofree(so)
144 	register struct socket *so;
145 {
146 
147 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
148 		return;
149 	if (so->so_head) {
150 		if (!soqremque(so, 0) && !soqremque(so, 1))
151 			panic("sofree dq");
152 		so->so_head = 0;
153 	}
154 	sbrelease(&so->so_snd);
155 	sorflush(so);
156 	FREE(so, M_SOCKET);
157 }
158 
159 /*
160  * Close a socket on last file table reference removal.
161  * Initiate disconnect if connected.
162  * Free socket when disconnect complete.
163  */
164 int
165 soclose(so)
166 	register struct socket *so;
167 {
168 	int s = splsoftnet();		/* conservative */
169 	int error = 0;
170 
171 	if (so->so_options & SO_ACCEPTCONN) {
172 		while (so->so_q0)
173 			(void) soabort(so->so_q0);
174 		while (so->so_q)
175 			(void) soabort(so->so_q);
176 	}
177 	if (so->so_pcb == 0)
178 		goto discard;
179 	if (so->so_state & SS_ISCONNECTED) {
180 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
181 			error = sodisconnect(so);
182 			if (error)
183 				goto drop;
184 		}
185 		if (so->so_options & SO_LINGER) {
186 			if ((so->so_state & SS_ISDISCONNECTING) &&
187 			    (so->so_state & SS_NBIO))
188 				goto drop;
189 			while (so->so_state & SS_ISCONNECTED)
190 				if (error = tsleep((caddr_t)&so->so_timeo,
191 				    PSOCK | PCATCH, netcls, so->so_linger))
192 					break;
193 		}
194 	}
195 drop:
196 	if (so->so_pcb) {
197 		int error2 =
198 		    (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
199 			(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
200 		if (error == 0)
201 			error = error2;
202 	}
203 discard:
204 	if (so->so_state & SS_NOFDREF)
205 		panic("soclose: NOFDREF");
206 	so->so_state |= SS_NOFDREF;
207 	sofree(so);
208 	splx(s);
209 	return (error);
210 }
211 
212 /*
213  * Must be called at splsoftnet...
214  */
215 int
216 soabort(so)
217 	struct socket *so;
218 {
219 
220 	return (
221 	    (*so->so_proto->pr_usrreq)(so, PRU_ABORT,
222 		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
223 }
224 
225 int
226 soaccept(so, nam)
227 	register struct socket *so;
228 	struct mbuf *nam;
229 {
230 	int s = splsoftnet();
231 	int error;
232 
233 	if ((so->so_state & SS_NOFDREF) == 0)
234 		panic("soaccept: !NOFDREF");
235 	so->so_state &= ~SS_NOFDREF;
236 	error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
237 	    (struct mbuf *)0, nam, (struct mbuf *)0);
238 	splx(s);
239 	return (error);
240 }
241 
242 int
243 soconnect(so, nam)
244 	register struct socket *so;
245 	struct mbuf *nam;
246 {
247 	int s;
248 	int error;
249 
250 	if (so->so_options & SO_ACCEPTCONN)
251 		return (EOPNOTSUPP);
252 	s = splsoftnet();
253 	/*
254 	 * If protocol is connection-based, can only connect once.
255 	 * Otherwise, if connected, try to disconnect first.
256 	 * This allows user to disconnect by connecting to, e.g.,
257 	 * a null address.
258 	 */
259 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
260 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
261 	    (error = sodisconnect(so))))
262 		error = EISCONN;
263 	else
264 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
265 		    (struct mbuf *)0, nam, (struct mbuf *)0);
266 	splx(s);
267 	return (error);
268 }
269 
270 int
271 soconnect2(so1, so2)
272 	register struct socket *so1;
273 	struct socket *so2;
274 {
275 	int s = splsoftnet();
276 	int error;
277 
278 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
279 	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
280 	splx(s);
281 	return (error);
282 }
283 
284 int
285 sodisconnect(so)
286 	register struct socket *so;
287 {
288 	int s = splsoftnet();
289 	int error;
290 
291 	if ((so->so_state & SS_ISCONNECTED) == 0) {
292 		error = ENOTCONN;
293 		goto bad;
294 	}
295 	if (so->so_state & SS_ISDISCONNECTING) {
296 		error = EALREADY;
297 		goto bad;
298 	}
299 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
300 	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
301 bad:
302 	splx(s);
303 	return (error);
304 }
305 
306 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
307 /*
308  * Send on a socket.
309  * If send must go all at once and message is larger than
310  * send buffering, then hard error.
311  * Lock against other senders.
312  * If must go all at once and not enough room now, then
313  * inform user that this would block and do nothing.
314  * Otherwise, if nonblocking, send as much as possible.
315  * The data to be sent is described by "uio" if nonzero,
316  * otherwise by the mbuf chain "top" (which must be null
317  * if uio is not).  Data provided in mbuf chain must be small
318  * enough to send all at once.
319  *
320  * Returns nonzero on error, timeout or signal; callers
321  * must check for short counts if EINTR/ERESTART are returned.
322  * Data and control buffers are freed on return.
323  */
324 int
325 sosend(so, addr, uio, top, control, flags)
326 	register struct socket *so;
327 	struct mbuf *addr;
328 	struct uio *uio;
329 	struct mbuf *top;
330 	struct mbuf *control;
331 	int flags;
332 {
333 	struct proc *p = curproc;		/* XXX */
334 	struct mbuf **mp;
335 	register struct mbuf *m;
336 	register long space, len, resid;
337 	int clen = 0, error, s, dontroute, mlen;
338 	int atomic = sosendallatonce(so) || top;
339 
340 	if (uio)
341 		resid = uio->uio_resid;
342 	else
343 		resid = top->m_pkthdr.len;
344 	/*
345 	 * In theory resid should be unsigned.
346 	 * However, space must be signed, as it might be less than 0
347 	 * if we over-committed, and we must use a signed comparison
348 	 * of space and resid.  On the other hand, a negative resid
349 	 * causes us to loop sending 0-length segments to the protocol.
350 	 */
351 	if (resid < 0)
352 		return (EINVAL);
353 	dontroute =
354 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
355 	    (so->so_proto->pr_flags & PR_ATOMIC);
356 	p->p_stats->p_ru.ru_msgsnd++;
357 	if (control)
358 		clen = control->m_len;
359 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
360 
361 restart:
362 	if (error = sblock(&so->so_snd, SBLOCKWAIT(flags)))
363 		goto out;
364 	do {
365 		s = splsoftnet();
366 		if (so->so_state & SS_CANTSENDMORE)
367 			snderr(EPIPE);
368 		if (so->so_error)
369 			snderr(so->so_error);
370 		if ((so->so_state & SS_ISCONNECTED) == 0) {
371 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
372 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
373 				    !(resid == 0 && clen != 0))
374 					snderr(ENOTCONN);
375 			} else if (addr == 0)
376 				snderr(EDESTADDRREQ);
377 		}
378 		space = sbspace(&so->so_snd);
379 		if (flags & MSG_OOB)
380 			space += 1024;
381 		if (atomic && resid > so->so_snd.sb_hiwat ||
382 		    clen > so->so_snd.sb_hiwat)
383 			snderr(EMSGSIZE);
384 		if (space < resid + clen && uio &&
385 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
386 			if (so->so_state & SS_NBIO)
387 				snderr(EWOULDBLOCK);
388 			sbunlock(&so->so_snd);
389 			error = sbwait(&so->so_snd);
390 			splx(s);
391 			if (error)
392 				goto out;
393 			goto restart;
394 		}
395 		splx(s);
396 		mp = &top;
397 		space -= clen;
398 		do {
399 		    if (uio == NULL) {
400 			/*
401 			 * Data is prepackaged in "top".
402 			 */
403 			resid = 0;
404 			if (flags & MSG_EOR)
405 				top->m_flags |= M_EOR;
406 		    } else do {
407 			if (top == 0) {
408 				MGETHDR(m, M_WAIT, MT_DATA);
409 				mlen = MHLEN;
410 				m->m_pkthdr.len = 0;
411 				m->m_pkthdr.rcvif = (struct ifnet *)0;
412 			} else {
413 				MGET(m, M_WAIT, MT_DATA);
414 				mlen = MLEN;
415 			}
416 			if (resid >= MINCLSIZE && space >= MCLBYTES) {
417 				MCLGET(m, M_WAIT);
418 				if ((m->m_flags & M_EXT) == 0)
419 					goto nopages;
420 				mlen = MCLBYTES;
421 #ifdef	MAPPED_MBUFS
422 				len = min(MCLBYTES, resid);
423 #else
424 				if (atomic && top == 0) {
425 					len = min(MCLBYTES - max_hdr, resid);
426 					m->m_data += max_hdr;
427 				} else
428 					len = min(MCLBYTES, resid);
429 #endif
430 				space -= MCLBYTES;
431 			} else {
432 nopages:
433 				len = min(min(mlen, resid), space);
434 				space -= len;
435 				/*
436 				 * For datagram protocols, leave room
437 				 * for protocol headers in first mbuf.
438 				 */
439 				if (atomic && top == 0 && len < mlen)
440 					MH_ALIGN(m, len);
441 			}
442 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
443 			resid = uio->uio_resid;
444 			m->m_len = len;
445 			*mp = m;
446 			top->m_pkthdr.len += len;
447 			if (error)
448 				goto release;
449 			mp = &m->m_next;
450 			if (resid <= 0) {
451 				if (flags & MSG_EOR)
452 					top->m_flags |= M_EOR;
453 				break;
454 			}
455 		    } while (space > 0 && atomic);
456 		    if (dontroute)
457 			    so->so_options |= SO_DONTROUTE;
458 		    s = splsoftnet();				/* XXX */
459 		    error = (*so->so_proto->pr_usrreq)(so,
460 			(flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
461 			top, addr, control);
462 		    splx(s);
463 		    if (dontroute)
464 			    so->so_options &= ~SO_DONTROUTE;
465 		    clen = 0;
466 		    control = 0;
467 		    top = 0;
468 		    mp = &top;
469 		    if (error)
470 			goto release;
471 		} while (resid && space > 0);
472 	} while (resid);
473 
474 release:
475 	sbunlock(&so->so_snd);
476 out:
477 	if (top)
478 		m_freem(top);
479 	if (control)
480 		m_freem(control);
481 	return (error);
482 }
483 
484 /*
485  * Implement receive operations on a socket.
486  * We depend on the way that records are added to the sockbuf
487  * by sbappend*.  In particular, each record (mbufs linked through m_next)
488  * must begin with an address if the protocol so specifies,
489  * followed by an optional mbuf or mbufs containing ancillary data,
490  * and then zero or more mbufs of data.
491  * In order to avoid blocking network interrupts for the entire time here,
492  * we splx() while doing the actual copy to user space.
493  * Although the sockbuf is locked, new data may still be appended,
494  * and thus we must maintain consistency of the sockbuf during that time.
495  *
496  * The caller may receive the data as a single mbuf chain by supplying
497  * an mbuf **mp0 for use in returning the chain.  The uio is then used
498  * only for the count in uio_resid.
499  */
500 int
501 soreceive(so, paddr, uio, mp0, controlp, flagsp)
502 	register struct socket *so;
503 	struct mbuf **paddr;
504 	struct uio *uio;
505 	struct mbuf **mp0;
506 	struct mbuf **controlp;
507 	int *flagsp;
508 {
509 	register struct mbuf *m, **mp;
510 	register int flags, len, error, s, offset;
511 	struct protosw *pr = so->so_proto;
512 	struct mbuf *nextrecord;
513 	int moff, type;
514 	int orig_resid = uio->uio_resid;
515 
516 	mp = mp0;
517 	if (paddr)
518 		*paddr = 0;
519 	if (controlp)
520 		*controlp = 0;
521 	if (flagsp)
522 		flags = *flagsp &~ MSG_EOR;
523 	else
524 		flags = 0;
525 	if (flags & MSG_OOB) {
526 		m = m_get(M_WAIT, MT_DATA);
527 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
528 		    (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0);
529 		if (error)
530 			goto bad;
531 		do {
532 			error = uiomove(mtod(m, caddr_t),
533 			    (int) min(uio->uio_resid, m->m_len), uio);
534 			m = m_free(m);
535 		} while (uio->uio_resid && error == 0 && m);
536 bad:
537 		if (m)
538 			m_freem(m);
539 		return (error);
540 	}
541 	if (mp)
542 		*mp = (struct mbuf *)0;
543 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
544 		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
545 		    (struct mbuf *)0, (struct mbuf *)0);
546 
547 restart:
548 	if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags)))
549 		return (error);
550 	s = splsoftnet();
551 
552 	m = so->so_rcv.sb_mb;
553 	/*
554 	 * If we have less data than requested, block awaiting more
555 	 * (subject to any timeout) if:
556 	 *   1. the current count is less than the low water mark,
557 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
558 	 *	receive operation at once if we block (resid <= hiwat), or
559 	 *   3. MSG_DONTWAIT is not set.
560 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
561 	 * we have to do the receive in sections, and thus risk returning
562 	 * a short count if a timeout or signal occurs after we start.
563 	 */
564 	if (m == 0 || ((flags & MSG_DONTWAIT) == 0 &&
565 	    so->so_rcv.sb_cc < uio->uio_resid) &&
566 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
567 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
568 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0) {
569 #ifdef DIAGNOSTIC
570 		if (m == 0 && so->so_rcv.sb_cc)
571 			panic("receive 1");
572 #endif
573 		if (so->so_error) {
574 			if (m)
575 				goto dontblock;
576 			error = so->so_error;
577 			if ((flags & MSG_PEEK) == 0)
578 				so->so_error = 0;
579 			goto release;
580 		}
581 		if (so->so_state & SS_CANTRCVMORE) {
582 			if (m)
583 				goto dontblock;
584 			else
585 				goto release;
586 		}
587 		for (; m; m = m->m_next)
588 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
589 				m = so->so_rcv.sb_mb;
590 				goto dontblock;
591 			}
592 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
593 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
594 			error = ENOTCONN;
595 			goto release;
596 		}
597 		if (uio->uio_resid == 0)
598 			goto release;
599 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
600 			error = EWOULDBLOCK;
601 			goto release;
602 		}
603 		sbunlock(&so->so_rcv);
604 		error = sbwait(&so->so_rcv);
605 		splx(s);
606 		if (error)
607 			return (error);
608 		goto restart;
609 	}
610 dontblock:
611 #ifdef notyet /* XXXX */
612 	if (uio->uio_procp)
613 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
614 #endif
615 	nextrecord = m->m_nextpkt;
616 	if (pr->pr_flags & PR_ADDR) {
617 #ifdef DIAGNOSTIC
618 		if (m->m_type != MT_SONAME)
619 			panic("receive 1a");
620 #endif
621 		orig_resid = 0;
622 		if (flags & MSG_PEEK) {
623 			if (paddr)
624 				*paddr = m_copy(m, 0, m->m_len);
625 			m = m->m_next;
626 		} else {
627 			sbfree(&so->so_rcv, m);
628 			if (paddr) {
629 				*paddr = m;
630 				so->so_rcv.sb_mb = m->m_next;
631 				m->m_next = 0;
632 				m = so->so_rcv.sb_mb;
633 			} else {
634 				MFREE(m, so->so_rcv.sb_mb);
635 				m = so->so_rcv.sb_mb;
636 			}
637 		}
638 	}
639 	while (m && m->m_type == MT_CONTROL && error == 0) {
640 		if (flags & MSG_PEEK) {
641 			if (controlp)
642 				*controlp = m_copy(m, 0, m->m_len);
643 			m = m->m_next;
644 		} else {
645 			sbfree(&so->so_rcv, m);
646 			if (controlp) {
647 				if (pr->pr_domain->dom_externalize &&
648 				    mtod(m, struct cmsghdr *)->cmsg_type ==
649 				    SCM_RIGHTS)
650 				   error = (*pr->pr_domain->dom_externalize)(m);
651 				*controlp = m;
652 				so->so_rcv.sb_mb = m->m_next;
653 				m->m_next = 0;
654 				m = so->so_rcv.sb_mb;
655 			} else {
656 				MFREE(m, so->so_rcv.sb_mb);
657 				m = so->so_rcv.sb_mb;
658 			}
659 		}
660 		if (controlp) {
661 			orig_resid = 0;
662 			controlp = &(*controlp)->m_next;
663 		}
664 	}
665 	if (m) {
666 		if ((flags & MSG_PEEK) == 0)
667 			m->m_nextpkt = nextrecord;
668 		type = m->m_type;
669 		if (type == MT_OOBDATA)
670 			flags |= MSG_OOB;
671 	}
672 	moff = 0;
673 	offset = 0;
674 	while (m && uio->uio_resid > 0 && error == 0) {
675 		if (m->m_type == MT_OOBDATA) {
676 			if (type != MT_OOBDATA)
677 				break;
678 		} else if (type == MT_OOBDATA)
679 			break;
680 #ifdef DIAGNOSTIC
681 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
682 			panic("receive 3");
683 #endif
684 		so->so_state &= ~SS_RCVATMARK;
685 		len = uio->uio_resid;
686 		if (so->so_oobmark && len > so->so_oobmark - offset)
687 			len = so->so_oobmark - offset;
688 		if (len > m->m_len - moff)
689 			len = m->m_len - moff;
690 		/*
691 		 * If mp is set, just pass back the mbufs.
692 		 * Otherwise copy them out via the uio, then free.
693 		 * Sockbuf must be consistent here (points to current mbuf,
694 		 * it points to next record) when we drop priority;
695 		 * we must note any additions to the sockbuf when we
696 		 * block interrupts again.
697 		 */
698 		if (mp == 0) {
699 			splx(s);
700 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
701 			s = splsoftnet();
702 		} else
703 			uio->uio_resid -= len;
704 		if (len == m->m_len - moff) {
705 			if (m->m_flags & M_EOR)
706 				flags |= MSG_EOR;
707 			if (flags & MSG_PEEK) {
708 				m = m->m_next;
709 				moff = 0;
710 			} else {
711 				nextrecord = m->m_nextpkt;
712 				sbfree(&so->so_rcv, m);
713 				if (mp) {
714 					*mp = m;
715 					mp = &m->m_next;
716 					so->so_rcv.sb_mb = m = m->m_next;
717 					*mp = (struct mbuf *)0;
718 				} else {
719 					MFREE(m, so->so_rcv.sb_mb);
720 					m = so->so_rcv.sb_mb;
721 				}
722 				if (m)
723 					m->m_nextpkt = nextrecord;
724 			}
725 		} else {
726 			if (flags & MSG_PEEK)
727 				moff += len;
728 			else {
729 				if (mp)
730 					*mp = m_copym(m, 0, len, M_WAIT);
731 				m->m_data += len;
732 				m->m_len -= len;
733 				so->so_rcv.sb_cc -= len;
734 			}
735 		}
736 		if (so->so_oobmark) {
737 			if ((flags & MSG_PEEK) == 0) {
738 				so->so_oobmark -= len;
739 				if (so->so_oobmark == 0) {
740 					so->so_state |= SS_RCVATMARK;
741 					break;
742 				}
743 			} else {
744 				offset += len;
745 				if (offset == so->so_oobmark)
746 					break;
747 			}
748 		}
749 		if (flags & MSG_EOR)
750 			break;
751 		/*
752 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
753 		 * we must not quit until "uio->uio_resid == 0" or an error
754 		 * termination.  If a signal/timeout occurs, return
755 		 * with a short count but without error.
756 		 * Keep sockbuf locked against other readers.
757 		 */
758 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
759 		    !sosendallatonce(so) && !nextrecord) {
760 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
761 				break;
762 			error = sbwait(&so->so_rcv);
763 			if (error) {
764 				sbunlock(&so->so_rcv);
765 				splx(s);
766 				return (0);
767 			}
768 			if (m = so->so_rcv.sb_mb)
769 				nextrecord = m->m_nextpkt;
770 		}
771 	}
772 
773 	if (m && pr->pr_flags & PR_ATOMIC) {
774 		flags |= MSG_TRUNC;
775 		if ((flags & MSG_PEEK) == 0)
776 			(void) sbdroprecord(&so->so_rcv);
777 	}
778 	if ((flags & MSG_PEEK) == 0) {
779 		if (m == 0)
780 			so->so_rcv.sb_mb = nextrecord;
781 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
782 			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
783 			    (struct mbuf *)(long)flags, (struct mbuf *)0,
784 			    (struct mbuf *)0);
785 	}
786 	if (orig_resid == uio->uio_resid && orig_resid &&
787 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
788 		sbunlock(&so->so_rcv);
789 		splx(s);
790 		goto restart;
791 	}
792 
793 	if (flagsp)
794 		*flagsp |= flags;
795 release:
796 	sbunlock(&so->so_rcv);
797 	splx(s);
798 	return (error);
799 }
800 
801 int
802 soshutdown(so, how)
803 	register struct socket *so;
804 	register int how;
805 {
806 	register struct protosw *pr = so->so_proto;
807 
808 	how++;
809 	if (how & FREAD)
810 		sorflush(so);
811 	if (how & FWRITE)
812 		return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
813 		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
814 	return (0);
815 }
816 
817 void
818 sorflush(so)
819 	register struct socket *so;
820 {
821 	register struct sockbuf *sb = &so->so_rcv;
822 	register struct protosw *pr = so->so_proto;
823 	register int s;
824 	struct sockbuf asb;
825 
826 	sb->sb_flags |= SB_NOINTR;
827 	(void) sblock(sb, M_WAITOK);
828 	s = splimp();
829 	socantrcvmore(so);
830 	sbunlock(sb);
831 	asb = *sb;
832 	bzero((caddr_t)sb, sizeof (*sb));
833 	splx(s);
834 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
835 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
836 	sbrelease(&asb);
837 }
838 
839 int
840 sosetopt(so, level, optname, m0)
841 	register struct socket *so;
842 	int level, optname;
843 	struct mbuf *m0;
844 {
845 	int error = 0;
846 	register struct mbuf *m = m0;
847 
848 	if (level != SOL_SOCKET) {
849 		if (so->so_proto && so->so_proto->pr_ctloutput)
850 			return ((*so->so_proto->pr_ctloutput)
851 				  (PRCO_SETOPT, so, level, optname, &m0));
852 		error = ENOPROTOOPT;
853 	} else {
854 		switch (optname) {
855 
856 		case SO_LINGER:
857 			if (m == NULL || m->m_len != sizeof (struct linger)) {
858 				error = EINVAL;
859 				goto bad;
860 			}
861 			so->so_linger = mtod(m, struct linger *)->l_linger;
862 			/* fall thru... */
863 
864 		case SO_DEBUG:
865 		case SO_KEEPALIVE:
866 		case SO_DONTROUTE:
867 		case SO_USELOOPBACK:
868 		case SO_BROADCAST:
869 		case SO_REUSEADDR:
870 		case SO_REUSEPORT:
871 		case SO_OOBINLINE:
872 			if (m == NULL || m->m_len < sizeof (int)) {
873 				error = EINVAL;
874 				goto bad;
875 			}
876 			if (*mtod(m, int *))
877 				so->so_options |= optname;
878 			else
879 				so->so_options &= ~optname;
880 			break;
881 
882 		case SO_SNDBUF:
883 		case SO_RCVBUF:
884 		case SO_SNDLOWAT:
885 		case SO_RCVLOWAT:
886 			if (m == NULL || m->m_len < sizeof (int)) {
887 				error = EINVAL;
888 				goto bad;
889 			}
890 			switch (optname) {
891 
892 			case SO_SNDBUF:
893 			case SO_RCVBUF:
894 				if (sbreserve(optname == SO_SNDBUF ?
895 				    &so->so_snd : &so->so_rcv,
896 				    (u_long) *mtod(m, int *)) == 0) {
897 					error = ENOBUFS;
898 					goto bad;
899 				}
900 				break;
901 
902 			case SO_SNDLOWAT:
903 				so->so_snd.sb_lowat = *mtod(m, int *);
904 				break;
905 			case SO_RCVLOWAT:
906 				so->so_rcv.sb_lowat = *mtod(m, int *);
907 				break;
908 			}
909 			break;
910 
911 		case SO_SNDTIMEO:
912 		case SO_RCVTIMEO:
913 		    {
914 			struct timeval *tv;
915 			short val;
916 
917 			if (m == NULL || m->m_len < sizeof (*tv)) {
918 				error = EINVAL;
919 				goto bad;
920 			}
921 			tv = mtod(m, struct timeval *);
922 			if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
923 				error = EDOM;
924 				goto bad;
925 			}
926 			val = tv->tv_sec * hz + tv->tv_usec / tick;
927 
928 			switch (optname) {
929 
930 			case SO_SNDTIMEO:
931 				so->so_snd.sb_timeo = val;
932 				break;
933 			case SO_RCVTIMEO:
934 				so->so_rcv.sb_timeo = val;
935 				break;
936 			}
937 			break;
938 		    }
939 
940 		default:
941 			error = ENOPROTOOPT;
942 			break;
943 		}
944 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
945 			(void) ((*so->so_proto->pr_ctloutput)
946 				  (PRCO_SETOPT, so, level, optname, &m0));
947 			m = NULL;	/* freed by protocol */
948 		}
949 	}
950 bad:
951 	if (m)
952 		(void) m_free(m);
953 	return (error);
954 }
955 
956 int
957 sogetopt(so, level, optname, mp)
958 	register struct socket *so;
959 	int level, optname;
960 	struct mbuf **mp;
961 {
962 	register struct mbuf *m;
963 
964 	if (level != SOL_SOCKET) {
965 		if (so->so_proto && so->so_proto->pr_ctloutput) {
966 			return ((*so->so_proto->pr_ctloutput)
967 				  (PRCO_GETOPT, so, level, optname, mp));
968 		} else
969 			return (ENOPROTOOPT);
970 	} else {
971 		m = m_get(M_WAIT, MT_SOOPTS);
972 		m->m_len = sizeof (int);
973 
974 		switch (optname) {
975 
976 		case SO_LINGER:
977 			m->m_len = sizeof (struct linger);
978 			mtod(m, struct linger *)->l_onoff =
979 				so->so_options & SO_LINGER;
980 			mtod(m, struct linger *)->l_linger = so->so_linger;
981 			break;
982 
983 		case SO_USELOOPBACK:
984 		case SO_DONTROUTE:
985 		case SO_DEBUG:
986 		case SO_KEEPALIVE:
987 		case SO_REUSEADDR:
988 		case SO_REUSEPORT:
989 		case SO_BROADCAST:
990 		case SO_OOBINLINE:
991 			*mtod(m, int *) = so->so_options & optname;
992 			break;
993 
994 		case SO_TYPE:
995 			*mtod(m, int *) = so->so_type;
996 			break;
997 
998 		case SO_ERROR:
999 			*mtod(m, int *) = so->so_error;
1000 			so->so_error = 0;
1001 			break;
1002 
1003 		case SO_SNDBUF:
1004 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1005 			break;
1006 
1007 		case SO_RCVBUF:
1008 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1009 			break;
1010 
1011 		case SO_SNDLOWAT:
1012 			*mtod(m, int *) = so->so_snd.sb_lowat;
1013 			break;
1014 
1015 		case SO_RCVLOWAT:
1016 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1017 			break;
1018 
1019 		case SO_SNDTIMEO:
1020 		case SO_RCVTIMEO:
1021 		    {
1022 			int val = (optname == SO_SNDTIMEO ?
1023 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1024 
1025 			m->m_len = sizeof(struct timeval);
1026 			mtod(m, struct timeval *)->tv_sec = val / hz;
1027 			mtod(m, struct timeval *)->tv_usec =
1028 			    (val % hz) / tick;
1029 			break;
1030 		    }
1031 
1032 		default:
1033 			(void)m_free(m);
1034 			return (ENOPROTOOPT);
1035 		}
1036 		*mp = m;
1037 		return (0);
1038 	}
1039 }
1040 
1041 void
1042 sohasoutofband(so)
1043 	register struct socket *so;
1044 {
1045 	struct proc *p;
1046 
1047 	if (so->so_pgid < 0)
1048 		gsignal(-so->so_pgid, SIGURG);
1049 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1050 		psignal(p, SIGURG);
1051 	selwakeup(&so->so_rcv.sb_sel);
1052 }
1053