xref: /netbsd-src/sys/kern/uipc_socket.c (revision b519c70ad771d0a55b3c2277db6b97a05fa6465d)
1 /*	$NetBSD: uipc_socket.c,v 1.61 2002/01/03 01:16:02 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)uipc_socket.c	8.6 (Berkeley) 5/2/95
36  */
37 
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.61 2002/01/03 01:16:02 mrg Exp $");
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/file.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/domain.h>
48 #include <sys/kernel.h>
49 #include <sys/protosw.h>
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/signalvar.h>
53 #include <sys/resourcevar.h>
54 #include <sys/pool.h>
55 
56 struct pool	socket_pool;
57 
58 extern int	somaxconn;			/* patchable (XXX sysctl) */
59 int		somaxconn = SOMAXCONN;
60 
61 void
62 soinit(void)
63 {
64 
65 	pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0,
66 	    "sockpl", 0, NULL, NULL, M_SOCKET);
67 }
68 
69 /*
70  * Socket operation routines.
71  * These routines are called by the routines in
72  * sys_socket.c or from a system process, and
73  * implement the semantics of socket operations by
74  * switching out to the protocol specific routines.
75  */
76 /*ARGSUSED*/
77 int
78 socreate(int dom, struct socket **aso, int type, int proto)
79 {
80 	struct proc	*p;
81 	struct protosw	*prp;
82 	struct socket	*so;
83 	int		error, s;
84 
85 	p = curproc;		/* XXX */
86 	if (proto)
87 		prp = pffindproto(dom, proto, type);
88 	else
89 		prp = pffindtype(dom, type);
90 	if (prp == 0 || prp->pr_usrreq == 0)
91 		return (EPROTONOSUPPORT);
92 	if (prp->pr_type != type)
93 		return (EPROTOTYPE);
94 	s = splsoftnet();
95 	so = pool_get(&socket_pool, PR_WAITOK);
96 	memset((caddr_t)so, 0, sizeof(*so));
97 	TAILQ_INIT(&so->so_q0);
98 	TAILQ_INIT(&so->so_q);
99 	so->so_type = type;
100 	so->so_proto = prp;
101 	so->so_send = sosend;
102 	so->so_receive = soreceive;
103 	if (p != 0)
104 		so->so_uid = p->p_ucred->cr_uid;
105 	error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
106 	    (struct mbuf *)(long)proto, (struct mbuf *)0, p);
107 	if (error) {
108 		so->so_state |= SS_NOFDREF;
109 		sofree(so);
110 		splx(s);
111 		return (error);
112 	}
113 	splx(s);
114 	*aso = so;
115 	return (0);
116 }
117 
118 int
119 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
120 {
121 	int	s, error;
122 
123 	s = splsoftnet();
124 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
125 	    nam, (struct mbuf *)0, p);
126 	splx(s);
127 	return (error);
128 }
129 
130 int
131 solisten(struct socket *so, int backlog)
132 {
133 	int	s, error;
134 
135 	s = splsoftnet();
136 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
137 	    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
138 	if (error) {
139 		splx(s);
140 		return (error);
141 	}
142 	if (so->so_q.tqh_first == NULL)
143 		so->so_options |= SO_ACCEPTCONN;
144 	if (backlog < 0)
145 		backlog = 0;
146 	so->so_qlimit = min(backlog, somaxconn);
147 	splx(s);
148 	return (0);
149 }
150 
151 void
152 sofree(struct socket *so)
153 {
154 
155 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
156 		return;
157 	if (so->so_head) {
158 		/*
159 		 * We must not decommission a socket that's on the accept(2)
160 		 * queue.  If we do, then accept(2) may hang after select(2)
161 		 * indicated that the listening socket was ready.
162 		 */
163 		if (!soqremque(so, 0))
164 			return;
165 	}
166 	sbrelease(&so->so_snd);
167 	sorflush(so);
168 	pool_put(&socket_pool, so);
169 }
170 
171 /*
172  * Close a socket on last file table reference removal.
173  * Initiate disconnect if connected.
174  * Free socket when disconnect complete.
175  */
176 int
177 soclose(struct socket *so)
178 {
179 	struct socket	*so2;
180 	int		s, error;
181 
182 	error = 0;
183 	s = splsoftnet();		/* conservative */
184 	if (so->so_options & SO_ACCEPTCONN) {
185 		while ((so2 = so->so_q0.tqh_first) != 0) {
186 			(void) soqremque(so2, 0);
187 			(void) soabort(so2);
188 		}
189 		while ((so2 = so->so_q.tqh_first) != 0) {
190 			(void) soqremque(so2, 1);
191 			(void) soabort(so2);
192 		}
193 	}
194 	if (so->so_pcb == 0)
195 		goto discard;
196 	if (so->so_state & SS_ISCONNECTED) {
197 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
198 			error = sodisconnect(so);
199 			if (error)
200 				goto drop;
201 		}
202 		if (so->so_options & SO_LINGER) {
203 			if ((so->so_state & SS_ISDISCONNECTING) &&
204 			    (so->so_state & SS_NBIO))
205 				goto drop;
206 			while (so->so_state & SS_ISCONNECTED) {
207 				error = tsleep((caddr_t)&so->so_timeo,
208 					       PSOCK | PCATCH, netcls,
209 					       so->so_linger * hz);
210 				if (error)
211 					break;
212 			}
213 		}
214 	}
215  drop:
216 	if (so->so_pcb) {
217 		int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
218 		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
219 		    (struct proc *)0);
220 		if (error == 0)
221 			error = error2;
222 	}
223  discard:
224 	if (so->so_state & SS_NOFDREF)
225 		panic("soclose: NOFDREF");
226 	so->so_state |= SS_NOFDREF;
227 	sofree(so);
228 	splx(s);
229 	return (error);
230 }
231 
232 /*
233  * Must be called at splsoftnet...
234  */
235 int
236 soabort(struct socket *so)
237 {
238 
239 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
240 	    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
241 }
242 
243 int
244 soaccept(struct socket *so, struct mbuf *nam)
245 {
246 	int	s, error;
247 
248 	error = 0;
249 	s = splsoftnet();
250 	if ((so->so_state & SS_NOFDREF) == 0)
251 		panic("soaccept: !NOFDREF");
252 	so->so_state &= ~SS_NOFDREF;
253 	if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
254 	    (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
255 		error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
256 		    (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0);
257 	else
258 		error = ECONNABORTED;
259 
260 	splx(s);
261 	return (error);
262 }
263 
264 int
265 soconnect(struct socket *so, struct mbuf *nam)
266 {
267 	struct proc	*p;
268 	int		s, error;
269 
270 	p = curproc;		/* XXX */
271 	if (so->so_options & SO_ACCEPTCONN)
272 		return (EOPNOTSUPP);
273 	s = splsoftnet();
274 	/*
275 	 * If protocol is connection-based, can only connect once.
276 	 * Otherwise, if connected, try to disconnect first.
277 	 * This allows user to disconnect by connecting to, e.g.,
278 	 * a null address.
279 	 */
280 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
281 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
282 	    (error = sodisconnect(so))))
283 		error = EISCONN;
284 	else
285 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
286 		    (struct mbuf *)0, nam, (struct mbuf *)0, p);
287 	splx(s);
288 	return (error);
289 }
290 
291 int
292 soconnect2(struct socket *so1, struct socket *so2)
293 {
294 	int	s, error;
295 
296 	s = splsoftnet();
297 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
298 	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
299 	    (struct proc *)0);
300 	splx(s);
301 	return (error);
302 }
303 
304 int
305 sodisconnect(struct socket *so)
306 {
307 	int	s, error;
308 
309 	s = splsoftnet();
310 	if ((so->so_state & SS_ISCONNECTED) == 0) {
311 		error = ENOTCONN;
312 		goto bad;
313 	}
314 	if (so->so_state & SS_ISDISCONNECTING) {
315 		error = EALREADY;
316 		goto bad;
317 	}
318 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
319 	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
320 	    (struct proc *)0);
321  bad:
322 	splx(s);
323 	return (error);
324 }
325 
326 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
327 /*
328  * Send on a socket.
329  * If send must go all at once and message is larger than
330  * send buffering, then hard error.
331  * Lock against other senders.
332  * If must go all at once and not enough room now, then
333  * inform user that this would block and do nothing.
334  * Otherwise, if nonblocking, send as much as possible.
335  * The data to be sent is described by "uio" if nonzero,
336  * otherwise by the mbuf chain "top" (which must be null
337  * if uio is not).  Data provided in mbuf chain must be small
338  * enough to send all at once.
339  *
340  * Returns nonzero on error, timeout or signal; callers
341  * must check for short counts if EINTR/ERESTART are returned.
342  * Data and control buffers are freed on return.
343  */
344 int
345 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
346 	struct mbuf *control, int flags)
347 {
348 	struct proc	*p;
349 	struct mbuf	**mp, *m;
350 	long		space, len, resid, clen, mlen;
351 	int		error, s, dontroute, atomic;
352 
353 	p = curproc;		/* XXX */
354 	clen = 0;
355 	atomic = sosendallatonce(so) || top;
356 	if (uio)
357 		resid = uio->uio_resid;
358 	else
359 		resid = top->m_pkthdr.len;
360 	/*
361 	 * In theory resid should be unsigned.
362 	 * However, space must be signed, as it might be less than 0
363 	 * if we over-committed, and we must use a signed comparison
364 	 * of space and resid.  On the other hand, a negative resid
365 	 * causes us to loop sending 0-length segments to the protocol.
366 	 */
367 	if (resid < 0) {
368 		error = EINVAL;
369 		goto out;
370 	}
371 	dontroute =
372 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
373 	    (so->so_proto->pr_flags & PR_ATOMIC);
374 	p->p_stats->p_ru.ru_msgsnd++;
375 	if (control)
376 		clen = control->m_len;
377 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
378 
379  restart:
380 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
381 		goto out;
382 	do {
383 		s = splsoftnet();
384 		if (so->so_state & SS_CANTSENDMORE)
385 			snderr(EPIPE);
386 		if (so->so_error) {
387 			error = so->so_error;
388 			so->so_error = 0;
389 			splx(s);
390 			goto release;
391 		}
392 		if ((so->so_state & SS_ISCONNECTED) == 0) {
393 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
394 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
395 				    !(resid == 0 && clen != 0))
396 					snderr(ENOTCONN);
397 			} else if (addr == 0)
398 				snderr(EDESTADDRREQ);
399 		}
400 		space = sbspace(&so->so_snd);
401 		if (flags & MSG_OOB)
402 			space += 1024;
403 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
404 		    clen > so->so_snd.sb_hiwat)
405 			snderr(EMSGSIZE);
406 		if (space < resid + clen && uio &&
407 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
408 			if (so->so_state & SS_NBIO)
409 				snderr(EWOULDBLOCK);
410 			sbunlock(&so->so_snd);
411 			error = sbwait(&so->so_snd);
412 			splx(s);
413 			if (error)
414 				goto out;
415 			goto restart;
416 		}
417 		splx(s);
418 		mp = &top;
419 		space -= clen;
420 		do {
421 			if (uio == NULL) {
422 				/*
423 				 * Data is prepackaged in "top".
424 				 */
425 				resid = 0;
426 				if (flags & MSG_EOR)
427 					top->m_flags |= M_EOR;
428 			} else do {
429 				if (top == 0) {
430 					MGETHDR(m, M_WAIT, MT_DATA);
431 					mlen = MHLEN;
432 					m->m_pkthdr.len = 0;
433 					m->m_pkthdr.rcvif = (struct ifnet *)0;
434 				} else {
435 					MGET(m, M_WAIT, MT_DATA);
436 					mlen = MLEN;
437 				}
438 				if (resid >= MINCLSIZE && space >= MCLBYTES) {
439 					MCLGET(m, M_WAIT);
440 					if ((m->m_flags & M_EXT) == 0)
441 						goto nopages;
442 					mlen = MCLBYTES;
443 #ifdef	MAPPED_MBUFS
444 					len = lmin(MCLBYTES, resid);
445 #else
446 					if (atomic && top == 0) {
447 						len = lmin(MCLBYTES - max_hdr,
448 						    resid);
449 						m->m_data += max_hdr;
450 					} else
451 						len = lmin(MCLBYTES, resid);
452 #endif
453 					space -= len;
454 				} else {
455 nopages:
456 					len = lmin(lmin(mlen, resid), space);
457 					space -= len;
458 					/*
459 					 * For datagram protocols, leave room
460 					 * for protocol headers in first mbuf.
461 					 */
462 					if (atomic && top == 0 && len < mlen)
463 						MH_ALIGN(m, len);
464 				}
465 				error = uiomove(mtod(m, caddr_t), (int)len,
466 				    uio);
467 				resid = uio->uio_resid;
468 				m->m_len = len;
469 				*mp = m;
470 				top->m_pkthdr.len += len;
471 				if (error)
472 					goto release;
473 				mp = &m->m_next;
474 				if (resid <= 0) {
475 					if (flags & MSG_EOR)
476 						top->m_flags |= M_EOR;
477 					break;
478 				}
479 			} while (space > 0 && atomic);
480 
481 			s = splsoftnet();
482 
483 			if (so->so_state & SS_CANTSENDMORE)
484 				snderr(EPIPE);
485 
486 			if (dontroute)
487 				so->so_options |= SO_DONTROUTE;
488 			if (resid > 0)
489 				so->so_state |= SS_MORETOCOME;
490 			error = (*so->so_proto->pr_usrreq)(so,
491 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
492 			    top, addr, control, p);
493 			if (dontroute)
494 				so->so_options &= ~SO_DONTROUTE;
495 			if (resid > 0)
496 				so->so_state &= ~SS_MORETOCOME;
497 			splx(s);
498 
499 			clen = 0;
500 			control = 0;
501 			top = 0;
502 			mp = &top;
503 			if (error)
504 				goto release;
505 		} while (resid && space > 0);
506 	} while (resid);
507 
508  release:
509 	sbunlock(&so->so_snd);
510  out:
511 	if (top)
512 		m_freem(top);
513 	if (control)
514 		m_freem(control);
515 	return (error);
516 }
517 
518 /*
519  * Implement receive operations on a socket.
520  * We depend on the way that records are added to the sockbuf
521  * by sbappend*.  In particular, each record (mbufs linked through m_next)
522  * must begin with an address if the protocol so specifies,
523  * followed by an optional mbuf or mbufs containing ancillary data,
524  * and then zero or more mbufs of data.
525  * In order to avoid blocking network interrupts for the entire time here,
526  * we splx() while doing the actual copy to user space.
527  * Although the sockbuf is locked, new data may still be appended,
528  * and thus we must maintain consistency of the sockbuf during that time.
529  *
530  * The caller may receive the data as a single mbuf chain by supplying
531  * an mbuf **mp0 for use in returning the chain.  The uio is then used
532  * only for the count in uio_resid.
533  */
534 int
535 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
536 	struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
537 {
538 	struct mbuf	*m, **mp;
539 	int		flags, len, error, s, offset, moff, type, orig_resid;
540 	struct protosw	*pr;
541 	struct mbuf	*nextrecord;
542 
543 	pr = so->so_proto;
544 	mp = mp0;
545 	type = 0;
546 	orig_resid = uio->uio_resid;
547 	if (paddr)
548 		*paddr = 0;
549 	if (controlp)
550 		*controlp = 0;
551 	if (flagsp)
552 		flags = *flagsp &~ MSG_EOR;
553 	else
554 		flags = 0;
555 	if (flags & MSG_OOB) {
556 		m = m_get(M_WAIT, MT_DATA);
557 		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
558 		    (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0,
559 		    (struct proc *)0);
560 		if (error)
561 			goto bad;
562 		do {
563 			error = uiomove(mtod(m, caddr_t),
564 			    (int) min(uio->uio_resid, m->m_len), uio);
565 			m = m_free(m);
566 		} while (uio->uio_resid && error == 0 && m);
567  bad:
568 		if (m)
569 			m_freem(m);
570 		return (error);
571 	}
572 	if (mp)
573 		*mp = (struct mbuf *)0;
574 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
575 		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
576 		    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
577 
578  restart:
579 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
580 		return (error);
581 	s = splsoftnet();
582 
583 	m = so->so_rcv.sb_mb;
584 	/*
585 	 * If we have less data than requested, block awaiting more
586 	 * (subject to any timeout) if:
587 	 *   1. the current count is less than the low water mark,
588 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
589 	 *	receive operation at once if we block (resid <= hiwat), or
590 	 *   3. MSG_DONTWAIT is not set.
591 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
592 	 * we have to do the receive in sections, and thus risk returning
593 	 * a short count if a timeout or signal occurs after we start.
594 	 */
595 	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
596 	    so->so_rcv.sb_cc < uio->uio_resid) &&
597 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
598 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
599 	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
600 #ifdef DIAGNOSTIC
601 		if (m == 0 && so->so_rcv.sb_cc)
602 			panic("receive 1");
603 #endif
604 		if (so->so_error) {
605 			if (m)
606 				goto dontblock;
607 			error = so->so_error;
608 			if ((flags & MSG_PEEK) == 0)
609 				so->so_error = 0;
610 			goto release;
611 		}
612 		if (so->so_state & SS_CANTRCVMORE) {
613 			if (m)
614 				goto dontblock;
615 			else
616 				goto release;
617 		}
618 		for (; m; m = m->m_next)
619 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
620 				m = so->so_rcv.sb_mb;
621 				goto dontblock;
622 			}
623 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
624 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
625 			error = ENOTCONN;
626 			goto release;
627 		}
628 		if (uio->uio_resid == 0)
629 			goto release;
630 		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
631 			error = EWOULDBLOCK;
632 			goto release;
633 		}
634 		sbunlock(&so->so_rcv);
635 		error = sbwait(&so->so_rcv);
636 		splx(s);
637 		if (error)
638 			return (error);
639 		goto restart;
640 	}
641  dontblock:
642 #ifdef notyet /* XXXX */
643 	if (uio->uio_procp)
644 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
645 #endif
646 	nextrecord = m->m_nextpkt;
647 	if (pr->pr_flags & PR_ADDR) {
648 #ifdef DIAGNOSTIC
649 		if (m->m_type != MT_SONAME)
650 			panic("receive 1a");
651 #endif
652 		orig_resid = 0;
653 		if (flags & MSG_PEEK) {
654 			if (paddr)
655 				*paddr = m_copy(m, 0, m->m_len);
656 			m = m->m_next;
657 		} else {
658 			sbfree(&so->so_rcv, m);
659 			if (paddr) {
660 				*paddr = m;
661 				so->so_rcv.sb_mb = m->m_next;
662 				m->m_next = 0;
663 				m = so->so_rcv.sb_mb;
664 			} else {
665 				MFREE(m, so->so_rcv.sb_mb);
666 				m = so->so_rcv.sb_mb;
667 			}
668 		}
669 	}
670 	while (m && m->m_type == MT_CONTROL && error == 0) {
671 		if (flags & MSG_PEEK) {
672 			if (controlp)
673 				*controlp = m_copy(m, 0, m->m_len);
674 			m = m->m_next;
675 		} else {
676 			sbfree(&so->so_rcv, m);
677 			if (controlp) {
678 				if (pr->pr_domain->dom_externalize &&
679 				    mtod(m, struct cmsghdr *)->cmsg_type ==
680 				    SCM_RIGHTS)
681 					error = (*pr->pr_domain->dom_externalize)(m);
682 				*controlp = m;
683 				so->so_rcv.sb_mb = m->m_next;
684 				m->m_next = 0;
685 				m = so->so_rcv.sb_mb;
686 			} else {
687 				MFREE(m, so->so_rcv.sb_mb);
688 				m = so->so_rcv.sb_mb;
689 			}
690 		}
691 		if (controlp) {
692 			orig_resid = 0;
693 			controlp = &(*controlp)->m_next;
694 		}
695 	}
696 	if (m) {
697 		if ((flags & MSG_PEEK) == 0)
698 			m->m_nextpkt = nextrecord;
699 		type = m->m_type;
700 		if (type == MT_OOBDATA)
701 			flags |= MSG_OOB;
702 	}
703 	moff = 0;
704 	offset = 0;
705 	while (m && uio->uio_resid > 0 && error == 0) {
706 		if (m->m_type == MT_OOBDATA) {
707 			if (type != MT_OOBDATA)
708 				break;
709 		} else if (type == MT_OOBDATA)
710 			break;
711 #ifdef DIAGNOSTIC
712 		else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
713 			panic("receive 3");
714 #endif
715 		so->so_state &= ~SS_RCVATMARK;
716 		len = uio->uio_resid;
717 		if (so->so_oobmark && len > so->so_oobmark - offset)
718 			len = so->so_oobmark - offset;
719 		if (len > m->m_len - moff)
720 			len = m->m_len - moff;
721 		/*
722 		 * If mp is set, just pass back the mbufs.
723 		 * Otherwise copy them out via the uio, then free.
724 		 * Sockbuf must be consistent here (points to current mbuf,
725 		 * it points to next record) when we drop priority;
726 		 * we must note any additions to the sockbuf when we
727 		 * block interrupts again.
728 		 */
729 		if (mp == 0) {
730 			splx(s);
731 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
732 			s = splsoftnet();
733 			if (error)
734 				goto release;
735 		} else
736 			uio->uio_resid -= len;
737 		if (len == m->m_len - moff) {
738 			if (m->m_flags & M_EOR)
739 				flags |= MSG_EOR;
740 			if (flags & MSG_PEEK) {
741 				m = m->m_next;
742 				moff = 0;
743 			} else {
744 				nextrecord = m->m_nextpkt;
745 				sbfree(&so->so_rcv, m);
746 				if (mp) {
747 					*mp = m;
748 					mp = &m->m_next;
749 					so->so_rcv.sb_mb = m = m->m_next;
750 					*mp = (struct mbuf *)0;
751 				} else {
752 					MFREE(m, so->so_rcv.sb_mb);
753 					m = so->so_rcv.sb_mb;
754 				}
755 				if (m)
756 					m->m_nextpkt = nextrecord;
757 			}
758 		} else {
759 			if (flags & MSG_PEEK)
760 				moff += len;
761 			else {
762 				if (mp)
763 					*mp = m_copym(m, 0, len, M_WAIT);
764 				m->m_data += len;
765 				m->m_len -= len;
766 				so->so_rcv.sb_cc -= len;
767 			}
768 		}
769 		if (so->so_oobmark) {
770 			if ((flags & MSG_PEEK) == 0) {
771 				so->so_oobmark -= len;
772 				if (so->so_oobmark == 0) {
773 					so->so_state |= SS_RCVATMARK;
774 					break;
775 				}
776 			} else {
777 				offset += len;
778 				if (offset == so->so_oobmark)
779 					break;
780 			}
781 		}
782 		if (flags & MSG_EOR)
783 			break;
784 		/*
785 		 * If the MSG_WAITALL flag is set (for non-atomic socket),
786 		 * we must not quit until "uio->uio_resid == 0" or an error
787 		 * termination.  If a signal/timeout occurs, return
788 		 * with a short count but without error.
789 		 * Keep sockbuf locked against other readers.
790 		 */
791 		while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
792 		    !sosendallatonce(so) && !nextrecord) {
793 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
794 				break;
795 			error = sbwait(&so->so_rcv);
796 			if (error) {
797 				sbunlock(&so->so_rcv);
798 				splx(s);
799 				return (0);
800 			}
801 			if ((m = so->so_rcv.sb_mb) != NULL)
802 				nextrecord = m->m_nextpkt;
803 		}
804 	}
805 
806 	if (m && pr->pr_flags & PR_ATOMIC) {
807 		flags |= MSG_TRUNC;
808 		if ((flags & MSG_PEEK) == 0)
809 			(void) sbdroprecord(&so->so_rcv);
810 	}
811 	if ((flags & MSG_PEEK) == 0) {
812 		if (m == 0)
813 			so->so_rcv.sb_mb = nextrecord;
814 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
815 			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
816 			    (struct mbuf *)(long)flags, (struct mbuf *)0,
817 			    (struct proc *)0);
818 	}
819 	if (orig_resid == uio->uio_resid && orig_resid &&
820 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
821 		sbunlock(&so->so_rcv);
822 		splx(s);
823 		goto restart;
824 	}
825 
826 	if (flagsp)
827 		*flagsp |= flags;
828  release:
829 	sbunlock(&so->so_rcv);
830 	splx(s);
831 	return (error);
832 }
833 
834 int
835 soshutdown(struct socket *so, int how)
836 {
837 	struct protosw	*pr;
838 
839 	pr = so->so_proto;
840 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
841 		return (EINVAL);
842 
843 	if (how == SHUT_RD || how == SHUT_RDWR)
844 		sorflush(so);
845 	if (how == SHUT_WR || how == SHUT_RDWR)
846 		return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
847 		    (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
848 	return (0);
849 }
850 
851 void
852 sorflush(struct socket *so)
853 {
854 	struct sockbuf	*sb, asb;
855 	struct protosw	*pr;
856 	int		s;
857 
858 	sb = &so->so_rcv;
859 	pr = so->so_proto;
860 	sb->sb_flags |= SB_NOINTR;
861 	(void) sblock(sb, M_WAITOK);
862 	s = splnet();
863 	socantrcvmore(so);
864 	sbunlock(sb);
865 	asb = *sb;
866 	memset((caddr_t)sb, 0, sizeof(*sb));
867 	splx(s);
868 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
869 		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
870 	sbrelease(&asb);
871 }
872 
873 int
874 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
875 {
876 	int		error;
877 	struct mbuf	*m;
878 
879 	error = 0;
880 	m = m0;
881 	if (level != SOL_SOCKET) {
882 		if (so->so_proto && so->so_proto->pr_ctloutput)
883 			return ((*so->so_proto->pr_ctloutput)
884 				  (PRCO_SETOPT, so, level, optname, &m0));
885 		error = ENOPROTOOPT;
886 	} else {
887 		switch (optname) {
888 
889 		case SO_LINGER:
890 			if (m == NULL || m->m_len != sizeof(struct linger)) {
891 				error = EINVAL;
892 				goto bad;
893 			}
894 			so->so_linger = mtod(m, struct linger *)->l_linger;
895 			/* fall thru... */
896 
897 		case SO_DEBUG:
898 		case SO_KEEPALIVE:
899 		case SO_DONTROUTE:
900 		case SO_USELOOPBACK:
901 		case SO_BROADCAST:
902 		case SO_REUSEADDR:
903 		case SO_REUSEPORT:
904 		case SO_OOBINLINE:
905 		case SO_TIMESTAMP:
906 			if (m == NULL || m->m_len < sizeof(int)) {
907 				error = EINVAL;
908 				goto bad;
909 			}
910 			if (*mtod(m, int *))
911 				so->so_options |= optname;
912 			else
913 				so->so_options &= ~optname;
914 			break;
915 
916 		case SO_SNDBUF:
917 		case SO_RCVBUF:
918 		case SO_SNDLOWAT:
919 		case SO_RCVLOWAT:
920 		    {
921 			int optval;
922 
923 			if (m == NULL || m->m_len < sizeof(int)) {
924 				error = EINVAL;
925 				goto bad;
926 			}
927 
928 			/*
929 			 * Values < 1 make no sense for any of these
930 			 * options, so disallow them.
931 			 */
932 			optval = *mtod(m, int *);
933 			if (optval < 1) {
934 				error = EINVAL;
935 				goto bad;
936 			}
937 
938 			switch (optname) {
939 
940 			case SO_SNDBUF:
941 			case SO_RCVBUF:
942 				if (sbreserve(optname == SO_SNDBUF ?
943 				    &so->so_snd : &so->so_rcv,
944 				    (u_long) optval) == 0) {
945 					error = ENOBUFS;
946 					goto bad;
947 				}
948 				break;
949 
950 			/*
951 			 * Make sure the low-water is never greater than
952 			 * the high-water.
953 			 */
954 			case SO_SNDLOWAT:
955 				so->so_snd.sb_lowat =
956 				    (optval > so->so_snd.sb_hiwat) ?
957 				    so->so_snd.sb_hiwat : optval;
958 				break;
959 			case SO_RCVLOWAT:
960 				so->so_rcv.sb_lowat =
961 				    (optval > so->so_rcv.sb_hiwat) ?
962 				    so->so_rcv.sb_hiwat : optval;
963 				break;
964 			}
965 			break;
966 		    }
967 
968 		case SO_SNDTIMEO:
969 		case SO_RCVTIMEO:
970 		    {
971 			struct timeval *tv;
972 			short val;
973 
974 			if (m == NULL || m->m_len < sizeof(*tv)) {
975 				error = EINVAL;
976 				goto bad;
977 			}
978 			tv = mtod(m, struct timeval *);
979 			if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
980 				error = EDOM;
981 				goto bad;
982 			}
983 			val = tv->tv_sec * hz + tv->tv_usec / tick;
984 
985 			switch (optname) {
986 
987 			case SO_SNDTIMEO:
988 				so->so_snd.sb_timeo = val;
989 				break;
990 			case SO_RCVTIMEO:
991 				so->so_rcv.sb_timeo = val;
992 				break;
993 			}
994 			break;
995 		    }
996 
997 		default:
998 			error = ENOPROTOOPT;
999 			break;
1000 		}
1001 		if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1002 			(void) ((*so->so_proto->pr_ctloutput)
1003 				  (PRCO_SETOPT, so, level, optname, &m0));
1004 			m = NULL;	/* freed by protocol */
1005 		}
1006 	}
1007  bad:
1008 	if (m)
1009 		(void) m_free(m);
1010 	return (error);
1011 }
1012 
1013 int
1014 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
1015 {
1016 	struct mbuf	*m;
1017 
1018 	if (level != SOL_SOCKET) {
1019 		if (so->so_proto && so->so_proto->pr_ctloutput) {
1020 			return ((*so->so_proto->pr_ctloutput)
1021 				  (PRCO_GETOPT, so, level, optname, mp));
1022 		} else
1023 			return (ENOPROTOOPT);
1024 	} else {
1025 		m = m_get(M_WAIT, MT_SOOPTS);
1026 		m->m_len = sizeof(int);
1027 
1028 		switch (optname) {
1029 
1030 		case SO_LINGER:
1031 			m->m_len = sizeof(struct linger);
1032 			mtod(m, struct linger *)->l_onoff =
1033 				so->so_options & SO_LINGER;
1034 			mtod(m, struct linger *)->l_linger = so->so_linger;
1035 			break;
1036 
1037 		case SO_USELOOPBACK:
1038 		case SO_DONTROUTE:
1039 		case SO_DEBUG:
1040 		case SO_KEEPALIVE:
1041 		case SO_REUSEADDR:
1042 		case SO_REUSEPORT:
1043 		case SO_BROADCAST:
1044 		case SO_OOBINLINE:
1045 		case SO_TIMESTAMP:
1046 			*mtod(m, int *) = so->so_options & optname;
1047 			break;
1048 
1049 		case SO_TYPE:
1050 			*mtod(m, int *) = so->so_type;
1051 			break;
1052 
1053 		case SO_ERROR:
1054 			*mtod(m, int *) = so->so_error;
1055 			so->so_error = 0;
1056 			break;
1057 
1058 		case SO_SNDBUF:
1059 			*mtod(m, int *) = so->so_snd.sb_hiwat;
1060 			break;
1061 
1062 		case SO_RCVBUF:
1063 			*mtod(m, int *) = so->so_rcv.sb_hiwat;
1064 			break;
1065 
1066 		case SO_SNDLOWAT:
1067 			*mtod(m, int *) = so->so_snd.sb_lowat;
1068 			break;
1069 
1070 		case SO_RCVLOWAT:
1071 			*mtod(m, int *) = so->so_rcv.sb_lowat;
1072 			break;
1073 
1074 		case SO_SNDTIMEO:
1075 		case SO_RCVTIMEO:
1076 		    {
1077 			int val = (optname == SO_SNDTIMEO ?
1078 			     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1079 
1080 			m->m_len = sizeof(struct timeval);
1081 			mtod(m, struct timeval *)->tv_sec = val / hz;
1082 			mtod(m, struct timeval *)->tv_usec =
1083 			    (val % hz) * tick;
1084 			break;
1085 		    }
1086 
1087 		default:
1088 			(void)m_free(m);
1089 			return (ENOPROTOOPT);
1090 		}
1091 		*mp = m;
1092 		return (0);
1093 	}
1094 }
1095 
1096 void
1097 sohasoutofband(struct socket *so)
1098 {
1099 	struct proc *p;
1100 
1101 	if (so->so_pgid < 0)
1102 		gsignal(-so->so_pgid, SIGURG);
1103 	else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1104 		psignal(p, SIGURG);
1105 	selwakeup(&so->so_rcv.sb_sel);
1106 }
1107