xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision 4c1e55dc91edd6e69ccc60ce855900fbc12cf34f)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.66 2012/04/26 17:18:17 matthew Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/unpcb.h>
44 #include <sys/un.h>
45 #include <sys/namei.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/stat.h>
49 #include <sys/mbuf.h>
50 
51 /*
52  * Unix communications domain.
53  *
54  * TODO:
55  *	RDM
56  *	rethink name space problems
57  *	need a proper out-of-band
58  */
59 struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
60 ino_t	unp_ino;			/* prototype for fake inode numbers */
61 
62 /*ARGSUSED*/
63 int
64 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
65     struct mbuf *control, struct proc *p)
66 {
67 	struct unpcb *unp = sotounpcb(so);
68 	struct socket *so2;
69 	int error = 0;
70 
71 	if (req == PRU_CONTROL)
72 		return (EOPNOTSUPP);
73 	if (req != PRU_SEND && control && control->m_len) {
74 		error = EOPNOTSUPP;
75 		goto release;
76 	}
77 	if (unp == NULL && req != PRU_ATTACH) {
78 		error = EINVAL;
79 		goto release;
80 	}
81 	switch (req) {
82 
83 	case PRU_ATTACH:
84 		if (unp) {
85 			error = EISCONN;
86 			break;
87 		}
88 		error = unp_attach(so);
89 		break;
90 
91 	case PRU_DETACH:
92 		unp_detach(unp);
93 		break;
94 
95 	case PRU_BIND:
96 		error = unp_bind(unp, nam, p);
97 		break;
98 
99 	case PRU_LISTEN:
100 		if (unp->unp_vnode == NULL)
101 			error = EINVAL;
102 		break;
103 
104 	case PRU_CONNECT:
105 		error = unp_connect(so, nam, p);
106 		break;
107 
108 	case PRU_CONNECT2:
109 		error = unp_connect2(so, (struct socket *)nam);
110 		break;
111 
112 	case PRU_DISCONNECT:
113 		unp_disconnect(unp);
114 		break;
115 
116 	case PRU_ACCEPT:
117 		/*
118 		 * Pass back name of connected socket,
119 		 * if it was bound and we are still connected
120 		 * (our peer may have closed already!).
121 		 */
122 		if (unp->unp_conn && unp->unp_conn->unp_addr) {
123 			nam->m_len = unp->unp_conn->unp_addr->m_len;
124 			bcopy(mtod(unp->unp_conn->unp_addr, caddr_t),
125 			    mtod(nam, caddr_t), nam->m_len);
126 		} else {
127 			nam->m_len = sizeof(sun_noname);
128 			*(mtod(nam, struct sockaddr *)) = sun_noname;
129 		}
130 		break;
131 
132 	case PRU_SHUTDOWN:
133 		socantsendmore(so);
134 		unp_shutdown(unp);
135 		break;
136 
137 	case PRU_RCVD:
138 		switch (so->so_type) {
139 
140 		case SOCK_DGRAM:
141 			panic("uipc 1");
142 			/*NOTREACHED*/
143 
144 		case SOCK_STREAM:
145 		case SOCK_SEQPACKET:
146 #define	rcv (&so->so_rcv)
147 #define snd (&so2->so_snd)
148 			if (unp->unp_conn == NULL)
149 				break;
150 			so2 = unp->unp_conn->unp_socket;
151 			/*
152 			 * Adjust backpressure on sender
153 			 * and wakeup any waiting to write.
154 			 */
155 			snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
156 			unp->unp_mbcnt = rcv->sb_mbcnt;
157 			snd->sb_hiwat += unp->unp_cc - rcv->sb_cc;
158 			unp->unp_cc = rcv->sb_cc;
159 			sowwakeup(so2);
160 #undef snd
161 #undef rcv
162 			break;
163 
164 		default:
165 			panic("uipc 2");
166 		}
167 		break;
168 
169 	case PRU_SEND:
170 		if (control && (error = unp_internalize(control, p)))
171 			break;
172 		switch (so->so_type) {
173 
174 		case SOCK_DGRAM: {
175 			struct sockaddr *from;
176 
177 			if (nam) {
178 				if (unp->unp_conn) {
179 					error = EISCONN;
180 					break;
181 				}
182 				error = unp_connect(so, nam, p);
183 				if (error)
184 					break;
185 			} else {
186 				if (unp->unp_conn == NULL) {
187 					error = ENOTCONN;
188 					break;
189 				}
190 			}
191 			so2 = unp->unp_conn->unp_socket;
192 			if (unp->unp_addr)
193 				from = mtod(unp->unp_addr, struct sockaddr *);
194 			else
195 				from = &sun_noname;
196 			if (sbappendaddr(&so2->so_rcv, from, m, control)) {
197 				sorwakeup(so2);
198 				m = NULL;
199 				control = NULL;
200 			} else
201 				error = ENOBUFS;
202 			if (nam)
203 				unp_disconnect(unp);
204 			break;
205 		}
206 
207 		case SOCK_STREAM:
208 		case SOCK_SEQPACKET:
209 #define	rcv (&so2->so_rcv)
210 #define	snd (&so->so_snd)
211 			if (so->so_state & SS_CANTSENDMORE) {
212 				error = EPIPE;
213 				break;
214 			}
215 			if (unp->unp_conn == NULL) {
216 				error = ENOTCONN;
217 				break;
218 			}
219 			so2 = unp->unp_conn->unp_socket;
220 			/*
221 			 * Send to paired receive port, and then reduce
222 			 * send buffer hiwater marks to maintain backpressure.
223 			 * Wake up readers.
224 			 */
225 			if (control) {
226 				if (sbappendcontrol(rcv, m, control))
227 					control = NULL;
228 			} else if (so->so_type == SOCK_SEQPACKET)
229 				sbappendrecord(rcv, m);
230 			else
231 				sbappend(rcv, m);
232 			snd->sb_mbmax -=
233 			    rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
234 			unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
235 			snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc;
236 			unp->unp_conn->unp_cc = rcv->sb_cc;
237 			sorwakeup(so2);
238 			m = NULL;
239 #undef snd
240 #undef rcv
241 			break;
242 
243 		default:
244 			panic("uipc 4");
245 		}
246 		/* we need to undo unp_internalize in case of errors */
247 		if (control && error)
248 			unp_dispose(control);
249 		break;
250 
251 	case PRU_ABORT:
252 		unp_drop(unp, ECONNABORTED);
253 		break;
254 
255 	case PRU_SENSE:
256 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
257 		switch (so->so_type) {
258 		case SOCK_STREAM:
259 		case SOCK_SEQPACKET:
260 			if (unp->unp_conn != NULL) {
261 				so2 = unp->unp_conn->unp_socket;
262 				((struct stat *) m)->st_blksize +=
263 				    so2->so_rcv.sb_cc;
264 			}
265 			break;
266 		default:
267 			break;
268 		}
269 		((struct stat *) m)->st_dev = NODEV;
270 		if (unp->unp_ino == 0)
271 			unp->unp_ino = unp_ino++;
272 		((struct stat *) m)->st_atim =
273 		    ((struct stat *) m)->st_mtim =
274 		    ((struct stat *) m)->st_ctim = unp->unp_ctime;
275 		((struct stat *) m)->st_ino = unp->unp_ino;
276 		return (0);
277 
278 	case PRU_RCVOOB:
279 		return (EOPNOTSUPP);
280 
281 	case PRU_SENDOOB:
282 		error = EOPNOTSUPP;
283 		break;
284 
285 	case PRU_SOCKADDR:
286 		if (unp->unp_addr) {
287 			nam->m_len = unp->unp_addr->m_len;
288 			bcopy(mtod(unp->unp_addr, caddr_t),
289 			    mtod(nam, caddr_t), nam->m_len);
290 		} else
291 			nam->m_len = 0;
292 		break;
293 
294 	case PRU_PEERADDR:
295 		if (unp->unp_conn && unp->unp_conn->unp_addr) {
296 			nam->m_len = unp->unp_conn->unp_addr->m_len;
297 			bcopy(mtod(unp->unp_conn->unp_addr, caddr_t),
298 			    mtod(nam, caddr_t), nam->m_len);
299 		} else
300 			nam->m_len = 0;
301 		break;
302 
303 	case PRU_SLOWTIMO:
304 		break;
305 
306 	default:
307 		panic("piusrreq");
308 	}
309 release:
310 	if (control)
311 		m_freem(control);
312 	if (m)
313 		m_freem(m);
314 	return (error);
315 }
316 
317 /*
318  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
319  * for stream sockets, although the total for sender and receiver is
320  * actually only PIPSIZ.
321  * Datagram sockets really use the sendspace as the maximum datagram size,
322  * and don't really want to reserve the sendspace.  Their recvspace should
323  * be large enough for at least one max-size datagram plus address.
324  */
325 #define	PIPSIZ	4096
326 u_long	unpst_sendspace = PIPSIZ;
327 u_long	unpst_recvspace = PIPSIZ;
328 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
329 u_long	unpdg_recvspace = 4*1024;
330 
331 int	unp_rights;			/* file descriptors in flight */
332 
333 int
334 unp_attach(struct socket *so)
335 {
336 	struct unpcb *unp;
337 	int error;
338 
339 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
340 		switch (so->so_type) {
341 
342 		case SOCK_STREAM:
343 		case SOCK_SEQPACKET:
344 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
345 			break;
346 
347 		case SOCK_DGRAM:
348 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
349 			break;
350 
351 		default:
352 			panic("unp_attach");
353 		}
354 		if (error)
355 			return (error);
356 	}
357 	unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT|M_ZERO);
358 	if (unp == NULL)
359 		return (ENOBUFS);
360 	unp->unp_socket = so;
361 	so->so_pcb = unp;
362 	getnanotime(&unp->unp_ctime);
363 	return (0);
364 }
365 
366 void
367 unp_detach(struct unpcb *unp)
368 {
369 
370 	if (unp->unp_vnode) {
371 		unp->unp_vnode->v_socket = NULL;
372 		vrele(unp->unp_vnode);
373 		unp->unp_vnode = NULL;
374 	}
375 	if (unp->unp_conn)
376 		unp_disconnect(unp);
377 	while (unp->unp_refs)
378 		unp_drop(unp->unp_refs, ECONNRESET);
379 	soisdisconnected(unp->unp_socket);
380 	unp->unp_socket->so_pcb = NULL;
381 	m_freem(unp->unp_addr);
382 	if (unp_rights) {
383 		/*
384 		 * Normally the receive buffer is flushed later,
385 		 * in sofree, but if our receive buffer holds references
386 		 * to descriptors that are now garbage, we will dispose
387 		 * of those descriptor references after the garbage collector
388 		 * gets them (resulting in a "panic: closef: count < 0").
389 		 */
390 		sorflush(unp->unp_socket);
391 		free(unp, M_PCB);
392 		unp_gc();
393 	} else
394 		free(unp, M_PCB);
395 }
396 
397 int
398 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
399 {
400 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
401 	struct mbuf *nam2;
402 	struct vnode *vp;
403 	struct vattr vattr;
404 	int error;
405 	struct nameidata nd;
406 	size_t pathlen;
407 
408 	if (unp->unp_vnode != NULL)
409 		return (EINVAL);
410 
411 	if (soun->sun_len > sizeof(struct sockaddr_un) ||
412 	    soun->sun_len < offsetof(struct sockaddr_un, sun_path))
413 		return (EINVAL);
414 	if (soun->sun_family != AF_UNIX)
415 		return (EAFNOSUPPORT);
416 
417 	pathlen = strnlen(soun->sun_path, soun->sun_len -
418 	    offsetof(struct sockaddr_un, sun_path));
419 	if (pathlen == sizeof(soun->sun_path))
420 		return (EINVAL);
421 
422 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
423 	nam2->m_len = sizeof(struct sockaddr_un);
424 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
425 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
426 	/* No need to NUL terminate: m_getclr() returns bzero'd mbufs. */
427 
428 	soun = mtod(nam2, struct sockaddr_un *);
429 
430 	/* Fixup sun_len to keep it in sync with m_len. */
431 	soun->sun_len = nam2->m_len;
432 
433 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
434 	    soun->sun_path, p);
435 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
436 	if ((error = namei(&nd)) != 0) {
437 		m_freem(nam2);
438 		return (error);
439 	}
440 	vp = nd.ni_vp;
441 	if (vp != NULL) {
442 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
443 		if (nd.ni_dvp == vp)
444 			vrele(nd.ni_dvp);
445 		else
446 			vput(nd.ni_dvp);
447 		vrele(vp);
448 		m_freem(nam2);
449 		return (EADDRINUSE);
450 	}
451 	VATTR_NULL(&vattr);
452 	vattr.va_type = VSOCK;
453 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
454 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
455 	if (error) {
456 		m_freem(nam2);
457 		return (error);
458 	}
459 	unp->unp_addr = nam2;
460 	vp = nd.ni_vp;
461 	vp->v_socket = unp->unp_socket;
462 	unp->unp_vnode = vp;
463 	unp->unp_connid.uid = p->p_ucred->cr_uid;
464 	unp->unp_connid.gid = p->p_ucred->cr_gid;
465 	unp->unp_connid.pid = p->p_p->ps_mainproc->p_pid;
466 	unp->unp_flags |= UNP_FEIDSBIND;
467 	VOP_UNLOCK(vp, 0, p);
468 	return (0);
469 }
470 
471 int
472 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
473 {
474 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
475 	struct vnode *vp;
476 	struct socket *so2, *so3;
477 	struct unpcb *unp, *unp2, *unp3;
478 	int error;
479 	struct nameidata nd;
480 
481 	if (soun->sun_family != AF_UNIX)
482 		return (EAFNOSUPPORT);
483 
484 	if (nam->m_len < sizeof(struct sockaddr_un))
485 		*(mtod(nam, caddr_t) + nam->m_len) = 0;
486 	else if (nam->m_len > sizeof(struct sockaddr_un))
487 		return (EINVAL);
488 	else if (memchr(soun->sun_path, '\0', sizeof(soun->sun_path)) == NULL)
489 		return (EINVAL);
490 
491 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
492 	if ((error = namei(&nd)) != 0)
493 		return (error);
494 	vp = nd.ni_vp;
495 	if (vp->v_type != VSOCK) {
496 		error = ENOTSOCK;
497 		goto bad;
498 	}
499 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
500 		goto bad;
501 	so2 = vp->v_socket;
502 	if (so2 == NULL) {
503 		error = ECONNREFUSED;
504 		goto bad;
505 	}
506 	if (so->so_type != so2->so_type) {
507 		error = EPROTOTYPE;
508 		goto bad;
509 	}
510 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
511 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
512 		    (so3 = sonewconn(so2, 0)) == 0) {
513 			error = ECONNREFUSED;
514 			goto bad;
515 		}
516 		unp = sotounpcb(so);
517 		unp2 = sotounpcb(so2);
518 		unp3 = sotounpcb(so3);
519 		if (unp2->unp_addr)
520 			unp3->unp_addr =
521 			    m_copy(unp2->unp_addr, 0, (int)M_COPYALL);
522 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
523 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
524 		unp3->unp_connid.pid = p->p_p->ps_mainproc->p_pid;
525 		unp3->unp_flags |= UNP_FEIDS;
526 		so2 = so3;
527 		if (unp2->unp_flags & UNP_FEIDSBIND) {
528 			unp->unp_connid = unp2->unp_connid;
529 			unp->unp_flags |= UNP_FEIDS;
530 		}
531 	}
532 	error = unp_connect2(so, so2);
533 bad:
534 	vput(vp);
535 	return (error);
536 }
537 
538 int
539 unp_connect2(struct socket *so, struct socket *so2)
540 {
541 	struct unpcb *unp = sotounpcb(so);
542 	struct unpcb *unp2;
543 
544 	if (so2->so_type != so->so_type)
545 		return (EPROTOTYPE);
546 	unp2 = sotounpcb(so2);
547 	unp->unp_conn = unp2;
548 	switch (so->so_type) {
549 
550 	case SOCK_DGRAM:
551 		unp->unp_nextref = unp2->unp_refs;
552 		unp2->unp_refs = unp;
553 		soisconnected(so);
554 		break;
555 
556 	case SOCK_STREAM:
557 	case SOCK_SEQPACKET:
558 		unp2->unp_conn = unp;
559 		soisconnected(so);
560 		soisconnected(so2);
561 		break;
562 
563 	default:
564 		panic("unp_connect2");
565 	}
566 	return (0);
567 }
568 
569 void
570 unp_disconnect(struct unpcb *unp)
571 {
572 	struct unpcb *unp2 = unp->unp_conn;
573 
574 	if (unp2 == NULL)
575 		return;
576 	unp->unp_conn = NULL;
577 	switch (unp->unp_socket->so_type) {
578 
579 	case SOCK_DGRAM:
580 		if (unp2->unp_refs == unp)
581 			unp2->unp_refs = unp->unp_nextref;
582 		else {
583 			unp2 = unp2->unp_refs;
584 			for (;;) {
585 				if (unp2 == NULL)
586 					panic("unp_disconnect");
587 				if (unp2->unp_nextref == unp)
588 					break;
589 				unp2 = unp2->unp_nextref;
590 			}
591 			unp2->unp_nextref = unp->unp_nextref;
592 		}
593 		unp->unp_nextref = NULL;
594 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
595 		break;
596 
597 	case SOCK_STREAM:
598 	case SOCK_SEQPACKET:
599 		soisdisconnected(unp->unp_socket);
600 		unp2->unp_conn = NULL;
601 		soisdisconnected(unp2->unp_socket);
602 		break;
603 	}
604 }
605 
606 void
607 unp_shutdown(struct unpcb *unp)
608 {
609 	struct socket *so;
610 
611 	switch (unp->unp_socket->so_type) {
612 	case SOCK_STREAM:
613 	case SOCK_SEQPACKET:
614 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
615 			socantrcvmore(so);
616 		break;
617 	default:
618 		break;
619 	}
620 }
621 
622 void
623 unp_drop(struct unpcb *unp, int errno)
624 {
625 	struct socket *so = unp->unp_socket;
626 
627 	so->so_error = errno;
628 	unp_disconnect(unp);
629 	if (so->so_head) {
630 		so->so_pcb = NULL;
631 		sofree(so);
632 		m_freem(unp->unp_addr);
633 		free(unp, M_PCB);
634 	}
635 }
636 
637 #ifdef notdef
638 unp_drain(void)
639 {
640 
641 }
642 #endif
643 
644 int
645 unp_externalize(struct mbuf *rights, socklen_t controllen)
646 {
647 	struct proc *p = curproc;		/* XXX */
648 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
649 	int i, *fdp = NULL;
650 	struct file **rp;
651 	struct file *fp;
652 	int nfds, error = 0;
653 
654 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
655 	    sizeof(struct file *);
656 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
657 		controllen = 0;
658 	else
659 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
660 	if (nfds > controllen / sizeof(int)) {
661 		error = EMSGSIZE;
662 		goto restart;
663 	}
664 
665 	rp = (struct file **)CMSG_DATA(cm);
666 
667 	fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
668 
669 	/* Make sure the recipient should be able to see the descriptors.. */
670 	if (p->p_fd->fd_rdir != NULL) {
671 		rp = (struct file **)CMSG_DATA(cm);
672 		for (i = 0; i < nfds; i++) {
673 			fp = *rp++;
674 			/*
675 			 * No to block devices.  If passing a directory,
676 			 * make sure that it is underneath the root.
677 			 */
678 			if (fp->f_type == DTYPE_VNODE) {
679 				struct vnode *vp = (struct vnode *)fp->f_data;
680 
681 				if (vp->v_type == VBLK ||
682 				    (vp->v_type == VDIR &&
683 				    !vn_isunder(vp, p->p_fd->fd_rdir, p))) {
684 					error = EPERM;
685 					break;
686 				}
687 			}
688 		}
689 	}
690 
691 restart:
692 	fdplock(p->p_fd);
693 	if (error != 0) {
694 		rp = ((struct file **)CMSG_DATA(cm));
695 		for (i = 0; i < nfds; i++) {
696 			fp = *rp;
697 			/*
698 			 * zero the pointer before calling unp_discard,
699 			 * since it may end up in unp_gc()..
700 			 */
701 			*rp++ = NULL;
702 			unp_discard(fp);
703 		}
704 		goto out;
705 	}
706 
707 	/*
708 	 * First loop -- allocate file descriptor table slots for the
709 	 * new descriptors.
710 	 */
711 	rp = ((struct file **)CMSG_DATA(cm));
712 	for (i = 0; i < nfds; i++) {
713 		bcopy(rp, &fp, sizeof(fp));
714 		rp++;
715 		if ((error = fdalloc(p, 0, &fdp[i])) != 0) {
716 			/*
717 			 * Back out what we've done so far.
718 			 */
719 			for (--i; i >= 0; i--)
720 				fdremove(p->p_fd, fdp[i]);
721 
722 			if (error == ENOSPC) {
723 				fdexpand(p);
724 				error = 0;
725 			} else {
726 				/*
727 				 * This is the error that has historically
728 				 * been returned, and some callers may
729 				 * expect it.
730 				 */
731 				error = EMSGSIZE;
732 			}
733 			fdpunlock(p->p_fd);
734 			goto restart;
735 		}
736 
737 		/*
738 		 * Make the slot reference the descriptor so that
739 		 * fdalloc() works properly.. We finalize it all
740 		 * in the loop below.
741 		 */
742 		p->p_fd->fd_ofiles[fdp[i]] = fp;
743 	}
744 
745 	/*
746 	 * Now that adding them has succeeded, update all of the
747 	 * descriptor passing state.
748 	 */
749 	rp = (struct file **)CMSG_DATA(cm);
750 	for (i = 0; i < nfds; i++) {
751 		fp = *rp++;
752 		fp->f_msgcount--;
753 		unp_rights--;
754 	}
755 
756 	/*
757 	 * Copy temporary array to message and adjust length, in case of
758 	 * transition from large struct file pointers to ints.
759 	 */
760 	memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
761 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
762 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
763  out:
764 	fdpunlock(p->p_fd);
765 	if (fdp)
766 		free(fdp, M_TEMP);
767 	return (error);
768 }
769 
770 int
771 unp_internalize(struct mbuf *control, struct proc *p)
772 {
773 	struct filedesc *fdp = p->p_fd;
774 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
775 	struct file **rp, *fp;
776 	int i, error;
777 	int nfds, *ip, fd, neededspace;
778 
779 	/*
780 	 * Check for two potential msg_controllen values because
781 	 * IETF stuck their nose in a place it does not belong.
782 	 */
783 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
784 	    !(cm->cmsg_len == control->m_len ||
785 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
786 		return (EINVAL);
787 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
788 
789 	if (unp_rights + nfds > maxfiles / 10)
790 		return (EMFILE);
791 
792 	/* Make sure we have room for the struct file pointers */
793 morespace:
794 	neededspace = CMSG_SPACE(nfds * sizeof(struct file *)) -
795 	    control->m_len;
796 	if (neededspace > M_TRAILINGSPACE(control)) {
797 		char *tmp;
798 		/* if we already have a cluster, the message is just too big */
799 		if (control->m_flags & M_EXT)
800 			return (E2BIG);
801 
802 		/* copy cmsg data temporarily out of the mbuf */
803 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
804 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
805 
806 		/* allocate a cluster and try again */
807 		MCLGET(control, M_WAIT);
808 		if ((control->m_flags & M_EXT) == 0) {
809 			free(tmp, M_TEMP);
810 			return (ENOBUFS);       /* allocation failed */
811 		}
812 
813 		/* copy the data back into the cluster */
814 		cm = mtod(control, struct cmsghdr *);
815 		memcpy(cm, tmp, control->m_len);
816 		free(tmp, M_TEMP);
817 		goto morespace;
818 	}
819 
820 	/* adjust message & mbuf to note amount of space actually used. */
821 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct file *));
822 	control->m_len = CMSG_SPACE(nfds * sizeof(struct file *));
823 
824 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
825 	rp = ((struct file **)CMSG_DATA(cm)) + nfds - 1;
826 	for (i = 0; i < nfds; i++) {
827 		bcopy(ip, &fd, sizeof fd);
828 		ip--;
829 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
830 			error = EBADF;
831 			goto fail;
832 		}
833 		if (fp->f_count == LONG_MAX-2 ||
834 		    fp->f_msgcount == LONG_MAX-2) {
835 			error = EDEADLK;
836 			goto fail;
837 		}
838 		/* kq descriptors cannot be copied */
839 		if (fp->f_type == DTYPE_KQUEUE) {
840 			error = EINVAL;
841 			goto fail;
842 		}
843 		bcopy(&fp, rp, sizeof fp);
844 		rp--;
845 		fp->f_count++;
846 		fp->f_msgcount++;
847 		unp_rights++;
848 	}
849 	return (0);
850 fail:
851 	/* Back out what we just did. */
852 	for ( ; i > 0; i--) {
853 		rp++;
854 		bcopy(rp, &fp, sizeof(fp));
855 		fp->f_count--;
856 		fp->f_msgcount--;
857 		unp_rights--;
858 	}
859 
860 	return (error);
861 }
862 
863 int	unp_defer, unp_gcing;
864 extern	struct domain unixdomain;
865 
866 void
867 unp_gc(void)
868 {
869 	struct file *fp, *nextfp;
870 	struct socket *so;
871 	struct file **extra_ref, **fpp;
872 	int nunref, i;
873 
874 	if (unp_gcing)
875 		return;
876 	unp_gcing = 1;
877 	unp_defer = 0;
878 	LIST_FOREACH(fp, &filehead, f_list)
879 		fp->f_iflags &= ~(FIF_MARK|FIF_DEFER);
880 	do {
881 		LIST_FOREACH(fp, &filehead, f_list) {
882 			if (fp->f_iflags & FIF_DEFER) {
883 				fp->f_iflags &= ~FIF_DEFER;
884 				unp_defer--;
885 			} else {
886 				if (fp->f_count == 0)
887 					continue;
888 				if (fp->f_iflags & FIF_MARK)
889 					continue;
890 				if (fp->f_count == fp->f_msgcount)
891 					continue;
892 			}
893 			fp->f_iflags |= FIF_MARK;
894 
895 			if (fp->f_type != DTYPE_SOCKET ||
896 			    (so = (struct socket *)fp->f_data) == NULL)
897 				continue;
898 			if (so->so_proto->pr_domain != &unixdomain ||
899 			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
900 				continue;
901 #ifdef notdef
902 			if (so->so_rcv.sb_flags & SB_LOCK) {
903 				/*
904 				 * This is problematical; it's not clear
905 				 * we need to wait for the sockbuf to be
906 				 * unlocked (on a uniprocessor, at least),
907 				 * and it's also not clear what to do
908 				 * if sbwait returns an error due to receipt
909 				 * of a signal.  If sbwait does return
910 				 * an error, we'll go into an infinite
911 				 * loop.  Delete all of this for now.
912 				 */
913 				(void) sbwait(&so->so_rcv);
914 				goto restart;
915 			}
916 #endif
917 			unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
918 		}
919 	} while (unp_defer);
920 	/*
921 	 * We grab an extra reference to each of the file table entries
922 	 * that are not otherwise accessible and then free the rights
923 	 * that are stored in messages on them.
924 	 *
925 	 * The bug in the original code is a little tricky, so I'll describe
926 	 * what's wrong with it here.
927 	 *
928 	 * It is incorrect to simply unp_discard each entry for f_msgcount
929 	 * times -- consider the case of sockets A and B that contain
930 	 * references to each other.  On a last close of some other socket,
931 	 * we trigger a gc since the number of outstanding rights (unp_rights)
932 	 * is non-zero.  If during the sweep phase the gc code un_discards,
933 	 * we end up doing a (full) closef on the descriptor.  A closef on A
934 	 * results in the following chain.  Closef calls soo_close, which
935 	 * calls soclose.   Soclose calls first (through the switch
936 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
937 	 * returns because the previous instance had set unp_gcing, and
938 	 * we return all the way back to soclose, which marks the socket
939 	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
940 	 * to free up the rights that are queued in messages on the socket A,
941 	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
942 	 * switch unp_dispose, which unp_scans with unp_discard.  This second
943 	 * instance of unp_discard just calls closef on B.
944 	 *
945 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
946 	 * which results in another closef on A.  Unfortunately, A is already
947 	 * being closed, and the descriptor has already been marked with
948 	 * SS_NOFDREF, and soclose panics at this point.
949 	 *
950 	 * Here, we first take an extra reference to each inaccessible
951 	 * descriptor.  Then, we call sorflush ourself, since we know
952 	 * it is a Unix domain socket anyhow.  After we destroy all the
953 	 * rights carried in messages, we do a last closef to get rid
954 	 * of our extra reference.  This is the last close, and the
955 	 * unp_detach etc will shut down the socket.
956 	 *
957 	 * 91/09/19, bsy@cs.cmu.edu
958 	 */
959 	extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
960 	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
961 	    fp != NULL; fp = nextfp) {
962 		nextfp = LIST_NEXT(fp, f_list);
963 		if (fp->f_count == 0)
964 			continue;
965 		if (fp->f_count == fp->f_msgcount &&
966 		    !(fp->f_iflags & FIF_MARK)) {
967 			*fpp++ = fp;
968 			nunref++;
969 			FREF(fp);
970 			fp->f_count++;
971 		}
972 	}
973 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
974 	        if ((*fpp)->f_type == DTYPE_SOCKET && (*fpp)->f_data != NULL)
975 		        sorflush((struct socket *)(*fpp)->f_data);
976 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
977 		(void) closef(*fpp, NULL);
978 	free((caddr_t)extra_ref, M_FILE);
979 	unp_gcing = 0;
980 }
981 
982 void
983 unp_dispose(struct mbuf *m)
984 {
985 
986 	if (m)
987 		unp_scan(m, unp_discard, 1);
988 }
989 
990 void
991 unp_scan(struct mbuf *m0, void (*op)(struct file *), int discard)
992 {
993 	struct mbuf *m;
994 	struct file **rp, *fp;
995 	struct cmsghdr *cm;
996 	int i;
997 	int qfds;
998 
999 	while (m0) {
1000 		for (m = m0; m; m = m->m_next) {
1001 			if (m->m_type == MT_CONTROL &&
1002 			    m->m_len >= sizeof(*cm)) {
1003 				cm = mtod(m, struct cmsghdr *);
1004 				if (cm->cmsg_level != SOL_SOCKET ||
1005 				    cm->cmsg_type != SCM_RIGHTS)
1006 					continue;
1007 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1008 				    / sizeof(struct file *);
1009 				rp = (struct file **)CMSG_DATA(cm);
1010 				for (i = 0; i < qfds; i++) {
1011 					fp = *rp;
1012 					if (discard)
1013 						*rp = 0;
1014 					(*op)(fp);
1015 					rp++;
1016 				}
1017 				break;		/* XXX, but saves time */
1018 			}
1019 		}
1020 		m0 = m0->m_nextpkt;
1021 	}
1022 }
1023 
1024 void
1025 unp_mark(struct file *fp)
1026 {
1027 	if (fp == NULL)
1028 		return;
1029 
1030 	if (fp->f_iflags & (FIF_MARK|FIF_DEFER))
1031 		return;
1032 
1033 	if (fp->f_type == DTYPE_SOCKET) {
1034 		unp_defer++;
1035 		fp->f_iflags |= FIF_DEFER;
1036 	} else {
1037 		fp->f_iflags |= FIF_MARK;
1038 	}
1039 }
1040 
1041 void
1042 unp_discard(struct file *fp)
1043 {
1044 
1045 	if (fp == NULL)
1046 		return;
1047 	FREF(fp);
1048 	fp->f_msgcount--;
1049 	unp_rights--;
1050 	(void) closef(fp, NULL);
1051 }
1052