xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision 03adc85b7600a1f8f04886b8321c1c1c0c4933d4)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.111 2017/01/24 04:09:59 deraadt Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 
54 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
55 
56 /* list of all UNIX domain sockets, for unp_gc() */
57 LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
58 
59 struct fdpass {
60 	struct file	*fp;
61 	int		flags;
62 };
63 
64 /*
65  * Stack of sets of files that were passed over a socket but were
66  * not received and need to be closed.
67  */
68 struct	unp_deferral {
69 	SLIST_ENTRY(unp_deferral)	ud_link;
70 	int	ud_n;
71 	/* followed by ud_n struct fdpass */
72 	struct fdpass ud_fp[];
73 };
74 
75 void	unp_discard(struct fdpass *, int);
76 void	unp_mark(struct fdpass *, int);
77 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
78 
79 
80 /* list of sets of files that were sent over sockets that are now closed */
81 SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
82 
83 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
84 
85 
86 /*
87  * Unix communications domain.
88  *
89  * TODO:
90  *	RDM
91  *	rethink name space problems
92  *	need a proper out-of-band
93  */
94 struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
95 ino_t	unp_ino;			/* prototype for fake inode numbers */
96 
97 void
98 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
99 {
100 	if (unp != NULL && unp->unp_addr != NULL) {
101 		nam->m_len = unp->unp_addr->m_len;
102 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
103 		    nam->m_len);
104 	} else {
105 		nam->m_len = sizeof(sun_noname);
106 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
107 		    nam->m_len);
108 	}
109 }
110 
111 int
112 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
113     struct mbuf *control, struct proc *p)
114 {
115 	struct unpcb *unp = sotounpcb(so);
116 	struct socket *so2;
117 	int error = 0;
118 
119 	if (req == PRU_CONTROL)
120 		return (EOPNOTSUPP);
121 	if (req != PRU_SEND && control && control->m_len) {
122 		error = EOPNOTSUPP;
123 		goto release;
124 	}
125 	if (unp == NULL && req != PRU_ATTACH) {
126 		error = EINVAL;
127 		goto release;
128 	}
129 	switch (req) {
130 
131 	case PRU_ATTACH:
132 		if (unp) {
133 			error = EISCONN;
134 			break;
135 		}
136 		error = unp_attach(so);
137 		break;
138 
139 	case PRU_DETACH:
140 		unp_detach(unp);
141 		break;
142 
143 	case PRU_BIND:
144 		error = unp_bind(unp, nam, p);
145 		break;
146 
147 	case PRU_LISTEN:
148 		if (unp->unp_vnode == NULL)
149 			error = EINVAL;
150 		break;
151 
152 	case PRU_CONNECT:
153 		error = unp_connect(so, nam, p);
154 		break;
155 
156 	case PRU_CONNECT2:
157 		error = unp_connect2(so, (struct socket *)nam);
158 		break;
159 
160 	case PRU_DISCONNECT:
161 		unp_disconnect(unp);
162 		break;
163 
164 	case PRU_ACCEPT:
165 		/*
166 		 * Pass back name of connected socket,
167 		 * if it was bound and we are still connected
168 		 * (our peer may have closed already!).
169 		 */
170 		uipc_setaddr(unp->unp_conn, nam);
171 		break;
172 
173 	case PRU_SHUTDOWN:
174 		socantsendmore(so);
175 		unp_shutdown(unp);
176 		break;
177 
178 	case PRU_RCVD:
179 		switch (so->so_type) {
180 
181 		case SOCK_DGRAM:
182 			panic("uipc 1");
183 			/*NOTREACHED*/
184 
185 		case SOCK_STREAM:
186 		case SOCK_SEQPACKET:
187 #define	rcv (&so->so_rcv)
188 #define snd (&so2->so_snd)
189 			if (unp->unp_conn == NULL)
190 				break;
191 			so2 = unp->unp_conn->unp_socket;
192 			/*
193 			 * Adjust backpressure on sender
194 			 * and wakeup any waiting to write.
195 			 */
196 			snd->sb_mbcnt = rcv->sb_mbcnt;
197 			snd->sb_cc = rcv->sb_cc;
198 			sowwakeup(so2);
199 #undef snd
200 #undef rcv
201 			break;
202 
203 		default:
204 			panic("uipc 2");
205 		}
206 		break;
207 
208 	case PRU_SEND:
209 		if (control && (error = unp_internalize(control, p)))
210 			break;
211 		switch (so->so_type) {
212 
213 		case SOCK_DGRAM: {
214 			struct sockaddr *from;
215 
216 			if (nam) {
217 				if (unp->unp_conn) {
218 					error = EISCONN;
219 					break;
220 				}
221 				error = unp_connect(so, nam, p);
222 				if (error)
223 					break;
224 			} else {
225 				if (unp->unp_conn == NULL) {
226 					error = ENOTCONN;
227 					break;
228 				}
229 			}
230 			so2 = unp->unp_conn->unp_socket;
231 			if (unp->unp_addr)
232 				from = mtod(unp->unp_addr, struct sockaddr *);
233 			else
234 				from = &sun_noname;
235 			if (sbappendaddr(&so2->so_rcv, from, m, control)) {
236 				sorwakeup(so2);
237 				m = NULL;
238 				control = NULL;
239 			} else
240 				error = ENOBUFS;
241 			if (nam)
242 				unp_disconnect(unp);
243 			break;
244 		}
245 
246 		case SOCK_STREAM:
247 		case SOCK_SEQPACKET:
248 #define	rcv (&so2->so_rcv)
249 #define	snd (&so->so_snd)
250 			if (so->so_state & SS_CANTSENDMORE) {
251 				error = EPIPE;
252 				break;
253 			}
254 			if (unp->unp_conn == NULL) {
255 				error = ENOTCONN;
256 				break;
257 			}
258 			so2 = unp->unp_conn->unp_socket;
259 			/*
260 			 * Send to paired receive port, and then raise
261 			 * send buffer counts to maintain backpressure.
262 			 * Wake up readers.
263 			 */
264 			if (control) {
265 				if (sbappendcontrol(rcv, m, control))
266 					control = NULL;
267 				else {
268 					error = ENOBUFS;
269 					break;
270 				}
271 			} else if (so->so_type == SOCK_SEQPACKET)
272 				sbappendrecord(rcv, m);
273 			else
274 				sbappend(rcv, m);
275 			snd->sb_mbcnt = rcv->sb_mbcnt;
276 			snd->sb_cc = rcv->sb_cc;
277 			sorwakeup(so2);
278 			m = NULL;
279 #undef snd
280 #undef rcv
281 			break;
282 
283 		default:
284 			panic("uipc 4");
285 		}
286 		/* we need to undo unp_internalize in case of errors */
287 		if (control && error)
288 			unp_dispose(control);
289 		break;
290 
291 	case PRU_ABORT:
292 		unp_drop(unp, ECONNABORTED);
293 		break;
294 
295 	case PRU_SENSE: {
296 		struct stat *sb = (struct stat *)m;
297 
298 		sb->st_blksize = so->so_snd.sb_hiwat;
299 		sb->st_dev = NODEV;
300 		if (unp->unp_ino == 0)
301 			unp->unp_ino = unp_ino++;
302 		sb->st_atim.tv_sec =
303 		    sb->st_mtim.tv_sec =
304 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
305 		sb->st_atim.tv_nsec =
306 		    sb->st_mtim.tv_nsec =
307 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
308 		sb->st_ino = unp->unp_ino;
309 		return (0);
310 	}
311 
312 	case PRU_RCVOOB:
313 		return (EOPNOTSUPP);
314 
315 	case PRU_SENDOOB:
316 		error = EOPNOTSUPP;
317 		break;
318 
319 	case PRU_SOCKADDR:
320 		uipc_setaddr(unp, nam);
321 		break;
322 
323 	case PRU_PEERADDR:
324 		uipc_setaddr(unp->unp_conn, nam);
325 		break;
326 
327 	case PRU_SLOWTIMO:
328 		break;
329 
330 	default:
331 		panic("piusrreq");
332 	}
333 release:
334 	m_freem(control);
335 	m_freem(m);
336 	return (error);
337 }
338 
339 /*
340  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
341  * for stream sockets, although the total for sender and receiver is
342  * actually only PIPSIZ.
343  * Datagram sockets really use the sendspace as the maximum datagram size,
344  * and don't really want to reserve the sendspace.  Their recvspace should
345  * be large enough for at least one max-size datagram plus address.
346  */
347 #define	PIPSIZ	4096
348 u_long	unpst_sendspace = PIPSIZ;
349 u_long	unpst_recvspace = PIPSIZ;
350 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
351 u_long	unpdg_recvspace = 4*1024;
352 
353 int	unp_rights;			/* file descriptors in flight */
354 
355 int
356 unp_attach(struct socket *so)
357 {
358 	struct unpcb *unp;
359 	int error;
360 
361 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
362 		switch (so->so_type) {
363 
364 		case SOCK_STREAM:
365 		case SOCK_SEQPACKET:
366 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
367 			break;
368 
369 		case SOCK_DGRAM:
370 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
371 			break;
372 
373 		default:
374 			panic("unp_attach");
375 		}
376 		if (error)
377 			return (error);
378 	}
379 	unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT|M_ZERO);
380 	if (unp == NULL)
381 		return (ENOBUFS);
382 	unp->unp_socket = so;
383 	so->so_pcb = unp;
384 	getnanotime(&unp->unp_ctime);
385 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
386 	return (0);
387 }
388 
389 void
390 unp_detach(struct unpcb *unp)
391 {
392 	struct vnode *vp;
393 
394 	LIST_REMOVE(unp, unp_link);
395 	if (unp->unp_vnode) {
396 		unp->unp_vnode->v_socket = NULL;
397 		vp = unp->unp_vnode;
398 		unp->unp_vnode = NULL;
399 		vrele(vp);
400 	}
401 	if (unp->unp_conn)
402 		unp_disconnect(unp);
403 	while (!SLIST_EMPTY(&unp->unp_refs))
404 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
405 	soisdisconnected(unp->unp_socket);
406 	unp->unp_socket->so_pcb = NULL;
407 	m_freem(unp->unp_addr);
408 	free(unp, M_PCB, sizeof *unp);
409 	if (unp_rights)
410 		task_add(systq, &unp_gc_task);
411 }
412 
413 int
414 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
415 {
416 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
417 	struct mbuf *nam2;
418 	struct vnode *vp;
419 	struct vattr vattr;
420 	int error;
421 	struct nameidata nd;
422 	size_t pathlen;
423 
424 	if (unp->unp_vnode != NULL)
425 		return (EINVAL);
426 
427 	if (soun->sun_len > sizeof(struct sockaddr_un) ||
428 	    soun->sun_len < offsetof(struct sockaddr_un, sun_path))
429 		return (EINVAL);
430 	if (soun->sun_family != AF_UNIX)
431 		return (EAFNOSUPPORT);
432 
433 	pathlen = strnlen(soun->sun_path, soun->sun_len -
434 	    offsetof(struct sockaddr_un, sun_path));
435 	if (pathlen == sizeof(soun->sun_path))
436 		return (EINVAL);
437 
438 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
439 	nam2->m_len = sizeof(struct sockaddr_un);
440 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
441 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
442 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
443 
444 	soun = mtod(nam2, struct sockaddr_un *);
445 
446 	/* Fixup sun_len to keep it in sync with m_len. */
447 	soun->sun_len = nam2->m_len;
448 
449 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
450 	    soun->sun_path, p);
451 	nd.ni_pledge = PLEDGE_UNIX;
452 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
453 	if ((error = namei(&nd)) != 0) {
454 		m_freem(nam2);
455 		return (error);
456 	}
457 	vp = nd.ni_vp;
458 	if (vp != NULL) {
459 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
460 		if (nd.ni_dvp == vp)
461 			vrele(nd.ni_dvp);
462 		else
463 			vput(nd.ni_dvp);
464 		vrele(vp);
465 		m_freem(nam2);
466 		return (EADDRINUSE);
467 	}
468 	VATTR_NULL(&vattr);
469 	vattr.va_type = VSOCK;
470 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
471 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
472 	if (error) {
473 		m_freem(nam2);
474 		return (error);
475 	}
476 	unp->unp_addr = nam2;
477 	vp = nd.ni_vp;
478 	vp->v_socket = unp->unp_socket;
479 	unp->unp_vnode = vp;
480 	unp->unp_connid.uid = p->p_ucred->cr_uid;
481 	unp->unp_connid.gid = p->p_ucred->cr_gid;
482 	unp->unp_connid.pid = p->p_p->ps_pid;
483 	unp->unp_flags |= UNP_FEIDSBIND;
484 	VOP_UNLOCK(vp, p);
485 	return (0);
486 }
487 
488 int
489 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
490 {
491 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
492 	struct vnode *vp;
493 	struct socket *so2, *so3;
494 	struct unpcb *unp, *unp2, *unp3;
495 	struct nameidata nd;
496 	int error, s;
497 
498 	if (soun->sun_family != AF_UNIX)
499 		return (EAFNOSUPPORT);
500 
501 	if (nam->m_len < sizeof(struct sockaddr_un))
502 		*(mtod(nam, caddr_t) + nam->m_len) = 0;
503 	else if (nam->m_len > sizeof(struct sockaddr_un))
504 		return (EINVAL);
505 	else if (memchr(soun->sun_path, '\0', sizeof(soun->sun_path)) == NULL)
506 		return (EINVAL);
507 
508 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
509 	nd.ni_pledge = PLEDGE_UNIX;
510 	if ((error = namei(&nd)) != 0)
511 		return (error);
512 	vp = nd.ni_vp;
513 	if (vp->v_type != VSOCK) {
514 		error = ENOTSOCK;
515 		goto bad;
516 	}
517 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
518 		goto bad;
519 	so2 = vp->v_socket;
520 	if (so2 == NULL) {
521 		error = ECONNREFUSED;
522 		goto bad;
523 	}
524 	if (so->so_type != so2->so_type) {
525 		error = EPROTOTYPE;
526 		goto bad;
527 	}
528 	NET_LOCK(s);
529 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
530 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
531 		    (so3 = sonewconn(so2, 0)) == 0) {
532 			error = ECONNREFUSED;
533 			goto unlock;
534 		}
535 		unp = sotounpcb(so);
536 		unp2 = sotounpcb(so2);
537 		unp3 = sotounpcb(so3);
538 		if (unp2->unp_addr)
539 			unp3->unp_addr =
540 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
541 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
542 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
543 		unp3->unp_connid.pid = p->p_p->ps_pid;
544 		unp3->unp_flags |= UNP_FEIDS;
545 		so2 = so3;
546 		if (unp2->unp_flags & UNP_FEIDSBIND) {
547 			unp->unp_connid = unp2->unp_connid;
548 			unp->unp_flags |= UNP_FEIDS;
549 		}
550 	}
551 	error = unp_connect2(so, so2);
552 unlock:
553 	NET_UNLOCK(s);
554 bad:
555 	vput(vp);
556 	return (error);
557 }
558 
559 int
560 unp_connect2(struct socket *so, struct socket *so2)
561 {
562 	struct unpcb *unp = sotounpcb(so);
563 	struct unpcb *unp2;
564 
565 	if (so2->so_type != so->so_type)
566 		return (EPROTOTYPE);
567 	unp2 = sotounpcb(so2);
568 	unp->unp_conn = unp2;
569 	switch (so->so_type) {
570 
571 	case SOCK_DGRAM:
572 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
573 		soisconnected(so);
574 		break;
575 
576 	case SOCK_STREAM:
577 	case SOCK_SEQPACKET:
578 		unp2->unp_conn = unp;
579 		soisconnected(so);
580 		soisconnected(so2);
581 		break;
582 
583 	default:
584 		panic("unp_connect2");
585 	}
586 	return (0);
587 }
588 
589 void
590 unp_disconnect(struct unpcb *unp)
591 {
592 	struct unpcb *unp2 = unp->unp_conn;
593 
594 	if (unp2 == NULL)
595 		return;
596 	unp->unp_conn = NULL;
597 	switch (unp->unp_socket->so_type) {
598 
599 	case SOCK_DGRAM:
600 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
601 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
602 		break;
603 
604 	case SOCK_STREAM:
605 	case SOCK_SEQPACKET:
606 		unp->unp_socket->so_snd.sb_mbcnt = 0;
607 		unp->unp_socket->so_snd.sb_cc = 0;
608 		soisdisconnected(unp->unp_socket);
609 		unp2->unp_conn = NULL;
610 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
611 		unp2->unp_socket->so_snd.sb_cc = 0;
612 		soisdisconnected(unp2->unp_socket);
613 		break;
614 	}
615 }
616 
617 void
618 unp_shutdown(struct unpcb *unp)
619 {
620 	struct socket *so;
621 
622 	switch (unp->unp_socket->so_type) {
623 	case SOCK_STREAM:
624 	case SOCK_SEQPACKET:
625 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
626 			socantrcvmore(so);
627 		break;
628 	default:
629 		break;
630 	}
631 }
632 
633 void
634 unp_drop(struct unpcb *unp, int errno)
635 {
636 	struct socket *so = unp->unp_socket;
637 
638 	so->so_error = errno;
639 	unp_disconnect(unp);
640 	if (so->so_head) {
641 		so->so_pcb = NULL;
642 		sofree(so);
643 		m_freem(unp->unp_addr);
644 		free(unp, M_PCB, sizeof *unp);
645 	}
646 }
647 
648 #ifdef notdef
649 unp_drain(void)
650 {
651 
652 }
653 #endif
654 
655 extern	struct domain unixdomain;
656 
657 static struct unpcb *
658 fptounp(struct file *fp)
659 {
660 	struct socket *so;
661 
662 	if (fp->f_type != DTYPE_SOCKET)
663 		return (NULL);
664 	if ((so = fp->f_data) == NULL)
665 		return (NULL);
666 	if (so->so_proto->pr_domain != &unixdomain)
667 		return (NULL);
668 	return (sotounpcb(so));
669 }
670 
671 int
672 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
673 {
674 	struct proc *p = curproc;		/* XXX */
675 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
676 	int i, *fdp = NULL;
677 	struct fdpass *rp;
678 	struct file *fp;
679 	int nfds, error = 0;
680 
681 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
682 	    sizeof(struct fdpass);
683 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
684 		controllen = 0;
685 	else
686 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
687 	if (nfds > controllen / sizeof(int)) {
688 		error = EMSGSIZE;
689 		goto restart;
690 	}
691 
692 	/* Make sure the recipient should be able to see the descriptors.. */
693 	rp = (struct fdpass *)CMSG_DATA(cm);
694 	for (i = 0; i < nfds; i++) {
695 		fp = rp->fp;
696 		rp++;
697 		error = pledge_recvfd(p, fp);
698 		if (error)
699 			break;
700 
701 		/*
702 		 * No to block devices.  If passing a directory,
703 		 * make sure that it is underneath the root.
704 		 */
705 		if (p->p_fd->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
706 			struct vnode *vp = (struct vnode *)fp->f_data;
707 
708 			if (vp->v_type == VBLK ||
709 			    (vp->v_type == VDIR &&
710 			    !vn_isunder(vp, p->p_fd->fd_rdir, p))) {
711 				error = EPERM;
712 				break;
713 			}
714 		}
715 	}
716 
717 	fdp = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
718 
719 restart:
720 	fdplock(p->p_fd);
721 	if (error != 0) {
722 		if (nfds > 0) {
723 			rp = ((struct fdpass *)CMSG_DATA(cm));
724 			unp_discard(rp, nfds);
725 		}
726 		goto out;
727 	}
728 
729 	/*
730 	 * First loop -- allocate file descriptor table slots for the
731 	 * new descriptors.
732 	 */
733 	rp = ((struct fdpass *)CMSG_DATA(cm));
734 	for (i = 0; i < nfds; i++) {
735 		if ((error = fdalloc(p, 0, &fdp[i])) != 0) {
736 			/*
737 			 * Back out what we've done so far.
738 			 */
739 			for (--i; i >= 0; i--)
740 				fdremove(p->p_fd, fdp[i]);
741 
742 			if (error == ENOSPC) {
743 				fdexpand(p);
744 				error = 0;
745 			} else {
746 				/*
747 				 * This is the error that has historically
748 				 * been returned, and some callers may
749 				 * expect it.
750 				 */
751 				error = EMSGSIZE;
752 			}
753 			fdpunlock(p->p_fd);
754 			goto restart;
755 		}
756 
757 		/*
758 		 * Make the slot reference the descriptor so that
759 		 * fdalloc() works properly.. We finalize it all
760 		 * in the loop below.
761 		 */
762 		p->p_fd->fd_ofiles[fdp[i]] = rp->fp;
763 		p->p_fd->fd_ofileflags[fdp[i]] = (rp->flags & UF_PLEDGED);
764 		rp++;
765 
766 		if (flags & MSG_CMSG_CLOEXEC)
767 			p->p_fd->fd_ofileflags[fdp[i]] |= UF_EXCLOSE;
768 	}
769 
770 	/*
771 	 * Now that adding them has succeeded, update all of the
772 	 * descriptor passing state.
773 	 */
774 	rp = (struct fdpass *)CMSG_DATA(cm);
775 	for (i = 0; i < nfds; i++) {
776 		struct unpcb *unp;
777 
778 		fp = rp->fp;
779 		rp++;
780 		if ((unp = fptounp(fp)) != NULL)
781 			unp->unp_msgcount--;
782 		unp_rights--;
783 	}
784 
785 	/*
786 	 * Copy temporary array to message and adjust length, in case of
787 	 * transition from large struct file pointers to ints.
788 	 */
789 	memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
790 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
791 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
792  out:
793 	fdpunlock(p->p_fd);
794 	if (fdp)
795 		free(fdp, M_TEMP, nfds * sizeof(int));
796 	return (error);
797 }
798 
799 int
800 unp_internalize(struct mbuf *control, struct proc *p)
801 {
802 	struct filedesc *fdp = p->p_fd;
803 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
804 	struct fdpass *rp;
805 	struct file *fp;
806 	struct unpcb *unp;
807 	int i, error;
808 	int nfds, *ip, fd, neededspace;
809 
810 	/*
811 	 * Check for two potential msg_controllen values because
812 	 * IETF stuck their nose in a place it does not belong.
813 	 */
814 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
815 	    !(cm->cmsg_len == control->m_len ||
816 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
817 		return (EINVAL);
818 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
819 
820 	if (unp_rights + nfds > maxfiles / 10)
821 		return (EMFILE);
822 
823 	/* Make sure we have room for the struct file pointers */
824 morespace:
825 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
826 	    control->m_len;
827 	if (neededspace > M_TRAILINGSPACE(control)) {
828 		char *tmp;
829 		/* if we already have a cluster, the message is just too big */
830 		if (control->m_flags & M_EXT)
831 			return (E2BIG);
832 
833 		/* copy cmsg data temporarily out of the mbuf */
834 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
835 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
836 
837 		/* allocate a cluster and try again */
838 		MCLGET(control, M_WAIT);
839 		if ((control->m_flags & M_EXT) == 0) {
840 			free(tmp, M_TEMP, control->m_len);
841 			return (ENOBUFS);       /* allocation failed */
842 		}
843 
844 		/* copy the data back into the cluster */
845 		cm = mtod(control, struct cmsghdr *);
846 		memcpy(cm, tmp, control->m_len);
847 		free(tmp, M_TEMP, control->m_len);
848 		goto morespace;
849 	}
850 
851 	/* adjust message & mbuf to note amount of space actually used. */
852 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
853 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
854 
855 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
856 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
857 	for (i = 0; i < nfds; i++) {
858 		memcpy(&fd, ip, sizeof fd);
859 		ip--;
860 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
861 			error = EBADF;
862 			goto fail;
863 		}
864 		if (fp->f_count == LONG_MAX-2) {
865 			error = EDEADLK;
866 			goto fail;
867 		}
868 		error = pledge_sendfd(p, fp);
869 		if (error)
870 			goto fail;
871 
872 		/* kqueue descriptors cannot be copied */
873 		if (fp->f_type == DTYPE_KQUEUE) {
874 			error = EINVAL;
875 			goto fail;
876 		}
877 		rp->fp = fp;
878 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
879 		rp--;
880 		fp->f_count++;
881 		if ((unp = fptounp(fp)) != NULL) {
882 			unp->unp_file = fp;
883 			unp->unp_msgcount++;
884 		}
885 		unp_rights++;
886 	}
887 	return (0);
888 fail:
889 	/* Back out what we just did. */
890 	for ( ; i > 0; i--) {
891 		rp++;
892 		fp = rp->fp;
893 		fp->f_count--;
894 		if ((unp = fptounp(fp)) != NULL)
895 			unp->unp_msgcount--;
896 		unp_rights--;
897 	}
898 
899 	return (error);
900 }
901 
902 int	unp_defer, unp_gcing;
903 
904 void
905 unp_gc(void *arg __unused)
906 {
907 	struct unp_deferral *defer;
908 	struct file *fp;
909 	struct socket *so;
910 	struct unpcb *unp;
911 	int nunref, i;
912 
913 	if (unp_gcing)
914 		return;
915 	unp_gcing = 1;
916 
917 	/* close any fds on the deferred list */
918 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
919 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
920 		for (i = 0; i < defer->ud_n; i++) {
921 			fp = defer->ud_fp[i].fp;
922 			if (fp == NULL)
923 				continue;
924 			FREF(fp);
925 			if ((unp = fptounp(fp)) != NULL)
926 				unp->unp_msgcount--;
927 			unp_rights--;
928 			(void) closef(fp, NULL);
929 		}
930 		free(defer, M_TEMP, sizeof(*defer) +
931 		    sizeof(struct fdpass) * defer->ud_n);
932 	}
933 
934 	unp_defer = 0;
935 	LIST_FOREACH(unp, &unp_head, unp_link)
936 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
937 	do {
938 		nunref = 0;
939 		LIST_FOREACH(unp, &unp_head, unp_link) {
940 			if (unp->unp_flags & UNP_GCDEFER) {
941 				/*
942 				 * This socket is referenced by another
943 				 * socket which is known to be live,
944 				 * so it's certainly live.
945 				 */
946 				unp->unp_flags &= ~UNP_GCDEFER;
947 				unp_defer--;
948 			} else if (unp->unp_flags & UNP_GCMARK) {
949 				/* marked as live in previous pass */
950 				continue;
951 			} else if ((fp = unp->unp_file) == NULL) {
952 				/* not being passed, so can't be in loop */
953 			} else if (fp->f_count == 0) {
954 				/*
955 				 * Already being closed, let normal close
956 				 * path take its course
957 				 */
958 			} else {
959 				/*
960 				 * Unreferenced by other sockets so far,
961 				 * so if all the references (f_count) are
962 				 * from passing (unp_msgcount) then this
963 				 * socket is prospectively dead
964 				 */
965 				if (fp->f_count == unp->unp_msgcount) {
966 					nunref++;
967 					unp->unp_flags |= UNP_GCDEAD;
968 					continue;
969 				}
970 			}
971 
972 			/*
973 			 * This is the first time we've seen this socket on
974 			 * the mark pass and known it has a live reference,
975 			 * so mark it, then scan its receive buffer for
976 			 * sockets and note them as deferred (== referenced,
977 			 * but not yet marked).
978 			 */
979 			unp->unp_flags |= UNP_GCMARK;
980 
981 			so = unp->unp_socket;
982 #ifdef notdef
983 			if (so->so_rcv.sb_flags & SB_LOCK) {
984 				/*
985 				 * This is problematical; it's not clear
986 				 * we need to wait for the sockbuf to be
987 				 * unlocked (on a uniprocessor, at least),
988 				 * and it's also not clear what to do
989 				 * if sbwait returns an error due to receipt
990 				 * of a signal.  If sbwait does return
991 				 * an error, we'll go into an infinite
992 				 * loop.  Delete all of this for now.
993 				 */
994 				(void) sbwait(&so->so_rcv);
995 				goto restart;
996 			}
997 #endif
998 			unp_scan(so->so_rcv.sb_mb, unp_mark);
999 		}
1000 	} while (unp_defer);
1001 
1002 	/*
1003 	 * If there are any unreferenced sockets, then for each dispose
1004 	 * of files in its receive buffer and then close it.
1005 	 */
1006 	if (nunref) {
1007 		LIST_FOREACH(unp, &unp_head, unp_link) {
1008 			if (unp->unp_flags & UNP_GCDEAD)
1009 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
1010 				    unp_discard);
1011 		}
1012 	}
1013 	unp_gcing = 0;
1014 }
1015 
1016 void
1017 unp_dispose(struct mbuf *m)
1018 {
1019 
1020 	if (m)
1021 		unp_scan(m, unp_discard);
1022 }
1023 
1024 void
1025 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1026 {
1027 	struct mbuf *m;
1028 	struct fdpass *rp;
1029 	struct cmsghdr *cm;
1030 	int qfds;
1031 
1032 	while (m0) {
1033 		for (m = m0; m; m = m->m_next) {
1034 			if (m->m_type == MT_CONTROL &&
1035 			    m->m_len >= sizeof(*cm)) {
1036 				cm = mtod(m, struct cmsghdr *);
1037 				if (cm->cmsg_level != SOL_SOCKET ||
1038 				    cm->cmsg_type != SCM_RIGHTS)
1039 					continue;
1040 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1041 				    / sizeof(struct fdpass);
1042 				if (qfds > 0) {
1043 					rp = (struct fdpass *)CMSG_DATA(cm);
1044 					op(rp, qfds);
1045 				}
1046 				break;		/* XXX, but saves time */
1047 			}
1048 		}
1049 		m0 = m0->m_nextpkt;
1050 	}
1051 }
1052 
1053 void
1054 unp_mark(struct fdpass *rp, int nfds)
1055 {
1056 	struct unpcb *unp;
1057 	int i;
1058 
1059 	for (i = 0; i < nfds; i++) {
1060 		if (rp[i].fp == NULL)
1061 			continue;
1062 
1063 		unp = fptounp(rp[i].fp);
1064 		if (unp == NULL)
1065 			continue;
1066 
1067 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1068 			continue;
1069 
1070 		unp_defer++;
1071 		unp->unp_flags |= UNP_GCDEFER;
1072 		unp->unp_flags &= ~UNP_GCDEAD;
1073 	}
1074 }
1075 
1076 void
1077 unp_discard(struct fdpass *rp, int nfds)
1078 {
1079 	struct unp_deferral *defer;
1080 
1081 	/* copy the file pointers to a deferral structure */
1082 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1083 	defer->ud_n = nfds;
1084 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1085 	memset(rp, 0, sizeof(*rp) * nfds);
1086 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1087 
1088 	task_add(systq, &unp_gc_task);
1089 }
1090