xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision d59bb9942320b767f2a19aaa7690c8c6e30b724c)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.116 2017/02/14 09:46:21 mpi Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 
54 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
55 
56 /* list of all UNIX domain sockets, for unp_gc() */
57 LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
58 
59 /*
60  * Stack of sets of files that were passed over a socket but were
61  * not received and need to be closed.
62  */
63 struct	unp_deferral {
64 	SLIST_ENTRY(unp_deferral)	ud_link;
65 	int	ud_n;
66 	/* followed by ud_n struct fdpass */
67 	struct fdpass ud_fp[];
68 };
69 
70 void	unp_discard(struct fdpass *, int);
71 void	unp_mark(struct fdpass *, int);
72 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
73 
74 
75 /* list of sets of files that were sent over sockets that are now closed */
76 SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
77 
78 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
79 
80 
81 /*
82  * Unix communications domain.
83  *
84  * TODO:
85  *	RDM
86  *	rethink name space problems
87  *	need a proper out-of-band
88  */
89 struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
90 ino_t	unp_ino;			/* prototype for fake inode numbers */
91 
92 void
93 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
94 {
95 	if (unp != NULL && unp->unp_addr != NULL) {
96 		nam->m_len = unp->unp_addr->m_len;
97 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
98 		    nam->m_len);
99 	} else {
100 		nam->m_len = sizeof(sun_noname);
101 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
102 		    nam->m_len);
103 	}
104 }
105 
106 int
107 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
108     struct mbuf *control, struct proc *p)
109 {
110 	struct unpcb *unp = sotounpcb(so);
111 	struct socket *so2;
112 	int error = 0;
113 
114 	if (req == PRU_CONTROL)
115 		return (EOPNOTSUPP);
116 	if (req != PRU_SEND && control && control->m_len) {
117 		error = EOPNOTSUPP;
118 		goto release;
119 	}
120 	if (unp == NULL && req != PRU_ATTACH) {
121 		error = EINVAL;
122 		goto release;
123 	}
124 
125 	NET_ASSERT_UNLOCKED();
126 
127 	switch (req) {
128 
129 	case PRU_ATTACH:
130 		if (unp) {
131 			error = EISCONN;
132 			break;
133 		}
134 		error = unp_attach(so);
135 		break;
136 
137 	case PRU_DETACH:
138 		unp_detach(unp);
139 		break;
140 
141 	case PRU_BIND:
142 		error = unp_bind(unp, nam, p);
143 		break;
144 
145 	case PRU_LISTEN:
146 		if (unp->unp_vnode == NULL)
147 			error = EINVAL;
148 		break;
149 
150 	case PRU_CONNECT:
151 		error = unp_connect(so, nam, p);
152 		break;
153 
154 	case PRU_CONNECT2:
155 		error = unp_connect2(so, (struct socket *)nam);
156 		break;
157 
158 	case PRU_DISCONNECT:
159 		unp_disconnect(unp);
160 		break;
161 
162 	case PRU_ACCEPT:
163 		/*
164 		 * Pass back name of connected socket,
165 		 * if it was bound and we are still connected
166 		 * (our peer may have closed already!).
167 		 */
168 		uipc_setaddr(unp->unp_conn, nam);
169 		break;
170 
171 	case PRU_SHUTDOWN:
172 		socantsendmore(so);
173 		unp_shutdown(unp);
174 		break;
175 
176 	case PRU_RCVD:
177 		switch (so->so_type) {
178 
179 		case SOCK_DGRAM:
180 			panic("uipc 1");
181 			/*NOTREACHED*/
182 
183 		case SOCK_STREAM:
184 		case SOCK_SEQPACKET:
185 #define	rcv (&so->so_rcv)
186 #define snd (&so2->so_snd)
187 			if (unp->unp_conn == NULL)
188 				break;
189 			so2 = unp->unp_conn->unp_socket;
190 			/*
191 			 * Adjust backpressure on sender
192 			 * and wakeup any waiting to write.
193 			 */
194 			snd->sb_mbcnt = rcv->sb_mbcnt;
195 			snd->sb_cc = rcv->sb_cc;
196 			sowwakeup(so2);
197 #undef snd
198 #undef rcv
199 			break;
200 
201 		default:
202 			panic("uipc 2");
203 		}
204 		break;
205 
206 	case PRU_SEND:
207 		if (control && (error = unp_internalize(control, p)))
208 			break;
209 		switch (so->so_type) {
210 
211 		case SOCK_DGRAM: {
212 			struct sockaddr *from;
213 
214 			if (nam) {
215 				if (unp->unp_conn) {
216 					error = EISCONN;
217 					break;
218 				}
219 				error = unp_connect(so, nam, p);
220 				if (error)
221 					break;
222 			} else {
223 				if (unp->unp_conn == NULL) {
224 					error = ENOTCONN;
225 					break;
226 				}
227 			}
228 			so2 = unp->unp_conn->unp_socket;
229 			if (unp->unp_addr)
230 				from = mtod(unp->unp_addr, struct sockaddr *);
231 			else
232 				from = &sun_noname;
233 			if (sbappendaddr(&so2->so_rcv, from, m, control)) {
234 				sorwakeup(so2);
235 				m = NULL;
236 				control = NULL;
237 			} else
238 				error = ENOBUFS;
239 			if (nam)
240 				unp_disconnect(unp);
241 			break;
242 		}
243 
244 		case SOCK_STREAM:
245 		case SOCK_SEQPACKET:
246 #define	rcv (&so2->so_rcv)
247 #define	snd (&so->so_snd)
248 			if (so->so_state & SS_CANTSENDMORE) {
249 				error = EPIPE;
250 				break;
251 			}
252 			if (unp->unp_conn == NULL) {
253 				error = ENOTCONN;
254 				break;
255 			}
256 			so2 = unp->unp_conn->unp_socket;
257 			/*
258 			 * Send to paired receive port, and then raise
259 			 * send buffer counts to maintain backpressure.
260 			 * Wake up readers.
261 			 */
262 			if (control) {
263 				if (sbappendcontrol(rcv, m, control))
264 					control = NULL;
265 				else {
266 					error = ENOBUFS;
267 					break;
268 				}
269 			} else if (so->so_type == SOCK_SEQPACKET)
270 				sbappendrecord(rcv, m);
271 			else
272 				sbappend(rcv, m);
273 			snd->sb_mbcnt = rcv->sb_mbcnt;
274 			snd->sb_cc = rcv->sb_cc;
275 			sorwakeup(so2);
276 			m = NULL;
277 #undef snd
278 #undef rcv
279 			break;
280 
281 		default:
282 			panic("uipc 4");
283 		}
284 		/* we need to undo unp_internalize in case of errors */
285 		if (control && error)
286 			unp_dispose(control);
287 		break;
288 
289 	case PRU_ABORT:
290 		unp_drop(unp, ECONNABORTED);
291 		break;
292 
293 	case PRU_SENSE: {
294 		struct stat *sb = (struct stat *)m;
295 
296 		sb->st_blksize = so->so_snd.sb_hiwat;
297 		sb->st_dev = NODEV;
298 		if (unp->unp_ino == 0)
299 			unp->unp_ino = unp_ino++;
300 		sb->st_atim.tv_sec =
301 		    sb->st_mtim.tv_sec =
302 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
303 		sb->st_atim.tv_nsec =
304 		    sb->st_mtim.tv_nsec =
305 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
306 		sb->st_ino = unp->unp_ino;
307 		return (0);
308 	}
309 
310 	case PRU_RCVOOB:
311 		return (EOPNOTSUPP);
312 
313 	case PRU_SENDOOB:
314 		error = EOPNOTSUPP;
315 		break;
316 
317 	case PRU_SOCKADDR:
318 		uipc_setaddr(unp, nam);
319 		break;
320 
321 	case PRU_PEERADDR:
322 		uipc_setaddr(unp->unp_conn, nam);
323 		break;
324 
325 	case PRU_SLOWTIMO:
326 		break;
327 
328 	default:
329 		panic("piusrreq");
330 	}
331 release:
332 	m_freem(control);
333 	m_freem(m);
334 	return (error);
335 }
336 
337 /*
338  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
339  * for stream sockets, although the total for sender and receiver is
340  * actually only PIPSIZ.
341  * Datagram sockets really use the sendspace as the maximum datagram size,
342  * and don't really want to reserve the sendspace.  Their recvspace should
343  * be large enough for at least one max-size datagram plus address.
344  */
345 #define	PIPSIZ	4096
346 u_long	unpst_sendspace = PIPSIZ;
347 u_long	unpst_recvspace = PIPSIZ;
348 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
349 u_long	unpdg_recvspace = 4*1024;
350 
351 int	unp_rights;			/* file descriptors in flight */
352 
353 int
354 unp_attach(struct socket *so)
355 {
356 	struct unpcb *unp;
357 	int error;
358 
359 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
360 		switch (so->so_type) {
361 
362 		case SOCK_STREAM:
363 		case SOCK_SEQPACKET:
364 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
365 			break;
366 
367 		case SOCK_DGRAM:
368 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
369 			break;
370 
371 		default:
372 			panic("unp_attach");
373 		}
374 		if (error)
375 			return (error);
376 	}
377 	unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT|M_ZERO);
378 	if (unp == NULL)
379 		return (ENOBUFS);
380 	unp->unp_socket = so;
381 	so->so_pcb = unp;
382 	getnanotime(&unp->unp_ctime);
383 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
384 	return (0);
385 }
386 
387 void
388 unp_detach(struct unpcb *unp)
389 {
390 	struct vnode *vp;
391 
392 	LIST_REMOVE(unp, unp_link);
393 	if (unp->unp_vnode) {
394 		unp->unp_vnode->v_socket = NULL;
395 		vp = unp->unp_vnode;
396 		unp->unp_vnode = NULL;
397 		vrele(vp);
398 	}
399 	if (unp->unp_conn)
400 		unp_disconnect(unp);
401 	while (!SLIST_EMPTY(&unp->unp_refs))
402 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
403 	soisdisconnected(unp->unp_socket);
404 	unp->unp_socket->so_pcb = NULL;
405 	m_freem(unp->unp_addr);
406 	free(unp, M_PCB, sizeof *unp);
407 	if (unp_rights)
408 		task_add(systq, &unp_gc_task);
409 }
410 
411 int
412 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
413 {
414 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
415 	struct mbuf *nam2;
416 	struct vnode *vp;
417 	struct vattr vattr;
418 	int error;
419 	struct nameidata nd;
420 	size_t pathlen;
421 
422 	if (unp->unp_vnode != NULL)
423 		return (EINVAL);
424 
425 	if (soun->sun_len > sizeof(struct sockaddr_un) ||
426 	    soun->sun_len < offsetof(struct sockaddr_un, sun_path))
427 		return (EINVAL);
428 	if (soun->sun_family != AF_UNIX)
429 		return (EAFNOSUPPORT);
430 
431 	pathlen = strnlen(soun->sun_path, soun->sun_len -
432 	    offsetof(struct sockaddr_un, sun_path));
433 	if (pathlen == sizeof(soun->sun_path))
434 		return (EINVAL);
435 
436 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
437 	nam2->m_len = sizeof(struct sockaddr_un);
438 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
439 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
440 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
441 
442 	soun = mtod(nam2, struct sockaddr_un *);
443 
444 	/* Fixup sun_len to keep it in sync with m_len. */
445 	soun->sun_len = nam2->m_len;
446 
447 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
448 	    soun->sun_path, p);
449 	nd.ni_pledge = PLEDGE_UNIX;
450 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
451 	if ((error = namei(&nd)) != 0) {
452 		m_freem(nam2);
453 		return (error);
454 	}
455 	vp = nd.ni_vp;
456 	if (vp != NULL) {
457 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
458 		if (nd.ni_dvp == vp)
459 			vrele(nd.ni_dvp);
460 		else
461 			vput(nd.ni_dvp);
462 		vrele(vp);
463 		m_freem(nam2);
464 		return (EADDRINUSE);
465 	}
466 	VATTR_NULL(&vattr);
467 	vattr.va_type = VSOCK;
468 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
469 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
470 	if (error) {
471 		m_freem(nam2);
472 		return (error);
473 	}
474 	unp->unp_addr = nam2;
475 	vp = nd.ni_vp;
476 	vp->v_socket = unp->unp_socket;
477 	unp->unp_vnode = vp;
478 	unp->unp_connid.uid = p->p_ucred->cr_uid;
479 	unp->unp_connid.gid = p->p_ucred->cr_gid;
480 	unp->unp_connid.pid = p->p_p->ps_pid;
481 	unp->unp_flags |= UNP_FEIDSBIND;
482 	VOP_UNLOCK(vp, p);
483 	return (0);
484 }
485 
486 int
487 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
488 {
489 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
490 	struct vnode *vp;
491 	struct socket *so2, *so3;
492 	struct unpcb *unp, *unp2, *unp3;
493 	struct nameidata nd;
494 	int error;
495 
496 	if (soun->sun_family != AF_UNIX)
497 		return (EAFNOSUPPORT);
498 
499 	if (nam->m_len < sizeof(struct sockaddr_un))
500 		*(mtod(nam, caddr_t) + nam->m_len) = 0;
501 	else if (nam->m_len > sizeof(struct sockaddr_un))
502 		return (EINVAL);
503 	else if (memchr(soun->sun_path, '\0', sizeof(soun->sun_path)) == NULL)
504 		return (EINVAL);
505 
506 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
507 	nd.ni_pledge = PLEDGE_UNIX;
508 	if ((error = namei(&nd)) != 0)
509 		return (error);
510 	vp = nd.ni_vp;
511 	if (vp->v_type != VSOCK) {
512 		error = ENOTSOCK;
513 		goto bad;
514 	}
515 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
516 		goto bad;
517 	so2 = vp->v_socket;
518 	if (so2 == NULL) {
519 		error = ECONNREFUSED;
520 		goto bad;
521 	}
522 	if (so->so_type != so2->so_type) {
523 		error = EPROTOTYPE;
524 		goto bad;
525 	}
526 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
527 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
528 		    (so3 = sonewconn(so2, 0)) == 0) {
529 			error = ECONNREFUSED;
530 			goto bad;
531 		}
532 		unp = sotounpcb(so);
533 		unp2 = sotounpcb(so2);
534 		unp3 = sotounpcb(so3);
535 		if (unp2->unp_addr)
536 			unp3->unp_addr =
537 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
538 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
539 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
540 		unp3->unp_connid.pid = p->p_p->ps_pid;
541 		unp3->unp_flags |= UNP_FEIDS;
542 		so2 = so3;
543 		if (unp2->unp_flags & UNP_FEIDSBIND) {
544 			unp->unp_connid = unp2->unp_connid;
545 			unp->unp_flags |= UNP_FEIDS;
546 		}
547 	}
548 	error = unp_connect2(so, so2);
549 bad:
550 	vput(vp);
551 	return (error);
552 }
553 
554 int
555 unp_connect2(struct socket *so, struct socket *so2)
556 {
557 	struct unpcb *unp = sotounpcb(so);
558 	struct unpcb *unp2;
559 
560 	if (so2->so_type != so->so_type)
561 		return (EPROTOTYPE);
562 	unp2 = sotounpcb(so2);
563 	unp->unp_conn = unp2;
564 	switch (so->so_type) {
565 
566 	case SOCK_DGRAM:
567 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
568 		soisconnected(so);
569 		break;
570 
571 	case SOCK_STREAM:
572 	case SOCK_SEQPACKET:
573 		unp2->unp_conn = unp;
574 		soisconnected(so);
575 		soisconnected(so2);
576 		break;
577 
578 	default:
579 		panic("unp_connect2");
580 	}
581 	return (0);
582 }
583 
584 void
585 unp_disconnect(struct unpcb *unp)
586 {
587 	struct unpcb *unp2 = unp->unp_conn;
588 
589 	if (unp2 == NULL)
590 		return;
591 	unp->unp_conn = NULL;
592 	switch (unp->unp_socket->so_type) {
593 
594 	case SOCK_DGRAM:
595 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
596 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
597 		break;
598 
599 	case SOCK_STREAM:
600 	case SOCK_SEQPACKET:
601 		unp->unp_socket->so_snd.sb_mbcnt = 0;
602 		unp->unp_socket->so_snd.sb_cc = 0;
603 		soisdisconnected(unp->unp_socket);
604 		unp2->unp_conn = NULL;
605 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
606 		unp2->unp_socket->so_snd.sb_cc = 0;
607 		soisdisconnected(unp2->unp_socket);
608 		break;
609 	}
610 }
611 
612 void
613 unp_shutdown(struct unpcb *unp)
614 {
615 	struct socket *so;
616 
617 	switch (unp->unp_socket->so_type) {
618 	case SOCK_STREAM:
619 	case SOCK_SEQPACKET:
620 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
621 			socantrcvmore(so);
622 		break;
623 	default:
624 		break;
625 	}
626 }
627 
628 void
629 unp_drop(struct unpcb *unp, int errno)
630 {
631 	struct socket *so = unp->unp_socket;
632 
633 	so->so_error = errno;
634 	unp_disconnect(unp);
635 	if (so->so_head) {
636 		so->so_pcb = NULL;
637 		sofree(so);
638 		m_freem(unp->unp_addr);
639 		free(unp, M_PCB, sizeof *unp);
640 	}
641 }
642 
643 #ifdef notdef
644 unp_drain(void)
645 {
646 
647 }
648 #endif
649 
650 extern	struct domain unixdomain;
651 
652 static struct unpcb *
653 fptounp(struct file *fp)
654 {
655 	struct socket *so;
656 
657 	if (fp->f_type != DTYPE_SOCKET)
658 		return (NULL);
659 	if ((so = fp->f_data) == NULL)
660 		return (NULL);
661 	if (so->so_proto->pr_domain != &unixdomain)
662 		return (NULL);
663 	return (sotounpcb(so));
664 }
665 
666 int
667 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
668 {
669 	struct proc *p = curproc;		/* XXX */
670 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
671 	int i, *fdp = NULL;
672 	struct fdpass *rp;
673 	struct file *fp;
674 	int nfds, error = 0;
675 
676 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
677 	    sizeof(struct fdpass);
678 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
679 		controllen = 0;
680 	else
681 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
682 	if (nfds > controllen / sizeof(int)) {
683 		error = EMSGSIZE;
684 		goto restart;
685 	}
686 
687 	/* Make sure the recipient should be able to see the descriptors.. */
688 	rp = (struct fdpass *)CMSG_DATA(cm);
689 	for (i = 0; i < nfds; i++) {
690 		fp = rp->fp;
691 		rp++;
692 		error = pledge_recvfd(p, fp);
693 		if (error)
694 			break;
695 
696 		/*
697 		 * No to block devices.  If passing a directory,
698 		 * make sure that it is underneath the root.
699 		 */
700 		if (p->p_fd->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
701 			struct vnode *vp = (struct vnode *)fp->f_data;
702 
703 			if (vp->v_type == VBLK ||
704 			    (vp->v_type == VDIR &&
705 			    !vn_isunder(vp, p->p_fd->fd_rdir, p))) {
706 				error = EPERM;
707 				break;
708 			}
709 		}
710 	}
711 
712 	fdp = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
713 
714 restart:
715 	fdplock(p->p_fd);
716 	if (error != 0) {
717 		if (nfds > 0) {
718 			rp = ((struct fdpass *)CMSG_DATA(cm));
719 			unp_discard(rp, nfds);
720 		}
721 		goto out;
722 	}
723 
724 	/*
725 	 * First loop -- allocate file descriptor table slots for the
726 	 * new descriptors.
727 	 */
728 	rp = ((struct fdpass *)CMSG_DATA(cm));
729 	for (i = 0; i < nfds; i++) {
730 		if ((error = fdalloc(p, 0, &fdp[i])) != 0) {
731 			/*
732 			 * Back out what we've done so far.
733 			 */
734 			for (--i; i >= 0; i--)
735 				fdremove(p->p_fd, fdp[i]);
736 
737 			if (error == ENOSPC) {
738 				fdexpand(p);
739 				error = 0;
740 			} else {
741 				/*
742 				 * This is the error that has historically
743 				 * been returned, and some callers may
744 				 * expect it.
745 				 */
746 				error = EMSGSIZE;
747 			}
748 			fdpunlock(p->p_fd);
749 			goto restart;
750 		}
751 
752 		/*
753 		 * Make the slot reference the descriptor so that
754 		 * fdalloc() works properly.. We finalize it all
755 		 * in the loop below.
756 		 */
757 		p->p_fd->fd_ofiles[fdp[i]] = rp->fp;
758 		p->p_fd->fd_ofileflags[fdp[i]] = (rp->flags & UF_PLEDGED);
759 		rp++;
760 
761 		if (flags & MSG_CMSG_CLOEXEC)
762 			p->p_fd->fd_ofileflags[fdp[i]] |= UF_EXCLOSE;
763 	}
764 
765 	/*
766 	 * Now that adding them has succeeded, update all of the
767 	 * descriptor passing state.
768 	 */
769 	rp = (struct fdpass *)CMSG_DATA(cm);
770 	for (i = 0; i < nfds; i++) {
771 		struct unpcb *unp;
772 
773 		fp = rp->fp;
774 		rp++;
775 		if ((unp = fptounp(fp)) != NULL)
776 			unp->unp_msgcount--;
777 		unp_rights--;
778 	}
779 
780 	/*
781 	 * Copy temporary array to message and adjust length, in case of
782 	 * transition from large struct file pointers to ints.
783 	 */
784 	memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
785 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
786 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
787  out:
788 	fdpunlock(p->p_fd);
789 	if (fdp)
790 		free(fdp, M_TEMP, nfds * sizeof(int));
791 	return (error);
792 }
793 
794 int
795 unp_internalize(struct mbuf *control, struct proc *p)
796 {
797 	struct filedesc *fdp = p->p_fd;
798 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
799 	struct fdpass *rp;
800 	struct file *fp;
801 	struct unpcb *unp;
802 	int i, error;
803 	int nfds, *ip, fd, neededspace;
804 
805 	/*
806 	 * Check for two potential msg_controllen values because
807 	 * IETF stuck their nose in a place it does not belong.
808 	 */
809 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
810 	    !(cm->cmsg_len == control->m_len ||
811 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
812 		return (EINVAL);
813 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
814 
815 	if (unp_rights + nfds > maxfiles / 10)
816 		return (EMFILE);
817 
818 	/* Make sure we have room for the struct file pointers */
819 morespace:
820 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
821 	    control->m_len;
822 	if (neededspace > M_TRAILINGSPACE(control)) {
823 		char *tmp;
824 		/* if we already have a cluster, the message is just too big */
825 		if (control->m_flags & M_EXT)
826 			return (E2BIG);
827 
828 		/* copy cmsg data temporarily out of the mbuf */
829 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
830 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
831 
832 		/* allocate a cluster and try again */
833 		MCLGET(control, M_WAIT);
834 		if ((control->m_flags & M_EXT) == 0) {
835 			free(tmp, M_TEMP, control->m_len);
836 			return (ENOBUFS);       /* allocation failed */
837 		}
838 
839 		/* copy the data back into the cluster */
840 		cm = mtod(control, struct cmsghdr *);
841 		memcpy(cm, tmp, control->m_len);
842 		free(tmp, M_TEMP, control->m_len);
843 		goto morespace;
844 	}
845 
846 	/* adjust message & mbuf to note amount of space actually used. */
847 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
848 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
849 
850 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
851 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
852 	for (i = 0; i < nfds; i++) {
853 		memcpy(&fd, ip, sizeof fd);
854 		ip--;
855 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
856 			error = EBADF;
857 			goto fail;
858 		}
859 		if (fp->f_count == LONG_MAX-2) {
860 			error = EDEADLK;
861 			goto fail;
862 		}
863 		error = pledge_sendfd(p, fp);
864 		if (error)
865 			goto fail;
866 
867 		/* kqueue descriptors cannot be copied */
868 		if (fp->f_type == DTYPE_KQUEUE) {
869 			error = EINVAL;
870 			goto fail;
871 		}
872 		rp->fp = fp;
873 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
874 		rp--;
875 		fp->f_count++;
876 		if ((unp = fptounp(fp)) != NULL) {
877 			unp->unp_file = fp;
878 			unp->unp_msgcount++;
879 		}
880 		unp_rights++;
881 	}
882 	return (0);
883 fail:
884 	/* Back out what we just did. */
885 	for ( ; i > 0; i--) {
886 		rp++;
887 		fp = rp->fp;
888 		fp->f_count--;
889 		if ((unp = fptounp(fp)) != NULL)
890 			unp->unp_msgcount--;
891 		unp_rights--;
892 	}
893 
894 	return (error);
895 }
896 
897 int	unp_defer, unp_gcing;
898 
899 void
900 unp_gc(void *arg __unused)
901 {
902 	struct unp_deferral *defer;
903 	struct file *fp;
904 	struct socket *so;
905 	struct unpcb *unp;
906 	int nunref, i;
907 
908 	if (unp_gcing)
909 		return;
910 	unp_gcing = 1;
911 
912 	/* close any fds on the deferred list */
913 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
914 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
915 		for (i = 0; i < defer->ud_n; i++) {
916 			fp = defer->ud_fp[i].fp;
917 			if (fp == NULL)
918 				continue;
919 			FREF(fp);
920 			if ((unp = fptounp(fp)) != NULL)
921 				unp->unp_msgcount--;
922 			unp_rights--;
923 			(void) closef(fp, NULL);
924 		}
925 		free(defer, M_TEMP, sizeof(*defer) +
926 		    sizeof(struct fdpass) * defer->ud_n);
927 	}
928 
929 	unp_defer = 0;
930 	LIST_FOREACH(unp, &unp_head, unp_link)
931 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
932 	do {
933 		nunref = 0;
934 		LIST_FOREACH(unp, &unp_head, unp_link) {
935 			if (unp->unp_flags & UNP_GCDEFER) {
936 				/*
937 				 * This socket is referenced by another
938 				 * socket which is known to be live,
939 				 * so it's certainly live.
940 				 */
941 				unp->unp_flags &= ~UNP_GCDEFER;
942 				unp_defer--;
943 			} else if (unp->unp_flags & UNP_GCMARK) {
944 				/* marked as live in previous pass */
945 				continue;
946 			} else if ((fp = unp->unp_file) == NULL) {
947 				/* not being passed, so can't be in loop */
948 			} else if (fp->f_count == 0) {
949 				/*
950 				 * Already being closed, let normal close
951 				 * path take its course
952 				 */
953 			} else {
954 				/*
955 				 * Unreferenced by other sockets so far,
956 				 * so if all the references (f_count) are
957 				 * from passing (unp_msgcount) then this
958 				 * socket is prospectively dead
959 				 */
960 				if (fp->f_count == unp->unp_msgcount) {
961 					nunref++;
962 					unp->unp_flags |= UNP_GCDEAD;
963 					continue;
964 				}
965 			}
966 
967 			/*
968 			 * This is the first time we've seen this socket on
969 			 * the mark pass and known it has a live reference,
970 			 * so mark it, then scan its receive buffer for
971 			 * sockets and note them as deferred (== referenced,
972 			 * but not yet marked).
973 			 */
974 			unp->unp_flags |= UNP_GCMARK;
975 
976 			so = unp->unp_socket;
977 #ifdef notdef
978 			if (so->so_rcv.sb_flags & SB_LOCK) {
979 				/*
980 				 * This is problematical; it's not clear
981 				 * we need to wait for the sockbuf to be
982 				 * unlocked (on a uniprocessor, at least),
983 				 * and it's also not clear what to do
984 				 * if sbwait returns an error due to receipt
985 				 * of a signal.  If sbwait does return
986 				 * an error, we'll go into an infinite
987 				 * loop.  Delete all of this for now.
988 				 */
989 				(void) sbwait(&so->so_rcv);
990 				goto restart;
991 			}
992 #endif
993 			unp_scan(so->so_rcv.sb_mb, unp_mark);
994 		}
995 	} while (unp_defer);
996 
997 	/*
998 	 * If there are any unreferenced sockets, then for each dispose
999 	 * of files in its receive buffer and then close it.
1000 	 */
1001 	if (nunref) {
1002 		LIST_FOREACH(unp, &unp_head, unp_link) {
1003 			if (unp->unp_flags & UNP_GCDEAD)
1004 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
1005 				    unp_discard);
1006 		}
1007 	}
1008 	unp_gcing = 0;
1009 }
1010 
1011 void
1012 unp_dispose(struct mbuf *m)
1013 {
1014 
1015 	if (m)
1016 		unp_scan(m, unp_discard);
1017 }
1018 
1019 void
1020 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1021 {
1022 	struct mbuf *m;
1023 	struct fdpass *rp;
1024 	struct cmsghdr *cm;
1025 	int qfds;
1026 
1027 	while (m0) {
1028 		for (m = m0; m; m = m->m_next) {
1029 			if (m->m_type == MT_CONTROL &&
1030 			    m->m_len >= sizeof(*cm)) {
1031 				cm = mtod(m, struct cmsghdr *);
1032 				if (cm->cmsg_level != SOL_SOCKET ||
1033 				    cm->cmsg_type != SCM_RIGHTS)
1034 					continue;
1035 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1036 				    / sizeof(struct fdpass);
1037 				if (qfds > 0) {
1038 					rp = (struct fdpass *)CMSG_DATA(cm);
1039 					op(rp, qfds);
1040 				}
1041 				break;		/* XXX, but saves time */
1042 			}
1043 		}
1044 		m0 = m0->m_nextpkt;
1045 	}
1046 }
1047 
1048 void
1049 unp_mark(struct fdpass *rp, int nfds)
1050 {
1051 	struct unpcb *unp;
1052 	int i;
1053 
1054 	for (i = 0; i < nfds; i++) {
1055 		if (rp[i].fp == NULL)
1056 			continue;
1057 
1058 		unp = fptounp(rp[i].fp);
1059 		if (unp == NULL)
1060 			continue;
1061 
1062 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1063 			continue;
1064 
1065 		unp_defer++;
1066 		unp->unp_flags |= UNP_GCDEFER;
1067 		unp->unp_flags &= ~UNP_GCDEAD;
1068 	}
1069 }
1070 
1071 void
1072 unp_discard(struct fdpass *rp, int nfds)
1073 {
1074 	struct unp_deferral *defer;
1075 
1076 	/* copy the file pointers to a deferral structure */
1077 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1078 	defer->ud_n = nfds;
1079 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1080 	memset(rp, 0, sizeof(*rp) * nfds);
1081 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1082 
1083 	task_add(systq, &unp_gc_task);
1084 }
1085