xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision d1df930ffab53da22f3324c32bed7ac5709915e6)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.134 2018/07/09 10:58:21 claudio Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 
54 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
55 
56 /* list of all UNIX domain sockets, for unp_gc() */
57 LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
58 
59 /*
60  * Stack of sets of files that were passed over a socket but were
61  * not received and need to be closed.
62  */
63 struct	unp_deferral {
64 	SLIST_ENTRY(unp_deferral)	ud_link;
65 	int	ud_n;
66 	/* followed by ud_n struct fdpass */
67 	struct fdpass ud_fp[];
68 };
69 
70 void	unp_discard(struct fdpass *, int);
71 void	unp_mark(struct fdpass *, int);
72 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
73 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
74 
75 /* list of sets of files that were sent over sockets that are now closed */
76 SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
77 
78 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
79 
80 
81 /*
82  * Unix communications domain.
83  *
84  * TODO:
85  *	RDM
86  *	rethink name space problems
87  *	need a proper out-of-band
88  */
89 struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
90 ino_t	unp_ino;			/* prototype for fake inode numbers */
91 
92 void
93 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
94 {
95 	if (unp != NULL && unp->unp_addr != NULL) {
96 		nam->m_len = unp->unp_addr->m_len;
97 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
98 		    nam->m_len);
99 	} else {
100 		nam->m_len = sizeof(sun_noname);
101 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
102 		    nam->m_len);
103 	}
104 }
105 
106 int
107 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
108     struct mbuf *control, struct proc *p)
109 {
110 	struct unpcb *unp = sotounpcb(so);
111 	struct socket *so2;
112 	int error = 0;
113 
114 	if (req == PRU_CONTROL)
115 		return (EOPNOTSUPP);
116 	if (req != PRU_SEND && control && control->m_len) {
117 		error = EOPNOTSUPP;
118 		goto release;
119 	}
120 	if (unp == NULL) {
121 		error = EINVAL;
122 		goto release;
123 	}
124 
125 	NET_ASSERT_UNLOCKED();
126 
127 	switch (req) {
128 
129 	case PRU_BIND:
130 		error = unp_bind(unp, nam, p);
131 		break;
132 
133 	case PRU_LISTEN:
134 		if (unp->unp_vnode == NULL)
135 			error = EINVAL;
136 		break;
137 
138 	case PRU_CONNECT:
139 		error = unp_connect(so, nam, p);
140 		break;
141 
142 	case PRU_CONNECT2:
143 		error = unp_connect2(so, (struct socket *)nam);
144 		break;
145 
146 	case PRU_DISCONNECT:
147 		unp_disconnect(unp);
148 		break;
149 
150 	case PRU_ACCEPT:
151 		/*
152 		 * Pass back name of connected socket,
153 		 * if it was bound and we are still connected
154 		 * (our peer may have closed already!).
155 		 */
156 		uipc_setaddr(unp->unp_conn, nam);
157 		break;
158 
159 	case PRU_SHUTDOWN:
160 		socantsendmore(so);
161 		unp_shutdown(unp);
162 		break;
163 
164 	case PRU_RCVD:
165 		switch (so->so_type) {
166 
167 		case SOCK_DGRAM:
168 			panic("uipc 1");
169 			/*NOTREACHED*/
170 
171 		case SOCK_STREAM:
172 		case SOCK_SEQPACKET:
173 			if (unp->unp_conn == NULL)
174 				break;
175 			so2 = unp->unp_conn->unp_socket;
176 			/*
177 			 * Adjust backpressure on sender
178 			 * and wakeup any waiting to write.
179 			 */
180 			so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
181 			so2->so_snd.sb_cc = so->so_rcv.sb_cc;
182 			sowwakeup(so2);
183 			break;
184 
185 		default:
186 			panic("uipc 2");
187 		}
188 		break;
189 
190 	case PRU_SEND:
191 		if (control && (error = unp_internalize(control, p)))
192 			break;
193 		switch (so->so_type) {
194 
195 		case SOCK_DGRAM: {
196 			struct sockaddr *from;
197 
198 			if (nam) {
199 				if (unp->unp_conn) {
200 					error = EISCONN;
201 					break;
202 				}
203 				error = unp_connect(so, nam, p);
204 				if (error)
205 					break;
206 			} else {
207 				if (unp->unp_conn == NULL) {
208 					error = ENOTCONN;
209 					break;
210 				}
211 			}
212 			so2 = unp->unp_conn->unp_socket;
213 			if (unp->unp_addr)
214 				from = mtod(unp->unp_addr, struct sockaddr *);
215 			else
216 				from = &sun_noname;
217 			if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
218 				sorwakeup(so2);
219 				m = NULL;
220 				control = NULL;
221 			} else
222 				error = ENOBUFS;
223 			if (nam)
224 				unp_disconnect(unp);
225 			break;
226 		}
227 
228 		case SOCK_STREAM:
229 		case SOCK_SEQPACKET:
230 			if (so->so_state & SS_CANTSENDMORE) {
231 				error = EPIPE;
232 				break;
233 			}
234 			if (unp->unp_conn == NULL) {
235 				error = ENOTCONN;
236 				break;
237 			}
238 			so2 = unp->unp_conn->unp_socket;
239 			/*
240 			 * Send to paired receive port, and then raise
241 			 * send buffer counts to maintain backpressure.
242 			 * Wake up readers.
243 			 */
244 			if (control) {
245 				if (sbappendcontrol(so2, &so2->so_rcv, m,
246 				    control)) {
247 					control = NULL;
248 				} else {
249 					error = ENOBUFS;
250 					break;
251 				}
252 			} else if (so->so_type == SOCK_SEQPACKET)
253 				sbappendrecord(so2, &so2->so_rcv, m);
254 			else
255 				sbappend(so2, &so2->so_rcv, m);
256 			so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
257 			so->so_snd.sb_cc = so2->so_rcv.sb_cc;
258 			sorwakeup(so2);
259 			m = NULL;
260 			break;
261 
262 		default:
263 			panic("uipc 4");
264 		}
265 		/* we need to undo unp_internalize in case of errors */
266 		if (control && error)
267 			unp_dispose(control);
268 		break;
269 
270 	case PRU_ABORT:
271 		unp_drop(unp, ECONNABORTED);
272 		break;
273 
274 	case PRU_SENSE: {
275 		struct stat *sb = (struct stat *)m;
276 
277 		sb->st_blksize = so->so_snd.sb_hiwat;
278 		sb->st_dev = NODEV;
279 		if (unp->unp_ino == 0)
280 			unp->unp_ino = unp_ino++;
281 		sb->st_atim.tv_sec =
282 		    sb->st_mtim.tv_sec =
283 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
284 		sb->st_atim.tv_nsec =
285 		    sb->st_mtim.tv_nsec =
286 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
287 		sb->st_ino = unp->unp_ino;
288 		return (0);
289 	}
290 
291 	case PRU_RCVOOB:
292 		return (EOPNOTSUPP);
293 
294 	case PRU_SENDOOB:
295 		error = EOPNOTSUPP;
296 		break;
297 
298 	case PRU_SOCKADDR:
299 		uipc_setaddr(unp, nam);
300 		break;
301 
302 	case PRU_PEERADDR:
303 		uipc_setaddr(unp->unp_conn, nam);
304 		break;
305 
306 	case PRU_SLOWTIMO:
307 		break;
308 
309 	default:
310 		panic("uipc_usrreq");
311 	}
312 release:
313 	m_freem(control);
314 	m_freem(m);
315 	return (error);
316 }
317 
318 /*
319  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
320  * for stream sockets, although the total for sender and receiver is
321  * actually only PIPSIZ.
322  * Datagram sockets really use the sendspace as the maximum datagram size,
323  * and don't really want to reserve the sendspace.  Their recvspace should
324  * be large enough for at least one max-size datagram plus address.
325  */
326 #define	PIPSIZ	4096
327 u_long	unpst_sendspace = PIPSIZ;
328 u_long	unpst_recvspace = PIPSIZ;
329 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
330 u_long	unpdg_recvspace = 4*1024;
331 
332 int	unp_rights;			/* file descriptors in flight */
333 
334 int
335 uipc_attach(struct socket *so, int proto)
336 {
337 	struct unpcb *unp;
338 	int error;
339 
340 	if (so->so_pcb)
341 		return EISCONN;
342 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
343 		switch (so->so_type) {
344 
345 		case SOCK_STREAM:
346 		case SOCK_SEQPACKET:
347 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
348 			break;
349 
350 		case SOCK_DGRAM:
351 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
352 			break;
353 
354 		default:
355 			panic("unp_attach");
356 		}
357 		if (error)
358 			return (error);
359 	}
360 	unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT|M_ZERO);
361 	if (unp == NULL)
362 		return (ENOBUFS);
363 	unp->unp_socket = so;
364 	so->so_pcb = unp;
365 	getnanotime(&unp->unp_ctime);
366 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
367 	return (0);
368 }
369 
370 int
371 uipc_detach(struct socket *so)
372 {
373 	struct unpcb *unp = sotounpcb(so);
374 
375 	if (unp == NULL)
376 		return (EINVAL);
377 
378 	NET_ASSERT_UNLOCKED();
379 
380 	unp_detach(unp);
381 
382 	return (0);
383 }
384 
385 void
386 unp_detach(struct unpcb *unp)
387 {
388 	struct vnode *vp;
389 
390 	LIST_REMOVE(unp, unp_link);
391 	if (unp->unp_vnode) {
392 		unp->unp_vnode->v_socket = NULL;
393 		vp = unp->unp_vnode;
394 		unp->unp_vnode = NULL;
395 		vrele(vp);
396 	}
397 	if (unp->unp_conn)
398 		unp_disconnect(unp);
399 	while (!SLIST_EMPTY(&unp->unp_refs))
400 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
401 	soisdisconnected(unp->unp_socket);
402 	unp->unp_socket->so_pcb = NULL;
403 	m_freem(unp->unp_addr);
404 	free(unp, M_PCB, sizeof *unp);
405 	if (unp_rights)
406 		task_add(systq, &unp_gc_task);
407 }
408 
409 int
410 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
411 {
412 	struct sockaddr_un *soun;
413 	struct mbuf *nam2;
414 	struct vnode *vp;
415 	struct vattr vattr;
416 	int error;
417 	struct nameidata nd;
418 	size_t pathlen;
419 
420 	if (unp->unp_vnode != NULL)
421 		return (EINVAL);
422 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
423 		return (error);
424 
425 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
426 	nam2->m_len = sizeof(struct sockaddr_un);
427 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
428 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
429 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
430 
431 	soun = mtod(nam2, struct sockaddr_un *);
432 
433 	/* Fixup sun_len to keep it in sync with m_len. */
434 	soun->sun_len = nam2->m_len;
435 
436 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
437 	    soun->sun_path, p);
438 	nd.ni_pledge = PLEDGE_UNIX;
439 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
440 	if ((error = namei(&nd)) != 0) {
441 		m_freem(nam2);
442 		return (error);
443 	}
444 	vp = nd.ni_vp;
445 	if (vp != NULL) {
446 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
447 		if (nd.ni_dvp == vp)
448 			vrele(nd.ni_dvp);
449 		else
450 			vput(nd.ni_dvp);
451 		vrele(vp);
452 		m_freem(nam2);
453 		return (EADDRINUSE);
454 	}
455 	VATTR_NULL(&vattr);
456 	vattr.va_type = VSOCK;
457 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
458 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
459 	vput(nd.ni_dvp);
460 	if (error) {
461 		m_freem(nam2);
462 		return (error);
463 	}
464 	unp->unp_addr = nam2;
465 	vp = nd.ni_vp;
466 	vp->v_socket = unp->unp_socket;
467 	unp->unp_vnode = vp;
468 	unp->unp_connid.uid = p->p_ucred->cr_uid;
469 	unp->unp_connid.gid = p->p_ucred->cr_gid;
470 	unp->unp_connid.pid = p->p_p->ps_pid;
471 	unp->unp_flags |= UNP_FEIDSBIND;
472 	VOP_UNLOCK(vp);
473 	return (0);
474 }
475 
476 int
477 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
478 {
479 	struct sockaddr_un *soun;
480 	struct vnode *vp;
481 	struct socket *so2, *so3;
482 	struct unpcb *unp, *unp2, *unp3;
483 	struct nameidata nd;
484 	int error;
485 
486 	if ((error = unp_nam2sun(nam, &soun, NULL)))
487 		return (error);
488 
489 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
490 	nd.ni_pledge = PLEDGE_UNIX;
491 	if ((error = namei(&nd)) != 0)
492 		return (error);
493 	vp = nd.ni_vp;
494 	if (vp->v_type != VSOCK) {
495 		error = ENOTSOCK;
496 		goto bad;
497 	}
498 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
499 		goto bad;
500 	so2 = vp->v_socket;
501 	if (so2 == NULL) {
502 		error = ECONNREFUSED;
503 		goto bad;
504 	}
505 	if (so->so_type != so2->so_type) {
506 		error = EPROTOTYPE;
507 		goto bad;
508 	}
509 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
510 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
511 		    (so3 = sonewconn(so2, 0)) == 0) {
512 			error = ECONNREFUSED;
513 			goto bad;
514 		}
515 		unp = sotounpcb(so);
516 		unp2 = sotounpcb(so2);
517 		unp3 = sotounpcb(so3);
518 		if (unp2->unp_addr)
519 			unp3->unp_addr =
520 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
521 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
522 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
523 		unp3->unp_connid.pid = p->p_p->ps_pid;
524 		unp3->unp_flags |= UNP_FEIDS;
525 		so2 = so3;
526 		if (unp2->unp_flags & UNP_FEIDSBIND) {
527 			unp->unp_connid = unp2->unp_connid;
528 			unp->unp_flags |= UNP_FEIDS;
529 		}
530 	}
531 	error = unp_connect2(so, so2);
532 bad:
533 	vput(vp);
534 	return (error);
535 }
536 
537 int
538 unp_connect2(struct socket *so, struct socket *so2)
539 {
540 	struct unpcb *unp = sotounpcb(so);
541 	struct unpcb *unp2;
542 
543 	if (so2->so_type != so->so_type)
544 		return (EPROTOTYPE);
545 	unp2 = sotounpcb(so2);
546 	unp->unp_conn = unp2;
547 	switch (so->so_type) {
548 
549 	case SOCK_DGRAM:
550 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
551 		soisconnected(so);
552 		break;
553 
554 	case SOCK_STREAM:
555 	case SOCK_SEQPACKET:
556 		unp2->unp_conn = unp;
557 		soisconnected(so);
558 		soisconnected(so2);
559 		break;
560 
561 	default:
562 		panic("unp_connect2");
563 	}
564 	return (0);
565 }
566 
567 void
568 unp_disconnect(struct unpcb *unp)
569 {
570 	struct unpcb *unp2 = unp->unp_conn;
571 
572 	if (unp2 == NULL)
573 		return;
574 	unp->unp_conn = NULL;
575 	switch (unp->unp_socket->so_type) {
576 
577 	case SOCK_DGRAM:
578 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
579 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
580 		break;
581 
582 	case SOCK_STREAM:
583 	case SOCK_SEQPACKET:
584 		unp->unp_socket->so_snd.sb_mbcnt = 0;
585 		unp->unp_socket->so_snd.sb_cc = 0;
586 		soisdisconnected(unp->unp_socket);
587 		unp2->unp_conn = NULL;
588 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
589 		unp2->unp_socket->so_snd.sb_cc = 0;
590 		soisdisconnected(unp2->unp_socket);
591 		break;
592 	}
593 }
594 
595 void
596 unp_shutdown(struct unpcb *unp)
597 {
598 	struct socket *so;
599 
600 	switch (unp->unp_socket->so_type) {
601 	case SOCK_STREAM:
602 	case SOCK_SEQPACKET:
603 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
604 			socantrcvmore(so);
605 		break;
606 	default:
607 		break;
608 	}
609 }
610 
611 void
612 unp_drop(struct unpcb *unp, int errno)
613 {
614 	struct socket *so = unp->unp_socket;
615 
616 	KERNEL_ASSERT_LOCKED();
617 
618 	so->so_error = errno;
619 	unp_disconnect(unp);
620 	if (so->so_head) {
621 		so->so_pcb = NULL;
622 		/*
623 		 * As long as the KERNEL_LOCK() is the default lock for Unix
624 		 * sockets, do not release it.
625 		 */
626 		sofree(so, SL_NOUNLOCK);
627 		m_freem(unp->unp_addr);
628 		free(unp, M_PCB, sizeof *unp);
629 	}
630 }
631 
632 #ifdef notdef
633 unp_drain(void)
634 {
635 
636 }
637 #endif
638 
639 extern	struct domain unixdomain;
640 
641 static struct unpcb *
642 fptounp(struct file *fp)
643 {
644 	struct socket *so;
645 
646 	if (fp->f_type != DTYPE_SOCKET)
647 		return (NULL);
648 	if ((so = fp->f_data) == NULL)
649 		return (NULL);
650 	if (so->so_proto->pr_domain != &unixdomain)
651 		return (NULL);
652 	return (sotounpcb(so));
653 }
654 
655 int
656 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
657 {
658 	struct proc *p = curproc;		/* XXX */
659 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
660 	struct filedesc *fdp = p->p_fd;
661 	int i, *fds = NULL;
662 	struct fdpass *rp;
663 	struct file *fp;
664 	int nfds, error = 0;
665 
666 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
667 	    sizeof(struct fdpass);
668 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
669 		controllen = 0;
670 	else
671 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
672 	if (nfds > controllen / sizeof(int)) {
673 		error = EMSGSIZE;
674 		goto restart;
675 	}
676 
677 	/* Make sure the recipient should be able to see the descriptors.. */
678 	rp = (struct fdpass *)CMSG_DATA(cm);
679 	for (i = 0; i < nfds; i++) {
680 		fp = rp->fp;
681 		rp++;
682 		error = pledge_recvfd(p, fp);
683 		if (error)
684 			break;
685 
686 		/*
687 		 * No to block devices.  If passing a directory,
688 		 * make sure that it is underneath the root.
689 		 */
690 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
691 			struct vnode *vp = (struct vnode *)fp->f_data;
692 
693 			if (vp->v_type == VBLK ||
694 			    (vp->v_type == VDIR &&
695 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
696 				error = EPERM;
697 				break;
698 			}
699 		}
700 	}
701 
702 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
703 
704 restart:
705 	fdplock(fdp);
706 	if (error != 0) {
707 		if (nfds > 0) {
708 			rp = ((struct fdpass *)CMSG_DATA(cm));
709 			unp_discard(rp, nfds);
710 		}
711 		goto out;
712 	}
713 
714 	/*
715 	 * First loop -- allocate file descriptor table slots for the
716 	 * new descriptors.
717 	 */
718 	rp = ((struct fdpass *)CMSG_DATA(cm));
719 	for (i = 0; i < nfds; i++) {
720 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
721 			/*
722 			 * Back out what we've done so far.
723 			 */
724 			for (--i; i >= 0; i--)
725 				fdremove(fdp, fds[i]);
726 
727 			if (error == ENOSPC) {
728 				fdexpand(p);
729 				error = 0;
730 			} else {
731 				/*
732 				 * This is the error that has historically
733 				 * been returned, and some callers may
734 				 * expect it.
735 				 */
736 				error = EMSGSIZE;
737 			}
738 			fdpunlock(fdp);
739 			goto restart;
740 		}
741 
742 		/*
743 		 * Make the slot reference the descriptor so that
744 		 * fdalloc() works properly.. We finalize it all
745 		 * in the loop below.
746 		 */
747 		mtx_enter(&fdp->fd_fplock);
748 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
749 		fdp->fd_ofiles[fds[i]] = rp->fp;
750 		mtx_leave(&fdp->fd_fplock);
751 
752 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
753 		if (flags & MSG_CMSG_CLOEXEC)
754 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
755 
756 		rp++;
757 	}
758 
759 	/*
760 	 * Now that adding them has succeeded, update all of the
761 	 * descriptor passing state.
762 	 */
763 	rp = (struct fdpass *)CMSG_DATA(cm);
764 	for (i = 0; i < nfds; i++) {
765 		struct unpcb *unp;
766 
767 		fp = rp->fp;
768 		rp++;
769 		if ((unp = fptounp(fp)) != NULL)
770 			unp->unp_msgcount--;
771 		unp_rights--;
772 	}
773 
774 	/*
775 	 * Copy temporary array to message and adjust length, in case of
776 	 * transition from large struct file pointers to ints.
777 	 */
778 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
779 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
780 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
781  out:
782 	fdpunlock(fdp);
783 	if (fds != NULL)
784 		free(fds, M_TEMP, nfds * sizeof(int));
785 	return (error);
786 }
787 
788 int
789 unp_internalize(struct mbuf *control, struct proc *p)
790 {
791 	struct filedesc *fdp = p->p_fd;
792 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
793 	struct fdpass *rp;
794 	struct file *fp;
795 	struct unpcb *unp;
796 	int i, error;
797 	int nfds, *ip, fd, neededspace;
798 
799 	/*
800 	 * Check for two potential msg_controllen values because
801 	 * IETF stuck their nose in a place it does not belong.
802 	 */
803 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
804 	    !(cm->cmsg_len == control->m_len ||
805 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
806 		return (EINVAL);
807 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
808 
809 	if (unp_rights + nfds > maxfiles / 10)
810 		return (EMFILE);
811 
812 	/* Make sure we have room for the struct file pointers */
813 morespace:
814 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
815 	    control->m_len;
816 	if (neededspace > M_TRAILINGSPACE(control)) {
817 		char *tmp;
818 		/* if we already have a cluster, the message is just too big */
819 		if (control->m_flags & M_EXT)
820 			return (E2BIG);
821 
822 		/* copy cmsg data temporarily out of the mbuf */
823 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
824 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
825 
826 		/* allocate a cluster and try again */
827 		MCLGET(control, M_WAIT);
828 		if ((control->m_flags & M_EXT) == 0) {
829 			free(tmp, M_TEMP, control->m_len);
830 			return (ENOBUFS);       /* allocation failed */
831 		}
832 
833 		/* copy the data back into the cluster */
834 		cm = mtod(control, struct cmsghdr *);
835 		memcpy(cm, tmp, control->m_len);
836 		free(tmp, M_TEMP, control->m_len);
837 		goto morespace;
838 	}
839 
840 	/* adjust message & mbuf to note amount of space actually used. */
841 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
842 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
843 
844 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
845 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
846 	fdplock(fdp);
847 	for (i = 0; i < nfds; i++) {
848 		memcpy(&fd, ip, sizeof fd);
849 		ip--;
850 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
851 			error = EBADF;
852 			goto fail;
853 		}
854 		if (fp->f_count >= FDUP_MAX_COUNT) {
855 			error = EDEADLK;
856 			goto fail;
857 		}
858 		error = pledge_sendfd(p, fp);
859 		if (error)
860 			goto fail;
861 
862 		/* kqueue descriptors cannot be copied */
863 		if (fp->f_type == DTYPE_KQUEUE) {
864 			error = EINVAL;
865 			goto fail;
866 		}
867 		rp->fp = fp;
868 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
869 		rp--;
870 		if ((unp = fptounp(fp)) != NULL) {
871 			unp->unp_file = fp;
872 			unp->unp_msgcount++;
873 		}
874 		unp_rights++;
875 	}
876 	fdpunlock(fdp);
877 	return (0);
878 fail:
879 	fdpunlock(fdp);
880 	if (fp != NULL)
881 		FRELE(fp, p);
882 	/* Back out what we just did. */
883 	for ( ; i > 0; i--) {
884 		rp++;
885 		fp = rp->fp;
886 		if ((unp = fptounp(fp)) != NULL)
887 			unp->unp_msgcount--;
888 		FRELE(fp, p);
889 		unp_rights--;
890 	}
891 
892 	return (error);
893 }
894 
895 int	unp_defer, unp_gcing;
896 
897 void
898 unp_gc(void *arg __unused)
899 {
900 	struct unp_deferral *defer;
901 	struct file *fp;
902 	struct socket *so;
903 	struct unpcb *unp;
904 	int nunref, i;
905 
906 	if (unp_gcing)
907 		return;
908 	unp_gcing = 1;
909 
910 	/* close any fds on the deferred list */
911 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
912 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
913 		for (i = 0; i < defer->ud_n; i++) {
914 			fp = defer->ud_fp[i].fp;
915 			if (fp == NULL)
916 				continue;
917 			 /* closef() expects a refcount of 2 */
918 			FREF(fp);
919 			if ((unp = fptounp(fp)) != NULL)
920 				unp->unp_msgcount--;
921 			unp_rights--;
922 			(void) closef(fp, NULL);
923 		}
924 		free(defer, M_TEMP, sizeof(*defer) +
925 		    sizeof(struct fdpass) * defer->ud_n);
926 	}
927 
928 	unp_defer = 0;
929 	LIST_FOREACH(unp, &unp_head, unp_link)
930 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
931 	do {
932 		nunref = 0;
933 		LIST_FOREACH(unp, &unp_head, unp_link) {
934 			fp = unp->unp_file;
935 			if (unp->unp_flags & UNP_GCDEFER) {
936 				/*
937 				 * This socket is referenced by another
938 				 * socket which is known to be live,
939 				 * so it's certainly live.
940 				 */
941 				unp->unp_flags &= ~UNP_GCDEFER;
942 				unp_defer--;
943 			} else if (unp->unp_flags & UNP_GCMARK) {
944 				/* marked as live in previous pass */
945 				continue;
946 			} else if (fp == NULL) {
947 				/* not being passed, so can't be in loop */
948 			} else if (fp->f_count == 0) {
949 				/*
950 				 * Already being closed, let normal close
951 				 * path take its course
952 				 */
953 			} else {
954 				/*
955 				 * Unreferenced by other sockets so far,
956 				 * so if all the references (f_count) are
957 				 * from passing (unp_msgcount) then this
958 				 * socket is prospectively dead
959 				 */
960 				if (fp->f_count == unp->unp_msgcount) {
961 					nunref++;
962 					unp->unp_flags |= UNP_GCDEAD;
963 					continue;
964 				}
965 			}
966 
967 			/*
968 			 * This is the first time we've seen this socket on
969 			 * the mark pass and known it has a live reference,
970 			 * so mark it, then scan its receive buffer for
971 			 * sockets and note them as deferred (== referenced,
972 			 * but not yet marked).
973 			 */
974 			unp->unp_flags |= UNP_GCMARK;
975 
976 			so = unp->unp_socket;
977 			unp_scan(so->so_rcv.sb_mb, unp_mark);
978 		}
979 	} while (unp_defer);
980 
981 	/*
982 	 * If there are any unreferenced sockets, then for each dispose
983 	 * of files in its receive buffer and then close it.
984 	 */
985 	if (nunref) {
986 		LIST_FOREACH(unp, &unp_head, unp_link) {
987 			if (unp->unp_flags & UNP_GCDEAD)
988 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
989 				    unp_discard);
990 		}
991 	}
992 	unp_gcing = 0;
993 }
994 
995 void
996 unp_dispose(struct mbuf *m)
997 {
998 
999 	if (m)
1000 		unp_scan(m, unp_discard);
1001 }
1002 
1003 void
1004 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1005 {
1006 	struct mbuf *m;
1007 	struct fdpass *rp;
1008 	struct cmsghdr *cm;
1009 	int qfds;
1010 
1011 	while (m0) {
1012 		for (m = m0; m; m = m->m_next) {
1013 			if (m->m_type == MT_CONTROL &&
1014 			    m->m_len >= sizeof(*cm)) {
1015 				cm = mtod(m, struct cmsghdr *);
1016 				if (cm->cmsg_level != SOL_SOCKET ||
1017 				    cm->cmsg_type != SCM_RIGHTS)
1018 					continue;
1019 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1020 				    / sizeof(struct fdpass);
1021 				if (qfds > 0) {
1022 					rp = (struct fdpass *)CMSG_DATA(cm);
1023 					op(rp, qfds);
1024 				}
1025 				break;		/* XXX, but saves time */
1026 			}
1027 		}
1028 		m0 = m0->m_nextpkt;
1029 	}
1030 }
1031 
1032 void
1033 unp_mark(struct fdpass *rp, int nfds)
1034 {
1035 	struct unpcb *unp;
1036 	int i;
1037 
1038 	for (i = 0; i < nfds; i++) {
1039 		if (rp[i].fp == NULL)
1040 			continue;
1041 
1042 		unp = fptounp(rp[i].fp);
1043 		if (unp == NULL)
1044 			continue;
1045 
1046 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1047 			continue;
1048 
1049 		unp_defer++;
1050 		unp->unp_flags |= UNP_GCDEFER;
1051 		unp->unp_flags &= ~UNP_GCDEAD;
1052 	}
1053 }
1054 
1055 void
1056 unp_discard(struct fdpass *rp, int nfds)
1057 {
1058 	struct unp_deferral *defer;
1059 
1060 	/* copy the file pointers to a deferral structure */
1061 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1062 	defer->ud_n = nfds;
1063 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1064 	memset(rp, 0, sizeof(*rp) * nfds);
1065 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1066 
1067 	task_add(systq, &unp_gc_task);
1068 }
1069 
1070 int
1071 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1072 {
1073 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1074 	size_t size, len;
1075 
1076 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1077 		return EINVAL;
1078 	if (sa->sa_family != AF_UNIX)
1079 		return EAFNOSUPPORT;
1080 	if (sa->sa_len != nam->m_len)
1081 		return EINVAL;
1082 	if (sa->sa_len > sizeof(struct sockaddr_un))
1083 		return EINVAL;
1084 	*sun = (struct sockaddr_un *)sa;
1085 
1086 	/* ensure that sun_path is NUL terminated and fits */
1087 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1088 	len = strnlen((*sun)->sun_path, size);
1089 	if (len == sizeof((*sun)->sun_path))
1090 		return EINVAL;
1091 	if (len == size) {
1092 		if (M_TRAILINGSPACE(nam) == 0)
1093 			return EINVAL;
1094 		nam->m_len++;
1095 		(*sun)->sun_len++;
1096 		(*sun)->sun_path[len] = '\0';
1097 	}
1098 	if (pathlen != NULL)
1099 		*pathlen = len;
1100 
1101 	return 0;
1102 }
1103