xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision 9f11ffb7133c203312a01e4b986886bc88c7d74b)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.139 2019/02/13 11:55:21 martijn Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 
54 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
55 
56 /* list of all UNIX domain sockets, for unp_gc() */
57 LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
58 
59 /*
60  * Stack of sets of files that were passed over a socket but were
61  * not received and need to be closed.
62  */
63 struct	unp_deferral {
64 	SLIST_ENTRY(unp_deferral)	ud_link;
65 	int	ud_n;
66 	/* followed by ud_n struct fdpass */
67 	struct fdpass ud_fp[];
68 };
69 
70 void	unp_discard(struct fdpass *, int);
71 void	unp_mark(struct fdpass *, int);
72 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
73 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
74 
75 /* list of sets of files that were sent over sockets that are now closed */
76 SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
77 
78 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
79 
80 
81 /*
82  * Unix communications domain.
83  *
84  * TODO:
85  *	RDM
86  *	rethink name space problems
87  *	need a proper out-of-band
88  */
89 struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
90 ino_t	unp_ino;			/* prototype for fake inode numbers */
91 
92 void
93 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
94 {
95 	if (unp != NULL && unp->unp_addr != NULL) {
96 		nam->m_len = unp->unp_addr->m_len;
97 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
98 		    nam->m_len);
99 	} else {
100 		nam->m_len = sizeof(sun_noname);
101 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
102 		    nam->m_len);
103 	}
104 }
105 
106 int
107 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
108     struct mbuf *control, struct proc *p)
109 {
110 	struct unpcb *unp = sotounpcb(so);
111 	struct unpcb *unp2;
112 	struct socket *so2;
113 	int error = 0;
114 
115 	if (req == PRU_CONTROL)
116 		return (EOPNOTSUPP);
117 	if (req != PRU_SEND && control && control->m_len) {
118 		error = EOPNOTSUPP;
119 		goto release;
120 	}
121 	if (unp == NULL) {
122 		error = EINVAL;
123 		goto release;
124 	}
125 
126 	NET_ASSERT_UNLOCKED();
127 
128 	switch (req) {
129 
130 	case PRU_BIND:
131 		error = unp_bind(unp, nam, p);
132 		break;
133 
134 	case PRU_LISTEN:
135 		if (unp->unp_vnode == NULL)
136 			error = EINVAL;
137 		break;
138 
139 	case PRU_CONNECT:
140 		error = unp_connect(so, nam, p);
141 		break;
142 
143 	case PRU_CONNECT2:
144 		error = unp_connect2(so, (struct socket *)nam);
145 		if (!error) {
146 			unp->unp_connid.uid = p->p_ucred->cr_uid;
147 			unp->unp_connid.gid = p->p_ucred->cr_gid;
148 			unp->unp_connid.pid = p->p_p->ps_pid;
149 			unp->unp_flags |= UNP_FEIDS;
150 			unp2 = sotounpcb((struct socket *)nam);
151 			unp2->unp_connid.uid = p->p_ucred->cr_uid;
152 			unp2->unp_connid.gid = p->p_ucred->cr_gid;
153 			unp2->unp_connid.pid = p->p_p->ps_pid;
154 			unp2->unp_flags |= UNP_FEIDS;
155 		}
156 		break;
157 
158 	case PRU_DISCONNECT:
159 		unp_disconnect(unp);
160 		break;
161 
162 	case PRU_ACCEPT:
163 		/*
164 		 * Pass back name of connected socket,
165 		 * if it was bound and we are still connected
166 		 * (our peer may have closed already!).
167 		 */
168 		uipc_setaddr(unp->unp_conn, nam);
169 		break;
170 
171 	case PRU_SHUTDOWN:
172 		socantsendmore(so);
173 		unp_shutdown(unp);
174 		break;
175 
176 	case PRU_RCVD:
177 		switch (so->so_type) {
178 
179 		case SOCK_DGRAM:
180 			panic("uipc 1");
181 			/*NOTREACHED*/
182 
183 		case SOCK_STREAM:
184 		case SOCK_SEQPACKET:
185 			if (unp->unp_conn == NULL)
186 				break;
187 			so2 = unp->unp_conn->unp_socket;
188 			/*
189 			 * Adjust backpressure on sender
190 			 * and wakeup any waiting to write.
191 			 */
192 			so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
193 			so2->so_snd.sb_cc = so->so_rcv.sb_cc;
194 			sowwakeup(so2);
195 			break;
196 
197 		default:
198 			panic("uipc 2");
199 		}
200 		break;
201 
202 	case PRU_SEND:
203 		if (control && (error = unp_internalize(control, p)))
204 			break;
205 		switch (so->so_type) {
206 
207 		case SOCK_DGRAM: {
208 			struct sockaddr *from;
209 
210 			if (nam) {
211 				if (unp->unp_conn) {
212 					error = EISCONN;
213 					break;
214 				}
215 				error = unp_connect(so, nam, p);
216 				if (error)
217 					break;
218 			} else {
219 				if (unp->unp_conn == NULL) {
220 					error = ENOTCONN;
221 					break;
222 				}
223 			}
224 			so2 = unp->unp_conn->unp_socket;
225 			if (unp->unp_addr)
226 				from = mtod(unp->unp_addr, struct sockaddr *);
227 			else
228 				from = &sun_noname;
229 			if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
230 				sorwakeup(so2);
231 				m = NULL;
232 				control = NULL;
233 			} else
234 				error = ENOBUFS;
235 			if (nam)
236 				unp_disconnect(unp);
237 			break;
238 		}
239 
240 		case SOCK_STREAM:
241 		case SOCK_SEQPACKET:
242 			if (so->so_state & SS_CANTSENDMORE) {
243 				error = EPIPE;
244 				break;
245 			}
246 			if (unp->unp_conn == NULL) {
247 				error = ENOTCONN;
248 				break;
249 			}
250 			so2 = unp->unp_conn->unp_socket;
251 			/*
252 			 * Send to paired receive port, and then raise
253 			 * send buffer counts to maintain backpressure.
254 			 * Wake up readers.
255 			 */
256 			if (control) {
257 				if (sbappendcontrol(so2, &so2->so_rcv, m,
258 				    control)) {
259 					control = NULL;
260 				} else {
261 					error = ENOBUFS;
262 					break;
263 				}
264 			} else if (so->so_type == SOCK_SEQPACKET)
265 				sbappendrecord(so2, &so2->so_rcv, m);
266 			else
267 				sbappend(so2, &so2->so_rcv, m);
268 			so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
269 			so->so_snd.sb_cc = so2->so_rcv.sb_cc;
270 			sorwakeup(so2);
271 			m = NULL;
272 			break;
273 
274 		default:
275 			panic("uipc 4");
276 		}
277 		/* we need to undo unp_internalize in case of errors */
278 		if (control && error)
279 			unp_dispose(control);
280 		break;
281 
282 	case PRU_ABORT:
283 		unp_drop(unp, ECONNABORTED);
284 		break;
285 
286 	case PRU_SENSE: {
287 		struct stat *sb = (struct stat *)m;
288 
289 		sb->st_blksize = so->so_snd.sb_hiwat;
290 		sb->st_dev = NODEV;
291 		if (unp->unp_ino == 0)
292 			unp->unp_ino = unp_ino++;
293 		sb->st_atim.tv_sec =
294 		    sb->st_mtim.tv_sec =
295 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
296 		sb->st_atim.tv_nsec =
297 		    sb->st_mtim.tv_nsec =
298 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
299 		sb->st_ino = unp->unp_ino;
300 		break;
301 	}
302 
303 	case PRU_RCVOOB:
304 	case PRU_SENDOOB:
305 		error = EOPNOTSUPP;
306 		break;
307 
308 	case PRU_SOCKADDR:
309 		uipc_setaddr(unp, nam);
310 		break;
311 
312 	case PRU_PEERADDR:
313 		uipc_setaddr(unp->unp_conn, nam);
314 		break;
315 
316 	case PRU_SLOWTIMO:
317 		break;
318 
319 	default:
320 		panic("uipc_usrreq");
321 	}
322 release:
323 	if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) {
324 		m_freem(control);
325 		m_freem(m);
326 	}
327 	return (error);
328 }
329 
330 /*
331  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
332  * for stream sockets, although the total for sender and receiver is
333  * actually only PIPSIZ.
334  * Datagram sockets really use the sendspace as the maximum datagram size,
335  * and don't really want to reserve the sendspace.  Their recvspace should
336  * be large enough for at least one max-size datagram plus address.
337  */
338 #define	PIPSIZ	4096
339 u_long	unpst_sendspace = PIPSIZ;
340 u_long	unpst_recvspace = PIPSIZ;
341 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
342 u_long	unpdg_recvspace = 4*1024;
343 
344 int	unp_rights;			/* file descriptors in flight */
345 
346 int
347 uipc_attach(struct socket *so, int proto)
348 {
349 	struct unpcb *unp;
350 	int error;
351 
352 	if (so->so_pcb)
353 		return EISCONN;
354 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
355 		switch (so->so_type) {
356 
357 		case SOCK_STREAM:
358 		case SOCK_SEQPACKET:
359 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
360 			break;
361 
362 		case SOCK_DGRAM:
363 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
364 			break;
365 
366 		default:
367 			panic("unp_attach");
368 		}
369 		if (error)
370 			return (error);
371 	}
372 	unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT|M_ZERO);
373 	if (unp == NULL)
374 		return (ENOBUFS);
375 	unp->unp_socket = so;
376 	so->so_pcb = unp;
377 	getnanotime(&unp->unp_ctime);
378 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
379 	return (0);
380 }
381 
382 int
383 uipc_detach(struct socket *so)
384 {
385 	struct unpcb *unp = sotounpcb(so);
386 
387 	if (unp == NULL)
388 		return (EINVAL);
389 
390 	NET_ASSERT_UNLOCKED();
391 
392 	unp_detach(unp);
393 
394 	return (0);
395 }
396 
397 void
398 unp_detach(struct unpcb *unp)
399 {
400 	struct vnode *vp;
401 
402 	LIST_REMOVE(unp, unp_link);
403 	if (unp->unp_vnode) {
404 		unp->unp_vnode->v_socket = NULL;
405 		vp = unp->unp_vnode;
406 		unp->unp_vnode = NULL;
407 		vrele(vp);
408 	}
409 	if (unp->unp_conn)
410 		unp_disconnect(unp);
411 	while (!SLIST_EMPTY(&unp->unp_refs))
412 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
413 	soisdisconnected(unp->unp_socket);
414 	unp->unp_socket->so_pcb = NULL;
415 	m_freem(unp->unp_addr);
416 	free(unp, M_PCB, sizeof *unp);
417 	if (unp_rights)
418 		task_add(systq, &unp_gc_task);
419 }
420 
421 int
422 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
423 {
424 	struct sockaddr_un *soun;
425 	struct mbuf *nam2;
426 	struct vnode *vp;
427 	struct vattr vattr;
428 	int error;
429 	struct nameidata nd;
430 	size_t pathlen;
431 
432 	if (unp->unp_vnode != NULL)
433 		return (EINVAL);
434 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
435 		return (error);
436 
437 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
438 	nam2->m_len = sizeof(struct sockaddr_un);
439 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
440 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
441 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
442 
443 	soun = mtod(nam2, struct sockaddr_un *);
444 
445 	/* Fixup sun_len to keep it in sync with m_len. */
446 	soun->sun_len = nam2->m_len;
447 
448 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
449 	    soun->sun_path, p);
450 	nd.ni_pledge = PLEDGE_UNIX;
451 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
452 	if ((error = namei(&nd)) != 0) {
453 		m_freem(nam2);
454 		return (error);
455 	}
456 	vp = nd.ni_vp;
457 	if (vp != NULL) {
458 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
459 		if (nd.ni_dvp == vp)
460 			vrele(nd.ni_dvp);
461 		else
462 			vput(nd.ni_dvp);
463 		vrele(vp);
464 		m_freem(nam2);
465 		return (EADDRINUSE);
466 	}
467 	VATTR_NULL(&vattr);
468 	vattr.va_type = VSOCK;
469 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
470 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
471 	vput(nd.ni_dvp);
472 	if (error) {
473 		m_freem(nam2);
474 		return (error);
475 	}
476 	unp->unp_addr = nam2;
477 	vp = nd.ni_vp;
478 	vp->v_socket = unp->unp_socket;
479 	unp->unp_vnode = vp;
480 	unp->unp_connid.uid = p->p_ucred->cr_uid;
481 	unp->unp_connid.gid = p->p_ucred->cr_gid;
482 	unp->unp_connid.pid = p->p_p->ps_pid;
483 	unp->unp_flags |= UNP_FEIDSBIND;
484 	VOP_UNLOCK(vp);
485 	return (0);
486 }
487 
488 int
489 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
490 {
491 	struct sockaddr_un *soun;
492 	struct vnode *vp;
493 	struct socket *so2, *so3;
494 	struct unpcb *unp, *unp2, *unp3;
495 	struct nameidata nd;
496 	int error;
497 
498 	if ((error = unp_nam2sun(nam, &soun, NULL)))
499 		return (error);
500 
501 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
502 	nd.ni_pledge = PLEDGE_UNIX;
503 	if ((error = namei(&nd)) != 0)
504 		return (error);
505 	vp = nd.ni_vp;
506 	if (vp->v_type != VSOCK) {
507 		error = ENOTSOCK;
508 		goto bad;
509 	}
510 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
511 		goto bad;
512 	so2 = vp->v_socket;
513 	if (so2 == NULL) {
514 		error = ECONNREFUSED;
515 		goto bad;
516 	}
517 	if (so->so_type != so2->so_type) {
518 		error = EPROTOTYPE;
519 		goto bad;
520 	}
521 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
522 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
523 		    (so3 = sonewconn(so2, 0)) == 0) {
524 			error = ECONNREFUSED;
525 			goto bad;
526 		}
527 		unp = sotounpcb(so);
528 		unp2 = sotounpcb(so2);
529 		unp3 = sotounpcb(so3);
530 		if (unp2->unp_addr)
531 			unp3->unp_addr =
532 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
533 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
534 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
535 		unp3->unp_connid.pid = p->p_p->ps_pid;
536 		unp3->unp_flags |= UNP_FEIDS;
537 		so2 = so3;
538 		if (unp2->unp_flags & UNP_FEIDSBIND) {
539 			unp->unp_connid = unp2->unp_connid;
540 			unp->unp_flags |= UNP_FEIDS;
541 		}
542 	}
543 	error = unp_connect2(so, so2);
544 bad:
545 	vput(vp);
546 	return (error);
547 }
548 
549 int
550 unp_connect2(struct socket *so, struct socket *so2)
551 {
552 	struct unpcb *unp = sotounpcb(so);
553 	struct unpcb *unp2;
554 
555 	if (so2->so_type != so->so_type)
556 		return (EPROTOTYPE);
557 	unp2 = sotounpcb(so2);
558 	unp->unp_conn = unp2;
559 	switch (so->so_type) {
560 
561 	case SOCK_DGRAM:
562 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
563 		soisconnected(so);
564 		break;
565 
566 	case SOCK_STREAM:
567 	case SOCK_SEQPACKET:
568 		unp2->unp_conn = unp;
569 		soisconnected(so);
570 		soisconnected(so2);
571 		break;
572 
573 	default:
574 		panic("unp_connect2");
575 	}
576 	return (0);
577 }
578 
579 void
580 unp_disconnect(struct unpcb *unp)
581 {
582 	struct unpcb *unp2 = unp->unp_conn;
583 
584 	if (unp2 == NULL)
585 		return;
586 	unp->unp_conn = NULL;
587 	switch (unp->unp_socket->so_type) {
588 
589 	case SOCK_DGRAM:
590 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
591 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
592 		break;
593 
594 	case SOCK_STREAM:
595 	case SOCK_SEQPACKET:
596 		unp->unp_socket->so_snd.sb_mbcnt = 0;
597 		unp->unp_socket->so_snd.sb_cc = 0;
598 		soisdisconnected(unp->unp_socket);
599 		unp2->unp_conn = NULL;
600 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
601 		unp2->unp_socket->so_snd.sb_cc = 0;
602 		soisdisconnected(unp2->unp_socket);
603 		break;
604 	}
605 }
606 
607 void
608 unp_shutdown(struct unpcb *unp)
609 {
610 	struct socket *so;
611 
612 	switch (unp->unp_socket->so_type) {
613 	case SOCK_STREAM:
614 	case SOCK_SEQPACKET:
615 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
616 			socantrcvmore(so);
617 		break;
618 	default:
619 		break;
620 	}
621 }
622 
623 void
624 unp_drop(struct unpcb *unp, int errno)
625 {
626 	struct socket *so = unp->unp_socket;
627 
628 	KERNEL_ASSERT_LOCKED();
629 
630 	so->so_error = errno;
631 	unp_disconnect(unp);
632 	if (so->so_head) {
633 		so->so_pcb = NULL;
634 		/*
635 		 * As long as the KERNEL_LOCK() is the default lock for Unix
636 		 * sockets, do not release it.
637 		 */
638 		sofree(so, SL_NOUNLOCK);
639 		m_freem(unp->unp_addr);
640 		free(unp, M_PCB, sizeof *unp);
641 	}
642 }
643 
644 #ifdef notdef
645 unp_drain(void)
646 {
647 
648 }
649 #endif
650 
651 extern	struct domain unixdomain;
652 
653 static struct unpcb *
654 fptounp(struct file *fp)
655 {
656 	struct socket *so;
657 
658 	if (fp->f_type != DTYPE_SOCKET)
659 		return (NULL);
660 	if ((so = fp->f_data) == NULL)
661 		return (NULL);
662 	if (so->so_proto->pr_domain != &unixdomain)
663 		return (NULL);
664 	return (sotounpcb(so));
665 }
666 
667 int
668 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
669 {
670 	struct proc *p = curproc;		/* XXX */
671 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
672 	struct filedesc *fdp = p->p_fd;
673 	int i, *fds = NULL;
674 	struct fdpass *rp;
675 	struct file *fp;
676 	int nfds, error = 0;
677 
678 	/*
679 	 * This code only works because SCM_RIGHTS is the only supported
680 	 * control message type on unix sockets. Enforce this here.
681 	 */
682 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
683 		return EINVAL;
684 
685 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
686 	    sizeof(struct fdpass);
687 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
688 		controllen = 0;
689 	else
690 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
691 	if (nfds > controllen / sizeof(int)) {
692 		error = EMSGSIZE;
693 		goto restart;
694 	}
695 
696 	/* Make sure the recipient should be able to see the descriptors.. */
697 	rp = (struct fdpass *)CMSG_DATA(cm);
698 	for (i = 0; i < nfds; i++) {
699 		fp = rp->fp;
700 		rp++;
701 		error = pledge_recvfd(p, fp);
702 		if (error)
703 			break;
704 
705 		/*
706 		 * No to block devices.  If passing a directory,
707 		 * make sure that it is underneath the root.
708 		 */
709 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
710 			struct vnode *vp = (struct vnode *)fp->f_data;
711 
712 			if (vp->v_type == VBLK ||
713 			    (vp->v_type == VDIR &&
714 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
715 				error = EPERM;
716 				break;
717 			}
718 		}
719 	}
720 
721 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
722 
723 restart:
724 	fdplock(fdp);
725 	if (error != 0) {
726 		if (nfds > 0) {
727 			rp = ((struct fdpass *)CMSG_DATA(cm));
728 			unp_discard(rp, nfds);
729 		}
730 		goto out;
731 	}
732 
733 	/*
734 	 * First loop -- allocate file descriptor table slots for the
735 	 * new descriptors.
736 	 */
737 	rp = ((struct fdpass *)CMSG_DATA(cm));
738 	for (i = 0; i < nfds; i++) {
739 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
740 			/*
741 			 * Back out what we've done so far.
742 			 */
743 			for (--i; i >= 0; i--)
744 				fdremove(fdp, fds[i]);
745 
746 			if (error == ENOSPC) {
747 				fdexpand(p);
748 				error = 0;
749 			} else {
750 				/*
751 				 * This is the error that has historically
752 				 * been returned, and some callers may
753 				 * expect it.
754 				 */
755 				error = EMSGSIZE;
756 			}
757 			fdpunlock(fdp);
758 			goto restart;
759 		}
760 
761 		/*
762 		 * Make the slot reference the descriptor so that
763 		 * fdalloc() works properly.. We finalize it all
764 		 * in the loop below.
765 		 */
766 		mtx_enter(&fdp->fd_fplock);
767 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
768 		fdp->fd_ofiles[fds[i]] = rp->fp;
769 		mtx_leave(&fdp->fd_fplock);
770 
771 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
772 		if (flags & MSG_CMSG_CLOEXEC)
773 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
774 
775 		rp++;
776 	}
777 
778 	/*
779 	 * Now that adding them has succeeded, update all of the
780 	 * descriptor passing state.
781 	 */
782 	rp = (struct fdpass *)CMSG_DATA(cm);
783 	for (i = 0; i < nfds; i++) {
784 		struct unpcb *unp;
785 
786 		fp = rp->fp;
787 		rp++;
788 		if ((unp = fptounp(fp)) != NULL)
789 			unp->unp_msgcount--;
790 		unp_rights--;
791 	}
792 
793 	/*
794 	 * Copy temporary array to message and adjust length, in case of
795 	 * transition from large struct file pointers to ints.
796 	 */
797 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
798 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
799 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
800  out:
801 	fdpunlock(fdp);
802 	if (fds != NULL)
803 		free(fds, M_TEMP, nfds * sizeof(int));
804 	return (error);
805 }
806 
807 int
808 unp_internalize(struct mbuf *control, struct proc *p)
809 {
810 	struct filedesc *fdp = p->p_fd;
811 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
812 	struct fdpass *rp;
813 	struct file *fp;
814 	struct unpcb *unp;
815 	int i, error;
816 	int nfds, *ip, fd, neededspace;
817 
818 	/*
819 	 * Check for two potential msg_controllen values because
820 	 * IETF stuck their nose in a place it does not belong.
821 	 */
822 	if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
823 		return (EINVAL);
824 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
825 	    !(cm->cmsg_len == control->m_len ||
826 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
827 		return (EINVAL);
828 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
829 
830 	if (unp_rights + nfds > maxfiles / 10)
831 		return (EMFILE);
832 
833 	/* Make sure we have room for the struct file pointers */
834 morespace:
835 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
836 	    control->m_len;
837 	if (neededspace > m_trailingspace(control)) {
838 		char *tmp;
839 		/* if we already have a cluster, the message is just too big */
840 		if (control->m_flags & M_EXT)
841 			return (E2BIG);
842 
843 		/* copy cmsg data temporarily out of the mbuf */
844 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
845 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
846 
847 		/* allocate a cluster and try again */
848 		MCLGET(control, M_WAIT);
849 		if ((control->m_flags & M_EXT) == 0) {
850 			free(tmp, M_TEMP, control->m_len);
851 			return (ENOBUFS);       /* allocation failed */
852 		}
853 
854 		/* copy the data back into the cluster */
855 		cm = mtod(control, struct cmsghdr *);
856 		memcpy(cm, tmp, control->m_len);
857 		free(tmp, M_TEMP, control->m_len);
858 		goto morespace;
859 	}
860 
861 	/* adjust message & mbuf to note amount of space actually used. */
862 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
863 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
864 
865 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
866 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
867 	fdplock(fdp);
868 	for (i = 0; i < nfds; i++) {
869 		memcpy(&fd, ip, sizeof fd);
870 		ip--;
871 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
872 			error = EBADF;
873 			goto fail;
874 		}
875 		if (fp->f_count >= FDUP_MAX_COUNT) {
876 			error = EDEADLK;
877 			goto fail;
878 		}
879 		error = pledge_sendfd(p, fp);
880 		if (error)
881 			goto fail;
882 
883 		/* kqueue descriptors cannot be copied */
884 		if (fp->f_type == DTYPE_KQUEUE) {
885 			error = EINVAL;
886 			goto fail;
887 		}
888 		rp->fp = fp;
889 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
890 		rp--;
891 		if ((unp = fptounp(fp)) != NULL) {
892 			unp->unp_file = fp;
893 			unp->unp_msgcount++;
894 		}
895 		unp_rights++;
896 	}
897 	fdpunlock(fdp);
898 	return (0);
899 fail:
900 	fdpunlock(fdp);
901 	if (fp != NULL)
902 		FRELE(fp, p);
903 	/* Back out what we just did. */
904 	for ( ; i > 0; i--) {
905 		rp++;
906 		fp = rp->fp;
907 		if ((unp = fptounp(fp)) != NULL)
908 			unp->unp_msgcount--;
909 		FRELE(fp, p);
910 		unp_rights--;
911 	}
912 
913 	return (error);
914 }
915 
916 int	unp_defer, unp_gcing;
917 
918 void
919 unp_gc(void *arg __unused)
920 {
921 	struct unp_deferral *defer;
922 	struct file *fp;
923 	struct socket *so;
924 	struct unpcb *unp;
925 	int nunref, i;
926 
927 	if (unp_gcing)
928 		return;
929 	unp_gcing = 1;
930 
931 	/* close any fds on the deferred list */
932 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
933 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
934 		for (i = 0; i < defer->ud_n; i++) {
935 			fp = defer->ud_fp[i].fp;
936 			if (fp == NULL)
937 				continue;
938 			 /* closef() expects a refcount of 2 */
939 			FREF(fp);
940 			if ((unp = fptounp(fp)) != NULL)
941 				unp->unp_msgcount--;
942 			unp_rights--;
943 			(void) closef(fp, NULL);
944 		}
945 		free(defer, M_TEMP, sizeof(*defer) +
946 		    sizeof(struct fdpass) * defer->ud_n);
947 	}
948 
949 	unp_defer = 0;
950 	LIST_FOREACH(unp, &unp_head, unp_link)
951 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
952 	do {
953 		nunref = 0;
954 		LIST_FOREACH(unp, &unp_head, unp_link) {
955 			fp = unp->unp_file;
956 			if (unp->unp_flags & UNP_GCDEFER) {
957 				/*
958 				 * This socket is referenced by another
959 				 * socket which is known to be live,
960 				 * so it's certainly live.
961 				 */
962 				unp->unp_flags &= ~UNP_GCDEFER;
963 				unp_defer--;
964 			} else if (unp->unp_flags & UNP_GCMARK) {
965 				/* marked as live in previous pass */
966 				continue;
967 			} else if (fp == NULL) {
968 				/* not being passed, so can't be in loop */
969 			} else if (fp->f_count == 0) {
970 				/*
971 				 * Already being closed, let normal close
972 				 * path take its course
973 				 */
974 			} else {
975 				/*
976 				 * Unreferenced by other sockets so far,
977 				 * so if all the references (f_count) are
978 				 * from passing (unp_msgcount) then this
979 				 * socket is prospectively dead
980 				 */
981 				if (fp->f_count == unp->unp_msgcount) {
982 					nunref++;
983 					unp->unp_flags |= UNP_GCDEAD;
984 					continue;
985 				}
986 			}
987 
988 			/*
989 			 * This is the first time we've seen this socket on
990 			 * the mark pass and known it has a live reference,
991 			 * so mark it, then scan its receive buffer for
992 			 * sockets and note them as deferred (== referenced,
993 			 * but not yet marked).
994 			 */
995 			unp->unp_flags |= UNP_GCMARK;
996 
997 			so = unp->unp_socket;
998 			unp_scan(so->so_rcv.sb_mb, unp_mark);
999 		}
1000 	} while (unp_defer);
1001 
1002 	/*
1003 	 * If there are any unreferenced sockets, then for each dispose
1004 	 * of files in its receive buffer and then close it.
1005 	 */
1006 	if (nunref) {
1007 		LIST_FOREACH(unp, &unp_head, unp_link) {
1008 			if (unp->unp_flags & UNP_GCDEAD)
1009 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
1010 				    unp_discard);
1011 		}
1012 	}
1013 	unp_gcing = 0;
1014 }
1015 
1016 void
1017 unp_dispose(struct mbuf *m)
1018 {
1019 
1020 	if (m)
1021 		unp_scan(m, unp_discard);
1022 }
1023 
1024 void
1025 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1026 {
1027 	struct mbuf *m;
1028 	struct fdpass *rp;
1029 	struct cmsghdr *cm;
1030 	int qfds;
1031 
1032 	while (m0) {
1033 		for (m = m0; m; m = m->m_next) {
1034 			if (m->m_type == MT_CONTROL &&
1035 			    m->m_len >= sizeof(*cm)) {
1036 				cm = mtod(m, struct cmsghdr *);
1037 				if (cm->cmsg_level != SOL_SOCKET ||
1038 				    cm->cmsg_type != SCM_RIGHTS)
1039 					continue;
1040 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1041 				    / sizeof(struct fdpass);
1042 				if (qfds > 0) {
1043 					rp = (struct fdpass *)CMSG_DATA(cm);
1044 					op(rp, qfds);
1045 				}
1046 				break;		/* XXX, but saves time */
1047 			}
1048 		}
1049 		m0 = m0->m_nextpkt;
1050 	}
1051 }
1052 
1053 void
1054 unp_mark(struct fdpass *rp, int nfds)
1055 {
1056 	struct unpcb *unp;
1057 	int i;
1058 
1059 	for (i = 0; i < nfds; i++) {
1060 		if (rp[i].fp == NULL)
1061 			continue;
1062 
1063 		unp = fptounp(rp[i].fp);
1064 		if (unp == NULL)
1065 			continue;
1066 
1067 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1068 			continue;
1069 
1070 		unp_defer++;
1071 		unp->unp_flags |= UNP_GCDEFER;
1072 		unp->unp_flags &= ~UNP_GCDEAD;
1073 	}
1074 }
1075 
1076 void
1077 unp_discard(struct fdpass *rp, int nfds)
1078 {
1079 	struct unp_deferral *defer;
1080 
1081 	/* copy the file pointers to a deferral structure */
1082 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1083 	defer->ud_n = nfds;
1084 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1085 	memset(rp, 0, sizeof(*rp) * nfds);
1086 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1087 
1088 	task_add(systq, &unp_gc_task);
1089 }
1090 
1091 int
1092 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1093 {
1094 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1095 	size_t size, len;
1096 
1097 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1098 		return EINVAL;
1099 	if (sa->sa_family != AF_UNIX)
1100 		return EAFNOSUPPORT;
1101 	if (sa->sa_len != nam->m_len)
1102 		return EINVAL;
1103 	if (sa->sa_len > sizeof(struct sockaddr_un))
1104 		return EINVAL;
1105 	*sun = (struct sockaddr_un *)sa;
1106 
1107 	/* ensure that sun_path is NUL terminated and fits */
1108 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1109 	len = strnlen((*sun)->sun_path, size);
1110 	if (len == sizeof((*sun)->sun_path))
1111 		return EINVAL;
1112 	if (len == size) {
1113 		if (m_trailingspace(nam) == 0)
1114 			return EINVAL;
1115 		nam->m_len++;
1116 		(*sun)->sun_len++;
1117 		(*sun)->sun_path[len] = '\0';
1118 	}
1119 	if (pathlen != NULL)
1120 		*pathlen = len;
1121 
1122 	return 0;
1123 }
1124