xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision 8550894424f8a4aa4aafb6cd57229dd6ed7cd9dd)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.198 2023/01/21 11:23:23 mvs Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 #include <sys/pool.h>
54 #include <sys/rwlock.h>
55 #include <sys/mutex.h>
56 #include <sys/sysctl.h>
57 #include <sys/lock.h>
58 #include <sys/refcnt.h>
59 
60 #include "kcov.h"
61 #if NKCOV > 0
62 #include <sys/kcov.h>
63 #endif
64 
65 /*
66  * Locks used to protect global data and struct members:
67  *      I       immutable after creation
68  *      D       unp_df_lock
69  *      G       unp_gc_lock
70  *      M       unp_ino_mtx
71  *      R       unp_rights_mtx
72  *      a       atomic
73  *      s       socket lock
74  */
75 
76 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
77 struct rwlock unp_df_lock = RWLOCK_INITIALIZER("unpdflk");
78 struct rwlock unp_gc_lock = RWLOCK_INITIALIZER("unpgclk");
79 
80 struct mutex unp_rights_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
81 struct mutex unp_ino_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
82 
83 /*
84  * Stack of sets of files that were passed over a socket but were
85  * not received and need to be closed.
86  */
87 struct	unp_deferral {
88 	SLIST_ENTRY(unp_deferral)	ud_link;	/* [D] */
89 	int				ud_n;		/* [I] */
90 	/* followed by ud_n struct fdpass */
91 	struct fdpass			ud_fp[];	/* [I] */
92 };
93 
94 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
95 void	unp_discard(struct fdpass *, int);
96 void	unp_remove_gcrefs(struct fdpass *, int);
97 void	unp_restore_gcrefs(struct fdpass *, int);
98 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
99 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
100 static inline void unp_ref(struct unpcb *);
101 static inline void unp_rele(struct unpcb *);
102 struct socket *unp_solock_peer(struct socket *);
103 
104 struct pool unpcb_pool;
105 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
106 
107 /*
108  * Unix communications domain.
109  *
110  * TODO:
111  *	RDM
112  *	rethink name space problems
113  *	need a proper out-of-band
114  */
115 const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
116 
117 /* [G] list of all UNIX domain sockets, for unp_gc() */
118 LIST_HEAD(unp_head, unpcb)	unp_head =
119 	LIST_HEAD_INITIALIZER(unp_head);
120 /* [D] list of sets of files that were sent over sockets that are now closed */
121 SLIST_HEAD(,unp_deferral)	unp_deferred =
122 	SLIST_HEAD_INITIALIZER(unp_deferred);
123 
124 ino_t	unp_ino;	/* [U] prototype for fake inode numbers */
125 int	unp_rights;	/* [R] file descriptors in flight */
126 int	unp_defer;	/* [G] number of deferred fp to close by the GC task */
127 int	unp_gcing;	/* [G] GC task currently running */
128 
129 const struct pr_usrreqs uipc_usrreqs = {
130 	.pru_attach	= uipc_attach,
131 	.pru_detach	= uipc_detach,
132 	.pru_bind	= uipc_bind,
133 	.pru_listen	= uipc_listen,
134 	.pru_connect	= uipc_connect,
135 	.pru_accept	= uipc_accept,
136 	.pru_disconnect	= uipc_disconnect,
137 	.pru_shutdown	= uipc_shutdown,
138 	.pru_rcvd	= uipc_rcvd,
139 	.pru_send	= uipc_send,
140 	.pru_abort	= uipc_abort,
141 	.pru_sense	= uipc_sense,
142 	.pru_sockaddr	= uipc_sockaddr,
143 	.pru_peeraddr	= uipc_peeraddr,
144 	.pru_connect2	= uipc_connect2,
145 };
146 
147 const struct pr_usrreqs uipc_dgram_usrreqs = {
148 	.pru_attach	= uipc_attach,
149 	.pru_detach	= uipc_detach,
150 	.pru_bind	= uipc_bind,
151 	.pru_listen	= uipc_listen,
152 	.pru_connect	= uipc_connect,
153 	.pru_disconnect	= uipc_disconnect,
154 	.pru_shutdown	= uipc_dgram_shutdown,
155 	.pru_send	= uipc_dgram_send,
156 	.pru_sense	= uipc_sense,
157 	.pru_sockaddr	= uipc_sockaddr,
158 	.pru_peeraddr	= uipc_peeraddr,
159 	.pru_connect2	= uipc_connect2,
160 };
161 
162 void
163 unp_init(void)
164 {
165 	pool_init(&unpcb_pool, sizeof(struct unpcb), 0,
166 	    IPL_SOFTNET, 0, "unpcb", NULL);
167 }
168 
169 static inline void
170 unp_ref(struct unpcb *unp)
171 {
172 	refcnt_take(&unp->unp_refcnt);
173 }
174 
175 static inline void
176 unp_rele(struct unpcb *unp)
177 {
178 	refcnt_rele_wake(&unp->unp_refcnt);
179 }
180 
181 struct socket *
182 unp_solock_peer(struct socket *so)
183 {
184 	struct unpcb *unp, *unp2;
185 	struct socket *so2;
186 
187 	unp = so->so_pcb;
188 
189 again:
190 	if ((unp2 = unp->unp_conn) == NULL)
191 		return NULL;
192 
193 	so2 = unp2->unp_socket;
194 
195 	if (so < so2)
196 		solock(so2);
197 	else if (so > so2) {
198 		unp_ref(unp2);
199 		sounlock(so);
200 		solock(so2);
201 		solock(so);
202 
203 		/* Datagram socket could be reconnected due to re-lock. */
204 		if (unp->unp_conn != unp2) {
205 			sounlock(so2);
206 			unp_rele(unp2);
207 			goto again;
208 		}
209 
210 		unp_rele(unp2);
211 	}
212 
213 	return so2;
214 }
215 
216 void
217 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
218 {
219 	if (unp != NULL && unp->unp_addr != NULL) {
220 		nam->m_len = unp->unp_addr->m_len;
221 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
222 		    nam->m_len);
223 	} else {
224 		nam->m_len = sizeof(sun_noname);
225 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
226 		    nam->m_len);
227 	}
228 }
229 
230 /*
231  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
232  * for stream sockets, although the total for sender and receiver is
233  * actually only PIPSIZ.
234  * Datagram sockets really use the sendspace as the maximum datagram size,
235  * and don't really want to reserve the sendspace.  Their recvspace should
236  * be large enough for at least one max-size datagram plus address.
237  */
238 #define	PIPSIZ	8192
239 u_int	unpst_sendspace = PIPSIZ;
240 u_int	unpst_recvspace = PIPSIZ;
241 u_int	unpsq_sendspace = PIPSIZ;
242 u_int	unpsq_recvspace = PIPSIZ;
243 u_int	unpdg_sendspace = 2*1024;	/* really max datagram size */
244 u_int	unpdg_recvspace = 16*1024;
245 
246 const struct sysctl_bounded_args unpstctl_vars[] = {
247 	{ UNPCTL_RECVSPACE, &unpst_recvspace, 0, SB_MAX },
248 	{ UNPCTL_SENDSPACE, &unpst_sendspace, 0, SB_MAX },
249 };
250 const struct sysctl_bounded_args unpsqctl_vars[] = {
251 	{ UNPCTL_RECVSPACE, &unpsq_recvspace, 0, SB_MAX },
252 	{ UNPCTL_SENDSPACE, &unpsq_sendspace, 0, SB_MAX },
253 };
254 const struct sysctl_bounded_args unpdgctl_vars[] = {
255 	{ UNPCTL_RECVSPACE, &unpdg_recvspace, 0, SB_MAX },
256 	{ UNPCTL_SENDSPACE, &unpdg_sendspace, 0, SB_MAX },
257 };
258 
259 int
260 uipc_attach(struct socket *so, int proto, int wait)
261 {
262 	struct unpcb *unp;
263 	int error;
264 
265 	if (so->so_pcb)
266 		return EISCONN;
267 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
268 		switch (so->so_type) {
269 
270 		case SOCK_STREAM:
271 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
272 			break;
273 
274 		case SOCK_SEQPACKET:
275 			error = soreserve(so, unpsq_sendspace, unpsq_recvspace);
276 			break;
277 
278 		case SOCK_DGRAM:
279 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
280 			break;
281 
282 		default:
283 			panic("unp_attach");
284 		}
285 		if (error)
286 			return (error);
287 	}
288 	unp = pool_get(&unpcb_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
289 	    PR_ZERO);
290 	if (unp == NULL)
291 		return (ENOBUFS);
292 	refcnt_init(&unp->unp_refcnt);
293 	unp->unp_socket = so;
294 	so->so_pcb = unp;
295 	getnanotime(&unp->unp_ctime);
296 
297 	/*
298 	 * Enforce `unp_gc_lock' -> `solock()' lock order.
299 	 */
300 	sounlock(so);
301 	rw_enter_write(&unp_gc_lock);
302 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
303 	rw_exit_write(&unp_gc_lock);
304 	solock(so);
305 	return (0);
306 }
307 
308 int
309 uipc_detach(struct socket *so)
310 {
311 	struct unpcb *unp = sotounpcb(so);
312 
313 	if (unp == NULL)
314 		return (EINVAL);
315 
316 	unp_detach(unp);
317 
318 	return (0);
319 }
320 
321 int
322 uipc_bind(struct socket *so, struct mbuf *nam, struct proc *p)
323 {
324 	struct unpcb *unp = sotounpcb(so);
325 	struct sockaddr_un *soun;
326 	struct mbuf *nam2;
327 	struct vnode *vp;
328 	struct vattr vattr;
329 	int error;
330 	struct nameidata nd;
331 	size_t pathlen;
332 
333 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
334 		return (EINVAL);
335 	if (unp->unp_vnode != NULL)
336 		return (EINVAL);
337 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
338 		return (error);
339 
340 	unp->unp_flags |= UNP_BINDING;
341 
342 	/*
343 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
344 	 * requires it. The socket can't be closed concurrently
345 	 * because the file descriptor reference is still held.
346 	 */
347 
348 	sounlock(unp->unp_socket);
349 
350 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
351 	nam2->m_len = sizeof(struct sockaddr_un);
352 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
353 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
354 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
355 
356 	soun = mtod(nam2, struct sockaddr_un *);
357 
358 	/* Fixup sun_len to keep it in sync with m_len. */
359 	soun->sun_len = nam2->m_len;
360 
361 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
362 	    soun->sun_path, p);
363 	nd.ni_pledge = PLEDGE_UNIX;
364 	nd.ni_unveil = UNVEIL_CREATE;
365 
366 	KERNEL_LOCK();
367 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
368 	error = namei(&nd);
369 	if (error != 0) {
370 		m_freem(nam2);
371 		solock(unp->unp_socket);
372 		goto out;
373 	}
374 	vp = nd.ni_vp;
375 	if (vp != NULL) {
376 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
377 		if (nd.ni_dvp == vp)
378 			vrele(nd.ni_dvp);
379 		else
380 			vput(nd.ni_dvp);
381 		vrele(vp);
382 		m_freem(nam2);
383 		error = EADDRINUSE;
384 		solock(unp->unp_socket);
385 		goto out;
386 	}
387 	VATTR_NULL(&vattr);
388 	vattr.va_type = VSOCK;
389 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
390 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
391 	vput(nd.ni_dvp);
392 	if (error) {
393 		m_freem(nam2);
394 		solock(unp->unp_socket);
395 		goto out;
396 	}
397 	solock(unp->unp_socket);
398 	unp->unp_addr = nam2;
399 	vp = nd.ni_vp;
400 	vp->v_socket = unp->unp_socket;
401 	unp->unp_vnode = vp;
402 	unp->unp_connid.uid = p->p_ucred->cr_uid;
403 	unp->unp_connid.gid = p->p_ucred->cr_gid;
404 	unp->unp_connid.pid = p->p_p->ps_pid;
405 	unp->unp_flags |= UNP_FEIDSBIND;
406 	VOP_UNLOCK(vp);
407 out:
408 	KERNEL_UNLOCK();
409 	unp->unp_flags &= ~UNP_BINDING;
410 
411 	return (error);
412 }
413 
414 int
415 uipc_listen(struct socket *so)
416 {
417 	struct unpcb *unp = sotounpcb(so);
418 
419 	if (unp->unp_vnode == NULL)
420 		return (EINVAL);
421 	return (0);
422 }
423 
424 int
425 uipc_connect(struct socket *so, struct mbuf *nam)
426 {
427 	return unp_connect(so, nam, curproc);
428 }
429 
430 int
431 uipc_accept(struct socket *so, struct mbuf *nam)
432 {
433 	struct socket *so2;
434 	struct unpcb *unp = sotounpcb(so);
435 
436 	/*
437 	 * Pass back name of connected socket, if it was bound and
438 	 * we are still connected (our peer may have closed already!).
439 	 */
440 	so2 = unp_solock_peer(so);
441 	uipc_setaddr(unp->unp_conn, nam);
442 
443 	if (so2 != NULL && so2 != so)
444 		sounlock(so2);
445 	return (0);
446 }
447 
448 int
449 uipc_disconnect(struct socket *so)
450 {
451 	struct unpcb *unp = sotounpcb(so);
452 
453 	unp_disconnect(unp);
454 	return (0);
455 }
456 
457 int
458 uipc_shutdown(struct socket *so)
459 {
460 	struct unpcb *unp = sotounpcb(so);
461 	struct socket *so2;
462 
463 	socantsendmore(so);
464 
465 	if ((so2 = unp_solock_peer(unp->unp_socket))){
466 		socantrcvmore(so2);
467 		sounlock(so2);
468 	}
469 
470 	return (0);
471 }
472 
473 int
474 uipc_dgram_shutdown(struct socket *so)
475 {
476 	socantsendmore(so);
477 	return (0);
478 }
479 
480 void
481 uipc_rcvd(struct socket *so)
482 {
483 	struct socket *so2;
484 
485 	if ((so2 = unp_solock_peer(so)) == NULL)
486 		return;
487 	/*
488 	 * Adjust backpressure on sender
489 	 * and wakeup any waiting to write.
490 	 */
491 	so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
492 	so2->so_snd.sb_cc = so->so_rcv.sb_cc;
493 	sowwakeup(so2);
494 	sounlock(so2);
495 }
496 
497 int
498 uipc_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
499     struct mbuf *control)
500 {
501 	struct socket *so2;
502 	int error = 0;
503 
504 	if (control) {
505 		sounlock(so);
506 		error = unp_internalize(control, curproc);
507 		solock(so);
508 		if (error)
509 			goto out;
510 	}
511 
512 	if (so->so_snd.sb_state & SS_CANTSENDMORE) {
513 		error = EPIPE;
514 		goto dispose;
515 	}
516 	if ((so2 = unp_solock_peer(so)) == NULL) {
517 		error = ENOTCONN;
518 		goto dispose;
519 	}
520 
521 	/*
522 	 * Send to paired receive port, and then raise
523 	 * send buffer counts to maintain backpressure.
524 	 * Wake up readers.
525 	 */
526 	if (control) {
527 		if (sbappendcontrol(so2, &so2->so_rcv, m, control)) {
528 			control = NULL;
529 		} else {
530 			sounlock(so2);
531 			error = ENOBUFS;
532 			goto dispose;
533 		}
534 	} else if (so->so_type == SOCK_SEQPACKET)
535 		sbappendrecord(so2, &so2->so_rcv, m);
536 	else
537 		sbappend(so2, &so2->so_rcv, m);
538 	so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
539 	so->so_snd.sb_cc = so2->so_rcv.sb_cc;
540 	if (so2->so_rcv.sb_cc > 0)
541 		sorwakeup(so2);
542 
543 	sounlock(so2);
544 	m = NULL;
545 
546 dispose:
547 	/* we need to undo unp_internalize in case of errors */
548 	if (control && error)
549 		unp_dispose(control);
550 
551 out:
552 	m_freem(control);
553 	m_freem(m);
554 
555 	return (error);
556 }
557 
558 int
559 uipc_dgram_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
560     struct mbuf *control)
561 {
562 	struct unpcb *unp = sotounpcb(so);
563 	struct socket *so2;
564 	const struct sockaddr *from;
565 	int error = 0;
566 
567 	if (control) {
568 		sounlock(so);
569 		error = unp_internalize(control, curproc);
570 		solock(so);
571 		if (error)
572 			goto out;
573 	}
574 
575 	if (nam) {
576 		if (unp->unp_conn) {
577 			error = EISCONN;
578 			goto dispose;
579 		}
580 		error = unp_connect(so, nam, curproc);
581 		if (error)
582 			goto dispose;
583 	}
584 
585 	if ((so2 = unp_solock_peer(so)) == NULL) {
586 		if (nam != NULL)
587 			error = ECONNREFUSED;
588 		else
589 			error = ENOTCONN;
590 		goto dispose;
591 	}
592 
593 	if (unp->unp_addr)
594 		from = mtod(unp->unp_addr, struct sockaddr *);
595 	else
596 		from = &sun_noname;
597 	if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
598 		sorwakeup(so2);
599 		m = NULL;
600 		control = NULL;
601 	} else
602 		error = ENOBUFS;
603 
604 	if (so2 != so)
605 		sounlock(so2);
606 
607 	if (nam)
608 		unp_disconnect(unp);
609 
610 dispose:
611 	/* we need to undo unp_internalize in case of errors */
612 	if (control && error)
613 		unp_dispose(control);
614 
615 out:
616 	m_freem(control);
617 	m_freem(m);
618 
619 	return (error);
620 }
621 
622 void
623 uipc_abort(struct socket *so)
624 {
625 	struct unpcb *unp = sotounpcb(so);
626 
627 	unp_detach(unp);
628 	sofree(so, 0);
629 }
630 
631 int
632 uipc_sense(struct socket *so, struct stat *sb)
633 {
634 	struct unpcb *unp = sotounpcb(so);
635 
636 	sb->st_blksize = so->so_snd.sb_hiwat;
637 	sb->st_dev = NODEV;
638 	mtx_enter(&unp_ino_mtx);
639 	if (unp->unp_ino == 0)
640 		unp->unp_ino = unp_ino++;
641 	mtx_leave(&unp_ino_mtx);
642 	sb->st_atim.tv_sec =
643 	    sb->st_mtim.tv_sec =
644 	    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
645 	sb->st_atim.tv_nsec =
646 	    sb->st_mtim.tv_nsec =
647 	    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
648 	sb->st_ino = unp->unp_ino;
649 
650 	return (0);
651 }
652 
653 int
654 uipc_sockaddr(struct socket *so, struct mbuf *nam)
655 {
656 	struct unpcb *unp = sotounpcb(so);
657 
658 	uipc_setaddr(unp, nam);
659 	return (0);
660 }
661 
662 int
663 uipc_peeraddr(struct socket *so, struct mbuf *nam)
664 {
665 	struct unpcb *unp = sotounpcb(so);
666 	struct socket *so2;
667 
668 	so2 = unp_solock_peer(so);
669 	uipc_setaddr(unp->unp_conn, nam);
670 	if (so2 != NULL && so2 != so)
671 		sounlock(so2);
672 	return (0);
673 }
674 
675 int
676 uipc_connect2(struct socket *so, struct socket *so2)
677 {
678 	struct unpcb *unp = sotounpcb(so), *unp2;
679 	int error;
680 
681 	if ((error = unp_connect2(so, so2)))
682 		return (error);
683 
684 	unp->unp_connid.uid = curproc->p_ucred->cr_uid;
685 	unp->unp_connid.gid = curproc->p_ucred->cr_gid;
686 	unp->unp_connid.pid = curproc->p_p->ps_pid;
687 	unp->unp_flags |= UNP_FEIDS;
688 	unp2 = sotounpcb(so2);
689 	unp2->unp_connid.uid = curproc->p_ucred->cr_uid;
690 	unp2->unp_connid.gid = curproc->p_ucred->cr_gid;
691 	unp2->unp_connid.pid = curproc->p_p->ps_pid;
692 	unp2->unp_flags |= UNP_FEIDS;
693 
694 	return (0);
695 }
696 
697 int
698 uipc_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
699     size_t newlen)
700 {
701 	int *valp = &unp_defer;
702 
703 	/* All sysctl names at this level are terminal. */
704 	switch (name[0]) {
705 	case SOCK_STREAM:
706 		if (namelen != 2)
707 			return (ENOTDIR);
708 		return sysctl_bounded_arr(unpstctl_vars, nitems(unpstctl_vars),
709 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
710 	case SOCK_SEQPACKET:
711 		if (namelen != 2)
712 			return (ENOTDIR);
713 		return sysctl_bounded_arr(unpsqctl_vars, nitems(unpsqctl_vars),
714 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
715 	case SOCK_DGRAM:
716 		if (namelen != 2)
717 			return (ENOTDIR);
718 		return sysctl_bounded_arr(unpdgctl_vars, nitems(unpdgctl_vars),
719 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
720 	case NET_UNIX_INFLIGHT:
721 		valp = &unp_rights;
722 		/* FALLTHOUGH */
723 	case NET_UNIX_DEFERRED:
724 		if (namelen != 1)
725 			return (ENOTDIR);
726 		return sysctl_rdint(oldp, oldlenp, newp, *valp);
727 	default:
728 		return (ENOPROTOOPT);
729 	}
730 }
731 
732 void
733 unp_detach(struct unpcb *unp)
734 {
735 	struct socket *so = unp->unp_socket;
736 	struct vnode *vp = unp->unp_vnode;
737 	struct unpcb *unp2;
738 
739 	unp->unp_vnode = NULL;
740 
741 	/*
742 	 * Enforce `unp_gc_lock' -> `solock()' lock order.
743 	 * Enforce `i_lock' -> `solock()' lock order.
744 	 */
745 	sounlock(so);
746 
747 	rw_enter_write(&unp_gc_lock);
748 	LIST_REMOVE(unp, unp_link);
749 	rw_exit_write(&unp_gc_lock);
750 
751 	if (vp != NULL) {
752 		VOP_LOCK(vp, LK_EXCLUSIVE);
753 		vp->v_socket = NULL;
754 
755 		KERNEL_LOCK();
756 		vput(vp);
757 		KERNEL_UNLOCK();
758 	}
759 
760 	solock(so);
761 
762 	if (unp->unp_conn != NULL) {
763 		/*
764 		 * Datagram socket could be connected to itself.
765 		 * Such socket will be disconnected here.
766 		 */
767 		unp_disconnect(unp);
768 	}
769 
770 	while ((unp2 = SLIST_FIRST(&unp->unp_refs)) != NULL) {
771 		struct socket *so2 = unp2->unp_socket;
772 
773 		if (so < so2)
774 			solock(so2);
775 		else {
776 			unp_ref(unp2);
777 			sounlock(so);
778 			solock(so2);
779 			solock(so);
780 
781 			if (unp2->unp_conn != unp) {
782 				/* `unp2' was disconnected due to re-lock. */
783 				sounlock(so2);
784 				unp_rele(unp2);
785 				continue;
786 			}
787 
788 			unp_rele(unp2);
789 		}
790 
791 		unp2->unp_conn = NULL;
792 		SLIST_REMOVE(&unp->unp_refs, unp2, unpcb, unp_nextref);
793 		so2->so_error = ECONNRESET;
794 		so2->so_state &= ~SS_ISCONNECTED;
795 
796 		sounlock(so2);
797 	}
798 
799 	sounlock(so);
800 	refcnt_finalize(&unp->unp_refcnt, "unpfinal");
801 	solock(so);
802 
803 	soisdisconnected(so);
804 	so->so_pcb = NULL;
805 	m_freem(unp->unp_addr);
806 	pool_put(&unpcb_pool, unp);
807 	if (unp_rights)
808 		task_add(systqmp, &unp_gc_task);
809 }
810 
811 int
812 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
813 {
814 	struct sockaddr_un *soun;
815 	struct vnode *vp;
816 	struct socket *so2, *so3;
817 	struct unpcb *unp, *unp2, *unp3;
818 	struct nameidata nd;
819 	int error;
820 
821 	unp = sotounpcb(so);
822 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
823 		return (EISCONN);
824 	if ((error = unp_nam2sun(nam, &soun, NULL)))
825 		return (error);
826 
827 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
828 	nd.ni_pledge = PLEDGE_UNIX;
829 	nd.ni_unveil = UNVEIL_WRITE;
830 
831 	unp->unp_flags |= UNP_CONNECTING;
832 
833 	/*
834 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
835 	 * requires it. The socket can't be closed concurrently
836 	 * because the file descriptor reference is still held.
837 	 */
838 
839 	sounlock(so);
840 
841 	KERNEL_LOCK();
842 	error = namei(&nd);
843 	if (error != 0)
844 		goto unlock;
845 	vp = nd.ni_vp;
846 	if (vp->v_type != VSOCK) {
847 		error = ENOTSOCK;
848 		goto put;
849 	}
850 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
851 		goto put;
852 	so2 = vp->v_socket;
853 	if (so2 == NULL) {
854 		error = ECONNREFUSED;
855 		goto put;
856 	}
857 	if (so->so_type != so2->so_type) {
858 		error = EPROTOTYPE;
859 		goto put;
860 	}
861 
862 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
863 		solock(so2);
864 
865 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
866 		    (so3 = sonewconn(so2, 0, M_WAIT)) == NULL) {
867 			error = ECONNREFUSED;
868 		}
869 
870 		sounlock(so2);
871 
872 		if (error != 0)
873 			goto put;
874 
875 		/*
876 		 * Since `so2' is protected by vnode(9) lock, `so3'
877 		 * can't be PRU_ABORT'ed here.
878 		 */
879 		solock_pair(so, so3);
880 
881 		unp2 = sotounpcb(so2);
882 		unp3 = sotounpcb(so3);
883 
884 		/*
885 		 * `unp_addr', `unp_connid' and 'UNP_FEIDSBIND' flag
886 		 * are immutable since we set them in uipc_bind().
887 		 */
888 		if (unp2->unp_addr)
889 			unp3->unp_addr =
890 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
891 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
892 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
893 		unp3->unp_connid.pid = p->p_p->ps_pid;
894 		unp3->unp_flags |= UNP_FEIDS;
895 
896 		if (unp2->unp_flags & UNP_FEIDSBIND) {
897 			unp->unp_connid = unp2->unp_connid;
898 			unp->unp_flags |= UNP_FEIDS;
899 		}
900 
901 		so2 = so3;
902 	} else {
903 		if (so2 != so)
904 			solock_pair(so, so2);
905 		else
906 			solock(so);
907 	}
908 
909 	error = unp_connect2(so, so2);
910 
911 	sounlock(so);
912 
913 	/*
914 	 * `so2' can't be PRU_ABORT'ed concurrently
915 	 */
916 	if (so2 != so)
917 		sounlock(so2);
918 put:
919 	vput(vp);
920 unlock:
921 	KERNEL_UNLOCK();
922 	solock(so);
923 	unp->unp_flags &= ~UNP_CONNECTING;
924 
925 	/*
926 	 * The peer socket could be closed by concurrent thread
927 	 * when `so' and `vp' are unlocked.
928 	 */
929 	if (error == 0 && unp->unp_conn == NULL)
930 		error = ECONNREFUSED;
931 
932 	return (error);
933 }
934 
935 int
936 unp_connect2(struct socket *so, struct socket *so2)
937 {
938 	struct unpcb *unp = sotounpcb(so);
939 	struct unpcb *unp2;
940 
941 	soassertlocked(so);
942 	soassertlocked(so2);
943 
944 	if (so2->so_type != so->so_type)
945 		return (EPROTOTYPE);
946 	unp2 = sotounpcb(so2);
947 	unp->unp_conn = unp2;
948 	switch (so->so_type) {
949 
950 	case SOCK_DGRAM:
951 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
952 		soisconnected(so);
953 		break;
954 
955 	case SOCK_STREAM:
956 	case SOCK_SEQPACKET:
957 		unp2->unp_conn = unp;
958 		soisconnected(so);
959 		soisconnected(so2);
960 		break;
961 
962 	default:
963 		panic("unp_connect2");
964 	}
965 	return (0);
966 }
967 
968 void
969 unp_disconnect(struct unpcb *unp)
970 {
971 	struct socket *so2;
972 	struct unpcb *unp2;
973 
974 	if ((so2 = unp_solock_peer(unp->unp_socket)) == NULL)
975 		return;
976 
977 	unp2 = unp->unp_conn;
978 	unp->unp_conn = NULL;
979 
980 	switch (unp->unp_socket->so_type) {
981 
982 	case SOCK_DGRAM:
983 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
984 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
985 		break;
986 
987 	case SOCK_STREAM:
988 	case SOCK_SEQPACKET:
989 		unp->unp_socket->so_snd.sb_mbcnt = 0;
990 		unp->unp_socket->so_snd.sb_cc = 0;
991 		soisdisconnected(unp->unp_socket);
992 		unp2->unp_conn = NULL;
993 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
994 		unp2->unp_socket->so_snd.sb_cc = 0;
995 		soisdisconnected(unp2->unp_socket);
996 		break;
997 	}
998 
999 	if (so2 != unp->unp_socket)
1000 		sounlock(so2);
1001 }
1002 
1003 static struct unpcb *
1004 fptounp(struct file *fp)
1005 {
1006 	struct socket *so;
1007 
1008 	if (fp->f_type != DTYPE_SOCKET)
1009 		return (NULL);
1010 	if ((so = fp->f_data) == NULL)
1011 		return (NULL);
1012 	if (so->so_proto->pr_domain != &unixdomain)
1013 		return (NULL);
1014 	return (sotounpcb(so));
1015 }
1016 
1017 int
1018 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
1019 {
1020 	struct proc *p = curproc;		/* XXX */
1021 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1022 	struct filedesc *fdp = p->p_fd;
1023 	int i, *fds = NULL;
1024 	struct fdpass *rp;
1025 	struct file *fp;
1026 	int nfds, error = 0;
1027 
1028 	/*
1029 	 * This code only works because SCM_RIGHTS is the only supported
1030 	 * control message type on unix sockets. Enforce this here.
1031 	 */
1032 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
1033 		return EINVAL;
1034 
1035 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1036 	    sizeof(struct fdpass);
1037 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
1038 		controllen = 0;
1039 	else
1040 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
1041 	if (nfds > controllen / sizeof(int)) {
1042 		error = EMSGSIZE;
1043 		goto out;
1044 	}
1045 
1046 	/* Make sure the recipient should be able to see the descriptors.. */
1047 	rp = (struct fdpass *)CMSG_DATA(cm);
1048 
1049 	/* fdp->fd_rdir requires KERNEL_LOCK() */
1050 	KERNEL_LOCK();
1051 
1052 	for (i = 0; i < nfds; i++) {
1053 		fp = rp->fp;
1054 		rp++;
1055 		error = pledge_recvfd(p, fp);
1056 		if (error)
1057 			break;
1058 
1059 		/*
1060 		 * No to block devices.  If passing a directory,
1061 		 * make sure that it is underneath the root.
1062 		 */
1063 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
1064 			struct vnode *vp = (struct vnode *)fp->f_data;
1065 
1066 			if (vp->v_type == VBLK ||
1067 			    (vp->v_type == VDIR &&
1068 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
1069 				error = EPERM;
1070 				break;
1071 			}
1072 		}
1073 	}
1074 
1075 	KERNEL_UNLOCK();
1076 
1077 	if (error)
1078 		goto out;
1079 
1080 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
1081 
1082 	fdplock(fdp);
1083 restart:
1084 	/*
1085 	 * First loop -- allocate file descriptor table slots for the
1086 	 * new descriptors.
1087 	 */
1088 	rp = ((struct fdpass *)CMSG_DATA(cm));
1089 	for (i = 0; i < nfds; i++) {
1090 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
1091 			/*
1092 			 * Back out what we've done so far.
1093 			 */
1094 			for (--i; i >= 0; i--)
1095 				fdremove(fdp, fds[i]);
1096 
1097 			if (error == ENOSPC) {
1098 				fdexpand(p);
1099 				goto restart;
1100 			}
1101 
1102 			fdpunlock(fdp);
1103 
1104 			/*
1105 			 * This is the error that has historically
1106 			 * been returned, and some callers may
1107 			 * expect it.
1108 			 */
1109 
1110 			error = EMSGSIZE;
1111 			goto out;
1112 		}
1113 
1114 		/*
1115 		 * Make the slot reference the descriptor so that
1116 		 * fdalloc() works properly.. We finalize it all
1117 		 * in the loop below.
1118 		 */
1119 		mtx_enter(&fdp->fd_fplock);
1120 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
1121 		fdp->fd_ofiles[fds[i]] = rp->fp;
1122 		mtx_leave(&fdp->fd_fplock);
1123 
1124 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
1125 		if (flags & MSG_CMSG_CLOEXEC)
1126 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
1127 
1128 		rp++;
1129 	}
1130 
1131 	/*
1132 	 * Keep `fdp' locked to prevent concurrent close() of just
1133 	 * inserted descriptors. Such descriptors could have the only
1134 	 * `f_count' reference which is now shared between control
1135 	 * message and `fdp'.
1136 	 */
1137 
1138 	/*
1139 	 * Now that adding them has succeeded, update all of the
1140 	 * descriptor passing state.
1141 	 */
1142 	rp = (struct fdpass *)CMSG_DATA(cm);
1143 
1144 	for (i = 0; i < nfds; i++) {
1145 		struct unpcb *unp;
1146 
1147 		fp = rp->fp;
1148 		rp++;
1149 		if ((unp = fptounp(fp)) != NULL) {
1150 			rw_enter_write(&unp_gc_lock);
1151 			unp->unp_msgcount--;
1152 			rw_exit_write(&unp_gc_lock);
1153 		}
1154 	}
1155 	fdpunlock(fdp);
1156 
1157 	mtx_enter(&unp_rights_mtx);
1158 	unp_rights -= nfds;
1159 	mtx_leave(&unp_rights_mtx);
1160 
1161 	/*
1162 	 * Copy temporary array to message and adjust length, in case of
1163 	 * transition from large struct file pointers to ints.
1164 	 */
1165 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
1166 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1167 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
1168  out:
1169 	if (fds != NULL)
1170 		free(fds, M_TEMP, nfds * sizeof(int));
1171 
1172 	if (error) {
1173 		if (nfds > 0) {
1174 			/*
1175 			 * No lock required. We are the only `cm' holder.
1176 			 */
1177 			rp = ((struct fdpass *)CMSG_DATA(cm));
1178 			unp_discard(rp, nfds);
1179 		}
1180 	}
1181 
1182 	return (error);
1183 }
1184 
1185 int
1186 unp_internalize(struct mbuf *control, struct proc *p)
1187 {
1188 	struct filedesc *fdp = p->p_fd;
1189 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1190 	struct fdpass *rp;
1191 	struct file *fp;
1192 	struct unpcb *unp;
1193 	int i, error;
1194 	int nfds, *ip, fd, neededspace;
1195 
1196 	/*
1197 	 * Check for two potential msg_controllen values because
1198 	 * IETF stuck their nose in a place it does not belong.
1199 	 */
1200 	if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
1201 		return (EINVAL);
1202 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1203 	    !(cm->cmsg_len == control->m_len ||
1204 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
1205 		return (EINVAL);
1206 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
1207 
1208 	mtx_enter(&unp_rights_mtx);
1209 	if (unp_rights + nfds > maxfiles / 10) {
1210 		mtx_leave(&unp_rights_mtx);
1211 		return (EMFILE);
1212 	}
1213 	unp_rights += nfds;
1214 	mtx_leave(&unp_rights_mtx);
1215 
1216 	/* Make sure we have room for the struct file pointers */
1217 morespace:
1218 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
1219 	    control->m_len;
1220 	if (neededspace > m_trailingspace(control)) {
1221 		char *tmp;
1222 		/* if we already have a cluster, the message is just too big */
1223 		if (control->m_flags & M_EXT) {
1224 			error = E2BIG;
1225 			goto nospace;
1226 		}
1227 
1228 		/* copy cmsg data temporarily out of the mbuf */
1229 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
1230 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
1231 
1232 		/* allocate a cluster and try again */
1233 		MCLGET(control, M_WAIT);
1234 		if ((control->m_flags & M_EXT) == 0) {
1235 			free(tmp, M_TEMP, control->m_len);
1236 			error = ENOBUFS;       /* allocation failed */
1237 			goto nospace;
1238 		}
1239 
1240 		/* copy the data back into the cluster */
1241 		cm = mtod(control, struct cmsghdr *);
1242 		memcpy(cm, tmp, control->m_len);
1243 		free(tmp, M_TEMP, control->m_len);
1244 		goto morespace;
1245 	}
1246 
1247 	/* adjust message & mbuf to note amount of space actually used. */
1248 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
1249 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
1250 
1251 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
1252 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
1253 	fdplock(fdp);
1254 	for (i = 0; i < nfds; i++) {
1255 		memcpy(&fd, ip, sizeof fd);
1256 		ip--;
1257 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
1258 			error = EBADF;
1259 			goto fail;
1260 		}
1261 		if (fp->f_count >= FDUP_MAX_COUNT) {
1262 			error = EDEADLK;
1263 			goto fail;
1264 		}
1265 		error = pledge_sendfd(p, fp);
1266 		if (error)
1267 			goto fail;
1268 
1269 		/* kqueue descriptors cannot be copied */
1270 		if (fp->f_type == DTYPE_KQUEUE) {
1271 			error = EINVAL;
1272 			goto fail;
1273 		}
1274 #if NKCOV > 0
1275 		/* kcov descriptors cannot be copied */
1276 		if (fp->f_type == DTYPE_VNODE && kcov_vnode(fp->f_data)) {
1277 			error = EINVAL;
1278 			goto fail;
1279 		}
1280 #endif
1281 		rp->fp = fp;
1282 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
1283 		rp--;
1284 		if ((unp = fptounp(fp)) != NULL) {
1285 			rw_enter_write(&unp_gc_lock);
1286 			unp->unp_msgcount++;
1287 			unp->unp_file = fp;
1288 			rw_exit_write(&unp_gc_lock);
1289 		}
1290 	}
1291 	fdpunlock(fdp);
1292 	return (0);
1293 fail:
1294 	fdpunlock(fdp);
1295 	if (fp != NULL)
1296 		FRELE(fp, p);
1297 	/* Back out what we just did. */
1298 	for ( ; i > 0; i--) {
1299 		rp++;
1300 		fp = rp->fp;
1301 		if ((unp = fptounp(fp)) != NULL) {
1302 			rw_enter_write(&unp_gc_lock);
1303 			unp->unp_msgcount--;
1304 			rw_exit_write(&unp_gc_lock);
1305 		}
1306 		FRELE(fp, p);
1307 	}
1308 
1309 nospace:
1310 	mtx_enter(&unp_rights_mtx);
1311 	unp_rights -= nfds;
1312 	mtx_leave(&unp_rights_mtx);
1313 
1314 	return (error);
1315 }
1316 
1317 void
1318 unp_gc(void *arg __unused)
1319 {
1320 	struct unp_deferral *defer;
1321 	struct file *fp;
1322 	struct socket *so;
1323 	struct unpcb *unp;
1324 	int nunref, i;
1325 
1326 	rw_enter_write(&unp_gc_lock);
1327 	if (unp_gcing)
1328 		goto unlock;
1329 	unp_gcing = 1;
1330 	rw_exit_write(&unp_gc_lock);
1331 
1332 	rw_enter_write(&unp_df_lock);
1333 	/* close any fds on the deferred list */
1334 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
1335 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
1336 		rw_exit_write(&unp_df_lock);
1337 		for (i = 0; i < defer->ud_n; i++) {
1338 			fp = defer->ud_fp[i].fp;
1339 			if (fp == NULL)
1340 				continue;
1341 			if ((unp = fptounp(fp)) != NULL) {
1342 				rw_enter_write(&unp_gc_lock);
1343 				unp->unp_msgcount--;
1344 				rw_exit_write(&unp_gc_lock);
1345 			}
1346 			mtx_enter(&unp_rights_mtx);
1347 			unp_rights--;
1348 			mtx_leave(&unp_rights_mtx);
1349 			 /* closef() expects a refcount of 2 */
1350 			FREF(fp);
1351 			(void) closef(fp, NULL);
1352 		}
1353 		free(defer, M_TEMP, sizeof(*defer) +
1354 		    sizeof(struct fdpass) * defer->ud_n);
1355 		rw_enter_write(&unp_df_lock);
1356 	}
1357 	rw_exit_write(&unp_df_lock);
1358 
1359 	nunref = 0;
1360 
1361 	rw_enter_write(&unp_gc_lock);
1362 
1363 	/*
1364 	 * Determine sockets which may be prospectively dead. Such
1365 	 * sockets have their `unp_msgcount' equal to the `f_count'.
1366 	 * If `unp_msgcount' is 0, the socket has not been passed
1367 	 * and can't be unreferenced.
1368 	 */
1369 	LIST_FOREACH(unp, &unp_head, unp_link) {
1370 		unp->unp_gcflags = 0;
1371 
1372 		if (unp->unp_msgcount == 0)
1373 			continue;
1374 		if ((fp = unp->unp_file) == NULL)
1375 			continue;
1376 		if (fp->f_count == unp->unp_msgcount) {
1377 			unp->unp_gcflags |= UNP_GCDEAD;
1378 			unp->unp_gcrefs = unp->unp_msgcount;
1379 			nunref++;
1380 		}
1381 	}
1382 
1383 	/*
1384 	 * Scan all sockets previously marked as dead. Remove
1385 	 * the `unp_gcrefs' reference each socket holds on any
1386 	 * dead socket in its buffer.
1387 	 */
1388 	LIST_FOREACH(unp, &unp_head, unp_link) {
1389 		if ((unp->unp_gcflags & UNP_GCDEAD) == 0)
1390 			continue;
1391 		so = unp->unp_socket;
1392 		solock(so);
1393 		unp_scan(so->so_rcv.sb_mb, unp_remove_gcrefs);
1394 		sounlock(so);
1395 	}
1396 
1397 	/*
1398 	 * If the dead socket has `unp_gcrefs' reference counter
1399 	 * greater than 0, it can't be unreferenced. Mark it as
1400 	 * alive and increment the `unp_gcrefs' reference for each
1401 	 * dead socket within its buffer. Repeat this until we
1402 	 * have no new alive sockets found.
1403 	 */
1404 	do {
1405 		unp_defer = 0;
1406 
1407 		LIST_FOREACH(unp, &unp_head, unp_link) {
1408 			if ((unp->unp_gcflags & UNP_GCDEAD) == 0)
1409 				continue;
1410 			if (unp->unp_gcrefs == 0)
1411 				continue;
1412 
1413 			unp->unp_gcflags &= ~UNP_GCDEAD;
1414 
1415 			so = unp->unp_socket;
1416 			solock(so);
1417 			unp_scan(so->so_rcv.sb_mb, unp_restore_gcrefs);
1418 			sounlock(so);
1419 
1420 			KASSERT(nunref > 0);
1421 			nunref--;
1422 		}
1423 	} while (unp_defer > 0);
1424 
1425 	/*
1426 	 * If there are any unreferenced sockets, then for each dispose
1427 	 * of files in its receive buffer and then close it.
1428 	 */
1429 	if (nunref) {
1430 		LIST_FOREACH(unp, &unp_head, unp_link) {
1431 			if (unp->unp_gcflags & UNP_GCDEAD) {
1432 				/*
1433 				 * This socket could still be connected
1434 				 * and if so it's `so_rcv' is still
1435 				 * accessible by concurrent PRU_SEND
1436 				 * thread.
1437 				 */
1438 				so = unp->unp_socket;
1439 				solock(so);
1440 				unp_scan(so->so_rcv.sb_mb, unp_discard);
1441 				sounlock(so);
1442 			}
1443 		}
1444 	}
1445 
1446 	unp_gcing = 0;
1447 unlock:
1448 	rw_exit_write(&unp_gc_lock);
1449 }
1450 
1451 void
1452 unp_dispose(struct mbuf *m)
1453 {
1454 
1455 	if (m)
1456 		unp_scan(m, unp_discard);
1457 }
1458 
1459 void
1460 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1461 {
1462 	struct mbuf *m;
1463 	struct fdpass *rp;
1464 	struct cmsghdr *cm;
1465 	int qfds;
1466 
1467 	while (m0) {
1468 		for (m = m0; m; m = m->m_next) {
1469 			if (m->m_type == MT_CONTROL &&
1470 			    m->m_len >= sizeof(*cm)) {
1471 				cm = mtod(m, struct cmsghdr *);
1472 				if (cm->cmsg_level != SOL_SOCKET ||
1473 				    cm->cmsg_type != SCM_RIGHTS)
1474 					continue;
1475 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1476 				    / sizeof(struct fdpass);
1477 				if (qfds > 0) {
1478 					rp = (struct fdpass *)CMSG_DATA(cm);
1479 					op(rp, qfds);
1480 				}
1481 				break;		/* XXX, but saves time */
1482 			}
1483 		}
1484 		m0 = m0->m_nextpkt;
1485 	}
1486 }
1487 
1488 void
1489 unp_discard(struct fdpass *rp, int nfds)
1490 {
1491 	struct unp_deferral *defer;
1492 
1493 	/* copy the file pointers to a deferral structure */
1494 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1495 	defer->ud_n = nfds;
1496 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1497 	memset(rp, 0, sizeof(*rp) * nfds);
1498 
1499 	rw_enter_write(&unp_df_lock);
1500 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1501 	rw_exit_write(&unp_df_lock);
1502 
1503 	task_add(systqmp, &unp_gc_task);
1504 }
1505 
1506 void
1507 unp_remove_gcrefs(struct fdpass *rp, int nfds)
1508 {
1509 	struct unpcb *unp;
1510 	int i;
1511 
1512 	rw_assert_wrlock(&unp_gc_lock);
1513 
1514 	for (i = 0; i < nfds; i++) {
1515 		if (rp[i].fp == NULL)
1516 			continue;
1517 		if ((unp = fptounp(rp[i].fp)) == NULL)
1518 			continue;
1519 		if (unp->unp_gcflags & UNP_GCDEAD) {
1520 			KASSERT(unp->unp_gcrefs > 0);
1521 			unp->unp_gcrefs--;
1522 		}
1523 	}
1524 }
1525 
1526 void
1527 unp_restore_gcrefs(struct fdpass *rp, int nfds)
1528 {
1529 	struct unpcb *unp;
1530 	int i;
1531 
1532 	rw_assert_wrlock(&unp_gc_lock);
1533 
1534 	for (i = 0; i < nfds; i++) {
1535 		if (rp[i].fp == NULL)
1536 			continue;
1537 		if ((unp = fptounp(rp[i].fp)) == NULL)
1538 			continue;
1539 		if (unp->unp_gcflags & UNP_GCDEAD) {
1540 			unp->unp_gcrefs++;
1541 			unp_defer++;
1542 		}
1543 	}
1544 }
1545 
1546 int
1547 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1548 {
1549 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1550 	size_t size, len;
1551 
1552 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1553 		return EINVAL;
1554 	if (sa->sa_family != AF_UNIX)
1555 		return EAFNOSUPPORT;
1556 	if (sa->sa_len != nam->m_len)
1557 		return EINVAL;
1558 	if (sa->sa_len > sizeof(struct sockaddr_un))
1559 		return EINVAL;
1560 	*sun = (struct sockaddr_un *)sa;
1561 
1562 	/* ensure that sun_path is NUL terminated and fits */
1563 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1564 	len = strnlen((*sun)->sun_path, size);
1565 	if (len == sizeof((*sun)->sun_path))
1566 		return EINVAL;
1567 	if (len == size) {
1568 		if (m_trailingspace(nam) == 0)
1569 			return EINVAL;
1570 		nam->m_len++;
1571 		(*sun)->sun_len++;
1572 		(*sun)->sun_path[len] = '\0';
1573 	}
1574 	if (pathlen != NULL)
1575 		*pathlen = len;
1576 
1577 	return 0;
1578 }
1579