xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision 097a140d792de8b2bbe59ad827d39eabf9b4280a)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.144 2021/02/22 19:14:01 mvs Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 #include <sys/pool.h>
54 #include <sys/rwlock.h>
55 
56 /*
57  * Locks used to protect global data and struct members:
58  *      I       immutable after creation
59  *      U       unp_lock
60  */
61 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
62 
63 /*
64  * Stack of sets of files that were passed over a socket but were
65  * not received and need to be closed.
66  */
67 struct	unp_deferral {
68 	SLIST_ENTRY(unp_deferral)	ud_link;	/* [U] */
69 	int				ud_n;		/* [I] */
70 	/* followed by ud_n struct fdpass */
71 	struct fdpass			ud_fp[];	/* [I] */
72 };
73 
74 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
75 void	unp_discard(struct fdpass *, int);
76 void	unp_mark(struct fdpass *, int);
77 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
78 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
79 
80 struct pool unpcb_pool;
81 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
82 
83 /*
84  * Unix communications domain.
85  *
86  * TODO:
87  *	RDM
88  *	rethink name space problems
89  *	need a proper out-of-band
90  */
91 const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
92 
93 /* [U] list of all UNIX domain sockets, for unp_gc() */
94 LIST_HEAD(unp_head, unpcb)	unp_head =
95 	LIST_HEAD_INITIALIZER(unp_head);
96 /* [U] list of sets of files that were sent over sockets that are now closed */
97 SLIST_HEAD(,unp_deferral)	unp_deferred =
98 	SLIST_HEAD_INITIALIZER(unp_deferred);
99 
100 ino_t	unp_ino;	/* [U] prototype for fake inode numbers */
101 int	unp_rights;	/* [U] file descriptors in flight */
102 int	unp_defer;	/* [U] number of deferred fp to close by the GC task */
103 int	unp_gcing;	/* [U] GC task currently running */
104 
105 void
106 unp_init(void)
107 {
108 	pool_init(&unpcb_pool, sizeof(struct unpcb), 0,
109 	    IPL_SOFTNET, 0, "unpcb", NULL);
110 }
111 
112 void
113 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
114 {
115 	if (unp != NULL && unp->unp_addr != NULL) {
116 		nam->m_len = unp->unp_addr->m_len;
117 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
118 		    nam->m_len);
119 	} else {
120 		nam->m_len = sizeof(sun_noname);
121 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
122 		    nam->m_len);
123 	}
124 }
125 
126 int
127 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
128     struct mbuf *control, struct proc *p)
129 {
130 	struct unpcb *unp = sotounpcb(so);
131 	struct unpcb *unp2;
132 	struct socket *so2;
133 	int error = 0;
134 
135 	if (req == PRU_CONTROL)
136 		return (EOPNOTSUPP);
137 	if (req != PRU_SEND && control && control->m_len) {
138 		error = EOPNOTSUPP;
139 		goto release;
140 	}
141 	if (unp == NULL) {
142 		error = EINVAL;
143 		goto release;
144 	}
145 
146 	NET_ASSERT_UNLOCKED();
147 
148 	switch (req) {
149 
150 	case PRU_BIND:
151 		error = unp_bind(unp, nam, p);
152 		break;
153 
154 	case PRU_LISTEN:
155 		if (unp->unp_vnode == NULL)
156 			error = EINVAL;
157 		break;
158 
159 	case PRU_CONNECT:
160 		error = unp_connect(so, nam, p);
161 		break;
162 
163 	case PRU_CONNECT2:
164 		error = unp_connect2(so, (struct socket *)nam);
165 		if (!error) {
166 			unp->unp_connid.uid = p->p_ucred->cr_uid;
167 			unp->unp_connid.gid = p->p_ucred->cr_gid;
168 			unp->unp_connid.pid = p->p_p->ps_pid;
169 			unp->unp_flags |= UNP_FEIDS;
170 			unp2 = sotounpcb((struct socket *)nam);
171 			unp2->unp_connid.uid = p->p_ucred->cr_uid;
172 			unp2->unp_connid.gid = p->p_ucred->cr_gid;
173 			unp2->unp_connid.pid = p->p_p->ps_pid;
174 			unp2->unp_flags |= UNP_FEIDS;
175 		}
176 		break;
177 
178 	case PRU_DISCONNECT:
179 		unp_disconnect(unp);
180 		break;
181 
182 	case PRU_ACCEPT:
183 		/*
184 		 * Pass back name of connected socket,
185 		 * if it was bound and we are still connected
186 		 * (our peer may have closed already!).
187 		 */
188 		uipc_setaddr(unp->unp_conn, nam);
189 		break;
190 
191 	case PRU_SHUTDOWN:
192 		socantsendmore(so);
193 		unp_shutdown(unp);
194 		break;
195 
196 	case PRU_RCVD:
197 		switch (so->so_type) {
198 
199 		case SOCK_DGRAM:
200 			panic("uipc 1");
201 			/*NOTREACHED*/
202 
203 		case SOCK_STREAM:
204 		case SOCK_SEQPACKET:
205 			if (unp->unp_conn == NULL)
206 				break;
207 			so2 = unp->unp_conn->unp_socket;
208 			/*
209 			 * Adjust backpressure on sender
210 			 * and wakeup any waiting to write.
211 			 */
212 			so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
213 			so2->so_snd.sb_cc = so->so_rcv.sb_cc;
214 			sowwakeup(so2);
215 			break;
216 
217 		default:
218 			panic("uipc 2");
219 		}
220 		break;
221 
222 	case PRU_SEND:
223 		if (control && (error = unp_internalize(control, p)))
224 			break;
225 		switch (so->so_type) {
226 
227 		case SOCK_DGRAM: {
228 			const struct sockaddr *from;
229 
230 			if (nam) {
231 				if (unp->unp_conn) {
232 					error = EISCONN;
233 					break;
234 				}
235 				error = unp_connect(so, nam, p);
236 				if (error)
237 					break;
238 			} else {
239 				if (unp->unp_conn == NULL) {
240 					error = ENOTCONN;
241 					break;
242 				}
243 			}
244 			so2 = unp->unp_conn->unp_socket;
245 			if (unp->unp_addr)
246 				from = mtod(unp->unp_addr, struct sockaddr *);
247 			else
248 				from = &sun_noname;
249 			if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
250 				sorwakeup(so2);
251 				m = NULL;
252 				control = NULL;
253 			} else
254 				error = ENOBUFS;
255 			if (nam)
256 				unp_disconnect(unp);
257 			break;
258 		}
259 
260 		case SOCK_STREAM:
261 		case SOCK_SEQPACKET:
262 			if (so->so_state & SS_CANTSENDMORE) {
263 				error = EPIPE;
264 				break;
265 			}
266 			if (unp->unp_conn == NULL) {
267 				error = ENOTCONN;
268 				break;
269 			}
270 			so2 = unp->unp_conn->unp_socket;
271 			/*
272 			 * Send to paired receive port, and then raise
273 			 * send buffer counts to maintain backpressure.
274 			 * Wake up readers.
275 			 */
276 			if (control) {
277 				if (sbappendcontrol(so2, &so2->so_rcv, m,
278 				    control)) {
279 					control = NULL;
280 				} else {
281 					error = ENOBUFS;
282 					break;
283 				}
284 			} else if (so->so_type == SOCK_SEQPACKET)
285 				sbappendrecord(so2, &so2->so_rcv, m);
286 			else
287 				sbappend(so2, &so2->so_rcv, m);
288 			so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
289 			so->so_snd.sb_cc = so2->so_rcv.sb_cc;
290 			if (so2->so_rcv.sb_cc > 0)
291 				sorwakeup(so2);
292 			m = NULL;
293 			break;
294 
295 		default:
296 			panic("uipc 4");
297 		}
298 		/* we need to undo unp_internalize in case of errors */
299 		if (control && error)
300 			unp_dispose(control);
301 		break;
302 
303 	case PRU_ABORT:
304 		unp_drop(unp, ECONNABORTED);
305 		break;
306 
307 	case PRU_SENSE: {
308 		struct stat *sb = (struct stat *)m;
309 
310 		sb->st_blksize = so->so_snd.sb_hiwat;
311 		sb->st_dev = NODEV;
312 		if (unp->unp_ino == 0)
313 			unp->unp_ino = unp_ino++;
314 		sb->st_atim.tv_sec =
315 		    sb->st_mtim.tv_sec =
316 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
317 		sb->st_atim.tv_nsec =
318 		    sb->st_mtim.tv_nsec =
319 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
320 		sb->st_ino = unp->unp_ino;
321 		break;
322 	}
323 
324 	case PRU_RCVOOB:
325 	case PRU_SENDOOB:
326 		error = EOPNOTSUPP;
327 		break;
328 
329 	case PRU_SOCKADDR:
330 		uipc_setaddr(unp, nam);
331 		break;
332 
333 	case PRU_PEERADDR:
334 		uipc_setaddr(unp->unp_conn, nam);
335 		break;
336 
337 	case PRU_SLOWTIMO:
338 		break;
339 
340 	default:
341 		panic("uipc_usrreq");
342 	}
343 release:
344 	if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) {
345 		m_freem(control);
346 		m_freem(m);
347 	}
348 	return (error);
349 }
350 
351 /*
352  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
353  * for stream sockets, although the total for sender and receiver is
354  * actually only PIPSIZ.
355  * Datagram sockets really use the sendspace as the maximum datagram size,
356  * and don't really want to reserve the sendspace.  Their recvspace should
357  * be large enough for at least one max-size datagram plus address.
358  */
359 #define	PIPSIZ	4096
360 u_long	unpst_sendspace = PIPSIZ;
361 u_long	unpst_recvspace = PIPSIZ;
362 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
363 u_long	unpdg_recvspace = 4*1024;
364 
365 int
366 uipc_attach(struct socket *so, int proto)
367 {
368 	struct unpcb *unp;
369 	int error;
370 
371 	rw_assert_wrlock(&unp_lock);
372 
373 	if (so->so_pcb)
374 		return EISCONN;
375 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
376 		switch (so->so_type) {
377 
378 		case SOCK_STREAM:
379 		case SOCK_SEQPACKET:
380 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
381 			break;
382 
383 		case SOCK_DGRAM:
384 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
385 			break;
386 
387 		default:
388 			panic("unp_attach");
389 		}
390 		if (error)
391 			return (error);
392 	}
393 	unp = pool_get(&unpcb_pool, PR_NOWAIT|PR_ZERO);
394 	if (unp == NULL)
395 		return (ENOBUFS);
396 	unp->unp_socket = so;
397 	so->so_pcb = unp;
398 	getnanotime(&unp->unp_ctime);
399 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
400 	return (0);
401 }
402 
403 int
404 uipc_detach(struct socket *so)
405 {
406 	struct unpcb *unp = sotounpcb(so);
407 
408 	if (unp == NULL)
409 		return (EINVAL);
410 
411 	NET_ASSERT_UNLOCKED();
412 
413 	unp_detach(unp);
414 
415 	return (0);
416 }
417 
418 void
419 unp_detach(struct unpcb *unp)
420 {
421 	struct socket *so = unp->unp_socket;
422 	struct vnode *vp = NULL;
423 
424 	rw_assert_wrlock(&unp_lock);
425 
426 	LIST_REMOVE(unp, unp_link);
427 	if (unp->unp_vnode) {
428 		/*
429 		 * `v_socket' is only read in unp_connect and
430 		 * unplock prevents concurrent access.
431 		 */
432 
433 		unp->unp_vnode->v_socket = NULL;
434 		vp = unp->unp_vnode;
435 		unp->unp_vnode = NULL;
436 	}
437 
438 	if (unp->unp_conn)
439 		unp_disconnect(unp);
440 	while (!SLIST_EMPTY(&unp->unp_refs))
441 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
442 	soisdisconnected(so);
443 	so->so_pcb = NULL;
444 	m_freem(unp->unp_addr);
445 	pool_put(&unpcb_pool, unp);
446 	if (unp_rights)
447 		task_add(systqmp, &unp_gc_task);
448 
449 	if (vp != NULL) {
450 		/*
451 		 * Enforce `i_lock' -> `unplock' because fifo subsystem
452 		 * requires it. The socket can't be closed concurrently
453 		 * because the file descriptor reference is
454 		 * still hold.
455 		 */
456 
457 		sounlock(so, SL_LOCKED);
458 		KERNEL_LOCK();
459 		vrele(vp);
460 		KERNEL_UNLOCK();
461 		solock(so);
462 	}
463 }
464 
465 int
466 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
467 {
468 	struct sockaddr_un *soun;
469 	struct mbuf *nam2;
470 	struct vnode *vp;
471 	struct vattr vattr;
472 	int error;
473 	struct nameidata nd;
474 	size_t pathlen;
475 
476 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
477 		return (EINVAL);
478 	if (unp->unp_vnode != NULL)
479 		return (EINVAL);
480 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
481 		return (error);
482 
483 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
484 	nam2->m_len = sizeof(struct sockaddr_un);
485 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
486 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
487 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
488 
489 	soun = mtod(nam2, struct sockaddr_un *);
490 
491 	/* Fixup sun_len to keep it in sync with m_len. */
492 	soun->sun_len = nam2->m_len;
493 
494 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
495 	    soun->sun_path, p);
496 	nd.ni_pledge = PLEDGE_UNIX;
497 
498 	unp->unp_flags |= UNP_BINDING;
499 
500 	/*
501 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
502 	 * requires it. The socket can't be closed concurrently
503 	 * because the file descriptor reference is still held.
504 	 */
505 
506 	sounlock(unp->unp_socket, SL_LOCKED);
507 
508 	KERNEL_LOCK();
509 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
510 	error = namei(&nd);
511 	if (error != 0) {
512 		m_freem(nam2);
513 		solock(unp->unp_socket);
514 		goto out;
515 	}
516 	vp = nd.ni_vp;
517 	if (vp != NULL) {
518 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
519 		if (nd.ni_dvp == vp)
520 			vrele(nd.ni_dvp);
521 		else
522 			vput(nd.ni_dvp);
523 		vrele(vp);
524 		m_freem(nam2);
525 		error = EADDRINUSE;
526 		solock(unp->unp_socket);
527 		goto out;
528 	}
529 	VATTR_NULL(&vattr);
530 	vattr.va_type = VSOCK;
531 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
532 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
533 	vput(nd.ni_dvp);
534 	if (error) {
535 		m_freem(nam2);
536 		solock(unp->unp_socket);
537 		goto out;
538 	}
539 	solock(unp->unp_socket);
540 	unp->unp_addr = nam2;
541 	vp = nd.ni_vp;
542 	vp->v_socket = unp->unp_socket;
543 	unp->unp_vnode = vp;
544 	unp->unp_connid.uid = p->p_ucred->cr_uid;
545 	unp->unp_connid.gid = p->p_ucred->cr_gid;
546 	unp->unp_connid.pid = p->p_p->ps_pid;
547 	unp->unp_flags |= UNP_FEIDSBIND;
548 	VOP_UNLOCK(vp);
549 out:
550 	KERNEL_UNLOCK();
551 	unp->unp_flags &= ~UNP_BINDING;
552 
553 	return (error);
554 }
555 
556 int
557 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
558 {
559 	struct sockaddr_un *soun;
560 	struct vnode *vp;
561 	struct socket *so2, *so3;
562 	struct unpcb *unp, *unp2, *unp3;
563 	struct nameidata nd;
564 	int error;
565 
566 	unp = sotounpcb(so);
567 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
568 		return (EISCONN);
569 	if ((error = unp_nam2sun(nam, &soun, NULL)))
570 		return (error);
571 
572 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
573 	nd.ni_pledge = PLEDGE_UNIX;
574 
575 	unp->unp_flags |= UNP_CONNECTING;
576 
577 	/*
578 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
579 	 * requires it. The socket can't be closed concurrently
580 	 * because the file descriptor reference is still held.
581 	 */
582 
583 	sounlock(so, SL_LOCKED);
584 
585 	KERNEL_LOCK();
586 	error = namei(&nd);
587 	if (error != 0)
588 		goto unlock;
589 	vp = nd.ni_vp;
590 	if (vp->v_type != VSOCK) {
591 		error = ENOTSOCK;
592 		goto put;
593 	}
594 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
595 		goto put;
596 	solock(so);
597 	so2 = vp->v_socket;
598 	if (so2 == NULL) {
599 		error = ECONNREFUSED;
600 		goto put_locked;
601 	}
602 	if (so->so_type != so2->so_type) {
603 		error = EPROTOTYPE;
604 		goto put_locked;
605 	}
606 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
607 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
608 		    (so3 = sonewconn(so2, 0)) == 0) {
609 			error = ECONNREFUSED;
610 			goto put_locked;
611 		}
612 		unp2 = sotounpcb(so2);
613 		unp3 = sotounpcb(so3);
614 		if (unp2->unp_addr)
615 			unp3->unp_addr =
616 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
617 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
618 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
619 		unp3->unp_connid.pid = p->p_p->ps_pid;
620 		unp3->unp_flags |= UNP_FEIDS;
621 		so2 = so3;
622 		if (unp2->unp_flags & UNP_FEIDSBIND) {
623 			unp->unp_connid = unp2->unp_connid;
624 			unp->unp_flags |= UNP_FEIDS;
625 		}
626 	}
627 	error = unp_connect2(so, so2);
628 put_locked:
629 	sounlock(so, SL_LOCKED);
630 put:
631 	vput(vp);
632 unlock:
633 	KERNEL_UNLOCK();
634 	solock(so);
635 	unp->unp_flags &= ~UNP_CONNECTING;
636 
637 	return (error);
638 }
639 
640 int
641 unp_connect2(struct socket *so, struct socket *so2)
642 {
643 	struct unpcb *unp = sotounpcb(so);
644 	struct unpcb *unp2;
645 
646 	rw_assert_wrlock(&unp_lock);
647 
648 	if (so2->so_type != so->so_type)
649 		return (EPROTOTYPE);
650 	unp2 = sotounpcb(so2);
651 	unp->unp_conn = unp2;
652 	switch (so->so_type) {
653 
654 	case SOCK_DGRAM:
655 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
656 		soisconnected(so);
657 		break;
658 
659 	case SOCK_STREAM:
660 	case SOCK_SEQPACKET:
661 		unp2->unp_conn = unp;
662 		soisconnected(so);
663 		soisconnected(so2);
664 		break;
665 
666 	default:
667 		panic("unp_connect2");
668 	}
669 	return (0);
670 }
671 
672 void
673 unp_disconnect(struct unpcb *unp)
674 {
675 	struct unpcb *unp2 = unp->unp_conn;
676 
677 	if (unp2 == NULL)
678 		return;
679 	unp->unp_conn = NULL;
680 	switch (unp->unp_socket->so_type) {
681 
682 	case SOCK_DGRAM:
683 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
684 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
685 		break;
686 
687 	case SOCK_STREAM:
688 	case SOCK_SEQPACKET:
689 		unp->unp_socket->so_snd.sb_mbcnt = 0;
690 		unp->unp_socket->so_snd.sb_cc = 0;
691 		soisdisconnected(unp->unp_socket);
692 		unp2->unp_conn = NULL;
693 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
694 		unp2->unp_socket->so_snd.sb_cc = 0;
695 		soisdisconnected(unp2->unp_socket);
696 		break;
697 	}
698 }
699 
700 void
701 unp_shutdown(struct unpcb *unp)
702 {
703 	struct socket *so;
704 
705 	switch (unp->unp_socket->so_type) {
706 	case SOCK_STREAM:
707 	case SOCK_SEQPACKET:
708 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
709 			socantrcvmore(so);
710 		break;
711 	default:
712 		break;
713 	}
714 }
715 
716 void
717 unp_drop(struct unpcb *unp, int errno)
718 {
719 	struct socket *so = unp->unp_socket;
720 
721 	rw_assert_wrlock(&unp_lock);
722 
723 	so->so_error = errno;
724 	unp_disconnect(unp);
725 	if (so->so_head) {
726 		so->so_pcb = NULL;
727 		/*
728 		 * As long as `unp_lock' is taken before entering
729 		 * uipc_usrreq() releasing it here would lead to a
730 		 * double unlock.
731 		 */
732 		sofree(so, SL_NOUNLOCK);
733 		m_freem(unp->unp_addr);
734 		pool_put(&unpcb_pool, unp);
735 	}
736 }
737 
738 #ifdef notdef
739 unp_drain(void)
740 {
741 
742 }
743 #endif
744 
745 extern	struct domain unixdomain;
746 
747 static struct unpcb *
748 fptounp(struct file *fp)
749 {
750 	struct socket *so;
751 
752 	if (fp->f_type != DTYPE_SOCKET)
753 		return (NULL);
754 	if ((so = fp->f_data) == NULL)
755 		return (NULL);
756 	if (so->so_proto->pr_domain != &unixdomain)
757 		return (NULL);
758 	return (sotounpcb(so));
759 }
760 
761 int
762 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
763 {
764 	struct proc *p = curproc;		/* XXX */
765 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
766 	struct filedesc *fdp = p->p_fd;
767 	int i, *fds = NULL;
768 	struct fdpass *rp;
769 	struct file *fp;
770 	int nfds, error = 0;
771 
772 	rw_assert_wrlock(&unp_lock);
773 
774 	/*
775 	 * This code only works because SCM_RIGHTS is the only supported
776 	 * control message type on unix sockets. Enforce this here.
777 	 */
778 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
779 		return EINVAL;
780 
781 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
782 	    sizeof(struct fdpass);
783 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
784 		controllen = 0;
785 	else
786 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
787 	if (nfds > controllen / sizeof(int)) {
788 		error = EMSGSIZE;
789 		goto restart;
790 	}
791 
792 	/* Make sure the recipient should be able to see the descriptors.. */
793 	rp = (struct fdpass *)CMSG_DATA(cm);
794 
795 	/* fdp->fd_rdir requires KERNEL_LOCK() */
796 	KERNEL_LOCK();
797 
798 	for (i = 0; i < nfds; i++) {
799 		fp = rp->fp;
800 		rp++;
801 		error = pledge_recvfd(p, fp);
802 		if (error)
803 			break;
804 
805 		/*
806 		 * No to block devices.  If passing a directory,
807 		 * make sure that it is underneath the root.
808 		 */
809 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
810 			struct vnode *vp = (struct vnode *)fp->f_data;
811 
812 			if (vp->v_type == VBLK ||
813 			    (vp->v_type == VDIR &&
814 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
815 				error = EPERM;
816 				break;
817 			}
818 		}
819 	}
820 
821 	KERNEL_UNLOCK();
822 
823 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
824 
825 restart:
826 	fdplock(fdp);
827 	if (error != 0) {
828 		if (nfds > 0) {
829 			rp = ((struct fdpass *)CMSG_DATA(cm));
830 			unp_discard(rp, nfds);
831 		}
832 		goto out;
833 	}
834 
835 	/*
836 	 * First loop -- allocate file descriptor table slots for the
837 	 * new descriptors.
838 	 */
839 	rp = ((struct fdpass *)CMSG_DATA(cm));
840 	for (i = 0; i < nfds; i++) {
841 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
842 			/*
843 			 * Back out what we've done so far.
844 			 */
845 			for (--i; i >= 0; i--)
846 				fdremove(fdp, fds[i]);
847 
848 			if (error == ENOSPC) {
849 				fdexpand(p);
850 				error = 0;
851 			} else {
852 				/*
853 				 * This is the error that has historically
854 				 * been returned, and some callers may
855 				 * expect it.
856 				 */
857 				error = EMSGSIZE;
858 			}
859 			fdpunlock(fdp);
860 			goto restart;
861 		}
862 
863 		/*
864 		 * Make the slot reference the descriptor so that
865 		 * fdalloc() works properly.. We finalize it all
866 		 * in the loop below.
867 		 */
868 		mtx_enter(&fdp->fd_fplock);
869 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
870 		fdp->fd_ofiles[fds[i]] = rp->fp;
871 		mtx_leave(&fdp->fd_fplock);
872 
873 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
874 		if (flags & MSG_CMSG_CLOEXEC)
875 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
876 
877 		rp++;
878 	}
879 
880 	/*
881 	 * Now that adding them has succeeded, update all of the
882 	 * descriptor passing state.
883 	 */
884 	rp = (struct fdpass *)CMSG_DATA(cm);
885 	for (i = 0; i < nfds; i++) {
886 		struct unpcb *unp;
887 
888 		fp = rp->fp;
889 		rp++;
890 		if ((unp = fptounp(fp)) != NULL)
891 			unp->unp_msgcount--;
892 		unp_rights--;
893 	}
894 
895 	/*
896 	 * Copy temporary array to message and adjust length, in case of
897 	 * transition from large struct file pointers to ints.
898 	 */
899 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
900 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
901 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
902  out:
903 	fdpunlock(fdp);
904 	if (fds != NULL)
905 		free(fds, M_TEMP, nfds * sizeof(int));
906 	return (error);
907 }
908 
909 int
910 unp_internalize(struct mbuf *control, struct proc *p)
911 {
912 	struct filedesc *fdp = p->p_fd;
913 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
914 	struct fdpass *rp;
915 	struct file *fp;
916 	struct unpcb *unp;
917 	int i, error;
918 	int nfds, *ip, fd, neededspace;
919 
920 	rw_assert_wrlock(&unp_lock);
921 
922 	/*
923 	 * Check for two potential msg_controllen values because
924 	 * IETF stuck their nose in a place it does not belong.
925 	 */
926 	if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
927 		return (EINVAL);
928 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
929 	    !(cm->cmsg_len == control->m_len ||
930 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
931 		return (EINVAL);
932 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
933 
934 	if (unp_rights + nfds > maxfiles / 10)
935 		return (EMFILE);
936 
937 	/* Make sure we have room for the struct file pointers */
938 morespace:
939 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
940 	    control->m_len;
941 	if (neededspace > m_trailingspace(control)) {
942 		char *tmp;
943 		/* if we already have a cluster, the message is just too big */
944 		if (control->m_flags & M_EXT)
945 			return (E2BIG);
946 
947 		/* copy cmsg data temporarily out of the mbuf */
948 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
949 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
950 
951 		/* allocate a cluster and try again */
952 		MCLGET(control, M_WAIT);
953 		if ((control->m_flags & M_EXT) == 0) {
954 			free(tmp, M_TEMP, control->m_len);
955 			return (ENOBUFS);       /* allocation failed */
956 		}
957 
958 		/* copy the data back into the cluster */
959 		cm = mtod(control, struct cmsghdr *);
960 		memcpy(cm, tmp, control->m_len);
961 		free(tmp, M_TEMP, control->m_len);
962 		goto morespace;
963 	}
964 
965 	/* adjust message & mbuf to note amount of space actually used. */
966 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
967 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
968 
969 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
970 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
971 	fdplock(fdp);
972 	for (i = 0; i < nfds; i++) {
973 		memcpy(&fd, ip, sizeof fd);
974 		ip--;
975 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
976 			error = EBADF;
977 			goto fail;
978 		}
979 		if (fp->f_count >= FDUP_MAX_COUNT) {
980 			error = EDEADLK;
981 			goto fail;
982 		}
983 		error = pledge_sendfd(p, fp);
984 		if (error)
985 			goto fail;
986 
987 		/* kqueue descriptors cannot be copied */
988 		if (fp->f_type == DTYPE_KQUEUE) {
989 			error = EINVAL;
990 			goto fail;
991 		}
992 		rp->fp = fp;
993 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
994 		rp--;
995 		if ((unp = fptounp(fp)) != NULL) {
996 			unp->unp_file = fp;
997 			unp->unp_msgcount++;
998 		}
999 		unp_rights++;
1000 	}
1001 	fdpunlock(fdp);
1002 	return (0);
1003 fail:
1004 	fdpunlock(fdp);
1005 	if (fp != NULL)
1006 		FRELE(fp, p);
1007 	/* Back out what we just did. */
1008 	for ( ; i > 0; i--) {
1009 		rp++;
1010 		fp = rp->fp;
1011 		if ((unp = fptounp(fp)) != NULL)
1012 			unp->unp_msgcount--;
1013 		FRELE(fp, p);
1014 		unp_rights--;
1015 	}
1016 
1017 	return (error);
1018 }
1019 
1020 void
1021 unp_gc(void *arg __unused)
1022 {
1023 	struct unp_deferral *defer;
1024 	struct file *fp;
1025 	struct socket *so;
1026 	struct unpcb *unp;
1027 	int nunref, i;
1028 
1029 	rw_enter_write(&unp_lock);
1030 
1031 	if (unp_gcing)
1032 		goto unlock;
1033 	unp_gcing = 1;
1034 
1035 	/* close any fds on the deferred list */
1036 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
1037 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
1038 		for (i = 0; i < defer->ud_n; i++) {
1039 			fp = defer->ud_fp[i].fp;
1040 			if (fp == NULL)
1041 				continue;
1042 			 /* closef() expects a refcount of 2 */
1043 			FREF(fp);
1044 			if ((unp = fptounp(fp)) != NULL)
1045 				unp->unp_msgcount--;
1046 			unp_rights--;
1047 			rw_exit_write(&unp_lock);
1048 			(void) closef(fp, NULL);
1049 			rw_enter_write(&unp_lock);
1050 		}
1051 		free(defer, M_TEMP, sizeof(*defer) +
1052 		    sizeof(struct fdpass) * defer->ud_n);
1053 	}
1054 
1055 	unp_defer = 0;
1056 	LIST_FOREACH(unp, &unp_head, unp_link)
1057 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
1058 	do {
1059 		nunref = 0;
1060 		LIST_FOREACH(unp, &unp_head, unp_link) {
1061 			fp = unp->unp_file;
1062 			if (unp->unp_flags & UNP_GCDEFER) {
1063 				/*
1064 				 * This socket is referenced by another
1065 				 * socket which is known to be live,
1066 				 * so it's certainly live.
1067 				 */
1068 				unp->unp_flags &= ~UNP_GCDEFER;
1069 				unp_defer--;
1070 			} else if (unp->unp_flags & UNP_GCMARK) {
1071 				/* marked as live in previous pass */
1072 				continue;
1073 			} else if (fp == NULL) {
1074 				/* not being passed, so can't be in loop */
1075 			} else if (fp->f_count == 0) {
1076 				/*
1077 				 * Already being closed, let normal close
1078 				 * path take its course
1079 				 */
1080 			} else {
1081 				/*
1082 				 * Unreferenced by other sockets so far,
1083 				 * so if all the references (f_count) are
1084 				 * from passing (unp_msgcount) then this
1085 				 * socket is prospectively dead
1086 				 */
1087 				if (fp->f_count == unp->unp_msgcount) {
1088 					nunref++;
1089 					unp->unp_flags |= UNP_GCDEAD;
1090 					continue;
1091 				}
1092 			}
1093 
1094 			/*
1095 			 * This is the first time we've seen this socket on
1096 			 * the mark pass and known it has a live reference,
1097 			 * so mark it, then scan its receive buffer for
1098 			 * sockets and note them as deferred (== referenced,
1099 			 * but not yet marked).
1100 			 */
1101 			unp->unp_flags |= UNP_GCMARK;
1102 
1103 			so = unp->unp_socket;
1104 			unp_scan(so->so_rcv.sb_mb, unp_mark);
1105 		}
1106 	} while (unp_defer);
1107 
1108 	/*
1109 	 * If there are any unreferenced sockets, then for each dispose
1110 	 * of files in its receive buffer and then close it.
1111 	 */
1112 	if (nunref) {
1113 		LIST_FOREACH(unp, &unp_head, unp_link) {
1114 			if (unp->unp_flags & UNP_GCDEAD)
1115 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
1116 				    unp_discard);
1117 		}
1118 	}
1119 	unp_gcing = 0;
1120 unlock:
1121 	rw_exit_write(&unp_lock);
1122 }
1123 
1124 void
1125 unp_dispose(struct mbuf *m)
1126 {
1127 
1128 	if (m)
1129 		unp_scan(m, unp_discard);
1130 }
1131 
1132 void
1133 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1134 {
1135 	struct mbuf *m;
1136 	struct fdpass *rp;
1137 	struct cmsghdr *cm;
1138 	int qfds;
1139 
1140 	while (m0) {
1141 		for (m = m0; m; m = m->m_next) {
1142 			if (m->m_type == MT_CONTROL &&
1143 			    m->m_len >= sizeof(*cm)) {
1144 				cm = mtod(m, struct cmsghdr *);
1145 				if (cm->cmsg_level != SOL_SOCKET ||
1146 				    cm->cmsg_type != SCM_RIGHTS)
1147 					continue;
1148 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1149 				    / sizeof(struct fdpass);
1150 				if (qfds > 0) {
1151 					rp = (struct fdpass *)CMSG_DATA(cm);
1152 					op(rp, qfds);
1153 				}
1154 				break;		/* XXX, but saves time */
1155 			}
1156 		}
1157 		m0 = m0->m_nextpkt;
1158 	}
1159 }
1160 
1161 void
1162 unp_mark(struct fdpass *rp, int nfds)
1163 {
1164 	struct unpcb *unp;
1165 	int i;
1166 
1167 	rw_assert_wrlock(&unp_lock);
1168 
1169 	for (i = 0; i < nfds; i++) {
1170 		if (rp[i].fp == NULL)
1171 			continue;
1172 
1173 		unp = fptounp(rp[i].fp);
1174 		if (unp == NULL)
1175 			continue;
1176 
1177 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1178 			continue;
1179 
1180 		unp_defer++;
1181 		unp->unp_flags |= UNP_GCDEFER;
1182 		unp->unp_flags &= ~UNP_GCDEAD;
1183 	}
1184 }
1185 
1186 void
1187 unp_discard(struct fdpass *rp, int nfds)
1188 {
1189 	struct unp_deferral *defer;
1190 
1191 	rw_assert_wrlock(&unp_lock);
1192 
1193 	/* copy the file pointers to a deferral structure */
1194 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1195 	defer->ud_n = nfds;
1196 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1197 	memset(rp, 0, sizeof(*rp) * nfds);
1198 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1199 
1200 	task_add(systqmp, &unp_gc_task);
1201 }
1202 
1203 int
1204 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1205 {
1206 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1207 	size_t size, len;
1208 
1209 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1210 		return EINVAL;
1211 	if (sa->sa_family != AF_UNIX)
1212 		return EAFNOSUPPORT;
1213 	if (sa->sa_len != nam->m_len)
1214 		return EINVAL;
1215 	if (sa->sa_len > sizeof(struct sockaddr_un))
1216 		return EINVAL;
1217 	*sun = (struct sockaddr_un *)sa;
1218 
1219 	/* ensure that sun_path is NUL terminated and fits */
1220 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1221 	len = strnlen((*sun)->sun_path, size);
1222 	if (len == sizeof((*sun)->sun_path))
1223 		return EINVAL;
1224 	if (len == size) {
1225 		if (m_trailingspace(nam) == 0)
1226 			return EINVAL;
1227 		nam->m_len++;
1228 		(*sun)->sun_len++;
1229 		(*sun)->sun_path[len] = '\0';
1230 	}
1231 	if (pathlen != NULL)
1232 		*pathlen = len;
1233 
1234 	return 0;
1235 }
1236