xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision d0fc3bb68efd6c434b4053cd7adb29023cbec341)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.148 2021/05/25 22:45:09 bluhm Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 #include <sys/pool.h>
54 #include <sys/rwlock.h>
55 #include <sys/sysctl.h>
56 
57 /*
58  * Locks used to protect global data and struct members:
59  *      I       immutable after creation
60  *      U       unp_lock
61  */
62 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
63 
64 /*
65  * Stack of sets of files that were passed over a socket but were
66  * not received and need to be closed.
67  */
68 struct	unp_deferral {
69 	SLIST_ENTRY(unp_deferral)	ud_link;	/* [U] */
70 	int				ud_n;		/* [I] */
71 	/* followed by ud_n struct fdpass */
72 	struct fdpass			ud_fp[];	/* [I] */
73 };
74 
75 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
76 void	unp_discard(struct fdpass *, int);
77 void	unp_mark(struct fdpass *, int);
78 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
79 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
80 
81 struct pool unpcb_pool;
82 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
83 
84 /*
85  * Unix communications domain.
86  *
87  * TODO:
88  *	RDM
89  *	rethink name space problems
90  *	need a proper out-of-band
91  */
92 const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
93 
94 /* [U] list of all UNIX domain sockets, for unp_gc() */
95 LIST_HEAD(unp_head, unpcb)	unp_head =
96 	LIST_HEAD_INITIALIZER(unp_head);
97 /* [U] list of sets of files that were sent over sockets that are now closed */
98 SLIST_HEAD(,unp_deferral)	unp_deferred =
99 	SLIST_HEAD_INITIALIZER(unp_deferred);
100 
101 ino_t	unp_ino;	/* [U] prototype for fake inode numbers */
102 int	unp_rights;	/* [U] file descriptors in flight */
103 int	unp_defer;	/* [U] number of deferred fp to close by the GC task */
104 int	unp_gcing;	/* [U] GC task currently running */
105 
106 void
107 unp_init(void)
108 {
109 	pool_init(&unpcb_pool, sizeof(struct unpcb), 0,
110 	    IPL_SOFTNET, 0, "unpcb", NULL);
111 }
112 
113 void
114 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
115 {
116 	if (unp != NULL && unp->unp_addr != NULL) {
117 		nam->m_len = unp->unp_addr->m_len;
118 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
119 		    nam->m_len);
120 	} else {
121 		nam->m_len = sizeof(sun_noname);
122 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
123 		    nam->m_len);
124 	}
125 }
126 
127 int
128 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
129     struct mbuf *control, struct proc *p)
130 {
131 	struct unpcb *unp = sotounpcb(so);
132 	struct unpcb *unp2;
133 	struct socket *so2;
134 	int error = 0;
135 
136 	if (req == PRU_CONTROL)
137 		return (EOPNOTSUPP);
138 	if (req != PRU_SEND && control && control->m_len) {
139 		error = EOPNOTSUPP;
140 		goto release;
141 	}
142 	if (unp == NULL) {
143 		error = EINVAL;
144 		goto release;
145 	}
146 
147 	switch (req) {
148 
149 	case PRU_BIND:
150 		error = unp_bind(unp, nam, p);
151 		break;
152 
153 	case PRU_LISTEN:
154 		if (unp->unp_vnode == NULL)
155 			error = EINVAL;
156 		break;
157 
158 	case PRU_CONNECT:
159 		error = unp_connect(so, nam, p);
160 		break;
161 
162 	case PRU_CONNECT2:
163 		error = unp_connect2(so, (struct socket *)nam);
164 		if (!error) {
165 			unp->unp_connid.uid = p->p_ucred->cr_uid;
166 			unp->unp_connid.gid = p->p_ucred->cr_gid;
167 			unp->unp_connid.pid = p->p_p->ps_pid;
168 			unp->unp_flags |= UNP_FEIDS;
169 			unp2 = sotounpcb((struct socket *)nam);
170 			unp2->unp_connid.uid = p->p_ucred->cr_uid;
171 			unp2->unp_connid.gid = p->p_ucred->cr_gid;
172 			unp2->unp_connid.pid = p->p_p->ps_pid;
173 			unp2->unp_flags |= UNP_FEIDS;
174 		}
175 		break;
176 
177 	case PRU_DISCONNECT:
178 		unp_disconnect(unp);
179 		break;
180 
181 	case PRU_ACCEPT:
182 		/*
183 		 * Pass back name of connected socket,
184 		 * if it was bound and we are still connected
185 		 * (our peer may have closed already!).
186 		 */
187 		uipc_setaddr(unp->unp_conn, nam);
188 		break;
189 
190 	case PRU_SHUTDOWN:
191 		socantsendmore(so);
192 		unp_shutdown(unp);
193 		break;
194 
195 	case PRU_RCVD:
196 		switch (so->so_type) {
197 
198 		case SOCK_DGRAM:
199 			panic("uipc 1");
200 			/*NOTREACHED*/
201 
202 		case SOCK_STREAM:
203 		case SOCK_SEQPACKET:
204 			if (unp->unp_conn == NULL)
205 				break;
206 			so2 = unp->unp_conn->unp_socket;
207 			/*
208 			 * Adjust backpressure on sender
209 			 * and wakeup any waiting to write.
210 			 */
211 			so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
212 			so2->so_snd.sb_cc = so->so_rcv.sb_cc;
213 			sowwakeup(so2);
214 			break;
215 
216 		default:
217 			panic("uipc 2");
218 		}
219 		break;
220 
221 	case PRU_SEND:
222 		if (control && (error = unp_internalize(control, p)))
223 			break;
224 		switch (so->so_type) {
225 
226 		case SOCK_DGRAM: {
227 			const struct sockaddr *from;
228 
229 			if (nam) {
230 				if (unp->unp_conn) {
231 					error = EISCONN;
232 					break;
233 				}
234 				error = unp_connect(so, nam, p);
235 				if (error)
236 					break;
237 			} else {
238 				if (unp->unp_conn == NULL) {
239 					error = ENOTCONN;
240 					break;
241 				}
242 			}
243 			so2 = unp->unp_conn->unp_socket;
244 			if (unp->unp_addr)
245 				from = mtod(unp->unp_addr, struct sockaddr *);
246 			else
247 				from = &sun_noname;
248 			if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
249 				sorwakeup(so2);
250 				m = NULL;
251 				control = NULL;
252 			} else
253 				error = ENOBUFS;
254 			if (nam)
255 				unp_disconnect(unp);
256 			break;
257 		}
258 
259 		case SOCK_STREAM:
260 		case SOCK_SEQPACKET:
261 			if (so->so_state & SS_CANTSENDMORE) {
262 				error = EPIPE;
263 				break;
264 			}
265 			if (unp->unp_conn == NULL) {
266 				error = ENOTCONN;
267 				break;
268 			}
269 			so2 = unp->unp_conn->unp_socket;
270 			/*
271 			 * Send to paired receive port, and then raise
272 			 * send buffer counts to maintain backpressure.
273 			 * Wake up readers.
274 			 */
275 			if (control) {
276 				if (sbappendcontrol(so2, &so2->so_rcv, m,
277 				    control)) {
278 					control = NULL;
279 				} else {
280 					error = ENOBUFS;
281 					break;
282 				}
283 			} else if (so->so_type == SOCK_SEQPACKET)
284 				sbappendrecord(so2, &so2->so_rcv, m);
285 			else
286 				sbappend(so2, &so2->so_rcv, m);
287 			so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
288 			so->so_snd.sb_cc = so2->so_rcv.sb_cc;
289 			if (so2->so_rcv.sb_cc > 0)
290 				sorwakeup(so2);
291 			m = NULL;
292 			break;
293 
294 		default:
295 			panic("uipc 4");
296 		}
297 		/* we need to undo unp_internalize in case of errors */
298 		if (control && error)
299 			unp_dispose(control);
300 		break;
301 
302 	case PRU_ABORT:
303 		unp_drop(unp, ECONNABORTED);
304 		break;
305 
306 	case PRU_SENSE: {
307 		struct stat *sb = (struct stat *)m;
308 
309 		sb->st_blksize = so->so_snd.sb_hiwat;
310 		sb->st_dev = NODEV;
311 		if (unp->unp_ino == 0)
312 			unp->unp_ino = unp_ino++;
313 		sb->st_atim.tv_sec =
314 		    sb->st_mtim.tv_sec =
315 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
316 		sb->st_atim.tv_nsec =
317 		    sb->st_mtim.tv_nsec =
318 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
319 		sb->st_ino = unp->unp_ino;
320 		break;
321 	}
322 
323 	case PRU_RCVOOB:
324 	case PRU_SENDOOB:
325 		error = EOPNOTSUPP;
326 		break;
327 
328 	case PRU_SOCKADDR:
329 		uipc_setaddr(unp, nam);
330 		break;
331 
332 	case PRU_PEERADDR:
333 		uipc_setaddr(unp->unp_conn, nam);
334 		break;
335 
336 	case PRU_SLOWTIMO:
337 		break;
338 
339 	default:
340 		panic("uipc_usrreq");
341 	}
342 release:
343 	if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) {
344 		m_freem(control);
345 		m_freem(m);
346 	}
347 	return (error);
348 }
349 
350 /*
351  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
352  * for stream sockets, although the total for sender and receiver is
353  * actually only PIPSIZ.
354  * Datagram sockets really use the sendspace as the maximum datagram size,
355  * and don't really want to reserve the sendspace.  Their recvspace should
356  * be large enough for at least one max-size datagram plus address.
357  */
358 #define	PIPSIZ	8192
359 u_int	unpst_sendspace = PIPSIZ;
360 u_int	unpst_recvspace = PIPSIZ;
361 u_int	unpsq_sendspace = PIPSIZ;
362 u_int	unpsq_recvspace = PIPSIZ;
363 u_int	unpdg_sendspace = 2*1024;	/* really max datagram size */
364 u_int	unpdg_recvspace = 16*1024;
365 
366 const struct sysctl_bounded_args unpstctl_vars[] = {
367 	{ UNPCTL_RECVSPACE, &unpst_recvspace, 0, SB_MAX },
368 	{ UNPCTL_SENDSPACE, &unpst_sendspace, 0, SB_MAX },
369 };
370 const struct sysctl_bounded_args unpsqctl_vars[] = {
371 	{ UNPCTL_RECVSPACE, &unpsq_recvspace, 0, SB_MAX },
372 	{ UNPCTL_SENDSPACE, &unpsq_sendspace, 0, SB_MAX },
373 };
374 const struct sysctl_bounded_args unpdgctl_vars[] = {
375 	{ UNPCTL_RECVSPACE, &unpdg_recvspace, 0, SB_MAX },
376 	{ UNPCTL_SENDSPACE, &unpdg_sendspace, 0, SB_MAX },
377 };
378 
379 int
380 uipc_attach(struct socket *so, int proto)
381 {
382 	struct unpcb *unp;
383 	int error;
384 
385 	rw_assert_wrlock(&unp_lock);
386 
387 	if (so->so_pcb)
388 		return EISCONN;
389 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
390 		switch (so->so_type) {
391 
392 		case SOCK_STREAM:
393 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
394 			break;
395 
396 		case SOCK_SEQPACKET:
397 			error = soreserve(so, unpsq_sendspace, unpsq_recvspace);
398 			break;
399 
400 		case SOCK_DGRAM:
401 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
402 			break;
403 
404 		default:
405 			panic("unp_attach");
406 		}
407 		if (error)
408 			return (error);
409 	}
410 	unp = pool_get(&unpcb_pool, PR_NOWAIT|PR_ZERO);
411 	if (unp == NULL)
412 		return (ENOBUFS);
413 	unp->unp_socket = so;
414 	so->so_pcb = unp;
415 	getnanotime(&unp->unp_ctime);
416 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
417 	return (0);
418 }
419 
420 int
421 uipc_detach(struct socket *so)
422 {
423 	struct unpcb *unp = sotounpcb(so);
424 
425 	if (unp == NULL)
426 		return (EINVAL);
427 
428 	unp_detach(unp);
429 
430 	return (0);
431 }
432 
433 int
434 uipc_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
435     size_t newlen)
436 {
437 	int *valp = &unp_defer;
438 
439 	/* All sysctl names at this level are terminal. */
440 	switch (name[0]) {
441 	case SOCK_STREAM:
442 		if (namelen != 2)
443 			return (ENOTDIR);
444 		return sysctl_bounded_arr(unpstctl_vars, nitems(unpstctl_vars),
445 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
446 	case SOCK_SEQPACKET:
447 		if (namelen != 2)
448 			return (ENOTDIR);
449 		return sysctl_bounded_arr(unpsqctl_vars, nitems(unpsqctl_vars),
450 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
451 	case SOCK_DGRAM:
452 		if (namelen != 2)
453 			return (ENOTDIR);
454 		return sysctl_bounded_arr(unpdgctl_vars, nitems(unpdgctl_vars),
455 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
456 	case NET_UNIX_INFLIGHT:
457 		valp = &unp_rights;
458 		/* FALLTHOUGH */
459 	case NET_UNIX_DEFERRED:
460 		if (namelen != 1)
461 			return (ENOTDIR);
462 		return sysctl_rdint(oldp, oldlenp, newp, *valp);
463 	default:
464 		return (ENOPROTOOPT);
465 	}
466 }
467 
468 void
469 unp_detach(struct unpcb *unp)
470 {
471 	struct socket *so = unp->unp_socket;
472 	struct vnode *vp = NULL;
473 
474 	rw_assert_wrlock(&unp_lock);
475 
476 	LIST_REMOVE(unp, unp_link);
477 	if (unp->unp_vnode) {
478 		/*
479 		 * `v_socket' is only read in unp_connect and
480 		 * unplock prevents concurrent access.
481 		 */
482 
483 		unp->unp_vnode->v_socket = NULL;
484 		vp = unp->unp_vnode;
485 		unp->unp_vnode = NULL;
486 	}
487 
488 	if (unp->unp_conn)
489 		unp_disconnect(unp);
490 	while (!SLIST_EMPTY(&unp->unp_refs))
491 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
492 	soisdisconnected(so);
493 	so->so_pcb = NULL;
494 	m_freem(unp->unp_addr);
495 	pool_put(&unpcb_pool, unp);
496 	if (unp_rights)
497 		task_add(systqmp, &unp_gc_task);
498 
499 	if (vp != NULL) {
500 		/*
501 		 * Enforce `i_lock' -> `unplock' because fifo subsystem
502 		 * requires it. The socket can't be closed concurrently
503 		 * because the file descriptor reference is
504 		 * still hold.
505 		 */
506 
507 		sounlock(so, SL_LOCKED);
508 		KERNEL_LOCK();
509 		vrele(vp);
510 		KERNEL_UNLOCK();
511 		solock(so);
512 	}
513 }
514 
515 int
516 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
517 {
518 	struct sockaddr_un *soun;
519 	struct mbuf *nam2;
520 	struct vnode *vp;
521 	struct vattr vattr;
522 	int error;
523 	struct nameidata nd;
524 	size_t pathlen;
525 
526 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
527 		return (EINVAL);
528 	if (unp->unp_vnode != NULL)
529 		return (EINVAL);
530 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
531 		return (error);
532 
533 	unp->unp_flags |= UNP_BINDING;
534 
535 	/*
536 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
537 	 * requires it. The socket can't be closed concurrently
538 	 * because the file descriptor reference is still held.
539 	 */
540 
541 	sounlock(unp->unp_socket, SL_LOCKED);
542 
543 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
544 	nam2->m_len = sizeof(struct sockaddr_un);
545 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
546 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
547 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
548 
549 	soun = mtod(nam2, struct sockaddr_un *);
550 
551 	/* Fixup sun_len to keep it in sync with m_len. */
552 	soun->sun_len = nam2->m_len;
553 
554 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
555 	    soun->sun_path, p);
556 	nd.ni_pledge = PLEDGE_UNIX;
557 
558 	KERNEL_LOCK();
559 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
560 	error = namei(&nd);
561 	if (error != 0) {
562 		m_freem(nam2);
563 		solock(unp->unp_socket);
564 		goto out;
565 	}
566 	vp = nd.ni_vp;
567 	if (vp != NULL) {
568 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
569 		if (nd.ni_dvp == vp)
570 			vrele(nd.ni_dvp);
571 		else
572 			vput(nd.ni_dvp);
573 		vrele(vp);
574 		m_freem(nam2);
575 		error = EADDRINUSE;
576 		solock(unp->unp_socket);
577 		goto out;
578 	}
579 	VATTR_NULL(&vattr);
580 	vattr.va_type = VSOCK;
581 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
582 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
583 	vput(nd.ni_dvp);
584 	if (error) {
585 		m_freem(nam2);
586 		solock(unp->unp_socket);
587 		goto out;
588 	}
589 	solock(unp->unp_socket);
590 	unp->unp_addr = nam2;
591 	vp = nd.ni_vp;
592 	vp->v_socket = unp->unp_socket;
593 	unp->unp_vnode = vp;
594 	unp->unp_connid.uid = p->p_ucred->cr_uid;
595 	unp->unp_connid.gid = p->p_ucred->cr_gid;
596 	unp->unp_connid.pid = p->p_p->ps_pid;
597 	unp->unp_flags |= UNP_FEIDSBIND;
598 	VOP_UNLOCK(vp);
599 out:
600 	KERNEL_UNLOCK();
601 	unp->unp_flags &= ~UNP_BINDING;
602 
603 	return (error);
604 }
605 
606 int
607 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
608 {
609 	struct sockaddr_un *soun;
610 	struct vnode *vp;
611 	struct socket *so2, *so3;
612 	struct unpcb *unp, *unp2, *unp3;
613 	struct nameidata nd;
614 	int error;
615 
616 	unp = sotounpcb(so);
617 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
618 		return (EISCONN);
619 	if ((error = unp_nam2sun(nam, &soun, NULL)))
620 		return (error);
621 
622 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
623 	nd.ni_pledge = PLEDGE_UNIX;
624 
625 	unp->unp_flags |= UNP_CONNECTING;
626 
627 	/*
628 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
629 	 * requires it. The socket can't be closed concurrently
630 	 * because the file descriptor reference is still held.
631 	 */
632 
633 	sounlock(so, SL_LOCKED);
634 
635 	KERNEL_LOCK();
636 	error = namei(&nd);
637 	if (error != 0)
638 		goto unlock;
639 	vp = nd.ni_vp;
640 	if (vp->v_type != VSOCK) {
641 		error = ENOTSOCK;
642 		goto put;
643 	}
644 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
645 		goto put;
646 	solock(so);
647 	so2 = vp->v_socket;
648 	if (so2 == NULL) {
649 		error = ECONNREFUSED;
650 		goto put_locked;
651 	}
652 	if (so->so_type != so2->so_type) {
653 		error = EPROTOTYPE;
654 		goto put_locked;
655 	}
656 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
657 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
658 		    (so3 = sonewconn(so2, 0)) == 0) {
659 			error = ECONNREFUSED;
660 			goto put_locked;
661 		}
662 		unp2 = sotounpcb(so2);
663 		unp3 = sotounpcb(so3);
664 		if (unp2->unp_addr)
665 			unp3->unp_addr =
666 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
667 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
668 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
669 		unp3->unp_connid.pid = p->p_p->ps_pid;
670 		unp3->unp_flags |= UNP_FEIDS;
671 		so2 = so3;
672 		if (unp2->unp_flags & UNP_FEIDSBIND) {
673 			unp->unp_connid = unp2->unp_connid;
674 			unp->unp_flags |= UNP_FEIDS;
675 		}
676 	}
677 	error = unp_connect2(so, so2);
678 put_locked:
679 	sounlock(so, SL_LOCKED);
680 put:
681 	vput(vp);
682 unlock:
683 	KERNEL_UNLOCK();
684 	solock(so);
685 	unp->unp_flags &= ~UNP_CONNECTING;
686 
687 	return (error);
688 }
689 
690 int
691 unp_connect2(struct socket *so, struct socket *so2)
692 {
693 	struct unpcb *unp = sotounpcb(so);
694 	struct unpcb *unp2;
695 
696 	rw_assert_wrlock(&unp_lock);
697 
698 	if (so2->so_type != so->so_type)
699 		return (EPROTOTYPE);
700 	unp2 = sotounpcb(so2);
701 	unp->unp_conn = unp2;
702 	switch (so->so_type) {
703 
704 	case SOCK_DGRAM:
705 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
706 		soisconnected(so);
707 		break;
708 
709 	case SOCK_STREAM:
710 	case SOCK_SEQPACKET:
711 		unp2->unp_conn = unp;
712 		soisconnected(so);
713 		soisconnected(so2);
714 		break;
715 
716 	default:
717 		panic("unp_connect2");
718 	}
719 	return (0);
720 }
721 
722 void
723 unp_disconnect(struct unpcb *unp)
724 {
725 	struct unpcb *unp2 = unp->unp_conn;
726 
727 	if (unp2 == NULL)
728 		return;
729 	unp->unp_conn = NULL;
730 	switch (unp->unp_socket->so_type) {
731 
732 	case SOCK_DGRAM:
733 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
734 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
735 		break;
736 
737 	case SOCK_STREAM:
738 	case SOCK_SEQPACKET:
739 		unp->unp_socket->so_snd.sb_mbcnt = 0;
740 		unp->unp_socket->so_snd.sb_cc = 0;
741 		soisdisconnected(unp->unp_socket);
742 		unp2->unp_conn = NULL;
743 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
744 		unp2->unp_socket->so_snd.sb_cc = 0;
745 		soisdisconnected(unp2->unp_socket);
746 		break;
747 	}
748 }
749 
750 void
751 unp_shutdown(struct unpcb *unp)
752 {
753 	struct socket *so;
754 
755 	switch (unp->unp_socket->so_type) {
756 	case SOCK_STREAM:
757 	case SOCK_SEQPACKET:
758 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
759 			socantrcvmore(so);
760 		break;
761 	default:
762 		break;
763 	}
764 }
765 
766 void
767 unp_drop(struct unpcb *unp, int errno)
768 {
769 	struct socket *so = unp->unp_socket;
770 
771 	rw_assert_wrlock(&unp_lock);
772 
773 	so->so_error = errno;
774 	unp_disconnect(unp);
775 	if (so->so_head) {
776 		so->so_pcb = NULL;
777 		/*
778 		 * As long as `unp_lock' is taken before entering
779 		 * uipc_usrreq() releasing it here would lead to a
780 		 * double unlock.
781 		 */
782 		sofree(so, SL_NOUNLOCK);
783 		m_freem(unp->unp_addr);
784 		pool_put(&unpcb_pool, unp);
785 	}
786 }
787 
788 #ifdef notdef
789 unp_drain(void)
790 {
791 
792 }
793 #endif
794 
795 static struct unpcb *
796 fptounp(struct file *fp)
797 {
798 	struct socket *so;
799 
800 	if (fp->f_type != DTYPE_SOCKET)
801 		return (NULL);
802 	if ((so = fp->f_data) == NULL)
803 		return (NULL);
804 	if (so->so_proto->pr_domain != &unixdomain)
805 		return (NULL);
806 	return (sotounpcb(so));
807 }
808 
809 int
810 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
811 {
812 	struct proc *p = curproc;		/* XXX */
813 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
814 	struct filedesc *fdp = p->p_fd;
815 	int i, *fds = NULL;
816 	struct fdpass *rp;
817 	struct file *fp;
818 	int nfds, error = 0;
819 
820 	rw_assert_wrlock(&unp_lock);
821 
822 	/*
823 	 * This code only works because SCM_RIGHTS is the only supported
824 	 * control message type on unix sockets. Enforce this here.
825 	 */
826 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
827 		return EINVAL;
828 
829 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
830 	    sizeof(struct fdpass);
831 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
832 		controllen = 0;
833 	else
834 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
835 	if (nfds > controllen / sizeof(int)) {
836 		error = EMSGSIZE;
837 		goto restart;
838 	}
839 
840 	/* Make sure the recipient should be able to see the descriptors.. */
841 	rp = (struct fdpass *)CMSG_DATA(cm);
842 
843 	/* fdp->fd_rdir requires KERNEL_LOCK() */
844 	KERNEL_LOCK();
845 
846 	for (i = 0; i < nfds; i++) {
847 		fp = rp->fp;
848 		rp++;
849 		error = pledge_recvfd(p, fp);
850 		if (error)
851 			break;
852 
853 		/*
854 		 * No to block devices.  If passing a directory,
855 		 * make sure that it is underneath the root.
856 		 */
857 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
858 			struct vnode *vp = (struct vnode *)fp->f_data;
859 
860 			if (vp->v_type == VBLK ||
861 			    (vp->v_type == VDIR &&
862 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
863 				error = EPERM;
864 				break;
865 			}
866 		}
867 	}
868 
869 	KERNEL_UNLOCK();
870 
871 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
872 
873 restart:
874 	fdplock(fdp);
875 	if (error != 0) {
876 		if (nfds > 0) {
877 			rp = ((struct fdpass *)CMSG_DATA(cm));
878 			unp_discard(rp, nfds);
879 		}
880 		goto out;
881 	}
882 
883 	/*
884 	 * First loop -- allocate file descriptor table slots for the
885 	 * new descriptors.
886 	 */
887 	rp = ((struct fdpass *)CMSG_DATA(cm));
888 	for (i = 0; i < nfds; i++) {
889 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
890 			/*
891 			 * Back out what we've done so far.
892 			 */
893 			for (--i; i >= 0; i--)
894 				fdremove(fdp, fds[i]);
895 
896 			if (error == ENOSPC) {
897 				fdexpand(p);
898 				error = 0;
899 			} else {
900 				/*
901 				 * This is the error that has historically
902 				 * been returned, and some callers may
903 				 * expect it.
904 				 */
905 				error = EMSGSIZE;
906 			}
907 			fdpunlock(fdp);
908 			goto restart;
909 		}
910 
911 		/*
912 		 * Make the slot reference the descriptor so that
913 		 * fdalloc() works properly.. We finalize it all
914 		 * in the loop below.
915 		 */
916 		mtx_enter(&fdp->fd_fplock);
917 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
918 		fdp->fd_ofiles[fds[i]] = rp->fp;
919 		mtx_leave(&fdp->fd_fplock);
920 
921 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
922 		if (flags & MSG_CMSG_CLOEXEC)
923 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
924 
925 		rp++;
926 	}
927 
928 	/*
929 	 * Now that adding them has succeeded, update all of the
930 	 * descriptor passing state.
931 	 */
932 	rp = (struct fdpass *)CMSG_DATA(cm);
933 	for (i = 0; i < nfds; i++) {
934 		struct unpcb *unp;
935 
936 		fp = rp->fp;
937 		rp++;
938 		if ((unp = fptounp(fp)) != NULL)
939 			unp->unp_msgcount--;
940 		unp_rights--;
941 	}
942 
943 	/*
944 	 * Copy temporary array to message and adjust length, in case of
945 	 * transition from large struct file pointers to ints.
946 	 */
947 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
948 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
949 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
950  out:
951 	fdpunlock(fdp);
952 	if (fds != NULL)
953 		free(fds, M_TEMP, nfds * sizeof(int));
954 	return (error);
955 }
956 
957 int
958 unp_internalize(struct mbuf *control, struct proc *p)
959 {
960 	struct filedesc *fdp = p->p_fd;
961 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
962 	struct fdpass *rp;
963 	struct file *fp;
964 	struct unpcb *unp;
965 	int i, error;
966 	int nfds, *ip, fd, neededspace;
967 
968 	rw_assert_wrlock(&unp_lock);
969 
970 	/*
971 	 * Check for two potential msg_controllen values because
972 	 * IETF stuck their nose in a place it does not belong.
973 	 */
974 	if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
975 		return (EINVAL);
976 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
977 	    !(cm->cmsg_len == control->m_len ||
978 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
979 		return (EINVAL);
980 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
981 
982 	if (unp_rights + nfds > maxfiles / 10)
983 		return (EMFILE);
984 
985 	/* Make sure we have room for the struct file pointers */
986 morespace:
987 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
988 	    control->m_len;
989 	if (neededspace > m_trailingspace(control)) {
990 		char *tmp;
991 		/* if we already have a cluster, the message is just too big */
992 		if (control->m_flags & M_EXT)
993 			return (E2BIG);
994 
995 		/* copy cmsg data temporarily out of the mbuf */
996 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
997 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
998 
999 		/* allocate a cluster and try again */
1000 		MCLGET(control, M_WAIT);
1001 		if ((control->m_flags & M_EXT) == 0) {
1002 			free(tmp, M_TEMP, control->m_len);
1003 			return (ENOBUFS);       /* allocation failed */
1004 		}
1005 
1006 		/* copy the data back into the cluster */
1007 		cm = mtod(control, struct cmsghdr *);
1008 		memcpy(cm, tmp, control->m_len);
1009 		free(tmp, M_TEMP, control->m_len);
1010 		goto morespace;
1011 	}
1012 
1013 	/* adjust message & mbuf to note amount of space actually used. */
1014 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
1015 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
1016 
1017 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
1018 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
1019 	fdplock(fdp);
1020 	for (i = 0; i < nfds; i++) {
1021 		memcpy(&fd, ip, sizeof fd);
1022 		ip--;
1023 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
1024 			error = EBADF;
1025 			goto fail;
1026 		}
1027 		if (fp->f_count >= FDUP_MAX_COUNT) {
1028 			error = EDEADLK;
1029 			goto fail;
1030 		}
1031 		error = pledge_sendfd(p, fp);
1032 		if (error)
1033 			goto fail;
1034 
1035 		/* kqueue descriptors cannot be copied */
1036 		if (fp->f_type == DTYPE_KQUEUE) {
1037 			error = EINVAL;
1038 			goto fail;
1039 		}
1040 		rp->fp = fp;
1041 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
1042 		rp--;
1043 		if ((unp = fptounp(fp)) != NULL) {
1044 			unp->unp_file = fp;
1045 			unp->unp_msgcount++;
1046 		}
1047 		unp_rights++;
1048 	}
1049 	fdpunlock(fdp);
1050 	return (0);
1051 fail:
1052 	fdpunlock(fdp);
1053 	if (fp != NULL)
1054 		FRELE(fp, p);
1055 	/* Back out what we just did. */
1056 	for ( ; i > 0; i--) {
1057 		rp++;
1058 		fp = rp->fp;
1059 		if ((unp = fptounp(fp)) != NULL)
1060 			unp->unp_msgcount--;
1061 		FRELE(fp, p);
1062 		unp_rights--;
1063 	}
1064 
1065 	return (error);
1066 }
1067 
1068 void
1069 unp_gc(void *arg __unused)
1070 {
1071 	struct unp_deferral *defer;
1072 	struct file *fp;
1073 	struct socket *so;
1074 	struct unpcb *unp;
1075 	int nunref, i;
1076 
1077 	rw_enter_write(&unp_lock);
1078 
1079 	if (unp_gcing)
1080 		goto unlock;
1081 	unp_gcing = 1;
1082 
1083 	/* close any fds on the deferred list */
1084 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
1085 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
1086 		for (i = 0; i < defer->ud_n; i++) {
1087 			fp = defer->ud_fp[i].fp;
1088 			if (fp == NULL)
1089 				continue;
1090 			 /* closef() expects a refcount of 2 */
1091 			FREF(fp);
1092 			if ((unp = fptounp(fp)) != NULL)
1093 				unp->unp_msgcount--;
1094 			unp_rights--;
1095 			rw_exit_write(&unp_lock);
1096 			(void) closef(fp, NULL);
1097 			rw_enter_write(&unp_lock);
1098 		}
1099 		free(defer, M_TEMP, sizeof(*defer) +
1100 		    sizeof(struct fdpass) * defer->ud_n);
1101 	}
1102 
1103 	unp_defer = 0;
1104 	LIST_FOREACH(unp, &unp_head, unp_link)
1105 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
1106 	do {
1107 		nunref = 0;
1108 		LIST_FOREACH(unp, &unp_head, unp_link) {
1109 			fp = unp->unp_file;
1110 			if (unp->unp_flags & UNP_GCDEFER) {
1111 				/*
1112 				 * This socket is referenced by another
1113 				 * socket which is known to be live,
1114 				 * so it's certainly live.
1115 				 */
1116 				unp->unp_flags &= ~UNP_GCDEFER;
1117 				unp_defer--;
1118 			} else if (unp->unp_flags & UNP_GCMARK) {
1119 				/* marked as live in previous pass */
1120 				continue;
1121 			} else if (fp == NULL) {
1122 				/* not being passed, so can't be in loop */
1123 			} else if (fp->f_count == 0) {
1124 				/*
1125 				 * Already being closed, let normal close
1126 				 * path take its course
1127 				 */
1128 			} else {
1129 				/*
1130 				 * Unreferenced by other sockets so far,
1131 				 * so if all the references (f_count) are
1132 				 * from passing (unp_msgcount) then this
1133 				 * socket is prospectively dead
1134 				 */
1135 				if (fp->f_count == unp->unp_msgcount) {
1136 					nunref++;
1137 					unp->unp_flags |= UNP_GCDEAD;
1138 					continue;
1139 				}
1140 			}
1141 
1142 			/*
1143 			 * This is the first time we've seen this socket on
1144 			 * the mark pass and known it has a live reference,
1145 			 * so mark it, then scan its receive buffer for
1146 			 * sockets and note them as deferred (== referenced,
1147 			 * but not yet marked).
1148 			 */
1149 			unp->unp_flags |= UNP_GCMARK;
1150 
1151 			so = unp->unp_socket;
1152 			unp_scan(so->so_rcv.sb_mb, unp_mark);
1153 		}
1154 	} while (unp_defer);
1155 
1156 	/*
1157 	 * If there are any unreferenced sockets, then for each dispose
1158 	 * of files in its receive buffer and then close it.
1159 	 */
1160 	if (nunref) {
1161 		LIST_FOREACH(unp, &unp_head, unp_link) {
1162 			if (unp->unp_flags & UNP_GCDEAD)
1163 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
1164 				    unp_discard);
1165 		}
1166 	}
1167 	unp_gcing = 0;
1168 unlock:
1169 	rw_exit_write(&unp_lock);
1170 }
1171 
1172 void
1173 unp_dispose(struct mbuf *m)
1174 {
1175 
1176 	if (m)
1177 		unp_scan(m, unp_discard);
1178 }
1179 
1180 void
1181 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1182 {
1183 	struct mbuf *m;
1184 	struct fdpass *rp;
1185 	struct cmsghdr *cm;
1186 	int qfds;
1187 
1188 	while (m0) {
1189 		for (m = m0; m; m = m->m_next) {
1190 			if (m->m_type == MT_CONTROL &&
1191 			    m->m_len >= sizeof(*cm)) {
1192 				cm = mtod(m, struct cmsghdr *);
1193 				if (cm->cmsg_level != SOL_SOCKET ||
1194 				    cm->cmsg_type != SCM_RIGHTS)
1195 					continue;
1196 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1197 				    / sizeof(struct fdpass);
1198 				if (qfds > 0) {
1199 					rp = (struct fdpass *)CMSG_DATA(cm);
1200 					op(rp, qfds);
1201 				}
1202 				break;		/* XXX, but saves time */
1203 			}
1204 		}
1205 		m0 = m0->m_nextpkt;
1206 	}
1207 }
1208 
1209 void
1210 unp_mark(struct fdpass *rp, int nfds)
1211 {
1212 	struct unpcb *unp;
1213 	int i;
1214 
1215 	rw_assert_wrlock(&unp_lock);
1216 
1217 	for (i = 0; i < nfds; i++) {
1218 		if (rp[i].fp == NULL)
1219 			continue;
1220 
1221 		unp = fptounp(rp[i].fp);
1222 		if (unp == NULL)
1223 			continue;
1224 
1225 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1226 			continue;
1227 
1228 		unp_defer++;
1229 		unp->unp_flags |= UNP_GCDEFER;
1230 		unp->unp_flags &= ~UNP_GCDEAD;
1231 	}
1232 }
1233 
1234 void
1235 unp_discard(struct fdpass *rp, int nfds)
1236 {
1237 	struct unp_deferral *defer;
1238 
1239 	rw_assert_wrlock(&unp_lock);
1240 
1241 	/* copy the file pointers to a deferral structure */
1242 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1243 	defer->ud_n = nfds;
1244 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1245 	memset(rp, 0, sizeof(*rp) * nfds);
1246 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1247 
1248 	task_add(systqmp, &unp_gc_task);
1249 }
1250 
1251 int
1252 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1253 {
1254 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1255 	size_t size, len;
1256 
1257 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1258 		return EINVAL;
1259 	if (sa->sa_family != AF_UNIX)
1260 		return EAFNOSUPPORT;
1261 	if (sa->sa_len != nam->m_len)
1262 		return EINVAL;
1263 	if (sa->sa_len > sizeof(struct sockaddr_un))
1264 		return EINVAL;
1265 	*sun = (struct sockaddr_un *)sa;
1266 
1267 	/* ensure that sun_path is NUL terminated and fits */
1268 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1269 	len = strnlen((*sun)->sun_path, size);
1270 	if (len == sizeof((*sun)->sun_path))
1271 		return EINVAL;
1272 	if (len == size) {
1273 		if (m_trailingspace(nam) == 0)
1274 			return EINVAL;
1275 		nam->m_len++;
1276 		(*sun)->sun_len++;
1277 		(*sun)->sun_path[len] = '\0';
1278 	}
1279 	if (pathlen != NULL)
1280 		*pathlen = len;
1281 
1282 	return 0;
1283 }
1284