xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision 4e1ee0786f11cc571bd0be17d38e46f635c719fc)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.150 2021/10/21 22:11:07 mvs Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/queue.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/unpcb.h>
45 #include <sys/un.h>
46 #include <sys/namei.h>
47 #include <sys/vnode.h>
48 #include <sys/file.h>
49 #include <sys/stat.h>
50 #include <sys/mbuf.h>
51 #include <sys/task.h>
52 #include <sys/pledge.h>
53 #include <sys/pool.h>
54 #include <sys/rwlock.h>
55 #include <sys/sysctl.h>
56 
57 /*
58  * Locks used to protect global data and struct members:
59  *      I       immutable after creation
60  *      U       unp_lock
61  */
62 struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
63 
64 /*
65  * Stack of sets of files that were passed over a socket but were
66  * not received and need to be closed.
67  */
68 struct	unp_deferral {
69 	SLIST_ENTRY(unp_deferral)	ud_link;	/* [U] */
70 	int				ud_n;		/* [I] */
71 	/* followed by ud_n struct fdpass */
72 	struct fdpass			ud_fp[];	/* [I] */
73 };
74 
75 void	uipc_setaddr(const struct unpcb *, struct mbuf *);
76 void	unp_discard(struct fdpass *, int);
77 void	unp_mark(struct fdpass *, int);
78 void	unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
79 int	unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
80 
81 struct pool unpcb_pool;
82 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
83 
84 /*
85  * Unix communications domain.
86  *
87  * TODO:
88  *	RDM
89  *	rethink name space problems
90  *	need a proper out-of-band
91  */
92 const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
93 
94 /* [U] list of all UNIX domain sockets, for unp_gc() */
95 LIST_HEAD(unp_head, unpcb)	unp_head =
96 	LIST_HEAD_INITIALIZER(unp_head);
97 /* [U] list of sets of files that were sent over sockets that are now closed */
98 SLIST_HEAD(,unp_deferral)	unp_deferred =
99 	SLIST_HEAD_INITIALIZER(unp_deferred);
100 
101 ino_t	unp_ino;	/* [U] prototype for fake inode numbers */
102 int	unp_rights;	/* [U] file descriptors in flight */
103 int	unp_defer;	/* [U] number of deferred fp to close by the GC task */
104 int	unp_gcing;	/* [U] GC task currently running */
105 
106 void
107 unp_init(void)
108 {
109 	pool_init(&unpcb_pool, sizeof(struct unpcb), 0,
110 	    IPL_SOFTNET, 0, "unpcb", NULL);
111 }
112 
113 void
114 uipc_setaddr(const struct unpcb *unp, struct mbuf *nam)
115 {
116 	if (unp != NULL && unp->unp_addr != NULL) {
117 		nam->m_len = unp->unp_addr->m_len;
118 		memcpy(mtod(nam, caddr_t), mtod(unp->unp_addr, caddr_t),
119 		    nam->m_len);
120 	} else {
121 		nam->m_len = sizeof(sun_noname);
122 		memcpy(mtod(nam, struct sockaddr *), &sun_noname,
123 		    nam->m_len);
124 	}
125 }
126 
127 int
128 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
129     struct mbuf *control, struct proc *p)
130 {
131 	struct unpcb *unp = sotounpcb(so);
132 	struct unpcb *unp2;
133 	struct socket *so2;
134 	int error = 0;
135 
136 	if (req == PRU_CONTROL)
137 		return (EOPNOTSUPP);
138 	if (req != PRU_SEND && control && control->m_len) {
139 		error = EOPNOTSUPP;
140 		goto release;
141 	}
142 	if (unp == NULL) {
143 		error = EINVAL;
144 		goto release;
145 	}
146 
147 	switch (req) {
148 
149 	case PRU_BIND:
150 		error = unp_bind(unp, nam, p);
151 		break;
152 
153 	case PRU_LISTEN:
154 		if (unp->unp_vnode == NULL)
155 			error = EINVAL;
156 		break;
157 
158 	case PRU_CONNECT:
159 		error = unp_connect(so, nam, p);
160 		break;
161 
162 	case PRU_CONNECT2:
163 		error = unp_connect2(so, (struct socket *)nam);
164 		if (!error) {
165 			unp->unp_connid.uid = p->p_ucred->cr_uid;
166 			unp->unp_connid.gid = p->p_ucred->cr_gid;
167 			unp->unp_connid.pid = p->p_p->ps_pid;
168 			unp->unp_flags |= UNP_FEIDS;
169 			unp2 = sotounpcb((struct socket *)nam);
170 			unp2->unp_connid.uid = p->p_ucred->cr_uid;
171 			unp2->unp_connid.gid = p->p_ucred->cr_gid;
172 			unp2->unp_connid.pid = p->p_p->ps_pid;
173 			unp2->unp_flags |= UNP_FEIDS;
174 		}
175 		break;
176 
177 	case PRU_DISCONNECT:
178 		unp_disconnect(unp);
179 		break;
180 
181 	case PRU_ACCEPT:
182 		/*
183 		 * Pass back name of connected socket,
184 		 * if it was bound and we are still connected
185 		 * (our peer may have closed already!).
186 		 */
187 		uipc_setaddr(unp->unp_conn, nam);
188 		break;
189 
190 	case PRU_SHUTDOWN:
191 		socantsendmore(so);
192 		unp_shutdown(unp);
193 		break;
194 
195 	case PRU_RCVD:
196 		switch (so->so_type) {
197 
198 		case SOCK_DGRAM:
199 			panic("uipc 1");
200 			/*NOTREACHED*/
201 
202 		case SOCK_STREAM:
203 		case SOCK_SEQPACKET:
204 			if (unp->unp_conn == NULL)
205 				break;
206 			so2 = unp->unp_conn->unp_socket;
207 			/*
208 			 * Adjust backpressure on sender
209 			 * and wakeup any waiting to write.
210 			 */
211 			so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt;
212 			so2->so_snd.sb_cc = so->so_rcv.sb_cc;
213 			sowwakeup(so2);
214 			break;
215 
216 		default:
217 			panic("uipc 2");
218 		}
219 		break;
220 
221 	case PRU_SEND:
222 		if (control) {
223 			sounlock(so, SL_LOCKED);
224 			error = unp_internalize(control, p);
225 			solock(so);
226 			if (error)
227 				break;
228 		}
229 		switch (so->so_type) {
230 
231 		case SOCK_DGRAM: {
232 			const struct sockaddr *from;
233 
234 			if (nam) {
235 				if (unp->unp_conn) {
236 					error = EISCONN;
237 					break;
238 				}
239 				error = unp_connect(so, nam, p);
240 				if (error)
241 					break;
242 			} else {
243 				if (unp->unp_conn == NULL) {
244 					error = ENOTCONN;
245 					break;
246 				}
247 			}
248 			so2 = unp->unp_conn->unp_socket;
249 			if (unp->unp_addr)
250 				from = mtod(unp->unp_addr, struct sockaddr *);
251 			else
252 				from = &sun_noname;
253 			if (sbappendaddr(so2, &so2->so_rcv, from, m, control)) {
254 				sorwakeup(so2);
255 				m = NULL;
256 				control = NULL;
257 			} else
258 				error = ENOBUFS;
259 			if (nam)
260 				unp_disconnect(unp);
261 			break;
262 		}
263 
264 		case SOCK_STREAM:
265 		case SOCK_SEQPACKET:
266 			if (so->so_state & SS_CANTSENDMORE) {
267 				error = EPIPE;
268 				break;
269 			}
270 			if (unp->unp_conn == NULL) {
271 				error = ENOTCONN;
272 				break;
273 			}
274 			so2 = unp->unp_conn->unp_socket;
275 			/*
276 			 * Send to paired receive port, and then raise
277 			 * send buffer counts to maintain backpressure.
278 			 * Wake up readers.
279 			 */
280 			if (control) {
281 				if (sbappendcontrol(so2, &so2->so_rcv, m,
282 				    control)) {
283 					control = NULL;
284 				} else {
285 					error = ENOBUFS;
286 					break;
287 				}
288 			} else if (so->so_type == SOCK_SEQPACKET)
289 				sbappendrecord(so2, &so2->so_rcv, m);
290 			else
291 				sbappend(so2, &so2->so_rcv, m);
292 			so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt;
293 			so->so_snd.sb_cc = so2->so_rcv.sb_cc;
294 			if (so2->so_rcv.sb_cc > 0)
295 				sorwakeup(so2);
296 			m = NULL;
297 			break;
298 
299 		default:
300 			panic("uipc 4");
301 		}
302 		/* we need to undo unp_internalize in case of errors */
303 		if (control && error)
304 			unp_dispose(control);
305 		break;
306 
307 	case PRU_ABORT:
308 		unp_drop(unp, ECONNABORTED);
309 		break;
310 
311 	case PRU_SENSE: {
312 		struct stat *sb = (struct stat *)m;
313 
314 		sb->st_blksize = so->so_snd.sb_hiwat;
315 		sb->st_dev = NODEV;
316 		if (unp->unp_ino == 0)
317 			unp->unp_ino = unp_ino++;
318 		sb->st_atim.tv_sec =
319 		    sb->st_mtim.tv_sec =
320 		    sb->st_ctim.tv_sec = unp->unp_ctime.tv_sec;
321 		sb->st_atim.tv_nsec =
322 		    sb->st_mtim.tv_nsec =
323 		    sb->st_ctim.tv_nsec = unp->unp_ctime.tv_nsec;
324 		sb->st_ino = unp->unp_ino;
325 		break;
326 	}
327 
328 	case PRU_RCVOOB:
329 	case PRU_SENDOOB:
330 		error = EOPNOTSUPP;
331 		break;
332 
333 	case PRU_SOCKADDR:
334 		uipc_setaddr(unp, nam);
335 		break;
336 
337 	case PRU_PEERADDR:
338 		uipc_setaddr(unp->unp_conn, nam);
339 		break;
340 
341 	case PRU_SLOWTIMO:
342 		break;
343 
344 	default:
345 		panic("uipc_usrreq");
346 	}
347 release:
348 	if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) {
349 		m_freem(control);
350 		m_freem(m);
351 	}
352 	return (error);
353 }
354 
355 /*
356  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
357  * for stream sockets, although the total for sender and receiver is
358  * actually only PIPSIZ.
359  * Datagram sockets really use the sendspace as the maximum datagram size,
360  * and don't really want to reserve the sendspace.  Their recvspace should
361  * be large enough for at least one max-size datagram plus address.
362  */
363 #define	PIPSIZ	8192
364 u_int	unpst_sendspace = PIPSIZ;
365 u_int	unpst_recvspace = PIPSIZ;
366 u_int	unpsq_sendspace = PIPSIZ;
367 u_int	unpsq_recvspace = PIPSIZ;
368 u_int	unpdg_sendspace = 2*1024;	/* really max datagram size */
369 u_int	unpdg_recvspace = 16*1024;
370 
371 const struct sysctl_bounded_args unpstctl_vars[] = {
372 	{ UNPCTL_RECVSPACE, &unpst_recvspace, 0, SB_MAX },
373 	{ UNPCTL_SENDSPACE, &unpst_sendspace, 0, SB_MAX },
374 };
375 const struct sysctl_bounded_args unpsqctl_vars[] = {
376 	{ UNPCTL_RECVSPACE, &unpsq_recvspace, 0, SB_MAX },
377 	{ UNPCTL_SENDSPACE, &unpsq_sendspace, 0, SB_MAX },
378 };
379 const struct sysctl_bounded_args unpdgctl_vars[] = {
380 	{ UNPCTL_RECVSPACE, &unpdg_recvspace, 0, SB_MAX },
381 	{ UNPCTL_SENDSPACE, &unpdg_sendspace, 0, SB_MAX },
382 };
383 
384 int
385 uipc_attach(struct socket *so, int proto)
386 {
387 	struct unpcb *unp;
388 	int error;
389 
390 	rw_assert_wrlock(&unp_lock);
391 
392 	if (so->so_pcb)
393 		return EISCONN;
394 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
395 		switch (so->so_type) {
396 
397 		case SOCK_STREAM:
398 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
399 			break;
400 
401 		case SOCK_SEQPACKET:
402 			error = soreserve(so, unpsq_sendspace, unpsq_recvspace);
403 			break;
404 
405 		case SOCK_DGRAM:
406 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
407 			break;
408 
409 		default:
410 			panic("unp_attach");
411 		}
412 		if (error)
413 			return (error);
414 	}
415 	unp = pool_get(&unpcb_pool, PR_NOWAIT|PR_ZERO);
416 	if (unp == NULL)
417 		return (ENOBUFS);
418 	unp->unp_socket = so;
419 	so->so_pcb = unp;
420 	getnanotime(&unp->unp_ctime);
421 	LIST_INSERT_HEAD(&unp_head, unp, unp_link);
422 	return (0);
423 }
424 
425 int
426 uipc_detach(struct socket *so)
427 {
428 	struct unpcb *unp = sotounpcb(so);
429 
430 	if (unp == NULL)
431 		return (EINVAL);
432 
433 	unp_detach(unp);
434 
435 	return (0);
436 }
437 
438 int
439 uipc_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
440     size_t newlen)
441 {
442 	int *valp = &unp_defer;
443 
444 	/* All sysctl names at this level are terminal. */
445 	switch (name[0]) {
446 	case SOCK_STREAM:
447 		if (namelen != 2)
448 			return (ENOTDIR);
449 		return sysctl_bounded_arr(unpstctl_vars, nitems(unpstctl_vars),
450 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
451 	case SOCK_SEQPACKET:
452 		if (namelen != 2)
453 			return (ENOTDIR);
454 		return sysctl_bounded_arr(unpsqctl_vars, nitems(unpsqctl_vars),
455 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
456 	case SOCK_DGRAM:
457 		if (namelen != 2)
458 			return (ENOTDIR);
459 		return sysctl_bounded_arr(unpdgctl_vars, nitems(unpdgctl_vars),
460 		    name + 1, namelen - 1, oldp, oldlenp, newp, newlen);
461 	case NET_UNIX_INFLIGHT:
462 		valp = &unp_rights;
463 		/* FALLTHOUGH */
464 	case NET_UNIX_DEFERRED:
465 		if (namelen != 1)
466 			return (ENOTDIR);
467 		return sysctl_rdint(oldp, oldlenp, newp, *valp);
468 	default:
469 		return (ENOPROTOOPT);
470 	}
471 }
472 
473 void
474 unp_detach(struct unpcb *unp)
475 {
476 	struct socket *so = unp->unp_socket;
477 	struct vnode *vp = NULL;
478 
479 	rw_assert_wrlock(&unp_lock);
480 
481 	LIST_REMOVE(unp, unp_link);
482 	if (unp->unp_vnode) {
483 		/*
484 		 * `v_socket' is only read in unp_connect and
485 		 * unplock prevents concurrent access.
486 		 */
487 
488 		unp->unp_vnode->v_socket = NULL;
489 		vp = unp->unp_vnode;
490 		unp->unp_vnode = NULL;
491 	}
492 
493 	if (unp->unp_conn)
494 		unp_disconnect(unp);
495 	while (!SLIST_EMPTY(&unp->unp_refs))
496 		unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
497 	soisdisconnected(so);
498 	so->so_pcb = NULL;
499 	m_freem(unp->unp_addr);
500 	pool_put(&unpcb_pool, unp);
501 	if (unp_rights)
502 		task_add(systqmp, &unp_gc_task);
503 
504 	if (vp != NULL) {
505 		/*
506 		 * Enforce `i_lock' -> `unplock' because fifo subsystem
507 		 * requires it. The socket can't be closed concurrently
508 		 * because the file descriptor reference is
509 		 * still hold.
510 		 */
511 
512 		sounlock(so, SL_LOCKED);
513 		KERNEL_LOCK();
514 		vrele(vp);
515 		KERNEL_UNLOCK();
516 		solock(so);
517 	}
518 }
519 
520 int
521 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
522 {
523 	struct sockaddr_un *soun;
524 	struct mbuf *nam2;
525 	struct vnode *vp;
526 	struct vattr vattr;
527 	int error;
528 	struct nameidata nd;
529 	size_t pathlen;
530 
531 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
532 		return (EINVAL);
533 	if (unp->unp_vnode != NULL)
534 		return (EINVAL);
535 	if ((error = unp_nam2sun(nam, &soun, &pathlen)))
536 		return (error);
537 
538 	unp->unp_flags |= UNP_BINDING;
539 
540 	/*
541 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
542 	 * requires it. The socket can't be closed concurrently
543 	 * because the file descriptor reference is still held.
544 	 */
545 
546 	sounlock(unp->unp_socket, SL_LOCKED);
547 
548 	nam2 = m_getclr(M_WAITOK, MT_SONAME);
549 	nam2->m_len = sizeof(struct sockaddr_un);
550 	memcpy(mtod(nam2, struct sockaddr_un *), soun,
551 	    offsetof(struct sockaddr_un, sun_path) + pathlen);
552 	/* No need to NUL terminate: m_getclr() returns zero'd mbufs. */
553 
554 	soun = mtod(nam2, struct sockaddr_un *);
555 
556 	/* Fixup sun_len to keep it in sync with m_len. */
557 	soun->sun_len = nam2->m_len;
558 
559 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
560 	    soun->sun_path, p);
561 	nd.ni_pledge = PLEDGE_UNIX;
562 
563 	KERNEL_LOCK();
564 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
565 	error = namei(&nd);
566 	if (error != 0) {
567 		m_freem(nam2);
568 		solock(unp->unp_socket);
569 		goto out;
570 	}
571 	vp = nd.ni_vp;
572 	if (vp != NULL) {
573 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
574 		if (nd.ni_dvp == vp)
575 			vrele(nd.ni_dvp);
576 		else
577 			vput(nd.ni_dvp);
578 		vrele(vp);
579 		m_freem(nam2);
580 		error = EADDRINUSE;
581 		solock(unp->unp_socket);
582 		goto out;
583 	}
584 	VATTR_NULL(&vattr);
585 	vattr.va_type = VSOCK;
586 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
587 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
588 	vput(nd.ni_dvp);
589 	if (error) {
590 		m_freem(nam2);
591 		solock(unp->unp_socket);
592 		goto out;
593 	}
594 	solock(unp->unp_socket);
595 	unp->unp_addr = nam2;
596 	vp = nd.ni_vp;
597 	vp->v_socket = unp->unp_socket;
598 	unp->unp_vnode = vp;
599 	unp->unp_connid.uid = p->p_ucred->cr_uid;
600 	unp->unp_connid.gid = p->p_ucred->cr_gid;
601 	unp->unp_connid.pid = p->p_p->ps_pid;
602 	unp->unp_flags |= UNP_FEIDSBIND;
603 	VOP_UNLOCK(vp);
604 out:
605 	KERNEL_UNLOCK();
606 	unp->unp_flags &= ~UNP_BINDING;
607 
608 	return (error);
609 }
610 
611 int
612 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
613 {
614 	struct sockaddr_un *soun;
615 	struct vnode *vp;
616 	struct socket *so2, *so3;
617 	struct unpcb *unp, *unp2, *unp3;
618 	struct nameidata nd;
619 	int error;
620 
621 	unp = sotounpcb(so);
622 	if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
623 		return (EISCONN);
624 	if ((error = unp_nam2sun(nam, &soun, NULL)))
625 		return (error);
626 
627 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
628 	nd.ni_pledge = PLEDGE_UNIX;
629 
630 	unp->unp_flags |= UNP_CONNECTING;
631 
632 	/*
633 	 * Enforce `i_lock' -> `unplock' because fifo subsystem
634 	 * requires it. The socket can't be closed concurrently
635 	 * because the file descriptor reference is still held.
636 	 */
637 
638 	sounlock(so, SL_LOCKED);
639 
640 	KERNEL_LOCK();
641 	error = namei(&nd);
642 	if (error != 0)
643 		goto unlock;
644 	vp = nd.ni_vp;
645 	if (vp->v_type != VSOCK) {
646 		error = ENOTSOCK;
647 		goto put;
648 	}
649 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
650 		goto put;
651 	solock(so);
652 	so2 = vp->v_socket;
653 	if (so2 == NULL) {
654 		error = ECONNREFUSED;
655 		goto put_locked;
656 	}
657 	if (so->so_type != so2->so_type) {
658 		error = EPROTOTYPE;
659 		goto put_locked;
660 	}
661 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
662 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
663 		    (so3 = sonewconn(so2, 0)) == 0) {
664 			error = ECONNREFUSED;
665 			goto put_locked;
666 		}
667 		unp2 = sotounpcb(so2);
668 		unp3 = sotounpcb(so3);
669 		if (unp2->unp_addr)
670 			unp3->unp_addr =
671 			    m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT);
672 		unp3->unp_connid.uid = p->p_ucred->cr_uid;
673 		unp3->unp_connid.gid = p->p_ucred->cr_gid;
674 		unp3->unp_connid.pid = p->p_p->ps_pid;
675 		unp3->unp_flags |= UNP_FEIDS;
676 		so2 = so3;
677 		if (unp2->unp_flags & UNP_FEIDSBIND) {
678 			unp->unp_connid = unp2->unp_connid;
679 			unp->unp_flags |= UNP_FEIDS;
680 		}
681 	}
682 	error = unp_connect2(so, so2);
683 put_locked:
684 	sounlock(so, SL_LOCKED);
685 put:
686 	vput(vp);
687 unlock:
688 	KERNEL_UNLOCK();
689 	solock(so);
690 	unp->unp_flags &= ~UNP_CONNECTING;
691 
692 	return (error);
693 }
694 
695 int
696 unp_connect2(struct socket *so, struct socket *so2)
697 {
698 	struct unpcb *unp = sotounpcb(so);
699 	struct unpcb *unp2;
700 
701 	rw_assert_wrlock(&unp_lock);
702 
703 	if (so2->so_type != so->so_type)
704 		return (EPROTOTYPE);
705 	unp2 = sotounpcb(so2);
706 	unp->unp_conn = unp2;
707 	switch (so->so_type) {
708 
709 	case SOCK_DGRAM:
710 		SLIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_nextref);
711 		soisconnected(so);
712 		break;
713 
714 	case SOCK_STREAM:
715 	case SOCK_SEQPACKET:
716 		unp2->unp_conn = unp;
717 		soisconnected(so);
718 		soisconnected(so2);
719 		break;
720 
721 	default:
722 		panic("unp_connect2");
723 	}
724 	return (0);
725 }
726 
727 void
728 unp_disconnect(struct unpcb *unp)
729 {
730 	struct unpcb *unp2 = unp->unp_conn;
731 
732 	if (unp2 == NULL)
733 		return;
734 	unp->unp_conn = NULL;
735 	switch (unp->unp_socket->so_type) {
736 
737 	case SOCK_DGRAM:
738 		SLIST_REMOVE(&unp2->unp_refs, unp, unpcb, unp_nextref);
739 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
740 		break;
741 
742 	case SOCK_STREAM:
743 	case SOCK_SEQPACKET:
744 		unp->unp_socket->so_snd.sb_mbcnt = 0;
745 		unp->unp_socket->so_snd.sb_cc = 0;
746 		soisdisconnected(unp->unp_socket);
747 		unp2->unp_conn = NULL;
748 		unp2->unp_socket->so_snd.sb_mbcnt = 0;
749 		unp2->unp_socket->so_snd.sb_cc = 0;
750 		soisdisconnected(unp2->unp_socket);
751 		break;
752 	}
753 }
754 
755 void
756 unp_shutdown(struct unpcb *unp)
757 {
758 	struct socket *so;
759 
760 	switch (unp->unp_socket->so_type) {
761 	case SOCK_STREAM:
762 	case SOCK_SEQPACKET:
763 		if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
764 			socantrcvmore(so);
765 		break;
766 	default:
767 		break;
768 	}
769 }
770 
771 void
772 unp_drop(struct unpcb *unp, int errno)
773 {
774 	struct socket *so = unp->unp_socket;
775 
776 	rw_assert_wrlock(&unp_lock);
777 
778 	so->so_error = errno;
779 	unp_disconnect(unp);
780 	if (so->so_head) {
781 		so->so_pcb = NULL;
782 		/*
783 		 * As long as `unp_lock' is taken before entering
784 		 * uipc_usrreq() releasing it here would lead to a
785 		 * double unlock.
786 		 */
787 		sofree(so, SL_NOUNLOCK);
788 		m_freem(unp->unp_addr);
789 		pool_put(&unpcb_pool, unp);
790 	}
791 }
792 
793 #ifdef notdef
794 unp_drain(void)
795 {
796 
797 }
798 #endif
799 
800 static struct unpcb *
801 fptounp(struct file *fp)
802 {
803 	struct socket *so;
804 
805 	if (fp->f_type != DTYPE_SOCKET)
806 		return (NULL);
807 	if ((so = fp->f_data) == NULL)
808 		return (NULL);
809 	if (so->so_proto->pr_domain != &unixdomain)
810 		return (NULL);
811 	return (sotounpcb(so));
812 }
813 
814 int
815 unp_externalize(struct mbuf *rights, socklen_t controllen, int flags)
816 {
817 	struct proc *p = curproc;		/* XXX */
818 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
819 	struct filedesc *fdp = p->p_fd;
820 	int i, *fds = NULL;
821 	struct fdpass *rp;
822 	struct file *fp;
823 	int nfds, error = 0;
824 
825 	/*
826 	 * This code only works because SCM_RIGHTS is the only supported
827 	 * control message type on unix sockets. Enforce this here.
828 	 */
829 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET)
830 		return EINVAL;
831 
832 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
833 	    sizeof(struct fdpass);
834 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
835 		controllen = 0;
836 	else
837 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
838 	if (nfds > controllen / sizeof(int)) {
839 		error = EMSGSIZE;
840 		goto out;
841 	}
842 
843 	/* Make sure the recipient should be able to see the descriptors.. */
844 	rp = (struct fdpass *)CMSG_DATA(cm);
845 
846 	/* fdp->fd_rdir requires KERNEL_LOCK() */
847 	KERNEL_LOCK();
848 
849 	for (i = 0; i < nfds; i++) {
850 		fp = rp->fp;
851 		rp++;
852 		error = pledge_recvfd(p, fp);
853 		if (error)
854 			break;
855 
856 		/*
857 		 * No to block devices.  If passing a directory,
858 		 * make sure that it is underneath the root.
859 		 */
860 		if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) {
861 			struct vnode *vp = (struct vnode *)fp->f_data;
862 
863 			if (vp->v_type == VBLK ||
864 			    (vp->v_type == VDIR &&
865 			    !vn_isunder(vp, fdp->fd_rdir, p))) {
866 				error = EPERM;
867 				break;
868 			}
869 		}
870 	}
871 
872 	KERNEL_UNLOCK();
873 
874 	if (error)
875 		goto out;
876 
877 	fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
878 
879 	fdplock(fdp);
880 restart:
881 	/*
882 	 * First loop -- allocate file descriptor table slots for the
883 	 * new descriptors.
884 	 */
885 	rp = ((struct fdpass *)CMSG_DATA(cm));
886 	for (i = 0; i < nfds; i++) {
887 		if ((error = fdalloc(p, 0, &fds[i])) != 0) {
888 			/*
889 			 * Back out what we've done so far.
890 			 */
891 			for (--i; i >= 0; i--)
892 				fdremove(fdp, fds[i]);
893 
894 			if (error == ENOSPC) {
895 				fdexpand(p);
896 				goto restart;
897 			}
898 
899 			fdpunlock(fdp);
900 
901 			/*
902 			 * This is the error that has historically
903 			 * been returned, and some callers may
904 			 * expect it.
905 			 */
906 
907 			error = EMSGSIZE;
908 			goto out;
909 		}
910 
911 		/*
912 		 * Make the slot reference the descriptor so that
913 		 * fdalloc() works properly.. We finalize it all
914 		 * in the loop below.
915 		 */
916 		mtx_enter(&fdp->fd_fplock);
917 		KASSERT(fdp->fd_ofiles[fds[i]] == NULL);
918 		fdp->fd_ofiles[fds[i]] = rp->fp;
919 		mtx_leave(&fdp->fd_fplock);
920 
921 		fdp->fd_ofileflags[fds[i]] = (rp->flags & UF_PLEDGED);
922 		if (flags & MSG_CMSG_CLOEXEC)
923 			fdp->fd_ofileflags[fds[i]] |= UF_EXCLOSE;
924 
925 		rp++;
926 	}
927 	fdpunlock(fdp);
928 
929 	/*
930 	 * Now that adding them has succeeded, update all of the
931 	 * descriptor passing state.
932 	 */
933 	rp = (struct fdpass *)CMSG_DATA(cm);
934 
935 	rw_enter_write(&unp_lock);
936 	for (i = 0; i < nfds; i++) {
937 		struct unpcb *unp;
938 
939 		fp = rp->fp;
940 		rp++;
941 		if ((unp = fptounp(fp)) != NULL)
942 			unp->unp_msgcount--;
943 		unp_rights--;
944 	}
945 	rw_exit_write(&unp_lock);
946 
947 	/*
948 	 * Copy temporary array to message and adjust length, in case of
949 	 * transition from large struct file pointers to ints.
950 	 */
951 	memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
952 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
953 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
954  out:
955 	if (fds != NULL)
956 		free(fds, M_TEMP, nfds * sizeof(int));
957 
958 	if (error) {
959 		if (nfds > 0) {
960 			rp = ((struct fdpass *)CMSG_DATA(cm));
961 			rw_enter_write(&unp_lock);
962 			unp_discard(rp, nfds);
963 			rw_exit_write(&unp_lock);
964 		}
965 	}
966 
967 	return (error);
968 }
969 
970 int
971 unp_internalize(struct mbuf *control, struct proc *p)
972 {
973 	struct filedesc *fdp = p->p_fd;
974 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
975 	struct fdpass *rp;
976 	struct file *fp;
977 	struct unpcb *unp;
978 	int i, error;
979 	int nfds, *ip, fd, neededspace;
980 
981 	/*
982 	 * Check for two potential msg_controllen values because
983 	 * IETF stuck their nose in a place it does not belong.
984 	 */
985 	if (control->m_len < CMSG_LEN(0) || cm->cmsg_len < CMSG_LEN(0))
986 		return (EINVAL);
987 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
988 	    !(cm->cmsg_len == control->m_len ||
989 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
990 		return (EINVAL);
991 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
992 
993 	rw_enter_write(&unp_lock);
994 	if (unp_rights + nfds > maxfiles / 10) {
995 		rw_exit_write(&unp_lock);
996 		return (EMFILE);
997 	}
998 	unp_rights += nfds;
999 	rw_exit_write(&unp_lock);
1000 
1001 	/* Make sure we have room for the struct file pointers */
1002 morespace:
1003 	neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
1004 	    control->m_len;
1005 	if (neededspace > m_trailingspace(control)) {
1006 		char *tmp;
1007 		/* if we already have a cluster, the message is just too big */
1008 		if (control->m_flags & M_EXT) {
1009 			error = E2BIG;
1010 			goto nospace;
1011 		}
1012 
1013 		/* copy cmsg data temporarily out of the mbuf */
1014 		tmp = malloc(control->m_len, M_TEMP, M_WAITOK);
1015 		memcpy(tmp, mtod(control, caddr_t), control->m_len);
1016 
1017 		/* allocate a cluster and try again */
1018 		MCLGET(control, M_WAIT);
1019 		if ((control->m_flags & M_EXT) == 0) {
1020 			free(tmp, M_TEMP, control->m_len);
1021 			error = ENOBUFS;       /* allocation failed */
1022 			goto nospace;
1023 		}
1024 
1025 		/* copy the data back into the cluster */
1026 		cm = mtod(control, struct cmsghdr *);
1027 		memcpy(cm, tmp, control->m_len);
1028 		free(tmp, M_TEMP, control->m_len);
1029 		goto morespace;
1030 	}
1031 
1032 	/* adjust message & mbuf to note amount of space actually used. */
1033 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
1034 	control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
1035 
1036 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
1037 	rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
1038 	fdplock(fdp);
1039 	rw_enter_write(&unp_lock);
1040 	for (i = 0; i < nfds; i++) {
1041 		memcpy(&fd, ip, sizeof fd);
1042 		ip--;
1043 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
1044 			error = EBADF;
1045 			goto fail;
1046 		}
1047 		if (fp->f_count >= FDUP_MAX_COUNT) {
1048 			error = EDEADLK;
1049 			goto fail;
1050 		}
1051 		error = pledge_sendfd(p, fp);
1052 		if (error)
1053 			goto fail;
1054 
1055 		/* kqueue descriptors cannot be copied */
1056 		if (fp->f_type == DTYPE_KQUEUE) {
1057 			error = EINVAL;
1058 			goto fail;
1059 		}
1060 		rp->fp = fp;
1061 		rp->flags = fdp->fd_ofileflags[fd] & UF_PLEDGED;
1062 		rp--;
1063 		if ((unp = fptounp(fp)) != NULL) {
1064 			unp->unp_file = fp;
1065 			unp->unp_msgcount++;
1066 		}
1067 	}
1068 	rw_exit_write(&unp_lock);
1069 	fdpunlock(fdp);
1070 	return (0);
1071 fail:
1072 	rw_exit_write(&unp_lock);
1073 	fdpunlock(fdp);
1074 	if (fp != NULL)
1075 		FRELE(fp, p);
1076 	/* Back out what we just did. */
1077 	for ( ; i > 0; i--) {
1078 		rp++;
1079 		fp = rp->fp;
1080 		rw_enter_write(&unp_lock);
1081 		if ((unp = fptounp(fp)) != NULL)
1082 			unp->unp_msgcount--;
1083 		rw_exit_write(&unp_lock);
1084 		FRELE(fp, p);
1085 	}
1086 
1087 nospace:
1088 	rw_enter_write(&unp_lock);
1089 	unp_rights -= nfds;
1090 	rw_exit_write(&unp_lock);
1091 
1092 	return (error);
1093 }
1094 
1095 void
1096 unp_gc(void *arg __unused)
1097 {
1098 	struct unp_deferral *defer;
1099 	struct file *fp;
1100 	struct socket *so;
1101 	struct unpcb *unp;
1102 	int nunref, i;
1103 
1104 	rw_enter_write(&unp_lock);
1105 
1106 	if (unp_gcing)
1107 		goto unlock;
1108 	unp_gcing = 1;
1109 
1110 	/* close any fds on the deferred list */
1111 	while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) {
1112 		SLIST_REMOVE_HEAD(&unp_deferred, ud_link);
1113 		for (i = 0; i < defer->ud_n; i++) {
1114 			fp = defer->ud_fp[i].fp;
1115 			if (fp == NULL)
1116 				continue;
1117 			 /* closef() expects a refcount of 2 */
1118 			FREF(fp);
1119 			if ((unp = fptounp(fp)) != NULL)
1120 				unp->unp_msgcount--;
1121 			unp_rights--;
1122 			rw_exit_write(&unp_lock);
1123 			(void) closef(fp, NULL);
1124 			rw_enter_write(&unp_lock);
1125 		}
1126 		free(defer, M_TEMP, sizeof(*defer) +
1127 		    sizeof(struct fdpass) * defer->ud_n);
1128 	}
1129 
1130 	unp_defer = 0;
1131 	LIST_FOREACH(unp, &unp_head, unp_link)
1132 		unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD);
1133 	do {
1134 		nunref = 0;
1135 		LIST_FOREACH(unp, &unp_head, unp_link) {
1136 			fp = unp->unp_file;
1137 			if (unp->unp_flags & UNP_GCDEFER) {
1138 				/*
1139 				 * This socket is referenced by another
1140 				 * socket which is known to be live,
1141 				 * so it's certainly live.
1142 				 */
1143 				unp->unp_flags &= ~UNP_GCDEFER;
1144 				unp_defer--;
1145 			} else if (unp->unp_flags & UNP_GCMARK) {
1146 				/* marked as live in previous pass */
1147 				continue;
1148 			} else if (fp == NULL) {
1149 				/* not being passed, so can't be in loop */
1150 			} else if (fp->f_count == 0) {
1151 				/*
1152 				 * Already being closed, let normal close
1153 				 * path take its course
1154 				 */
1155 			} else {
1156 				/*
1157 				 * Unreferenced by other sockets so far,
1158 				 * so if all the references (f_count) are
1159 				 * from passing (unp_msgcount) then this
1160 				 * socket is prospectively dead
1161 				 */
1162 				if (fp->f_count == unp->unp_msgcount) {
1163 					nunref++;
1164 					unp->unp_flags |= UNP_GCDEAD;
1165 					continue;
1166 				}
1167 			}
1168 
1169 			/*
1170 			 * This is the first time we've seen this socket on
1171 			 * the mark pass and known it has a live reference,
1172 			 * so mark it, then scan its receive buffer for
1173 			 * sockets and note them as deferred (== referenced,
1174 			 * but not yet marked).
1175 			 */
1176 			unp->unp_flags |= UNP_GCMARK;
1177 
1178 			so = unp->unp_socket;
1179 			unp_scan(so->so_rcv.sb_mb, unp_mark);
1180 		}
1181 	} while (unp_defer);
1182 
1183 	/*
1184 	 * If there are any unreferenced sockets, then for each dispose
1185 	 * of files in its receive buffer and then close it.
1186 	 */
1187 	if (nunref) {
1188 		LIST_FOREACH(unp, &unp_head, unp_link) {
1189 			if (unp->unp_flags & UNP_GCDEAD)
1190 				unp_scan(unp->unp_socket->so_rcv.sb_mb,
1191 				    unp_discard);
1192 		}
1193 	}
1194 	unp_gcing = 0;
1195 unlock:
1196 	rw_exit_write(&unp_lock);
1197 }
1198 
1199 void
1200 unp_dispose(struct mbuf *m)
1201 {
1202 
1203 	if (m)
1204 		unp_scan(m, unp_discard);
1205 }
1206 
1207 void
1208 unp_scan(struct mbuf *m0, void (*op)(struct fdpass *, int))
1209 {
1210 	struct mbuf *m;
1211 	struct fdpass *rp;
1212 	struct cmsghdr *cm;
1213 	int qfds;
1214 
1215 	while (m0) {
1216 		for (m = m0; m; m = m->m_next) {
1217 			if (m->m_type == MT_CONTROL &&
1218 			    m->m_len >= sizeof(*cm)) {
1219 				cm = mtod(m, struct cmsghdr *);
1220 				if (cm->cmsg_level != SOL_SOCKET ||
1221 				    cm->cmsg_type != SCM_RIGHTS)
1222 					continue;
1223 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
1224 				    / sizeof(struct fdpass);
1225 				if (qfds > 0) {
1226 					rp = (struct fdpass *)CMSG_DATA(cm);
1227 					op(rp, qfds);
1228 				}
1229 				break;		/* XXX, but saves time */
1230 			}
1231 		}
1232 		m0 = m0->m_nextpkt;
1233 	}
1234 }
1235 
1236 void
1237 unp_mark(struct fdpass *rp, int nfds)
1238 {
1239 	struct unpcb *unp;
1240 	int i;
1241 
1242 	rw_assert_wrlock(&unp_lock);
1243 
1244 	for (i = 0; i < nfds; i++) {
1245 		if (rp[i].fp == NULL)
1246 			continue;
1247 
1248 		unp = fptounp(rp[i].fp);
1249 		if (unp == NULL)
1250 			continue;
1251 
1252 		if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER))
1253 			continue;
1254 
1255 		unp_defer++;
1256 		unp->unp_flags |= UNP_GCDEFER;
1257 		unp->unp_flags &= ~UNP_GCDEAD;
1258 	}
1259 }
1260 
1261 void
1262 unp_discard(struct fdpass *rp, int nfds)
1263 {
1264 	struct unp_deferral *defer;
1265 
1266 	rw_assert_wrlock(&unp_lock);
1267 
1268 	/* copy the file pointers to a deferral structure */
1269 	defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
1270 	defer->ud_n = nfds;
1271 	memcpy(&defer->ud_fp[0], rp, sizeof(*rp) * nfds);
1272 	memset(rp, 0, sizeof(*rp) * nfds);
1273 	SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link);
1274 
1275 	task_add(systqmp, &unp_gc_task);
1276 }
1277 
1278 int
1279 unp_nam2sun(struct mbuf *nam, struct sockaddr_un **sun, size_t *pathlen)
1280 {
1281 	struct sockaddr *sa = mtod(nam, struct sockaddr *);
1282 	size_t size, len;
1283 
1284 	if (nam->m_len < offsetof(struct sockaddr, sa_data))
1285 		return EINVAL;
1286 	if (sa->sa_family != AF_UNIX)
1287 		return EAFNOSUPPORT;
1288 	if (sa->sa_len != nam->m_len)
1289 		return EINVAL;
1290 	if (sa->sa_len > sizeof(struct sockaddr_un))
1291 		return EINVAL;
1292 	*sun = (struct sockaddr_un *)sa;
1293 
1294 	/* ensure that sun_path is NUL terminated and fits */
1295 	size = (*sun)->sun_len - offsetof(struct sockaddr_un, sun_path);
1296 	len = strnlen((*sun)->sun_path, size);
1297 	if (len == sizeof((*sun)->sun_path))
1298 		return EINVAL;
1299 	if (len == size) {
1300 		if (m_trailingspace(nam) == 0)
1301 			return EINVAL;
1302 		nam->m_len++;
1303 		(*sun)->sun_len++;
1304 		(*sun)->sun_path[len] = '\0';
1305 	}
1306 	if (pathlen != NULL)
1307 		*pathlen = len;
1308 
1309 	return 0;
1310 }
1311