xref: /dflybsd-src/sys/kern/uipc_usrreq.c (revision 93bffecadc0caefc46f12b736eab0e62c2b6f42e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.54.2.10 2003/03/04 17:28:09 nectar Exp $
35  * $DragonFly: src/sys/kern/uipc_usrreq.c,v 1.44 2008/09/06 05:44:58 dillon Exp $
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/domain.h>
42 #include <sys/fcntl.h>
43 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
44 #include <sys/proc.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/mbuf.h>
48 #include <sys/nlookup.h>
49 #include <sys/protosw.h>
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/resourcevar.h>
53 #include <sys/stat.h>
54 #include <sys/mount.h>
55 #include <sys/sysctl.h>
56 #include <sys/un.h>
57 #include <sys/unpcb.h>
58 #include <sys/vnode.h>
59 
60 #include <sys/file2.h>
61 #include <sys/spinlock2.h>
62 #include <sys/socketvar2.h>
63 
64 static	MALLOC_DEFINE(M_UNPCB, "unpcb", "unpcb struct");
65 static	unp_gen_t unp_gencnt;
66 static	u_int unp_count;
67 
68 static	struct unp_head unp_shead, unp_dhead;
69 
70 static struct lwkt_token unp_token = LWKT_TOKEN_MP_INITIALIZER(unp_token);
71 
72 /*
73  * Unix communications domain.
74  *
75  * TODO:
76  *	RDM
77  *	rethink name space problems
78  *	need a proper out-of-band
79  *	lock pushdown
80  */
81 static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
82 static ino_t	unp_ino = 1;		/* prototype for fake inode numbers */
83 static struct spinlock unp_ino_spin = SPINLOCK_INITIALIZER(&unp_ino_spin);
84 
85 static int     unp_attach (struct socket *, struct pru_attach_info *);
86 static void    unp_detach (struct unpcb *);
87 static int     unp_bind (struct unpcb *,struct sockaddr *, struct thread *);
88 static int     unp_connect (struct socket *,struct sockaddr *,
89 				struct thread *);
90 static void    unp_disconnect (struct unpcb *);
91 static void    unp_shutdown (struct unpcb *);
92 static void    unp_drop (struct unpcb *, int);
93 static void    unp_gc (void);
94 static int     unp_gc_clearmarks(struct file *, void *);
95 static int     unp_gc_checkmarks(struct file *, void *);
96 static int     unp_gc_checkrefs(struct file *, void *);
97 static int     unp_revoke_gc_check(struct file *, void *);
98 static void    unp_scan (struct mbuf *, void (*)(struct file *, void *),
99 				void *data);
100 static void    unp_mark (struct file *, void *data);
101 static void    unp_discard (struct file *, void *);
102 static int     unp_internalize (struct mbuf *, struct thread *);
103 static int     unp_listen (struct unpcb *, struct thread *);
104 static void    unp_fp_externalize(struct lwp *lp, struct file *fp, int fd);
105 
106 /*
107  * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort()
108  *	 will sofree() it when we return.
109  */
110 static int
111 uipc_abort(struct socket *so)
112 {
113 	struct unpcb *unp;
114 	int error;
115 
116 	lwkt_gettoken(&unp_token);
117 	unp = so->so_pcb;
118 	if (unp) {
119 		unp_drop(unp, ECONNABORTED);
120 		unp_detach(unp);
121 		error = 0;
122 	} else {
123 		error = EINVAL;
124 	}
125 	lwkt_reltoken(&unp_token);
126 
127 	return error;
128 }
129 
130 static int
131 uipc_accept(struct socket *so, struct sockaddr **nam)
132 {
133 	struct unpcb *unp;
134 
135 	lwkt_gettoken(&unp_token);
136 	unp = so->so_pcb;
137 	if (unp == NULL) {
138 		lwkt_reltoken(&unp_token);
139 		return EINVAL;
140 	}
141 
142 	/*
143 	 * Pass back name of connected socket,
144 	 * if it was bound and we are still connected
145 	 * (our peer may have closed already!).
146 	 */
147 	if (unp->unp_conn && unp->unp_conn->unp_addr) {
148 		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr);
149 	} else {
150 		*nam = dup_sockaddr((struct sockaddr *)&sun_noname);
151 	}
152 	lwkt_reltoken(&unp_token);
153 	return 0;
154 }
155 
156 static int
157 uipc_attach(struct socket *so, int proto, struct pru_attach_info *ai)
158 {
159 	struct unpcb *unp;
160 	int error;
161 
162 	lwkt_gettoken(&unp_token);
163 	unp = so->so_pcb;
164 	if (unp)
165 		error = EISCONN;
166 	else
167 		error = unp_attach(so, ai);
168 	lwkt_reltoken(&unp_token);
169 
170 	return error;
171 }
172 
173 static int
174 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
175 {
176 	struct unpcb *unp;
177 	int error;
178 
179 	lwkt_gettoken(&unp_token);
180 	unp = so->so_pcb;
181 	if (unp)
182 		error = unp_bind(unp, nam, td);
183 	else
184 		error = EINVAL;
185 	lwkt_reltoken(&unp_token);
186 
187 	return error;
188 }
189 
190 static int
191 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
192 {
193 	struct unpcb *unp;
194 	int error;
195 
196 	lwkt_gettoken(&unp_token);
197 	unp = so->so_pcb;
198 	if (unp)
199 		error = unp_connect(so, nam, td);
200 	else
201 		error = EINVAL;
202 	lwkt_reltoken(&unp_token);
203 
204 	return error;
205 }
206 
207 static int
208 uipc_connect2(struct socket *so1, struct socket *so2)
209 {
210 	struct unpcb *unp;
211 	int error;
212 
213 	lwkt_gettoken(&unp_token);
214 	unp = so1->so_pcb;
215 	if (unp)
216 		error = unp_connect2(so1, so2);
217 	else
218 		error = EINVAL;
219 	lwkt_reltoken(&unp_token);
220 
221 	return error;
222 }
223 
224 /* control is EOPNOTSUPP */
225 
226 static int
227 uipc_detach(struct socket *so)
228 {
229 	struct unpcb *unp;
230 	int error;
231 
232 	lwkt_gettoken(&unp_token);
233 	unp = so->so_pcb;
234 	if (unp) {
235 		unp_detach(unp);
236 		error = 0;
237 	} else {
238 		error = EINVAL;
239 	}
240 	lwkt_reltoken(&unp_token);
241 
242 	return error;
243 }
244 
245 static int
246 uipc_disconnect(struct socket *so)
247 {
248 	struct unpcb *unp;
249 	int error;
250 
251 	lwkt_gettoken(&unp_token);
252 	unp = so->so_pcb;
253 	if (unp) {
254 		unp_disconnect(unp);
255 		error = 0;
256 	} else {
257 		error = EINVAL;
258 	}
259 	lwkt_reltoken(&unp_token);
260 
261 	return error;
262 }
263 
264 static int
265 uipc_listen(struct socket *so, struct thread *td)
266 {
267 	struct unpcb *unp;
268 	int error;
269 
270 	lwkt_gettoken(&unp_token);
271 	unp = so->so_pcb;
272 	if (unp == NULL || unp->unp_vnode == NULL)
273 		error = EINVAL;
274 	else
275 		error = unp_listen(unp, td);
276 	lwkt_reltoken(&unp_token);
277 
278 	return error;
279 }
280 
281 static int
282 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
283 {
284 	struct unpcb *unp;
285 	int error;
286 
287 	lwkt_gettoken(&unp_token);
288 	unp = so->so_pcb;
289 	if (unp == NULL) {
290 		error = EINVAL;
291 	} else if (unp->unp_conn && unp->unp_conn->unp_addr) {
292 		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr);
293 		error = 0;
294 	} else {
295 		/*
296 		 * XXX: It seems that this test always fails even when
297 		 * connection is established.  So, this else clause is
298 		 * added as workaround to return PF_LOCAL sockaddr.
299 		 */
300 		*nam = dup_sockaddr((struct sockaddr *)&sun_noname);
301 		error = 0;
302 	}
303 	lwkt_reltoken(&unp_token);
304 
305 	return error;
306 }
307 
308 static int
309 uipc_rcvd(struct socket *so, int flags)
310 {
311 	struct unpcb *unp;
312 	struct socket *so2;
313 
314 	lwkt_gettoken(&unp_token);
315 	unp = so->so_pcb;
316 	if (unp == NULL) {
317 		lwkt_reltoken(&unp_token);
318 		return EINVAL;
319 	}
320 
321 	switch (so->so_type) {
322 	case SOCK_DGRAM:
323 		panic("uipc_rcvd DGRAM?");
324 		/*NOTREACHED*/
325 	case SOCK_STREAM:
326 	case SOCK_SEQPACKET:
327 		if (unp->unp_conn == NULL)
328 			break;
329 		/*
330 		 * Because we are transfering mbufs directly to the
331 		 * peer socket we have to use SSB_STOP on the sender
332 		 * to prevent it from building up infinite mbufs.
333 		 */
334 		so2 = unp->unp_conn->unp_socket;
335 		if (so->so_rcv.ssb_cc < so2->so_snd.ssb_hiwat &&
336 		    so->so_rcv.ssb_mbcnt < so2->so_snd.ssb_mbmax
337 		) {
338 			atomic_clear_int(&so2->so_snd.ssb_flags, SSB_STOP);
339 			sowwakeup(so2);
340 		}
341 		break;
342 	default:
343 		panic("uipc_rcvd unknown socktype");
344 		/*NOTREACHED*/
345 	}
346 	lwkt_reltoken(&unp_token);
347 
348 	return 0;
349 }
350 
351 /* pru_rcvoob is EOPNOTSUPP */
352 
353 static int
354 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
355 	  struct mbuf *control, struct thread *td)
356 {
357 	struct unpcb *unp;
358 	struct socket *so2;
359 	int error = 0;
360 
361 	lwkt_gettoken(&unp_token);
362 
363 	unp = so->so_pcb;
364 	if (unp == NULL) {
365 		error = EINVAL;
366 		goto release;
367 	}
368 	if (flags & PRUS_OOB) {
369 		error = EOPNOTSUPP;
370 		goto release;
371 	}
372 
373 	if (control && (error = unp_internalize(control, td)))
374 		goto release;
375 
376 	switch (so->so_type) {
377 	case SOCK_DGRAM:
378 	{
379 		struct sockaddr *from;
380 
381 		if (nam) {
382 			if (unp->unp_conn) {
383 				error = EISCONN;
384 				break;
385 			}
386 			error = unp_connect(so, nam, td);
387 			if (error)
388 				break;
389 		} else {
390 			if (unp->unp_conn == NULL) {
391 				error = ENOTCONN;
392 				break;
393 			}
394 		}
395 		so2 = unp->unp_conn->unp_socket;
396 		if (unp->unp_addr)
397 			from = (struct sockaddr *)unp->unp_addr;
398 		else
399 			from = &sun_noname;
400 		if (ssb_appendaddr(&so2->so_rcv, from, m, control)) {
401 			sorwakeup(so2);
402 			m = NULL;
403 			control = NULL;
404 		} else {
405 			error = ENOBUFS;
406 		}
407 		if (nam)
408 			unp_disconnect(unp);
409 		break;
410 	}
411 
412 	case SOCK_STREAM:
413 	case SOCK_SEQPACKET:
414 		/* Connect if not connected yet. */
415 		/*
416 		 * Note: A better implementation would complain
417 		 * if not equal to the peer's address.
418 		 */
419 		if (!(so->so_state & SS_ISCONNECTED)) {
420 			if (nam) {
421 				error = unp_connect(so, nam, td);
422 				if (error)
423 					break;	/* XXX */
424 			} else {
425 				error = ENOTCONN;
426 				break;
427 			}
428 		}
429 
430 		if (so->so_state & SS_CANTSENDMORE) {
431 			error = EPIPE;
432 			break;
433 		}
434 		if (unp->unp_conn == NULL)
435 			panic("uipc_send connected but no connection?");
436 		so2 = unp->unp_conn->unp_socket;
437 		/*
438 		 * Send to paired receive port, and then reduce
439 		 * send buffer hiwater marks to maintain backpressure.
440 		 * Wake up readers.
441 		 */
442 		if (control) {
443 			if (ssb_appendcontrol(&so2->so_rcv, m, control)) {
444 				control = NULL;
445 				m = NULL;
446 			}
447 		} else if (so->so_type == SOCK_SEQPACKET) {
448 			sbappendrecord(&so2->so_rcv.sb, m);
449 			m = NULL;
450 		} else {
451 			sbappend(&so2->so_rcv.sb, m);
452 			m = NULL;
453 		}
454 
455 		/*
456 		 * Because we are transfering mbufs directly to the
457 		 * peer socket we have to use SSB_STOP on the sender
458 		 * to prevent it from building up infinite mbufs.
459 		 */
460 		if (so2->so_rcv.ssb_cc >= so->so_snd.ssb_hiwat ||
461 		    so2->so_rcv.ssb_mbcnt >= so->so_snd.ssb_mbmax
462 		) {
463 			atomic_set_int(&so->so_snd.ssb_flags, SSB_STOP);
464 		}
465 		sorwakeup(so2);
466 		break;
467 
468 	default:
469 		panic("uipc_send unknown socktype");
470 	}
471 
472 	/*
473 	 * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN.
474 	 */
475 	if (flags & PRUS_EOF) {
476 		socantsendmore(so);
477 		unp_shutdown(unp);
478 	}
479 
480 	if (control && error != 0)
481 		unp_dispose(control);
482 
483 release:
484 	lwkt_reltoken(&unp_token);
485 
486 	if (control)
487 		m_freem(control);
488 	if (m)
489 		m_freem(m);
490 	return error;
491 }
492 
493 /*
494  * MPSAFE
495  */
496 static int
497 uipc_sense(struct socket *so, struct stat *sb)
498 {
499 	struct unpcb *unp;
500 
501 	lwkt_gettoken(&unp_token);
502 	unp = so->so_pcb;
503 	if (unp == NULL) {
504 		lwkt_reltoken(&unp_token);
505 		return EINVAL;
506 	}
507 	sb->st_blksize = so->so_snd.ssb_hiwat;
508 	sb->st_dev = NOUDEV;
509 	if (unp->unp_ino == 0) {	/* make up a non-zero inode number */
510 		spin_lock(&unp_ino_spin);
511 		unp->unp_ino = unp_ino++;
512 		spin_unlock(&unp_ino_spin);
513 	}
514 	sb->st_ino = unp->unp_ino;
515 	lwkt_reltoken(&unp_token);
516 
517 	return (0);
518 }
519 
520 static int
521 uipc_shutdown(struct socket *so)
522 {
523 	struct unpcb *unp;
524 	int error;
525 
526 	lwkt_gettoken(&unp_token);
527 	unp = so->so_pcb;
528 	if (unp) {
529 		socantsendmore(so);
530 		unp_shutdown(unp);
531 		error = 0;
532 	} else {
533 		error = EINVAL;
534 	}
535 	lwkt_reltoken(&unp_token);
536 
537 	return error;
538 }
539 
540 static int
541 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
542 {
543 	struct unpcb *unp;
544 	int error;
545 
546 	lwkt_gettoken(&unp_token);
547 	unp = so->so_pcb;
548 	if (unp) {
549 		if (unp->unp_addr)
550 			*nam = dup_sockaddr((struct sockaddr *)unp->unp_addr);
551 		error = 0;
552 	} else {
553 		error = EINVAL;
554 	}
555 	lwkt_reltoken(&unp_token);
556 
557 	return error;
558 }
559 
560 struct pr_usrreqs uipc_usrreqs = {
561 	.pru_abort = uipc_abort,
562 	.pru_accept = uipc_accept,
563 	.pru_attach = uipc_attach,
564 	.pru_bind = uipc_bind,
565 	.pru_connect = uipc_connect,
566 	.pru_connect2 = uipc_connect2,
567 	.pru_control = pru_control_notsupp,
568 	.pru_detach = uipc_detach,
569 	.pru_disconnect = uipc_disconnect,
570 	.pru_listen = uipc_listen,
571 	.pru_peeraddr = uipc_peeraddr,
572 	.pru_rcvd = uipc_rcvd,
573 	.pru_rcvoob = pru_rcvoob_notsupp,
574 	.pru_send = uipc_send,
575 	.pru_sense = uipc_sense,
576 	.pru_shutdown = uipc_shutdown,
577 	.pru_sockaddr = uipc_sockaddr,
578 	.pru_sosend = sosend,
579 	.pru_soreceive = soreceive
580 };
581 
582 int
583 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
584 {
585 	struct unpcb *unp;
586 	int error = 0;
587 
588 	lwkt_gettoken(&unp_token);
589 	unp = so->so_pcb;
590 
591 	switch (sopt->sopt_dir) {
592 	case SOPT_GET:
593 		switch (sopt->sopt_name) {
594 		case LOCAL_PEERCRED:
595 			if (unp->unp_flags & UNP_HAVEPC)
596 				soopt_from_kbuf(sopt, &unp->unp_peercred,
597 						sizeof(unp->unp_peercred));
598 			else {
599 				if (so->so_type == SOCK_STREAM)
600 					error = ENOTCONN;
601 				else if (so->so_type == SOCK_SEQPACKET)
602 					error = ENOTCONN;
603 				else
604 					error = EINVAL;
605 			}
606 			break;
607 		default:
608 			error = EOPNOTSUPP;
609 			break;
610 		}
611 		break;
612 	case SOPT_SET:
613 	default:
614 		error = EOPNOTSUPP;
615 		break;
616 	}
617 	lwkt_reltoken(&unp_token);
618 
619 	return (error);
620 }
621 
622 /*
623  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
624  * for stream sockets, although the total for sender and receiver is
625  * actually only PIPSIZ.
626  *
627  * Datagram sockets really use the sendspace as the maximum datagram size,
628  * and don't really want to reserve the sendspace.  Their recvspace should
629  * be large enough for at least one max-size datagram plus address.
630  *
631  * We want the local send/recv space to be significant larger then lo0's
632  * mtu of 16384.
633  */
634 #ifndef PIPSIZ
635 #define	PIPSIZ	57344
636 #endif
637 static u_long	unpst_sendspace = PIPSIZ;
638 static u_long	unpst_recvspace = PIPSIZ;
639 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
640 static u_long	unpdg_recvspace = 4*1024;
641 
642 static int	unp_rights;			/* file descriptors in flight */
643 static struct spinlock unp_spin = SPINLOCK_INITIALIZER(&unp_spin);
644 
645 SYSCTL_DECL(_net_local_seqpacket);
646 SYSCTL_DECL(_net_local_stream);
647 SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
648 	   &unpst_sendspace, 0, "");
649 SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
650 	   &unpst_recvspace, 0, "");
651 
652 SYSCTL_DECL(_net_local_dgram);
653 SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
654 	   &unpdg_sendspace, 0, "");
655 SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
656 	   &unpdg_recvspace, 0, "");
657 
658 SYSCTL_DECL(_net_local);
659 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
660 
661 static int
662 unp_attach(struct socket *so, struct pru_attach_info *ai)
663 {
664 	struct unpcb *unp;
665 	int error;
666 
667 	lwkt_gettoken(&unp_token);
668 	if (so->so_snd.ssb_hiwat == 0 || so->so_rcv.ssb_hiwat == 0) {
669 		switch (so->so_type) {
670 
671 		case SOCK_STREAM:
672 		case SOCK_SEQPACKET:
673 			error = soreserve(so, unpst_sendspace, unpst_recvspace,
674 					  ai->sb_rlimit);
675 			break;
676 
677 		case SOCK_DGRAM:
678 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace,
679 					  ai->sb_rlimit);
680 			break;
681 
682 		default:
683 			panic("unp_attach");
684 		}
685 		if (error)
686 			goto failed;
687 	}
688 	unp = kmalloc(sizeof(*unp), M_UNPCB, M_NOWAIT|M_ZERO);
689 	if (unp == NULL) {
690 		error = ENOBUFS;
691 		goto failed;
692 	}
693 	unp->unp_gencnt = ++unp_gencnt;
694 	unp_count++;
695 	LIST_INIT(&unp->unp_refs);
696 	unp->unp_socket = so;
697 	unp->unp_rvnode = ai->fd_rdir;		/* jail cruft XXX JH */
698 	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
699 			 : &unp_shead, unp, unp_link);
700 	so->so_pcb = (caddr_t)unp;
701 	soreference(so);
702 	so->so_port = sync_soport(so, NULL, NULL);
703 	error = 0;
704 failed:
705 	lwkt_reltoken(&unp_token);
706 	return error;
707 }
708 
709 static void
710 unp_detach(struct unpcb *unp)
711 {
712 	struct socket *so;
713 
714 	lwkt_gettoken(&unp_token);
715 
716 	LIST_REMOVE(unp, unp_link);
717 	unp->unp_gencnt = ++unp_gencnt;
718 	--unp_count;
719 	if (unp->unp_vnode) {
720 		unp->unp_vnode->v_socket = NULL;
721 		vrele(unp->unp_vnode);
722 		unp->unp_vnode = NULL;
723 	}
724 	if (unp->unp_conn)
725 		unp_disconnect(unp);
726 	while (!LIST_EMPTY(&unp->unp_refs))
727 		unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
728 	soisdisconnected(unp->unp_socket);
729 	so = unp->unp_socket;
730 	soreference(so);	/* for delayed sorflush */
731 	so->so_pcb = NULL;
732 	unp->unp_socket = NULL;
733 	sofree(so);		/* remove pcb ref */
734 
735 	if (unp_rights) {
736 		/*
737 		 * Normally the receive buffer is flushed later,
738 		 * in sofree, but if our receive buffer holds references
739 		 * to descriptors that are now garbage, we will dispose
740 		 * of those descriptor references after the garbage collector
741 		 * gets them (resulting in a "panic: closef: count < 0").
742 		 */
743 		sorflush(so);
744 		unp_gc();
745 	}
746 	sofree(so);
747 	lwkt_reltoken(&unp_token);
748 
749 	if (unp->unp_addr)
750 		kfree(unp->unp_addr, M_SONAME);
751 	kfree(unp, M_UNPCB);
752 }
753 
754 static int
755 unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
756 {
757 	struct proc *p = td->td_proc;
758 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
759 	struct vnode *vp;
760 	struct vattr vattr;
761 	int error, namelen;
762 	struct nlookupdata nd;
763 	char buf[SOCK_MAXADDRLEN];
764 
765 	lwkt_gettoken(&unp_token);
766 	if (unp->unp_vnode != NULL) {
767 		error = EINVAL;
768 		goto failed;
769 	}
770 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
771 	if (namelen <= 0) {
772 		error = EINVAL;
773 		goto failed;
774 	}
775 	strncpy(buf, soun->sun_path, namelen);
776 	buf[namelen] = 0;	/* null-terminate the string */
777 	error = nlookup_init(&nd, buf, UIO_SYSSPACE,
778 			     NLC_LOCKVP | NLC_CREATE | NLC_REFDVP);
779 	if (error == 0)
780 		error = nlookup(&nd);
781 	if (error == 0 && nd.nl_nch.ncp->nc_vp != NULL)
782 		error = EADDRINUSE;
783 	if (error)
784 		goto done;
785 
786 	VATTR_NULL(&vattr);
787 	vattr.va_type = VSOCK;
788 	vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask);
789 	error = VOP_NCREATE(&nd.nl_nch, nd.nl_dvp, &vp, nd.nl_cred, &vattr);
790 	if (error == 0) {
791 		vp->v_socket = unp->unp_socket;
792 		unp->unp_vnode = vp;
793 		unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam);
794 		vn_unlock(vp);
795 	}
796 done:
797 	nlookup_done(&nd);
798 failed:
799 	lwkt_reltoken(&unp_token);
800 	return (error);
801 }
802 
803 static int
804 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
805 {
806 	struct proc *p = td->td_proc;
807 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
808 	struct vnode *vp;
809 	struct socket *so2, *so3;
810 	struct unpcb *unp, *unp2, *unp3;
811 	int error, len;
812 	struct nlookupdata nd;
813 	char buf[SOCK_MAXADDRLEN];
814 
815 	lwkt_gettoken(&unp_token);
816 
817 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
818 	if (len <= 0) {
819 		error = EINVAL;
820 		goto failed;
821 	}
822 	strncpy(buf, soun->sun_path, len);
823 	buf[len] = 0;
824 
825 	vp = NULL;
826 	error = nlookup_init(&nd, buf, UIO_SYSSPACE, NLC_FOLLOW);
827 	if (error == 0)
828 		error = nlookup(&nd);
829 	if (error == 0)
830 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
831 	nlookup_done(&nd);
832 	if (error)
833 		goto failed;
834 
835 	if (vp->v_type != VSOCK) {
836 		error = ENOTSOCK;
837 		goto bad;
838 	}
839 	error = VOP_ACCESS(vp, VWRITE, p->p_ucred);
840 	if (error)
841 		goto bad;
842 	so2 = vp->v_socket;
843 	if (so2 == NULL) {
844 		error = ECONNREFUSED;
845 		goto bad;
846 	}
847 	if (so->so_type != so2->so_type) {
848 		error = EPROTOTYPE;
849 		goto bad;
850 	}
851 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
852 		if (!(so2->so_options & SO_ACCEPTCONN) ||
853 		    (so3 = sonewconn(so2, 0)) == NULL) {
854 			error = ECONNREFUSED;
855 			goto bad;
856 		}
857 		unp = so->so_pcb;
858 		unp2 = so2->so_pcb;
859 		unp3 = so3->so_pcb;
860 		if (unp2->unp_addr)
861 			unp3->unp_addr = (struct sockaddr_un *)
862 				dup_sockaddr((struct sockaddr *)unp2->unp_addr);
863 
864 		/*
865 		 * unp_peercred management:
866 		 *
867 		 * The connecter's (client's) credentials are copied
868 		 * from its process structure at the time of connect()
869 		 * (which is now).
870 		 */
871 		cru2x(p->p_ucred, &unp3->unp_peercred);
872 		unp3->unp_flags |= UNP_HAVEPC;
873 		/*
874 		 * The receiver's (server's) credentials are copied
875 		 * from the unp_peercred member of socket on which the
876 		 * former called listen(); unp_listen() cached that
877 		 * process's credentials at that time so we can use
878 		 * them now.
879 		 */
880 		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
881 		    ("unp_connect: listener without cached peercred"));
882 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
883 		    sizeof(unp->unp_peercred));
884 		unp->unp_flags |= UNP_HAVEPC;
885 
886 		so2 = so3;
887 	}
888 	error = unp_connect2(so, so2);
889 bad:
890 	vput(vp);
891 failed:
892 	lwkt_reltoken(&unp_token);
893 	return (error);
894 }
895 
896 int
897 unp_connect2(struct socket *so, struct socket *so2)
898 {
899 	struct unpcb *unp;
900 	struct unpcb *unp2;
901 
902 	lwkt_gettoken(&unp_token);
903 	unp = so->so_pcb;
904 	if (so2->so_type != so->so_type) {
905 		lwkt_reltoken(&unp_token);
906 		return (EPROTOTYPE);
907 	}
908 	unp2 = so2->so_pcb;
909 	unp->unp_conn = unp2;
910 
911 	switch (so->so_type) {
912 	case SOCK_DGRAM:
913 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
914 		soisconnected(so);
915 		break;
916 
917 	case SOCK_STREAM:
918 	case SOCK_SEQPACKET:
919 		unp2->unp_conn = unp;
920 		soisconnected(so);
921 		soisconnected(so2);
922 		break;
923 
924 	default:
925 		panic("unp_connect2");
926 	}
927 	lwkt_reltoken(&unp_token);
928 	return (0);
929 }
930 
931 static void
932 unp_disconnect(struct unpcb *unp)
933 {
934 	struct unpcb *unp2;
935 
936 	lwkt_gettoken(&unp_token);
937 
938 	unp2 = unp->unp_conn;
939 	if (unp2 == NULL) {
940 		lwkt_reltoken(&unp_token);
941 		return;
942 	}
943 
944 	unp->unp_conn = NULL;
945 
946 	switch (unp->unp_socket->so_type) {
947 	case SOCK_DGRAM:
948 		LIST_REMOVE(unp, unp_reflink);
949 		soclrstate(unp->unp_socket, SS_ISCONNECTED);
950 		break;
951 	case SOCK_STREAM:
952 	case SOCK_SEQPACKET:
953 		soisdisconnected(unp->unp_socket);
954 		unp2->unp_conn = NULL;
955 		soisdisconnected(unp2->unp_socket);
956 		break;
957 	}
958 	lwkt_reltoken(&unp_token);
959 }
960 
961 #ifdef notdef
962 void
963 unp_abort(struct unpcb *unp)
964 {
965 	lwkt_gettoken(&unp_token);
966 	unp_detach(unp);
967 	lwkt_reltoken(&unp_token);
968 }
969 #endif
970 
971 static int
972 prison_unpcb(struct thread *td, struct unpcb *unp)
973 {
974 	struct proc *p;
975 
976 	if (td == NULL)
977 		return (0);
978 	if ((p = td->td_proc) == NULL)
979 		return (0);
980 	if (!p->p_ucred->cr_prison)
981 		return (0);
982 	if (p->p_fd->fd_rdir == unp->unp_rvnode)
983 		return (0);
984 	return (1);
985 }
986 
987 static int
988 unp_pcblist(SYSCTL_HANDLER_ARGS)
989 {
990 	int error, i, n;
991 	struct unpcb *unp, **unp_list;
992 	unp_gen_t gencnt;
993 	struct unp_head *head;
994 
995 	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
996 
997 	KKASSERT(curproc != NULL);
998 
999 	/*
1000 	 * The process of preparing the PCB list is too time-consuming and
1001 	 * resource-intensive to repeat twice on every request.
1002 	 */
1003 	if (req->oldptr == NULL) {
1004 		n = unp_count;
1005 		req->oldidx = (n + n/8) * sizeof(struct xunpcb);
1006 		return 0;
1007 	}
1008 
1009 	if (req->newptr != NULL)
1010 		return EPERM;
1011 
1012 	lwkt_gettoken(&unp_token);
1013 
1014 	/*
1015 	 * OK, now we're committed to doing something.
1016 	 */
1017 	gencnt = unp_gencnt;
1018 	n = unp_count;
1019 
1020 	unp_list = kmalloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1021 
1022 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1023 	     unp = LIST_NEXT(unp, unp_link)) {
1024 		if (unp->unp_gencnt <= gencnt && !prison_unpcb(req->td, unp))
1025 			unp_list[i++] = unp;
1026 	}
1027 	n = i;			/* in case we lost some during malloc */
1028 
1029 	error = 0;
1030 	for (i = 0; i < n; i++) {
1031 		unp = unp_list[i];
1032 		if (unp->unp_gencnt <= gencnt) {
1033 			struct xunpcb xu;
1034 			xu.xu_len = sizeof xu;
1035 			xu.xu_unpp = unp;
1036 			/*
1037 			 * XXX - need more locking here to protect against
1038 			 * connect/disconnect races for SMP.
1039 			 */
1040 			if (unp->unp_addr)
1041 				bcopy(unp->unp_addr, &xu.xu_addr,
1042 				      unp->unp_addr->sun_len);
1043 			if (unp->unp_conn && unp->unp_conn->unp_addr)
1044 				bcopy(unp->unp_conn->unp_addr,
1045 				      &xu.xu_caddr,
1046 				      unp->unp_conn->unp_addr->sun_len);
1047 			bcopy(unp, &xu.xu_unp, sizeof *unp);
1048 			sotoxsocket(unp->unp_socket, &xu.xu_socket);
1049 			error = SYSCTL_OUT(req, &xu, sizeof xu);
1050 		}
1051 	}
1052 	lwkt_reltoken(&unp_token);
1053 	kfree(unp_list, M_TEMP);
1054 
1055 	return error;
1056 }
1057 
1058 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD,
1059 	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1060 	    "List of active local datagram sockets");
1061 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD,
1062 	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1063 	    "List of active local stream sockets");
1064 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, CTLFLAG_RD,
1065 	    (caddr_t)(long)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
1066 	    "List of active local seqpacket stream sockets");
1067 
1068 static void
1069 unp_shutdown(struct unpcb *unp)
1070 {
1071 	struct socket *so;
1072 
1073 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
1074 	     unp->unp_socket->so_type == SOCK_SEQPACKET) &&
1075 	    unp->unp_conn != NULL && (so = unp->unp_conn->unp_socket)) {
1076 		socantrcvmore(so);
1077 	}
1078 }
1079 
1080 static void
1081 unp_drop(struct unpcb *unp, int err)
1082 {
1083 	struct socket *so = unp->unp_socket;
1084 
1085 	so->so_error = err;
1086 	unp_disconnect(unp);
1087 }
1088 
1089 #ifdef notdef
1090 void
1091 unp_drain(void)
1092 {
1093 	lwkt_gettoken(&unp_token);
1094 	lwkt_reltoken(&unp_token);
1095 }
1096 #endif
1097 
1098 int
1099 unp_externalize(struct mbuf *rights)
1100 {
1101 	struct thread *td = curthread;
1102 	struct proc *p = td->td_proc;		/* XXX */
1103 	struct lwp *lp = td->td_lwp;
1104 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1105 	int *fdp;
1106 	int i;
1107 	struct file **rp;
1108 	struct file *fp;
1109 	int newfds = (cm->cmsg_len - (CMSG_DATA(cm) - (u_char *)cm))
1110 		/ sizeof (struct file *);
1111 	int f;
1112 
1113 	lwkt_gettoken(&unp_token);
1114 
1115 	/*
1116 	 * if the new FD's will not fit, then we free them all
1117 	 */
1118 	if (!fdavail(p, newfds)) {
1119 		rp = (struct file **)CMSG_DATA(cm);
1120 		for (i = 0; i < newfds; i++) {
1121 			fp = *rp;
1122 			/*
1123 			 * zero the pointer before calling unp_discard,
1124 			 * since it may end up in unp_gc()..
1125 			 */
1126 			*rp++ = 0;
1127 			unp_discard(fp, NULL);
1128 		}
1129 		lwkt_reltoken(&unp_token);
1130 		return (EMSGSIZE);
1131 	}
1132 
1133 	/*
1134 	 * now change each pointer to an fd in the global table to
1135 	 * an integer that is the index to the local fd table entry
1136 	 * that we set up to point to the global one we are transferring.
1137 	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1138 	 * then do it in forward order. In that case, an integer will
1139 	 * always come in the same place or before its corresponding
1140 	 * struct file pointer.
1141 	 * If sizeof (struct file *) is smaller than sizeof int, then
1142 	 * do it in reverse order.
1143 	 */
1144 	if (sizeof (struct file *) >= sizeof (int)) {
1145 		fdp = (int *)CMSG_DATA(cm);
1146 		rp = (struct file **)CMSG_DATA(cm);
1147 		for (i = 0; i < newfds; i++) {
1148 			if (fdalloc(p, 0, &f))
1149 				panic("unp_externalize");
1150 			fp = *rp++;
1151 			unp_fp_externalize(lp, fp, f);
1152 			*fdp++ = f;
1153 		}
1154 	} else {
1155 		fdp = (int *)CMSG_DATA(cm) + newfds - 1;
1156 		rp = (struct file **)CMSG_DATA(cm) + newfds - 1;
1157 		for (i = 0; i < newfds; i++) {
1158 			if (fdalloc(p, 0, &f))
1159 				panic("unp_externalize");
1160 			fp = *rp--;
1161 			unp_fp_externalize(lp, fp, f);
1162 			*fdp-- = f;
1163 		}
1164 	}
1165 
1166 	/*
1167 	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1168 	 * differs.
1169 	 */
1170 	cm->cmsg_len = CMSG_LEN(newfds * sizeof(int));
1171 	rights->m_len = cm->cmsg_len;
1172 
1173 	lwkt_reltoken(&unp_token);
1174 	return (0);
1175 }
1176 
1177 static void
1178 unp_fp_externalize(struct lwp *lp, struct file *fp, int fd)
1179 {
1180 	struct file *fx;
1181 	int error;
1182 
1183 	lwkt_gettoken(&unp_token);
1184 
1185 	if (lp) {
1186 		KKASSERT(fd >= 0);
1187 		if (fp->f_flag & FREVOKED) {
1188 			kprintf("Warning: revoked fp exiting unix socket\n");
1189 			fx = NULL;
1190 			error = falloc(lp, &fx, NULL);
1191 			if (error == 0)
1192 				fsetfd(lp->lwp_proc->p_fd, fx, fd);
1193 			else
1194 				fsetfd(lp->lwp_proc->p_fd, NULL, fd);
1195 			fdrop(fx);
1196 		} else {
1197 			fsetfd(lp->lwp_proc->p_fd, fp, fd);
1198 		}
1199 	}
1200 	spin_lock(&unp_spin);
1201 	fp->f_msgcount--;
1202 	unp_rights--;
1203 	spin_unlock(&unp_spin);
1204 	fdrop(fp);
1205 
1206 	lwkt_reltoken(&unp_token);
1207 }
1208 
1209 
1210 void
1211 unp_init(void)
1212 {
1213 	LIST_INIT(&unp_dhead);
1214 	LIST_INIT(&unp_shead);
1215 	spin_init(&unp_spin);
1216 }
1217 
1218 static int
1219 unp_internalize(struct mbuf *control, struct thread *td)
1220 {
1221 	struct proc *p = td->td_proc;
1222 	struct filedesc *fdescp;
1223 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1224 	struct file **rp;
1225 	struct file *fp;
1226 	int i, fd, *fdp;
1227 	struct cmsgcred *cmcred;
1228 	int oldfds;
1229 	u_int newlen;
1230 	int error;
1231 
1232 	KKASSERT(p);
1233 	lwkt_gettoken(&unp_token);
1234 
1235 	fdescp = p->p_fd;
1236 	if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
1237 	    cm->cmsg_level != SOL_SOCKET ||
1238 	    CMSG_ALIGN(cm->cmsg_len) != control->m_len) {
1239 		error = EINVAL;
1240 		goto done;
1241 	}
1242 
1243 	/*
1244 	 * Fill in credential information.
1245 	 */
1246 	if (cm->cmsg_type == SCM_CREDS) {
1247 		cmcred = (struct cmsgcred *)CMSG_DATA(cm);
1248 		cmcred->cmcred_pid = p->p_pid;
1249 		cmcred->cmcred_uid = p->p_ucred->cr_ruid;
1250 		cmcred->cmcred_gid = p->p_ucred->cr_rgid;
1251 		cmcred->cmcred_euid = p->p_ucred->cr_uid;
1252 		cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
1253 							CMGROUP_MAX);
1254 		for (i = 0; i < cmcred->cmcred_ngroups; i++)
1255 			cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
1256 		error = 0;
1257 		goto done;
1258 	}
1259 
1260 	/*
1261 	 * cmsghdr may not be aligned, do not allow calculation(s) to
1262 	 * go negative.
1263 	 */
1264 	if (cm->cmsg_len < CMSG_LEN(0)) {
1265 		error = EINVAL;
1266 		goto done;
1267 	}
1268 
1269 	oldfds = (cm->cmsg_len - CMSG_LEN(0)) / sizeof (int);
1270 
1271 	/*
1272 	 * check that all the FDs passed in refer to legal OPEN files
1273 	 * If not, reject the entire operation.
1274 	 */
1275 	fdp = (int *)CMSG_DATA(cm);
1276 	for (i = 0; i < oldfds; i++) {
1277 		fd = *fdp++;
1278 		if ((unsigned)fd >= fdescp->fd_nfiles ||
1279 		    fdescp->fd_files[fd].fp == NULL) {
1280 			error = EBADF;
1281 			goto done;
1282 		}
1283 		if (fdescp->fd_files[fd].fp->f_type == DTYPE_KQUEUE) {
1284 			error = EOPNOTSUPP;
1285 			goto done;
1286 		}
1287 	}
1288 	/*
1289 	 * Now replace the integer FDs with pointers to
1290 	 * the associated global file table entry..
1291 	 * Allocate a bigger buffer as necessary. But if an cluster is not
1292 	 * enough, return E2BIG.
1293 	 */
1294 	newlen = CMSG_LEN(oldfds * sizeof(struct file *));
1295 	if (newlen > MCLBYTES) {
1296 		error = E2BIG;
1297 		goto done;
1298 	}
1299 	if (newlen - control->m_len > M_TRAILINGSPACE(control)) {
1300 		if (control->m_flags & M_EXT) {
1301 			error = E2BIG;
1302 			goto done;
1303 		}
1304 		MCLGET(control, MB_WAIT);
1305 		if (!(control->m_flags & M_EXT)) {
1306 			error = ENOBUFS;
1307 			goto done;
1308 		}
1309 
1310 		/* copy the data to the cluster */
1311 		memcpy(mtod(control, char *), cm, cm->cmsg_len);
1312 		cm = mtod(control, struct cmsghdr *);
1313 	}
1314 
1315 	/*
1316 	 * Adjust length, in case sizeof(struct file *) and sizeof(int)
1317 	 * differs.
1318 	 */
1319 	cm->cmsg_len = newlen;
1320 	control->m_len = CMSG_ALIGN(newlen);
1321 
1322 	/*
1323 	 * Transform the file descriptors into struct file pointers.
1324 	 * If sizeof (struct file *) is bigger than or equal to sizeof int,
1325 	 * then do it in reverse order so that the int won't get until
1326 	 * we're done.
1327 	 * If sizeof (struct file *) is smaller than sizeof int, then
1328 	 * do it in forward order.
1329 	 */
1330 	if (sizeof (struct file *) >= sizeof (int)) {
1331 		fdp = (int *)CMSG_DATA(cm) + oldfds - 1;
1332 		rp = (struct file **)CMSG_DATA(cm) + oldfds - 1;
1333 		for (i = 0; i < oldfds; i++) {
1334 			fp = fdescp->fd_files[*fdp--].fp;
1335 			*rp-- = fp;
1336 			fhold(fp);
1337 			spin_lock(&unp_spin);
1338 			fp->f_msgcount++;
1339 			unp_rights++;
1340 			spin_unlock(&unp_spin);
1341 		}
1342 	} else {
1343 		fdp = (int *)CMSG_DATA(cm);
1344 		rp = (struct file **)CMSG_DATA(cm);
1345 		for (i = 0; i < oldfds; i++) {
1346 			fp = fdescp->fd_files[*fdp++].fp;
1347 			*rp++ = fp;
1348 			fhold(fp);
1349 			spin_lock(&unp_spin);
1350 			fp->f_msgcount++;
1351 			unp_rights++;
1352 			spin_unlock(&unp_spin);
1353 		}
1354 	}
1355 	error = 0;
1356 done:
1357 	lwkt_reltoken(&unp_token);
1358 	return error;
1359 }
1360 
1361 /*
1362  * Garbage collect in-transit file descriptors that get lost due to
1363  * loops (i.e. when a socket is sent to another process over itself,
1364  * and more complex situations).
1365  *
1366  * NOT MPSAFE - TODO socket flush code and maybe closef.  Rest is MPSAFE.
1367  */
1368 
1369 struct unp_gc_info {
1370 	struct file **extra_ref;
1371 	struct file *locked_fp;
1372 	int defer;
1373 	int index;
1374 	int maxindex;
1375 };
1376 
1377 static void
1378 unp_gc(void)
1379 {
1380 	struct unp_gc_info info;
1381 	static boolean_t unp_gcing;
1382 	struct file **fpp;
1383 	int i;
1384 
1385 	spin_lock(&unp_spin);
1386 	if (unp_gcing) {
1387 		spin_unlock(&unp_spin);
1388 		return;
1389 	}
1390 	unp_gcing = TRUE;
1391 	spin_unlock(&unp_spin);
1392 
1393 	lwkt_gettoken(&unp_token);
1394 
1395 	/*
1396 	 * before going through all this, set all FDs to
1397 	 * be NOT defered and NOT externally accessible
1398 	 */
1399 	info.defer = 0;
1400 	allfiles_scan_exclusive(unp_gc_clearmarks, NULL);
1401 	do {
1402 		allfiles_scan_exclusive(unp_gc_checkmarks, &info);
1403 	} while (info.defer);
1404 
1405 	/*
1406 	 * We grab an extra reference to each of the file table entries
1407 	 * that are not otherwise accessible and then free the rights
1408 	 * that are stored in messages on them.
1409 	 *
1410 	 * The bug in the orginal code is a little tricky, so I'll describe
1411 	 * what's wrong with it here.
1412 	 *
1413 	 * It is incorrect to simply unp_discard each entry for f_msgcount
1414 	 * times -- consider the case of sockets A and B that contain
1415 	 * references to each other.  On a last close of some other socket,
1416 	 * we trigger a gc since the number of outstanding rights (unp_rights)
1417 	 * is non-zero.  If during the sweep phase the gc code un_discards,
1418 	 * we end up doing a (full) closef on the descriptor.  A closef on A
1419 	 * results in the following chain.  Closef calls soo_close, which
1420 	 * calls soclose.   Soclose calls first (through the switch
1421 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
1422 	 * returns because the previous instance had set unp_gcing, and
1423 	 * we return all the way back to soclose, which marks the socket
1424 	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
1425 	 * to free up the rights that are queued in messages on the socket A,
1426 	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
1427 	 * switch unp_dispose, which unp_scans with unp_discard.  This second
1428 	 * instance of unp_discard just calls closef on B.
1429 	 *
1430 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1431 	 * which results in another closef on A.  Unfortunately, A is already
1432 	 * being closed, and the descriptor has already been marked with
1433 	 * SS_NOFDREF, and soclose panics at this point.
1434 	 *
1435 	 * Here, we first take an extra reference to each inaccessible
1436 	 * descriptor.  Then, we call sorflush ourself, since we know
1437 	 * it is a Unix domain socket anyhow.  After we destroy all the
1438 	 * rights carried in messages, we do a last closef to get rid
1439 	 * of our extra reference.  This is the last close, and the
1440 	 * unp_detach etc will shut down the socket.
1441 	 *
1442 	 * 91/09/19, bsy@cs.cmu.edu
1443 	 */
1444 	info.extra_ref = kmalloc(256 * sizeof(struct file *), M_FILE, M_WAITOK);
1445 	info.maxindex = 256;
1446 
1447 	do {
1448 		/*
1449 		 * Look for matches
1450 		 */
1451 		info.index = 0;
1452 		allfiles_scan_exclusive(unp_gc_checkrefs, &info);
1453 
1454 		/*
1455 		 * For each FD on our hit list, do the following two things
1456 		 */
1457 		for (i = info.index, fpp = info.extra_ref; --i >= 0; ++fpp) {
1458 			struct file *tfp = *fpp;
1459 			if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
1460 				sorflush((struct socket *)(tfp->f_data));
1461 		}
1462 		for (i = info.index, fpp = info.extra_ref; --i >= 0; ++fpp)
1463 			closef(*fpp, NULL);
1464 	} while (info.index == info.maxindex);
1465 
1466 	lwkt_reltoken(&unp_token);
1467 
1468 	kfree((caddr_t)info.extra_ref, M_FILE);
1469 	unp_gcing = FALSE;
1470 }
1471 
1472 /*
1473  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1474  */
1475 static int
1476 unp_gc_checkrefs(struct file *fp, void *data)
1477 {
1478 	struct unp_gc_info *info = data;
1479 
1480 	if (fp->f_count == 0)
1481 		return(0);
1482 	if (info->index == info->maxindex)
1483 		return(-1);
1484 
1485 	/*
1486 	 * If all refs are from msgs, and it's not marked accessible
1487 	 * then it must be referenced from some unreachable cycle
1488 	 * of (shut-down) FDs, so include it in our
1489 	 * list of FDs to remove
1490 	 */
1491 	if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1492 		info->extra_ref[info->index++] = fp;
1493 		fhold(fp);
1494 	}
1495 	return(0);
1496 }
1497 
1498 /*
1499  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1500  */
1501 static int
1502 unp_gc_clearmarks(struct file *fp, void *data __unused)
1503 {
1504 	atomic_clear_int(&fp->f_flag, FMARK | FDEFER);
1505 	return(0);
1506 }
1507 
1508 /*
1509  * MPSAFE - NOTE: filehead list and file pointer spinlocked on entry
1510  */
1511 static int
1512 unp_gc_checkmarks(struct file *fp, void *data)
1513 {
1514 	struct unp_gc_info *info = data;
1515 	struct socket *so;
1516 
1517 	/*
1518 	 * If the file is not open, skip it
1519 	 */
1520 	if (fp->f_count == 0)
1521 		return(0);
1522 	/*
1523 	 * If we already marked it as 'defer'  in a
1524 	 * previous pass, then try process it this time
1525 	 * and un-mark it
1526 	 */
1527 	if (fp->f_flag & FDEFER) {
1528 		atomic_clear_int(&fp->f_flag, FDEFER);
1529 		--info->defer;
1530 	} else {
1531 		/*
1532 		 * if it's not defered, then check if it's
1533 		 * already marked.. if so skip it
1534 		 */
1535 		if (fp->f_flag & FMARK)
1536 			return(0);
1537 		/*
1538 		 * If all references are from messages
1539 		 * in transit, then skip it. it's not
1540 		 * externally accessible.
1541 		 */
1542 		if (fp->f_count == fp->f_msgcount)
1543 			return(0);
1544 		/*
1545 		 * If it got this far then it must be
1546 		 * externally accessible.
1547 		 */
1548 		atomic_set_int(&fp->f_flag, FMARK);
1549 	}
1550 
1551 	/*
1552 	 * either it was defered, or it is externally
1553 	 * accessible and not already marked so.
1554 	 * Now check if it is possibly one of OUR sockets.
1555 	 */
1556 	if (fp->f_type != DTYPE_SOCKET ||
1557 	    (so = (struct socket *)fp->f_data) == NULL)
1558 		return(0);
1559 	if (so->so_proto->pr_domain != &localdomain ||
1560 	    !(so->so_proto->pr_flags & PR_RIGHTS))
1561 		return(0);
1562 #ifdef notdef
1563 	if (so->so_rcv.ssb_flags & SSB_LOCK) {
1564 		/*
1565 		 * This is problematical; it's not clear
1566 		 * we need to wait for the sockbuf to be
1567 		 * unlocked (on a uniprocessor, at least),
1568 		 * and it's also not clear what to do
1569 		 * if sbwait returns an error due to receipt
1570 		 * of a signal.  If sbwait does return
1571 		 * an error, we'll go into an infinite
1572 		 * loop.  Delete all of this for now.
1573 		 */
1574 		sbwait(&so->so_rcv);
1575 		goto restart;
1576 	}
1577 #endif
1578 	/*
1579 	 * So, Ok, it's one of our sockets and it IS externally
1580 	 * accessible (or was defered). Now we look
1581 	 * to see if we hold any file descriptors in its
1582 	 * message buffers. Follow those links and mark them
1583 	 * as accessible too.
1584 	 */
1585 	info->locked_fp = fp;
1586 /*	spin_lock_wr(&so->so_rcv.sb_spin); */
1587 	unp_scan(so->so_rcv.ssb_mb, unp_mark, info);
1588 /*	spin_unlock_wr(&so->so_rcv.sb_spin);*/
1589 	return (0);
1590 }
1591 
1592 /*
1593  * Scan all unix domain sockets and replace any revoked file pointers
1594  * found with the dummy file pointer fx.  We don't worry about races
1595  * against file pointers being read out as those are handled in the
1596  * externalize code.
1597  */
1598 
1599 #define REVOKE_GC_MAXFILES	32
1600 
1601 struct unp_revoke_gc_info {
1602 	struct file	*fx;
1603 	struct file	*fary[REVOKE_GC_MAXFILES];
1604 	int		fcount;
1605 };
1606 
1607 void
1608 unp_revoke_gc(struct file *fx)
1609 {
1610 	struct unp_revoke_gc_info info;
1611 	int i;
1612 
1613 	lwkt_gettoken(&unp_token);
1614 	info.fx = fx;
1615 	do {
1616 		info.fcount = 0;
1617 		allfiles_scan_exclusive(unp_revoke_gc_check, &info);
1618 		for (i = 0; i < info.fcount; ++i)
1619 			unp_fp_externalize(NULL, info.fary[i], -1);
1620 	} while (info.fcount == REVOKE_GC_MAXFILES);
1621 	lwkt_reltoken(&unp_token);
1622 }
1623 
1624 /*
1625  * Check for and replace revoked descriptors.
1626  *
1627  * WARNING:  This routine is not allowed to block.
1628  */
1629 static int
1630 unp_revoke_gc_check(struct file *fps, void *vinfo)
1631 {
1632 	struct unp_revoke_gc_info *info = vinfo;
1633 	struct file *fp;
1634 	struct socket *so;
1635 	struct mbuf *m0;
1636 	struct mbuf *m;
1637 	struct file **rp;
1638 	struct cmsghdr *cm;
1639 	int i;
1640 	int qfds;
1641 
1642 	/*
1643 	 * Is this a unix domain socket with rights-passing abilities?
1644 	 */
1645 	if (fps->f_type != DTYPE_SOCKET)
1646 		return (0);
1647 	if ((so = (struct socket *)fps->f_data) == NULL)
1648 		return(0);
1649 	if (so->so_proto->pr_domain != &localdomain)
1650 		return(0);
1651 	if ((so->so_proto->pr_flags & PR_RIGHTS) == 0)
1652 		return(0);
1653 
1654 	/*
1655 	 * Scan the mbufs for control messages and replace any revoked
1656 	 * descriptors we find.
1657 	 */
1658 	m0 = so->so_rcv.ssb_mb;
1659 	while (m0) {
1660 		for (m = m0; m; m = m->m_next) {
1661 			if (m->m_type != MT_CONTROL)
1662 				continue;
1663 			if (m->m_len < sizeof(*cm))
1664 				continue;
1665 			cm = mtod(m, struct cmsghdr *);
1666 			if (cm->cmsg_level != SOL_SOCKET ||
1667 			    cm->cmsg_type != SCM_RIGHTS) {
1668 				continue;
1669 			}
1670 			qfds = (cm->cmsg_len - CMSG_LEN(0)) / sizeof(void *);
1671 			rp = (struct file **)CMSG_DATA(cm);
1672 			for (i = 0; i < qfds; i++) {
1673 				fp = rp[i];
1674 				if (fp->f_flag & FREVOKED) {
1675 					kprintf("Warning: Removing revoked fp from unix domain socket queue\n");
1676 					fhold(info->fx);
1677 					info->fx->f_msgcount++;
1678 					unp_rights++;
1679 					rp[i] = info->fx;
1680 					info->fary[info->fcount++] = fp;
1681 				}
1682 				if (info->fcount == REVOKE_GC_MAXFILES)
1683 					break;
1684 			}
1685 			if (info->fcount == REVOKE_GC_MAXFILES)
1686 				break;
1687 		}
1688 		m0 = m0->m_nextpkt;
1689 		if (info->fcount == REVOKE_GC_MAXFILES)
1690 			break;
1691 	}
1692 
1693 	/*
1694 	 * Stop the scan if we filled up our array.
1695 	 */
1696 	if (info->fcount == REVOKE_GC_MAXFILES)
1697 		return(-1);
1698 	return(0);
1699 }
1700 
1701 void
1702 unp_dispose(struct mbuf *m)
1703 {
1704 	lwkt_gettoken(&unp_token);
1705 	if (m)
1706 		unp_scan(m, unp_discard, NULL);
1707 	lwkt_reltoken(&unp_token);
1708 }
1709 
1710 static int
1711 unp_listen(struct unpcb *unp, struct thread *td)
1712 {
1713 	struct proc *p = td->td_proc;
1714 
1715 	KKASSERT(p);
1716 	lwkt_gettoken(&unp_token);
1717 	cru2x(p->p_ucred, &unp->unp_peercred);
1718 	unp->unp_flags |= UNP_HAVEPCCACHED;
1719 	lwkt_reltoken(&unp_token);
1720 	return (0);
1721 }
1722 
1723 static void
1724 unp_scan(struct mbuf *m0, void (*op)(struct file *, void *), void *data)
1725 {
1726 	struct mbuf *m;
1727 	struct file **rp;
1728 	struct cmsghdr *cm;
1729 	int i;
1730 	int qfds;
1731 
1732 	while (m0) {
1733 		for (m = m0; m; m = m->m_next) {
1734 			if (m->m_type == MT_CONTROL &&
1735 			    m->m_len >= sizeof(*cm)) {
1736 				cm = mtod(m, struct cmsghdr *);
1737 				if (cm->cmsg_level != SOL_SOCKET ||
1738 				    cm->cmsg_type != SCM_RIGHTS)
1739 					continue;
1740 				qfds = (cm->cmsg_len - CMSG_LEN(0)) /
1741 					sizeof(void *);
1742 				rp = (struct file **)CMSG_DATA(cm);
1743 				for (i = 0; i < qfds; i++)
1744 					(*op)(*rp++, data);
1745 				break;		/* XXX, but saves time */
1746 			}
1747 		}
1748 		m0 = m0->m_nextpkt;
1749 	}
1750 }
1751 
1752 static void
1753 unp_mark(struct file *fp, void *data)
1754 {
1755 	struct unp_gc_info *info = data;
1756 
1757 	if ((fp->f_flag & FMARK) == 0) {
1758 		++info->defer;
1759 		atomic_set_int(&fp->f_flag, FMARK | FDEFER);
1760 	}
1761 }
1762 
1763 static void
1764 unp_discard(struct file *fp, void *data __unused)
1765 {
1766 	spin_lock(&unp_spin);
1767 	fp->f_msgcount--;
1768 	unp_rights--;
1769 	spin_unlock(&unp_spin);
1770 	closef(fp, NULL);
1771 }
1772 
1773