xref: /openbsd-src/sys/kern/uipc_usrreq.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: uipc_usrreq.c,v 1.45 2009/02/22 07:47:22 otto Exp $	*/
2 /*	$NetBSD: uipc_usrreq.c,v 1.18 1996/02/09 19:00:50 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/filedesc.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/unpcb.h>
44 #include <sys/un.h>
45 #include <sys/namei.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/stat.h>
49 #include <sys/mbuf.h>
50 
51 /*
52  * Unix communications domain.
53  *
54  * TODO:
55  *	SEQPACKET, RDM
56  *	rethink name space problems
57  *	need a proper out-of-band
58  */
59 struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
60 ino_t	unp_ino;			/* prototype for fake inode numbers */
61 
62 /*ARGSUSED*/
63 int
64 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
65     struct mbuf *control, struct proc *p)
66 {
67 	struct unpcb *unp = sotounpcb(so);
68 	struct socket *so2;
69 	int error = 0;
70 
71 	if (req == PRU_CONTROL)
72 		return (EOPNOTSUPP);
73 	if (req != PRU_SEND && control && control->m_len) {
74 		error = EOPNOTSUPP;
75 		goto release;
76 	}
77 	if (unp == NULL && req != PRU_ATTACH) {
78 		error = EINVAL;
79 		goto release;
80 	}
81 	switch (req) {
82 
83 	case PRU_ATTACH:
84 		if (unp) {
85 			error = EISCONN;
86 			break;
87 		}
88 		error = unp_attach(so);
89 		break;
90 
91 	case PRU_DETACH:
92 		unp_detach(unp);
93 		break;
94 
95 	case PRU_BIND:
96 		error = unp_bind(unp, nam, p);
97 		break;
98 
99 	case PRU_LISTEN:
100 		if (unp->unp_vnode == NULL)
101 			error = EINVAL;
102 		break;
103 
104 	case PRU_CONNECT:
105 		error = unp_connect(so, nam, p);
106 		break;
107 
108 	case PRU_CONNECT2:
109 		error = unp_connect2(so, (struct socket *)nam);
110 		break;
111 
112 	case PRU_DISCONNECT:
113 		unp_disconnect(unp);
114 		break;
115 
116 	case PRU_ACCEPT:
117 		/*
118 		 * Pass back name of connected socket,
119 		 * if it was bound and we are still connected
120 		 * (our peer may have closed already!).
121 		 */
122 		if (unp->unp_conn && unp->unp_conn->unp_addr) {
123 			nam->m_len = unp->unp_conn->unp_addr->m_len;
124 			bcopy(mtod(unp->unp_conn->unp_addr, caddr_t),
125 			    mtod(nam, caddr_t), (unsigned)nam->m_len);
126 		} else {
127 			nam->m_len = sizeof(sun_noname);
128 			*(mtod(nam, struct sockaddr *)) = sun_noname;
129 		}
130 		break;
131 
132 	case PRU_SHUTDOWN:
133 		socantsendmore(so);
134 		unp_shutdown(unp);
135 		break;
136 
137 	case PRU_RCVD:
138 		switch (so->so_type) {
139 
140 		case SOCK_DGRAM:
141 			panic("uipc 1");
142 			/*NOTREACHED*/
143 
144 		case SOCK_STREAM:
145 #define	rcv (&so->so_rcv)
146 #define snd (&so2->so_snd)
147 			if (unp->unp_conn == NULL)
148 				break;
149 			so2 = unp->unp_conn->unp_socket;
150 			/*
151 			 * Adjust backpressure on sender
152 			 * and wakeup any waiting to write.
153 			 */
154 			snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
155 			unp->unp_mbcnt = rcv->sb_mbcnt;
156 			snd->sb_hiwat += unp->unp_cc - rcv->sb_cc;
157 			unp->unp_cc = rcv->sb_cc;
158 			sowwakeup(so2);
159 #undef snd
160 #undef rcv
161 			break;
162 
163 		default:
164 			panic("uipc 2");
165 		}
166 		break;
167 
168 	case PRU_SEND:
169 		if (control && (error = unp_internalize(control, p)))
170 			break;
171 		switch (so->so_type) {
172 
173 		case SOCK_DGRAM: {
174 			struct sockaddr *from;
175 
176 			if (nam) {
177 				if (unp->unp_conn) {
178 					error = EISCONN;
179 					break;
180 				}
181 				error = unp_connect(so, nam, p);
182 				if (error)
183 					break;
184 			} else {
185 				if (unp->unp_conn == NULL) {
186 					error = ENOTCONN;
187 					break;
188 				}
189 			}
190 			so2 = unp->unp_conn->unp_socket;
191 			if (unp->unp_addr)
192 				from = mtod(unp->unp_addr, struct sockaddr *);
193 			else
194 				from = &sun_noname;
195 			if (sbappendaddr(&so2->so_rcv, from, m, control)) {
196 				sorwakeup(so2);
197 				m = NULL;
198 				control = NULL;
199 			} else
200 				error = ENOBUFS;
201 			if (nam)
202 				unp_disconnect(unp);
203 			break;
204 		}
205 
206 		case SOCK_STREAM:
207 #define	rcv (&so2->so_rcv)
208 #define	snd (&so->so_snd)
209 			if (so->so_state & SS_CANTSENDMORE) {
210 				error = EPIPE;
211 				break;
212 			}
213 			if (unp->unp_conn == NULL) {
214 				error = ENOTCONN;
215 				break;
216 			}
217 			so2 = unp->unp_conn->unp_socket;
218 			/*
219 			 * Send to paired receive port, and then reduce
220 			 * send buffer hiwater marks to maintain backpressure.
221 			 * Wake up readers.
222 			 */
223 			if (control) {
224 				if (sbappendcontrol(rcv, m, control))
225 					control = NULL;
226 			} else
227 				sbappend(rcv, m);
228 			snd->sb_mbmax -=
229 			    rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
230 			unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
231 			snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc;
232 			unp->unp_conn->unp_cc = rcv->sb_cc;
233 			sorwakeup(so2);
234 			m = NULL;
235 #undef snd
236 #undef rcv
237 			break;
238 
239 		default:
240 			panic("uipc 4");
241 		}
242 		/* we need to undo unp_internalize in case of errors */
243 		if (control && error)
244 			unp_dispose(control);
245 		break;
246 
247 	case PRU_ABORT:
248 		unp_drop(unp, ECONNABORTED);
249 		break;
250 
251 	case PRU_SENSE:
252 		((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
253 		if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
254 			so2 = unp->unp_conn->unp_socket;
255 			((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
256 		}
257 		((struct stat *) m)->st_dev = NODEV;
258 		if (unp->unp_ino == 0)
259 			unp->unp_ino = unp_ino++;
260 		((struct stat *) m)->st_atim =
261 		    ((struct stat *) m)->st_mtim =
262 		    ((struct stat *) m)->st_ctim = unp->unp_ctime;
263 		((struct stat *) m)->st_ino = unp->unp_ino;
264 		return (0);
265 
266 	case PRU_RCVOOB:
267 		return (EOPNOTSUPP);
268 
269 	case PRU_SENDOOB:
270 		error = EOPNOTSUPP;
271 		break;
272 
273 	case PRU_SOCKADDR:
274 		if (unp->unp_addr) {
275 			nam->m_len = unp->unp_addr->m_len;
276 			bcopy(mtod(unp->unp_addr, caddr_t),
277 			    mtod(nam, caddr_t), (unsigned)nam->m_len);
278 		} else
279 			nam->m_len = 0;
280 		break;
281 
282 	case PRU_PEERADDR:
283 		if (unp->unp_conn && unp->unp_conn->unp_addr) {
284 			nam->m_len = unp->unp_conn->unp_addr->m_len;
285 			bcopy(mtod(unp->unp_conn->unp_addr, caddr_t),
286 			    mtod(nam, caddr_t), (unsigned)nam->m_len);
287 		} else
288 			nam->m_len = 0;
289 		break;
290 
291 	case PRU_PEEREID:
292 		if (unp->unp_flags & UNP_FEIDS) {
293 			nam->m_len = sizeof(struct unpcbid);
294 			bcopy((caddr_t)(&(unp->unp_connid)),
295 			    mtod(nam, caddr_t), (unsigned)nam->m_len);
296 		} else
297 			nam->m_len = 0;
298 		break;
299 
300 	case PRU_SLOWTIMO:
301 		break;
302 
303 	default:
304 		panic("piusrreq");
305 	}
306 release:
307 	if (control)
308 		m_freem(control);
309 	if (m)
310 		m_freem(m);
311 	return (error);
312 }
313 
314 /*
315  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
316  * for stream sockets, although the total for sender and receiver is
317  * actually only PIPSIZ.
318  * Datagram sockets really use the sendspace as the maximum datagram size,
319  * and don't really want to reserve the sendspace.  Their recvspace should
320  * be large enough for at least one max-size datagram plus address.
321  */
322 #define	PIPSIZ	4096
323 u_long	unpst_sendspace = PIPSIZ;
324 u_long	unpst_recvspace = PIPSIZ;
325 u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
326 u_long	unpdg_recvspace = 4*1024;
327 
328 int	unp_rights;			/* file descriptors in flight */
329 
330 int
331 unp_attach(struct socket *so)
332 {
333 	struct unpcb *unp;
334 	int error;
335 
336 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
337 		switch (so->so_type) {
338 
339 		case SOCK_STREAM:
340 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
341 			break;
342 
343 		case SOCK_DGRAM:
344 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
345 			break;
346 
347 		default:
348 			panic("unp_attach");
349 		}
350 		if (error)
351 			return (error);
352 	}
353 	unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT|M_ZERO);
354 	if (unp == NULL)
355 		return (ENOBUFS);
356 	unp->unp_socket = so;
357 	so->so_pcb = unp;
358 	getnanotime(&unp->unp_ctime);
359 	return (0);
360 }
361 
362 void
363 unp_detach(struct unpcb *unp)
364 {
365 
366 	if (unp->unp_vnode) {
367 		unp->unp_vnode->v_socket = NULL;
368 		vrele(unp->unp_vnode);
369 		unp->unp_vnode = NULL;
370 	}
371 	if (unp->unp_conn)
372 		unp_disconnect(unp);
373 	while (unp->unp_refs)
374 		unp_drop(unp->unp_refs, ECONNRESET);
375 	soisdisconnected(unp->unp_socket);
376 	unp->unp_socket->so_pcb = NULL;
377 	m_freem(unp->unp_addr);
378 	if (unp_rights) {
379 		/*
380 		 * Normally the receive buffer is flushed later,
381 		 * in sofree, but if our receive buffer holds references
382 		 * to descriptors that are now garbage, we will dispose
383 		 * of those descriptor references after the garbage collector
384 		 * gets them (resulting in a "panic: closef: count < 0").
385 		 */
386 		sorflush(unp->unp_socket);
387 		free(unp, M_PCB);
388 		unp_gc();
389 	} else
390 		free(unp, M_PCB);
391 }
392 
393 int
394 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
395 {
396 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
397 	struct vnode *vp;
398 	struct vattr vattr;
399 	int error, namelen;
400 	struct nameidata nd;
401 	char buf[MLEN];
402 
403 	if (unp->unp_vnode != NULL)
404 		return (EINVAL);
405 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
406 	if (namelen <= 0 || namelen >= MLEN)
407 		return EINVAL;
408 	strncpy(buf, soun->sun_path, namelen);
409 	buf[namelen] = 0;       /* null-terminate the string */
410 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, buf, p);
411 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
412 	if ((error = namei(&nd)) != 0)
413 		return (error);
414 	vp = nd.ni_vp;
415 	if (vp != NULL) {
416 		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
417 		if (nd.ni_dvp == vp)
418 			vrele(nd.ni_dvp);
419 		else
420 			vput(nd.ni_dvp);
421 		vrele(vp);
422 		return (EADDRINUSE);
423 	}
424 	VATTR_NULL(&vattr);
425 	vattr.va_type = VSOCK;
426 	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
427 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
428 	if (error)
429 		return (error);
430 	vp = nd.ni_vp;
431 	vp->v_socket = unp->unp_socket;
432 	unp->unp_vnode = vp;
433 	unp->unp_addr = m_copy(nam, 0, (int)M_COPYALL);
434 	unp->unp_connid.unp_euid = p->p_ucred->cr_uid;
435 	unp->unp_connid.unp_egid = p->p_ucred->cr_gid;
436 	unp->unp_flags |= UNP_FEIDSBIND;
437 	VOP_UNLOCK(vp, 0, p);
438 	return (0);
439 }
440 
441 int
442 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
443 {
444 	struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *);
445 	struct vnode *vp;
446 	struct socket *so2, *so3;
447 	struct unpcb *unp, *unp2, *unp3;
448 	int error;
449 	struct nameidata nd;
450 
451 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
452 	if (nam->m_data + nam->m_len == &nam->m_dat[MLEN]) {	/* XXX */
453 		if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0)
454 			return (EMSGSIZE);
455 	} else
456 		*(mtod(nam, caddr_t) + nam->m_len) = 0;
457 	if ((error = namei(&nd)) != 0)
458 		return (error);
459 	vp = nd.ni_vp;
460 	if (vp->v_type != VSOCK) {
461 		error = ENOTSOCK;
462 		goto bad;
463 	}
464 	if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
465 		goto bad;
466 	so2 = vp->v_socket;
467 	if (so2 == NULL) {
468 		error = ECONNREFUSED;
469 		goto bad;
470 	}
471 	if (so->so_type != so2->so_type) {
472 		error = EPROTOTYPE;
473 		goto bad;
474 	}
475 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
476 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
477 		    (so3 = sonewconn(so2, 0)) == 0) {
478 			error = ECONNREFUSED;
479 			goto bad;
480 		}
481 		unp = sotounpcb(so);
482 		unp2 = sotounpcb(so2);
483 		unp3 = sotounpcb(so3);
484 		if (unp2->unp_addr)
485 			unp3->unp_addr =
486 			    m_copy(unp2->unp_addr, 0, (int)M_COPYALL);
487 		unp3->unp_connid.unp_euid = p->p_ucred->cr_uid;
488 		unp3->unp_connid.unp_egid = p->p_ucred->cr_gid;
489 		unp3->unp_flags |= UNP_FEIDS;
490 		so2 = so3;
491 		if (unp2->unp_flags & UNP_FEIDSBIND) {
492 			unp->unp_connid.unp_euid = unp2->unp_connid.unp_euid;
493 			unp->unp_connid.unp_egid = unp2->unp_connid.unp_egid;
494 			unp->unp_flags |= UNP_FEIDS;
495 		}
496 	}
497 	error = unp_connect2(so, so2);
498 bad:
499 	vput(vp);
500 	return (error);
501 }
502 
503 int
504 unp_connect2(struct socket *so, struct socket *so2)
505 {
506 	struct unpcb *unp = sotounpcb(so);
507 	struct unpcb *unp2;
508 
509 	if (so2->so_type != so->so_type)
510 		return (EPROTOTYPE);
511 	unp2 = sotounpcb(so2);
512 	unp->unp_conn = unp2;
513 	switch (so->so_type) {
514 
515 	case SOCK_DGRAM:
516 		unp->unp_nextref = unp2->unp_refs;
517 		unp2->unp_refs = unp;
518 		soisconnected(so);
519 		break;
520 
521 	case SOCK_STREAM:
522 		unp2->unp_conn = unp;
523 		soisconnected(so);
524 		soisconnected(so2);
525 		break;
526 
527 	default:
528 		panic("unp_connect2");
529 	}
530 	return (0);
531 }
532 
533 void
534 unp_disconnect(struct unpcb *unp)
535 {
536 	struct unpcb *unp2 = unp->unp_conn;
537 
538 	if (unp2 == NULL)
539 		return;
540 	unp->unp_conn = NULL;
541 	switch (unp->unp_socket->so_type) {
542 
543 	case SOCK_DGRAM:
544 		if (unp2->unp_refs == unp)
545 			unp2->unp_refs = unp->unp_nextref;
546 		else {
547 			unp2 = unp2->unp_refs;
548 			for (;;) {
549 				if (unp2 == NULL)
550 					panic("unp_disconnect");
551 				if (unp2->unp_nextref == unp)
552 					break;
553 				unp2 = unp2->unp_nextref;
554 			}
555 			unp2->unp_nextref = unp->unp_nextref;
556 		}
557 		unp->unp_nextref = NULL;
558 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
559 		break;
560 
561 	case SOCK_STREAM:
562 		soisdisconnected(unp->unp_socket);
563 		unp2->unp_conn = NULL;
564 		soisdisconnected(unp2->unp_socket);
565 		break;
566 	}
567 }
568 
569 #ifdef notdef
570 unp_abort(struct unpcb *unp)
571 {
572 	unp_detach(unp);
573 }
574 #endif
575 
576 void
577 unp_shutdown(struct unpcb *unp)
578 {
579 	struct socket *so;
580 
581 	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
582 	    (so = unp->unp_conn->unp_socket))
583 		socantrcvmore(so);
584 }
585 
586 void
587 unp_drop(struct unpcb *unp, int errno)
588 {
589 	struct socket *so = unp->unp_socket;
590 
591 	so->so_error = errno;
592 	unp_disconnect(unp);
593 	if (so->so_head) {
594 		so->so_pcb = NULL;
595 		sofree(so);
596 		m_freem(unp->unp_addr);
597 		free(unp, M_PCB);
598 	}
599 }
600 
601 #ifdef notdef
602 unp_drain(void)
603 {
604 
605 }
606 #endif
607 
608 int
609 unp_externalize(struct mbuf *rights, socklen_t controllen)
610 {
611 	struct proc *p = curproc;		/* XXX */
612 	struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
613 	int i, *fdp;
614 	struct file **rp;
615 	struct file *fp;
616 	int nfds, error = 0;
617 
618 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
619 	    sizeof(struct file *);
620 	if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
621 		controllen = 0;
622 	else
623 		controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
624 	if (nfds > controllen / sizeof(int))
625 		nfds = controllen / sizeof(int);
626 
627 	rp = (struct file **)CMSG_DATA(cm);
628 
629 	fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
630 
631 	/* Make sure the recipient should be able to see the descriptors.. */
632 	if (p->p_fd->fd_rdir != NULL) {
633 		rp = (struct file **)CMSG_DATA(cm);
634 		for (i = 0; i < nfds; i++) {
635 			fp = *rp++;
636 			/*
637 			 * No to block devices.  If passing a directory,
638 			 * make sure that it is underneath the root.
639 			 */
640 			if (fp->f_type == DTYPE_VNODE) {
641 				struct vnode *vp = (struct vnode *)fp->f_data;
642 
643 				if (vp->v_type == VBLK ||
644 				    (vp->v_type == VDIR &&
645 				    !vn_isunder(vp, p->p_fd->fd_rdir, p))) {
646 					error = EPERM;
647 					break;
648 				}
649 			}
650 		}
651 	}
652 
653 restart:
654 	fdplock(p->p_fd);
655 	if (error != 0) {
656 		rp = ((struct file **)CMSG_DATA(cm));
657 		for (i = 0; i < nfds; i++) {
658 			fp = *rp;
659 			/*
660 			 * zero the pointer before calling unp_discard,
661 			 * since it may end up in unp_gc()..
662 			 */
663 			*rp++ = NULL;
664 			unp_discard(fp);
665 		}
666 		goto out;
667 	}
668 
669 	/*
670 	 * First loop -- allocate file descriptor table slots for the
671 	 * new descriptors.
672 	 */
673 	rp = ((struct file **)CMSG_DATA(cm));
674 	for (i = 0; i < nfds; i++) {
675 		bcopy(rp, &fp, sizeof(fp));
676 		rp++;
677 		if ((error = fdalloc(p, 0, &fdp[i])) != 0) {
678 			/*
679 			 * Back out what we've done so far.
680 			 */
681 			for (--i; i >= 0; i--)
682 				fdremove(p->p_fd, fdp[i]);
683 
684 			if (error == ENOSPC) {
685 				fdexpand(p);
686 				error = 0;
687 			} else {
688 				/*
689 				 * This is the error that has historically
690 				 * been returned, and some callers may
691 				 * expect it.
692 				 */
693 				error = EMSGSIZE;
694 			}
695 			fdpunlock(p->p_fd);
696 			goto restart;
697 		}
698 
699 		/*
700 		 * Make the slot reference the descriptor so that
701 		 * fdalloc() works properly.. We finalize it all
702 		 * in the loop below.
703 		 */
704 		p->p_fd->fd_ofiles[fdp[i]] = fp;
705 	}
706 
707 	/*
708 	 * Now that adding them has succeeded, update all of the
709 	 * descriptor passing state.
710 	 */
711 	rp = (struct file **)CMSG_DATA(cm);
712 	for (i = 0; i < nfds; i++) {
713 		fp = *rp++;
714 		fp->f_msgcount--;
715 		unp_rights--;
716 	}
717 
718 	/*
719 	 * Copy temporary array to message and adjust length, in case of
720 	 * transition from large struct file pointers to ints.
721 	 */
722 	memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
723 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
724 	rights->m_len = CMSG_LEN(nfds * sizeof(int));
725  out:
726 	fdpunlock(p->p_fd);
727 	free(fdp, M_TEMP);
728 	return (error);
729 }
730 
731 int
732 unp_internalize(struct mbuf *control, struct proc *p)
733 {
734 	struct filedesc *fdp = p->p_fd;
735 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
736 	struct file **rp, *fp;
737 	int i, error;
738 	int nfds, *ip, fd, neededspace;
739 
740 	/*
741 	 * Check for two potential msg_controllen values because
742 	 * IETF stuck their nose in a place it does not belong.
743 	 */
744 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
745 	    !(cm->cmsg_len == control->m_len ||
746 	    control->m_len == CMSG_ALIGN(cm->cmsg_len)))
747 		return (EINVAL);
748 	nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof (int);
749 
750 	/* Make sure we have room for the struct file pointers */
751 morespace:
752 	neededspace = CMSG_SPACE(nfds * sizeof(struct file *)) -
753 	    control->m_len;
754 	if (neededspace > M_TRAILINGSPACE(control)) {
755 		/* if we already have a cluster, the message is just too big */
756 		if (control->m_flags & M_EXT)
757 			return (E2BIG);
758 
759 		/* allocate a cluster and try again */
760 		MCLGET(control, M_WAIT);
761 		if ((control->m_flags & M_EXT) == 0)
762 			return (ENOBUFS);       /* allocation failed */
763 
764 		/* copy the data to the cluster */
765 		memcpy(mtod(control, char *), cm, cm->cmsg_len);
766 		cm = mtod(control, struct cmsghdr *);
767 		goto morespace;
768 	}
769 
770 	/* adjust message & mbuf to note amount of space actually used. */
771 	cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct file *));
772 	control->m_len = CMSG_SPACE(nfds * sizeof(struct file *));
773 
774 	ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
775 	rp = ((struct file **)CMSG_DATA(cm)) + nfds - 1;
776 	for (i = 0; i < nfds; i++) {
777 		bcopy(ip, &fd, sizeof fd);
778 		ip--;
779 		if ((fp = fd_getfile(fdp, fd)) == NULL) {
780 			error = EBADF;
781 			goto fail;
782 		}
783 		if (fp->f_count == LONG_MAX-2 ||
784 		    fp->f_msgcount == LONG_MAX-2) {
785 			error = EDEADLK;
786 			goto fail;
787 		}
788 		bcopy(&fp, rp, sizeof fp);
789 		rp--;
790 		fp->f_count++;
791 		fp->f_msgcount++;
792 		unp_rights++;
793 	}
794 	return (0);
795 fail:
796 	/* Back out what we just did. */
797 	for ( ; i > 0; i--) {
798 		rp++;
799 		bcopy(rp, &fp, sizeof(fp));
800 		fp->f_count--;
801 		fp->f_msgcount--;
802 		unp_rights--;
803 	}
804 
805 	return (error);
806 }
807 
808 int	unp_defer, unp_gcing;
809 extern	struct domain unixdomain;
810 
811 void
812 unp_gc(void)
813 {
814 	struct file *fp, *nextfp;
815 	struct socket *so;
816 	struct file **extra_ref, **fpp;
817 	int nunref, i;
818 
819 	if (unp_gcing)
820 		return;
821 	unp_gcing = 1;
822 	unp_defer = 0;
823 	LIST_FOREACH(fp, &filehead, f_list)
824 		fp->f_flag &= ~(FMARK|FDEFER);
825 	do {
826 		LIST_FOREACH(fp, &filehead, f_list) {
827 			if (fp->f_flag & FDEFER) {
828 				fp->f_flag &= ~FDEFER;
829 				unp_defer--;
830 			} else {
831 				if (fp->f_count == 0)
832 					continue;
833 				if (fp->f_flag & FMARK)
834 					continue;
835 				if (fp->f_count == fp->f_msgcount)
836 					continue;
837 			}
838 			fp->f_flag |= FMARK;
839 
840 			if (fp->f_type != DTYPE_SOCKET ||
841 			    (so = (struct socket *)fp->f_data) == NULL)
842 				continue;
843 			if (so->so_proto->pr_domain != &unixdomain ||
844 			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
845 				continue;
846 #ifdef notdef
847 			if (so->so_rcv.sb_flags & SB_LOCK) {
848 				/*
849 				 * This is problematical; it's not clear
850 				 * we need to wait for the sockbuf to be
851 				 * unlocked (on a uniprocessor, at least),
852 				 * and it's also not clear what to do
853 				 * if sbwait returns an error due to receipt
854 				 * of a signal.  If sbwait does return
855 				 * an error, we'll go into an infinite
856 				 * loop.  Delete all of this for now.
857 				 */
858 				(void) sbwait(&so->so_rcv);
859 				goto restart;
860 			}
861 #endif
862 			unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
863 		}
864 	} while (unp_defer);
865 	/*
866 	 * We grab an extra reference to each of the file table entries
867 	 * that are not otherwise accessible and then free the rights
868 	 * that are stored in messages on them.
869 	 *
870 	 * The bug in the original code is a little tricky, so I'll describe
871 	 * what's wrong with it here.
872 	 *
873 	 * It is incorrect to simply unp_discard each entry for f_msgcount
874 	 * times -- consider the case of sockets A and B that contain
875 	 * references to each other.  On a last close of some other socket,
876 	 * we trigger a gc since the number of outstanding rights (unp_rights)
877 	 * is non-zero.  If during the sweep phase the gc code un_discards,
878 	 * we end up doing a (full) closef on the descriptor.  A closef on A
879 	 * results in the following chain.  Closef calls soo_close, which
880 	 * calls soclose.   Soclose calls first (through the switch
881 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
882 	 * returns because the previous instance had set unp_gcing, and
883 	 * we return all the way back to soclose, which marks the socket
884 	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
885 	 * to free up the rights that are queued in messages on the socket A,
886 	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
887 	 * switch unp_dispose, which unp_scans with unp_discard.  This second
888 	 * instance of unp_discard just calls closef on B.
889 	 *
890 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
891 	 * which results in another closef on A.  Unfortunately, A is already
892 	 * being closed, and the descriptor has already been marked with
893 	 * SS_NOFDREF, and soclose panics at this point.
894 	 *
895 	 * Here, we first take an extra reference to each inaccessible
896 	 * descriptor.  Then, we call sorflush ourself, since we know
897 	 * it is a Unix domain socket anyhow.  After we destroy all the
898 	 * rights carried in messages, we do a last closef to get rid
899 	 * of our extra reference.  This is the last close, and the
900 	 * unp_detach etc will shut down the socket.
901 	 *
902 	 * 91/09/19, bsy@cs.cmu.edu
903 	 */
904 	extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
905 	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref;
906 	    fp != NULL; fp = nextfp) {
907 		nextfp = LIST_NEXT(fp, f_list);
908 		if (fp->f_count == 0)
909 			continue;
910 		if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
911 			*fpp++ = fp;
912 			nunref++;
913 			FREF(fp);
914 			fp->f_count++;
915 		}
916 	}
917 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
918 	        if ((*fpp)->f_type == DTYPE_SOCKET && (*fpp)->f_data != NULL)
919 		        sorflush((struct socket *)(*fpp)->f_data);
920 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
921 		(void) closef(*fpp, NULL);
922 	free((caddr_t)extra_ref, M_FILE);
923 	unp_gcing = 0;
924 }
925 
926 void
927 unp_dispose(struct mbuf *m)
928 {
929 
930 	if (m)
931 		unp_scan(m, unp_discard, 1);
932 }
933 
934 void
935 unp_scan(struct mbuf *m0, void (*op)(struct file *), int discard)
936 {
937 	struct mbuf *m;
938 	struct file **rp, *fp;
939 	struct cmsghdr *cm;
940 	int i;
941 	int qfds;
942 
943 	while (m0) {
944 		for (m = m0; m; m = m->m_next) {
945 			if (m->m_type == MT_CONTROL &&
946 			    m->m_len >= sizeof(*cm)) {
947 				cm = mtod(m, struct cmsghdr *);
948 				if (cm->cmsg_level != SOL_SOCKET ||
949 				    cm->cmsg_type != SCM_RIGHTS)
950 					continue;
951 				qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof *cm))
952 				    / sizeof(struct file *);
953 				rp = (struct file **)CMSG_DATA(cm);
954 				for (i = 0; i < qfds; i++) {
955 					fp = *rp;
956 					if (discard)
957 						*rp = 0;
958 					(*op)(fp);
959 					rp++;
960 				}
961 				break;		/* XXX, but saves time */
962 			}
963 		}
964 		m0 = m0->m_nextpkt;
965 	}
966 }
967 
968 void
969 unp_mark(struct file *fp)
970 {
971 	if (fp == NULL)
972 		return;
973 
974 	if (fp->f_flag & FMARK)
975 		return;
976 
977 	if (fp->f_flag & FDEFER)
978 		return;
979 
980 	if (fp->f_type == DTYPE_SOCKET) {
981 		unp_defer++;
982 		fp->f_flag |= FDEFER;
983 	} else {
984 		fp->f_flag |= FMARK;
985 	}
986 }
987 
988 void
989 unp_discard(struct file *fp)
990 {
991 
992 	if (fp == NULL)
993 		return;
994 	FREF(fp);
995 	fp->f_msgcount--;
996 	unp_rights--;
997 	(void) closef(fp, NULL);
998 }
999