xref: /netbsd-src/sys/kern/kern_descrip.c (revision 8ac07aec990b9d2e483062509d0a9fa5b4f57cf2)
1 /*	$NetBSD: kern_descrip.c,v 1.177 2008/04/24 18:39:23 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the NetBSD
18  *	Foundation, Inc. and its contributors.
19  * 4. Neither the name of The NetBSD Foundation nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
27  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /*
37  * Copyright (c) 1982, 1986, 1989, 1991, 1993
38  *	The Regents of the University of California.  All rights reserved.
39  * (c) UNIX System Laboratories, Inc.
40  * All or some portions of this file are derived from material licensed
41  * to the University of California by American Telephone and Telegraph
42  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
43  * the permission of UNIX System Laboratories, Inc.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 3. Neither the name of the University nor the names of its contributors
54  *    may be used to endorse or promote products derived from this software
55  *    without specific prior written permission.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67  * SUCH DAMAGE.
68  *
69  *	@(#)kern_descrip.c	8.8 (Berkeley) 2/14/95
70  */
71 
72 /*
73  * File descriptor management.
74  */
75 
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.177 2008/04/24 18:39:23 ad Exp $");
78 
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/filedesc.h>
82 #include <sys/kernel.h>
83 #include <sys/vnode.h>
84 #include <sys/proc.h>
85 #include <sys/file.h>
86 #include <sys/namei.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/stat.h>
90 #include <sys/ioctl.h>
91 #include <sys/fcntl.h>
92 #include <sys/malloc.h>
93 #include <sys/pool.h>
94 #include <sys/syslog.h>
95 #include <sys/unistd.h>
96 #include <sys/resourcevar.h>
97 #include <sys/conf.h>
98 #include <sys/event.h>
99 #include <sys/kauth.h>
100 #include <sys/atomic.h>
101 #include <sys/mount.h>
102 #include <sys/syscallargs.h>
103 #include <sys/cpu.h>
104 
105 static int	cwdi_ctor(void *, void *, int);
106 static void	cwdi_dtor(void *, void *);
107 static int	file_ctor(void *, void *, int);
108 static void	file_dtor(void *, void *);
109 static int	fdfile_ctor(void *, void *, int);
110 static void	fdfile_dtor(void *, void *);
111 static int	filedesc_ctor(void *, void *, int);
112 static void	filedesc_dtor(void *, void *);
113 static int	filedescopen(dev_t, int, int, lwp_t *);
114 
115 kmutex_t	filelist_lock;	/* lock on filehead */
116 struct filelist	filehead;	/* head of list of open files */
117 u_int		nfiles;		/* actual number of open files */
118 
119 static pool_cache_t cwdi_cache;
120 static pool_cache_t filedesc_cache;
121 static pool_cache_t file_cache;
122 static pool_cache_t fdfile_cache;
123 
124 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
125 
126 const struct cdevsw filedesc_cdevsw = {
127 	filedescopen, noclose, noread, nowrite, noioctl,
128 	nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
129 };
130 
131 /* For ease of reading. */
132 __strong_alias(fd_putvnode,fd_putfile)
133 __strong_alias(fd_putsock,fd_putfile)
134 
135 /*
136  * Initialize the descriptor system.
137  */
138 void
139 fd_sys_init(void)
140 {
141 
142 	mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
143 
144 	file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
145 	    0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
146 	KASSERT(file_cache != NULL);
147 
148 	fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
149 	    PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
150 	    NULL);
151 	KASSERT(fdfile_cache != NULL);
152 
153 	cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit,
154 	    0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
155 	KASSERT(cwdi_cache != NULL);
156 
157 	filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
158 	    0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
159 	    NULL);
160 	KASSERT(filedesc_cache != NULL);
161 }
162 
163 static int
164 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
165 {
166 	int i, off, maxoff;
167 	uint32_t sub;
168 
169 	KASSERT(mutex_owned(&fdp->fd_lock));
170 
171 	if (want > bits)
172 		return -1;
173 
174 	off = want >> NDENTRYSHIFT;
175 	i = want & NDENTRYMASK;
176 	if (i) {
177 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
178 		if (sub != ~0)
179 			goto found;
180 		off++;
181 	}
182 
183 	maxoff = NDLOSLOTS(bits);
184 	while (off < maxoff) {
185 		if ((sub = bitmap[off]) != ~0)
186 			goto found;
187 		off++;
188 	}
189 
190 	return (-1);
191 
192  found:
193 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
194 }
195 
196 static int
197 fd_last_set(filedesc_t *fd, int last)
198 {
199 	int off, i;
200 	fdfile_t **ofiles = fd->fd_ofiles;
201 	uint32_t *bitmap = fd->fd_lomap;
202 
203 	KASSERT(mutex_owned(&fd->fd_lock));
204 
205 	off = (last - 1) >> NDENTRYSHIFT;
206 
207 	while (off >= 0 && !bitmap[off])
208 		off--;
209 
210 	if (off < 0)
211 		return (-1);
212 
213 	i = ((off + 1) << NDENTRYSHIFT) - 1;
214 	if (i >= last)
215 		i = last - 1;
216 
217 	/* XXX should use bitmap */
218 	/* XXXAD does not work for fd_copy() */
219 	while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
220 		i--;
221 
222 	return (i);
223 }
224 
225 void
226 fd_used(filedesc_t *fdp, unsigned fd)
227 {
228 	u_int off = fd >> NDENTRYSHIFT;
229 	fdfile_t *ff;
230 
231 	ff = fdp->fd_ofiles[fd];
232 
233 	KASSERT(mutex_owned(&fdp->fd_lock));
234 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
235 	KASSERT(ff != NULL);
236 	KASSERT(ff->ff_file == NULL);
237    	KASSERT(!ff->ff_allocated);
238 
239    	ff->ff_allocated = 1;
240 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
241 	if (fdp->fd_lomap[off] == ~0) {
242 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
243 		    (1 << (off & NDENTRYMASK))) == 0);
244 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
245 	}
246 
247 	if ((int)fd > fdp->fd_lastfile) {
248 		fdp->fd_lastfile = fd;
249 	}
250 
251 	if (fd >= NDFDFILE) {
252 		fdp->fd_nused++;
253 	} else {
254 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
255 	}
256 }
257 
258 void
259 fd_unused(filedesc_t *fdp, unsigned fd)
260 {
261 	u_int off = fd >> NDENTRYSHIFT;
262 	fdfile_t *ff;
263 
264 	ff = fdp->fd_ofiles[fd];
265 
266 	/*
267 	 * Don't assert the lock is held here, as we may be copying
268 	 * the table during exec() and it is not needed there.
269 	 * procfs and sysctl are locked out by proc::p_reflock.
270 	 *
271 	 * KASSERT(mutex_owned(&fdp->fd_lock));
272 	 */
273 	KASSERT(ff != NULL);
274 	KASSERT(ff->ff_file == NULL);
275    	KASSERT(ff->ff_allocated);
276 
277 	if (fd < fdp->fd_freefile) {
278 		fdp->fd_freefile = fd;
279 	}
280 
281 	if (fdp->fd_lomap[off] == ~0) {
282 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
283 		    (1 << (off & NDENTRYMASK))) != 0);
284 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
285 		    ~(1 << (off & NDENTRYMASK));
286 	}
287 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
288 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
289 	ff->ff_allocated = 0;
290 
291 	KASSERT(fd <= fdp->fd_lastfile);
292 	if (fd == fdp->fd_lastfile) {
293 		fdp->fd_lastfile = fd_last_set(fdp, fd);
294 	}
295 
296 	if (fd >= NDFDFILE) {
297 		KASSERT(fdp->fd_nused > 0);
298 		fdp->fd_nused--;
299 	} else {
300 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
301 	}
302 }
303 
304 /*
305  * Custom version of fd_unused() for fd_copy(), where the descriptor
306  * table is not yet fully initialized.
307  */
308 static inline void
309 fd_zap(filedesc_t *fdp, unsigned fd)
310 {
311 	u_int off = fd >> NDENTRYSHIFT;
312 
313 	if (fd < fdp->fd_freefile) {
314 		fdp->fd_freefile = fd;
315 	}
316 
317 	if (fdp->fd_lomap[off] == ~0) {
318 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
319 		    (1 << (off & NDENTRYMASK))) != 0);
320 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
321 		    ~(1 << (off & NDENTRYMASK));
322 	}
323 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
324 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
325 }
326 
327 bool
328 fd_isused(filedesc_t *fdp, unsigned fd)
329 {
330 	u_int off = fd >> NDENTRYSHIFT;
331 
332 	KASSERT(fd < fdp->fd_nfiles);
333 
334 	return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
335 }
336 
337 /*
338  * Look up the file structure corresponding to a file descriptor
339  * and return the file, holding a reference on the descriptor.
340  */
341 inline file_t *
342 fd_getfile(unsigned fd)
343 {
344 	filedesc_t *fdp;
345 	fdfile_t *ff;
346 	file_t *fp;
347 
348 	fdp = curlwp->l_fd;
349 
350 	/*
351 	 * Look up the fdfile structure representing this descriptor.
352 	 * Ensure that we see fd_nfiles before fd_ofiles since we
353 	 * are doing this unlocked.  See fd_tryexpand().
354 	 */
355 	if (__predict_false(fd >= fdp->fd_nfiles)) {
356 		return NULL;
357 	}
358 	membar_consumer();
359 	ff = fdp->fd_ofiles[fd];
360 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
361 	if (__predict_false(ff == NULL)) {
362 		return NULL;
363 	}
364 
365 	/*
366 	 * Now get a reference to the descriptor.   Issue a memory
367 	 * barrier to ensure that we acquire the file pointer _after_
368 	 * adding a reference.  If no memory barrier, we could fetch
369 	 * a stale pointer.
370 	 */
371 	atomic_inc_uint(&ff->ff_refcnt);
372 #ifndef __HAVE_ATOMIC_AS_MEMBAR
373 	membar_enter();
374 #endif
375 
376 	/*
377 	 * If the file is not open or is being closed then put the
378 	 * reference back.
379 	 */
380 	fp = ff->ff_file;
381 	if (__predict_true(fp != NULL)) {
382 		return fp;
383 	}
384 	fd_putfile(fd);
385 	return NULL;
386 }
387 
388 /*
389  * Release a reference to a file descriptor acquired with fd_getfile().
390  */
391 void
392 fd_putfile(unsigned fd)
393 {
394 	filedesc_t *fdp;
395 	fdfile_t *ff;
396 	u_int u, v;
397 
398 	fdp = curlwp->l_fd;
399 	ff = fdp->fd_ofiles[fd];
400 
401 	KASSERT(fd < fdp->fd_nfiles);
402 	KASSERT(ff != NULL);
403 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
404 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
405 
406 	/*
407 	 * Ensure that any use of the file is complete and globally
408 	 * visible before dropping the final reference.  If no membar,
409 	 * the current CPU could still access memory associated with
410 	 * the file after it has been freed or recycled by another
411 	 * CPU.
412 	 */
413 #ifndef __HAVE_ATOMIC_AS_MEMBAR
414 	membar_exit();
415 #endif
416 
417 	/*
418 	 * Be optimistic and start out with the assumption that no other
419 	 * threads are trying to close the descriptor.  If the CAS fails,
420 	 * we lost a race and/or it's being closed.
421 	 */
422 	for (u = ff->ff_refcnt & FR_MASK;; u = v) {
423 		v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
424 		if (__predict_true(u == v)) {
425 			return;
426 		}
427 		if (__predict_false((v & FR_CLOSING) != 0)) {
428 			break;
429 		}
430 	}
431 
432 	/* Another thread is waiting to close the file: join it. */
433 	(void)fd_close(fd);
434 }
435 
436 /*
437  * Convenience wrapper around fd_getfile() that returns reference
438  * to a vnode.
439  */
440 int
441 fd_getvnode(unsigned fd, file_t **fpp)
442 {
443 	vnode_t *vp;
444 	file_t *fp;
445 
446 	fp = fd_getfile(fd);
447 	if (__predict_false(fp == NULL)) {
448 		return EBADF;
449 	}
450 	if (__predict_false(fp->f_type != DTYPE_VNODE)) {
451 		fd_putfile(fd);
452 		return EINVAL;
453 	}
454 	vp = fp->f_data;
455 	if (__predict_false(vp->v_type == VBAD)) {
456 		/* XXX Is this case really necessary? */
457 		fd_putfile(fd);
458 		return EBADF;
459 	}
460 	*fpp = fp;
461 	return 0;
462 }
463 
464 /*
465  * Convenience wrapper around fd_getfile() that returns reference
466  * to a socket.
467  */
468 int
469 fd_getsock(unsigned fd, struct socket **sop)
470 {
471 	file_t *fp;
472 
473 	fp = fd_getfile(fd);
474 	if (__predict_false(fp == NULL)) {
475 		return EBADF;
476 	}
477 	if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
478 		fd_putfile(fd);
479 		return ENOTSOCK;
480 	}
481 	*sop = fp->f_data;
482 	return 0;
483 }
484 
485 /*
486  * Look up the file structure corresponding to a file descriptor
487  * and return it with a reference held on the file, not the
488  * descriptor.
489  *
490  * This is heavyweight and only used when accessing descriptors
491  * from a foreign process.  The caller must ensure that `p' does
492  * not exit or fork across this call.
493  *
494  * To release the file (not descriptor) reference, use closef().
495  */
496 file_t *
497 fd_getfile2(proc_t *p, unsigned fd)
498 {
499 	filedesc_t *fdp;
500 	fdfile_t *ff;
501 	file_t *fp;
502 
503 	fdp = p->p_fd;
504 	mutex_enter(&fdp->fd_lock);
505 	if (fd > fdp->fd_nfiles) {
506 		mutex_exit(&fdp->fd_lock);
507 		return NULL;
508 	}
509 	if ((ff = fdp->fd_ofiles[fd]) == NULL) {
510 		mutex_exit(&fdp->fd_lock);
511 		return NULL;
512 	}
513 	mutex_enter(&ff->ff_lock);
514 	if ((fp = ff->ff_file) == NULL) {
515 		mutex_exit(&ff->ff_lock);
516 		mutex_exit(&fdp->fd_lock);
517 		return NULL;
518 	}
519 	mutex_enter(&fp->f_lock);
520 	fp->f_count++;
521 	mutex_exit(&fp->f_lock);
522 	mutex_exit(&ff->ff_lock);
523 	mutex_exit(&fdp->fd_lock);
524 
525 	return fp;
526 }
527 
528 /*
529  * Internal form of close.  Must be called with a reference to the
530  * descriptor, and will drop the reference.  When all descriptor
531  * references are dropped, releases the descriptor slot and a single
532  * reference to the file structure.
533  */
534 int
535 fd_close(unsigned fd)
536 {
537 	struct flock lf;
538 	filedesc_t *fdp;
539 	fdfile_t *ff;
540 	file_t *fp;
541 	proc_t *p;
542 	lwp_t *l;
543 
544 	l = curlwp;
545 	p = l->l_proc;
546 	fdp = l->l_fd;
547 	ff = fdp->fd_ofiles[fd];
548 
549 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
550 
551 	mutex_enter(&ff->ff_lock);
552 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
553 	if (ff->ff_file == NULL) {
554 		/*
555 		 * Another user of the file is already closing, and is
556 		 * waiting for other users of the file to drain.  Release
557 		 * our reference, and wake up the closer.
558 		 */
559 		atomic_dec_uint(&ff->ff_refcnt);
560 		cv_broadcast(&ff->ff_closing);
561 		mutex_exit(&ff->ff_lock);
562 
563 		/*
564 		 * An application error, so pretend that the descriptor
565 		 * was already closed.  We can't safely wait for it to
566 		 * be closed without potentially deadlocking.
567 		 */
568 		return (EBADF);
569 	}
570 	KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
571 
572 	/*
573 	 * There may be multiple users of this file within the process.
574 	 * Notify existing and new users that the file is closing.  This
575 	 * will prevent them from adding additional uses to this file
576 	 * while we are closing it.
577 	 */
578 	fp = ff->ff_file;
579 	ff->ff_file = NULL;
580 	ff->ff_exclose = 0;
581 
582 	/*
583 	 * We expect the caller to hold a descriptor reference - drop it.
584 	 * The reference count may increase beyond zero at this point due
585 	 * to an erroneous descriptor reference by an application, but
586 	 * fd_getfile() will notice that the file is being closed and drop
587 	 * the reference again.
588 	 */
589 #ifndef __HAVE_ATOMIC_AS_MEMBAR
590 	membar_producer();
591 #endif
592 	if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
593 		/*
594 		 * Wait for other references to drain.  This is typically
595 		 * an application error - the descriptor is being closed
596 		 * while still in use.
597 		 *
598 		 */
599 		atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
600 		/*
601 		 * Remove any knotes attached to the file.  A knote
602 		 * attached to the descriptor can hold references on it.
603 		 */
604 		if (!SLIST_EMPTY(&ff->ff_knlist)) {
605 			mutex_exit(&ff->ff_lock);
606 			knote_fdclose(fd);
607 			mutex_enter(&ff->ff_lock);
608 		}
609 		/*
610 		 * We need to see the count drop to zero at least once,
611 		 * in order to ensure that all pre-existing references
612 		 * have been drained.  New references past this point are
613 		 * of no interest.
614 		 */
615 		while ((ff->ff_refcnt & FR_MASK) != 0) {
616 			cv_wait(&ff->ff_closing, &ff->ff_lock);
617 		}
618 		atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
619 	} else {
620 		/* If no references, there must be no knotes. */
621 		KASSERT(SLIST_EMPTY(&ff->ff_knlist));
622 	}
623 	mutex_exit(&ff->ff_lock);
624 
625 	/*
626 	 * POSIX record locking dictates that any close releases ALL
627 	 * locks owned by this process.  This is handled by setting
628 	 * a flag in the unlock to free ONLY locks obeying POSIX
629 	 * semantics, and not to free BSD-style file locks.
630 	 * If the descriptor was in a message, POSIX-style locks
631 	 * aren't passed with the descriptor.
632 	 */
633 	if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
634 		lf.l_whence = SEEK_SET;
635 		lf.l_start = 0;
636 		lf.l_len = 0;
637 		lf.l_type = F_UNLCK;
638 		(void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
639 	}
640 
641 
642 	/* Free descriptor slot. */
643 	mutex_enter(&fdp->fd_lock);
644 	fd_unused(fdp, fd);
645 	mutex_exit(&fdp->fd_lock);
646 
647 	/* Now drop reference to the file itself. */
648 	return closef(fp);
649 }
650 
651 /*
652  * Duplicate a file descriptor.
653  */
654 int
655 fd_dup(file_t *fp, int minfd, int *newp, int exclose)
656 {
657 	proc_t *p;
658 	int error;
659 
660 	p = curproc;
661 
662 	while ((error = fd_alloc(p, minfd, newp)) != 0) {
663 		if (error != ENOSPC) {
664 			return error;
665 		}
666 		fd_tryexpand(p);
667 	}
668 
669 	curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
670 	fd_affix(p, fp, *newp);
671 	return 0;
672 }
673 
674 /*
675  * dup2 operation.
676  */
677 int
678 fd_dup2(file_t *fp, unsigned new)
679 {
680 	filedesc_t *fdp;
681 	fdfile_t *ff;
682 
683 	fdp = curlwp->l_fd;
684 
685 	/*
686 	 * Ensure there are enough slots in the descriptor table,
687 	 * and allocate an fdfile_t up front in case we need it.
688 	 */
689 	while (new >= fdp->fd_nfiles) {
690 		fd_tryexpand(curproc);
691 	}
692 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
693 
694 	/*
695 	 * If there is already a file open, close it.  If the file is
696 	 * half open, wait for it to be constructed before closing it.
697 	 * XXX Potential for deadlock here?
698 	 */
699 	mutex_enter(&fdp->fd_lock);
700 	while (fd_isused(fdp, new)) {
701 		mutex_exit(&fdp->fd_lock);
702 		if (fd_getfile(new) != NULL) {
703 			(void)fd_close(new);
704 		} else {
705 			/* XXX Crummy, but unlikely to happen. */
706 			kpause("dup2", false, 1, NULL);
707 		}
708 		mutex_enter(&fdp->fd_lock);
709 	}
710 	if (fdp->fd_ofiles[new] == NULL) {
711 		KASSERT(new >= NDFDFILE);
712 		fdp->fd_ofiles[new] = ff;
713 		ff = NULL;
714 	}
715 	fd_used(fdp, new);
716 	mutex_exit(&fdp->fd_lock);
717 
718 	/* Slot is now allocated.  Insert copy of the file. */
719 	fd_affix(curproc, fp, new);
720 	if (ff != NULL) {
721 		pool_cache_put(fdfile_cache, ff);
722 	}
723 	return 0;
724 }
725 
726 /*
727  * Drop reference to a file structure.
728  */
729 int
730 closef(file_t *fp)
731 {
732 	struct flock lf;
733 	int error;
734 
735 	/*
736 	 * Drop reference.  If referenced elsewhere it's still open
737 	 * and we have nothing more to do.
738 	 */
739 	mutex_enter(&fp->f_lock);
740 	KASSERT(fp->f_count > 0);
741 	if (--fp->f_count > 0) {
742 		mutex_exit(&fp->f_lock);
743 		return 0;
744 	}
745 	KASSERT(fp->f_count == 0);
746 	mutex_exit(&fp->f_lock);
747 
748 	/* We held the last reference - release locks, close and free. */
749         if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
750         	lf.l_whence = SEEK_SET;
751 		lf.l_start = 0;
752 		lf.l_len = 0;
753 		lf.l_type = F_UNLCK;
754 		(void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
755 	}
756 	if (fp->f_ops != NULL) {
757 		error = (*fp->f_ops->fo_close)(fp);
758 	} else {
759 		error = 0;
760 	}
761 	ffree(fp);
762 
763 	return error;
764 }
765 
766 /*
767  * Allocate a file descriptor for the process.
768  */
769 int
770 fd_alloc(proc_t *p, int want, int *result)
771 {
772 	filedesc_t *fdp;
773 	int i, lim, last, error;
774 	u_int off, new;
775 	fdfile_t *ff;
776 
777 	KASSERT(p == curproc || p == &proc0);
778 
779 	fdp = p->p_fd;
780 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
781 	KASSERT(ff->ff_refcnt == 0);
782 	KASSERT(ff->ff_file == NULL);
783 
784 	/*
785 	 * Search for a free descriptor starting at the higher
786 	 * of want or fd_freefile.
787 	 */
788 	mutex_enter(&fdp->fd_lock);
789 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
790 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
791 	last = min(fdp->fd_nfiles, lim);
792 	for (;;) {
793 		if ((i = want) < fdp->fd_freefile)
794 			i = fdp->fd_freefile;
795 		off = i >> NDENTRYSHIFT;
796 		new = fd_next_zero(fdp, fdp->fd_himap, off,
797 		    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
798 		if (new == -1)
799 			break;
800 		i = fd_next_zero(fdp, &fdp->fd_lomap[new],
801 		    new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
802 		if (i == -1) {
803 			/*
804 			 * Free file descriptor in this block was
805 			 * below want, try again with higher want.
806 			 */
807 			want = (new + 1) << NDENTRYSHIFT;
808 			continue;
809 		}
810 		i += (new << NDENTRYSHIFT);
811 		if (i >= last) {
812 			break;
813 		}
814 		if (fdp->fd_ofiles[i] == NULL) {
815 			KASSERT(i >= NDFDFILE);
816 			fdp->fd_ofiles[i] = ff;
817 		} else {
818 		   	pool_cache_put(fdfile_cache, ff);
819 		}
820 		KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
821 		fd_used(fdp, i);
822 		if (want <= fdp->fd_freefile) {
823 			fdp->fd_freefile = i;
824 		}
825 		*result = i;
826 		mutex_exit(&fdp->fd_lock);
827 		KASSERT(i >= NDFDFILE ||
828 		    fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
829 		return 0;
830 	}
831 
832 	/* No space in current array.  Let the caller expand and retry. */
833 	error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
834 	mutex_exit(&fdp->fd_lock);
835 	pool_cache_put(fdfile_cache, ff);
836 	return error;
837 }
838 
839 /*
840  * Expand a process' descriptor table.
841  */
842 void
843 fd_tryexpand(proc_t *p)
844 {
845 	filedesc_t *fdp;
846 	int i, numfiles, oldnfiles;
847 	fdfile_t **newofile;
848 	uint32_t *newhimap, *newlomap;
849 
850 	KASSERT(p == curproc || p == &proc0);
851 
852 	fdp = p->p_fd;
853 	newhimap = NULL;
854 	newlomap = NULL;
855 	oldnfiles = fdp->fd_nfiles;
856 
857 	if (oldnfiles < NDEXTENT)
858 		numfiles = NDEXTENT;
859 	else
860 		numfiles = 2 * oldnfiles;
861 
862 	newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK);
863 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
864 		newhimap = malloc(NDHISLOTS(numfiles) *
865 		    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
866 		newlomap = malloc(NDLOSLOTS(numfiles) *
867 		    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
868 	}
869 
870 	mutex_enter(&fdp->fd_lock);
871 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
872 	if (fdp->fd_nfiles != oldnfiles) {
873 		/* fdp changed; caller must retry */
874 		mutex_exit(&fdp->fd_lock);
875 		free(newofile, M_FILEDESC);
876 		if (newhimap != NULL)
877 			free(newhimap, M_FILEDESC);
878 		if (newlomap != NULL)
879 			free(newlomap, M_FILEDESC);
880 		return;
881 	}
882 
883 	/* Copy the existing ofile array and zero the new portion. */
884 	i = sizeof(fdfile_t *) * fdp->fd_nfiles;
885 	memcpy(newofile, fdp->fd_ofiles, i);
886 	memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
887 
888 	/*
889 	 * Link old ofiles array into list to be discarded.  We defer
890 	 * freeing until process exit if the descriptor table is visble
891 	 * to other threads.
892 	 */
893 	if (oldnfiles > NDFILE) {
894 		if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
895 			*(void **)fdp->fd_ofiles = fdp->fd_discard;
896 			fdp->fd_discard = fdp->fd_ofiles;
897 		} else {
898 			free(fdp->fd_ofiles, M_FILEDESC);
899 		}
900 	}
901 
902 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
903 		i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
904 		memcpy(newhimap, fdp->fd_himap, i);
905 		memset((uint8_t *)newhimap + i, 0,
906 		    NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
907 
908 		i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
909 		memcpy(newlomap, fdp->fd_lomap, i);
910 		memset((uint8_t *)newlomap + i, 0,
911 		    NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
912 
913 		if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
914 			free(fdp->fd_himap, M_FILEDESC);
915 			free(fdp->fd_lomap, M_FILEDESC);
916 		}
917 		fdp->fd_himap = newhimap;
918 		fdp->fd_lomap = newlomap;
919 	}
920 
921 	/*
922 	 * All other modifications must become globally visible before
923 	 * the change to fd_nfiles.  See fd_getfile().
924 	 */
925 	fdp->fd_ofiles = newofile;
926 	membar_producer();
927 	fdp->fd_nfiles = numfiles;
928 	mutex_exit(&fdp->fd_lock);
929 
930 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
931 }
932 
933 /*
934  * Create a new open file structure and allocate a file descriptor
935  * for the current process.
936  */
937 int
938 fd_allocfile(file_t **resultfp, int *resultfd)
939 {
940 	file_t *fp;
941 	proc_t *p;
942 	int error;
943 
944 	p = curproc;
945 
946 	while ((error = fd_alloc(p, 0, resultfd)) != 0) {
947 		if (error != ENOSPC) {
948 			return error;
949 		}
950 		fd_tryexpand(p);
951 	}
952 
953 	fp = pool_cache_get(file_cache, PR_WAITOK);
954 	KASSERT(fp->f_count == 0);
955 	fp->f_cred = kauth_cred_get();
956 	kauth_cred_hold(fp->f_cred);
957 
958 	if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
959 		fd_abort(p, fp, *resultfd);
960 		tablefull("file", "increase kern.maxfiles or MAXFILES");
961 		return ENFILE;
962 	}
963 
964 	fp->f_advice = 0;
965 	fp->f_msgcount = 0;
966 	fp->f_offset = 0;
967 	fp->f_iflags = 0;
968 	*resultfp = fp;
969 
970 	return 0;
971 }
972 
973 /*
974  * Successful creation of a new descriptor: make visible to the process.
975  */
976 void
977 fd_affix(proc_t *p, file_t *fp, unsigned fd)
978 {
979 	fdfile_t *ff;
980 	filedesc_t *fdp;
981 
982 	KASSERT(p == curproc || p == &proc0);
983 
984 	/* Add a reference to the file structure. */
985 	mutex_enter(&fp->f_lock);
986 	fp->f_count++;
987 	mutex_exit(&fp->f_lock);
988 
989 	/*
990 	 * Insert the new file into the descriptor slot.
991 	 *
992 	 * The memory barriers provided by lock activity in this routine
993 	 * ensure that any updates to the file structure become globally
994 	 * visible before the file becomes visible to other LWPs in the
995 	 * current process.
996 	 */
997 	fdp = p->p_fd;
998 	ff = fdp->fd_ofiles[fd];
999 
1000 	KASSERT(ff != NULL);
1001 	KASSERT(ff->ff_file == NULL);
1002 	KASSERT(ff->ff_allocated);
1003 	KASSERT(fd_isused(fdp, fd));
1004 	KASSERT(fd >= NDFDFILE ||
1005 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1006 
1007 	/* No need to lock in order to make file initially visible. */
1008 	ff->ff_file = fp;
1009 }
1010 
1011 /*
1012  * Abort creation of a new descriptor: free descriptor slot and file.
1013  */
1014 void
1015 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1016 {
1017 	filedesc_t *fdp;
1018 	fdfile_t *ff;
1019 
1020 	KASSERT(p == curproc || p == &proc0);
1021 
1022 	fdp = p->p_fd;
1023 	ff = fdp->fd_ofiles[fd];
1024 
1025 	KASSERT(fd >= NDFDFILE ||
1026 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1027 
1028 	mutex_enter(&fdp->fd_lock);
1029 	KASSERT(fd_isused(fdp, fd));
1030 	fd_unused(fdp, fd);
1031 	mutex_exit(&fdp->fd_lock);
1032 
1033 	if (fp != NULL) {
1034 		ffree(fp);
1035 	}
1036 }
1037 
1038 /*
1039  * Free a file descriptor.
1040  */
1041 void
1042 ffree(file_t *fp)
1043 {
1044 
1045 	KASSERT(fp->f_count == 0);
1046 
1047 	atomic_dec_uint(&nfiles);
1048 	kauth_cred_free(fp->f_cred);
1049 	pool_cache_put(file_cache, fp);
1050 }
1051 
1052 /*
1053  * Create an initial cwdinfo structure, using the same current and root
1054  * directories as curproc.
1055  */
1056 struct cwdinfo *
1057 cwdinit(void)
1058 {
1059 	struct cwdinfo *cwdi;
1060 	struct cwdinfo *copy;
1061 
1062 	cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1063 	copy = curproc->p_cwdi;
1064 
1065 	rw_enter(&copy->cwdi_lock, RW_READER);
1066 	cwdi->cwdi_cdir = copy->cwdi_cdir;
1067 	if (cwdi->cwdi_cdir)
1068 		VREF(cwdi->cwdi_cdir);
1069 	cwdi->cwdi_rdir = copy->cwdi_rdir;
1070 	if (cwdi->cwdi_rdir)
1071 		VREF(cwdi->cwdi_rdir);
1072 	cwdi->cwdi_edir = copy->cwdi_edir;
1073 	if (cwdi->cwdi_edir)
1074 		VREF(cwdi->cwdi_edir);
1075 	cwdi->cwdi_cmask =  copy->cwdi_cmask;
1076 	cwdi->cwdi_refcnt = 1;
1077 	rw_exit(&copy->cwdi_lock);
1078 
1079 	return (cwdi);
1080 }
1081 
1082 static int
1083 cwdi_ctor(void *arg, void *obj, int flags)
1084 {
1085 	struct cwdinfo *cwdi = obj;
1086 
1087 	rw_init(&cwdi->cwdi_lock);
1088 
1089 	return 0;
1090 }
1091 
1092 static void
1093 cwdi_dtor(void *arg, void *obj)
1094 {
1095 	struct cwdinfo *cwdi = obj;
1096 
1097 	rw_destroy(&cwdi->cwdi_lock);
1098 }
1099 
1100 static int
1101 file_ctor(void *arg, void *obj, int flags)
1102 {
1103 	file_t *fp = obj;
1104 
1105 	memset(fp, 0, sizeof(*fp));
1106 	mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1107 
1108 	mutex_enter(&filelist_lock);
1109 	LIST_INSERT_HEAD(&filehead, fp, f_list);
1110 	mutex_exit(&filelist_lock);
1111 
1112 	return 0;
1113 }
1114 
1115 static void
1116 file_dtor(void *arg, void *obj)
1117 {
1118 	file_t *fp = obj;
1119 
1120 	mutex_enter(&filelist_lock);
1121 	LIST_REMOVE(fp, f_list);
1122 	mutex_exit(&filelist_lock);
1123 
1124 	mutex_destroy(&fp->f_lock);
1125 }
1126 
1127 static int
1128 fdfile_ctor(void *arg, void *obj, int flags)
1129 {
1130 	fdfile_t *ff = obj;
1131 
1132 	memset(ff, 0, sizeof(*ff));
1133 	mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1134 	cv_init(&ff->ff_closing, "fdclose");
1135 
1136 	return 0;
1137 }
1138 
1139 static void
1140 fdfile_dtor(void *arg, void *obj)
1141 {
1142 	fdfile_t *ff = obj;
1143 
1144 	mutex_destroy(&ff->ff_lock);
1145 	cv_destroy(&ff->ff_closing);
1146 }
1147 
1148 file_t *
1149 fgetdummy(void)
1150 {
1151 	file_t *fp;
1152 
1153 	fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1154 	if (fp != NULL) {
1155 		memset(fp, 0, sizeof(*fp));
1156 		mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1157 	}
1158 	return fp;
1159 }
1160 
1161 void
1162 fputdummy(file_t *fp)
1163 {
1164 
1165 	mutex_destroy(&fp->f_lock);
1166 	kmem_free(fp, sizeof(*fp));
1167 }
1168 
1169 /*
1170  * Make p2 share p1's cwdinfo.
1171  */
1172 void
1173 cwdshare(struct proc *p2)
1174 {
1175 	struct cwdinfo *cwdi;
1176 
1177 	cwdi = curproc->p_cwdi;
1178 
1179 	atomic_inc_uint(&cwdi->cwdi_refcnt);
1180 	p2->p_cwdi = cwdi;
1181 }
1182 
1183 /*
1184  * Release a cwdinfo structure.
1185  */
1186 void
1187 cwdfree(struct cwdinfo *cwdi)
1188 {
1189 
1190 	if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1191 		return;
1192 
1193 	vrele(cwdi->cwdi_cdir);
1194 	if (cwdi->cwdi_rdir)
1195 		vrele(cwdi->cwdi_rdir);
1196 	if (cwdi->cwdi_edir)
1197 		vrele(cwdi->cwdi_edir);
1198 	pool_cache_put(cwdi_cache, cwdi);
1199 }
1200 
1201 /*
1202  * Create an initial filedesc structure.
1203  */
1204 filedesc_t *
1205 fd_init(filedesc_t *fdp)
1206 {
1207 	unsigned fd;
1208 
1209 	if (fdp == NULL) {
1210 		fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1211 	} else {
1212 		filedesc_ctor(NULL, fdp, PR_WAITOK);
1213 	}
1214 
1215 	fdp->fd_refcnt = 1;
1216 	fdp->fd_ofiles = fdp->fd_dfiles;
1217 	fdp->fd_nfiles = NDFILE;
1218 	fdp->fd_himap = fdp->fd_dhimap;
1219 	fdp->fd_lomap = fdp->fd_dlomap;
1220 	KASSERT(fdp->fd_lastfile == -1);
1221 	KASSERT(fdp->fd_lastkqfile == -1);
1222 	KASSERT(fdp->fd_knhash == NULL);
1223 
1224 	memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1225 	    offsetof(filedesc_t, fd_startzero));
1226 	for (fd = 0; fd < NDFDFILE; fd++) {
1227 		fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1228 	}
1229 
1230 	return fdp;
1231 }
1232 
1233 /*
1234  * Initialize a file descriptor table.
1235  */
1236 static int
1237 filedesc_ctor(void *arg, void *obj, int flag)
1238 {
1239 	filedesc_t *fdp = obj;
1240 	int i;
1241 
1242 	memset(fdp, 0, sizeof(*fdp));
1243 	mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1244 	fdp->fd_lastfile = -1;
1245 	fdp->fd_lastkqfile = -1;
1246 
1247 	KASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1248 	for (i = 0; i < NDFDFILE; i++) {
1249 		fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1250 	}
1251 
1252 	return 0;
1253 }
1254 
1255 static void
1256 filedesc_dtor(void *arg, void *obj)
1257 {
1258 	filedesc_t *fdp = obj;
1259 	int i;
1260 
1261 	for (i = 0; i < NDFDFILE; i++) {
1262 		fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1263 	}
1264 
1265 	mutex_destroy(&fdp->fd_lock);
1266 }
1267 
1268 /*
1269  * Make p2 share p1's filedesc structure.
1270  */
1271 void
1272 fd_share(struct proc *p2)
1273 {
1274 	filedesc_t *fdp;
1275 
1276 	fdp = curlwp->l_fd;
1277 	p2->p_fd = fdp;
1278 	atomic_inc_uint(&fdp->fd_refcnt);
1279 }
1280 
1281 /*
1282  * Copy a filedesc structure.
1283  */
1284 filedesc_t *
1285 fd_copy(void)
1286 {
1287 	filedesc_t *newfdp, *fdp;
1288 	fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1289 	int i, nused, numfiles, lastfile, j, newlast;
1290 	file_t *fp;
1291 
1292 	fdp = curproc->p_fd;
1293 	newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1294 	newfdp->fd_refcnt = 1;
1295 
1296 	KASSERT(newfdp->fd_knhash == NULL);
1297 	KASSERT(newfdp->fd_knhashmask == 0);
1298 	KASSERT(newfdp->fd_discard == NULL);
1299 
1300 	for (;;) {
1301 		numfiles = fdp->fd_nfiles;
1302 		lastfile = fdp->fd_lastfile;
1303 
1304 		/*
1305 		 * If the number of open files fits in the internal arrays
1306 		 * of the open file structure, use them, otherwise allocate
1307 		 * additional memory for the number of descriptors currently
1308 		 * in use.
1309 		 */
1310 		if (lastfile < NDFILE) {
1311 			i = NDFILE;
1312 			newfdp->fd_ofiles = newfdp->fd_dfiles;
1313 		} else {
1314 			/*
1315 			 * Compute the smallest multiple of NDEXTENT needed
1316 			 * for the file descriptors currently in use,
1317 			 * allowing the table to shrink.
1318 			 */
1319 			i = numfiles;
1320 			while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1321 				i /= 2;
1322 			}
1323 			newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *),
1324 			    M_FILEDESC, M_WAITOK);
1325 			KASSERT(i >= NDFILE);
1326 		}
1327 		if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1328 			newfdp->fd_himap = newfdp->fd_dhimap;
1329 			newfdp->fd_lomap = newfdp->fd_dlomap;
1330 		} else {
1331 			newfdp->fd_himap = malloc(NDHISLOTS(i) *
1332 			    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1333 			newfdp->fd_lomap = malloc(NDLOSLOTS(i) *
1334 			    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1335 		}
1336 
1337 		/*
1338 		 * Allocate and string together fdfile structures.
1339 		 * We abuse fdfile_t::ff_file here, but it will be
1340 		 * cleared before this routine returns.
1341 		 */
1342 		nused = fdp->fd_nused;
1343 		fflist = NULL;
1344 		for (j = nused; j != 0; j--) {
1345 			ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1346 			ff->ff_file = (void *)fflist;
1347 			fflist = ff;
1348 		}
1349 
1350 		mutex_enter(&fdp->fd_lock);
1351 		if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1352 		    lastfile == fdp->fd_lastfile) {
1353 			break;
1354 		}
1355 		mutex_exit(&fdp->fd_lock);
1356 		if (i >= NDFILE) {
1357 			free(newfdp->fd_ofiles, M_FILEDESC);
1358 		}
1359 		if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1360 			free(newfdp->fd_himap, M_FILEDESC);
1361 			free(newfdp->fd_lomap, M_FILEDESC);
1362 		}
1363 		while (fflist != NULL) {
1364 			ff = fflist;
1365 			fflist = (void *)ff->ff_file;
1366 			ff->ff_file = NULL;
1367 			pool_cache_put(fdfile_cache, ff);
1368 		}
1369 	}
1370 
1371 	newfdp->fd_nfiles = i;
1372 	newfdp->fd_freefile = fdp->fd_freefile;
1373 	newfdp->fd_exclose = fdp->fd_exclose;
1374 
1375 	/*
1376 	 * Clear the entries that will not be copied over.
1377 	 * Avoid calling memset with 0 size.
1378 	 */
1379 	if (lastfile < (i-1)) {
1380 		memset(newfdp->fd_ofiles + lastfile + 1, 0,
1381 		    (i - lastfile - 1) * sizeof(file_t **));
1382 	}
1383 	if (i < NDENTRIES * NDENTRIES) {
1384 		i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1385 	}
1386 	memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1387 	memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1388 
1389 	ffp = fdp->fd_ofiles;
1390 	nffp = newfdp->fd_ofiles;
1391 	j = imax(lastfile, (NDFDFILE - 1));
1392 	newlast = -1;
1393 	KASSERT(j < fdp->fd_nfiles);
1394 	for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1395 		ff = *ffp;
1396 		/* Install built-in fdfiles even if unused here. */
1397 		if (i < NDFDFILE) {
1398 			ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1399 		} else {
1400 			ff2 = NULL;
1401 		}
1402 		/* Determine if descriptor is active in parent. */
1403 		if (ff == NULL || !fd_isused(fdp, i)) {
1404 			KASSERT(ff != NULL || i >= NDFDFILE);
1405 			continue;
1406 		}
1407 		mutex_enter(&ff->ff_lock);
1408 		fp = ff->ff_file;
1409 		if (fp == NULL) {
1410 			/* Descriptor is half-open: free slot. */
1411 			fd_zap(newfdp, i);
1412 			mutex_exit(&ff->ff_lock);
1413 			continue;
1414 		}
1415 		if (fp->f_type == DTYPE_KQUEUE) {
1416 			/* kqueue descriptors cannot be copied. */
1417 			fd_zap(newfdp, i);
1418 			mutex_exit(&ff->ff_lock);
1419 			continue;
1420 		}
1421 		/* It's active: add a reference to the file. */
1422 		mutex_enter(&fp->f_lock);
1423 		fp->f_count++;
1424 		mutex_exit(&fp->f_lock);
1425 		/* Consume one fdfile_t to represent it. */
1426 		if (i >= NDFDFILE) {
1427 			ff2 = fflist;
1428 			fflist = (void *)ff2->ff_file;
1429 		}
1430 		ff2->ff_file = fp;
1431 		ff2->ff_exclose = ff->ff_exclose;
1432 		ff2->ff_allocated = 1;
1433 		mutex_exit(&ff->ff_lock);
1434 		if (i > newlast) {
1435 			newlast = i;
1436 		}
1437 	}
1438 	mutex_exit(&fdp->fd_lock);
1439 
1440 	/* Discard unused fdfile_t structures. */
1441 	while (__predict_false(fflist != NULL)) {
1442 		ff = fflist;
1443 		fflist = (void *)ff->ff_file;
1444 		ff->ff_file = NULL;
1445 		pool_cache_put(fdfile_cache, ff);
1446 		nused--;
1447 	}
1448 	KASSERT(nused >= 0);
1449 	KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1450 
1451 	newfdp->fd_nused = nused;
1452 	newfdp->fd_lastfile = newlast;
1453 
1454 	return (newfdp);
1455 }
1456 
1457 /*
1458  * Release a filedesc structure.
1459  */
1460 void
1461 fd_free(void)
1462 {
1463 	filedesc_t *fdp;
1464 	fdfile_t *ff;
1465 	file_t *fp;
1466 	int fd, lastfd;
1467 	void *discard;
1468 
1469 	fdp = curlwp->l_fd;
1470 
1471 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1472 
1473 	if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1474 		return;
1475 
1476 	/*
1477 	 * Close any files that the process holds open.
1478 	 */
1479 	for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1480 		ff = fdp->fd_ofiles[fd];
1481 		KASSERT(fd >= NDFDFILE ||
1482 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1483 		if ((ff = fdp->fd_ofiles[fd]) == NULL)
1484 			continue;
1485 		if ((fp = ff->ff_file) != NULL) {
1486 			/*
1487 			 * Must use fd_close() here as kqueue holds
1488 			 * long term references to descriptors.
1489 			 */
1490 			ff->ff_refcnt++;
1491 			fd_close(fd);
1492 		}
1493 		KASSERT(ff->ff_refcnt == 0);
1494 		KASSERT(ff->ff_file == NULL);
1495 		KASSERT(!ff->ff_exclose);
1496 		KASSERT(!ff->ff_allocated);
1497 		if (fd >= NDFDFILE) {
1498 			pool_cache_put(fdfile_cache, ff);
1499 		}
1500 	}
1501 
1502 	/*
1503 	 * Clean out the descriptor table for the next user and return
1504 	 * to the cache.
1505 	 */
1506 	while ((discard = fdp->fd_discard) != NULL) {
1507 		KASSERT(discard != fdp->fd_ofiles);
1508 		fdp->fd_discard = *(void **)discard;
1509 		free(discard, M_FILEDESC);
1510 	}
1511 	if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1512 		KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1513 		KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1514 		free(fdp->fd_himap, M_FILEDESC);
1515 		free(fdp->fd_lomap, M_FILEDESC);
1516 	}
1517 	if (fdp->fd_nfiles > NDFILE) {
1518 		KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1519 		free(fdp->fd_ofiles, M_FILEDESC);
1520 	}
1521 	if (fdp->fd_knhash != NULL) {
1522 		hashdone(fdp->fd_knhash, M_KEVENT);
1523 		fdp->fd_knhash = NULL;
1524 		fdp->fd_knhashmask = 0;
1525 	} else {
1526 		KASSERT(fdp->fd_knhashmask == 0);
1527 	}
1528 	fdp->fd_lastkqfile = -1;
1529 	pool_cache_put(filedesc_cache, fdp);
1530 }
1531 
1532 /*
1533  * File Descriptor pseudo-device driver (/dev/fd/).
1534  *
1535  * Opening minor device N dup()s the file (if any) connected to file
1536  * descriptor N belonging to the calling process.  Note that this driver
1537  * consists of only the ``open()'' routine, because all subsequent
1538  * references to this file will be direct to the other driver.
1539  */
1540 static int
1541 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1542 {
1543 
1544 	/*
1545 	 * XXX Kludge: set dupfd to contain the value of the
1546 	 * the file descriptor being sought for duplication. The error
1547 	 * return ensures that the vnode for this device will be released
1548 	 * by vn_open. Open will detect this special error and take the
1549 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1550 	 * will simply report the error.
1551 	 */
1552 	l->l_dupfd = minor(dev);	/* XXX */
1553 	return EDUPFD;
1554 }
1555 
1556 /*
1557  * Duplicate the specified descriptor to a free descriptor.
1558  */
1559 int
1560 fd_dupopen(int old, int *new, int mode, int error)
1561 {
1562 	filedesc_t *fdp;
1563 	fdfile_t *ff;
1564 	file_t *fp;
1565 
1566 	if ((fp = fd_getfile(old)) == NULL) {
1567 		return EBADF;
1568 	}
1569 	fdp = curlwp->l_fd;
1570 	ff = fdp->fd_ofiles[old];
1571 
1572 	/*
1573 	 * There are two cases of interest here.
1574 	 *
1575 	 * For EDUPFD simply dup (dfd) to file descriptor
1576 	 * (indx) and return.
1577 	 *
1578 	 * For EMOVEFD steal away the file structure from (dfd) and
1579 	 * store it in (indx).  (dfd) is effectively closed by
1580 	 * this operation.
1581 	 *
1582 	 * Any other error code is just returned.
1583 	 */
1584 	switch (error) {
1585 	case EDUPFD:
1586 		/*
1587 		 * Check that the mode the file is being opened for is a
1588 		 * subset of the mode of the existing descriptor.
1589 		 */
1590 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1591 			error = EACCES;
1592 			break;
1593 		}
1594 
1595 		/* Copy it. */
1596 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1597 		break;
1598 
1599 	case EMOVEFD:
1600 		/* Copy it. */
1601 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1602 		if (error != 0) {
1603 			break;
1604 		}
1605 
1606 		/* Steal away the file pointer from 'old'. */
1607 		(void)fd_close(old);
1608 		return 0;
1609 	}
1610 
1611 	fd_putfile(old);
1612 	return error;
1613 }
1614 
1615 /*
1616  * Close open files on exec.
1617  */
1618 void
1619 fd_closeexec(void)
1620 {
1621 	struct cwdinfo *cwdi;
1622 	proc_t *p;
1623 	filedesc_t *fdp;
1624 	fdfile_t *ff;
1625 	lwp_t *l;
1626 	int fd;
1627 
1628 	l = curlwp;
1629 	p = l->l_proc;
1630 	fdp = p->p_fd;
1631 	cwdi = p->p_cwdi;
1632 
1633 	if (cwdi->cwdi_refcnt > 1) {
1634 		cwdi = cwdinit();
1635 		cwdfree(p->p_cwdi);
1636 		p->p_cwdi = cwdi;
1637 	}
1638 	if (p->p_cwdi->cwdi_edir) {
1639 		vrele(p->p_cwdi->cwdi_edir);
1640 	}
1641 
1642 	if (fdp->fd_refcnt > 1) {
1643 		fdp = fd_copy();
1644 		fd_free();
1645 		p->p_fd = fdp;
1646 		l->l_fd = fdp;
1647 	}
1648 	if (!fdp->fd_exclose) {
1649 		return;
1650 	}
1651 	fdp->fd_exclose = 0;
1652 
1653 	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1654 		if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1655 			KASSERT(fd >= NDFDFILE);
1656 			continue;
1657 		}
1658 		KASSERT(fd >= NDFDFILE ||
1659 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1660 		if (ff->ff_file == NULL)
1661 			continue;
1662 		if (ff->ff_exclose) {
1663 			/*
1664 			 * We need a reference to close the file.
1665 			 * No other threads can see the fdfile_t at
1666 			 * this point, so don't bother locking.
1667 			 */
1668 			KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1669 			ff->ff_refcnt++;
1670 			fd_close(fd);
1671 		}
1672 	}
1673 }
1674 
1675 /*
1676  * It is unsafe for set[ug]id processes to be started with file
1677  * descriptors 0..2 closed, as these descriptors are given implicit
1678  * significance in the Standard C library.  fdcheckstd() will create a
1679  * descriptor referencing /dev/null for each of stdin, stdout, and
1680  * stderr that is not already open.
1681  */
1682 #define CHECK_UPTO 3
1683 int
1684 fd_checkstd(void)
1685 {
1686 	struct proc *p;
1687 	struct nameidata nd;
1688 	filedesc_t *fdp;
1689 	file_t *fp;
1690 	struct proc *pp;
1691 	int fd, i, error, flags = FREAD|FWRITE;
1692 	char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1693 
1694 	p = curproc;
1695 	closed[0] = '\0';
1696 	if ((fdp = p->p_fd) == NULL)
1697 		return (0);
1698 	for (i = 0; i < CHECK_UPTO; i++) {
1699 		KASSERT(i >= NDFDFILE ||
1700 		    fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1701 		if (fdp->fd_ofiles[i]->ff_file != NULL)
1702 			continue;
1703 		snprintf(which, sizeof(which), ",%d", i);
1704 		strlcat(closed, which, sizeof(closed));
1705 		if ((error = fd_allocfile(&fp, &fd)) != 0)
1706 			return (error);
1707 		KASSERT(fd < CHECK_UPTO);
1708 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1709 		if ((error = vn_open(&nd, flags, 0)) != 0) {
1710 			fd_abort(p, fp, fd);
1711 			return (error);
1712 		}
1713 		fp->f_data = nd.ni_vp;
1714 		fp->f_flag = flags;
1715 		fp->f_ops = &vnops;
1716 		fp->f_type = DTYPE_VNODE;
1717 		VOP_UNLOCK(nd.ni_vp, 0);
1718 		fd_affix(p, fp, fd);
1719 	}
1720 	if (closed[0] != '\0') {
1721 		mutex_enter(proc_lock);
1722 		pp = p->p_pptr;
1723 		mutex_enter(pp->p_lock);
1724 		log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1725 		    "was invoked by uid %d ppid %d (%s) "
1726 		    "with fd %s closed\n",
1727 		    p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1728 		    pp->p_pid, pp->p_comm, &closed[1]);
1729 		mutex_exit(pp->p_lock);
1730 		mutex_exit(proc_lock);
1731 	}
1732 	return (0);
1733 }
1734 #undef CHECK_UPTO
1735 
1736 /*
1737  * Sets descriptor owner. If the owner is a process, 'pgid'
1738  * is set to positive value, process ID. If the owner is process group,
1739  * 'pgid' is set to -pg_id.
1740  */
1741 int
1742 fsetown(pid_t *pgid, int cmd, const void *data)
1743 {
1744 	int id = *(const int *)data;
1745 	int error;
1746 
1747 	switch (cmd) {
1748 	case TIOCSPGRP:
1749 		if (id < 0)
1750 			return (EINVAL);
1751 		id = -id;
1752 		break;
1753 	default:
1754 		break;
1755 	}
1756 
1757 	if (id > 0 && !pfind(id))
1758 		return (ESRCH);
1759 	else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1760 		return (error);
1761 
1762 	*pgid = id;
1763 	return (0);
1764 }
1765 
1766 /*
1767  * Return descriptor owner information. If the value is positive,
1768  * it's process ID. If it's negative, it's process group ID and
1769  * needs the sign removed before use.
1770  */
1771 int
1772 fgetown(pid_t pgid, int cmd, void *data)
1773 {
1774 
1775 	switch (cmd) {
1776 	case TIOCGPGRP:
1777 		*(int *)data = -pgid;
1778 		break;
1779 	default:
1780 		*(int *)data = pgid;
1781 		break;
1782 	}
1783 	return (0);
1784 }
1785 
1786 /*
1787  * Send signal to descriptor owner, either process or process group.
1788  */
1789 void
1790 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1791 {
1792 	struct proc *p1;
1793 	struct pgrp *pgrp;
1794 	ksiginfo_t ksi;
1795 
1796 	KASSERT(!cpu_intr_p());
1797 
1798 	KSI_INIT(&ksi);
1799 	ksi.ksi_signo = signo;
1800 	ksi.ksi_code = code;
1801 	ksi.ksi_band = band;
1802 
1803 	mutex_enter(proc_lock);
1804 	if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1805 		kpsignal(p1, &ksi, fdescdata);
1806 	else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1807 		kpgsignal(pgrp, &ksi, fdescdata, 0);
1808 	mutex_exit(proc_lock);
1809 }
1810 
1811 int
1812 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1813 	 void *data)
1814 {
1815 
1816 	fp->f_flag = flag;
1817 	fp->f_type = DTYPE_MISC;
1818 	fp->f_ops = fops;
1819 	fp->f_data = data;
1820 	curlwp->l_dupfd = fd;
1821 	fd_affix(curproc, fp, fd);
1822 
1823 	return EMOVEFD;
1824 }
1825 
1826 int
1827 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1828 {
1829 
1830 	if (cmd == F_SETFL)
1831 		return 0;
1832 
1833 	return EOPNOTSUPP;
1834 }
1835 
1836 int
1837 fnullop_poll(file_t *fp, int which)
1838 {
1839 
1840 	return 0;
1841 }
1842 
1843 int
1844 fnullop_kqfilter(file_t *fp, struct knote *kn)
1845 {
1846 
1847 	return 0;
1848 }
1849 
1850 int
1851 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1852 	    kauth_cred_t cred, int flags)
1853 {
1854 
1855 	return EOPNOTSUPP;
1856 }
1857 
1858 int
1859 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1860 	     kauth_cred_t cred, int flags)
1861 {
1862 
1863 	return EOPNOTSUPP;
1864 }
1865 
1866 int
1867 fbadop_ioctl(file_t *fp, u_long com, void *data)
1868 {
1869 
1870 	return EOPNOTSUPP;
1871 }
1872 
1873 int
1874 fbadop_stat(file_t *fp, struct stat *sb)
1875 {
1876 
1877 	return EOPNOTSUPP;
1878 }
1879 
1880 int
1881 fbadop_close(file_t *fp)
1882 {
1883 
1884 	return EOPNOTSUPP;
1885 }
1886