xref: /netbsd-src/sys/kern/kern_descrip.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /*	$NetBSD: kern_descrip.c,v 1.185 2008/12/21 09:58:22 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1982, 1986, 1989, 1991, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)kern_descrip.c	8.8 (Berkeley) 2/14/95
63  */
64 
65 /*
66  * File descriptor management.
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.185 2008/12/21 09:58:22 ad Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/filedesc.h>
75 #include <sys/kernel.h>
76 #include <sys/proc.h>
77 #include <sys/file.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/stat.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <sys/pool.h>
84 #include <sys/unistd.h>
85 #include <sys/resourcevar.h>
86 #include <sys/conf.h>
87 #include <sys/event.h>
88 #include <sys/kauth.h>
89 #include <sys/atomic.h>
90 #include <sys/syscallargs.h>
91 #include <sys/cpu.h>
92 #include <sys/kmem.h>
93 #include <sys/vnode.h>
94 
95 static int	file_ctor(void *, void *, int);
96 static void	file_dtor(void *, void *);
97 static int	fdfile_ctor(void *, void *, int);
98 static void	fdfile_dtor(void *, void *);
99 static int	filedesc_ctor(void *, void *, int);
100 static void	filedesc_dtor(void *, void *);
101 static int	filedescopen(dev_t, int, int, lwp_t *);
102 
103 kmutex_t	filelist_lock;	/* lock on filehead */
104 struct filelist	filehead;	/* head of list of open files */
105 u_int		nfiles;		/* actual number of open files */
106 
107 static pool_cache_t filedesc_cache;
108 static pool_cache_t file_cache;
109 static pool_cache_t fdfile_cache;
110 
111 const struct cdevsw filedesc_cdevsw = {
112 	filedescopen, noclose, noread, nowrite, noioctl,
113 	nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
114 };
115 
116 /* For ease of reading. */
117 __strong_alias(fd_putvnode,fd_putfile)
118 __strong_alias(fd_putsock,fd_putfile)
119 
120 /*
121  * Initialize the descriptor system.
122  */
123 void
124 fd_sys_init(void)
125 {
126 
127 	mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
128 
129 	file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
130 	    0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
131 	KASSERT(file_cache != NULL);
132 
133 	fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
134 	    PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
135 	    NULL);
136 	KASSERT(fdfile_cache != NULL);
137 
138 	filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
139 	    0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
140 	    NULL);
141 	KASSERT(filedesc_cache != NULL);
142 }
143 
144 static int
145 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
146 {
147 	int i, off, maxoff;
148 	uint32_t sub;
149 
150 	KASSERT(mutex_owned(&fdp->fd_lock));
151 
152 	if (want > bits)
153 		return -1;
154 
155 	off = want >> NDENTRYSHIFT;
156 	i = want & NDENTRYMASK;
157 	if (i) {
158 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
159 		if (sub != ~0)
160 			goto found;
161 		off++;
162 	}
163 
164 	maxoff = NDLOSLOTS(bits);
165 	while (off < maxoff) {
166 		if ((sub = bitmap[off]) != ~0)
167 			goto found;
168 		off++;
169 	}
170 
171 	return (-1);
172 
173  found:
174 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
175 }
176 
177 static int
178 fd_last_set(filedesc_t *fd, int last)
179 {
180 	int off, i;
181 	fdfile_t **ofiles = fd->fd_ofiles;
182 	uint32_t *bitmap = fd->fd_lomap;
183 
184 	KASSERT(mutex_owned(&fd->fd_lock));
185 
186 	off = (last - 1) >> NDENTRYSHIFT;
187 
188 	while (off >= 0 && !bitmap[off])
189 		off--;
190 
191 	if (off < 0)
192 		return (-1);
193 
194 	i = ((off + 1) << NDENTRYSHIFT) - 1;
195 	if (i >= last)
196 		i = last - 1;
197 
198 	/* XXX should use bitmap */
199 	/* XXXAD does not work for fd_copy() */
200 	while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
201 		i--;
202 
203 	return (i);
204 }
205 
206 void
207 fd_used(filedesc_t *fdp, unsigned fd)
208 {
209 	u_int off = fd >> NDENTRYSHIFT;
210 	fdfile_t *ff;
211 
212 	ff = fdp->fd_ofiles[fd];
213 
214 	KASSERT(mutex_owned(&fdp->fd_lock));
215 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
216 	KASSERT(ff != NULL);
217 	KASSERT(ff->ff_file == NULL);
218    	KASSERT(!ff->ff_allocated);
219 
220    	ff->ff_allocated = 1;
221 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
222 	if (fdp->fd_lomap[off] == ~0) {
223 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
224 		    (1 << (off & NDENTRYMASK))) == 0);
225 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
226 	}
227 
228 	if ((int)fd > fdp->fd_lastfile) {
229 		fdp->fd_lastfile = fd;
230 	}
231 
232 	if (fd >= NDFDFILE) {
233 		fdp->fd_nused++;
234 	} else {
235 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
236 	}
237 }
238 
239 void
240 fd_unused(filedesc_t *fdp, unsigned fd)
241 {
242 	u_int off = fd >> NDENTRYSHIFT;
243 	fdfile_t *ff;
244 
245 	ff = fdp->fd_ofiles[fd];
246 
247 	/*
248 	 * Don't assert the lock is held here, as we may be copying
249 	 * the table during exec() and it is not needed there.
250 	 * procfs and sysctl are locked out by proc::p_reflock.
251 	 *
252 	 * KASSERT(mutex_owned(&fdp->fd_lock));
253 	 */
254 	KASSERT(ff != NULL);
255 	KASSERT(ff->ff_file == NULL);
256    	KASSERT(ff->ff_allocated);
257 
258 	if (fd < fdp->fd_freefile) {
259 		fdp->fd_freefile = fd;
260 	}
261 
262 	if (fdp->fd_lomap[off] == ~0) {
263 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
264 		    (1 << (off & NDENTRYMASK))) != 0);
265 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
266 		    ~(1 << (off & NDENTRYMASK));
267 	}
268 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
269 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
270 	ff->ff_allocated = 0;
271 
272 	KASSERT(fd <= fdp->fd_lastfile);
273 	if (fd == fdp->fd_lastfile) {
274 		fdp->fd_lastfile = fd_last_set(fdp, fd);
275 	}
276 
277 	if (fd >= NDFDFILE) {
278 		KASSERT(fdp->fd_nused > 0);
279 		fdp->fd_nused--;
280 	} else {
281 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
282 	}
283 }
284 
285 /*
286  * Custom version of fd_unused() for fd_copy(), where the descriptor
287  * table is not yet fully initialized.
288  */
289 static inline void
290 fd_zap(filedesc_t *fdp, unsigned fd)
291 {
292 	u_int off = fd >> NDENTRYSHIFT;
293 
294 	if (fd < fdp->fd_freefile) {
295 		fdp->fd_freefile = fd;
296 	}
297 
298 	if (fdp->fd_lomap[off] == ~0) {
299 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
300 		    (1 << (off & NDENTRYMASK))) != 0);
301 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
302 		    ~(1 << (off & NDENTRYMASK));
303 	}
304 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
305 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
306 }
307 
308 bool
309 fd_isused(filedesc_t *fdp, unsigned fd)
310 {
311 	u_int off = fd >> NDENTRYSHIFT;
312 
313 	KASSERT(fd < fdp->fd_nfiles);
314 
315 	return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
316 }
317 
318 /*
319  * Look up the file structure corresponding to a file descriptor
320  * and return the file, holding a reference on the descriptor.
321  */
322 inline file_t *
323 fd_getfile(unsigned fd)
324 {
325 	filedesc_t *fdp;
326 	fdfile_t *ff;
327 	file_t *fp;
328 
329 	fdp = curlwp->l_fd;
330 
331 	/*
332 	 * Look up the fdfile structure representing this descriptor.
333 	 * Ensure that we see fd_nfiles before fd_ofiles since we
334 	 * are doing this unlocked.  See fd_tryexpand().
335 	 */
336 	if (__predict_false(fd >= fdp->fd_nfiles)) {
337 		return NULL;
338 	}
339 	membar_consumer();
340 	ff = fdp->fd_ofiles[fd];
341 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
342 	if (__predict_false(ff == NULL)) {
343 		return NULL;
344 	}
345 
346 	/*
347 	 * Now get a reference to the descriptor.   Issue a memory
348 	 * barrier to ensure that we acquire the file pointer _after_
349 	 * adding a reference.  If no memory barrier, we could fetch
350 	 * a stale pointer.
351 	 */
352 	atomic_inc_uint(&ff->ff_refcnt);
353 #ifndef __HAVE_ATOMIC_AS_MEMBAR
354 	membar_enter();
355 #endif
356 
357 	/*
358 	 * If the file is not open or is being closed then put the
359 	 * reference back.
360 	 */
361 	fp = ff->ff_file;
362 	if (__predict_true(fp != NULL)) {
363 		return fp;
364 	}
365 	fd_putfile(fd);
366 	return NULL;
367 }
368 
369 /*
370  * Release a reference to a file descriptor acquired with fd_getfile().
371  */
372 void
373 fd_putfile(unsigned fd)
374 {
375 	filedesc_t *fdp;
376 	fdfile_t *ff;
377 	u_int u, v;
378 
379 	fdp = curlwp->l_fd;
380 	ff = fdp->fd_ofiles[fd];
381 
382 	KASSERT(fd < fdp->fd_nfiles);
383 	KASSERT(ff != NULL);
384 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
385 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
386 
387 	/*
388 	 * Ensure that any use of the file is complete and globally
389 	 * visible before dropping the final reference.  If no membar,
390 	 * the current CPU could still access memory associated with
391 	 * the file after it has been freed or recycled by another
392 	 * CPU.
393 	 */
394 #ifndef __HAVE_ATOMIC_AS_MEMBAR
395 	membar_exit();
396 #endif
397 
398 	/*
399 	 * Be optimistic and start out with the assumption that no other
400 	 * threads are trying to close the descriptor.  If the CAS fails,
401 	 * we lost a race and/or it's being closed.
402 	 */
403 	for (u = ff->ff_refcnt & FR_MASK;; u = v) {
404 		v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
405 		if (__predict_true(u == v)) {
406 			return;
407 		}
408 		if (__predict_false((v & FR_CLOSING) != 0)) {
409 			break;
410 		}
411 	}
412 
413 	/* Another thread is waiting to close the file: join it. */
414 	(void)fd_close(fd);
415 }
416 
417 /*
418  * Convenience wrapper around fd_getfile() that returns reference
419  * to a vnode.
420  */
421 int
422 fd_getvnode(unsigned fd, file_t **fpp)
423 {
424 	vnode_t *vp;
425 	file_t *fp;
426 
427 	fp = fd_getfile(fd);
428 	if (__predict_false(fp == NULL)) {
429 		return EBADF;
430 	}
431 	if (__predict_false(fp->f_type != DTYPE_VNODE)) {
432 		fd_putfile(fd);
433 		return EINVAL;
434 	}
435 	vp = fp->f_data;
436 	if (__predict_false(vp->v_type == VBAD)) {
437 		/* XXX Is this case really necessary? */
438 		fd_putfile(fd);
439 		return EBADF;
440 	}
441 	*fpp = fp;
442 	return 0;
443 }
444 
445 /*
446  * Convenience wrapper around fd_getfile() that returns reference
447  * to a socket.
448  */
449 int
450 fd_getsock(unsigned fd, struct socket **sop)
451 {
452 	file_t *fp;
453 
454 	fp = fd_getfile(fd);
455 	if (__predict_false(fp == NULL)) {
456 		return EBADF;
457 	}
458 	if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
459 		fd_putfile(fd);
460 		return ENOTSOCK;
461 	}
462 	*sop = fp->f_data;
463 	return 0;
464 }
465 
466 /*
467  * Look up the file structure corresponding to a file descriptor
468  * and return it with a reference held on the file, not the
469  * descriptor.
470  *
471  * This is heavyweight and only used when accessing descriptors
472  * from a foreign process.  The caller must ensure that `p' does
473  * not exit or fork across this call.
474  *
475  * To release the file (not descriptor) reference, use closef().
476  */
477 file_t *
478 fd_getfile2(proc_t *p, unsigned fd)
479 {
480 	filedesc_t *fdp;
481 	fdfile_t *ff;
482 	file_t *fp;
483 
484 	fdp = p->p_fd;
485 	mutex_enter(&fdp->fd_lock);
486 	if (fd > fdp->fd_nfiles) {
487 		mutex_exit(&fdp->fd_lock);
488 		return NULL;
489 	}
490 	if ((ff = fdp->fd_ofiles[fd]) == NULL) {
491 		mutex_exit(&fdp->fd_lock);
492 		return NULL;
493 	}
494 	mutex_enter(&ff->ff_lock);
495 	if ((fp = ff->ff_file) == NULL) {
496 		mutex_exit(&ff->ff_lock);
497 		mutex_exit(&fdp->fd_lock);
498 		return NULL;
499 	}
500 	mutex_enter(&fp->f_lock);
501 	fp->f_count++;
502 	mutex_exit(&fp->f_lock);
503 	mutex_exit(&ff->ff_lock);
504 	mutex_exit(&fdp->fd_lock);
505 
506 	return fp;
507 }
508 
509 /*
510  * Internal form of close.  Must be called with a reference to the
511  * descriptor, and will drop the reference.  When all descriptor
512  * references are dropped, releases the descriptor slot and a single
513  * reference to the file structure.
514  */
515 int
516 fd_close(unsigned fd)
517 {
518 	struct flock lf;
519 	filedesc_t *fdp;
520 	fdfile_t *ff;
521 	file_t *fp;
522 	proc_t *p;
523 	lwp_t *l;
524 
525 	l = curlwp;
526 	p = l->l_proc;
527 	fdp = l->l_fd;
528 	ff = fdp->fd_ofiles[fd];
529 
530 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
531 
532 	mutex_enter(&ff->ff_lock);
533 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
534 	if (ff->ff_file == NULL) {
535 		/*
536 		 * Another user of the file is already closing, and is
537 		 * waiting for other users of the file to drain.  Release
538 		 * our reference, and wake up the closer.
539 		 */
540 		atomic_dec_uint(&ff->ff_refcnt);
541 		cv_broadcast(&ff->ff_closing);
542 		mutex_exit(&ff->ff_lock);
543 
544 		/*
545 		 * An application error, so pretend that the descriptor
546 		 * was already closed.  We can't safely wait for it to
547 		 * be closed without potentially deadlocking.
548 		 */
549 		return (EBADF);
550 	}
551 	KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
552 
553 	/*
554 	 * There may be multiple users of this file within the process.
555 	 * Notify existing and new users that the file is closing.  This
556 	 * will prevent them from adding additional uses to this file
557 	 * while we are closing it.
558 	 */
559 	fp = ff->ff_file;
560 	ff->ff_file = NULL;
561 	ff->ff_exclose = false;
562 
563 	/*
564 	 * We expect the caller to hold a descriptor reference - drop it.
565 	 * The reference count may increase beyond zero at this point due
566 	 * to an erroneous descriptor reference by an application, but
567 	 * fd_getfile() will notice that the file is being closed and drop
568 	 * the reference again.
569 	 */
570 #ifndef __HAVE_ATOMIC_AS_MEMBAR
571 	membar_producer();
572 #endif
573 	if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
574 		/*
575 		 * Wait for other references to drain.  This is typically
576 		 * an application error - the descriptor is being closed
577 		 * while still in use.
578 		 *
579 		 */
580 		atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
581 		/*
582 		 * Remove any knotes attached to the file.  A knote
583 		 * attached to the descriptor can hold references on it.
584 		 */
585 		if (!SLIST_EMPTY(&ff->ff_knlist)) {
586 			mutex_exit(&ff->ff_lock);
587 			knote_fdclose(fd);
588 			mutex_enter(&ff->ff_lock);
589 		}
590 		/*
591 		 * We need to see the count drop to zero at least once,
592 		 * in order to ensure that all pre-existing references
593 		 * have been drained.  New references past this point are
594 		 * of no interest.
595 		 */
596 		while ((ff->ff_refcnt & FR_MASK) != 0) {
597 			cv_wait(&ff->ff_closing, &ff->ff_lock);
598 		}
599 		atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
600 	} else {
601 		/* If no references, there must be no knotes. */
602 		KASSERT(SLIST_EMPTY(&ff->ff_knlist));
603 	}
604 	mutex_exit(&ff->ff_lock);
605 
606 	/*
607 	 * POSIX record locking dictates that any close releases ALL
608 	 * locks owned by this process.  This is handled by setting
609 	 * a flag in the unlock to free ONLY locks obeying POSIX
610 	 * semantics, and not to free BSD-style file locks.
611 	 * If the descriptor was in a message, POSIX-style locks
612 	 * aren't passed with the descriptor.
613 	 */
614 	if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
615 		lf.l_whence = SEEK_SET;
616 		lf.l_start = 0;
617 		lf.l_len = 0;
618 		lf.l_type = F_UNLCK;
619 		(void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
620 	}
621 
622 
623 	/* Free descriptor slot. */
624 	mutex_enter(&fdp->fd_lock);
625 	fd_unused(fdp, fd);
626 	mutex_exit(&fdp->fd_lock);
627 
628 	/* Now drop reference to the file itself. */
629 	return closef(fp);
630 }
631 
632 /*
633  * Duplicate a file descriptor.
634  */
635 int
636 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
637 {
638 	proc_t *p;
639 	int error;
640 
641 	p = curproc;
642 
643 	while ((error = fd_alloc(p, minfd, newp)) != 0) {
644 		if (error != ENOSPC) {
645 			return error;
646 		}
647 		fd_tryexpand(p);
648 	}
649 
650 	curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
651 	fd_affix(p, fp, *newp);
652 	return 0;
653 }
654 
655 /*
656  * dup2 operation.
657  */
658 int
659 fd_dup2(file_t *fp, unsigned new)
660 {
661 	filedesc_t *fdp;
662 	fdfile_t *ff;
663 
664 	fdp = curlwp->l_fd;
665 
666 	/*
667 	 * Ensure there are enough slots in the descriptor table,
668 	 * and allocate an fdfile_t up front in case we need it.
669 	 */
670 	while (new >= fdp->fd_nfiles) {
671 		fd_tryexpand(curproc);
672 	}
673 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
674 
675 	/*
676 	 * If there is already a file open, close it.  If the file is
677 	 * half open, wait for it to be constructed before closing it.
678 	 * XXX Potential for deadlock here?
679 	 */
680 	mutex_enter(&fdp->fd_lock);
681 	while (fd_isused(fdp, new)) {
682 		mutex_exit(&fdp->fd_lock);
683 		if (fd_getfile(new) != NULL) {
684 			(void)fd_close(new);
685 		} else {
686 			/* XXX Crummy, but unlikely to happen. */
687 			kpause("dup2", false, 1, NULL);
688 		}
689 		mutex_enter(&fdp->fd_lock);
690 	}
691 	if (fdp->fd_ofiles[new] == NULL) {
692 		KASSERT(new >= NDFDFILE);
693 		fdp->fd_ofiles[new] = ff;
694 		ff = NULL;
695 	}
696 	fd_used(fdp, new);
697 	mutex_exit(&fdp->fd_lock);
698 
699 	/* Slot is now allocated.  Insert copy of the file. */
700 	fd_affix(curproc, fp, new);
701 	if (ff != NULL) {
702 		pool_cache_put(fdfile_cache, ff);
703 	}
704 	return 0;
705 }
706 
707 /*
708  * Drop reference to a file structure.
709  */
710 int
711 closef(file_t *fp)
712 {
713 	struct flock lf;
714 	int error;
715 
716 	/*
717 	 * Drop reference.  If referenced elsewhere it's still open
718 	 * and we have nothing more to do.
719 	 */
720 	mutex_enter(&fp->f_lock);
721 	KASSERT(fp->f_count > 0);
722 	if (--fp->f_count > 0) {
723 		mutex_exit(&fp->f_lock);
724 		return 0;
725 	}
726 	KASSERT(fp->f_count == 0);
727 	mutex_exit(&fp->f_lock);
728 
729 	/* We held the last reference - release locks, close and free. */
730         if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
731         	lf.l_whence = SEEK_SET;
732 		lf.l_start = 0;
733 		lf.l_len = 0;
734 		lf.l_type = F_UNLCK;
735 		(void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
736 	}
737 	if (fp->f_ops != NULL) {
738 		error = (*fp->f_ops->fo_close)(fp);
739 	} else {
740 		error = 0;
741 	}
742 	ffree(fp);
743 
744 	return error;
745 }
746 
747 /*
748  * Allocate a file descriptor for the process.
749  */
750 int
751 fd_alloc(proc_t *p, int want, int *result)
752 {
753 	filedesc_t *fdp;
754 	int i, lim, last, error;
755 	u_int off, new;
756 	fdfile_t *ff;
757 
758 	KASSERT(p == curproc || p == &proc0);
759 
760 	fdp = p->p_fd;
761 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
762 	KASSERT(ff->ff_refcnt == 0);
763 	KASSERT(ff->ff_file == NULL);
764 
765 	/*
766 	 * Search for a free descriptor starting at the higher
767 	 * of want or fd_freefile.
768 	 */
769 	mutex_enter(&fdp->fd_lock);
770 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
771 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
772 	last = min(fdp->fd_nfiles, lim);
773 	for (;;) {
774 		if ((i = want) < fdp->fd_freefile)
775 			i = fdp->fd_freefile;
776 		off = i >> NDENTRYSHIFT;
777 		new = fd_next_zero(fdp, fdp->fd_himap, off,
778 		    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
779 		if (new == -1)
780 			break;
781 		i = fd_next_zero(fdp, &fdp->fd_lomap[new],
782 		    new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
783 		if (i == -1) {
784 			/*
785 			 * Free file descriptor in this block was
786 			 * below want, try again with higher want.
787 			 */
788 			want = (new + 1) << NDENTRYSHIFT;
789 			continue;
790 		}
791 		i += (new << NDENTRYSHIFT);
792 		if (i >= last) {
793 			break;
794 		}
795 		if (fdp->fd_ofiles[i] == NULL) {
796 			KASSERT(i >= NDFDFILE);
797 			fdp->fd_ofiles[i] = ff;
798 		} else {
799 		   	pool_cache_put(fdfile_cache, ff);
800 		}
801 		KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
802 		fd_used(fdp, i);
803 		if (want <= fdp->fd_freefile) {
804 			fdp->fd_freefile = i;
805 		}
806 		*result = i;
807 		mutex_exit(&fdp->fd_lock);
808 		KASSERT(i >= NDFDFILE ||
809 		    fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
810 		return 0;
811 	}
812 
813 	/* No space in current array.  Let the caller expand and retry. */
814 	error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
815 	mutex_exit(&fdp->fd_lock);
816 	pool_cache_put(fdfile_cache, ff);
817 	return error;
818 }
819 
820 /*
821  * Allocate memory for the open files array.
822  */
823 static fdfile_t **
824 fd_ofile_alloc(int n)
825 {
826 	uintptr_t *ptr, sz;
827 
828 	KASSERT(n > NDFILE);
829 
830 	sz = (n + 2) * sizeof(uintptr_t);
831 	ptr = kmem_alloc((size_t)sz, KM_SLEEP);
832 	ptr[1] = sz;
833 
834 	return (fdfile_t **)(ptr + 2);
835 }
836 
837 /*
838  * Free an open files array.
839  */
840 static void
841 fd_ofile_free(int n, fdfile_t **of)
842 {
843 	uintptr_t *ptr, sz;
844 
845 	KASSERT(n > NDFILE);
846 
847 	sz = (n + 2) * sizeof(uintptr_t);
848 	ptr = (uintptr_t *)of - 2;
849 	KASSERT(ptr[1] == sz);
850 	kmem_free(ptr, sz);
851 }
852 
853 /*
854  * Allocate descriptor bitmap.
855  */
856 static void
857 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
858 {
859 	uint8_t *ptr;
860 	size_t szlo, szhi;
861 
862 	KASSERT(n > NDENTRIES);
863 
864 	szlo = NDLOSLOTS(n) * sizeof(uint32_t);
865 	szhi = NDHISLOTS(n) * sizeof(uint32_t);
866 	ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
867 	*lo = (uint32_t *)ptr;
868 	*hi = (uint32_t *)(ptr + szlo);
869 }
870 
871 /*
872  * Free descriptor bitmap.
873  */
874 static void
875 fd_map_free(int n, uint32_t *lo, uint32_t *hi)
876 {
877 	size_t szlo, szhi;
878 
879 	KASSERT(n > NDENTRIES);
880 
881 	szlo = NDLOSLOTS(n) * sizeof(uint32_t);
882 	szhi = NDHISLOTS(n) * sizeof(uint32_t);
883 	KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
884 	kmem_free(lo, szlo + szhi);
885 }
886 
887 /*
888  * Expand a process' descriptor table.
889  */
890 void
891 fd_tryexpand(proc_t *p)
892 {
893 	filedesc_t *fdp;
894 	int i, numfiles, oldnfiles;
895 	fdfile_t **newofile;
896 	uint32_t *newhimap, *newlomap;
897 
898 	KASSERT(p == curproc || p == &proc0);
899 
900 	fdp = p->p_fd;
901 	newhimap = NULL;
902 	newlomap = NULL;
903 	oldnfiles = fdp->fd_nfiles;
904 
905 	if (oldnfiles < NDEXTENT)
906 		numfiles = NDEXTENT;
907 	else
908 		numfiles = 2 * oldnfiles;
909 
910 	newofile = fd_ofile_alloc(numfiles);
911 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
912 		fd_map_alloc(numfiles, &newlomap, &newhimap);
913 	}
914 
915 	mutex_enter(&fdp->fd_lock);
916 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
917 	if (fdp->fd_nfiles != oldnfiles) {
918 		/* fdp changed; caller must retry */
919 		mutex_exit(&fdp->fd_lock);
920 		fd_ofile_free(numfiles, newofile);
921 		if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
922 			fd_map_free(numfiles, newlomap, newhimap);
923 		}
924 		return;
925 	}
926 
927 	/* Copy the existing ofile array and zero the new portion. */
928 	i = sizeof(fdfile_t *) * fdp->fd_nfiles;
929 	memcpy(newofile, fdp->fd_ofiles, i);
930 	memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
931 
932 	/*
933 	 * Link old ofiles array into list to be discarded.  We defer
934 	 * freeing until process exit if the descriptor table is visble
935 	 * to other threads.
936 	 */
937 	if (oldnfiles > NDFILE) {
938 		if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
939 			fdp->fd_ofiles[-2] = (void *)fdp->fd_discard;
940 			fdp->fd_discard = fdp->fd_ofiles - 2;
941 		} else {
942 			fd_ofile_free(oldnfiles, fdp->fd_ofiles);
943 		}
944 	}
945 
946 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
947 		i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
948 		memcpy(newhimap, fdp->fd_himap, i);
949 		memset((uint8_t *)newhimap + i, 0,
950 		    NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
951 
952 		i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
953 		memcpy(newlomap, fdp->fd_lomap, i);
954 		memset((uint8_t *)newlomap + i, 0,
955 		    NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
956 
957 		if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
958 			fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
959 		}
960 		fdp->fd_himap = newhimap;
961 		fdp->fd_lomap = newlomap;
962 	}
963 
964 	/*
965 	 * All other modifications must become globally visible before
966 	 * the change to fd_nfiles.  See fd_getfile().
967 	 */
968 	fdp->fd_ofiles = newofile;
969 	membar_producer();
970 	fdp->fd_nfiles = numfiles;
971 	mutex_exit(&fdp->fd_lock);
972 
973 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
974 }
975 
976 /*
977  * Create a new open file structure and allocate a file descriptor
978  * for the current process.
979  */
980 int
981 fd_allocfile(file_t **resultfp, int *resultfd)
982 {
983 	file_t *fp;
984 	proc_t *p;
985 	int error;
986 
987 	p = curproc;
988 
989 	while ((error = fd_alloc(p, 0, resultfd)) != 0) {
990 		if (error != ENOSPC) {
991 			return error;
992 		}
993 		fd_tryexpand(p);
994 	}
995 
996 	fp = pool_cache_get(file_cache, PR_WAITOK);
997 	KASSERT(fp->f_count == 0);
998 	fp->f_cred = kauth_cred_get();
999 	kauth_cred_hold(fp->f_cred);
1000 
1001 	if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
1002 		fd_abort(p, fp, *resultfd);
1003 		tablefull("file", "increase kern.maxfiles or MAXFILES");
1004 		return ENFILE;
1005 	}
1006 
1007 	fp->f_advice = 0;
1008 	fp->f_msgcount = 0;
1009 	fp->f_offset = 0;
1010 	fp->f_iflags = 0;
1011 	*resultfp = fp;
1012 
1013 	return 0;
1014 }
1015 
1016 /*
1017  * Successful creation of a new descriptor: make visible to the process.
1018  */
1019 void
1020 fd_affix(proc_t *p, file_t *fp, unsigned fd)
1021 {
1022 	fdfile_t *ff;
1023 	filedesc_t *fdp;
1024 
1025 	KASSERT(p == curproc || p == &proc0);
1026 
1027 	/* Add a reference to the file structure. */
1028 	mutex_enter(&fp->f_lock);
1029 	fp->f_count++;
1030 	mutex_exit(&fp->f_lock);
1031 
1032 	/*
1033 	 * Insert the new file into the descriptor slot.
1034 	 *
1035 	 * The memory barriers provided by lock activity in this routine
1036 	 * ensure that any updates to the file structure become globally
1037 	 * visible before the file becomes visible to other LWPs in the
1038 	 * current process.
1039 	 */
1040 	fdp = p->p_fd;
1041 	ff = fdp->fd_ofiles[fd];
1042 
1043 	KASSERT(ff != NULL);
1044 	KASSERT(ff->ff_file == NULL);
1045 	KASSERT(ff->ff_allocated);
1046 	KASSERT(fd_isused(fdp, fd));
1047 	KASSERT(fd >= NDFDFILE ||
1048 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1049 
1050 	/* No need to lock in order to make file initially visible. */
1051 	ff->ff_file = fp;
1052 }
1053 
1054 /*
1055  * Abort creation of a new descriptor: free descriptor slot and file.
1056  */
1057 void
1058 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1059 {
1060 	filedesc_t *fdp;
1061 	fdfile_t *ff;
1062 
1063 	KASSERT(p == curproc || p == &proc0);
1064 
1065 	fdp = p->p_fd;
1066 	ff = fdp->fd_ofiles[fd];
1067 
1068 	KASSERT(fd >= NDFDFILE ||
1069 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1070 
1071 	mutex_enter(&fdp->fd_lock);
1072 	KASSERT(fd_isused(fdp, fd));
1073 	fd_unused(fdp, fd);
1074 	mutex_exit(&fdp->fd_lock);
1075 
1076 	if (fp != NULL) {
1077 		ffree(fp);
1078 	}
1079 }
1080 
1081 /*
1082  * Free a file descriptor.
1083  */
1084 void
1085 ffree(file_t *fp)
1086 {
1087 
1088 	KASSERT(fp->f_count == 0);
1089 
1090 	atomic_dec_uint(&nfiles);
1091 	kauth_cred_free(fp->f_cred);
1092 	pool_cache_put(file_cache, fp);
1093 }
1094 
1095 static int
1096 file_ctor(void *arg, void *obj, int flags)
1097 {
1098 	file_t *fp = obj;
1099 
1100 	memset(fp, 0, sizeof(*fp));
1101 	mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1102 
1103 	mutex_enter(&filelist_lock);
1104 	LIST_INSERT_HEAD(&filehead, fp, f_list);
1105 	mutex_exit(&filelist_lock);
1106 
1107 	return 0;
1108 }
1109 
1110 static void
1111 file_dtor(void *arg, void *obj)
1112 {
1113 	file_t *fp = obj;
1114 
1115 	mutex_enter(&filelist_lock);
1116 	LIST_REMOVE(fp, f_list);
1117 	mutex_exit(&filelist_lock);
1118 
1119 	mutex_destroy(&fp->f_lock);
1120 }
1121 
1122 static int
1123 fdfile_ctor(void *arg, void *obj, int flags)
1124 {
1125 	fdfile_t *ff = obj;
1126 
1127 	memset(ff, 0, sizeof(*ff));
1128 	mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1129 	cv_init(&ff->ff_closing, "fdclose");
1130 
1131 	return 0;
1132 }
1133 
1134 static void
1135 fdfile_dtor(void *arg, void *obj)
1136 {
1137 	fdfile_t *ff = obj;
1138 
1139 	mutex_destroy(&ff->ff_lock);
1140 	cv_destroy(&ff->ff_closing);
1141 }
1142 
1143 file_t *
1144 fgetdummy(void)
1145 {
1146 	file_t *fp;
1147 
1148 	fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1149 	if (fp != NULL) {
1150 		memset(fp, 0, sizeof(*fp));
1151 		mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1152 	}
1153 	return fp;
1154 }
1155 
1156 void
1157 fputdummy(file_t *fp)
1158 {
1159 
1160 	mutex_destroy(&fp->f_lock);
1161 	kmem_free(fp, sizeof(*fp));
1162 }
1163 
1164 /*
1165  * Create an initial filedesc structure.
1166  */
1167 filedesc_t *
1168 fd_init(filedesc_t *fdp)
1169 {
1170 	unsigned fd;
1171 
1172 	if (fdp == NULL) {
1173 		fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1174 	} else {
1175 		filedesc_ctor(NULL, fdp, PR_WAITOK);
1176 	}
1177 
1178 	fdp->fd_refcnt = 1;
1179 	fdp->fd_ofiles = fdp->fd_dfiles;
1180 	fdp->fd_nfiles = NDFILE;
1181 	fdp->fd_himap = fdp->fd_dhimap;
1182 	fdp->fd_lomap = fdp->fd_dlomap;
1183 	KASSERT(fdp->fd_lastfile == -1);
1184 	KASSERT(fdp->fd_lastkqfile == -1);
1185 	KASSERT(fdp->fd_knhash == NULL);
1186 
1187 	memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1188 	    offsetof(filedesc_t, fd_startzero));
1189 	for (fd = 0; fd < NDFDFILE; fd++) {
1190 		fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1191 	}
1192 
1193 	return fdp;
1194 }
1195 
1196 /*
1197  * Initialize a file descriptor table.
1198  */
1199 static int
1200 filedesc_ctor(void *arg, void *obj, int flag)
1201 {
1202 	filedesc_t *fdp = obj;
1203 	int i;
1204 
1205 	memset(fdp, 0, sizeof(*fdp));
1206 	mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1207 	fdp->fd_lastfile = -1;
1208 	fdp->fd_lastkqfile = -1;
1209 
1210 	CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1211 	for (i = 0; i < NDFDFILE; i++) {
1212 		fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1213 	}
1214 
1215 	return 0;
1216 }
1217 
1218 static void
1219 filedesc_dtor(void *arg, void *obj)
1220 {
1221 	filedesc_t *fdp = obj;
1222 	int i;
1223 
1224 	for (i = 0; i < NDFDFILE; i++) {
1225 		fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1226 	}
1227 
1228 	mutex_destroy(&fdp->fd_lock);
1229 }
1230 
1231 /*
1232  * Make p2 share p1's filedesc structure.
1233  */
1234 void
1235 fd_share(struct proc *p2)
1236 {
1237 	filedesc_t *fdp;
1238 
1239 	fdp = curlwp->l_fd;
1240 	p2->p_fd = fdp;
1241 	atomic_inc_uint(&fdp->fd_refcnt);
1242 }
1243 
1244 /*
1245  * Copy a filedesc structure.
1246  */
1247 filedesc_t *
1248 fd_copy(void)
1249 {
1250 	filedesc_t *newfdp, *fdp;
1251 	fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1252 	int i, nused, numfiles, lastfile, j, newlast;
1253 	file_t *fp;
1254 
1255 	fdp = curproc->p_fd;
1256 	newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1257 	newfdp->fd_refcnt = 1;
1258 
1259 	KASSERT(newfdp->fd_knhash == NULL);
1260 	KASSERT(newfdp->fd_knhashmask == 0);
1261 	KASSERT(newfdp->fd_discard == NULL);
1262 
1263 	for (;;) {
1264 		numfiles = fdp->fd_nfiles;
1265 		lastfile = fdp->fd_lastfile;
1266 
1267 		/*
1268 		 * If the number of open files fits in the internal arrays
1269 		 * of the open file structure, use them, otherwise allocate
1270 		 * additional memory for the number of descriptors currently
1271 		 * in use.
1272 		 */
1273 		if (lastfile < NDFILE) {
1274 			i = NDFILE;
1275 			newfdp->fd_ofiles = newfdp->fd_dfiles;
1276 		} else {
1277 			/*
1278 			 * Compute the smallest multiple of NDEXTENT needed
1279 			 * for the file descriptors currently in use,
1280 			 * allowing the table to shrink.
1281 			 */
1282 			i = numfiles;
1283 			while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1284 				i /= 2;
1285 			}
1286 			newfdp->fd_ofiles = fd_ofile_alloc(i);
1287 			KASSERT(i >= NDFILE);
1288 		}
1289 		if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1290 			newfdp->fd_himap = newfdp->fd_dhimap;
1291 			newfdp->fd_lomap = newfdp->fd_dlomap;
1292 		} else {
1293 			fd_map_alloc(i, &newfdp->fd_lomap,
1294 			    &newfdp->fd_himap);
1295 		}
1296 
1297 		/*
1298 		 * Allocate and string together fdfile structures.
1299 		 * We abuse fdfile_t::ff_file here, but it will be
1300 		 * cleared before this routine returns.
1301 		 */
1302 		nused = fdp->fd_nused;
1303 		fflist = NULL;
1304 		for (j = nused; j != 0; j--) {
1305 			ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1306 			ff->ff_file = (void *)fflist;
1307 			fflist = ff;
1308 		}
1309 
1310 		mutex_enter(&fdp->fd_lock);
1311 		if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1312 		    lastfile == fdp->fd_lastfile) {
1313 			break;
1314 		}
1315 		mutex_exit(&fdp->fd_lock);
1316 		if (i >= NDFILE) {
1317 			fd_ofile_free(i, newfdp->fd_ofiles);
1318 		}
1319 		if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1320 			fd_map_free(i, newfdp->fd_lomap, newfdp->fd_himap);
1321 		}
1322 		while (fflist != NULL) {
1323 			ff = fflist;
1324 			fflist = (void *)ff->ff_file;
1325 			ff->ff_file = NULL;
1326 			pool_cache_put(fdfile_cache, ff);
1327 		}
1328 	}
1329 
1330 	newfdp->fd_nfiles = i;
1331 	newfdp->fd_freefile = fdp->fd_freefile;
1332 	newfdp->fd_exclose = fdp->fd_exclose;
1333 
1334 	/*
1335 	 * Clear the entries that will not be copied over.
1336 	 * Avoid calling memset with 0 size.
1337 	 */
1338 	if (lastfile < (i-1)) {
1339 		memset(newfdp->fd_ofiles + lastfile + 1, 0,
1340 		    (i - lastfile - 1) * sizeof(file_t **));
1341 	}
1342 	if (i < NDENTRIES * NDENTRIES) {
1343 		i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1344 	}
1345 	memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1346 	memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1347 
1348 	ffp = fdp->fd_ofiles;
1349 	nffp = newfdp->fd_ofiles;
1350 	j = imax(lastfile, (NDFDFILE - 1));
1351 	newlast = -1;
1352 	KASSERT(j < fdp->fd_nfiles);
1353 	for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1354 		ff = *ffp;
1355 		/* Install built-in fdfiles even if unused here. */
1356 		if (i < NDFDFILE) {
1357 			ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1358 		} else {
1359 			ff2 = NULL;
1360 		}
1361 		/* Determine if descriptor is active in parent. */
1362 		if (ff == NULL || !fd_isused(fdp, i)) {
1363 			KASSERT(ff != NULL || i >= NDFDFILE);
1364 			continue;
1365 		}
1366 		mutex_enter(&ff->ff_lock);
1367 		fp = ff->ff_file;
1368 		if (fp == NULL) {
1369 			/* Descriptor is half-open: free slot. */
1370 			fd_zap(newfdp, i);
1371 			mutex_exit(&ff->ff_lock);
1372 			continue;
1373 		}
1374 		if (fp->f_type == DTYPE_KQUEUE) {
1375 			/* kqueue descriptors cannot be copied. */
1376 			fd_zap(newfdp, i);
1377 			mutex_exit(&ff->ff_lock);
1378 			continue;
1379 		}
1380 		/* It's active: add a reference to the file. */
1381 		mutex_enter(&fp->f_lock);
1382 		fp->f_count++;
1383 		mutex_exit(&fp->f_lock);
1384 		/* Consume one fdfile_t to represent it. */
1385 		if (i >= NDFDFILE) {
1386 			ff2 = fflist;
1387 			fflist = (void *)ff2->ff_file;
1388 		}
1389 		ff2->ff_file = fp;
1390 		ff2->ff_exclose = ff->ff_exclose;
1391 		ff2->ff_allocated = true;
1392 		mutex_exit(&ff->ff_lock);
1393 		if (i > newlast) {
1394 			newlast = i;
1395 		}
1396 	}
1397 	mutex_exit(&fdp->fd_lock);
1398 
1399 	/* Discard unused fdfile_t structures. */
1400 	while (__predict_false(fflist != NULL)) {
1401 		ff = fflist;
1402 		fflist = (void *)ff->ff_file;
1403 		ff->ff_file = NULL;
1404 		pool_cache_put(fdfile_cache, ff);
1405 		nused--;
1406 	}
1407 	KASSERT(nused >= 0);
1408 	KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1409 
1410 	newfdp->fd_nused = nused;
1411 	newfdp->fd_lastfile = newlast;
1412 
1413 	return (newfdp);
1414 }
1415 
1416 /*
1417  * Release a filedesc structure.
1418  */
1419 void
1420 fd_free(void)
1421 {
1422 	filedesc_t *fdp;
1423 	fdfile_t *ff;
1424 	file_t *fp;
1425 	int fd, lastfd;
1426 	void **discard;
1427 
1428 	fdp = curlwp->l_fd;
1429 
1430 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1431 
1432 	if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1433 		return;
1434 
1435 	/*
1436 	 * Close any files that the process holds open.
1437 	 */
1438 	for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1439 		ff = fdp->fd_ofiles[fd];
1440 		KASSERT(fd >= NDFDFILE ||
1441 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1442 		if ((ff = fdp->fd_ofiles[fd]) == NULL)
1443 			continue;
1444 		if ((fp = ff->ff_file) != NULL) {
1445 			/*
1446 			 * Must use fd_close() here as kqueue holds
1447 			 * long term references to descriptors.
1448 			 */
1449 			ff->ff_refcnt++;
1450 			fd_close(fd);
1451 		}
1452 		KASSERT(ff->ff_refcnt == 0);
1453 		KASSERT(ff->ff_file == NULL);
1454 		KASSERT(!ff->ff_exclose);
1455 		KASSERT(!ff->ff_allocated);
1456 		if (fd >= NDFDFILE) {
1457 			pool_cache_put(fdfile_cache, ff);
1458 		}
1459 	}
1460 
1461 	/*
1462 	 * Clean out the descriptor table for the next user and return
1463 	 * to the cache.
1464 	 */
1465 	while ((discard = fdp->fd_discard) != NULL) {
1466 		fdp->fd_discard = discard[0];
1467 		kmem_free(discard, (uintptr_t)discard[1]);
1468 	}
1469 	if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1470 		KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1471 		KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1472 		fd_map_free(fdp->fd_nfiles, fdp->fd_lomap, fdp->fd_himap);
1473 	}
1474 	if (fdp->fd_nfiles > NDFILE) {
1475 		KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1476 		fd_ofile_free(fdp->fd_nfiles, fdp->fd_ofiles);
1477 	}
1478 	if (fdp->fd_knhash != NULL) {
1479 		hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1480 		fdp->fd_knhash = NULL;
1481 		fdp->fd_knhashmask = 0;
1482 	} else {
1483 		KASSERT(fdp->fd_knhashmask == 0);
1484 	}
1485 	fdp->fd_lastkqfile = -1;
1486 	pool_cache_put(filedesc_cache, fdp);
1487 }
1488 
1489 /*
1490  * File Descriptor pseudo-device driver (/dev/fd/).
1491  *
1492  * Opening minor device N dup()s the file (if any) connected to file
1493  * descriptor N belonging to the calling process.  Note that this driver
1494  * consists of only the ``open()'' routine, because all subsequent
1495  * references to this file will be direct to the other driver.
1496  */
1497 static int
1498 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1499 {
1500 
1501 	/*
1502 	 * XXX Kludge: set dupfd to contain the value of the
1503 	 * the file descriptor being sought for duplication. The error
1504 	 * return ensures that the vnode for this device will be released
1505 	 * by vn_open. Open will detect this special error and take the
1506 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1507 	 * will simply report the error.
1508 	 */
1509 	l->l_dupfd = minor(dev);	/* XXX */
1510 	return EDUPFD;
1511 }
1512 
1513 /*
1514  * Duplicate the specified descriptor to a free descriptor.
1515  */
1516 int
1517 fd_dupopen(int old, int *new, int mode, int error)
1518 {
1519 	filedesc_t *fdp;
1520 	fdfile_t *ff;
1521 	file_t *fp;
1522 
1523 	if ((fp = fd_getfile(old)) == NULL) {
1524 		return EBADF;
1525 	}
1526 	fdp = curlwp->l_fd;
1527 	ff = fdp->fd_ofiles[old];
1528 
1529 	/*
1530 	 * There are two cases of interest here.
1531 	 *
1532 	 * For EDUPFD simply dup (dfd) to file descriptor
1533 	 * (indx) and return.
1534 	 *
1535 	 * For EMOVEFD steal away the file structure from (dfd) and
1536 	 * store it in (indx).  (dfd) is effectively closed by
1537 	 * this operation.
1538 	 *
1539 	 * Any other error code is just returned.
1540 	 */
1541 	switch (error) {
1542 	case EDUPFD:
1543 		/*
1544 		 * Check that the mode the file is being opened for is a
1545 		 * subset of the mode of the existing descriptor.
1546 		 */
1547 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1548 			error = EACCES;
1549 			break;
1550 		}
1551 
1552 		/* Copy it. */
1553 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1554 		break;
1555 
1556 	case EMOVEFD:
1557 		/* Copy it. */
1558 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1559 		if (error != 0) {
1560 			break;
1561 		}
1562 
1563 		/* Steal away the file pointer from 'old'. */
1564 		(void)fd_close(old);
1565 		return 0;
1566 	}
1567 
1568 	fd_putfile(old);
1569 	return error;
1570 }
1571 
1572 /*
1573  * Sets descriptor owner. If the owner is a process, 'pgid'
1574  * is set to positive value, process ID. If the owner is process group,
1575  * 'pgid' is set to -pg_id.
1576  */
1577 int
1578 fsetown(pid_t *pgid, u_long cmd, const void *data)
1579 {
1580 	int id = *(const int *)data;
1581 	int error;
1582 
1583 	switch (cmd) {
1584 	case TIOCSPGRP:
1585 		if (id < 0)
1586 			return (EINVAL);
1587 		id = -id;
1588 		break;
1589 	default:
1590 		break;
1591 	}
1592 
1593 	if (id > 0 && !pfind(id))
1594 		return (ESRCH);
1595 	else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1596 		return (error);
1597 
1598 	*pgid = id;
1599 	return (0);
1600 }
1601 
1602 /*
1603  * Return descriptor owner information. If the value is positive,
1604  * it's process ID. If it's negative, it's process group ID and
1605  * needs the sign removed before use.
1606  */
1607 int
1608 fgetown(pid_t pgid, u_long cmd, void *data)
1609 {
1610 
1611 	switch (cmd) {
1612 	case TIOCGPGRP:
1613 		*(int *)data = -pgid;
1614 		break;
1615 	default:
1616 		*(int *)data = pgid;
1617 		break;
1618 	}
1619 	return (0);
1620 }
1621 
1622 /*
1623  * Send signal to descriptor owner, either process or process group.
1624  */
1625 void
1626 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1627 {
1628 	struct proc *p1;
1629 	struct pgrp *pgrp;
1630 	ksiginfo_t ksi;
1631 
1632 	KASSERT(!cpu_intr_p());
1633 
1634 	KSI_INIT(&ksi);
1635 	ksi.ksi_signo = signo;
1636 	ksi.ksi_code = code;
1637 	ksi.ksi_band = band;
1638 
1639 	mutex_enter(proc_lock);
1640 	if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1641 		kpsignal(p1, &ksi, fdescdata);
1642 	else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1643 		kpgsignal(pgrp, &ksi, fdescdata, 0);
1644 	mutex_exit(proc_lock);
1645 }
1646 
1647 int
1648 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1649 	 void *data)
1650 {
1651 
1652 	fp->f_flag = flag;
1653 	fp->f_type = DTYPE_MISC;
1654 	fp->f_ops = fops;
1655 	fp->f_data = data;
1656 	curlwp->l_dupfd = fd;
1657 	fd_affix(curproc, fp, fd);
1658 
1659 	return EMOVEFD;
1660 }
1661 
1662 int
1663 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1664 {
1665 
1666 	if (cmd == F_SETFL)
1667 		return 0;
1668 
1669 	return EOPNOTSUPP;
1670 }
1671 
1672 int
1673 fnullop_poll(file_t *fp, int which)
1674 {
1675 
1676 	return 0;
1677 }
1678 
1679 int
1680 fnullop_kqfilter(file_t *fp, struct knote *kn)
1681 {
1682 
1683 	return 0;
1684 }
1685 
1686 int
1687 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1688 	    kauth_cred_t cred, int flags)
1689 {
1690 
1691 	return EOPNOTSUPP;
1692 }
1693 
1694 int
1695 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1696 	     kauth_cred_t cred, int flags)
1697 {
1698 
1699 	return EOPNOTSUPP;
1700 }
1701 
1702 int
1703 fbadop_ioctl(file_t *fp, u_long com, void *data)
1704 {
1705 
1706 	return EOPNOTSUPP;
1707 }
1708 
1709 int
1710 fbadop_stat(file_t *fp, struct stat *sb)
1711 {
1712 
1713 	return EOPNOTSUPP;
1714 }
1715 
1716 int
1717 fbadop_close(file_t *fp)
1718 {
1719 
1720 	return EOPNOTSUPP;
1721 }
1722