xref: /netbsd-src/sys/kern/kern_descrip.c (revision cac8e449158efc7261bebc8657cbb0125a2cfdde)
1 /*	$NetBSD: kern_descrip.c,v 1.182 2008/07/02 16:45:19 matt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1982, 1986, 1989, 1991, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)kern_descrip.c	8.8 (Berkeley) 2/14/95
63  */
64 
65 /*
66  * File descriptor management.
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182 2008/07/02 16:45:19 matt Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/filedesc.h>
75 #include <sys/kernel.h>
76 #include <sys/vnode.h>
77 #include <sys/proc.h>
78 #include <sys/file.h>
79 #include <sys/namei.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/stat.h>
83 #include <sys/ioctl.h>
84 #include <sys/fcntl.h>
85 #include <sys/malloc.h>
86 #include <sys/pool.h>
87 #include <sys/syslog.h>
88 #include <sys/unistd.h>
89 #include <sys/resourcevar.h>
90 #include <sys/conf.h>
91 #include <sys/event.h>
92 #include <sys/kauth.h>
93 #include <sys/atomic.h>
94 #include <sys/mount.h>
95 #include <sys/syscallargs.h>
96 #include <sys/cpu.h>
97 
98 static int	cwdi_ctor(void *, void *, int);
99 static void	cwdi_dtor(void *, void *);
100 static int	file_ctor(void *, void *, int);
101 static void	file_dtor(void *, void *);
102 static int	fdfile_ctor(void *, void *, int);
103 static void	fdfile_dtor(void *, void *);
104 static int	filedesc_ctor(void *, void *, int);
105 static void	filedesc_dtor(void *, void *);
106 static int	filedescopen(dev_t, int, int, lwp_t *);
107 
108 kmutex_t	filelist_lock;	/* lock on filehead */
109 struct filelist	filehead;	/* head of list of open files */
110 u_int		nfiles;		/* actual number of open files */
111 
112 static pool_cache_t cwdi_cache;
113 static pool_cache_t filedesc_cache;
114 static pool_cache_t file_cache;
115 static pool_cache_t fdfile_cache;
116 
117 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
118 
119 const struct cdevsw filedesc_cdevsw = {
120 	filedescopen, noclose, noread, nowrite, noioctl,
121 	nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
122 };
123 
124 /* For ease of reading. */
125 __strong_alias(fd_putvnode,fd_putfile)
126 __strong_alias(fd_putsock,fd_putfile)
127 
128 /*
129  * Initialize the descriptor system.
130  */
131 void
132 fd_sys_init(void)
133 {
134 
135 	mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
136 
137 	file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
138 	    0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
139 	KASSERT(file_cache != NULL);
140 
141 	fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
142 	    PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
143 	    NULL);
144 	KASSERT(fdfile_cache != NULL);
145 
146 	cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit,
147 	    0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
148 	KASSERT(cwdi_cache != NULL);
149 
150 	filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
151 	    0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
152 	    NULL);
153 	KASSERT(filedesc_cache != NULL);
154 }
155 
156 static int
157 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
158 {
159 	int i, off, maxoff;
160 	uint32_t sub;
161 
162 	KASSERT(mutex_owned(&fdp->fd_lock));
163 
164 	if (want > bits)
165 		return -1;
166 
167 	off = want >> NDENTRYSHIFT;
168 	i = want & NDENTRYMASK;
169 	if (i) {
170 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
171 		if (sub != ~0)
172 			goto found;
173 		off++;
174 	}
175 
176 	maxoff = NDLOSLOTS(bits);
177 	while (off < maxoff) {
178 		if ((sub = bitmap[off]) != ~0)
179 			goto found;
180 		off++;
181 	}
182 
183 	return (-1);
184 
185  found:
186 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
187 }
188 
189 static int
190 fd_last_set(filedesc_t *fd, int last)
191 {
192 	int off, i;
193 	fdfile_t **ofiles = fd->fd_ofiles;
194 	uint32_t *bitmap = fd->fd_lomap;
195 
196 	KASSERT(mutex_owned(&fd->fd_lock));
197 
198 	off = (last - 1) >> NDENTRYSHIFT;
199 
200 	while (off >= 0 && !bitmap[off])
201 		off--;
202 
203 	if (off < 0)
204 		return (-1);
205 
206 	i = ((off + 1) << NDENTRYSHIFT) - 1;
207 	if (i >= last)
208 		i = last - 1;
209 
210 	/* XXX should use bitmap */
211 	/* XXXAD does not work for fd_copy() */
212 	while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
213 		i--;
214 
215 	return (i);
216 }
217 
218 void
219 fd_used(filedesc_t *fdp, unsigned fd)
220 {
221 	u_int off = fd >> NDENTRYSHIFT;
222 	fdfile_t *ff;
223 
224 	ff = fdp->fd_ofiles[fd];
225 
226 	KASSERT(mutex_owned(&fdp->fd_lock));
227 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
228 	KASSERT(ff != NULL);
229 	KASSERT(ff->ff_file == NULL);
230    	KASSERT(!ff->ff_allocated);
231 
232    	ff->ff_allocated = 1;
233 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
234 	if (fdp->fd_lomap[off] == ~0) {
235 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
236 		    (1 << (off & NDENTRYMASK))) == 0);
237 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
238 	}
239 
240 	if ((int)fd > fdp->fd_lastfile) {
241 		fdp->fd_lastfile = fd;
242 	}
243 
244 	if (fd >= NDFDFILE) {
245 		fdp->fd_nused++;
246 	} else {
247 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
248 	}
249 }
250 
251 void
252 fd_unused(filedesc_t *fdp, unsigned fd)
253 {
254 	u_int off = fd >> NDENTRYSHIFT;
255 	fdfile_t *ff;
256 
257 	ff = fdp->fd_ofiles[fd];
258 
259 	/*
260 	 * Don't assert the lock is held here, as we may be copying
261 	 * the table during exec() and it is not needed there.
262 	 * procfs and sysctl are locked out by proc::p_reflock.
263 	 *
264 	 * KASSERT(mutex_owned(&fdp->fd_lock));
265 	 */
266 	KASSERT(ff != NULL);
267 	KASSERT(ff->ff_file == NULL);
268    	KASSERT(ff->ff_allocated);
269 
270 	if (fd < fdp->fd_freefile) {
271 		fdp->fd_freefile = fd;
272 	}
273 
274 	if (fdp->fd_lomap[off] == ~0) {
275 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
276 		    (1 << (off & NDENTRYMASK))) != 0);
277 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
278 		    ~(1 << (off & NDENTRYMASK));
279 	}
280 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
281 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
282 	ff->ff_allocated = 0;
283 
284 	KASSERT(fd <= fdp->fd_lastfile);
285 	if (fd == fdp->fd_lastfile) {
286 		fdp->fd_lastfile = fd_last_set(fdp, fd);
287 	}
288 
289 	if (fd >= NDFDFILE) {
290 		KASSERT(fdp->fd_nused > 0);
291 		fdp->fd_nused--;
292 	} else {
293 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
294 	}
295 }
296 
297 /*
298  * Custom version of fd_unused() for fd_copy(), where the descriptor
299  * table is not yet fully initialized.
300  */
301 static inline void
302 fd_zap(filedesc_t *fdp, unsigned fd)
303 {
304 	u_int off = fd >> NDENTRYSHIFT;
305 
306 	if (fd < fdp->fd_freefile) {
307 		fdp->fd_freefile = fd;
308 	}
309 
310 	if (fdp->fd_lomap[off] == ~0) {
311 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
312 		    (1 << (off & NDENTRYMASK))) != 0);
313 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
314 		    ~(1 << (off & NDENTRYMASK));
315 	}
316 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
317 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
318 }
319 
320 bool
321 fd_isused(filedesc_t *fdp, unsigned fd)
322 {
323 	u_int off = fd >> NDENTRYSHIFT;
324 
325 	KASSERT(fd < fdp->fd_nfiles);
326 
327 	return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
328 }
329 
330 /*
331  * Look up the file structure corresponding to a file descriptor
332  * and return the file, holding a reference on the descriptor.
333  */
334 inline file_t *
335 fd_getfile(unsigned fd)
336 {
337 	filedesc_t *fdp;
338 	fdfile_t *ff;
339 	file_t *fp;
340 
341 	fdp = curlwp->l_fd;
342 
343 	/*
344 	 * Look up the fdfile structure representing this descriptor.
345 	 * Ensure that we see fd_nfiles before fd_ofiles since we
346 	 * are doing this unlocked.  See fd_tryexpand().
347 	 */
348 	if (__predict_false(fd >= fdp->fd_nfiles)) {
349 		return NULL;
350 	}
351 	membar_consumer();
352 	ff = fdp->fd_ofiles[fd];
353 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
354 	if (__predict_false(ff == NULL)) {
355 		return NULL;
356 	}
357 
358 	/*
359 	 * Now get a reference to the descriptor.   Issue a memory
360 	 * barrier to ensure that we acquire the file pointer _after_
361 	 * adding a reference.  If no memory barrier, we could fetch
362 	 * a stale pointer.
363 	 */
364 	atomic_inc_uint(&ff->ff_refcnt);
365 #ifndef __HAVE_ATOMIC_AS_MEMBAR
366 	membar_enter();
367 #endif
368 
369 	/*
370 	 * If the file is not open or is being closed then put the
371 	 * reference back.
372 	 */
373 	fp = ff->ff_file;
374 	if (__predict_true(fp != NULL)) {
375 		return fp;
376 	}
377 	fd_putfile(fd);
378 	return NULL;
379 }
380 
381 /*
382  * Release a reference to a file descriptor acquired with fd_getfile().
383  */
384 void
385 fd_putfile(unsigned fd)
386 {
387 	filedesc_t *fdp;
388 	fdfile_t *ff;
389 	u_int u, v;
390 
391 	fdp = curlwp->l_fd;
392 	ff = fdp->fd_ofiles[fd];
393 
394 	KASSERT(fd < fdp->fd_nfiles);
395 	KASSERT(ff != NULL);
396 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
397 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
398 
399 	/*
400 	 * Ensure that any use of the file is complete and globally
401 	 * visible before dropping the final reference.  If no membar,
402 	 * the current CPU could still access memory associated with
403 	 * the file after it has been freed or recycled by another
404 	 * CPU.
405 	 */
406 #ifndef __HAVE_ATOMIC_AS_MEMBAR
407 	membar_exit();
408 #endif
409 
410 	/*
411 	 * Be optimistic and start out with the assumption that no other
412 	 * threads are trying to close the descriptor.  If the CAS fails,
413 	 * we lost a race and/or it's being closed.
414 	 */
415 	for (u = ff->ff_refcnt & FR_MASK;; u = v) {
416 		v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
417 		if (__predict_true(u == v)) {
418 			return;
419 		}
420 		if (__predict_false((v & FR_CLOSING) != 0)) {
421 			break;
422 		}
423 	}
424 
425 	/* Another thread is waiting to close the file: join it. */
426 	(void)fd_close(fd);
427 }
428 
429 /*
430  * Convenience wrapper around fd_getfile() that returns reference
431  * to a vnode.
432  */
433 int
434 fd_getvnode(unsigned fd, file_t **fpp)
435 {
436 	vnode_t *vp;
437 	file_t *fp;
438 
439 	fp = fd_getfile(fd);
440 	if (__predict_false(fp == NULL)) {
441 		return EBADF;
442 	}
443 	if (__predict_false(fp->f_type != DTYPE_VNODE)) {
444 		fd_putfile(fd);
445 		return EINVAL;
446 	}
447 	vp = fp->f_data;
448 	if (__predict_false(vp->v_type == VBAD)) {
449 		/* XXX Is this case really necessary? */
450 		fd_putfile(fd);
451 		return EBADF;
452 	}
453 	*fpp = fp;
454 	return 0;
455 }
456 
457 /*
458  * Convenience wrapper around fd_getfile() that returns reference
459  * to a socket.
460  */
461 int
462 fd_getsock(unsigned fd, struct socket **sop)
463 {
464 	file_t *fp;
465 
466 	fp = fd_getfile(fd);
467 	if (__predict_false(fp == NULL)) {
468 		return EBADF;
469 	}
470 	if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
471 		fd_putfile(fd);
472 		return ENOTSOCK;
473 	}
474 	*sop = fp->f_data;
475 	return 0;
476 }
477 
478 /*
479  * Look up the file structure corresponding to a file descriptor
480  * and return it with a reference held on the file, not the
481  * descriptor.
482  *
483  * This is heavyweight and only used when accessing descriptors
484  * from a foreign process.  The caller must ensure that `p' does
485  * not exit or fork across this call.
486  *
487  * To release the file (not descriptor) reference, use closef().
488  */
489 file_t *
490 fd_getfile2(proc_t *p, unsigned fd)
491 {
492 	filedesc_t *fdp;
493 	fdfile_t *ff;
494 	file_t *fp;
495 
496 	fdp = p->p_fd;
497 	mutex_enter(&fdp->fd_lock);
498 	if (fd > fdp->fd_nfiles) {
499 		mutex_exit(&fdp->fd_lock);
500 		return NULL;
501 	}
502 	if ((ff = fdp->fd_ofiles[fd]) == NULL) {
503 		mutex_exit(&fdp->fd_lock);
504 		return NULL;
505 	}
506 	mutex_enter(&ff->ff_lock);
507 	if ((fp = ff->ff_file) == NULL) {
508 		mutex_exit(&ff->ff_lock);
509 		mutex_exit(&fdp->fd_lock);
510 		return NULL;
511 	}
512 	mutex_enter(&fp->f_lock);
513 	fp->f_count++;
514 	mutex_exit(&fp->f_lock);
515 	mutex_exit(&ff->ff_lock);
516 	mutex_exit(&fdp->fd_lock);
517 
518 	return fp;
519 }
520 
521 /*
522  * Internal form of close.  Must be called with a reference to the
523  * descriptor, and will drop the reference.  When all descriptor
524  * references are dropped, releases the descriptor slot and a single
525  * reference to the file structure.
526  */
527 int
528 fd_close(unsigned fd)
529 {
530 	struct flock lf;
531 	filedesc_t *fdp;
532 	fdfile_t *ff;
533 	file_t *fp;
534 	proc_t *p;
535 	lwp_t *l;
536 
537 	l = curlwp;
538 	p = l->l_proc;
539 	fdp = l->l_fd;
540 	ff = fdp->fd_ofiles[fd];
541 
542 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
543 
544 	mutex_enter(&ff->ff_lock);
545 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
546 	if (ff->ff_file == NULL) {
547 		/*
548 		 * Another user of the file is already closing, and is
549 		 * waiting for other users of the file to drain.  Release
550 		 * our reference, and wake up the closer.
551 		 */
552 		atomic_dec_uint(&ff->ff_refcnt);
553 		cv_broadcast(&ff->ff_closing);
554 		mutex_exit(&ff->ff_lock);
555 
556 		/*
557 		 * An application error, so pretend that the descriptor
558 		 * was already closed.  We can't safely wait for it to
559 		 * be closed without potentially deadlocking.
560 		 */
561 		return (EBADF);
562 	}
563 	KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
564 
565 	/*
566 	 * There may be multiple users of this file within the process.
567 	 * Notify existing and new users that the file is closing.  This
568 	 * will prevent them from adding additional uses to this file
569 	 * while we are closing it.
570 	 */
571 	fp = ff->ff_file;
572 	ff->ff_file = NULL;
573 	ff->ff_exclose = false;
574 
575 	/*
576 	 * We expect the caller to hold a descriptor reference - drop it.
577 	 * The reference count may increase beyond zero at this point due
578 	 * to an erroneous descriptor reference by an application, but
579 	 * fd_getfile() will notice that the file is being closed and drop
580 	 * the reference again.
581 	 */
582 #ifndef __HAVE_ATOMIC_AS_MEMBAR
583 	membar_producer();
584 #endif
585 	if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
586 		/*
587 		 * Wait for other references to drain.  This is typically
588 		 * an application error - the descriptor is being closed
589 		 * while still in use.
590 		 *
591 		 */
592 		atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
593 		/*
594 		 * Remove any knotes attached to the file.  A knote
595 		 * attached to the descriptor can hold references on it.
596 		 */
597 		if (!SLIST_EMPTY(&ff->ff_knlist)) {
598 			mutex_exit(&ff->ff_lock);
599 			knote_fdclose(fd);
600 			mutex_enter(&ff->ff_lock);
601 		}
602 		/*
603 		 * We need to see the count drop to zero at least once,
604 		 * in order to ensure that all pre-existing references
605 		 * have been drained.  New references past this point are
606 		 * of no interest.
607 		 */
608 		while ((ff->ff_refcnt & FR_MASK) != 0) {
609 			cv_wait(&ff->ff_closing, &ff->ff_lock);
610 		}
611 		atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
612 	} else {
613 		/* If no references, there must be no knotes. */
614 		KASSERT(SLIST_EMPTY(&ff->ff_knlist));
615 	}
616 	mutex_exit(&ff->ff_lock);
617 
618 	/*
619 	 * POSIX record locking dictates that any close releases ALL
620 	 * locks owned by this process.  This is handled by setting
621 	 * a flag in the unlock to free ONLY locks obeying POSIX
622 	 * semantics, and not to free BSD-style file locks.
623 	 * If the descriptor was in a message, POSIX-style locks
624 	 * aren't passed with the descriptor.
625 	 */
626 	if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
627 		lf.l_whence = SEEK_SET;
628 		lf.l_start = 0;
629 		lf.l_len = 0;
630 		lf.l_type = F_UNLCK;
631 		(void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
632 	}
633 
634 
635 	/* Free descriptor slot. */
636 	mutex_enter(&fdp->fd_lock);
637 	fd_unused(fdp, fd);
638 	mutex_exit(&fdp->fd_lock);
639 
640 	/* Now drop reference to the file itself. */
641 	return closef(fp);
642 }
643 
644 /*
645  * Duplicate a file descriptor.
646  */
647 int
648 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
649 {
650 	proc_t *p;
651 	int error;
652 
653 	p = curproc;
654 
655 	while ((error = fd_alloc(p, minfd, newp)) != 0) {
656 		if (error != ENOSPC) {
657 			return error;
658 		}
659 		fd_tryexpand(p);
660 	}
661 
662 	curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
663 	fd_affix(p, fp, *newp);
664 	return 0;
665 }
666 
667 /*
668  * dup2 operation.
669  */
670 int
671 fd_dup2(file_t *fp, unsigned new)
672 {
673 	filedesc_t *fdp;
674 	fdfile_t *ff;
675 
676 	fdp = curlwp->l_fd;
677 
678 	/*
679 	 * Ensure there are enough slots in the descriptor table,
680 	 * and allocate an fdfile_t up front in case we need it.
681 	 */
682 	while (new >= fdp->fd_nfiles) {
683 		fd_tryexpand(curproc);
684 	}
685 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
686 
687 	/*
688 	 * If there is already a file open, close it.  If the file is
689 	 * half open, wait for it to be constructed before closing it.
690 	 * XXX Potential for deadlock here?
691 	 */
692 	mutex_enter(&fdp->fd_lock);
693 	while (fd_isused(fdp, new)) {
694 		mutex_exit(&fdp->fd_lock);
695 		if (fd_getfile(new) != NULL) {
696 			(void)fd_close(new);
697 		} else {
698 			/* XXX Crummy, but unlikely to happen. */
699 			kpause("dup2", false, 1, NULL);
700 		}
701 		mutex_enter(&fdp->fd_lock);
702 	}
703 	if (fdp->fd_ofiles[new] == NULL) {
704 		KASSERT(new >= NDFDFILE);
705 		fdp->fd_ofiles[new] = ff;
706 		ff = NULL;
707 	}
708 	fd_used(fdp, new);
709 	mutex_exit(&fdp->fd_lock);
710 
711 	/* Slot is now allocated.  Insert copy of the file. */
712 	fd_affix(curproc, fp, new);
713 	if (ff != NULL) {
714 		pool_cache_put(fdfile_cache, ff);
715 	}
716 	return 0;
717 }
718 
719 /*
720  * Drop reference to a file structure.
721  */
722 int
723 closef(file_t *fp)
724 {
725 	struct flock lf;
726 	int error;
727 
728 	/*
729 	 * Drop reference.  If referenced elsewhere it's still open
730 	 * and we have nothing more to do.
731 	 */
732 	mutex_enter(&fp->f_lock);
733 	KASSERT(fp->f_count > 0);
734 	if (--fp->f_count > 0) {
735 		mutex_exit(&fp->f_lock);
736 		return 0;
737 	}
738 	KASSERT(fp->f_count == 0);
739 	mutex_exit(&fp->f_lock);
740 
741 	/* We held the last reference - release locks, close and free. */
742         if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
743         	lf.l_whence = SEEK_SET;
744 		lf.l_start = 0;
745 		lf.l_len = 0;
746 		lf.l_type = F_UNLCK;
747 		(void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
748 	}
749 	if (fp->f_ops != NULL) {
750 		error = (*fp->f_ops->fo_close)(fp);
751 	} else {
752 		error = 0;
753 	}
754 	ffree(fp);
755 
756 	return error;
757 }
758 
759 /*
760  * Allocate a file descriptor for the process.
761  */
762 int
763 fd_alloc(proc_t *p, int want, int *result)
764 {
765 	filedesc_t *fdp;
766 	int i, lim, last, error;
767 	u_int off, new;
768 	fdfile_t *ff;
769 
770 	KASSERT(p == curproc || p == &proc0);
771 
772 	fdp = p->p_fd;
773 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
774 	KASSERT(ff->ff_refcnt == 0);
775 	KASSERT(ff->ff_file == NULL);
776 
777 	/*
778 	 * Search for a free descriptor starting at the higher
779 	 * of want or fd_freefile.
780 	 */
781 	mutex_enter(&fdp->fd_lock);
782 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
783 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
784 	last = min(fdp->fd_nfiles, lim);
785 	for (;;) {
786 		if ((i = want) < fdp->fd_freefile)
787 			i = fdp->fd_freefile;
788 		off = i >> NDENTRYSHIFT;
789 		new = fd_next_zero(fdp, fdp->fd_himap, off,
790 		    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
791 		if (new == -1)
792 			break;
793 		i = fd_next_zero(fdp, &fdp->fd_lomap[new],
794 		    new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
795 		if (i == -1) {
796 			/*
797 			 * Free file descriptor in this block was
798 			 * below want, try again with higher want.
799 			 */
800 			want = (new + 1) << NDENTRYSHIFT;
801 			continue;
802 		}
803 		i += (new << NDENTRYSHIFT);
804 		if (i >= last) {
805 			break;
806 		}
807 		if (fdp->fd_ofiles[i] == NULL) {
808 			KASSERT(i >= NDFDFILE);
809 			fdp->fd_ofiles[i] = ff;
810 		} else {
811 		   	pool_cache_put(fdfile_cache, ff);
812 		}
813 		KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
814 		fd_used(fdp, i);
815 		if (want <= fdp->fd_freefile) {
816 			fdp->fd_freefile = i;
817 		}
818 		*result = i;
819 		mutex_exit(&fdp->fd_lock);
820 		KASSERT(i >= NDFDFILE ||
821 		    fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
822 		return 0;
823 	}
824 
825 	/* No space in current array.  Let the caller expand and retry. */
826 	error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
827 	mutex_exit(&fdp->fd_lock);
828 	pool_cache_put(fdfile_cache, ff);
829 	return error;
830 }
831 
832 /*
833  * Expand a process' descriptor table.
834  */
835 void
836 fd_tryexpand(proc_t *p)
837 {
838 	filedesc_t *fdp;
839 	int i, numfiles, oldnfiles;
840 	fdfile_t **newofile;
841 	uint32_t *newhimap, *newlomap;
842 
843 	KASSERT(p == curproc || p == &proc0);
844 
845 	fdp = p->p_fd;
846 	newhimap = NULL;
847 	newlomap = NULL;
848 	oldnfiles = fdp->fd_nfiles;
849 
850 	if (oldnfiles < NDEXTENT)
851 		numfiles = NDEXTENT;
852 	else
853 		numfiles = 2 * oldnfiles;
854 
855 	newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK);
856 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
857 		newhimap = malloc(NDHISLOTS(numfiles) *
858 		    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
859 		newlomap = malloc(NDLOSLOTS(numfiles) *
860 		    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
861 	}
862 
863 	mutex_enter(&fdp->fd_lock);
864 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
865 	if (fdp->fd_nfiles != oldnfiles) {
866 		/* fdp changed; caller must retry */
867 		mutex_exit(&fdp->fd_lock);
868 		free(newofile, M_FILEDESC);
869 		if (newhimap != NULL)
870 			free(newhimap, M_FILEDESC);
871 		if (newlomap != NULL)
872 			free(newlomap, M_FILEDESC);
873 		return;
874 	}
875 
876 	/* Copy the existing ofile array and zero the new portion. */
877 	i = sizeof(fdfile_t *) * fdp->fd_nfiles;
878 	memcpy(newofile, fdp->fd_ofiles, i);
879 	memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
880 
881 	/*
882 	 * Link old ofiles array into list to be discarded.  We defer
883 	 * freeing until process exit if the descriptor table is visble
884 	 * to other threads.
885 	 */
886 	if (oldnfiles > NDFILE) {
887 		if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
888 			*(void **)fdp->fd_ofiles = fdp->fd_discard;
889 			fdp->fd_discard = fdp->fd_ofiles;
890 		} else {
891 			free(fdp->fd_ofiles, M_FILEDESC);
892 		}
893 	}
894 
895 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
896 		i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
897 		memcpy(newhimap, fdp->fd_himap, i);
898 		memset((uint8_t *)newhimap + i, 0,
899 		    NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
900 
901 		i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
902 		memcpy(newlomap, fdp->fd_lomap, i);
903 		memset((uint8_t *)newlomap + i, 0,
904 		    NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
905 
906 		if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
907 			free(fdp->fd_himap, M_FILEDESC);
908 			free(fdp->fd_lomap, M_FILEDESC);
909 		}
910 		fdp->fd_himap = newhimap;
911 		fdp->fd_lomap = newlomap;
912 	}
913 
914 	/*
915 	 * All other modifications must become globally visible before
916 	 * the change to fd_nfiles.  See fd_getfile().
917 	 */
918 	fdp->fd_ofiles = newofile;
919 	membar_producer();
920 	fdp->fd_nfiles = numfiles;
921 	mutex_exit(&fdp->fd_lock);
922 
923 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
924 }
925 
926 /*
927  * Create a new open file structure and allocate a file descriptor
928  * for the current process.
929  */
930 int
931 fd_allocfile(file_t **resultfp, int *resultfd)
932 {
933 	file_t *fp;
934 	proc_t *p;
935 	int error;
936 
937 	p = curproc;
938 
939 	while ((error = fd_alloc(p, 0, resultfd)) != 0) {
940 		if (error != ENOSPC) {
941 			return error;
942 		}
943 		fd_tryexpand(p);
944 	}
945 
946 	fp = pool_cache_get(file_cache, PR_WAITOK);
947 	KASSERT(fp->f_count == 0);
948 	fp->f_cred = kauth_cred_get();
949 	kauth_cred_hold(fp->f_cred);
950 
951 	if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
952 		fd_abort(p, fp, *resultfd);
953 		tablefull("file", "increase kern.maxfiles or MAXFILES");
954 		return ENFILE;
955 	}
956 
957 	fp->f_advice = 0;
958 	fp->f_msgcount = 0;
959 	fp->f_offset = 0;
960 	fp->f_iflags = 0;
961 	*resultfp = fp;
962 
963 	return 0;
964 }
965 
966 /*
967  * Successful creation of a new descriptor: make visible to the process.
968  */
969 void
970 fd_affix(proc_t *p, file_t *fp, unsigned fd)
971 {
972 	fdfile_t *ff;
973 	filedesc_t *fdp;
974 
975 	KASSERT(p == curproc || p == &proc0);
976 
977 	/* Add a reference to the file structure. */
978 	mutex_enter(&fp->f_lock);
979 	fp->f_count++;
980 	mutex_exit(&fp->f_lock);
981 
982 	/*
983 	 * Insert the new file into the descriptor slot.
984 	 *
985 	 * The memory barriers provided by lock activity in this routine
986 	 * ensure that any updates to the file structure become globally
987 	 * visible before the file becomes visible to other LWPs in the
988 	 * current process.
989 	 */
990 	fdp = p->p_fd;
991 	ff = fdp->fd_ofiles[fd];
992 
993 	KASSERT(ff != NULL);
994 	KASSERT(ff->ff_file == NULL);
995 	KASSERT(ff->ff_allocated);
996 	KASSERT(fd_isused(fdp, fd));
997 	KASSERT(fd >= NDFDFILE ||
998 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
999 
1000 	/* No need to lock in order to make file initially visible. */
1001 	ff->ff_file = fp;
1002 }
1003 
1004 /*
1005  * Abort creation of a new descriptor: free descriptor slot and file.
1006  */
1007 void
1008 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1009 {
1010 	filedesc_t *fdp;
1011 	fdfile_t *ff;
1012 
1013 	KASSERT(p == curproc || p == &proc0);
1014 
1015 	fdp = p->p_fd;
1016 	ff = fdp->fd_ofiles[fd];
1017 
1018 	KASSERT(fd >= NDFDFILE ||
1019 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1020 
1021 	mutex_enter(&fdp->fd_lock);
1022 	KASSERT(fd_isused(fdp, fd));
1023 	fd_unused(fdp, fd);
1024 	mutex_exit(&fdp->fd_lock);
1025 
1026 	if (fp != NULL) {
1027 		ffree(fp);
1028 	}
1029 }
1030 
1031 /*
1032  * Free a file descriptor.
1033  */
1034 void
1035 ffree(file_t *fp)
1036 {
1037 
1038 	KASSERT(fp->f_count == 0);
1039 
1040 	atomic_dec_uint(&nfiles);
1041 	kauth_cred_free(fp->f_cred);
1042 	pool_cache_put(file_cache, fp);
1043 }
1044 
1045 /*
1046  * Create an initial cwdinfo structure, using the same current and root
1047  * directories as curproc.
1048  */
1049 struct cwdinfo *
1050 cwdinit(void)
1051 {
1052 	struct cwdinfo *cwdi;
1053 	struct cwdinfo *copy;
1054 
1055 	cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1056 	copy = curproc->p_cwdi;
1057 
1058 	rw_enter(&copy->cwdi_lock, RW_READER);
1059 	cwdi->cwdi_cdir = copy->cwdi_cdir;
1060 	if (cwdi->cwdi_cdir)
1061 		VREF(cwdi->cwdi_cdir);
1062 	cwdi->cwdi_rdir = copy->cwdi_rdir;
1063 	if (cwdi->cwdi_rdir)
1064 		VREF(cwdi->cwdi_rdir);
1065 	cwdi->cwdi_edir = copy->cwdi_edir;
1066 	if (cwdi->cwdi_edir)
1067 		VREF(cwdi->cwdi_edir);
1068 	cwdi->cwdi_cmask =  copy->cwdi_cmask;
1069 	cwdi->cwdi_refcnt = 1;
1070 	rw_exit(&copy->cwdi_lock);
1071 
1072 	return (cwdi);
1073 }
1074 
1075 static int
1076 cwdi_ctor(void *arg, void *obj, int flags)
1077 {
1078 	struct cwdinfo *cwdi = obj;
1079 
1080 	rw_init(&cwdi->cwdi_lock);
1081 
1082 	return 0;
1083 }
1084 
1085 static void
1086 cwdi_dtor(void *arg, void *obj)
1087 {
1088 	struct cwdinfo *cwdi = obj;
1089 
1090 	rw_destroy(&cwdi->cwdi_lock);
1091 }
1092 
1093 static int
1094 file_ctor(void *arg, void *obj, int flags)
1095 {
1096 	file_t *fp = obj;
1097 
1098 	memset(fp, 0, sizeof(*fp));
1099 	mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1100 
1101 	mutex_enter(&filelist_lock);
1102 	LIST_INSERT_HEAD(&filehead, fp, f_list);
1103 	mutex_exit(&filelist_lock);
1104 
1105 	return 0;
1106 }
1107 
1108 static void
1109 file_dtor(void *arg, void *obj)
1110 {
1111 	file_t *fp = obj;
1112 
1113 	mutex_enter(&filelist_lock);
1114 	LIST_REMOVE(fp, f_list);
1115 	mutex_exit(&filelist_lock);
1116 
1117 	mutex_destroy(&fp->f_lock);
1118 }
1119 
1120 static int
1121 fdfile_ctor(void *arg, void *obj, int flags)
1122 {
1123 	fdfile_t *ff = obj;
1124 
1125 	memset(ff, 0, sizeof(*ff));
1126 	mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1127 	cv_init(&ff->ff_closing, "fdclose");
1128 
1129 	return 0;
1130 }
1131 
1132 static void
1133 fdfile_dtor(void *arg, void *obj)
1134 {
1135 	fdfile_t *ff = obj;
1136 
1137 	mutex_destroy(&ff->ff_lock);
1138 	cv_destroy(&ff->ff_closing);
1139 }
1140 
1141 file_t *
1142 fgetdummy(void)
1143 {
1144 	file_t *fp;
1145 
1146 	fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1147 	if (fp != NULL) {
1148 		memset(fp, 0, sizeof(*fp));
1149 		mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1150 	}
1151 	return fp;
1152 }
1153 
1154 void
1155 fputdummy(file_t *fp)
1156 {
1157 
1158 	mutex_destroy(&fp->f_lock);
1159 	kmem_free(fp, sizeof(*fp));
1160 }
1161 
1162 /*
1163  * Make p2 share p1's cwdinfo.
1164  */
1165 void
1166 cwdshare(struct proc *p2)
1167 {
1168 	struct cwdinfo *cwdi;
1169 
1170 	cwdi = curproc->p_cwdi;
1171 
1172 	atomic_inc_uint(&cwdi->cwdi_refcnt);
1173 	p2->p_cwdi = cwdi;
1174 }
1175 
1176 /*
1177  * Release a cwdinfo structure.
1178  */
1179 void
1180 cwdfree(struct cwdinfo *cwdi)
1181 {
1182 
1183 	if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1184 		return;
1185 
1186 	vrele(cwdi->cwdi_cdir);
1187 	if (cwdi->cwdi_rdir)
1188 		vrele(cwdi->cwdi_rdir);
1189 	if (cwdi->cwdi_edir)
1190 		vrele(cwdi->cwdi_edir);
1191 	pool_cache_put(cwdi_cache, cwdi);
1192 }
1193 
1194 /*
1195  * Create an initial filedesc structure.
1196  */
1197 filedesc_t *
1198 fd_init(filedesc_t *fdp)
1199 {
1200 	unsigned fd;
1201 
1202 	if (fdp == NULL) {
1203 		fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1204 	} else {
1205 		filedesc_ctor(NULL, fdp, PR_WAITOK);
1206 	}
1207 
1208 	fdp->fd_refcnt = 1;
1209 	fdp->fd_ofiles = fdp->fd_dfiles;
1210 	fdp->fd_nfiles = NDFILE;
1211 	fdp->fd_himap = fdp->fd_dhimap;
1212 	fdp->fd_lomap = fdp->fd_dlomap;
1213 	KASSERT(fdp->fd_lastfile == -1);
1214 	KASSERT(fdp->fd_lastkqfile == -1);
1215 	KASSERT(fdp->fd_knhash == NULL);
1216 
1217 	memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1218 	    offsetof(filedesc_t, fd_startzero));
1219 	for (fd = 0; fd < NDFDFILE; fd++) {
1220 		fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1221 	}
1222 
1223 	return fdp;
1224 }
1225 
1226 /*
1227  * Initialize a file descriptor table.
1228  */
1229 static int
1230 filedesc_ctor(void *arg, void *obj, int flag)
1231 {
1232 	filedesc_t *fdp = obj;
1233 	int i;
1234 
1235 	memset(fdp, 0, sizeof(*fdp));
1236 	mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1237 	fdp->fd_lastfile = -1;
1238 	fdp->fd_lastkqfile = -1;
1239 
1240 	CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1241 	for (i = 0; i < NDFDFILE; i++) {
1242 		fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1243 	}
1244 
1245 	return 0;
1246 }
1247 
1248 static void
1249 filedesc_dtor(void *arg, void *obj)
1250 {
1251 	filedesc_t *fdp = obj;
1252 	int i;
1253 
1254 	for (i = 0; i < NDFDFILE; i++) {
1255 		fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1256 	}
1257 
1258 	mutex_destroy(&fdp->fd_lock);
1259 }
1260 
1261 /*
1262  * Make p2 share p1's filedesc structure.
1263  */
1264 void
1265 fd_share(struct proc *p2)
1266 {
1267 	filedesc_t *fdp;
1268 
1269 	fdp = curlwp->l_fd;
1270 	p2->p_fd = fdp;
1271 	atomic_inc_uint(&fdp->fd_refcnt);
1272 }
1273 
1274 /*
1275  * Copy a filedesc structure.
1276  */
1277 filedesc_t *
1278 fd_copy(void)
1279 {
1280 	filedesc_t *newfdp, *fdp;
1281 	fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1282 	int i, nused, numfiles, lastfile, j, newlast;
1283 	file_t *fp;
1284 
1285 	fdp = curproc->p_fd;
1286 	newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1287 	newfdp->fd_refcnt = 1;
1288 
1289 	KASSERT(newfdp->fd_knhash == NULL);
1290 	KASSERT(newfdp->fd_knhashmask == 0);
1291 	KASSERT(newfdp->fd_discard == NULL);
1292 
1293 	for (;;) {
1294 		numfiles = fdp->fd_nfiles;
1295 		lastfile = fdp->fd_lastfile;
1296 
1297 		/*
1298 		 * If the number of open files fits in the internal arrays
1299 		 * of the open file structure, use them, otherwise allocate
1300 		 * additional memory for the number of descriptors currently
1301 		 * in use.
1302 		 */
1303 		if (lastfile < NDFILE) {
1304 			i = NDFILE;
1305 			newfdp->fd_ofiles = newfdp->fd_dfiles;
1306 		} else {
1307 			/*
1308 			 * Compute the smallest multiple of NDEXTENT needed
1309 			 * for the file descriptors currently in use,
1310 			 * allowing the table to shrink.
1311 			 */
1312 			i = numfiles;
1313 			while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1314 				i /= 2;
1315 			}
1316 			newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *),
1317 			    M_FILEDESC, M_WAITOK);
1318 			KASSERT(i >= NDFILE);
1319 		}
1320 		if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1321 			newfdp->fd_himap = newfdp->fd_dhimap;
1322 			newfdp->fd_lomap = newfdp->fd_dlomap;
1323 		} else {
1324 			newfdp->fd_himap = malloc(NDHISLOTS(i) *
1325 			    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1326 			newfdp->fd_lomap = malloc(NDLOSLOTS(i) *
1327 			    sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1328 		}
1329 
1330 		/*
1331 		 * Allocate and string together fdfile structures.
1332 		 * We abuse fdfile_t::ff_file here, but it will be
1333 		 * cleared before this routine returns.
1334 		 */
1335 		nused = fdp->fd_nused;
1336 		fflist = NULL;
1337 		for (j = nused; j != 0; j--) {
1338 			ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1339 			ff->ff_file = (void *)fflist;
1340 			fflist = ff;
1341 		}
1342 
1343 		mutex_enter(&fdp->fd_lock);
1344 		if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1345 		    lastfile == fdp->fd_lastfile) {
1346 			break;
1347 		}
1348 		mutex_exit(&fdp->fd_lock);
1349 		if (i >= NDFILE) {
1350 			free(newfdp->fd_ofiles, M_FILEDESC);
1351 		}
1352 		if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1353 			free(newfdp->fd_himap, M_FILEDESC);
1354 			free(newfdp->fd_lomap, M_FILEDESC);
1355 		}
1356 		while (fflist != NULL) {
1357 			ff = fflist;
1358 			fflist = (void *)ff->ff_file;
1359 			ff->ff_file = NULL;
1360 			pool_cache_put(fdfile_cache, ff);
1361 		}
1362 	}
1363 
1364 	newfdp->fd_nfiles = i;
1365 	newfdp->fd_freefile = fdp->fd_freefile;
1366 	newfdp->fd_exclose = fdp->fd_exclose;
1367 
1368 	/*
1369 	 * Clear the entries that will not be copied over.
1370 	 * Avoid calling memset with 0 size.
1371 	 */
1372 	if (lastfile < (i-1)) {
1373 		memset(newfdp->fd_ofiles + lastfile + 1, 0,
1374 		    (i - lastfile - 1) * sizeof(file_t **));
1375 	}
1376 	if (i < NDENTRIES * NDENTRIES) {
1377 		i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1378 	}
1379 	memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1380 	memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1381 
1382 	ffp = fdp->fd_ofiles;
1383 	nffp = newfdp->fd_ofiles;
1384 	j = imax(lastfile, (NDFDFILE - 1));
1385 	newlast = -1;
1386 	KASSERT(j < fdp->fd_nfiles);
1387 	for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1388 		ff = *ffp;
1389 		/* Install built-in fdfiles even if unused here. */
1390 		if (i < NDFDFILE) {
1391 			ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1392 		} else {
1393 			ff2 = NULL;
1394 		}
1395 		/* Determine if descriptor is active in parent. */
1396 		if (ff == NULL || !fd_isused(fdp, i)) {
1397 			KASSERT(ff != NULL || i >= NDFDFILE);
1398 			continue;
1399 		}
1400 		mutex_enter(&ff->ff_lock);
1401 		fp = ff->ff_file;
1402 		if (fp == NULL) {
1403 			/* Descriptor is half-open: free slot. */
1404 			fd_zap(newfdp, i);
1405 			mutex_exit(&ff->ff_lock);
1406 			continue;
1407 		}
1408 		if (fp->f_type == DTYPE_KQUEUE) {
1409 			/* kqueue descriptors cannot be copied. */
1410 			fd_zap(newfdp, i);
1411 			mutex_exit(&ff->ff_lock);
1412 			continue;
1413 		}
1414 		/* It's active: add a reference to the file. */
1415 		mutex_enter(&fp->f_lock);
1416 		fp->f_count++;
1417 		mutex_exit(&fp->f_lock);
1418 		/* Consume one fdfile_t to represent it. */
1419 		if (i >= NDFDFILE) {
1420 			ff2 = fflist;
1421 			fflist = (void *)ff2->ff_file;
1422 		}
1423 		ff2->ff_file = fp;
1424 		ff2->ff_exclose = ff->ff_exclose;
1425 		ff2->ff_allocated = true;
1426 		mutex_exit(&ff->ff_lock);
1427 		if (i > newlast) {
1428 			newlast = i;
1429 		}
1430 	}
1431 	mutex_exit(&fdp->fd_lock);
1432 
1433 	/* Discard unused fdfile_t structures. */
1434 	while (__predict_false(fflist != NULL)) {
1435 		ff = fflist;
1436 		fflist = (void *)ff->ff_file;
1437 		ff->ff_file = NULL;
1438 		pool_cache_put(fdfile_cache, ff);
1439 		nused--;
1440 	}
1441 	KASSERT(nused >= 0);
1442 	KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1443 
1444 	newfdp->fd_nused = nused;
1445 	newfdp->fd_lastfile = newlast;
1446 
1447 	return (newfdp);
1448 }
1449 
1450 /*
1451  * Release a filedesc structure.
1452  */
1453 void
1454 fd_free(void)
1455 {
1456 	filedesc_t *fdp;
1457 	fdfile_t *ff;
1458 	file_t *fp;
1459 	int fd, lastfd;
1460 	void *discard;
1461 
1462 	fdp = curlwp->l_fd;
1463 
1464 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1465 
1466 	if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1467 		return;
1468 
1469 	/*
1470 	 * Close any files that the process holds open.
1471 	 */
1472 	for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1473 		ff = fdp->fd_ofiles[fd];
1474 		KASSERT(fd >= NDFDFILE ||
1475 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1476 		if ((ff = fdp->fd_ofiles[fd]) == NULL)
1477 			continue;
1478 		if ((fp = ff->ff_file) != NULL) {
1479 			/*
1480 			 * Must use fd_close() here as kqueue holds
1481 			 * long term references to descriptors.
1482 			 */
1483 			ff->ff_refcnt++;
1484 			fd_close(fd);
1485 		}
1486 		KASSERT(ff->ff_refcnt == 0);
1487 		KASSERT(ff->ff_file == NULL);
1488 		KASSERT(!ff->ff_exclose);
1489 		KASSERT(!ff->ff_allocated);
1490 		if (fd >= NDFDFILE) {
1491 			pool_cache_put(fdfile_cache, ff);
1492 		}
1493 	}
1494 
1495 	/*
1496 	 * Clean out the descriptor table for the next user and return
1497 	 * to the cache.
1498 	 */
1499 	while ((discard = fdp->fd_discard) != NULL) {
1500 		KASSERT(discard != fdp->fd_ofiles);
1501 		fdp->fd_discard = *(void **)discard;
1502 		free(discard, M_FILEDESC);
1503 	}
1504 	if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1505 		KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1506 		KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1507 		free(fdp->fd_himap, M_FILEDESC);
1508 		free(fdp->fd_lomap, M_FILEDESC);
1509 	}
1510 	if (fdp->fd_nfiles > NDFILE) {
1511 		KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1512 		free(fdp->fd_ofiles, M_FILEDESC);
1513 	}
1514 	if (fdp->fd_knhash != NULL) {
1515 		hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1516 		fdp->fd_knhash = NULL;
1517 		fdp->fd_knhashmask = 0;
1518 	} else {
1519 		KASSERT(fdp->fd_knhashmask == 0);
1520 	}
1521 	fdp->fd_lastkqfile = -1;
1522 	pool_cache_put(filedesc_cache, fdp);
1523 }
1524 
1525 /*
1526  * File Descriptor pseudo-device driver (/dev/fd/).
1527  *
1528  * Opening minor device N dup()s the file (if any) connected to file
1529  * descriptor N belonging to the calling process.  Note that this driver
1530  * consists of only the ``open()'' routine, because all subsequent
1531  * references to this file will be direct to the other driver.
1532  */
1533 static int
1534 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1535 {
1536 
1537 	/*
1538 	 * XXX Kludge: set dupfd to contain the value of the
1539 	 * the file descriptor being sought for duplication. The error
1540 	 * return ensures that the vnode for this device will be released
1541 	 * by vn_open. Open will detect this special error and take the
1542 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1543 	 * will simply report the error.
1544 	 */
1545 	l->l_dupfd = minor(dev);	/* XXX */
1546 	return EDUPFD;
1547 }
1548 
1549 /*
1550  * Duplicate the specified descriptor to a free descriptor.
1551  */
1552 int
1553 fd_dupopen(int old, int *new, int mode, int error)
1554 {
1555 	filedesc_t *fdp;
1556 	fdfile_t *ff;
1557 	file_t *fp;
1558 
1559 	if ((fp = fd_getfile(old)) == NULL) {
1560 		return EBADF;
1561 	}
1562 	fdp = curlwp->l_fd;
1563 	ff = fdp->fd_ofiles[old];
1564 
1565 	/*
1566 	 * There are two cases of interest here.
1567 	 *
1568 	 * For EDUPFD simply dup (dfd) to file descriptor
1569 	 * (indx) and return.
1570 	 *
1571 	 * For EMOVEFD steal away the file structure from (dfd) and
1572 	 * store it in (indx).  (dfd) is effectively closed by
1573 	 * this operation.
1574 	 *
1575 	 * Any other error code is just returned.
1576 	 */
1577 	switch (error) {
1578 	case EDUPFD:
1579 		/*
1580 		 * Check that the mode the file is being opened for is a
1581 		 * subset of the mode of the existing descriptor.
1582 		 */
1583 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1584 			error = EACCES;
1585 			break;
1586 		}
1587 
1588 		/* Copy it. */
1589 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1590 		break;
1591 
1592 	case EMOVEFD:
1593 		/* Copy it. */
1594 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1595 		if (error != 0) {
1596 			break;
1597 		}
1598 
1599 		/* Steal away the file pointer from 'old'. */
1600 		(void)fd_close(old);
1601 		return 0;
1602 	}
1603 
1604 	fd_putfile(old);
1605 	return error;
1606 }
1607 
1608 /*
1609  * Close open files on exec.
1610  */
1611 void
1612 fd_closeexec(void)
1613 {
1614 	struct cwdinfo *cwdi;
1615 	proc_t *p;
1616 	filedesc_t *fdp;
1617 	fdfile_t *ff;
1618 	lwp_t *l;
1619 	int fd;
1620 
1621 	l = curlwp;
1622 	p = l->l_proc;
1623 	fdp = p->p_fd;
1624 	cwdi = p->p_cwdi;
1625 
1626 	if (cwdi->cwdi_refcnt > 1) {
1627 		cwdi = cwdinit();
1628 		cwdfree(p->p_cwdi);
1629 		p->p_cwdi = cwdi;
1630 	}
1631 	if (p->p_cwdi->cwdi_edir) {
1632 		vrele(p->p_cwdi->cwdi_edir);
1633 	}
1634 
1635 	if (fdp->fd_refcnt > 1) {
1636 		fdp = fd_copy();
1637 		fd_free();
1638 		p->p_fd = fdp;
1639 		l->l_fd = fdp;
1640 	}
1641 	if (!fdp->fd_exclose) {
1642 		return;
1643 	}
1644 	fdp->fd_exclose = false;
1645 
1646 	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1647 		if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1648 			KASSERT(fd >= NDFDFILE);
1649 			continue;
1650 		}
1651 		KASSERT(fd >= NDFDFILE ||
1652 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1653 		if (ff->ff_file == NULL)
1654 			continue;
1655 		if (ff->ff_exclose) {
1656 			/*
1657 			 * We need a reference to close the file.
1658 			 * No other threads can see the fdfile_t at
1659 			 * this point, so don't bother locking.
1660 			 */
1661 			KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1662 			ff->ff_refcnt++;
1663 			fd_close(fd);
1664 		}
1665 	}
1666 }
1667 
1668 /*
1669  * It is unsafe for set[ug]id processes to be started with file
1670  * descriptors 0..2 closed, as these descriptors are given implicit
1671  * significance in the Standard C library.  fdcheckstd() will create a
1672  * descriptor referencing /dev/null for each of stdin, stdout, and
1673  * stderr that is not already open.
1674  */
1675 #define CHECK_UPTO 3
1676 int
1677 fd_checkstd(void)
1678 {
1679 	struct proc *p;
1680 	struct nameidata nd;
1681 	filedesc_t *fdp;
1682 	file_t *fp;
1683 	struct proc *pp;
1684 	int fd, i, error, flags = FREAD|FWRITE;
1685 	char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1686 
1687 	p = curproc;
1688 	closed[0] = '\0';
1689 	if ((fdp = p->p_fd) == NULL)
1690 		return (0);
1691 	for (i = 0; i < CHECK_UPTO; i++) {
1692 		KASSERT(i >= NDFDFILE ||
1693 		    fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1694 		if (fdp->fd_ofiles[i]->ff_file != NULL)
1695 			continue;
1696 		snprintf(which, sizeof(which), ",%d", i);
1697 		strlcat(closed, which, sizeof(closed));
1698 		if ((error = fd_allocfile(&fp, &fd)) != 0)
1699 			return (error);
1700 		KASSERT(fd < CHECK_UPTO);
1701 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1702 		if ((error = vn_open(&nd, flags, 0)) != 0) {
1703 			fd_abort(p, fp, fd);
1704 			return (error);
1705 		}
1706 		fp->f_data = nd.ni_vp;
1707 		fp->f_flag = flags;
1708 		fp->f_ops = &vnops;
1709 		fp->f_type = DTYPE_VNODE;
1710 		VOP_UNLOCK(nd.ni_vp, 0);
1711 		fd_affix(p, fp, fd);
1712 	}
1713 	if (closed[0] != '\0') {
1714 		mutex_enter(proc_lock);
1715 		pp = p->p_pptr;
1716 		mutex_enter(pp->p_lock);
1717 		log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1718 		    "was invoked by uid %d ppid %d (%s) "
1719 		    "with fd %s closed\n",
1720 		    p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1721 		    pp->p_pid, pp->p_comm, &closed[1]);
1722 		mutex_exit(pp->p_lock);
1723 		mutex_exit(proc_lock);
1724 	}
1725 	return (0);
1726 }
1727 #undef CHECK_UPTO
1728 
1729 /*
1730  * Sets descriptor owner. If the owner is a process, 'pgid'
1731  * is set to positive value, process ID. If the owner is process group,
1732  * 'pgid' is set to -pg_id.
1733  */
1734 int
1735 fsetown(pid_t *pgid, u_long cmd, const void *data)
1736 {
1737 	int id = *(const int *)data;
1738 	int error;
1739 
1740 	switch (cmd) {
1741 	case TIOCSPGRP:
1742 		if (id < 0)
1743 			return (EINVAL);
1744 		id = -id;
1745 		break;
1746 	default:
1747 		break;
1748 	}
1749 
1750 	if (id > 0 && !pfind(id))
1751 		return (ESRCH);
1752 	else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1753 		return (error);
1754 
1755 	*pgid = id;
1756 	return (0);
1757 }
1758 
1759 /*
1760  * Return descriptor owner information. If the value is positive,
1761  * it's process ID. If it's negative, it's process group ID and
1762  * needs the sign removed before use.
1763  */
1764 int
1765 fgetown(pid_t pgid, u_long cmd, void *data)
1766 {
1767 
1768 	switch (cmd) {
1769 	case TIOCGPGRP:
1770 		*(int *)data = -pgid;
1771 		break;
1772 	default:
1773 		*(int *)data = pgid;
1774 		break;
1775 	}
1776 	return (0);
1777 }
1778 
1779 /*
1780  * Send signal to descriptor owner, either process or process group.
1781  */
1782 void
1783 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1784 {
1785 	struct proc *p1;
1786 	struct pgrp *pgrp;
1787 	ksiginfo_t ksi;
1788 
1789 	KASSERT(!cpu_intr_p());
1790 
1791 	KSI_INIT(&ksi);
1792 	ksi.ksi_signo = signo;
1793 	ksi.ksi_code = code;
1794 	ksi.ksi_band = band;
1795 
1796 	mutex_enter(proc_lock);
1797 	if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1798 		kpsignal(p1, &ksi, fdescdata);
1799 	else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1800 		kpgsignal(pgrp, &ksi, fdescdata, 0);
1801 	mutex_exit(proc_lock);
1802 }
1803 
1804 int
1805 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1806 	 void *data)
1807 {
1808 
1809 	fp->f_flag = flag;
1810 	fp->f_type = DTYPE_MISC;
1811 	fp->f_ops = fops;
1812 	fp->f_data = data;
1813 	curlwp->l_dupfd = fd;
1814 	fd_affix(curproc, fp, fd);
1815 
1816 	return EMOVEFD;
1817 }
1818 
1819 int
1820 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1821 {
1822 
1823 	if (cmd == F_SETFL)
1824 		return 0;
1825 
1826 	return EOPNOTSUPP;
1827 }
1828 
1829 int
1830 fnullop_poll(file_t *fp, int which)
1831 {
1832 
1833 	return 0;
1834 }
1835 
1836 int
1837 fnullop_kqfilter(file_t *fp, struct knote *kn)
1838 {
1839 
1840 	return 0;
1841 }
1842 
1843 int
1844 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1845 	    kauth_cred_t cred, int flags)
1846 {
1847 
1848 	return EOPNOTSUPP;
1849 }
1850 
1851 int
1852 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1853 	     kauth_cred_t cred, int flags)
1854 {
1855 
1856 	return EOPNOTSUPP;
1857 }
1858 
1859 int
1860 fbadop_ioctl(file_t *fp, u_long com, void *data)
1861 {
1862 
1863 	return EOPNOTSUPP;
1864 }
1865 
1866 int
1867 fbadop_stat(file_t *fp, struct stat *sb)
1868 {
1869 
1870 	return EOPNOTSUPP;
1871 }
1872 
1873 int
1874 fbadop_close(file_t *fp)
1875 {
1876 
1877 	return EOPNOTSUPP;
1878 }
1879