xref: /netbsd-src/sys/kern/kern_descrip.c (revision 500db002748d9818288e46e10f026a2b09548086)
1 /*	$NetBSD: kern_descrip.c,v 1.188 2009/03/11 06:05:29 mrg Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 1982, 1986, 1989, 1991, 1993
31  *	The Regents of the University of California.  All rights reserved.
32  * (c) UNIX System Laboratories, Inc.
33  * All or some portions of this file are derived from material licensed
34  * to the University of California by American Telephone and Telegraph
35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36  * the permission of UNIX System Laboratories, Inc.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *	@(#)kern_descrip.c	8.8 (Berkeley) 2/14/95
63  */
64 
65 /*
66  * File descriptor management.
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.188 2009/03/11 06:05:29 mrg Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/filedesc.h>
75 #include <sys/kernel.h>
76 #include <sys/proc.h>
77 #include <sys/file.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/stat.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <sys/pool.h>
84 #include <sys/unistd.h>
85 #include <sys/resourcevar.h>
86 #include <sys/conf.h>
87 #include <sys/event.h>
88 #include <sys/kauth.h>
89 #include <sys/atomic.h>
90 #include <sys/syscallargs.h>
91 #include <sys/cpu.h>
92 #include <sys/kmem.h>
93 #include <sys/vnode.h>
94 
95 static int	file_ctor(void *, void *, int);
96 static void	file_dtor(void *, void *);
97 static int	fdfile_ctor(void *, void *, int);
98 static void	fdfile_dtor(void *, void *);
99 static int	filedesc_ctor(void *, void *, int);
100 static void	filedesc_dtor(void *, void *);
101 static int	filedescopen(dev_t, int, int, lwp_t *);
102 
103 kmutex_t	filelist_lock;	/* lock on filehead */
104 struct filelist	filehead;	/* head of list of open files */
105 u_int		nfiles;		/* actual number of open files */
106 
107 static pool_cache_t filedesc_cache;
108 static pool_cache_t file_cache;
109 static pool_cache_t fdfile_cache;
110 
111 const struct cdevsw filedesc_cdevsw = {
112 	filedescopen, noclose, noread, nowrite, noioctl,
113 	nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
114 };
115 
116 /* For ease of reading. */
117 __strong_alias(fd_putvnode,fd_putfile)
118 __strong_alias(fd_putsock,fd_putfile)
119 
120 /*
121  * Initialize the descriptor system.
122  */
123 void
124 fd_sys_init(void)
125 {
126 
127 	mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
128 
129 	file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
130 	    0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
131 	KASSERT(file_cache != NULL);
132 
133 	fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
134 	    PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
135 	    NULL);
136 	KASSERT(fdfile_cache != NULL);
137 
138 	filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
139 	    0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
140 	    NULL);
141 	KASSERT(filedesc_cache != NULL);
142 }
143 
144 static int
145 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
146 {
147 	int i, off, maxoff;
148 	uint32_t sub;
149 
150 	KASSERT(mutex_owned(&fdp->fd_lock));
151 
152 	if (want > bits)
153 		return -1;
154 
155 	off = want >> NDENTRYSHIFT;
156 	i = want & NDENTRYMASK;
157 	if (i) {
158 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
159 		if (sub != ~0)
160 			goto found;
161 		off++;
162 	}
163 
164 	maxoff = NDLOSLOTS(bits);
165 	while (off < maxoff) {
166 		if ((sub = bitmap[off]) != ~0)
167 			goto found;
168 		off++;
169 	}
170 
171 	return (-1);
172 
173  found:
174 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
175 }
176 
177 static int
178 fd_last_set(filedesc_t *fd, int last)
179 {
180 	int off, i;
181 	fdfile_t **ofiles = fd->fd_ofiles;
182 	uint32_t *bitmap = fd->fd_lomap;
183 
184 	KASSERT(mutex_owned(&fd->fd_lock));
185 
186 	off = (last - 1) >> NDENTRYSHIFT;
187 
188 	while (off >= 0 && !bitmap[off])
189 		off--;
190 
191 	if (off < 0)
192 		return (-1);
193 
194 	i = ((off + 1) << NDENTRYSHIFT) - 1;
195 	if (i >= last)
196 		i = last - 1;
197 
198 	/* XXX should use bitmap */
199 	/* XXXAD does not work for fd_copy() */
200 	while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
201 		i--;
202 
203 	return (i);
204 }
205 
206 void
207 fd_used(filedesc_t *fdp, unsigned fd)
208 {
209 	u_int off = fd >> NDENTRYSHIFT;
210 	fdfile_t *ff;
211 
212 	ff = fdp->fd_ofiles[fd];
213 
214 	KASSERT(mutex_owned(&fdp->fd_lock));
215 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
216 	KASSERT(ff != NULL);
217 	KASSERT(ff->ff_file == NULL);
218    	KASSERT(!ff->ff_allocated);
219 
220    	ff->ff_allocated = 1;
221 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
222 	if (fdp->fd_lomap[off] == ~0) {
223 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
224 		    (1 << (off & NDENTRYMASK))) == 0);
225 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
226 	}
227 
228 	if ((int)fd > fdp->fd_lastfile) {
229 		fdp->fd_lastfile = fd;
230 	}
231 
232 	if (fd >= NDFDFILE) {
233 		fdp->fd_nused++;
234 	} else {
235 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
236 	}
237 }
238 
239 void
240 fd_unused(filedesc_t *fdp, unsigned fd)
241 {
242 	u_int off = fd >> NDENTRYSHIFT;
243 	fdfile_t *ff;
244 
245 	ff = fdp->fd_ofiles[fd];
246 
247 	/*
248 	 * Don't assert the lock is held here, as we may be copying
249 	 * the table during exec() and it is not needed there.
250 	 * procfs and sysctl are locked out by proc::p_reflock.
251 	 *
252 	 * KASSERT(mutex_owned(&fdp->fd_lock));
253 	 */
254 	KASSERT(ff != NULL);
255 	KASSERT(ff->ff_file == NULL);
256    	KASSERT(ff->ff_allocated);
257 
258 	if (fd < fdp->fd_freefile) {
259 		fdp->fd_freefile = fd;
260 	}
261 
262 	if (fdp->fd_lomap[off] == ~0) {
263 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
264 		    (1 << (off & NDENTRYMASK))) != 0);
265 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
266 		    ~(1 << (off & NDENTRYMASK));
267 	}
268 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
269 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
270 	ff->ff_allocated = 0;
271 
272 	KASSERT(fd <= fdp->fd_lastfile);
273 	if (fd == fdp->fd_lastfile) {
274 		fdp->fd_lastfile = fd_last_set(fdp, fd);
275 	}
276 
277 	if (fd >= NDFDFILE) {
278 		KASSERT(fdp->fd_nused > 0);
279 		fdp->fd_nused--;
280 	} else {
281 		KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
282 	}
283 }
284 
285 /*
286  * Custom version of fd_unused() for fd_copy(), where the descriptor
287  * table is not yet fully initialized.
288  */
289 static inline void
290 fd_zap(filedesc_t *fdp, unsigned fd)
291 {
292 	u_int off = fd >> NDENTRYSHIFT;
293 
294 	if (fd < fdp->fd_freefile) {
295 		fdp->fd_freefile = fd;
296 	}
297 
298 	if (fdp->fd_lomap[off] == ~0) {
299 		KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
300 		    (1 << (off & NDENTRYMASK))) != 0);
301 		fdp->fd_himap[off >> NDENTRYSHIFT] &=
302 		    ~(1 << (off & NDENTRYMASK));
303 	}
304 	KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
305 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
306 }
307 
308 bool
309 fd_isused(filedesc_t *fdp, unsigned fd)
310 {
311 	u_int off = fd >> NDENTRYSHIFT;
312 
313 	KASSERT(fd < fdp->fd_nfiles);
314 
315 	return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
316 }
317 
318 /*
319  * Look up the file structure corresponding to a file descriptor
320  * and return the file, holding a reference on the descriptor.
321  */
322 inline file_t *
323 fd_getfile(unsigned fd)
324 {
325 	filedesc_t *fdp;
326 	fdfile_t *ff;
327 	file_t *fp;
328 
329 	fdp = curlwp->l_fd;
330 
331 	/*
332 	 * Look up the fdfile structure representing this descriptor.
333 	 * Ensure that we see fd_nfiles before fd_ofiles since we
334 	 * are doing this unlocked.  See fd_tryexpand().
335 	 */
336 	if (__predict_false(fd >= fdp->fd_nfiles)) {
337 		return NULL;
338 	}
339 	membar_consumer();
340 	ff = fdp->fd_ofiles[fd];
341 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
342 	if (__predict_false(ff == NULL)) {
343 		return NULL;
344 	}
345 
346 	/*
347 	 * Now get a reference to the descriptor.   Issue a memory
348 	 * barrier to ensure that we acquire the file pointer _after_
349 	 * adding a reference.  If no memory barrier, we could fetch
350 	 * a stale pointer.
351 	 */
352 	atomic_inc_uint(&ff->ff_refcnt);
353 #ifndef __HAVE_ATOMIC_AS_MEMBAR
354 	membar_enter();
355 #endif
356 
357 	/*
358 	 * If the file is not open or is being closed then put the
359 	 * reference back.
360 	 */
361 	fp = ff->ff_file;
362 	if (__predict_true(fp != NULL)) {
363 		return fp;
364 	}
365 	fd_putfile(fd);
366 	return NULL;
367 }
368 
369 /*
370  * Release a reference to a file descriptor acquired with fd_getfile().
371  */
372 void
373 fd_putfile(unsigned fd)
374 {
375 	filedesc_t *fdp;
376 	fdfile_t *ff;
377 	u_int u, v;
378 
379 	fdp = curlwp->l_fd;
380 	ff = fdp->fd_ofiles[fd];
381 
382 	KASSERT(fd < fdp->fd_nfiles);
383 	KASSERT(ff != NULL);
384 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
385 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
386 
387 	/*
388 	 * Ensure that any use of the file is complete and globally
389 	 * visible before dropping the final reference.  If no membar,
390 	 * the current CPU could still access memory associated with
391 	 * the file after it has been freed or recycled by another
392 	 * CPU.
393 	 */
394 #ifndef __HAVE_ATOMIC_AS_MEMBAR
395 	membar_exit();
396 #endif
397 
398 	/*
399 	 * Be optimistic and start out with the assumption that no other
400 	 * threads are trying to close the descriptor.  If the CAS fails,
401 	 * we lost a race and/or it's being closed.
402 	 */
403 	for (u = ff->ff_refcnt & FR_MASK;; u = v) {
404 		v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
405 		if (__predict_true(u == v)) {
406 			return;
407 		}
408 		if (__predict_false((v & FR_CLOSING) != 0)) {
409 			break;
410 		}
411 	}
412 
413 	/* Another thread is waiting to close the file: join it. */
414 	(void)fd_close(fd);
415 }
416 
417 /*
418  * Convenience wrapper around fd_getfile() that returns reference
419  * to a vnode.
420  */
421 int
422 fd_getvnode(unsigned fd, file_t **fpp)
423 {
424 	vnode_t *vp;
425 	file_t *fp;
426 
427 	fp = fd_getfile(fd);
428 	if (__predict_false(fp == NULL)) {
429 		return EBADF;
430 	}
431 	if (__predict_false(fp->f_type != DTYPE_VNODE)) {
432 		fd_putfile(fd);
433 		return EINVAL;
434 	}
435 	vp = fp->f_data;
436 	if (__predict_false(vp->v_type == VBAD)) {
437 		/* XXX Is this case really necessary? */
438 		fd_putfile(fd);
439 		return EBADF;
440 	}
441 	*fpp = fp;
442 	return 0;
443 }
444 
445 /*
446  * Convenience wrapper around fd_getfile() that returns reference
447  * to a socket.
448  */
449 int
450 fd_getsock(unsigned fd, struct socket **sop)
451 {
452 	file_t *fp;
453 
454 	fp = fd_getfile(fd);
455 	if (__predict_false(fp == NULL)) {
456 		return EBADF;
457 	}
458 	if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
459 		fd_putfile(fd);
460 		return ENOTSOCK;
461 	}
462 	*sop = fp->f_data;
463 	return 0;
464 }
465 
466 /*
467  * Look up the file structure corresponding to a file descriptor
468  * and return it with a reference held on the file, not the
469  * descriptor.
470  *
471  * This is heavyweight and only used when accessing descriptors
472  * from a foreign process.  The caller must ensure that `p' does
473  * not exit or fork across this call.
474  *
475  * To release the file (not descriptor) reference, use closef().
476  */
477 file_t *
478 fd_getfile2(proc_t *p, unsigned fd)
479 {
480 	filedesc_t *fdp;
481 	fdfile_t *ff;
482 	file_t *fp;
483 
484 	fdp = p->p_fd;
485 	mutex_enter(&fdp->fd_lock);
486 	if (fd > fdp->fd_nfiles) {
487 		mutex_exit(&fdp->fd_lock);
488 		return NULL;
489 	}
490 	if ((ff = fdp->fd_ofiles[fd]) == NULL) {
491 		mutex_exit(&fdp->fd_lock);
492 		return NULL;
493 	}
494 	mutex_enter(&ff->ff_lock);
495 	if ((fp = ff->ff_file) == NULL) {
496 		mutex_exit(&ff->ff_lock);
497 		mutex_exit(&fdp->fd_lock);
498 		return NULL;
499 	}
500 	mutex_enter(&fp->f_lock);
501 	fp->f_count++;
502 	mutex_exit(&fp->f_lock);
503 	mutex_exit(&ff->ff_lock);
504 	mutex_exit(&fdp->fd_lock);
505 
506 	return fp;
507 }
508 
509 /*
510  * Internal form of close.  Must be called with a reference to the
511  * descriptor, and will drop the reference.  When all descriptor
512  * references are dropped, releases the descriptor slot and a single
513  * reference to the file structure.
514  */
515 int
516 fd_close(unsigned fd)
517 {
518 	struct flock lf;
519 	filedesc_t *fdp;
520 	fdfile_t *ff;
521 	file_t *fp;
522 	proc_t *p;
523 	lwp_t *l;
524 
525 	l = curlwp;
526 	p = l->l_proc;
527 	fdp = l->l_fd;
528 	ff = fdp->fd_ofiles[fd];
529 
530 	KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
531 
532 	mutex_enter(&ff->ff_lock);
533 	KASSERT((ff->ff_refcnt & FR_MASK) > 0);
534 	if (ff->ff_file == NULL) {
535 		/*
536 		 * Another user of the file is already closing, and is
537 		 * waiting for other users of the file to drain.  Release
538 		 * our reference, and wake up the closer.
539 		 */
540 		atomic_dec_uint(&ff->ff_refcnt);
541 		cv_broadcast(&ff->ff_closing);
542 		mutex_exit(&ff->ff_lock);
543 
544 		/*
545 		 * An application error, so pretend that the descriptor
546 		 * was already closed.  We can't safely wait for it to
547 		 * be closed without potentially deadlocking.
548 		 */
549 		return (EBADF);
550 	}
551 	KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
552 
553 	/*
554 	 * There may be multiple users of this file within the process.
555 	 * Notify existing and new users that the file is closing.  This
556 	 * will prevent them from adding additional uses to this file
557 	 * while we are closing it.
558 	 */
559 	fp = ff->ff_file;
560 	ff->ff_file = NULL;
561 	ff->ff_exclose = false;
562 
563 	/*
564 	 * We expect the caller to hold a descriptor reference - drop it.
565 	 * The reference count may increase beyond zero at this point due
566 	 * to an erroneous descriptor reference by an application, but
567 	 * fd_getfile() will notice that the file is being closed and drop
568 	 * the reference again.
569 	 */
570 #ifndef __HAVE_ATOMIC_AS_MEMBAR
571 	membar_producer();
572 #endif
573 	if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
574 		/*
575 		 * Wait for other references to drain.  This is typically
576 		 * an application error - the descriptor is being closed
577 		 * while still in use.
578 		 *
579 		 */
580 		atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
581 		/*
582 		 * Remove any knotes attached to the file.  A knote
583 		 * attached to the descriptor can hold references on it.
584 		 */
585 		if (!SLIST_EMPTY(&ff->ff_knlist)) {
586 			mutex_exit(&ff->ff_lock);
587 			knote_fdclose(fd);
588 			mutex_enter(&ff->ff_lock);
589 		}
590 		/*
591 		 * We need to see the count drop to zero at least once,
592 		 * in order to ensure that all pre-existing references
593 		 * have been drained.  New references past this point are
594 		 * of no interest.
595 		 */
596 		while ((ff->ff_refcnt & FR_MASK) != 0) {
597 			cv_wait(&ff->ff_closing, &ff->ff_lock);
598 		}
599 		atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
600 	} else {
601 		/* If no references, there must be no knotes. */
602 		KASSERT(SLIST_EMPTY(&ff->ff_knlist));
603 	}
604 	mutex_exit(&ff->ff_lock);
605 
606 	/*
607 	 * POSIX record locking dictates that any close releases ALL
608 	 * locks owned by this process.  This is handled by setting
609 	 * a flag in the unlock to free ONLY locks obeying POSIX
610 	 * semantics, and not to free BSD-style file locks.
611 	 * If the descriptor was in a message, POSIX-style locks
612 	 * aren't passed with the descriptor.
613 	 */
614 	if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
615 		lf.l_whence = SEEK_SET;
616 		lf.l_start = 0;
617 		lf.l_len = 0;
618 		lf.l_type = F_UNLCK;
619 		(void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
620 	}
621 
622 
623 	/* Free descriptor slot. */
624 	mutex_enter(&fdp->fd_lock);
625 	fd_unused(fdp, fd);
626 	mutex_exit(&fdp->fd_lock);
627 
628 	/* Now drop reference to the file itself. */
629 	return closef(fp);
630 }
631 
632 /*
633  * Duplicate a file descriptor.
634  */
635 int
636 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
637 {
638 	proc_t *p;
639 	int error;
640 
641 	p = curproc;
642 
643 	while ((error = fd_alloc(p, minfd, newp)) != 0) {
644 		if (error != ENOSPC) {
645 			return error;
646 		}
647 		fd_tryexpand(p);
648 	}
649 
650 	curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
651 	fd_affix(p, fp, *newp);
652 	return 0;
653 }
654 
655 /*
656  * dup2 operation.
657  */
658 int
659 fd_dup2(file_t *fp, unsigned new)
660 {
661 	filedesc_t *fdp;
662 	fdfile_t *ff;
663 
664 	fdp = curlwp->l_fd;
665 
666 	/*
667 	 * Ensure there are enough slots in the descriptor table,
668 	 * and allocate an fdfile_t up front in case we need it.
669 	 */
670 	while (new >= fdp->fd_nfiles) {
671 		fd_tryexpand(curproc);
672 	}
673 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
674 
675 	/*
676 	 * If there is already a file open, close it.  If the file is
677 	 * half open, wait for it to be constructed before closing it.
678 	 * XXX Potential for deadlock here?
679 	 */
680 	mutex_enter(&fdp->fd_lock);
681 	while (fd_isused(fdp, new)) {
682 		mutex_exit(&fdp->fd_lock);
683 		if (fd_getfile(new) != NULL) {
684 			(void)fd_close(new);
685 		} else {
686 			/* XXX Crummy, but unlikely to happen. */
687 			kpause("dup2", false, 1, NULL);
688 		}
689 		mutex_enter(&fdp->fd_lock);
690 	}
691 	if (fdp->fd_ofiles[new] == NULL) {
692 		KASSERT(new >= NDFDFILE);
693 		fdp->fd_ofiles[new] = ff;
694 		ff = NULL;
695 	}
696 	fd_used(fdp, new);
697 	mutex_exit(&fdp->fd_lock);
698 
699 	/* Slot is now allocated.  Insert copy of the file. */
700 	fd_affix(curproc, fp, new);
701 	if (ff != NULL) {
702 		pool_cache_put(fdfile_cache, ff);
703 	}
704 	return 0;
705 }
706 
707 /*
708  * Drop reference to a file structure.
709  */
710 int
711 closef(file_t *fp)
712 {
713 	struct flock lf;
714 	int error;
715 
716 	/*
717 	 * Drop reference.  If referenced elsewhere it's still open
718 	 * and we have nothing more to do.
719 	 */
720 	mutex_enter(&fp->f_lock);
721 	KASSERT(fp->f_count > 0);
722 	if (--fp->f_count > 0) {
723 		mutex_exit(&fp->f_lock);
724 		return 0;
725 	}
726 	KASSERT(fp->f_count == 0);
727 	mutex_exit(&fp->f_lock);
728 
729 	/* We held the last reference - release locks, close and free. */
730         if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
731         	lf.l_whence = SEEK_SET;
732 		lf.l_start = 0;
733 		lf.l_len = 0;
734 		lf.l_type = F_UNLCK;
735 		(void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
736 	}
737 	if (fp->f_ops != NULL) {
738 		error = (*fp->f_ops->fo_close)(fp);
739 	} else {
740 		error = 0;
741 	}
742 	ffree(fp);
743 
744 	return error;
745 }
746 
747 /*
748  * Allocate a file descriptor for the process.
749  */
750 int
751 fd_alloc(proc_t *p, int want, int *result)
752 {
753 	filedesc_t *fdp;
754 	int i, lim, last, error;
755 	u_int off, new;
756 	fdfile_t *ff;
757 
758 	KASSERT(p == curproc || p == &proc0);
759 
760 	fdp = p->p_fd;
761 	ff = pool_cache_get(fdfile_cache, PR_WAITOK);
762 	KASSERT(ff->ff_refcnt == 0);
763 	KASSERT(ff->ff_file == NULL);
764 
765 	/*
766 	 * Search for a free descriptor starting at the higher
767 	 * of want or fd_freefile.
768 	 */
769 	mutex_enter(&fdp->fd_lock);
770 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
771 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
772 	last = min(fdp->fd_nfiles, lim);
773 	for (;;) {
774 		if ((i = want) < fdp->fd_freefile)
775 			i = fdp->fd_freefile;
776 		off = i >> NDENTRYSHIFT;
777 		new = fd_next_zero(fdp, fdp->fd_himap, off,
778 		    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
779 		if (new == -1)
780 			break;
781 		i = fd_next_zero(fdp, &fdp->fd_lomap[new],
782 		    new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
783 		if (i == -1) {
784 			/*
785 			 * Free file descriptor in this block was
786 			 * below want, try again with higher want.
787 			 */
788 			want = (new + 1) << NDENTRYSHIFT;
789 			continue;
790 		}
791 		i += (new << NDENTRYSHIFT);
792 		if (i >= last) {
793 			break;
794 		}
795 		if (fdp->fd_ofiles[i] == NULL) {
796 			KASSERT(i >= NDFDFILE);
797 			fdp->fd_ofiles[i] = ff;
798 		} else {
799 		   	pool_cache_put(fdfile_cache, ff);
800 		}
801 		KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
802 		fd_used(fdp, i);
803 		if (want <= fdp->fd_freefile) {
804 			fdp->fd_freefile = i;
805 		}
806 		*result = i;
807 		mutex_exit(&fdp->fd_lock);
808 		KASSERT(i >= NDFDFILE ||
809 		    fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
810 		return 0;
811 	}
812 
813 	/* No space in current array.  Let the caller expand and retry. */
814 	error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
815 	mutex_exit(&fdp->fd_lock);
816 	pool_cache_put(fdfile_cache, ff);
817 	return error;
818 }
819 
820 /*
821  * Allocate memory for the open files array.
822  */
823 static fdfile_t **
824 fd_ofile_alloc(int n)
825 {
826 	uintptr_t *ptr, sz;
827 
828 	KASSERT(n > NDFILE);
829 
830 	sz = (n + 2) * sizeof(uintptr_t);
831 	ptr = kmem_alloc((size_t)sz, KM_SLEEP);
832 	ptr[1] = sz;
833 
834 	return (fdfile_t **)(ptr + 2);
835 }
836 
837 /*
838  * Free an open files array.
839  */
840 static void
841 fd_ofile_free(int n, fdfile_t **of)
842 {
843 	uintptr_t *ptr, sz;
844 
845 	KASSERT(n > NDFILE);
846 
847 	sz = (n + 2) * sizeof(uintptr_t);
848 	ptr = (uintptr_t *)of - 2;
849 	KASSERT(ptr[1] == sz);
850 	kmem_free(ptr, sz);
851 }
852 
853 /*
854  * Allocate descriptor bitmap.
855  */
856 static void
857 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
858 {
859 	uint8_t *ptr;
860 	size_t szlo, szhi;
861 
862 	KASSERT(n > NDENTRIES);
863 
864 	szlo = NDLOSLOTS(n) * sizeof(uint32_t);
865 	szhi = NDHISLOTS(n) * sizeof(uint32_t);
866 	ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
867 	*lo = (uint32_t *)ptr;
868 	*hi = (uint32_t *)(ptr + szlo);
869 }
870 
871 /*
872  * Free descriptor bitmap.
873  */
874 static void
875 fd_map_free(int n, uint32_t *lo, uint32_t *hi)
876 {
877 	size_t szlo, szhi;
878 
879 	KASSERT(n > NDENTRIES);
880 
881 	szlo = NDLOSLOTS(n) * sizeof(uint32_t);
882 	szhi = NDHISLOTS(n) * sizeof(uint32_t);
883 	KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
884 	kmem_free(lo, szlo + szhi);
885 }
886 
887 /*
888  * Expand a process' descriptor table.
889  */
890 void
891 fd_tryexpand(proc_t *p)
892 {
893 	filedesc_t *fdp;
894 	int i, numfiles, oldnfiles;
895 	fdfile_t **newofile;
896 	uint32_t *newhimap, *newlomap;
897 
898 	KASSERT(p == curproc || p == &proc0);
899 
900 	fdp = p->p_fd;
901 	newhimap = NULL;
902 	newlomap = NULL;
903 	oldnfiles = fdp->fd_nfiles;
904 
905 	if (oldnfiles < NDEXTENT)
906 		numfiles = NDEXTENT;
907 	else
908 		numfiles = 2 * oldnfiles;
909 
910 	newofile = fd_ofile_alloc(numfiles);
911 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
912 		fd_map_alloc(numfiles, &newlomap, &newhimap);
913 	}
914 
915 	mutex_enter(&fdp->fd_lock);
916 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
917 	if (fdp->fd_nfiles != oldnfiles) {
918 		/* fdp changed; caller must retry */
919 		mutex_exit(&fdp->fd_lock);
920 		fd_ofile_free(numfiles, newofile);
921 		if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
922 			fd_map_free(numfiles, newlomap, newhimap);
923 		}
924 		return;
925 	}
926 
927 	/* Copy the existing ofile array and zero the new portion. */
928 	i = sizeof(fdfile_t *) * fdp->fd_nfiles;
929 	memcpy(newofile, fdp->fd_ofiles, i);
930 	memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
931 
932 	/*
933 	 * Link old ofiles array into list to be discarded.  We defer
934 	 * freeing until process exit if the descriptor table is visble
935 	 * to other threads.
936 	 */
937 	if (oldnfiles > NDFILE) {
938 		if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
939 			fdp->fd_ofiles[-2] = (void *)fdp->fd_discard;
940 			fdp->fd_discard = fdp->fd_ofiles - 2;
941 		} else {
942 			fd_ofile_free(oldnfiles, fdp->fd_ofiles);
943 		}
944 	}
945 
946 	if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
947 		i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
948 		memcpy(newhimap, fdp->fd_himap, i);
949 		memset((uint8_t *)newhimap + i, 0,
950 		    NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
951 
952 		i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
953 		memcpy(newlomap, fdp->fd_lomap, i);
954 		memset((uint8_t *)newlomap + i, 0,
955 		    NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
956 
957 		if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
958 			fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
959 		}
960 		fdp->fd_himap = newhimap;
961 		fdp->fd_lomap = newlomap;
962 	}
963 
964 	/*
965 	 * All other modifications must become globally visible before
966 	 * the change to fd_nfiles.  See fd_getfile().
967 	 */
968 	fdp->fd_ofiles = newofile;
969 	membar_producer();
970 	fdp->fd_nfiles = numfiles;
971 	mutex_exit(&fdp->fd_lock);
972 
973 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
974 }
975 
976 /*
977  * Create a new open file structure and allocate a file descriptor
978  * for the current process.
979  */
980 int
981 fd_allocfile(file_t **resultfp, int *resultfd)
982 {
983 	file_t *fp;
984 	proc_t *p;
985 	int error;
986 
987 	p = curproc;
988 
989 	while ((error = fd_alloc(p, 0, resultfd)) != 0) {
990 		if (error != ENOSPC) {
991 			return error;
992 		}
993 		fd_tryexpand(p);
994 	}
995 
996 	fp = pool_cache_get(file_cache, PR_WAITOK);
997 	KASSERT(fp->f_count == 0);
998 	KASSERT(fp->f_msgcount == 0);
999 	KASSERT(fp->f_unpcount == 0);
1000 	fp->f_cred = kauth_cred_get();
1001 	kauth_cred_hold(fp->f_cred);
1002 
1003 	if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
1004 		fd_abort(p, fp, *resultfd);
1005 		tablefull("file", "increase kern.maxfiles or MAXFILES");
1006 		return ENFILE;
1007 	}
1008 
1009 	/*
1010 	 * Don't allow recycled files to be scanned.
1011 	 */
1012 	if ((fp->f_flag & FSCAN) != 0) {
1013 		mutex_enter(&fp->f_lock);
1014 		atomic_and_uint(&fp->f_flag, ~FSCAN);
1015 		mutex_exit(&fp->f_lock);
1016 	}
1017 
1018 	fp->f_advice = 0;
1019 	fp->f_msgcount = 0;
1020 	fp->f_offset = 0;
1021 	*resultfp = fp;
1022 
1023 	return 0;
1024 }
1025 
1026 /*
1027  * Successful creation of a new descriptor: make visible to the process.
1028  */
1029 void
1030 fd_affix(proc_t *p, file_t *fp, unsigned fd)
1031 {
1032 	fdfile_t *ff;
1033 	filedesc_t *fdp;
1034 
1035 	KASSERT(p == curproc || p == &proc0);
1036 
1037 	/* Add a reference to the file structure. */
1038 	mutex_enter(&fp->f_lock);
1039 	fp->f_count++;
1040 	mutex_exit(&fp->f_lock);
1041 
1042 	/*
1043 	 * Insert the new file into the descriptor slot.
1044 	 *
1045 	 * The memory barriers provided by lock activity in this routine
1046 	 * ensure that any updates to the file structure become globally
1047 	 * visible before the file becomes visible to other LWPs in the
1048 	 * current process.
1049 	 */
1050 	fdp = p->p_fd;
1051 	ff = fdp->fd_ofiles[fd];
1052 
1053 	KASSERT(ff != NULL);
1054 	KASSERT(ff->ff_file == NULL);
1055 	KASSERT(ff->ff_allocated);
1056 	KASSERT(fd_isused(fdp, fd));
1057 	KASSERT(fd >= NDFDFILE ||
1058 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1059 
1060 	/* No need to lock in order to make file initially visible. */
1061 	ff->ff_file = fp;
1062 }
1063 
1064 /*
1065  * Abort creation of a new descriptor: free descriptor slot and file.
1066  */
1067 void
1068 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1069 {
1070 	filedesc_t *fdp;
1071 	fdfile_t *ff;
1072 
1073 	KASSERT(p == curproc || p == &proc0);
1074 
1075 	fdp = p->p_fd;
1076 	ff = fdp->fd_ofiles[fd];
1077 
1078 	KASSERT(fd >= NDFDFILE ||
1079 	    fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1080 
1081 	mutex_enter(&fdp->fd_lock);
1082 	KASSERT(fd_isused(fdp, fd));
1083 	fd_unused(fdp, fd);
1084 	mutex_exit(&fdp->fd_lock);
1085 
1086 	if (fp != NULL) {
1087 		ffree(fp);
1088 	}
1089 }
1090 
1091 /*
1092  * Free a file descriptor.
1093  */
1094 void
1095 ffree(file_t *fp)
1096 {
1097 
1098 	KASSERT(fp->f_count == 0);
1099 
1100 	atomic_dec_uint(&nfiles);
1101 	kauth_cred_free(fp->f_cred);
1102 	pool_cache_put(file_cache, fp);
1103 }
1104 
1105 static int
1106 file_ctor(void *arg, void *obj, int flags)
1107 {
1108 	file_t *fp = obj;
1109 
1110 	memset(fp, 0, sizeof(*fp));
1111 	mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1112 
1113 	mutex_enter(&filelist_lock);
1114 	LIST_INSERT_HEAD(&filehead, fp, f_list);
1115 	mutex_exit(&filelist_lock);
1116 
1117 	return 0;
1118 }
1119 
1120 static void
1121 file_dtor(void *arg, void *obj)
1122 {
1123 	file_t *fp = obj;
1124 
1125 	mutex_enter(&filelist_lock);
1126 	LIST_REMOVE(fp, f_list);
1127 	mutex_exit(&filelist_lock);
1128 
1129 	mutex_destroy(&fp->f_lock);
1130 }
1131 
1132 static int
1133 fdfile_ctor(void *arg, void *obj, int flags)
1134 {
1135 	fdfile_t *ff = obj;
1136 
1137 	memset(ff, 0, sizeof(*ff));
1138 	mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1139 	cv_init(&ff->ff_closing, "fdclose");
1140 
1141 	return 0;
1142 }
1143 
1144 static void
1145 fdfile_dtor(void *arg, void *obj)
1146 {
1147 	fdfile_t *ff = obj;
1148 
1149 	mutex_destroy(&ff->ff_lock);
1150 	cv_destroy(&ff->ff_closing);
1151 }
1152 
1153 file_t *
1154 fgetdummy(void)
1155 {
1156 	file_t *fp;
1157 
1158 	fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1159 	if (fp != NULL) {
1160 		memset(fp, 0, sizeof(*fp));
1161 		mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1162 	}
1163 	return fp;
1164 }
1165 
1166 void
1167 fputdummy(file_t *fp)
1168 {
1169 
1170 	mutex_destroy(&fp->f_lock);
1171 	kmem_free(fp, sizeof(*fp));
1172 }
1173 
1174 /*
1175  * Create an initial filedesc structure.
1176  */
1177 filedesc_t *
1178 fd_init(filedesc_t *fdp)
1179 {
1180 	unsigned fd;
1181 
1182 	if (fdp == NULL) {
1183 		fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1184 	} else {
1185 		filedesc_ctor(NULL, fdp, PR_WAITOK);
1186 	}
1187 
1188 	fdp->fd_refcnt = 1;
1189 	fdp->fd_ofiles = fdp->fd_dfiles;
1190 	fdp->fd_nfiles = NDFILE;
1191 	fdp->fd_himap = fdp->fd_dhimap;
1192 	fdp->fd_lomap = fdp->fd_dlomap;
1193 	KASSERT(fdp->fd_lastfile == -1);
1194 	KASSERT(fdp->fd_lastkqfile == -1);
1195 	KASSERT(fdp->fd_knhash == NULL);
1196 
1197 	memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1198 	    offsetof(filedesc_t, fd_startzero));
1199 	for (fd = 0; fd < NDFDFILE; fd++) {
1200 		fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1201 	}
1202 
1203 	return fdp;
1204 }
1205 
1206 /*
1207  * Initialize a file descriptor table.
1208  */
1209 static int
1210 filedesc_ctor(void *arg, void *obj, int flag)
1211 {
1212 	filedesc_t *fdp = obj;
1213 	int i;
1214 
1215 	memset(fdp, 0, sizeof(*fdp));
1216 	mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1217 	fdp->fd_lastfile = -1;
1218 	fdp->fd_lastkqfile = -1;
1219 
1220 	CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1221 	for (i = 0; i < NDFDFILE; i++) {
1222 		fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1223 	}
1224 
1225 	return 0;
1226 }
1227 
1228 static void
1229 filedesc_dtor(void *arg, void *obj)
1230 {
1231 	filedesc_t *fdp = obj;
1232 	int i;
1233 
1234 	for (i = 0; i < NDFDFILE; i++) {
1235 		fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1236 	}
1237 
1238 	mutex_destroy(&fdp->fd_lock);
1239 }
1240 
1241 /*
1242  * Make p2 share p1's filedesc structure.
1243  */
1244 void
1245 fd_share(struct proc *p2)
1246 {
1247 	filedesc_t *fdp;
1248 
1249 	fdp = curlwp->l_fd;
1250 	p2->p_fd = fdp;
1251 	atomic_inc_uint(&fdp->fd_refcnt);
1252 }
1253 
1254 /*
1255  * Copy a filedesc structure.
1256  */
1257 filedesc_t *
1258 fd_copy(void)
1259 {
1260 	filedesc_t *newfdp, *fdp;
1261 	fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1262 	int i, nused, numfiles, lastfile, j, newlast;
1263 	file_t *fp;
1264 
1265 	fdp = curproc->p_fd;
1266 	newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1267 	newfdp->fd_refcnt = 1;
1268 
1269 	KASSERT(newfdp->fd_knhash == NULL);
1270 	KASSERT(newfdp->fd_knhashmask == 0);
1271 	KASSERT(newfdp->fd_discard == NULL);
1272 
1273 	for (;;) {
1274 		numfiles = fdp->fd_nfiles;
1275 		lastfile = fdp->fd_lastfile;
1276 
1277 		/*
1278 		 * If the number of open files fits in the internal arrays
1279 		 * of the open file structure, use them, otherwise allocate
1280 		 * additional memory for the number of descriptors currently
1281 		 * in use.
1282 		 */
1283 		if (lastfile < NDFILE) {
1284 			i = NDFILE;
1285 			newfdp->fd_ofiles = newfdp->fd_dfiles;
1286 		} else {
1287 			/*
1288 			 * Compute the smallest multiple of NDEXTENT needed
1289 			 * for the file descriptors currently in use,
1290 			 * allowing the table to shrink.
1291 			 */
1292 			i = numfiles;
1293 			while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1294 				i /= 2;
1295 			}
1296 			newfdp->fd_ofiles = fd_ofile_alloc(i);
1297 			KASSERT(i > NDFILE);
1298 		}
1299 		if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1300 			newfdp->fd_himap = newfdp->fd_dhimap;
1301 			newfdp->fd_lomap = newfdp->fd_dlomap;
1302 		} else {
1303 			fd_map_alloc(i, &newfdp->fd_lomap,
1304 			    &newfdp->fd_himap);
1305 		}
1306 
1307 		/*
1308 		 * Allocate and string together fdfile structures.
1309 		 * We abuse fdfile_t::ff_file here, but it will be
1310 		 * cleared before this routine returns.
1311 		 */
1312 		nused = fdp->fd_nused;
1313 		fflist = NULL;
1314 		for (j = nused; j != 0; j--) {
1315 			ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1316 			ff->ff_file = (void *)fflist;
1317 			fflist = ff;
1318 		}
1319 
1320 		mutex_enter(&fdp->fd_lock);
1321 		if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1322 		    lastfile == fdp->fd_lastfile) {
1323 			break;
1324 		}
1325 		mutex_exit(&fdp->fd_lock);
1326 		if (i > NDFILE) {
1327 			fd_ofile_free(i, newfdp->fd_ofiles);
1328 		}
1329 		if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1330 			fd_map_free(i, newfdp->fd_lomap, newfdp->fd_himap);
1331 		}
1332 		while (fflist != NULL) {
1333 			ff = fflist;
1334 			fflist = (void *)ff->ff_file;
1335 			ff->ff_file = NULL;
1336 			pool_cache_put(fdfile_cache, ff);
1337 		}
1338 	}
1339 
1340 	newfdp->fd_nfiles = i;
1341 	newfdp->fd_freefile = fdp->fd_freefile;
1342 	newfdp->fd_exclose = fdp->fd_exclose;
1343 
1344 	/*
1345 	 * Clear the entries that will not be copied over.
1346 	 * Avoid calling memset with 0 size.
1347 	 */
1348 	if (lastfile < (i-1)) {
1349 		memset(newfdp->fd_ofiles + lastfile + 1, 0,
1350 		    (i - lastfile - 1) * sizeof(file_t **));
1351 	}
1352 	if (i < NDENTRIES * NDENTRIES) {
1353 		i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1354 	}
1355 	memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1356 	memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1357 
1358 	ffp = fdp->fd_ofiles;
1359 	nffp = newfdp->fd_ofiles;
1360 	j = imax(lastfile, (NDFDFILE - 1));
1361 	newlast = -1;
1362 	KASSERT(j < fdp->fd_nfiles);
1363 	for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1364 		ff = *ffp;
1365 		/* Install built-in fdfiles even if unused here. */
1366 		if (i < NDFDFILE) {
1367 			ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1368 		} else {
1369 			ff2 = NULL;
1370 		}
1371 		/* Determine if descriptor is active in parent. */
1372 		if (ff == NULL || !fd_isused(fdp, i)) {
1373 			KASSERT(ff != NULL || i >= NDFDFILE);
1374 			continue;
1375 		}
1376 		mutex_enter(&ff->ff_lock);
1377 		fp = ff->ff_file;
1378 		if (fp == NULL) {
1379 			/* Descriptor is half-open: free slot. */
1380 			fd_zap(newfdp, i);
1381 			mutex_exit(&ff->ff_lock);
1382 			continue;
1383 		}
1384 		if (fp->f_type == DTYPE_KQUEUE) {
1385 			/* kqueue descriptors cannot be copied. */
1386 			fd_zap(newfdp, i);
1387 			mutex_exit(&ff->ff_lock);
1388 			continue;
1389 		}
1390 		/* It's active: add a reference to the file. */
1391 		mutex_enter(&fp->f_lock);
1392 		fp->f_count++;
1393 		mutex_exit(&fp->f_lock);
1394 		/* Consume one fdfile_t to represent it. */
1395 		if (i >= NDFDFILE) {
1396 			ff2 = fflist;
1397 			fflist = (void *)ff2->ff_file;
1398 		}
1399 		ff2->ff_file = fp;
1400 		ff2->ff_exclose = ff->ff_exclose;
1401 		ff2->ff_allocated = true;
1402 		mutex_exit(&ff->ff_lock);
1403 		if (i > newlast) {
1404 			newlast = i;
1405 		}
1406 	}
1407 	mutex_exit(&fdp->fd_lock);
1408 
1409 	/* Discard unused fdfile_t structures. */
1410 	while (__predict_false(fflist != NULL)) {
1411 		ff = fflist;
1412 		fflist = (void *)ff->ff_file;
1413 		ff->ff_file = NULL;
1414 		pool_cache_put(fdfile_cache, ff);
1415 		nused--;
1416 	}
1417 	KASSERT(nused >= 0);
1418 	KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1419 
1420 	newfdp->fd_nused = nused;
1421 	newfdp->fd_lastfile = newlast;
1422 
1423 	return (newfdp);
1424 }
1425 
1426 /*
1427  * Release a filedesc structure.
1428  */
1429 void
1430 fd_free(void)
1431 {
1432 	filedesc_t *fdp;
1433 	fdfile_t *ff;
1434 	file_t *fp;
1435 	int fd, lastfd;
1436 	void **discard;
1437 
1438 	fdp = curlwp->l_fd;
1439 
1440 	KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1441 
1442 	if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1443 		return;
1444 
1445 	/*
1446 	 * Close any files that the process holds open.
1447 	 */
1448 	for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1449 		ff = fdp->fd_ofiles[fd];
1450 		KASSERT(fd >= NDFDFILE ||
1451 		    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1452 		if ((ff = fdp->fd_ofiles[fd]) == NULL)
1453 			continue;
1454 		if ((fp = ff->ff_file) != NULL) {
1455 			/*
1456 			 * Must use fd_close() here as kqueue holds
1457 			 * long term references to descriptors.
1458 			 */
1459 			ff->ff_refcnt++;
1460 			fd_close(fd);
1461 		}
1462 		KASSERT(ff->ff_refcnt == 0);
1463 		KASSERT(ff->ff_file == NULL);
1464 		KASSERT(!ff->ff_exclose);
1465 		KASSERT(!ff->ff_allocated);
1466 		if (fd >= NDFDFILE) {
1467 			pool_cache_put(fdfile_cache, ff);
1468 		}
1469 	}
1470 
1471 	/*
1472 	 * Clean out the descriptor table for the next user and return
1473 	 * to the cache.
1474 	 */
1475 	while ((discard = fdp->fd_discard) != NULL) {
1476 		fdp->fd_discard = discard[0];
1477 		kmem_free(discard, (uintptr_t)discard[1]);
1478 	}
1479 	if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1480 		KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1481 		KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1482 		fd_map_free(fdp->fd_nfiles, fdp->fd_lomap, fdp->fd_himap);
1483 	}
1484 	if (fdp->fd_nfiles > NDFILE) {
1485 		KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1486 		fd_ofile_free(fdp->fd_nfiles, fdp->fd_ofiles);
1487 	}
1488 	if (fdp->fd_knhash != NULL) {
1489 		hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1490 		fdp->fd_knhash = NULL;
1491 		fdp->fd_knhashmask = 0;
1492 	} else {
1493 		KASSERT(fdp->fd_knhashmask == 0);
1494 	}
1495 	fdp->fd_lastkqfile = -1;
1496 	pool_cache_put(filedesc_cache, fdp);
1497 }
1498 
1499 /*
1500  * File Descriptor pseudo-device driver (/dev/fd/).
1501  *
1502  * Opening minor device N dup()s the file (if any) connected to file
1503  * descriptor N belonging to the calling process.  Note that this driver
1504  * consists of only the ``open()'' routine, because all subsequent
1505  * references to this file will be direct to the other driver.
1506  */
1507 static int
1508 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1509 {
1510 
1511 	/*
1512 	 * XXX Kludge: set dupfd to contain the value of the
1513 	 * the file descriptor being sought for duplication. The error
1514 	 * return ensures that the vnode for this device will be released
1515 	 * by vn_open. Open will detect this special error and take the
1516 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1517 	 * will simply report the error.
1518 	 */
1519 	l->l_dupfd = minor(dev);	/* XXX */
1520 	return EDUPFD;
1521 }
1522 
1523 /*
1524  * Duplicate the specified descriptor to a free descriptor.
1525  */
1526 int
1527 fd_dupopen(int old, int *new, int mode, int error)
1528 {
1529 	filedesc_t *fdp;
1530 	fdfile_t *ff;
1531 	file_t *fp;
1532 
1533 	if ((fp = fd_getfile(old)) == NULL) {
1534 		return EBADF;
1535 	}
1536 	fdp = curlwp->l_fd;
1537 	ff = fdp->fd_ofiles[old];
1538 
1539 	/*
1540 	 * There are two cases of interest here.
1541 	 *
1542 	 * For EDUPFD simply dup (dfd) to file descriptor
1543 	 * (indx) and return.
1544 	 *
1545 	 * For EMOVEFD steal away the file structure from (dfd) and
1546 	 * store it in (indx).  (dfd) is effectively closed by
1547 	 * this operation.
1548 	 *
1549 	 * Any other error code is just returned.
1550 	 */
1551 	switch (error) {
1552 	case EDUPFD:
1553 		/*
1554 		 * Check that the mode the file is being opened for is a
1555 		 * subset of the mode of the existing descriptor.
1556 		 */
1557 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1558 			error = EACCES;
1559 			break;
1560 		}
1561 
1562 		/* Copy it. */
1563 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1564 		break;
1565 
1566 	case EMOVEFD:
1567 		/* Copy it. */
1568 		error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1569 		if (error != 0) {
1570 			break;
1571 		}
1572 
1573 		/* Steal away the file pointer from 'old'. */
1574 		(void)fd_close(old);
1575 		return 0;
1576 	}
1577 
1578 	fd_putfile(old);
1579 	return error;
1580 }
1581 
1582 /*
1583  * Sets descriptor owner. If the owner is a process, 'pgid'
1584  * is set to positive value, process ID. If the owner is process group,
1585  * 'pgid' is set to -pg_id.
1586  */
1587 int
1588 fsetown(pid_t *pgid, u_long cmd, const void *data)
1589 {
1590 	int id = *(const int *)data;
1591 	int error;
1592 
1593 	switch (cmd) {
1594 	case TIOCSPGRP:
1595 		if (id < 0)
1596 			return (EINVAL);
1597 		id = -id;
1598 		break;
1599 	default:
1600 		break;
1601 	}
1602 
1603 	if (id > 0 && !pfind(id))
1604 		return (ESRCH);
1605 	else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1606 		return (error);
1607 
1608 	*pgid = id;
1609 	return (0);
1610 }
1611 
1612 /*
1613  * Return descriptor owner information. If the value is positive,
1614  * it's process ID. If it's negative, it's process group ID and
1615  * needs the sign removed before use.
1616  */
1617 int
1618 fgetown(pid_t pgid, u_long cmd, void *data)
1619 {
1620 
1621 	switch (cmd) {
1622 	case TIOCGPGRP:
1623 		*(int *)data = -pgid;
1624 		break;
1625 	default:
1626 		*(int *)data = pgid;
1627 		break;
1628 	}
1629 	return (0);
1630 }
1631 
1632 /*
1633  * Send signal to descriptor owner, either process or process group.
1634  */
1635 void
1636 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1637 {
1638 	struct proc *p1;
1639 	struct pgrp *pgrp;
1640 	ksiginfo_t ksi;
1641 
1642 	KASSERT(!cpu_intr_p());
1643 
1644 	KSI_INIT(&ksi);
1645 	ksi.ksi_signo = signo;
1646 	ksi.ksi_code = code;
1647 	ksi.ksi_band = band;
1648 
1649 	mutex_enter(proc_lock);
1650 	if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1651 		kpsignal(p1, &ksi, fdescdata);
1652 	else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1653 		kpgsignal(pgrp, &ksi, fdescdata, 0);
1654 	mutex_exit(proc_lock);
1655 }
1656 
1657 int
1658 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1659 	 void *data)
1660 {
1661 
1662 	fp->f_flag = flag;
1663 	fp->f_type = DTYPE_MISC;
1664 	fp->f_ops = fops;
1665 	fp->f_data = data;
1666 	curlwp->l_dupfd = fd;
1667 	fd_affix(curproc, fp, fd);
1668 
1669 	return EMOVEFD;
1670 }
1671 
1672 int
1673 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1674 {
1675 
1676 	if (cmd == F_SETFL)
1677 		return 0;
1678 
1679 	return EOPNOTSUPP;
1680 }
1681 
1682 int
1683 fnullop_poll(file_t *fp, int which)
1684 {
1685 
1686 	return 0;
1687 }
1688 
1689 int
1690 fnullop_kqfilter(file_t *fp, struct knote *kn)
1691 {
1692 
1693 	return 0;
1694 }
1695 
1696 int
1697 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1698 	    kauth_cred_t cred, int flags)
1699 {
1700 
1701 	return EOPNOTSUPP;
1702 }
1703 
1704 int
1705 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1706 	     kauth_cred_t cred, int flags)
1707 {
1708 
1709 	return EOPNOTSUPP;
1710 }
1711 
1712 int
1713 fbadop_ioctl(file_t *fp, u_long com, void *data)
1714 {
1715 
1716 	return EOPNOTSUPP;
1717 }
1718 
1719 int
1720 fbadop_stat(file_t *fp, struct stat *sb)
1721 {
1722 
1723 	return EOPNOTSUPP;
1724 }
1725 
1726 int
1727 fbadop_close(file_t *fp)
1728 {
1729 
1730 	return EOPNOTSUPP;
1731 }
1732