xref: /openbsd-src/sys/kern/kern_descrip.c (revision 4c1e55dc91edd6e69ccc60ce855900fbc12cf34f)
1 /*	$OpenBSD: kern_descrip.c,v 1.98 2012/07/11 23:07:19 guenther Exp $	*/
2 /*	$NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * (c) UNIX System Laboratories, Inc.
8  * All or some portions of this file are derived from material licensed
9  * to the University of California by American Telephone and Telegraph
10  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11  * the permission of UNIX System Laboratories, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/filedesc.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode.h>
45 #include <sys/proc.h>
46 #include <sys/file.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/stat.h>
50 #include <sys/ioctl.h>
51 #include <sys/fcntl.h>
52 #include <sys/malloc.h>
53 #include <sys/syslog.h>
54 #include <sys/ucred.h>
55 #include <sys/unistd.h>
56 #include <sys/resourcevar.h>
57 #include <sys/conf.h>
58 #include <sys/mount.h>
59 #include <sys/syscallargs.h>
60 #include <sys/event.h>
61 #include <sys/pool.h>
62 #include <sys/ktrace.h>
63 
64 #include <uvm/uvm_extern.h>
65 
66 #include <sys/pipe.h>
67 
68 /*
69  * Descriptor management.
70  */
71 struct filelist filehead;	/* head of list of open files */
72 int nfiles;			/* actual number of open files */
73 
74 static __inline void fd_used(struct filedesc *, int);
75 static __inline void fd_unused(struct filedesc *, int);
76 static __inline int find_next_zero(u_int *, int, u_int);
77 int finishdup(struct proc *, struct file *, int, int, register_t *, int);
78 int find_last_set(struct filedesc *, int);
79 
80 struct pool file_pool;
81 struct pool fdesc_pool;
82 
83 void
84 filedesc_init(void)
85 {
86 	pool_init(&file_pool, sizeof(struct file), 0, 0, 0, "filepl",
87 		&pool_allocator_nointr);
88 	pool_init(&fdesc_pool, sizeof(struct filedesc0), 0, 0, 0, "fdescpl",
89 		&pool_allocator_nointr);
90 	LIST_INIT(&filehead);
91 }
92 
93 static __inline int
94 find_next_zero (u_int *bitmap, int want, u_int bits)
95 {
96 	int i, off, maxoff;
97 	u_int sub;
98 
99 	if (want > bits)
100 		return -1;
101 
102 	off = want >> NDENTRYSHIFT;
103 	i = want & NDENTRYMASK;
104 	if (i) {
105 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
106 		if (sub != ~0)
107 			goto found;
108 		off++;
109 	}
110 
111 	maxoff = NDLOSLOTS(bits);
112 	while (off < maxoff) {
113 		if ((sub = bitmap[off]) != ~0)
114 			goto found;
115 		off++;
116 	}
117 
118 	return -1;
119 
120  found:
121 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
122 }
123 
124 int
125 find_last_set(struct filedesc *fd, int last)
126 {
127 	int off, i;
128 	struct file **ofiles = fd->fd_ofiles;
129 	u_int *bitmap = fd->fd_lomap;
130 
131 	off = (last - 1) >> NDENTRYSHIFT;
132 
133 	while (off >= 0 && !bitmap[off])
134 		off--;
135 	if (off < 0)
136 		return 0;
137 
138 	i = ((off + 1) << NDENTRYSHIFT) - 1;
139 	if (i >= last)
140 		i = last - 1;
141 
142 	while (i > 0 && ofiles[i] == NULL)
143 		i--;
144 	return i;
145 }
146 
147 static __inline void
148 fd_used(struct filedesc *fdp, int fd)
149 {
150 	u_int off = fd >> NDENTRYSHIFT;
151 
152 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
153 	if (fdp->fd_lomap[off] == ~0)
154 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
155 
156 	if (fd > fdp->fd_lastfile)
157 		fdp->fd_lastfile = fd;
158 	fdp->fd_openfd++;
159 }
160 
161 static __inline void
162 fd_unused(struct filedesc *fdp, int fd)
163 {
164 	u_int off = fd >> NDENTRYSHIFT;
165 
166 	if (fd < fdp->fd_freefile)
167 		fdp->fd_freefile = fd;
168 
169 	if (fdp->fd_lomap[off] == ~0)
170 		fdp->fd_himap[off >> NDENTRYSHIFT] &= ~(1 << (off & NDENTRYMASK));
171 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
172 
173 #ifdef DIAGNOSTIC
174 	if (fd > fdp->fd_lastfile)
175 		panic("fd_unused: fd_lastfile inconsistent");
176 #endif
177 	if (fd == fdp->fd_lastfile)
178 		fdp->fd_lastfile = find_last_set(fdp, fd);
179 	fdp->fd_openfd--;
180 }
181 
182 struct file *
183 fd_getfile(struct filedesc *fdp, int fd)
184 {
185 	struct file *fp;
186 
187 	if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
188 		return (NULL);
189 
190 	if (!FILE_IS_USABLE(fp))
191 		return (NULL);
192 
193 	return (fp);
194 }
195 
196 /*
197  * System calls on descriptors.
198  */
199 
200 /*
201  * Duplicate a file descriptor.
202  */
203 /* ARGSUSED */
204 int
205 sys_dup(struct proc *p, void *v, register_t *retval)
206 {
207 	struct sys_dup_args /* {
208 		syscallarg(int) fd;
209 	} */ *uap = v;
210 	struct filedesc *fdp = p->p_fd;
211 	int old = SCARG(uap, fd);
212 	struct file *fp;
213 	int new;
214 	int error;
215 
216 restart:
217 	if ((fp = fd_getfile(fdp, old)) == NULL)
218 		return (EBADF);
219 	FREF(fp);
220 	fdplock(fdp);
221 	if ((error = fdalloc(p, 0, &new)) != 0) {
222 		FRELE(fp, p);
223 		if (error == ENOSPC) {
224 			fdexpand(p);
225 			fdpunlock(fdp);
226 			goto restart;
227 		}
228 		goto out;
229 	}
230 	error = finishdup(p, fp, old, new, retval, 0);
231 
232 out:
233 	fdpunlock(fdp);
234 	return (error);
235 }
236 
237 /*
238  * Duplicate a file descriptor to a particular value.
239  */
240 /* ARGSUSED */
241 int
242 sys_dup2(struct proc *p, void *v, register_t *retval)
243 {
244 	struct sys_dup2_args /* {
245 		syscallarg(int) from;
246 		syscallarg(int) to;
247 	} */ *uap = v;
248 	int old = SCARG(uap, from), new = SCARG(uap, to);
249 	struct filedesc *fdp = p->p_fd;
250 	struct file *fp;
251 	int i, error;
252 
253 restart:
254 	if ((fp = fd_getfile(fdp, old)) == NULL)
255 		return (EBADF);
256 	if ((u_int)new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
257 	    (u_int)new >= maxfiles)
258 		return (EBADF);
259 	if (old == new) {
260 		/*
261 		 * NOTE! This doesn't clear the close-on-exec flag. This might
262 		 * or might not be the intended behavior from the start, but
263 		 * this is what everyone else does.
264 		 */
265 		*retval = new;
266 		return (0);
267 	}
268 	FREF(fp);
269 	fdplock(fdp);
270 	if (new >= fdp->fd_nfiles) {
271 		if ((error = fdalloc(p, new, &i)) != 0) {
272 			FRELE(fp, p);
273 			if (error == ENOSPC) {
274 				fdexpand(p);
275 				fdpunlock(fdp);
276 				goto restart;
277 			}
278 			goto out;
279 		}
280 		if (new != i)
281 			panic("dup2: fdalloc");
282 		fd_unused(fdp, new);
283 	}
284 	/* finishdup() does FRELE */
285 	error = finishdup(p, fp, old, new, retval, 1);
286 
287 out:
288 	fdpunlock(fdp);
289 	return (error);
290 }
291 
292 /*
293  * The file control system call.
294  */
295 /* ARGSUSED */
296 int
297 sys_fcntl(struct proc *p, void *v, register_t *retval)
298 {
299 	struct sys_fcntl_args /* {
300 		syscallarg(int) fd;
301 		syscallarg(int) cmd;
302 		syscallarg(void *) arg;
303 	} */ *uap = v;
304 	int fd = SCARG(uap, fd);
305 	struct filedesc *fdp = p->p_fd;
306 	struct file *fp;
307 	struct vnode *vp;
308 	int i, tmp, newmin, flg = F_POSIX;
309 	struct flock fl;
310 	int error = 0;
311 
312 restart:
313 	if ((fp = fd_getfile(fdp, fd)) == NULL)
314 		return (EBADF);
315 	FREF(fp);
316 	switch (SCARG(uap, cmd)) {
317 
318 	case F_DUPFD:
319 	case F_DUPFD_CLOEXEC:
320 		newmin = (long)SCARG(uap, arg);
321 		if ((u_int)newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
322 		    (u_int)newmin >= maxfiles) {
323 			error = EINVAL;
324 			break;
325 		}
326 		fdplock(fdp);
327 		if ((error = fdalloc(p, newmin, &i)) != 0) {
328 			FRELE(fp, p);
329 			if (error == ENOSPC) {
330 				fdexpand(p);
331 				fdpunlock(fdp);
332 				goto restart;
333 			}
334 		} else {
335 			/* finishdup will FRELE for us. */
336 			error = finishdup(p, fp, fd, i, retval, 0);
337 
338 			if (!error && SCARG(uap, cmd) == F_DUPFD_CLOEXEC)
339 				fdp->fd_ofileflags[i] |= UF_EXCLOSE;
340 		}
341 
342 		fdpunlock(fdp);
343 		return (error);
344 
345 	case F_GETFD:
346 		*retval = fdp->fd_ofileflags[fd] & UF_EXCLOSE ? 1 : 0;
347 		break;
348 
349 	case F_SETFD:
350 		fdplock(fdp);
351 		if ((long)SCARG(uap, arg) & 1)
352 			fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
353 		else
354 			fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
355 		fdpunlock(fdp);
356 		break;
357 
358 	case F_GETFL:
359 		*retval = OFLAGS(fp->f_flag);
360 		break;
361 
362 	case F_SETFL:
363 		fp->f_flag &= ~FCNTLFLAGS;
364 		fp->f_flag |= FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
365 		tmp = fp->f_flag & FNONBLOCK;
366 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
367 		if (error)
368 			break;
369 		tmp = fp->f_flag & FASYNC;
370 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
371 		if (!error)
372 			break;
373 		fp->f_flag &= ~FNONBLOCK;
374 		tmp = 0;
375 		(void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
376 		break;
377 
378 	case F_GETOWN:
379 		if (fp->f_type == DTYPE_SOCKET) {
380 			*retval = ((struct socket *)fp->f_data)->so_pgid;
381 			break;
382 		}
383 		error = (*fp->f_ops->fo_ioctl)
384 			(fp, TIOCGPGRP, (caddr_t)&tmp, p);
385 		*retval = -tmp;
386 		break;
387 
388 	case F_SETOWN:
389 		if (fp->f_type == DTYPE_SOCKET) {
390 			struct socket *so = (struct socket *)fp->f_data;
391 
392 			so->so_pgid = (long)SCARG(uap, arg);
393 			so->so_siguid = p->p_cred->p_ruid;
394 			so->so_sigeuid = p->p_ucred->cr_uid;
395 			break;
396 		}
397 		if ((long)SCARG(uap, arg) <= 0) {
398 			SCARG(uap, arg) = (void *)(-(long)SCARG(uap, arg));
399 		} else {
400 			struct process *pr1 = prfind((long)SCARG(uap, arg));
401 			if (pr1 == 0) {
402 				error = ESRCH;
403 				break;
404 			}
405 			SCARG(uap, arg) = (void *)(long)pr1->ps_pgrp->pg_id;
406 		}
407 		error = ((*fp->f_ops->fo_ioctl)
408 			(fp, TIOCSPGRP, (caddr_t)&SCARG(uap, arg), p));
409 		break;
410 
411 	case F_SETLKW:
412 		flg |= F_WAIT;
413 		/* FALLTHROUGH */
414 
415 	case F_SETLK:
416 		if (fp->f_type != DTYPE_VNODE) {
417 			error = EBADF;
418 			break;
419 		}
420 		vp = (struct vnode *)fp->f_data;
421 		/* Copy in the lock structure */
422 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
423 		    sizeof (fl));
424 		if (error)
425 			break;
426 		if (fl.l_whence == SEEK_CUR) {
427 			if (fl.l_start == 0 && fl.l_len < 0) {
428 				/* lockf(3) compliance hack */
429 				fl.l_len = -fl.l_len;
430 				fl.l_start = fp->f_offset - fl.l_len;
431 			} else
432 				fl.l_start += fp->f_offset;
433 		}
434 		switch (fl.l_type) {
435 
436 		case F_RDLCK:
437 			if ((fp->f_flag & FREAD) == 0) {
438 				error = EBADF;
439 				goto out;
440 			}
441 			atomic_setbits_int(&fdp->fd_flags, FD_ADVLOCK);
442 			error = VOP_ADVLOCK(vp, fdp, F_SETLK, &fl, flg);
443 			break;
444 
445 		case F_WRLCK:
446 			if ((fp->f_flag & FWRITE) == 0) {
447 				error = EBADF;
448 				goto out;
449 			}
450 			atomic_setbits_int(&fdp->fd_flags, FD_ADVLOCK);
451 			error = VOP_ADVLOCK(vp, fdp, F_SETLK, &fl, flg);
452 			break;
453 
454 		case F_UNLCK:
455 			error = VOP_ADVLOCK(vp, fdp, F_UNLCK, &fl, F_POSIX);
456 			goto out;
457 
458 		default:
459 			error = EINVAL;
460 			goto out;
461 		}
462 
463 		if (fp != fd_getfile(fdp, fd)) {
464 			/*
465 			 * We have lost the race with close() or dup2();
466 			 * unlock, pretend that we've won the race and that
467 			 * lock had been removed by close()
468 			 */
469 			fl.l_whence = SEEK_SET;
470 			fl.l_start = 0;
471 			fl.l_len = 0;
472 			VOP_ADVLOCK(vp, fdp, F_UNLCK, &fl, F_POSIX);
473 			fl.l_type = F_UNLCK;
474 		}
475 		goto out;
476 
477 
478 	case F_GETLK:
479 		if (fp->f_type != DTYPE_VNODE) {
480 			error = EBADF;
481 			break;
482 		}
483 		vp = (struct vnode *)fp->f_data;
484 		/* Copy in the lock structure */
485 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
486 		    sizeof (fl));
487 		if (error)
488 			break;
489 		if (fl.l_whence == SEEK_CUR) {
490 			if (fl.l_start == 0 && fl.l_len < 0) {
491 				/* lockf(3) compliance hack */
492 				fl.l_len = -fl.l_len;
493 				fl.l_start = fp->f_offset - fl.l_len;
494 			} else
495 				fl.l_start += fp->f_offset;
496 		}
497 		if (fl.l_type != F_RDLCK &&
498 		    fl.l_type != F_WRLCK &&
499 		    fl.l_type != F_UNLCK &&
500 		    fl.l_type != 0) {
501 			error = EINVAL;
502 			break;
503 		}
504 		error = VOP_ADVLOCK(vp, fdp, F_GETLK, &fl, F_POSIX);
505 		if (error)
506 			break;
507 		error = (copyout((caddr_t)&fl, (caddr_t)SCARG(uap, arg),
508 		    sizeof (fl)));
509 		break;
510 
511 	default:
512 		error = EINVAL;
513 		break;
514 	}
515 out:
516 	FRELE(fp, p);
517 	return (error);
518 }
519 
520 /*
521  * Common code for dup, dup2, and fcntl(F_DUPFD).
522  */
523 int
524 finishdup(struct proc *p, struct file *fp, int old, int new,
525     register_t *retval, int dup2)
526 {
527 	struct file *oldfp;
528 	struct filedesc *fdp = p->p_fd;
529 
530 	fdpassertlocked(fdp);
531 	if (fp->f_count == LONG_MAX-2) {
532 		FRELE(fp, p);
533 		return (EDEADLK);
534 	}
535 
536 	/*
537 	 * Don't fd_getfile here. We want to closef LARVAL files and
538 	 * closef can deal with that.
539 	 */
540 	oldfp = fdp->fd_ofiles[new];
541 	if (oldfp != NULL)
542 		FREF(oldfp);
543 
544 	fdp->fd_ofiles[new] = fp;
545 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
546 	fp->f_count++;
547 	FRELE(fp, p);
548 	if (dup2 && oldfp == NULL)
549 		fd_used(fdp, new);
550 	*retval = new;
551 
552 	if (oldfp != NULL) {
553 		if (new < fdp->fd_knlistsize)
554 			knote_fdclose(p, new);
555 		closef(oldfp, p);
556 	}
557 
558 	return (0);
559 }
560 
561 void
562 fdremove(struct filedesc *fdp, int fd)
563 {
564 	fdpassertlocked(fdp);
565 	fdp->fd_ofiles[fd] = NULL;
566 	fd_unused(fdp, fd);
567 }
568 
569 int
570 fdrelease(struct proc *p, int fd)
571 {
572 	struct filedesc *fdp = p->p_fd;
573 	struct file **fpp, *fp;
574 
575 	fdpassertlocked(fdp);
576 
577 	/*
578 	 * Don't fd_getfile here. We want to closef LARVAL files and closef
579 	 * can deal with that.
580 	 */
581 	fpp = &fdp->fd_ofiles[fd];
582 	fp = *fpp;
583 	if (fp == NULL)
584 		return (EBADF);
585 	FREF(fp);
586 	*fpp = NULL;
587 	fd_unused(fdp, fd);
588 	if (fd < fdp->fd_knlistsize)
589 		knote_fdclose(p, fd);
590 	return (closef(fp, p));
591 }
592 
593 /*
594  * Close a file descriptor.
595  */
596 /* ARGSUSED */
597 int
598 sys_close(struct proc *p, void *v, register_t *retval)
599 {
600 	struct sys_close_args /* {
601 		syscallarg(int) fd;
602 	} */ *uap = v;
603 	int fd = SCARG(uap, fd), error;
604 	struct filedesc *fdp = p->p_fd;
605 
606 	if (fd_getfile(fdp, fd) == NULL)
607 		return (EBADF);
608 	fdplock(fdp);
609 	error = fdrelease(p, fd);
610 	fdpunlock(fdp);
611 
612 	return (error);
613 }
614 
615 /*
616  * Return status information about a file descriptor.
617  */
618 /* ARGSUSED */
619 int
620 sys_fstat(struct proc *p, void *v, register_t *retval)
621 {
622 	struct sys_fstat_args /* {
623 		syscallarg(int) fd;
624 		syscallarg(struct stat *) sb;
625 	} */ *uap = v;
626 	int fd = SCARG(uap, fd);
627 	struct filedesc *fdp = p->p_fd;
628 	struct file *fp;
629 	struct stat ub;
630 	int error;
631 
632 	if ((fp = fd_getfile(fdp, fd)) == NULL)
633 		return (EBADF);
634 	FREF(fp);
635 	error = (*fp->f_ops->fo_stat)(fp, &ub, p);
636 	FRELE(fp, p);
637 	if (error == 0) {
638 		/*
639 		 * Don't let non-root see generation numbers
640 		 * (for NFS security)
641 		 */
642 		if (suser(p, 0))
643 			ub.st_gen = 0;
644 		error = copyout((caddr_t)&ub, (caddr_t)SCARG(uap, sb),
645 		    sizeof (ub));
646 	}
647 #ifdef KTRACE
648 	if (error == 0 && KTRPOINT(p, KTR_STRUCT))
649 		ktrstat(p, &ub);
650 #endif
651 	return (error);
652 }
653 
654 /*
655  * Return pathconf information about a file descriptor.
656  */
657 /* ARGSUSED */
658 int
659 sys_fpathconf(struct proc *p, void *v, register_t *retval)
660 {
661 	struct sys_fpathconf_args /* {
662 		syscallarg(int) fd;
663 		syscallarg(int) name;
664 	} */ *uap = v;
665 	int fd = SCARG(uap, fd);
666 	struct filedesc *fdp = p->p_fd;
667 	struct file *fp;
668 	struct vnode *vp;
669 	int error;
670 
671 	if ((fp = fd_getfile(fdp, fd)) == NULL)
672 		return (EBADF);
673 	FREF(fp);
674 	switch (fp->f_type) {
675 	case DTYPE_PIPE:
676 	case DTYPE_SOCKET:
677 		if (SCARG(uap, name) != _PC_PIPE_BUF) {
678 			error = EINVAL;
679 			break;
680 		}
681 		*retval = PIPE_BUF;
682 		error = 0;
683 		break;
684 
685 	case DTYPE_VNODE:
686 		vp = (struct vnode *)fp->f_data;
687 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
688 		error = VOP_PATHCONF(vp, SCARG(uap, name), retval);
689 		VOP_UNLOCK(vp, 0, p);
690 		break;
691 
692 	default:
693 		error = EOPNOTSUPP;
694 		break;
695 	}
696 	FRELE(fp, p);
697 	return (error);
698 }
699 
700 /*
701  * Allocate a file descriptor for the process.
702  */
703 int
704 fdalloc(struct proc *p, int want, int *result)
705 {
706 	struct filedesc *fdp = p->p_fd;
707 	int lim, last, i;
708 	u_int new, off;
709 
710 	/*
711 	 * Search for a free descriptor starting at the higher
712 	 * of want or fd_freefile.  If that fails, consider
713 	 * expanding the ofile array.
714 	 */
715 restart:
716 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
717 	last = min(fdp->fd_nfiles, lim);
718 	if ((i = want) < fdp->fd_freefile)
719 		i = fdp->fd_freefile;
720 	off = i >> NDENTRYSHIFT;
721 	new = find_next_zero(fdp->fd_himap, off,
722 	    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
723 	if (new != -1) {
724 		i = find_next_zero(&fdp->fd_lomap[new],
725 				   new > off ? 0 : i & NDENTRYMASK,
726 				   NDENTRIES);
727 		if (i == -1) {
728 			/*
729 			 * Free file descriptor in this block was
730 			 * below want, try again with higher want.
731 			 */
732 			want = (new + 1) << NDENTRYSHIFT;
733 			goto restart;
734 		}
735 		i += (new << NDENTRYSHIFT);
736 		if (i < last) {
737 			fd_used(fdp, i);
738 			if (want <= fdp->fd_freefile)
739 				fdp->fd_freefile = i;
740 			*result = i;
741 			fdp->fd_ofileflags[i] = 0;
742 			return (0);
743 		}
744 	}
745 	if (fdp->fd_nfiles >= lim)
746 		return (EMFILE);
747 
748 	return (ENOSPC);
749 }
750 
751 void
752 fdexpand(struct proc *p)
753 {
754 	struct filedesc *fdp = p->p_fd;
755 	int nfiles, i;
756 	struct file **newofile;
757 	char *newofileflags;
758 	u_int *newhimap, *newlomap;
759 
760 	fdpassertlocked(fdp);
761 
762 	/*
763 	 * No space in current array.
764 	 */
765 	if (fdp->fd_nfiles < NDEXTENT)
766 		nfiles = NDEXTENT;
767 	else
768 		nfiles = 2 * fdp->fd_nfiles;
769 
770 	newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
771 	newofileflags = (char *) &newofile[nfiles];
772 
773 	/*
774 	 * Copy the existing ofile and ofileflags arrays
775 	 * and zero the new portion of each array.
776 	 */
777 	bcopy(fdp->fd_ofiles, newofile,
778 		(i = sizeof(struct file *) * fdp->fd_nfiles));
779 	bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
780 	bcopy(fdp->fd_ofileflags, newofileflags,
781 		(i = sizeof(char) * fdp->fd_nfiles));
782 	bzero(newofileflags + i, nfiles * sizeof(char) - i);
783 
784 	if (fdp->fd_nfiles > NDFILE)
785 		free(fdp->fd_ofiles, M_FILEDESC);
786 
787 	if (NDHISLOTS(nfiles) > NDHISLOTS(fdp->fd_nfiles)) {
788 		newhimap = malloc(NDHISLOTS(nfiles) * sizeof(u_int),
789 		    M_FILEDESC, M_WAITOK);
790 		newlomap = malloc(NDLOSLOTS(nfiles) * sizeof(u_int),
791 		    M_FILEDESC, M_WAITOK);
792 
793 		bcopy(fdp->fd_himap, newhimap,
794 		    (i = NDHISLOTS(fdp->fd_nfiles) * sizeof(u_int)));
795 		bzero((char *)newhimap + i,
796 		    NDHISLOTS(nfiles) * sizeof(u_int) - i);
797 
798 		bcopy(fdp->fd_lomap, newlomap,
799 		    (i = NDLOSLOTS(fdp->fd_nfiles) * sizeof(u_int)));
800 		bzero((char *)newlomap + i,
801 		    NDLOSLOTS(nfiles) * sizeof(u_int) - i);
802 
803 		if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
804 			free(fdp->fd_himap, M_FILEDESC);
805 			free(fdp->fd_lomap, M_FILEDESC);
806 		}
807 		fdp->fd_himap = newhimap;
808 		fdp->fd_lomap = newlomap;
809 	}
810 	fdp->fd_ofiles = newofile;
811 	fdp->fd_ofileflags = newofileflags;
812 	fdp->fd_nfiles = nfiles;
813 }
814 
815 /*
816  * Create a new open file structure and allocate
817  * a file descriptor for the process that refers to it.
818  */
819 int
820 falloc(struct proc *p, struct file **resultfp, int *resultfd)
821 {
822 	struct file *fp, *fq;
823 	int error, i;
824 
825 	fdpassertlocked(p->p_fd);
826 restart:
827 	if ((error = fdalloc(p, 0, &i)) != 0) {
828 		if (error == ENOSPC) {
829 			fdexpand(p);
830 			goto restart;
831 		}
832 		return (error);
833 	}
834 	if (nfiles >= maxfiles) {
835 		fd_unused(p->p_fd, i);
836 		tablefull("file");
837 		return (ENFILE);
838 	}
839 	/*
840 	 * Allocate a new file descriptor.
841 	 * If the process has file descriptor zero open, add to the list
842 	 * of open files at that point, otherwise put it at the front of
843 	 * the list of open files.
844 	 */
845 	nfiles++;
846 	fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO);
847 	fp->f_iflags = FIF_LARVAL;
848 	if ((fq = p->p_fd->fd_ofiles[0]) != NULL) {
849 		LIST_INSERT_AFTER(fq, fp, f_list);
850 	} else {
851 		LIST_INSERT_HEAD(&filehead, fp, f_list);
852 	}
853 	p->p_fd->fd_ofiles[i] = fp;
854 	fp->f_count = 1;
855 	fp->f_cred = p->p_ucred;
856 	crhold(fp->f_cred);
857 	if (resultfp)
858 		*resultfp = fp;
859 	if (resultfd)
860 		*resultfd = i;
861 	FREF(fp);
862 	return (0);
863 }
864 
865 /*
866  * Build a new filedesc structure.
867  */
868 struct filedesc *
869 fdinit(struct proc *p)
870 {
871 	struct filedesc0 *newfdp;
872 	extern int cmask;
873 
874 	newfdp = pool_get(&fdesc_pool, PR_WAITOK|PR_ZERO);
875 	if (p != NULL) {
876 		struct filedesc *fdp = p->p_fd;
877 
878 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
879 		vref(newfdp->fd_fd.fd_cdir);
880 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
881 		if (newfdp->fd_fd.fd_rdir)
882 			vref(newfdp->fd_fd.fd_rdir);
883 	}
884 	rw_init(&newfdp->fd_fd.fd_lock, "fdlock");
885 
886 	/* Create the file descriptor table. */
887 	newfdp->fd_fd.fd_refcnt = 1;
888 	newfdp->fd_fd.fd_cmask = cmask;
889 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
890 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
891 	newfdp->fd_fd.fd_nfiles = NDFILE;
892 	newfdp->fd_fd.fd_himap = newfdp->fd_dhimap;
893 	newfdp->fd_fd.fd_lomap = newfdp->fd_dlomap;
894 	newfdp->fd_fd.fd_knlistsize = -1;
895 
896 	newfdp->fd_fd.fd_freefile = 0;
897 	newfdp->fd_fd.fd_lastfile = 0;
898 
899 	return (&newfdp->fd_fd);
900 }
901 
902 /*
903  * Share a filedesc structure.
904  */
905 struct filedesc *
906 fdshare(struct proc *p)
907 {
908 	p->p_fd->fd_refcnt++;
909 	return (p->p_fd);
910 }
911 
912 /*
913  * Copy a filedesc structure.
914  */
915 struct filedesc *
916 fdcopy(struct proc *p)
917 {
918 	struct filedesc *newfdp, *fdp = p->p_fd;
919 	struct file **fpp;
920 	int i;
921 
922 	fdplock(fdp);
923 	newfdp = pool_get(&fdesc_pool, PR_WAITOK);
924 	bcopy(fdp, newfdp, sizeof(struct filedesc));
925 	if (newfdp->fd_cdir)
926 		vref(newfdp->fd_cdir);
927 	if (newfdp->fd_rdir)
928 		vref(newfdp->fd_rdir);
929 	newfdp->fd_refcnt = 1;
930 	rw_init(&newfdp->fd_lock, "fdlock");
931 
932 	/*
933 	 * If the number of open files fits in the internal arrays
934 	 * of the open file structure, use them, otherwise allocate
935 	 * additional memory for the number of descriptors currently
936 	 * in use.
937 	 */
938 	if (newfdp->fd_lastfile < NDFILE) {
939 		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
940 		newfdp->fd_ofileflags =
941 		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
942 		i = NDFILE;
943 	} else {
944 		/*
945 		 * Compute the smallest multiple of NDEXTENT needed
946 		 * for the file descriptors currently in use,
947 		 * allowing the table to shrink.
948 		 */
949 		i = newfdp->fd_nfiles;
950 		while (i >= 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
951 			i /= 2;
952 		newfdp->fd_ofiles = malloc(i * OFILESIZE, M_FILEDESC, M_WAITOK);
953 		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
954 	}
955 	if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
956 		newfdp->fd_himap =
957 			((struct filedesc0 *) newfdp)->fd_dhimap;
958 		newfdp->fd_lomap =
959 			((struct filedesc0 *) newfdp)->fd_dlomap;
960 	} else {
961 		newfdp->fd_himap = malloc(NDHISLOTS(i) * sizeof(u_int),
962 		    M_FILEDESC, M_WAITOK);
963 		newfdp->fd_lomap = malloc(NDLOSLOTS(i) * sizeof(u_int),
964 		    M_FILEDESC, M_WAITOK);
965 	}
966 	newfdp->fd_nfiles = i;
967 	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
968 	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
969 	bcopy(fdp->fd_himap, newfdp->fd_himap, NDHISLOTS(i) * sizeof(u_int));
970 	bcopy(fdp->fd_lomap, newfdp->fd_lomap, NDLOSLOTS(i) * sizeof(u_int));
971 	fdpunlock(fdp);
972 
973 	/*
974 	 * kq descriptors cannot be copied.
975 	 */
976 	fdplock(newfdp);
977 	if (newfdp->fd_knlistsize != -1) {
978 		fpp = newfdp->fd_ofiles;
979 		for (i = 0; i <= newfdp->fd_lastfile; i++, fpp++)
980 			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE)
981 				fdremove(newfdp, i);
982 		newfdp->fd_knlist = NULL;
983 		newfdp->fd_knlistsize = -1;
984 		newfdp->fd_knhash = NULL;
985 		newfdp->fd_knhashmask = 0;
986 	}
987 
988 	fpp = newfdp->fd_ofiles;
989 	for (i = 0; i <= newfdp->fd_lastfile; i++, fpp++)
990 		if (*fpp != NULL) {
991 			/*
992 			 * XXX Gruesome hack. If count gets too high, fail
993 			 * to copy an fd, since fdcopy()'s callers do not
994 			 * permit it to indicate failure yet.
995 			 */
996 			if ((*fpp)->f_count == LONG_MAX-2)
997 				fdremove(newfdp, i);
998 			else
999 				(*fpp)->f_count++;
1000 		}
1001 	fdpunlock(newfdp);
1002 	return (newfdp);
1003 }
1004 
1005 /*
1006  * Release a filedesc structure.
1007  */
1008 void
1009 fdfree(struct proc *p)
1010 {
1011 	struct filedesc *fdp = p->p_fd;
1012 	struct file **fpp, *fp;
1013 	int i;
1014 
1015 	if (--fdp->fd_refcnt > 0)
1016 		return;
1017 	fpp = fdp->fd_ofiles;
1018 	for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) {
1019 		fp = *fpp;
1020 		if (fp != NULL) {
1021 			FREF(fp);
1022 			*fpp = NULL;
1023 			(void) closef(fp, p);
1024 		}
1025 	}
1026 	p->p_fd = NULL;
1027 	if (fdp->fd_nfiles > NDFILE)
1028 		free(fdp->fd_ofiles, M_FILEDESC);
1029 	if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1030 		free(fdp->fd_himap, M_FILEDESC);
1031 		free(fdp->fd_lomap, M_FILEDESC);
1032 	}
1033 	if (fdp->fd_cdir)
1034 		vrele(fdp->fd_cdir);
1035 	if (fdp->fd_rdir)
1036 		vrele(fdp->fd_rdir);
1037 	if (fdp->fd_knlist)
1038 		free(fdp->fd_knlist, M_TEMP);
1039 	if (fdp->fd_knhash)
1040 		free(fdp->fd_knhash, M_TEMP);
1041 	pool_put(&fdesc_pool, fdp);
1042 }
1043 
1044 /*
1045  * Internal form of close.
1046  * Decrement reference count on file structure.
1047  * Note: p may be NULL when closing a file
1048  * that was being passed in a message.
1049  *
1050  * The fp must have its usecount bumped and will be FRELEd here.
1051  */
1052 int
1053 closef(struct file *fp, struct proc *p)
1054 {
1055 	struct filedesc *fdp;
1056 
1057 	if (fp == NULL)
1058 		return (0);
1059 
1060 #ifdef DIAGNOSTIC
1061 	if (fp->f_count < 2)
1062 		panic("closef: count (%d) < 2", fp->f_count);
1063 #endif
1064 	fp->f_count--;
1065 
1066 	/*
1067 	 * POSIX record locking dictates that any close releases ALL
1068 	 * locks owned by this process.  This is handled by setting
1069 	 * a flag in the unlock to free ONLY locks obeying POSIX
1070 	 * semantics, and not to free BSD-style file locks.
1071 	 * If the descriptor was in a message, POSIX-style locks
1072 	 * aren't passed with the descriptor.
1073 	 */
1074 
1075 	if (p && ((fdp = p->p_fd) != NULL) &&
1076 	    (fdp->fd_flags & FD_ADVLOCK) &&
1077 	    fp->f_type == DTYPE_VNODE) {
1078 		struct vnode *vp = fp->f_data;
1079 		struct flock lf;
1080 
1081 		lf.l_whence = SEEK_SET;
1082 		lf.l_start = 0;
1083 		lf.l_len = 0;
1084 		lf.l_type = F_UNLCK;
1085 		(void) VOP_ADVLOCK(vp, fdp, F_UNLCK, &lf, F_POSIX);
1086 	}
1087 
1088 	return (FRELE(fp, p));
1089 }
1090 
1091 int
1092 fdrop(struct file *fp, struct proc *p)
1093 {
1094 	int error;
1095 
1096 #ifdef DIAGNOSTIC
1097 	if (fp->f_count != 0)
1098 		panic("fdrop: count (%d) != 0", fp->f_count);
1099 #endif
1100 
1101 	if (fp->f_ops)
1102 		error = (*fp->f_ops->fo_close)(fp, p);
1103 	else
1104 		error = 0;
1105 
1106 	/* Free fp */
1107 	LIST_REMOVE(fp, f_list);
1108 	crfree(fp->f_cred);
1109 	nfiles--;
1110 	pool_put(&file_pool, fp);
1111 
1112 	return (error);
1113 }
1114 
1115 /*
1116  * Apply an advisory lock on a file descriptor.
1117  *
1118  * Just attempt to get a record lock of the requested type on
1119  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1120  */
1121 /* ARGSUSED */
1122 int
1123 sys_flock(struct proc *p, void *v, register_t *retval)
1124 {
1125 	struct sys_flock_args /* {
1126 		syscallarg(int) fd;
1127 		syscallarg(int) how;
1128 	} */ *uap = v;
1129 	int fd = SCARG(uap, fd);
1130 	int how = SCARG(uap, how);
1131 	struct filedesc *fdp = p->p_fd;
1132 	struct file *fp;
1133 	struct vnode *vp;
1134 	struct flock lf;
1135 	int error;
1136 
1137 	if ((fp = fd_getfile(fdp, fd)) == NULL)
1138 		return (EBADF);
1139 	if (fp->f_type != DTYPE_VNODE)
1140 		return (EOPNOTSUPP);
1141 	FREF(fp);
1142 	vp = (struct vnode *)fp->f_data;
1143 	lf.l_whence = SEEK_SET;
1144 	lf.l_start = 0;
1145 	lf.l_len = 0;
1146 	if (how & LOCK_UN) {
1147 		lf.l_type = F_UNLCK;
1148 		fp->f_flag &= ~FHASLOCK;
1149 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1150 		goto out;
1151 	}
1152 	if (how & LOCK_EX)
1153 		lf.l_type = F_WRLCK;
1154 	else if (how & LOCK_SH)
1155 		lf.l_type = F_RDLCK;
1156 	else {
1157 		error = EINVAL;
1158 		goto out;
1159 	}
1160 	fp->f_flag |= FHASLOCK;
1161 	if (how & LOCK_NB)
1162 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK);
1163 	else
1164 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT);
1165 out:
1166 	FRELE(fp, p);
1167 	return (error);
1168 }
1169 
1170 /*
1171  * File Descriptor pseudo-device driver (/dev/fd/).
1172  *
1173  * Opening minor device N dup()s the file (if any) connected to file
1174  * descriptor N belonging to the calling process.  Note that this driver
1175  * consists of only the ``open()'' routine, because all subsequent
1176  * references to this file will be direct to the other driver.
1177  */
1178 /* ARGSUSED */
1179 int
1180 filedescopen(dev_t dev, int mode, int type, struct proc *p)
1181 {
1182 
1183 	/*
1184 	 * XXX Kludge: set curproc->p_dupfd to contain the value of the
1185 	 * the file descriptor being sought for duplication. The error
1186 	 * return ensures that the vnode for this device will be released
1187 	 * by vn_open. Open will detect this special error and take the
1188 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1189 	 * will simply report the error.
1190 	 */
1191 	p->p_dupfd = minor(dev);
1192 	return (ENODEV);
1193 }
1194 
1195 /*
1196  * Duplicate the specified descriptor to a free descriptor.
1197  */
1198 int
1199 dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode)
1200 {
1201 	struct file *wfp;
1202 
1203 	fdpassertlocked(fdp);
1204 
1205 	/*
1206 	 * Assume that the filename was user-specified; applications do
1207 	 * not tend to open /dev/fd/# when they can just call dup()
1208 	 */
1209 	if ((curproc->p_p->ps_flags & (PS_SUGIDEXEC | PS_SUGID))) {
1210 		if (curproc->p_descfd == 255)
1211 			return (EPERM);
1212 		if (curproc->p_descfd != curproc->p_dupfd)
1213 			return (EPERM);
1214 	}
1215 
1216 	/*
1217 	 * If the to-be-dup'd fd number is greater than the allowed number
1218 	 * of file descriptors, or the fd to be dup'd has already been
1219 	 * closed, reject. Note, there is no need to check for new == old
1220 	 * because fd_getfile will return NULL if the file at indx is
1221 	 * newly created by falloc (FIF_LARVAL).
1222 	 */
1223 	if ((wfp = fd_getfile(fdp, dfd)) == NULL)
1224 		return (EBADF);
1225 
1226 	/*
1227 	 * Check that the mode the file is being opened for is a
1228 	 * subset of the mode of the existing descriptor.
1229 	 */
1230 	if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
1231 		return (EACCES);
1232 	if (wfp->f_count == LONG_MAX-2)
1233 		return (EDEADLK);
1234 
1235 	fdp->fd_ofiles[indx] = wfp;
1236 	fdp->fd_ofileflags[indx] = (fdp->fd_ofileflags[indx] & UF_EXCLOSE) |
1237 	    (fdp->fd_ofileflags[dfd] & ~UF_EXCLOSE);
1238 	wfp->f_count++;
1239 	fd_used(fdp, indx);
1240 	return (0);
1241 }
1242 
1243 /*
1244  * Close any files on exec?
1245  */
1246 void
1247 fdcloseexec(struct proc *p)
1248 {
1249 	struct filedesc *fdp = p->p_fd;
1250 	int fd;
1251 
1252 	fdplock(fdp);
1253 	for (fd = 0; fd <= fdp->fd_lastfile; fd++)
1254 		if (fdp->fd_ofileflags[fd] & UF_EXCLOSE)
1255 			(void) fdrelease(p, fd);
1256 	fdpunlock(fdp);
1257 }
1258 
1259 int
1260 sys_closefrom(struct proc *p, void *v, register_t *retval)
1261 {
1262 	struct sys_closefrom_args *uap = v;
1263 	struct filedesc *fdp = p->p_fd;
1264 	u_int startfd, i;
1265 
1266 	startfd = SCARG(uap, fd);
1267 	fdplock(fdp);
1268 
1269 	if (startfd > fdp->fd_lastfile) {
1270 		fdpunlock(fdp);
1271 		return (EBADF);
1272 	}
1273 
1274 	for (i = startfd; i <= fdp->fd_lastfile; i++)
1275 		fdrelease(p, i);
1276 
1277 	fdpunlock(fdp);
1278 	return (0);
1279 }
1280 
1281 int
1282 sys_getdtablecount(struct proc *p, void *v, register_t *retval)
1283 {
1284 	*retval = p->p_fd->fd_openfd;
1285 	return (0);
1286 }
1287