1 /* $NetBSD: kern_descrip.c,v 1.182 2008/07/02 16:45:19 matt Exp $ */ 2 3 /*- 4 * Copyright (c) 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright (c) 1982, 1986, 1989, 1991, 1993 31 * The Regents of the University of California. All rights reserved. 32 * (c) UNIX System Laboratories, Inc. 33 * All or some portions of this file are derived from material licensed 34 * to the University of California by American Telephone and Telegraph 35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 36 * the permission of UNIX System Laboratories, Inc. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 63 */ 64 65 /* 66 * File descriptor management. 67 */ 68 69 #include <sys/cdefs.h> 70 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182 2008/07/02 16:45:19 matt Exp $"); 71 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/filedesc.h> 75 #include <sys/kernel.h> 76 #include <sys/vnode.h> 77 #include <sys/proc.h> 78 #include <sys/file.h> 79 #include <sys/namei.h> 80 #include <sys/socket.h> 81 #include <sys/socketvar.h> 82 #include <sys/stat.h> 83 #include <sys/ioctl.h> 84 #include <sys/fcntl.h> 85 #include <sys/malloc.h> 86 #include <sys/pool.h> 87 #include <sys/syslog.h> 88 #include <sys/unistd.h> 89 #include <sys/resourcevar.h> 90 #include <sys/conf.h> 91 #include <sys/event.h> 92 #include <sys/kauth.h> 93 #include <sys/atomic.h> 94 #include <sys/mount.h> 95 #include <sys/syscallargs.h> 96 #include <sys/cpu.h> 97 98 static int cwdi_ctor(void *, void *, int); 99 static void cwdi_dtor(void *, void *); 100 static int file_ctor(void *, void *, int); 101 static void file_dtor(void *, void *); 102 static int fdfile_ctor(void *, void *, int); 103 static void fdfile_dtor(void *, void *); 104 static int filedesc_ctor(void *, void *, int); 105 static void filedesc_dtor(void *, void *); 106 static int filedescopen(dev_t, int, int, lwp_t *); 107 108 kmutex_t filelist_lock; /* lock on filehead */ 109 struct filelist filehead; /* head of list of open files */ 110 u_int nfiles; /* actual number of open files */ 111 112 static pool_cache_t cwdi_cache; 113 static pool_cache_t filedesc_cache; 114 static pool_cache_t file_cache; 115 static pool_cache_t fdfile_cache; 116 117 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); 118 119 const struct cdevsw filedesc_cdevsw = { 120 filedescopen, noclose, noread, nowrite, noioctl, 121 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE, 122 }; 123 124 /* For ease of reading. */ 125 __strong_alias(fd_putvnode,fd_putfile) 126 __strong_alias(fd_putsock,fd_putfile) 127 128 /* 129 * Initialize the descriptor system. 130 */ 131 void 132 fd_sys_init(void) 133 { 134 135 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE); 136 137 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0, 138 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL); 139 KASSERT(file_cache != NULL); 140 141 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0, 142 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor, 143 NULL); 144 KASSERT(fdfile_cache != NULL); 145 146 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit, 147 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL); 148 KASSERT(cwdi_cache != NULL); 149 150 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit, 151 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor, 152 NULL); 153 KASSERT(filedesc_cache != NULL); 154 } 155 156 static int 157 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits) 158 { 159 int i, off, maxoff; 160 uint32_t sub; 161 162 KASSERT(mutex_owned(&fdp->fd_lock)); 163 164 if (want > bits) 165 return -1; 166 167 off = want >> NDENTRYSHIFT; 168 i = want & NDENTRYMASK; 169 if (i) { 170 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i)); 171 if (sub != ~0) 172 goto found; 173 off++; 174 } 175 176 maxoff = NDLOSLOTS(bits); 177 while (off < maxoff) { 178 if ((sub = bitmap[off]) != ~0) 179 goto found; 180 off++; 181 } 182 183 return (-1); 184 185 found: 186 return (off << NDENTRYSHIFT) + ffs(~sub) - 1; 187 } 188 189 static int 190 fd_last_set(filedesc_t *fd, int last) 191 { 192 int off, i; 193 fdfile_t **ofiles = fd->fd_ofiles; 194 uint32_t *bitmap = fd->fd_lomap; 195 196 KASSERT(mutex_owned(&fd->fd_lock)); 197 198 off = (last - 1) >> NDENTRYSHIFT; 199 200 while (off >= 0 && !bitmap[off]) 201 off--; 202 203 if (off < 0) 204 return (-1); 205 206 i = ((off + 1) << NDENTRYSHIFT) - 1; 207 if (i >= last) 208 i = last - 1; 209 210 /* XXX should use bitmap */ 211 /* XXXAD does not work for fd_copy() */ 212 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated)) 213 i--; 214 215 return (i); 216 } 217 218 void 219 fd_used(filedesc_t *fdp, unsigned fd) 220 { 221 u_int off = fd >> NDENTRYSHIFT; 222 fdfile_t *ff; 223 224 ff = fdp->fd_ofiles[fd]; 225 226 KASSERT(mutex_owned(&fdp->fd_lock)); 227 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0); 228 KASSERT(ff != NULL); 229 KASSERT(ff->ff_file == NULL); 230 KASSERT(!ff->ff_allocated); 231 232 ff->ff_allocated = 1; 233 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK); 234 if (fdp->fd_lomap[off] == ~0) { 235 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 236 (1 << (off & NDENTRYMASK))) == 0); 237 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK); 238 } 239 240 if ((int)fd > fdp->fd_lastfile) { 241 fdp->fd_lastfile = fd; 242 } 243 244 if (fd >= NDFDFILE) { 245 fdp->fd_nused++; 246 } else { 247 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 248 } 249 } 250 251 void 252 fd_unused(filedesc_t *fdp, unsigned fd) 253 { 254 u_int off = fd >> NDENTRYSHIFT; 255 fdfile_t *ff; 256 257 ff = fdp->fd_ofiles[fd]; 258 259 /* 260 * Don't assert the lock is held here, as we may be copying 261 * the table during exec() and it is not needed there. 262 * procfs and sysctl are locked out by proc::p_reflock. 263 * 264 * KASSERT(mutex_owned(&fdp->fd_lock)); 265 */ 266 KASSERT(ff != NULL); 267 KASSERT(ff->ff_file == NULL); 268 KASSERT(ff->ff_allocated); 269 270 if (fd < fdp->fd_freefile) { 271 fdp->fd_freefile = fd; 272 } 273 274 if (fdp->fd_lomap[off] == ~0) { 275 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 276 (1 << (off & NDENTRYMASK))) != 0); 277 fdp->fd_himap[off >> NDENTRYSHIFT] &= 278 ~(1 << (off & NDENTRYMASK)); 279 } 280 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0); 281 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK)); 282 ff->ff_allocated = 0; 283 284 KASSERT(fd <= fdp->fd_lastfile); 285 if (fd == fdp->fd_lastfile) { 286 fdp->fd_lastfile = fd_last_set(fdp, fd); 287 } 288 289 if (fd >= NDFDFILE) { 290 KASSERT(fdp->fd_nused > 0); 291 fdp->fd_nused--; 292 } else { 293 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 294 } 295 } 296 297 /* 298 * Custom version of fd_unused() for fd_copy(), where the descriptor 299 * table is not yet fully initialized. 300 */ 301 static inline void 302 fd_zap(filedesc_t *fdp, unsigned fd) 303 { 304 u_int off = fd >> NDENTRYSHIFT; 305 306 if (fd < fdp->fd_freefile) { 307 fdp->fd_freefile = fd; 308 } 309 310 if (fdp->fd_lomap[off] == ~0) { 311 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 312 (1 << (off & NDENTRYMASK))) != 0); 313 fdp->fd_himap[off >> NDENTRYSHIFT] &= 314 ~(1 << (off & NDENTRYMASK)); 315 } 316 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0); 317 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK)); 318 } 319 320 bool 321 fd_isused(filedesc_t *fdp, unsigned fd) 322 { 323 u_int off = fd >> NDENTRYSHIFT; 324 325 KASSERT(fd < fdp->fd_nfiles); 326 327 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0; 328 } 329 330 /* 331 * Look up the file structure corresponding to a file descriptor 332 * and return the file, holding a reference on the descriptor. 333 */ 334 inline file_t * 335 fd_getfile(unsigned fd) 336 { 337 filedesc_t *fdp; 338 fdfile_t *ff; 339 file_t *fp; 340 341 fdp = curlwp->l_fd; 342 343 /* 344 * Look up the fdfile structure representing this descriptor. 345 * Ensure that we see fd_nfiles before fd_ofiles since we 346 * are doing this unlocked. See fd_tryexpand(). 347 */ 348 if (__predict_false(fd >= fdp->fd_nfiles)) { 349 return NULL; 350 } 351 membar_consumer(); 352 ff = fdp->fd_ofiles[fd]; 353 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 354 if (__predict_false(ff == NULL)) { 355 return NULL; 356 } 357 358 /* 359 * Now get a reference to the descriptor. Issue a memory 360 * barrier to ensure that we acquire the file pointer _after_ 361 * adding a reference. If no memory barrier, we could fetch 362 * a stale pointer. 363 */ 364 atomic_inc_uint(&ff->ff_refcnt); 365 #ifndef __HAVE_ATOMIC_AS_MEMBAR 366 membar_enter(); 367 #endif 368 369 /* 370 * If the file is not open or is being closed then put the 371 * reference back. 372 */ 373 fp = ff->ff_file; 374 if (__predict_true(fp != NULL)) { 375 return fp; 376 } 377 fd_putfile(fd); 378 return NULL; 379 } 380 381 /* 382 * Release a reference to a file descriptor acquired with fd_getfile(). 383 */ 384 void 385 fd_putfile(unsigned fd) 386 { 387 filedesc_t *fdp; 388 fdfile_t *ff; 389 u_int u, v; 390 391 fdp = curlwp->l_fd; 392 ff = fdp->fd_ofiles[fd]; 393 394 KASSERT(fd < fdp->fd_nfiles); 395 KASSERT(ff != NULL); 396 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 397 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 398 399 /* 400 * Ensure that any use of the file is complete and globally 401 * visible before dropping the final reference. If no membar, 402 * the current CPU could still access memory associated with 403 * the file after it has been freed or recycled by another 404 * CPU. 405 */ 406 #ifndef __HAVE_ATOMIC_AS_MEMBAR 407 membar_exit(); 408 #endif 409 410 /* 411 * Be optimistic and start out with the assumption that no other 412 * threads are trying to close the descriptor. If the CAS fails, 413 * we lost a race and/or it's being closed. 414 */ 415 for (u = ff->ff_refcnt & FR_MASK;; u = v) { 416 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); 417 if (__predict_true(u == v)) { 418 return; 419 } 420 if (__predict_false((v & FR_CLOSING) != 0)) { 421 break; 422 } 423 } 424 425 /* Another thread is waiting to close the file: join it. */ 426 (void)fd_close(fd); 427 } 428 429 /* 430 * Convenience wrapper around fd_getfile() that returns reference 431 * to a vnode. 432 */ 433 int 434 fd_getvnode(unsigned fd, file_t **fpp) 435 { 436 vnode_t *vp; 437 file_t *fp; 438 439 fp = fd_getfile(fd); 440 if (__predict_false(fp == NULL)) { 441 return EBADF; 442 } 443 if (__predict_false(fp->f_type != DTYPE_VNODE)) { 444 fd_putfile(fd); 445 return EINVAL; 446 } 447 vp = fp->f_data; 448 if (__predict_false(vp->v_type == VBAD)) { 449 /* XXX Is this case really necessary? */ 450 fd_putfile(fd); 451 return EBADF; 452 } 453 *fpp = fp; 454 return 0; 455 } 456 457 /* 458 * Convenience wrapper around fd_getfile() that returns reference 459 * to a socket. 460 */ 461 int 462 fd_getsock(unsigned fd, struct socket **sop) 463 { 464 file_t *fp; 465 466 fp = fd_getfile(fd); 467 if (__predict_false(fp == NULL)) { 468 return EBADF; 469 } 470 if (__predict_false(fp->f_type != DTYPE_SOCKET)) { 471 fd_putfile(fd); 472 return ENOTSOCK; 473 } 474 *sop = fp->f_data; 475 return 0; 476 } 477 478 /* 479 * Look up the file structure corresponding to a file descriptor 480 * and return it with a reference held on the file, not the 481 * descriptor. 482 * 483 * This is heavyweight and only used when accessing descriptors 484 * from a foreign process. The caller must ensure that `p' does 485 * not exit or fork across this call. 486 * 487 * To release the file (not descriptor) reference, use closef(). 488 */ 489 file_t * 490 fd_getfile2(proc_t *p, unsigned fd) 491 { 492 filedesc_t *fdp; 493 fdfile_t *ff; 494 file_t *fp; 495 496 fdp = p->p_fd; 497 mutex_enter(&fdp->fd_lock); 498 if (fd > fdp->fd_nfiles) { 499 mutex_exit(&fdp->fd_lock); 500 return NULL; 501 } 502 if ((ff = fdp->fd_ofiles[fd]) == NULL) { 503 mutex_exit(&fdp->fd_lock); 504 return NULL; 505 } 506 mutex_enter(&ff->ff_lock); 507 if ((fp = ff->ff_file) == NULL) { 508 mutex_exit(&ff->ff_lock); 509 mutex_exit(&fdp->fd_lock); 510 return NULL; 511 } 512 mutex_enter(&fp->f_lock); 513 fp->f_count++; 514 mutex_exit(&fp->f_lock); 515 mutex_exit(&ff->ff_lock); 516 mutex_exit(&fdp->fd_lock); 517 518 return fp; 519 } 520 521 /* 522 * Internal form of close. Must be called with a reference to the 523 * descriptor, and will drop the reference. When all descriptor 524 * references are dropped, releases the descriptor slot and a single 525 * reference to the file structure. 526 */ 527 int 528 fd_close(unsigned fd) 529 { 530 struct flock lf; 531 filedesc_t *fdp; 532 fdfile_t *ff; 533 file_t *fp; 534 proc_t *p; 535 lwp_t *l; 536 537 l = curlwp; 538 p = l->l_proc; 539 fdp = l->l_fd; 540 ff = fdp->fd_ofiles[fd]; 541 542 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 543 544 mutex_enter(&ff->ff_lock); 545 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 546 if (ff->ff_file == NULL) { 547 /* 548 * Another user of the file is already closing, and is 549 * waiting for other users of the file to drain. Release 550 * our reference, and wake up the closer. 551 */ 552 atomic_dec_uint(&ff->ff_refcnt); 553 cv_broadcast(&ff->ff_closing); 554 mutex_exit(&ff->ff_lock); 555 556 /* 557 * An application error, so pretend that the descriptor 558 * was already closed. We can't safely wait for it to 559 * be closed without potentially deadlocking. 560 */ 561 return (EBADF); 562 } 563 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 564 565 /* 566 * There may be multiple users of this file within the process. 567 * Notify existing and new users that the file is closing. This 568 * will prevent them from adding additional uses to this file 569 * while we are closing it. 570 */ 571 fp = ff->ff_file; 572 ff->ff_file = NULL; 573 ff->ff_exclose = false; 574 575 /* 576 * We expect the caller to hold a descriptor reference - drop it. 577 * The reference count may increase beyond zero at this point due 578 * to an erroneous descriptor reference by an application, but 579 * fd_getfile() will notice that the file is being closed and drop 580 * the reference again. 581 */ 582 #ifndef __HAVE_ATOMIC_AS_MEMBAR 583 membar_producer(); 584 #endif 585 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) { 586 /* 587 * Wait for other references to drain. This is typically 588 * an application error - the descriptor is being closed 589 * while still in use. 590 * 591 */ 592 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); 593 /* 594 * Remove any knotes attached to the file. A knote 595 * attached to the descriptor can hold references on it. 596 */ 597 if (!SLIST_EMPTY(&ff->ff_knlist)) { 598 mutex_exit(&ff->ff_lock); 599 knote_fdclose(fd); 600 mutex_enter(&ff->ff_lock); 601 } 602 /* 603 * We need to see the count drop to zero at least once, 604 * in order to ensure that all pre-existing references 605 * have been drained. New references past this point are 606 * of no interest. 607 */ 608 while ((ff->ff_refcnt & FR_MASK) != 0) { 609 cv_wait(&ff->ff_closing, &ff->ff_lock); 610 } 611 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING); 612 } else { 613 /* If no references, there must be no knotes. */ 614 KASSERT(SLIST_EMPTY(&ff->ff_knlist)); 615 } 616 mutex_exit(&ff->ff_lock); 617 618 /* 619 * POSIX record locking dictates that any close releases ALL 620 * locks owned by this process. This is handled by setting 621 * a flag in the unlock to free ONLY locks obeying POSIX 622 * semantics, and not to free BSD-style file locks. 623 * If the descriptor was in a message, POSIX-style locks 624 * aren't passed with the descriptor. 625 */ 626 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) { 627 lf.l_whence = SEEK_SET; 628 lf.l_start = 0; 629 lf.l_len = 0; 630 lf.l_type = F_UNLCK; 631 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX); 632 } 633 634 635 /* Free descriptor slot. */ 636 mutex_enter(&fdp->fd_lock); 637 fd_unused(fdp, fd); 638 mutex_exit(&fdp->fd_lock); 639 640 /* Now drop reference to the file itself. */ 641 return closef(fp); 642 } 643 644 /* 645 * Duplicate a file descriptor. 646 */ 647 int 648 fd_dup(file_t *fp, int minfd, int *newp, bool exclose) 649 { 650 proc_t *p; 651 int error; 652 653 p = curproc; 654 655 while ((error = fd_alloc(p, minfd, newp)) != 0) { 656 if (error != ENOSPC) { 657 return error; 658 } 659 fd_tryexpand(p); 660 } 661 662 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose; 663 fd_affix(p, fp, *newp); 664 return 0; 665 } 666 667 /* 668 * dup2 operation. 669 */ 670 int 671 fd_dup2(file_t *fp, unsigned new) 672 { 673 filedesc_t *fdp; 674 fdfile_t *ff; 675 676 fdp = curlwp->l_fd; 677 678 /* 679 * Ensure there are enough slots in the descriptor table, 680 * and allocate an fdfile_t up front in case we need it. 681 */ 682 while (new >= fdp->fd_nfiles) { 683 fd_tryexpand(curproc); 684 } 685 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 686 687 /* 688 * If there is already a file open, close it. If the file is 689 * half open, wait for it to be constructed before closing it. 690 * XXX Potential for deadlock here? 691 */ 692 mutex_enter(&fdp->fd_lock); 693 while (fd_isused(fdp, new)) { 694 mutex_exit(&fdp->fd_lock); 695 if (fd_getfile(new) != NULL) { 696 (void)fd_close(new); 697 } else { 698 /* XXX Crummy, but unlikely to happen. */ 699 kpause("dup2", false, 1, NULL); 700 } 701 mutex_enter(&fdp->fd_lock); 702 } 703 if (fdp->fd_ofiles[new] == NULL) { 704 KASSERT(new >= NDFDFILE); 705 fdp->fd_ofiles[new] = ff; 706 ff = NULL; 707 } 708 fd_used(fdp, new); 709 mutex_exit(&fdp->fd_lock); 710 711 /* Slot is now allocated. Insert copy of the file. */ 712 fd_affix(curproc, fp, new); 713 if (ff != NULL) { 714 pool_cache_put(fdfile_cache, ff); 715 } 716 return 0; 717 } 718 719 /* 720 * Drop reference to a file structure. 721 */ 722 int 723 closef(file_t *fp) 724 { 725 struct flock lf; 726 int error; 727 728 /* 729 * Drop reference. If referenced elsewhere it's still open 730 * and we have nothing more to do. 731 */ 732 mutex_enter(&fp->f_lock); 733 KASSERT(fp->f_count > 0); 734 if (--fp->f_count > 0) { 735 mutex_exit(&fp->f_lock); 736 return 0; 737 } 738 KASSERT(fp->f_count == 0); 739 mutex_exit(&fp->f_lock); 740 741 /* We held the last reference - release locks, close and free. */ 742 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { 743 lf.l_whence = SEEK_SET; 744 lf.l_start = 0; 745 lf.l_len = 0; 746 lf.l_type = F_UNLCK; 747 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK); 748 } 749 if (fp->f_ops != NULL) { 750 error = (*fp->f_ops->fo_close)(fp); 751 } else { 752 error = 0; 753 } 754 ffree(fp); 755 756 return error; 757 } 758 759 /* 760 * Allocate a file descriptor for the process. 761 */ 762 int 763 fd_alloc(proc_t *p, int want, int *result) 764 { 765 filedesc_t *fdp; 766 int i, lim, last, error; 767 u_int off, new; 768 fdfile_t *ff; 769 770 KASSERT(p == curproc || p == &proc0); 771 772 fdp = p->p_fd; 773 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 774 KASSERT(ff->ff_refcnt == 0); 775 KASSERT(ff->ff_file == NULL); 776 777 /* 778 * Search for a free descriptor starting at the higher 779 * of want or fd_freefile. 780 */ 781 mutex_enter(&fdp->fd_lock); 782 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 783 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); 784 last = min(fdp->fd_nfiles, lim); 785 for (;;) { 786 if ((i = want) < fdp->fd_freefile) 787 i = fdp->fd_freefile; 788 off = i >> NDENTRYSHIFT; 789 new = fd_next_zero(fdp, fdp->fd_himap, off, 790 (last + NDENTRIES - 1) >> NDENTRYSHIFT); 791 if (new == -1) 792 break; 793 i = fd_next_zero(fdp, &fdp->fd_lomap[new], 794 new > off ? 0 : i & NDENTRYMASK, NDENTRIES); 795 if (i == -1) { 796 /* 797 * Free file descriptor in this block was 798 * below want, try again with higher want. 799 */ 800 want = (new + 1) << NDENTRYSHIFT; 801 continue; 802 } 803 i += (new << NDENTRYSHIFT); 804 if (i >= last) { 805 break; 806 } 807 if (fdp->fd_ofiles[i] == NULL) { 808 KASSERT(i >= NDFDFILE); 809 fdp->fd_ofiles[i] = ff; 810 } else { 811 pool_cache_put(fdfile_cache, ff); 812 } 813 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL); 814 fd_used(fdp, i); 815 if (want <= fdp->fd_freefile) { 816 fdp->fd_freefile = i; 817 } 818 *result = i; 819 mutex_exit(&fdp->fd_lock); 820 KASSERT(i >= NDFDFILE || 821 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 822 return 0; 823 } 824 825 /* No space in current array. Let the caller expand and retry. */ 826 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC; 827 mutex_exit(&fdp->fd_lock); 828 pool_cache_put(fdfile_cache, ff); 829 return error; 830 } 831 832 /* 833 * Expand a process' descriptor table. 834 */ 835 void 836 fd_tryexpand(proc_t *p) 837 { 838 filedesc_t *fdp; 839 int i, numfiles, oldnfiles; 840 fdfile_t **newofile; 841 uint32_t *newhimap, *newlomap; 842 843 KASSERT(p == curproc || p == &proc0); 844 845 fdp = p->p_fd; 846 newhimap = NULL; 847 newlomap = NULL; 848 oldnfiles = fdp->fd_nfiles; 849 850 if (oldnfiles < NDEXTENT) 851 numfiles = NDEXTENT; 852 else 853 numfiles = 2 * oldnfiles; 854 855 newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK); 856 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 857 newhimap = malloc(NDHISLOTS(numfiles) * 858 sizeof(uint32_t), M_FILEDESC, M_WAITOK); 859 newlomap = malloc(NDLOSLOTS(numfiles) * 860 sizeof(uint32_t), M_FILEDESC, M_WAITOK); 861 } 862 863 mutex_enter(&fdp->fd_lock); 864 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 865 if (fdp->fd_nfiles != oldnfiles) { 866 /* fdp changed; caller must retry */ 867 mutex_exit(&fdp->fd_lock); 868 free(newofile, M_FILEDESC); 869 if (newhimap != NULL) 870 free(newhimap, M_FILEDESC); 871 if (newlomap != NULL) 872 free(newlomap, M_FILEDESC); 873 return; 874 } 875 876 /* Copy the existing ofile array and zero the new portion. */ 877 i = sizeof(fdfile_t *) * fdp->fd_nfiles; 878 memcpy(newofile, fdp->fd_ofiles, i); 879 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i); 880 881 /* 882 * Link old ofiles array into list to be discarded. We defer 883 * freeing until process exit if the descriptor table is visble 884 * to other threads. 885 */ 886 if (oldnfiles > NDFILE) { 887 if ((fdp->fd_refcnt | p->p_nlwps) > 1) { 888 *(void **)fdp->fd_ofiles = fdp->fd_discard; 889 fdp->fd_discard = fdp->fd_ofiles; 890 } else { 891 free(fdp->fd_ofiles, M_FILEDESC); 892 } 893 } 894 895 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 896 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t); 897 memcpy(newhimap, fdp->fd_himap, i); 898 memset((uint8_t *)newhimap + i, 0, 899 NDHISLOTS(numfiles) * sizeof(uint32_t) - i); 900 901 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t); 902 memcpy(newlomap, fdp->fd_lomap, i); 903 memset((uint8_t *)newlomap + i, 0, 904 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i); 905 906 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) { 907 free(fdp->fd_himap, M_FILEDESC); 908 free(fdp->fd_lomap, M_FILEDESC); 909 } 910 fdp->fd_himap = newhimap; 911 fdp->fd_lomap = newlomap; 912 } 913 914 /* 915 * All other modifications must become globally visible before 916 * the change to fd_nfiles. See fd_getfile(). 917 */ 918 fdp->fd_ofiles = newofile; 919 membar_producer(); 920 fdp->fd_nfiles = numfiles; 921 mutex_exit(&fdp->fd_lock); 922 923 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 924 } 925 926 /* 927 * Create a new open file structure and allocate a file descriptor 928 * for the current process. 929 */ 930 int 931 fd_allocfile(file_t **resultfp, int *resultfd) 932 { 933 file_t *fp; 934 proc_t *p; 935 int error; 936 937 p = curproc; 938 939 while ((error = fd_alloc(p, 0, resultfd)) != 0) { 940 if (error != ENOSPC) { 941 return error; 942 } 943 fd_tryexpand(p); 944 } 945 946 fp = pool_cache_get(file_cache, PR_WAITOK); 947 KASSERT(fp->f_count == 0); 948 fp->f_cred = kauth_cred_get(); 949 kauth_cred_hold(fp->f_cred); 950 951 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) { 952 fd_abort(p, fp, *resultfd); 953 tablefull("file", "increase kern.maxfiles or MAXFILES"); 954 return ENFILE; 955 } 956 957 fp->f_advice = 0; 958 fp->f_msgcount = 0; 959 fp->f_offset = 0; 960 fp->f_iflags = 0; 961 *resultfp = fp; 962 963 return 0; 964 } 965 966 /* 967 * Successful creation of a new descriptor: make visible to the process. 968 */ 969 void 970 fd_affix(proc_t *p, file_t *fp, unsigned fd) 971 { 972 fdfile_t *ff; 973 filedesc_t *fdp; 974 975 KASSERT(p == curproc || p == &proc0); 976 977 /* Add a reference to the file structure. */ 978 mutex_enter(&fp->f_lock); 979 fp->f_count++; 980 mutex_exit(&fp->f_lock); 981 982 /* 983 * Insert the new file into the descriptor slot. 984 * 985 * The memory barriers provided by lock activity in this routine 986 * ensure that any updates to the file structure become globally 987 * visible before the file becomes visible to other LWPs in the 988 * current process. 989 */ 990 fdp = p->p_fd; 991 ff = fdp->fd_ofiles[fd]; 992 993 KASSERT(ff != NULL); 994 KASSERT(ff->ff_file == NULL); 995 KASSERT(ff->ff_allocated); 996 KASSERT(fd_isused(fdp, fd)); 997 KASSERT(fd >= NDFDFILE || 998 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]); 999 1000 /* No need to lock in order to make file initially visible. */ 1001 ff->ff_file = fp; 1002 } 1003 1004 /* 1005 * Abort creation of a new descriptor: free descriptor slot and file. 1006 */ 1007 void 1008 fd_abort(proc_t *p, file_t *fp, unsigned fd) 1009 { 1010 filedesc_t *fdp; 1011 fdfile_t *ff; 1012 1013 KASSERT(p == curproc || p == &proc0); 1014 1015 fdp = p->p_fd; 1016 ff = fdp->fd_ofiles[fd]; 1017 1018 KASSERT(fd >= NDFDFILE || 1019 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]); 1020 1021 mutex_enter(&fdp->fd_lock); 1022 KASSERT(fd_isused(fdp, fd)); 1023 fd_unused(fdp, fd); 1024 mutex_exit(&fdp->fd_lock); 1025 1026 if (fp != NULL) { 1027 ffree(fp); 1028 } 1029 } 1030 1031 /* 1032 * Free a file descriptor. 1033 */ 1034 void 1035 ffree(file_t *fp) 1036 { 1037 1038 KASSERT(fp->f_count == 0); 1039 1040 atomic_dec_uint(&nfiles); 1041 kauth_cred_free(fp->f_cred); 1042 pool_cache_put(file_cache, fp); 1043 } 1044 1045 /* 1046 * Create an initial cwdinfo structure, using the same current and root 1047 * directories as curproc. 1048 */ 1049 struct cwdinfo * 1050 cwdinit(void) 1051 { 1052 struct cwdinfo *cwdi; 1053 struct cwdinfo *copy; 1054 1055 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK); 1056 copy = curproc->p_cwdi; 1057 1058 rw_enter(©->cwdi_lock, RW_READER); 1059 cwdi->cwdi_cdir = copy->cwdi_cdir; 1060 if (cwdi->cwdi_cdir) 1061 VREF(cwdi->cwdi_cdir); 1062 cwdi->cwdi_rdir = copy->cwdi_rdir; 1063 if (cwdi->cwdi_rdir) 1064 VREF(cwdi->cwdi_rdir); 1065 cwdi->cwdi_edir = copy->cwdi_edir; 1066 if (cwdi->cwdi_edir) 1067 VREF(cwdi->cwdi_edir); 1068 cwdi->cwdi_cmask = copy->cwdi_cmask; 1069 cwdi->cwdi_refcnt = 1; 1070 rw_exit(©->cwdi_lock); 1071 1072 return (cwdi); 1073 } 1074 1075 static int 1076 cwdi_ctor(void *arg, void *obj, int flags) 1077 { 1078 struct cwdinfo *cwdi = obj; 1079 1080 rw_init(&cwdi->cwdi_lock); 1081 1082 return 0; 1083 } 1084 1085 static void 1086 cwdi_dtor(void *arg, void *obj) 1087 { 1088 struct cwdinfo *cwdi = obj; 1089 1090 rw_destroy(&cwdi->cwdi_lock); 1091 } 1092 1093 static int 1094 file_ctor(void *arg, void *obj, int flags) 1095 { 1096 file_t *fp = obj; 1097 1098 memset(fp, 0, sizeof(*fp)); 1099 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1100 1101 mutex_enter(&filelist_lock); 1102 LIST_INSERT_HEAD(&filehead, fp, f_list); 1103 mutex_exit(&filelist_lock); 1104 1105 return 0; 1106 } 1107 1108 static void 1109 file_dtor(void *arg, void *obj) 1110 { 1111 file_t *fp = obj; 1112 1113 mutex_enter(&filelist_lock); 1114 LIST_REMOVE(fp, f_list); 1115 mutex_exit(&filelist_lock); 1116 1117 mutex_destroy(&fp->f_lock); 1118 } 1119 1120 static int 1121 fdfile_ctor(void *arg, void *obj, int flags) 1122 { 1123 fdfile_t *ff = obj; 1124 1125 memset(ff, 0, sizeof(*ff)); 1126 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE); 1127 cv_init(&ff->ff_closing, "fdclose"); 1128 1129 return 0; 1130 } 1131 1132 static void 1133 fdfile_dtor(void *arg, void *obj) 1134 { 1135 fdfile_t *ff = obj; 1136 1137 mutex_destroy(&ff->ff_lock); 1138 cv_destroy(&ff->ff_closing); 1139 } 1140 1141 file_t * 1142 fgetdummy(void) 1143 { 1144 file_t *fp; 1145 1146 fp = kmem_alloc(sizeof(*fp), KM_SLEEP); 1147 if (fp != NULL) { 1148 memset(fp, 0, sizeof(*fp)); 1149 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1150 } 1151 return fp; 1152 } 1153 1154 void 1155 fputdummy(file_t *fp) 1156 { 1157 1158 mutex_destroy(&fp->f_lock); 1159 kmem_free(fp, sizeof(*fp)); 1160 } 1161 1162 /* 1163 * Make p2 share p1's cwdinfo. 1164 */ 1165 void 1166 cwdshare(struct proc *p2) 1167 { 1168 struct cwdinfo *cwdi; 1169 1170 cwdi = curproc->p_cwdi; 1171 1172 atomic_inc_uint(&cwdi->cwdi_refcnt); 1173 p2->p_cwdi = cwdi; 1174 } 1175 1176 /* 1177 * Release a cwdinfo structure. 1178 */ 1179 void 1180 cwdfree(struct cwdinfo *cwdi) 1181 { 1182 1183 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0) 1184 return; 1185 1186 vrele(cwdi->cwdi_cdir); 1187 if (cwdi->cwdi_rdir) 1188 vrele(cwdi->cwdi_rdir); 1189 if (cwdi->cwdi_edir) 1190 vrele(cwdi->cwdi_edir); 1191 pool_cache_put(cwdi_cache, cwdi); 1192 } 1193 1194 /* 1195 * Create an initial filedesc structure. 1196 */ 1197 filedesc_t * 1198 fd_init(filedesc_t *fdp) 1199 { 1200 unsigned fd; 1201 1202 if (fdp == NULL) { 1203 fdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1204 } else { 1205 filedesc_ctor(NULL, fdp, PR_WAITOK); 1206 } 1207 1208 fdp->fd_refcnt = 1; 1209 fdp->fd_ofiles = fdp->fd_dfiles; 1210 fdp->fd_nfiles = NDFILE; 1211 fdp->fd_himap = fdp->fd_dhimap; 1212 fdp->fd_lomap = fdp->fd_dlomap; 1213 KASSERT(fdp->fd_lastfile == -1); 1214 KASSERT(fdp->fd_lastkqfile == -1); 1215 KASSERT(fdp->fd_knhash == NULL); 1216 1217 memset(&fdp->fd_startzero, 0, sizeof(*fdp) - 1218 offsetof(filedesc_t, fd_startzero)); 1219 for (fd = 0; fd < NDFDFILE; fd++) { 1220 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd]; 1221 } 1222 1223 return fdp; 1224 } 1225 1226 /* 1227 * Initialize a file descriptor table. 1228 */ 1229 static int 1230 filedesc_ctor(void *arg, void *obj, int flag) 1231 { 1232 filedesc_t *fdp = obj; 1233 int i; 1234 1235 memset(fdp, 0, sizeof(*fdp)); 1236 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE); 1237 fdp->fd_lastfile = -1; 1238 fdp->fd_lastkqfile = -1; 1239 1240 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t)); 1241 for (i = 0; i < NDFDFILE; i++) { 1242 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK); 1243 } 1244 1245 return 0; 1246 } 1247 1248 static void 1249 filedesc_dtor(void *arg, void *obj) 1250 { 1251 filedesc_t *fdp = obj; 1252 int i; 1253 1254 for (i = 0; i < NDFDFILE; i++) { 1255 fdfile_dtor(NULL, fdp->fd_dfdfile[i]); 1256 } 1257 1258 mutex_destroy(&fdp->fd_lock); 1259 } 1260 1261 /* 1262 * Make p2 share p1's filedesc structure. 1263 */ 1264 void 1265 fd_share(struct proc *p2) 1266 { 1267 filedesc_t *fdp; 1268 1269 fdp = curlwp->l_fd; 1270 p2->p_fd = fdp; 1271 atomic_inc_uint(&fdp->fd_refcnt); 1272 } 1273 1274 /* 1275 * Copy a filedesc structure. 1276 */ 1277 filedesc_t * 1278 fd_copy(void) 1279 { 1280 filedesc_t *newfdp, *fdp; 1281 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2; 1282 int i, nused, numfiles, lastfile, j, newlast; 1283 file_t *fp; 1284 1285 fdp = curproc->p_fd; 1286 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1287 newfdp->fd_refcnt = 1; 1288 1289 KASSERT(newfdp->fd_knhash == NULL); 1290 KASSERT(newfdp->fd_knhashmask == 0); 1291 KASSERT(newfdp->fd_discard == NULL); 1292 1293 for (;;) { 1294 numfiles = fdp->fd_nfiles; 1295 lastfile = fdp->fd_lastfile; 1296 1297 /* 1298 * If the number of open files fits in the internal arrays 1299 * of the open file structure, use them, otherwise allocate 1300 * additional memory for the number of descriptors currently 1301 * in use. 1302 */ 1303 if (lastfile < NDFILE) { 1304 i = NDFILE; 1305 newfdp->fd_ofiles = newfdp->fd_dfiles; 1306 } else { 1307 /* 1308 * Compute the smallest multiple of NDEXTENT needed 1309 * for the file descriptors currently in use, 1310 * allowing the table to shrink. 1311 */ 1312 i = numfiles; 1313 while (i >= 2 * NDEXTENT && i > lastfile * 2) { 1314 i /= 2; 1315 } 1316 newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *), 1317 M_FILEDESC, M_WAITOK); 1318 KASSERT(i >= NDFILE); 1319 } 1320 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) { 1321 newfdp->fd_himap = newfdp->fd_dhimap; 1322 newfdp->fd_lomap = newfdp->fd_dlomap; 1323 } else { 1324 newfdp->fd_himap = malloc(NDHISLOTS(i) * 1325 sizeof(uint32_t), M_FILEDESC, M_WAITOK); 1326 newfdp->fd_lomap = malloc(NDLOSLOTS(i) * 1327 sizeof(uint32_t), M_FILEDESC, M_WAITOK); 1328 } 1329 1330 /* 1331 * Allocate and string together fdfile structures. 1332 * We abuse fdfile_t::ff_file here, but it will be 1333 * cleared before this routine returns. 1334 */ 1335 nused = fdp->fd_nused; 1336 fflist = NULL; 1337 for (j = nused; j != 0; j--) { 1338 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 1339 ff->ff_file = (void *)fflist; 1340 fflist = ff; 1341 } 1342 1343 mutex_enter(&fdp->fd_lock); 1344 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused && 1345 lastfile == fdp->fd_lastfile) { 1346 break; 1347 } 1348 mutex_exit(&fdp->fd_lock); 1349 if (i >= NDFILE) { 1350 free(newfdp->fd_ofiles, M_FILEDESC); 1351 } 1352 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) { 1353 free(newfdp->fd_himap, M_FILEDESC); 1354 free(newfdp->fd_lomap, M_FILEDESC); 1355 } 1356 while (fflist != NULL) { 1357 ff = fflist; 1358 fflist = (void *)ff->ff_file; 1359 ff->ff_file = NULL; 1360 pool_cache_put(fdfile_cache, ff); 1361 } 1362 } 1363 1364 newfdp->fd_nfiles = i; 1365 newfdp->fd_freefile = fdp->fd_freefile; 1366 newfdp->fd_exclose = fdp->fd_exclose; 1367 1368 /* 1369 * Clear the entries that will not be copied over. 1370 * Avoid calling memset with 0 size. 1371 */ 1372 if (lastfile < (i-1)) { 1373 memset(newfdp->fd_ofiles + lastfile + 1, 0, 1374 (i - lastfile - 1) * sizeof(file_t **)); 1375 } 1376 if (i < NDENTRIES * NDENTRIES) { 1377 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */ 1378 } 1379 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t)); 1380 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t)); 1381 1382 ffp = fdp->fd_ofiles; 1383 nffp = newfdp->fd_ofiles; 1384 j = imax(lastfile, (NDFDFILE - 1)); 1385 newlast = -1; 1386 KASSERT(j < fdp->fd_nfiles); 1387 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) { 1388 ff = *ffp; 1389 /* Install built-in fdfiles even if unused here. */ 1390 if (i < NDFDFILE) { 1391 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i]; 1392 } else { 1393 ff2 = NULL; 1394 } 1395 /* Determine if descriptor is active in parent. */ 1396 if (ff == NULL || !fd_isused(fdp, i)) { 1397 KASSERT(ff != NULL || i >= NDFDFILE); 1398 continue; 1399 } 1400 mutex_enter(&ff->ff_lock); 1401 fp = ff->ff_file; 1402 if (fp == NULL) { 1403 /* Descriptor is half-open: free slot. */ 1404 fd_zap(newfdp, i); 1405 mutex_exit(&ff->ff_lock); 1406 continue; 1407 } 1408 if (fp->f_type == DTYPE_KQUEUE) { 1409 /* kqueue descriptors cannot be copied. */ 1410 fd_zap(newfdp, i); 1411 mutex_exit(&ff->ff_lock); 1412 continue; 1413 } 1414 /* It's active: add a reference to the file. */ 1415 mutex_enter(&fp->f_lock); 1416 fp->f_count++; 1417 mutex_exit(&fp->f_lock); 1418 /* Consume one fdfile_t to represent it. */ 1419 if (i >= NDFDFILE) { 1420 ff2 = fflist; 1421 fflist = (void *)ff2->ff_file; 1422 } 1423 ff2->ff_file = fp; 1424 ff2->ff_exclose = ff->ff_exclose; 1425 ff2->ff_allocated = true; 1426 mutex_exit(&ff->ff_lock); 1427 if (i > newlast) { 1428 newlast = i; 1429 } 1430 } 1431 mutex_exit(&fdp->fd_lock); 1432 1433 /* Discard unused fdfile_t structures. */ 1434 while (__predict_false(fflist != NULL)) { 1435 ff = fflist; 1436 fflist = (void *)ff->ff_file; 1437 ff->ff_file = NULL; 1438 pool_cache_put(fdfile_cache, ff); 1439 nused--; 1440 } 1441 KASSERT(nused >= 0); 1442 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]); 1443 1444 newfdp->fd_nused = nused; 1445 newfdp->fd_lastfile = newlast; 1446 1447 return (newfdp); 1448 } 1449 1450 /* 1451 * Release a filedesc structure. 1452 */ 1453 void 1454 fd_free(void) 1455 { 1456 filedesc_t *fdp; 1457 fdfile_t *ff; 1458 file_t *fp; 1459 int fd, lastfd; 1460 void *discard; 1461 1462 fdp = curlwp->l_fd; 1463 1464 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1465 1466 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) 1467 return; 1468 1469 /* 1470 * Close any files that the process holds open. 1471 */ 1472 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) { 1473 ff = fdp->fd_ofiles[fd]; 1474 KASSERT(fd >= NDFDFILE || 1475 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1476 if ((ff = fdp->fd_ofiles[fd]) == NULL) 1477 continue; 1478 if ((fp = ff->ff_file) != NULL) { 1479 /* 1480 * Must use fd_close() here as kqueue holds 1481 * long term references to descriptors. 1482 */ 1483 ff->ff_refcnt++; 1484 fd_close(fd); 1485 } 1486 KASSERT(ff->ff_refcnt == 0); 1487 KASSERT(ff->ff_file == NULL); 1488 KASSERT(!ff->ff_exclose); 1489 KASSERT(!ff->ff_allocated); 1490 if (fd >= NDFDFILE) { 1491 pool_cache_put(fdfile_cache, ff); 1492 } 1493 } 1494 1495 /* 1496 * Clean out the descriptor table for the next user and return 1497 * to the cache. 1498 */ 1499 while ((discard = fdp->fd_discard) != NULL) { 1500 KASSERT(discard != fdp->fd_ofiles); 1501 fdp->fd_discard = *(void **)discard; 1502 free(discard, M_FILEDESC); 1503 } 1504 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) { 1505 KASSERT(fdp->fd_himap != fdp->fd_dhimap); 1506 KASSERT(fdp->fd_lomap != fdp->fd_dlomap); 1507 free(fdp->fd_himap, M_FILEDESC); 1508 free(fdp->fd_lomap, M_FILEDESC); 1509 } 1510 if (fdp->fd_nfiles > NDFILE) { 1511 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles); 1512 free(fdp->fd_ofiles, M_FILEDESC); 1513 } 1514 if (fdp->fd_knhash != NULL) { 1515 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask); 1516 fdp->fd_knhash = NULL; 1517 fdp->fd_knhashmask = 0; 1518 } else { 1519 KASSERT(fdp->fd_knhashmask == 0); 1520 } 1521 fdp->fd_lastkqfile = -1; 1522 pool_cache_put(filedesc_cache, fdp); 1523 } 1524 1525 /* 1526 * File Descriptor pseudo-device driver (/dev/fd/). 1527 * 1528 * Opening minor device N dup()s the file (if any) connected to file 1529 * descriptor N belonging to the calling process. Note that this driver 1530 * consists of only the ``open()'' routine, because all subsequent 1531 * references to this file will be direct to the other driver. 1532 */ 1533 static int 1534 filedescopen(dev_t dev, int mode, int type, lwp_t *l) 1535 { 1536 1537 /* 1538 * XXX Kludge: set dupfd to contain the value of the 1539 * the file descriptor being sought for duplication. The error 1540 * return ensures that the vnode for this device will be released 1541 * by vn_open. Open will detect this special error and take the 1542 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 1543 * will simply report the error. 1544 */ 1545 l->l_dupfd = minor(dev); /* XXX */ 1546 return EDUPFD; 1547 } 1548 1549 /* 1550 * Duplicate the specified descriptor to a free descriptor. 1551 */ 1552 int 1553 fd_dupopen(int old, int *new, int mode, int error) 1554 { 1555 filedesc_t *fdp; 1556 fdfile_t *ff; 1557 file_t *fp; 1558 1559 if ((fp = fd_getfile(old)) == NULL) { 1560 return EBADF; 1561 } 1562 fdp = curlwp->l_fd; 1563 ff = fdp->fd_ofiles[old]; 1564 1565 /* 1566 * There are two cases of interest here. 1567 * 1568 * For EDUPFD simply dup (dfd) to file descriptor 1569 * (indx) and return. 1570 * 1571 * For EMOVEFD steal away the file structure from (dfd) and 1572 * store it in (indx). (dfd) is effectively closed by 1573 * this operation. 1574 * 1575 * Any other error code is just returned. 1576 */ 1577 switch (error) { 1578 case EDUPFD: 1579 /* 1580 * Check that the mode the file is being opened for is a 1581 * subset of the mode of the existing descriptor. 1582 */ 1583 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 1584 error = EACCES; 1585 break; 1586 } 1587 1588 /* Copy it. */ 1589 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose); 1590 break; 1591 1592 case EMOVEFD: 1593 /* Copy it. */ 1594 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose); 1595 if (error != 0) { 1596 break; 1597 } 1598 1599 /* Steal away the file pointer from 'old'. */ 1600 (void)fd_close(old); 1601 return 0; 1602 } 1603 1604 fd_putfile(old); 1605 return error; 1606 } 1607 1608 /* 1609 * Close open files on exec. 1610 */ 1611 void 1612 fd_closeexec(void) 1613 { 1614 struct cwdinfo *cwdi; 1615 proc_t *p; 1616 filedesc_t *fdp; 1617 fdfile_t *ff; 1618 lwp_t *l; 1619 int fd; 1620 1621 l = curlwp; 1622 p = l->l_proc; 1623 fdp = p->p_fd; 1624 cwdi = p->p_cwdi; 1625 1626 if (cwdi->cwdi_refcnt > 1) { 1627 cwdi = cwdinit(); 1628 cwdfree(p->p_cwdi); 1629 p->p_cwdi = cwdi; 1630 } 1631 if (p->p_cwdi->cwdi_edir) { 1632 vrele(p->p_cwdi->cwdi_edir); 1633 } 1634 1635 if (fdp->fd_refcnt > 1) { 1636 fdp = fd_copy(); 1637 fd_free(); 1638 p->p_fd = fdp; 1639 l->l_fd = fdp; 1640 } 1641 if (!fdp->fd_exclose) { 1642 return; 1643 } 1644 fdp->fd_exclose = false; 1645 1646 for (fd = 0; fd <= fdp->fd_lastfile; fd++) { 1647 if ((ff = fdp->fd_ofiles[fd]) == NULL) { 1648 KASSERT(fd >= NDFDFILE); 1649 continue; 1650 } 1651 KASSERT(fd >= NDFDFILE || 1652 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1653 if (ff->ff_file == NULL) 1654 continue; 1655 if (ff->ff_exclose) { 1656 /* 1657 * We need a reference to close the file. 1658 * No other threads can see the fdfile_t at 1659 * this point, so don't bother locking. 1660 */ 1661 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 1662 ff->ff_refcnt++; 1663 fd_close(fd); 1664 } 1665 } 1666 } 1667 1668 /* 1669 * It is unsafe for set[ug]id processes to be started with file 1670 * descriptors 0..2 closed, as these descriptors are given implicit 1671 * significance in the Standard C library. fdcheckstd() will create a 1672 * descriptor referencing /dev/null for each of stdin, stdout, and 1673 * stderr that is not already open. 1674 */ 1675 #define CHECK_UPTO 3 1676 int 1677 fd_checkstd(void) 1678 { 1679 struct proc *p; 1680 struct nameidata nd; 1681 filedesc_t *fdp; 1682 file_t *fp; 1683 struct proc *pp; 1684 int fd, i, error, flags = FREAD|FWRITE; 1685 char closed[CHECK_UPTO * 3 + 1], which[3 + 1]; 1686 1687 p = curproc; 1688 closed[0] = '\0'; 1689 if ((fdp = p->p_fd) == NULL) 1690 return (0); 1691 for (i = 0; i < CHECK_UPTO; i++) { 1692 KASSERT(i >= NDFDFILE || 1693 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 1694 if (fdp->fd_ofiles[i]->ff_file != NULL) 1695 continue; 1696 snprintf(which, sizeof(which), ",%d", i); 1697 strlcat(closed, which, sizeof(closed)); 1698 if ((error = fd_allocfile(&fp, &fd)) != 0) 1699 return (error); 1700 KASSERT(fd < CHECK_UPTO); 1701 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null"); 1702 if ((error = vn_open(&nd, flags, 0)) != 0) { 1703 fd_abort(p, fp, fd); 1704 return (error); 1705 } 1706 fp->f_data = nd.ni_vp; 1707 fp->f_flag = flags; 1708 fp->f_ops = &vnops; 1709 fp->f_type = DTYPE_VNODE; 1710 VOP_UNLOCK(nd.ni_vp, 0); 1711 fd_affix(p, fp, fd); 1712 } 1713 if (closed[0] != '\0') { 1714 mutex_enter(proc_lock); 1715 pp = p->p_pptr; 1716 mutex_enter(pp->p_lock); 1717 log(LOG_WARNING, "set{u,g}id pid %d (%s) " 1718 "was invoked by uid %d ppid %d (%s) " 1719 "with fd %s closed\n", 1720 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred), 1721 pp->p_pid, pp->p_comm, &closed[1]); 1722 mutex_exit(pp->p_lock); 1723 mutex_exit(proc_lock); 1724 } 1725 return (0); 1726 } 1727 #undef CHECK_UPTO 1728 1729 /* 1730 * Sets descriptor owner. If the owner is a process, 'pgid' 1731 * is set to positive value, process ID. If the owner is process group, 1732 * 'pgid' is set to -pg_id. 1733 */ 1734 int 1735 fsetown(pid_t *pgid, u_long cmd, const void *data) 1736 { 1737 int id = *(const int *)data; 1738 int error; 1739 1740 switch (cmd) { 1741 case TIOCSPGRP: 1742 if (id < 0) 1743 return (EINVAL); 1744 id = -id; 1745 break; 1746 default: 1747 break; 1748 } 1749 1750 if (id > 0 && !pfind(id)) 1751 return (ESRCH); 1752 else if (id < 0 && (error = pgid_in_session(curproc, -id))) 1753 return (error); 1754 1755 *pgid = id; 1756 return (0); 1757 } 1758 1759 /* 1760 * Return descriptor owner information. If the value is positive, 1761 * it's process ID. If it's negative, it's process group ID and 1762 * needs the sign removed before use. 1763 */ 1764 int 1765 fgetown(pid_t pgid, u_long cmd, void *data) 1766 { 1767 1768 switch (cmd) { 1769 case TIOCGPGRP: 1770 *(int *)data = -pgid; 1771 break; 1772 default: 1773 *(int *)data = pgid; 1774 break; 1775 } 1776 return (0); 1777 } 1778 1779 /* 1780 * Send signal to descriptor owner, either process or process group. 1781 */ 1782 void 1783 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata) 1784 { 1785 struct proc *p1; 1786 struct pgrp *pgrp; 1787 ksiginfo_t ksi; 1788 1789 KASSERT(!cpu_intr_p()); 1790 1791 KSI_INIT(&ksi); 1792 ksi.ksi_signo = signo; 1793 ksi.ksi_code = code; 1794 ksi.ksi_band = band; 1795 1796 mutex_enter(proc_lock); 1797 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED))) 1798 kpsignal(p1, &ksi, fdescdata); 1799 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED))) 1800 kpgsignal(pgrp, &ksi, fdescdata, 0); 1801 mutex_exit(proc_lock); 1802 } 1803 1804 int 1805 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops, 1806 void *data) 1807 { 1808 1809 fp->f_flag = flag; 1810 fp->f_type = DTYPE_MISC; 1811 fp->f_ops = fops; 1812 fp->f_data = data; 1813 curlwp->l_dupfd = fd; 1814 fd_affix(curproc, fp, fd); 1815 1816 return EMOVEFD; 1817 } 1818 1819 int 1820 fnullop_fcntl(file_t *fp, u_int cmd, void *data) 1821 { 1822 1823 if (cmd == F_SETFL) 1824 return 0; 1825 1826 return EOPNOTSUPP; 1827 } 1828 1829 int 1830 fnullop_poll(file_t *fp, int which) 1831 { 1832 1833 return 0; 1834 } 1835 1836 int 1837 fnullop_kqfilter(file_t *fp, struct knote *kn) 1838 { 1839 1840 return 0; 1841 } 1842 1843 int 1844 fbadop_read(file_t *fp, off_t *offset, struct uio *uio, 1845 kauth_cred_t cred, int flags) 1846 { 1847 1848 return EOPNOTSUPP; 1849 } 1850 1851 int 1852 fbadop_write(file_t *fp, off_t *offset, struct uio *uio, 1853 kauth_cred_t cred, int flags) 1854 { 1855 1856 return EOPNOTSUPP; 1857 } 1858 1859 int 1860 fbadop_ioctl(file_t *fp, u_long com, void *data) 1861 { 1862 1863 return EOPNOTSUPP; 1864 } 1865 1866 int 1867 fbadop_stat(file_t *fp, struct stat *sb) 1868 { 1869 1870 return EOPNOTSUPP; 1871 } 1872 1873 int 1874 fbadop_close(file_t *fp) 1875 { 1876 1877 return EOPNOTSUPP; 1878 } 1879