1 /* $NetBSD: kern_descrip.c,v 1.243 2019/02/20 19:42:14 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1991, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 66 */ 67 68 /* 69 * File descriptor management. 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.243 2019/02/20 19:42:14 christos Exp $"); 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/filedesc.h> 78 #include <sys/kernel.h> 79 #include <sys/proc.h> 80 #include <sys/file.h> 81 #include <sys/socket.h> 82 #include <sys/socketvar.h> 83 #include <sys/stat.h> 84 #include <sys/ioctl.h> 85 #include <sys/fcntl.h> 86 #include <sys/pool.h> 87 #include <sys/unistd.h> 88 #include <sys/resourcevar.h> 89 #include <sys/conf.h> 90 #include <sys/event.h> 91 #include <sys/kauth.h> 92 #include <sys/atomic.h> 93 #include <sys/syscallargs.h> 94 #include <sys/cpu.h> 95 #include <sys/kmem.h> 96 #include <sys/vnode.h> 97 #include <sys/sysctl.h> 98 #include <sys/ktrace.h> 99 100 /* 101 * A list (head) of open files, counter, and lock protecting them. 102 */ 103 struct filelist filehead __cacheline_aligned; 104 static u_int nfiles __cacheline_aligned; 105 kmutex_t filelist_lock __cacheline_aligned; 106 107 static pool_cache_t filedesc_cache __read_mostly; 108 static pool_cache_t file_cache __read_mostly; 109 static pool_cache_t fdfile_cache __read_mostly; 110 111 static int file_ctor(void *, void *, int); 112 static void file_dtor(void *, void *); 113 static int fdfile_ctor(void *, void *, int); 114 static void fdfile_dtor(void *, void *); 115 static int filedesc_ctor(void *, void *, int); 116 static void filedesc_dtor(void *, void *); 117 static int filedescopen(dev_t, int, int, lwp_t *); 118 119 static int sysctl_kern_file(SYSCTLFN_PROTO); 120 static int sysctl_kern_file2(SYSCTLFN_PROTO); 121 static void fill_file(struct file *, const struct file *); 122 static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *, 123 int, pid_t); 124 125 const struct cdevsw filedesc_cdevsw = { 126 .d_open = filedescopen, 127 .d_close = noclose, 128 .d_read = noread, 129 .d_write = nowrite, 130 .d_ioctl = noioctl, 131 .d_stop = nostop, 132 .d_tty = notty, 133 .d_poll = nopoll, 134 .d_mmap = nommap, 135 .d_kqfilter = nokqfilter, 136 .d_discard = nodiscard, 137 .d_flag = D_OTHER | D_MPSAFE 138 }; 139 140 /* For ease of reading. */ 141 __strong_alias(fd_putvnode,fd_putfile) 142 __strong_alias(fd_putsock,fd_putfile) 143 144 /* 145 * Initialize the descriptor system. 146 */ 147 void 148 fd_sys_init(void) 149 { 150 static struct sysctllog *clog; 151 152 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE); 153 154 LIST_INIT(&filehead); 155 156 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0, 157 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL); 158 KASSERT(file_cache != NULL); 159 160 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0, 161 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor, 162 NULL); 163 KASSERT(fdfile_cache != NULL); 164 165 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit, 166 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor, 167 NULL); 168 KASSERT(filedesc_cache != NULL); 169 170 sysctl_createv(&clog, 0, NULL, NULL, 171 CTLFLAG_PERMANENT, 172 CTLTYPE_STRUCT, "file", 173 SYSCTL_DESCR("System open file table"), 174 sysctl_kern_file, 0, NULL, 0, 175 CTL_KERN, KERN_FILE, CTL_EOL); 176 sysctl_createv(&clog, 0, NULL, NULL, 177 CTLFLAG_PERMANENT, 178 CTLTYPE_STRUCT, "file2", 179 SYSCTL_DESCR("System open file table"), 180 sysctl_kern_file2, 0, NULL, 0, 181 CTL_KERN, KERN_FILE2, CTL_EOL); 182 } 183 184 static bool 185 fd_isused(filedesc_t *fdp, unsigned fd) 186 { 187 u_int off = fd >> NDENTRYSHIFT; 188 189 KASSERT(fd < fdp->fd_dt->dt_nfiles); 190 191 return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0; 192 } 193 194 /* 195 * Verify that the bitmaps match the descriptor table. 196 */ 197 static inline void 198 fd_checkmaps(filedesc_t *fdp) 199 { 200 #ifdef DEBUG 201 fdtab_t *dt; 202 u_int fd; 203 204 dt = fdp->fd_dt; 205 if (fdp->fd_refcnt == -1) { 206 /* 207 * fd_free tears down the table without maintaining its bitmap. 208 */ 209 return; 210 } 211 for (fd = 0; fd < dt->dt_nfiles; fd++) { 212 if (fd < NDFDFILE) { 213 KASSERT(dt->dt_ff[fd] == 214 (fdfile_t *)fdp->fd_dfdfile[fd]); 215 } 216 if (dt->dt_ff[fd] == NULL) { 217 KASSERT(!fd_isused(fdp, fd)); 218 } else if (dt->dt_ff[fd]->ff_file != NULL) { 219 KASSERT(fd_isused(fdp, fd)); 220 } 221 } 222 #endif 223 } 224 225 static int 226 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits) 227 { 228 int i, off, maxoff; 229 uint32_t sub; 230 231 KASSERT(mutex_owned(&fdp->fd_lock)); 232 233 fd_checkmaps(fdp); 234 235 if (want > bits) 236 return -1; 237 238 off = want >> NDENTRYSHIFT; 239 i = want & NDENTRYMASK; 240 if (i) { 241 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i)); 242 if (sub != ~0) 243 goto found; 244 off++; 245 } 246 247 maxoff = NDLOSLOTS(bits); 248 while (off < maxoff) { 249 if ((sub = bitmap[off]) != ~0) 250 goto found; 251 off++; 252 } 253 254 return -1; 255 256 found: 257 return (off << NDENTRYSHIFT) + ffs(~sub) - 1; 258 } 259 260 static int 261 fd_last_set(filedesc_t *fd, int last) 262 { 263 int off, i; 264 fdfile_t **ff = fd->fd_dt->dt_ff; 265 uint32_t *bitmap = fd->fd_lomap; 266 267 KASSERT(mutex_owned(&fd->fd_lock)); 268 269 fd_checkmaps(fd); 270 271 off = (last - 1) >> NDENTRYSHIFT; 272 273 while (off >= 0 && !bitmap[off]) 274 off--; 275 276 if (off < 0) 277 return -1; 278 279 i = ((off + 1) << NDENTRYSHIFT) - 1; 280 if (i >= last) 281 i = last - 1; 282 283 /* XXX should use bitmap */ 284 while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated)) 285 i--; 286 287 return i; 288 } 289 290 static inline void 291 fd_used(filedesc_t *fdp, unsigned fd) 292 { 293 u_int off = fd >> NDENTRYSHIFT; 294 fdfile_t *ff; 295 296 ff = fdp->fd_dt->dt_ff[fd]; 297 298 KASSERT(mutex_owned(&fdp->fd_lock)); 299 KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0); 300 KASSERT(ff != NULL); 301 KASSERT(ff->ff_file == NULL); 302 KASSERT(!ff->ff_allocated); 303 304 ff->ff_allocated = true; 305 fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK); 306 if (__predict_false(fdp->fd_lomap[off] == ~0)) { 307 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 308 (1U << (off & NDENTRYMASK))) == 0); 309 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK); 310 } 311 312 if ((int)fd > fdp->fd_lastfile) { 313 fdp->fd_lastfile = fd; 314 } 315 316 fd_checkmaps(fdp); 317 } 318 319 static inline void 320 fd_unused(filedesc_t *fdp, unsigned fd) 321 { 322 u_int off = fd >> NDENTRYSHIFT; 323 fdfile_t *ff; 324 325 ff = fdp->fd_dt->dt_ff[fd]; 326 327 /* 328 * Don't assert the lock is held here, as we may be copying 329 * the table during exec() and it is not needed there. 330 * procfs and sysctl are locked out by proc::p_reflock. 331 * 332 * KASSERT(mutex_owned(&fdp->fd_lock)); 333 */ 334 KASSERT(ff != NULL); 335 KASSERT(ff->ff_file == NULL); 336 KASSERT(ff->ff_allocated); 337 338 if (fd < fdp->fd_freefile) { 339 fdp->fd_freefile = fd; 340 } 341 342 if (fdp->fd_lomap[off] == ~0) { 343 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 344 (1U << (off & NDENTRYMASK))) != 0); 345 fdp->fd_himap[off >> NDENTRYSHIFT] &= 346 ~(1U << (off & NDENTRYMASK)); 347 } 348 KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0); 349 fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK)); 350 ff->ff_allocated = false; 351 352 KASSERT(fd <= fdp->fd_lastfile); 353 if (fd == fdp->fd_lastfile) { 354 fdp->fd_lastfile = fd_last_set(fdp, fd); 355 } 356 fd_checkmaps(fdp); 357 } 358 359 /* 360 * Look up the file structure corresponding to a file descriptor 361 * and return the file, holding a reference on the descriptor. 362 */ 363 file_t * 364 fd_getfile(unsigned fd) 365 { 366 filedesc_t *fdp; 367 fdfile_t *ff; 368 file_t *fp; 369 fdtab_t *dt; 370 371 /* 372 * Look up the fdfile structure representing this descriptor. 373 * We are doing this unlocked. See fd_tryexpand(). 374 */ 375 fdp = curlwp->l_fd; 376 dt = fdp->fd_dt; 377 if (__predict_false(fd >= dt->dt_nfiles)) { 378 return NULL; 379 } 380 ff = dt->dt_ff[fd]; 381 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 382 if (__predict_false(ff == NULL)) { 383 return NULL; 384 } 385 386 /* Now get a reference to the descriptor. */ 387 if (fdp->fd_refcnt == 1) { 388 /* 389 * Single threaded: don't need to worry about concurrent 390 * access (other than earlier calls to kqueue, which may 391 * hold a reference to the descriptor). 392 */ 393 ff->ff_refcnt++; 394 } else { 395 /* 396 * Multi threaded: issue a memory barrier to ensure that we 397 * acquire the file pointer _after_ adding a reference. If 398 * no memory barrier, we could fetch a stale pointer. 399 */ 400 atomic_inc_uint(&ff->ff_refcnt); 401 #ifndef __HAVE_ATOMIC_AS_MEMBAR 402 membar_enter(); 403 #endif 404 } 405 406 /* 407 * If the file is not open or is being closed then put the 408 * reference back. 409 */ 410 fp = ff->ff_file; 411 if (__predict_true(fp != NULL)) { 412 return fp; 413 } 414 fd_putfile(fd); 415 return NULL; 416 } 417 418 /* 419 * Release a reference to a file descriptor acquired with fd_getfile(). 420 */ 421 void 422 fd_putfile(unsigned fd) 423 { 424 filedesc_t *fdp; 425 fdfile_t *ff; 426 u_int u, v; 427 428 fdp = curlwp->l_fd; 429 ff = fdp->fd_dt->dt_ff[fd]; 430 431 KASSERT(fd < fdp->fd_dt->dt_nfiles); 432 KASSERT(ff != NULL); 433 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 434 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 435 436 if (fdp->fd_refcnt == 1) { 437 /* 438 * Single threaded: don't need to worry about concurrent 439 * access (other than earlier calls to kqueue, which may 440 * hold a reference to the descriptor). 441 */ 442 if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) { 443 fd_close(fd); 444 return; 445 } 446 ff->ff_refcnt--; 447 return; 448 } 449 450 /* 451 * Ensure that any use of the file is complete and globally 452 * visible before dropping the final reference. If no membar, 453 * the current CPU could still access memory associated with 454 * the file after it has been freed or recycled by another 455 * CPU. 456 */ 457 #ifndef __HAVE_ATOMIC_AS_MEMBAR 458 membar_exit(); 459 #endif 460 461 /* 462 * Be optimistic and start out with the assumption that no other 463 * threads are trying to close the descriptor. If the CAS fails, 464 * we lost a race and/or it's being closed. 465 */ 466 for (u = ff->ff_refcnt & FR_MASK;; u = v) { 467 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); 468 if (__predict_true(u == v)) { 469 return; 470 } 471 if (__predict_false((v & FR_CLOSING) != 0)) { 472 break; 473 } 474 } 475 476 /* Another thread is waiting to close the file: join it. */ 477 (void)fd_close(fd); 478 } 479 480 /* 481 * Convenience wrapper around fd_getfile() that returns reference 482 * to a vnode. 483 */ 484 int 485 fd_getvnode(unsigned fd, file_t **fpp) 486 { 487 vnode_t *vp; 488 file_t *fp; 489 490 fp = fd_getfile(fd); 491 if (__predict_false(fp == NULL)) { 492 return EBADF; 493 } 494 if (__predict_false(fp->f_type != DTYPE_VNODE)) { 495 fd_putfile(fd); 496 return EINVAL; 497 } 498 vp = fp->f_vnode; 499 if (__predict_false(vp->v_type == VBAD)) { 500 /* XXX Is this case really necessary? */ 501 fd_putfile(fd); 502 return EBADF; 503 } 504 *fpp = fp; 505 return 0; 506 } 507 508 /* 509 * Convenience wrapper around fd_getfile() that returns reference 510 * to a socket. 511 */ 512 int 513 fd_getsock1(unsigned fd, struct socket **sop, file_t **fp) 514 { 515 *fp = fd_getfile(fd); 516 if (__predict_false(*fp == NULL)) { 517 return EBADF; 518 } 519 if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) { 520 fd_putfile(fd); 521 return ENOTSOCK; 522 } 523 *sop = (*fp)->f_socket; 524 return 0; 525 } 526 527 int 528 fd_getsock(unsigned fd, struct socket **sop) 529 { 530 file_t *fp; 531 return fd_getsock1(fd, sop, &fp); 532 } 533 534 /* 535 * Look up the file structure corresponding to a file descriptor 536 * and return it with a reference held on the file, not the 537 * descriptor. 538 * 539 * This is heavyweight and only used when accessing descriptors 540 * from a foreign process. The caller must ensure that `p' does 541 * not exit or fork across this call. 542 * 543 * To release the file (not descriptor) reference, use closef(). 544 */ 545 file_t * 546 fd_getfile2(proc_t *p, unsigned fd) 547 { 548 filedesc_t *fdp; 549 fdfile_t *ff; 550 file_t *fp; 551 fdtab_t *dt; 552 553 fdp = p->p_fd; 554 mutex_enter(&fdp->fd_lock); 555 dt = fdp->fd_dt; 556 if (fd >= dt->dt_nfiles) { 557 mutex_exit(&fdp->fd_lock); 558 return NULL; 559 } 560 if ((ff = dt->dt_ff[fd]) == NULL) { 561 mutex_exit(&fdp->fd_lock); 562 return NULL; 563 } 564 if ((fp = ff->ff_file) == NULL) { 565 mutex_exit(&fdp->fd_lock); 566 return NULL; 567 } 568 mutex_enter(&fp->f_lock); 569 fp->f_count++; 570 mutex_exit(&fp->f_lock); 571 mutex_exit(&fdp->fd_lock); 572 573 return fp; 574 } 575 576 /* 577 * Internal form of close. Must be called with a reference to the 578 * descriptor, and will drop the reference. When all descriptor 579 * references are dropped, releases the descriptor slot and a single 580 * reference to the file structure. 581 */ 582 int 583 fd_close(unsigned fd) 584 { 585 struct flock lf; 586 filedesc_t *fdp; 587 fdfile_t *ff; 588 file_t *fp; 589 proc_t *p; 590 lwp_t *l; 591 u_int refcnt; 592 593 l = curlwp; 594 p = l->l_proc; 595 fdp = l->l_fd; 596 ff = fdp->fd_dt->dt_ff[fd]; 597 598 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 599 600 mutex_enter(&fdp->fd_lock); 601 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 602 if (__predict_false(ff->ff_file == NULL)) { 603 /* 604 * Another user of the file is already closing, and is 605 * waiting for other users of the file to drain. Release 606 * our reference, and wake up the closer. 607 */ 608 atomic_dec_uint(&ff->ff_refcnt); 609 cv_broadcast(&ff->ff_closing); 610 mutex_exit(&fdp->fd_lock); 611 612 /* 613 * An application error, so pretend that the descriptor 614 * was already closed. We can't safely wait for it to 615 * be closed without potentially deadlocking. 616 */ 617 return (EBADF); 618 } 619 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 620 621 /* 622 * There may be multiple users of this file within the process. 623 * Notify existing and new users that the file is closing. This 624 * will prevent them from adding additional uses to this file 625 * while we are closing it. 626 */ 627 fp = ff->ff_file; 628 ff->ff_file = NULL; 629 ff->ff_exclose = false; 630 631 /* 632 * We expect the caller to hold a descriptor reference - drop it. 633 * The reference count may increase beyond zero at this point due 634 * to an erroneous descriptor reference by an application, but 635 * fd_getfile() will notice that the file is being closed and drop 636 * the reference again. 637 */ 638 if (fdp->fd_refcnt == 1) { 639 /* Single threaded. */ 640 refcnt = --(ff->ff_refcnt); 641 } else { 642 /* Multi threaded. */ 643 #ifndef __HAVE_ATOMIC_AS_MEMBAR 644 membar_producer(); 645 #endif 646 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt); 647 } 648 if (__predict_false(refcnt != 0)) { 649 /* 650 * Wait for other references to drain. This is typically 651 * an application error - the descriptor is being closed 652 * while still in use. 653 * (Or just a threaded application trying to unblock its 654 * thread that sleeps in (say) accept()). 655 */ 656 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); 657 658 /* 659 * Remove any knotes attached to the file. A knote 660 * attached to the descriptor can hold references on it. 661 */ 662 mutex_exit(&fdp->fd_lock); 663 if (!SLIST_EMPTY(&ff->ff_knlist)) { 664 knote_fdclose(fd); 665 } 666 667 /* 668 * Since the file system code doesn't know which fd 669 * each request came from (think dup()), we have to 670 * ask it to return ERESTART for any long-term blocks. 671 * The re-entry through read/write/etc will detect the 672 * closed fd and return EBAFD. 673 * Blocked partial writes may return a short length. 674 */ 675 (*fp->f_ops->fo_restart)(fp); 676 mutex_enter(&fdp->fd_lock); 677 678 /* 679 * We need to see the count drop to zero at least once, 680 * in order to ensure that all pre-existing references 681 * have been drained. New references past this point are 682 * of no interest. 683 * XXX (dsl) this may need to call fo_restart() after a 684 * timeout to guarantee that all the system calls exit. 685 */ 686 while ((ff->ff_refcnt & FR_MASK) != 0) { 687 cv_wait(&ff->ff_closing, &fdp->fd_lock); 688 } 689 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING); 690 } else { 691 /* If no references, there must be no knotes. */ 692 KASSERT(SLIST_EMPTY(&ff->ff_knlist)); 693 } 694 695 /* 696 * POSIX record locking dictates that any close releases ALL 697 * locks owned by this process. This is handled by setting 698 * a flag in the unlock to free ONLY locks obeying POSIX 699 * semantics, and not to free BSD-style file locks. 700 * If the descriptor was in a message, POSIX-style locks 701 * aren't passed with the descriptor. 702 */ 703 if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 && 704 fp->f_type == DTYPE_VNODE)) { 705 lf.l_whence = SEEK_SET; 706 lf.l_start = 0; 707 lf.l_len = 0; 708 lf.l_type = F_UNLCK; 709 mutex_exit(&fdp->fd_lock); 710 (void)VOP_ADVLOCK(fp->f_vnode, p, F_UNLCK, &lf, F_POSIX); 711 mutex_enter(&fdp->fd_lock); 712 } 713 714 /* Free descriptor slot. */ 715 fd_unused(fdp, fd); 716 mutex_exit(&fdp->fd_lock); 717 718 /* Now drop reference to the file itself. */ 719 return closef(fp); 720 } 721 722 /* 723 * Duplicate a file descriptor. 724 */ 725 int 726 fd_dup(file_t *fp, int minfd, int *newp, bool exclose) 727 { 728 proc_t *p = curproc; 729 int error; 730 731 while ((error = fd_alloc(p, minfd, newp)) != 0) { 732 if (error != ENOSPC) { 733 return error; 734 } 735 fd_tryexpand(p); 736 } 737 738 curlwp->l_fd->fd_dt->dt_ff[*newp]->ff_exclose = exclose; 739 fd_affix(p, fp, *newp); 740 return 0; 741 } 742 743 /* 744 * dup2 operation. 745 */ 746 int 747 fd_dup2(file_t *fp, unsigned newfd, int flags) 748 { 749 filedesc_t *fdp = curlwp->l_fd; 750 fdfile_t *ff; 751 fdtab_t *dt; 752 753 if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE)) 754 return EINVAL; 755 /* 756 * Ensure there are enough slots in the descriptor table, 757 * and allocate an fdfile_t up front in case we need it. 758 */ 759 while (newfd >= fdp->fd_dt->dt_nfiles) { 760 fd_tryexpand(curproc); 761 } 762 ff = pool_cache_get(fdfile_cache, PR_WAITOK); 763 764 /* 765 * If there is already a file open, close it. If the file is 766 * half open, wait for it to be constructed before closing it. 767 * XXX Potential for deadlock here? 768 */ 769 mutex_enter(&fdp->fd_lock); 770 while (fd_isused(fdp, newfd)) { 771 mutex_exit(&fdp->fd_lock); 772 if (fd_getfile(newfd) != NULL) { 773 (void)fd_close(newfd); 774 } else { 775 /* 776 * Crummy, but unlikely to happen. 777 * Can occur if we interrupt another 778 * thread while it is opening a file. 779 */ 780 kpause("dup2", false, 1, NULL); 781 } 782 mutex_enter(&fdp->fd_lock); 783 } 784 dt = fdp->fd_dt; 785 if (dt->dt_ff[newfd] == NULL) { 786 KASSERT(newfd >= NDFDFILE); 787 dt->dt_ff[newfd] = ff; 788 ff = NULL; 789 } 790 fd_used(fdp, newfd); 791 mutex_exit(&fdp->fd_lock); 792 793 dt->dt_ff[newfd]->ff_exclose = (flags & O_CLOEXEC) != 0; 794 fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE); 795 /* Slot is now allocated. Insert copy of the file. */ 796 fd_affix(curproc, fp, newfd); 797 if (ff != NULL) { 798 pool_cache_put(fdfile_cache, ff); 799 } 800 return 0; 801 } 802 803 /* 804 * Drop reference to a file structure. 805 */ 806 int 807 closef(file_t *fp) 808 { 809 struct flock lf; 810 int error; 811 812 /* 813 * Drop reference. If referenced elsewhere it's still open 814 * and we have nothing more to do. 815 */ 816 mutex_enter(&fp->f_lock); 817 KASSERT(fp->f_count > 0); 818 if (--fp->f_count > 0) { 819 mutex_exit(&fp->f_lock); 820 return 0; 821 } 822 KASSERT(fp->f_count == 0); 823 mutex_exit(&fp->f_lock); 824 825 /* We held the last reference - release locks, close and free. */ 826 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { 827 lf.l_whence = SEEK_SET; 828 lf.l_start = 0; 829 lf.l_len = 0; 830 lf.l_type = F_UNLCK; 831 (void)VOP_ADVLOCK(fp->f_vnode, fp, F_UNLCK, &lf, F_FLOCK); 832 } 833 if (fp->f_ops != NULL) { 834 error = (*fp->f_ops->fo_close)(fp); 835 } else { 836 error = 0; 837 } 838 KASSERT(fp->f_count == 0); 839 KASSERT(fp->f_cred != NULL); 840 pool_cache_put(file_cache, fp); 841 842 return error; 843 } 844 845 /* 846 * Allocate a file descriptor for the process. 847 */ 848 int 849 fd_alloc(proc_t *p, int want, int *result) 850 { 851 filedesc_t *fdp = p->p_fd; 852 int i, lim, last, error, hi; 853 u_int off; 854 fdtab_t *dt; 855 856 KASSERT(p == curproc || p == &proc0); 857 858 /* 859 * Search for a free descriptor starting at the higher 860 * of want or fd_freefile. 861 */ 862 mutex_enter(&fdp->fd_lock); 863 fd_checkmaps(fdp); 864 dt = fdp->fd_dt; 865 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 866 lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); 867 last = uimin(dt->dt_nfiles, lim); 868 for (;;) { 869 if ((i = want) < fdp->fd_freefile) 870 i = fdp->fd_freefile; 871 off = i >> NDENTRYSHIFT; 872 hi = fd_next_zero(fdp, fdp->fd_himap, off, 873 (last + NDENTRIES - 1) >> NDENTRYSHIFT); 874 if (hi == -1) 875 break; 876 i = fd_next_zero(fdp, &fdp->fd_lomap[hi], 877 hi > off ? 0 : i & NDENTRYMASK, NDENTRIES); 878 if (i == -1) { 879 /* 880 * Free file descriptor in this block was 881 * below want, try again with higher want. 882 */ 883 want = (hi + 1) << NDENTRYSHIFT; 884 continue; 885 } 886 i += (hi << NDENTRYSHIFT); 887 if (i >= last) { 888 break; 889 } 890 if (dt->dt_ff[i] == NULL) { 891 KASSERT(i >= NDFDFILE); 892 dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK); 893 } 894 KASSERT(dt->dt_ff[i]->ff_file == NULL); 895 fd_used(fdp, i); 896 if (want <= fdp->fd_freefile) { 897 fdp->fd_freefile = i; 898 } 899 *result = i; 900 KASSERT(i >= NDFDFILE || 901 dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 902 fd_checkmaps(fdp); 903 mutex_exit(&fdp->fd_lock); 904 return 0; 905 } 906 907 /* No space in current array. Let the caller expand and retry. */ 908 error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC; 909 mutex_exit(&fdp->fd_lock); 910 return error; 911 } 912 913 /* 914 * Allocate memory for a descriptor table. 915 */ 916 static fdtab_t * 917 fd_dtab_alloc(int n) 918 { 919 fdtab_t *dt; 920 size_t sz; 921 922 KASSERT(n > NDFILE); 923 924 sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]); 925 dt = kmem_alloc(sz, KM_SLEEP); 926 #ifdef DIAGNOSTIC 927 memset(dt, 0xff, sz); 928 #endif 929 dt->dt_nfiles = n; 930 dt->dt_link = NULL; 931 return dt; 932 } 933 934 /* 935 * Free a descriptor table, and all tables linked for deferred free. 936 */ 937 static void 938 fd_dtab_free(fdtab_t *dt) 939 { 940 fdtab_t *next; 941 size_t sz; 942 943 do { 944 next = dt->dt_link; 945 KASSERT(dt->dt_nfiles > NDFILE); 946 sz = sizeof(*dt) + 947 (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]); 948 #ifdef DIAGNOSTIC 949 memset(dt, 0xff, sz); 950 #endif 951 kmem_free(dt, sz); 952 dt = next; 953 } while (dt != NULL); 954 } 955 956 /* 957 * Allocate descriptor bitmap. 958 */ 959 static void 960 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi) 961 { 962 uint8_t *ptr; 963 size_t szlo, szhi; 964 965 KASSERT(n > NDENTRIES); 966 967 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 968 szhi = NDHISLOTS(n) * sizeof(uint32_t); 969 ptr = kmem_alloc(szlo + szhi, KM_SLEEP); 970 *lo = (uint32_t *)ptr; 971 *hi = (uint32_t *)(ptr + szlo); 972 } 973 974 /* 975 * Free descriptor bitmap. 976 */ 977 static void 978 fd_map_free(int n, uint32_t *lo, uint32_t *hi) 979 { 980 size_t szlo, szhi; 981 982 KASSERT(n > NDENTRIES); 983 984 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 985 szhi = NDHISLOTS(n) * sizeof(uint32_t); 986 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo)); 987 kmem_free(lo, szlo + szhi); 988 } 989 990 /* 991 * Expand a process' descriptor table. 992 */ 993 void 994 fd_tryexpand(proc_t *p) 995 { 996 filedesc_t *fdp; 997 int i, numfiles, oldnfiles; 998 fdtab_t *newdt, *dt; 999 uint32_t *newhimap, *newlomap; 1000 1001 KASSERT(p == curproc || p == &proc0); 1002 1003 fdp = p->p_fd; 1004 newhimap = NULL; 1005 newlomap = NULL; 1006 oldnfiles = fdp->fd_dt->dt_nfiles; 1007 1008 if (oldnfiles < NDEXTENT) 1009 numfiles = NDEXTENT; 1010 else 1011 numfiles = 2 * oldnfiles; 1012 1013 newdt = fd_dtab_alloc(numfiles); 1014 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1015 fd_map_alloc(numfiles, &newlomap, &newhimap); 1016 } 1017 1018 mutex_enter(&fdp->fd_lock); 1019 dt = fdp->fd_dt; 1020 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1021 if (dt->dt_nfiles != oldnfiles) { 1022 /* fdp changed; caller must retry */ 1023 mutex_exit(&fdp->fd_lock); 1024 fd_dtab_free(newdt); 1025 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1026 fd_map_free(numfiles, newlomap, newhimap); 1027 } 1028 return; 1029 } 1030 1031 /* Copy the existing descriptor table and zero the new portion. */ 1032 i = sizeof(fdfile_t *) * oldnfiles; 1033 memcpy(newdt->dt_ff, dt->dt_ff, i); 1034 memset((uint8_t *)newdt->dt_ff + i, 0, 1035 numfiles * sizeof(fdfile_t *) - i); 1036 1037 /* 1038 * Link old descriptor array into list to be discarded. We defer 1039 * freeing until the last reference to the descriptor table goes 1040 * away (usually process exit). This allows us to do lockless 1041 * lookups in fd_getfile(). 1042 */ 1043 if (oldnfiles > NDFILE) { 1044 if (fdp->fd_refcnt > 1) { 1045 newdt->dt_link = dt; 1046 } else { 1047 fd_dtab_free(dt); 1048 } 1049 } 1050 1051 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1052 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t); 1053 memcpy(newhimap, fdp->fd_himap, i); 1054 memset((uint8_t *)newhimap + i, 0, 1055 NDHISLOTS(numfiles) * sizeof(uint32_t) - i); 1056 1057 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t); 1058 memcpy(newlomap, fdp->fd_lomap, i); 1059 memset((uint8_t *)newlomap + i, 0, 1060 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i); 1061 1062 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) { 1063 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap); 1064 } 1065 fdp->fd_himap = newhimap; 1066 fdp->fd_lomap = newlomap; 1067 } 1068 1069 /* 1070 * All other modifications must become globally visible before 1071 * the change to fd_dt. See fd_getfile(). 1072 */ 1073 membar_producer(); 1074 fdp->fd_dt = newdt; 1075 KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1076 fd_checkmaps(fdp); 1077 mutex_exit(&fdp->fd_lock); 1078 } 1079 1080 /* 1081 * Create a new open file structure and allocate a file descriptor 1082 * for the current process. 1083 */ 1084 int 1085 fd_allocfile(file_t **resultfp, int *resultfd) 1086 { 1087 proc_t *p = curproc; 1088 kauth_cred_t cred; 1089 file_t *fp; 1090 int error; 1091 1092 while ((error = fd_alloc(p, 0, resultfd)) != 0) { 1093 if (error != ENOSPC) { 1094 return error; 1095 } 1096 fd_tryexpand(p); 1097 } 1098 1099 fp = pool_cache_get(file_cache, PR_WAITOK); 1100 if (fp == NULL) { 1101 fd_abort(p, NULL, *resultfd); 1102 return ENFILE; 1103 } 1104 KASSERT(fp->f_count == 0); 1105 KASSERT(fp->f_msgcount == 0); 1106 KASSERT(fp->f_unpcount == 0); 1107 1108 /* Replace cached credentials if not what we need. */ 1109 cred = curlwp->l_cred; 1110 if (__predict_false(cred != fp->f_cred)) { 1111 kauth_cred_free(fp->f_cred); 1112 kauth_cred_hold(cred); 1113 fp->f_cred = cred; 1114 } 1115 1116 /* 1117 * Don't allow recycled files to be scanned. 1118 * See uipc_usrreq.c. 1119 */ 1120 if (__predict_false((fp->f_flag & FSCAN) != 0)) { 1121 mutex_enter(&fp->f_lock); 1122 atomic_and_uint(&fp->f_flag, ~FSCAN); 1123 mutex_exit(&fp->f_lock); 1124 } 1125 1126 fp->f_advice = 0; 1127 fp->f_offset = 0; 1128 *resultfp = fp; 1129 1130 return 0; 1131 } 1132 1133 /* 1134 * Successful creation of a new descriptor: make visible to the process. 1135 */ 1136 void 1137 fd_affix(proc_t *p, file_t *fp, unsigned fd) 1138 { 1139 fdfile_t *ff; 1140 filedesc_t *fdp; 1141 1142 KASSERT(p == curproc || p == &proc0); 1143 1144 /* Add a reference to the file structure. */ 1145 mutex_enter(&fp->f_lock); 1146 fp->f_count++; 1147 mutex_exit(&fp->f_lock); 1148 1149 /* 1150 * Insert the new file into the descriptor slot. 1151 * 1152 * The memory barriers provided by lock activity in this routine 1153 * ensure that any updates to the file structure become globally 1154 * visible before the file becomes visible to other LWPs in the 1155 * current process. 1156 */ 1157 fdp = p->p_fd; 1158 ff = fdp->fd_dt->dt_ff[fd]; 1159 1160 KASSERT(ff != NULL); 1161 KASSERT(ff->ff_file == NULL); 1162 KASSERT(ff->ff_allocated); 1163 KASSERT(fd_isused(fdp, fd)); 1164 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1165 1166 /* No need to lock in order to make file initially visible. */ 1167 ff->ff_file = fp; 1168 } 1169 1170 /* 1171 * Abort creation of a new descriptor: free descriptor slot and file. 1172 */ 1173 void 1174 fd_abort(proc_t *p, file_t *fp, unsigned fd) 1175 { 1176 filedesc_t *fdp; 1177 fdfile_t *ff; 1178 1179 KASSERT(p == curproc || p == &proc0); 1180 1181 fdp = p->p_fd; 1182 ff = fdp->fd_dt->dt_ff[fd]; 1183 ff->ff_exclose = false; 1184 1185 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1186 1187 mutex_enter(&fdp->fd_lock); 1188 KASSERT(fd_isused(fdp, fd)); 1189 fd_unused(fdp, fd); 1190 mutex_exit(&fdp->fd_lock); 1191 1192 if (fp != NULL) { 1193 KASSERT(fp->f_count == 0); 1194 KASSERT(fp->f_cred != NULL); 1195 pool_cache_put(file_cache, fp); 1196 } 1197 } 1198 1199 static int 1200 file_ctor(void *arg, void *obj, int flags) 1201 { 1202 file_t *fp = obj; 1203 1204 memset(fp, 0, sizeof(*fp)); 1205 1206 mutex_enter(&filelist_lock); 1207 if (__predict_false(nfiles >= maxfiles)) { 1208 mutex_exit(&filelist_lock); 1209 tablefull("file", "increase kern.maxfiles or MAXFILES"); 1210 return ENFILE; 1211 } 1212 nfiles++; 1213 LIST_INSERT_HEAD(&filehead, fp, f_list); 1214 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1215 fp->f_cred = curlwp->l_cred; 1216 kauth_cred_hold(fp->f_cred); 1217 mutex_exit(&filelist_lock); 1218 1219 return 0; 1220 } 1221 1222 static void 1223 file_dtor(void *arg, void *obj) 1224 { 1225 file_t *fp = obj; 1226 1227 mutex_enter(&filelist_lock); 1228 nfiles--; 1229 LIST_REMOVE(fp, f_list); 1230 mutex_exit(&filelist_lock); 1231 1232 KASSERT(fp->f_count == 0); 1233 kauth_cred_free(fp->f_cred); 1234 mutex_destroy(&fp->f_lock); 1235 } 1236 1237 static int 1238 fdfile_ctor(void *arg, void *obj, int flags) 1239 { 1240 fdfile_t *ff = obj; 1241 1242 memset(ff, 0, sizeof(*ff)); 1243 cv_init(&ff->ff_closing, "fdclose"); 1244 1245 return 0; 1246 } 1247 1248 static void 1249 fdfile_dtor(void *arg, void *obj) 1250 { 1251 fdfile_t *ff = obj; 1252 1253 cv_destroy(&ff->ff_closing); 1254 } 1255 1256 file_t * 1257 fgetdummy(void) 1258 { 1259 file_t *fp; 1260 1261 fp = kmem_zalloc(sizeof(*fp), KM_SLEEP); 1262 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1263 return fp; 1264 } 1265 1266 void 1267 fputdummy(file_t *fp) 1268 { 1269 1270 mutex_destroy(&fp->f_lock); 1271 kmem_free(fp, sizeof(*fp)); 1272 } 1273 1274 /* 1275 * Create an initial filedesc structure. 1276 */ 1277 filedesc_t * 1278 fd_init(filedesc_t *fdp) 1279 { 1280 #ifdef DIAGNOSTIC 1281 unsigned fd; 1282 #endif 1283 1284 if (__predict_true(fdp == NULL)) { 1285 fdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1286 } else { 1287 KASSERT(fdp == &filedesc0); 1288 filedesc_ctor(NULL, fdp, PR_WAITOK); 1289 } 1290 1291 #ifdef DIAGNOSTIC 1292 KASSERT(fdp->fd_lastfile == -1); 1293 KASSERT(fdp->fd_lastkqfile == -1); 1294 KASSERT(fdp->fd_knhash == NULL); 1295 KASSERT(fdp->fd_freefile == 0); 1296 KASSERT(fdp->fd_exclose == false); 1297 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); 1298 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1299 for (fd = 0; fd < NDFDFILE; fd++) { 1300 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == 1301 (fdfile_t *)fdp->fd_dfdfile[fd]); 1302 } 1303 for (fd = NDFDFILE; fd < NDFILE; fd++) { 1304 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL); 1305 } 1306 KASSERT(fdp->fd_himap == fdp->fd_dhimap); 1307 KASSERT(fdp->fd_lomap == fdp->fd_dlomap); 1308 #endif /* DIAGNOSTIC */ 1309 1310 fdp->fd_refcnt = 1; 1311 fd_checkmaps(fdp); 1312 1313 return fdp; 1314 } 1315 1316 /* 1317 * Initialize a file descriptor table. 1318 */ 1319 static int 1320 filedesc_ctor(void *arg, void *obj, int flag) 1321 { 1322 filedesc_t *fdp = obj; 1323 fdfile_t **ffp; 1324 int i; 1325 1326 memset(fdp, 0, sizeof(*fdp)); 1327 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE); 1328 fdp->fd_lastfile = -1; 1329 fdp->fd_lastkqfile = -1; 1330 fdp->fd_dt = &fdp->fd_dtbuiltin; 1331 fdp->fd_dtbuiltin.dt_nfiles = NDFILE; 1332 fdp->fd_himap = fdp->fd_dhimap; 1333 fdp->fd_lomap = fdp->fd_dlomap; 1334 1335 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t)); 1336 for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) { 1337 *ffp = (fdfile_t *)fdp->fd_dfdfile[i]; 1338 (void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK); 1339 } 1340 1341 return 0; 1342 } 1343 1344 static void 1345 filedesc_dtor(void *arg, void *obj) 1346 { 1347 filedesc_t *fdp = obj; 1348 int i; 1349 1350 for (i = 0; i < NDFDFILE; i++) { 1351 fdfile_dtor(NULL, fdp->fd_dfdfile[i]); 1352 } 1353 1354 mutex_destroy(&fdp->fd_lock); 1355 } 1356 1357 /* 1358 * Make p share curproc's filedesc structure. 1359 */ 1360 void 1361 fd_share(struct proc *p) 1362 { 1363 filedesc_t *fdp; 1364 1365 fdp = curlwp->l_fd; 1366 p->p_fd = fdp; 1367 atomic_inc_uint(&fdp->fd_refcnt); 1368 } 1369 1370 /* 1371 * Acquire a hold on a filedesc structure. 1372 */ 1373 void 1374 fd_hold(lwp_t *l) 1375 { 1376 filedesc_t *fdp = l->l_fd; 1377 1378 atomic_inc_uint(&fdp->fd_refcnt); 1379 } 1380 1381 /* 1382 * Copy a filedesc structure. 1383 */ 1384 filedesc_t * 1385 fd_copy(void) 1386 { 1387 filedesc_t *newfdp, *fdp; 1388 fdfile_t *ff, **ffp, **nffp, *ff2; 1389 int i, j, numfiles, lastfile, newlast; 1390 file_t *fp; 1391 fdtab_t *newdt; 1392 1393 fdp = curproc->p_fd; 1394 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1395 newfdp->fd_refcnt = 1; 1396 1397 #ifdef DIAGNOSTIC 1398 KASSERT(newfdp->fd_lastfile == -1); 1399 KASSERT(newfdp->fd_lastkqfile == -1); 1400 KASSERT(newfdp->fd_knhash == NULL); 1401 KASSERT(newfdp->fd_freefile == 0); 1402 KASSERT(newfdp->fd_exclose == false); 1403 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); 1404 KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1405 for (i = 0; i < NDFDFILE; i++) { 1406 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == 1407 (fdfile_t *)&newfdp->fd_dfdfile[i]); 1408 } 1409 for (i = NDFDFILE; i < NDFILE; i++) { 1410 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL); 1411 } 1412 #endif /* DIAGNOSTIC */ 1413 1414 mutex_enter(&fdp->fd_lock); 1415 fd_checkmaps(fdp); 1416 numfiles = fdp->fd_dt->dt_nfiles; 1417 lastfile = fdp->fd_lastfile; 1418 1419 /* 1420 * If the number of open files fits in the internal arrays 1421 * of the open file structure, use them, otherwise allocate 1422 * additional memory for the number of descriptors currently 1423 * in use. 1424 */ 1425 if (lastfile < NDFILE) { 1426 i = NDFILE; 1427 newdt = newfdp->fd_dt; 1428 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); 1429 } else { 1430 /* 1431 * Compute the smallest multiple of NDEXTENT needed 1432 * for the file descriptors currently in use, 1433 * allowing the table to shrink. 1434 */ 1435 i = numfiles; 1436 while (i >= 2 * NDEXTENT && i > lastfile * 2) { 1437 i /= 2; 1438 } 1439 KASSERT(i > NDFILE); 1440 newdt = fd_dtab_alloc(i); 1441 newfdp->fd_dt = newdt; 1442 memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff, 1443 NDFDFILE * sizeof(fdfile_t **)); 1444 memset(newdt->dt_ff + NDFDFILE, 0, 1445 (i - NDFDFILE) * sizeof(fdfile_t **)); 1446 } 1447 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) { 1448 newfdp->fd_himap = newfdp->fd_dhimap; 1449 newfdp->fd_lomap = newfdp->fd_dlomap; 1450 } else { 1451 fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap); 1452 KASSERT(i >= NDENTRIES * NDENTRIES); 1453 memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t)); 1454 memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t)); 1455 } 1456 newfdp->fd_freefile = fdp->fd_freefile; 1457 newfdp->fd_exclose = fdp->fd_exclose; 1458 1459 ffp = fdp->fd_dt->dt_ff; 1460 nffp = newdt->dt_ff; 1461 newlast = -1; 1462 for (i = 0; i <= lastfile; i++, ffp++, nffp++) { 1463 KASSERT(i >= NDFDFILE || 1464 *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]); 1465 ff = *ffp; 1466 if (ff == NULL || (fp = ff->ff_file) == NULL) { 1467 /* Descriptor unused, or descriptor half open. */ 1468 KASSERT(!fd_isused(newfdp, i)); 1469 continue; 1470 } 1471 if (__predict_false(fp->f_type == DTYPE_KQUEUE)) { 1472 /* kqueue descriptors cannot be copied. */ 1473 if (i < newfdp->fd_freefile) { 1474 newfdp->fd_freefile = i; 1475 } 1476 continue; 1477 } 1478 /* It's active: add a reference to the file. */ 1479 mutex_enter(&fp->f_lock); 1480 fp->f_count++; 1481 mutex_exit(&fp->f_lock); 1482 1483 /* Allocate an fdfile_t to represent it. */ 1484 if (i >= NDFDFILE) { 1485 ff2 = pool_cache_get(fdfile_cache, PR_WAITOK); 1486 *nffp = ff2; 1487 } else { 1488 ff2 = newdt->dt_ff[i]; 1489 } 1490 ff2->ff_file = fp; 1491 ff2->ff_exclose = ff->ff_exclose; 1492 ff2->ff_allocated = true; 1493 1494 /* Fix up bitmaps. */ 1495 j = i >> NDENTRYSHIFT; 1496 KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0); 1497 newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK); 1498 if (__predict_false(newfdp->fd_lomap[j] == ~0)) { 1499 KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] & 1500 (1U << (j & NDENTRYMASK))) == 0); 1501 newfdp->fd_himap[j >> NDENTRYSHIFT] |= 1502 1U << (j & NDENTRYMASK); 1503 } 1504 newlast = i; 1505 } 1506 KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]); 1507 newfdp->fd_lastfile = newlast; 1508 fd_checkmaps(newfdp); 1509 mutex_exit(&fdp->fd_lock); 1510 1511 return newfdp; 1512 } 1513 1514 /* 1515 * Release a filedesc structure. 1516 */ 1517 void 1518 fd_free(void) 1519 { 1520 fdfile_t *ff; 1521 file_t *fp; 1522 int fd, nf; 1523 fdtab_t *dt; 1524 lwp_t * const l = curlwp; 1525 filedesc_t * const fdp = l->l_fd; 1526 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0; 1527 1528 KASSERT(fdp->fd_dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1529 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1530 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1531 1532 #ifndef __HAVE_ATOMIC_AS_MEMBAR 1533 membar_exit(); 1534 #endif 1535 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) 1536 return; 1537 1538 /* 1539 * Close any files that the process holds open. 1540 */ 1541 dt = fdp->fd_dt; 1542 fd_checkmaps(fdp); 1543 #ifdef DEBUG 1544 fdp->fd_refcnt = -1; /* see fd_checkmaps */ 1545 #endif 1546 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) { 1547 ff = dt->dt_ff[fd]; 1548 KASSERT(fd >= NDFDFILE || 1549 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1550 if (ff == NULL) 1551 continue; 1552 if ((fp = ff->ff_file) != NULL) { 1553 /* 1554 * Must use fd_close() here if there is 1555 * a reference from kqueue or we might have posix 1556 * advisory locks. 1557 */ 1558 if (__predict_true(ff->ff_refcnt == 0) && 1559 (noadvlock || fp->f_type != DTYPE_VNODE)) { 1560 ff->ff_file = NULL; 1561 ff->ff_exclose = false; 1562 ff->ff_allocated = false; 1563 closef(fp); 1564 } else { 1565 ff->ff_refcnt++; 1566 fd_close(fd); 1567 } 1568 } 1569 KASSERT(ff->ff_refcnt == 0); 1570 KASSERT(ff->ff_file == NULL); 1571 KASSERT(!ff->ff_exclose); 1572 KASSERT(!ff->ff_allocated); 1573 if (fd >= NDFDFILE) { 1574 pool_cache_put(fdfile_cache, ff); 1575 dt->dt_ff[fd] = NULL; 1576 } 1577 } 1578 1579 /* 1580 * Clean out the descriptor table for the next user and return 1581 * to the cache. 1582 */ 1583 if (__predict_false(dt != &fdp->fd_dtbuiltin)) { 1584 fd_dtab_free(fdp->fd_dt); 1585 /* Otherwise, done above. */ 1586 memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0, 1587 (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0])); 1588 fdp->fd_dt = &fdp->fd_dtbuiltin; 1589 } 1590 if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) { 1591 KASSERT(fdp->fd_himap != fdp->fd_dhimap); 1592 KASSERT(fdp->fd_lomap != fdp->fd_dlomap); 1593 fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap); 1594 } 1595 if (__predict_false(fdp->fd_knhash != NULL)) { 1596 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask); 1597 fdp->fd_knhash = NULL; 1598 fdp->fd_knhashmask = 0; 1599 } else { 1600 KASSERT(fdp->fd_knhashmask == 0); 1601 } 1602 fdp->fd_dt = &fdp->fd_dtbuiltin; 1603 fdp->fd_lastkqfile = -1; 1604 fdp->fd_lastfile = -1; 1605 fdp->fd_freefile = 0; 1606 fdp->fd_exclose = false; 1607 memset(&fdp->fd_startzero, 0, sizeof(*fdp) - 1608 offsetof(filedesc_t, fd_startzero)); 1609 fdp->fd_himap = fdp->fd_dhimap; 1610 fdp->fd_lomap = fdp->fd_dlomap; 1611 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1612 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1613 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); 1614 #ifdef DEBUG 1615 fdp->fd_refcnt = 0; /* see fd_checkmaps */ 1616 #endif 1617 fd_checkmaps(fdp); 1618 pool_cache_put(filedesc_cache, fdp); 1619 } 1620 1621 /* 1622 * File Descriptor pseudo-device driver (/dev/fd/). 1623 * 1624 * Opening minor device N dup()s the file (if any) connected to file 1625 * descriptor N belonging to the calling process. Note that this driver 1626 * consists of only the ``open()'' routine, because all subsequent 1627 * references to this file will be direct to the other driver. 1628 */ 1629 static int 1630 filedescopen(dev_t dev, int mode, int type, lwp_t *l) 1631 { 1632 1633 /* 1634 * XXX Kludge: set dupfd to contain the value of the 1635 * the file descriptor being sought for duplication. The error 1636 * return ensures that the vnode for this device will be released 1637 * by vn_open. Open will detect this special error and take the 1638 * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN 1639 * will simply report the error. 1640 */ 1641 l->l_dupfd = minor(dev); /* XXX */ 1642 return EDUPFD; 1643 } 1644 1645 /* 1646 * Duplicate the specified descriptor to a free descriptor. 1647 */ 1648 int 1649 fd_dupopen(int old, int *newp, int mode, int error) 1650 { 1651 filedesc_t *fdp; 1652 fdfile_t *ff; 1653 file_t *fp; 1654 fdtab_t *dt; 1655 1656 if ((fp = fd_getfile(old)) == NULL) { 1657 return EBADF; 1658 } 1659 fdp = curlwp->l_fd; 1660 dt = fdp->fd_dt; 1661 ff = dt->dt_ff[old]; 1662 1663 /* 1664 * There are two cases of interest here. 1665 * 1666 * For EDUPFD simply dup (old) to file descriptor 1667 * (new) and return. 1668 * 1669 * For EMOVEFD steal away the file structure from (old) and 1670 * store it in (new). (old) is effectively closed by 1671 * this operation. 1672 * 1673 * Any other error code is just returned. 1674 */ 1675 switch (error) { 1676 case EDUPFD: 1677 /* 1678 * Check that the mode the file is being opened for is a 1679 * subset of the mode of the existing descriptor. 1680 */ 1681 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 1682 error = EACCES; 1683 break; 1684 } 1685 1686 /* Copy it. */ 1687 error = fd_dup(fp, 0, newp, ff->ff_exclose); 1688 break; 1689 1690 case EMOVEFD: 1691 /* Copy it. */ 1692 error = fd_dup(fp, 0, newp, ff->ff_exclose); 1693 if (error != 0) { 1694 break; 1695 } 1696 1697 /* Steal away the file pointer from 'old'. */ 1698 (void)fd_close(old); 1699 return 0; 1700 } 1701 1702 fd_putfile(old); 1703 return error; 1704 } 1705 1706 /* 1707 * Close open files on exec. 1708 */ 1709 void 1710 fd_closeexec(void) 1711 { 1712 proc_t *p; 1713 filedesc_t *fdp; 1714 fdfile_t *ff; 1715 lwp_t *l; 1716 fdtab_t *dt; 1717 int fd; 1718 1719 l = curlwp; 1720 p = l->l_proc; 1721 fdp = p->p_fd; 1722 1723 if (fdp->fd_refcnt > 1) { 1724 fdp = fd_copy(); 1725 fd_free(); 1726 p->p_fd = fdp; 1727 l->l_fd = fdp; 1728 } 1729 if (!fdp->fd_exclose) { 1730 return; 1731 } 1732 fdp->fd_exclose = false; 1733 dt = fdp->fd_dt; 1734 1735 for (fd = 0; fd <= fdp->fd_lastfile; fd++) { 1736 if ((ff = dt->dt_ff[fd]) == NULL) { 1737 KASSERT(fd >= NDFDFILE); 1738 continue; 1739 } 1740 KASSERT(fd >= NDFDFILE || 1741 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1742 if (ff->ff_file == NULL) 1743 continue; 1744 if (ff->ff_exclose) { 1745 /* 1746 * We need a reference to close the file. 1747 * No other threads can see the fdfile_t at 1748 * this point, so don't bother locking. 1749 */ 1750 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 1751 ff->ff_refcnt++; 1752 fd_close(fd); 1753 } 1754 } 1755 } 1756 1757 /* 1758 * Sets descriptor owner. If the owner is a process, 'pgid' 1759 * is set to positive value, process ID. If the owner is process group, 1760 * 'pgid' is set to -pg_id. 1761 */ 1762 int 1763 fsetown(pid_t *pgid, u_long cmd, const void *data) 1764 { 1765 pid_t id = *(const pid_t *)data; 1766 int error; 1767 1768 switch (cmd) { 1769 case TIOCSPGRP: 1770 if (id < 0) 1771 return EINVAL; 1772 id = -id; 1773 break; 1774 default: 1775 break; 1776 } 1777 if (id > 0) { 1778 mutex_enter(proc_lock); 1779 error = proc_find(id) ? 0 : ESRCH; 1780 mutex_exit(proc_lock); 1781 } else if (id < 0) { 1782 error = pgid_in_session(curproc, -id); 1783 } else { 1784 error = 0; 1785 } 1786 if (!error) { 1787 *pgid = id; 1788 } 1789 return error; 1790 } 1791 1792 void 1793 fd_set_exclose(struct lwp *l, int fd, bool exclose) 1794 { 1795 filedesc_t *fdp = l->l_fd; 1796 fdfile_t *ff = fdp->fd_dt->dt_ff[fd]; 1797 1798 ff->ff_exclose = exclose; 1799 if (exclose) 1800 fdp->fd_exclose = true; 1801 } 1802 1803 /* 1804 * Return descriptor owner information. If the value is positive, 1805 * it's process ID. If it's negative, it's process group ID and 1806 * needs the sign removed before use. 1807 */ 1808 int 1809 fgetown(pid_t pgid, u_long cmd, void *data) 1810 { 1811 1812 switch (cmd) { 1813 case TIOCGPGRP: 1814 *(int *)data = -pgid; 1815 break; 1816 default: 1817 *(int *)data = pgid; 1818 break; 1819 } 1820 return 0; 1821 } 1822 1823 /* 1824 * Send signal to descriptor owner, either process or process group. 1825 */ 1826 void 1827 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata) 1828 { 1829 ksiginfo_t ksi; 1830 1831 KASSERT(!cpu_intr_p()); 1832 1833 if (pgid == 0) { 1834 return; 1835 } 1836 1837 KSI_INIT(&ksi); 1838 ksi.ksi_signo = signo; 1839 ksi.ksi_code = code; 1840 ksi.ksi_band = band; 1841 1842 mutex_enter(proc_lock); 1843 if (pgid > 0) { 1844 struct proc *p1; 1845 1846 p1 = proc_find(pgid); 1847 if (p1 != NULL) { 1848 kpsignal(p1, &ksi, fdescdata); 1849 } 1850 } else { 1851 struct pgrp *pgrp; 1852 1853 KASSERT(pgid < 0); 1854 pgrp = pgrp_find(-pgid); 1855 if (pgrp != NULL) { 1856 kpgsignal(pgrp, &ksi, fdescdata, 0); 1857 } 1858 } 1859 mutex_exit(proc_lock); 1860 } 1861 1862 int 1863 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops, 1864 void *data) 1865 { 1866 fdfile_t *ff; 1867 filedesc_t *fdp; 1868 1869 fp->f_flag = flag & FMASK; 1870 fdp = curproc->p_fd; 1871 ff = fdp->fd_dt->dt_ff[fd]; 1872 KASSERT(ff != NULL); 1873 ff->ff_exclose = (flag & O_CLOEXEC) != 0; 1874 fp->f_type = DTYPE_MISC; 1875 fp->f_ops = fops; 1876 fp->f_data = data; 1877 curlwp->l_dupfd = fd; 1878 fd_affix(curproc, fp, fd); 1879 1880 return EMOVEFD; 1881 } 1882 1883 int 1884 fnullop_fcntl(file_t *fp, u_int cmd, void *data) 1885 { 1886 1887 if (cmd == F_SETFL) 1888 return 0; 1889 1890 return EOPNOTSUPP; 1891 } 1892 1893 int 1894 fnullop_poll(file_t *fp, int which) 1895 { 1896 1897 return 0; 1898 } 1899 1900 int 1901 fnullop_kqfilter(file_t *fp, struct knote *kn) 1902 { 1903 1904 return EOPNOTSUPP; 1905 } 1906 1907 void 1908 fnullop_restart(file_t *fp) 1909 { 1910 1911 } 1912 1913 int 1914 fbadop_read(file_t *fp, off_t *offset, struct uio *uio, 1915 kauth_cred_t cred, int flags) 1916 { 1917 1918 return EOPNOTSUPP; 1919 } 1920 1921 int 1922 fbadop_write(file_t *fp, off_t *offset, struct uio *uio, 1923 kauth_cred_t cred, int flags) 1924 { 1925 1926 return EOPNOTSUPP; 1927 } 1928 1929 int 1930 fbadop_ioctl(file_t *fp, u_long com, void *data) 1931 { 1932 1933 return EOPNOTSUPP; 1934 } 1935 1936 int 1937 fbadop_stat(file_t *fp, struct stat *sb) 1938 { 1939 1940 return EOPNOTSUPP; 1941 } 1942 1943 int 1944 fbadop_close(file_t *fp) 1945 { 1946 1947 return EOPNOTSUPP; 1948 } 1949 1950 /* 1951 * sysctl routines pertaining to file descriptors 1952 */ 1953 1954 /* Initialized in sysctl_init() for now... */ 1955 extern kmutex_t sysctl_file_marker_lock; 1956 static u_int sysctl_file_marker = 1; 1957 1958 /* 1959 * Expects to be called with proc_lock and sysctl_file_marker_lock locked. 1960 */ 1961 static void 1962 sysctl_file_marker_reset(void) 1963 { 1964 struct proc *p; 1965 1966 PROCLIST_FOREACH(p, &allproc) { 1967 struct filedesc *fd = p->p_fd; 1968 fdtab_t *dt; 1969 u_int i; 1970 1971 mutex_enter(&fd->fd_lock); 1972 dt = fd->fd_dt; 1973 for (i = 0; i < dt->dt_nfiles; i++) { 1974 struct file *fp; 1975 fdfile_t *ff; 1976 1977 if ((ff = dt->dt_ff[i]) == NULL) { 1978 continue; 1979 } 1980 if ((fp = ff->ff_file) == NULL) { 1981 continue; 1982 } 1983 fp->f_marker = 0; 1984 } 1985 mutex_exit(&fd->fd_lock); 1986 } 1987 } 1988 1989 /* 1990 * sysctl helper routine for kern.file pseudo-subtree. 1991 */ 1992 static int 1993 sysctl_kern_file(SYSCTLFN_ARGS) 1994 { 1995 const bool allowaddr = get_expose_address(curproc); 1996 struct filelist flist; 1997 int error; 1998 size_t buflen; 1999 struct file *fp, fbuf; 2000 char *start, *where; 2001 struct proc *p; 2002 2003 start = where = oldp; 2004 buflen = *oldlenp; 2005 2006 if (where == NULL) { 2007 /* 2008 * overestimate by 10 files 2009 */ 2010 *oldlenp = sizeof(filehead) + (nfiles + 10) * 2011 sizeof(struct file); 2012 return 0; 2013 } 2014 2015 /* 2016 * first sysctl_copyout filehead 2017 */ 2018 if (buflen < sizeof(filehead)) { 2019 *oldlenp = 0; 2020 return 0; 2021 } 2022 sysctl_unlock(); 2023 if (allowaddr) { 2024 memcpy(&flist, &filehead, sizeof(flist)); 2025 } else { 2026 memset(&flist, 0, sizeof(flist)); 2027 } 2028 error = sysctl_copyout(l, &flist, where, sizeof(flist)); 2029 if (error) { 2030 sysctl_relock(); 2031 return error; 2032 } 2033 buflen -= sizeof(flist); 2034 where += sizeof(flist); 2035 2036 /* 2037 * followed by an array of file structures 2038 */ 2039 mutex_enter(&sysctl_file_marker_lock); 2040 mutex_enter(proc_lock); 2041 PROCLIST_FOREACH(p, &allproc) { 2042 struct filedesc *fd; 2043 fdtab_t *dt; 2044 u_int i; 2045 2046 if (p->p_stat == SIDL) { 2047 /* skip embryonic processes */ 2048 continue; 2049 } 2050 mutex_enter(p->p_lock); 2051 error = kauth_authorize_process(l->l_cred, 2052 KAUTH_PROCESS_CANSEE, p, 2053 KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), 2054 NULL, NULL); 2055 mutex_exit(p->p_lock); 2056 if (error != 0) { 2057 /* 2058 * Don't leak kauth retval if we're silently 2059 * skipping this entry. 2060 */ 2061 error = 0; 2062 continue; 2063 } 2064 2065 /* 2066 * Grab a hold on the process. 2067 */ 2068 if (!rw_tryenter(&p->p_reflock, RW_READER)) { 2069 continue; 2070 } 2071 mutex_exit(proc_lock); 2072 2073 fd = p->p_fd; 2074 mutex_enter(&fd->fd_lock); 2075 dt = fd->fd_dt; 2076 for (i = 0; i < dt->dt_nfiles; i++) { 2077 fdfile_t *ff; 2078 2079 if ((ff = dt->dt_ff[i]) == NULL) { 2080 continue; 2081 } 2082 if ((fp = ff->ff_file) == NULL) { 2083 continue; 2084 } 2085 2086 mutex_enter(&fp->f_lock); 2087 2088 if ((fp->f_count == 0) || 2089 (fp->f_marker == sysctl_file_marker)) { 2090 mutex_exit(&fp->f_lock); 2091 continue; 2092 } 2093 2094 /* Check that we have enough space. */ 2095 if (buflen < sizeof(struct file)) { 2096 *oldlenp = where - start; 2097 mutex_exit(&fp->f_lock); 2098 error = ENOMEM; 2099 break; 2100 } 2101 2102 fill_file(&fbuf, fp); 2103 mutex_exit(&fp->f_lock); 2104 error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf)); 2105 if (error) { 2106 break; 2107 } 2108 buflen -= sizeof(struct file); 2109 where += sizeof(struct file); 2110 2111 fp->f_marker = sysctl_file_marker; 2112 } 2113 mutex_exit(&fd->fd_lock); 2114 2115 /* 2116 * Release reference to process. 2117 */ 2118 mutex_enter(proc_lock); 2119 rw_exit(&p->p_reflock); 2120 2121 if (error) 2122 break; 2123 } 2124 2125 sysctl_file_marker++; 2126 /* Reset all markers if wrapped. */ 2127 if (sysctl_file_marker == 0) { 2128 sysctl_file_marker_reset(); 2129 sysctl_file_marker++; 2130 } 2131 2132 mutex_exit(proc_lock); 2133 mutex_exit(&sysctl_file_marker_lock); 2134 2135 *oldlenp = where - start; 2136 sysctl_relock(); 2137 return error; 2138 } 2139 2140 /* 2141 * sysctl helper function for kern.file2 2142 */ 2143 static int 2144 sysctl_kern_file2(SYSCTLFN_ARGS) 2145 { 2146 struct proc *p; 2147 struct file *fp; 2148 struct filedesc *fd; 2149 struct kinfo_file kf; 2150 char *dp; 2151 u_int i, op; 2152 size_t len, needed, elem_size, out_size; 2153 int error, arg, elem_count; 2154 fdfile_t *ff; 2155 fdtab_t *dt; 2156 2157 if (namelen == 1 && name[0] == CTL_QUERY) 2158 return sysctl_query(SYSCTLFN_CALL(rnode)); 2159 2160 if (namelen != 4) 2161 return EINVAL; 2162 2163 error = 0; 2164 dp = oldp; 2165 len = (oldp != NULL) ? *oldlenp : 0; 2166 op = name[0]; 2167 arg = name[1]; 2168 elem_size = name[2]; 2169 elem_count = name[3]; 2170 out_size = MIN(sizeof(kf), elem_size); 2171 needed = 0; 2172 2173 if (elem_size < 1 || elem_count < 0) 2174 return EINVAL; 2175 2176 switch (op) { 2177 case KERN_FILE_BYFILE: 2178 case KERN_FILE_BYPID: 2179 /* 2180 * We're traversing the process list in both cases; the BYFILE 2181 * case does additional work of keeping track of files already 2182 * looked at. 2183 */ 2184 2185 /* doesn't use arg so it must be zero */ 2186 if ((op == KERN_FILE_BYFILE) && (arg != 0)) 2187 return EINVAL; 2188 2189 if ((op == KERN_FILE_BYPID) && (arg < -1)) 2190 /* -1 means all processes */ 2191 return EINVAL; 2192 2193 sysctl_unlock(); 2194 if (op == KERN_FILE_BYFILE) 2195 mutex_enter(&sysctl_file_marker_lock); 2196 mutex_enter(proc_lock); 2197 PROCLIST_FOREACH(p, &allproc) { 2198 if (p->p_stat == SIDL) { 2199 /* skip embryonic processes */ 2200 continue; 2201 } 2202 if (arg > 0 && p->p_pid != arg) { 2203 /* pick only the one we want */ 2204 /* XXX want 0 to mean "kernel files" */ 2205 continue; 2206 } 2207 mutex_enter(p->p_lock); 2208 error = kauth_authorize_process(l->l_cred, 2209 KAUTH_PROCESS_CANSEE, p, 2210 KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), 2211 NULL, NULL); 2212 mutex_exit(p->p_lock); 2213 if (error != 0) { 2214 /* 2215 * Don't leak kauth retval if we're silently 2216 * skipping this entry. 2217 */ 2218 error = 0; 2219 continue; 2220 } 2221 2222 /* 2223 * Grab a hold on the process. 2224 */ 2225 if (!rw_tryenter(&p->p_reflock, RW_READER)) { 2226 continue; 2227 } 2228 mutex_exit(proc_lock); 2229 2230 fd = p->p_fd; 2231 mutex_enter(&fd->fd_lock); 2232 dt = fd->fd_dt; 2233 for (i = 0; i < dt->dt_nfiles; i++) { 2234 if ((ff = dt->dt_ff[i]) == NULL) { 2235 continue; 2236 } 2237 if ((fp = ff->ff_file) == NULL) { 2238 continue; 2239 } 2240 2241 if ((op == KERN_FILE_BYFILE) && 2242 (fp->f_marker == sysctl_file_marker)) { 2243 continue; 2244 } 2245 if (len >= elem_size && elem_count > 0) { 2246 mutex_enter(&fp->f_lock); 2247 fill_file2(&kf, fp, ff, i, p->p_pid); 2248 mutex_exit(&fp->f_lock); 2249 mutex_exit(&fd->fd_lock); 2250 error = sysctl_copyout(l, 2251 &kf, dp, out_size); 2252 mutex_enter(&fd->fd_lock); 2253 if (error) 2254 break; 2255 dp += elem_size; 2256 len -= elem_size; 2257 } 2258 if (op == KERN_FILE_BYFILE) 2259 fp->f_marker = sysctl_file_marker; 2260 needed += elem_size; 2261 if (elem_count > 0 && elem_count != INT_MAX) 2262 elem_count--; 2263 } 2264 mutex_exit(&fd->fd_lock); 2265 2266 /* 2267 * Release reference to process. 2268 */ 2269 mutex_enter(proc_lock); 2270 rw_exit(&p->p_reflock); 2271 } 2272 if (op == KERN_FILE_BYFILE) { 2273 sysctl_file_marker++; 2274 2275 /* Reset all markers if wrapped. */ 2276 if (sysctl_file_marker == 0) { 2277 sysctl_file_marker_reset(); 2278 sysctl_file_marker++; 2279 } 2280 } 2281 mutex_exit(proc_lock); 2282 if (op == KERN_FILE_BYFILE) 2283 mutex_exit(&sysctl_file_marker_lock); 2284 sysctl_relock(); 2285 break; 2286 default: 2287 return EINVAL; 2288 } 2289 2290 if (oldp == NULL) 2291 needed += KERN_FILESLOP * elem_size; 2292 *oldlenp = needed; 2293 2294 return error; 2295 } 2296 2297 static void 2298 fill_file(struct file *fp, const struct file *fpsrc) 2299 { 2300 const bool allowaddr = get_expose_address(curproc); 2301 2302 memset(fp, 0, sizeof(*fp)); 2303 2304 fp->f_offset = fpsrc->f_offset; 2305 COND_SET_VALUE(fp->f_cred, fpsrc->f_cred, allowaddr); 2306 COND_SET_VALUE(fp->f_ops, fpsrc->f_ops, allowaddr); 2307 COND_SET_VALUE(fp->f_undata, fpsrc->f_undata, allowaddr); 2308 COND_SET_VALUE(fp->f_list, fpsrc->f_list, allowaddr); 2309 COND_SET_VALUE(fp->f_lock, fpsrc->f_lock, allowaddr); 2310 fp->f_flag = fpsrc->f_flag; 2311 fp->f_marker = fpsrc->f_marker; 2312 fp->f_type = fpsrc->f_type; 2313 fp->f_advice = fpsrc->f_advice; 2314 fp->f_count = fpsrc->f_count; 2315 fp->f_msgcount = fpsrc->f_msgcount; 2316 fp->f_unpcount = fpsrc->f_unpcount; 2317 COND_SET_VALUE(fp->f_unplist, fpsrc->f_unplist, allowaddr); 2318 } 2319 2320 static void 2321 fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff, 2322 int i, pid_t pid) 2323 { 2324 const bool allowaddr = get_expose_address(curproc); 2325 2326 memset(kp, 0, sizeof(*kp)); 2327 2328 COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr); 2329 kp->ki_flag = fp->f_flag; 2330 kp->ki_iflags = 0; 2331 kp->ki_ftype = fp->f_type; 2332 kp->ki_count = fp->f_count; 2333 kp->ki_msgcount = fp->f_msgcount; 2334 COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr); 2335 kp->ki_fuid = kauth_cred_geteuid(fp->f_cred); 2336 kp->ki_fgid = kauth_cred_getegid(fp->f_cred); 2337 COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr); 2338 kp->ki_foffset = fp->f_offset; 2339 COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr); 2340 2341 /* vnode information to glue this file to something */ 2342 if (fp->f_type == DTYPE_VNODE) { 2343 struct vnode *vp = fp->f_vnode; 2344 2345 COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket), 2346 allowaddr); 2347 kp->ki_vsize = vp->v_size; 2348 kp->ki_vtype = vp->v_type; 2349 kp->ki_vtag = vp->v_tag; 2350 COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data), 2351 allowaddr); 2352 } 2353 2354 /* process information when retrieved via KERN_FILE_BYPID */ 2355 if (ff != NULL) { 2356 kp->ki_pid = pid; 2357 kp->ki_fd = i; 2358 kp->ki_ofileflags = ff->ff_exclose; 2359 kp->ki_usecount = ff->ff_refcnt; 2360 } 2361 } 2362