1 /* $NetBSD: vfs_subr.c,v 1.339 2008/04/30 12:49:16 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * External virtual filesystem routines. 71 * 72 * This file contains vfs subroutines which are heavily dependant on 73 * the kernel and are not suitable for standalone use. Examples include 74 * routines involved vnode and mountpoint management. 75 */ 76 77 #include <sys/cdefs.h> 78 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.339 2008/04/30 12:49:16 ad Exp $"); 79 80 #include "opt_ddb.h" 81 #include "opt_compat_netbsd.h" 82 #include "opt_compat_43.h" 83 84 #include <sys/param.h> 85 #include <sys/systm.h> 86 #include <sys/proc.h> 87 #include <sys/kernel.h> 88 #include <sys/mount.h> 89 #include <sys/fcntl.h> 90 #include <sys/vnode.h> 91 #include <sys/stat.h> 92 #include <sys/namei.h> 93 #include <sys/ucred.h> 94 #include <sys/buf.h> 95 #include <sys/errno.h> 96 #include <sys/malloc.h> 97 #include <sys/syscallargs.h> 98 #include <sys/device.h> 99 #include <sys/filedesc.h> 100 #include <sys/kauth.h> 101 #include <sys/atomic.h> 102 #include <sys/kthread.h> 103 104 #include <miscfs/specfs/specdev.h> 105 #include <miscfs/syncfs/syncfs.h> 106 107 #include <uvm/uvm.h> 108 #include <uvm/uvm_readahead.h> 109 #include <uvm/uvm_ddb.h> 110 111 #include <sys/sysctl.h> 112 113 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 114 extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 115 116 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 117 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 118 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 119 120 static int vrele_pending; 121 static kmutex_t vrele_lock; 122 static kcondvar_t vrele_cv; 123 static lwp_t *vrele_lwp; 124 125 static pool_cache_t vnode_cache; 126 127 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 128 129 /* 130 * Local declarations. 131 */ 132 133 static void vrele_thread(void *); 134 static void insmntque(vnode_t *, struct mount *); 135 static int getdevvp(dev_t, vnode_t **, enum vtype); 136 static vnode_t *getcleanvnode(void);; 137 void vpanic(vnode_t *, const char *); 138 139 #ifdef DIAGNOSTIC 140 void 141 vpanic(vnode_t *vp, const char *msg) 142 { 143 144 vprint(NULL, vp); 145 panic("%s\n", msg); 146 } 147 #else 148 #define vpanic(vp, msg) /* nothing */ 149 #endif 150 151 void 152 vn_init1(void) 153 { 154 155 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 156 NULL, IPL_NONE, NULL, NULL, NULL); 157 KASSERT(vnode_cache != NULL); 158 159 /* Create deferred release thread. */ 160 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 161 cv_init(&vrele_cv, "vrele"); 162 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 163 NULL, &vrele_lwp, "vrele")) 164 panic("fork vrele"); 165 } 166 167 int 168 vfs_drainvnodes(long target, struct lwp *l) 169 { 170 171 while (numvnodes > target) { 172 vnode_t *vp; 173 174 mutex_enter(&vnode_free_list_lock); 175 vp = getcleanvnode(); 176 if (vp == NULL) 177 return EBUSY; /* give up */ 178 ungetnewvnode(vp); 179 } 180 181 return 0; 182 } 183 184 /* 185 * grab a vnode from freelist and clean it. 186 */ 187 vnode_t * 188 getcleanvnode(void) 189 { 190 vnode_t *vp; 191 vnodelst_t *listhd; 192 193 KASSERT(mutex_owned(&vnode_free_list_lock)); 194 195 retry: 196 listhd = &vnode_free_list; 197 try_nextlist: 198 TAILQ_FOREACH(vp, listhd, v_freelist) { 199 /* 200 * It's safe to test v_usecount and v_iflag 201 * without holding the interlock here, since 202 * these vnodes should never appear on the 203 * lists. 204 */ 205 if (vp->v_usecount != 0) { 206 vpanic(vp, "free vnode isn't"); 207 } 208 if ((vp->v_iflag & VI_CLEAN) != 0) { 209 vpanic(vp, "clean vnode on freelist"); 210 } 211 if (vp->v_freelisthd != listhd) { 212 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 213 vpanic(vp, "list head mismatch"); 214 } 215 if (!mutex_tryenter(&vp->v_interlock)) 216 continue; 217 /* 218 * Our lwp might hold the underlying vnode 219 * locked, so don't try to reclaim a VI_LAYER 220 * node if it's locked. 221 */ 222 if ((vp->v_iflag & VI_XLOCK) == 0 && 223 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 224 break; 225 } 226 mutex_exit(&vp->v_interlock); 227 } 228 229 if (vp == NULL) { 230 if (listhd == &vnode_free_list) { 231 listhd = &vnode_hold_list; 232 goto try_nextlist; 233 } 234 mutex_exit(&vnode_free_list_lock); 235 return NULL; 236 } 237 238 /* Remove it from the freelist. */ 239 TAILQ_REMOVE(listhd, vp, v_freelist); 240 vp->v_freelisthd = NULL; 241 mutex_exit(&vnode_free_list_lock); 242 243 /* 244 * The vnode is still associated with a file system, so we must 245 * clean it out before reusing it. We need to add a reference 246 * before doing this. If the vnode gains another reference while 247 * being cleaned out then we lose - retry. 248 */ 249 vp->v_usecount++; 250 vclean(vp, DOCLOSE); 251 if (vp->v_usecount == 1) { 252 /* We're about to dirty it. */ 253 vp->v_iflag &= ~VI_CLEAN; 254 mutex_exit(&vp->v_interlock); 255 if (vp->v_type == VBLK || vp->v_type == VCHR) { 256 spec_node_destroy(vp); 257 } 258 vp->v_type = VNON; 259 } else { 260 /* 261 * Don't return to freelist - the holder of the last 262 * reference will destroy it. 263 */ 264 KASSERT(vp->v_usecount > 1); 265 vp->v_usecount--; 266 mutex_exit(&vp->v_interlock); 267 mutex_enter(&vnode_free_list_lock); 268 goto retry; 269 } 270 271 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 272 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 273 vpanic(vp, "cleaned vnode isn't"); 274 } 275 if (vp->v_numoutput != 0) { 276 vpanic(vp, "clean vnode has pending I/O's"); 277 } 278 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 279 vpanic(vp, "clean vnode on syncer list"); 280 } 281 282 return vp; 283 } 284 285 static inline int 286 vfs_dobusy(struct mount *mp, const krw_t op, struct mount **nextp) 287 { 288 289 KASSERT(mp->mnt_refcnt > 0); 290 291 atomic_inc_uint(&mp->mnt_refcnt); 292 if (nextp != NULL) { 293 mutex_exit(&mountlist_lock); 294 } 295 if (mp->mnt_writer == curlwp) { 296 mp->mnt_recursecnt++; 297 } else { 298 rw_enter(&mp->mnt_lock, op); 299 if (op == RW_WRITER) { 300 KASSERT(mp->mnt_writer == NULL); 301 mp->mnt_writer = curlwp; 302 } 303 } 304 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 305 if (nextp != NULL) { 306 mutex_enter(&mountlist_lock); 307 } 308 vfs_unbusy(mp, false, nextp); 309 return ENOENT; 310 } 311 312 return 0; 313 } 314 315 /* 316 * Mark a mount point as busy, and gain a new reference to it. Used to 317 * synchronize access and to delay unmounting. 318 * 319 * => The caller must hold a pre-existing reference to the mount. 320 */ 321 int 322 vfs_busy(struct mount *mp, const krw_t op) 323 { 324 int error; 325 326 for (;;) { 327 error = vfs_dobusy(mp, op, NULL); 328 if (error != 0) { 329 return error; 330 } 331 if (__predict_true(mp->mnt_unmounter == NULL)) { 332 return 0; 333 } 334 mutex_enter(&mount_lock); 335 if (mp->mnt_unmounter != NULL) { 336 vfs_unbusy(mp, false, NULL); 337 cv_wait(&mount_cv, &mount_lock); 338 } 339 mutex_exit(&mount_lock); 340 } 341 } 342 343 /* 344 * As vfs_busy(), but return error if the file system is being 345 * unmounted (and do not wait for the unmount). 346 * 347 * => If nextp != NULL, mountlist_lock is understood to be held. On 348 * failure a pointer to the next mount will be returned via nextp. 349 * The caller need not hold a reference to the mount. 350 * 351 * => If nextp == NULL, the caller is expected to hold a reference 352 * to the mount. 353 */ 354 int 355 vfs_trybusy(struct mount *mp, krw_t op, struct mount **nextp) 356 { 357 lwp_t *l; 358 int error; 359 360 KASSERT(nextp == NULL || mutex_owned(&mountlist_lock)); 361 362 if (nextp != NULL) { 363 /* 364 * We need to prevent adding a reference to the mount 365 * if it is already on the way out: the reference count 366 * could be zero, and as a result another thread could 367 * be in vfs_destroy() trying to throw away the mount. 368 * 369 * mnt_iflag is protected by mnt_lock, but this check is 370 * safe if mountlist_lock is held. mountlist_lock will 371 * be held by vfs_destroy() before removing the mount 372 * from mountlist. 373 */ 374 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 375 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 376 return ENOENT; 377 } 378 } 379 380 error = vfs_dobusy(mp, op, nextp); 381 l = mp->mnt_unmounter; 382 if (error == 0 && (l != NULL && l != curlwp)) { 383 if (nextp != NULL) { 384 mutex_enter(&mountlist_lock); 385 } 386 vfs_unbusy(mp, false, nextp); 387 error = EBUSY; 388 } 389 return error; 390 } 391 392 /* 393 * Unlock a busy filesystem and drop reference to it. If 'keepref' is 394 * true, unlock but preserve the reference. 395 * 396 * => If nextp != NULL, mountlist_lock is understood to be held. On 397 * failure a pointer to the next mount will be returned via nextp. 398 */ 399 void 400 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 401 { 402 403 KASSERT(mp->mnt_refcnt > 0); 404 405 if (mp->mnt_writer == curlwp) { 406 KASSERT(rw_write_held(&mp->mnt_lock)); 407 if (mp->mnt_recursecnt != 0) { 408 mp->mnt_recursecnt--; 409 } else { 410 mp->mnt_writer = NULL; 411 rw_exit(&mp->mnt_lock); 412 } 413 } else { 414 rw_exit(&mp->mnt_lock); 415 } 416 if (nextp != NULL) { 417 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 418 } 419 if (!keepref) { 420 vfs_destroy(mp, nextp != NULL); 421 } 422 } 423 424 /* 425 * Lookup a filesystem type, and if found allocate and initialize 426 * a mount structure for it. 427 * 428 * Devname is usually updated by mount(8) after booting. 429 */ 430 int 431 vfs_rootmountalloc(const char *fstypename, const char *devname, 432 struct mount **mpp) 433 { 434 struct vfsops *vfsp = NULL; 435 struct mount *mp; 436 437 mutex_enter(&vfs_list_lock); 438 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 439 if (!strncmp(vfsp->vfs_name, fstypename, 440 sizeof(mp->mnt_stat.f_fstypename))) 441 break; 442 if (vfsp == NULL) { 443 mutex_exit(&vfs_list_lock); 444 return (ENODEV); 445 } 446 vfsp->vfs_refcount++; 447 mutex_exit(&vfs_list_lock); 448 449 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 450 if (mp == NULL) 451 return ENOMEM; 452 mp->mnt_refcnt = 1; 453 rw_init(&mp->mnt_lock); 454 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 455 (void)vfs_busy(mp, RW_WRITER); 456 TAILQ_INIT(&mp->mnt_vnodelist); 457 mp->mnt_op = vfsp; 458 mp->mnt_flag = MNT_RDONLY; 459 mp->mnt_vnodecovered = NULL; 460 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 461 sizeof(mp->mnt_stat.f_fstypename)); 462 mp->mnt_stat.f_mntonname[0] = '/'; 463 mp->mnt_stat.f_mntonname[1] = '\0'; 464 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 465 '\0'; 466 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 467 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 468 mount_initspecific(mp); 469 *mpp = mp; 470 return (0); 471 } 472 473 /* 474 * Routines having to do with the management of the vnode table. 475 */ 476 extern int (**dead_vnodeop_p)(void *); 477 478 /* 479 * Return the next vnode from the free list. 480 */ 481 int 482 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 483 vnode_t **vpp) 484 { 485 struct uvm_object *uobj; 486 static int toggle; 487 vnode_t *vp; 488 int error = 0, tryalloc; 489 490 try_again: 491 if (mp != NULL) { 492 /* 493 * Mark filesystem busy while we're creating a 494 * vnode. If unmount is in progress, this will 495 * wait; if the unmount succeeds (only if umount 496 * -f), this will return an error. If the 497 * unmount fails, we'll keep going afterwards. 498 */ 499 error = vfs_busy(mp, RW_READER); 500 if (error) 501 return error; 502 } 503 504 /* 505 * We must choose whether to allocate a new vnode or recycle an 506 * existing one. The criterion for allocating a new one is that 507 * the total number of vnodes is less than the number desired or 508 * there are no vnodes on either free list. Generally we only 509 * want to recycle vnodes that have no buffers associated with 510 * them, so we look first on the vnode_free_list. If it is empty, 511 * we next consider vnodes with referencing buffers on the 512 * vnode_hold_list. The toggle ensures that half the time we 513 * will use a buffer from the vnode_hold_list, and half the time 514 * we will allocate a new one unless the list has grown to twice 515 * the desired size. We are reticent to recycle vnodes from the 516 * vnode_hold_list because we will lose the identity of all its 517 * referencing buffers. 518 */ 519 520 vp = NULL; 521 522 mutex_enter(&vnode_free_list_lock); 523 524 toggle ^= 1; 525 if (numvnodes > 2 * desiredvnodes) 526 toggle = 0; 527 528 tryalloc = numvnodes < desiredvnodes || 529 (TAILQ_FIRST(&vnode_free_list) == NULL && 530 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 531 532 if (tryalloc) { 533 numvnodes++; 534 mutex_exit(&vnode_free_list_lock); 535 if ((vp = vnalloc(NULL)) == NULL) { 536 mutex_enter(&vnode_free_list_lock); 537 numvnodes--; 538 } else 539 vp->v_usecount = 1; 540 } 541 542 if (vp == NULL) { 543 vp = getcleanvnode(); 544 if (vp == NULL) { 545 if (mp != NULL) { 546 vfs_unbusy(mp, false, NULL); 547 } 548 if (tryalloc) { 549 printf("WARNING: unable to allocate new " 550 "vnode, retrying...\n"); 551 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 552 goto try_again; 553 } 554 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 555 *vpp = 0; 556 return (ENFILE); 557 } 558 vp->v_iflag = 0; 559 vp->v_vflag = 0; 560 vp->v_uflag = 0; 561 vp->v_socket = NULL; 562 } 563 564 KASSERT(vp->v_usecount == 1); 565 KASSERT(vp->v_freelisthd == NULL); 566 KASSERT(LIST_EMPTY(&vp->v_nclist)); 567 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 568 569 vp->v_type = VNON; 570 vp->v_vnlock = &vp->v_lock; 571 vp->v_tag = tag; 572 vp->v_op = vops; 573 insmntque(vp, mp); 574 *vpp = vp; 575 vp->v_data = 0; 576 577 /* 578 * initialize uvm_object within vnode. 579 */ 580 581 uobj = &vp->v_uobj; 582 KASSERT(uobj->pgops == &uvm_vnodeops); 583 KASSERT(uobj->uo_npages == 0); 584 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 585 vp->v_size = vp->v_writesize = VSIZENOTSET; 586 587 if (mp != NULL) { 588 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 589 vp->v_vflag |= VV_MPSAFE; 590 vfs_unbusy(mp, true, NULL); 591 } 592 593 return (0); 594 } 595 596 /* 597 * This is really just the reverse of getnewvnode(). Needed for 598 * VFS_VGET functions who may need to push back a vnode in case 599 * of a locking race. 600 */ 601 void 602 ungetnewvnode(vnode_t *vp) 603 { 604 605 KASSERT(vp->v_usecount == 1); 606 KASSERT(vp->v_data == NULL); 607 KASSERT(vp->v_freelisthd == NULL); 608 609 mutex_enter(&vp->v_interlock); 610 vp->v_iflag |= VI_CLEAN; 611 vrelel(vp, 0); 612 } 613 614 /* 615 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 616 * marker vnode and we are prepared to wait for the allocation. 617 */ 618 vnode_t * 619 vnalloc(struct mount *mp) 620 { 621 vnode_t *vp; 622 623 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 624 if (vp == NULL) { 625 return NULL; 626 } 627 628 memset(vp, 0, sizeof(*vp)); 629 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 630 cv_init(&vp->v_cv, "vnode"); 631 /* 632 * done by memset() above. 633 * LIST_INIT(&vp->v_nclist); 634 * LIST_INIT(&vp->v_dnclist); 635 */ 636 637 if (mp != NULL) { 638 vp->v_mount = mp; 639 vp->v_type = VBAD; 640 vp->v_iflag = VI_MARKER; 641 } else { 642 rw_init(&vp->v_lock.vl_lock); 643 } 644 645 return vp; 646 } 647 648 /* 649 * Free an unused, unreferenced vnode. 650 */ 651 void 652 vnfree(vnode_t *vp) 653 { 654 655 KASSERT(vp->v_usecount == 0); 656 657 if ((vp->v_iflag & VI_MARKER) == 0) { 658 rw_destroy(&vp->v_lock.vl_lock); 659 mutex_enter(&vnode_free_list_lock); 660 numvnodes--; 661 mutex_exit(&vnode_free_list_lock); 662 } 663 664 UVM_OBJ_DESTROY(&vp->v_uobj); 665 cv_destroy(&vp->v_cv); 666 pool_cache_put(vnode_cache, vp); 667 } 668 669 /* 670 * Remove a vnode from its freelist. 671 */ 672 static inline void 673 vremfree(vnode_t *vp) 674 { 675 676 KASSERT(mutex_owned(&vp->v_interlock)); 677 KASSERT(vp->v_usecount == 0); 678 679 /* 680 * Note that the reference count must not change until 681 * the vnode is removed. 682 */ 683 mutex_enter(&vnode_free_list_lock); 684 if (vp->v_holdcnt > 0) { 685 KASSERT(vp->v_freelisthd == &vnode_hold_list); 686 } else { 687 KASSERT(vp->v_freelisthd == &vnode_free_list); 688 } 689 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 690 vp->v_freelisthd = NULL; 691 mutex_exit(&vnode_free_list_lock); 692 } 693 694 /* 695 * Move a vnode from one mount queue to another. 696 */ 697 static void 698 insmntque(vnode_t *vp, struct mount *mp) 699 { 700 struct mount *omp; 701 702 #ifdef DIAGNOSTIC 703 if ((mp != NULL) && 704 (mp->mnt_iflag & IMNT_UNMOUNT) && 705 !(mp->mnt_flag & MNT_SOFTDEP) && 706 vp->v_tag != VT_VFS) { 707 panic("insmntque into dying filesystem"); 708 } 709 #endif 710 711 mutex_enter(&mntvnode_lock); 712 /* 713 * Delete from old mount point vnode list, if on one. 714 */ 715 if ((omp = vp->v_mount) != NULL) 716 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 717 /* 718 * Insert into list of vnodes for the new mount point, if 719 * available. The caller must take a reference on the mount 720 * structure and donate to the vnode. 721 */ 722 if ((vp->v_mount = mp) != NULL) 723 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 724 mutex_exit(&mntvnode_lock); 725 726 if (omp != NULL) { 727 /* Release reference to old mount. */ 728 vfs_destroy(omp, false); 729 } 730 } 731 732 /* 733 * Create a vnode for a block device. 734 * Used for root filesystem and swap areas. 735 * Also used for memory file system special devices. 736 */ 737 int 738 bdevvp(dev_t dev, vnode_t **vpp) 739 { 740 741 return (getdevvp(dev, vpp, VBLK)); 742 } 743 744 /* 745 * Create a vnode for a character device. 746 * Used for kernfs and some console handling. 747 */ 748 int 749 cdevvp(dev_t dev, vnode_t **vpp) 750 { 751 752 return (getdevvp(dev, vpp, VCHR)); 753 } 754 755 /* 756 * Create a vnode for a device. 757 * Used by bdevvp (block device) for root file system etc., 758 * and by cdevvp (character device) for console and kernfs. 759 */ 760 static int 761 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 762 { 763 vnode_t *vp; 764 vnode_t *nvp; 765 int error; 766 767 if (dev == NODEV) { 768 *vpp = NULL; 769 return (0); 770 } 771 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 772 if (error) { 773 *vpp = NULL; 774 return (error); 775 } 776 vp = nvp; 777 vp->v_type = type; 778 vp->v_vflag |= VV_MPSAFE; 779 uvm_vnp_setsize(vp, 0); 780 spec_node_init(vp, dev); 781 *vpp = vp; 782 return (0); 783 } 784 785 /* 786 * Grab a particular vnode from the free list, increment its 787 * reference count and lock it. If the vnode lock bit is set the 788 * vnode is being eliminated in vgone. In that case, we can not 789 * grab the vnode, so the process is awakened when the transition is 790 * completed, and an error returned to indicate that the vnode is no 791 * longer usable (possibly having been changed to a new file system type). 792 */ 793 int 794 vget(vnode_t *vp, int flags) 795 { 796 int error; 797 798 KASSERT((vp->v_iflag & VI_MARKER) == 0); 799 800 if ((flags & LK_INTERLOCK) == 0) 801 mutex_enter(&vp->v_interlock); 802 803 /* 804 * Before adding a reference, we must remove the vnode 805 * from its freelist. 806 */ 807 if (vp->v_usecount == 0) { 808 vremfree(vp); 809 } 810 if (++vp->v_usecount == 0) { 811 vpanic(vp, "vget: usecount overflow"); 812 } 813 814 /* 815 * If the vnode is in the process of being cleaned out for 816 * another use, we wait for the cleaning to finish and then 817 * return failure. Cleaning is determined by checking if 818 * the VI_XLOCK or VI_FREEING flags are set. 819 */ 820 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 821 if ((flags & LK_NOWAIT) != 0) { 822 vrelel(vp, 0); 823 return EBUSY; 824 } 825 vwait(vp, VI_XLOCK | VI_FREEING); 826 vrelel(vp, 0); 827 return ENOENT; 828 } 829 if (flags & LK_TYPE_MASK) { 830 error = vn_lock(vp, flags | LK_INTERLOCK); 831 if (error != 0) { 832 vrele(vp); 833 } 834 return error; 835 } 836 mutex_exit(&vp->v_interlock); 837 return 0; 838 } 839 840 /* 841 * vput(), just unlock and vrele() 842 */ 843 void 844 vput(vnode_t *vp) 845 { 846 847 KASSERT((vp->v_iflag & VI_MARKER) == 0); 848 849 VOP_UNLOCK(vp, 0); 850 vrele(vp); 851 } 852 853 /* 854 * Vnode release. If reference count drops to zero, call inactive 855 * routine and either return to freelist or free to the pool. 856 */ 857 void 858 vrelel(vnode_t *vp, int flags) 859 { 860 bool recycle, defer; 861 int error; 862 863 KASSERT(mutex_owned(&vp->v_interlock)); 864 KASSERT((vp->v_iflag & VI_MARKER) == 0); 865 KASSERT(vp->v_freelisthd == NULL); 866 867 if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) { 868 vpanic(vp, "dead but not clean"); 869 } 870 871 /* 872 * If not the last reference, just drop the reference count 873 * and unlock. 874 */ 875 if (vp->v_usecount > 1) { 876 vp->v_usecount--; 877 vp->v_iflag |= VI_INACTREDO; 878 mutex_exit(&vp->v_interlock); 879 return; 880 } 881 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 882 vpanic(vp, "vput: bad ref count"); 883 } 884 885 /* 886 * If not clean, deactivate the vnode, but preserve 887 * our reference across the call to VOP_INACTIVE(). 888 */ 889 retry: 890 if ((vp->v_iflag & VI_CLEAN) == 0) { 891 recycle = false; 892 /* 893 * XXX This ugly block can be largely eliminated if 894 * locking is pushed down into the file systems. 895 */ 896 if (curlwp == uvm.pagedaemon_lwp) { 897 /* The pagedaemon can't wait around; defer. */ 898 defer = true; 899 } else if (curlwp == vrele_lwp) { 900 /* We have to try harder. */ 901 vp->v_iflag &= ~VI_INACTREDO; 902 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 903 LK_RETRY); 904 if (error != 0) { 905 /* XXX */ 906 vpanic(vp, "vrele: unable to lock %p"); 907 } 908 defer = false; 909 } else if ((vp->v_iflag & VI_LAYER) != 0) { 910 /* 911 * Acquiring the stack's lock in vclean() even 912 * for an honest vput/vrele is dangerous because 913 * our caller may hold other vnode locks; defer. 914 */ 915 defer = true; 916 } else { 917 /* If we can't acquire the lock, then defer. */ 918 vp->v_iflag &= ~VI_INACTREDO; 919 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 920 LK_NOWAIT); 921 if (error != 0) { 922 defer = true; 923 mutex_enter(&vp->v_interlock); 924 } else { 925 defer = false; 926 } 927 } 928 929 if (defer) { 930 /* 931 * Defer reclaim to the kthread; it's not safe to 932 * clean it here. We donate it our last reference. 933 */ 934 KASSERT(mutex_owned(&vp->v_interlock)); 935 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 936 vp->v_iflag |= VI_INACTPEND; 937 mutex_enter(&vrele_lock); 938 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 939 if (++vrele_pending > (desiredvnodes >> 8)) 940 cv_signal(&vrele_cv); 941 mutex_exit(&vrele_lock); 942 mutex_exit(&vp->v_interlock); 943 return; 944 } 945 946 #ifdef DIAGNOSTIC 947 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 948 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 949 vprint("vrelel: missing VOP_CLOSE()", vp); 950 } 951 #endif 952 953 /* 954 * The vnode can gain another reference while being 955 * deactivated. If VOP_INACTIVE() indicates that 956 * the described file has been deleted, then recycle 957 * the vnode irrespective of additional references. 958 * Another thread may be waiting to re-use the on-disk 959 * inode. 960 * 961 * Note that VOP_INACTIVE() will drop the vnode lock. 962 */ 963 VOP_INACTIVE(vp, &recycle); 964 mutex_enter(&vp->v_interlock); 965 if (!recycle) { 966 if (vp->v_usecount > 1) { 967 vp->v_usecount--; 968 mutex_exit(&vp->v_interlock); 969 return; 970 } 971 972 /* 973 * If we grew another reference while 974 * VOP_INACTIVE() was underway, retry. 975 */ 976 if ((vp->v_iflag & VI_INACTREDO) != 0) { 977 goto retry; 978 } 979 } 980 981 /* Take care of space accounting. */ 982 if (vp->v_iflag & VI_EXECMAP) { 983 atomic_add_int(&uvmexp.execpages, 984 -vp->v_uobj.uo_npages); 985 atomic_add_int(&uvmexp.filepages, 986 vp->v_uobj.uo_npages); 987 } 988 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 989 vp->v_vflag &= ~VV_MAPPED; 990 991 /* 992 * Recycle the vnode if the file is now unused (unlinked), 993 * otherwise just free it. 994 */ 995 if (recycle) { 996 vclean(vp, DOCLOSE); 997 } 998 KASSERT(vp->v_usecount > 0); 999 } 1000 1001 if (--vp->v_usecount != 0) { 1002 /* Gained another reference while being reclaimed. */ 1003 mutex_exit(&vp->v_interlock); 1004 return; 1005 } 1006 1007 if ((vp->v_iflag & VI_CLEAN) != 0) { 1008 /* 1009 * It's clean so destroy it. It isn't referenced 1010 * anywhere since it has been reclaimed. 1011 */ 1012 KASSERT(vp->v_holdcnt == 0); 1013 KASSERT(vp->v_writecount == 0); 1014 mutex_exit(&vp->v_interlock); 1015 insmntque(vp, NULL); 1016 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1017 spec_node_destroy(vp); 1018 } 1019 vnfree(vp); 1020 } else { 1021 /* 1022 * Otherwise, put it back onto the freelist. It 1023 * can't be destroyed while still associated with 1024 * a file system. 1025 */ 1026 mutex_enter(&vnode_free_list_lock); 1027 if (vp->v_holdcnt > 0) { 1028 vp->v_freelisthd = &vnode_hold_list; 1029 } else { 1030 vp->v_freelisthd = &vnode_free_list; 1031 } 1032 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1033 mutex_exit(&vnode_free_list_lock); 1034 mutex_exit(&vp->v_interlock); 1035 } 1036 } 1037 1038 void 1039 vrele(vnode_t *vp) 1040 { 1041 1042 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1043 1044 mutex_enter(&vp->v_interlock); 1045 vrelel(vp, 0); 1046 } 1047 1048 static void 1049 vrele_thread(void *cookie) 1050 { 1051 vnode_t *vp; 1052 1053 for (;;) { 1054 mutex_enter(&vrele_lock); 1055 while (TAILQ_EMPTY(&vrele_list)) { 1056 cv_timedwait(&vrele_cv, &vrele_lock, hz); 1057 } 1058 vp = TAILQ_FIRST(&vrele_list); 1059 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 1060 vrele_pending--; 1061 mutex_exit(&vrele_lock); 1062 1063 /* 1064 * If not the last reference, then ignore the vnode 1065 * and look for more work. 1066 */ 1067 mutex_enter(&vp->v_interlock); 1068 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 1069 vp->v_iflag &= ~VI_INACTPEND; 1070 if (vp->v_usecount > 1) { 1071 vp->v_usecount--; 1072 mutex_exit(&vp->v_interlock); 1073 continue; 1074 } 1075 vrelel(vp, 0); 1076 } 1077 } 1078 1079 /* 1080 * Page or buffer structure gets a reference. 1081 * Called with v_interlock held. 1082 */ 1083 void 1084 vholdl(vnode_t *vp) 1085 { 1086 1087 KASSERT(mutex_owned(&vp->v_interlock)); 1088 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1089 1090 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1091 mutex_enter(&vnode_free_list_lock); 1092 KASSERT(vp->v_freelisthd == &vnode_free_list); 1093 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1094 vp->v_freelisthd = &vnode_hold_list; 1095 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1096 mutex_exit(&vnode_free_list_lock); 1097 } 1098 } 1099 1100 /* 1101 * Page or buffer structure frees a reference. 1102 * Called with v_interlock held. 1103 */ 1104 void 1105 holdrelel(vnode_t *vp) 1106 { 1107 1108 KASSERT(mutex_owned(&vp->v_interlock)); 1109 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1110 1111 if (vp->v_holdcnt <= 0) { 1112 vpanic(vp, "holdrelel: holdcnt vp %p"); 1113 } 1114 1115 vp->v_holdcnt--; 1116 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1117 mutex_enter(&vnode_free_list_lock); 1118 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1119 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1120 vp->v_freelisthd = &vnode_free_list; 1121 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1122 mutex_exit(&vnode_free_list_lock); 1123 } 1124 } 1125 1126 /* 1127 * Vnode reference, where a reference is already held by some other 1128 * object (for example, a file structure). 1129 */ 1130 void 1131 vref(vnode_t *vp) 1132 { 1133 1134 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1135 1136 mutex_enter(&vp->v_interlock); 1137 if (vp->v_usecount <= 0) { 1138 vpanic(vp, "vref used where vget required"); 1139 } 1140 if (++vp->v_usecount == 0) { 1141 vpanic(vp, "vref: usecount overflow"); 1142 } 1143 mutex_exit(&vp->v_interlock); 1144 } 1145 1146 /* 1147 * Remove any vnodes in the vnode table belonging to mount point mp. 1148 * 1149 * If FORCECLOSE is not specified, there should not be any active ones, 1150 * return error if any are found (nb: this is a user error, not a 1151 * system error). If FORCECLOSE is specified, detach any active vnodes 1152 * that are found. 1153 * 1154 * If WRITECLOSE is set, only flush out regular file vnodes open for 1155 * writing. 1156 * 1157 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1158 */ 1159 #ifdef DEBUG 1160 int busyprt = 0; /* print out busy vnodes */ 1161 struct ctldebug debug1 = { "busyprt", &busyprt }; 1162 #endif 1163 1164 static vnode_t * 1165 vflushnext(vnode_t *mvp, int *when) 1166 { 1167 1168 if (hardclock_ticks > *when) { 1169 mutex_exit(&mntvnode_lock); 1170 yield(); 1171 mutex_enter(&mntvnode_lock); 1172 *when = hardclock_ticks + hz / 10; 1173 } 1174 1175 return vunmark(mvp); 1176 } 1177 1178 int 1179 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1180 { 1181 vnode_t *vp, *mvp; 1182 int busy = 0, when = 0; 1183 1184 /* Allocate a marker vnode. */ 1185 if ((mvp = vnalloc(mp)) == NULL) 1186 return (ENOMEM); 1187 1188 mutex_enter(&mntvnode_lock); 1189 /* 1190 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1191 * and vclean() are called 1192 */ 1193 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1194 vp = vflushnext(mvp, &when)) { 1195 vmark(mvp, vp); 1196 if (vp->v_mount != mp || vismarker(vp)) 1197 continue; 1198 /* 1199 * Skip over a selected vnode. 1200 */ 1201 if (vp == skipvp) 1202 continue; 1203 mutex_enter(&vp->v_interlock); 1204 /* 1205 * Ignore clean but still referenced vnodes. 1206 */ 1207 if ((vp->v_iflag & VI_CLEAN) != 0) { 1208 mutex_exit(&vp->v_interlock); 1209 continue; 1210 } 1211 /* 1212 * Skip over a vnodes marked VSYSTEM. 1213 */ 1214 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1215 mutex_exit(&vp->v_interlock); 1216 continue; 1217 } 1218 /* 1219 * If WRITECLOSE is set, only flush out regular file 1220 * vnodes open for writing. 1221 */ 1222 if ((flags & WRITECLOSE) && 1223 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1224 mutex_exit(&vp->v_interlock); 1225 continue; 1226 } 1227 /* 1228 * With v_usecount == 0, all we need to do is clear 1229 * out the vnode data structures and we are done. 1230 */ 1231 if (vp->v_usecount == 0) { 1232 mutex_exit(&mntvnode_lock); 1233 vremfree(vp); 1234 vp->v_usecount++; 1235 vclean(vp, DOCLOSE); 1236 vrelel(vp, 0); 1237 mutex_enter(&mntvnode_lock); 1238 continue; 1239 } 1240 /* 1241 * If FORCECLOSE is set, forcibly close the vnode. 1242 * For block or character devices, revert to an 1243 * anonymous device. For all other files, just 1244 * kill them. 1245 */ 1246 if (flags & FORCECLOSE) { 1247 mutex_exit(&mntvnode_lock); 1248 vp->v_usecount++; 1249 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1250 vclean(vp, DOCLOSE); 1251 vrelel(vp, 0); 1252 } else { 1253 vclean(vp, 0); 1254 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1255 mutex_exit(&vp->v_interlock); 1256 /* 1257 * The vnode isn't clean, but still resides 1258 * on the mount list. Remove it. XXX This 1259 * is a bit dodgy. 1260 */ 1261 insmntque(vp, NULL); 1262 vrele(vp); 1263 } 1264 mutex_enter(&mntvnode_lock); 1265 continue; 1266 } 1267 #ifdef DEBUG 1268 if (busyprt) 1269 vprint("vflush: busy vnode", vp); 1270 #endif 1271 mutex_exit(&vp->v_interlock); 1272 busy++; 1273 } 1274 mutex_exit(&mntvnode_lock); 1275 vnfree(mvp); 1276 if (busy) 1277 return (EBUSY); 1278 return (0); 1279 } 1280 1281 /* 1282 * Disassociate the underlying file system from a vnode. 1283 * 1284 * Must be called with the interlock held, and will return with it held. 1285 */ 1286 void 1287 vclean(vnode_t *vp, int flags) 1288 { 1289 lwp_t *l = curlwp; 1290 bool recycle, active; 1291 int error; 1292 1293 KASSERT(mutex_owned(&vp->v_interlock)); 1294 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1295 KASSERT(vp->v_usecount != 0); 1296 1297 /* If cleaning is already in progress wait until done and return. */ 1298 if (vp->v_iflag & VI_XLOCK) { 1299 vwait(vp, VI_XLOCK); 1300 return; 1301 } 1302 1303 /* If already clean, nothing to do. */ 1304 if ((vp->v_iflag & VI_CLEAN) != 0) { 1305 return; 1306 } 1307 1308 /* 1309 * Prevent the vnode from being recycled or brought into use 1310 * while we clean it out. 1311 */ 1312 vp->v_iflag |= VI_XLOCK; 1313 if (vp->v_iflag & VI_EXECMAP) { 1314 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1315 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1316 } 1317 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1318 active = (vp->v_usecount > 1); 1319 1320 /* XXXAD should not lock vnode under layer */ 1321 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1322 1323 /* 1324 * Clean out any cached data associated with the vnode. 1325 * If purging an active vnode, it must be closed and 1326 * deactivated before being reclaimed. Note that the 1327 * VOP_INACTIVE will unlock the vnode. 1328 */ 1329 if (flags & DOCLOSE) { 1330 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1331 if (error != 0) 1332 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1333 KASSERT(error == 0); 1334 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1335 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1336 spec_node_revoke(vp); 1337 } 1338 } 1339 if (active) { 1340 VOP_INACTIVE(vp, &recycle); 1341 } else { 1342 /* 1343 * Any other processes trying to obtain this lock must first 1344 * wait for VI_XLOCK to clear, then call the new lock operation. 1345 */ 1346 VOP_UNLOCK(vp, 0); 1347 } 1348 1349 /* Disassociate the underlying file system from the vnode. */ 1350 if (VOP_RECLAIM(vp)) { 1351 vpanic(vp, "vclean: cannot reclaim"); 1352 } 1353 1354 KASSERT(vp->v_uobj.uo_npages == 0); 1355 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1356 uvm_ra_freectx(vp->v_ractx); 1357 vp->v_ractx = NULL; 1358 } 1359 cache_purge(vp); 1360 1361 /* Done with purge, notify sleepers of the grim news. */ 1362 vp->v_op = dead_vnodeop_p; 1363 vp->v_tag = VT_NON; 1364 mutex_enter(&vp->v_interlock); 1365 vp->v_vnlock = &vp->v_lock; 1366 KNOTE(&vp->v_klist, NOTE_REVOKE); 1367 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1368 vp->v_vflag &= ~VV_LOCKSWORK; 1369 if ((flags & DOCLOSE) != 0) { 1370 vp->v_iflag |= VI_CLEAN; 1371 } 1372 cv_broadcast(&vp->v_cv); 1373 1374 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1375 } 1376 1377 /* 1378 * Recycle an unused vnode to the front of the free list. 1379 * Release the passed interlock if the vnode will be recycled. 1380 */ 1381 int 1382 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1383 { 1384 1385 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1386 1387 mutex_enter(&vp->v_interlock); 1388 if (vp->v_usecount != 0) { 1389 mutex_exit(&vp->v_interlock); 1390 return (0); 1391 } 1392 if (inter_lkp) 1393 mutex_exit(inter_lkp); 1394 vremfree(vp); 1395 vp->v_usecount++; 1396 vclean(vp, DOCLOSE); 1397 vrelel(vp, 0); 1398 return (1); 1399 } 1400 1401 /* 1402 * Eliminate all activity associated with a vnode in preparation for 1403 * reuse. Drops a reference from the vnode. 1404 */ 1405 void 1406 vgone(vnode_t *vp) 1407 { 1408 1409 mutex_enter(&vp->v_interlock); 1410 vclean(vp, DOCLOSE); 1411 vrelel(vp, 0); 1412 } 1413 1414 /* 1415 * Lookup a vnode by device number. 1416 */ 1417 int 1418 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1419 { 1420 vnode_t *vp; 1421 int rc = 0; 1422 1423 mutex_enter(&specfs_lock); 1424 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1425 if (dev != vp->v_rdev || type != vp->v_type) 1426 continue; 1427 *vpp = vp; 1428 rc = 1; 1429 break; 1430 } 1431 mutex_exit(&specfs_lock); 1432 return (rc); 1433 } 1434 1435 /* 1436 * Revoke all the vnodes corresponding to the specified minor number 1437 * range (endpoints inclusive) of the specified major. 1438 */ 1439 void 1440 vdevgone(int maj, int minl, int minh, enum vtype type) 1441 { 1442 vnode_t *vp, **vpp; 1443 dev_t dev; 1444 int mn; 1445 1446 vp = NULL; /* XXX gcc */ 1447 1448 mutex_enter(&specfs_lock); 1449 for (mn = minl; mn <= minh; mn++) { 1450 dev = makedev(maj, mn); 1451 vpp = &specfs_hash[SPECHASH(dev)]; 1452 for (vp = *vpp; vp != NULL;) { 1453 mutex_enter(&vp->v_interlock); 1454 if ((vp->v_iflag & VI_CLEAN) != 0 || 1455 dev != vp->v_rdev || type != vp->v_type) { 1456 mutex_exit(&vp->v_interlock); 1457 vp = vp->v_specnext; 1458 continue; 1459 } 1460 mutex_exit(&specfs_lock); 1461 if (vget(vp, LK_INTERLOCK) == 0) { 1462 VOP_REVOKE(vp, REVOKEALL); 1463 vrele(vp); 1464 } 1465 mutex_enter(&specfs_lock); 1466 vp = *vpp; 1467 } 1468 } 1469 mutex_exit(&specfs_lock); 1470 } 1471 1472 /* 1473 * Calculate the total number of references to a special device. 1474 */ 1475 int 1476 vcount(vnode_t *vp) 1477 { 1478 int count; 1479 1480 mutex_enter(&specfs_lock); 1481 mutex_enter(&vp->v_interlock); 1482 if (vp->v_specnode == NULL) { 1483 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 1484 mutex_exit(&vp->v_interlock); 1485 mutex_exit(&specfs_lock); 1486 return (count); 1487 } 1488 mutex_exit(&vp->v_interlock); 1489 count = vp->v_specnode->sn_dev->sd_opencnt; 1490 mutex_exit(&specfs_lock); 1491 return (count); 1492 } 1493 1494 /* 1495 * Eliminate all activity associated with the requested vnode 1496 * and with all vnodes aliased to the requested vnode. 1497 */ 1498 void 1499 vrevoke(vnode_t *vp) 1500 { 1501 vnode_t *vq, **vpp; 1502 enum vtype type; 1503 dev_t dev; 1504 1505 KASSERT(vp->v_usecount > 0); 1506 1507 mutex_enter(&vp->v_interlock); 1508 if ((vp->v_iflag & VI_CLEAN) != 0) { 1509 mutex_exit(&vp->v_interlock); 1510 return; 1511 } else { 1512 dev = vp->v_rdev; 1513 type = vp->v_type; 1514 mutex_exit(&vp->v_interlock); 1515 } 1516 1517 vpp = &specfs_hash[SPECHASH(dev)]; 1518 mutex_enter(&specfs_lock); 1519 for (vq = *vpp; vq != NULL;) { 1520 /* If clean or being cleaned, then ignore it. */ 1521 mutex_enter(&vq->v_interlock); 1522 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1523 vq->v_rdev != dev || vq->v_type != type) { 1524 mutex_exit(&vq->v_interlock); 1525 vq = vq->v_specnext; 1526 continue; 1527 } 1528 mutex_exit(&specfs_lock); 1529 if (vq->v_usecount == 0) { 1530 vremfree(vq); 1531 } 1532 vq->v_usecount++; 1533 vclean(vq, DOCLOSE); 1534 vrelel(vq, 0); 1535 mutex_enter(&specfs_lock); 1536 vq = *vpp; 1537 } 1538 mutex_exit(&specfs_lock); 1539 } 1540 1541 /* 1542 * sysctl helper routine to return list of supported fstypes 1543 */ 1544 static int 1545 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1546 { 1547 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 1548 char *where = oldp; 1549 struct vfsops *v; 1550 size_t needed, left, slen; 1551 int error, first; 1552 1553 if (newp != NULL) 1554 return (EPERM); 1555 if (namelen != 0) 1556 return (EINVAL); 1557 1558 first = 1; 1559 error = 0; 1560 needed = 0; 1561 left = *oldlenp; 1562 1563 sysctl_unlock(); 1564 mutex_enter(&vfs_list_lock); 1565 LIST_FOREACH(v, &vfs_list, vfs_list) { 1566 if (where == NULL) 1567 needed += strlen(v->vfs_name) + 1; 1568 else { 1569 memset(bf, 0, sizeof(bf)); 1570 if (first) { 1571 strncpy(bf, v->vfs_name, sizeof(bf)); 1572 first = 0; 1573 } else { 1574 bf[0] = ' '; 1575 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1576 } 1577 bf[sizeof(bf)-1] = '\0'; 1578 slen = strlen(bf); 1579 if (left < slen + 1) 1580 break; 1581 /* +1 to copy out the trailing NUL byte */ 1582 v->vfs_refcount++; 1583 mutex_exit(&vfs_list_lock); 1584 error = copyout(bf, where, slen + 1); 1585 mutex_enter(&vfs_list_lock); 1586 v->vfs_refcount--; 1587 if (error) 1588 break; 1589 where += slen; 1590 needed += slen; 1591 left -= slen; 1592 } 1593 } 1594 mutex_exit(&vfs_list_lock); 1595 sysctl_relock(); 1596 *oldlenp = needed; 1597 return (error); 1598 } 1599 1600 /* 1601 * Top level filesystem related information gathering. 1602 */ 1603 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 1604 { 1605 sysctl_createv(clog, 0, NULL, NULL, 1606 CTLFLAG_PERMANENT, 1607 CTLTYPE_NODE, "vfs", NULL, 1608 NULL, 0, NULL, 0, 1609 CTL_VFS, CTL_EOL); 1610 sysctl_createv(clog, 0, NULL, NULL, 1611 CTLFLAG_PERMANENT, 1612 CTLTYPE_NODE, "generic", 1613 SYSCTL_DESCR("Non-specific vfs related information"), 1614 NULL, 0, NULL, 0, 1615 CTL_VFS, VFS_GENERIC, CTL_EOL); 1616 sysctl_createv(clog, 0, NULL, NULL, 1617 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1618 CTLTYPE_INT, "usermount", 1619 SYSCTL_DESCR("Whether unprivileged users may mount " 1620 "filesystems"), 1621 NULL, 0, &dovfsusermount, 0, 1622 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 1623 sysctl_createv(clog, 0, NULL, NULL, 1624 CTLFLAG_PERMANENT, 1625 CTLTYPE_STRING, "fstypes", 1626 SYSCTL_DESCR("List of file systems present"), 1627 sysctl_vfs_generic_fstypes, 0, NULL, 0, 1628 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 1629 sysctl_createv(clog, 0, NULL, NULL, 1630 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1631 CTLTYPE_INT, "magiclinks", 1632 SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), 1633 NULL, 0, &vfs_magiclinks, 0, 1634 CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); 1635 } 1636 1637 1638 int kinfo_vdebug = 1; 1639 int kinfo_vgetfailed; 1640 #define KINFO_VNODESLOP 10 1641 /* 1642 * Dump vnode list (via sysctl). 1643 * Copyout address of vnode followed by vnode. 1644 */ 1645 /* ARGSUSED */ 1646 int 1647 sysctl_kern_vnode(SYSCTLFN_ARGS) 1648 { 1649 char *where = oldp; 1650 size_t *sizep = oldlenp; 1651 struct mount *mp, *nmp; 1652 vnode_t *vp, *mvp, vbuf; 1653 char *bp = where, *savebp; 1654 char *ewhere; 1655 int error; 1656 1657 if (namelen != 0) 1658 return (EOPNOTSUPP); 1659 if (newp != NULL) 1660 return (EPERM); 1661 1662 #define VPTRSZ sizeof(vnode_t *) 1663 #define VNODESZ sizeof(vnode_t) 1664 if (where == NULL) { 1665 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1666 return (0); 1667 } 1668 ewhere = where + *sizep; 1669 1670 sysctl_unlock(); 1671 mutex_enter(&mountlist_lock); 1672 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1673 mp = nmp) { 1674 if (vfs_trybusy(mp, RW_READER, &nmp)) { 1675 continue; 1676 } 1677 savebp = bp; 1678 /* Allocate a marker vnode. */ 1679 if ((mvp = vnalloc(mp)) == NULL) { 1680 sysctl_relock(); 1681 return (ENOMEM); 1682 } 1683 mutex_enter(&mntvnode_lock); 1684 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 1685 vmark(mvp, vp); 1686 /* 1687 * Check that the vp is still associated with 1688 * this filesystem. RACE: could have been 1689 * recycled onto the same filesystem. 1690 */ 1691 if (vp->v_mount != mp || vismarker(vp)) 1692 continue; 1693 if (bp + VPTRSZ + VNODESZ > ewhere) { 1694 (void)vunmark(mvp); 1695 mutex_exit(&mntvnode_lock); 1696 vnfree(mvp); 1697 sysctl_relock(); 1698 *sizep = bp - where; 1699 return (ENOMEM); 1700 } 1701 memcpy(&vbuf, vp, VNODESZ); 1702 mutex_exit(&mntvnode_lock); 1703 if ((error = copyout(vp, bp, VPTRSZ)) || 1704 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 1705 mutex_enter(&mntvnode_lock); 1706 (void)vunmark(mvp); 1707 mutex_exit(&mntvnode_lock); 1708 vnfree(mvp); 1709 sysctl_relock(); 1710 return (error); 1711 } 1712 bp += VPTRSZ + VNODESZ; 1713 mutex_enter(&mntvnode_lock); 1714 } 1715 mutex_exit(&mntvnode_lock); 1716 mutex_enter(&mountlist_lock); 1717 vfs_unbusy(mp, false, &nmp); 1718 vnfree(mvp); 1719 } 1720 mutex_exit(&mountlist_lock); 1721 sysctl_relock(); 1722 1723 *sizep = bp - where; 1724 return (0); 1725 } 1726 1727 /* 1728 * Remove clean vnodes from a mountpoint's vnode list. 1729 */ 1730 void 1731 vfs_scrubvnlist(struct mount *mp) 1732 { 1733 vnode_t *vp, *nvp; 1734 1735 retry: 1736 mutex_enter(&mntvnode_lock); 1737 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1738 nvp = TAILQ_NEXT(vp, v_mntvnodes); 1739 mutex_enter(&vp->v_interlock); 1740 if ((vp->v_iflag & VI_CLEAN) != 0) { 1741 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 1742 vp->v_mount = NULL; 1743 mutex_exit(&mntvnode_lock); 1744 mutex_exit(&vp->v_interlock); 1745 vfs_destroy(mp, false); 1746 goto retry; 1747 } 1748 mutex_exit(&vp->v_interlock); 1749 } 1750 mutex_exit(&mntvnode_lock); 1751 } 1752 1753 /* 1754 * Check to see if a filesystem is mounted on a block device. 1755 */ 1756 int 1757 vfs_mountedon(vnode_t *vp) 1758 { 1759 vnode_t *vq; 1760 int error = 0; 1761 1762 if (vp->v_type != VBLK) 1763 return ENOTBLK; 1764 if (vp->v_specmountpoint != NULL) 1765 return (EBUSY); 1766 mutex_enter(&specfs_lock); 1767 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 1768 vq = vq->v_specnext) { 1769 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1770 continue; 1771 if (vq->v_specmountpoint != NULL) { 1772 error = EBUSY; 1773 break; 1774 } 1775 } 1776 mutex_exit(&specfs_lock); 1777 return (error); 1778 } 1779 1780 /* 1781 * Unmount all file systems. 1782 * We traverse the list in reverse order under the assumption that doing so 1783 * will avoid needing to worry about dependencies. 1784 */ 1785 void 1786 vfs_unmountall(struct lwp *l) 1787 { 1788 struct mount *mp, *nmp; 1789 int allerror, error; 1790 1791 printf("unmounting file systems..."); 1792 for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist); 1793 !CIRCLEQ_EMPTY(&mountlist); 1794 mp = nmp) { 1795 nmp = CIRCLEQ_PREV(mp, mnt_list); 1796 #ifdef DEBUG 1797 printf("\nunmounting %s (%s)...", 1798 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 1799 #endif 1800 /* 1801 * XXX Freeze syncer. Must do this before locking the 1802 * mount point. See dounmount() for details. 1803 */ 1804 mutex_enter(&syncer_mutex); 1805 if (vfs_busy(mp, RW_WRITER)) { 1806 mutex_exit(&syncer_mutex); 1807 continue; 1808 } 1809 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 1810 printf("unmount of %s failed with error %d\n", 1811 mp->mnt_stat.f_mntonname, error); 1812 allerror = 1; 1813 } 1814 } 1815 printf(" done\n"); 1816 if (allerror) 1817 printf("WARNING: some file systems would not unmount\n"); 1818 } 1819 1820 /* 1821 * Sync and unmount file systems before shutting down. 1822 */ 1823 void 1824 vfs_shutdown(void) 1825 { 1826 struct lwp *l; 1827 1828 /* XXX we're certainly not running in lwp0's context! */ 1829 l = curlwp; 1830 if (l == NULL) 1831 l = &lwp0; 1832 1833 printf("syncing disks... "); 1834 1835 /* remove user processes from run queue */ 1836 suspendsched(); 1837 (void) spl0(); 1838 1839 /* avoid coming back this way again if we panic. */ 1840 doing_shutdown = 1; 1841 1842 sys_sync(l, NULL, NULL); 1843 1844 /* Wait for sync to finish. */ 1845 if (buf_syncwait() != 0) { 1846 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 1847 Debugger(); 1848 #endif 1849 printf("giving up\n"); 1850 return; 1851 } else 1852 printf("done\n"); 1853 1854 /* 1855 * If we've panic'd, don't make the situation potentially 1856 * worse by unmounting the file systems. 1857 */ 1858 if (panicstr != NULL) 1859 return; 1860 1861 /* Release inodes held by texts before update. */ 1862 #ifdef notdef 1863 vnshutdown(); 1864 #endif 1865 /* Unmount file systems. */ 1866 vfs_unmountall(l); 1867 } 1868 1869 /* 1870 * Mount the root file system. If the operator didn't specify a 1871 * file system to use, try all possible file systems until one 1872 * succeeds. 1873 */ 1874 int 1875 vfs_mountroot(void) 1876 { 1877 struct vfsops *v; 1878 int error = ENODEV; 1879 1880 if (root_device == NULL) 1881 panic("vfs_mountroot: root device unknown"); 1882 1883 switch (device_class(root_device)) { 1884 case DV_IFNET: 1885 if (rootdev != NODEV) 1886 panic("vfs_mountroot: rootdev set for DV_IFNET " 1887 "(0x%08x -> %d,%d)", rootdev, 1888 major(rootdev), minor(rootdev)); 1889 break; 1890 1891 case DV_DISK: 1892 if (rootdev == NODEV) 1893 panic("vfs_mountroot: rootdev not set for DV_DISK"); 1894 if (bdevvp(rootdev, &rootvp)) 1895 panic("vfs_mountroot: can't get vnode for rootdev"); 1896 error = VOP_OPEN(rootvp, FREAD, FSCRED); 1897 if (error) { 1898 printf("vfs_mountroot: can't open root device\n"); 1899 return (error); 1900 } 1901 break; 1902 1903 default: 1904 printf("%s: inappropriate for root file system\n", 1905 device_xname(root_device)); 1906 return (ENODEV); 1907 } 1908 1909 /* 1910 * If user specified a file system, use it. 1911 */ 1912 if (mountroot != NULL) { 1913 error = (*mountroot)(); 1914 goto done; 1915 } 1916 1917 /* 1918 * Try each file system currently configured into the kernel. 1919 */ 1920 mutex_enter(&vfs_list_lock); 1921 LIST_FOREACH(v, &vfs_list, vfs_list) { 1922 if (v->vfs_mountroot == NULL) 1923 continue; 1924 #ifdef DEBUG 1925 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 1926 #endif 1927 v->vfs_refcount++; 1928 mutex_exit(&vfs_list_lock); 1929 error = (*v->vfs_mountroot)(); 1930 mutex_enter(&vfs_list_lock); 1931 v->vfs_refcount--; 1932 if (!error) { 1933 aprint_normal("root file system type: %s\n", 1934 v->vfs_name); 1935 break; 1936 } 1937 } 1938 mutex_exit(&vfs_list_lock); 1939 1940 if (v == NULL) { 1941 printf("no file system for %s", device_xname(root_device)); 1942 if (device_class(root_device) == DV_DISK) 1943 printf(" (dev 0x%x)", rootdev); 1944 printf("\n"); 1945 error = EFTYPE; 1946 } 1947 1948 done: 1949 if (error && device_class(root_device) == DV_DISK) { 1950 VOP_CLOSE(rootvp, FREAD, FSCRED); 1951 vrele(rootvp); 1952 } 1953 return (error); 1954 } 1955 1956 /* 1957 * Sham lock manager for vnodes. This is a temporary measure. 1958 */ 1959 int 1960 vlockmgr(struct vnlock *vl, int flags) 1961 { 1962 1963 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 1964 1965 switch (flags & LK_TYPE_MASK) { 1966 case LK_SHARED: 1967 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 1968 return 0; 1969 } 1970 if ((flags & LK_NOWAIT) != 0) { 1971 return EBUSY; 1972 } 1973 rw_enter(&vl->vl_lock, RW_READER); 1974 return 0; 1975 1976 case LK_EXCLUSIVE: 1977 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 1978 return 0; 1979 } 1980 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 1981 rw_write_held(&vl->vl_lock)) { 1982 vl->vl_recursecnt++; 1983 return 0; 1984 } 1985 if ((flags & LK_NOWAIT) != 0) { 1986 return EBUSY; 1987 } 1988 rw_enter(&vl->vl_lock, RW_WRITER); 1989 return 0; 1990 1991 case LK_RELEASE: 1992 if (vl->vl_recursecnt != 0) { 1993 KASSERT(rw_write_held(&vl->vl_lock)); 1994 vl->vl_recursecnt--; 1995 return 0; 1996 } 1997 rw_exit(&vl->vl_lock); 1998 return 0; 1999 2000 default: 2001 panic("vlockmgr: flags %x", flags); 2002 } 2003 } 2004 2005 int 2006 vlockstatus(struct vnlock *vl) 2007 { 2008 2009 if (rw_write_held(&vl->vl_lock)) { 2010 return LK_EXCLUSIVE; 2011 } 2012 if (rw_read_held(&vl->vl_lock)) { 2013 return LK_SHARED; 2014 } 2015 return 0; 2016 } 2017