1 /* $NetBSD: vfs_subr.c,v 1.340 2008/05/02 17:40:30 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * External virtual filesystem routines. 71 * 72 * This file contains vfs subroutines which are heavily dependant on 73 * the kernel and are not suitable for standalone use. Examples include 74 * routines involved vnode and mountpoint management. 75 */ 76 77 #include <sys/cdefs.h> 78 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.340 2008/05/02 17:40:30 ad Exp $"); 79 80 #include "opt_ddb.h" 81 #include "opt_compat_netbsd.h" 82 #include "opt_compat_43.h" 83 84 #include <sys/param.h> 85 #include <sys/systm.h> 86 #include <sys/proc.h> 87 #include <sys/kernel.h> 88 #include <sys/mount.h> 89 #include <sys/fcntl.h> 90 #include <sys/vnode.h> 91 #include <sys/stat.h> 92 #include <sys/namei.h> 93 #include <sys/ucred.h> 94 #include <sys/buf.h> 95 #include <sys/errno.h> 96 #include <sys/malloc.h> 97 #include <sys/syscallargs.h> 98 #include <sys/device.h> 99 #include <sys/filedesc.h> 100 #include <sys/kauth.h> 101 #include <sys/atomic.h> 102 #include <sys/kthread.h> 103 104 #include <miscfs/specfs/specdev.h> 105 #include <miscfs/syncfs/syncfs.h> 106 107 #include <uvm/uvm.h> 108 #include <uvm/uvm_readahead.h> 109 #include <uvm/uvm_ddb.h> 110 111 #include <sys/sysctl.h> 112 113 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 114 extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 115 116 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 117 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 118 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 119 120 static int vrele_pending; 121 static kmutex_t vrele_lock; 122 static kcondvar_t vrele_cv; 123 static lwp_t *vrele_lwp; 124 125 static pool_cache_t vnode_cache; 126 127 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 128 129 /* 130 * Local declarations. 131 */ 132 133 static void vrele_thread(void *); 134 static void insmntque(vnode_t *, struct mount *); 135 static int getdevvp(dev_t, vnode_t **, enum vtype); 136 static vnode_t *getcleanvnode(void);; 137 void vpanic(vnode_t *, const char *); 138 139 #ifdef DIAGNOSTIC 140 void 141 vpanic(vnode_t *vp, const char *msg) 142 { 143 144 vprint(NULL, vp); 145 panic("%s\n", msg); 146 } 147 #else 148 #define vpanic(vp, msg) /* nothing */ 149 #endif 150 151 void 152 vn_init1(void) 153 { 154 155 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 156 NULL, IPL_NONE, NULL, NULL, NULL); 157 KASSERT(vnode_cache != NULL); 158 159 /* Create deferred release thread. */ 160 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 161 cv_init(&vrele_cv, "vrele"); 162 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 163 NULL, &vrele_lwp, "vrele")) 164 panic("fork vrele"); 165 } 166 167 int 168 vfs_drainvnodes(long target, struct lwp *l) 169 { 170 171 while (numvnodes > target) { 172 vnode_t *vp; 173 174 mutex_enter(&vnode_free_list_lock); 175 vp = getcleanvnode(); 176 if (vp == NULL) 177 return EBUSY; /* give up */ 178 ungetnewvnode(vp); 179 } 180 181 return 0; 182 } 183 184 /* 185 * grab a vnode from freelist and clean it. 186 */ 187 vnode_t * 188 getcleanvnode(void) 189 { 190 vnode_t *vp; 191 vnodelst_t *listhd; 192 193 KASSERT(mutex_owned(&vnode_free_list_lock)); 194 195 retry: 196 listhd = &vnode_free_list; 197 try_nextlist: 198 TAILQ_FOREACH(vp, listhd, v_freelist) { 199 /* 200 * It's safe to test v_usecount and v_iflag 201 * without holding the interlock here, since 202 * these vnodes should never appear on the 203 * lists. 204 */ 205 if (vp->v_usecount != 0) { 206 vpanic(vp, "free vnode isn't"); 207 } 208 if ((vp->v_iflag & VI_CLEAN) != 0) { 209 vpanic(vp, "clean vnode on freelist"); 210 } 211 if (vp->v_freelisthd != listhd) { 212 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 213 vpanic(vp, "list head mismatch"); 214 } 215 if (!mutex_tryenter(&vp->v_interlock)) 216 continue; 217 /* 218 * Our lwp might hold the underlying vnode 219 * locked, so don't try to reclaim a VI_LAYER 220 * node if it's locked. 221 */ 222 if ((vp->v_iflag & VI_XLOCK) == 0 && 223 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 224 break; 225 } 226 mutex_exit(&vp->v_interlock); 227 } 228 229 if (vp == NULL) { 230 if (listhd == &vnode_free_list) { 231 listhd = &vnode_hold_list; 232 goto try_nextlist; 233 } 234 mutex_exit(&vnode_free_list_lock); 235 return NULL; 236 } 237 238 /* Remove it from the freelist. */ 239 TAILQ_REMOVE(listhd, vp, v_freelist); 240 vp->v_freelisthd = NULL; 241 mutex_exit(&vnode_free_list_lock); 242 243 /* 244 * The vnode is still associated with a file system, so we must 245 * clean it out before reusing it. We need to add a reference 246 * before doing this. If the vnode gains another reference while 247 * being cleaned out then we lose - retry. 248 */ 249 vp->v_usecount++; 250 vclean(vp, DOCLOSE); 251 if (vp->v_usecount == 1) { 252 /* We're about to dirty it. */ 253 vp->v_iflag &= ~VI_CLEAN; 254 mutex_exit(&vp->v_interlock); 255 if (vp->v_type == VBLK || vp->v_type == VCHR) { 256 spec_node_destroy(vp); 257 } 258 vp->v_type = VNON; 259 } else { 260 /* 261 * Don't return to freelist - the holder of the last 262 * reference will destroy it. 263 */ 264 KASSERT(vp->v_usecount > 1); 265 vp->v_usecount--; 266 mutex_exit(&vp->v_interlock); 267 mutex_enter(&vnode_free_list_lock); 268 goto retry; 269 } 270 271 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 272 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 273 vpanic(vp, "cleaned vnode isn't"); 274 } 275 if (vp->v_numoutput != 0) { 276 vpanic(vp, "clean vnode has pending I/O's"); 277 } 278 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 279 vpanic(vp, "clean vnode on syncer list"); 280 } 281 282 return vp; 283 } 284 285 static inline int 286 vfs_dobusy(struct mount *mp, const krw_t op, struct mount **nextp) 287 { 288 lwp_t *l; 289 290 KASSERT(mp->mnt_refcnt > 0); 291 292 atomic_inc_uint(&mp->mnt_refcnt); 293 if (nextp != NULL) { 294 mutex_exit(&mountlist_lock); 295 } 296 l = curlwp; 297 if (l->l_mpbusy == mp) { 298 if (op == RW_WRITER) { 299 KASSERT(rw_write_held(&mp->mnt_lock)); 300 } else { 301 KASSERT(rw_lock_held(&mp->mnt_lock)); 302 } 303 l->l_mprecurse++; 304 } else { 305 rw_enter(&mp->mnt_lock, op); 306 l->l_mpbusy = mp; 307 } 308 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 309 if (nextp != NULL) { 310 mutex_enter(&mountlist_lock); 311 } 312 vfs_unbusy(mp, false, nextp); 313 return ENOENT; 314 } 315 316 return 0; 317 } 318 319 /* 320 * Mark a mount point as busy, and gain a new reference to it. Used to 321 * synchronize access and to delay unmounting. 322 * 323 * => The caller must hold a pre-existing reference to the mount. 324 */ 325 int 326 vfs_busy(struct mount *mp, const krw_t op) 327 { 328 int error; 329 330 for (;;) { 331 error = vfs_dobusy(mp, op, NULL); 332 if (error != 0) { 333 return error; 334 } 335 if (__predict_true(mp->mnt_unmounter == NULL)) { 336 return 0; 337 } 338 mutex_enter(&mount_lock); 339 if (mp->mnt_unmounter != NULL) { 340 vfs_unbusy(mp, false, NULL); 341 cv_wait(&mount_cv, &mount_lock); 342 } 343 mutex_exit(&mount_lock); 344 } 345 } 346 347 /* 348 * As vfs_busy(), but return error if the file system is being 349 * unmounted (and do not wait for the unmount). 350 * 351 * => If nextp != NULL, mountlist_lock is understood to be held. On 352 * failure a pointer to the next mount will be returned via nextp. 353 * The caller need not hold a reference to the mount. 354 * 355 * => If nextp == NULL, the caller is expected to hold a reference 356 * to the mount. 357 */ 358 int 359 vfs_trybusy(struct mount *mp, krw_t op, struct mount **nextp) 360 { 361 lwp_t *l; 362 int error; 363 364 KASSERT(nextp == NULL || mutex_owned(&mountlist_lock)); 365 366 if (nextp != NULL) { 367 /* 368 * We need to prevent adding a reference to the mount 369 * if it is already on the way out: the reference count 370 * could be zero, and as a result another thread could 371 * be in vfs_destroy() trying to throw away the mount. 372 * 373 * mnt_iflag is protected by mnt_lock, but this check is 374 * safe if mountlist_lock is held. mountlist_lock will 375 * be held by vfs_destroy() before removing the mount 376 * from mountlist. 377 */ 378 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 379 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 380 return ENOENT; 381 } 382 } 383 384 error = vfs_dobusy(mp, op, nextp); 385 l = mp->mnt_unmounter; 386 if (error == 0 && (l != NULL && l != curlwp)) { 387 if (nextp != NULL) { 388 mutex_enter(&mountlist_lock); 389 } 390 vfs_unbusy(mp, false, nextp); 391 error = EBUSY; 392 } 393 return error; 394 } 395 396 /* 397 * Unlock a busy filesystem and drop reference to it. If 'keepref' is 398 * true, unlock but preserve the reference. 399 * 400 * => If nextp != NULL, mountlist_lock is understood to be held. On 401 * failure a pointer to the next mount will be returned via nextp. 402 */ 403 void 404 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 405 { 406 lwp_t *l; 407 408 KASSERT(mp->mnt_refcnt > 0); 409 410 l = curlwp; 411 if (l->l_mpbusy != NULL) { 412 KASSERT(l->l_mpbusy == mp); 413 KASSERT(rw_lock_held(&mp->mnt_lock)); 414 if (l->l_mprecurse != 0) { 415 l->l_mprecurse--; 416 } else { 417 l->l_mpbusy = NULL; 418 rw_exit(&mp->mnt_lock); 419 } 420 } else { 421 rw_exit(&mp->mnt_lock); 422 } 423 if (nextp != NULL) { 424 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 425 } 426 if (!keepref) { 427 vfs_destroy(mp, nextp != NULL); 428 } 429 } 430 431 /* 432 * Lookup a filesystem type, and if found allocate and initialize 433 * a mount structure for it. 434 * 435 * Devname is usually updated by mount(8) after booting. 436 */ 437 int 438 vfs_rootmountalloc(const char *fstypename, const char *devname, 439 struct mount **mpp) 440 { 441 struct vfsops *vfsp = NULL; 442 struct mount *mp; 443 444 mutex_enter(&vfs_list_lock); 445 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 446 if (!strncmp(vfsp->vfs_name, fstypename, 447 sizeof(mp->mnt_stat.f_fstypename))) 448 break; 449 if (vfsp == NULL) { 450 mutex_exit(&vfs_list_lock); 451 return (ENODEV); 452 } 453 vfsp->vfs_refcount++; 454 mutex_exit(&vfs_list_lock); 455 456 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 457 if (mp == NULL) 458 return ENOMEM; 459 mp->mnt_refcnt = 1; 460 rw_init(&mp->mnt_lock); 461 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 462 (void)vfs_busy(mp, RW_WRITER); 463 TAILQ_INIT(&mp->mnt_vnodelist); 464 mp->mnt_op = vfsp; 465 mp->mnt_flag = MNT_RDONLY; 466 mp->mnt_vnodecovered = NULL; 467 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 468 sizeof(mp->mnt_stat.f_fstypename)); 469 mp->mnt_stat.f_mntonname[0] = '/'; 470 mp->mnt_stat.f_mntonname[1] = '\0'; 471 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 472 '\0'; 473 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 474 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 475 mount_initspecific(mp); 476 *mpp = mp; 477 return (0); 478 } 479 480 /* 481 * Routines having to do with the management of the vnode table. 482 */ 483 extern int (**dead_vnodeop_p)(void *); 484 485 /* 486 * Return the next vnode from the free list. 487 */ 488 int 489 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 490 vnode_t **vpp) 491 { 492 struct uvm_object *uobj; 493 static int toggle; 494 vnode_t *vp; 495 int error = 0, tryalloc; 496 497 try_again: 498 if (mp != NULL) { 499 /* 500 * Mark filesystem busy while we're creating a 501 * vnode. If unmount is in progress, this will 502 * wait; if the unmount succeeds (only if umount 503 * -f), this will return an error. If the 504 * unmount fails, we'll keep going afterwards. 505 */ 506 error = vfs_busy(mp, RW_READER); 507 if (error) 508 return error; 509 } 510 511 /* 512 * We must choose whether to allocate a new vnode or recycle an 513 * existing one. The criterion for allocating a new one is that 514 * the total number of vnodes is less than the number desired or 515 * there are no vnodes on either free list. Generally we only 516 * want to recycle vnodes that have no buffers associated with 517 * them, so we look first on the vnode_free_list. If it is empty, 518 * we next consider vnodes with referencing buffers on the 519 * vnode_hold_list. The toggle ensures that half the time we 520 * will use a buffer from the vnode_hold_list, and half the time 521 * we will allocate a new one unless the list has grown to twice 522 * the desired size. We are reticent to recycle vnodes from the 523 * vnode_hold_list because we will lose the identity of all its 524 * referencing buffers. 525 */ 526 527 vp = NULL; 528 529 mutex_enter(&vnode_free_list_lock); 530 531 toggle ^= 1; 532 if (numvnodes > 2 * desiredvnodes) 533 toggle = 0; 534 535 tryalloc = numvnodes < desiredvnodes || 536 (TAILQ_FIRST(&vnode_free_list) == NULL && 537 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 538 539 if (tryalloc) { 540 numvnodes++; 541 mutex_exit(&vnode_free_list_lock); 542 if ((vp = vnalloc(NULL)) == NULL) { 543 mutex_enter(&vnode_free_list_lock); 544 numvnodes--; 545 } else 546 vp->v_usecount = 1; 547 } 548 549 if (vp == NULL) { 550 vp = getcleanvnode(); 551 if (vp == NULL) { 552 if (mp != NULL) { 553 vfs_unbusy(mp, false, NULL); 554 } 555 if (tryalloc) { 556 printf("WARNING: unable to allocate new " 557 "vnode, retrying...\n"); 558 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 559 goto try_again; 560 } 561 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 562 *vpp = 0; 563 return (ENFILE); 564 } 565 vp->v_iflag = 0; 566 vp->v_vflag = 0; 567 vp->v_uflag = 0; 568 vp->v_socket = NULL; 569 } 570 571 KASSERT(vp->v_usecount == 1); 572 KASSERT(vp->v_freelisthd == NULL); 573 KASSERT(LIST_EMPTY(&vp->v_nclist)); 574 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 575 576 vp->v_type = VNON; 577 vp->v_vnlock = &vp->v_lock; 578 vp->v_tag = tag; 579 vp->v_op = vops; 580 insmntque(vp, mp); 581 *vpp = vp; 582 vp->v_data = 0; 583 584 /* 585 * initialize uvm_object within vnode. 586 */ 587 588 uobj = &vp->v_uobj; 589 KASSERT(uobj->pgops == &uvm_vnodeops); 590 KASSERT(uobj->uo_npages == 0); 591 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 592 vp->v_size = vp->v_writesize = VSIZENOTSET; 593 594 if (mp != NULL) { 595 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 596 vp->v_vflag |= VV_MPSAFE; 597 vfs_unbusy(mp, true, NULL); 598 } 599 600 return (0); 601 } 602 603 /* 604 * This is really just the reverse of getnewvnode(). Needed for 605 * VFS_VGET functions who may need to push back a vnode in case 606 * of a locking race. 607 */ 608 void 609 ungetnewvnode(vnode_t *vp) 610 { 611 612 KASSERT(vp->v_usecount == 1); 613 KASSERT(vp->v_data == NULL); 614 KASSERT(vp->v_freelisthd == NULL); 615 616 mutex_enter(&vp->v_interlock); 617 vp->v_iflag |= VI_CLEAN; 618 vrelel(vp, 0); 619 } 620 621 /* 622 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 623 * marker vnode and we are prepared to wait for the allocation. 624 */ 625 vnode_t * 626 vnalloc(struct mount *mp) 627 { 628 vnode_t *vp; 629 630 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 631 if (vp == NULL) { 632 return NULL; 633 } 634 635 memset(vp, 0, sizeof(*vp)); 636 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 637 cv_init(&vp->v_cv, "vnode"); 638 /* 639 * done by memset() above. 640 * LIST_INIT(&vp->v_nclist); 641 * LIST_INIT(&vp->v_dnclist); 642 */ 643 644 if (mp != NULL) { 645 vp->v_mount = mp; 646 vp->v_type = VBAD; 647 vp->v_iflag = VI_MARKER; 648 } else { 649 rw_init(&vp->v_lock.vl_lock); 650 } 651 652 return vp; 653 } 654 655 /* 656 * Free an unused, unreferenced vnode. 657 */ 658 void 659 vnfree(vnode_t *vp) 660 { 661 662 KASSERT(vp->v_usecount == 0); 663 664 if ((vp->v_iflag & VI_MARKER) == 0) { 665 rw_destroy(&vp->v_lock.vl_lock); 666 mutex_enter(&vnode_free_list_lock); 667 numvnodes--; 668 mutex_exit(&vnode_free_list_lock); 669 } 670 671 UVM_OBJ_DESTROY(&vp->v_uobj); 672 cv_destroy(&vp->v_cv); 673 pool_cache_put(vnode_cache, vp); 674 } 675 676 /* 677 * Remove a vnode from its freelist. 678 */ 679 static inline void 680 vremfree(vnode_t *vp) 681 { 682 683 KASSERT(mutex_owned(&vp->v_interlock)); 684 KASSERT(vp->v_usecount == 0); 685 686 /* 687 * Note that the reference count must not change until 688 * the vnode is removed. 689 */ 690 mutex_enter(&vnode_free_list_lock); 691 if (vp->v_holdcnt > 0) { 692 KASSERT(vp->v_freelisthd == &vnode_hold_list); 693 } else { 694 KASSERT(vp->v_freelisthd == &vnode_free_list); 695 } 696 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 697 vp->v_freelisthd = NULL; 698 mutex_exit(&vnode_free_list_lock); 699 } 700 701 /* 702 * Move a vnode from one mount queue to another. 703 */ 704 static void 705 insmntque(vnode_t *vp, struct mount *mp) 706 { 707 struct mount *omp; 708 709 #ifdef DIAGNOSTIC 710 if ((mp != NULL) && 711 (mp->mnt_iflag & IMNT_UNMOUNT) && 712 !(mp->mnt_flag & MNT_SOFTDEP) && 713 vp->v_tag != VT_VFS) { 714 panic("insmntque into dying filesystem"); 715 } 716 #endif 717 718 mutex_enter(&mntvnode_lock); 719 /* 720 * Delete from old mount point vnode list, if on one. 721 */ 722 if ((omp = vp->v_mount) != NULL) 723 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 724 /* 725 * Insert into list of vnodes for the new mount point, if 726 * available. The caller must take a reference on the mount 727 * structure and donate to the vnode. 728 */ 729 if ((vp->v_mount = mp) != NULL) 730 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 731 mutex_exit(&mntvnode_lock); 732 733 if (omp != NULL) { 734 /* Release reference to old mount. */ 735 vfs_destroy(omp, false); 736 } 737 } 738 739 /* 740 * Create a vnode for a block device. 741 * Used for root filesystem and swap areas. 742 * Also used for memory file system special devices. 743 */ 744 int 745 bdevvp(dev_t dev, vnode_t **vpp) 746 { 747 748 return (getdevvp(dev, vpp, VBLK)); 749 } 750 751 /* 752 * Create a vnode for a character device. 753 * Used for kernfs and some console handling. 754 */ 755 int 756 cdevvp(dev_t dev, vnode_t **vpp) 757 { 758 759 return (getdevvp(dev, vpp, VCHR)); 760 } 761 762 /* 763 * Create a vnode for a device. 764 * Used by bdevvp (block device) for root file system etc., 765 * and by cdevvp (character device) for console and kernfs. 766 */ 767 static int 768 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 769 { 770 vnode_t *vp; 771 vnode_t *nvp; 772 int error; 773 774 if (dev == NODEV) { 775 *vpp = NULL; 776 return (0); 777 } 778 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 779 if (error) { 780 *vpp = NULL; 781 return (error); 782 } 783 vp = nvp; 784 vp->v_type = type; 785 vp->v_vflag |= VV_MPSAFE; 786 uvm_vnp_setsize(vp, 0); 787 spec_node_init(vp, dev); 788 *vpp = vp; 789 return (0); 790 } 791 792 /* 793 * Grab a particular vnode from the free list, increment its 794 * reference count and lock it. If the vnode lock bit is set the 795 * vnode is being eliminated in vgone. In that case, we can not 796 * grab the vnode, so the process is awakened when the transition is 797 * completed, and an error returned to indicate that the vnode is no 798 * longer usable (possibly having been changed to a new file system type). 799 */ 800 int 801 vget(vnode_t *vp, int flags) 802 { 803 int error; 804 805 KASSERT((vp->v_iflag & VI_MARKER) == 0); 806 807 if ((flags & LK_INTERLOCK) == 0) 808 mutex_enter(&vp->v_interlock); 809 810 /* 811 * Before adding a reference, we must remove the vnode 812 * from its freelist. 813 */ 814 if (vp->v_usecount == 0) { 815 vremfree(vp); 816 } 817 if (++vp->v_usecount == 0) { 818 vpanic(vp, "vget: usecount overflow"); 819 } 820 821 /* 822 * If the vnode is in the process of being cleaned out for 823 * another use, we wait for the cleaning to finish and then 824 * return failure. Cleaning is determined by checking if 825 * the VI_XLOCK or VI_FREEING flags are set. 826 */ 827 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 828 if ((flags & LK_NOWAIT) != 0) { 829 vrelel(vp, 0); 830 return EBUSY; 831 } 832 vwait(vp, VI_XLOCK | VI_FREEING); 833 vrelel(vp, 0); 834 return ENOENT; 835 } 836 if (flags & LK_TYPE_MASK) { 837 error = vn_lock(vp, flags | LK_INTERLOCK); 838 if (error != 0) { 839 vrele(vp); 840 } 841 return error; 842 } 843 mutex_exit(&vp->v_interlock); 844 return 0; 845 } 846 847 /* 848 * vput(), just unlock and vrele() 849 */ 850 void 851 vput(vnode_t *vp) 852 { 853 854 KASSERT((vp->v_iflag & VI_MARKER) == 0); 855 856 VOP_UNLOCK(vp, 0); 857 vrele(vp); 858 } 859 860 /* 861 * Vnode release. If reference count drops to zero, call inactive 862 * routine and either return to freelist or free to the pool. 863 */ 864 void 865 vrelel(vnode_t *vp, int flags) 866 { 867 bool recycle, defer; 868 int error; 869 870 KASSERT(mutex_owned(&vp->v_interlock)); 871 KASSERT((vp->v_iflag & VI_MARKER) == 0); 872 KASSERT(vp->v_freelisthd == NULL); 873 874 if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) { 875 vpanic(vp, "dead but not clean"); 876 } 877 878 /* 879 * If not the last reference, just drop the reference count 880 * and unlock. 881 */ 882 if (vp->v_usecount > 1) { 883 vp->v_usecount--; 884 vp->v_iflag |= VI_INACTREDO; 885 mutex_exit(&vp->v_interlock); 886 return; 887 } 888 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 889 vpanic(vp, "vput: bad ref count"); 890 } 891 892 /* 893 * If not clean, deactivate the vnode, but preserve 894 * our reference across the call to VOP_INACTIVE(). 895 */ 896 retry: 897 if ((vp->v_iflag & VI_CLEAN) == 0) { 898 recycle = false; 899 /* 900 * XXX This ugly block can be largely eliminated if 901 * locking is pushed down into the file systems. 902 */ 903 if (curlwp == uvm.pagedaemon_lwp) { 904 /* The pagedaemon can't wait around; defer. */ 905 defer = true; 906 } else if (curlwp == vrele_lwp) { 907 /* We have to try harder. */ 908 vp->v_iflag &= ~VI_INACTREDO; 909 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 910 LK_RETRY); 911 if (error != 0) { 912 /* XXX */ 913 vpanic(vp, "vrele: unable to lock %p"); 914 } 915 defer = false; 916 } else if ((vp->v_iflag & VI_LAYER) != 0) { 917 /* 918 * Acquiring the stack's lock in vclean() even 919 * for an honest vput/vrele is dangerous because 920 * our caller may hold other vnode locks; defer. 921 */ 922 defer = true; 923 } else { 924 /* If we can't acquire the lock, then defer. */ 925 vp->v_iflag &= ~VI_INACTREDO; 926 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 927 LK_NOWAIT); 928 if (error != 0) { 929 defer = true; 930 mutex_enter(&vp->v_interlock); 931 } else { 932 defer = false; 933 } 934 } 935 936 if (defer) { 937 /* 938 * Defer reclaim to the kthread; it's not safe to 939 * clean it here. We donate it our last reference. 940 */ 941 KASSERT(mutex_owned(&vp->v_interlock)); 942 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 943 vp->v_iflag |= VI_INACTPEND; 944 mutex_enter(&vrele_lock); 945 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 946 if (++vrele_pending > (desiredvnodes >> 8)) 947 cv_signal(&vrele_cv); 948 mutex_exit(&vrele_lock); 949 mutex_exit(&vp->v_interlock); 950 return; 951 } 952 953 #ifdef DIAGNOSTIC 954 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 955 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 956 vprint("vrelel: missing VOP_CLOSE()", vp); 957 } 958 #endif 959 960 /* 961 * The vnode can gain another reference while being 962 * deactivated. If VOP_INACTIVE() indicates that 963 * the described file has been deleted, then recycle 964 * the vnode irrespective of additional references. 965 * Another thread may be waiting to re-use the on-disk 966 * inode. 967 * 968 * Note that VOP_INACTIVE() will drop the vnode lock. 969 */ 970 VOP_INACTIVE(vp, &recycle); 971 mutex_enter(&vp->v_interlock); 972 if (!recycle) { 973 if (vp->v_usecount > 1) { 974 vp->v_usecount--; 975 mutex_exit(&vp->v_interlock); 976 return; 977 } 978 979 /* 980 * If we grew another reference while 981 * VOP_INACTIVE() was underway, retry. 982 */ 983 if ((vp->v_iflag & VI_INACTREDO) != 0) { 984 goto retry; 985 } 986 } 987 988 /* Take care of space accounting. */ 989 if (vp->v_iflag & VI_EXECMAP) { 990 atomic_add_int(&uvmexp.execpages, 991 -vp->v_uobj.uo_npages); 992 atomic_add_int(&uvmexp.filepages, 993 vp->v_uobj.uo_npages); 994 } 995 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 996 vp->v_vflag &= ~VV_MAPPED; 997 998 /* 999 * Recycle the vnode if the file is now unused (unlinked), 1000 * otherwise just free it. 1001 */ 1002 if (recycle) { 1003 vclean(vp, DOCLOSE); 1004 } 1005 KASSERT(vp->v_usecount > 0); 1006 } 1007 1008 if (--vp->v_usecount != 0) { 1009 /* Gained another reference while being reclaimed. */ 1010 mutex_exit(&vp->v_interlock); 1011 return; 1012 } 1013 1014 if ((vp->v_iflag & VI_CLEAN) != 0) { 1015 /* 1016 * It's clean so destroy it. It isn't referenced 1017 * anywhere since it has been reclaimed. 1018 */ 1019 KASSERT(vp->v_holdcnt == 0); 1020 KASSERT(vp->v_writecount == 0); 1021 mutex_exit(&vp->v_interlock); 1022 insmntque(vp, NULL); 1023 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1024 spec_node_destroy(vp); 1025 } 1026 vnfree(vp); 1027 } else { 1028 /* 1029 * Otherwise, put it back onto the freelist. It 1030 * can't be destroyed while still associated with 1031 * a file system. 1032 */ 1033 mutex_enter(&vnode_free_list_lock); 1034 if (vp->v_holdcnt > 0) { 1035 vp->v_freelisthd = &vnode_hold_list; 1036 } else { 1037 vp->v_freelisthd = &vnode_free_list; 1038 } 1039 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1040 mutex_exit(&vnode_free_list_lock); 1041 mutex_exit(&vp->v_interlock); 1042 } 1043 } 1044 1045 void 1046 vrele(vnode_t *vp) 1047 { 1048 1049 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1050 1051 mutex_enter(&vp->v_interlock); 1052 vrelel(vp, 0); 1053 } 1054 1055 static void 1056 vrele_thread(void *cookie) 1057 { 1058 vnode_t *vp; 1059 1060 for (;;) { 1061 mutex_enter(&vrele_lock); 1062 while (TAILQ_EMPTY(&vrele_list)) { 1063 cv_timedwait(&vrele_cv, &vrele_lock, hz); 1064 } 1065 vp = TAILQ_FIRST(&vrele_list); 1066 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 1067 vrele_pending--; 1068 mutex_exit(&vrele_lock); 1069 1070 /* 1071 * If not the last reference, then ignore the vnode 1072 * and look for more work. 1073 */ 1074 mutex_enter(&vp->v_interlock); 1075 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 1076 vp->v_iflag &= ~VI_INACTPEND; 1077 if (vp->v_usecount > 1) { 1078 vp->v_usecount--; 1079 mutex_exit(&vp->v_interlock); 1080 continue; 1081 } 1082 vrelel(vp, 0); 1083 } 1084 } 1085 1086 /* 1087 * Page or buffer structure gets a reference. 1088 * Called with v_interlock held. 1089 */ 1090 void 1091 vholdl(vnode_t *vp) 1092 { 1093 1094 KASSERT(mutex_owned(&vp->v_interlock)); 1095 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1096 1097 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1098 mutex_enter(&vnode_free_list_lock); 1099 KASSERT(vp->v_freelisthd == &vnode_free_list); 1100 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1101 vp->v_freelisthd = &vnode_hold_list; 1102 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1103 mutex_exit(&vnode_free_list_lock); 1104 } 1105 } 1106 1107 /* 1108 * Page or buffer structure frees a reference. 1109 * Called with v_interlock held. 1110 */ 1111 void 1112 holdrelel(vnode_t *vp) 1113 { 1114 1115 KASSERT(mutex_owned(&vp->v_interlock)); 1116 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1117 1118 if (vp->v_holdcnt <= 0) { 1119 vpanic(vp, "holdrelel: holdcnt vp %p"); 1120 } 1121 1122 vp->v_holdcnt--; 1123 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1124 mutex_enter(&vnode_free_list_lock); 1125 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1126 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1127 vp->v_freelisthd = &vnode_free_list; 1128 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1129 mutex_exit(&vnode_free_list_lock); 1130 } 1131 } 1132 1133 /* 1134 * Vnode reference, where a reference is already held by some other 1135 * object (for example, a file structure). 1136 */ 1137 void 1138 vref(vnode_t *vp) 1139 { 1140 1141 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1142 1143 mutex_enter(&vp->v_interlock); 1144 if (vp->v_usecount <= 0) { 1145 vpanic(vp, "vref used where vget required"); 1146 } 1147 if (++vp->v_usecount == 0) { 1148 vpanic(vp, "vref: usecount overflow"); 1149 } 1150 mutex_exit(&vp->v_interlock); 1151 } 1152 1153 /* 1154 * Remove any vnodes in the vnode table belonging to mount point mp. 1155 * 1156 * If FORCECLOSE is not specified, there should not be any active ones, 1157 * return error if any are found (nb: this is a user error, not a 1158 * system error). If FORCECLOSE is specified, detach any active vnodes 1159 * that are found. 1160 * 1161 * If WRITECLOSE is set, only flush out regular file vnodes open for 1162 * writing. 1163 * 1164 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1165 */ 1166 #ifdef DEBUG 1167 int busyprt = 0; /* print out busy vnodes */ 1168 struct ctldebug debug1 = { "busyprt", &busyprt }; 1169 #endif 1170 1171 static vnode_t * 1172 vflushnext(vnode_t *mvp, int *when) 1173 { 1174 1175 if (hardclock_ticks > *when) { 1176 mutex_exit(&mntvnode_lock); 1177 yield(); 1178 mutex_enter(&mntvnode_lock); 1179 *when = hardclock_ticks + hz / 10; 1180 } 1181 1182 return vunmark(mvp); 1183 } 1184 1185 int 1186 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1187 { 1188 vnode_t *vp, *mvp; 1189 int busy = 0, when = 0; 1190 1191 /* Allocate a marker vnode. */ 1192 if ((mvp = vnalloc(mp)) == NULL) 1193 return (ENOMEM); 1194 1195 mutex_enter(&mntvnode_lock); 1196 /* 1197 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1198 * and vclean() are called 1199 */ 1200 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1201 vp = vflushnext(mvp, &when)) { 1202 vmark(mvp, vp); 1203 if (vp->v_mount != mp || vismarker(vp)) 1204 continue; 1205 /* 1206 * Skip over a selected vnode. 1207 */ 1208 if (vp == skipvp) 1209 continue; 1210 mutex_enter(&vp->v_interlock); 1211 /* 1212 * Ignore clean but still referenced vnodes. 1213 */ 1214 if ((vp->v_iflag & VI_CLEAN) != 0) { 1215 mutex_exit(&vp->v_interlock); 1216 continue; 1217 } 1218 /* 1219 * Skip over a vnodes marked VSYSTEM. 1220 */ 1221 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1222 mutex_exit(&vp->v_interlock); 1223 continue; 1224 } 1225 /* 1226 * If WRITECLOSE is set, only flush out regular file 1227 * vnodes open for writing. 1228 */ 1229 if ((flags & WRITECLOSE) && 1230 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1231 mutex_exit(&vp->v_interlock); 1232 continue; 1233 } 1234 /* 1235 * With v_usecount == 0, all we need to do is clear 1236 * out the vnode data structures and we are done. 1237 */ 1238 if (vp->v_usecount == 0) { 1239 mutex_exit(&mntvnode_lock); 1240 vremfree(vp); 1241 vp->v_usecount++; 1242 vclean(vp, DOCLOSE); 1243 vrelel(vp, 0); 1244 mutex_enter(&mntvnode_lock); 1245 continue; 1246 } 1247 /* 1248 * If FORCECLOSE is set, forcibly close the vnode. 1249 * For block or character devices, revert to an 1250 * anonymous device. For all other files, just 1251 * kill them. 1252 */ 1253 if (flags & FORCECLOSE) { 1254 mutex_exit(&mntvnode_lock); 1255 vp->v_usecount++; 1256 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1257 vclean(vp, DOCLOSE); 1258 vrelel(vp, 0); 1259 } else { 1260 vclean(vp, 0); 1261 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1262 mutex_exit(&vp->v_interlock); 1263 /* 1264 * The vnode isn't clean, but still resides 1265 * on the mount list. Remove it. XXX This 1266 * is a bit dodgy. 1267 */ 1268 insmntque(vp, NULL); 1269 vrele(vp); 1270 } 1271 mutex_enter(&mntvnode_lock); 1272 continue; 1273 } 1274 #ifdef DEBUG 1275 if (busyprt) 1276 vprint("vflush: busy vnode", vp); 1277 #endif 1278 mutex_exit(&vp->v_interlock); 1279 busy++; 1280 } 1281 mutex_exit(&mntvnode_lock); 1282 vnfree(mvp); 1283 if (busy) 1284 return (EBUSY); 1285 return (0); 1286 } 1287 1288 /* 1289 * Disassociate the underlying file system from a vnode. 1290 * 1291 * Must be called with the interlock held, and will return with it held. 1292 */ 1293 void 1294 vclean(vnode_t *vp, int flags) 1295 { 1296 lwp_t *l = curlwp; 1297 bool recycle, active; 1298 int error; 1299 1300 KASSERT(mutex_owned(&vp->v_interlock)); 1301 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1302 KASSERT(vp->v_usecount != 0); 1303 1304 /* If cleaning is already in progress wait until done and return. */ 1305 if (vp->v_iflag & VI_XLOCK) { 1306 vwait(vp, VI_XLOCK); 1307 return; 1308 } 1309 1310 /* If already clean, nothing to do. */ 1311 if ((vp->v_iflag & VI_CLEAN) != 0) { 1312 return; 1313 } 1314 1315 /* 1316 * Prevent the vnode from being recycled or brought into use 1317 * while we clean it out. 1318 */ 1319 vp->v_iflag |= VI_XLOCK; 1320 if (vp->v_iflag & VI_EXECMAP) { 1321 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1322 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1323 } 1324 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1325 active = (vp->v_usecount > 1); 1326 1327 /* XXXAD should not lock vnode under layer */ 1328 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1329 1330 /* 1331 * Clean out any cached data associated with the vnode. 1332 * If purging an active vnode, it must be closed and 1333 * deactivated before being reclaimed. Note that the 1334 * VOP_INACTIVE will unlock the vnode. 1335 */ 1336 if (flags & DOCLOSE) { 1337 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1338 if (error != 0) 1339 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1340 KASSERT(error == 0); 1341 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1342 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1343 spec_node_revoke(vp); 1344 } 1345 } 1346 if (active) { 1347 VOP_INACTIVE(vp, &recycle); 1348 } else { 1349 /* 1350 * Any other processes trying to obtain this lock must first 1351 * wait for VI_XLOCK to clear, then call the new lock operation. 1352 */ 1353 VOP_UNLOCK(vp, 0); 1354 } 1355 1356 /* Disassociate the underlying file system from the vnode. */ 1357 if (VOP_RECLAIM(vp)) { 1358 vpanic(vp, "vclean: cannot reclaim"); 1359 } 1360 1361 KASSERT(vp->v_uobj.uo_npages == 0); 1362 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1363 uvm_ra_freectx(vp->v_ractx); 1364 vp->v_ractx = NULL; 1365 } 1366 cache_purge(vp); 1367 1368 /* Done with purge, notify sleepers of the grim news. */ 1369 vp->v_op = dead_vnodeop_p; 1370 vp->v_tag = VT_NON; 1371 mutex_enter(&vp->v_interlock); 1372 vp->v_vnlock = &vp->v_lock; 1373 KNOTE(&vp->v_klist, NOTE_REVOKE); 1374 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1375 vp->v_vflag &= ~VV_LOCKSWORK; 1376 if ((flags & DOCLOSE) != 0) { 1377 vp->v_iflag |= VI_CLEAN; 1378 } 1379 cv_broadcast(&vp->v_cv); 1380 1381 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1382 } 1383 1384 /* 1385 * Recycle an unused vnode to the front of the free list. 1386 * Release the passed interlock if the vnode will be recycled. 1387 */ 1388 int 1389 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1390 { 1391 1392 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1393 1394 mutex_enter(&vp->v_interlock); 1395 if (vp->v_usecount != 0) { 1396 mutex_exit(&vp->v_interlock); 1397 return (0); 1398 } 1399 if (inter_lkp) 1400 mutex_exit(inter_lkp); 1401 vremfree(vp); 1402 vp->v_usecount++; 1403 vclean(vp, DOCLOSE); 1404 vrelel(vp, 0); 1405 return (1); 1406 } 1407 1408 /* 1409 * Eliminate all activity associated with a vnode in preparation for 1410 * reuse. Drops a reference from the vnode. 1411 */ 1412 void 1413 vgone(vnode_t *vp) 1414 { 1415 1416 mutex_enter(&vp->v_interlock); 1417 vclean(vp, DOCLOSE); 1418 vrelel(vp, 0); 1419 } 1420 1421 /* 1422 * Lookup a vnode by device number. 1423 */ 1424 int 1425 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1426 { 1427 vnode_t *vp; 1428 int rc = 0; 1429 1430 mutex_enter(&specfs_lock); 1431 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1432 if (dev != vp->v_rdev || type != vp->v_type) 1433 continue; 1434 *vpp = vp; 1435 rc = 1; 1436 break; 1437 } 1438 mutex_exit(&specfs_lock); 1439 return (rc); 1440 } 1441 1442 /* 1443 * Revoke all the vnodes corresponding to the specified minor number 1444 * range (endpoints inclusive) of the specified major. 1445 */ 1446 void 1447 vdevgone(int maj, int minl, int minh, enum vtype type) 1448 { 1449 vnode_t *vp, **vpp; 1450 dev_t dev; 1451 int mn; 1452 1453 vp = NULL; /* XXX gcc */ 1454 1455 mutex_enter(&specfs_lock); 1456 for (mn = minl; mn <= minh; mn++) { 1457 dev = makedev(maj, mn); 1458 vpp = &specfs_hash[SPECHASH(dev)]; 1459 for (vp = *vpp; vp != NULL;) { 1460 mutex_enter(&vp->v_interlock); 1461 if ((vp->v_iflag & VI_CLEAN) != 0 || 1462 dev != vp->v_rdev || type != vp->v_type) { 1463 mutex_exit(&vp->v_interlock); 1464 vp = vp->v_specnext; 1465 continue; 1466 } 1467 mutex_exit(&specfs_lock); 1468 if (vget(vp, LK_INTERLOCK) == 0) { 1469 VOP_REVOKE(vp, REVOKEALL); 1470 vrele(vp); 1471 } 1472 mutex_enter(&specfs_lock); 1473 vp = *vpp; 1474 } 1475 } 1476 mutex_exit(&specfs_lock); 1477 } 1478 1479 /* 1480 * Calculate the total number of references to a special device. 1481 */ 1482 int 1483 vcount(vnode_t *vp) 1484 { 1485 int count; 1486 1487 mutex_enter(&specfs_lock); 1488 mutex_enter(&vp->v_interlock); 1489 if (vp->v_specnode == NULL) { 1490 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 1491 mutex_exit(&vp->v_interlock); 1492 mutex_exit(&specfs_lock); 1493 return (count); 1494 } 1495 mutex_exit(&vp->v_interlock); 1496 count = vp->v_specnode->sn_dev->sd_opencnt; 1497 mutex_exit(&specfs_lock); 1498 return (count); 1499 } 1500 1501 /* 1502 * Eliminate all activity associated with the requested vnode 1503 * and with all vnodes aliased to the requested vnode. 1504 */ 1505 void 1506 vrevoke(vnode_t *vp) 1507 { 1508 vnode_t *vq, **vpp; 1509 enum vtype type; 1510 dev_t dev; 1511 1512 KASSERT(vp->v_usecount > 0); 1513 1514 mutex_enter(&vp->v_interlock); 1515 if ((vp->v_iflag & VI_CLEAN) != 0) { 1516 mutex_exit(&vp->v_interlock); 1517 return; 1518 } else { 1519 dev = vp->v_rdev; 1520 type = vp->v_type; 1521 mutex_exit(&vp->v_interlock); 1522 } 1523 1524 vpp = &specfs_hash[SPECHASH(dev)]; 1525 mutex_enter(&specfs_lock); 1526 for (vq = *vpp; vq != NULL;) { 1527 /* If clean or being cleaned, then ignore it. */ 1528 mutex_enter(&vq->v_interlock); 1529 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1530 vq->v_rdev != dev || vq->v_type != type) { 1531 mutex_exit(&vq->v_interlock); 1532 vq = vq->v_specnext; 1533 continue; 1534 } 1535 mutex_exit(&specfs_lock); 1536 if (vq->v_usecount == 0) { 1537 vremfree(vq); 1538 } 1539 vq->v_usecount++; 1540 vclean(vq, DOCLOSE); 1541 vrelel(vq, 0); 1542 mutex_enter(&specfs_lock); 1543 vq = *vpp; 1544 } 1545 mutex_exit(&specfs_lock); 1546 } 1547 1548 /* 1549 * sysctl helper routine to return list of supported fstypes 1550 */ 1551 static int 1552 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1553 { 1554 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 1555 char *where = oldp; 1556 struct vfsops *v; 1557 size_t needed, left, slen; 1558 int error, first; 1559 1560 if (newp != NULL) 1561 return (EPERM); 1562 if (namelen != 0) 1563 return (EINVAL); 1564 1565 first = 1; 1566 error = 0; 1567 needed = 0; 1568 left = *oldlenp; 1569 1570 sysctl_unlock(); 1571 mutex_enter(&vfs_list_lock); 1572 LIST_FOREACH(v, &vfs_list, vfs_list) { 1573 if (where == NULL) 1574 needed += strlen(v->vfs_name) + 1; 1575 else { 1576 memset(bf, 0, sizeof(bf)); 1577 if (first) { 1578 strncpy(bf, v->vfs_name, sizeof(bf)); 1579 first = 0; 1580 } else { 1581 bf[0] = ' '; 1582 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1583 } 1584 bf[sizeof(bf)-1] = '\0'; 1585 slen = strlen(bf); 1586 if (left < slen + 1) 1587 break; 1588 /* +1 to copy out the trailing NUL byte */ 1589 v->vfs_refcount++; 1590 mutex_exit(&vfs_list_lock); 1591 error = copyout(bf, where, slen + 1); 1592 mutex_enter(&vfs_list_lock); 1593 v->vfs_refcount--; 1594 if (error) 1595 break; 1596 where += slen; 1597 needed += slen; 1598 left -= slen; 1599 } 1600 } 1601 mutex_exit(&vfs_list_lock); 1602 sysctl_relock(); 1603 *oldlenp = needed; 1604 return (error); 1605 } 1606 1607 /* 1608 * Top level filesystem related information gathering. 1609 */ 1610 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 1611 { 1612 sysctl_createv(clog, 0, NULL, NULL, 1613 CTLFLAG_PERMANENT, 1614 CTLTYPE_NODE, "vfs", NULL, 1615 NULL, 0, NULL, 0, 1616 CTL_VFS, CTL_EOL); 1617 sysctl_createv(clog, 0, NULL, NULL, 1618 CTLFLAG_PERMANENT, 1619 CTLTYPE_NODE, "generic", 1620 SYSCTL_DESCR("Non-specific vfs related information"), 1621 NULL, 0, NULL, 0, 1622 CTL_VFS, VFS_GENERIC, CTL_EOL); 1623 sysctl_createv(clog, 0, NULL, NULL, 1624 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1625 CTLTYPE_INT, "usermount", 1626 SYSCTL_DESCR("Whether unprivileged users may mount " 1627 "filesystems"), 1628 NULL, 0, &dovfsusermount, 0, 1629 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 1630 sysctl_createv(clog, 0, NULL, NULL, 1631 CTLFLAG_PERMANENT, 1632 CTLTYPE_STRING, "fstypes", 1633 SYSCTL_DESCR("List of file systems present"), 1634 sysctl_vfs_generic_fstypes, 0, NULL, 0, 1635 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 1636 sysctl_createv(clog, 0, NULL, NULL, 1637 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1638 CTLTYPE_INT, "magiclinks", 1639 SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), 1640 NULL, 0, &vfs_magiclinks, 0, 1641 CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); 1642 } 1643 1644 1645 int kinfo_vdebug = 1; 1646 int kinfo_vgetfailed; 1647 #define KINFO_VNODESLOP 10 1648 /* 1649 * Dump vnode list (via sysctl). 1650 * Copyout address of vnode followed by vnode. 1651 */ 1652 /* ARGSUSED */ 1653 int 1654 sysctl_kern_vnode(SYSCTLFN_ARGS) 1655 { 1656 char *where = oldp; 1657 size_t *sizep = oldlenp; 1658 struct mount *mp, *nmp; 1659 vnode_t *vp, *mvp, vbuf; 1660 char *bp = where, *savebp; 1661 char *ewhere; 1662 int error; 1663 1664 if (namelen != 0) 1665 return (EOPNOTSUPP); 1666 if (newp != NULL) 1667 return (EPERM); 1668 1669 #define VPTRSZ sizeof(vnode_t *) 1670 #define VNODESZ sizeof(vnode_t) 1671 if (where == NULL) { 1672 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1673 return (0); 1674 } 1675 ewhere = where + *sizep; 1676 1677 sysctl_unlock(); 1678 mutex_enter(&mountlist_lock); 1679 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1680 mp = nmp) { 1681 if (vfs_trybusy(mp, RW_READER, &nmp)) { 1682 continue; 1683 } 1684 savebp = bp; 1685 /* Allocate a marker vnode. */ 1686 if ((mvp = vnalloc(mp)) == NULL) { 1687 sysctl_relock(); 1688 return (ENOMEM); 1689 } 1690 mutex_enter(&mntvnode_lock); 1691 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 1692 vmark(mvp, vp); 1693 /* 1694 * Check that the vp is still associated with 1695 * this filesystem. RACE: could have been 1696 * recycled onto the same filesystem. 1697 */ 1698 if (vp->v_mount != mp || vismarker(vp)) 1699 continue; 1700 if (bp + VPTRSZ + VNODESZ > ewhere) { 1701 (void)vunmark(mvp); 1702 mutex_exit(&mntvnode_lock); 1703 vnfree(mvp); 1704 sysctl_relock(); 1705 *sizep = bp - where; 1706 return (ENOMEM); 1707 } 1708 memcpy(&vbuf, vp, VNODESZ); 1709 mutex_exit(&mntvnode_lock); 1710 if ((error = copyout(vp, bp, VPTRSZ)) || 1711 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 1712 mutex_enter(&mntvnode_lock); 1713 (void)vunmark(mvp); 1714 mutex_exit(&mntvnode_lock); 1715 vnfree(mvp); 1716 sysctl_relock(); 1717 return (error); 1718 } 1719 bp += VPTRSZ + VNODESZ; 1720 mutex_enter(&mntvnode_lock); 1721 } 1722 mutex_exit(&mntvnode_lock); 1723 mutex_enter(&mountlist_lock); 1724 vfs_unbusy(mp, false, &nmp); 1725 vnfree(mvp); 1726 } 1727 mutex_exit(&mountlist_lock); 1728 sysctl_relock(); 1729 1730 *sizep = bp - where; 1731 return (0); 1732 } 1733 1734 /* 1735 * Remove clean vnodes from a mountpoint's vnode list. 1736 */ 1737 void 1738 vfs_scrubvnlist(struct mount *mp) 1739 { 1740 vnode_t *vp, *nvp; 1741 1742 retry: 1743 mutex_enter(&mntvnode_lock); 1744 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1745 nvp = TAILQ_NEXT(vp, v_mntvnodes); 1746 mutex_enter(&vp->v_interlock); 1747 if ((vp->v_iflag & VI_CLEAN) != 0) { 1748 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 1749 vp->v_mount = NULL; 1750 mutex_exit(&mntvnode_lock); 1751 mutex_exit(&vp->v_interlock); 1752 vfs_destroy(mp, false); 1753 goto retry; 1754 } 1755 mutex_exit(&vp->v_interlock); 1756 } 1757 mutex_exit(&mntvnode_lock); 1758 } 1759 1760 /* 1761 * Check to see if a filesystem is mounted on a block device. 1762 */ 1763 int 1764 vfs_mountedon(vnode_t *vp) 1765 { 1766 vnode_t *vq; 1767 int error = 0; 1768 1769 if (vp->v_type != VBLK) 1770 return ENOTBLK; 1771 if (vp->v_specmountpoint != NULL) 1772 return (EBUSY); 1773 mutex_enter(&specfs_lock); 1774 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 1775 vq = vq->v_specnext) { 1776 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1777 continue; 1778 if (vq->v_specmountpoint != NULL) { 1779 error = EBUSY; 1780 break; 1781 } 1782 } 1783 mutex_exit(&specfs_lock); 1784 return (error); 1785 } 1786 1787 /* 1788 * Unmount all file systems. 1789 * We traverse the list in reverse order under the assumption that doing so 1790 * will avoid needing to worry about dependencies. 1791 */ 1792 void 1793 vfs_unmountall(struct lwp *l) 1794 { 1795 struct mount *mp, *nmp; 1796 int allerror, error; 1797 1798 printf("unmounting file systems..."); 1799 for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist); 1800 !CIRCLEQ_EMPTY(&mountlist); 1801 mp = nmp) { 1802 nmp = CIRCLEQ_PREV(mp, mnt_list); 1803 #ifdef DEBUG 1804 printf("\nunmounting %s (%s)...", 1805 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 1806 #endif 1807 /* 1808 * XXX Freeze syncer. Must do this before locking the 1809 * mount point. See dounmount() for details. 1810 */ 1811 mutex_enter(&syncer_mutex); 1812 if (vfs_busy(mp, RW_WRITER)) { 1813 mutex_exit(&syncer_mutex); 1814 continue; 1815 } 1816 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 1817 printf("unmount of %s failed with error %d\n", 1818 mp->mnt_stat.f_mntonname, error); 1819 allerror = 1; 1820 } 1821 } 1822 printf(" done\n"); 1823 if (allerror) 1824 printf("WARNING: some file systems would not unmount\n"); 1825 } 1826 1827 /* 1828 * Sync and unmount file systems before shutting down. 1829 */ 1830 void 1831 vfs_shutdown(void) 1832 { 1833 struct lwp *l; 1834 1835 /* XXX we're certainly not running in lwp0's context! */ 1836 l = curlwp; 1837 if (l == NULL) 1838 l = &lwp0; 1839 1840 printf("syncing disks... "); 1841 1842 /* remove user processes from run queue */ 1843 suspendsched(); 1844 (void) spl0(); 1845 1846 /* avoid coming back this way again if we panic. */ 1847 doing_shutdown = 1; 1848 1849 sys_sync(l, NULL, NULL); 1850 1851 /* Wait for sync to finish. */ 1852 if (buf_syncwait() != 0) { 1853 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 1854 Debugger(); 1855 #endif 1856 printf("giving up\n"); 1857 return; 1858 } else 1859 printf("done\n"); 1860 1861 /* 1862 * If we've panic'd, don't make the situation potentially 1863 * worse by unmounting the file systems. 1864 */ 1865 if (panicstr != NULL) 1866 return; 1867 1868 /* Release inodes held by texts before update. */ 1869 #ifdef notdef 1870 vnshutdown(); 1871 #endif 1872 /* Unmount file systems. */ 1873 vfs_unmountall(l); 1874 } 1875 1876 /* 1877 * Mount the root file system. If the operator didn't specify a 1878 * file system to use, try all possible file systems until one 1879 * succeeds. 1880 */ 1881 int 1882 vfs_mountroot(void) 1883 { 1884 struct vfsops *v; 1885 int error = ENODEV; 1886 1887 if (root_device == NULL) 1888 panic("vfs_mountroot: root device unknown"); 1889 1890 switch (device_class(root_device)) { 1891 case DV_IFNET: 1892 if (rootdev != NODEV) 1893 panic("vfs_mountroot: rootdev set for DV_IFNET " 1894 "(0x%08x -> %d,%d)", rootdev, 1895 major(rootdev), minor(rootdev)); 1896 break; 1897 1898 case DV_DISK: 1899 if (rootdev == NODEV) 1900 panic("vfs_mountroot: rootdev not set for DV_DISK"); 1901 if (bdevvp(rootdev, &rootvp)) 1902 panic("vfs_mountroot: can't get vnode for rootdev"); 1903 error = VOP_OPEN(rootvp, FREAD, FSCRED); 1904 if (error) { 1905 printf("vfs_mountroot: can't open root device\n"); 1906 return (error); 1907 } 1908 break; 1909 1910 default: 1911 printf("%s: inappropriate for root file system\n", 1912 device_xname(root_device)); 1913 return (ENODEV); 1914 } 1915 1916 /* 1917 * If user specified a file system, use it. 1918 */ 1919 if (mountroot != NULL) { 1920 error = (*mountroot)(); 1921 goto done; 1922 } 1923 1924 /* 1925 * Try each file system currently configured into the kernel. 1926 */ 1927 mutex_enter(&vfs_list_lock); 1928 LIST_FOREACH(v, &vfs_list, vfs_list) { 1929 if (v->vfs_mountroot == NULL) 1930 continue; 1931 #ifdef DEBUG 1932 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 1933 #endif 1934 v->vfs_refcount++; 1935 mutex_exit(&vfs_list_lock); 1936 error = (*v->vfs_mountroot)(); 1937 mutex_enter(&vfs_list_lock); 1938 v->vfs_refcount--; 1939 if (!error) { 1940 aprint_normal("root file system type: %s\n", 1941 v->vfs_name); 1942 break; 1943 } 1944 } 1945 mutex_exit(&vfs_list_lock); 1946 1947 if (v == NULL) { 1948 printf("no file system for %s", device_xname(root_device)); 1949 if (device_class(root_device) == DV_DISK) 1950 printf(" (dev 0x%x)", rootdev); 1951 printf("\n"); 1952 error = EFTYPE; 1953 } 1954 1955 done: 1956 if (error && device_class(root_device) == DV_DISK) { 1957 VOP_CLOSE(rootvp, FREAD, FSCRED); 1958 vrele(rootvp); 1959 } 1960 return (error); 1961 } 1962 1963 /* 1964 * Sham lock manager for vnodes. This is a temporary measure. 1965 */ 1966 int 1967 vlockmgr(struct vnlock *vl, int flags) 1968 { 1969 1970 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 1971 1972 switch (flags & LK_TYPE_MASK) { 1973 case LK_SHARED: 1974 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 1975 return 0; 1976 } 1977 if ((flags & LK_NOWAIT) != 0) { 1978 return EBUSY; 1979 } 1980 rw_enter(&vl->vl_lock, RW_READER); 1981 return 0; 1982 1983 case LK_EXCLUSIVE: 1984 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 1985 return 0; 1986 } 1987 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 1988 rw_write_held(&vl->vl_lock)) { 1989 vl->vl_recursecnt++; 1990 return 0; 1991 } 1992 if ((flags & LK_NOWAIT) != 0) { 1993 return EBUSY; 1994 } 1995 rw_enter(&vl->vl_lock, RW_WRITER); 1996 return 0; 1997 1998 case LK_RELEASE: 1999 if (vl->vl_recursecnt != 0) { 2000 KASSERT(rw_write_held(&vl->vl_lock)); 2001 vl->vl_recursecnt--; 2002 return 0; 2003 } 2004 rw_exit(&vl->vl_lock); 2005 return 0; 2006 2007 default: 2008 panic("vlockmgr: flags %x", flags); 2009 } 2010 } 2011 2012 int 2013 vlockstatus(struct vnlock *vl) 2014 { 2015 2016 if (rw_write_held(&vl->vl_lock)) { 2017 return LK_EXCLUSIVE; 2018 } 2019 if (rw_read_held(&vl->vl_lock)) { 2020 return LK_SHARED; 2021 } 2022 return 0; 2023 } 2024