1 /* $NetBSD: vfs_subr.c,v 1.345 2008/05/27 17:49:07 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * External virtual filesystem routines. 71 * 72 * This file contains vfs subroutines which are heavily dependant on 73 * the kernel and are not suitable for standalone use. Examples include 74 * routines involved vnode and mountpoint management. 75 */ 76 77 #include <sys/cdefs.h> 78 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.345 2008/05/27 17:49:07 ad Exp $"); 79 80 #include "opt_ddb.h" 81 #include "opt_compat_netbsd.h" 82 #include "opt_compat_43.h" 83 84 #include <sys/param.h> 85 #include <sys/systm.h> 86 #include <sys/proc.h> 87 #include <sys/kernel.h> 88 #include <sys/mount.h> 89 #include <sys/fcntl.h> 90 #include <sys/vnode.h> 91 #include <sys/stat.h> 92 #include <sys/namei.h> 93 #include <sys/ucred.h> 94 #include <sys/buf.h> 95 #include <sys/errno.h> 96 #include <sys/malloc.h> 97 #include <sys/syscallargs.h> 98 #include <sys/device.h> 99 #include <sys/filedesc.h> 100 #include <sys/kauth.h> 101 #include <sys/atomic.h> 102 #include <sys/kthread.h> 103 104 #include <miscfs/specfs/specdev.h> 105 #include <miscfs/syncfs/syncfs.h> 106 107 #include <uvm/uvm.h> 108 #include <uvm/uvm_readahead.h> 109 #include <uvm/uvm_ddb.h> 110 111 #include <sys/sysctl.h> 112 113 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 114 extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 115 116 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 117 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 118 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 119 120 static int vrele_pending; 121 static kmutex_t vrele_lock; 122 static kcondvar_t vrele_cv; 123 static lwp_t *vrele_lwp; 124 125 static pool_cache_t vnode_cache; 126 127 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 128 129 /* 130 * Local declarations. 131 */ 132 133 static void vrele_thread(void *); 134 static void insmntque(vnode_t *, struct mount *); 135 static int getdevvp(dev_t, vnode_t **, enum vtype); 136 static vnode_t *getcleanvnode(void);; 137 void vpanic(vnode_t *, const char *); 138 139 #ifdef DIAGNOSTIC 140 void 141 vpanic(vnode_t *vp, const char *msg) 142 { 143 144 vprint(NULL, vp); 145 panic("%s\n", msg); 146 } 147 #else 148 #define vpanic(vp, msg) /* nothing */ 149 #endif 150 151 void 152 vn_init1(void) 153 { 154 155 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 156 NULL, IPL_NONE, NULL, NULL, NULL); 157 KASSERT(vnode_cache != NULL); 158 159 /* Create deferred release thread. */ 160 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 161 cv_init(&vrele_cv, "vrele"); 162 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 163 NULL, &vrele_lwp, "vrele")) 164 panic("fork vrele"); 165 } 166 167 int 168 vfs_drainvnodes(long target, struct lwp *l) 169 { 170 171 while (numvnodes > target) { 172 vnode_t *vp; 173 174 mutex_enter(&vnode_free_list_lock); 175 vp = getcleanvnode(); 176 if (vp == NULL) 177 return EBUSY; /* give up */ 178 ungetnewvnode(vp); 179 } 180 181 return 0; 182 } 183 184 /* 185 * grab a vnode from freelist and clean it. 186 */ 187 vnode_t * 188 getcleanvnode(void) 189 { 190 vnode_t *vp; 191 vnodelst_t *listhd; 192 193 KASSERT(mutex_owned(&vnode_free_list_lock)); 194 195 retry: 196 listhd = &vnode_free_list; 197 try_nextlist: 198 TAILQ_FOREACH(vp, listhd, v_freelist) { 199 /* 200 * It's safe to test v_usecount and v_iflag 201 * without holding the interlock here, since 202 * these vnodes should never appear on the 203 * lists. 204 */ 205 if (vp->v_usecount != 0) { 206 vpanic(vp, "free vnode isn't"); 207 } 208 if ((vp->v_iflag & VI_CLEAN) != 0) { 209 vpanic(vp, "clean vnode on freelist"); 210 } 211 if (vp->v_freelisthd != listhd) { 212 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 213 vpanic(vp, "list head mismatch"); 214 } 215 if (!mutex_tryenter(&vp->v_interlock)) 216 continue; 217 /* 218 * Our lwp might hold the underlying vnode 219 * locked, so don't try to reclaim a VI_LAYER 220 * node if it's locked. 221 */ 222 if ((vp->v_iflag & VI_XLOCK) == 0 && 223 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 224 break; 225 } 226 mutex_exit(&vp->v_interlock); 227 } 228 229 if (vp == NULL) { 230 if (listhd == &vnode_free_list) { 231 listhd = &vnode_hold_list; 232 goto try_nextlist; 233 } 234 mutex_exit(&vnode_free_list_lock); 235 return NULL; 236 } 237 238 /* Remove it from the freelist. */ 239 TAILQ_REMOVE(listhd, vp, v_freelist); 240 vp->v_freelisthd = NULL; 241 mutex_exit(&vnode_free_list_lock); 242 243 /* 244 * The vnode is still associated with a file system, so we must 245 * clean it out before reusing it. We need to add a reference 246 * before doing this. If the vnode gains another reference while 247 * being cleaned out then we lose - retry. 248 */ 249 vp->v_usecount++; 250 vclean(vp, DOCLOSE); 251 if (vp->v_usecount == 1) { 252 /* We're about to dirty it. */ 253 vp->v_iflag &= ~VI_CLEAN; 254 mutex_exit(&vp->v_interlock); 255 if (vp->v_type == VBLK || vp->v_type == VCHR) { 256 spec_node_destroy(vp); 257 } 258 vp->v_type = VNON; 259 } else { 260 /* 261 * Don't return to freelist - the holder of the last 262 * reference will destroy it. 263 */ 264 KASSERT(vp->v_usecount > 1); 265 vp->v_usecount--; 266 mutex_exit(&vp->v_interlock); 267 mutex_enter(&vnode_free_list_lock); 268 goto retry; 269 } 270 271 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 272 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 273 vpanic(vp, "cleaned vnode isn't"); 274 } 275 if (vp->v_numoutput != 0) { 276 vpanic(vp, "clean vnode has pending I/O's"); 277 } 278 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 279 vpanic(vp, "clean vnode on syncer list"); 280 } 281 282 return vp; 283 } 284 285 /* 286 * Mark a mount point as busy, and gain a new reference to it. Used to 287 * prevent the file system from being unmounted during critical sections. 288 * 289 * => The caller must hold a pre-existing reference to the mount. 290 * => Will fail if the file system is being unmounted, or is unmounted. 291 */ 292 int 293 vfs_busy(struct mount *mp, struct mount **nextp) 294 { 295 296 KASSERT(mp->mnt_refcnt > 0); 297 298 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) { 299 if (nextp != NULL) { 300 KASSERT(mutex_owned(&mountlist_lock)); 301 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 302 } 303 return EBUSY; 304 } 305 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 306 rw_exit(&mp->mnt_unmounting); 307 if (nextp != NULL) { 308 KASSERT(mutex_owned(&mountlist_lock)); 309 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 310 } 311 return ENOENT; 312 } 313 if (nextp != NULL) { 314 mutex_exit(&mountlist_lock); 315 } 316 atomic_inc_uint(&mp->mnt_refcnt); 317 return 0; 318 } 319 320 /* 321 * Unbusy a busy filesystem. 322 * 323 * => If keepref is true, preserve reference added by vfs_busy(). 324 * => If nextp != NULL, acquire mountlist_lock. 325 */ 326 void 327 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 328 { 329 330 KASSERT(mp->mnt_refcnt > 0); 331 332 if (nextp != NULL) { 333 mutex_enter(&mountlist_lock); 334 } 335 rw_exit(&mp->mnt_unmounting); 336 if (!keepref) { 337 vfs_destroy(mp); 338 } 339 if (nextp != NULL) { 340 KASSERT(mutex_owned(&mountlist_lock)); 341 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 342 } 343 } 344 345 /* 346 * Lookup a filesystem type, and if found allocate and initialize 347 * a mount structure for it. 348 * 349 * Devname is usually updated by mount(8) after booting. 350 */ 351 int 352 vfs_rootmountalloc(const char *fstypename, const char *devname, 353 struct mount **mpp) 354 { 355 struct vfsops *vfsp = NULL; 356 struct mount *mp; 357 358 mutex_enter(&vfs_list_lock); 359 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 360 if (!strncmp(vfsp->vfs_name, fstypename, 361 sizeof(mp->mnt_stat.f_fstypename))) 362 break; 363 if (vfsp == NULL) { 364 mutex_exit(&vfs_list_lock); 365 return (ENODEV); 366 } 367 vfsp->vfs_refcount++; 368 mutex_exit(&vfs_list_lock); 369 370 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 371 if (mp == NULL) 372 return ENOMEM; 373 mp->mnt_refcnt = 1; 374 rw_init(&mp->mnt_unmounting); 375 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE); 376 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 377 (void)vfs_busy(mp, NULL); 378 TAILQ_INIT(&mp->mnt_vnodelist); 379 mp->mnt_op = vfsp; 380 mp->mnt_flag = MNT_RDONLY; 381 mp->mnt_vnodecovered = NULL; 382 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 383 sizeof(mp->mnt_stat.f_fstypename)); 384 mp->mnt_stat.f_mntonname[0] = '/'; 385 mp->mnt_stat.f_mntonname[1] = '\0'; 386 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 387 '\0'; 388 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 389 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 390 mount_initspecific(mp); 391 *mpp = mp; 392 return (0); 393 } 394 395 /* 396 * Routines having to do with the management of the vnode table. 397 */ 398 extern int (**dead_vnodeop_p)(void *); 399 400 /* 401 * Return the next vnode from the free list. 402 */ 403 int 404 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 405 vnode_t **vpp) 406 { 407 struct uvm_object *uobj; 408 static int toggle; 409 vnode_t *vp; 410 int error = 0, tryalloc; 411 412 try_again: 413 if (mp != NULL) { 414 /* 415 * Mark filesystem busy while we're creating a 416 * vnode. If unmount is in progress, this will 417 * fail. 418 */ 419 error = vfs_busy(mp, NULL); 420 if (error) 421 return error; 422 } 423 424 /* 425 * We must choose whether to allocate a new vnode or recycle an 426 * existing one. The criterion for allocating a new one is that 427 * the total number of vnodes is less than the number desired or 428 * there are no vnodes on either free list. Generally we only 429 * want to recycle vnodes that have no buffers associated with 430 * them, so we look first on the vnode_free_list. If it is empty, 431 * we next consider vnodes with referencing buffers on the 432 * vnode_hold_list. The toggle ensures that half the time we 433 * will use a buffer from the vnode_hold_list, and half the time 434 * we will allocate a new one unless the list has grown to twice 435 * the desired size. We are reticent to recycle vnodes from the 436 * vnode_hold_list because we will lose the identity of all its 437 * referencing buffers. 438 */ 439 440 vp = NULL; 441 442 mutex_enter(&vnode_free_list_lock); 443 444 toggle ^= 1; 445 if (numvnodes > 2 * desiredvnodes) 446 toggle = 0; 447 448 tryalloc = numvnodes < desiredvnodes || 449 (TAILQ_FIRST(&vnode_free_list) == NULL && 450 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 451 452 if (tryalloc) { 453 numvnodes++; 454 mutex_exit(&vnode_free_list_lock); 455 if ((vp = vnalloc(NULL)) == NULL) { 456 mutex_enter(&vnode_free_list_lock); 457 numvnodes--; 458 } else 459 vp->v_usecount = 1; 460 } 461 462 if (vp == NULL) { 463 vp = getcleanvnode(); 464 if (vp == NULL) { 465 if (mp != NULL) { 466 vfs_unbusy(mp, false, NULL); 467 } 468 if (tryalloc) { 469 printf("WARNING: unable to allocate new " 470 "vnode, retrying...\n"); 471 kpause("newvn", false, hz, NULL); 472 goto try_again; 473 } 474 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 475 *vpp = 0; 476 return (ENFILE); 477 } 478 vp->v_iflag = 0; 479 vp->v_vflag = 0; 480 vp->v_uflag = 0; 481 vp->v_socket = NULL; 482 } 483 484 KASSERT(vp->v_usecount == 1); 485 KASSERT(vp->v_freelisthd == NULL); 486 KASSERT(LIST_EMPTY(&vp->v_nclist)); 487 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 488 489 vp->v_type = VNON; 490 vp->v_vnlock = &vp->v_lock; 491 vp->v_tag = tag; 492 vp->v_op = vops; 493 insmntque(vp, mp); 494 *vpp = vp; 495 vp->v_data = 0; 496 497 /* 498 * initialize uvm_object within vnode. 499 */ 500 501 uobj = &vp->v_uobj; 502 KASSERT(uobj->pgops == &uvm_vnodeops); 503 KASSERT(uobj->uo_npages == 0); 504 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 505 vp->v_size = vp->v_writesize = VSIZENOTSET; 506 507 if (mp != NULL) { 508 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 509 vp->v_vflag |= VV_MPSAFE; 510 vfs_unbusy(mp, true, NULL); 511 } 512 513 return (0); 514 } 515 516 /* 517 * This is really just the reverse of getnewvnode(). Needed for 518 * VFS_VGET functions who may need to push back a vnode in case 519 * of a locking race. 520 */ 521 void 522 ungetnewvnode(vnode_t *vp) 523 { 524 525 KASSERT(vp->v_usecount == 1); 526 KASSERT(vp->v_data == NULL); 527 KASSERT(vp->v_freelisthd == NULL); 528 529 mutex_enter(&vp->v_interlock); 530 vp->v_iflag |= VI_CLEAN; 531 vrelel(vp, 0); 532 } 533 534 /* 535 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 536 * marker vnode and we are prepared to wait for the allocation. 537 */ 538 vnode_t * 539 vnalloc(struct mount *mp) 540 { 541 vnode_t *vp; 542 543 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 544 if (vp == NULL) { 545 return NULL; 546 } 547 548 memset(vp, 0, sizeof(*vp)); 549 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 550 cv_init(&vp->v_cv, "vnode"); 551 /* 552 * done by memset() above. 553 * LIST_INIT(&vp->v_nclist); 554 * LIST_INIT(&vp->v_dnclist); 555 */ 556 557 if (mp != NULL) { 558 vp->v_mount = mp; 559 vp->v_type = VBAD; 560 vp->v_iflag = VI_MARKER; 561 } else { 562 rw_init(&vp->v_lock.vl_lock); 563 } 564 565 return vp; 566 } 567 568 /* 569 * Free an unused, unreferenced vnode. 570 */ 571 void 572 vnfree(vnode_t *vp) 573 { 574 575 KASSERT(vp->v_usecount == 0); 576 577 if ((vp->v_iflag & VI_MARKER) == 0) { 578 rw_destroy(&vp->v_lock.vl_lock); 579 mutex_enter(&vnode_free_list_lock); 580 numvnodes--; 581 mutex_exit(&vnode_free_list_lock); 582 } 583 584 UVM_OBJ_DESTROY(&vp->v_uobj); 585 cv_destroy(&vp->v_cv); 586 pool_cache_put(vnode_cache, vp); 587 } 588 589 /* 590 * Remove a vnode from its freelist. 591 */ 592 static inline void 593 vremfree(vnode_t *vp) 594 { 595 596 KASSERT(mutex_owned(&vp->v_interlock)); 597 KASSERT(vp->v_usecount == 0); 598 599 /* 600 * Note that the reference count must not change until 601 * the vnode is removed. 602 */ 603 mutex_enter(&vnode_free_list_lock); 604 if (vp->v_holdcnt > 0) { 605 KASSERT(vp->v_freelisthd == &vnode_hold_list); 606 } else { 607 KASSERT(vp->v_freelisthd == &vnode_free_list); 608 } 609 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 610 vp->v_freelisthd = NULL; 611 mutex_exit(&vnode_free_list_lock); 612 } 613 614 /* 615 * Move a vnode from one mount queue to another. 616 */ 617 static void 618 insmntque(vnode_t *vp, struct mount *mp) 619 { 620 struct mount *omp; 621 622 #ifdef DIAGNOSTIC 623 if ((mp != NULL) && 624 (mp->mnt_iflag & IMNT_UNMOUNT) && 625 !(mp->mnt_flag & MNT_SOFTDEP) && 626 vp->v_tag != VT_VFS) { 627 panic("insmntque into dying filesystem"); 628 } 629 #endif 630 631 mutex_enter(&mntvnode_lock); 632 /* 633 * Delete from old mount point vnode list, if on one. 634 */ 635 if ((omp = vp->v_mount) != NULL) 636 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 637 /* 638 * Insert into list of vnodes for the new mount point, if 639 * available. The caller must take a reference on the mount 640 * structure and donate to the vnode. 641 */ 642 if ((vp->v_mount = mp) != NULL) 643 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 644 mutex_exit(&mntvnode_lock); 645 646 if (omp != NULL) { 647 /* Release reference to old mount. */ 648 vfs_destroy(omp); 649 } 650 } 651 652 /* 653 * Create a vnode for a block device. 654 * Used for root filesystem and swap areas. 655 * Also used for memory file system special devices. 656 */ 657 int 658 bdevvp(dev_t dev, vnode_t **vpp) 659 { 660 661 return (getdevvp(dev, vpp, VBLK)); 662 } 663 664 /* 665 * Create a vnode for a character device. 666 * Used for kernfs and some console handling. 667 */ 668 int 669 cdevvp(dev_t dev, vnode_t **vpp) 670 { 671 672 return (getdevvp(dev, vpp, VCHR)); 673 } 674 675 /* 676 * Create a vnode for a device. 677 * Used by bdevvp (block device) for root file system etc., 678 * and by cdevvp (character device) for console and kernfs. 679 */ 680 static int 681 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 682 { 683 vnode_t *vp; 684 vnode_t *nvp; 685 int error; 686 687 if (dev == NODEV) { 688 *vpp = NULL; 689 return (0); 690 } 691 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 692 if (error) { 693 *vpp = NULL; 694 return (error); 695 } 696 vp = nvp; 697 vp->v_type = type; 698 vp->v_vflag |= VV_MPSAFE; 699 uvm_vnp_setsize(vp, 0); 700 spec_node_init(vp, dev); 701 *vpp = vp; 702 return (0); 703 } 704 705 /* 706 * Grab a particular vnode from the free list, increment its 707 * reference count and lock it. If the vnode lock bit is set the 708 * vnode is being eliminated in vgone. In that case, we can not 709 * grab the vnode, so the process is awakened when the transition is 710 * completed, and an error returned to indicate that the vnode is no 711 * longer usable (possibly having been changed to a new file system type). 712 */ 713 int 714 vget(vnode_t *vp, int flags) 715 { 716 int error; 717 718 KASSERT((vp->v_iflag & VI_MARKER) == 0); 719 720 if ((flags & LK_INTERLOCK) == 0) 721 mutex_enter(&vp->v_interlock); 722 723 /* 724 * Before adding a reference, we must remove the vnode 725 * from its freelist. 726 */ 727 if (vp->v_usecount == 0) { 728 vremfree(vp); 729 } 730 if (++vp->v_usecount == 0) { 731 vpanic(vp, "vget: usecount overflow"); 732 } 733 734 /* 735 * If the vnode is in the process of being cleaned out for 736 * another use, we wait for the cleaning to finish and then 737 * return failure. Cleaning is determined by checking if 738 * the VI_XLOCK or VI_FREEING flags are set. 739 */ 740 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 741 if ((flags & LK_NOWAIT) != 0) { 742 vrelel(vp, 0); 743 return EBUSY; 744 } 745 vwait(vp, VI_XLOCK | VI_FREEING); 746 vrelel(vp, 0); 747 return ENOENT; 748 } 749 if (flags & LK_TYPE_MASK) { 750 error = vn_lock(vp, flags | LK_INTERLOCK); 751 if (error != 0) { 752 vrele(vp); 753 } 754 return error; 755 } 756 mutex_exit(&vp->v_interlock); 757 return 0; 758 } 759 760 /* 761 * vput(), just unlock and vrele() 762 */ 763 void 764 vput(vnode_t *vp) 765 { 766 767 KASSERT((vp->v_iflag & VI_MARKER) == 0); 768 769 VOP_UNLOCK(vp, 0); 770 vrele(vp); 771 } 772 773 /* 774 * Vnode release. If reference count drops to zero, call inactive 775 * routine and either return to freelist or free to the pool. 776 */ 777 void 778 vrelel(vnode_t *vp, int flags) 779 { 780 bool recycle, defer; 781 int error; 782 783 KASSERT(mutex_owned(&vp->v_interlock)); 784 KASSERT((vp->v_iflag & VI_MARKER) == 0); 785 KASSERT(vp->v_freelisthd == NULL); 786 787 if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) { 788 vpanic(vp, "dead but not clean"); 789 } 790 791 /* 792 * If not the last reference, just drop the reference count 793 * and unlock. 794 */ 795 if (vp->v_usecount > 1) { 796 vp->v_usecount--; 797 vp->v_iflag |= VI_INACTREDO; 798 mutex_exit(&vp->v_interlock); 799 return; 800 } 801 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 802 vpanic(vp, "vput: bad ref count"); 803 } 804 805 /* 806 * If not clean, deactivate the vnode, but preserve 807 * our reference across the call to VOP_INACTIVE(). 808 */ 809 retry: 810 if ((vp->v_iflag & VI_CLEAN) == 0) { 811 recycle = false; 812 /* 813 * XXX This ugly block can be largely eliminated if 814 * locking is pushed down into the file systems. 815 */ 816 if (curlwp == uvm.pagedaemon_lwp) { 817 /* The pagedaemon can't wait around; defer. */ 818 defer = true; 819 } else if (curlwp == vrele_lwp) { 820 /* We have to try harder. */ 821 vp->v_iflag &= ~VI_INACTREDO; 822 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 823 LK_RETRY); 824 if (error != 0) { 825 /* XXX */ 826 vpanic(vp, "vrele: unable to lock %p"); 827 } 828 defer = false; 829 } else if ((vp->v_iflag & VI_LAYER) != 0) { 830 /* 831 * Acquiring the stack's lock in vclean() even 832 * for an honest vput/vrele is dangerous because 833 * our caller may hold other vnode locks; defer. 834 */ 835 defer = true; 836 } else { 837 /* If we can't acquire the lock, then defer. */ 838 vp->v_iflag &= ~VI_INACTREDO; 839 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 840 LK_NOWAIT); 841 if (error != 0) { 842 defer = true; 843 mutex_enter(&vp->v_interlock); 844 } else { 845 defer = false; 846 } 847 } 848 849 if (defer) { 850 /* 851 * Defer reclaim to the kthread; it's not safe to 852 * clean it here. We donate it our last reference. 853 */ 854 KASSERT(mutex_owned(&vp->v_interlock)); 855 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 856 vp->v_iflag |= VI_INACTPEND; 857 mutex_enter(&vrele_lock); 858 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 859 if (++vrele_pending > (desiredvnodes >> 8)) 860 cv_signal(&vrele_cv); 861 mutex_exit(&vrele_lock); 862 mutex_exit(&vp->v_interlock); 863 return; 864 } 865 866 #ifdef DIAGNOSTIC 867 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 868 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 869 vprint("vrelel: missing VOP_CLOSE()", vp); 870 } 871 #endif 872 873 /* 874 * The vnode can gain another reference while being 875 * deactivated. If VOP_INACTIVE() indicates that 876 * the described file has been deleted, then recycle 877 * the vnode irrespective of additional references. 878 * Another thread may be waiting to re-use the on-disk 879 * inode. 880 * 881 * Note that VOP_INACTIVE() will drop the vnode lock. 882 */ 883 VOP_INACTIVE(vp, &recycle); 884 mutex_enter(&vp->v_interlock); 885 if (!recycle) { 886 if (vp->v_usecount > 1) { 887 vp->v_usecount--; 888 mutex_exit(&vp->v_interlock); 889 return; 890 } 891 892 /* 893 * If we grew another reference while 894 * VOP_INACTIVE() was underway, retry. 895 */ 896 if ((vp->v_iflag & VI_INACTREDO) != 0) { 897 goto retry; 898 } 899 } 900 901 /* Take care of space accounting. */ 902 if (vp->v_iflag & VI_EXECMAP) { 903 atomic_add_int(&uvmexp.execpages, 904 -vp->v_uobj.uo_npages); 905 atomic_add_int(&uvmexp.filepages, 906 vp->v_uobj.uo_npages); 907 } 908 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 909 vp->v_vflag &= ~VV_MAPPED; 910 911 /* 912 * Recycle the vnode if the file is now unused (unlinked), 913 * otherwise just free it. 914 */ 915 if (recycle) { 916 vclean(vp, DOCLOSE); 917 } 918 KASSERT(vp->v_usecount > 0); 919 } 920 921 if (--vp->v_usecount != 0) { 922 /* Gained another reference while being reclaimed. */ 923 mutex_exit(&vp->v_interlock); 924 return; 925 } 926 927 if ((vp->v_iflag & VI_CLEAN) != 0) { 928 /* 929 * It's clean so destroy it. It isn't referenced 930 * anywhere since it has been reclaimed. 931 */ 932 KASSERT(vp->v_holdcnt == 0); 933 KASSERT(vp->v_writecount == 0); 934 mutex_exit(&vp->v_interlock); 935 insmntque(vp, NULL); 936 if (vp->v_type == VBLK || vp->v_type == VCHR) { 937 spec_node_destroy(vp); 938 } 939 vnfree(vp); 940 } else { 941 /* 942 * Otherwise, put it back onto the freelist. It 943 * can't be destroyed while still associated with 944 * a file system. 945 */ 946 mutex_enter(&vnode_free_list_lock); 947 if (vp->v_holdcnt > 0) { 948 vp->v_freelisthd = &vnode_hold_list; 949 } else { 950 vp->v_freelisthd = &vnode_free_list; 951 } 952 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 953 mutex_exit(&vnode_free_list_lock); 954 mutex_exit(&vp->v_interlock); 955 } 956 } 957 958 void 959 vrele(vnode_t *vp) 960 { 961 962 KASSERT((vp->v_iflag & VI_MARKER) == 0); 963 964 mutex_enter(&vp->v_interlock); 965 vrelel(vp, 0); 966 } 967 968 static void 969 vrele_thread(void *cookie) 970 { 971 vnode_t *vp; 972 973 for (;;) { 974 mutex_enter(&vrele_lock); 975 while (TAILQ_EMPTY(&vrele_list)) { 976 cv_timedwait(&vrele_cv, &vrele_lock, hz); 977 } 978 vp = TAILQ_FIRST(&vrele_list); 979 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 980 vrele_pending--; 981 mutex_exit(&vrele_lock); 982 983 /* 984 * If not the last reference, then ignore the vnode 985 * and look for more work. 986 */ 987 mutex_enter(&vp->v_interlock); 988 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 989 vp->v_iflag &= ~VI_INACTPEND; 990 if (vp->v_usecount > 1) { 991 vp->v_usecount--; 992 mutex_exit(&vp->v_interlock); 993 continue; 994 } 995 vrelel(vp, 0); 996 } 997 } 998 999 /* 1000 * Page or buffer structure gets a reference. 1001 * Called with v_interlock held. 1002 */ 1003 void 1004 vholdl(vnode_t *vp) 1005 { 1006 1007 KASSERT(mutex_owned(&vp->v_interlock)); 1008 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1009 1010 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1011 mutex_enter(&vnode_free_list_lock); 1012 KASSERT(vp->v_freelisthd == &vnode_free_list); 1013 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1014 vp->v_freelisthd = &vnode_hold_list; 1015 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1016 mutex_exit(&vnode_free_list_lock); 1017 } 1018 } 1019 1020 /* 1021 * Page or buffer structure frees a reference. 1022 * Called with v_interlock held. 1023 */ 1024 void 1025 holdrelel(vnode_t *vp) 1026 { 1027 1028 KASSERT(mutex_owned(&vp->v_interlock)); 1029 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1030 1031 if (vp->v_holdcnt <= 0) { 1032 vpanic(vp, "holdrelel: holdcnt vp %p"); 1033 } 1034 1035 vp->v_holdcnt--; 1036 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1037 mutex_enter(&vnode_free_list_lock); 1038 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1039 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1040 vp->v_freelisthd = &vnode_free_list; 1041 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1042 mutex_exit(&vnode_free_list_lock); 1043 } 1044 } 1045 1046 /* 1047 * Vnode reference, where a reference is already held by some other 1048 * object (for example, a file structure). 1049 */ 1050 void 1051 vref(vnode_t *vp) 1052 { 1053 1054 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1055 1056 mutex_enter(&vp->v_interlock); 1057 if (vp->v_usecount <= 0) { 1058 vpanic(vp, "vref used where vget required"); 1059 } 1060 if (++vp->v_usecount == 0) { 1061 vpanic(vp, "vref: usecount overflow"); 1062 } 1063 mutex_exit(&vp->v_interlock); 1064 } 1065 1066 /* 1067 * Remove any vnodes in the vnode table belonging to mount point mp. 1068 * 1069 * If FORCECLOSE is not specified, there should not be any active ones, 1070 * return error if any are found (nb: this is a user error, not a 1071 * system error). If FORCECLOSE is specified, detach any active vnodes 1072 * that are found. 1073 * 1074 * If WRITECLOSE is set, only flush out regular file vnodes open for 1075 * writing. 1076 * 1077 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1078 */ 1079 #ifdef DEBUG 1080 int busyprt = 0; /* print out busy vnodes */ 1081 struct ctldebug debug1 = { "busyprt", &busyprt }; 1082 #endif 1083 1084 static vnode_t * 1085 vflushnext(vnode_t *mvp, int *when) 1086 { 1087 1088 if (hardclock_ticks > *when) { 1089 mutex_exit(&mntvnode_lock); 1090 yield(); 1091 mutex_enter(&mntvnode_lock); 1092 *when = hardclock_ticks + hz / 10; 1093 } 1094 1095 return vunmark(mvp); 1096 } 1097 1098 int 1099 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1100 { 1101 vnode_t *vp, *mvp; 1102 int busy = 0, when = 0; 1103 1104 /* Allocate a marker vnode. */ 1105 if ((mvp = vnalloc(mp)) == NULL) 1106 return (ENOMEM); 1107 1108 mutex_enter(&mntvnode_lock); 1109 /* 1110 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1111 * and vclean() are called 1112 */ 1113 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1114 vp = vflushnext(mvp, &when)) { 1115 vmark(mvp, vp); 1116 if (vp->v_mount != mp || vismarker(vp)) 1117 continue; 1118 /* 1119 * Skip over a selected vnode. 1120 */ 1121 if (vp == skipvp) 1122 continue; 1123 mutex_enter(&vp->v_interlock); 1124 /* 1125 * Ignore clean but still referenced vnodes. 1126 */ 1127 if ((vp->v_iflag & VI_CLEAN) != 0) { 1128 mutex_exit(&vp->v_interlock); 1129 continue; 1130 } 1131 /* 1132 * Skip over a vnodes marked VSYSTEM. 1133 */ 1134 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1135 mutex_exit(&vp->v_interlock); 1136 continue; 1137 } 1138 /* 1139 * If WRITECLOSE is set, only flush out regular file 1140 * vnodes open for writing. 1141 */ 1142 if ((flags & WRITECLOSE) && 1143 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1144 mutex_exit(&vp->v_interlock); 1145 continue; 1146 } 1147 /* 1148 * With v_usecount == 0, all we need to do is clear 1149 * out the vnode data structures and we are done. 1150 */ 1151 if (vp->v_usecount == 0) { 1152 mutex_exit(&mntvnode_lock); 1153 vremfree(vp); 1154 vp->v_usecount++; 1155 vclean(vp, DOCLOSE); 1156 vrelel(vp, 0); 1157 mutex_enter(&mntvnode_lock); 1158 continue; 1159 } 1160 /* 1161 * If FORCECLOSE is set, forcibly close the vnode. 1162 * For block or character devices, revert to an 1163 * anonymous device. For all other files, just 1164 * kill them. 1165 */ 1166 if (flags & FORCECLOSE) { 1167 mutex_exit(&mntvnode_lock); 1168 vp->v_usecount++; 1169 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1170 vclean(vp, DOCLOSE); 1171 vrelel(vp, 0); 1172 } else { 1173 vclean(vp, 0); 1174 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1175 mutex_exit(&vp->v_interlock); 1176 /* 1177 * The vnode isn't clean, but still resides 1178 * on the mount list. Remove it. XXX This 1179 * is a bit dodgy. 1180 */ 1181 insmntque(vp, NULL); 1182 vrele(vp); 1183 } 1184 mutex_enter(&mntvnode_lock); 1185 continue; 1186 } 1187 #ifdef DEBUG 1188 if (busyprt) 1189 vprint("vflush: busy vnode", vp); 1190 #endif 1191 mutex_exit(&vp->v_interlock); 1192 busy++; 1193 } 1194 mutex_exit(&mntvnode_lock); 1195 vnfree(mvp); 1196 if (busy) 1197 return (EBUSY); 1198 return (0); 1199 } 1200 1201 /* 1202 * Disassociate the underlying file system from a vnode. 1203 * 1204 * Must be called with the interlock held, and will return with it held. 1205 */ 1206 void 1207 vclean(vnode_t *vp, int flags) 1208 { 1209 lwp_t *l = curlwp; 1210 bool recycle, active; 1211 int error; 1212 1213 KASSERT(mutex_owned(&vp->v_interlock)); 1214 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1215 KASSERT(vp->v_usecount != 0); 1216 1217 /* If cleaning is already in progress wait until done and return. */ 1218 if (vp->v_iflag & VI_XLOCK) { 1219 vwait(vp, VI_XLOCK); 1220 return; 1221 } 1222 1223 /* If already clean, nothing to do. */ 1224 if ((vp->v_iflag & VI_CLEAN) != 0) { 1225 return; 1226 } 1227 1228 /* 1229 * Prevent the vnode from being recycled or brought into use 1230 * while we clean it out. 1231 */ 1232 vp->v_iflag |= VI_XLOCK; 1233 if (vp->v_iflag & VI_EXECMAP) { 1234 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1235 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1236 } 1237 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1238 active = (vp->v_usecount > 1); 1239 1240 /* XXXAD should not lock vnode under layer */ 1241 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1242 1243 /* 1244 * Clean out any cached data associated with the vnode. 1245 * If purging an active vnode, it must be closed and 1246 * deactivated before being reclaimed. Note that the 1247 * VOP_INACTIVE will unlock the vnode. 1248 */ 1249 if (flags & DOCLOSE) { 1250 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1251 if (error != 0) 1252 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1253 KASSERT(error == 0); 1254 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1255 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1256 spec_node_revoke(vp); 1257 } 1258 } 1259 if (active) { 1260 VOP_INACTIVE(vp, &recycle); 1261 } else { 1262 /* 1263 * Any other processes trying to obtain this lock must first 1264 * wait for VI_XLOCK to clear, then call the new lock operation. 1265 */ 1266 VOP_UNLOCK(vp, 0); 1267 } 1268 1269 /* Disassociate the underlying file system from the vnode. */ 1270 if (VOP_RECLAIM(vp)) { 1271 vpanic(vp, "vclean: cannot reclaim"); 1272 } 1273 1274 KASSERT(vp->v_uobj.uo_npages == 0); 1275 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1276 uvm_ra_freectx(vp->v_ractx); 1277 vp->v_ractx = NULL; 1278 } 1279 cache_purge(vp); 1280 1281 /* Done with purge, notify sleepers of the grim news. */ 1282 vp->v_op = dead_vnodeop_p; 1283 vp->v_tag = VT_NON; 1284 mutex_enter(&vp->v_interlock); 1285 vp->v_vnlock = &vp->v_lock; 1286 KNOTE(&vp->v_klist, NOTE_REVOKE); 1287 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1288 vp->v_vflag &= ~VV_LOCKSWORK; 1289 if ((flags & DOCLOSE) != 0) { 1290 vp->v_iflag |= VI_CLEAN; 1291 } 1292 cv_broadcast(&vp->v_cv); 1293 1294 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1295 } 1296 1297 /* 1298 * Recycle an unused vnode to the front of the free list. 1299 * Release the passed interlock if the vnode will be recycled. 1300 */ 1301 int 1302 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1303 { 1304 1305 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1306 1307 mutex_enter(&vp->v_interlock); 1308 if (vp->v_usecount != 0) { 1309 mutex_exit(&vp->v_interlock); 1310 return (0); 1311 } 1312 if (inter_lkp) 1313 mutex_exit(inter_lkp); 1314 vremfree(vp); 1315 vp->v_usecount++; 1316 vclean(vp, DOCLOSE); 1317 vrelel(vp, 0); 1318 return (1); 1319 } 1320 1321 /* 1322 * Eliminate all activity associated with a vnode in preparation for 1323 * reuse. Drops a reference from the vnode. 1324 */ 1325 void 1326 vgone(vnode_t *vp) 1327 { 1328 1329 mutex_enter(&vp->v_interlock); 1330 vclean(vp, DOCLOSE); 1331 vrelel(vp, 0); 1332 } 1333 1334 /* 1335 * Lookup a vnode by device number. 1336 */ 1337 int 1338 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1339 { 1340 vnode_t *vp; 1341 int rc = 0; 1342 1343 mutex_enter(&specfs_lock); 1344 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1345 if (dev != vp->v_rdev || type != vp->v_type) 1346 continue; 1347 *vpp = vp; 1348 rc = 1; 1349 break; 1350 } 1351 mutex_exit(&specfs_lock); 1352 return (rc); 1353 } 1354 1355 /* 1356 * Revoke all the vnodes corresponding to the specified minor number 1357 * range (endpoints inclusive) of the specified major. 1358 */ 1359 void 1360 vdevgone(int maj, int minl, int minh, enum vtype type) 1361 { 1362 vnode_t *vp, **vpp; 1363 dev_t dev; 1364 int mn; 1365 1366 vp = NULL; /* XXX gcc */ 1367 1368 mutex_enter(&specfs_lock); 1369 for (mn = minl; mn <= minh; mn++) { 1370 dev = makedev(maj, mn); 1371 vpp = &specfs_hash[SPECHASH(dev)]; 1372 for (vp = *vpp; vp != NULL;) { 1373 mutex_enter(&vp->v_interlock); 1374 if ((vp->v_iflag & VI_CLEAN) != 0 || 1375 dev != vp->v_rdev || type != vp->v_type) { 1376 mutex_exit(&vp->v_interlock); 1377 vp = vp->v_specnext; 1378 continue; 1379 } 1380 mutex_exit(&specfs_lock); 1381 if (vget(vp, LK_INTERLOCK) == 0) { 1382 VOP_REVOKE(vp, REVOKEALL); 1383 vrele(vp); 1384 } 1385 mutex_enter(&specfs_lock); 1386 vp = *vpp; 1387 } 1388 } 1389 mutex_exit(&specfs_lock); 1390 } 1391 1392 /* 1393 * Calculate the total number of references to a special device. 1394 */ 1395 int 1396 vcount(vnode_t *vp) 1397 { 1398 int count; 1399 1400 mutex_enter(&specfs_lock); 1401 mutex_enter(&vp->v_interlock); 1402 if (vp->v_specnode == NULL) { 1403 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 1404 mutex_exit(&vp->v_interlock); 1405 mutex_exit(&specfs_lock); 1406 return (count); 1407 } 1408 mutex_exit(&vp->v_interlock); 1409 count = vp->v_specnode->sn_dev->sd_opencnt; 1410 mutex_exit(&specfs_lock); 1411 return (count); 1412 } 1413 1414 /* 1415 * Eliminate all activity associated with the requested vnode 1416 * and with all vnodes aliased to the requested vnode. 1417 */ 1418 void 1419 vrevoke(vnode_t *vp) 1420 { 1421 vnode_t *vq, **vpp; 1422 enum vtype type; 1423 dev_t dev; 1424 1425 KASSERT(vp->v_usecount > 0); 1426 1427 mutex_enter(&vp->v_interlock); 1428 if ((vp->v_iflag & VI_CLEAN) != 0) { 1429 mutex_exit(&vp->v_interlock); 1430 return; 1431 } else { 1432 dev = vp->v_rdev; 1433 type = vp->v_type; 1434 mutex_exit(&vp->v_interlock); 1435 } 1436 1437 vpp = &specfs_hash[SPECHASH(dev)]; 1438 mutex_enter(&specfs_lock); 1439 for (vq = *vpp; vq != NULL;) { 1440 /* If clean or being cleaned, then ignore it. */ 1441 mutex_enter(&vq->v_interlock); 1442 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1443 vq->v_rdev != dev || vq->v_type != type) { 1444 mutex_exit(&vq->v_interlock); 1445 vq = vq->v_specnext; 1446 continue; 1447 } 1448 mutex_exit(&specfs_lock); 1449 if (vq->v_usecount == 0) { 1450 vremfree(vq); 1451 } 1452 vq->v_usecount++; 1453 vclean(vq, DOCLOSE); 1454 vrelel(vq, 0); 1455 mutex_enter(&specfs_lock); 1456 vq = *vpp; 1457 } 1458 mutex_exit(&specfs_lock); 1459 } 1460 1461 /* 1462 * sysctl helper routine to return list of supported fstypes 1463 */ 1464 static int 1465 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1466 { 1467 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 1468 char *where = oldp; 1469 struct vfsops *v; 1470 size_t needed, left, slen; 1471 int error, first; 1472 1473 if (newp != NULL) 1474 return (EPERM); 1475 if (namelen != 0) 1476 return (EINVAL); 1477 1478 first = 1; 1479 error = 0; 1480 needed = 0; 1481 left = *oldlenp; 1482 1483 sysctl_unlock(); 1484 mutex_enter(&vfs_list_lock); 1485 LIST_FOREACH(v, &vfs_list, vfs_list) { 1486 if (where == NULL) 1487 needed += strlen(v->vfs_name) + 1; 1488 else { 1489 memset(bf, 0, sizeof(bf)); 1490 if (first) { 1491 strncpy(bf, v->vfs_name, sizeof(bf)); 1492 first = 0; 1493 } else { 1494 bf[0] = ' '; 1495 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1496 } 1497 bf[sizeof(bf)-1] = '\0'; 1498 slen = strlen(bf); 1499 if (left < slen + 1) 1500 break; 1501 /* +1 to copy out the trailing NUL byte */ 1502 v->vfs_refcount++; 1503 mutex_exit(&vfs_list_lock); 1504 error = copyout(bf, where, slen + 1); 1505 mutex_enter(&vfs_list_lock); 1506 v->vfs_refcount--; 1507 if (error) 1508 break; 1509 where += slen; 1510 needed += slen; 1511 left -= slen; 1512 } 1513 } 1514 mutex_exit(&vfs_list_lock); 1515 sysctl_relock(); 1516 *oldlenp = needed; 1517 return (error); 1518 } 1519 1520 /* 1521 * Top level filesystem related information gathering. 1522 */ 1523 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 1524 { 1525 sysctl_createv(clog, 0, NULL, NULL, 1526 CTLFLAG_PERMANENT, 1527 CTLTYPE_NODE, "vfs", NULL, 1528 NULL, 0, NULL, 0, 1529 CTL_VFS, CTL_EOL); 1530 sysctl_createv(clog, 0, NULL, NULL, 1531 CTLFLAG_PERMANENT, 1532 CTLTYPE_NODE, "generic", 1533 SYSCTL_DESCR("Non-specific vfs related information"), 1534 NULL, 0, NULL, 0, 1535 CTL_VFS, VFS_GENERIC, CTL_EOL); 1536 sysctl_createv(clog, 0, NULL, NULL, 1537 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1538 CTLTYPE_INT, "usermount", 1539 SYSCTL_DESCR("Whether unprivileged users may mount " 1540 "filesystems"), 1541 NULL, 0, &dovfsusermount, 0, 1542 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 1543 sysctl_createv(clog, 0, NULL, NULL, 1544 CTLFLAG_PERMANENT, 1545 CTLTYPE_STRING, "fstypes", 1546 SYSCTL_DESCR("List of file systems present"), 1547 sysctl_vfs_generic_fstypes, 0, NULL, 0, 1548 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 1549 sysctl_createv(clog, 0, NULL, NULL, 1550 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1551 CTLTYPE_INT, "magiclinks", 1552 SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), 1553 NULL, 0, &vfs_magiclinks, 0, 1554 CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); 1555 } 1556 1557 1558 int kinfo_vdebug = 1; 1559 int kinfo_vgetfailed; 1560 #define KINFO_VNODESLOP 10 1561 /* 1562 * Dump vnode list (via sysctl). 1563 * Copyout address of vnode followed by vnode. 1564 */ 1565 /* ARGSUSED */ 1566 int 1567 sysctl_kern_vnode(SYSCTLFN_ARGS) 1568 { 1569 char *where = oldp; 1570 size_t *sizep = oldlenp; 1571 struct mount *mp, *nmp; 1572 vnode_t *vp, *mvp, vbuf; 1573 char *bp = where, *savebp; 1574 char *ewhere; 1575 int error; 1576 1577 if (namelen != 0) 1578 return (EOPNOTSUPP); 1579 if (newp != NULL) 1580 return (EPERM); 1581 1582 #define VPTRSZ sizeof(vnode_t *) 1583 #define VNODESZ sizeof(vnode_t) 1584 if (where == NULL) { 1585 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1586 return (0); 1587 } 1588 ewhere = where + *sizep; 1589 1590 sysctl_unlock(); 1591 mutex_enter(&mountlist_lock); 1592 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1593 mp = nmp) { 1594 if (vfs_busy(mp, &nmp)) { 1595 continue; 1596 } 1597 savebp = bp; 1598 /* Allocate a marker vnode. */ 1599 if ((mvp = vnalloc(mp)) == NULL) { 1600 sysctl_relock(); 1601 return (ENOMEM); 1602 } 1603 mutex_enter(&mntvnode_lock); 1604 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 1605 vmark(mvp, vp); 1606 /* 1607 * Check that the vp is still associated with 1608 * this filesystem. RACE: could have been 1609 * recycled onto the same filesystem. 1610 */ 1611 if (vp->v_mount != mp || vismarker(vp)) 1612 continue; 1613 if (bp + VPTRSZ + VNODESZ > ewhere) { 1614 (void)vunmark(mvp); 1615 mutex_exit(&mntvnode_lock); 1616 vnfree(mvp); 1617 sysctl_relock(); 1618 *sizep = bp - where; 1619 return (ENOMEM); 1620 } 1621 memcpy(&vbuf, vp, VNODESZ); 1622 mutex_exit(&mntvnode_lock); 1623 if ((error = copyout(vp, bp, VPTRSZ)) || 1624 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 1625 mutex_enter(&mntvnode_lock); 1626 (void)vunmark(mvp); 1627 mutex_exit(&mntvnode_lock); 1628 vnfree(mvp); 1629 sysctl_relock(); 1630 return (error); 1631 } 1632 bp += VPTRSZ + VNODESZ; 1633 mutex_enter(&mntvnode_lock); 1634 } 1635 mutex_exit(&mntvnode_lock); 1636 vnfree(mvp); 1637 vfs_unbusy(mp, false, &nmp); 1638 } 1639 mutex_exit(&mountlist_lock); 1640 sysctl_relock(); 1641 1642 *sizep = bp - where; 1643 return (0); 1644 } 1645 1646 /* 1647 * Remove clean vnodes from a mountpoint's vnode list. 1648 */ 1649 void 1650 vfs_scrubvnlist(struct mount *mp) 1651 { 1652 vnode_t *vp, *nvp; 1653 1654 retry: 1655 mutex_enter(&mntvnode_lock); 1656 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1657 nvp = TAILQ_NEXT(vp, v_mntvnodes); 1658 mutex_enter(&vp->v_interlock); 1659 if ((vp->v_iflag & VI_CLEAN) != 0) { 1660 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 1661 vp->v_mount = NULL; 1662 mutex_exit(&mntvnode_lock); 1663 mutex_exit(&vp->v_interlock); 1664 vfs_destroy(mp); 1665 goto retry; 1666 } 1667 mutex_exit(&vp->v_interlock); 1668 } 1669 mutex_exit(&mntvnode_lock); 1670 } 1671 1672 /* 1673 * Check to see if a filesystem is mounted on a block device. 1674 */ 1675 int 1676 vfs_mountedon(vnode_t *vp) 1677 { 1678 vnode_t *vq; 1679 int error = 0; 1680 1681 if (vp->v_type != VBLK) 1682 return ENOTBLK; 1683 if (vp->v_specmountpoint != NULL) 1684 return (EBUSY); 1685 mutex_enter(&specfs_lock); 1686 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 1687 vq = vq->v_specnext) { 1688 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1689 continue; 1690 if (vq->v_specmountpoint != NULL) { 1691 error = EBUSY; 1692 break; 1693 } 1694 } 1695 mutex_exit(&specfs_lock); 1696 return (error); 1697 } 1698 1699 /* 1700 * Unmount all file systems. 1701 * We traverse the list in reverse order under the assumption that doing so 1702 * will avoid needing to worry about dependencies. 1703 */ 1704 void 1705 vfs_unmountall(struct lwp *l) 1706 { 1707 struct mount *mp, *nmp; 1708 int allerror, error; 1709 1710 printf("unmounting file systems..."); 1711 for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist); 1712 !CIRCLEQ_EMPTY(&mountlist); 1713 mp = nmp) { 1714 nmp = CIRCLEQ_PREV(mp, mnt_list); 1715 #ifdef DEBUG 1716 printf("\nunmounting %s (%s)...", 1717 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 1718 #endif 1719 atomic_inc_uint(&mp->mnt_refcnt); 1720 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 1721 printf("unmount of %s failed with error %d\n", 1722 mp->mnt_stat.f_mntonname, error); 1723 allerror = 1; 1724 } 1725 } 1726 printf(" done\n"); 1727 if (allerror) 1728 printf("WARNING: some file systems would not unmount\n"); 1729 } 1730 1731 /* 1732 * Sync and unmount file systems before shutting down. 1733 */ 1734 void 1735 vfs_shutdown(void) 1736 { 1737 struct lwp *l; 1738 1739 /* XXX we're certainly not running in lwp0's context! */ 1740 l = curlwp; 1741 if (l == NULL) 1742 l = &lwp0; 1743 1744 printf("syncing disks... "); 1745 1746 /* remove user processes from run queue */ 1747 suspendsched(); 1748 (void) spl0(); 1749 1750 /* avoid coming back this way again if we panic. */ 1751 doing_shutdown = 1; 1752 1753 sys_sync(l, NULL, NULL); 1754 1755 /* Wait for sync to finish. */ 1756 if (buf_syncwait() != 0) { 1757 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 1758 Debugger(); 1759 #endif 1760 printf("giving up\n"); 1761 return; 1762 } else 1763 printf("done\n"); 1764 1765 /* 1766 * If we've panic'd, don't make the situation potentially 1767 * worse by unmounting the file systems. 1768 */ 1769 if (panicstr != NULL) 1770 return; 1771 1772 /* Release inodes held by texts before update. */ 1773 #ifdef notdef 1774 vnshutdown(); 1775 #endif 1776 /* Unmount file systems. */ 1777 vfs_unmountall(l); 1778 } 1779 1780 /* 1781 * Mount the root file system. If the operator didn't specify a 1782 * file system to use, try all possible file systems until one 1783 * succeeds. 1784 */ 1785 int 1786 vfs_mountroot(void) 1787 { 1788 struct vfsops *v; 1789 int error = ENODEV; 1790 1791 if (root_device == NULL) 1792 panic("vfs_mountroot: root device unknown"); 1793 1794 switch (device_class(root_device)) { 1795 case DV_IFNET: 1796 if (rootdev != NODEV) 1797 panic("vfs_mountroot: rootdev set for DV_IFNET " 1798 "(0x%08x -> %d,%d)", rootdev, 1799 major(rootdev), minor(rootdev)); 1800 break; 1801 1802 case DV_DISK: 1803 if (rootdev == NODEV) 1804 panic("vfs_mountroot: rootdev not set for DV_DISK"); 1805 if (bdevvp(rootdev, &rootvp)) 1806 panic("vfs_mountroot: can't get vnode for rootdev"); 1807 error = VOP_OPEN(rootvp, FREAD, FSCRED); 1808 if (error) { 1809 printf("vfs_mountroot: can't open root device\n"); 1810 return (error); 1811 } 1812 break; 1813 1814 default: 1815 printf("%s: inappropriate for root file system\n", 1816 device_xname(root_device)); 1817 return (ENODEV); 1818 } 1819 1820 /* 1821 * If user specified a file system, use it. 1822 */ 1823 if (mountroot != NULL) { 1824 error = (*mountroot)(); 1825 goto done; 1826 } 1827 1828 /* 1829 * Try each file system currently configured into the kernel. 1830 */ 1831 mutex_enter(&vfs_list_lock); 1832 LIST_FOREACH(v, &vfs_list, vfs_list) { 1833 if (v->vfs_mountroot == NULL) 1834 continue; 1835 #ifdef DEBUG 1836 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 1837 #endif 1838 v->vfs_refcount++; 1839 mutex_exit(&vfs_list_lock); 1840 error = (*v->vfs_mountroot)(); 1841 mutex_enter(&vfs_list_lock); 1842 v->vfs_refcount--; 1843 if (!error) { 1844 aprint_normal("root file system type: %s\n", 1845 v->vfs_name); 1846 break; 1847 } 1848 } 1849 mutex_exit(&vfs_list_lock); 1850 1851 if (v == NULL) { 1852 printf("no file system for %s", device_xname(root_device)); 1853 if (device_class(root_device) == DV_DISK) 1854 printf(" (dev 0x%x)", rootdev); 1855 printf("\n"); 1856 error = EFTYPE; 1857 } 1858 1859 done: 1860 if (error && device_class(root_device) == DV_DISK) { 1861 VOP_CLOSE(rootvp, FREAD, FSCRED); 1862 vrele(rootvp); 1863 } 1864 return (error); 1865 } 1866 1867 /* 1868 * Sham lock manager for vnodes. This is a temporary measure. 1869 */ 1870 int 1871 vlockmgr(struct vnlock *vl, int flags) 1872 { 1873 1874 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 1875 1876 switch (flags & LK_TYPE_MASK) { 1877 case LK_SHARED: 1878 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 1879 return 0; 1880 } 1881 if ((flags & LK_NOWAIT) != 0) { 1882 return EBUSY; 1883 } 1884 rw_enter(&vl->vl_lock, RW_READER); 1885 return 0; 1886 1887 case LK_EXCLUSIVE: 1888 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 1889 return 0; 1890 } 1891 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 1892 rw_write_held(&vl->vl_lock)) { 1893 vl->vl_recursecnt++; 1894 return 0; 1895 } 1896 if ((flags & LK_NOWAIT) != 0) { 1897 return EBUSY; 1898 } 1899 rw_enter(&vl->vl_lock, RW_WRITER); 1900 return 0; 1901 1902 case LK_RELEASE: 1903 if (vl->vl_recursecnt != 0) { 1904 KASSERT(rw_write_held(&vl->vl_lock)); 1905 vl->vl_recursecnt--; 1906 return 0; 1907 } 1908 rw_exit(&vl->vl_lock); 1909 return 0; 1910 1911 default: 1912 panic("vlockmgr: flags %x", flags); 1913 } 1914 } 1915 1916 int 1917 vlockstatus(struct vnlock *vl) 1918 { 1919 1920 if (rw_write_held(&vl->vl_lock)) { 1921 return LK_EXCLUSIVE; 1922 } 1923 if (rw_read_held(&vl->vl_lock)) { 1924 return LK_SHARED; 1925 } 1926 return 0; 1927 } 1928