1 /* $NetBSD: vfs_subr.c,v 1.334 2008/02/15 13:06:02 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines. 78 * 79 * This file contains vfs subroutines which are heavily dependant on 80 * the kernel and are not suitable for standalone use. Examples include 81 * routines involved vnode and mountpoint management. 82 */ 83 84 #include <sys/cdefs.h> 85 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.334 2008/02/15 13:06:02 ad Exp $"); 86 87 #include "opt_ddb.h" 88 #include "opt_compat_netbsd.h" 89 #include "opt_compat_43.h" 90 91 #include <sys/param.h> 92 #include <sys/systm.h> 93 #include <sys/proc.h> 94 #include <sys/kernel.h> 95 #include <sys/mount.h> 96 #include <sys/fcntl.h> 97 #include <sys/vnode.h> 98 #include <sys/stat.h> 99 #include <sys/namei.h> 100 #include <sys/ucred.h> 101 #include <sys/buf.h> 102 #include <sys/errno.h> 103 #include <sys/malloc.h> 104 #include <sys/syscallargs.h> 105 #include <sys/device.h> 106 #include <sys/filedesc.h> 107 #include <sys/kauth.h> 108 #include <sys/atomic.h> 109 #include <sys/kthread.h> 110 111 #include <miscfs/specfs/specdev.h> 112 #include <miscfs/syncfs/syncfs.h> 113 114 #include <uvm/uvm.h> 115 #include <uvm/uvm_readahead.h> 116 #include <uvm/uvm_ddb.h> 117 118 #include <sys/sysctl.h> 119 120 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 121 extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 122 123 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 124 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 125 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 126 127 static int vrele_pending; 128 static kmutex_t vrele_lock; 129 static kcondvar_t vrele_cv; 130 static lwp_t *vrele_lwp; 131 132 static pool_cache_t vnode_cache; 133 134 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 135 136 /* 137 * Local declarations. 138 */ 139 140 static void vrele_thread(void *); 141 static void insmntque(vnode_t *, struct mount *); 142 static int getdevvp(dev_t, vnode_t **, enum vtype); 143 static vnode_t *getcleanvnode(void);; 144 void vpanic(vnode_t *, const char *); 145 146 #ifdef DIAGNOSTIC 147 void 148 vpanic(vnode_t *vp, const char *msg) 149 { 150 151 vprint(NULL, vp); 152 panic("%s\n", msg); 153 } 154 #else 155 #define vpanic(vp, msg) /* nothing */ 156 #endif 157 158 void 159 vn_init1(void) 160 { 161 162 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 163 NULL, IPL_NONE, NULL, NULL, NULL); 164 KASSERT(vnode_cache != NULL); 165 166 /* Create deferred release thread. */ 167 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 168 cv_init(&vrele_cv, "vrele"); 169 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 170 NULL, &vrele_lwp, "vrele")) 171 panic("fork vrele"); 172 } 173 174 int 175 vfs_drainvnodes(long target, struct lwp *l) 176 { 177 178 while (numvnodes > target) { 179 vnode_t *vp; 180 181 mutex_enter(&vnode_free_list_lock); 182 vp = getcleanvnode(); 183 if (vp == NULL) 184 return EBUSY; /* give up */ 185 ungetnewvnode(vp); 186 } 187 188 return 0; 189 } 190 191 /* 192 * grab a vnode from freelist and clean it. 193 */ 194 vnode_t * 195 getcleanvnode(void) 196 { 197 vnode_t *vp; 198 vnodelst_t *listhd; 199 200 KASSERT(mutex_owned(&vnode_free_list_lock)); 201 202 retry: 203 listhd = &vnode_free_list; 204 try_nextlist: 205 TAILQ_FOREACH(vp, listhd, v_freelist) { 206 /* 207 * It's safe to test v_usecount and v_iflag 208 * without holding the interlock here, since 209 * these vnodes should never appear on the 210 * lists. 211 */ 212 if (vp->v_usecount != 0) { 213 vpanic(vp, "free vnode isn't"); 214 } 215 if ((vp->v_iflag & VI_CLEAN) != 0) { 216 vpanic(vp, "clean vnode on freelist"); 217 } 218 if (vp->v_freelisthd != listhd) { 219 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 220 vpanic(vp, "list head mismatch"); 221 } 222 if (!mutex_tryenter(&vp->v_interlock)) 223 continue; 224 /* 225 * Our lwp might hold the underlying vnode 226 * locked, so don't try to reclaim a VI_LAYER 227 * node if it's locked. 228 */ 229 if ((vp->v_iflag & VI_XLOCK) == 0 && 230 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 231 break; 232 } 233 mutex_exit(&vp->v_interlock); 234 } 235 236 if (vp == NULL) { 237 if (listhd == &vnode_free_list) { 238 listhd = &vnode_hold_list; 239 goto try_nextlist; 240 } 241 mutex_exit(&vnode_free_list_lock); 242 return NULL; 243 } 244 245 /* Remove it from the freelist. */ 246 TAILQ_REMOVE(listhd, vp, v_freelist); 247 vp->v_freelisthd = NULL; 248 mutex_exit(&vnode_free_list_lock); 249 250 /* 251 * The vnode is still associated with a file system, so we must 252 * clean it out before reusing it. We need to add a reference 253 * before doing this. If the vnode gains another reference while 254 * being cleaned out then we lose - retry. 255 */ 256 vp->v_usecount++; 257 vclean(vp, DOCLOSE); 258 if (vp->v_usecount == 1) { 259 /* We're about to dirty it. */ 260 vp->v_iflag &= ~VI_CLEAN; 261 mutex_exit(&vp->v_interlock); 262 if (vp->v_type == VBLK || vp->v_type == VCHR) { 263 spec_node_destroy(vp); 264 } 265 vp->v_type = VNON; 266 } else { 267 /* 268 * Don't return to freelist - the holder of the last 269 * reference will destroy it. 270 */ 271 KASSERT(vp->v_usecount > 1); 272 vp->v_usecount--; 273 mutex_exit(&vp->v_interlock); 274 mutex_enter(&vnode_free_list_lock); 275 goto retry; 276 } 277 278 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 279 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 280 vpanic(vp, "cleaned vnode isn't"); 281 } 282 if (vp->v_numoutput != 0) { 283 vpanic(vp, "clean vnode has pending I/O's"); 284 } 285 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 286 vpanic(vp, "clean vnode on syncer list"); 287 } 288 289 return vp; 290 } 291 292 /* 293 * Mark a mount point as busy, and gain a new reference to it. Used to 294 * synchronize access and to delay unmounting. 295 * 296 * => Interlock is not released on failure. 297 * => If no interlock, the caller is expected to already hold a reference 298 * on the mount. 299 * => If interlocked, the interlock must prevent the last reference to 300 * the mount from disappearing. 301 */ 302 int 303 vfs_busy(struct mount *mp, const krw_t op, kmutex_t *interlock) 304 { 305 306 KASSERT(mp->mnt_refcnt > 0); 307 308 atomic_inc_uint(&mp->mnt_refcnt); 309 if (interlock != NULL) { 310 mutex_exit(interlock); 311 } 312 if (mp->mnt_writer == curlwp) { 313 mp->mnt_recursecnt++; 314 } else { 315 rw_enter(&mp->mnt_lock, op); 316 if (op == RW_WRITER) { 317 KASSERT(mp->mnt_writer == NULL); 318 mp->mnt_writer = curlwp; 319 } 320 } 321 if ((mp->mnt_iflag & IMNT_GONE) != 0) { 322 vfs_unbusy(mp, false); 323 if (interlock != NULL) { 324 mutex_enter(interlock); 325 } 326 return ENOENT; 327 } 328 329 return 0; 330 } 331 332 /* 333 * As vfs_busy(), but return immediatley if the mount cannot be 334 * locked without waiting. 335 */ 336 int 337 vfs_trybusy(struct mount *mp, krw_t op, kmutex_t *interlock) 338 { 339 340 KASSERT(mp->mnt_refcnt > 0); 341 342 if (mp->mnt_writer == curlwp) { 343 mp->mnt_recursecnt++; 344 } else { 345 if (!rw_tryenter(&mp->mnt_lock, op)) { 346 return EBUSY; 347 } 348 if (op == RW_WRITER) { 349 KASSERT(mp->mnt_writer == NULL); 350 mp->mnt_writer = curlwp; 351 } 352 } 353 atomic_inc_uint(&mp->mnt_refcnt); 354 if ((mp->mnt_iflag & IMNT_GONE) != 0) { 355 vfs_unbusy(mp, false); 356 return ENOENT; 357 } 358 if (interlock != NULL) { 359 mutex_exit(interlock); 360 } 361 return 0; 362 } 363 364 /* 365 * Unlock a busy filesystem and drop reference to it. If 'keepref' is 366 * true, unlock but preserve the reference. 367 */ 368 void 369 vfs_unbusy(struct mount *mp, bool keepref) 370 { 371 372 KASSERT(mp->mnt_refcnt > 0); 373 374 if (mp->mnt_writer == curlwp) { 375 KASSERT(rw_write_held(&mp->mnt_lock)); 376 if (mp->mnt_recursecnt != 0) { 377 mp->mnt_recursecnt--; 378 } else { 379 mp->mnt_writer = NULL; 380 rw_exit(&mp->mnt_lock); 381 } 382 } else { 383 rw_exit(&mp->mnt_lock); 384 } 385 if (!keepref) { 386 vfs_destroy(mp); 387 } 388 } 389 390 /* 391 * Lookup a filesystem type, and if found allocate and initialize 392 * a mount structure for it. 393 * 394 * Devname is usually updated by mount(8) after booting. 395 */ 396 int 397 vfs_rootmountalloc(const char *fstypename, const char *devname, 398 struct mount **mpp) 399 { 400 struct vfsops *vfsp = NULL; 401 struct mount *mp; 402 403 mutex_enter(&vfs_list_lock); 404 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 405 if (!strncmp(vfsp->vfs_name, fstypename, 406 sizeof(mp->mnt_stat.f_fstypename))) 407 break; 408 if (vfsp == NULL) { 409 mutex_exit(&vfs_list_lock); 410 return (ENODEV); 411 } 412 vfsp->vfs_refcount++; 413 mutex_exit(&vfs_list_lock); 414 415 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 416 if (mp == NULL) 417 return ENOMEM; 418 mp->mnt_refcnt = 1; 419 rw_init(&mp->mnt_lock); 420 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 421 (void)vfs_busy(mp, RW_WRITER, NULL); 422 TAILQ_INIT(&mp->mnt_vnodelist); 423 mp->mnt_op = vfsp; 424 mp->mnt_flag = MNT_RDONLY; 425 mp->mnt_vnodecovered = NULL; 426 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 427 sizeof(mp->mnt_stat.f_fstypename)); 428 mp->mnt_stat.f_mntonname[0] = '/'; 429 mp->mnt_stat.f_mntonname[1] = '\0'; 430 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 431 '\0'; 432 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 433 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 434 mount_initspecific(mp); 435 *mpp = mp; 436 return (0); 437 } 438 439 /* 440 * Routines having to do with the management of the vnode table. 441 */ 442 extern int (**dead_vnodeop_p)(void *); 443 444 /* 445 * Return the next vnode from the free list. 446 */ 447 int 448 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 449 vnode_t **vpp) 450 { 451 struct uvm_object *uobj; 452 static int toggle; 453 vnode_t *vp; 454 int error = 0, tryalloc; 455 456 try_again: 457 if (mp != NULL) { 458 /* 459 * Mark filesystem busy while we're creating a 460 * vnode. If unmount is in progress, this will 461 * wait; if the unmount succeeds (only if umount 462 * -f), this will return an error. If the 463 * unmount fails, we'll keep going afterwards. 464 */ 465 error = vfs_busy(mp, RW_READER, NULL); 466 if (error) 467 return error; 468 } 469 470 /* 471 * We must choose whether to allocate a new vnode or recycle an 472 * existing one. The criterion for allocating a new one is that 473 * the total number of vnodes is less than the number desired or 474 * there are no vnodes on either free list. Generally we only 475 * want to recycle vnodes that have no buffers associated with 476 * them, so we look first on the vnode_free_list. If it is empty, 477 * we next consider vnodes with referencing buffers on the 478 * vnode_hold_list. The toggle ensures that half the time we 479 * will use a buffer from the vnode_hold_list, and half the time 480 * we will allocate a new one unless the list has grown to twice 481 * the desired size. We are reticent to recycle vnodes from the 482 * vnode_hold_list because we will lose the identity of all its 483 * referencing buffers. 484 */ 485 486 vp = NULL; 487 488 mutex_enter(&vnode_free_list_lock); 489 490 toggle ^= 1; 491 if (numvnodes > 2 * desiredvnodes) 492 toggle = 0; 493 494 tryalloc = numvnodes < desiredvnodes || 495 (TAILQ_FIRST(&vnode_free_list) == NULL && 496 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 497 498 if (tryalloc) { 499 numvnodes++; 500 mutex_exit(&vnode_free_list_lock); 501 if ((vp = vnalloc(NULL)) == NULL) { 502 mutex_enter(&vnode_free_list_lock); 503 numvnodes--; 504 } else 505 vp->v_usecount = 1; 506 } 507 508 if (vp == NULL) { 509 vp = getcleanvnode(); 510 if (vp == NULL) { 511 if (mp != NULL) { 512 vfs_unbusy(mp, false); 513 } 514 if (tryalloc) { 515 printf("WARNING: unable to allocate new " 516 "vnode, retrying...\n"); 517 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 518 goto try_again; 519 } 520 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 521 *vpp = 0; 522 return (ENFILE); 523 } 524 vp->v_iflag = 0; 525 vp->v_vflag = 0; 526 vp->v_uflag = 0; 527 vp->v_socket = NULL; 528 } 529 530 KASSERT(vp->v_usecount == 1); 531 KASSERT(vp->v_freelisthd == NULL); 532 KASSERT(LIST_EMPTY(&vp->v_nclist)); 533 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 534 535 vp->v_type = VNON; 536 vp->v_vnlock = &vp->v_lock; 537 vp->v_tag = tag; 538 vp->v_op = vops; 539 insmntque(vp, mp); 540 *vpp = vp; 541 vp->v_data = 0; 542 543 /* 544 * initialize uvm_object within vnode. 545 */ 546 547 uobj = &vp->v_uobj; 548 KASSERT(uobj->pgops == &uvm_vnodeops); 549 KASSERT(uobj->uo_npages == 0); 550 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 551 vp->v_size = vp->v_writesize = VSIZENOTSET; 552 553 if (mp != NULL) { 554 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 555 vp->v_vflag |= VV_MPSAFE; 556 vfs_unbusy(mp, true); 557 } 558 559 return (0); 560 } 561 562 /* 563 * This is really just the reverse of getnewvnode(). Needed for 564 * VFS_VGET functions who may need to push back a vnode in case 565 * of a locking race. 566 */ 567 void 568 ungetnewvnode(vnode_t *vp) 569 { 570 571 KASSERT(vp->v_usecount == 1); 572 KASSERT(vp->v_data == NULL); 573 KASSERT(vp->v_freelisthd == NULL); 574 575 mutex_enter(&vp->v_interlock); 576 vp->v_iflag |= VI_CLEAN; 577 vrelel(vp, 0); 578 } 579 580 /* 581 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 582 * marker vnode and we are prepared to wait for the allocation. 583 */ 584 vnode_t * 585 vnalloc(struct mount *mp) 586 { 587 vnode_t *vp; 588 589 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 590 if (vp == NULL) { 591 return NULL; 592 } 593 594 memset(vp, 0, sizeof(*vp)); 595 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 596 cv_init(&vp->v_cv, "vnode"); 597 /* 598 * done by memset() above. 599 * LIST_INIT(&vp->v_nclist); 600 * LIST_INIT(&vp->v_dnclist); 601 */ 602 603 if (mp != NULL) { 604 vp->v_mount = mp; 605 vp->v_type = VBAD; 606 vp->v_iflag = VI_MARKER; 607 } else { 608 rw_init(&vp->v_lock.vl_lock); 609 } 610 611 return vp; 612 } 613 614 /* 615 * Free an unused, unreferenced vnode. 616 */ 617 void 618 vnfree(vnode_t *vp) 619 { 620 621 KASSERT(vp->v_usecount == 0); 622 623 if ((vp->v_iflag & VI_MARKER) == 0) { 624 rw_destroy(&vp->v_lock.vl_lock); 625 mutex_enter(&vnode_free_list_lock); 626 numvnodes--; 627 mutex_exit(&vnode_free_list_lock); 628 } 629 630 UVM_OBJ_DESTROY(&vp->v_uobj); 631 cv_destroy(&vp->v_cv); 632 pool_cache_put(vnode_cache, vp); 633 } 634 635 /* 636 * Remove a vnode from its freelist. 637 */ 638 static inline void 639 vremfree(vnode_t *vp) 640 { 641 642 KASSERT(mutex_owned(&vp->v_interlock)); 643 KASSERT(vp->v_usecount == 0); 644 645 /* 646 * Note that the reference count must not change until 647 * the vnode is removed. 648 */ 649 mutex_enter(&vnode_free_list_lock); 650 if (vp->v_holdcnt > 0) { 651 KASSERT(vp->v_freelisthd == &vnode_hold_list); 652 } else { 653 KASSERT(vp->v_freelisthd == &vnode_free_list); 654 } 655 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 656 vp->v_freelisthd = NULL; 657 mutex_exit(&vnode_free_list_lock); 658 } 659 660 /* 661 * Move a vnode from one mount queue to another. 662 */ 663 static void 664 insmntque(vnode_t *vp, struct mount *mp) 665 { 666 struct mount *omp; 667 668 #ifdef DIAGNOSTIC 669 if ((mp != NULL) && 670 (mp->mnt_iflag & IMNT_UNMOUNT) && 671 !(mp->mnt_flag & MNT_SOFTDEP) && 672 vp->v_tag != VT_VFS) { 673 panic("insmntque into dying filesystem"); 674 } 675 #endif 676 677 mutex_enter(&mntvnode_lock); 678 /* 679 * Delete from old mount point vnode list, if on one. 680 */ 681 if ((omp = vp->v_mount) != NULL) 682 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 683 /* 684 * Insert into list of vnodes for the new mount point, if 685 * available. The caller must take a reference on the mount 686 * structure and donate to the vnode. 687 */ 688 if ((vp->v_mount = mp) != NULL) 689 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 690 mutex_exit(&mntvnode_lock); 691 692 if (omp != NULL) { 693 /* Release reference to old mount. */ 694 vfs_destroy(omp); 695 } 696 } 697 698 /* 699 * Create a vnode for a block device. 700 * Used for root filesystem and swap areas. 701 * Also used for memory file system special devices. 702 */ 703 int 704 bdevvp(dev_t dev, vnode_t **vpp) 705 { 706 707 return (getdevvp(dev, vpp, VBLK)); 708 } 709 710 /* 711 * Create a vnode for a character device. 712 * Used for kernfs and some console handling. 713 */ 714 int 715 cdevvp(dev_t dev, vnode_t **vpp) 716 { 717 718 return (getdevvp(dev, vpp, VCHR)); 719 } 720 721 /* 722 * Create a vnode for a device. 723 * Used by bdevvp (block device) for root file system etc., 724 * and by cdevvp (character device) for console and kernfs. 725 */ 726 static int 727 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 728 { 729 vnode_t *vp; 730 vnode_t *nvp; 731 int error; 732 733 if (dev == NODEV) { 734 *vpp = NULL; 735 return (0); 736 } 737 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 738 if (error) { 739 *vpp = NULL; 740 return (error); 741 } 742 vp = nvp; 743 vp->v_type = type; 744 vp->v_vflag |= VV_MPSAFE; 745 uvm_vnp_setsize(vp, 0); 746 spec_node_init(vp, dev); 747 *vpp = vp; 748 return (0); 749 } 750 751 /* 752 * Grab a particular vnode from the free list, increment its 753 * reference count and lock it. If the vnode lock bit is set the 754 * vnode is being eliminated in vgone. In that case, we can not 755 * grab the vnode, so the process is awakened when the transition is 756 * completed, and an error returned to indicate that the vnode is no 757 * longer usable (possibly having been changed to a new file system type). 758 */ 759 int 760 vget(vnode_t *vp, int flags) 761 { 762 int error; 763 764 KASSERT((vp->v_iflag & VI_MARKER) == 0); 765 766 if ((flags & LK_INTERLOCK) == 0) 767 mutex_enter(&vp->v_interlock); 768 769 /* 770 * Before adding a reference, we must remove the vnode 771 * from its freelist. 772 */ 773 if (vp->v_usecount == 0) { 774 vremfree(vp); 775 } 776 if (++vp->v_usecount == 0) { 777 vpanic(vp, "vget: usecount overflow"); 778 } 779 780 /* 781 * If the vnode is in the process of being cleaned out for 782 * another use, we wait for the cleaning to finish and then 783 * return failure. Cleaning is determined by checking if 784 * the VI_XLOCK or VI_FREEING flags are set. 785 */ 786 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 787 if ((flags & LK_NOWAIT) != 0) { 788 vrelel(vp, 0); 789 return EBUSY; 790 } 791 vwait(vp, VI_XLOCK | VI_FREEING); 792 vrelel(vp, 0); 793 return ENOENT; 794 } 795 if (flags & LK_TYPE_MASK) { 796 error = vn_lock(vp, flags | LK_INTERLOCK); 797 if (error != 0) { 798 vrele(vp); 799 } 800 return error; 801 } 802 mutex_exit(&vp->v_interlock); 803 return 0; 804 } 805 806 /* 807 * vput(), just unlock and vrele() 808 */ 809 void 810 vput(vnode_t *vp) 811 { 812 813 KASSERT((vp->v_iflag & VI_MARKER) == 0); 814 815 VOP_UNLOCK(vp, 0); 816 vrele(vp); 817 } 818 819 /* 820 * Vnode release. If reference count drops to zero, call inactive 821 * routine and either return to freelist or free to the pool. 822 */ 823 void 824 vrelel(vnode_t *vp, int flags) 825 { 826 bool recycle, defer; 827 int error; 828 829 KASSERT(mutex_owned(&vp->v_interlock)); 830 KASSERT((vp->v_iflag & VI_MARKER) == 0); 831 KASSERT(vp->v_freelisthd == NULL); 832 833 if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) { 834 vpanic(vp, "dead but not clean"); 835 } 836 837 /* 838 * If not the last reference, just drop the reference count 839 * and unlock. 840 */ 841 if (vp->v_usecount > 1) { 842 vp->v_usecount--; 843 vp->v_iflag |= VI_INACTREDO; 844 mutex_exit(&vp->v_interlock); 845 return; 846 } 847 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 848 vpanic(vp, "vput: bad ref count"); 849 } 850 851 /* 852 * If not clean, deactivate the vnode, but preserve 853 * our reference across the call to VOP_INACTIVE(). 854 */ 855 retry: 856 if ((vp->v_iflag & VI_CLEAN) == 0) { 857 recycle = false; 858 /* 859 * XXX This ugly block can be largely eliminated if 860 * locking is pushed down into the file systems. 861 */ 862 if (curlwp == uvm.pagedaemon_lwp) { 863 /* The pagedaemon can't wait around; defer. */ 864 defer = true; 865 } else if (curlwp == vrele_lwp) { 866 /* We have to try harder. */ 867 vp->v_iflag &= ~VI_INACTREDO; 868 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 869 LK_RETRY); 870 if (error != 0) { 871 /* XXX */ 872 vpanic(vp, "vrele: unable to lock %p"); 873 } 874 defer = false; 875 } else if ((vp->v_iflag & VI_LAYER) != 0) { 876 /* 877 * Acquiring the stack's lock in vclean() even 878 * for an honest vput/vrele is dangerous because 879 * our caller may hold other vnode locks; defer. 880 */ 881 defer = true; 882 } else { 883 /* If we can't acquire the lock, then defer. */ 884 vp->v_iflag &= ~VI_INACTREDO; 885 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 886 LK_NOWAIT); 887 if (error != 0) { 888 defer = true; 889 mutex_enter(&vp->v_interlock); 890 } else { 891 defer = false; 892 } 893 } 894 895 if (defer) { 896 /* 897 * Defer reclaim to the kthread; it's not safe to 898 * clean it here. We donate it our last reference. 899 */ 900 KASSERT(mutex_owned(&vp->v_interlock)); 901 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 902 vp->v_iflag |= VI_INACTPEND; 903 mutex_enter(&vrele_lock); 904 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 905 if (++vrele_pending > (desiredvnodes >> 8)) 906 cv_signal(&vrele_cv); 907 mutex_exit(&vrele_lock); 908 mutex_exit(&vp->v_interlock); 909 return; 910 } 911 912 #ifdef DIAGNOSTIC 913 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 914 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 915 vprint("vrelel: missing VOP_CLOSE()", vp); 916 } 917 #endif 918 919 /* 920 * The vnode can gain another reference while being 921 * deactivated. If VOP_INACTIVE() indicates that 922 * the described file has been deleted, then recycle 923 * the vnode irrespective of additional references. 924 * Another thread may be waiting to re-use the on-disk 925 * inode. 926 * 927 * Note that VOP_INACTIVE() will drop the vnode lock. 928 */ 929 VOP_INACTIVE(vp, &recycle); 930 mutex_enter(&vp->v_interlock); 931 if (!recycle) { 932 if (vp->v_usecount > 1) { 933 vp->v_usecount--; 934 mutex_exit(&vp->v_interlock); 935 return; 936 } 937 938 /* 939 * If we grew another reference while 940 * VOP_INACTIVE() was underway, retry. 941 */ 942 if ((vp->v_iflag & VI_INACTREDO) != 0) { 943 goto retry; 944 } 945 } 946 947 /* Take care of space accounting. */ 948 if (vp->v_iflag & VI_EXECMAP) { 949 atomic_add_int(&uvmexp.execpages, 950 -vp->v_uobj.uo_npages); 951 atomic_add_int(&uvmexp.filepages, 952 vp->v_uobj.uo_npages); 953 } 954 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 955 vp->v_vflag &= ~VV_MAPPED; 956 957 /* 958 * Recycle the vnode if the file is now unused (unlinked), 959 * otherwise just free it. 960 */ 961 if (recycle) { 962 vclean(vp, DOCLOSE); 963 } 964 KASSERT(vp->v_usecount > 0); 965 } 966 967 if (--vp->v_usecount != 0) { 968 /* Gained another reference while being reclaimed. */ 969 mutex_exit(&vp->v_interlock); 970 return; 971 } 972 973 if ((vp->v_iflag & VI_CLEAN) != 0) { 974 /* 975 * It's clean so destroy it. It isn't referenced 976 * anywhere since it has been reclaimed. 977 */ 978 KASSERT(vp->v_holdcnt == 0); 979 KASSERT(vp->v_writecount == 0); 980 mutex_exit(&vp->v_interlock); 981 insmntque(vp, NULL); 982 if (vp->v_type == VBLK || vp->v_type == VCHR) { 983 spec_node_destroy(vp); 984 } 985 vnfree(vp); 986 } else { 987 /* 988 * Otherwise, put it back onto the freelist. It 989 * can't be destroyed while still associated with 990 * a file system. 991 */ 992 mutex_enter(&vnode_free_list_lock); 993 if (vp->v_holdcnt > 0) { 994 vp->v_freelisthd = &vnode_hold_list; 995 } else { 996 vp->v_freelisthd = &vnode_free_list; 997 } 998 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 999 mutex_exit(&vnode_free_list_lock); 1000 mutex_exit(&vp->v_interlock); 1001 } 1002 } 1003 1004 void 1005 vrele(vnode_t *vp) 1006 { 1007 1008 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1009 1010 mutex_enter(&vp->v_interlock); 1011 vrelel(vp, 0); 1012 } 1013 1014 static void 1015 vrele_thread(void *cookie) 1016 { 1017 vnode_t *vp; 1018 1019 for (;;) { 1020 mutex_enter(&vrele_lock); 1021 while (TAILQ_EMPTY(&vrele_list)) { 1022 cv_timedwait(&vrele_cv, &vrele_lock, hz); 1023 } 1024 vp = TAILQ_FIRST(&vrele_list); 1025 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 1026 vrele_pending--; 1027 mutex_exit(&vrele_lock); 1028 1029 /* 1030 * If not the last reference, then ignore the vnode 1031 * and look for more work. 1032 */ 1033 mutex_enter(&vp->v_interlock); 1034 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 1035 vp->v_iflag &= ~VI_INACTPEND; 1036 if (vp->v_usecount > 1) { 1037 vp->v_usecount--; 1038 mutex_exit(&vp->v_interlock); 1039 continue; 1040 } 1041 vrelel(vp, 0); 1042 } 1043 } 1044 1045 /* 1046 * Page or buffer structure gets a reference. 1047 * Called with v_interlock held. 1048 */ 1049 void 1050 vholdl(vnode_t *vp) 1051 { 1052 1053 KASSERT(mutex_owned(&vp->v_interlock)); 1054 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1055 1056 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1057 mutex_enter(&vnode_free_list_lock); 1058 KASSERT(vp->v_freelisthd == &vnode_free_list); 1059 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1060 vp->v_freelisthd = &vnode_hold_list; 1061 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1062 mutex_exit(&vnode_free_list_lock); 1063 } 1064 } 1065 1066 /* 1067 * Page or buffer structure frees a reference. 1068 * Called with v_interlock held. 1069 */ 1070 void 1071 holdrelel(vnode_t *vp) 1072 { 1073 1074 KASSERT(mutex_owned(&vp->v_interlock)); 1075 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1076 1077 if (vp->v_holdcnt <= 0) { 1078 vpanic(vp, "holdrelel: holdcnt vp %p"); 1079 } 1080 1081 vp->v_holdcnt--; 1082 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1083 mutex_enter(&vnode_free_list_lock); 1084 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1085 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1086 vp->v_freelisthd = &vnode_free_list; 1087 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1088 mutex_exit(&vnode_free_list_lock); 1089 } 1090 } 1091 1092 /* 1093 * Vnode reference, where a reference is already held by some other 1094 * object (for example, a file structure). 1095 */ 1096 void 1097 vref(vnode_t *vp) 1098 { 1099 1100 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1101 1102 mutex_enter(&vp->v_interlock); 1103 if (vp->v_usecount <= 0) { 1104 vpanic(vp, "vref used where vget required"); 1105 } 1106 if (++vp->v_usecount == 0) { 1107 vpanic(vp, "vref: usecount overflow"); 1108 } 1109 mutex_exit(&vp->v_interlock); 1110 } 1111 1112 /* 1113 * Remove any vnodes in the vnode table belonging to mount point mp. 1114 * 1115 * If FORCECLOSE is not specified, there should not be any active ones, 1116 * return error if any are found (nb: this is a user error, not a 1117 * system error). If FORCECLOSE is specified, detach any active vnodes 1118 * that are found. 1119 * 1120 * If WRITECLOSE is set, only flush out regular file vnodes open for 1121 * writing. 1122 * 1123 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1124 */ 1125 #ifdef DEBUG 1126 int busyprt = 0; /* print out busy vnodes */ 1127 struct ctldebug debug1 = { "busyprt", &busyprt }; 1128 #endif 1129 1130 static vnode_t * 1131 vflushnext(vnode_t *mvp, int *when) 1132 { 1133 1134 if (hardclock_ticks > *when) { 1135 mutex_exit(&mntvnode_lock); 1136 yield(); 1137 mutex_enter(&mntvnode_lock); 1138 *when = hardclock_ticks + hz / 10; 1139 } 1140 1141 return vunmark(mvp); 1142 } 1143 1144 int 1145 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1146 { 1147 vnode_t *vp, *mvp; 1148 int busy = 0, when = 0; 1149 1150 /* Allocate a marker vnode. */ 1151 if ((mvp = vnalloc(mp)) == NULL) 1152 return (ENOMEM); 1153 1154 mutex_enter(&mntvnode_lock); 1155 /* 1156 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1157 * and vclean() are called 1158 */ 1159 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1160 vp = vflushnext(mvp, &when)) { 1161 vmark(mvp, vp); 1162 if (vp->v_mount != mp || vismarker(vp)) 1163 continue; 1164 /* 1165 * Skip over a selected vnode. 1166 */ 1167 if (vp == skipvp) 1168 continue; 1169 mutex_enter(&vp->v_interlock); 1170 /* 1171 * Ignore clean but still referenced vnodes. 1172 */ 1173 if ((vp->v_iflag & VI_CLEAN) != 0) { 1174 mutex_exit(&vp->v_interlock); 1175 continue; 1176 } 1177 /* 1178 * Skip over a vnodes marked VSYSTEM. 1179 */ 1180 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1181 mutex_exit(&vp->v_interlock); 1182 continue; 1183 } 1184 /* 1185 * If WRITECLOSE is set, only flush out regular file 1186 * vnodes open for writing. 1187 */ 1188 if ((flags & WRITECLOSE) && 1189 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1190 mutex_exit(&vp->v_interlock); 1191 continue; 1192 } 1193 /* 1194 * With v_usecount == 0, all we need to do is clear 1195 * out the vnode data structures and we are done. 1196 */ 1197 if (vp->v_usecount == 0) { 1198 mutex_exit(&mntvnode_lock); 1199 vremfree(vp); 1200 vp->v_usecount++; 1201 vclean(vp, DOCLOSE); 1202 vrelel(vp, 0); 1203 mutex_enter(&mntvnode_lock); 1204 continue; 1205 } 1206 /* 1207 * If FORCECLOSE is set, forcibly close the vnode. 1208 * For block or character devices, revert to an 1209 * anonymous device. For all other files, just 1210 * kill them. 1211 */ 1212 if (flags & FORCECLOSE) { 1213 mutex_exit(&mntvnode_lock); 1214 vp->v_usecount++; 1215 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1216 vclean(vp, DOCLOSE); 1217 vrelel(vp, 0); 1218 } else { 1219 vclean(vp, 0); 1220 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1221 mutex_exit(&vp->v_interlock); 1222 /* 1223 * The vnode isn't clean, but still resides 1224 * on the mount list. Remove it. XXX This 1225 * is a bit dodgy. 1226 */ 1227 insmntque(vp, NULL); 1228 vrele(vp); 1229 } 1230 mutex_enter(&mntvnode_lock); 1231 continue; 1232 } 1233 #ifdef DEBUG 1234 if (busyprt) 1235 vprint("vflush: busy vnode", vp); 1236 #endif 1237 mutex_exit(&vp->v_interlock); 1238 busy++; 1239 } 1240 mutex_exit(&mntvnode_lock); 1241 vnfree(mvp); 1242 if (busy) 1243 return (EBUSY); 1244 return (0); 1245 } 1246 1247 /* 1248 * Disassociate the underlying file system from a vnode. 1249 * 1250 * Must be called with the interlock held, and will return with it held. 1251 */ 1252 void 1253 vclean(vnode_t *vp, int flags) 1254 { 1255 lwp_t *l = curlwp; 1256 bool recycle, active; 1257 int error; 1258 1259 KASSERT(mutex_owned(&vp->v_interlock)); 1260 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1261 KASSERT(vp->v_usecount != 0); 1262 1263 /* If cleaning is already in progress wait until done and return. */ 1264 if (vp->v_iflag & VI_XLOCK) { 1265 vwait(vp, VI_XLOCK); 1266 return; 1267 } 1268 1269 /* If already clean, nothing to do. */ 1270 if ((vp->v_iflag & VI_CLEAN) != 0) { 1271 return; 1272 } 1273 1274 /* 1275 * Prevent the vnode from being recycled or brought into use 1276 * while we clean it out. 1277 */ 1278 vp->v_iflag |= VI_XLOCK; 1279 if (vp->v_iflag & VI_EXECMAP) { 1280 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1281 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1282 } 1283 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1284 active = (vp->v_usecount > 1); 1285 1286 /* XXXAD should not lock vnode under layer */ 1287 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1288 1289 /* 1290 * Clean out any cached data associated with the vnode. 1291 * If purging an active vnode, it must be closed and 1292 * deactivated before being reclaimed. Note that the 1293 * VOP_INACTIVE will unlock the vnode. 1294 */ 1295 if (flags & DOCLOSE) { 1296 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1297 if (error != 0) 1298 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1299 KASSERT(error == 0); 1300 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1301 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1302 spec_node_revoke(vp); 1303 } 1304 } 1305 if (active) { 1306 VOP_INACTIVE(vp, &recycle); 1307 } else { 1308 /* 1309 * Any other processes trying to obtain this lock must first 1310 * wait for VI_XLOCK to clear, then call the new lock operation. 1311 */ 1312 VOP_UNLOCK(vp, 0); 1313 } 1314 1315 /* Disassociate the underlying file system from the vnode. */ 1316 if (VOP_RECLAIM(vp)) { 1317 vpanic(vp, "vclean: cannot reclaim"); 1318 } 1319 1320 KASSERT(vp->v_uobj.uo_npages == 0); 1321 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1322 uvm_ra_freectx(vp->v_ractx); 1323 vp->v_ractx = NULL; 1324 } 1325 cache_purge(vp); 1326 1327 /* Done with purge, notify sleepers of the grim news. */ 1328 vp->v_op = dead_vnodeop_p; 1329 vp->v_tag = VT_NON; 1330 mutex_enter(&vp->v_interlock); 1331 vp->v_vnlock = &vp->v_lock; 1332 KNOTE(&vp->v_klist, NOTE_REVOKE); 1333 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1334 vp->v_vflag &= ~VV_LOCKSWORK; 1335 if ((flags & DOCLOSE) != 0) { 1336 vp->v_iflag |= VI_CLEAN; 1337 } 1338 cv_broadcast(&vp->v_cv); 1339 1340 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1341 } 1342 1343 /* 1344 * Recycle an unused vnode to the front of the free list. 1345 * Release the passed interlock if the vnode will be recycled. 1346 */ 1347 int 1348 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1349 { 1350 1351 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1352 1353 mutex_enter(&vp->v_interlock); 1354 if (vp->v_usecount != 0) { 1355 mutex_exit(&vp->v_interlock); 1356 return (0); 1357 } 1358 if (inter_lkp) 1359 mutex_exit(inter_lkp); 1360 vremfree(vp); 1361 vp->v_usecount++; 1362 vclean(vp, DOCLOSE); 1363 vrelel(vp, 0); 1364 return (1); 1365 } 1366 1367 /* 1368 * Eliminate all activity associated with a vnode in preparation for 1369 * reuse. Drops a reference from the vnode. 1370 */ 1371 void 1372 vgone(vnode_t *vp) 1373 { 1374 1375 mutex_enter(&vp->v_interlock); 1376 vclean(vp, DOCLOSE); 1377 vrelel(vp, 0); 1378 } 1379 1380 /* 1381 * Lookup a vnode by device number. 1382 */ 1383 int 1384 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1385 { 1386 vnode_t *vp; 1387 int rc = 0; 1388 1389 mutex_enter(&specfs_lock); 1390 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1391 if (dev != vp->v_rdev || type != vp->v_type) 1392 continue; 1393 *vpp = vp; 1394 rc = 1; 1395 break; 1396 } 1397 mutex_exit(&specfs_lock); 1398 return (rc); 1399 } 1400 1401 /* 1402 * Revoke all the vnodes corresponding to the specified minor number 1403 * range (endpoints inclusive) of the specified major. 1404 */ 1405 void 1406 vdevgone(int maj, int minl, int minh, enum vtype type) 1407 { 1408 vnode_t *vp, **vpp; 1409 dev_t dev; 1410 int mn; 1411 1412 vp = NULL; /* XXX gcc */ 1413 1414 mutex_enter(&specfs_lock); 1415 for (mn = minl; mn <= minh; mn++) { 1416 dev = makedev(maj, mn); 1417 vpp = &specfs_hash[SPECHASH(dev)]; 1418 for (vp = *vpp; vp != NULL;) { 1419 mutex_enter(&vp->v_interlock); 1420 if ((vp->v_iflag & VI_CLEAN) != 0 || 1421 dev != vp->v_rdev || type != vp->v_type) { 1422 mutex_exit(&vp->v_interlock); 1423 vp = vp->v_specnext; 1424 continue; 1425 } 1426 mutex_exit(&specfs_lock); 1427 if (vget(vp, LK_INTERLOCK) == 0) { 1428 VOP_REVOKE(vp, REVOKEALL); 1429 vrele(vp); 1430 } 1431 mutex_enter(&specfs_lock); 1432 vp = *vpp; 1433 } 1434 } 1435 mutex_exit(&specfs_lock); 1436 } 1437 1438 /* 1439 * Calculate the total number of references to a special device. 1440 */ 1441 int 1442 vcount(vnode_t *vp) 1443 { 1444 int count; 1445 1446 mutex_enter(&specfs_lock); 1447 mutex_enter(&vp->v_interlock); 1448 if (vp->v_specnode == NULL) { 1449 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 1450 mutex_exit(&vp->v_interlock); 1451 mutex_exit(&specfs_lock); 1452 return (count); 1453 } 1454 mutex_exit(&vp->v_interlock); 1455 count = vp->v_specnode->sn_dev->sd_opencnt; 1456 mutex_exit(&specfs_lock); 1457 return (count); 1458 } 1459 1460 /* 1461 * Eliminate all activity associated with the requested vnode 1462 * and with all vnodes aliased to the requested vnode. 1463 */ 1464 void 1465 vrevoke(vnode_t *vp) 1466 { 1467 vnode_t *vq, **vpp; 1468 enum vtype type; 1469 dev_t dev; 1470 1471 KASSERT(vp->v_usecount > 0); 1472 1473 mutex_enter(&vp->v_interlock); 1474 if ((vp->v_iflag & VI_CLEAN) != 0) { 1475 mutex_exit(&vp->v_interlock); 1476 return; 1477 } else { 1478 dev = vp->v_rdev; 1479 type = vp->v_type; 1480 mutex_exit(&vp->v_interlock); 1481 } 1482 1483 vpp = &specfs_hash[SPECHASH(dev)]; 1484 mutex_enter(&specfs_lock); 1485 for (vq = *vpp; vq != NULL;) { 1486 /* If clean or being cleaned, then ignore it. */ 1487 mutex_enter(&vq->v_interlock); 1488 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1489 vq->v_rdev != dev || vq->v_type != type) { 1490 mutex_exit(&vq->v_interlock); 1491 vq = vq->v_specnext; 1492 continue; 1493 } 1494 mutex_exit(&specfs_lock); 1495 if (vq->v_usecount == 0) { 1496 vremfree(vq); 1497 } 1498 vq->v_usecount++; 1499 vclean(vq, DOCLOSE); 1500 vrelel(vq, 0); 1501 mutex_enter(&specfs_lock); 1502 vq = *vpp; 1503 } 1504 mutex_exit(&specfs_lock); 1505 } 1506 1507 /* 1508 * sysctl helper routine to return list of supported fstypes 1509 */ 1510 static int 1511 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1512 { 1513 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 1514 char *where = oldp; 1515 struct vfsops *v; 1516 size_t needed, left, slen; 1517 int error, first; 1518 1519 if (newp != NULL) 1520 return (EPERM); 1521 if (namelen != 0) 1522 return (EINVAL); 1523 1524 first = 1; 1525 error = 0; 1526 needed = 0; 1527 left = *oldlenp; 1528 1529 sysctl_unlock(); 1530 mutex_enter(&vfs_list_lock); 1531 LIST_FOREACH(v, &vfs_list, vfs_list) { 1532 if (where == NULL) 1533 needed += strlen(v->vfs_name) + 1; 1534 else { 1535 memset(bf, 0, sizeof(bf)); 1536 if (first) { 1537 strncpy(bf, v->vfs_name, sizeof(bf)); 1538 first = 0; 1539 } else { 1540 bf[0] = ' '; 1541 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1542 } 1543 bf[sizeof(bf)-1] = '\0'; 1544 slen = strlen(bf); 1545 if (left < slen + 1) 1546 break; 1547 /* +1 to copy out the trailing NUL byte */ 1548 v->vfs_refcount++; 1549 mutex_exit(&vfs_list_lock); 1550 error = copyout(bf, where, slen + 1); 1551 mutex_enter(&vfs_list_lock); 1552 v->vfs_refcount--; 1553 if (error) 1554 break; 1555 where += slen; 1556 needed += slen; 1557 left -= slen; 1558 } 1559 } 1560 mutex_exit(&vfs_list_lock); 1561 sysctl_relock(); 1562 *oldlenp = needed; 1563 return (error); 1564 } 1565 1566 /* 1567 * Top level filesystem related information gathering. 1568 */ 1569 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 1570 { 1571 sysctl_createv(clog, 0, NULL, NULL, 1572 CTLFLAG_PERMANENT, 1573 CTLTYPE_NODE, "vfs", NULL, 1574 NULL, 0, NULL, 0, 1575 CTL_VFS, CTL_EOL); 1576 sysctl_createv(clog, 0, NULL, NULL, 1577 CTLFLAG_PERMANENT, 1578 CTLTYPE_NODE, "generic", 1579 SYSCTL_DESCR("Non-specific vfs related information"), 1580 NULL, 0, NULL, 0, 1581 CTL_VFS, VFS_GENERIC, CTL_EOL); 1582 sysctl_createv(clog, 0, NULL, NULL, 1583 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1584 CTLTYPE_INT, "usermount", 1585 SYSCTL_DESCR("Whether unprivileged users may mount " 1586 "filesystems"), 1587 NULL, 0, &dovfsusermount, 0, 1588 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 1589 sysctl_createv(clog, 0, NULL, NULL, 1590 CTLFLAG_PERMANENT, 1591 CTLTYPE_STRING, "fstypes", 1592 SYSCTL_DESCR("List of file systems present"), 1593 sysctl_vfs_generic_fstypes, 0, NULL, 0, 1594 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 1595 sysctl_createv(clog, 0, NULL, NULL, 1596 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1597 CTLTYPE_INT, "magiclinks", 1598 SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), 1599 NULL, 0, &vfs_magiclinks, 0, 1600 CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); 1601 } 1602 1603 1604 int kinfo_vdebug = 1; 1605 int kinfo_vgetfailed; 1606 #define KINFO_VNODESLOP 10 1607 /* 1608 * Dump vnode list (via sysctl). 1609 * Copyout address of vnode followed by vnode. 1610 */ 1611 /* ARGSUSED */ 1612 int 1613 sysctl_kern_vnode(SYSCTLFN_ARGS) 1614 { 1615 char *where = oldp; 1616 size_t *sizep = oldlenp; 1617 struct mount *mp, *nmp; 1618 vnode_t *vp, *mvp, vbuf; 1619 char *bp = where, *savebp; 1620 char *ewhere; 1621 int error; 1622 1623 if (namelen != 0) 1624 return (EOPNOTSUPP); 1625 if (newp != NULL) 1626 return (EPERM); 1627 1628 #define VPTRSZ sizeof(vnode_t *) 1629 #define VNODESZ sizeof(vnode_t) 1630 if (where == NULL) { 1631 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1632 return (0); 1633 } 1634 ewhere = where + *sizep; 1635 1636 sysctl_unlock(); 1637 mutex_enter(&mountlist_lock); 1638 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1639 mp = nmp) { 1640 if (vfs_trybusy(mp, RW_READER, &mountlist_lock)) { 1641 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1642 continue; 1643 } 1644 savebp = bp; 1645 /* Allocate a marker vnode. */ 1646 if ((mvp = vnalloc(mp)) == NULL) { 1647 sysctl_relock(); 1648 return (ENOMEM); 1649 } 1650 mutex_enter(&mntvnode_lock); 1651 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 1652 vmark(mvp, vp); 1653 /* 1654 * Check that the vp is still associated with 1655 * this filesystem. RACE: could have been 1656 * recycled onto the same filesystem. 1657 */ 1658 if (vp->v_mount != mp || vismarker(vp)) 1659 continue; 1660 if (bp + VPTRSZ + VNODESZ > ewhere) { 1661 (void)vunmark(mvp); 1662 mutex_exit(&mntvnode_lock); 1663 vnfree(mvp); 1664 sysctl_relock(); 1665 *sizep = bp - where; 1666 return (ENOMEM); 1667 } 1668 memcpy(&vbuf, vp, VNODESZ); 1669 mutex_exit(&mntvnode_lock); 1670 if ((error = copyout(vp, bp, VPTRSZ)) || 1671 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 1672 mutex_enter(&mntvnode_lock); 1673 (void)vunmark(mvp); 1674 mutex_exit(&mntvnode_lock); 1675 vnfree(mvp); 1676 sysctl_relock(); 1677 return (error); 1678 } 1679 bp += VPTRSZ + VNODESZ; 1680 mutex_enter(&mntvnode_lock); 1681 } 1682 mutex_exit(&mntvnode_lock); 1683 mutex_enter(&mountlist_lock); 1684 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1685 vfs_unbusy(mp, false); 1686 vnfree(mvp); 1687 } 1688 mutex_exit(&mountlist_lock); 1689 sysctl_relock(); 1690 1691 *sizep = bp - where; 1692 return (0); 1693 } 1694 1695 /* 1696 * Remove clean vnodes from a mountpoint's vnode list. 1697 */ 1698 void 1699 vfs_scrubvnlist(struct mount *mp) 1700 { 1701 vnode_t *vp, *nvp; 1702 1703 retry: 1704 mutex_enter(&mntvnode_lock); 1705 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1706 nvp = TAILQ_NEXT(vp, v_mntvnodes); 1707 mutex_enter(&vp->v_interlock); 1708 if ((vp->v_iflag & VI_CLEAN) != 0) { 1709 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 1710 vp->v_mount = NULL; 1711 mutex_exit(&mntvnode_lock); 1712 mutex_exit(&vp->v_interlock); 1713 vfs_destroy(mp); 1714 goto retry; 1715 } 1716 mutex_exit(&vp->v_interlock); 1717 } 1718 mutex_exit(&mntvnode_lock); 1719 } 1720 1721 /* 1722 * Check to see if a filesystem is mounted on a block device. 1723 */ 1724 int 1725 vfs_mountedon(vnode_t *vp) 1726 { 1727 vnode_t *vq; 1728 int error = 0; 1729 1730 if (vp->v_type != VBLK) 1731 return ENOTBLK; 1732 if (vp->v_specmountpoint != NULL) 1733 return (EBUSY); 1734 mutex_enter(&specfs_lock); 1735 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 1736 vq = vq->v_specnext) { 1737 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1738 continue; 1739 if (vq->v_specmountpoint != NULL) { 1740 error = EBUSY; 1741 break; 1742 } 1743 } 1744 mutex_exit(&specfs_lock); 1745 return (error); 1746 } 1747 1748 /* 1749 * Unmount all file systems. 1750 * We traverse the list in reverse order under the assumption that doing so 1751 * will avoid needing to worry about dependencies. 1752 */ 1753 void 1754 vfs_unmountall(struct lwp *l) 1755 { 1756 struct mount *mp, *nmp; 1757 int allerror, error; 1758 1759 printf("unmounting file systems..."); 1760 for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist); 1761 !CIRCLEQ_EMPTY(&mountlist); 1762 mp = nmp) { 1763 nmp = CIRCLEQ_PREV(mp, mnt_list); 1764 #ifdef DEBUG 1765 printf("\nunmounting %s (%s)...", 1766 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 1767 #endif 1768 /* 1769 * XXX Freeze syncer. Must do this before locking the 1770 * mount point. See dounmount() for details. 1771 */ 1772 mutex_enter(&syncer_mutex); 1773 if (vfs_busy(mp, RW_WRITER, NULL)) { 1774 mutex_exit(&syncer_mutex); 1775 continue; 1776 } 1777 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 1778 printf("unmount of %s failed with error %d\n", 1779 mp->mnt_stat.f_mntonname, error); 1780 allerror = 1; 1781 } 1782 } 1783 printf(" done\n"); 1784 if (allerror) 1785 printf("WARNING: some file systems would not unmount\n"); 1786 } 1787 1788 /* 1789 * Sync and unmount file systems before shutting down. 1790 */ 1791 void 1792 vfs_shutdown(void) 1793 { 1794 struct lwp *l; 1795 1796 /* XXX we're certainly not running in lwp0's context! */ 1797 l = curlwp; 1798 if (l == NULL) 1799 l = &lwp0; 1800 1801 printf("syncing disks... "); 1802 1803 /* remove user processes from run queue */ 1804 suspendsched(); 1805 (void) spl0(); 1806 1807 /* avoid coming back this way again if we panic. */ 1808 doing_shutdown = 1; 1809 1810 sys_sync(l, NULL, NULL); 1811 1812 /* Wait for sync to finish. */ 1813 if (buf_syncwait() != 0) { 1814 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 1815 Debugger(); 1816 #endif 1817 printf("giving up\n"); 1818 return; 1819 } else 1820 printf("done\n"); 1821 1822 /* 1823 * If we've panic'd, don't make the situation potentially 1824 * worse by unmounting the file systems. 1825 */ 1826 if (panicstr != NULL) 1827 return; 1828 1829 /* Release inodes held by texts before update. */ 1830 #ifdef notdef 1831 vnshutdown(); 1832 #endif 1833 /* Unmount file systems. */ 1834 vfs_unmountall(l); 1835 } 1836 1837 /* 1838 * Mount the root file system. If the operator didn't specify a 1839 * file system to use, try all possible file systems until one 1840 * succeeds. 1841 */ 1842 int 1843 vfs_mountroot(void) 1844 { 1845 struct vfsops *v; 1846 int error = ENODEV; 1847 1848 if (root_device == NULL) 1849 panic("vfs_mountroot: root device unknown"); 1850 1851 switch (device_class(root_device)) { 1852 case DV_IFNET: 1853 if (rootdev != NODEV) 1854 panic("vfs_mountroot: rootdev set for DV_IFNET " 1855 "(0x%08x -> %d,%d)", rootdev, 1856 major(rootdev), minor(rootdev)); 1857 break; 1858 1859 case DV_DISK: 1860 if (rootdev == NODEV) 1861 panic("vfs_mountroot: rootdev not set for DV_DISK"); 1862 if (bdevvp(rootdev, &rootvp)) 1863 panic("vfs_mountroot: can't get vnode for rootdev"); 1864 error = VOP_OPEN(rootvp, FREAD, FSCRED); 1865 if (error) { 1866 printf("vfs_mountroot: can't open root device\n"); 1867 return (error); 1868 } 1869 break; 1870 1871 default: 1872 printf("%s: inappropriate for root file system\n", 1873 root_device->dv_xname); 1874 return (ENODEV); 1875 } 1876 1877 /* 1878 * If user specified a file system, use it. 1879 */ 1880 if (mountroot != NULL) { 1881 error = (*mountroot)(); 1882 goto done; 1883 } 1884 1885 /* 1886 * Try each file system currently configured into the kernel. 1887 */ 1888 mutex_enter(&vfs_list_lock); 1889 LIST_FOREACH(v, &vfs_list, vfs_list) { 1890 if (v->vfs_mountroot == NULL) 1891 continue; 1892 #ifdef DEBUG 1893 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 1894 #endif 1895 v->vfs_refcount++; 1896 mutex_exit(&vfs_list_lock); 1897 error = (*v->vfs_mountroot)(); 1898 mutex_enter(&vfs_list_lock); 1899 v->vfs_refcount--; 1900 if (!error) { 1901 aprint_normal("root file system type: %s\n", 1902 v->vfs_name); 1903 break; 1904 } 1905 } 1906 mutex_exit(&vfs_list_lock); 1907 1908 if (v == NULL) { 1909 printf("no file system for %s", root_device->dv_xname); 1910 if (device_class(root_device) == DV_DISK) 1911 printf(" (dev 0x%x)", rootdev); 1912 printf("\n"); 1913 error = EFTYPE; 1914 } 1915 1916 done: 1917 if (error && device_class(root_device) == DV_DISK) { 1918 VOP_CLOSE(rootvp, FREAD, FSCRED); 1919 vrele(rootvp); 1920 } 1921 return (error); 1922 } 1923 1924 /* 1925 * Sham lock manager for vnodes. This is a temporary measure. 1926 */ 1927 int 1928 vlockmgr(struct vnlock *vl, int flags) 1929 { 1930 1931 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 1932 1933 switch (flags & LK_TYPE_MASK) { 1934 case LK_SHARED: 1935 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 1936 return 0; 1937 } 1938 if ((flags & LK_NOWAIT) != 0) { 1939 return EBUSY; 1940 } 1941 rw_enter(&vl->vl_lock, RW_READER); 1942 return 0; 1943 1944 case LK_EXCLUSIVE: 1945 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 1946 return 0; 1947 } 1948 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 1949 rw_write_held(&vl->vl_lock)) { 1950 vl->vl_recursecnt++; 1951 return 0; 1952 } 1953 if ((flags & LK_NOWAIT) != 0) { 1954 return EBUSY; 1955 } 1956 rw_enter(&vl->vl_lock, RW_WRITER); 1957 return 0; 1958 1959 case LK_RELEASE: 1960 if (vl->vl_recursecnt != 0) { 1961 KASSERT(rw_write_held(&vl->vl_lock)); 1962 vl->vl_recursecnt--; 1963 return 0; 1964 } 1965 rw_exit(&vl->vl_lock); 1966 return 0; 1967 1968 default: 1969 panic("vlockmgr: flags %x", flags); 1970 } 1971 } 1972 1973 int 1974 vlockstatus(struct vnlock *vl) 1975 { 1976 1977 if (rw_write_held(&vl->vl_lock)) { 1978 return LK_EXCLUSIVE; 1979 } 1980 if (rw_read_held(&vl->vl_lock)) { 1981 return LK_SHARED; 1982 } 1983 return 0; 1984 } 1985