1 /* $NetBSD: vfs_subr.c,v 1.305 2007/11/04 17:31:16 pooka Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines. 78 * 79 * This file contains vfs subroutines which are heavily dependant on 80 * the kernel and are not suitable for standalone use. Examples include 81 * routines involved vnode and mountpoint management. 82 */ 83 84 #include <sys/cdefs.h> 85 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.305 2007/11/04 17:31:16 pooka Exp $"); 86 87 #include "opt_inet.h" 88 #include "opt_ddb.h" 89 #include "opt_compat_netbsd.h" 90 #include "opt_compat_43.h" 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/proc.h> 95 #include <sys/kernel.h> 96 #include <sys/mount.h> 97 #include <sys/fcntl.h> 98 #include <sys/vnode.h> 99 #include <sys/stat.h> 100 #include <sys/namei.h> 101 #include <sys/ucred.h> 102 #include <sys/buf.h> 103 #include <sys/errno.h> 104 #include <sys/malloc.h> 105 #include <sys/syscallargs.h> 106 #include <sys/device.h> 107 #include <sys/filedesc.h> 108 #include <sys/kauth.h> 109 110 #include <miscfs/specfs/specdev.h> 111 #include <miscfs/syncfs/syncfs.h> 112 113 #include <uvm/uvm.h> 114 #include <uvm/uvm_readahead.h> 115 #include <uvm/uvm_ddb.h> 116 117 #include <sys/sysctl.h> 118 119 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 120 extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 121 122 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 123 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 124 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 125 126 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 127 128 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 129 &pool_allocator_nointr, IPL_NONE); 130 131 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 132 133 /* 134 * Local declarations. 135 */ 136 137 static void insmntque(struct vnode *, struct mount *); 138 static int getdevvp(dev_t, struct vnode **, enum vtype); 139 static void vclean(struct vnode *, int, struct lwp *); 140 static struct vnode *getcleanvnode(struct lwp *); 141 142 int 143 vfs_drainvnodes(long target, struct lwp *l) 144 { 145 146 simple_lock(&vnode_free_list_slock); 147 while (numvnodes > target) { 148 struct vnode *vp; 149 150 vp = getcleanvnode(l); 151 if (vp == NULL) 152 return EBUSY; /* give up */ 153 pool_put(&vnode_pool, vp); 154 simple_lock(&vnode_free_list_slock); 155 numvnodes--; 156 } 157 simple_unlock(&vnode_free_list_slock); 158 159 return 0; 160 } 161 162 /* 163 * grab a vnode from freelist and clean it. 164 */ 165 struct vnode * 166 getcleanvnode(struct lwp *l) 167 { 168 struct vnode *vp; 169 struct freelst *listhd; 170 171 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 172 173 listhd = &vnode_free_list; 174 try_nextlist: 175 TAILQ_FOREACH(vp, listhd, v_freelist) { 176 if (!simple_lock_try(&vp->v_interlock)) 177 continue; 178 /* 179 * as our lwp might hold the underlying vnode locked, 180 * don't try to reclaim the VLAYER vnode if it's locked. 181 */ 182 if ((vp->v_iflag & VI_XLOCK) == 0 && 183 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 184 break; 185 } 186 simple_unlock(&vp->v_interlock); 187 } 188 189 if (vp == NULLVP) { 190 if (listhd == &vnode_free_list) { 191 listhd = &vnode_hold_list; 192 goto try_nextlist; 193 } 194 simple_unlock(&vnode_free_list_slock); 195 return NULLVP; 196 } 197 198 if (vp->v_usecount) 199 panic("free vnode isn't, vp %p", vp); 200 TAILQ_REMOVE(listhd, vp, v_freelist); 201 /* see comment on why 0xdeadb is set at end of vgone (below) */ 202 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 203 simple_unlock(&vnode_free_list_slock); 204 205 if (vp->v_type != VBAD) 206 vgonel(vp, l); 207 else 208 simple_unlock(&vp->v_interlock); 209 #ifdef DIAGNOSTIC 210 if (vp->v_data || vp->v_uobj.uo_npages || 211 TAILQ_FIRST(&vp->v_uobj.memq)) 212 panic("cleaned vnode isn't, vp %p", vp); 213 if (vp->v_numoutput) 214 panic("clean vnode has pending I/O's, vp %p", vp); 215 #endif 216 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 217 218 return vp; 219 } 220 221 /* 222 * Mark a mount point as busy. Used to synchronize access and to delay 223 * unmounting. Interlock is not released on failure. 224 */ 225 int 226 vfs_busy(struct mount *mp, int flags, kmutex_t *interlkp) 227 { 228 int lkflags; 229 230 while (mp->mnt_iflag & IMNT_UNMOUNT) { 231 int gone, n; 232 233 if (flags & LK_NOWAIT) 234 return (ENOENT); 235 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 236 && mp->mnt_unmounter == curlwp) 237 return (EDEADLK); 238 if (interlkp) 239 mutex_exit(interlkp); 240 /* 241 * Since all busy locks are shared except the exclusive 242 * lock granted when unmounting, the only place that a 243 * wakeup needs to be done is at the release of the 244 * exclusive lock at the end of dounmount. 245 */ 246 simple_lock(&mp->mnt_slock); 247 mp->mnt_wcnt++; 248 ltsleep((void *)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 249 n = --mp->mnt_wcnt; 250 simple_unlock(&mp->mnt_slock); 251 gone = mp->mnt_iflag & IMNT_GONE; 252 253 if (n == 0) 254 wakeup(&mp->mnt_wcnt); 255 if (interlkp) 256 mutex_enter(interlkp); 257 if (gone) 258 return (ENOENT); 259 } 260 lkflags = LK_SHARED; 261 if (interlkp) { 262 /* lkflags |= LK_INTERLOCK; XXX */ 263 mutex_exit(interlkp); /* XXX */ 264 } 265 if (lockmgr(&mp->mnt_lock, lkflags, NULL)) 266 panic("vfs_busy: unexpected lock failure"); 267 return (0); 268 } 269 270 /* 271 * Free a busy filesystem. 272 */ 273 void 274 vfs_unbusy(struct mount *mp) 275 { 276 277 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 278 } 279 280 /* 281 * Lookup a filesystem type, and if found allocate and initialize 282 * a mount structure for it. 283 * 284 * Devname is usually updated by mount(8) after booting. 285 */ 286 int 287 vfs_rootmountalloc(const char *fstypename, const char *devname, 288 struct mount **mpp) 289 { 290 struct vfsops *vfsp = NULL; 291 struct mount *mp; 292 293 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 294 if (!strncmp(vfsp->vfs_name, fstypename, 295 sizeof(mp->mnt_stat.f_fstypename))) 296 break; 297 298 if (vfsp == NULL) 299 return (ENODEV); 300 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 301 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 302 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 303 simple_lock_init(&mp->mnt_slock); 304 (void)vfs_busy(mp, LK_NOWAIT, 0); 305 TAILQ_INIT(&mp->mnt_vnodelist); 306 mp->mnt_op = vfsp; 307 mp->mnt_flag = MNT_RDONLY; 308 mp->mnt_vnodecovered = NULLVP; 309 vfsp->vfs_refcount++; 310 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 311 sizeof(mp->mnt_stat.f_fstypename)); 312 mp->mnt_stat.f_mntonname[0] = '/'; 313 mp->mnt_stat.f_mntonname[1] = '\0'; 314 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 315 '\0'; 316 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 317 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 318 mount_initspecific(mp); 319 *mpp = mp; 320 return (0); 321 } 322 323 324 /* 325 * Routines having to do with the management of the vnode table. 326 */ 327 extern int (**dead_vnodeop_p)(void *); 328 329 /* 330 * Return the next vnode from the free list. 331 */ 332 int 333 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 334 struct vnode **vpp) 335 { 336 extern struct uvm_pagerops uvm_vnodeops; 337 struct uvm_object *uobj; 338 struct lwp *l = curlwp; /* XXX */ 339 static int toggle; 340 struct vnode *vp; 341 int error = 0, tryalloc; 342 343 try_again: 344 if (mp) { 345 /* 346 * Mark filesystem busy while we're creating a vnode. 347 * If unmount is in progress, this will wait; if the 348 * unmount succeeds (only if umount -f), this will 349 * return an error. If the unmount fails, we'll keep 350 * going afterwards. 351 * (This puts the per-mount vnode list logically under 352 * the protection of the vfs_busy lock). 353 */ 354 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 355 if (error && error != EDEADLK) 356 return error; 357 } 358 359 /* 360 * We must choose whether to allocate a new vnode or recycle an 361 * existing one. The criterion for allocating a new one is that 362 * the total number of vnodes is less than the number desired or 363 * there are no vnodes on either free list. Generally we only 364 * want to recycle vnodes that have no buffers associated with 365 * them, so we look first on the vnode_free_list. If it is empty, 366 * we next consider vnodes with referencing buffers on the 367 * vnode_hold_list. The toggle ensures that half the time we 368 * will use a buffer from the vnode_hold_list, and half the time 369 * we will allocate a new one unless the list has grown to twice 370 * the desired size. We are reticent to recycle vnodes from the 371 * vnode_hold_list because we will lose the identity of all its 372 * referencing buffers. 373 */ 374 375 vp = NULL; 376 377 simple_lock(&vnode_free_list_slock); 378 379 toggle ^= 1; 380 if (numvnodes > 2 * desiredvnodes) 381 toggle = 0; 382 383 tryalloc = numvnodes < desiredvnodes || 384 (TAILQ_FIRST(&vnode_free_list) == NULL && 385 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 386 387 if (tryalloc && 388 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 389 numvnodes++; 390 simple_unlock(&vnode_free_list_slock); 391 memset(vp, 0, sizeof(*vp)); 392 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1); 393 /* 394 * done by memset() above. 395 * LIST_INIT(&vp->v_nclist); 396 * LIST_INIT(&vp->v_dnclist); 397 */ 398 } else { 399 vp = getcleanvnode(l); 400 /* 401 * Unless this is a bad time of the month, at most 402 * the first NCPUS items on the free list are 403 * locked, so this is close enough to being empty. 404 */ 405 if (vp == NULLVP) { 406 if (mp && error != EDEADLK) 407 vfs_unbusy(mp); 408 if (tryalloc) { 409 printf("WARNING: unable to allocate new " 410 "vnode, retrying...\n"); 411 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 412 goto try_again; 413 } 414 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 415 *vpp = 0; 416 return (ENFILE); 417 } 418 vp->v_usecount = 1; 419 vp->v_iflag = 0; 420 vp->v_vflag = 0; 421 vp->v_uflag = 0; 422 vp->v_socket = NULL; 423 } 424 vp->v_type = VNON; 425 vp->v_vnlock = &vp->v_lock; 426 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 427 KASSERT(LIST_EMPTY(&vp->v_nclist)); 428 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 429 vp->v_tag = tag; 430 vp->v_op = vops; 431 insmntque(vp, mp); 432 *vpp = vp; 433 vp->v_data = 0; 434 simple_lock_init(&vp->v_interlock); 435 436 /* 437 * initialize uvm_object within vnode. 438 */ 439 440 uobj = &vp->v_uobj; 441 KASSERT(uobj->pgops == &uvm_vnodeops); 442 KASSERT(uobj->uo_npages == 0); 443 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 444 vp->v_size = vp->v_writesize = VSIZENOTSET; 445 446 if (mp && error != EDEADLK) 447 vfs_unbusy(mp); 448 return (0); 449 } 450 451 /* 452 * This is really just the reverse of getnewvnode(). Needed for 453 * VFS_VGET functions who may need to push back a vnode in case 454 * of a locking race. 455 */ 456 void 457 ungetnewvnode(struct vnode *vp) 458 { 459 #ifdef DIAGNOSTIC 460 if (vp->v_usecount != 1) 461 panic("ungetnewvnode: busy vnode"); 462 #endif 463 vp->v_usecount--; 464 insmntque(vp, NULL); 465 vp->v_type = VBAD; 466 467 simple_lock(&vp->v_interlock); 468 /* 469 * Insert at head of LRU list 470 */ 471 simple_lock(&vnode_free_list_slock); 472 if (vp->v_holdcnt > 0) 473 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 474 else 475 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 476 simple_unlock(&vnode_free_list_slock); 477 simple_unlock(&vp->v_interlock); 478 } 479 480 /* 481 * Move a vnode from one mount queue to another. 482 */ 483 static void 484 insmntque(struct vnode *vp, struct mount *mp) 485 { 486 487 #ifdef DIAGNOSTIC 488 if ((mp != NULL) && 489 (mp->mnt_iflag & IMNT_UNMOUNT) && 490 !(mp->mnt_flag & MNT_SOFTDEP) && 491 vp->v_tag != VT_VFS) { 492 panic("insmntque into dying filesystem"); 493 } 494 #endif 495 496 simple_lock(&mntvnode_slock); 497 /* 498 * Delete from old mount point vnode list, if on one. 499 */ 500 if (vp->v_mount != NULL) 501 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 502 /* 503 * Insert into list of vnodes for the new mount point, if available. 504 */ 505 if ((vp->v_mount = mp) != NULL) 506 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 507 simple_unlock(&mntvnode_slock); 508 } 509 510 /* 511 * Create a vnode for a block device. 512 * Used for root filesystem and swap areas. 513 * Also used for memory file system special devices. 514 */ 515 int 516 bdevvp(dev_t dev, struct vnode **vpp) 517 { 518 519 return (getdevvp(dev, vpp, VBLK)); 520 } 521 522 /* 523 * Create a vnode for a character device. 524 * Used for kernfs and some console handling. 525 */ 526 int 527 cdevvp(dev_t dev, struct vnode **vpp) 528 { 529 530 return (getdevvp(dev, vpp, VCHR)); 531 } 532 533 /* 534 * Create a vnode for a device. 535 * Used by bdevvp (block device) for root file system etc., 536 * and by cdevvp (character device) for console and kernfs. 537 */ 538 static int 539 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type) 540 { 541 struct vnode *vp; 542 struct vnode *nvp; 543 int error; 544 545 if (dev == NODEV) { 546 *vpp = NULL; 547 return (0); 548 } 549 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 550 if (error) { 551 *vpp = NULL; 552 return (error); 553 } 554 vp = nvp; 555 vp->v_type = type; 556 uvm_vnp_setsize(vp, 0); 557 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 558 vput(vp); 559 vp = nvp; 560 } 561 *vpp = vp; 562 return (0); 563 } 564 565 /* 566 * Check to see if the new vnode represents a special device 567 * for which we already have a vnode (either because of 568 * bdevvp() or because of a different vnode representing 569 * the same block device). If such an alias exists, deallocate 570 * the existing contents and return the aliased vnode. The 571 * caller is responsible for filling it with its new contents. 572 */ 573 struct vnode * 574 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp) 575 { 576 struct lwp *l = curlwp; /* XXX */ 577 struct vnode *vp; 578 struct vnode **vpp; 579 580 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 581 return (NULLVP); 582 583 vpp = &speclisth[SPECHASH(nvp_rdev)]; 584 loop: 585 simple_lock(&spechash_slock); 586 for (vp = *vpp; vp; vp = vp->v_specnext) { 587 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 588 continue; 589 /* 590 * Alias, but not in use, so flush it out. 591 */ 592 simple_lock(&vp->v_interlock); 593 simple_unlock(&spechash_slock); 594 if (vp->v_usecount == 0) { 595 vgonel(vp, l); 596 goto loop; 597 } 598 /* 599 * What we're interested to know here is if someone else has 600 * removed this vnode from the device hash list while we were 601 * waiting. This can only happen if vclean() did it, and 602 * this requires the vnode to be locked. 603 */ 604 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) 605 goto loop; 606 if (vp->v_specinfo == NULL) { 607 vput(vp); 608 goto loop; 609 } 610 simple_lock(&spechash_slock); 611 break; 612 } 613 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 614 MALLOC(nvp->v_specinfo, struct specinfo *, 615 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 616 /* XXX Erg. */ 617 if (nvp->v_specinfo == NULL) { 618 simple_unlock(&spechash_slock); 619 uvm_wait("checkalias"); 620 goto loop; 621 } 622 623 nvp->v_rdev = nvp_rdev; 624 nvp->v_hashchain = vpp; 625 nvp->v_specnext = *vpp; 626 nvp->v_specmountpoint = NULL; 627 simple_unlock(&spechash_slock); 628 nvp->v_speclockf = NULL; 629 630 *vpp = nvp; 631 if (vp != NULLVP) { 632 nvp->v_iflag |= VI_ALIASED; 633 vp->v_iflag |= VI_ALIASED; 634 vput(vp); 635 } 636 return (NULLVP); 637 } 638 simple_unlock(&spechash_slock); 639 VOP_UNLOCK(vp, 0); 640 simple_lock(&vp->v_interlock); 641 vclean(vp, 0, l); 642 vp->v_op = nvp->v_op; 643 vp->v_tag = nvp->v_tag; 644 vp->v_vnlock = &vp->v_lock; 645 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 646 nvp->v_type = VNON; 647 insmntque(vp, mp); 648 return (vp); 649 } 650 651 /* 652 * Grab a particular vnode from the free list, increment its 653 * reference count and lock it. If the vnode lock bit is set the 654 * vnode is being eliminated in vgone. In that case, we can not 655 * grab the vnode, so the process is awakened when the transition is 656 * completed, and an error returned to indicate that the vnode is no 657 * longer usable (possibly having been changed to a new file system type). 658 */ 659 int 660 vget(struct vnode *vp, int flags) 661 { 662 int error; 663 664 /* 665 * If the vnode is in the process of being cleaned out for 666 * another use, we wait for the cleaning to finish and then 667 * return failure. Cleaning is determined by checking that 668 * the VI_XLOCK flag is set. 669 */ 670 671 if ((flags & LK_INTERLOCK) == 0) 672 simple_lock(&vp->v_interlock); 673 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 674 if (flags & LK_NOWAIT) { 675 simple_unlock(&vp->v_interlock); 676 return EBUSY; 677 } 678 vp->v_iflag |= VI_XWANT; 679 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 680 return (ENOENT); 681 } 682 if (vp->v_usecount == 0) { 683 simple_lock(&vnode_free_list_slock); 684 if (vp->v_holdcnt > 0) 685 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 686 else 687 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 688 simple_unlock(&vnode_free_list_slock); 689 } 690 vp->v_usecount++; 691 #ifdef DIAGNOSTIC 692 if (vp->v_usecount == 0) { 693 vprint("vget", vp); 694 panic("vget: usecount overflow, vp %p", vp); 695 } 696 #endif 697 if (flags & LK_TYPE_MASK) { 698 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 699 vrele(vp); 700 } 701 return (error); 702 } 703 simple_unlock(&vp->v_interlock); 704 return (0); 705 } 706 707 /* 708 * vput(), just unlock and vrele() 709 */ 710 void 711 vput(struct vnode *vp) 712 { 713 struct lwp *l = curlwp; /* XXX */ 714 715 #ifdef DIAGNOSTIC 716 if (vp == NULL) 717 panic("vput: null vp"); 718 #endif 719 simple_lock(&vp->v_interlock); 720 vp->v_usecount--; 721 if (vp->v_usecount > 0) { 722 simple_unlock(&vp->v_interlock); 723 VOP_UNLOCK(vp, 0); 724 return; 725 } 726 #ifdef DIAGNOSTIC 727 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 728 vprint("vput: bad ref count", vp); 729 panic("vput: ref cnt"); 730 } 731 #endif 732 /* 733 * Insert at tail of LRU list. 734 */ 735 simple_lock(&vnode_free_list_slock); 736 if (vp->v_holdcnt > 0) 737 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 738 else 739 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 740 simple_unlock(&vnode_free_list_slock); 741 if (vp->v_iflag & VI_EXECMAP) { 742 uvmexp.execpages -= vp->v_uobj.uo_npages; 743 uvmexp.filepages += vp->v_uobj.uo_npages; 744 } 745 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 746 vp->v_vflag &= ~VV_MAPPED; 747 simple_unlock(&vp->v_interlock); 748 VOP_INACTIVE(vp, l); 749 } 750 751 /* 752 * Vnode release. 753 * If count drops to zero, call inactive routine and return to freelist. 754 */ 755 static void 756 do_vrele(struct vnode *vp, int doinactive, int onhead) 757 { 758 struct lwp *l = curlwp; /* XXX */ 759 760 #ifdef DIAGNOSTIC 761 if (vp == NULL) 762 panic("vrele: null vp"); 763 #endif 764 simple_lock(&vp->v_interlock); 765 vp->v_usecount--; 766 if (vp->v_usecount > 0) { 767 simple_unlock(&vp->v_interlock); 768 return; 769 } 770 #ifdef DIAGNOSTIC 771 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 772 vprint("vrele: bad ref count", vp); 773 panic("vrele: ref cnt vp %p", vp); 774 } 775 #endif 776 /* 777 * Insert at tail of LRU list. 778 */ 779 simple_lock(&vnode_free_list_slock); 780 if (vp->v_holdcnt > 0) { 781 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 782 } else { 783 if (onhead) 784 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 785 else 786 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 787 } 788 simple_unlock(&vnode_free_list_slock); 789 if (vp->v_iflag & VI_EXECMAP) { 790 uvmexp.execpages -= vp->v_uobj.uo_npages; 791 uvmexp.filepages += vp->v_uobj.uo_npages; 792 } 793 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 794 vp->v_vflag &= ~VV_MAPPED; 795 796 if (doinactive) { 797 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 798 VOP_INACTIVE(vp, l); 799 } else { 800 simple_unlock(&vp->v_interlock); 801 } 802 } 803 804 void 805 vrele(struct vnode *vp) 806 { 807 808 do_vrele(vp, 1, 0); 809 } 810 811 void 812 vrele2(struct vnode *vp, int onhead) 813 { 814 815 do_vrele(vp, 0, onhead); 816 } 817 818 /* 819 * Page or buffer structure gets a reference. 820 * Called with v_interlock held. 821 */ 822 void 823 vholdl(struct vnode *vp) 824 { 825 826 /* 827 * If it is on the freelist and the hold count is currently 828 * zero, move it to the hold list. The test of the back 829 * pointer and the use reference count of zero is because 830 * it will be removed from a free list by getnewvnode, 831 * but will not have its reference count incremented until 832 * after calling vgone. If the reference count were 833 * incremented first, vgone would (incorrectly) try to 834 * close the previous instance of the underlying object. 835 * So, the back pointer is explicitly set to `0xdeadb' in 836 * getnewvnode after removing it from a freelist to ensure 837 * that we do not try to move it here. 838 */ 839 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 840 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 841 simple_lock(&vnode_free_list_slock); 842 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 843 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 844 simple_unlock(&vnode_free_list_slock); 845 } 846 vp->v_holdcnt++; 847 } 848 849 /* 850 * Page or buffer structure frees a reference. 851 * Called with v_interlock held. 852 */ 853 void 854 holdrelel(struct vnode *vp) 855 { 856 857 if (vp->v_holdcnt <= 0) 858 panic("holdrelel: holdcnt vp %p", vp); 859 vp->v_holdcnt--; 860 861 /* 862 * If it is on the holdlist and the hold count drops to 863 * zero, move it to the free list. The test of the back 864 * pointer and the use reference count of zero is because 865 * it will be removed from a free list by getnewvnode, 866 * but will not have its reference count incremented until 867 * after calling vgone. If the reference count were 868 * incremented first, vgone would (incorrectly) try to 869 * close the previous instance of the underlying object. 870 * So, the back pointer is explicitly set to `0xdeadb' in 871 * getnewvnode after removing it from a freelist to ensure 872 * that we do not try to move it here. 873 */ 874 875 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 876 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 877 simple_lock(&vnode_free_list_slock); 878 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 879 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 880 simple_unlock(&vnode_free_list_slock); 881 } 882 } 883 884 /* 885 * Vnode reference. 886 */ 887 void 888 vref(struct vnode *vp) 889 { 890 891 simple_lock(&vp->v_interlock); 892 if (vp->v_usecount <= 0) 893 panic("vref used where vget required, vp %p", vp); 894 vp->v_usecount++; 895 #ifdef DIAGNOSTIC 896 if (vp->v_usecount == 0) { 897 vprint("vref", vp); 898 panic("vref: usecount overflow, vp %p", vp); 899 } 900 #endif 901 simple_unlock(&vp->v_interlock); 902 } 903 904 /* 905 * Remove any vnodes in the vnode table belonging to mount point mp. 906 * 907 * If FORCECLOSE is not specified, there should not be any active ones, 908 * return error if any are found (nb: this is a user error, not a 909 * system error). If FORCECLOSE is specified, detach any active vnodes 910 * that are found. 911 * 912 * If WRITECLOSE is set, only flush out regular file vnodes open for 913 * writing. 914 * 915 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 916 */ 917 #ifdef DEBUG 918 int busyprt = 0; /* print out busy vnodes */ 919 struct ctldebug debug1 = { "busyprt", &busyprt }; 920 #endif 921 922 int 923 vflush(struct mount *mp, struct vnode *skipvp, int flags) 924 { 925 struct lwp *l = curlwp; /* XXX */ 926 struct vnode *vp, *nvp; 927 int busy = 0; 928 929 simple_lock(&mntvnode_slock); 930 loop: 931 /* 932 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 933 * and vclean() are called 934 */ 935 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 936 if (vp->v_mount != mp) 937 goto loop; 938 nvp = TAILQ_NEXT(vp, v_mntvnodes); 939 /* 940 * Skip over a selected vnode. 941 */ 942 if (vp == skipvp) 943 continue; 944 simple_lock(&vp->v_interlock); 945 /* 946 * Skip over a vnodes marked VV_SYSTEM. 947 */ 948 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 949 simple_unlock(&vp->v_interlock); 950 continue; 951 } 952 /* 953 * If WRITECLOSE is set, only flush out regular file 954 * vnodes open for writing. 955 */ 956 if ((flags & WRITECLOSE) && 957 (vp->v_writecount == 0 || vp->v_type != VREG)) { 958 simple_unlock(&vp->v_interlock); 959 continue; 960 } 961 /* 962 * With v_usecount == 0, all we need to do is clear 963 * out the vnode data structures and we are done. 964 */ 965 if (vp->v_usecount == 0) { 966 simple_unlock(&mntvnode_slock); 967 vgonel(vp, l); 968 simple_lock(&mntvnode_slock); 969 continue; 970 } 971 /* 972 * If FORCECLOSE is set, forcibly close the vnode. 973 * For block or character devices, revert to an 974 * anonymous device. For all other files, just kill them. 975 */ 976 if (flags & FORCECLOSE) { 977 simple_unlock(&mntvnode_slock); 978 if (vp->v_type != VBLK && vp->v_type != VCHR) { 979 vgonel(vp, l); 980 } else { 981 vclean(vp, 0, l); 982 vp->v_op = spec_vnodeop_p; 983 insmntque(vp, (struct mount *)0); 984 } 985 simple_lock(&mntvnode_slock); 986 continue; 987 } 988 #ifdef DEBUG 989 if (busyprt) 990 vprint("vflush: busy vnode", vp); 991 #endif 992 simple_unlock(&vp->v_interlock); 993 busy++; 994 } 995 simple_unlock(&mntvnode_slock); 996 if (busy) 997 return (EBUSY); 998 return (0); 999 } 1000 1001 /* 1002 * Disassociate the underlying file system from a vnode. 1003 */ 1004 static void 1005 vclean(struct vnode *vp, int flags, struct lwp *l) 1006 { 1007 int active; 1008 1009 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1010 1011 /* 1012 * Check to see if the vnode is in use. 1013 * If so we have to reference it before we clean it out 1014 * so that its count cannot fall to zero and generate a 1015 * race against ourselves to recycle it. 1016 */ 1017 1018 if ((active = vp->v_usecount) != 0) { 1019 vp->v_usecount++; 1020 #ifdef DIAGNOSTIC 1021 if (vp->v_usecount == 0) { 1022 vprint("vclean", vp); 1023 panic("vclean: usecount overflow"); 1024 } 1025 #endif 1026 } 1027 1028 /* 1029 * Prevent the vnode from being recycled or 1030 * brought into use while we clean it out. 1031 */ 1032 if (vp->v_iflag & VI_XLOCK) 1033 panic("vclean: deadlock, vp %p", vp); 1034 vp->v_iflag |= VI_XLOCK; 1035 if (vp->v_iflag & VI_EXECMAP) { 1036 uvmexp.execpages -= vp->v_uobj.uo_npages; 1037 uvmexp.filepages += vp->v_uobj.uo_npages; 1038 } 1039 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1040 1041 /* 1042 * Even if the count is zero, the VOP_INACTIVE routine may still 1043 * have the object locked while it cleans it out. For 1044 * active vnodes, it ensures that no other activity can 1045 * occur while the underlying object is being cleaned out. 1046 * 1047 * We drain the lock to make sure we are the last one trying to 1048 * get it and immediately resurrect the lock. Future accesses 1049 * for locking this _vnode_ will be protected by VI_XLOCK. However, 1050 * upper layers might be using the _lock_ in case the file system 1051 * exported it and might access it while the vnode lingers in 1052 * deadfs. 1053 */ 1054 VOP_LOCK(vp, LK_DRAIN | LK_RESURRECT | LK_INTERLOCK); 1055 1056 /* 1057 * Clean out any cached data associated with the vnode. 1058 * If special device, remove it from special device alias list. 1059 * if it is on one. 1060 */ 1061 if (flags & DOCLOSE) { 1062 int error; 1063 struct vnode *vq, *vx; 1064 1065 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1066 if (error) 1067 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1068 KASSERT(error == 0); 1069 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1070 1071 if (active) 1072 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1073 1074 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1075 vp->v_specinfo != 0) { 1076 simple_lock(&spechash_slock); 1077 if (vp->v_hashchain != NULL) { 1078 if (*vp->v_hashchain == vp) { 1079 *vp->v_hashchain = vp->v_specnext; 1080 } else { 1081 for (vq = *vp->v_hashchain; vq; 1082 vq = vq->v_specnext) { 1083 if (vq->v_specnext != vp) 1084 continue; 1085 vq->v_specnext = vp->v_specnext; 1086 break; 1087 } 1088 if (vq == NULL) 1089 panic("missing bdev"); 1090 } 1091 if (vp->v_iflag & VI_ALIASED) { 1092 vx = NULL; 1093 for (vq = *vp->v_hashchain; vq; 1094 vq = vq->v_specnext) { 1095 if (vq->v_rdev != vp->v_rdev || 1096 vq->v_type != vp->v_type) 1097 continue; 1098 if (vx) 1099 break; 1100 vx = vq; 1101 } 1102 if (vx == NULL) 1103 panic("missing alias"); 1104 if (vq == NULL) 1105 vx->v_iflag &= ~VI_ALIASED; 1106 vp->v_iflag &= ~VI_ALIASED; 1107 } 1108 } 1109 simple_unlock(&spechash_slock); 1110 FREE(vp->v_specinfo, M_VNODE); 1111 vp->v_specinfo = NULL; 1112 } 1113 } 1114 1115 /* 1116 * If purging an active vnode, it must be closed and 1117 * deactivated before being reclaimed. Note that the 1118 * VOP_INACTIVE will unlock the vnode. 1119 */ 1120 if (active) { 1121 VOP_INACTIVE(vp, l); 1122 } else { 1123 /* 1124 * Any other processes trying to obtain this lock must first 1125 * wait for VI_XLOCK to clear, then call the new lock operation. 1126 */ 1127 VOP_UNLOCK(vp, 0); 1128 } 1129 /* 1130 * Reclaim the vnode. 1131 */ 1132 if (VOP_RECLAIM(vp, l)) 1133 panic("vclean: cannot reclaim, vp %p", vp); 1134 if (active) { 1135 /* 1136 * Inline copy of vrele() since VOP_INACTIVE 1137 * has already been called. 1138 */ 1139 simple_lock(&vp->v_interlock); 1140 if (--vp->v_usecount <= 0) { 1141 #ifdef DIAGNOSTIC 1142 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1143 vprint("vclean: bad ref count", vp); 1144 panic("vclean: ref cnt"); 1145 } 1146 #endif 1147 /* 1148 * Insert at tail of LRU list. 1149 */ 1150 1151 simple_unlock(&vp->v_interlock); 1152 simple_lock(&vnode_free_list_slock); 1153 #ifdef DIAGNOSTIC 1154 if (vp->v_holdcnt > 0) 1155 panic("vclean: not clean, vp %p", vp); 1156 #endif 1157 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1158 simple_unlock(&vnode_free_list_slock); 1159 } else 1160 simple_unlock(&vp->v_interlock); 1161 } 1162 1163 KASSERT(vp->v_uobj.uo_npages == 0); 1164 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1165 uvm_ra_freectx(vp->v_ractx); 1166 vp->v_ractx = NULL; 1167 } 1168 cache_purge(vp); 1169 1170 /* 1171 * Done with purge, notify sleepers of the grim news. 1172 */ 1173 vp->v_op = dead_vnodeop_p; 1174 vp->v_tag = VT_NON; 1175 vp->v_vnlock = NULL; 1176 simple_lock(&vp->v_interlock); 1177 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1178 vp->v_iflag &= ~VI_XLOCK; 1179 vp->v_vflag &= ~VV_LOCKSWORK; 1180 if (vp->v_iflag & VI_XWANT) { 1181 vp->v_iflag &= ~VI_XWANT; 1182 simple_unlock(&vp->v_interlock); 1183 wakeup((void *)vp); 1184 } else 1185 simple_unlock(&vp->v_interlock); 1186 } 1187 1188 /* 1189 * Recycle an unused vnode to the front of the free list. 1190 * Release the passed interlock if the vnode will be recycled. 1191 */ 1192 int 1193 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct lwp *l) 1194 { 1195 1196 simple_lock(&vp->v_interlock); 1197 if (vp->v_usecount == 0) { 1198 if (inter_lkp) 1199 simple_unlock(inter_lkp); 1200 vgonel(vp, l); 1201 return (1); 1202 } 1203 simple_unlock(&vp->v_interlock); 1204 return (0); 1205 } 1206 1207 /* 1208 * Eliminate all activity associated with a vnode 1209 * in preparation for reuse. 1210 */ 1211 void 1212 vgone(struct vnode *vp) 1213 { 1214 struct lwp *l = curlwp; /* XXX */ 1215 1216 simple_lock(&vp->v_interlock); 1217 vgonel(vp, l); 1218 } 1219 1220 /* 1221 * vgone, with the vp interlock held. 1222 */ 1223 void 1224 vgonel(struct vnode *vp, struct lwp *l) 1225 { 1226 1227 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1228 1229 /* 1230 * If a vgone (or vclean) is already in progress, 1231 * wait until it is done and return. 1232 */ 1233 1234 if (vp->v_iflag & VI_XLOCK) { 1235 vp->v_iflag |= VI_XWANT; 1236 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1237 return; 1238 } 1239 1240 /* 1241 * Clean out the filesystem specific data. 1242 */ 1243 1244 vclean(vp, DOCLOSE, l); 1245 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1246 1247 /* 1248 * Delete from old mount point vnode list, if on one. 1249 */ 1250 1251 if (vp->v_mount != NULL) 1252 insmntque(vp, (struct mount *)0); 1253 1254 /* 1255 * The test of the back pointer and the reference count of 1256 * zero is because it will be removed from the free list by 1257 * getcleanvnode, but will not have its reference count 1258 * incremented until after calling vgone. If the reference 1259 * count were incremented first, vgone would (incorrectly) 1260 * try to close the previous instance of the underlying object. 1261 * So, the back pointer is explicitly set to `0xdeadb' in 1262 * getnewvnode after removing it from the freelist to ensure 1263 * that we do not try to move it here. 1264 */ 1265 1266 vp->v_type = VBAD; 1267 if (vp->v_usecount == 0) { 1268 bool dofree; 1269 1270 simple_lock(&vnode_free_list_slock); 1271 if (vp->v_holdcnt > 0) 1272 panic("vgonel: not clean, vp %p", vp); 1273 /* 1274 * if it isn't on the freelist, we're called by getcleanvnode 1275 * and vnode is being re-used. otherwise, we'll free it. 1276 */ 1277 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1278 if (dofree) { 1279 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1280 numvnodes--; 1281 } 1282 simple_unlock(&vnode_free_list_slock); 1283 if (dofree) 1284 pool_put(&vnode_pool, vp); 1285 } 1286 } 1287 1288 /* 1289 * Lookup a vnode by device number. 1290 */ 1291 int 1292 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1293 { 1294 struct vnode *vp; 1295 int rc = 0; 1296 1297 simple_lock(&spechash_slock); 1298 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1299 if (dev != vp->v_rdev || type != vp->v_type) 1300 continue; 1301 *vpp = vp; 1302 rc = 1; 1303 break; 1304 } 1305 simple_unlock(&spechash_slock); 1306 return (rc); 1307 } 1308 1309 /* 1310 * Revoke all the vnodes corresponding to the specified minor number 1311 * range (endpoints inclusive) of the specified major. 1312 */ 1313 void 1314 vdevgone(int maj, int minl, int minh, enum vtype type) 1315 { 1316 struct vnode *vp; 1317 int mn; 1318 1319 vp = NULL; /* XXX gcc */ 1320 1321 for (mn = minl; mn <= minh; mn++) 1322 if (vfinddev(makedev(maj, mn), type, &vp)) 1323 VOP_REVOKE(vp, REVOKEALL); 1324 } 1325 1326 /* 1327 * Calculate the total number of references to a special device. 1328 */ 1329 int 1330 vcount(struct vnode *vp) 1331 { 1332 struct vnode *vq, *vnext; 1333 int count; 1334 1335 loop: 1336 if ((vp->v_iflag & VI_ALIASED) == 0) 1337 return (vp->v_usecount); 1338 simple_lock(&spechash_slock); 1339 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1340 vnext = vq->v_specnext; 1341 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1342 continue; 1343 /* 1344 * Alias, but not in use, so flush it out. 1345 */ 1346 if (vq->v_usecount == 0 && vq != vp && 1347 (vq->v_iflag & VI_XLOCK) == 0) { 1348 simple_unlock(&spechash_slock); 1349 vgone(vq); 1350 goto loop; 1351 } 1352 count += vq->v_usecount; 1353 } 1354 simple_unlock(&spechash_slock); 1355 return (count); 1356 } 1357 1358 1359 /* 1360 * sysctl helper routine to return list of supported fstypes 1361 */ 1362 static int 1363 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1364 { 1365 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 1366 char *where = oldp; 1367 struct vfsops *v; 1368 size_t needed, left, slen; 1369 int error, first; 1370 1371 if (newp != NULL) 1372 return (EPERM); 1373 if (namelen != 0) 1374 return (EINVAL); 1375 1376 first = 1; 1377 error = 0; 1378 needed = 0; 1379 left = *oldlenp; 1380 1381 mutex_enter(&vfs_list_lock); 1382 LIST_FOREACH(v, &vfs_list, vfs_list) { 1383 if (where == NULL) 1384 needed += strlen(v->vfs_name) + 1; 1385 else { 1386 memset(bf, 0, sizeof(bf)); 1387 if (first) { 1388 strncpy(bf, v->vfs_name, sizeof(bf)); 1389 first = 0; 1390 } else { 1391 bf[0] = ' '; 1392 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1393 } 1394 bf[sizeof(bf)-1] = '\0'; 1395 slen = strlen(bf); 1396 if (left < slen + 1) 1397 break; 1398 /* +1 to copy out the trailing NUL byte */ 1399 v->vfs_refcount++; 1400 mutex_exit(&vfs_list_lock); 1401 error = copyout(bf, where, slen + 1); 1402 mutex_enter(&vfs_list_lock); 1403 v->vfs_refcount--; 1404 if (error) 1405 break; 1406 where += slen; 1407 needed += slen; 1408 left -= slen; 1409 } 1410 } 1411 mutex_exit(&vfs_list_lock); 1412 *oldlenp = needed; 1413 return (error); 1414 } 1415 1416 /* 1417 * Top level filesystem related information gathering. 1418 */ 1419 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 1420 { 1421 sysctl_createv(clog, 0, NULL, NULL, 1422 CTLFLAG_PERMANENT, 1423 CTLTYPE_NODE, "vfs", NULL, 1424 NULL, 0, NULL, 0, 1425 CTL_VFS, CTL_EOL); 1426 sysctl_createv(clog, 0, NULL, NULL, 1427 CTLFLAG_PERMANENT, 1428 CTLTYPE_NODE, "generic", 1429 SYSCTL_DESCR("Non-specific vfs related information"), 1430 NULL, 0, NULL, 0, 1431 CTL_VFS, VFS_GENERIC, CTL_EOL); 1432 sysctl_createv(clog, 0, NULL, NULL, 1433 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1434 CTLTYPE_INT, "usermount", 1435 SYSCTL_DESCR("Whether unprivileged users may mount " 1436 "filesystems"), 1437 NULL, 0, &dovfsusermount, 0, 1438 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 1439 sysctl_createv(clog, 0, NULL, NULL, 1440 CTLFLAG_PERMANENT, 1441 CTLTYPE_STRING, "fstypes", 1442 SYSCTL_DESCR("List of file systems present"), 1443 sysctl_vfs_generic_fstypes, 0, NULL, 0, 1444 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 1445 sysctl_createv(clog, 0, NULL, NULL, 1446 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1447 CTLTYPE_INT, "magiclinks", 1448 SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), 1449 NULL, 0, &vfs_magiclinks, 0, 1450 CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); 1451 } 1452 1453 1454 int kinfo_vdebug = 1; 1455 int kinfo_vgetfailed; 1456 #define KINFO_VNODESLOP 10 1457 /* 1458 * Dump vnode list (via sysctl). 1459 * Copyout address of vnode followed by vnode. 1460 */ 1461 /* ARGSUSED */ 1462 int 1463 sysctl_kern_vnode(SYSCTLFN_ARGS) 1464 { 1465 char *where = oldp; 1466 size_t *sizep = oldlenp; 1467 struct mount *mp, *nmp; 1468 struct vnode *vp; 1469 char *bp = where, *savebp; 1470 char *ewhere; 1471 int error; 1472 1473 if (namelen != 0) 1474 return (EOPNOTSUPP); 1475 if (newp != NULL) 1476 return (EPERM); 1477 1478 #define VPTRSZ sizeof(struct vnode *) 1479 #define VNODESZ sizeof(struct vnode) 1480 if (where == NULL) { 1481 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1482 return (0); 1483 } 1484 ewhere = where + *sizep; 1485 1486 mutex_enter(&mountlist_lock); 1487 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1488 mp = nmp) { 1489 if (vfs_busy(mp, LK_NOWAIT, &mountlist_lock)) { 1490 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1491 continue; 1492 } 1493 savebp = bp; 1494 again: 1495 simple_lock(&mntvnode_slock); 1496 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1497 /* 1498 * Check that the vp is still associated with 1499 * this filesystem. RACE: could have been 1500 * recycled onto the same filesystem. 1501 */ 1502 if (vp->v_mount != mp) { 1503 simple_unlock(&mntvnode_slock); 1504 if (kinfo_vdebug) 1505 printf("kinfo: vp changed\n"); 1506 bp = savebp; 1507 goto again; 1508 } 1509 if (bp + VPTRSZ + VNODESZ > ewhere) { 1510 simple_unlock(&mntvnode_slock); 1511 *sizep = bp - where; 1512 return (ENOMEM); 1513 } 1514 simple_unlock(&mntvnode_slock); 1515 if ((error = copyout((void *)&vp, bp, VPTRSZ)) || 1516 (error = copyout((void *)vp, bp + VPTRSZ, VNODESZ))) 1517 return (error); 1518 bp += VPTRSZ + VNODESZ; 1519 simple_lock(&mntvnode_slock); 1520 } 1521 simple_unlock(&mntvnode_slock); 1522 mutex_enter(&mountlist_lock); 1523 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1524 vfs_unbusy(mp); 1525 } 1526 mutex_exit(&mountlist_lock); 1527 1528 *sizep = bp - where; 1529 return (0); 1530 } 1531 1532 /* 1533 * Check to see if a filesystem is mounted on a block device. 1534 */ 1535 int 1536 vfs_mountedon(struct vnode *vp) 1537 { 1538 struct vnode *vq; 1539 int error = 0; 1540 1541 if (vp->v_type != VBLK) 1542 return ENOTBLK; 1543 if (vp->v_specmountpoint != NULL) 1544 return (EBUSY); 1545 if (vp->v_iflag & VI_ALIASED) { 1546 simple_lock(&spechash_slock); 1547 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 1548 if (vq->v_rdev != vp->v_rdev || 1549 vq->v_type != vp->v_type) 1550 continue; 1551 if (vq->v_specmountpoint != NULL) { 1552 error = EBUSY; 1553 break; 1554 } 1555 } 1556 simple_unlock(&spechash_slock); 1557 } 1558 return (error); 1559 } 1560 1561 /* 1562 * Unmount all file systems. 1563 * We traverse the list in reverse order under the assumption that doing so 1564 * will avoid needing to worry about dependencies. 1565 */ 1566 void 1567 vfs_unmountall(struct lwp *l) 1568 { 1569 struct mount *mp, *nmp; 1570 int allerror, error; 1571 1572 printf("unmounting file systems..."); 1573 for (allerror = 0, 1574 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 1575 nmp = mp->mnt_list.cqe_prev; 1576 #ifdef DEBUG 1577 printf("\nunmounting %s (%s)...", 1578 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 1579 #endif 1580 /* 1581 * XXX Freeze syncer. Must do this before locking the 1582 * mount point. See dounmount() for details. 1583 */ 1584 mutex_enter(&syncer_mutex); 1585 if (vfs_busy(mp, 0, 0)) { 1586 mutex_exit(&syncer_mutex); 1587 continue; 1588 } 1589 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 1590 printf("unmount of %s failed with error %d\n", 1591 mp->mnt_stat.f_mntonname, error); 1592 allerror = 1; 1593 } 1594 } 1595 printf(" done\n"); 1596 if (allerror) 1597 printf("WARNING: some file systems would not unmount\n"); 1598 } 1599 1600 extern struct simplelock bqueue_slock; /* XXX */ 1601 1602 /* 1603 * Sync and unmount file systems before shutting down. 1604 */ 1605 void 1606 vfs_shutdown(void) 1607 { 1608 struct lwp *l; 1609 1610 /* XXX we're certainly not running in lwp0's context! */ 1611 l = curlwp; 1612 if (l == NULL) 1613 l = &lwp0; 1614 1615 printf("syncing disks... "); 1616 1617 /* remove user processes from run queue */ 1618 suspendsched(); 1619 (void) spl0(); 1620 1621 /* avoid coming back this way again if we panic. */ 1622 doing_shutdown = 1; 1623 1624 sys_sync(l, NULL, NULL); 1625 1626 /* Wait for sync to finish. */ 1627 if (buf_syncwait() != 0) { 1628 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 1629 Debugger(); 1630 #endif 1631 printf("giving up\n"); 1632 return; 1633 } else 1634 printf("done\n"); 1635 1636 /* 1637 * If we've panic'd, don't make the situation potentially 1638 * worse by unmounting the file systems. 1639 */ 1640 if (panicstr != NULL) 1641 return; 1642 1643 /* Release inodes held by texts before update. */ 1644 #ifdef notdef 1645 vnshutdown(); 1646 #endif 1647 /* Unmount file systems. */ 1648 vfs_unmountall(l); 1649 } 1650 1651 /* 1652 * Mount the root file system. If the operator didn't specify a 1653 * file system to use, try all possible file systems until one 1654 * succeeds. 1655 */ 1656 int 1657 vfs_mountroot(void) 1658 { 1659 struct vfsops *v; 1660 int error = ENODEV; 1661 1662 if (root_device == NULL) 1663 panic("vfs_mountroot: root device unknown"); 1664 1665 switch (device_class(root_device)) { 1666 case DV_IFNET: 1667 if (rootdev != NODEV) 1668 panic("vfs_mountroot: rootdev set for DV_IFNET " 1669 "(0x%08x -> %d,%d)", rootdev, 1670 major(rootdev), minor(rootdev)); 1671 break; 1672 1673 case DV_DISK: 1674 if (rootdev == NODEV) 1675 panic("vfs_mountroot: rootdev not set for DV_DISK"); 1676 if (bdevvp(rootdev, &rootvp)) 1677 panic("vfs_mountroot: can't get vnode for rootdev"); 1678 error = VOP_OPEN(rootvp, FREAD, FSCRED, curlwp); 1679 if (error) { 1680 printf("vfs_mountroot: can't open root device\n"); 1681 return (error); 1682 } 1683 break; 1684 1685 default: 1686 printf("%s: inappropriate for root file system\n", 1687 root_device->dv_xname); 1688 return (ENODEV); 1689 } 1690 1691 /* 1692 * If user specified a file system, use it. 1693 */ 1694 if (mountroot != NULL) { 1695 error = (*mountroot)(); 1696 goto done; 1697 } 1698 1699 /* 1700 * Try each file system currently configured into the kernel. 1701 */ 1702 mutex_enter(&vfs_list_lock); 1703 LIST_FOREACH(v, &vfs_list, vfs_list) { 1704 if (v->vfs_mountroot == NULL) 1705 continue; 1706 #ifdef DEBUG 1707 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 1708 #endif 1709 v->vfs_refcount++; 1710 mutex_exit(&vfs_list_lock); 1711 error = (*v->vfs_mountroot)(); 1712 mutex_enter(&vfs_list_lock); 1713 v->vfs_refcount--; 1714 if (!error) { 1715 aprint_normal("root file system type: %s\n", 1716 v->vfs_name); 1717 break; 1718 } 1719 } 1720 mutex_exit(&vfs_list_lock); 1721 1722 if (v == NULL) { 1723 printf("no file system for %s", root_device->dv_xname); 1724 if (device_class(root_device) == DV_DISK) 1725 printf(" (dev 0x%x)", rootdev); 1726 printf("\n"); 1727 error = EFTYPE; 1728 } 1729 1730 done: 1731 if (error && device_class(root_device) == DV_DISK) { 1732 VOP_CLOSE(rootvp, FREAD, FSCRED, curlwp); 1733 vrele(rootvp); 1734 } 1735 return (error); 1736 } 1737