1 /* $NetBSD: vfs_subr.c,v 1.308 2007/12/01 10:36:47 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines. 78 * 79 * This file contains vfs subroutines which are heavily dependant on 80 * the kernel and are not suitable for standalone use. Examples include 81 * routines involved vnode and mountpoint management. 82 */ 83 84 #include <sys/cdefs.h> 85 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.308 2007/12/01 10:36:47 yamt Exp $"); 86 87 #include "opt_inet.h" 88 #include "opt_ddb.h" 89 #include "opt_compat_netbsd.h" 90 #include "opt_compat_43.h" 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/proc.h> 95 #include <sys/kernel.h> 96 #include <sys/mount.h> 97 #include <sys/fcntl.h> 98 #include <sys/vnode.h> 99 #include <sys/stat.h> 100 #include <sys/namei.h> 101 #include <sys/ucred.h> 102 #include <sys/buf.h> 103 #include <sys/errno.h> 104 #include <sys/malloc.h> 105 #include <sys/syscallargs.h> 106 #include <sys/device.h> 107 #include <sys/filedesc.h> 108 #include <sys/kauth.h> 109 #include <sys/atomic.h> 110 111 #include <miscfs/specfs/specdev.h> 112 #include <miscfs/syncfs/syncfs.h> 113 114 #include <uvm/uvm.h> 115 #include <uvm/uvm_readahead.h> 116 #include <uvm/uvm_ddb.h> 117 118 #include <sys/sysctl.h> 119 120 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 121 extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */ 122 123 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 124 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 125 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 126 127 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 128 129 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 130 &pool_allocator_nointr, IPL_NONE); 131 132 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 133 134 /* 135 * Local declarations. 136 */ 137 138 static void insmntque(struct vnode *, struct mount *); 139 static int getdevvp(dev_t, struct vnode **, enum vtype); 140 static void vclean(struct vnode *, int, struct lwp *); 141 static struct vnode *getcleanvnode(struct lwp *); 142 143 int 144 vfs_drainvnodes(long target, struct lwp *l) 145 { 146 147 simple_lock(&vnode_free_list_slock); 148 while (numvnodes > target) { 149 struct vnode *vp; 150 151 vp = getcleanvnode(l); 152 if (vp == NULL) 153 return EBUSY; /* give up */ 154 pool_put(&vnode_pool, vp); 155 simple_lock(&vnode_free_list_slock); 156 numvnodes--; 157 } 158 simple_unlock(&vnode_free_list_slock); 159 160 return 0; 161 } 162 163 /* 164 * grab a vnode from freelist and clean it. 165 */ 166 struct vnode * 167 getcleanvnode(struct lwp *l) 168 { 169 struct vnode *vp; 170 struct freelst *listhd; 171 172 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 173 174 listhd = &vnode_free_list; 175 try_nextlist: 176 TAILQ_FOREACH(vp, listhd, v_freelist) { 177 if (!simple_lock_try(&vp->v_interlock)) 178 continue; 179 /* 180 * as our lwp might hold the underlying vnode locked, 181 * don't try to reclaim the VLAYER vnode if it's locked. 182 */ 183 if ((vp->v_iflag & VI_XLOCK) == 0 && 184 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 185 break; 186 } 187 simple_unlock(&vp->v_interlock); 188 } 189 190 if (vp == NULLVP) { 191 if (listhd == &vnode_free_list) { 192 listhd = &vnode_hold_list; 193 goto try_nextlist; 194 } 195 simple_unlock(&vnode_free_list_slock); 196 return NULLVP; 197 } 198 199 if (vp->v_usecount) 200 panic("free vnode isn't, vp %p", vp); 201 TAILQ_REMOVE(listhd, vp, v_freelist); 202 /* see comment on why 0xdeadb is set at end of vgone (below) */ 203 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 204 simple_unlock(&vnode_free_list_slock); 205 206 if (vp->v_type != VBAD) 207 vgonel(vp, l); 208 else 209 simple_unlock(&vp->v_interlock); 210 #ifdef DIAGNOSTIC 211 if (vp->v_data || vp->v_uobj.uo_npages || 212 TAILQ_FIRST(&vp->v_uobj.memq)) 213 panic("cleaned vnode isn't, vp %p", vp); 214 if (vp->v_numoutput) 215 panic("clean vnode has pending I/O's, vp %p", vp); 216 #endif 217 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 218 219 return vp; 220 } 221 222 /* 223 * Mark a mount point as busy. Used to synchronize access and to delay 224 * unmounting. Interlock is not released on failure. 225 */ 226 int 227 vfs_busy(struct mount *mp, int flags, kmutex_t *interlkp) 228 { 229 int lkflags; 230 231 while (mp->mnt_iflag & IMNT_UNMOUNT) { 232 int gone, n; 233 234 if (flags & LK_NOWAIT) 235 return (ENOENT); 236 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 237 && mp->mnt_unmounter == curlwp) 238 return (EDEADLK); 239 if (interlkp) 240 mutex_exit(interlkp); 241 /* 242 * Since all busy locks are shared except the exclusive 243 * lock granted when unmounting, the only place that a 244 * wakeup needs to be done is at the release of the 245 * exclusive lock at the end of dounmount. 246 */ 247 simple_lock(&mp->mnt_slock); 248 mp->mnt_wcnt++; 249 ltsleep((void *)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 250 n = --mp->mnt_wcnt; 251 simple_unlock(&mp->mnt_slock); 252 gone = mp->mnt_iflag & IMNT_GONE; 253 254 if (n == 0) 255 wakeup(&mp->mnt_wcnt); 256 if (interlkp) 257 mutex_enter(interlkp); 258 if (gone) 259 return (ENOENT); 260 } 261 lkflags = LK_SHARED; 262 if (interlkp) { 263 /* lkflags |= LK_INTERLOCK; XXX */ 264 mutex_exit(interlkp); /* XXX */ 265 } 266 if (lockmgr(&mp->mnt_lock, lkflags, NULL)) 267 panic("vfs_busy: unexpected lock failure"); 268 return (0); 269 } 270 271 /* 272 * Free a busy filesystem. 273 */ 274 void 275 vfs_unbusy(struct mount *mp) 276 { 277 278 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 279 } 280 281 /* 282 * Lookup a filesystem type, and if found allocate and initialize 283 * a mount structure for it. 284 * 285 * Devname is usually updated by mount(8) after booting. 286 */ 287 int 288 vfs_rootmountalloc(const char *fstypename, const char *devname, 289 struct mount **mpp) 290 { 291 struct vfsops *vfsp = NULL; 292 struct mount *mp; 293 294 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 295 if (!strncmp(vfsp->vfs_name, fstypename, 296 sizeof(mp->mnt_stat.f_fstypename))) 297 break; 298 299 if (vfsp == NULL) 300 return (ENODEV); 301 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 302 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 303 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 304 simple_lock_init(&mp->mnt_slock); 305 (void)vfs_busy(mp, LK_NOWAIT, 0); 306 TAILQ_INIT(&mp->mnt_vnodelist); 307 mp->mnt_op = vfsp; 308 mp->mnt_flag = MNT_RDONLY; 309 mp->mnt_vnodecovered = NULLVP; 310 vfsp->vfs_refcount++; 311 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 312 sizeof(mp->mnt_stat.f_fstypename)); 313 mp->mnt_stat.f_mntonname[0] = '/'; 314 mp->mnt_stat.f_mntonname[1] = '\0'; 315 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 316 '\0'; 317 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 318 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 319 mount_initspecific(mp); 320 *mpp = mp; 321 return (0); 322 } 323 324 325 /* 326 * Routines having to do with the management of the vnode table. 327 */ 328 extern int (**dead_vnodeop_p)(void *); 329 330 /* 331 * Return the next vnode from the free list. 332 */ 333 int 334 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 335 struct vnode **vpp) 336 { 337 struct uvm_object *uobj; 338 struct lwp *l = curlwp; /* XXX */ 339 static int toggle; 340 struct vnode *vp; 341 int error = 0, tryalloc; 342 343 try_again: 344 if (mp) { 345 /* 346 * Mark filesystem busy while we're creating a vnode. 347 * If unmount is in progress, this will wait; if the 348 * unmount succeeds (only if umount -f), this will 349 * return an error. If the unmount fails, we'll keep 350 * going afterwards. 351 * (This puts the per-mount vnode list logically under 352 * the protection of the vfs_busy lock). 353 */ 354 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 355 if (error && error != EDEADLK) 356 return error; 357 } 358 359 /* 360 * We must choose whether to allocate a new vnode or recycle an 361 * existing one. The criterion for allocating a new one is that 362 * the total number of vnodes is less than the number desired or 363 * there are no vnodes on either free list. Generally we only 364 * want to recycle vnodes that have no buffers associated with 365 * them, so we look first on the vnode_free_list. If it is empty, 366 * we next consider vnodes with referencing buffers on the 367 * vnode_hold_list. The toggle ensures that half the time we 368 * will use a buffer from the vnode_hold_list, and half the time 369 * we will allocate a new one unless the list has grown to twice 370 * the desired size. We are reticent to recycle vnodes from the 371 * vnode_hold_list because we will lose the identity of all its 372 * referencing buffers. 373 */ 374 375 vp = NULL; 376 377 simple_lock(&vnode_free_list_slock); 378 379 toggle ^= 1; 380 if (numvnodes > 2 * desiredvnodes) 381 toggle = 0; 382 383 tryalloc = numvnodes < desiredvnodes || 384 (TAILQ_FIRST(&vnode_free_list) == NULL && 385 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 386 387 if (tryalloc && 388 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 389 numvnodes++; 390 simple_unlock(&vnode_free_list_slock); 391 memset(vp, 0, sizeof(*vp)); 392 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1); 393 /* 394 * done by memset() above. 395 * LIST_INIT(&vp->v_nclist); 396 * LIST_INIT(&vp->v_dnclist); 397 */ 398 } else { 399 vp = getcleanvnode(l); 400 /* 401 * Unless this is a bad time of the month, at most 402 * the first NCPUS items on the free list are 403 * locked, so this is close enough to being empty. 404 */ 405 if (vp == NULLVP) { 406 if (mp && error != EDEADLK) 407 vfs_unbusy(mp); 408 if (tryalloc) { 409 printf("WARNING: unable to allocate new " 410 "vnode, retrying...\n"); 411 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 412 goto try_again; 413 } 414 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 415 *vpp = 0; 416 return (ENFILE); 417 } 418 vp->v_usecount = 1; 419 vp->v_iflag = 0; 420 vp->v_vflag = 0; 421 vp->v_uflag = 0; 422 vp->v_socket = NULL; 423 } 424 vp->v_type = VNON; 425 vp->v_vnlock = &vp->v_lock; 426 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 427 KASSERT(LIST_EMPTY(&vp->v_nclist)); 428 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 429 vp->v_tag = tag; 430 vp->v_op = vops; 431 insmntque(vp, mp); 432 *vpp = vp; 433 vp->v_data = 0; 434 simple_lock_init(&vp->v_interlock); 435 436 /* 437 * initialize uvm_object within vnode. 438 */ 439 440 uobj = &vp->v_uobj; 441 KASSERT(uobj->pgops == &uvm_vnodeops); 442 KASSERT(uobj->uo_npages == 0); 443 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 444 vp->v_size = vp->v_writesize = VSIZENOTSET; 445 446 if (mp && error != EDEADLK) 447 vfs_unbusy(mp); 448 return (0); 449 } 450 451 /* 452 * This is really just the reverse of getnewvnode(). Needed for 453 * VFS_VGET functions who may need to push back a vnode in case 454 * of a locking race. 455 */ 456 void 457 ungetnewvnode(struct vnode *vp) 458 { 459 #ifdef DIAGNOSTIC 460 if (vp->v_usecount != 1) 461 panic("ungetnewvnode: busy vnode"); 462 #endif 463 vp->v_usecount--; 464 insmntque(vp, NULL); 465 vp->v_type = VBAD; 466 467 simple_lock(&vp->v_interlock); 468 /* 469 * Insert at head of LRU list 470 */ 471 simple_lock(&vnode_free_list_slock); 472 if (vp->v_holdcnt > 0) 473 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 474 else 475 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 476 simple_unlock(&vnode_free_list_slock); 477 simple_unlock(&vp->v_interlock); 478 } 479 480 /* 481 * Move a vnode from one mount queue to another. 482 */ 483 static void 484 insmntque(struct vnode *vp, struct mount *mp) 485 { 486 487 #ifdef DIAGNOSTIC 488 if ((mp != NULL) && 489 (mp->mnt_iflag & IMNT_UNMOUNT) && 490 !(mp->mnt_flag & MNT_SOFTDEP) && 491 vp->v_tag != VT_VFS) { 492 panic("insmntque into dying filesystem"); 493 } 494 #endif 495 496 simple_lock(&mntvnode_slock); 497 /* 498 * Delete from old mount point vnode list, if on one. 499 */ 500 if (vp->v_mount != NULL) 501 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 502 /* 503 * Insert into list of vnodes for the new mount point, if available. 504 */ 505 if ((vp->v_mount = mp) != NULL) 506 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 507 simple_unlock(&mntvnode_slock); 508 } 509 510 /* 511 * Create a vnode for a block device. 512 * Used for root filesystem and swap areas. 513 * Also used for memory file system special devices. 514 */ 515 int 516 bdevvp(dev_t dev, struct vnode **vpp) 517 { 518 519 return (getdevvp(dev, vpp, VBLK)); 520 } 521 522 /* 523 * Create a vnode for a character device. 524 * Used for kernfs and some console handling. 525 */ 526 int 527 cdevvp(dev_t dev, struct vnode **vpp) 528 { 529 530 return (getdevvp(dev, vpp, VCHR)); 531 } 532 533 /* 534 * Create a vnode for a device. 535 * Used by bdevvp (block device) for root file system etc., 536 * and by cdevvp (character device) for console and kernfs. 537 */ 538 static int 539 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type) 540 { 541 struct vnode *vp; 542 struct vnode *nvp; 543 int error; 544 545 if (dev == NODEV) { 546 *vpp = NULL; 547 return (0); 548 } 549 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 550 if (error) { 551 *vpp = NULL; 552 return (error); 553 } 554 vp = nvp; 555 vp->v_type = type; 556 uvm_vnp_setsize(vp, 0); 557 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 558 vput(vp); 559 vp = nvp; 560 } 561 *vpp = vp; 562 return (0); 563 } 564 565 /* 566 * Check to see if the new vnode represents a special device 567 * for which we already have a vnode (either because of 568 * bdevvp() or because of a different vnode representing 569 * the same block device). If such an alias exists, deallocate 570 * the existing contents and return the aliased vnode. The 571 * caller is responsible for filling it with its new contents. 572 */ 573 struct vnode * 574 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp) 575 { 576 struct lwp *l = curlwp; /* XXX */ 577 struct vnode *vp; 578 struct vnode **vpp; 579 580 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 581 return (NULLVP); 582 583 vpp = &speclisth[SPECHASH(nvp_rdev)]; 584 loop: 585 simple_lock(&spechash_slock); 586 for (vp = *vpp; vp; vp = vp->v_specnext) { 587 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 588 continue; 589 /* 590 * Alias, but not in use, so flush it out. 591 */ 592 simple_lock(&vp->v_interlock); 593 simple_unlock(&spechash_slock); 594 if (vp->v_usecount == 0) { 595 vgonel(vp, l); 596 goto loop; 597 } 598 /* 599 * What we're interested to know here is if someone else has 600 * removed this vnode from the device hash list while we were 601 * waiting. This can only happen if vclean() did it, and 602 * this requires the vnode to be locked. 603 */ 604 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) 605 goto loop; 606 if (vp->v_specinfo == NULL) { 607 vput(vp); 608 goto loop; 609 } 610 simple_lock(&spechash_slock); 611 break; 612 } 613 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 614 MALLOC(nvp->v_specinfo, struct specinfo *, 615 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 616 /* XXX Erg. */ 617 if (nvp->v_specinfo == NULL) { 618 simple_unlock(&spechash_slock); 619 uvm_wait("checkalias"); 620 goto loop; 621 } 622 623 nvp->v_rdev = nvp_rdev; 624 nvp->v_hashchain = vpp; 625 nvp->v_specnext = *vpp; 626 nvp->v_specmountpoint = NULL; 627 simple_unlock(&spechash_slock); 628 nvp->v_speclockf = NULL; 629 630 *vpp = nvp; 631 if (vp != NULLVP) { 632 nvp->v_iflag |= VI_ALIASED; 633 vp->v_iflag |= VI_ALIASED; 634 vput(vp); 635 } 636 return (NULLVP); 637 } 638 simple_unlock(&spechash_slock); 639 VOP_UNLOCK(vp, 0); 640 simple_lock(&vp->v_interlock); 641 vclean(vp, 0, l); 642 vp->v_op = nvp->v_op; 643 vp->v_tag = nvp->v_tag; 644 vp->v_vnlock = &vp->v_lock; 645 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 646 nvp->v_type = VNON; 647 insmntque(vp, mp); 648 return (vp); 649 } 650 651 /* 652 * Grab a particular vnode from the free list, increment its 653 * reference count and lock it. If the vnode lock bit is set the 654 * vnode is being eliminated in vgone. In that case, we can not 655 * grab the vnode, so the process is awakened when the transition is 656 * completed, and an error returned to indicate that the vnode is no 657 * longer usable (possibly having been changed to a new file system type). 658 */ 659 int 660 vget(struct vnode *vp, int flags) 661 { 662 int error; 663 664 /* 665 * If the vnode is in the process of being cleaned out for 666 * another use, we wait for the cleaning to finish and then 667 * return failure. Cleaning is determined by checking that 668 * the VI_XLOCK flag is set. 669 */ 670 671 if ((flags & LK_INTERLOCK) == 0) 672 simple_lock(&vp->v_interlock); 673 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 674 if (flags & LK_NOWAIT) { 675 simple_unlock(&vp->v_interlock); 676 return EBUSY; 677 } 678 vp->v_iflag |= VI_XWANT; 679 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 680 return (ENOENT); 681 } 682 if (vp->v_usecount == 0) { 683 simple_lock(&vnode_free_list_slock); 684 if (vp->v_holdcnt > 0) 685 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 686 else 687 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 688 simple_unlock(&vnode_free_list_slock); 689 } 690 vp->v_usecount++; 691 #ifdef DIAGNOSTIC 692 if (vp->v_usecount == 0) { 693 vprint("vget", vp); 694 panic("vget: usecount overflow, vp %p", vp); 695 } 696 #endif 697 if (flags & LK_TYPE_MASK) { 698 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 699 vrele(vp); 700 } 701 return (error); 702 } 703 simple_unlock(&vp->v_interlock); 704 return (0); 705 } 706 707 /* 708 * vput(), just unlock and vrele() 709 */ 710 void 711 vput(struct vnode *vp) 712 { 713 714 #ifdef DIAGNOSTIC 715 if (vp == NULL) 716 panic("vput: null vp"); 717 #endif 718 simple_lock(&vp->v_interlock); 719 vp->v_usecount--; 720 if (vp->v_usecount > 0) { 721 simple_unlock(&vp->v_interlock); 722 VOP_UNLOCK(vp, 0); 723 return; 724 } 725 #ifdef DIAGNOSTIC 726 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 727 vprint("vput: bad ref count", vp); 728 panic("vput: ref cnt"); 729 } 730 #endif 731 /* 732 * Insert at tail of LRU list. 733 */ 734 simple_lock(&vnode_free_list_slock); 735 if (vp->v_holdcnt > 0) 736 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 737 else 738 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 739 simple_unlock(&vnode_free_list_slock); 740 if (vp->v_iflag & VI_EXECMAP) { 741 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 742 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 743 } 744 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 745 vp->v_vflag &= ~VV_MAPPED; 746 simple_unlock(&vp->v_interlock); 747 VOP_INACTIVE(vp); 748 } 749 750 /* 751 * Vnode release. 752 * If count drops to zero, call inactive routine and return to freelist. 753 */ 754 static void 755 do_vrele(struct vnode *vp, int doinactive, int onhead) 756 { 757 758 #ifdef DIAGNOSTIC 759 if (vp == NULL) 760 panic("vrele: null vp"); 761 #endif 762 simple_lock(&vp->v_interlock); 763 vp->v_usecount--; 764 if (vp->v_usecount > 0) { 765 simple_unlock(&vp->v_interlock); 766 return; 767 } 768 #ifdef DIAGNOSTIC 769 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 770 vprint("vrele: bad ref count", vp); 771 panic("vrele: ref cnt vp %p", vp); 772 } 773 #endif 774 /* 775 * Insert at tail of LRU list. 776 */ 777 simple_lock(&vnode_free_list_slock); 778 if (vp->v_holdcnt > 0) { 779 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 780 } else { 781 if (onhead) 782 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 783 else 784 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 785 } 786 simple_unlock(&vnode_free_list_slock); 787 if (vp->v_iflag & VI_EXECMAP) { 788 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 789 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 790 } 791 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED); 792 vp->v_vflag &= ~VV_MAPPED; 793 794 if (doinactive) { 795 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 796 VOP_INACTIVE(vp); 797 } else { 798 simple_unlock(&vp->v_interlock); 799 } 800 } 801 802 void 803 vrele(struct vnode *vp) 804 { 805 806 do_vrele(vp, 1, 0); 807 } 808 809 void 810 vrele2(struct vnode *vp, int onhead) 811 { 812 813 do_vrele(vp, 0, onhead); 814 } 815 816 /* 817 * Page or buffer structure gets a reference. 818 * Called with v_interlock held. 819 */ 820 void 821 vholdl(struct vnode *vp) 822 { 823 824 /* 825 * If it is on the freelist and the hold count is currently 826 * zero, move it to the hold list. The test of the back 827 * pointer and the use reference count of zero is because 828 * it will be removed from a free list by getnewvnode, 829 * but will not have its reference count incremented until 830 * after calling vgone. If the reference count were 831 * incremented first, vgone would (incorrectly) try to 832 * close the previous instance of the underlying object. 833 * So, the back pointer is explicitly set to `0xdeadb' in 834 * getnewvnode after removing it from a freelist to ensure 835 * that we do not try to move it here. 836 */ 837 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 838 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 839 simple_lock(&vnode_free_list_slock); 840 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 841 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 842 simple_unlock(&vnode_free_list_slock); 843 } 844 vp->v_holdcnt++; 845 } 846 847 /* 848 * Page or buffer structure frees a reference. 849 * Called with v_interlock held. 850 */ 851 void 852 holdrelel(struct vnode *vp) 853 { 854 855 if (vp->v_holdcnt <= 0) 856 panic("holdrelel: holdcnt vp %p", vp); 857 vp->v_holdcnt--; 858 859 /* 860 * If it is on the holdlist and the hold count drops to 861 * zero, move it to the free list. The test of the back 862 * pointer and the use reference count of zero is because 863 * it will be removed from a free list by getnewvnode, 864 * but will not have its reference count incremented until 865 * after calling vgone. If the reference count were 866 * incremented first, vgone would (incorrectly) try to 867 * close the previous instance of the underlying object. 868 * So, the back pointer is explicitly set to `0xdeadb' in 869 * getnewvnode after removing it from a freelist to ensure 870 * that we do not try to move it here. 871 */ 872 873 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 874 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 875 simple_lock(&vnode_free_list_slock); 876 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 877 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 878 simple_unlock(&vnode_free_list_slock); 879 } 880 } 881 882 /* 883 * Vnode reference. 884 */ 885 void 886 vref(struct vnode *vp) 887 { 888 889 simple_lock(&vp->v_interlock); 890 if (vp->v_usecount <= 0) 891 panic("vref used where vget required, vp %p", vp); 892 vp->v_usecount++; 893 #ifdef DIAGNOSTIC 894 if (vp->v_usecount == 0) { 895 vprint("vref", vp); 896 panic("vref: usecount overflow, vp %p", vp); 897 } 898 #endif 899 simple_unlock(&vp->v_interlock); 900 } 901 902 /* 903 * Remove any vnodes in the vnode table belonging to mount point mp. 904 * 905 * If FORCECLOSE is not specified, there should not be any active ones, 906 * return error if any are found (nb: this is a user error, not a 907 * system error). If FORCECLOSE is specified, detach any active vnodes 908 * that are found. 909 * 910 * If WRITECLOSE is set, only flush out regular file vnodes open for 911 * writing. 912 * 913 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 914 */ 915 #ifdef DEBUG 916 int busyprt = 0; /* print out busy vnodes */ 917 struct ctldebug debug1 = { "busyprt", &busyprt }; 918 #endif 919 920 int 921 vflush(struct mount *mp, struct vnode *skipvp, int flags) 922 { 923 struct lwp *l = curlwp; /* XXX */ 924 struct vnode *vp, *nvp; 925 int busy = 0; 926 927 simple_lock(&mntvnode_slock); 928 loop: 929 /* 930 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 931 * and vclean() are called 932 */ 933 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 934 if (vp->v_mount != mp) 935 goto loop; 936 nvp = TAILQ_NEXT(vp, v_mntvnodes); 937 /* 938 * Skip over a selected vnode. 939 */ 940 if (vp == skipvp) 941 continue; 942 simple_lock(&vp->v_interlock); 943 /* 944 * Skip over a vnodes marked VV_SYSTEM. 945 */ 946 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 947 simple_unlock(&vp->v_interlock); 948 continue; 949 } 950 /* 951 * If WRITECLOSE is set, only flush out regular file 952 * vnodes open for writing. 953 */ 954 if ((flags & WRITECLOSE) && 955 (vp->v_writecount == 0 || vp->v_type != VREG)) { 956 simple_unlock(&vp->v_interlock); 957 continue; 958 } 959 /* 960 * With v_usecount == 0, all we need to do is clear 961 * out the vnode data structures and we are done. 962 */ 963 if (vp->v_usecount == 0) { 964 simple_unlock(&mntvnode_slock); 965 vgonel(vp, l); 966 simple_lock(&mntvnode_slock); 967 continue; 968 } 969 /* 970 * If FORCECLOSE is set, forcibly close the vnode. 971 * For block or character devices, revert to an 972 * anonymous device. For all other files, just kill them. 973 */ 974 if (flags & FORCECLOSE) { 975 simple_unlock(&mntvnode_slock); 976 if (vp->v_type != VBLK && vp->v_type != VCHR) { 977 vgonel(vp, l); 978 } else { 979 vclean(vp, 0, l); 980 vp->v_op = spec_vnodeop_p; 981 insmntque(vp, (struct mount *)0); 982 } 983 simple_lock(&mntvnode_slock); 984 continue; 985 } 986 #ifdef DEBUG 987 if (busyprt) 988 vprint("vflush: busy vnode", vp); 989 #endif 990 simple_unlock(&vp->v_interlock); 991 busy++; 992 } 993 simple_unlock(&mntvnode_slock); 994 if (busy) 995 return (EBUSY); 996 return (0); 997 } 998 999 /* 1000 * Disassociate the underlying file system from a vnode. 1001 */ 1002 static void 1003 vclean(struct vnode *vp, int flags, struct lwp *l) 1004 { 1005 int active; 1006 1007 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1008 1009 /* 1010 * Check to see if the vnode is in use. 1011 * If so we have to reference it before we clean it out 1012 * so that its count cannot fall to zero and generate a 1013 * race against ourselves to recycle it. 1014 */ 1015 1016 if ((active = vp->v_usecount) != 0) { 1017 vp->v_usecount++; 1018 #ifdef DIAGNOSTIC 1019 if (vp->v_usecount == 0) { 1020 vprint("vclean", vp); 1021 panic("vclean: usecount overflow"); 1022 } 1023 #endif 1024 } 1025 1026 /* 1027 * Prevent the vnode from being recycled or 1028 * brought into use while we clean it out. 1029 */ 1030 if (vp->v_iflag & VI_XLOCK) 1031 panic("vclean: deadlock, vp %p", vp); 1032 vp->v_iflag |= VI_XLOCK; 1033 if (vp->v_iflag & VI_EXECMAP) { 1034 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1035 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1036 } 1037 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1038 1039 /* 1040 * Even if the count is zero, the VOP_INACTIVE routine may still 1041 * have the object locked while it cleans it out. For 1042 * active vnodes, it ensures that no other activity can 1043 * occur while the underlying object is being cleaned out. 1044 * 1045 * We drain the lock to make sure we are the last one trying to 1046 * get it and immediately resurrect the lock. Future accesses 1047 * for locking this _vnode_ will be protected by VI_XLOCK. However, 1048 * upper layers might be using the _lock_ in case the file system 1049 * exported it and might access it while the vnode lingers in 1050 * deadfs. 1051 */ 1052 VOP_LOCK(vp, LK_DRAIN | LK_RESURRECT | LK_INTERLOCK); 1053 1054 /* 1055 * Clean out any cached data associated with the vnode. 1056 * If special device, remove it from special device alias list. 1057 * if it is on one. 1058 */ 1059 if (flags & DOCLOSE) { 1060 int error; 1061 struct vnode *vq, *vx; 1062 1063 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1064 if (error) 1065 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1066 KASSERT(error == 0); 1067 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1068 1069 if (active) 1070 VOP_CLOSE(vp, FNONBLOCK, NOCRED); 1071 1072 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1073 vp->v_specinfo != 0) { 1074 simple_lock(&spechash_slock); 1075 if (vp->v_hashchain != NULL) { 1076 if (*vp->v_hashchain == vp) { 1077 *vp->v_hashchain = vp->v_specnext; 1078 } else { 1079 for (vq = *vp->v_hashchain; vq; 1080 vq = vq->v_specnext) { 1081 if (vq->v_specnext != vp) 1082 continue; 1083 vq->v_specnext = vp->v_specnext; 1084 break; 1085 } 1086 if (vq == NULL) 1087 panic("missing bdev"); 1088 } 1089 if (vp->v_iflag & VI_ALIASED) { 1090 vx = NULL; 1091 for (vq = *vp->v_hashchain; vq; 1092 vq = vq->v_specnext) { 1093 if (vq->v_rdev != vp->v_rdev || 1094 vq->v_type != vp->v_type) 1095 continue; 1096 if (vx) 1097 break; 1098 vx = vq; 1099 } 1100 if (vx == NULL) 1101 panic("missing alias"); 1102 if (vq == NULL) 1103 vx->v_iflag &= ~VI_ALIASED; 1104 vp->v_iflag &= ~VI_ALIASED; 1105 } 1106 } 1107 simple_unlock(&spechash_slock); 1108 FREE(vp->v_specinfo, M_VNODE); 1109 vp->v_specinfo = NULL; 1110 } 1111 } 1112 1113 /* 1114 * If purging an active vnode, it must be closed and 1115 * deactivated before being reclaimed. Note that the 1116 * VOP_INACTIVE will unlock the vnode. 1117 */ 1118 if (active) { 1119 VOP_INACTIVE(vp); 1120 } else { 1121 /* 1122 * Any other processes trying to obtain this lock must first 1123 * wait for VI_XLOCK to clear, then call the new lock operation. 1124 */ 1125 VOP_UNLOCK(vp, 0); 1126 } 1127 /* 1128 * Reclaim the vnode. 1129 */ 1130 if (VOP_RECLAIM(vp)) 1131 panic("vclean: cannot reclaim, vp %p", vp); 1132 if (active) { 1133 /* 1134 * Inline copy of vrele() since VOP_INACTIVE 1135 * has already been called. 1136 */ 1137 simple_lock(&vp->v_interlock); 1138 if (--vp->v_usecount <= 0) { 1139 #ifdef DIAGNOSTIC 1140 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1141 vprint("vclean: bad ref count", vp); 1142 panic("vclean: ref cnt"); 1143 } 1144 #endif 1145 /* 1146 * Insert at tail of LRU list. 1147 */ 1148 1149 simple_unlock(&vp->v_interlock); 1150 simple_lock(&vnode_free_list_slock); 1151 #ifdef DIAGNOSTIC 1152 if (vp->v_holdcnt > 0) 1153 panic("vclean: not clean, vp %p", vp); 1154 #endif 1155 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1156 simple_unlock(&vnode_free_list_slock); 1157 } else 1158 simple_unlock(&vp->v_interlock); 1159 } 1160 1161 KASSERT(vp->v_uobj.uo_npages == 0); 1162 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1163 uvm_ra_freectx(vp->v_ractx); 1164 vp->v_ractx = NULL; 1165 } 1166 cache_purge(vp); 1167 1168 /* 1169 * Done with purge, notify sleepers of the grim news. 1170 */ 1171 vp->v_op = dead_vnodeop_p; 1172 vp->v_tag = VT_NON; 1173 vp->v_vnlock = NULL; 1174 simple_lock(&vp->v_interlock); 1175 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1176 vp->v_iflag &= ~VI_XLOCK; 1177 vp->v_vflag &= ~VV_LOCKSWORK; 1178 if (vp->v_iflag & VI_XWANT) { 1179 vp->v_iflag &= ~VI_XWANT; 1180 simple_unlock(&vp->v_interlock); 1181 wakeup((void *)vp); 1182 } else 1183 simple_unlock(&vp->v_interlock); 1184 } 1185 1186 /* 1187 * Recycle an unused vnode to the front of the free list. 1188 * Release the passed interlock if the vnode will be recycled. 1189 */ 1190 int 1191 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct lwp *l) 1192 { 1193 1194 simple_lock(&vp->v_interlock); 1195 if (vp->v_usecount == 0) { 1196 if (inter_lkp) 1197 simple_unlock(inter_lkp); 1198 vgonel(vp, l); 1199 return (1); 1200 } 1201 simple_unlock(&vp->v_interlock); 1202 return (0); 1203 } 1204 1205 /* 1206 * Eliminate all activity associated with a vnode 1207 * in preparation for reuse. 1208 */ 1209 void 1210 vgone(struct vnode *vp) 1211 { 1212 struct lwp *l = curlwp; /* XXX */ 1213 1214 simple_lock(&vp->v_interlock); 1215 vgonel(vp, l); 1216 } 1217 1218 /* 1219 * vgone, with the vp interlock held. 1220 */ 1221 void 1222 vgonel(struct vnode *vp, struct lwp *l) 1223 { 1224 1225 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1226 1227 /* 1228 * If a vgone (or vclean) is already in progress, 1229 * wait until it is done and return. 1230 */ 1231 1232 if (vp->v_iflag & VI_XLOCK) { 1233 vp->v_iflag |= VI_XWANT; 1234 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1235 return; 1236 } 1237 1238 /* 1239 * Clean out the filesystem specific data. 1240 */ 1241 1242 vclean(vp, DOCLOSE, l); 1243 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1244 1245 /* 1246 * Delete from old mount point vnode list, if on one. 1247 */ 1248 1249 if (vp->v_mount != NULL) 1250 insmntque(vp, (struct mount *)0); 1251 1252 /* 1253 * The test of the back pointer and the reference count of 1254 * zero is because it will be removed from the free list by 1255 * getcleanvnode, but will not have its reference count 1256 * incremented until after calling vgone. If the reference 1257 * count were incremented first, vgone would (incorrectly) 1258 * try to close the previous instance of the underlying object. 1259 * So, the back pointer is explicitly set to `0xdeadb' in 1260 * getnewvnode after removing it from the freelist to ensure 1261 * that we do not try to move it here. 1262 */ 1263 1264 vp->v_type = VBAD; 1265 if (vp->v_usecount == 0) { 1266 bool dofree; 1267 1268 simple_lock(&vnode_free_list_slock); 1269 if (vp->v_holdcnt > 0) 1270 panic("vgonel: not clean, vp %p", vp); 1271 /* 1272 * if it isn't on the freelist, we're called by getcleanvnode 1273 * and vnode is being re-used. otherwise, we'll free it. 1274 */ 1275 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1276 if (dofree) { 1277 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1278 numvnodes--; 1279 } 1280 simple_unlock(&vnode_free_list_slock); 1281 if (dofree) 1282 pool_put(&vnode_pool, vp); 1283 } 1284 } 1285 1286 /* 1287 * Lookup a vnode by device number. 1288 */ 1289 int 1290 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1291 { 1292 struct vnode *vp; 1293 int rc = 0; 1294 1295 simple_lock(&spechash_slock); 1296 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1297 if (dev != vp->v_rdev || type != vp->v_type) 1298 continue; 1299 *vpp = vp; 1300 rc = 1; 1301 break; 1302 } 1303 simple_unlock(&spechash_slock); 1304 return (rc); 1305 } 1306 1307 /* 1308 * Revoke all the vnodes corresponding to the specified minor number 1309 * range (endpoints inclusive) of the specified major. 1310 */ 1311 void 1312 vdevgone(int maj, int minl, int minh, enum vtype type) 1313 { 1314 struct vnode *vp; 1315 int mn; 1316 1317 vp = NULL; /* XXX gcc */ 1318 1319 for (mn = minl; mn <= minh; mn++) 1320 if (vfinddev(makedev(maj, mn), type, &vp)) 1321 VOP_REVOKE(vp, REVOKEALL); 1322 } 1323 1324 /* 1325 * Calculate the total number of references to a special device. 1326 */ 1327 int 1328 vcount(struct vnode *vp) 1329 { 1330 struct vnode *vq, *vnext; 1331 int count; 1332 1333 loop: 1334 if ((vp->v_iflag & VI_ALIASED) == 0) 1335 return (vp->v_usecount); 1336 simple_lock(&spechash_slock); 1337 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1338 vnext = vq->v_specnext; 1339 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1340 continue; 1341 /* 1342 * Alias, but not in use, so flush it out. 1343 */ 1344 if (vq->v_usecount == 0 && vq != vp && 1345 (vq->v_iflag & VI_XLOCK) == 0) { 1346 simple_unlock(&spechash_slock); 1347 vgone(vq); 1348 goto loop; 1349 } 1350 count += vq->v_usecount; 1351 } 1352 simple_unlock(&spechash_slock); 1353 return (count); 1354 } 1355 1356 1357 /* 1358 * sysctl helper routine to return list of supported fstypes 1359 */ 1360 static int 1361 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1362 { 1363 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 1364 char *where = oldp; 1365 struct vfsops *v; 1366 size_t needed, left, slen; 1367 int error, first; 1368 1369 if (newp != NULL) 1370 return (EPERM); 1371 if (namelen != 0) 1372 return (EINVAL); 1373 1374 first = 1; 1375 error = 0; 1376 needed = 0; 1377 left = *oldlenp; 1378 1379 mutex_enter(&vfs_list_lock); 1380 LIST_FOREACH(v, &vfs_list, vfs_list) { 1381 if (where == NULL) 1382 needed += strlen(v->vfs_name) + 1; 1383 else { 1384 memset(bf, 0, sizeof(bf)); 1385 if (first) { 1386 strncpy(bf, v->vfs_name, sizeof(bf)); 1387 first = 0; 1388 } else { 1389 bf[0] = ' '; 1390 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1391 } 1392 bf[sizeof(bf)-1] = '\0'; 1393 slen = strlen(bf); 1394 if (left < slen + 1) 1395 break; 1396 /* +1 to copy out the trailing NUL byte */ 1397 v->vfs_refcount++; 1398 mutex_exit(&vfs_list_lock); 1399 error = copyout(bf, where, slen + 1); 1400 mutex_enter(&vfs_list_lock); 1401 v->vfs_refcount--; 1402 if (error) 1403 break; 1404 where += slen; 1405 needed += slen; 1406 left -= slen; 1407 } 1408 } 1409 mutex_exit(&vfs_list_lock); 1410 *oldlenp = needed; 1411 return (error); 1412 } 1413 1414 /* 1415 * Top level filesystem related information gathering. 1416 */ 1417 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 1418 { 1419 sysctl_createv(clog, 0, NULL, NULL, 1420 CTLFLAG_PERMANENT, 1421 CTLTYPE_NODE, "vfs", NULL, 1422 NULL, 0, NULL, 0, 1423 CTL_VFS, CTL_EOL); 1424 sysctl_createv(clog, 0, NULL, NULL, 1425 CTLFLAG_PERMANENT, 1426 CTLTYPE_NODE, "generic", 1427 SYSCTL_DESCR("Non-specific vfs related information"), 1428 NULL, 0, NULL, 0, 1429 CTL_VFS, VFS_GENERIC, CTL_EOL); 1430 sysctl_createv(clog, 0, NULL, NULL, 1431 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1432 CTLTYPE_INT, "usermount", 1433 SYSCTL_DESCR("Whether unprivileged users may mount " 1434 "filesystems"), 1435 NULL, 0, &dovfsusermount, 0, 1436 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 1437 sysctl_createv(clog, 0, NULL, NULL, 1438 CTLFLAG_PERMANENT, 1439 CTLTYPE_STRING, "fstypes", 1440 SYSCTL_DESCR("List of file systems present"), 1441 sysctl_vfs_generic_fstypes, 0, NULL, 0, 1442 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 1443 sysctl_createv(clog, 0, NULL, NULL, 1444 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1445 CTLTYPE_INT, "magiclinks", 1446 SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"), 1447 NULL, 0, &vfs_magiclinks, 0, 1448 CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL); 1449 } 1450 1451 1452 int kinfo_vdebug = 1; 1453 int kinfo_vgetfailed; 1454 #define KINFO_VNODESLOP 10 1455 /* 1456 * Dump vnode list (via sysctl). 1457 * Copyout address of vnode followed by vnode. 1458 */ 1459 /* ARGSUSED */ 1460 int 1461 sysctl_kern_vnode(SYSCTLFN_ARGS) 1462 { 1463 char *where = oldp; 1464 size_t *sizep = oldlenp; 1465 struct mount *mp, *nmp; 1466 struct vnode *vp; 1467 char *bp = where, *savebp; 1468 char *ewhere; 1469 int error; 1470 1471 if (namelen != 0) 1472 return (EOPNOTSUPP); 1473 if (newp != NULL) 1474 return (EPERM); 1475 1476 #define VPTRSZ sizeof(struct vnode *) 1477 #define VNODESZ sizeof(struct vnode) 1478 if (where == NULL) { 1479 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1480 return (0); 1481 } 1482 ewhere = where + *sizep; 1483 1484 mutex_enter(&mountlist_lock); 1485 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1486 mp = nmp) { 1487 if (vfs_busy(mp, LK_NOWAIT, &mountlist_lock)) { 1488 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1489 continue; 1490 } 1491 savebp = bp; 1492 again: 1493 simple_lock(&mntvnode_slock); 1494 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1495 /* 1496 * Check that the vp is still associated with 1497 * this filesystem. RACE: could have been 1498 * recycled onto the same filesystem. 1499 */ 1500 if (vp->v_mount != mp) { 1501 simple_unlock(&mntvnode_slock); 1502 if (kinfo_vdebug) 1503 printf("kinfo: vp changed\n"); 1504 bp = savebp; 1505 goto again; 1506 } 1507 if (bp + VPTRSZ + VNODESZ > ewhere) { 1508 simple_unlock(&mntvnode_slock); 1509 *sizep = bp - where; 1510 return (ENOMEM); 1511 } 1512 simple_unlock(&mntvnode_slock); 1513 if ((error = copyout((void *)&vp, bp, VPTRSZ)) || 1514 (error = copyout((void *)vp, bp + VPTRSZ, VNODESZ))) 1515 return (error); 1516 bp += VPTRSZ + VNODESZ; 1517 simple_lock(&mntvnode_slock); 1518 } 1519 simple_unlock(&mntvnode_slock); 1520 mutex_enter(&mountlist_lock); 1521 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1522 vfs_unbusy(mp); 1523 } 1524 mutex_exit(&mountlist_lock); 1525 1526 *sizep = bp - where; 1527 return (0); 1528 } 1529 1530 /* 1531 * Check to see if a filesystem is mounted on a block device. 1532 */ 1533 int 1534 vfs_mountedon(struct vnode *vp) 1535 { 1536 struct vnode *vq; 1537 int error = 0; 1538 1539 if (vp->v_type != VBLK) 1540 return ENOTBLK; 1541 if (vp->v_specmountpoint != NULL) 1542 return (EBUSY); 1543 if (vp->v_iflag & VI_ALIASED) { 1544 simple_lock(&spechash_slock); 1545 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 1546 if (vq->v_rdev != vp->v_rdev || 1547 vq->v_type != vp->v_type) 1548 continue; 1549 if (vq->v_specmountpoint != NULL) { 1550 error = EBUSY; 1551 break; 1552 } 1553 } 1554 simple_unlock(&spechash_slock); 1555 } 1556 return (error); 1557 } 1558 1559 /* 1560 * Unmount all file systems. 1561 * We traverse the list in reverse order under the assumption that doing so 1562 * will avoid needing to worry about dependencies. 1563 */ 1564 void 1565 vfs_unmountall(struct lwp *l) 1566 { 1567 struct mount *mp, *nmp; 1568 int allerror, error; 1569 1570 printf("unmounting file systems..."); 1571 for (allerror = 0, 1572 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 1573 nmp = mp->mnt_list.cqe_prev; 1574 #ifdef DEBUG 1575 printf("\nunmounting %s (%s)...", 1576 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 1577 #endif 1578 /* 1579 * XXX Freeze syncer. Must do this before locking the 1580 * mount point. See dounmount() for details. 1581 */ 1582 mutex_enter(&syncer_mutex); 1583 if (vfs_busy(mp, 0, 0)) { 1584 mutex_exit(&syncer_mutex); 1585 continue; 1586 } 1587 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 1588 printf("unmount of %s failed with error %d\n", 1589 mp->mnt_stat.f_mntonname, error); 1590 allerror = 1; 1591 } 1592 } 1593 printf(" done\n"); 1594 if (allerror) 1595 printf("WARNING: some file systems would not unmount\n"); 1596 } 1597 1598 extern struct simplelock bqueue_slock; /* XXX */ 1599 1600 /* 1601 * Sync and unmount file systems before shutting down. 1602 */ 1603 void 1604 vfs_shutdown(void) 1605 { 1606 struct lwp *l; 1607 1608 /* XXX we're certainly not running in lwp0's context! */ 1609 l = curlwp; 1610 if (l == NULL) 1611 l = &lwp0; 1612 1613 printf("syncing disks... "); 1614 1615 /* remove user processes from run queue */ 1616 suspendsched(); 1617 (void) spl0(); 1618 1619 /* avoid coming back this way again if we panic. */ 1620 doing_shutdown = 1; 1621 1622 sys_sync(l, NULL, NULL); 1623 1624 /* Wait for sync to finish. */ 1625 if (buf_syncwait() != 0) { 1626 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 1627 Debugger(); 1628 #endif 1629 printf("giving up\n"); 1630 return; 1631 } else 1632 printf("done\n"); 1633 1634 /* 1635 * If we've panic'd, don't make the situation potentially 1636 * worse by unmounting the file systems. 1637 */ 1638 if (panicstr != NULL) 1639 return; 1640 1641 /* Release inodes held by texts before update. */ 1642 #ifdef notdef 1643 vnshutdown(); 1644 #endif 1645 /* Unmount file systems. */ 1646 vfs_unmountall(l); 1647 } 1648 1649 /* 1650 * Mount the root file system. If the operator didn't specify a 1651 * file system to use, try all possible file systems until one 1652 * succeeds. 1653 */ 1654 int 1655 vfs_mountroot(void) 1656 { 1657 struct vfsops *v; 1658 int error = ENODEV; 1659 1660 if (root_device == NULL) 1661 panic("vfs_mountroot: root device unknown"); 1662 1663 switch (device_class(root_device)) { 1664 case DV_IFNET: 1665 if (rootdev != NODEV) 1666 panic("vfs_mountroot: rootdev set for DV_IFNET " 1667 "(0x%08x -> %d,%d)", rootdev, 1668 major(rootdev), minor(rootdev)); 1669 break; 1670 1671 case DV_DISK: 1672 if (rootdev == NODEV) 1673 panic("vfs_mountroot: rootdev not set for DV_DISK"); 1674 if (bdevvp(rootdev, &rootvp)) 1675 panic("vfs_mountroot: can't get vnode for rootdev"); 1676 error = VOP_OPEN(rootvp, FREAD, FSCRED); 1677 if (error) { 1678 printf("vfs_mountroot: can't open root device\n"); 1679 return (error); 1680 } 1681 break; 1682 1683 default: 1684 printf("%s: inappropriate for root file system\n", 1685 root_device->dv_xname); 1686 return (ENODEV); 1687 } 1688 1689 /* 1690 * If user specified a file system, use it. 1691 */ 1692 if (mountroot != NULL) { 1693 error = (*mountroot)(); 1694 goto done; 1695 } 1696 1697 /* 1698 * Try each file system currently configured into the kernel. 1699 */ 1700 mutex_enter(&vfs_list_lock); 1701 LIST_FOREACH(v, &vfs_list, vfs_list) { 1702 if (v->vfs_mountroot == NULL) 1703 continue; 1704 #ifdef DEBUG 1705 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 1706 #endif 1707 v->vfs_refcount++; 1708 mutex_exit(&vfs_list_lock); 1709 error = (*v->vfs_mountroot)(); 1710 mutex_enter(&vfs_list_lock); 1711 v->vfs_refcount--; 1712 if (!error) { 1713 aprint_normal("root file system type: %s\n", 1714 v->vfs_name); 1715 break; 1716 } 1717 } 1718 mutex_exit(&vfs_list_lock); 1719 1720 if (v == NULL) { 1721 printf("no file system for %s", root_device->dv_xname); 1722 if (device_class(root_device) == DV_DISK) 1723 printf(" (dev 0x%x)", rootdev); 1724 printf("\n"); 1725 error = EFTYPE; 1726 } 1727 1728 done: 1729 if (error && device_class(root_device) == DV_DISK) { 1730 VOP_CLOSE(rootvp, FREAD, FSCRED); 1731 vrele(rootvp); 1732 } 1733 return (error); 1734 } 1735