1 /* $NetBSD: vfs_subr.c,v 1.252 2005/07/23 12:18:41 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the NetBSD 24 * Foundation, Inc. and its contributors. 25 * 4. Neither the name of The NetBSD Foundation nor the names of its 26 * contributors may be used to endorse or promote products derived 27 * from this software without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 30 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 31 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 32 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 33 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 34 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 35 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 36 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 37 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39 * POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * Copyright (c) 1989, 1993 44 * The Regents of the University of California. All rights reserved. 45 * (c) UNIX System Laboratories, Inc. 46 * All or some portions of this file are derived from material licensed 47 * to the University of California by American Telephone and Telegraph 48 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 49 * the permission of UNIX System Laboratories, Inc. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 3. Neither the name of the University nor the names of its contributors 60 * may be used to endorse or promote products derived from this software 61 * without specific prior written permission. 62 * 63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 73 * SUCH DAMAGE. 74 * 75 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 76 */ 77 78 /* 79 * External virtual filesystem routines 80 */ 81 82 #include <sys/cdefs.h> 83 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.252 2005/07/23 12:18:41 yamt Exp $"); 84 85 #include "opt_inet.h" 86 #include "opt_ddb.h" 87 #include "opt_compat_netbsd.h" 88 #include "opt_compat_43.h" 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/proc.h> 93 #include <sys/kernel.h> 94 #include <sys/mount.h> 95 #include <sys/time.h> 96 #include <sys/event.h> 97 #include <sys/fcntl.h> 98 #include <sys/vnode.h> 99 #include <sys/stat.h> 100 #include <sys/namei.h> 101 #include <sys/ucred.h> 102 #include <sys/buf.h> 103 #include <sys/errno.h> 104 #include <sys/malloc.h> 105 #include <sys/domain.h> 106 #include <sys/mbuf.h> 107 #include <sys/sa.h> 108 #include <sys/syscallargs.h> 109 #include <sys/device.h> 110 #include <sys/dirent.h> 111 #include <sys/filedesc.h> 112 113 #include <miscfs/specfs/specdev.h> 114 #include <miscfs/genfs/genfs.h> 115 #include <miscfs/syncfs/syncfs.h> 116 117 #include <netinet/in.h> 118 119 #include <uvm/uvm.h> 120 #include <uvm/uvm_ddb.h> 121 122 #include <netinet/in.h> 123 124 #include <sys/sysctl.h> 125 126 const enum vtype iftovt_tab[16] = { 127 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 128 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 129 }; 130 const int vttoif_tab[9] = { 131 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 132 S_IFSOCK, S_IFIFO, S_IFMT, 133 }; 134 135 int doforce = 1; /* 1 => permit forcible unmounting */ 136 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 137 138 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 139 140 /* 141 * Insq/Remq for the vnode usage lists. 142 */ 143 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 144 #define bufremvn(bp) { \ 145 LIST_REMOVE(bp, b_vnbufs); \ 146 (bp)->b_vnbufs.le_next = NOLIST; \ 147 } 148 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 149 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 150 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 151 152 struct mntlist mountlist = /* mounted filesystem list */ 153 CIRCLEQ_HEAD_INITIALIZER(mountlist); 154 struct vfs_list_head vfs_list = /* vfs list */ 155 LIST_HEAD_INITIALIZER(vfs_list); 156 157 struct nfs_public nfs_pub; /* publicly exported FS */ 158 159 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 160 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 161 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 162 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 163 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 164 165 /* XXX - gross; single global lock to protect v_numoutput */ 166 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 167 168 /* 169 * These define the root filesystem and device. 170 */ 171 struct mount *rootfs; 172 struct vnode *rootvnode; 173 struct device *root_device; /* root device */ 174 175 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 176 &pool_allocator_nointr); 177 178 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 179 180 /* 181 * Local declarations. 182 */ 183 void insmntque(struct vnode *, struct mount *); 184 int getdevvp(dev_t, struct vnode **, enum vtype); 185 186 void vclean(struct vnode *, int, struct proc *); 187 188 static int vfs_hang_addrlist(struct mount *, struct netexport *, 189 struct export_args *); 190 static int vfs_free_netcred(struct radix_node *, void *); 191 static void vfs_free_addrlist(struct netexport *); 192 static struct vnode *getcleanvnode(struct proc *); 193 194 #ifdef DEBUG 195 void printlockedvnodes(void); 196 #endif 197 198 /* 199 * Initialize the vnode management data structures. 200 */ 201 void 202 vntblinit(void) 203 { 204 205 /* 206 * Initialize the filesystem syncer. 207 */ 208 vn_initialize_syncerd(); 209 } 210 211 int 212 vfs_drainvnodes(long target, struct proc *p) 213 { 214 215 simple_lock(&vnode_free_list_slock); 216 while (numvnodes > target) { 217 struct vnode *vp; 218 219 vp = getcleanvnode(p); 220 if (vp == NULL) 221 return EBUSY; /* give up */ 222 pool_put(&vnode_pool, vp); 223 simple_lock(&vnode_free_list_slock); 224 numvnodes--; 225 } 226 simple_unlock(&vnode_free_list_slock); 227 228 return 0; 229 } 230 231 /* 232 * grab a vnode from freelist and clean it. 233 */ 234 struct vnode * 235 getcleanvnode(struct proc *p) 236 { 237 struct vnode *vp; 238 struct mount *mp; 239 struct freelst *listhd; 240 241 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 242 243 listhd = &vnode_free_list; 244 try_nextlist: 245 TAILQ_FOREACH(vp, listhd, v_freelist) { 246 if (!simple_lock_try(&vp->v_interlock)) 247 continue; 248 /* 249 * as our lwp might hold the underlying vnode locked, 250 * don't try to reclaim the VLAYER vnode if it's locked. 251 */ 252 if ((vp->v_flag & VXLOCK) == 0 && 253 ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 254 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 255 break; 256 } 257 mp = NULL; 258 simple_unlock(&vp->v_interlock); 259 } 260 261 if (vp == NULLVP) { 262 if (listhd == &vnode_free_list) { 263 listhd = &vnode_hold_list; 264 goto try_nextlist; 265 } 266 simple_unlock(&vnode_free_list_slock); 267 return NULLVP; 268 } 269 270 if (vp->v_usecount) 271 panic("free vnode isn't, vp %p", vp); 272 TAILQ_REMOVE(listhd, vp, v_freelist); 273 /* see comment on why 0xdeadb is set at end of vgone (below) */ 274 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 275 simple_unlock(&vnode_free_list_slock); 276 vp->v_lease = NULL; 277 278 if (vp->v_type != VBAD) 279 vgonel(vp, p); 280 else 281 simple_unlock(&vp->v_interlock); 282 vn_finished_write(mp, 0); 283 #ifdef DIAGNOSTIC 284 if (vp->v_data || vp->v_uobj.uo_npages || 285 TAILQ_FIRST(&vp->v_uobj.memq)) 286 panic("cleaned vnode isn't, vp %p", vp); 287 if (vp->v_numoutput) 288 panic("clean vnode has pending I/O's, vp %p", vp); 289 #endif 290 KASSERT((vp->v_flag & VONWORKLST) == 0); 291 292 return vp; 293 } 294 295 /* 296 * Mark a mount point as busy. Used to synchronize access and to delay 297 * unmounting. Interlock is not released on failure. 298 */ 299 int 300 vfs_busy(struct mount *mp, int flags, struct simplelock *interlkp) 301 { 302 int lkflags; 303 304 while (mp->mnt_iflag & IMNT_UNMOUNT) { 305 int gone, n; 306 307 if (flags & LK_NOWAIT) 308 return (ENOENT); 309 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 310 && mp->mnt_unmounter == curproc) 311 return (EDEADLK); 312 if (interlkp) 313 simple_unlock(interlkp); 314 /* 315 * Since all busy locks are shared except the exclusive 316 * lock granted when unmounting, the only place that a 317 * wakeup needs to be done is at the release of the 318 * exclusive lock at the end of dounmount. 319 */ 320 simple_lock(&mp->mnt_slock); 321 mp->mnt_wcnt++; 322 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 323 n = --mp->mnt_wcnt; 324 simple_unlock(&mp->mnt_slock); 325 gone = mp->mnt_iflag & IMNT_GONE; 326 327 if (n == 0) 328 wakeup(&mp->mnt_wcnt); 329 if (interlkp) 330 simple_lock(interlkp); 331 if (gone) 332 return (ENOENT); 333 } 334 lkflags = LK_SHARED; 335 if (interlkp) 336 lkflags |= LK_INTERLOCK; 337 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 338 panic("vfs_busy: unexpected lock failure"); 339 return (0); 340 } 341 342 /* 343 * Free a busy filesystem. 344 */ 345 void 346 vfs_unbusy(struct mount *mp) 347 { 348 349 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 350 } 351 352 /* 353 * Lookup a filesystem type, and if found allocate and initialize 354 * a mount structure for it. 355 * 356 * Devname is usually updated by mount(8) after booting. 357 */ 358 int 359 vfs_rootmountalloc(const char *fstypename, const char *devname, 360 struct mount **mpp) 361 { 362 struct vfsops *vfsp = NULL; 363 struct mount *mp; 364 365 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 366 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 367 break; 368 369 if (vfsp == NULL) 370 return (ENODEV); 371 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 372 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 373 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 374 simple_lock_init(&mp->mnt_slock); 375 (void)vfs_busy(mp, LK_NOWAIT, 0); 376 LIST_INIT(&mp->mnt_vnodelist); 377 mp->mnt_op = vfsp; 378 mp->mnt_flag = MNT_RDONLY; 379 mp->mnt_vnodecovered = NULLVP; 380 mp->mnt_leaf = mp; 381 vfsp->vfs_refcount++; 382 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 383 mp->mnt_stat.f_mntonname[0] = '/'; 384 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 385 *mpp = mp; 386 return (0); 387 } 388 389 /* 390 * Lookup a mount point by filesystem identifier. 391 */ 392 struct mount * 393 vfs_getvfs(fsid_t *fsid) 394 { 395 struct mount *mp; 396 397 simple_lock(&mountlist_slock); 398 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 399 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 400 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 401 simple_unlock(&mountlist_slock); 402 return (mp); 403 } 404 } 405 simple_unlock(&mountlist_slock); 406 return ((struct mount *)0); 407 } 408 409 /* 410 * Get a new unique fsid 411 */ 412 void 413 vfs_getnewfsid(struct mount *mp) 414 { 415 static u_short xxxfs_mntid; 416 fsid_t tfsid; 417 int mtype; 418 419 simple_lock(&mntid_slock); 420 mtype = makefstype(mp->mnt_op->vfs_name); 421 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 422 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 423 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 424 if (xxxfs_mntid == 0) 425 ++xxxfs_mntid; 426 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 427 tfsid.__fsid_val[1] = mtype; 428 if (!CIRCLEQ_EMPTY(&mountlist)) { 429 while (vfs_getvfs(&tfsid)) { 430 tfsid.__fsid_val[0]++; 431 xxxfs_mntid++; 432 } 433 } 434 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 435 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 436 simple_unlock(&mntid_slock); 437 } 438 439 /* 440 * Make a 'unique' number from a mount type name. 441 */ 442 long 443 makefstype(const char *type) 444 { 445 long rv; 446 447 for (rv = 0; *type; type++) { 448 rv <<= 2; 449 rv ^= *type; 450 } 451 return rv; 452 } 453 454 455 /* 456 * Set vnode attributes to VNOVAL 457 */ 458 void 459 vattr_null(struct vattr *vap) 460 { 461 462 vap->va_type = VNON; 463 464 /* 465 * Assign individually so that it is safe even if size and 466 * sign of each member are varied. 467 */ 468 vap->va_mode = VNOVAL; 469 vap->va_nlink = VNOVAL; 470 vap->va_uid = VNOVAL; 471 vap->va_gid = VNOVAL; 472 vap->va_fsid = VNOVAL; 473 vap->va_fileid = VNOVAL; 474 vap->va_size = VNOVAL; 475 vap->va_blocksize = VNOVAL; 476 vap->va_atime.tv_sec = 477 vap->va_mtime.tv_sec = 478 vap->va_ctime.tv_sec = 479 vap->va_birthtime.tv_sec = VNOVAL; 480 vap->va_atime.tv_nsec = 481 vap->va_mtime.tv_nsec = 482 vap->va_ctime.tv_nsec = 483 vap->va_birthtime.tv_nsec = VNOVAL; 484 vap->va_gen = VNOVAL; 485 vap->va_flags = VNOVAL; 486 vap->va_rdev = VNOVAL; 487 vap->va_bytes = VNOVAL; 488 vap->va_vaflags = 0; 489 } 490 491 /* 492 * Routines having to do with the management of the vnode table. 493 */ 494 extern int (**dead_vnodeop_p)(void *); 495 long numvnodes; 496 497 /* 498 * Return the next vnode from the free list. 499 */ 500 int 501 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 502 struct vnode **vpp) 503 { 504 extern struct uvm_pagerops uvm_vnodeops; 505 struct uvm_object *uobj; 506 struct proc *p = curproc; /* XXX */ 507 static int toggle; 508 struct vnode *vp; 509 int error = 0, tryalloc; 510 511 try_again: 512 if (mp) { 513 /* 514 * Mark filesystem busy while we're creating a vnode. 515 * If unmount is in progress, this will wait; if the 516 * unmount succeeds (only if umount -f), this will 517 * return an error. If the unmount fails, we'll keep 518 * going afterwards. 519 * (This puts the per-mount vnode list logically under 520 * the protection of the vfs_busy lock). 521 */ 522 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 523 if (error && error != EDEADLK) 524 return error; 525 } 526 527 /* 528 * We must choose whether to allocate a new vnode or recycle an 529 * existing one. The criterion for allocating a new one is that 530 * the total number of vnodes is less than the number desired or 531 * there are no vnodes on either free list. Generally we only 532 * want to recycle vnodes that have no buffers associated with 533 * them, so we look first on the vnode_free_list. If it is empty, 534 * we next consider vnodes with referencing buffers on the 535 * vnode_hold_list. The toggle ensures that half the time we 536 * will use a buffer from the vnode_hold_list, and half the time 537 * we will allocate a new one unless the list has grown to twice 538 * the desired size. We are reticent to recycle vnodes from the 539 * vnode_hold_list because we will lose the identity of all its 540 * referencing buffers. 541 */ 542 543 vp = NULL; 544 545 simple_lock(&vnode_free_list_slock); 546 547 toggle ^= 1; 548 if (numvnodes > 2 * desiredvnodes) 549 toggle = 0; 550 551 tryalloc = numvnodes < desiredvnodes || 552 (TAILQ_FIRST(&vnode_free_list) == NULL && 553 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 554 555 if (tryalloc && 556 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 557 numvnodes++; 558 simple_unlock(&vnode_free_list_slock); 559 memset(vp, 0, sizeof(*vp)); 560 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1); 561 /* 562 * done by memset() above. 563 * LIST_INIT(&vp->v_nclist); 564 * LIST_INIT(&vp->v_dnclist); 565 */ 566 } else { 567 vp = getcleanvnode(p); 568 /* 569 * Unless this is a bad time of the month, at most 570 * the first NCPUS items on the free list are 571 * locked, so this is close enough to being empty. 572 */ 573 if (vp == NULLVP) { 574 if (mp && error != EDEADLK) 575 vfs_unbusy(mp); 576 if (tryalloc) { 577 printf("WARNING: unable to allocate new " 578 "vnode, retrying...\n"); 579 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 580 goto try_again; 581 } 582 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 583 *vpp = 0; 584 return (ENFILE); 585 } 586 vp->v_usecount = 1; 587 vp->v_flag = 0; 588 vp->v_socket = NULL; 589 } 590 vp->v_type = VNON; 591 vp->v_vnlock = &vp->v_lock; 592 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 593 KASSERT(LIST_EMPTY(&vp->v_nclist)); 594 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 595 vp->v_tag = tag; 596 vp->v_op = vops; 597 insmntque(vp, mp); 598 *vpp = vp; 599 vp->v_data = 0; 600 simple_lock_init(&vp->v_interlock); 601 602 /* 603 * initialize uvm_object within vnode. 604 */ 605 606 uobj = &vp->v_uobj; 607 KASSERT(uobj->pgops == &uvm_vnodeops); 608 KASSERT(uobj->uo_npages == 0); 609 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 610 vp->v_size = VSIZENOTSET; 611 612 if (mp && error != EDEADLK) 613 vfs_unbusy(mp); 614 return (0); 615 } 616 617 /* 618 * This is really just the reverse of getnewvnode(). Needed for 619 * VFS_VGET functions who may need to push back a vnode in case 620 * of a locking race. 621 */ 622 void 623 ungetnewvnode(struct vnode *vp) 624 { 625 #ifdef DIAGNOSTIC 626 if (vp->v_usecount != 1) 627 panic("ungetnewvnode: busy vnode"); 628 #endif 629 vp->v_usecount--; 630 insmntque(vp, NULL); 631 vp->v_type = VBAD; 632 633 simple_lock(&vp->v_interlock); 634 /* 635 * Insert at head of LRU list 636 */ 637 simple_lock(&vnode_free_list_slock); 638 if (vp->v_holdcnt > 0) 639 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 640 else 641 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 642 simple_unlock(&vnode_free_list_slock); 643 simple_unlock(&vp->v_interlock); 644 } 645 646 /* 647 * Move a vnode from one mount queue to another. 648 */ 649 void 650 insmntque(struct vnode *vp, struct mount *mp) 651 { 652 653 #ifdef DIAGNOSTIC 654 if ((mp != NULL) && 655 (mp->mnt_iflag & IMNT_UNMOUNT) && 656 !(mp->mnt_flag & MNT_SOFTDEP) && 657 vp->v_tag != VT_VFS) { 658 panic("insmntque into dying filesystem"); 659 } 660 #endif 661 662 simple_lock(&mntvnode_slock); 663 /* 664 * Delete from old mount point vnode list, if on one. 665 */ 666 if (vp->v_mount != NULL) 667 LIST_REMOVE(vp, v_mntvnodes); 668 /* 669 * Insert into list of vnodes for the new mount point, if available. 670 */ 671 if ((vp->v_mount = mp) != NULL) 672 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 673 simple_unlock(&mntvnode_slock); 674 } 675 676 /* 677 * Update outstanding I/O count and do wakeup if requested. 678 */ 679 void 680 vwakeup(struct buf *bp) 681 { 682 struct vnode *vp; 683 684 if ((vp = bp->b_vp) != NULL) { 685 /* XXX global lock hack 686 * can't use v_interlock here since this is called 687 * in interrupt context from biodone(). 688 */ 689 simple_lock(&global_v_numoutput_slock); 690 if (--vp->v_numoutput < 0) 691 panic("vwakeup: neg numoutput, vp %p", vp); 692 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 693 vp->v_flag &= ~VBWAIT; 694 wakeup((caddr_t)&vp->v_numoutput); 695 } 696 simple_unlock(&global_v_numoutput_slock); 697 } 698 } 699 700 /* 701 * Flush out and invalidate all buffers associated with a vnode. 702 * Called with the underlying vnode locked, which should prevent new dirty 703 * buffers from being queued. 704 */ 705 int 706 vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, struct proc *p, 707 int slpflag, int slptimeo) 708 { 709 struct buf *bp, *nbp; 710 int s, error; 711 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 712 (flags & V_SAVE ? PGO_CLEANIT : 0); 713 714 /* XXXUBC this doesn't look at flags or slp* */ 715 simple_lock(&vp->v_interlock); 716 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 717 if (error) { 718 return error; 719 } 720 721 if (flags & V_SAVE) { 722 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 723 if (error) 724 return (error); 725 #ifdef DIAGNOSTIC 726 s = splbio(); 727 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 728 panic("vinvalbuf: dirty bufs, vp %p", vp); 729 splx(s); 730 #endif 731 } 732 733 s = splbio(); 734 735 restart: 736 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 737 nbp = LIST_NEXT(bp, b_vnbufs); 738 simple_lock(&bp->b_interlock); 739 if (bp->b_flags & B_BUSY) { 740 bp->b_flags |= B_WANTED; 741 error = ltsleep((caddr_t)bp, 742 slpflag | (PRIBIO + 1) | PNORELOCK, 743 "vinvalbuf", slptimeo, &bp->b_interlock); 744 if (error) { 745 splx(s); 746 return (error); 747 } 748 goto restart; 749 } 750 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 751 simple_unlock(&bp->b_interlock); 752 brelse(bp); 753 } 754 755 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 756 nbp = LIST_NEXT(bp, b_vnbufs); 757 simple_lock(&bp->b_interlock); 758 if (bp->b_flags & B_BUSY) { 759 bp->b_flags |= B_WANTED; 760 error = ltsleep((caddr_t)bp, 761 slpflag | (PRIBIO + 1) | PNORELOCK, 762 "vinvalbuf", slptimeo, &bp->b_interlock); 763 if (error) { 764 splx(s); 765 return (error); 766 } 767 goto restart; 768 } 769 /* 770 * XXX Since there are no node locks for NFS, I believe 771 * there is a slight chance that a delayed write will 772 * occur while sleeping just above, so check for it. 773 */ 774 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 775 #ifdef DEBUG 776 printf("buffer still DELWRI\n"); 777 #endif 778 bp->b_flags |= B_BUSY | B_VFLUSH; 779 simple_unlock(&bp->b_interlock); 780 VOP_BWRITE(bp); 781 goto restart; 782 } 783 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 784 simple_unlock(&bp->b_interlock); 785 brelse(bp); 786 } 787 788 #ifdef DIAGNOSTIC 789 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 790 panic("vinvalbuf: flush failed, vp %p", vp); 791 #endif 792 793 splx(s); 794 795 return (0); 796 } 797 798 /* 799 * Destroy any in core blocks past the truncation length. 800 * Called with the underlying vnode locked, which should prevent new dirty 801 * buffers from being queued. 802 */ 803 int 804 vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo) 805 { 806 struct buf *bp, *nbp; 807 int s, error; 808 voff_t off; 809 810 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 811 simple_lock(&vp->v_interlock); 812 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 813 if (error) { 814 return error; 815 } 816 817 s = splbio(); 818 819 restart: 820 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 821 nbp = LIST_NEXT(bp, b_vnbufs); 822 if (bp->b_lblkno < lbn) 823 continue; 824 simple_lock(&bp->b_interlock); 825 if (bp->b_flags & B_BUSY) { 826 bp->b_flags |= B_WANTED; 827 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 828 "vtruncbuf", slptimeo, &bp->b_interlock); 829 if (error) { 830 splx(s); 831 return (error); 832 } 833 goto restart; 834 } 835 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 836 simple_unlock(&bp->b_interlock); 837 brelse(bp); 838 } 839 840 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 841 nbp = LIST_NEXT(bp, b_vnbufs); 842 if (bp->b_lblkno < lbn) 843 continue; 844 simple_lock(&bp->b_interlock); 845 if (bp->b_flags & B_BUSY) { 846 bp->b_flags |= B_WANTED; 847 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 848 "vtruncbuf", slptimeo, &bp->b_interlock); 849 if (error) { 850 splx(s); 851 return (error); 852 } 853 goto restart; 854 } 855 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 856 simple_unlock(&bp->b_interlock); 857 brelse(bp); 858 } 859 860 splx(s); 861 862 return (0); 863 } 864 865 void 866 vflushbuf(struct vnode *vp, int sync) 867 { 868 struct buf *bp, *nbp; 869 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 870 int s; 871 872 simple_lock(&vp->v_interlock); 873 (void) VOP_PUTPAGES(vp, 0, 0, flags); 874 875 loop: 876 s = splbio(); 877 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 878 nbp = LIST_NEXT(bp, b_vnbufs); 879 simple_lock(&bp->b_interlock); 880 if ((bp->b_flags & B_BUSY)) { 881 simple_unlock(&bp->b_interlock); 882 continue; 883 } 884 if ((bp->b_flags & B_DELWRI) == 0) 885 panic("vflushbuf: not dirty, bp %p", bp); 886 bp->b_flags |= B_BUSY | B_VFLUSH; 887 simple_unlock(&bp->b_interlock); 888 splx(s); 889 /* 890 * Wait for I/O associated with indirect blocks to complete, 891 * since there is no way to quickly wait for them below. 892 */ 893 if (bp->b_vp == vp || sync == 0) 894 (void) bawrite(bp); 895 else 896 (void) bwrite(bp); 897 goto loop; 898 } 899 if (sync == 0) { 900 splx(s); 901 return; 902 } 903 simple_lock(&global_v_numoutput_slock); 904 while (vp->v_numoutput) { 905 vp->v_flag |= VBWAIT; 906 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 907 &global_v_numoutput_slock); 908 } 909 simple_unlock(&global_v_numoutput_slock); 910 splx(s); 911 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 912 vprint("vflushbuf: dirty", vp); 913 goto loop; 914 } 915 } 916 917 /* 918 * Associate a buffer with a vnode. 919 */ 920 void 921 bgetvp(struct vnode *vp, struct buf *bp) 922 { 923 int s; 924 925 if (bp->b_vp) 926 panic("bgetvp: not free, bp %p", bp); 927 VHOLD(vp); 928 s = splbio(); 929 bp->b_vp = vp; 930 if (vp->v_type == VBLK || vp->v_type == VCHR) 931 bp->b_dev = vp->v_rdev; 932 else 933 bp->b_dev = NODEV; 934 /* 935 * Insert onto list for new vnode. 936 */ 937 bufinsvn(bp, &vp->v_cleanblkhd); 938 splx(s); 939 } 940 941 /* 942 * Disassociate a buffer from a vnode. 943 */ 944 void 945 brelvp(struct buf *bp) 946 { 947 struct vnode *vp; 948 int s; 949 950 if (bp->b_vp == NULL) 951 panic("brelvp: vp NULL, bp %p", bp); 952 953 s = splbio(); 954 vp = bp->b_vp; 955 /* 956 * Delete from old vnode list, if on one. 957 */ 958 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 959 bufremvn(bp); 960 961 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 962 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 963 vp->v_flag &= ~(VWRITEMAPDIRTY|VONWORKLST); 964 LIST_REMOVE(vp, v_synclist); 965 } 966 967 bp->b_vp = NULL; 968 HOLDRELE(vp); 969 splx(s); 970 } 971 972 /* 973 * Reassign a buffer from one vnode to another. 974 * Used to assign file specific control information 975 * (indirect blocks) to the vnode to which they belong. 976 * 977 * This function must be called at splbio(). 978 */ 979 void 980 reassignbuf(struct buf *bp, struct vnode *newvp) 981 { 982 struct buflists *listheadp; 983 int delayx; 984 985 /* 986 * Delete from old vnode list, if on one. 987 */ 988 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 989 bufremvn(bp); 990 /* 991 * If dirty, put on list of dirty buffers; 992 * otherwise insert onto list of clean buffers. 993 */ 994 if ((bp->b_flags & B_DELWRI) == 0) { 995 listheadp = &newvp->v_cleanblkhd; 996 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 997 (newvp->v_flag & VONWORKLST) && 998 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 999 newvp->v_flag &= ~(VWRITEMAPDIRTY|VONWORKLST); 1000 LIST_REMOVE(newvp, v_synclist); 1001 } 1002 } else { 1003 listheadp = &newvp->v_dirtyblkhd; 1004 if ((newvp->v_flag & VONWORKLST) == 0) { 1005 switch (newvp->v_type) { 1006 case VDIR: 1007 delayx = dirdelay; 1008 break; 1009 case VBLK: 1010 if (newvp->v_specmountpoint != NULL) { 1011 delayx = metadelay; 1012 break; 1013 } 1014 /* fall through */ 1015 default: 1016 delayx = filedelay; 1017 break; 1018 } 1019 if (!newvp->v_mount || 1020 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1021 vn_syncer_add_to_worklist(newvp, delayx); 1022 } 1023 } 1024 bufinsvn(bp, listheadp); 1025 } 1026 1027 /* 1028 * Create a vnode for a block device. 1029 * Used for root filesystem and swap areas. 1030 * Also used for memory file system special devices. 1031 */ 1032 int 1033 bdevvp(dev_t dev, struct vnode **vpp) 1034 { 1035 1036 return (getdevvp(dev, vpp, VBLK)); 1037 } 1038 1039 /* 1040 * Create a vnode for a character device. 1041 * Used for kernfs and some console handling. 1042 */ 1043 int 1044 cdevvp(dev_t dev, struct vnode **vpp) 1045 { 1046 1047 return (getdevvp(dev, vpp, VCHR)); 1048 } 1049 1050 /* 1051 * Create a vnode for a device. 1052 * Used by bdevvp (block device) for root file system etc., 1053 * and by cdevvp (character device) for console and kernfs. 1054 */ 1055 int 1056 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type) 1057 { 1058 struct vnode *vp; 1059 struct vnode *nvp; 1060 int error; 1061 1062 if (dev == NODEV) { 1063 *vpp = NULLVP; 1064 return (0); 1065 } 1066 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1067 if (error) { 1068 *vpp = NULLVP; 1069 return (error); 1070 } 1071 vp = nvp; 1072 vp->v_type = type; 1073 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1074 vput(vp); 1075 vp = nvp; 1076 } 1077 *vpp = vp; 1078 return (0); 1079 } 1080 1081 /* 1082 * Check to see if the new vnode represents a special device 1083 * for which we already have a vnode (either because of 1084 * bdevvp() or because of a different vnode representing 1085 * the same block device). If such an alias exists, deallocate 1086 * the existing contents and return the aliased vnode. The 1087 * caller is responsible for filling it with its new contents. 1088 */ 1089 struct vnode * 1090 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp) 1091 { 1092 struct proc *p = curproc; /* XXX */ 1093 struct vnode *vp; 1094 struct vnode **vpp; 1095 1096 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1097 return (NULLVP); 1098 1099 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1100 loop: 1101 simple_lock(&spechash_slock); 1102 for (vp = *vpp; vp; vp = vp->v_specnext) { 1103 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1104 continue; 1105 /* 1106 * Alias, but not in use, so flush it out. 1107 */ 1108 simple_lock(&vp->v_interlock); 1109 simple_unlock(&spechash_slock); 1110 if (vp->v_usecount == 0) { 1111 vgonel(vp, p); 1112 goto loop; 1113 } 1114 /* 1115 * What we're interested to know here is if someone else has 1116 * removed this vnode from the device hash list while we were 1117 * waiting. This can only happen if vclean() did it, and 1118 * this requires the vnode to be locked. Therefore, we use 1119 * LK_SLEEPFAIL and retry. 1120 */ 1121 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL)) 1122 goto loop; 1123 simple_lock(&spechash_slock); 1124 break; 1125 } 1126 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1127 MALLOC(nvp->v_specinfo, struct specinfo *, 1128 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1129 /* XXX Erg. */ 1130 if (nvp->v_specinfo == NULL) { 1131 simple_unlock(&spechash_slock); 1132 uvm_wait("checkalias"); 1133 goto loop; 1134 } 1135 1136 nvp->v_rdev = nvp_rdev; 1137 nvp->v_hashchain = vpp; 1138 nvp->v_specnext = *vpp; 1139 nvp->v_specmountpoint = NULL; 1140 simple_unlock(&spechash_slock); 1141 nvp->v_speclockf = NULL; 1142 simple_lock_init(&nvp->v_spec_cow_slock); 1143 SLIST_INIT(&nvp->v_spec_cow_head); 1144 nvp->v_spec_cow_req = 0; 1145 nvp->v_spec_cow_count = 0; 1146 1147 *vpp = nvp; 1148 if (vp != NULLVP) { 1149 nvp->v_flag |= VALIASED; 1150 vp->v_flag |= VALIASED; 1151 vput(vp); 1152 } 1153 return (NULLVP); 1154 } 1155 simple_unlock(&spechash_slock); 1156 VOP_UNLOCK(vp, 0); 1157 simple_lock(&vp->v_interlock); 1158 vclean(vp, 0, p); 1159 vp->v_op = nvp->v_op; 1160 vp->v_tag = nvp->v_tag; 1161 vp->v_vnlock = &vp->v_lock; 1162 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1163 nvp->v_type = VNON; 1164 insmntque(vp, mp); 1165 return (vp); 1166 } 1167 1168 /* 1169 * Grab a particular vnode from the free list, increment its 1170 * reference count and lock it. If the vnode lock bit is set the 1171 * vnode is being eliminated in vgone. In that case, we can not 1172 * grab the vnode, so the process is awakened when the transition is 1173 * completed, and an error returned to indicate that the vnode is no 1174 * longer usable (possibly having been changed to a new file system type). 1175 */ 1176 int 1177 vget(struct vnode *vp, int flags) 1178 { 1179 int error; 1180 1181 /* 1182 * If the vnode is in the process of being cleaned out for 1183 * another use, we wait for the cleaning to finish and then 1184 * return failure. Cleaning is determined by checking that 1185 * the VXLOCK flag is set. 1186 */ 1187 1188 if ((flags & LK_INTERLOCK) == 0) 1189 simple_lock(&vp->v_interlock); 1190 if (vp->v_flag & VXLOCK) { 1191 if (flags & LK_NOWAIT) { 1192 simple_unlock(&vp->v_interlock); 1193 return EBUSY; 1194 } 1195 vp->v_flag |= VXWANT; 1196 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1197 return (ENOENT); 1198 } 1199 if (vp->v_usecount == 0) { 1200 simple_lock(&vnode_free_list_slock); 1201 if (vp->v_holdcnt > 0) 1202 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1203 else 1204 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1205 simple_unlock(&vnode_free_list_slock); 1206 } 1207 vp->v_usecount++; 1208 #ifdef DIAGNOSTIC 1209 if (vp->v_usecount == 0) { 1210 vprint("vget", vp); 1211 panic("vget: usecount overflow, vp %p", vp); 1212 } 1213 #endif 1214 if (flags & LK_TYPE_MASK) { 1215 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1216 /* 1217 * must expand vrele here because we do not want 1218 * to call VOP_INACTIVE if the reference count 1219 * drops back to zero since it was never really 1220 * active. We must remove it from the free list 1221 * before sleeping so that multiple processes do 1222 * not try to recycle it. 1223 */ 1224 simple_lock(&vp->v_interlock); 1225 vp->v_usecount--; 1226 if (vp->v_usecount > 0) { 1227 simple_unlock(&vp->v_interlock); 1228 return (error); 1229 } 1230 /* 1231 * insert at tail of LRU list 1232 */ 1233 simple_lock(&vnode_free_list_slock); 1234 if (vp->v_holdcnt > 0) 1235 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1236 v_freelist); 1237 else 1238 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1239 v_freelist); 1240 simple_unlock(&vnode_free_list_slock); 1241 simple_unlock(&vp->v_interlock); 1242 } 1243 return (error); 1244 } 1245 simple_unlock(&vp->v_interlock); 1246 return (0); 1247 } 1248 1249 /* 1250 * vput(), just unlock and vrele() 1251 */ 1252 void 1253 vput(struct vnode *vp) 1254 { 1255 struct proc *p = curproc; /* XXX */ 1256 1257 #ifdef DIAGNOSTIC 1258 if (vp == NULL) 1259 panic("vput: null vp"); 1260 #endif 1261 simple_lock(&vp->v_interlock); 1262 vp->v_usecount--; 1263 if (vp->v_usecount > 0) { 1264 simple_unlock(&vp->v_interlock); 1265 VOP_UNLOCK(vp, 0); 1266 return; 1267 } 1268 #ifdef DIAGNOSTIC 1269 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1270 vprint("vput: bad ref count", vp); 1271 panic("vput: ref cnt"); 1272 } 1273 #endif 1274 /* 1275 * Insert at tail of LRU list. 1276 */ 1277 simple_lock(&vnode_free_list_slock); 1278 if (vp->v_holdcnt > 0) 1279 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1280 else 1281 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1282 simple_unlock(&vnode_free_list_slock); 1283 if (vp->v_flag & VEXECMAP) { 1284 uvmexp.execpages -= vp->v_uobj.uo_npages; 1285 uvmexp.filepages += vp->v_uobj.uo_npages; 1286 } 1287 vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP); 1288 simple_unlock(&vp->v_interlock); 1289 VOP_INACTIVE(vp, p); 1290 } 1291 1292 /* 1293 * Vnode release. 1294 * If count drops to zero, call inactive routine and return to freelist. 1295 */ 1296 void 1297 vrele(struct vnode *vp) 1298 { 1299 struct proc *p = curproc; /* XXX */ 1300 1301 #ifdef DIAGNOSTIC 1302 if (vp == NULL) 1303 panic("vrele: null vp"); 1304 #endif 1305 simple_lock(&vp->v_interlock); 1306 vp->v_usecount--; 1307 if (vp->v_usecount > 0) { 1308 simple_unlock(&vp->v_interlock); 1309 return; 1310 } 1311 #ifdef DIAGNOSTIC 1312 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1313 vprint("vrele: bad ref count", vp); 1314 panic("vrele: ref cnt vp %p", vp); 1315 } 1316 #endif 1317 /* 1318 * Insert at tail of LRU list. 1319 */ 1320 simple_lock(&vnode_free_list_slock); 1321 if (vp->v_holdcnt > 0) 1322 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1323 else 1324 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1325 simple_unlock(&vnode_free_list_slock); 1326 if (vp->v_flag & VEXECMAP) { 1327 uvmexp.execpages -= vp->v_uobj.uo_npages; 1328 uvmexp.filepages += vp->v_uobj.uo_npages; 1329 } 1330 vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP); 1331 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1332 VOP_INACTIVE(vp, p); 1333 } 1334 1335 #ifdef DIAGNOSTIC 1336 /* 1337 * Page or buffer structure gets a reference. 1338 */ 1339 void 1340 vholdl(struct vnode *vp) 1341 { 1342 1343 /* 1344 * If it is on the freelist and the hold count is currently 1345 * zero, move it to the hold list. The test of the back 1346 * pointer and the use reference count of zero is because 1347 * it will be removed from a free list by getnewvnode, 1348 * but will not have its reference count incremented until 1349 * after calling vgone. If the reference count were 1350 * incremented first, vgone would (incorrectly) try to 1351 * close the previous instance of the underlying object. 1352 * So, the back pointer is explicitly set to `0xdeadb' in 1353 * getnewvnode after removing it from a freelist to ensure 1354 * that we do not try to move it here. 1355 */ 1356 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1357 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1358 simple_lock(&vnode_free_list_slock); 1359 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1360 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1361 simple_unlock(&vnode_free_list_slock); 1362 } 1363 vp->v_holdcnt++; 1364 } 1365 1366 /* 1367 * Page or buffer structure frees a reference. 1368 */ 1369 void 1370 holdrelel(struct vnode *vp) 1371 { 1372 1373 if (vp->v_holdcnt <= 0) 1374 panic("holdrelel: holdcnt vp %p", vp); 1375 vp->v_holdcnt--; 1376 1377 /* 1378 * If it is on the holdlist and the hold count drops to 1379 * zero, move it to the free list. The test of the back 1380 * pointer and the use reference count of zero is because 1381 * it will be removed from a free list by getnewvnode, 1382 * but will not have its reference count incremented until 1383 * after calling vgone. If the reference count were 1384 * incremented first, vgone would (incorrectly) try to 1385 * close the previous instance of the underlying object. 1386 * So, the back pointer is explicitly set to `0xdeadb' in 1387 * getnewvnode after removing it from a freelist to ensure 1388 * that we do not try to move it here. 1389 */ 1390 1391 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1392 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1393 simple_lock(&vnode_free_list_slock); 1394 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1395 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1396 simple_unlock(&vnode_free_list_slock); 1397 } 1398 } 1399 1400 /* 1401 * Vnode reference. 1402 */ 1403 void 1404 vref(struct vnode *vp) 1405 { 1406 1407 simple_lock(&vp->v_interlock); 1408 if (vp->v_usecount <= 0) 1409 panic("vref used where vget required, vp %p", vp); 1410 vp->v_usecount++; 1411 #ifdef DIAGNOSTIC 1412 if (vp->v_usecount == 0) { 1413 vprint("vref", vp); 1414 panic("vref: usecount overflow, vp %p", vp); 1415 } 1416 #endif 1417 simple_unlock(&vp->v_interlock); 1418 } 1419 #endif /* DIAGNOSTIC */ 1420 1421 /* 1422 * Remove any vnodes in the vnode table belonging to mount point mp. 1423 * 1424 * If FORCECLOSE is not specified, there should not be any active ones, 1425 * return error if any are found (nb: this is a user error, not a 1426 * system error). If FORCECLOSE is specified, detach any active vnodes 1427 * that are found. 1428 * 1429 * If WRITECLOSE is set, only flush out regular file vnodes open for 1430 * writing. 1431 * 1432 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1433 */ 1434 #ifdef DEBUG 1435 int busyprt = 0; /* print out busy vnodes */ 1436 struct ctldebug debug1 = { "busyprt", &busyprt }; 1437 #endif 1438 1439 int 1440 vflush(struct mount *mp, struct vnode *skipvp, int flags) 1441 { 1442 struct proc *p = curproc; /* XXX */ 1443 struct vnode *vp, *nvp; 1444 int busy = 0; 1445 1446 simple_lock(&mntvnode_slock); 1447 loop: 1448 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1449 if (vp->v_mount != mp) 1450 goto loop; 1451 nvp = LIST_NEXT(vp, v_mntvnodes); 1452 /* 1453 * Skip over a selected vnode. 1454 */ 1455 if (vp == skipvp) 1456 continue; 1457 simple_lock(&vp->v_interlock); 1458 /* 1459 * Skip over a vnodes marked VSYSTEM. 1460 */ 1461 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1462 simple_unlock(&vp->v_interlock); 1463 continue; 1464 } 1465 /* 1466 * If WRITECLOSE is set, only flush out regular file 1467 * vnodes open for writing. 1468 */ 1469 if ((flags & WRITECLOSE) && 1470 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1471 simple_unlock(&vp->v_interlock); 1472 continue; 1473 } 1474 /* 1475 * With v_usecount == 0, all we need to do is clear 1476 * out the vnode data structures and we are done. 1477 */ 1478 if (vp->v_usecount == 0) { 1479 simple_unlock(&mntvnode_slock); 1480 vgonel(vp, p); 1481 simple_lock(&mntvnode_slock); 1482 continue; 1483 } 1484 /* 1485 * If FORCECLOSE is set, forcibly close the vnode. 1486 * For block or character devices, revert to an 1487 * anonymous device. For all other files, just kill them. 1488 */ 1489 if (flags & FORCECLOSE) { 1490 simple_unlock(&mntvnode_slock); 1491 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1492 vgonel(vp, p); 1493 } else { 1494 vclean(vp, 0, p); 1495 vp->v_op = spec_vnodeop_p; 1496 insmntque(vp, (struct mount *)0); 1497 } 1498 simple_lock(&mntvnode_slock); 1499 continue; 1500 } 1501 #ifdef DEBUG 1502 if (busyprt) 1503 vprint("vflush: busy vnode", vp); 1504 #endif 1505 simple_unlock(&vp->v_interlock); 1506 busy++; 1507 } 1508 simple_unlock(&mntvnode_slock); 1509 if (busy) 1510 return (EBUSY); 1511 return (0); 1512 } 1513 1514 /* 1515 * Disassociate the underlying file system from a vnode. 1516 */ 1517 void 1518 vclean(struct vnode *vp, int flags, struct proc *p) 1519 { 1520 struct mount *mp; 1521 int active; 1522 1523 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1524 1525 /* 1526 * Check to see if the vnode is in use. 1527 * If so we have to reference it before we clean it out 1528 * so that its count cannot fall to zero and generate a 1529 * race against ourselves to recycle it. 1530 */ 1531 1532 if ((active = vp->v_usecount) != 0) { 1533 vp->v_usecount++; 1534 #ifdef DIAGNOSTIC 1535 if (vp->v_usecount == 0) { 1536 vprint("vclean", vp); 1537 panic("vclean: usecount overflow"); 1538 } 1539 #endif 1540 } 1541 1542 /* 1543 * Prevent the vnode from being recycled or 1544 * brought into use while we clean it out. 1545 */ 1546 if (vp->v_flag & VXLOCK) 1547 panic("vclean: deadlock, vp %p", vp); 1548 vp->v_flag |= VXLOCK; 1549 if (vp->v_flag & VEXECMAP) { 1550 uvmexp.execpages -= vp->v_uobj.uo_npages; 1551 uvmexp.filepages += vp->v_uobj.uo_npages; 1552 } 1553 vp->v_flag &= ~(VTEXT|VEXECMAP); 1554 1555 /* 1556 * Even if the count is zero, the VOP_INACTIVE routine may still 1557 * have the object locked while it cleans it out. The VOP_LOCK 1558 * ensures that the VOP_INACTIVE routine is done with its work. 1559 * For active vnodes, it ensures that no other activity can 1560 * occur while the underlying object is being cleaned out. 1561 */ 1562 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1563 1564 /* 1565 * Clean out any cached data associated with the vnode. 1566 * If special device, remove it from special device alias list. 1567 * if it is on one. 1568 */ 1569 if (flags & DOCLOSE) { 1570 int error; 1571 struct vnode *vq, *vx; 1572 1573 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1574 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1575 vn_finished_write(mp, V_LOWER); 1576 if (error) 1577 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1578 KASSERT(error == 0); 1579 KASSERT((vp->v_flag & VONWORKLST) == 0); 1580 1581 if (active) 1582 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1583 1584 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1585 vp->v_specinfo != 0) { 1586 simple_lock(&spechash_slock); 1587 if (vp->v_hashchain != NULL) { 1588 if (*vp->v_hashchain == vp) { 1589 *vp->v_hashchain = vp->v_specnext; 1590 } else { 1591 for (vq = *vp->v_hashchain; vq; 1592 vq = vq->v_specnext) { 1593 if (vq->v_specnext != vp) 1594 continue; 1595 vq->v_specnext = vp->v_specnext; 1596 break; 1597 } 1598 if (vq == NULL) 1599 panic("missing bdev"); 1600 } 1601 if (vp->v_flag & VALIASED) { 1602 vx = NULL; 1603 for (vq = *vp->v_hashchain; vq; 1604 vq = vq->v_specnext) { 1605 if (vq->v_rdev != vp->v_rdev || 1606 vq->v_type != vp->v_type) 1607 continue; 1608 if (vx) 1609 break; 1610 vx = vq; 1611 } 1612 if (vx == NULL) 1613 panic("missing alias"); 1614 if (vq == NULL) 1615 vx->v_flag &= ~VALIASED; 1616 vp->v_flag &= ~VALIASED; 1617 } 1618 } 1619 simple_unlock(&spechash_slock); 1620 FREE(vp->v_specinfo, M_VNODE); 1621 vp->v_specinfo = NULL; 1622 } 1623 } 1624 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1625 1626 /* 1627 * If purging an active vnode, it must be closed and 1628 * deactivated before being reclaimed. Note that the 1629 * VOP_INACTIVE will unlock the vnode. 1630 */ 1631 if (active) { 1632 VOP_INACTIVE(vp, p); 1633 } else { 1634 /* 1635 * Any other processes trying to obtain this lock must first 1636 * wait for VXLOCK to clear, then call the new lock operation. 1637 */ 1638 VOP_UNLOCK(vp, 0); 1639 } 1640 /* 1641 * Reclaim the vnode. 1642 */ 1643 if (VOP_RECLAIM(vp, p)) 1644 panic("vclean: cannot reclaim, vp %p", vp); 1645 if (active) { 1646 /* 1647 * Inline copy of vrele() since VOP_INACTIVE 1648 * has already been called. 1649 */ 1650 simple_lock(&vp->v_interlock); 1651 if (--vp->v_usecount <= 0) { 1652 #ifdef DIAGNOSTIC 1653 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1654 vprint("vclean: bad ref count", vp); 1655 panic("vclean: ref cnt"); 1656 } 1657 #endif 1658 /* 1659 * Insert at tail of LRU list. 1660 */ 1661 1662 simple_unlock(&vp->v_interlock); 1663 simple_lock(&vnode_free_list_slock); 1664 #ifdef DIAGNOSTIC 1665 if (vp->v_holdcnt > 0) 1666 panic("vclean: not clean, vp %p", vp); 1667 #endif 1668 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1669 simple_unlock(&vnode_free_list_slock); 1670 } else 1671 simple_unlock(&vp->v_interlock); 1672 } 1673 1674 KASSERT(vp->v_uobj.uo_npages == 0); 1675 cache_purge(vp); 1676 1677 /* 1678 * Done with purge, notify sleepers of the grim news. 1679 */ 1680 vp->v_op = dead_vnodeop_p; 1681 vp->v_tag = VT_NON; 1682 simple_lock(&vp->v_interlock); 1683 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1684 vp->v_flag &= ~(VXLOCK|VLOCKSWORK); 1685 if (vp->v_flag & VXWANT) { 1686 vp->v_flag &= ~VXWANT; 1687 simple_unlock(&vp->v_interlock); 1688 wakeup((caddr_t)vp); 1689 } else 1690 simple_unlock(&vp->v_interlock); 1691 } 1692 1693 /* 1694 * Recycle an unused vnode to the front of the free list. 1695 * Release the passed interlock if the vnode will be recycled. 1696 */ 1697 int 1698 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct proc *p) 1699 { 1700 1701 simple_lock(&vp->v_interlock); 1702 if (vp->v_usecount == 0) { 1703 if (inter_lkp) 1704 simple_unlock(inter_lkp); 1705 vgonel(vp, p); 1706 return (1); 1707 } 1708 simple_unlock(&vp->v_interlock); 1709 return (0); 1710 } 1711 1712 /* 1713 * Eliminate all activity associated with a vnode 1714 * in preparation for reuse. 1715 */ 1716 void 1717 vgone(struct vnode *vp) 1718 { 1719 struct proc *p = curproc; /* XXX */ 1720 1721 simple_lock(&vp->v_interlock); 1722 vgonel(vp, p); 1723 } 1724 1725 /* 1726 * vgone, with the vp interlock held. 1727 */ 1728 void 1729 vgonel(struct vnode *vp, struct proc *p) 1730 { 1731 1732 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1733 1734 /* 1735 * If a vgone (or vclean) is already in progress, 1736 * wait until it is done and return. 1737 */ 1738 1739 if (vp->v_flag & VXLOCK) { 1740 vp->v_flag |= VXWANT; 1741 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1742 return; 1743 } 1744 1745 /* 1746 * Clean out the filesystem specific data. 1747 */ 1748 1749 vclean(vp, DOCLOSE, p); 1750 KASSERT((vp->v_flag & VONWORKLST) == 0); 1751 1752 /* 1753 * Delete from old mount point vnode list, if on one. 1754 */ 1755 1756 if (vp->v_mount != NULL) 1757 insmntque(vp, (struct mount *)0); 1758 1759 /* 1760 * The test of the back pointer and the reference count of 1761 * zero is because it will be removed from the free list by 1762 * getcleanvnode, but will not have its reference count 1763 * incremented until after calling vgone. If the reference 1764 * count were incremented first, vgone would (incorrectly) 1765 * try to close the previous instance of the underlying object. 1766 * So, the back pointer is explicitly set to `0xdeadb' in 1767 * getnewvnode after removing it from the freelist to ensure 1768 * that we do not try to move it here. 1769 */ 1770 1771 vp->v_type = VBAD; 1772 if (vp->v_usecount == 0) { 1773 boolean_t dofree; 1774 1775 simple_lock(&vnode_free_list_slock); 1776 if (vp->v_holdcnt > 0) 1777 panic("vgonel: not clean, vp %p", vp); 1778 /* 1779 * if it isn't on the freelist, we're called by getcleanvnode 1780 * and vnode is being re-used. otherwise, we'll free it. 1781 */ 1782 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1783 if (dofree) { 1784 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1785 numvnodes--; 1786 } 1787 simple_unlock(&vnode_free_list_slock); 1788 if (dofree) 1789 pool_put(&vnode_pool, vp); 1790 } 1791 } 1792 1793 /* 1794 * Lookup a vnode by device number. 1795 */ 1796 int 1797 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1798 { 1799 struct vnode *vp; 1800 int rc = 0; 1801 1802 simple_lock(&spechash_slock); 1803 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1804 if (dev != vp->v_rdev || type != vp->v_type) 1805 continue; 1806 *vpp = vp; 1807 rc = 1; 1808 break; 1809 } 1810 simple_unlock(&spechash_slock); 1811 return (rc); 1812 } 1813 1814 /* 1815 * Revoke all the vnodes corresponding to the specified minor number 1816 * range (endpoints inclusive) of the specified major. 1817 */ 1818 void 1819 vdevgone(int maj, int minl, int minh, enum vtype type) 1820 { 1821 struct vnode *vp; 1822 int mn; 1823 1824 for (mn = minl; mn <= minh; mn++) 1825 if (vfinddev(makedev(maj, mn), type, &vp)) 1826 VOP_REVOKE(vp, REVOKEALL); 1827 } 1828 1829 /* 1830 * Calculate the total number of references to a special device. 1831 */ 1832 int 1833 vcount(struct vnode *vp) 1834 { 1835 struct vnode *vq, *vnext; 1836 int count; 1837 1838 loop: 1839 if ((vp->v_flag & VALIASED) == 0) 1840 return (vp->v_usecount); 1841 simple_lock(&spechash_slock); 1842 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1843 vnext = vq->v_specnext; 1844 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1845 continue; 1846 /* 1847 * Alias, but not in use, so flush it out. 1848 */ 1849 if (vq->v_usecount == 0 && vq != vp && 1850 (vq->v_flag & VXLOCK) == 0) { 1851 simple_unlock(&spechash_slock); 1852 vgone(vq); 1853 goto loop; 1854 } 1855 count += vq->v_usecount; 1856 } 1857 simple_unlock(&spechash_slock); 1858 return (count); 1859 } 1860 1861 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 1862 #define ARRAY_PRINT(idx, arr) \ 1863 ((idx) > 0 && (idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 1864 1865 const char * const vnode_tags[] = { VNODE_TAGS }; 1866 const char * const vnode_types[] = { VNODE_TYPES }; 1867 const char vnode_flagbits[] = VNODE_FLAGBITS; 1868 1869 /* 1870 * Print out a description of a vnode. 1871 */ 1872 void 1873 vprint(const char *label, struct vnode *vp) 1874 { 1875 char bf[96]; 1876 1877 if (label != NULL) 1878 printf("%s: ", label); 1879 printf("tag %s(%d) type %s(%d), usecount %d, writecount %ld, " 1880 "refcount %ld,", ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 1881 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 1882 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1883 bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf)); 1884 if (bf[0] != '\0') 1885 printf(" flags (%s)", &bf[1]); 1886 if (vp->v_data == NULL) { 1887 printf("\n"); 1888 } else { 1889 printf("\n\t"); 1890 VOP_PRINT(vp); 1891 } 1892 } 1893 1894 #ifdef DEBUG 1895 /* 1896 * List all of the locked vnodes in the system. 1897 * Called when debugging the kernel. 1898 */ 1899 void 1900 printlockedvnodes(void) 1901 { 1902 struct mount *mp, *nmp; 1903 struct vnode *vp; 1904 1905 printf("Locked vnodes\n"); 1906 simple_lock(&mountlist_slock); 1907 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1908 mp = nmp) { 1909 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 1910 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1911 continue; 1912 } 1913 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1914 if (VOP_ISLOCKED(vp)) 1915 vprint(NULL, vp); 1916 } 1917 simple_lock(&mountlist_slock); 1918 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1919 vfs_unbusy(mp); 1920 } 1921 simple_unlock(&mountlist_slock); 1922 } 1923 #endif 1924 1925 /* 1926 * sysctl helper routine for vfs.generic.conf lookups. 1927 */ 1928 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 1929 static int 1930 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 1931 { 1932 struct vfsconf vfc; 1933 extern const char * const mountcompatnames[]; 1934 extern int nmountcompatnames; 1935 struct sysctlnode node; 1936 struct vfsops *vfsp; 1937 u_int vfsnum; 1938 1939 if (namelen != 1) 1940 return (ENOTDIR); 1941 vfsnum = name[0]; 1942 if (vfsnum >= nmountcompatnames || 1943 mountcompatnames[vfsnum] == NULL) 1944 return (EOPNOTSUPP); 1945 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 1946 if (vfsp == NULL) 1947 return (EOPNOTSUPP); 1948 1949 vfc.vfc_vfsops = vfsp; 1950 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 1951 vfc.vfc_typenum = vfsnum; 1952 vfc.vfc_refcount = vfsp->vfs_refcount; 1953 vfc.vfc_flags = 0; 1954 vfc.vfc_mountroot = vfsp->vfs_mountroot; 1955 vfc.vfc_next = NULL; 1956 1957 node = *rnode; 1958 node.sysctl_data = &vfc; 1959 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 1960 } 1961 #endif 1962 1963 /* 1964 * sysctl helper routine to return list of supported fstypes 1965 */ 1966 static int 1967 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1968 { 1969 char bf[MFSNAMELEN]; 1970 char *where = oldp; 1971 struct vfsops *v; 1972 size_t needed, left, slen; 1973 int error, first; 1974 1975 if (newp != NULL) 1976 return (EPERM); 1977 if (namelen != 0) 1978 return (EINVAL); 1979 1980 first = 1; 1981 error = 0; 1982 needed = 0; 1983 left = *oldlenp; 1984 1985 LIST_FOREACH(v, &vfs_list, vfs_list) { 1986 if (where == NULL) 1987 needed += strlen(v->vfs_name) + 1; 1988 else { 1989 memset(bf, 0, sizeof(bf)); 1990 if (first) { 1991 strncpy(bf, v->vfs_name, sizeof(bf)); 1992 first = 0; 1993 } else { 1994 bf[0] = ' '; 1995 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1996 } 1997 bf[sizeof(bf)-1] = '\0'; 1998 slen = strlen(bf); 1999 if (left < slen + 1) 2000 break; 2001 /* +1 to copy out the trailing NUL byte */ 2002 error = copyout(bf, where, slen + 1); 2003 if (error) 2004 break; 2005 where += slen; 2006 needed += slen; 2007 left -= slen; 2008 } 2009 } 2010 *oldlenp = needed; 2011 return (error); 2012 } 2013 2014 /* 2015 * Top level filesystem related information gathering. 2016 */ 2017 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2018 { 2019 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2020 extern int nmountcompatnames; 2021 #endif 2022 2023 sysctl_createv(clog, 0, NULL, NULL, 2024 CTLFLAG_PERMANENT, 2025 CTLTYPE_NODE, "vfs", NULL, 2026 NULL, 0, NULL, 0, 2027 CTL_VFS, CTL_EOL); 2028 sysctl_createv(clog, 0, NULL, NULL, 2029 CTLFLAG_PERMANENT, 2030 CTLTYPE_NODE, "generic", 2031 SYSCTL_DESCR("Non-specific vfs related information"), 2032 NULL, 0, NULL, 0, 2033 CTL_VFS, VFS_GENERIC, CTL_EOL); 2034 2035 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2036 sysctl_createv(clog, 0, NULL, NULL, 2037 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 2038 CTLTYPE_INT, "maxtypenum", 2039 SYSCTL_DESCR("Highest valid filesystem type number"), 2040 NULL, nmountcompatnames, NULL, 0, 2041 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2042 #endif 2043 sysctl_createv(clog, 0, NULL, NULL, 2044 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2045 CTLTYPE_INT, "usermount", 2046 SYSCTL_DESCR("Whether unprivileged users may mount " 2047 "filesystems"), 2048 NULL, 0, &dovfsusermount, 0, 2049 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2050 sysctl_createv(clog, 0, NULL, NULL, 2051 CTLFLAG_PERMANENT, 2052 CTLTYPE_STRING, "fstypes", 2053 SYSCTL_DESCR("List of file systems present"), 2054 sysctl_vfs_generic_fstypes, 0, NULL, 0, 2055 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 2056 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2057 sysctl_createv(clog, 0, NULL, NULL, 2058 CTLFLAG_PERMANENT, 2059 CTLTYPE_STRUCT, "conf", 2060 SYSCTL_DESCR("Filesystem configuration information"), 2061 sysctl_vfs_generic_conf, 0, NULL, 2062 sizeof(struct vfsconf), 2063 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2064 #endif 2065 } 2066 2067 2068 int kinfo_vdebug = 1; 2069 int kinfo_vgetfailed; 2070 #define KINFO_VNODESLOP 10 2071 /* 2072 * Dump vnode list (via sysctl). 2073 * Copyout address of vnode followed by vnode. 2074 */ 2075 /* ARGSUSED */ 2076 int 2077 sysctl_kern_vnode(SYSCTLFN_ARGS) 2078 { 2079 char *where = oldp; 2080 size_t *sizep = oldlenp; 2081 struct mount *mp, *nmp; 2082 struct vnode *nvp, *vp; 2083 char *bp = where, *savebp; 2084 char *ewhere; 2085 int error; 2086 2087 if (namelen != 0) 2088 return (EOPNOTSUPP); 2089 if (newp != NULL) 2090 return (EPERM); 2091 2092 #define VPTRSZ sizeof(struct vnode *) 2093 #define VNODESZ sizeof(struct vnode) 2094 if (where == NULL) { 2095 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2096 return (0); 2097 } 2098 ewhere = where + *sizep; 2099 2100 simple_lock(&mountlist_slock); 2101 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2102 mp = nmp) { 2103 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2104 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2105 continue; 2106 } 2107 savebp = bp; 2108 again: 2109 simple_lock(&mntvnode_slock); 2110 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2111 vp != NULL; 2112 vp = nvp) { 2113 /* 2114 * Check that the vp is still associated with 2115 * this filesystem. RACE: could have been 2116 * recycled onto the same filesystem. 2117 */ 2118 if (vp->v_mount != mp) { 2119 simple_unlock(&mntvnode_slock); 2120 if (kinfo_vdebug) 2121 printf("kinfo: vp changed\n"); 2122 bp = savebp; 2123 goto again; 2124 } 2125 nvp = LIST_NEXT(vp, v_mntvnodes); 2126 if (bp + VPTRSZ + VNODESZ > ewhere) { 2127 simple_unlock(&mntvnode_slock); 2128 *sizep = bp - where; 2129 return (ENOMEM); 2130 } 2131 simple_unlock(&mntvnode_slock); 2132 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2133 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2134 return (error); 2135 bp += VPTRSZ + VNODESZ; 2136 simple_lock(&mntvnode_slock); 2137 } 2138 simple_unlock(&mntvnode_slock); 2139 simple_lock(&mountlist_slock); 2140 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2141 vfs_unbusy(mp); 2142 } 2143 simple_unlock(&mountlist_slock); 2144 2145 *sizep = bp - where; 2146 return (0); 2147 } 2148 2149 /* 2150 * Check to see if a filesystem is mounted on a block device. 2151 */ 2152 int 2153 vfs_mountedon(struct vnode *vp) 2154 { 2155 struct vnode *vq; 2156 int error = 0; 2157 2158 if (vp->v_specmountpoint != NULL) 2159 return (EBUSY); 2160 if (vp->v_flag & VALIASED) { 2161 simple_lock(&spechash_slock); 2162 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2163 if (vq->v_rdev != vp->v_rdev || 2164 vq->v_type != vp->v_type) 2165 continue; 2166 if (vq->v_specmountpoint != NULL) { 2167 error = EBUSY; 2168 break; 2169 } 2170 } 2171 simple_unlock(&spechash_slock); 2172 } 2173 return (error); 2174 } 2175 2176 static int 2177 sacheck(struct sockaddr *sa) 2178 { 2179 switch (sa->sa_family) { 2180 #ifdef INET 2181 case AF_INET: { 2182 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2183 char *p = (char *)sin->sin_zero; 2184 size_t i; 2185 2186 if (sin->sin_len != sizeof(*sin)) 2187 return -1; 2188 if (sin->sin_port != 0) 2189 return -1; 2190 for (i = 0; i < sizeof(sin->sin_zero); i++) 2191 if (*p++ != '\0') 2192 return -1; 2193 return 0; 2194 } 2195 #endif 2196 #ifdef INET6 2197 case AF_INET6: { 2198 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2199 2200 if (sin6->sin6_len != sizeof(*sin6)) 2201 return -1; 2202 if (sin6->sin6_port != 0) 2203 return -1; 2204 return 0; 2205 } 2206 #endif 2207 default: 2208 return -1; 2209 } 2210 } 2211 2212 /* 2213 * Build hash lists of net addresses and hang them off the mount point. 2214 * Called by ufs_mount() to set up the lists of export addresses. 2215 */ 2216 static int 2217 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 2218 struct export_args *argp) 2219 { 2220 struct netcred *np, *enp; 2221 struct radix_node_head *rnh; 2222 int i; 2223 struct sockaddr *saddr, *smask = 0; 2224 struct domain *dom; 2225 int error; 2226 2227 if (argp->ex_addrlen == 0) { 2228 if (mp->mnt_flag & MNT_DEFEXPORTED) 2229 return (EPERM); 2230 np = &nep->ne_defexported; 2231 np->netc_exflags = argp->ex_flags; 2232 crcvt(&np->netc_anon, &argp->ex_anon); 2233 np->netc_anon.cr_ref = 1; 2234 mp->mnt_flag |= MNT_DEFEXPORTED; 2235 return (0); 2236 } 2237 2238 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2239 return (EINVAL); 2240 2241 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2242 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2243 memset((caddr_t)np, 0, i); 2244 saddr = (struct sockaddr *)(np + 1); 2245 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2246 if (error) 2247 goto out; 2248 if (saddr->sa_len > argp->ex_addrlen) 2249 saddr->sa_len = argp->ex_addrlen; 2250 if (sacheck(saddr) == -1) 2251 return EINVAL; 2252 if (argp->ex_masklen) { 2253 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2254 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2255 if (error) 2256 goto out; 2257 if (smask->sa_len > argp->ex_masklen) 2258 smask->sa_len = argp->ex_masklen; 2259 if (smask->sa_family != saddr->sa_family) 2260 return EINVAL; 2261 if (sacheck(smask) == -1) 2262 return EINVAL; 2263 } 2264 i = saddr->sa_family; 2265 if ((rnh = nep->ne_rtable[i]) == 0) { 2266 /* 2267 * Seems silly to initialize every AF when most are not 2268 * used, do so on demand here 2269 */ 2270 DOMAIN_FOREACH(dom) { 2271 if (dom->dom_family == i && dom->dom_rtattach) { 2272 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2273 dom->dom_rtoffset); 2274 break; 2275 } 2276 } 2277 if ((rnh = nep->ne_rtable[i]) == 0) { 2278 error = ENOBUFS; 2279 goto out; 2280 } 2281 } 2282 2283 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2284 np->netc_rnodes); 2285 if (enp != np) { 2286 if (enp == NULL) { 2287 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2288 smask, rnh); 2289 if (enp == NULL) { 2290 error = EPERM; 2291 goto out; 2292 } 2293 } else 2294 enp->netc_refcnt++; 2295 2296 goto check; 2297 } else 2298 enp->netc_refcnt = 1; 2299 2300 np->netc_exflags = argp->ex_flags; 2301 crcvt(&np->netc_anon, &argp->ex_anon); 2302 np->netc_anon.cr_ref = 1; 2303 return 0; 2304 check: 2305 if (enp->netc_exflags != argp->ex_flags || 2306 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2307 error = EPERM; 2308 else 2309 error = 0; 2310 out: 2311 free(np, M_NETADDR); 2312 return error; 2313 } 2314 2315 /* ARGSUSED */ 2316 static int 2317 vfs_free_netcred(struct radix_node *rn, void *w) 2318 { 2319 struct radix_node_head *rnh = (struct radix_node_head *)w; 2320 struct netcred *np = (struct netcred *)(void *)rn; 2321 2322 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2323 if (--(np->netc_refcnt) <= 0) 2324 free(np, M_NETADDR); 2325 return (0); 2326 } 2327 2328 /* 2329 * Free the net address hash lists that are hanging off the mount points. 2330 */ 2331 static void 2332 vfs_free_addrlist(struct netexport *nep) 2333 { 2334 int i; 2335 struct radix_node_head *rnh; 2336 2337 for (i = 0; i <= AF_MAX; i++) 2338 if ((rnh = nep->ne_rtable[i]) != NULL) { 2339 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2340 free((caddr_t)rnh, M_RTABLE); 2341 nep->ne_rtable[i] = 0; 2342 } 2343 } 2344 2345 int 2346 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp) 2347 { 2348 int error; 2349 2350 if (argp->ex_flags & MNT_DELEXPORT) { 2351 if (mp->mnt_flag & MNT_EXPUBLIC) { 2352 vfs_setpublicfs(NULL, NULL, NULL); 2353 mp->mnt_flag &= ~MNT_EXPUBLIC; 2354 } 2355 vfs_free_addrlist(nep); 2356 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2357 } 2358 if (argp->ex_flags & MNT_EXPORTED) { 2359 if (argp->ex_flags & MNT_EXPUBLIC) { 2360 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2361 return (error); 2362 mp->mnt_flag |= MNT_EXPUBLIC; 2363 } 2364 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2365 return (error); 2366 mp->mnt_flag |= MNT_EXPORTED; 2367 } 2368 return (0); 2369 } 2370 2371 /* 2372 * Set the publicly exported filesystem (WebNFS). Currently, only 2373 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2374 */ 2375 int 2376 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 2377 struct export_args *argp) 2378 { 2379 int error; 2380 struct vnode *rvp; 2381 char *cp; 2382 2383 /* 2384 * mp == NULL -> invalidate the current info, the FS is 2385 * no longer exported. May be called from either vfs_export 2386 * or unmount, so check if it hasn't already been done. 2387 */ 2388 if (mp == NULL) { 2389 if (nfs_pub.np_valid) { 2390 nfs_pub.np_valid = 0; 2391 if (nfs_pub.np_index != NULL) { 2392 FREE(nfs_pub.np_index, M_TEMP); 2393 nfs_pub.np_index = NULL; 2394 } 2395 } 2396 return (0); 2397 } 2398 2399 /* 2400 * Only one allowed at a time. 2401 */ 2402 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2403 return (EBUSY); 2404 2405 /* 2406 * Get real filehandle for root of exported FS. 2407 */ 2408 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2409 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx; 2410 2411 if ((error = VFS_ROOT(mp, &rvp))) 2412 return (error); 2413 2414 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2415 return (error); 2416 2417 vput(rvp); 2418 2419 /* 2420 * If an indexfile was specified, pull it in. 2421 */ 2422 if (argp->ex_indexfile != NULL) { 2423 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2424 M_WAITOK); 2425 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2426 MAXNAMLEN, (size_t *)0); 2427 if (!error) { 2428 /* 2429 * Check for illegal filenames. 2430 */ 2431 for (cp = nfs_pub.np_index; *cp; cp++) { 2432 if (*cp == '/') { 2433 error = EINVAL; 2434 break; 2435 } 2436 } 2437 } 2438 if (error) { 2439 FREE(nfs_pub.np_index, M_TEMP); 2440 return (error); 2441 } 2442 } 2443 2444 nfs_pub.np_mount = mp; 2445 nfs_pub.np_valid = 1; 2446 return (0); 2447 } 2448 2449 struct netcred * 2450 vfs_export_lookup(struct mount *mp, struct netexport *nep, struct mbuf *nam) 2451 { 2452 struct netcred *np; 2453 struct radix_node_head *rnh; 2454 struct sockaddr *saddr; 2455 2456 np = NULL; 2457 if (mp->mnt_flag & MNT_EXPORTED) { 2458 /* 2459 * Lookup in the export list first. 2460 */ 2461 if (nam != NULL) { 2462 saddr = mtod(nam, struct sockaddr *); 2463 rnh = nep->ne_rtable[saddr->sa_family]; 2464 if (rnh != NULL) { 2465 np = (struct netcred *) 2466 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2467 rnh); 2468 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2469 np = NULL; 2470 } 2471 } 2472 /* 2473 * If no address match, use the default if it exists. 2474 */ 2475 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2476 np = &nep->ne_defexported; 2477 } 2478 return (np); 2479 } 2480 2481 /* 2482 * Do the usual access checking. 2483 * file_mode, uid and gid are from the vnode in question, 2484 * while acc_mode and cred are from the VOP_ACCESS parameter list 2485 */ 2486 int 2487 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, 2488 mode_t acc_mode, struct ucred *cred) 2489 { 2490 mode_t mask; 2491 2492 /* 2493 * Super-user always gets read/write access, but execute access depends 2494 * on at least one execute bit being set. 2495 */ 2496 if (cred->cr_uid == 0) { 2497 if ((acc_mode & VEXEC) && type != VDIR && 2498 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2499 return (EACCES); 2500 return (0); 2501 } 2502 2503 mask = 0; 2504 2505 /* Otherwise, check the owner. */ 2506 if (cred->cr_uid == uid) { 2507 if (acc_mode & VEXEC) 2508 mask |= S_IXUSR; 2509 if (acc_mode & VREAD) 2510 mask |= S_IRUSR; 2511 if (acc_mode & VWRITE) 2512 mask |= S_IWUSR; 2513 return ((file_mode & mask) == mask ? 0 : EACCES); 2514 } 2515 2516 /* Otherwise, check the groups. */ 2517 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2518 if (acc_mode & VEXEC) 2519 mask |= S_IXGRP; 2520 if (acc_mode & VREAD) 2521 mask |= S_IRGRP; 2522 if (acc_mode & VWRITE) 2523 mask |= S_IWGRP; 2524 return ((file_mode & mask) == mask ? 0 : EACCES); 2525 } 2526 2527 /* Otherwise, check everyone else. */ 2528 if (acc_mode & VEXEC) 2529 mask |= S_IXOTH; 2530 if (acc_mode & VREAD) 2531 mask |= S_IROTH; 2532 if (acc_mode & VWRITE) 2533 mask |= S_IWOTH; 2534 return ((file_mode & mask) == mask ? 0 : EACCES); 2535 } 2536 2537 /* 2538 * Unmount all file systems. 2539 * We traverse the list in reverse order under the assumption that doing so 2540 * will avoid needing to worry about dependencies. 2541 */ 2542 void 2543 vfs_unmountall(struct proc *p) 2544 { 2545 struct mount *mp, *nmp; 2546 int allerror, error; 2547 2548 printf("unmounting file systems..."); 2549 for (allerror = 0, 2550 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2551 nmp = mp->mnt_list.cqe_prev; 2552 #ifdef DEBUG 2553 printf("\nunmounting %s (%s)...", 2554 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2555 #endif 2556 /* 2557 * XXX Freeze syncer. Must do this before locking the 2558 * mount point. See dounmount() for details. 2559 */ 2560 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2561 if (vfs_busy(mp, 0, 0)) { 2562 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2563 continue; 2564 } 2565 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2566 printf("unmount of %s failed with error %d\n", 2567 mp->mnt_stat.f_mntonname, error); 2568 allerror = 1; 2569 } 2570 } 2571 printf(" done\n"); 2572 if (allerror) 2573 printf("WARNING: some file systems would not unmount\n"); 2574 } 2575 2576 extern struct simplelock bqueue_slock; /* XXX */ 2577 2578 /* 2579 * Sync and unmount file systems before shutting down. 2580 */ 2581 void 2582 vfs_shutdown(void) 2583 { 2584 struct lwp *l = curlwp; 2585 struct proc *p; 2586 2587 /* XXX we're certainly not running in proc0's context! */ 2588 if (l == NULL || (p = l->l_proc) == NULL) 2589 p = &proc0; 2590 2591 printf("syncing disks... "); 2592 2593 /* remove user process from run queue */ 2594 suspendsched(); 2595 (void) spl0(); 2596 2597 /* avoid coming back this way again if we panic. */ 2598 doing_shutdown = 1; 2599 2600 sys_sync(l, NULL, NULL); 2601 2602 /* Wait for sync to finish. */ 2603 if (buf_syncwait() != 0) { 2604 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2605 Debugger(); 2606 #endif 2607 printf("giving up\n"); 2608 return; 2609 } else 2610 printf("done\n"); 2611 2612 /* 2613 * If we've panic'd, don't make the situation potentially 2614 * worse by unmounting the file systems. 2615 */ 2616 if (panicstr != NULL) 2617 return; 2618 2619 /* Release inodes held by texts before update. */ 2620 #ifdef notdef 2621 vnshutdown(); 2622 #endif 2623 /* Unmount file systems. */ 2624 vfs_unmountall(p); 2625 } 2626 2627 /* 2628 * Mount the root file system. If the operator didn't specify a 2629 * file system to use, try all possible file systems until one 2630 * succeeds. 2631 */ 2632 int 2633 vfs_mountroot(void) 2634 { 2635 struct vfsops *v; 2636 int error = ENODEV; 2637 2638 if (root_device == NULL) 2639 panic("vfs_mountroot: root device unknown"); 2640 2641 switch (root_device->dv_class) { 2642 case DV_IFNET: 2643 if (rootdev != NODEV) 2644 panic("vfs_mountroot: rootdev set for DV_IFNET " 2645 "(0x%08x -> %d,%d)", rootdev, 2646 major(rootdev), minor(rootdev)); 2647 break; 2648 2649 case DV_DISK: 2650 if (rootdev == NODEV) 2651 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2652 if (bdevvp(rootdev, &rootvp)) 2653 panic("vfs_mountroot: can't get vnode for rootdev"); 2654 error = VOP_OPEN(rootvp, FREAD, FSCRED, curproc); 2655 if (error) { 2656 printf("vfs_mountroot: can't open root device\n"); 2657 return (error); 2658 } 2659 break; 2660 2661 default: 2662 printf("%s: inappropriate for root file system\n", 2663 root_device->dv_xname); 2664 return (ENODEV); 2665 } 2666 2667 /* 2668 * If user specified a file system, use it. 2669 */ 2670 if (mountroot != NULL) { 2671 error = (*mountroot)(); 2672 goto done; 2673 } 2674 2675 /* 2676 * Try each file system currently configured into the kernel. 2677 */ 2678 LIST_FOREACH(v, &vfs_list, vfs_list) { 2679 if (v->vfs_mountroot == NULL) 2680 continue; 2681 #ifdef DEBUG 2682 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2683 #endif 2684 error = (*v->vfs_mountroot)(); 2685 if (!error) { 2686 aprint_normal("root file system type: %s\n", 2687 v->vfs_name); 2688 break; 2689 } 2690 } 2691 2692 if (v == NULL) { 2693 printf("no file system for %s", root_device->dv_xname); 2694 if (root_device->dv_class == DV_DISK) 2695 printf(" (dev 0x%x)", rootdev); 2696 printf("\n"); 2697 error = EFTYPE; 2698 } 2699 2700 done: 2701 if (error && root_device->dv_class == DV_DISK) { 2702 VOP_CLOSE(rootvp, FREAD, FSCRED, curproc); 2703 vrele(rootvp); 2704 } 2705 return (error); 2706 } 2707 2708 /* 2709 * Given a file system name, look up the vfsops for that 2710 * file system, or return NULL if file system isn't present 2711 * in the kernel. 2712 */ 2713 struct vfsops * 2714 vfs_getopsbyname(const char *name) 2715 { 2716 struct vfsops *v; 2717 2718 LIST_FOREACH(v, &vfs_list, vfs_list) { 2719 if (strcmp(v->vfs_name, name) == 0) 2720 break; 2721 } 2722 2723 return (v); 2724 } 2725 2726 /* 2727 * Establish a file system and initialize it. 2728 */ 2729 int 2730 vfs_attach(struct vfsops *vfs) 2731 { 2732 struct vfsops *v; 2733 int error = 0; 2734 2735 2736 /* 2737 * Make sure this file system doesn't already exist. 2738 */ 2739 LIST_FOREACH(v, &vfs_list, vfs_list) { 2740 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2741 error = EEXIST; 2742 goto out; 2743 } 2744 } 2745 2746 /* 2747 * Initialize the vnode operations for this file system. 2748 */ 2749 vfs_opv_init(vfs->vfs_opv_descs); 2750 2751 /* 2752 * Now initialize the file system itself. 2753 */ 2754 (*vfs->vfs_init)(); 2755 2756 /* 2757 * ...and link it into the kernel's list. 2758 */ 2759 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2760 2761 /* 2762 * Sanity: make sure the reference count is 0. 2763 */ 2764 vfs->vfs_refcount = 0; 2765 2766 out: 2767 return (error); 2768 } 2769 2770 /* 2771 * Remove a file system from the kernel. 2772 */ 2773 int 2774 vfs_detach(struct vfsops *vfs) 2775 { 2776 struct vfsops *v; 2777 2778 /* 2779 * Make sure no one is using the filesystem. 2780 */ 2781 if (vfs->vfs_refcount != 0) 2782 return (EBUSY); 2783 2784 /* 2785 * ...and remove it from the kernel's list. 2786 */ 2787 LIST_FOREACH(v, &vfs_list, vfs_list) { 2788 if (v == vfs) { 2789 LIST_REMOVE(v, vfs_list); 2790 break; 2791 } 2792 } 2793 2794 if (v == NULL) 2795 return (ESRCH); 2796 2797 /* 2798 * Now run the file system-specific cleanups. 2799 */ 2800 (*vfs->vfs_done)(); 2801 2802 /* 2803 * Free the vnode operations vector. 2804 */ 2805 vfs_opv_free(vfs->vfs_opv_descs); 2806 return (0); 2807 } 2808 2809 void 2810 vfs_reinit(void) 2811 { 2812 struct vfsops *vfs; 2813 2814 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2815 if (vfs->vfs_reinit) { 2816 (*vfs->vfs_reinit)(); 2817 } 2818 } 2819 } 2820 2821 /* 2822 * Request a filesystem to suspend write operations. 2823 */ 2824 int 2825 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2826 { 2827 struct proc *p = curproc; /* XXX */ 2828 int error; 2829 2830 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2831 if (slptimeo < 0) 2832 return EWOULDBLOCK; 2833 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2834 if (error) 2835 return error; 2836 } 2837 mp->mnt_iflag |= IMNT_SUSPEND; 2838 2839 simple_lock(&mp->mnt_slock); 2840 if (mp->mnt_writeopcountupper > 0) 2841 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 2842 0, &mp->mnt_slock); 2843 simple_unlock(&mp->mnt_slock); 2844 2845 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2846 if (error) { 2847 vfs_write_resume(mp); 2848 return error; 2849 } 2850 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2851 2852 simple_lock(&mp->mnt_slock); 2853 if (mp->mnt_writeopcountlower > 0) 2854 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 2855 0, &mp->mnt_slock); 2856 mp->mnt_iflag |= IMNT_SUSPENDED; 2857 simple_unlock(&mp->mnt_slock); 2858 2859 return 0; 2860 } 2861 2862 /* 2863 * Request a filesystem to resume write operations. 2864 */ 2865 void 2866 vfs_write_resume(struct mount *mp) 2867 { 2868 2869 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2870 return; 2871 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2872 wakeup(&mp->mnt_flag); 2873 } 2874 2875 void 2876 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2877 { 2878 const struct statvfs *mbp; 2879 2880 if (sbp == (mbp = &mp->mnt_stat)) 2881 return; 2882 2883 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2884 sbp->f_fsid = mbp->f_fsid; 2885 sbp->f_owner = mbp->f_owner; 2886 sbp->f_flag = mbp->f_flag; 2887 sbp->f_syncwrites = mbp->f_syncwrites; 2888 sbp->f_asyncwrites = mbp->f_asyncwrites; 2889 sbp->f_syncreads = mbp->f_syncreads; 2890 sbp->f_asyncreads = mbp->f_asyncreads; 2891 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2892 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2893 sizeof(sbp->f_fstypename)); 2894 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2895 sizeof(sbp->f_mntonname)); 2896 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2897 sizeof(sbp->f_mntfromname)); 2898 sbp->f_namemax = mbp->f_namemax; 2899 } 2900 2901 int 2902 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2903 struct mount *mp, struct proc *p) 2904 { 2905 int error; 2906 size_t size; 2907 struct statvfs *sfs = &mp->mnt_stat; 2908 int (*fun)(const void *, void *, size_t, size_t *); 2909 2910 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 2911 sizeof(mp->mnt_stat.f_fstypename)); 2912 2913 if (onp) { 2914 struct cwdinfo *cwdi = p->p_cwdi; 2915 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2916 if (cwdi->cwdi_rdir != NULL) { 2917 size_t len; 2918 char *bp; 2919 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2920 2921 if (!path) /* XXX can't happen with M_WAITOK */ 2922 return ENOMEM; 2923 2924 bp = path + MAXPATHLEN; 2925 *--bp = '\0'; 2926 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 2927 path, MAXPATHLEN / 2, 0, p); 2928 if (error) { 2929 free(path, M_TEMP); 2930 return error; 2931 } 2932 2933 len = strlen(bp); 2934 if (len > sizeof(sfs->f_mntonname) - 1) 2935 len = sizeof(sfs->f_mntonname) - 1; 2936 (void)strncpy(sfs->f_mntonname, bp, len); 2937 free(path, M_TEMP); 2938 2939 if (len < sizeof(sfs->f_mntonname) - 1) { 2940 error = (*fun)(onp, &sfs->f_mntonname[len], 2941 sizeof(sfs->f_mntonname) - len - 1, &size); 2942 if (error) 2943 return error; 2944 size += len; 2945 } else { 2946 size = len; 2947 } 2948 } else { 2949 error = (*fun)(onp, &sfs->f_mntonname, 2950 sizeof(sfs->f_mntonname) - 1, &size); 2951 if (error) 2952 return error; 2953 } 2954 (void)memset(sfs->f_mntonname + size, 0, 2955 sizeof(sfs->f_mntonname) - size); 2956 } 2957 2958 if (fromp) { 2959 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 2960 error = (*fun)(fromp, sfs->f_mntfromname, 2961 sizeof(sfs->f_mntfromname) - 1, &size); 2962 if (error) 2963 return error; 2964 (void)memset(sfs->f_mntfromname + size, 0, 2965 sizeof(sfs->f_mntfromname) - size); 2966 } 2967 return 0; 2968 } 2969 2970 #ifdef DDB 2971 static const char buf_flagbits[] = BUF_FLAGBITS; 2972 2973 void 2974 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) 2975 { 2976 char bf[1024]; 2977 2978 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 2979 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 2980 2981 bitmask_snprintf(bp->b_flags, buf_flagbits, bf, sizeof(bf)); 2982 (*pr)(" error %d flags 0x%s\n", bp->b_error, bf); 2983 2984 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 2985 bp->b_bufsize, bp->b_bcount, bp->b_resid); 2986 (*pr)(" data %p saveaddr %p dep %p\n", 2987 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 2988 (*pr)(" iodone %p\n", bp->b_iodone); 2989 } 2990 2991 2992 void 2993 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) 2994 { 2995 char bf[256]; 2996 2997 uvm_object_printit(&vp->v_uobj, full, pr); 2998 bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf)); 2999 (*pr)("\nVNODE flags %s\n", bf); 3000 (*pr)("mp %p numoutput %d size 0x%llx\n", 3001 vp->v_mount, vp->v_numoutput, vp->v_size); 3002 3003 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3004 vp->v_data, vp->v_usecount, vp->v_writecount, 3005 vp->v_holdcnt, vp->v_numoutput); 3006 3007 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n", 3008 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 3009 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 3010 vp->v_mount, vp->v_mountedhere); 3011 3012 if (full) { 3013 struct buf *bp; 3014 3015 (*pr)("clean bufs:\n"); 3016 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3017 (*pr)(" bp %p\n", bp); 3018 vfs_buf_print(bp, full, pr); 3019 } 3020 3021 (*pr)("dirty bufs:\n"); 3022 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3023 (*pr)(" bp %p\n", bp); 3024 vfs_buf_print(bp, full, pr); 3025 } 3026 } 3027 } 3028 3029 void 3030 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) 3031 { 3032 char sbuf[256]; 3033 3034 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3035 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3036 3037 (*pr)("fs_bshift %d dev_bshift = %d\n", 3038 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3039 3040 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3041 (*pr)("flag = %s\n", sbuf); 3042 3043 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3044 (*pr)("iflag = %s\n", sbuf); 3045 3046 /* XXX use lockmgr_printinfo */ 3047 if (mp->mnt_lock.lk_sharecount) 3048 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3049 mp->mnt_lock.lk_sharecount); 3050 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3051 (*pr)(" lock type %s: EXCL (count %d) by ", 3052 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3053 if (mp->mnt_lock.lk_flags & LK_SPIN) 3054 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3055 else 3056 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3057 mp->mnt_lock.lk_locklwp); 3058 } else 3059 (*pr)(" not locked"); 3060 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3061 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3062 3063 (*pr)("\n"); 3064 3065 if (mp->mnt_unmounter) { 3066 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3067 } 3068 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3069 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3070 3071 (*pr)("statvfs cache:\n"); 3072 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3073 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3074 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3075 3076 (*pr)("\tblocks = "PRIu64"\n",mp->mnt_stat.f_blocks); 3077 (*pr)("\tbfree = "PRIu64"\n",mp->mnt_stat.f_bfree); 3078 (*pr)("\tbavail = "PRIu64"\n",mp->mnt_stat.f_bavail); 3079 (*pr)("\tbresvd = "PRIu64"\n",mp->mnt_stat.f_bresvd); 3080 3081 (*pr)("\tfiles = "PRIu64"\n",mp->mnt_stat.f_files); 3082 (*pr)("\tffree = "PRIu64"\n",mp->mnt_stat.f_ffree); 3083 (*pr)("\tfavail = "PRIu64"\n",mp->mnt_stat.f_favail); 3084 (*pr)("\tfresvd = "PRIu64"\n",mp->mnt_stat.f_fresvd); 3085 3086 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3087 mp->mnt_stat.f_fsidx.__fsid_val[0], 3088 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3089 3090 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3091 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3092 3093 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, 3094 sizeof(sbuf)); 3095 (*pr)("\tflag = %s\n",sbuf); 3096 (*pr)("\tsyncwrites = " PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3097 (*pr)("\tasyncwrites = " PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3098 (*pr)("\tsyncreads = " PRIu64 "\n",mp->mnt_stat.f_syncreads); 3099 (*pr)("\tasyncreads = " PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3100 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3101 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3102 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3103 3104 { 3105 int cnt = 0; 3106 struct vnode *vp; 3107 (*pr)("locked vnodes ="); 3108 /* XXX would take mountlist lock, except ddb may not have context */ 3109 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3110 if (VOP_ISLOCKED(vp)) { 3111 if ((++cnt % 6) == 0) { 3112 (*pr)(" %p,\n\t", vp); 3113 } else { 3114 (*pr)(" %p,", vp); 3115 } 3116 } 3117 } 3118 (*pr)("\n"); 3119 } 3120 3121 if (full) { 3122 int cnt = 0; 3123 struct vnode *vp; 3124 (*pr)("all vnodes ="); 3125 /* XXX would take mountlist lock, except ddb may not have context */ 3126 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3127 if (!LIST_NEXT(vp, v_mntvnodes)) { 3128 (*pr)(" %p", vp); 3129 } else if ((++cnt % 6) == 0) { 3130 (*pr)(" %p,\n\t", vp); 3131 } else { 3132 (*pr)(" %p,", vp); 3133 } 3134 } 3135 (*pr)("\n", vp); 3136 } 3137 } 3138 #endif /* DDB */ 3139