1 /* $NetBSD: vfs_subr.c,v 1.215 2004/01/14 11:28:05 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.215 2004/01/14 11:28:05 yamt Exp $"); 82 83 #include "opt_inet.h" 84 #include "opt_ddb.h" 85 #include "opt_compat_netbsd.h" 86 #include "opt_compat_43.h" 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/proc.h> 91 #include <sys/kernel.h> 92 #include <sys/mount.h> 93 #include <sys/time.h> 94 #include <sys/event.h> 95 #include <sys/fcntl.h> 96 #include <sys/vnode.h> 97 #include <sys/stat.h> 98 #include <sys/namei.h> 99 #include <sys/ucred.h> 100 #include <sys/buf.h> 101 #include <sys/errno.h> 102 #include <sys/malloc.h> 103 #include <sys/domain.h> 104 #include <sys/mbuf.h> 105 #include <sys/sa.h> 106 #include <sys/syscallargs.h> 107 #include <sys/device.h> 108 #include <sys/dirent.h> 109 #include <sys/filedesc.h> 110 111 #include <miscfs/specfs/specdev.h> 112 #include <miscfs/genfs/genfs.h> 113 #include <miscfs/syncfs/syncfs.h> 114 115 #include <netinet/in.h> 116 117 #include <uvm/uvm.h> 118 #include <uvm/uvm_ddb.h> 119 120 #include <netinet/in.h> 121 122 #include <sys/sysctl.h> 123 124 const enum vtype iftovt_tab[16] = { 125 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 126 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 127 }; 128 const int vttoif_tab[9] = { 129 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 130 S_IFSOCK, S_IFIFO, S_IFMT, 131 }; 132 133 int doforce = 1; /* 1 => permit forcible unmounting */ 134 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 135 136 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 137 138 /* 139 * Insq/Remq for the vnode usage lists. 140 */ 141 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 142 #define bufremvn(bp) { \ 143 LIST_REMOVE(bp, b_vnbufs); \ 144 (bp)->b_vnbufs.le_next = NOLIST; \ 145 } 146 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 147 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 148 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 149 150 struct mntlist mountlist = /* mounted filesystem list */ 151 CIRCLEQ_HEAD_INITIALIZER(mountlist); 152 struct vfs_list_head vfs_list = /* vfs list */ 153 LIST_HEAD_INITIALIZER(vfs_list); 154 155 struct nfs_public nfs_pub; /* publicly exported FS */ 156 157 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 158 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 159 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 160 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 161 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 162 163 /* XXX - gross; single global lock to protect v_numoutput */ 164 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 165 166 /* 167 * These define the root filesystem and device. 168 */ 169 struct mount *rootfs; 170 struct vnode *rootvnode; 171 struct device *root_device; /* root device */ 172 173 struct pool vnode_pool; /* memory pool for vnodes */ 174 175 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 176 177 /* 178 * Local declarations. 179 */ 180 void insmntque __P((struct vnode *, struct mount *)); 181 int getdevvp __P((dev_t, struct vnode **, enum vtype)); 182 void vgoneall __P((struct vnode *)); 183 184 void vclean(struct vnode *, int, struct proc *); 185 186 static int vfs_hang_addrlist __P((struct mount *, struct netexport *, 187 struct export_args *)); 188 static int vfs_free_netcred __P((struct radix_node *, void *)); 189 static void vfs_free_addrlist __P((struct netexport *)); 190 static struct vnode *getcleanvnode __P((struct proc *)); 191 192 #ifdef DEBUG 193 void printlockedvnodes __P((void)); 194 #endif 195 196 /* 197 * Initialize the vnode management data structures. 198 */ 199 void 200 vntblinit() 201 { 202 203 pool_init(&vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 204 &pool_allocator_nointr); 205 206 /* 207 * Initialize the filesystem syncer. 208 */ 209 vn_initialize_syncerd(); 210 } 211 212 int 213 vfs_drainvnodes(long target, struct proc *p) 214 { 215 216 simple_lock(&vnode_free_list_slock); 217 while (numvnodes > target) { 218 struct vnode *vp; 219 220 vp = getcleanvnode(p); 221 if (vp == NULL) 222 return EBUSY; /* give up */ 223 pool_put(&vnode_pool, vp); 224 simple_lock(&vnode_free_list_slock); 225 numvnodes--; 226 } 227 simple_unlock(&vnode_free_list_slock); 228 229 return 0; 230 } 231 232 /* 233 * grab a vnode from freelist and clean it. 234 */ 235 struct vnode * 236 getcleanvnode(p) 237 struct proc *p; 238 { 239 struct vnode *vp; 240 struct mount *mp; 241 struct freelst *listhd; 242 243 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 244 if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL) 245 vp = TAILQ_FIRST(listhd = &vnode_hold_list); 246 for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) { 247 if (!simple_lock_try(&vp->v_interlock)) 248 continue; 249 if ((vp->v_flag & VLAYER) == 0) { 250 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 251 break; 252 } else if (VOP_ISLOCKED(vp) == 0) { 253 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 254 break; 255 } 256 mp = NULL; 257 simple_unlock(&vp->v_interlock); 258 } 259 260 if (vp == NULLVP) { 261 simple_unlock(&vnode_free_list_slock); 262 return NULLVP; 263 } 264 265 if (vp->v_usecount) 266 panic("free vnode isn't, vp %p", vp); 267 TAILQ_REMOVE(listhd, vp, v_freelist); 268 /* see comment on why 0xdeadb is set at end of vgone (below) */ 269 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 270 simple_unlock(&vnode_free_list_slock); 271 vp->v_lease = NULL; 272 273 if (vp->v_type != VBAD) 274 vgonel(vp, p); 275 else 276 simple_unlock(&vp->v_interlock); 277 vn_finished_write(mp, 0); 278 #ifdef DIAGNOSTIC 279 if (vp->v_data || vp->v_uobj.uo_npages || 280 TAILQ_FIRST(&vp->v_uobj.memq)) 281 panic("cleaned vnode isn't, vp %p", vp); 282 if (vp->v_numoutput) 283 panic("clean vnode has pending I/O's, vp %p", vp); 284 #endif 285 KASSERT((vp->v_flag & VONWORKLST) == 0); 286 287 return vp; 288 } 289 290 /* 291 * Mark a mount point as busy. Used to synchronize access and to delay 292 * unmounting. Interlock is not released on failure. 293 */ 294 int 295 vfs_busy(mp, flags, interlkp) 296 struct mount *mp; 297 int flags; 298 struct simplelock *interlkp; 299 { 300 int lkflags; 301 302 while (mp->mnt_iflag & IMNT_UNMOUNT) { 303 int gone; 304 305 if (flags & LK_NOWAIT) 306 return (ENOENT); 307 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 308 && mp->mnt_unmounter == curproc) 309 return (EDEADLK); 310 if (interlkp) 311 simple_unlock(interlkp); 312 /* 313 * Since all busy locks are shared except the exclusive 314 * lock granted when unmounting, the only place that a 315 * wakeup needs to be done is at the release of the 316 * exclusive lock at the end of dounmount. 317 * 318 * XXX MP: add spinlock protecting mnt_wcnt here once you 319 * can atomically unlock-and-sleep. 320 */ 321 mp->mnt_wcnt++; 322 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 323 mp->mnt_wcnt--; 324 gone = mp->mnt_iflag & IMNT_GONE; 325 326 if (mp->mnt_wcnt == 0) 327 wakeup(&mp->mnt_wcnt); 328 if (interlkp) 329 simple_lock(interlkp); 330 if (gone) 331 return (ENOENT); 332 } 333 lkflags = LK_SHARED; 334 if (interlkp) 335 lkflags |= LK_INTERLOCK; 336 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 337 panic("vfs_busy: unexpected lock failure"); 338 return (0); 339 } 340 341 /* 342 * Free a busy filesystem. 343 */ 344 void 345 vfs_unbusy(mp) 346 struct mount *mp; 347 { 348 349 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 350 } 351 352 /* 353 * Lookup a filesystem type, and if found allocate and initialize 354 * a mount structure for it. 355 * 356 * Devname is usually updated by mount(8) after booting. 357 */ 358 int 359 vfs_rootmountalloc(fstypename, devname, mpp) 360 char *fstypename; 361 char *devname; 362 struct mount **mpp; 363 { 364 struct vfsops *vfsp = NULL; 365 struct mount *mp; 366 367 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 368 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 369 break; 370 371 if (vfsp == NULL) 372 return (ENODEV); 373 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 374 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 375 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 376 (void)vfs_busy(mp, LK_NOWAIT, 0); 377 LIST_INIT(&mp->mnt_vnodelist); 378 mp->mnt_op = vfsp; 379 mp->mnt_flag = MNT_RDONLY; 380 mp->mnt_vnodecovered = NULLVP; 381 vfsp->vfs_refcount++; 382 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 383 mp->mnt_stat.f_mntonname[0] = '/'; 384 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 385 *mpp = mp; 386 return (0); 387 } 388 389 /* 390 * Lookup a mount point by filesystem identifier. 391 */ 392 struct mount * 393 vfs_getvfs(fsid) 394 fsid_t *fsid; 395 { 396 struct mount *mp; 397 398 simple_lock(&mountlist_slock); 399 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 400 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 401 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 402 simple_unlock(&mountlist_slock); 403 return (mp); 404 } 405 } 406 simple_unlock(&mountlist_slock); 407 return ((struct mount *)0); 408 } 409 410 /* 411 * Get a new unique fsid 412 */ 413 void 414 vfs_getnewfsid(mp) 415 struct mount *mp; 416 { 417 static u_short xxxfs_mntid; 418 fsid_t tfsid; 419 int mtype; 420 421 simple_lock(&mntid_slock); 422 mtype = makefstype(mp->mnt_op->vfs_name); 423 mp->mnt_stat.f_fsid.val[0] = makedev(mtype, 0); 424 mp->mnt_stat.f_fsid.val[1] = mtype; 425 if (xxxfs_mntid == 0) 426 ++xxxfs_mntid; 427 tfsid.val[0] = makedev(mtype & 0xff, xxxfs_mntid); 428 tfsid.val[1] = mtype; 429 if (!CIRCLEQ_EMPTY(&mountlist)) { 430 while (vfs_getvfs(&tfsid)) { 431 tfsid.val[0]++; 432 xxxfs_mntid++; 433 } 434 } 435 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 436 simple_unlock(&mntid_slock); 437 } 438 439 /* 440 * Make a 'unique' number from a mount type name. 441 */ 442 long 443 makefstype(type) 444 const char *type; 445 { 446 long rv; 447 448 for (rv = 0; *type; type++) { 449 rv <<= 2; 450 rv ^= *type; 451 } 452 return rv; 453 } 454 455 456 /* 457 * Set vnode attributes to VNOVAL 458 */ 459 void 460 vattr_null(vap) 461 struct vattr *vap; 462 { 463 464 vap->va_type = VNON; 465 466 /* 467 * Assign individually so that it is safe even if size and 468 * sign of each member are varied. 469 */ 470 vap->va_mode = VNOVAL; 471 vap->va_nlink = VNOVAL; 472 vap->va_uid = VNOVAL; 473 vap->va_gid = VNOVAL; 474 vap->va_fsid = VNOVAL; 475 vap->va_fileid = VNOVAL; 476 vap->va_size = VNOVAL; 477 vap->va_blocksize = VNOVAL; 478 vap->va_atime.tv_sec = 479 vap->va_mtime.tv_sec = 480 vap->va_ctime.tv_sec = 481 vap->va_birthtime.tv_sec = VNOVAL; 482 vap->va_atime.tv_nsec = 483 vap->va_mtime.tv_nsec = 484 vap->va_ctime.tv_nsec = 485 vap->va_birthtime.tv_nsec = VNOVAL; 486 vap->va_gen = VNOVAL; 487 vap->va_flags = VNOVAL; 488 vap->va_rdev = VNOVAL; 489 vap->va_bytes = VNOVAL; 490 vap->va_vaflags = 0; 491 } 492 493 /* 494 * Routines having to do with the management of the vnode table. 495 */ 496 extern int (**dead_vnodeop_p) __P((void *)); 497 long numvnodes; 498 499 /* 500 * Return the next vnode from the free list. 501 */ 502 int 503 getnewvnode(tag, mp, vops, vpp) 504 enum vtagtype tag; 505 struct mount *mp; 506 int (**vops) __P((void *)); 507 struct vnode **vpp; 508 { 509 extern struct uvm_pagerops uvm_vnodeops; 510 struct uvm_object *uobj; 511 struct proc *p = curproc; /* XXX */ 512 static int toggle; 513 struct vnode *vp; 514 int error = 0, tryalloc; 515 516 try_again: 517 if (mp) { 518 /* 519 * Mark filesystem busy while we're creating a vnode. 520 * If unmount is in progress, this will wait; if the 521 * unmount succeeds (only if umount -f), this will 522 * return an error. If the unmount fails, we'll keep 523 * going afterwards. 524 * (This puts the per-mount vnode list logically under 525 * the protection of the vfs_busy lock). 526 */ 527 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 528 if (error && error != EDEADLK) 529 return error; 530 } 531 532 /* 533 * We must choose whether to allocate a new vnode or recycle an 534 * existing one. The criterion for allocating a new one is that 535 * the total number of vnodes is less than the number desired or 536 * there are no vnodes on either free list. Generally we only 537 * want to recycle vnodes that have no buffers associated with 538 * them, so we look first on the vnode_free_list. If it is empty, 539 * we next consider vnodes with referencing buffers on the 540 * vnode_hold_list. The toggle ensures that half the time we 541 * will use a buffer from the vnode_hold_list, and half the time 542 * we will allocate a new one unless the list has grown to twice 543 * the desired size. We are reticent to recycle vnodes from the 544 * vnode_hold_list because we will lose the identity of all its 545 * referencing buffers. 546 */ 547 548 vp = NULL; 549 550 simple_lock(&vnode_free_list_slock); 551 552 toggle ^= 1; 553 if (numvnodes > 2 * desiredvnodes) 554 toggle = 0; 555 556 tryalloc = numvnodes < desiredvnodes || 557 (TAILQ_FIRST(&vnode_free_list) == NULL && 558 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 559 560 if (tryalloc && 561 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 562 numvnodes++; 563 simple_unlock(&vnode_free_list_slock); 564 memset(vp, 0, sizeof(*vp)); 565 simple_lock_init(&vp->v_interlock); 566 uobj = &vp->v_uobj; 567 uobj->pgops = &uvm_vnodeops; 568 uobj->uo_npages = 0; 569 TAILQ_INIT(&uobj->memq); 570 } else { 571 vp = getcleanvnode(p); 572 /* 573 * Unless this is a bad time of the month, at most 574 * the first NCPUS items on the free list are 575 * locked, so this is close enough to being empty. 576 */ 577 if (vp == NULLVP) { 578 if (mp && error != EDEADLK) 579 vfs_unbusy(mp); 580 if (tryalloc) { 581 printf("WARNING: unable to allocate new " 582 "vnode, retrying...\n"); 583 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 584 goto try_again; 585 } 586 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 587 *vpp = 0; 588 return (ENFILE); 589 } 590 vp->v_flag = 0; 591 vp->v_socket = NULL; 592 #ifdef VERIFIED_EXEC 593 vp->fp_status = FINGERPRINT_INVALID; 594 #endif 595 } 596 vp->v_type = VNON; 597 vp->v_vnlock = &vp->v_lock; 598 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 599 cache_purge(vp); 600 vp->v_tag = tag; 601 vp->v_op = vops; 602 insmntque(vp, mp); 603 *vpp = vp; 604 vp->v_usecount = 1; 605 vp->v_data = 0; 606 simple_lock_init(&vp->v_uobj.vmobjlock); 607 608 /* 609 * initialize uvm_object within vnode. 610 */ 611 612 uobj = &vp->v_uobj; 613 KASSERT(uobj->pgops == &uvm_vnodeops); 614 KASSERT(uobj->uo_npages == 0); 615 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 616 vp->v_size = VSIZENOTSET; 617 618 if (mp && error != EDEADLK) 619 vfs_unbusy(mp); 620 return (0); 621 } 622 623 /* 624 * This is really just the reverse of getnewvnode(). Needed for 625 * VFS_VGET functions who may need to push back a vnode in case 626 * of a locking race. 627 */ 628 void 629 ungetnewvnode(vp) 630 struct vnode *vp; 631 { 632 #ifdef DIAGNOSTIC 633 if (vp->v_usecount != 1) 634 panic("ungetnewvnode: busy vnode"); 635 #endif 636 vp->v_usecount--; 637 insmntque(vp, NULL); 638 vp->v_type = VBAD; 639 640 simple_lock(&vp->v_interlock); 641 /* 642 * Insert at head of LRU list 643 */ 644 simple_lock(&vnode_free_list_slock); 645 if (vp->v_holdcnt > 0) 646 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 647 else 648 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 649 simple_unlock(&vnode_free_list_slock); 650 simple_unlock(&vp->v_interlock); 651 } 652 653 /* 654 * Move a vnode from one mount queue to another. 655 */ 656 void 657 insmntque(vp, mp) 658 struct vnode *vp; 659 struct mount *mp; 660 { 661 662 #ifdef DIAGNOSTIC 663 if ((mp != NULL) && 664 (mp->mnt_iflag & IMNT_UNMOUNT) && 665 !(mp->mnt_flag & MNT_SOFTDEP) && 666 vp->v_tag != VT_VFS) { 667 panic("insmntque into dying filesystem"); 668 } 669 #endif 670 671 simple_lock(&mntvnode_slock); 672 /* 673 * Delete from old mount point vnode list, if on one. 674 */ 675 if (vp->v_mount != NULL) 676 LIST_REMOVE(vp, v_mntvnodes); 677 /* 678 * Insert into list of vnodes for the new mount point, if available. 679 */ 680 if ((vp->v_mount = mp) != NULL) 681 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 682 simple_unlock(&mntvnode_slock); 683 } 684 685 /* 686 * Update outstanding I/O count and do wakeup if requested. 687 */ 688 void 689 vwakeup(bp) 690 struct buf *bp; 691 { 692 struct vnode *vp; 693 694 if ((vp = bp->b_vp) != NULL) { 695 /* XXX global lock hack 696 * can't use v_interlock here since this is called 697 * in interrupt context from biodone(). 698 */ 699 simple_lock(&global_v_numoutput_slock); 700 if (--vp->v_numoutput < 0) 701 panic("vwakeup: neg numoutput, vp %p", vp); 702 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 703 vp->v_flag &= ~VBWAIT; 704 wakeup((caddr_t)&vp->v_numoutput); 705 } 706 simple_unlock(&global_v_numoutput_slock); 707 } 708 } 709 710 /* 711 * Flush out and invalidate all buffers associated with a vnode. 712 * Called with the underlying vnode locked, which should prevent new dirty 713 * buffers from being queued. 714 */ 715 int 716 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 717 struct vnode *vp; 718 int flags; 719 struct ucred *cred; 720 struct proc *p; 721 int slpflag, slptimeo; 722 { 723 struct buf *bp, *nbp; 724 int s, error; 725 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 726 (flags & V_SAVE ? PGO_CLEANIT : 0); 727 728 /* XXXUBC this doesn't look at flags or slp* */ 729 simple_lock(&vp->v_interlock); 730 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 731 if (error) { 732 return error; 733 } 734 735 if (flags & V_SAVE) { 736 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 737 if (error) 738 return (error); 739 #ifdef DIAGNOSTIC 740 s = splbio(); 741 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 742 panic("vinvalbuf: dirty bufs, vp %p", vp); 743 splx(s); 744 #endif 745 } 746 747 s = splbio(); 748 749 restart: 750 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 751 nbp = LIST_NEXT(bp, b_vnbufs); 752 simple_lock(&bp->b_interlock); 753 if (bp->b_flags & B_BUSY) { 754 bp->b_flags |= B_WANTED; 755 error = ltsleep((caddr_t)bp, 756 slpflag | (PRIBIO + 1) | PNORELOCK, 757 "vinvalbuf", slptimeo, &bp->b_interlock); 758 if (error) { 759 splx(s); 760 return (error); 761 } 762 goto restart; 763 } 764 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 765 simple_unlock(&bp->b_interlock); 766 brelse(bp); 767 } 768 769 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 770 nbp = LIST_NEXT(bp, b_vnbufs); 771 simple_lock(&bp->b_interlock); 772 if (bp->b_flags & B_BUSY) { 773 bp->b_flags |= B_WANTED; 774 error = ltsleep((caddr_t)bp, 775 slpflag | (PRIBIO + 1) | PNORELOCK, 776 "vinvalbuf", slptimeo, &bp->b_interlock); 777 if (error) { 778 splx(s); 779 return (error); 780 } 781 goto restart; 782 } 783 /* 784 * XXX Since there are no node locks for NFS, I believe 785 * there is a slight chance that a delayed write will 786 * occur while sleeping just above, so check for it. 787 */ 788 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 789 #ifdef DEBUG 790 printf("buffer still DELWRI\n"); 791 #endif 792 bp->b_flags |= B_BUSY | B_VFLUSH; 793 simple_unlock(&bp->b_interlock); 794 VOP_BWRITE(bp); 795 goto restart; 796 } 797 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 798 simple_unlock(&bp->b_interlock); 799 brelse(bp); 800 } 801 802 #ifdef DIAGNOSTIC 803 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 804 panic("vinvalbuf: flush failed, vp %p", vp); 805 #endif 806 807 splx(s); 808 809 return (0); 810 } 811 812 /* 813 * Destroy any in core blocks past the truncation length. 814 * Called with the underlying vnode locked, which should prevent new dirty 815 * buffers from being queued. 816 */ 817 int 818 vtruncbuf(vp, lbn, slpflag, slptimeo) 819 struct vnode *vp; 820 daddr_t lbn; 821 int slpflag, slptimeo; 822 { 823 struct buf *bp, *nbp; 824 int s, error; 825 voff_t off; 826 827 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 828 simple_lock(&vp->v_interlock); 829 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 830 if (error) { 831 return error; 832 } 833 834 s = splbio(); 835 836 restart: 837 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 838 nbp = LIST_NEXT(bp, b_vnbufs); 839 if (bp->b_lblkno < lbn) 840 continue; 841 simple_lock(&bp->b_interlock); 842 if (bp->b_flags & B_BUSY) { 843 bp->b_flags |= B_WANTED; 844 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 845 "vtruncbuf", slptimeo, &bp->b_interlock); 846 if (error) { 847 splx(s); 848 return (error); 849 } 850 goto restart; 851 } 852 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 853 simple_unlock(&bp->b_interlock); 854 brelse(bp); 855 } 856 857 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 858 nbp = LIST_NEXT(bp, b_vnbufs); 859 if (bp->b_lblkno < lbn) 860 continue; 861 simple_lock(&bp->b_interlock); 862 if (bp->b_flags & B_BUSY) { 863 bp->b_flags |= B_WANTED; 864 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 865 "vtruncbuf", slptimeo, &bp->b_interlock); 866 if (error) { 867 splx(s); 868 return (error); 869 } 870 goto restart; 871 } 872 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 873 simple_unlock(&bp->b_interlock); 874 brelse(bp); 875 } 876 877 splx(s); 878 879 return (0); 880 } 881 882 void 883 vflushbuf(vp, sync) 884 struct vnode *vp; 885 int sync; 886 { 887 struct buf *bp, *nbp; 888 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 889 int s; 890 891 simple_lock(&vp->v_interlock); 892 (void) VOP_PUTPAGES(vp, 0, 0, flags); 893 894 loop: 895 s = splbio(); 896 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 897 nbp = LIST_NEXT(bp, b_vnbufs); 898 simple_lock(&bp->b_interlock); 899 if ((bp->b_flags & B_BUSY)) { 900 simple_unlock(&bp->b_interlock); 901 continue; 902 } 903 if ((bp->b_flags & B_DELWRI) == 0) 904 panic("vflushbuf: not dirty, bp %p", bp); 905 bp->b_flags |= B_BUSY | B_VFLUSH; 906 simple_unlock(&bp->b_interlock); 907 splx(s); 908 /* 909 * Wait for I/O associated with indirect blocks to complete, 910 * since there is no way to quickly wait for them below. 911 */ 912 if (bp->b_vp == vp || sync == 0) 913 (void) bawrite(bp); 914 else 915 (void) bwrite(bp); 916 goto loop; 917 } 918 if (sync == 0) { 919 splx(s); 920 return; 921 } 922 simple_lock(&global_v_numoutput_slock); 923 while (vp->v_numoutput) { 924 vp->v_flag |= VBWAIT; 925 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 926 &global_v_numoutput_slock); 927 } 928 simple_unlock(&global_v_numoutput_slock); 929 splx(s); 930 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 931 vprint("vflushbuf: dirty", vp); 932 goto loop; 933 } 934 } 935 936 /* 937 * Associate a buffer with a vnode. 938 */ 939 void 940 bgetvp(vp, bp) 941 struct vnode *vp; 942 struct buf *bp; 943 { 944 int s; 945 946 if (bp->b_vp) 947 panic("bgetvp: not free, bp %p", bp); 948 VHOLD(vp); 949 s = splbio(); 950 bp->b_vp = vp; 951 if (vp->v_type == VBLK || vp->v_type == VCHR) 952 bp->b_dev = vp->v_rdev; 953 else 954 bp->b_dev = NODEV; 955 /* 956 * Insert onto list for new vnode. 957 */ 958 bufinsvn(bp, &vp->v_cleanblkhd); 959 splx(s); 960 } 961 962 /* 963 * Disassociate a buffer from a vnode. 964 */ 965 void 966 brelvp(bp) 967 struct buf *bp; 968 { 969 struct vnode *vp; 970 int s; 971 972 if (bp->b_vp == NULL) 973 panic("brelvp: vp NULL, bp %p", bp); 974 975 s = splbio(); 976 vp = bp->b_vp; 977 /* 978 * Delete from old vnode list, if on one. 979 */ 980 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 981 bufremvn(bp); 982 983 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 984 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 985 vp->v_flag &= ~VONWORKLST; 986 LIST_REMOVE(vp, v_synclist); 987 } 988 989 bp->b_vp = NULL; 990 HOLDRELE(vp); 991 splx(s); 992 } 993 994 /* 995 * Reassign a buffer from one vnode to another. 996 * Used to assign file specific control information 997 * (indirect blocks) to the vnode to which they belong. 998 * 999 * This function must be called at splbio(). 1000 */ 1001 void 1002 reassignbuf(bp, newvp) 1003 struct buf *bp; 1004 struct vnode *newvp; 1005 { 1006 struct buflists *listheadp; 1007 int delay; 1008 1009 /* 1010 * Delete from old vnode list, if on one. 1011 */ 1012 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1013 bufremvn(bp); 1014 /* 1015 * If dirty, put on list of dirty buffers; 1016 * otherwise insert onto list of clean buffers. 1017 */ 1018 if ((bp->b_flags & B_DELWRI) == 0) { 1019 listheadp = &newvp->v_cleanblkhd; 1020 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 1021 (newvp->v_flag & VONWORKLST) && 1022 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1023 newvp->v_flag &= ~VONWORKLST; 1024 LIST_REMOVE(newvp, v_synclist); 1025 } 1026 } else { 1027 listheadp = &newvp->v_dirtyblkhd; 1028 if ((newvp->v_flag & VONWORKLST) == 0) { 1029 switch (newvp->v_type) { 1030 case VDIR: 1031 delay = dirdelay; 1032 break; 1033 case VBLK: 1034 if (newvp->v_specmountpoint != NULL) { 1035 delay = metadelay; 1036 break; 1037 } 1038 /* fall through */ 1039 default: 1040 delay = filedelay; 1041 break; 1042 } 1043 if (!newvp->v_mount || 1044 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1045 vn_syncer_add_to_worklist(newvp, delay); 1046 } 1047 } 1048 bufinsvn(bp, listheadp); 1049 } 1050 1051 /* 1052 * Create a vnode for a block device. 1053 * Used for root filesystem and swap areas. 1054 * Also used for memory file system special devices. 1055 */ 1056 int 1057 bdevvp(dev, vpp) 1058 dev_t dev; 1059 struct vnode **vpp; 1060 { 1061 1062 return (getdevvp(dev, vpp, VBLK)); 1063 } 1064 1065 /* 1066 * Create a vnode for a character device. 1067 * Used for kernfs and some console handling. 1068 */ 1069 int 1070 cdevvp(dev, vpp) 1071 dev_t dev; 1072 struct vnode **vpp; 1073 { 1074 1075 return (getdevvp(dev, vpp, VCHR)); 1076 } 1077 1078 /* 1079 * Create a vnode for a device. 1080 * Used by bdevvp (block device) for root file system etc., 1081 * and by cdevvp (character device) for console and kernfs. 1082 */ 1083 int 1084 getdevvp(dev, vpp, type) 1085 dev_t dev; 1086 struct vnode **vpp; 1087 enum vtype type; 1088 { 1089 struct vnode *vp; 1090 struct vnode *nvp; 1091 int error; 1092 1093 if (dev == NODEV) { 1094 *vpp = NULLVP; 1095 return (0); 1096 } 1097 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1098 if (error) { 1099 *vpp = NULLVP; 1100 return (error); 1101 } 1102 vp = nvp; 1103 vp->v_type = type; 1104 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1105 vput(vp); 1106 vp = nvp; 1107 } 1108 *vpp = vp; 1109 return (0); 1110 } 1111 1112 /* 1113 * Check to see if the new vnode represents a special device 1114 * for which we already have a vnode (either because of 1115 * bdevvp() or because of a different vnode representing 1116 * the same block device). If such an alias exists, deallocate 1117 * the existing contents and return the aliased vnode. The 1118 * caller is responsible for filling it with its new contents. 1119 */ 1120 struct vnode * 1121 checkalias(nvp, nvp_rdev, mp) 1122 struct vnode *nvp; 1123 dev_t nvp_rdev; 1124 struct mount *mp; 1125 { 1126 struct proc *p = curproc; /* XXX */ 1127 struct vnode *vp; 1128 struct vnode **vpp; 1129 1130 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1131 return (NULLVP); 1132 1133 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1134 loop: 1135 simple_lock(&spechash_slock); 1136 for (vp = *vpp; vp; vp = vp->v_specnext) { 1137 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1138 continue; 1139 /* 1140 * Alias, but not in use, so flush it out. 1141 */ 1142 simple_lock(&vp->v_interlock); 1143 if (vp->v_usecount == 0) { 1144 simple_unlock(&spechash_slock); 1145 vgonel(vp, p); 1146 goto loop; 1147 } 1148 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { 1149 simple_unlock(&spechash_slock); 1150 goto loop; 1151 } 1152 break; 1153 } 1154 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1155 MALLOC(nvp->v_specinfo, struct specinfo *, 1156 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1157 /* XXX Erg. */ 1158 if (nvp->v_specinfo == NULL) { 1159 simple_unlock(&spechash_slock); 1160 uvm_wait("checkalias"); 1161 goto loop; 1162 } 1163 1164 nvp->v_rdev = nvp_rdev; 1165 nvp->v_hashchain = vpp; 1166 nvp->v_specnext = *vpp; 1167 nvp->v_specmountpoint = NULL; 1168 simple_unlock(&spechash_slock); 1169 nvp->v_speclockf = NULL; 1170 *vpp = nvp; 1171 if (vp != NULLVP) { 1172 nvp->v_flag |= VALIASED; 1173 vp->v_flag |= VALIASED; 1174 vput(vp); 1175 } 1176 return (NULLVP); 1177 } 1178 simple_unlock(&spechash_slock); 1179 VOP_UNLOCK(vp, 0); 1180 simple_lock(&vp->v_interlock); 1181 vclean(vp, 0, p); 1182 vp->v_op = nvp->v_op; 1183 vp->v_tag = nvp->v_tag; 1184 vp->v_vnlock = &vp->v_lock; 1185 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1186 nvp->v_type = VNON; 1187 insmntque(vp, mp); 1188 return (vp); 1189 } 1190 1191 /* 1192 * Grab a particular vnode from the free list, increment its 1193 * reference count and lock it. If the vnode lock bit is set the 1194 * vnode is being eliminated in vgone. In that case, we can not 1195 * grab the vnode, so the process is awakened when the transition is 1196 * completed, and an error returned to indicate that the vnode is no 1197 * longer usable (possibly having been changed to a new file system type). 1198 */ 1199 int 1200 vget(vp, flags) 1201 struct vnode *vp; 1202 int flags; 1203 { 1204 int error; 1205 1206 /* 1207 * If the vnode is in the process of being cleaned out for 1208 * another use, we wait for the cleaning to finish and then 1209 * return failure. Cleaning is determined by checking that 1210 * the VXLOCK flag is set. 1211 */ 1212 1213 if ((flags & LK_INTERLOCK) == 0) 1214 simple_lock(&vp->v_interlock); 1215 if (vp->v_flag & VXLOCK) { 1216 if (flags & LK_NOWAIT) { 1217 simple_unlock(&vp->v_interlock); 1218 return EBUSY; 1219 } 1220 vp->v_flag |= VXWANT; 1221 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1222 return (ENOENT); 1223 } 1224 if (vp->v_usecount == 0) { 1225 simple_lock(&vnode_free_list_slock); 1226 if (vp->v_holdcnt > 0) 1227 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1228 else 1229 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1230 simple_unlock(&vnode_free_list_slock); 1231 } 1232 vp->v_usecount++; 1233 #ifdef DIAGNOSTIC 1234 if (vp->v_usecount == 0) { 1235 vprint("vget", vp); 1236 panic("vget: usecount overflow, vp %p", vp); 1237 } 1238 #endif 1239 if (flags & LK_TYPE_MASK) { 1240 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1241 /* 1242 * must expand vrele here because we do not want 1243 * to call VOP_INACTIVE if the reference count 1244 * drops back to zero since it was never really 1245 * active. We must remove it from the free list 1246 * before sleeping so that multiple processes do 1247 * not try to recycle it. 1248 */ 1249 simple_lock(&vp->v_interlock); 1250 vp->v_usecount--; 1251 if (vp->v_usecount > 0) { 1252 simple_unlock(&vp->v_interlock); 1253 return (error); 1254 } 1255 /* 1256 * insert at tail of LRU list 1257 */ 1258 simple_lock(&vnode_free_list_slock); 1259 if (vp->v_holdcnt > 0) 1260 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1261 v_freelist); 1262 else 1263 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1264 v_freelist); 1265 simple_unlock(&vnode_free_list_slock); 1266 simple_unlock(&vp->v_interlock); 1267 } 1268 return (error); 1269 } 1270 simple_unlock(&vp->v_interlock); 1271 return (0); 1272 } 1273 1274 /* 1275 * vput(), just unlock and vrele() 1276 */ 1277 void 1278 vput(vp) 1279 struct vnode *vp; 1280 { 1281 struct proc *p = curproc; /* XXX */ 1282 1283 #ifdef DIAGNOSTIC 1284 if (vp == NULL) 1285 panic("vput: null vp"); 1286 #endif 1287 simple_lock(&vp->v_interlock); 1288 vp->v_usecount--; 1289 if (vp->v_usecount > 0) { 1290 simple_unlock(&vp->v_interlock); 1291 VOP_UNLOCK(vp, 0); 1292 return; 1293 } 1294 #ifdef DIAGNOSTIC 1295 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1296 vprint("vput: bad ref count", vp); 1297 panic("vput: ref cnt"); 1298 } 1299 #endif 1300 /* 1301 * Insert at tail of LRU list. 1302 */ 1303 simple_lock(&vnode_free_list_slock); 1304 if (vp->v_holdcnt > 0) 1305 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1306 else 1307 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1308 simple_unlock(&vnode_free_list_slock); 1309 if (vp->v_flag & VEXECMAP) { 1310 uvmexp.execpages -= vp->v_uobj.uo_npages; 1311 uvmexp.filepages += vp->v_uobj.uo_npages; 1312 } 1313 vp->v_flag &= ~(VTEXT|VEXECMAP); 1314 simple_unlock(&vp->v_interlock); 1315 VOP_INACTIVE(vp, p); 1316 } 1317 1318 /* 1319 * Vnode release. 1320 * If count drops to zero, call inactive routine and return to freelist. 1321 */ 1322 void 1323 vrele(vp) 1324 struct vnode *vp; 1325 { 1326 struct proc *p = curproc; /* XXX */ 1327 1328 #ifdef DIAGNOSTIC 1329 if (vp == NULL) 1330 panic("vrele: null vp"); 1331 #endif 1332 simple_lock(&vp->v_interlock); 1333 vp->v_usecount--; 1334 if (vp->v_usecount > 0) { 1335 simple_unlock(&vp->v_interlock); 1336 return; 1337 } 1338 #ifdef DIAGNOSTIC 1339 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1340 vprint("vrele: bad ref count", vp); 1341 panic("vrele: ref cnt vp %p", vp); 1342 } 1343 #endif 1344 /* 1345 * Insert at tail of LRU list. 1346 */ 1347 simple_lock(&vnode_free_list_slock); 1348 if (vp->v_holdcnt > 0) 1349 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1350 else 1351 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1352 simple_unlock(&vnode_free_list_slock); 1353 if (vp->v_flag & VEXECMAP) { 1354 uvmexp.execpages -= vp->v_uobj.uo_npages; 1355 uvmexp.filepages += vp->v_uobj.uo_npages; 1356 } 1357 vp->v_flag &= ~(VTEXT|VEXECMAP); 1358 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1359 VOP_INACTIVE(vp, p); 1360 } 1361 1362 #ifdef DIAGNOSTIC 1363 /* 1364 * Page or buffer structure gets a reference. 1365 */ 1366 void 1367 vholdl(vp) 1368 struct vnode *vp; 1369 { 1370 1371 /* 1372 * If it is on the freelist and the hold count is currently 1373 * zero, move it to the hold list. The test of the back 1374 * pointer and the use reference count of zero is because 1375 * it will be removed from a free list by getnewvnode, 1376 * but will not have its reference count incremented until 1377 * after calling vgone. If the reference count were 1378 * incremented first, vgone would (incorrectly) try to 1379 * close the previous instance of the underlying object. 1380 * So, the back pointer is explicitly set to `0xdeadb' in 1381 * getnewvnode after removing it from a freelist to ensure 1382 * that we do not try to move it here. 1383 */ 1384 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1385 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1386 simple_lock(&vnode_free_list_slock); 1387 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1388 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1389 simple_unlock(&vnode_free_list_slock); 1390 } 1391 vp->v_holdcnt++; 1392 } 1393 1394 /* 1395 * Page or buffer structure frees a reference. 1396 */ 1397 void 1398 holdrelel(vp) 1399 struct vnode *vp; 1400 { 1401 1402 if (vp->v_holdcnt <= 0) 1403 panic("holdrelel: holdcnt vp %p", vp); 1404 vp->v_holdcnt--; 1405 1406 /* 1407 * If it is on the holdlist and the hold count drops to 1408 * zero, move it to the free list. The test of the back 1409 * pointer and the use reference count of zero is because 1410 * it will be removed from a free list by getnewvnode, 1411 * but will not have its reference count incremented until 1412 * after calling vgone. If the reference count were 1413 * incremented first, vgone would (incorrectly) try to 1414 * close the previous instance of the underlying object. 1415 * So, the back pointer is explicitly set to `0xdeadb' in 1416 * getnewvnode after removing it from a freelist to ensure 1417 * that we do not try to move it here. 1418 */ 1419 1420 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1421 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1422 simple_lock(&vnode_free_list_slock); 1423 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1424 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1425 simple_unlock(&vnode_free_list_slock); 1426 } 1427 } 1428 1429 /* 1430 * Vnode reference. 1431 */ 1432 void 1433 vref(vp) 1434 struct vnode *vp; 1435 { 1436 1437 simple_lock(&vp->v_interlock); 1438 if (vp->v_usecount <= 0) 1439 panic("vref used where vget required, vp %p", vp); 1440 vp->v_usecount++; 1441 #ifdef DIAGNOSTIC 1442 if (vp->v_usecount == 0) { 1443 vprint("vref", vp); 1444 panic("vref: usecount overflow, vp %p", vp); 1445 } 1446 #endif 1447 simple_unlock(&vp->v_interlock); 1448 } 1449 #endif /* DIAGNOSTIC */ 1450 1451 /* 1452 * Remove any vnodes in the vnode table belonging to mount point mp. 1453 * 1454 * If FORCECLOSE is not specified, there should not be any active ones, 1455 * return error if any are found (nb: this is a user error, not a 1456 * system error). If FORCECLOSE is specified, detach any active vnodes 1457 * that are found. 1458 * 1459 * If WRITECLOSE is set, only flush out regular file vnodes open for 1460 * writing. 1461 * 1462 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1463 */ 1464 #ifdef DEBUG 1465 int busyprt = 0; /* print out busy vnodes */ 1466 struct ctldebug debug1 = { "busyprt", &busyprt }; 1467 #endif 1468 1469 int 1470 vflush(mp, skipvp, flags) 1471 struct mount *mp; 1472 struct vnode *skipvp; 1473 int flags; 1474 { 1475 struct proc *p = curproc; /* XXX */ 1476 struct vnode *vp, *nvp; 1477 int busy = 0; 1478 1479 simple_lock(&mntvnode_slock); 1480 loop: 1481 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1482 if (vp->v_mount != mp) 1483 goto loop; 1484 nvp = LIST_NEXT(vp, v_mntvnodes); 1485 /* 1486 * Skip over a selected vnode. 1487 */ 1488 if (vp == skipvp) 1489 continue; 1490 simple_lock(&vp->v_interlock); 1491 /* 1492 * Skip over a vnodes marked VSYSTEM. 1493 */ 1494 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1495 simple_unlock(&vp->v_interlock); 1496 continue; 1497 } 1498 /* 1499 * If WRITECLOSE is set, only flush out regular file 1500 * vnodes open for writing. 1501 */ 1502 if ((flags & WRITECLOSE) && 1503 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1504 simple_unlock(&vp->v_interlock); 1505 continue; 1506 } 1507 /* 1508 * With v_usecount == 0, all we need to do is clear 1509 * out the vnode data structures and we are done. 1510 */ 1511 if (vp->v_usecount == 0) { 1512 simple_unlock(&mntvnode_slock); 1513 vgonel(vp, p); 1514 simple_lock(&mntvnode_slock); 1515 continue; 1516 } 1517 /* 1518 * If FORCECLOSE is set, forcibly close the vnode. 1519 * For block or character devices, revert to an 1520 * anonymous device. For all other files, just kill them. 1521 */ 1522 if (flags & FORCECLOSE) { 1523 simple_unlock(&mntvnode_slock); 1524 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1525 vgonel(vp, p); 1526 } else { 1527 vclean(vp, 0, p); 1528 vp->v_op = spec_vnodeop_p; 1529 insmntque(vp, (struct mount *)0); 1530 } 1531 simple_lock(&mntvnode_slock); 1532 continue; 1533 } 1534 #ifdef DEBUG 1535 if (busyprt) 1536 vprint("vflush: busy vnode", vp); 1537 #endif 1538 simple_unlock(&vp->v_interlock); 1539 busy++; 1540 } 1541 simple_unlock(&mntvnode_slock); 1542 if (busy) 1543 return (EBUSY); 1544 return (0); 1545 } 1546 1547 /* 1548 * Disassociate the underlying file system from a vnode. 1549 */ 1550 void 1551 vclean(vp, flags, p) 1552 struct vnode *vp; 1553 int flags; 1554 struct proc *p; 1555 { 1556 struct mount *mp; 1557 int active; 1558 1559 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1560 1561 /* 1562 * Check to see if the vnode is in use. 1563 * If so we have to reference it before we clean it out 1564 * so that its count cannot fall to zero and generate a 1565 * race against ourselves to recycle it. 1566 */ 1567 1568 if ((active = vp->v_usecount) != 0) { 1569 vp->v_usecount++; 1570 #ifdef DIAGNOSTIC 1571 if (vp->v_usecount == 0) { 1572 vprint("vclean", vp); 1573 panic("vclean: usecount overflow"); 1574 } 1575 #endif 1576 } 1577 1578 /* 1579 * Prevent the vnode from being recycled or 1580 * brought into use while we clean it out. 1581 */ 1582 if (vp->v_flag & VXLOCK) 1583 panic("vclean: deadlock, vp %p", vp); 1584 vp->v_flag |= VXLOCK; 1585 if (vp->v_flag & VEXECMAP) { 1586 uvmexp.execpages -= vp->v_uobj.uo_npages; 1587 uvmexp.filepages += vp->v_uobj.uo_npages; 1588 } 1589 vp->v_flag &= ~(VTEXT|VEXECMAP); 1590 1591 /* 1592 * Even if the count is zero, the VOP_INACTIVE routine may still 1593 * have the object locked while it cleans it out. The VOP_LOCK 1594 * ensures that the VOP_INACTIVE routine is done with its work. 1595 * For active vnodes, it ensures that no other activity can 1596 * occur while the underlying object is being cleaned out. 1597 */ 1598 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1599 1600 /* 1601 * Clean out any cached data associated with the vnode. 1602 */ 1603 if (flags & DOCLOSE) { 1604 int error; 1605 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1606 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1607 vn_finished_write(mp, V_LOWER); 1608 if (error) 1609 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1610 KASSERT(error == 0); 1611 KASSERT((vp->v_flag & VONWORKLST) == 0); 1612 } 1613 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1614 1615 /* 1616 * If purging an active vnode, it must be closed and 1617 * deactivated before being reclaimed. Note that the 1618 * VOP_INACTIVE will unlock the vnode. 1619 */ 1620 if (active) { 1621 if (flags & DOCLOSE) 1622 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1623 VOP_INACTIVE(vp, p); 1624 } else { 1625 /* 1626 * Any other processes trying to obtain this lock must first 1627 * wait for VXLOCK to clear, then call the new lock operation. 1628 */ 1629 VOP_UNLOCK(vp, 0); 1630 } 1631 /* 1632 * Reclaim the vnode. 1633 */ 1634 if (VOP_RECLAIM(vp, p)) 1635 panic("vclean: cannot reclaim, vp %p", vp); 1636 if (active) { 1637 /* 1638 * Inline copy of vrele() since VOP_INACTIVE 1639 * has already been called. 1640 */ 1641 simple_lock(&vp->v_interlock); 1642 if (--vp->v_usecount <= 0) { 1643 #ifdef DIAGNOSTIC 1644 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1645 vprint("vclean: bad ref count", vp); 1646 panic("vclean: ref cnt"); 1647 } 1648 #endif 1649 /* 1650 * Insert at tail of LRU list. 1651 */ 1652 1653 simple_unlock(&vp->v_interlock); 1654 simple_lock(&vnode_free_list_slock); 1655 #ifdef DIAGNOSTIC 1656 if (vp->v_holdcnt > 0) 1657 panic("vclean: not clean, vp %p", vp); 1658 #endif 1659 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1660 simple_unlock(&vnode_free_list_slock); 1661 } else 1662 simple_unlock(&vp->v_interlock); 1663 } 1664 1665 KASSERT(vp->v_uobj.uo_npages == 0); 1666 cache_purge(vp); 1667 1668 /* 1669 * Done with purge, notify sleepers of the grim news. 1670 */ 1671 vp->v_op = dead_vnodeop_p; 1672 vp->v_tag = VT_NON; 1673 simple_lock(&vp->v_interlock); 1674 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1675 vp->v_flag &= ~VXLOCK; 1676 if (vp->v_flag & VXWANT) { 1677 vp->v_flag &= ~VXWANT; 1678 simple_unlock(&vp->v_interlock); 1679 wakeup((caddr_t)vp); 1680 } else 1681 simple_unlock(&vp->v_interlock); 1682 } 1683 1684 /* 1685 * Recycle an unused vnode to the front of the free list. 1686 * Release the passed interlock if the vnode will be recycled. 1687 */ 1688 int 1689 vrecycle(vp, inter_lkp, p) 1690 struct vnode *vp; 1691 struct simplelock *inter_lkp; 1692 struct proc *p; 1693 { 1694 1695 simple_lock(&vp->v_interlock); 1696 if (vp->v_usecount == 0) { 1697 if (inter_lkp) 1698 simple_unlock(inter_lkp); 1699 vgonel(vp, p); 1700 return (1); 1701 } 1702 simple_unlock(&vp->v_interlock); 1703 return (0); 1704 } 1705 1706 /* 1707 * Eliminate all activity associated with a vnode 1708 * in preparation for reuse. 1709 */ 1710 void 1711 vgone(vp) 1712 struct vnode *vp; 1713 { 1714 struct proc *p = curproc; /* XXX */ 1715 1716 simple_lock(&vp->v_interlock); 1717 vgonel(vp, p); 1718 } 1719 1720 /* 1721 * vgone, with the vp interlock held. 1722 */ 1723 void 1724 vgonel(vp, p) 1725 struct vnode *vp; 1726 struct proc *p; 1727 { 1728 struct vnode *vq; 1729 struct vnode *vx; 1730 1731 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1732 1733 /* 1734 * If a vgone (or vclean) is already in progress, 1735 * wait until it is done and return. 1736 */ 1737 1738 if (vp->v_flag & VXLOCK) { 1739 vp->v_flag |= VXWANT; 1740 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1741 return; 1742 } 1743 1744 /* 1745 * Clean out the filesystem specific data. 1746 */ 1747 1748 vclean(vp, DOCLOSE, p); 1749 KASSERT((vp->v_flag & VONWORKLST) == 0); 1750 1751 /* 1752 * Delete from old mount point vnode list, if on one. 1753 */ 1754 1755 if (vp->v_mount != NULL) 1756 insmntque(vp, (struct mount *)0); 1757 1758 /* 1759 * If special device, remove it from special device alias list. 1760 * if it is on one. 1761 */ 1762 1763 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 1764 simple_lock(&spechash_slock); 1765 if (vp->v_hashchain != NULL) { 1766 if (*vp->v_hashchain == vp) { 1767 *vp->v_hashchain = vp->v_specnext; 1768 } else { 1769 for (vq = *vp->v_hashchain; vq; 1770 vq = vq->v_specnext) { 1771 if (vq->v_specnext != vp) 1772 continue; 1773 vq->v_specnext = vp->v_specnext; 1774 break; 1775 } 1776 if (vq == NULL) 1777 panic("missing bdev"); 1778 } 1779 if (vp->v_flag & VALIASED) { 1780 vx = NULL; 1781 for (vq = *vp->v_hashchain; vq; 1782 vq = vq->v_specnext) { 1783 if (vq->v_rdev != vp->v_rdev || 1784 vq->v_type != vp->v_type) 1785 continue; 1786 if (vx) 1787 break; 1788 vx = vq; 1789 } 1790 if (vx == NULL) 1791 panic("missing alias"); 1792 if (vq == NULL) 1793 vx->v_flag &= ~VALIASED; 1794 vp->v_flag &= ~VALIASED; 1795 } 1796 } 1797 simple_unlock(&spechash_slock); 1798 FREE(vp->v_specinfo, M_VNODE); 1799 vp->v_specinfo = NULL; 1800 } 1801 1802 /* 1803 * The test of the back pointer and the reference count of 1804 * zero is because it will be removed from the free list by 1805 * getcleanvnode, but will not have its reference count 1806 * incremented until after calling vgone. If the reference 1807 * count were incremented first, vgone would (incorrectly) 1808 * try to close the previous instance of the underlying object. 1809 * So, the back pointer is explicitly set to `0xdeadb' in 1810 * getnewvnode after removing it from the freelist to ensure 1811 * that we do not try to move it here. 1812 */ 1813 1814 vp->v_type = VBAD; 1815 if (vp->v_usecount == 0) { 1816 boolean_t dofree; 1817 1818 simple_lock(&vnode_free_list_slock); 1819 if (vp->v_holdcnt > 0) 1820 panic("vgonel: not clean, vp %p", vp); 1821 /* 1822 * if it isn't on the freelist, we're called by getcleanvnode 1823 * and vnode is being re-used. otherwise, we'll free it. 1824 */ 1825 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1826 if (dofree) { 1827 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1828 numvnodes--; 1829 } 1830 simple_unlock(&vnode_free_list_slock); 1831 if (dofree) 1832 pool_put(&vnode_pool, vp); 1833 } 1834 } 1835 1836 /* 1837 * Lookup a vnode by device number. 1838 */ 1839 int 1840 vfinddev(dev, type, vpp) 1841 dev_t dev; 1842 enum vtype type; 1843 struct vnode **vpp; 1844 { 1845 struct vnode *vp; 1846 int rc = 0; 1847 1848 simple_lock(&spechash_slock); 1849 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1850 if (dev != vp->v_rdev || type != vp->v_type) 1851 continue; 1852 *vpp = vp; 1853 rc = 1; 1854 break; 1855 } 1856 simple_unlock(&spechash_slock); 1857 return (rc); 1858 } 1859 1860 /* 1861 * Revoke all the vnodes corresponding to the specified minor number 1862 * range (endpoints inclusive) of the specified major. 1863 */ 1864 void 1865 vdevgone(maj, minl, minh, type) 1866 int maj, minl, minh; 1867 enum vtype type; 1868 { 1869 struct vnode *vp; 1870 int mn; 1871 1872 for (mn = minl; mn <= minh; mn++) 1873 if (vfinddev(makedev(maj, mn), type, &vp)) 1874 VOP_REVOKE(vp, REVOKEALL); 1875 } 1876 1877 /* 1878 * Calculate the total number of references to a special device. 1879 */ 1880 int 1881 vcount(vp) 1882 struct vnode *vp; 1883 { 1884 struct vnode *vq, *vnext; 1885 int count; 1886 1887 loop: 1888 if ((vp->v_flag & VALIASED) == 0) 1889 return (vp->v_usecount); 1890 simple_lock(&spechash_slock); 1891 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1892 vnext = vq->v_specnext; 1893 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1894 continue; 1895 /* 1896 * Alias, but not in use, so flush it out. 1897 */ 1898 if (vq->v_usecount == 0 && vq != vp && 1899 (vq->v_flag & VXLOCK) == 0) { 1900 simple_unlock(&spechash_slock); 1901 vgone(vq); 1902 goto loop; 1903 } 1904 count += vq->v_usecount; 1905 } 1906 simple_unlock(&spechash_slock); 1907 return (count); 1908 } 1909 1910 /* 1911 * Print out a description of a vnode. 1912 */ 1913 const char * const vnode_types[] = { 1914 "VNON", 1915 "VREG", 1916 "VDIR", 1917 "VBLK", 1918 "VCHR", 1919 "VLNK", 1920 "VSOCK", 1921 "VFIFO", 1922 "VBAD" 1923 }; 1924 1925 void 1926 vprint(label, vp) 1927 char *label; 1928 struct vnode *vp; 1929 { 1930 char buf[96]; 1931 1932 if (label != NULL) 1933 printf("%s: ", label); 1934 printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,", 1935 vp->v_tag, vnode_types[vp->v_type], 1936 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1937 buf[0] = '\0'; 1938 if (vp->v_flag & VROOT) 1939 strlcat(buf, "|VROOT", sizeof(buf)); 1940 if (vp->v_flag & VTEXT) 1941 strlcat(buf, "|VTEXT", sizeof(buf)); 1942 if (vp->v_flag & VEXECMAP) 1943 strlcat(buf, "|VEXECMAP", sizeof(buf)); 1944 if (vp->v_flag & VSYSTEM) 1945 strlcat(buf, "|VSYSTEM", sizeof(buf)); 1946 if (vp->v_flag & VXLOCK) 1947 strlcat(buf, "|VXLOCK", sizeof(buf)); 1948 if (vp->v_flag & VXWANT) 1949 strlcat(buf, "|VXWANT", sizeof(buf)); 1950 if (vp->v_flag & VBWAIT) 1951 strlcat(buf, "|VBWAIT", sizeof(buf)); 1952 if (vp->v_flag & VALIASED) 1953 strlcat(buf, "|VALIASED", sizeof(buf)); 1954 if (buf[0] != '\0') 1955 printf(" flags (%s)", &buf[1]); 1956 if (vp->v_data == NULL) { 1957 printf("\n"); 1958 } else { 1959 printf("\n\t"); 1960 VOP_PRINT(vp); 1961 } 1962 } 1963 1964 #ifdef DEBUG 1965 /* 1966 * List all of the locked vnodes in the system. 1967 * Called when debugging the kernel. 1968 */ 1969 void 1970 printlockedvnodes() 1971 { 1972 struct mount *mp, *nmp; 1973 struct vnode *vp; 1974 1975 printf("Locked vnodes\n"); 1976 simple_lock(&mountlist_slock); 1977 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1978 mp = nmp) { 1979 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 1980 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1981 continue; 1982 } 1983 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1984 if (VOP_ISLOCKED(vp)) 1985 vprint(NULL, vp); 1986 } 1987 simple_lock(&mountlist_slock); 1988 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1989 vfs_unbusy(mp); 1990 } 1991 simple_unlock(&mountlist_slock); 1992 } 1993 #endif 1994 1995 /* 1996 * sysctl helper routine for vfs.generic.conf lookups. 1997 */ 1998 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 1999 static int 2000 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 2001 { 2002 struct vfsconf vfc; 2003 extern const char * const mountcompatnames[]; 2004 extern int nmountcompatnames; 2005 struct sysctlnode node; 2006 struct vfsops *vfsp; 2007 u_int vfsnum; 2008 2009 if (namelen != 1) 2010 return (ENOTDIR); 2011 vfsnum = name[0]; 2012 if (vfsnum >= nmountcompatnames || 2013 mountcompatnames[vfsnum] == NULL) 2014 return (EOPNOTSUPP); 2015 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 2016 if (vfsp == NULL) 2017 return (EOPNOTSUPP); 2018 2019 vfc.vfc_vfsops = vfsp; 2020 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 2021 vfc.vfc_typenum = vfsnum; 2022 vfc.vfc_refcount = vfsp->vfs_refcount; 2023 vfc.vfc_flags = 0; 2024 vfc.vfc_mountroot = vfsp->vfs_mountroot; 2025 vfc.vfc_next = NULL; 2026 2027 node = *rnode; 2028 node.sysctl_data = &vfc; 2029 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 2030 } 2031 #endif 2032 2033 /* 2034 * Top level filesystem related information gathering. 2035 */ 2036 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2037 { 2038 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2039 extern int nmountcompatnames; 2040 #endif 2041 2042 sysctl_createv(SYSCTL_PERMANENT, 2043 CTLTYPE_NODE, "vfs", NULL, 2044 NULL, 0, NULL, 0, 2045 CTL_VFS, CTL_EOL); 2046 sysctl_createv(SYSCTL_PERMANENT, 2047 CTLTYPE_NODE, "generic", NULL, 2048 NULL, 0, NULL, 0, 2049 CTL_VFS, VFS_GENERIC, CTL_EOL); 2050 2051 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2052 sysctl_createv(SYSCTL_PERMANENT|SYSCTL_IMMEDIATE, 2053 CTLTYPE_INT, "maxtypenum", NULL, 2054 NULL, nmountcompatnames, NULL, 0, 2055 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2056 #endif 2057 sysctl_createv(SYSCTL_PERMANENT|SYSCTL_READWRITE, 2058 CTLTYPE_INT, "usermount", NULL, 2059 NULL, 0, &dovfsusermount, 0, 2060 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2061 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2062 sysctl_createv(SYSCTL_PERMANENT, 2063 CTLTYPE_STRUCT, "conf", NULL, 2064 sysctl_vfs_generic_conf, 0, NULL, 2065 sizeof(struct vfsconf), 2066 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2067 #endif 2068 } 2069 2070 2071 int kinfo_vdebug = 1; 2072 int kinfo_vgetfailed; 2073 #define KINFO_VNODESLOP 10 2074 /* 2075 * Dump vnode list (via sysctl). 2076 * Copyout address of vnode followed by vnode. 2077 */ 2078 /* ARGSUSED */ 2079 int 2080 sysctl_kern_vnode(SYSCTLFN_ARGS) 2081 { 2082 char *where = oldp; 2083 size_t *sizep = oldlenp; 2084 struct mount *mp, *nmp; 2085 struct vnode *nvp, *vp; 2086 char *bp = where, *savebp; 2087 char *ewhere; 2088 int error; 2089 2090 if (namelen != 0) 2091 return (EOPNOTSUPP); 2092 if (newp != NULL) 2093 return (EPERM); 2094 2095 #define VPTRSZ sizeof(struct vnode *) 2096 #define VNODESZ sizeof(struct vnode) 2097 if (where == NULL) { 2098 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2099 return (0); 2100 } 2101 ewhere = where + *sizep; 2102 2103 simple_lock(&mountlist_slock); 2104 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2105 mp = nmp) { 2106 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2107 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2108 continue; 2109 } 2110 savebp = bp; 2111 again: 2112 simple_lock(&mntvnode_slock); 2113 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2114 vp != NULL; 2115 vp = nvp) { 2116 /* 2117 * Check that the vp is still associated with 2118 * this filesystem. RACE: could have been 2119 * recycled onto the same filesystem. 2120 */ 2121 if (vp->v_mount != mp) { 2122 simple_unlock(&mntvnode_slock); 2123 if (kinfo_vdebug) 2124 printf("kinfo: vp changed\n"); 2125 bp = savebp; 2126 goto again; 2127 } 2128 nvp = LIST_NEXT(vp, v_mntvnodes); 2129 if (bp + VPTRSZ + VNODESZ > ewhere) { 2130 simple_unlock(&mntvnode_slock); 2131 *sizep = bp - where; 2132 return (ENOMEM); 2133 } 2134 simple_unlock(&mntvnode_slock); 2135 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2136 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2137 return (error); 2138 bp += VPTRSZ + VNODESZ; 2139 simple_lock(&mntvnode_slock); 2140 } 2141 simple_unlock(&mntvnode_slock); 2142 simple_lock(&mountlist_slock); 2143 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2144 vfs_unbusy(mp); 2145 } 2146 simple_unlock(&mountlist_slock); 2147 2148 *sizep = bp - where; 2149 return (0); 2150 } 2151 2152 /* 2153 * Check to see if a filesystem is mounted on a block device. 2154 */ 2155 int 2156 vfs_mountedon(vp) 2157 struct vnode *vp; 2158 { 2159 struct vnode *vq; 2160 int error = 0; 2161 2162 if (vp->v_specmountpoint != NULL) 2163 return (EBUSY); 2164 if (vp->v_flag & VALIASED) { 2165 simple_lock(&spechash_slock); 2166 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2167 if (vq->v_rdev != vp->v_rdev || 2168 vq->v_type != vp->v_type) 2169 continue; 2170 if (vq->v_specmountpoint != NULL) { 2171 error = EBUSY; 2172 break; 2173 } 2174 } 2175 simple_unlock(&spechash_slock); 2176 } 2177 return (error); 2178 } 2179 2180 static int 2181 sacheck(struct sockaddr *sa) 2182 { 2183 switch (sa->sa_family) { 2184 #ifdef INET 2185 case AF_INET: { 2186 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2187 char *p = (char *)sin->sin_zero; 2188 size_t i; 2189 2190 if (sin->sin_len != sizeof(*sin)) 2191 return -1; 2192 if (sin->sin_port != 0) 2193 return -1; 2194 for (i = 0; i < sizeof(sin->sin_zero); i++) 2195 if (*p++ != '\0') 2196 return -1; 2197 return 0; 2198 } 2199 #endif 2200 #ifdef INET6 2201 case AF_INET6: { 2202 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2203 2204 if (sin6->sin6_len != sizeof(*sin6)) 2205 return -1; 2206 if (sin6->sin6_port != 0) 2207 return -1; 2208 return 0; 2209 } 2210 #endif 2211 default: 2212 return -1; 2213 } 2214 } 2215 2216 /* 2217 * Build hash lists of net addresses and hang them off the mount point. 2218 * Called by ufs_mount() to set up the lists of export addresses. 2219 */ 2220 static int 2221 vfs_hang_addrlist(mp, nep, argp) 2222 struct mount *mp; 2223 struct netexport *nep; 2224 struct export_args *argp; 2225 { 2226 struct netcred *np, *enp; 2227 struct radix_node_head *rnh; 2228 int i; 2229 struct sockaddr *saddr, *smask = 0; 2230 struct domain *dom; 2231 int error; 2232 2233 if (argp->ex_addrlen == 0) { 2234 if (mp->mnt_flag & MNT_DEFEXPORTED) 2235 return (EPERM); 2236 np = &nep->ne_defexported; 2237 np->netc_exflags = argp->ex_flags; 2238 crcvt(&np->netc_anon, &argp->ex_anon); 2239 np->netc_anon.cr_ref = 1; 2240 mp->mnt_flag |= MNT_DEFEXPORTED; 2241 return (0); 2242 } 2243 2244 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2245 return (EINVAL); 2246 2247 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2248 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2249 memset((caddr_t)np, 0, i); 2250 saddr = (struct sockaddr *)(np + 1); 2251 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2252 if (error) 2253 goto out; 2254 if (saddr->sa_len > argp->ex_addrlen) 2255 saddr->sa_len = argp->ex_addrlen; 2256 if (sacheck(saddr) == -1) 2257 return EINVAL; 2258 if (argp->ex_masklen) { 2259 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2260 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2261 if (error) 2262 goto out; 2263 if (smask->sa_len > argp->ex_masklen) 2264 smask->sa_len = argp->ex_masklen; 2265 if (smask->sa_family != saddr->sa_family) 2266 return EINVAL; 2267 if (sacheck(smask) == -1) 2268 return EINVAL; 2269 } 2270 i = saddr->sa_family; 2271 if ((rnh = nep->ne_rtable[i]) == 0) { 2272 /* 2273 * Seems silly to initialize every AF when most are not 2274 * used, do so on demand here 2275 */ 2276 for (dom = domains; dom; dom = dom->dom_next) 2277 if (dom->dom_family == i && dom->dom_rtattach) { 2278 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2279 dom->dom_rtoffset); 2280 break; 2281 } 2282 if ((rnh = nep->ne_rtable[i]) == 0) { 2283 error = ENOBUFS; 2284 goto out; 2285 } 2286 } 2287 2288 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2289 np->netc_rnodes); 2290 if (enp != np) { 2291 if (enp == NULL) { 2292 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2293 smask, rnh); 2294 if (enp == NULL) { 2295 error = EPERM; 2296 goto out; 2297 } 2298 } else 2299 enp->netc_refcnt++; 2300 2301 goto check; 2302 } else 2303 enp->netc_refcnt = 1; 2304 2305 np->netc_exflags = argp->ex_flags; 2306 crcvt(&np->netc_anon, &argp->ex_anon); 2307 np->netc_anon.cr_ref = 1; 2308 return 0; 2309 check: 2310 if (enp->netc_exflags != argp->ex_flags || 2311 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2312 error = EPERM; 2313 else 2314 error = 0; 2315 out: 2316 free(np, M_NETADDR); 2317 return error; 2318 } 2319 2320 /* ARGSUSED */ 2321 static int 2322 vfs_free_netcred(rn, w) 2323 struct radix_node *rn; 2324 void *w; 2325 { 2326 struct radix_node_head *rnh = (struct radix_node_head *)w; 2327 struct netcred *np = (struct netcred *)(void *)rn; 2328 2329 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2330 if (--(np->netc_refcnt) <= 0) 2331 free(np, M_NETADDR); 2332 return (0); 2333 } 2334 2335 /* 2336 * Free the net address hash lists that are hanging off the mount points. 2337 */ 2338 static void 2339 vfs_free_addrlist(nep) 2340 struct netexport *nep; 2341 { 2342 int i; 2343 struct radix_node_head *rnh; 2344 2345 for (i = 0; i <= AF_MAX; i++) 2346 if ((rnh = nep->ne_rtable[i]) != NULL) { 2347 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2348 free((caddr_t)rnh, M_RTABLE); 2349 nep->ne_rtable[i] = 0; 2350 } 2351 } 2352 2353 int 2354 vfs_export(mp, nep, argp) 2355 struct mount *mp; 2356 struct netexport *nep; 2357 struct export_args *argp; 2358 { 2359 int error; 2360 2361 if (argp->ex_flags & MNT_DELEXPORT) { 2362 if (mp->mnt_flag & MNT_EXPUBLIC) { 2363 vfs_setpublicfs(NULL, NULL, NULL); 2364 mp->mnt_flag &= ~MNT_EXPUBLIC; 2365 } 2366 vfs_free_addrlist(nep); 2367 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2368 } 2369 if (argp->ex_flags & MNT_EXPORTED) { 2370 if (argp->ex_flags & MNT_EXPUBLIC) { 2371 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2372 return (error); 2373 mp->mnt_flag |= MNT_EXPUBLIC; 2374 } 2375 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2376 return (error); 2377 mp->mnt_flag |= MNT_EXPORTED; 2378 } 2379 return (0); 2380 } 2381 2382 /* 2383 * Set the publicly exported filesystem (WebNFS). Currently, only 2384 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2385 */ 2386 int 2387 vfs_setpublicfs(mp, nep, argp) 2388 struct mount *mp; 2389 struct netexport *nep; 2390 struct export_args *argp; 2391 { 2392 int error; 2393 struct vnode *rvp; 2394 char *cp; 2395 2396 /* 2397 * mp == NULL -> invalidate the current info, the FS is 2398 * no longer exported. May be called from either vfs_export 2399 * or unmount, so check if it hasn't already been done. 2400 */ 2401 if (mp == NULL) { 2402 if (nfs_pub.np_valid) { 2403 nfs_pub.np_valid = 0; 2404 if (nfs_pub.np_index != NULL) { 2405 FREE(nfs_pub.np_index, M_TEMP); 2406 nfs_pub.np_index = NULL; 2407 } 2408 } 2409 return (0); 2410 } 2411 2412 /* 2413 * Only one allowed at a time. 2414 */ 2415 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2416 return (EBUSY); 2417 2418 /* 2419 * Get real filehandle for root of exported FS. 2420 */ 2421 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2422 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2423 2424 if ((error = VFS_ROOT(mp, &rvp))) 2425 return (error); 2426 2427 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2428 return (error); 2429 2430 vput(rvp); 2431 2432 /* 2433 * If an indexfile was specified, pull it in. 2434 */ 2435 if (argp->ex_indexfile != NULL) { 2436 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2437 M_WAITOK); 2438 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2439 MAXNAMLEN, (size_t *)0); 2440 if (!error) { 2441 /* 2442 * Check for illegal filenames. 2443 */ 2444 for (cp = nfs_pub.np_index; *cp; cp++) { 2445 if (*cp == '/') { 2446 error = EINVAL; 2447 break; 2448 } 2449 } 2450 } 2451 if (error) { 2452 FREE(nfs_pub.np_index, M_TEMP); 2453 return (error); 2454 } 2455 } 2456 2457 nfs_pub.np_mount = mp; 2458 nfs_pub.np_valid = 1; 2459 return (0); 2460 } 2461 2462 struct netcred * 2463 vfs_export_lookup(mp, nep, nam) 2464 struct mount *mp; 2465 struct netexport *nep; 2466 struct mbuf *nam; 2467 { 2468 struct netcred *np; 2469 struct radix_node_head *rnh; 2470 struct sockaddr *saddr; 2471 2472 np = NULL; 2473 if (mp->mnt_flag & MNT_EXPORTED) { 2474 /* 2475 * Lookup in the export list first. 2476 */ 2477 if (nam != NULL) { 2478 saddr = mtod(nam, struct sockaddr *); 2479 rnh = nep->ne_rtable[saddr->sa_family]; 2480 if (rnh != NULL) { 2481 np = (struct netcred *) 2482 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2483 rnh); 2484 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2485 np = NULL; 2486 } 2487 } 2488 /* 2489 * If no address match, use the default if it exists. 2490 */ 2491 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2492 np = &nep->ne_defexported; 2493 } 2494 return (np); 2495 } 2496 2497 /* 2498 * Do the usual access checking. 2499 * file_mode, uid and gid are from the vnode in question, 2500 * while acc_mode and cred are from the VOP_ACCESS parameter list 2501 */ 2502 int 2503 vaccess(type, file_mode, uid, gid, acc_mode, cred) 2504 enum vtype type; 2505 mode_t file_mode; 2506 uid_t uid; 2507 gid_t gid; 2508 mode_t acc_mode; 2509 struct ucred *cred; 2510 { 2511 mode_t mask; 2512 2513 /* 2514 * Super-user always gets read/write access, but execute access depends 2515 * on at least one execute bit being set. 2516 */ 2517 if (cred->cr_uid == 0) { 2518 if ((acc_mode & VEXEC) && type != VDIR && 2519 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2520 return (EACCES); 2521 return (0); 2522 } 2523 2524 mask = 0; 2525 2526 /* Otherwise, check the owner. */ 2527 if (cred->cr_uid == uid) { 2528 if (acc_mode & VEXEC) 2529 mask |= S_IXUSR; 2530 if (acc_mode & VREAD) 2531 mask |= S_IRUSR; 2532 if (acc_mode & VWRITE) 2533 mask |= S_IWUSR; 2534 return ((file_mode & mask) == mask ? 0 : EACCES); 2535 } 2536 2537 /* Otherwise, check the groups. */ 2538 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2539 if (acc_mode & VEXEC) 2540 mask |= S_IXGRP; 2541 if (acc_mode & VREAD) 2542 mask |= S_IRGRP; 2543 if (acc_mode & VWRITE) 2544 mask |= S_IWGRP; 2545 return ((file_mode & mask) == mask ? 0 : EACCES); 2546 } 2547 2548 /* Otherwise, check everyone else. */ 2549 if (acc_mode & VEXEC) 2550 mask |= S_IXOTH; 2551 if (acc_mode & VREAD) 2552 mask |= S_IROTH; 2553 if (acc_mode & VWRITE) 2554 mask |= S_IWOTH; 2555 return ((file_mode & mask) == mask ? 0 : EACCES); 2556 } 2557 2558 /* 2559 * Unmount all file systems. 2560 * We traverse the list in reverse order under the assumption that doing so 2561 * will avoid needing to worry about dependencies. 2562 */ 2563 void 2564 vfs_unmountall(p) 2565 struct proc *p; 2566 { 2567 struct mount *mp, *nmp; 2568 int allerror, error; 2569 2570 for (allerror = 0, 2571 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2572 nmp = mp->mnt_list.cqe_prev; 2573 #ifdef DEBUG 2574 printf("unmounting %s (%s)...\n", 2575 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2576 #endif 2577 /* 2578 * XXX Freeze syncer. Must do this before locking the 2579 * mount point. See dounmount() for details. 2580 */ 2581 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2582 if (vfs_busy(mp, 0, 0)) { 2583 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2584 continue; 2585 } 2586 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2587 printf("unmount of %s failed with error %d\n", 2588 mp->mnt_stat.f_mntonname, error); 2589 allerror = 1; 2590 } 2591 } 2592 if (allerror) 2593 printf("WARNING: some file systems would not unmount\n"); 2594 } 2595 2596 extern struct simplelock bqueue_slock; /* XXX */ 2597 2598 /* 2599 * Sync and unmount file systems before shutting down. 2600 */ 2601 void 2602 vfs_shutdown() 2603 { 2604 struct lwp *l = curlwp; 2605 struct proc *p; 2606 2607 /* XXX we're certainly not running in proc0's context! */ 2608 if (l == NULL || (p = l->l_proc) == NULL) 2609 p = &proc0; 2610 2611 printf("syncing disks... "); 2612 2613 /* remove user process from run queue */ 2614 suspendsched(); 2615 (void) spl0(); 2616 2617 /* avoid coming back this way again if we panic. */ 2618 doing_shutdown = 1; 2619 2620 sys_sync(l, NULL, NULL); 2621 2622 /* Wait for sync to finish. */ 2623 if (buf_syncwait() != 0) { 2624 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2625 Debugger(); 2626 #endif 2627 printf("giving up\n"); 2628 return; 2629 } else 2630 printf("done\n"); 2631 2632 /* 2633 * If we've panic'd, don't make the situation potentially 2634 * worse by unmounting the file systems. 2635 */ 2636 if (panicstr != NULL) 2637 return; 2638 2639 /* Release inodes held by texts before update. */ 2640 #ifdef notdef 2641 vnshutdown(); 2642 #endif 2643 /* Unmount file systems. */ 2644 vfs_unmountall(p); 2645 } 2646 2647 /* 2648 * Mount the root file system. If the operator didn't specify a 2649 * file system to use, try all possible file systems until one 2650 * succeeds. 2651 */ 2652 int 2653 vfs_mountroot() 2654 { 2655 struct vfsops *v; 2656 2657 if (root_device == NULL) 2658 panic("vfs_mountroot: root device unknown"); 2659 2660 switch (root_device->dv_class) { 2661 case DV_IFNET: 2662 if (rootdev != NODEV) 2663 panic("vfs_mountroot: rootdev set for DV_IFNET " 2664 "(0x%08x -> %d,%d)", rootdev, 2665 major(rootdev), minor(rootdev)); 2666 break; 2667 2668 case DV_DISK: 2669 if (rootdev == NODEV) 2670 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2671 break; 2672 2673 default: 2674 printf("%s: inappropriate for root file system\n", 2675 root_device->dv_xname); 2676 return (ENODEV); 2677 } 2678 2679 /* 2680 * If user specified a file system, use it. 2681 */ 2682 if (mountroot != NULL) 2683 return ((*mountroot)()); 2684 2685 /* 2686 * Try each file system currently configured into the kernel. 2687 */ 2688 for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) { 2689 if (v->vfs_mountroot == NULL) 2690 continue; 2691 #ifdef DEBUG 2692 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2693 #endif 2694 if ((*v->vfs_mountroot)() == 0) { 2695 aprint_normal("root file system type: %s\n", 2696 v->vfs_name); 2697 break; 2698 } 2699 } 2700 2701 if (v == NULL) { 2702 printf("no file system for %s", root_device->dv_xname); 2703 if (root_device->dv_class == DV_DISK) 2704 printf(" (dev 0x%x)", rootdev); 2705 printf("\n"); 2706 return (EFTYPE); 2707 } 2708 return (0); 2709 } 2710 2711 /* 2712 * Given a file system name, look up the vfsops for that 2713 * file system, or return NULL if file system isn't present 2714 * in the kernel. 2715 */ 2716 struct vfsops * 2717 vfs_getopsbyname(name) 2718 const char *name; 2719 { 2720 struct vfsops *v; 2721 2722 for (v = LIST_FIRST(&vfs_list); v != NULL; v = LIST_NEXT(v, vfs_list)) { 2723 if (strcmp(v->vfs_name, name) == 0) 2724 break; 2725 } 2726 2727 return (v); 2728 } 2729 2730 /* 2731 * Establish a file system and initialize it. 2732 */ 2733 int 2734 vfs_attach(vfs) 2735 struct vfsops *vfs; 2736 { 2737 struct vfsops *v; 2738 int error = 0; 2739 2740 2741 /* 2742 * Make sure this file system doesn't already exist. 2743 */ 2744 LIST_FOREACH(v, &vfs_list, vfs_list) { 2745 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2746 error = EEXIST; 2747 goto out; 2748 } 2749 } 2750 2751 /* 2752 * Initialize the vnode operations for this file system. 2753 */ 2754 vfs_opv_init(vfs->vfs_opv_descs); 2755 2756 /* 2757 * Now initialize the file system itself. 2758 */ 2759 (*vfs->vfs_init)(); 2760 2761 /* 2762 * ...and link it into the kernel's list. 2763 */ 2764 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2765 2766 /* 2767 * Sanity: make sure the reference count is 0. 2768 */ 2769 vfs->vfs_refcount = 0; 2770 2771 out: 2772 return (error); 2773 } 2774 2775 /* 2776 * Remove a file system from the kernel. 2777 */ 2778 int 2779 vfs_detach(vfs) 2780 struct vfsops *vfs; 2781 { 2782 struct vfsops *v; 2783 2784 /* 2785 * Make sure no one is using the filesystem. 2786 */ 2787 if (vfs->vfs_refcount != 0) 2788 return (EBUSY); 2789 2790 /* 2791 * ...and remove it from the kernel's list. 2792 */ 2793 LIST_FOREACH(v, &vfs_list, vfs_list) { 2794 if (v == vfs) { 2795 LIST_REMOVE(v, vfs_list); 2796 break; 2797 } 2798 } 2799 2800 if (v == NULL) 2801 return (ESRCH); 2802 2803 /* 2804 * Now run the file system-specific cleanups. 2805 */ 2806 (*vfs->vfs_done)(); 2807 2808 /* 2809 * Free the vnode operations vector. 2810 */ 2811 vfs_opv_free(vfs->vfs_opv_descs); 2812 return (0); 2813 } 2814 2815 void 2816 vfs_reinit(void) 2817 { 2818 struct vfsops *vfs; 2819 2820 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2821 if (vfs->vfs_reinit) { 2822 (*vfs->vfs_reinit)(); 2823 } 2824 } 2825 } 2826 2827 /* 2828 * Request a filesystem to suspend write operations. 2829 */ 2830 int 2831 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2832 { 2833 struct proc *p = curproc; /* XXX */ 2834 int error; 2835 2836 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2837 if (slptimeo < 0) 2838 return EWOULDBLOCK; 2839 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2840 if (error) 2841 return error; 2842 } 2843 mp->mnt_iflag |= IMNT_SUSPEND; 2844 2845 if (mp->mnt_writeopcountupper > 0) 2846 tsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 0); 2847 2848 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2849 if (error) { 2850 vfs_write_resume(mp); 2851 return error; 2852 } 2853 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2854 2855 if (mp->mnt_writeopcountlower > 0) 2856 tsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 0); 2857 mp->mnt_iflag |= IMNT_SUSPENDED; 2858 2859 return 0; 2860 } 2861 2862 /* 2863 * Request a filesystem to resume write operations. 2864 */ 2865 void 2866 vfs_write_resume(struct mount *mp) 2867 { 2868 2869 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2870 return; 2871 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2872 wakeup(&mp->mnt_flag); 2873 } 2874 2875 void 2876 copy_statfs_info(struct statfs *sbp, const struct mount *mp) 2877 { 2878 const struct statfs *mbp; 2879 2880 if (sbp == (mbp = &mp->mnt_stat)) 2881 return; 2882 2883 sbp->f_oflags = mbp->f_oflags; 2884 sbp->f_type = mbp->f_type; 2885 (void)memcpy(&sbp->f_fsid, &mbp->f_fsid, sizeof(sbp->f_fsid)); 2886 sbp->f_owner = mbp->f_owner; 2887 sbp->f_flags = mbp->f_flags; 2888 sbp->f_syncwrites = mbp->f_syncwrites; 2889 sbp->f_asyncwrites = mbp->f_asyncwrites; 2890 sbp->f_spare[0] = mbp->f_spare[0]; 2891 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2892 sizeof(sbp->f_fstypename)); 2893 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2894 sizeof(sbp->f_mntonname)); 2895 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2896 sizeof(sbp->f_mntfromname)); 2897 } 2898 2899 int 2900 set_statfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2901 struct mount *mp, struct proc *p) 2902 { 2903 int error; 2904 size_t size; 2905 struct statfs *sfs = &mp->mnt_stat; 2906 int (*fun)(const void *, void *, size_t, size_t *); 2907 2908 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 2909 sizeof(mp->mnt_stat.f_fstypename)); 2910 2911 if (onp) { 2912 struct cwdinfo *cwdi = p->p_cwdi; 2913 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2914 if (cwdi->cwdi_rdir != NULL) { 2915 size_t len; 2916 char *bp; 2917 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2918 2919 if (!path) /* XXX can't happen with M_WAITOK */ 2920 return ENOMEM; 2921 2922 bp = path + MAXPATHLEN; 2923 *--bp = '\0'; 2924 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 2925 path, MAXPATHLEN / 2, 0, p); 2926 if (error) { 2927 free(path, M_TEMP); 2928 return error; 2929 } 2930 2931 len = strlen(bp); 2932 if (len > sizeof(sfs->f_mntonname) - 1) 2933 len = sizeof(sfs->f_mntonname) - 1; 2934 (void)strncpy(sfs->f_mntonname, bp, len); 2935 free(path, M_TEMP); 2936 2937 if (len < sizeof(sfs->f_mntonname) - 1) { 2938 error = (*fun)(onp, &sfs->f_mntonname[len], 2939 sizeof(sfs->f_mntonname) - len - 1, &size); 2940 if (error) 2941 return error; 2942 size += len; 2943 } else { 2944 size = len; 2945 } 2946 } else { 2947 error = (*fun)(onp, &sfs->f_mntonname, 2948 sizeof(sfs->f_mntonname) - 1, &size); 2949 if (error) 2950 return error; 2951 } 2952 (void)memset(sfs->f_mntonname + size, 0, 2953 sizeof(sfs->f_mntonname) - size); 2954 } 2955 2956 if (fromp) { 2957 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 2958 error = (*fun)(fromp, sfs->f_mntfromname, 2959 sizeof(sfs->f_mntfromname) - 1, &size); 2960 if (error) 2961 return error; 2962 (void)memset(sfs->f_mntfromname + size, 0, 2963 sizeof(sfs->f_mntfromname) - size); 2964 } 2965 return 0; 2966 } 2967 2968 #ifdef DDB 2969 const char buf_flagbits[] = 2970 "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" 2971 "\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" 2972 "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED" 2973 "\32XXX\33VFLUSH"; 2974 2975 void 2976 vfs_buf_print(bp, full, pr) 2977 struct buf *bp; 2978 int full; 2979 void (*pr) __P((const char *, ...)); 2980 { 2981 char buf[1024]; 2982 2983 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 2984 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 2985 2986 bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf)); 2987 (*pr)(" error %d flags 0x%s\n", bp->b_error, buf); 2988 2989 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 2990 bp->b_bufsize, bp->b_bcount, bp->b_resid); 2991 (*pr)(" data %p saveaddr %p dep %p\n", 2992 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 2993 (*pr)(" iodone %p\n", bp->b_iodone); 2994 } 2995 2996 2997 const char vnode_flagbits[] = 2998 "\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\5EXECMAP" 2999 "\11XLOCK\12XWANT\13BWAIT\14ALIASED" 3000 "\15DIROP\16LAYER\17ONWORKLIST\20DIRTY"; 3001 3002 const char * const vnode_tags[] = { 3003 "VT_NON", 3004 "VT_UFS", 3005 "VT_NFS", 3006 "VT_MFS", 3007 "VT_MSDOSFS", 3008 "VT_LFS", 3009 "VT_LOFS", 3010 "VT_FDESC", 3011 "VT_PORTAL", 3012 "VT_NULL", 3013 "VT_UMAP", 3014 "VT_KERNFS", 3015 "VT_PROCFS", 3016 "VT_AFS", 3017 "VT_ISOFS", 3018 "VT_UNION", 3019 "VT_ADOSFS", 3020 "VT_EXT2FS", 3021 "VT_CODA", 3022 "VT_FILECORE", 3023 "VT_NTFS", 3024 "VT_VFS", 3025 "VT_OVERLAY", 3026 "VT_SMBFS" 3027 }; 3028 3029 void 3030 vfs_vnode_print(vp, full, pr) 3031 struct vnode *vp; 3032 int full; 3033 void (*pr) __P((const char *, ...)); 3034 { 3035 char buf[256]; 3036 const char *vtype, *vtag; 3037 3038 uvm_object_printit(&vp->v_uobj, full, pr); 3039 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); 3040 (*pr)("\nVNODE flags %s\n", buf); 3041 (*pr)("mp %p numoutput %d size 0x%llx\n", 3042 vp->v_mount, vp->v_numoutput, vp->v_size); 3043 3044 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3045 vp->v_data, vp->v_usecount, vp->v_writecount, 3046 vp->v_holdcnt, vp->v_numoutput); 3047 3048 vtype = (vp->v_type >= 0 && 3049 vp->v_type < sizeof(vnode_types) / sizeof(vnode_types[0])) ? 3050 vnode_types[vp->v_type] : "UNKNOWN"; 3051 vtag = (vp->v_tag >= 0 && 3052 vp->v_tag < sizeof(vnode_tags) / sizeof(vnode_tags[0])) ? 3053 vnode_tags[vp->v_tag] : "UNKNOWN"; 3054 3055 (*pr)("type %s(%d) tag %s(%d) mount %p typedata %p\n", 3056 vtype, vp->v_type, vtag, vp->v_tag, 3057 vp->v_mount, vp->v_mountedhere); 3058 3059 if (full) { 3060 struct buf *bp; 3061 3062 (*pr)("clean bufs:\n"); 3063 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3064 (*pr)(" bp %p\n", bp); 3065 vfs_buf_print(bp, full, pr); 3066 } 3067 3068 (*pr)("dirty bufs:\n"); 3069 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3070 (*pr)(" bp %p\n", bp); 3071 vfs_buf_print(bp, full, pr); 3072 } 3073 } 3074 } 3075 3076 void 3077 vfs_mount_print(mp, full, pr) 3078 struct mount *mp; 3079 int full; 3080 void (*pr) __P((const char *, ...)); 3081 { 3082 char sbuf[256]; 3083 3084 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3085 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3086 3087 (*pr)("fs_bshift %d dev_bshift = %d maxsymlinklen = %d\n", 3088 mp->mnt_fs_bshift,mp->mnt_dev_bshift,mp->mnt_maxsymlinklen); 3089 3090 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3091 (*pr)("flag = %s\n", sbuf); 3092 3093 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3094 (*pr)("iflag = %s\n", sbuf); 3095 3096 /* XXX use lockmgr_printinfo */ 3097 if (mp->mnt_lock.lk_sharecount) 3098 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3099 mp->mnt_lock.lk_sharecount); 3100 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3101 (*pr)(" lock type %s: EXCL (count %d) by ", 3102 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3103 if (mp->mnt_lock.lk_flags & LK_SPIN) 3104 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3105 else 3106 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3107 mp->mnt_lock.lk_locklwp); 3108 } else 3109 (*pr)(" not locked"); 3110 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3111 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3112 3113 (*pr)("\n"); 3114 3115 if (mp->mnt_unmounter) { 3116 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3117 } 3118 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3119 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3120 3121 (*pr)("statfs cache:\n"); 3122 (*pr)("\ttype = %d\n",mp->mnt_stat.f_type); 3123 (*pr)("\toflags = 0x%04x\n",mp->mnt_stat.f_oflags); 3124 (*pr)("\tbsize = %d\n",mp->mnt_stat.f_bsize); 3125 (*pr)("\tiosize = %d\n",mp->mnt_stat.f_iosize); 3126 (*pr)("\tblocks = %d\n",mp->mnt_stat.f_blocks); 3127 (*pr)("\tbfree = %d\n",mp->mnt_stat.f_bfree); 3128 (*pr)("\tbavail = %d\n",mp->mnt_stat.f_bavail); 3129 (*pr)("\tfiles = %d\n",mp->mnt_stat.f_files); 3130 (*pr)("\tffree = %d\n",mp->mnt_stat.f_ffree); 3131 (*pr)("\tf_fsid = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3132 mp->mnt_stat.f_fsid.val[0],mp->mnt_stat.f_fsid.val[1]); 3133 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3134 bitmask_snprintf(mp->mnt_stat.f_flags, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3135 (*pr)("\tflags = %s\n",sbuf); 3136 (*pr)("\tsyncwrites = %d\n",mp->mnt_stat.f_syncwrites); 3137 (*pr)("\tasyncwrites = %d\n",mp->mnt_stat.f_asyncwrites); 3138 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3139 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3140 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3141 3142 { 3143 int cnt = 0; 3144 struct vnode *vp; 3145 (*pr)("locked vnodes ="); 3146 /* XXX would take mountlist lock, except ddb may not have context */ 3147 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3148 if (VOP_ISLOCKED(vp)) { 3149 if ((++cnt % 6) == 0) { 3150 (*pr)(" %p,\n\t", vp); 3151 } else { 3152 (*pr)(" %p,", vp); 3153 } 3154 } 3155 } 3156 (*pr)("\n"); 3157 } 3158 3159 if (full) { 3160 int cnt = 0; 3161 struct vnode *vp; 3162 (*pr)("all vnodes ="); 3163 /* XXX would take mountlist lock, except ddb may not have context */ 3164 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3165 if (!LIST_NEXT(vp, v_mntvnodes)) { 3166 (*pr)(" %p", vp); 3167 } else if ((++cnt % 6) == 0) { 3168 (*pr)(" %p,\n\t", vp); 3169 } else { 3170 (*pr)(" %p,", vp); 3171 } 3172 } 3173 (*pr)("\n", vp); 3174 } 3175 } 3176 3177 #endif 3178