1 /* $NetBSD: vfs_subr.c,v 1.235 2004/09/22 11:47:23 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.235 2004/09/22 11:47:23 lukem Exp $"); 82 83 #include "opt_inet.h" 84 #include "opt_ddb.h" 85 #include "opt_compat_netbsd.h" 86 #include "opt_compat_43.h" 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/proc.h> 91 #include <sys/kernel.h> 92 #include <sys/mount.h> 93 #include <sys/time.h> 94 #include <sys/event.h> 95 #include <sys/fcntl.h> 96 #include <sys/vnode.h> 97 #include <sys/stat.h> 98 #include <sys/namei.h> 99 #include <sys/ucred.h> 100 #include <sys/buf.h> 101 #include <sys/errno.h> 102 #include <sys/malloc.h> 103 #include <sys/domain.h> 104 #include <sys/mbuf.h> 105 #include <sys/sa.h> 106 #include <sys/syscallargs.h> 107 #include <sys/device.h> 108 #include <sys/dirent.h> 109 #include <sys/filedesc.h> 110 111 #include <miscfs/specfs/specdev.h> 112 #include <miscfs/genfs/genfs.h> 113 #include <miscfs/syncfs/syncfs.h> 114 115 #include <netinet/in.h> 116 117 #include <uvm/uvm.h> 118 #include <uvm/uvm_ddb.h> 119 120 #include <netinet/in.h> 121 122 #include <sys/sysctl.h> 123 124 const enum vtype iftovt_tab[16] = { 125 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 126 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 127 }; 128 const int vttoif_tab[9] = { 129 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 130 S_IFSOCK, S_IFIFO, S_IFMT, 131 }; 132 133 int doforce = 1; /* 1 => permit forcible unmounting */ 134 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 135 136 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 137 138 /* 139 * Insq/Remq for the vnode usage lists. 140 */ 141 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 142 #define bufremvn(bp) { \ 143 LIST_REMOVE(bp, b_vnbufs); \ 144 (bp)->b_vnbufs.le_next = NOLIST; \ 145 } 146 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 147 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 148 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 149 150 struct mntlist mountlist = /* mounted filesystem list */ 151 CIRCLEQ_HEAD_INITIALIZER(mountlist); 152 struct vfs_list_head vfs_list = /* vfs list */ 153 LIST_HEAD_INITIALIZER(vfs_list); 154 155 struct nfs_public nfs_pub; /* publicly exported FS */ 156 157 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 158 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 159 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 160 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 161 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 162 163 /* XXX - gross; single global lock to protect v_numoutput */ 164 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 165 166 /* 167 * These define the root filesystem and device. 168 */ 169 struct mount *rootfs; 170 struct vnode *rootvnode; 171 struct device *root_device; /* root device */ 172 173 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 174 &pool_allocator_nointr); 175 176 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 177 178 /* 179 * Local declarations. 180 */ 181 void insmntque(struct vnode *, struct mount *); 182 int getdevvp(dev_t, struct vnode **, enum vtype); 183 void vgoneall(struct vnode *); 184 185 void vclean(struct vnode *, int, struct proc *); 186 187 static int vfs_hang_addrlist(struct mount *, struct netexport *, 188 struct export_args *); 189 static int vfs_free_netcred(struct radix_node *, void *); 190 static void vfs_free_addrlist(struct netexport *); 191 static struct vnode *getcleanvnode(struct proc *); 192 193 #ifdef DEBUG 194 void printlockedvnodes(void); 195 #endif 196 197 /* 198 * Initialize the vnode management data structures. 199 */ 200 void 201 vntblinit() 202 { 203 204 /* 205 * Initialize the filesystem syncer. 206 */ 207 vn_initialize_syncerd(); 208 } 209 210 int 211 vfs_drainvnodes(long target, struct proc *p) 212 { 213 214 simple_lock(&vnode_free_list_slock); 215 while (numvnodes > target) { 216 struct vnode *vp; 217 218 vp = getcleanvnode(p); 219 if (vp == NULL) 220 return EBUSY; /* give up */ 221 pool_put(&vnode_pool, vp); 222 simple_lock(&vnode_free_list_slock); 223 numvnodes--; 224 } 225 simple_unlock(&vnode_free_list_slock); 226 227 return 0; 228 } 229 230 /* 231 * grab a vnode from freelist and clean it. 232 */ 233 struct vnode * 234 getcleanvnode(p) 235 struct proc *p; 236 { 237 struct vnode *vp; 238 struct mount *mp; 239 struct freelst *listhd; 240 241 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 242 243 listhd = &vnode_free_list; 244 try_nextlist: 245 TAILQ_FOREACH(vp, listhd, v_freelist) { 246 if (!simple_lock_try(&vp->v_interlock)) 247 continue; 248 /* 249 * as our lwp might hold the underlying vnode locked, 250 * don't try to reclaim the VLAYER vnode if it's locked. 251 */ 252 if ((vp->v_flag & VXLOCK) == 0 && 253 ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 254 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 255 break; 256 } 257 mp = NULL; 258 simple_unlock(&vp->v_interlock); 259 } 260 261 if (vp == NULLVP) { 262 if (listhd == &vnode_free_list) { 263 listhd = &vnode_hold_list; 264 goto try_nextlist; 265 } 266 simple_unlock(&vnode_free_list_slock); 267 return NULLVP; 268 } 269 270 if (vp->v_usecount) 271 panic("free vnode isn't, vp %p", vp); 272 TAILQ_REMOVE(listhd, vp, v_freelist); 273 /* see comment on why 0xdeadb is set at end of vgone (below) */ 274 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 275 simple_unlock(&vnode_free_list_slock); 276 vp->v_lease = NULL; 277 278 if (vp->v_type != VBAD) 279 vgonel(vp, p); 280 else 281 simple_unlock(&vp->v_interlock); 282 vn_finished_write(mp, 0); 283 #ifdef DIAGNOSTIC 284 if (vp->v_data || vp->v_uobj.uo_npages || 285 TAILQ_FIRST(&vp->v_uobj.memq)) 286 panic("cleaned vnode isn't, vp %p", vp); 287 if (vp->v_numoutput) 288 panic("clean vnode has pending I/O's, vp %p", vp); 289 #endif 290 KASSERT((vp->v_flag & VONWORKLST) == 0); 291 292 return vp; 293 } 294 295 /* 296 * Mark a mount point as busy. Used to synchronize access and to delay 297 * unmounting. Interlock is not released on failure. 298 */ 299 int 300 vfs_busy(mp, flags, interlkp) 301 struct mount *mp; 302 int flags; 303 struct simplelock *interlkp; 304 { 305 int lkflags; 306 307 while (mp->mnt_iflag & IMNT_UNMOUNT) { 308 int gone, n; 309 310 if (flags & LK_NOWAIT) 311 return (ENOENT); 312 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 313 && mp->mnt_unmounter == curproc) 314 return (EDEADLK); 315 if (interlkp) 316 simple_unlock(interlkp); 317 /* 318 * Since all busy locks are shared except the exclusive 319 * lock granted when unmounting, the only place that a 320 * wakeup needs to be done is at the release of the 321 * exclusive lock at the end of dounmount. 322 */ 323 simple_lock(&mp->mnt_slock); 324 mp->mnt_wcnt++; 325 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 326 n = --mp->mnt_wcnt; 327 simple_unlock(&mp->mnt_slock); 328 gone = mp->mnt_iflag & IMNT_GONE; 329 330 if (n == 0) 331 wakeup(&mp->mnt_wcnt); 332 if (interlkp) 333 simple_lock(interlkp); 334 if (gone) 335 return (ENOENT); 336 } 337 lkflags = LK_SHARED; 338 if (interlkp) 339 lkflags |= LK_INTERLOCK; 340 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 341 panic("vfs_busy: unexpected lock failure"); 342 return (0); 343 } 344 345 /* 346 * Free a busy filesystem. 347 */ 348 void 349 vfs_unbusy(mp) 350 struct mount *mp; 351 { 352 353 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 354 } 355 356 /* 357 * Lookup a filesystem type, and if found allocate and initialize 358 * a mount structure for it. 359 * 360 * Devname is usually updated by mount(8) after booting. 361 */ 362 int 363 vfs_rootmountalloc(fstypename, devname, mpp) 364 char *fstypename; 365 char *devname; 366 struct mount **mpp; 367 { 368 struct vfsops *vfsp = NULL; 369 struct mount *mp; 370 371 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 372 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 373 break; 374 375 if (vfsp == NULL) 376 return (ENODEV); 377 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 378 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 379 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 380 simple_lock_init(&mp->mnt_slock); 381 (void)vfs_busy(mp, LK_NOWAIT, 0); 382 LIST_INIT(&mp->mnt_vnodelist); 383 mp->mnt_op = vfsp; 384 mp->mnt_flag = MNT_RDONLY; 385 mp->mnt_vnodecovered = NULLVP; 386 mp->mnt_leaf = mp; 387 vfsp->vfs_refcount++; 388 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 389 mp->mnt_stat.f_mntonname[0] = '/'; 390 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 391 *mpp = mp; 392 return (0); 393 } 394 395 /* 396 * Lookup a mount point by filesystem identifier. 397 */ 398 struct mount * 399 vfs_getvfs(fsid) 400 fsid_t *fsid; 401 { 402 struct mount *mp; 403 404 simple_lock(&mountlist_slock); 405 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 406 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 407 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 408 simple_unlock(&mountlist_slock); 409 return (mp); 410 } 411 } 412 simple_unlock(&mountlist_slock); 413 return ((struct mount *)0); 414 } 415 416 /* 417 * Get a new unique fsid 418 */ 419 void 420 vfs_getnewfsid(mp) 421 struct mount *mp; 422 { 423 static u_short xxxfs_mntid; 424 fsid_t tfsid; 425 int mtype; 426 427 simple_lock(&mntid_slock); 428 mtype = makefstype(mp->mnt_op->vfs_name); 429 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 430 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 431 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 432 if (xxxfs_mntid == 0) 433 ++xxxfs_mntid; 434 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 435 tfsid.__fsid_val[1] = mtype; 436 if (!CIRCLEQ_EMPTY(&mountlist)) { 437 while (vfs_getvfs(&tfsid)) { 438 tfsid.__fsid_val[0]++; 439 xxxfs_mntid++; 440 } 441 } 442 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 443 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 444 simple_unlock(&mntid_slock); 445 } 446 447 /* 448 * Make a 'unique' number from a mount type name. 449 */ 450 long 451 makefstype(type) 452 const char *type; 453 { 454 long rv; 455 456 for (rv = 0; *type; type++) { 457 rv <<= 2; 458 rv ^= *type; 459 } 460 return rv; 461 } 462 463 464 /* 465 * Set vnode attributes to VNOVAL 466 */ 467 void 468 vattr_null(vap) 469 struct vattr *vap; 470 { 471 472 vap->va_type = VNON; 473 474 /* 475 * Assign individually so that it is safe even if size and 476 * sign of each member are varied. 477 */ 478 vap->va_mode = VNOVAL; 479 vap->va_nlink = VNOVAL; 480 vap->va_uid = VNOVAL; 481 vap->va_gid = VNOVAL; 482 vap->va_fsid = VNOVAL; 483 vap->va_fileid = VNOVAL; 484 vap->va_size = VNOVAL; 485 vap->va_blocksize = VNOVAL; 486 vap->va_atime.tv_sec = 487 vap->va_mtime.tv_sec = 488 vap->va_ctime.tv_sec = 489 vap->va_birthtime.tv_sec = VNOVAL; 490 vap->va_atime.tv_nsec = 491 vap->va_mtime.tv_nsec = 492 vap->va_ctime.tv_nsec = 493 vap->va_birthtime.tv_nsec = VNOVAL; 494 vap->va_gen = VNOVAL; 495 vap->va_flags = VNOVAL; 496 vap->va_rdev = VNOVAL; 497 vap->va_bytes = VNOVAL; 498 vap->va_vaflags = 0; 499 } 500 501 /* 502 * Routines having to do with the management of the vnode table. 503 */ 504 extern int (**dead_vnodeop_p)(void *); 505 long numvnodes; 506 507 /* 508 * Return the next vnode from the free list. 509 */ 510 int 511 getnewvnode(tag, mp, vops, vpp) 512 enum vtagtype tag; 513 struct mount *mp; 514 int (**vops)(void *); 515 struct vnode **vpp; 516 { 517 extern struct uvm_pagerops uvm_vnodeops; 518 struct uvm_object *uobj; 519 struct proc *p = curproc; /* XXX */ 520 static int toggle; 521 struct vnode *vp; 522 int error = 0, tryalloc; 523 524 try_again: 525 if (mp) { 526 /* 527 * Mark filesystem busy while we're creating a vnode. 528 * If unmount is in progress, this will wait; if the 529 * unmount succeeds (only if umount -f), this will 530 * return an error. If the unmount fails, we'll keep 531 * going afterwards. 532 * (This puts the per-mount vnode list logically under 533 * the protection of the vfs_busy lock). 534 */ 535 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 536 if (error && error != EDEADLK) 537 return error; 538 } 539 540 /* 541 * We must choose whether to allocate a new vnode or recycle an 542 * existing one. The criterion for allocating a new one is that 543 * the total number of vnodes is less than the number desired or 544 * there are no vnodes on either free list. Generally we only 545 * want to recycle vnodes that have no buffers associated with 546 * them, so we look first on the vnode_free_list. If it is empty, 547 * we next consider vnodes with referencing buffers on the 548 * vnode_hold_list. The toggle ensures that half the time we 549 * will use a buffer from the vnode_hold_list, and half the time 550 * we will allocate a new one unless the list has grown to twice 551 * the desired size. We are reticent to recycle vnodes from the 552 * vnode_hold_list because we will lose the identity of all its 553 * referencing buffers. 554 */ 555 556 vp = NULL; 557 558 simple_lock(&vnode_free_list_slock); 559 560 toggle ^= 1; 561 if (numvnodes > 2 * desiredvnodes) 562 toggle = 0; 563 564 tryalloc = numvnodes < desiredvnodes || 565 (TAILQ_FIRST(&vnode_free_list) == NULL && 566 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 567 568 if (tryalloc && 569 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 570 numvnodes++; 571 simple_unlock(&vnode_free_list_slock); 572 memset(vp, 0, sizeof(*vp)); 573 simple_lock_init(&vp->v_interlock); 574 uobj = &vp->v_uobj; 575 uobj->pgops = &uvm_vnodeops; 576 TAILQ_INIT(&uobj->memq); 577 /* 578 * done by memset() above. 579 * uobj->uo_npages = 0; 580 * LIST_INIT(&vp->v_nclist); 581 * LIST_INIT(&vp->v_dnclist); 582 */ 583 } else { 584 vp = getcleanvnode(p); 585 /* 586 * Unless this is a bad time of the month, at most 587 * the first NCPUS items on the free list are 588 * locked, so this is close enough to being empty. 589 */ 590 if (vp == NULLVP) { 591 if (mp && error != EDEADLK) 592 vfs_unbusy(mp); 593 if (tryalloc) { 594 printf("WARNING: unable to allocate new " 595 "vnode, retrying...\n"); 596 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 597 goto try_again; 598 } 599 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 600 *vpp = 0; 601 return (ENFILE); 602 } 603 vp->v_flag = 0; 604 vp->v_socket = NULL; 605 #ifdef VERIFIED_EXEC 606 vp->fp_status = FINGERPRINT_INVALID; 607 #endif 608 } 609 vp->v_type = VNON; 610 vp->v_vnlock = &vp->v_lock; 611 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 612 KASSERT(LIST_EMPTY(&vp->v_nclist)); 613 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 614 vp->v_tag = tag; 615 vp->v_op = vops; 616 insmntque(vp, mp); 617 *vpp = vp; 618 vp->v_usecount = 1; 619 vp->v_data = 0; 620 simple_lock_init(&vp->v_uobj.vmobjlock); 621 622 /* 623 * initialize uvm_object within vnode. 624 */ 625 626 uobj = &vp->v_uobj; 627 KASSERT(uobj->pgops == &uvm_vnodeops); 628 KASSERT(uobj->uo_npages == 0); 629 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 630 vp->v_size = VSIZENOTSET; 631 632 if (mp && error != EDEADLK) 633 vfs_unbusy(mp); 634 return (0); 635 } 636 637 /* 638 * This is really just the reverse of getnewvnode(). Needed for 639 * VFS_VGET functions who may need to push back a vnode in case 640 * of a locking race. 641 */ 642 void 643 ungetnewvnode(vp) 644 struct vnode *vp; 645 { 646 #ifdef DIAGNOSTIC 647 if (vp->v_usecount != 1) 648 panic("ungetnewvnode: busy vnode"); 649 #endif 650 vp->v_usecount--; 651 insmntque(vp, NULL); 652 vp->v_type = VBAD; 653 654 simple_lock(&vp->v_interlock); 655 /* 656 * Insert at head of LRU list 657 */ 658 simple_lock(&vnode_free_list_slock); 659 if (vp->v_holdcnt > 0) 660 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 661 else 662 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 663 simple_unlock(&vnode_free_list_slock); 664 simple_unlock(&vp->v_interlock); 665 } 666 667 /* 668 * Move a vnode from one mount queue to another. 669 */ 670 void 671 insmntque(vp, mp) 672 struct vnode *vp; 673 struct mount *mp; 674 { 675 676 #ifdef DIAGNOSTIC 677 if ((mp != NULL) && 678 (mp->mnt_iflag & IMNT_UNMOUNT) && 679 !(mp->mnt_flag & MNT_SOFTDEP) && 680 vp->v_tag != VT_VFS) { 681 panic("insmntque into dying filesystem"); 682 } 683 #endif 684 685 simple_lock(&mntvnode_slock); 686 /* 687 * Delete from old mount point vnode list, if on one. 688 */ 689 if (vp->v_mount != NULL) 690 LIST_REMOVE(vp, v_mntvnodes); 691 /* 692 * Insert into list of vnodes for the new mount point, if available. 693 */ 694 if ((vp->v_mount = mp) != NULL) 695 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 696 simple_unlock(&mntvnode_slock); 697 } 698 699 /* 700 * Update outstanding I/O count and do wakeup if requested. 701 */ 702 void 703 vwakeup(bp) 704 struct buf *bp; 705 { 706 struct vnode *vp; 707 708 if ((vp = bp->b_vp) != NULL) { 709 /* XXX global lock hack 710 * can't use v_interlock here since this is called 711 * in interrupt context from biodone(). 712 */ 713 simple_lock(&global_v_numoutput_slock); 714 if (--vp->v_numoutput < 0) 715 panic("vwakeup: neg numoutput, vp %p", vp); 716 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 717 vp->v_flag &= ~VBWAIT; 718 wakeup((caddr_t)&vp->v_numoutput); 719 } 720 simple_unlock(&global_v_numoutput_slock); 721 } 722 } 723 724 /* 725 * Flush out and invalidate all buffers associated with a vnode. 726 * Called with the underlying vnode locked, which should prevent new dirty 727 * buffers from being queued. 728 */ 729 int 730 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 731 struct vnode *vp; 732 int flags; 733 struct ucred *cred; 734 struct proc *p; 735 int slpflag, slptimeo; 736 { 737 struct buf *bp, *nbp; 738 int s, error; 739 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 740 (flags & V_SAVE ? PGO_CLEANIT : 0); 741 742 /* XXXUBC this doesn't look at flags or slp* */ 743 simple_lock(&vp->v_interlock); 744 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 745 if (error) { 746 return error; 747 } 748 749 if (flags & V_SAVE) { 750 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 751 if (error) 752 return (error); 753 #ifdef DIAGNOSTIC 754 s = splbio(); 755 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 756 panic("vinvalbuf: dirty bufs, vp %p", vp); 757 splx(s); 758 #endif 759 } 760 761 s = splbio(); 762 763 restart: 764 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 765 nbp = LIST_NEXT(bp, b_vnbufs); 766 simple_lock(&bp->b_interlock); 767 if (bp->b_flags & B_BUSY) { 768 bp->b_flags |= B_WANTED; 769 error = ltsleep((caddr_t)bp, 770 slpflag | (PRIBIO + 1) | PNORELOCK, 771 "vinvalbuf", slptimeo, &bp->b_interlock); 772 if (error) { 773 splx(s); 774 return (error); 775 } 776 goto restart; 777 } 778 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 779 simple_unlock(&bp->b_interlock); 780 brelse(bp); 781 } 782 783 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 784 nbp = LIST_NEXT(bp, b_vnbufs); 785 simple_lock(&bp->b_interlock); 786 if (bp->b_flags & B_BUSY) { 787 bp->b_flags |= B_WANTED; 788 error = ltsleep((caddr_t)bp, 789 slpflag | (PRIBIO + 1) | PNORELOCK, 790 "vinvalbuf", slptimeo, &bp->b_interlock); 791 if (error) { 792 splx(s); 793 return (error); 794 } 795 goto restart; 796 } 797 /* 798 * XXX Since there are no node locks for NFS, I believe 799 * there is a slight chance that a delayed write will 800 * occur while sleeping just above, so check for it. 801 */ 802 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 803 #ifdef DEBUG 804 printf("buffer still DELWRI\n"); 805 #endif 806 bp->b_flags |= B_BUSY | B_VFLUSH; 807 simple_unlock(&bp->b_interlock); 808 VOP_BWRITE(bp); 809 goto restart; 810 } 811 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 812 simple_unlock(&bp->b_interlock); 813 brelse(bp); 814 } 815 816 #ifdef DIAGNOSTIC 817 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 818 panic("vinvalbuf: flush failed, vp %p", vp); 819 #endif 820 821 splx(s); 822 823 return (0); 824 } 825 826 /* 827 * Destroy any in core blocks past the truncation length. 828 * Called with the underlying vnode locked, which should prevent new dirty 829 * buffers from being queued. 830 */ 831 int 832 vtruncbuf(vp, lbn, slpflag, slptimeo) 833 struct vnode *vp; 834 daddr_t lbn; 835 int slpflag, slptimeo; 836 { 837 struct buf *bp, *nbp; 838 int s, error; 839 voff_t off; 840 841 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 842 simple_lock(&vp->v_interlock); 843 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 844 if (error) { 845 return error; 846 } 847 848 s = splbio(); 849 850 restart: 851 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 852 nbp = LIST_NEXT(bp, b_vnbufs); 853 if (bp->b_lblkno < lbn) 854 continue; 855 simple_lock(&bp->b_interlock); 856 if (bp->b_flags & B_BUSY) { 857 bp->b_flags |= B_WANTED; 858 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 859 "vtruncbuf", slptimeo, &bp->b_interlock); 860 if (error) { 861 splx(s); 862 return (error); 863 } 864 goto restart; 865 } 866 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 867 simple_unlock(&bp->b_interlock); 868 brelse(bp); 869 } 870 871 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 872 nbp = LIST_NEXT(bp, b_vnbufs); 873 if (bp->b_lblkno < lbn) 874 continue; 875 simple_lock(&bp->b_interlock); 876 if (bp->b_flags & B_BUSY) { 877 bp->b_flags |= B_WANTED; 878 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 879 "vtruncbuf", slptimeo, &bp->b_interlock); 880 if (error) { 881 splx(s); 882 return (error); 883 } 884 goto restart; 885 } 886 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 887 simple_unlock(&bp->b_interlock); 888 brelse(bp); 889 } 890 891 splx(s); 892 893 return (0); 894 } 895 896 void 897 vflushbuf(vp, sync) 898 struct vnode *vp; 899 int sync; 900 { 901 struct buf *bp, *nbp; 902 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 903 int s; 904 905 simple_lock(&vp->v_interlock); 906 (void) VOP_PUTPAGES(vp, 0, 0, flags); 907 908 loop: 909 s = splbio(); 910 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 911 nbp = LIST_NEXT(bp, b_vnbufs); 912 simple_lock(&bp->b_interlock); 913 if ((bp->b_flags & B_BUSY)) { 914 simple_unlock(&bp->b_interlock); 915 continue; 916 } 917 if ((bp->b_flags & B_DELWRI) == 0) 918 panic("vflushbuf: not dirty, bp %p", bp); 919 bp->b_flags |= B_BUSY | B_VFLUSH; 920 simple_unlock(&bp->b_interlock); 921 splx(s); 922 /* 923 * Wait for I/O associated with indirect blocks to complete, 924 * since there is no way to quickly wait for them below. 925 */ 926 if (bp->b_vp == vp || sync == 0) 927 (void) bawrite(bp); 928 else 929 (void) bwrite(bp); 930 goto loop; 931 } 932 if (sync == 0) { 933 splx(s); 934 return; 935 } 936 simple_lock(&global_v_numoutput_slock); 937 while (vp->v_numoutput) { 938 vp->v_flag |= VBWAIT; 939 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 940 &global_v_numoutput_slock); 941 } 942 simple_unlock(&global_v_numoutput_slock); 943 splx(s); 944 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 945 vprint("vflushbuf: dirty", vp); 946 goto loop; 947 } 948 } 949 950 /* 951 * Associate a buffer with a vnode. 952 */ 953 void 954 bgetvp(vp, bp) 955 struct vnode *vp; 956 struct buf *bp; 957 { 958 int s; 959 960 if (bp->b_vp) 961 panic("bgetvp: not free, bp %p", bp); 962 VHOLD(vp); 963 s = splbio(); 964 bp->b_vp = vp; 965 if (vp->v_type == VBLK || vp->v_type == VCHR) 966 bp->b_dev = vp->v_rdev; 967 else 968 bp->b_dev = NODEV; 969 /* 970 * Insert onto list for new vnode. 971 */ 972 bufinsvn(bp, &vp->v_cleanblkhd); 973 splx(s); 974 } 975 976 /* 977 * Disassociate a buffer from a vnode. 978 */ 979 void 980 brelvp(bp) 981 struct buf *bp; 982 { 983 struct vnode *vp; 984 int s; 985 986 if (bp->b_vp == NULL) 987 panic("brelvp: vp NULL, bp %p", bp); 988 989 s = splbio(); 990 vp = bp->b_vp; 991 /* 992 * Delete from old vnode list, if on one. 993 */ 994 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 995 bufremvn(bp); 996 997 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 998 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 999 vp->v_flag &= ~VONWORKLST; 1000 LIST_REMOVE(vp, v_synclist); 1001 } 1002 1003 bp->b_vp = NULL; 1004 HOLDRELE(vp); 1005 splx(s); 1006 } 1007 1008 /* 1009 * Reassign a buffer from one vnode to another. 1010 * Used to assign file specific control information 1011 * (indirect blocks) to the vnode to which they belong. 1012 * 1013 * This function must be called at splbio(). 1014 */ 1015 void 1016 reassignbuf(bp, newvp) 1017 struct buf *bp; 1018 struct vnode *newvp; 1019 { 1020 struct buflists *listheadp; 1021 int delay; 1022 1023 /* 1024 * Delete from old vnode list, if on one. 1025 */ 1026 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1027 bufremvn(bp); 1028 /* 1029 * If dirty, put on list of dirty buffers; 1030 * otherwise insert onto list of clean buffers. 1031 */ 1032 if ((bp->b_flags & B_DELWRI) == 0) { 1033 listheadp = &newvp->v_cleanblkhd; 1034 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 1035 (newvp->v_flag & VONWORKLST) && 1036 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1037 newvp->v_flag &= ~VONWORKLST; 1038 LIST_REMOVE(newvp, v_synclist); 1039 } 1040 } else { 1041 listheadp = &newvp->v_dirtyblkhd; 1042 if ((newvp->v_flag & VONWORKLST) == 0) { 1043 switch (newvp->v_type) { 1044 case VDIR: 1045 delay = dirdelay; 1046 break; 1047 case VBLK: 1048 if (newvp->v_specmountpoint != NULL) { 1049 delay = metadelay; 1050 break; 1051 } 1052 /* fall through */ 1053 default: 1054 delay = filedelay; 1055 break; 1056 } 1057 if (!newvp->v_mount || 1058 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1059 vn_syncer_add_to_worklist(newvp, delay); 1060 } 1061 } 1062 bufinsvn(bp, listheadp); 1063 } 1064 1065 /* 1066 * Create a vnode for a block device. 1067 * Used for root filesystem and swap areas. 1068 * Also used for memory file system special devices. 1069 */ 1070 int 1071 bdevvp(dev, vpp) 1072 dev_t dev; 1073 struct vnode **vpp; 1074 { 1075 1076 return (getdevvp(dev, vpp, VBLK)); 1077 } 1078 1079 /* 1080 * Create a vnode for a character device. 1081 * Used for kernfs and some console handling. 1082 */ 1083 int 1084 cdevvp(dev, vpp) 1085 dev_t dev; 1086 struct vnode **vpp; 1087 { 1088 1089 return (getdevvp(dev, vpp, VCHR)); 1090 } 1091 1092 /* 1093 * Create a vnode for a device. 1094 * Used by bdevvp (block device) for root file system etc., 1095 * and by cdevvp (character device) for console and kernfs. 1096 */ 1097 int 1098 getdevvp(dev, vpp, type) 1099 dev_t dev; 1100 struct vnode **vpp; 1101 enum vtype type; 1102 { 1103 struct vnode *vp; 1104 struct vnode *nvp; 1105 int error; 1106 1107 if (dev == NODEV) { 1108 *vpp = NULLVP; 1109 return (0); 1110 } 1111 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1112 if (error) { 1113 *vpp = NULLVP; 1114 return (error); 1115 } 1116 vp = nvp; 1117 vp->v_type = type; 1118 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1119 vput(vp); 1120 vp = nvp; 1121 } 1122 *vpp = vp; 1123 return (0); 1124 } 1125 1126 /* 1127 * Check to see if the new vnode represents a special device 1128 * for which we already have a vnode (either because of 1129 * bdevvp() or because of a different vnode representing 1130 * the same block device). If such an alias exists, deallocate 1131 * the existing contents and return the aliased vnode. The 1132 * caller is responsible for filling it with its new contents. 1133 */ 1134 struct vnode * 1135 checkalias(nvp, nvp_rdev, mp) 1136 struct vnode *nvp; 1137 dev_t nvp_rdev; 1138 struct mount *mp; 1139 { 1140 struct proc *p = curproc; /* XXX */ 1141 struct vnode *vp; 1142 struct vnode **vpp; 1143 1144 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1145 return (NULLVP); 1146 1147 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1148 loop: 1149 simple_lock(&spechash_slock); 1150 for (vp = *vpp; vp; vp = vp->v_specnext) { 1151 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1152 continue; 1153 /* 1154 * Alias, but not in use, so flush it out. 1155 */ 1156 simple_lock(&vp->v_interlock); 1157 simple_unlock(&spechash_slock); 1158 if (vp->v_usecount == 0) { 1159 vgonel(vp, p); 1160 goto loop; 1161 } 1162 /* 1163 * What we're interested to know here is if someone else has 1164 * removed this vnode from the device hash list while we were 1165 * waiting. This can only happen if vclean() did it, and 1166 * this requires the vnode to be locked. Therefore, we use 1167 * LK_SLEEPFAIL and retry. 1168 */ 1169 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL)) 1170 goto loop; 1171 simple_lock(&spechash_slock); 1172 break; 1173 } 1174 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1175 MALLOC(nvp->v_specinfo, struct specinfo *, 1176 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1177 /* XXX Erg. */ 1178 if (nvp->v_specinfo == NULL) { 1179 simple_unlock(&spechash_slock); 1180 uvm_wait("checkalias"); 1181 goto loop; 1182 } 1183 1184 nvp->v_rdev = nvp_rdev; 1185 nvp->v_hashchain = vpp; 1186 nvp->v_specnext = *vpp; 1187 nvp->v_specmountpoint = NULL; 1188 simple_unlock(&spechash_slock); 1189 nvp->v_speclockf = NULL; 1190 simple_lock_init(&nvp->v_spec_cow_slock); 1191 SLIST_INIT(&nvp->v_spec_cow_head); 1192 nvp->v_spec_cow_req = 0; 1193 nvp->v_spec_cow_count = 0; 1194 1195 *vpp = nvp; 1196 if (vp != NULLVP) { 1197 nvp->v_flag |= VALIASED; 1198 vp->v_flag |= VALIASED; 1199 vput(vp); 1200 } 1201 return (NULLVP); 1202 } 1203 simple_unlock(&spechash_slock); 1204 VOP_UNLOCK(vp, 0); 1205 simple_lock(&vp->v_interlock); 1206 vclean(vp, 0, p); 1207 vp->v_op = nvp->v_op; 1208 vp->v_tag = nvp->v_tag; 1209 vp->v_vnlock = &vp->v_lock; 1210 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1211 nvp->v_type = VNON; 1212 insmntque(vp, mp); 1213 return (vp); 1214 } 1215 1216 /* 1217 * Grab a particular vnode from the free list, increment its 1218 * reference count and lock it. If the vnode lock bit is set the 1219 * vnode is being eliminated in vgone. In that case, we can not 1220 * grab the vnode, so the process is awakened when the transition is 1221 * completed, and an error returned to indicate that the vnode is no 1222 * longer usable (possibly having been changed to a new file system type). 1223 */ 1224 int 1225 vget(vp, flags) 1226 struct vnode *vp; 1227 int flags; 1228 { 1229 int error; 1230 1231 /* 1232 * If the vnode is in the process of being cleaned out for 1233 * another use, we wait for the cleaning to finish and then 1234 * return failure. Cleaning is determined by checking that 1235 * the VXLOCK flag is set. 1236 */ 1237 1238 if ((flags & LK_INTERLOCK) == 0) 1239 simple_lock(&vp->v_interlock); 1240 if (vp->v_flag & VXLOCK) { 1241 if (flags & LK_NOWAIT) { 1242 simple_unlock(&vp->v_interlock); 1243 return EBUSY; 1244 } 1245 vp->v_flag |= VXWANT; 1246 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1247 return (ENOENT); 1248 } 1249 if (vp->v_usecount == 0) { 1250 simple_lock(&vnode_free_list_slock); 1251 if (vp->v_holdcnt > 0) 1252 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1253 else 1254 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1255 simple_unlock(&vnode_free_list_slock); 1256 } 1257 vp->v_usecount++; 1258 #ifdef DIAGNOSTIC 1259 if (vp->v_usecount == 0) { 1260 vprint("vget", vp); 1261 panic("vget: usecount overflow, vp %p", vp); 1262 } 1263 #endif 1264 if (flags & LK_TYPE_MASK) { 1265 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1266 /* 1267 * must expand vrele here because we do not want 1268 * to call VOP_INACTIVE if the reference count 1269 * drops back to zero since it was never really 1270 * active. We must remove it from the free list 1271 * before sleeping so that multiple processes do 1272 * not try to recycle it. 1273 */ 1274 simple_lock(&vp->v_interlock); 1275 vp->v_usecount--; 1276 if (vp->v_usecount > 0) { 1277 simple_unlock(&vp->v_interlock); 1278 return (error); 1279 } 1280 /* 1281 * insert at tail of LRU list 1282 */ 1283 simple_lock(&vnode_free_list_slock); 1284 if (vp->v_holdcnt > 0) 1285 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1286 v_freelist); 1287 else 1288 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1289 v_freelist); 1290 simple_unlock(&vnode_free_list_slock); 1291 simple_unlock(&vp->v_interlock); 1292 } 1293 return (error); 1294 } 1295 simple_unlock(&vp->v_interlock); 1296 return (0); 1297 } 1298 1299 /* 1300 * vput(), just unlock and vrele() 1301 */ 1302 void 1303 vput(vp) 1304 struct vnode *vp; 1305 { 1306 struct proc *p = curproc; /* XXX */ 1307 1308 #ifdef DIAGNOSTIC 1309 if (vp == NULL) 1310 panic("vput: null vp"); 1311 #endif 1312 simple_lock(&vp->v_interlock); 1313 vp->v_usecount--; 1314 if (vp->v_usecount > 0) { 1315 simple_unlock(&vp->v_interlock); 1316 VOP_UNLOCK(vp, 0); 1317 return; 1318 } 1319 #ifdef DIAGNOSTIC 1320 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1321 vprint("vput: bad ref count", vp); 1322 panic("vput: ref cnt"); 1323 } 1324 #endif 1325 /* 1326 * Insert at tail of LRU list. 1327 */ 1328 simple_lock(&vnode_free_list_slock); 1329 if (vp->v_holdcnt > 0) 1330 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1331 else 1332 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1333 simple_unlock(&vnode_free_list_slock); 1334 if (vp->v_flag & VEXECMAP) { 1335 uvmexp.execpages -= vp->v_uobj.uo_npages; 1336 uvmexp.filepages += vp->v_uobj.uo_npages; 1337 } 1338 vp->v_flag &= ~(VTEXT|VEXECMAP); 1339 simple_unlock(&vp->v_interlock); 1340 VOP_INACTIVE(vp, p); 1341 } 1342 1343 /* 1344 * Vnode release. 1345 * If count drops to zero, call inactive routine and return to freelist. 1346 */ 1347 void 1348 vrele(vp) 1349 struct vnode *vp; 1350 { 1351 struct proc *p = curproc; /* XXX */ 1352 1353 #ifdef DIAGNOSTIC 1354 if (vp == NULL) 1355 panic("vrele: null vp"); 1356 #endif 1357 simple_lock(&vp->v_interlock); 1358 vp->v_usecount--; 1359 if (vp->v_usecount > 0) { 1360 simple_unlock(&vp->v_interlock); 1361 return; 1362 } 1363 #ifdef DIAGNOSTIC 1364 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1365 vprint("vrele: bad ref count", vp); 1366 panic("vrele: ref cnt vp %p", vp); 1367 } 1368 #endif 1369 /* 1370 * Insert at tail of LRU list. 1371 */ 1372 simple_lock(&vnode_free_list_slock); 1373 if (vp->v_holdcnt > 0) 1374 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1375 else 1376 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1377 simple_unlock(&vnode_free_list_slock); 1378 if (vp->v_flag & VEXECMAP) { 1379 uvmexp.execpages -= vp->v_uobj.uo_npages; 1380 uvmexp.filepages += vp->v_uobj.uo_npages; 1381 } 1382 vp->v_flag &= ~(VTEXT|VEXECMAP); 1383 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1384 VOP_INACTIVE(vp, p); 1385 } 1386 1387 #ifdef DIAGNOSTIC 1388 /* 1389 * Page or buffer structure gets a reference. 1390 */ 1391 void 1392 vholdl(vp) 1393 struct vnode *vp; 1394 { 1395 1396 /* 1397 * If it is on the freelist and the hold count is currently 1398 * zero, move it to the hold list. The test of the back 1399 * pointer and the use reference count of zero is because 1400 * it will be removed from a free list by getnewvnode, 1401 * but will not have its reference count incremented until 1402 * after calling vgone. If the reference count were 1403 * incremented first, vgone would (incorrectly) try to 1404 * close the previous instance of the underlying object. 1405 * So, the back pointer is explicitly set to `0xdeadb' in 1406 * getnewvnode after removing it from a freelist to ensure 1407 * that we do not try to move it here. 1408 */ 1409 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1410 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1411 simple_lock(&vnode_free_list_slock); 1412 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1413 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1414 simple_unlock(&vnode_free_list_slock); 1415 } 1416 vp->v_holdcnt++; 1417 } 1418 1419 /* 1420 * Page or buffer structure frees a reference. 1421 */ 1422 void 1423 holdrelel(vp) 1424 struct vnode *vp; 1425 { 1426 1427 if (vp->v_holdcnt <= 0) 1428 panic("holdrelel: holdcnt vp %p", vp); 1429 vp->v_holdcnt--; 1430 1431 /* 1432 * If it is on the holdlist and the hold count drops to 1433 * zero, move it to the free list. The test of the back 1434 * pointer and the use reference count of zero is because 1435 * it will be removed from a free list by getnewvnode, 1436 * but will not have its reference count incremented until 1437 * after calling vgone. If the reference count were 1438 * incremented first, vgone would (incorrectly) try to 1439 * close the previous instance of the underlying object. 1440 * So, the back pointer is explicitly set to `0xdeadb' in 1441 * getnewvnode after removing it from a freelist to ensure 1442 * that we do not try to move it here. 1443 */ 1444 1445 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1446 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1447 simple_lock(&vnode_free_list_slock); 1448 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1449 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1450 simple_unlock(&vnode_free_list_slock); 1451 } 1452 } 1453 1454 /* 1455 * Vnode reference. 1456 */ 1457 void 1458 vref(vp) 1459 struct vnode *vp; 1460 { 1461 1462 simple_lock(&vp->v_interlock); 1463 if (vp->v_usecount <= 0) 1464 panic("vref used where vget required, vp %p", vp); 1465 vp->v_usecount++; 1466 #ifdef DIAGNOSTIC 1467 if (vp->v_usecount == 0) { 1468 vprint("vref", vp); 1469 panic("vref: usecount overflow, vp %p", vp); 1470 } 1471 #endif 1472 simple_unlock(&vp->v_interlock); 1473 } 1474 #endif /* DIAGNOSTIC */ 1475 1476 /* 1477 * Remove any vnodes in the vnode table belonging to mount point mp. 1478 * 1479 * If FORCECLOSE is not specified, there should not be any active ones, 1480 * return error if any are found (nb: this is a user error, not a 1481 * system error). If FORCECLOSE is specified, detach any active vnodes 1482 * that are found. 1483 * 1484 * If WRITECLOSE is set, only flush out regular file vnodes open for 1485 * writing. 1486 * 1487 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1488 */ 1489 #ifdef DEBUG 1490 int busyprt = 0; /* print out busy vnodes */ 1491 struct ctldebug debug1 = { "busyprt", &busyprt }; 1492 #endif 1493 1494 int 1495 vflush(mp, skipvp, flags) 1496 struct mount *mp; 1497 struct vnode *skipvp; 1498 int flags; 1499 { 1500 struct proc *p = curproc; /* XXX */ 1501 struct vnode *vp, *nvp; 1502 int busy = 0; 1503 1504 simple_lock(&mntvnode_slock); 1505 loop: 1506 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1507 if (vp->v_mount != mp) 1508 goto loop; 1509 nvp = LIST_NEXT(vp, v_mntvnodes); 1510 /* 1511 * Skip over a selected vnode. 1512 */ 1513 if (vp == skipvp) 1514 continue; 1515 simple_lock(&vp->v_interlock); 1516 /* 1517 * Skip over a vnodes marked VSYSTEM. 1518 */ 1519 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1520 simple_unlock(&vp->v_interlock); 1521 continue; 1522 } 1523 /* 1524 * If WRITECLOSE is set, only flush out regular file 1525 * vnodes open for writing. 1526 */ 1527 if ((flags & WRITECLOSE) && 1528 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1529 simple_unlock(&vp->v_interlock); 1530 continue; 1531 } 1532 /* 1533 * With v_usecount == 0, all we need to do is clear 1534 * out the vnode data structures and we are done. 1535 */ 1536 if (vp->v_usecount == 0) { 1537 simple_unlock(&mntvnode_slock); 1538 vgonel(vp, p); 1539 simple_lock(&mntvnode_slock); 1540 continue; 1541 } 1542 /* 1543 * If FORCECLOSE is set, forcibly close the vnode. 1544 * For block or character devices, revert to an 1545 * anonymous device. For all other files, just kill them. 1546 */ 1547 if (flags & FORCECLOSE) { 1548 simple_unlock(&mntvnode_slock); 1549 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1550 vgonel(vp, p); 1551 } else { 1552 vclean(vp, 0, p); 1553 vp->v_op = spec_vnodeop_p; 1554 insmntque(vp, (struct mount *)0); 1555 } 1556 simple_lock(&mntvnode_slock); 1557 continue; 1558 } 1559 #ifdef DEBUG 1560 if (busyprt) 1561 vprint("vflush: busy vnode", vp); 1562 #endif 1563 simple_unlock(&vp->v_interlock); 1564 busy++; 1565 } 1566 simple_unlock(&mntvnode_slock); 1567 if (busy) 1568 return (EBUSY); 1569 return (0); 1570 } 1571 1572 /* 1573 * Disassociate the underlying file system from a vnode. 1574 */ 1575 void 1576 vclean(vp, flags, p) 1577 struct vnode *vp; 1578 int flags; 1579 struct proc *p; 1580 { 1581 struct mount *mp; 1582 int active; 1583 1584 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1585 1586 /* 1587 * Check to see if the vnode is in use. 1588 * If so we have to reference it before we clean it out 1589 * so that its count cannot fall to zero and generate a 1590 * race against ourselves to recycle it. 1591 */ 1592 1593 if ((active = vp->v_usecount) != 0) { 1594 vp->v_usecount++; 1595 #ifdef DIAGNOSTIC 1596 if (vp->v_usecount == 0) { 1597 vprint("vclean", vp); 1598 panic("vclean: usecount overflow"); 1599 } 1600 #endif 1601 } 1602 1603 /* 1604 * Prevent the vnode from being recycled or 1605 * brought into use while we clean it out. 1606 */ 1607 if (vp->v_flag & VXLOCK) 1608 panic("vclean: deadlock, vp %p", vp); 1609 vp->v_flag |= VXLOCK; 1610 if (vp->v_flag & VEXECMAP) { 1611 uvmexp.execpages -= vp->v_uobj.uo_npages; 1612 uvmexp.filepages += vp->v_uobj.uo_npages; 1613 } 1614 vp->v_flag &= ~(VTEXT|VEXECMAP); 1615 1616 /* 1617 * Even if the count is zero, the VOP_INACTIVE routine may still 1618 * have the object locked while it cleans it out. The VOP_LOCK 1619 * ensures that the VOP_INACTIVE routine is done with its work. 1620 * For active vnodes, it ensures that no other activity can 1621 * occur while the underlying object is being cleaned out. 1622 */ 1623 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1624 1625 /* 1626 * Clean out any cached data associated with the vnode. 1627 * If special device, remove it from special device alias list. 1628 * if it is on one. 1629 */ 1630 if (flags & DOCLOSE) { 1631 int error; 1632 struct vnode *vq, *vx; 1633 1634 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1635 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1636 vn_finished_write(mp, V_LOWER); 1637 if (error) 1638 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1639 KASSERT(error == 0); 1640 KASSERT((vp->v_flag & VONWORKLST) == 0); 1641 1642 if (active) 1643 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1644 1645 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1646 vp->v_specinfo != 0) { 1647 simple_lock(&spechash_slock); 1648 if (vp->v_hashchain != NULL) { 1649 if (*vp->v_hashchain == vp) { 1650 *vp->v_hashchain = vp->v_specnext; 1651 } else { 1652 for (vq = *vp->v_hashchain; vq; 1653 vq = vq->v_specnext) { 1654 if (vq->v_specnext != vp) 1655 continue; 1656 vq->v_specnext = vp->v_specnext; 1657 break; 1658 } 1659 if (vq == NULL) 1660 panic("missing bdev"); 1661 } 1662 if (vp->v_flag & VALIASED) { 1663 vx = NULL; 1664 for (vq = *vp->v_hashchain; vq; 1665 vq = vq->v_specnext) { 1666 if (vq->v_rdev != vp->v_rdev || 1667 vq->v_type != vp->v_type) 1668 continue; 1669 if (vx) 1670 break; 1671 vx = vq; 1672 } 1673 if (vx == NULL) 1674 panic("missing alias"); 1675 if (vq == NULL) 1676 vx->v_flag &= ~VALIASED; 1677 vp->v_flag &= ~VALIASED; 1678 } 1679 } 1680 simple_unlock(&spechash_slock); 1681 FREE(vp->v_specinfo, M_VNODE); 1682 vp->v_specinfo = NULL; 1683 } 1684 } 1685 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1686 1687 /* 1688 * If purging an active vnode, it must be closed and 1689 * deactivated before being reclaimed. Note that the 1690 * VOP_INACTIVE will unlock the vnode. 1691 */ 1692 if (active) { 1693 VOP_INACTIVE(vp, p); 1694 } else { 1695 /* 1696 * Any other processes trying to obtain this lock must first 1697 * wait for VXLOCK to clear, then call the new lock operation. 1698 */ 1699 VOP_UNLOCK(vp, 0); 1700 } 1701 /* 1702 * Reclaim the vnode. 1703 */ 1704 if (VOP_RECLAIM(vp, p)) 1705 panic("vclean: cannot reclaim, vp %p", vp); 1706 if (active) { 1707 /* 1708 * Inline copy of vrele() since VOP_INACTIVE 1709 * has already been called. 1710 */ 1711 simple_lock(&vp->v_interlock); 1712 if (--vp->v_usecount <= 0) { 1713 #ifdef DIAGNOSTIC 1714 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1715 vprint("vclean: bad ref count", vp); 1716 panic("vclean: ref cnt"); 1717 } 1718 #endif 1719 /* 1720 * Insert at tail of LRU list. 1721 */ 1722 1723 simple_unlock(&vp->v_interlock); 1724 simple_lock(&vnode_free_list_slock); 1725 #ifdef DIAGNOSTIC 1726 if (vp->v_holdcnt > 0) 1727 panic("vclean: not clean, vp %p", vp); 1728 #endif 1729 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1730 simple_unlock(&vnode_free_list_slock); 1731 } else 1732 simple_unlock(&vp->v_interlock); 1733 } 1734 1735 KASSERT(vp->v_uobj.uo_npages == 0); 1736 cache_purge(vp); 1737 1738 /* 1739 * Done with purge, notify sleepers of the grim news. 1740 */ 1741 vp->v_op = dead_vnodeop_p; 1742 vp->v_tag = VT_NON; 1743 simple_lock(&vp->v_interlock); 1744 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1745 vp->v_flag &= ~(VXLOCK|VLOCKSWORK); 1746 if (vp->v_flag & VXWANT) { 1747 vp->v_flag &= ~VXWANT; 1748 simple_unlock(&vp->v_interlock); 1749 wakeup((caddr_t)vp); 1750 } else 1751 simple_unlock(&vp->v_interlock); 1752 } 1753 1754 /* 1755 * Recycle an unused vnode to the front of the free list. 1756 * Release the passed interlock if the vnode will be recycled. 1757 */ 1758 int 1759 vrecycle(vp, inter_lkp, p) 1760 struct vnode *vp; 1761 struct simplelock *inter_lkp; 1762 struct proc *p; 1763 { 1764 1765 simple_lock(&vp->v_interlock); 1766 if (vp->v_usecount == 0) { 1767 if (inter_lkp) 1768 simple_unlock(inter_lkp); 1769 vgonel(vp, p); 1770 return (1); 1771 } 1772 simple_unlock(&vp->v_interlock); 1773 return (0); 1774 } 1775 1776 /* 1777 * Eliminate all activity associated with a vnode 1778 * in preparation for reuse. 1779 */ 1780 void 1781 vgone(vp) 1782 struct vnode *vp; 1783 { 1784 struct proc *p = curproc; /* XXX */ 1785 1786 simple_lock(&vp->v_interlock); 1787 vgonel(vp, p); 1788 } 1789 1790 /* 1791 * vgone, with the vp interlock held. 1792 */ 1793 void 1794 vgonel(vp, p) 1795 struct vnode *vp; 1796 struct proc *p; 1797 { 1798 1799 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1800 1801 /* 1802 * If a vgone (or vclean) is already in progress, 1803 * wait until it is done and return. 1804 */ 1805 1806 if (vp->v_flag & VXLOCK) { 1807 vp->v_flag |= VXWANT; 1808 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1809 return; 1810 } 1811 1812 /* 1813 * Clean out the filesystem specific data. 1814 */ 1815 1816 vclean(vp, DOCLOSE, p); 1817 KASSERT((vp->v_flag & VONWORKLST) == 0); 1818 1819 /* 1820 * Delete from old mount point vnode list, if on one. 1821 */ 1822 1823 if (vp->v_mount != NULL) 1824 insmntque(vp, (struct mount *)0); 1825 1826 /* 1827 * The test of the back pointer and the reference count of 1828 * zero is because it will be removed from the free list by 1829 * getcleanvnode, but will not have its reference count 1830 * incremented until after calling vgone. If the reference 1831 * count were incremented first, vgone would (incorrectly) 1832 * try to close the previous instance of the underlying object. 1833 * So, the back pointer is explicitly set to `0xdeadb' in 1834 * getnewvnode after removing it from the freelist to ensure 1835 * that we do not try to move it here. 1836 */ 1837 1838 vp->v_type = VBAD; 1839 if (vp->v_usecount == 0) { 1840 boolean_t dofree; 1841 1842 simple_lock(&vnode_free_list_slock); 1843 if (vp->v_holdcnt > 0) 1844 panic("vgonel: not clean, vp %p", vp); 1845 /* 1846 * if it isn't on the freelist, we're called by getcleanvnode 1847 * and vnode is being re-used. otherwise, we'll free it. 1848 */ 1849 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1850 if (dofree) { 1851 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1852 numvnodes--; 1853 } 1854 simple_unlock(&vnode_free_list_slock); 1855 if (dofree) 1856 pool_put(&vnode_pool, vp); 1857 } 1858 } 1859 1860 /* 1861 * Lookup a vnode by device number. 1862 */ 1863 int 1864 vfinddev(dev, type, vpp) 1865 dev_t dev; 1866 enum vtype type; 1867 struct vnode **vpp; 1868 { 1869 struct vnode *vp; 1870 int rc = 0; 1871 1872 simple_lock(&spechash_slock); 1873 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1874 if (dev != vp->v_rdev || type != vp->v_type) 1875 continue; 1876 *vpp = vp; 1877 rc = 1; 1878 break; 1879 } 1880 simple_unlock(&spechash_slock); 1881 return (rc); 1882 } 1883 1884 /* 1885 * Revoke all the vnodes corresponding to the specified minor number 1886 * range (endpoints inclusive) of the specified major. 1887 */ 1888 void 1889 vdevgone(maj, minl, minh, type) 1890 int maj, minl, minh; 1891 enum vtype type; 1892 { 1893 struct vnode *vp; 1894 int mn; 1895 1896 for (mn = minl; mn <= minh; mn++) 1897 if (vfinddev(makedev(maj, mn), type, &vp)) 1898 VOP_REVOKE(vp, REVOKEALL); 1899 } 1900 1901 /* 1902 * Calculate the total number of references to a special device. 1903 */ 1904 int 1905 vcount(vp) 1906 struct vnode *vp; 1907 { 1908 struct vnode *vq, *vnext; 1909 int count; 1910 1911 loop: 1912 if ((vp->v_flag & VALIASED) == 0) 1913 return (vp->v_usecount); 1914 simple_lock(&spechash_slock); 1915 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1916 vnext = vq->v_specnext; 1917 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1918 continue; 1919 /* 1920 * Alias, but not in use, so flush it out. 1921 */ 1922 if (vq->v_usecount == 0 && vq != vp && 1923 (vq->v_flag & VXLOCK) == 0) { 1924 simple_unlock(&spechash_slock); 1925 vgone(vq); 1926 goto loop; 1927 } 1928 count += vq->v_usecount; 1929 } 1930 simple_unlock(&spechash_slock); 1931 return (count); 1932 } 1933 1934 /* 1935 * Print out a description of a vnode. 1936 */ 1937 const char * const vnode_types[] = { 1938 "VNON", 1939 "VREG", 1940 "VDIR", 1941 "VBLK", 1942 "VCHR", 1943 "VLNK", 1944 "VSOCK", 1945 "VFIFO", 1946 "VBAD" 1947 }; 1948 1949 void 1950 vprint(label, vp) 1951 char *label; 1952 struct vnode *vp; 1953 { 1954 char buf[96]; 1955 1956 if (label != NULL) 1957 printf("%s: ", label); 1958 printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,", 1959 vp->v_tag, vnode_types[vp->v_type], 1960 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1961 buf[0] = '\0'; 1962 if (vp->v_flag & VROOT) 1963 strlcat(buf, "|VROOT", sizeof(buf)); 1964 if (vp->v_flag & VTEXT) 1965 strlcat(buf, "|VTEXT", sizeof(buf)); 1966 if (vp->v_flag & VEXECMAP) 1967 strlcat(buf, "|VEXECMAP", sizeof(buf)); 1968 if (vp->v_flag & VSYSTEM) 1969 strlcat(buf, "|VSYSTEM", sizeof(buf)); 1970 if (vp->v_flag & VXLOCK) 1971 strlcat(buf, "|VXLOCK", sizeof(buf)); 1972 if (vp->v_flag & VXWANT) 1973 strlcat(buf, "|VXWANT", sizeof(buf)); 1974 if (vp->v_flag & VBWAIT) 1975 strlcat(buf, "|VBWAIT", sizeof(buf)); 1976 if (vp->v_flag & VALIASED) 1977 strlcat(buf, "|VALIASED", sizeof(buf)); 1978 if (buf[0] != '\0') 1979 printf(" flags (%s)", &buf[1]); 1980 if (vp->v_data == NULL) { 1981 printf("\n"); 1982 } else { 1983 printf("\n\t"); 1984 VOP_PRINT(vp); 1985 } 1986 } 1987 1988 #ifdef DEBUG 1989 /* 1990 * List all of the locked vnodes in the system. 1991 * Called when debugging the kernel. 1992 */ 1993 void 1994 printlockedvnodes() 1995 { 1996 struct mount *mp, *nmp; 1997 struct vnode *vp; 1998 1999 printf("Locked vnodes\n"); 2000 simple_lock(&mountlist_slock); 2001 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2002 mp = nmp) { 2003 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2004 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2005 continue; 2006 } 2007 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2008 if (VOP_ISLOCKED(vp)) 2009 vprint(NULL, vp); 2010 } 2011 simple_lock(&mountlist_slock); 2012 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2013 vfs_unbusy(mp); 2014 } 2015 simple_unlock(&mountlist_slock); 2016 } 2017 #endif 2018 2019 /* 2020 * sysctl helper routine for vfs.generic.conf lookups. 2021 */ 2022 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2023 static int 2024 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 2025 { 2026 struct vfsconf vfc; 2027 extern const char * const mountcompatnames[]; 2028 extern int nmountcompatnames; 2029 struct sysctlnode node; 2030 struct vfsops *vfsp; 2031 u_int vfsnum; 2032 2033 if (namelen != 1) 2034 return (ENOTDIR); 2035 vfsnum = name[0]; 2036 if (vfsnum >= nmountcompatnames || 2037 mountcompatnames[vfsnum] == NULL) 2038 return (EOPNOTSUPP); 2039 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 2040 if (vfsp == NULL) 2041 return (EOPNOTSUPP); 2042 2043 vfc.vfc_vfsops = vfsp; 2044 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 2045 vfc.vfc_typenum = vfsnum; 2046 vfc.vfc_refcount = vfsp->vfs_refcount; 2047 vfc.vfc_flags = 0; 2048 vfc.vfc_mountroot = vfsp->vfs_mountroot; 2049 vfc.vfc_next = NULL; 2050 2051 node = *rnode; 2052 node.sysctl_data = &vfc; 2053 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 2054 } 2055 #endif 2056 2057 /* 2058 * sysctl helper routine to return list of supported fstypes 2059 */ 2060 static int 2061 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2062 { 2063 char buf[MFSNAMELEN]; 2064 char *where = oldp; 2065 struct vfsops *v; 2066 size_t needed, left, slen; 2067 int error, first; 2068 2069 if (newp != NULL) 2070 return (EPERM); 2071 if (namelen != 0) 2072 return (EINVAL); 2073 2074 first = 1; 2075 error = 0; 2076 needed = 0; 2077 left = *oldlenp; 2078 2079 LIST_FOREACH(v, &vfs_list, vfs_list) { 2080 if (where == NULL) 2081 needed += strlen(v->vfs_name) + 1; 2082 else { 2083 memset(buf, 0, sizeof(buf)); 2084 if (first) { 2085 strncpy(buf, v->vfs_name, sizeof(buf)); 2086 first = 0; 2087 } else { 2088 buf[0] = ' '; 2089 strncpy(buf + 1, v->vfs_name, sizeof(buf) - 1); 2090 } 2091 buf[sizeof(buf)-1] = '\0'; 2092 slen = strlen(buf); 2093 if (left < slen + 1) 2094 break; 2095 /* +1 to copy out the trailing NUL byte */ 2096 error = copyout(buf, where, slen + 1); 2097 if (error) 2098 break; 2099 where += slen; 2100 needed += slen; 2101 left -= slen; 2102 } 2103 } 2104 *oldlenp = needed; 2105 return (error); 2106 } 2107 2108 /* 2109 * Top level filesystem related information gathering. 2110 */ 2111 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2112 { 2113 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2114 extern int nmountcompatnames; 2115 #endif 2116 2117 sysctl_createv(clog, 0, NULL, NULL, 2118 CTLFLAG_PERMANENT, 2119 CTLTYPE_NODE, "vfs", NULL, 2120 NULL, 0, NULL, 0, 2121 CTL_VFS, CTL_EOL); 2122 sysctl_createv(clog, 0, NULL, NULL, 2123 CTLFLAG_PERMANENT, 2124 CTLTYPE_NODE, "generic", 2125 SYSCTL_DESCR("Non-specific vfs related information"), 2126 NULL, 0, NULL, 0, 2127 CTL_VFS, VFS_GENERIC, CTL_EOL); 2128 2129 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2130 sysctl_createv(clog, 0, NULL, NULL, 2131 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 2132 CTLTYPE_INT, "maxtypenum", 2133 SYSCTL_DESCR("Highest valid filesystem type number"), 2134 NULL, nmountcompatnames, NULL, 0, 2135 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2136 #endif 2137 sysctl_createv(clog, 0, NULL, NULL, 2138 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2139 CTLTYPE_INT, "usermount", 2140 SYSCTL_DESCR("Whether unprivileged users may mount " 2141 "filesystems"), 2142 NULL, 0, &dovfsusermount, 0, 2143 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2144 sysctl_createv(clog, 0, NULL, NULL, 2145 CTLFLAG_PERMANENT, 2146 CTLTYPE_STRING, "fstypes", 2147 SYSCTL_DESCR("List of file systems present"), 2148 sysctl_vfs_generic_fstypes, 0, NULL, 0, 2149 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 2150 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2151 sysctl_createv(clog, 0, NULL, NULL, 2152 CTLFLAG_PERMANENT, 2153 CTLTYPE_STRUCT, "conf", 2154 SYSCTL_DESCR("Filesystem configuration information"), 2155 sysctl_vfs_generic_conf, 0, NULL, 2156 sizeof(struct vfsconf), 2157 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2158 #endif 2159 } 2160 2161 2162 int kinfo_vdebug = 1; 2163 int kinfo_vgetfailed; 2164 #define KINFO_VNODESLOP 10 2165 /* 2166 * Dump vnode list (via sysctl). 2167 * Copyout address of vnode followed by vnode. 2168 */ 2169 /* ARGSUSED */ 2170 int 2171 sysctl_kern_vnode(SYSCTLFN_ARGS) 2172 { 2173 char *where = oldp; 2174 size_t *sizep = oldlenp; 2175 struct mount *mp, *nmp; 2176 struct vnode *nvp, *vp; 2177 char *bp = where, *savebp; 2178 char *ewhere; 2179 int error; 2180 2181 if (namelen != 0) 2182 return (EOPNOTSUPP); 2183 if (newp != NULL) 2184 return (EPERM); 2185 2186 #define VPTRSZ sizeof(struct vnode *) 2187 #define VNODESZ sizeof(struct vnode) 2188 if (where == NULL) { 2189 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2190 return (0); 2191 } 2192 ewhere = where + *sizep; 2193 2194 simple_lock(&mountlist_slock); 2195 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2196 mp = nmp) { 2197 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2198 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2199 continue; 2200 } 2201 savebp = bp; 2202 again: 2203 simple_lock(&mntvnode_slock); 2204 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2205 vp != NULL; 2206 vp = nvp) { 2207 /* 2208 * Check that the vp is still associated with 2209 * this filesystem. RACE: could have been 2210 * recycled onto the same filesystem. 2211 */ 2212 if (vp->v_mount != mp) { 2213 simple_unlock(&mntvnode_slock); 2214 if (kinfo_vdebug) 2215 printf("kinfo: vp changed\n"); 2216 bp = savebp; 2217 goto again; 2218 } 2219 nvp = LIST_NEXT(vp, v_mntvnodes); 2220 if (bp + VPTRSZ + VNODESZ > ewhere) { 2221 simple_unlock(&mntvnode_slock); 2222 *sizep = bp - where; 2223 return (ENOMEM); 2224 } 2225 simple_unlock(&mntvnode_slock); 2226 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2227 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2228 return (error); 2229 bp += VPTRSZ + VNODESZ; 2230 simple_lock(&mntvnode_slock); 2231 } 2232 simple_unlock(&mntvnode_slock); 2233 simple_lock(&mountlist_slock); 2234 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2235 vfs_unbusy(mp); 2236 } 2237 simple_unlock(&mountlist_slock); 2238 2239 *sizep = bp - where; 2240 return (0); 2241 } 2242 2243 /* 2244 * Check to see if a filesystem is mounted on a block device. 2245 */ 2246 int 2247 vfs_mountedon(vp) 2248 struct vnode *vp; 2249 { 2250 struct vnode *vq; 2251 int error = 0; 2252 2253 if (vp->v_specmountpoint != NULL) 2254 return (EBUSY); 2255 if (vp->v_flag & VALIASED) { 2256 simple_lock(&spechash_slock); 2257 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2258 if (vq->v_rdev != vp->v_rdev || 2259 vq->v_type != vp->v_type) 2260 continue; 2261 if (vq->v_specmountpoint != NULL) { 2262 error = EBUSY; 2263 break; 2264 } 2265 } 2266 simple_unlock(&spechash_slock); 2267 } 2268 return (error); 2269 } 2270 2271 static int 2272 sacheck(struct sockaddr *sa) 2273 { 2274 switch (sa->sa_family) { 2275 #ifdef INET 2276 case AF_INET: { 2277 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2278 char *p = (char *)sin->sin_zero; 2279 size_t i; 2280 2281 if (sin->sin_len != sizeof(*sin)) 2282 return -1; 2283 if (sin->sin_port != 0) 2284 return -1; 2285 for (i = 0; i < sizeof(sin->sin_zero); i++) 2286 if (*p++ != '\0') 2287 return -1; 2288 return 0; 2289 } 2290 #endif 2291 #ifdef INET6 2292 case AF_INET6: { 2293 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2294 2295 if (sin6->sin6_len != sizeof(*sin6)) 2296 return -1; 2297 if (sin6->sin6_port != 0) 2298 return -1; 2299 return 0; 2300 } 2301 #endif 2302 default: 2303 return -1; 2304 } 2305 } 2306 2307 /* 2308 * Build hash lists of net addresses and hang them off the mount point. 2309 * Called by ufs_mount() to set up the lists of export addresses. 2310 */ 2311 static int 2312 vfs_hang_addrlist(mp, nep, argp) 2313 struct mount *mp; 2314 struct netexport *nep; 2315 struct export_args *argp; 2316 { 2317 struct netcred *np, *enp; 2318 struct radix_node_head *rnh; 2319 int i; 2320 struct sockaddr *saddr, *smask = 0; 2321 struct domain *dom; 2322 int error; 2323 2324 if (argp->ex_addrlen == 0) { 2325 if (mp->mnt_flag & MNT_DEFEXPORTED) 2326 return (EPERM); 2327 np = &nep->ne_defexported; 2328 np->netc_exflags = argp->ex_flags; 2329 crcvt(&np->netc_anon, &argp->ex_anon); 2330 np->netc_anon.cr_ref = 1; 2331 mp->mnt_flag |= MNT_DEFEXPORTED; 2332 return (0); 2333 } 2334 2335 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2336 return (EINVAL); 2337 2338 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2339 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2340 memset((caddr_t)np, 0, i); 2341 saddr = (struct sockaddr *)(np + 1); 2342 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2343 if (error) 2344 goto out; 2345 if (saddr->sa_len > argp->ex_addrlen) 2346 saddr->sa_len = argp->ex_addrlen; 2347 if (sacheck(saddr) == -1) 2348 return EINVAL; 2349 if (argp->ex_masklen) { 2350 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2351 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2352 if (error) 2353 goto out; 2354 if (smask->sa_len > argp->ex_masklen) 2355 smask->sa_len = argp->ex_masklen; 2356 if (smask->sa_family != saddr->sa_family) 2357 return EINVAL; 2358 if (sacheck(smask) == -1) 2359 return EINVAL; 2360 } 2361 i = saddr->sa_family; 2362 if ((rnh = nep->ne_rtable[i]) == 0) { 2363 /* 2364 * Seems silly to initialize every AF when most are not 2365 * used, do so on demand here 2366 */ 2367 for (dom = domains; dom; dom = dom->dom_next) 2368 if (dom->dom_family == i && dom->dom_rtattach) { 2369 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2370 dom->dom_rtoffset); 2371 break; 2372 } 2373 if ((rnh = nep->ne_rtable[i]) == 0) { 2374 error = ENOBUFS; 2375 goto out; 2376 } 2377 } 2378 2379 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2380 np->netc_rnodes); 2381 if (enp != np) { 2382 if (enp == NULL) { 2383 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2384 smask, rnh); 2385 if (enp == NULL) { 2386 error = EPERM; 2387 goto out; 2388 } 2389 } else 2390 enp->netc_refcnt++; 2391 2392 goto check; 2393 } else 2394 enp->netc_refcnt = 1; 2395 2396 np->netc_exflags = argp->ex_flags; 2397 crcvt(&np->netc_anon, &argp->ex_anon); 2398 np->netc_anon.cr_ref = 1; 2399 return 0; 2400 check: 2401 if (enp->netc_exflags != argp->ex_flags || 2402 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2403 error = EPERM; 2404 else 2405 error = 0; 2406 out: 2407 free(np, M_NETADDR); 2408 return error; 2409 } 2410 2411 /* ARGSUSED */ 2412 static int 2413 vfs_free_netcred(rn, w) 2414 struct radix_node *rn; 2415 void *w; 2416 { 2417 struct radix_node_head *rnh = (struct radix_node_head *)w; 2418 struct netcred *np = (struct netcred *)(void *)rn; 2419 2420 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2421 if (--(np->netc_refcnt) <= 0) 2422 free(np, M_NETADDR); 2423 return (0); 2424 } 2425 2426 /* 2427 * Free the net address hash lists that are hanging off the mount points. 2428 */ 2429 static void 2430 vfs_free_addrlist(nep) 2431 struct netexport *nep; 2432 { 2433 int i; 2434 struct radix_node_head *rnh; 2435 2436 for (i = 0; i <= AF_MAX; i++) 2437 if ((rnh = nep->ne_rtable[i]) != NULL) { 2438 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2439 free((caddr_t)rnh, M_RTABLE); 2440 nep->ne_rtable[i] = 0; 2441 } 2442 } 2443 2444 int 2445 vfs_export(mp, nep, argp) 2446 struct mount *mp; 2447 struct netexport *nep; 2448 struct export_args *argp; 2449 { 2450 int error; 2451 2452 if (argp->ex_flags & MNT_DELEXPORT) { 2453 if (mp->mnt_flag & MNT_EXPUBLIC) { 2454 vfs_setpublicfs(NULL, NULL, NULL); 2455 mp->mnt_flag &= ~MNT_EXPUBLIC; 2456 } 2457 vfs_free_addrlist(nep); 2458 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2459 } 2460 if (argp->ex_flags & MNT_EXPORTED) { 2461 if (argp->ex_flags & MNT_EXPUBLIC) { 2462 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2463 return (error); 2464 mp->mnt_flag |= MNT_EXPUBLIC; 2465 } 2466 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2467 return (error); 2468 mp->mnt_flag |= MNT_EXPORTED; 2469 } 2470 return (0); 2471 } 2472 2473 /* 2474 * Set the publicly exported filesystem (WebNFS). Currently, only 2475 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2476 */ 2477 int 2478 vfs_setpublicfs(mp, nep, argp) 2479 struct mount *mp; 2480 struct netexport *nep; 2481 struct export_args *argp; 2482 { 2483 int error; 2484 struct vnode *rvp; 2485 char *cp; 2486 2487 /* 2488 * mp == NULL -> invalidate the current info, the FS is 2489 * no longer exported. May be called from either vfs_export 2490 * or unmount, so check if it hasn't already been done. 2491 */ 2492 if (mp == NULL) { 2493 if (nfs_pub.np_valid) { 2494 nfs_pub.np_valid = 0; 2495 if (nfs_pub.np_index != NULL) { 2496 FREE(nfs_pub.np_index, M_TEMP); 2497 nfs_pub.np_index = NULL; 2498 } 2499 } 2500 return (0); 2501 } 2502 2503 /* 2504 * Only one allowed at a time. 2505 */ 2506 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2507 return (EBUSY); 2508 2509 /* 2510 * Get real filehandle for root of exported FS. 2511 */ 2512 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2513 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx; 2514 2515 if ((error = VFS_ROOT(mp, &rvp))) 2516 return (error); 2517 2518 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2519 return (error); 2520 2521 vput(rvp); 2522 2523 /* 2524 * If an indexfile was specified, pull it in. 2525 */ 2526 if (argp->ex_indexfile != NULL) { 2527 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2528 M_WAITOK); 2529 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2530 MAXNAMLEN, (size_t *)0); 2531 if (!error) { 2532 /* 2533 * Check for illegal filenames. 2534 */ 2535 for (cp = nfs_pub.np_index; *cp; cp++) { 2536 if (*cp == '/') { 2537 error = EINVAL; 2538 break; 2539 } 2540 } 2541 } 2542 if (error) { 2543 FREE(nfs_pub.np_index, M_TEMP); 2544 return (error); 2545 } 2546 } 2547 2548 nfs_pub.np_mount = mp; 2549 nfs_pub.np_valid = 1; 2550 return (0); 2551 } 2552 2553 struct netcred * 2554 vfs_export_lookup(mp, nep, nam) 2555 struct mount *mp; 2556 struct netexport *nep; 2557 struct mbuf *nam; 2558 { 2559 struct netcred *np; 2560 struct radix_node_head *rnh; 2561 struct sockaddr *saddr; 2562 2563 np = NULL; 2564 if (mp->mnt_flag & MNT_EXPORTED) { 2565 /* 2566 * Lookup in the export list first. 2567 */ 2568 if (nam != NULL) { 2569 saddr = mtod(nam, struct sockaddr *); 2570 rnh = nep->ne_rtable[saddr->sa_family]; 2571 if (rnh != NULL) { 2572 np = (struct netcred *) 2573 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2574 rnh); 2575 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2576 np = NULL; 2577 } 2578 } 2579 /* 2580 * If no address match, use the default if it exists. 2581 */ 2582 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2583 np = &nep->ne_defexported; 2584 } 2585 return (np); 2586 } 2587 2588 /* 2589 * Do the usual access checking. 2590 * file_mode, uid and gid are from the vnode in question, 2591 * while acc_mode and cred are from the VOP_ACCESS parameter list 2592 */ 2593 int 2594 vaccess(type, file_mode, uid, gid, acc_mode, cred) 2595 enum vtype type; 2596 mode_t file_mode; 2597 uid_t uid; 2598 gid_t gid; 2599 mode_t acc_mode; 2600 struct ucred *cred; 2601 { 2602 mode_t mask; 2603 2604 /* 2605 * Super-user always gets read/write access, but execute access depends 2606 * on at least one execute bit being set. 2607 */ 2608 if (cred->cr_uid == 0) { 2609 if ((acc_mode & VEXEC) && type != VDIR && 2610 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2611 return (EACCES); 2612 return (0); 2613 } 2614 2615 mask = 0; 2616 2617 /* Otherwise, check the owner. */ 2618 if (cred->cr_uid == uid) { 2619 if (acc_mode & VEXEC) 2620 mask |= S_IXUSR; 2621 if (acc_mode & VREAD) 2622 mask |= S_IRUSR; 2623 if (acc_mode & VWRITE) 2624 mask |= S_IWUSR; 2625 return ((file_mode & mask) == mask ? 0 : EACCES); 2626 } 2627 2628 /* Otherwise, check the groups. */ 2629 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2630 if (acc_mode & VEXEC) 2631 mask |= S_IXGRP; 2632 if (acc_mode & VREAD) 2633 mask |= S_IRGRP; 2634 if (acc_mode & VWRITE) 2635 mask |= S_IWGRP; 2636 return ((file_mode & mask) == mask ? 0 : EACCES); 2637 } 2638 2639 /* Otherwise, check everyone else. */ 2640 if (acc_mode & VEXEC) 2641 mask |= S_IXOTH; 2642 if (acc_mode & VREAD) 2643 mask |= S_IROTH; 2644 if (acc_mode & VWRITE) 2645 mask |= S_IWOTH; 2646 return ((file_mode & mask) == mask ? 0 : EACCES); 2647 } 2648 2649 /* 2650 * Unmount all file systems. 2651 * We traverse the list in reverse order under the assumption that doing so 2652 * will avoid needing to worry about dependencies. 2653 */ 2654 void 2655 vfs_unmountall(p) 2656 struct proc *p; 2657 { 2658 struct mount *mp, *nmp; 2659 int allerror, error; 2660 2661 printf("unmounting file systems..."); 2662 for (allerror = 0, 2663 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2664 nmp = mp->mnt_list.cqe_prev; 2665 #ifdef DEBUG 2666 printf("\nunmounting %s (%s)...", 2667 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2668 #endif 2669 /* 2670 * XXX Freeze syncer. Must do this before locking the 2671 * mount point. See dounmount() for details. 2672 */ 2673 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2674 if (vfs_busy(mp, 0, 0)) { 2675 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2676 continue; 2677 } 2678 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2679 printf("unmount of %s failed with error %d\n", 2680 mp->mnt_stat.f_mntonname, error); 2681 allerror = 1; 2682 } 2683 } 2684 printf(" done\n"); 2685 if (allerror) 2686 printf("WARNING: some file systems would not unmount\n"); 2687 } 2688 2689 extern struct simplelock bqueue_slock; /* XXX */ 2690 2691 /* 2692 * Sync and unmount file systems before shutting down. 2693 */ 2694 void 2695 vfs_shutdown() 2696 { 2697 struct lwp *l = curlwp; 2698 struct proc *p; 2699 2700 /* XXX we're certainly not running in proc0's context! */ 2701 if (l == NULL || (p = l->l_proc) == NULL) 2702 p = &proc0; 2703 2704 printf("syncing disks... "); 2705 2706 /* remove user process from run queue */ 2707 suspendsched(); 2708 (void) spl0(); 2709 2710 /* avoid coming back this way again if we panic. */ 2711 doing_shutdown = 1; 2712 2713 sys_sync(l, NULL, NULL); 2714 2715 /* Wait for sync to finish. */ 2716 if (buf_syncwait() != 0) { 2717 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2718 Debugger(); 2719 #endif 2720 printf("giving up\n"); 2721 return; 2722 } else 2723 printf("done\n"); 2724 2725 /* 2726 * If we've panic'd, don't make the situation potentially 2727 * worse by unmounting the file systems. 2728 */ 2729 if (panicstr != NULL) 2730 return; 2731 2732 /* Release inodes held by texts before update. */ 2733 #ifdef notdef 2734 vnshutdown(); 2735 #endif 2736 /* Unmount file systems. */ 2737 vfs_unmountall(p); 2738 } 2739 2740 /* 2741 * Mount the root file system. If the operator didn't specify a 2742 * file system to use, try all possible file systems until one 2743 * succeeds. 2744 */ 2745 int 2746 vfs_mountroot() 2747 { 2748 struct vfsops *v; 2749 2750 if (root_device == NULL) 2751 panic("vfs_mountroot: root device unknown"); 2752 2753 switch (root_device->dv_class) { 2754 case DV_IFNET: 2755 if (rootdev != NODEV) 2756 panic("vfs_mountroot: rootdev set for DV_IFNET " 2757 "(0x%08x -> %d,%d)", rootdev, 2758 major(rootdev), minor(rootdev)); 2759 break; 2760 2761 case DV_DISK: 2762 if (rootdev == NODEV) 2763 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2764 break; 2765 2766 default: 2767 printf("%s: inappropriate for root file system\n", 2768 root_device->dv_xname); 2769 return (ENODEV); 2770 } 2771 2772 /* 2773 * If user specified a file system, use it. 2774 */ 2775 if (mountroot != NULL) 2776 return ((*mountroot)()); 2777 2778 /* 2779 * Try each file system currently configured into the kernel. 2780 */ 2781 LIST_FOREACH(v, &vfs_list, vfs_list) { 2782 if (v->vfs_mountroot == NULL) 2783 continue; 2784 #ifdef DEBUG 2785 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2786 #endif 2787 if ((*v->vfs_mountroot)() == 0) { 2788 aprint_normal("root file system type: %s\n", 2789 v->vfs_name); 2790 break; 2791 } 2792 } 2793 2794 if (v == NULL) { 2795 printf("no file system for %s", root_device->dv_xname); 2796 if (root_device->dv_class == DV_DISK) 2797 printf(" (dev 0x%x)", rootdev); 2798 printf("\n"); 2799 return (EFTYPE); 2800 } 2801 return (0); 2802 } 2803 2804 /* 2805 * Given a file system name, look up the vfsops for that 2806 * file system, or return NULL if file system isn't present 2807 * in the kernel. 2808 */ 2809 struct vfsops * 2810 vfs_getopsbyname(name) 2811 const char *name; 2812 { 2813 struct vfsops *v; 2814 2815 LIST_FOREACH(v, &vfs_list, vfs_list) { 2816 if (strcmp(v->vfs_name, name) == 0) 2817 break; 2818 } 2819 2820 return (v); 2821 } 2822 2823 /* 2824 * Establish a file system and initialize it. 2825 */ 2826 int 2827 vfs_attach(vfs) 2828 struct vfsops *vfs; 2829 { 2830 struct vfsops *v; 2831 int error = 0; 2832 2833 2834 /* 2835 * Make sure this file system doesn't already exist. 2836 */ 2837 LIST_FOREACH(v, &vfs_list, vfs_list) { 2838 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2839 error = EEXIST; 2840 goto out; 2841 } 2842 } 2843 2844 /* 2845 * Initialize the vnode operations for this file system. 2846 */ 2847 vfs_opv_init(vfs->vfs_opv_descs); 2848 2849 /* 2850 * Now initialize the file system itself. 2851 */ 2852 (*vfs->vfs_init)(); 2853 2854 /* 2855 * ...and link it into the kernel's list. 2856 */ 2857 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2858 2859 /* 2860 * Sanity: make sure the reference count is 0. 2861 */ 2862 vfs->vfs_refcount = 0; 2863 2864 out: 2865 return (error); 2866 } 2867 2868 /* 2869 * Remove a file system from the kernel. 2870 */ 2871 int 2872 vfs_detach(vfs) 2873 struct vfsops *vfs; 2874 { 2875 struct vfsops *v; 2876 2877 /* 2878 * Make sure no one is using the filesystem. 2879 */ 2880 if (vfs->vfs_refcount != 0) 2881 return (EBUSY); 2882 2883 /* 2884 * ...and remove it from the kernel's list. 2885 */ 2886 LIST_FOREACH(v, &vfs_list, vfs_list) { 2887 if (v == vfs) { 2888 LIST_REMOVE(v, vfs_list); 2889 break; 2890 } 2891 } 2892 2893 if (v == NULL) 2894 return (ESRCH); 2895 2896 /* 2897 * Now run the file system-specific cleanups. 2898 */ 2899 (*vfs->vfs_done)(); 2900 2901 /* 2902 * Free the vnode operations vector. 2903 */ 2904 vfs_opv_free(vfs->vfs_opv_descs); 2905 return (0); 2906 } 2907 2908 void 2909 vfs_reinit(void) 2910 { 2911 struct vfsops *vfs; 2912 2913 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2914 if (vfs->vfs_reinit) { 2915 (*vfs->vfs_reinit)(); 2916 } 2917 } 2918 } 2919 2920 /* 2921 * Request a filesystem to suspend write operations. 2922 */ 2923 int 2924 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2925 { 2926 struct proc *p = curproc; /* XXX */ 2927 int error; 2928 2929 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2930 if (slptimeo < 0) 2931 return EWOULDBLOCK; 2932 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2933 if (error) 2934 return error; 2935 } 2936 mp->mnt_iflag |= IMNT_SUSPEND; 2937 2938 simple_lock(&mp->mnt_slock); 2939 if (mp->mnt_writeopcountupper > 0) 2940 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 2941 0, &mp->mnt_slock); 2942 simple_unlock(&mp->mnt_slock); 2943 2944 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2945 if (error) { 2946 vfs_write_resume(mp); 2947 return error; 2948 } 2949 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2950 2951 simple_lock(&mp->mnt_slock); 2952 if (mp->mnt_writeopcountlower > 0) 2953 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 2954 0, &mp->mnt_slock); 2955 mp->mnt_iflag |= IMNT_SUSPENDED; 2956 simple_unlock(&mp->mnt_slock); 2957 2958 return 0; 2959 } 2960 2961 /* 2962 * Request a filesystem to resume write operations. 2963 */ 2964 void 2965 vfs_write_resume(struct mount *mp) 2966 { 2967 2968 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2969 return; 2970 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2971 wakeup(&mp->mnt_flag); 2972 } 2973 2974 void 2975 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2976 { 2977 const struct statvfs *mbp; 2978 2979 if (sbp == (mbp = &mp->mnt_stat)) 2980 return; 2981 2982 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2983 sbp->f_fsid = mbp->f_fsid; 2984 sbp->f_owner = mbp->f_owner; 2985 sbp->f_flag = mbp->f_flag; 2986 sbp->f_syncwrites = mbp->f_syncwrites; 2987 sbp->f_asyncwrites = mbp->f_asyncwrites; 2988 sbp->f_syncreads = mbp->f_syncreads; 2989 sbp->f_asyncreads = mbp->f_asyncreads; 2990 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2991 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2992 sizeof(sbp->f_fstypename)); 2993 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2994 sizeof(sbp->f_mntonname)); 2995 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2996 sizeof(sbp->f_mntfromname)); 2997 sbp->f_namemax = mbp->f_namemax; 2998 } 2999 3000 int 3001 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 3002 struct mount *mp, struct proc *p) 3003 { 3004 int error; 3005 size_t size; 3006 struct statvfs *sfs = &mp->mnt_stat; 3007 int (*fun)(const void *, void *, size_t, size_t *); 3008 3009 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 3010 sizeof(mp->mnt_stat.f_fstypename)); 3011 3012 if (onp) { 3013 struct cwdinfo *cwdi = p->p_cwdi; 3014 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 3015 if (cwdi->cwdi_rdir != NULL) { 3016 size_t len; 3017 char *bp; 3018 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 3019 3020 if (!path) /* XXX can't happen with M_WAITOK */ 3021 return ENOMEM; 3022 3023 bp = path + MAXPATHLEN; 3024 *--bp = '\0'; 3025 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 3026 path, MAXPATHLEN / 2, 0, p); 3027 if (error) { 3028 free(path, M_TEMP); 3029 return error; 3030 } 3031 3032 len = strlen(bp); 3033 if (len > sizeof(sfs->f_mntonname) - 1) 3034 len = sizeof(sfs->f_mntonname) - 1; 3035 (void)strncpy(sfs->f_mntonname, bp, len); 3036 free(path, M_TEMP); 3037 3038 if (len < sizeof(sfs->f_mntonname) - 1) { 3039 error = (*fun)(onp, &sfs->f_mntonname[len], 3040 sizeof(sfs->f_mntonname) - len - 1, &size); 3041 if (error) 3042 return error; 3043 size += len; 3044 } else { 3045 size = len; 3046 } 3047 } else { 3048 error = (*fun)(onp, &sfs->f_mntonname, 3049 sizeof(sfs->f_mntonname) - 1, &size); 3050 if (error) 3051 return error; 3052 } 3053 (void)memset(sfs->f_mntonname + size, 0, 3054 sizeof(sfs->f_mntonname) - size); 3055 } 3056 3057 if (fromp) { 3058 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 3059 error = (*fun)(fromp, sfs->f_mntfromname, 3060 sizeof(sfs->f_mntfromname) - 1, &size); 3061 if (error) 3062 return error; 3063 (void)memset(sfs->f_mntfromname + size, 0, 3064 sizeof(sfs->f_mntfromname) - size); 3065 } 3066 return 0; 3067 } 3068 3069 #ifdef DDB 3070 const char buf_flagbits[] = 3071 "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" 3072 "\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" 3073 "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED" 3074 "\32XXX\33VFLUSH"; 3075 3076 void 3077 vfs_buf_print(bp, full, pr) 3078 struct buf *bp; 3079 int full; 3080 void (*pr)(const char *, ...); 3081 { 3082 char buf[1024]; 3083 3084 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 3085 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 3086 3087 bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf)); 3088 (*pr)(" error %d flags 0x%s\n", bp->b_error, buf); 3089 3090 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3091 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3092 (*pr)(" data %p saveaddr %p dep %p\n", 3093 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 3094 (*pr)(" iodone %p\n", bp->b_iodone); 3095 } 3096 3097 3098 const char vnode_flagbits[] = 3099 "\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\5EXECMAP" 3100 "\11XLOCK\12XWANT\13BWAIT\14ALIASED" 3101 "\15DIROP\16LAYER\17ONWORKLIST\20DIRTY"; 3102 3103 const char * const vnode_tags[] = { 3104 "VT_NON", 3105 "VT_UFS", 3106 "VT_NFS", 3107 "VT_MFS", 3108 "VT_MSDOSFS", 3109 "VT_LFS", 3110 "VT_LOFS", 3111 "VT_FDESC", 3112 "VT_PORTAL", 3113 "VT_NULL", 3114 "VT_UMAP", 3115 "VT_KERNFS", 3116 "VT_PROCFS", 3117 "VT_AFS", 3118 "VT_ISOFS", 3119 "VT_UNION", 3120 "VT_ADOSFS", 3121 "VT_EXT2FS", 3122 "VT_CODA", 3123 "VT_FILECORE", 3124 "VT_NTFS", 3125 "VT_VFS", 3126 "VT_OVERLAY", 3127 "VT_SMBFS" 3128 }; 3129 3130 void 3131 vfs_vnode_print(vp, full, pr) 3132 struct vnode *vp; 3133 int full; 3134 void (*pr)(const char *, ...); 3135 { 3136 char buf[256]; 3137 const char *vtype, *vtag; 3138 3139 uvm_object_printit(&vp->v_uobj, full, pr); 3140 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); 3141 (*pr)("\nVNODE flags %s\n", buf); 3142 (*pr)("mp %p numoutput %d size 0x%llx\n", 3143 vp->v_mount, vp->v_numoutput, vp->v_size); 3144 3145 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3146 vp->v_data, vp->v_usecount, vp->v_writecount, 3147 vp->v_holdcnt, vp->v_numoutput); 3148 3149 vtype = (vp->v_type >= 0 && 3150 vp->v_type < sizeof(vnode_types) / sizeof(vnode_types[0])) ? 3151 vnode_types[vp->v_type] : "UNKNOWN"; 3152 vtag = (vp->v_tag >= 0 && 3153 vp->v_tag < sizeof(vnode_tags) / sizeof(vnode_tags[0])) ? 3154 vnode_tags[vp->v_tag] : "UNKNOWN"; 3155 3156 (*pr)("type %s(%d) tag %s(%d) mount %p typedata %p\n", 3157 vtype, vp->v_type, vtag, vp->v_tag, 3158 vp->v_mount, vp->v_mountedhere); 3159 3160 if (full) { 3161 struct buf *bp; 3162 3163 (*pr)("clean bufs:\n"); 3164 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3165 (*pr)(" bp %p\n", bp); 3166 vfs_buf_print(bp, full, pr); 3167 } 3168 3169 (*pr)("dirty bufs:\n"); 3170 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3171 (*pr)(" bp %p\n", bp); 3172 vfs_buf_print(bp, full, pr); 3173 } 3174 } 3175 } 3176 3177 void 3178 vfs_mount_print(mp, full, pr) 3179 struct mount *mp; 3180 int full; 3181 void (*pr)(const char *, ...); 3182 { 3183 char sbuf[256]; 3184 3185 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3186 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3187 3188 (*pr)("fs_bshift %d dev_bshift = %d\n", 3189 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3190 3191 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3192 (*pr)("flag = %s\n", sbuf); 3193 3194 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3195 (*pr)("iflag = %s\n", sbuf); 3196 3197 /* XXX use lockmgr_printinfo */ 3198 if (mp->mnt_lock.lk_sharecount) 3199 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3200 mp->mnt_lock.lk_sharecount); 3201 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3202 (*pr)(" lock type %s: EXCL (count %d) by ", 3203 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3204 if (mp->mnt_lock.lk_flags & LK_SPIN) 3205 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3206 else 3207 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3208 mp->mnt_lock.lk_locklwp); 3209 } else 3210 (*pr)(" not locked"); 3211 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3212 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3213 3214 (*pr)("\n"); 3215 3216 if (mp->mnt_unmounter) { 3217 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3218 } 3219 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3220 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3221 3222 (*pr)("statvfs cache:\n"); 3223 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3224 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3225 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3226 3227 (*pr)("\tblocks = "PRIu64"\n",mp->mnt_stat.f_blocks); 3228 (*pr)("\tbfree = "PRIu64"\n",mp->mnt_stat.f_bfree); 3229 (*pr)("\tbavail = "PRIu64"\n",mp->mnt_stat.f_bavail); 3230 (*pr)("\tbresvd = "PRIu64"\n",mp->mnt_stat.f_bresvd); 3231 3232 (*pr)("\tfiles = "PRIu64"\n",mp->mnt_stat.f_files); 3233 (*pr)("\tffree = "PRIu64"\n",mp->mnt_stat.f_ffree); 3234 (*pr)("\tfavail = "PRIu64"\n",mp->mnt_stat.f_favail); 3235 (*pr)("\tfresvd = "PRIu64"\n",mp->mnt_stat.f_fresvd); 3236 3237 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3238 mp->mnt_stat.f_fsidx.__fsid_val[0], 3239 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3240 3241 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3242 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3243 3244 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, 3245 sizeof(sbuf)); 3246 (*pr)("\tflag = %s\n",sbuf); 3247 (*pr)("\tsyncwrites = " PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3248 (*pr)("\tasyncwrites = " PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3249 (*pr)("\tsyncreads = " PRIu64 "\n",mp->mnt_stat.f_syncreads); 3250 (*pr)("\tasyncreads = " PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3251 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3252 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3253 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3254 3255 { 3256 int cnt = 0; 3257 struct vnode *vp; 3258 (*pr)("locked vnodes ="); 3259 /* XXX would take mountlist lock, except ddb may not have context */ 3260 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3261 if (VOP_ISLOCKED(vp)) { 3262 if ((++cnt % 6) == 0) { 3263 (*pr)(" %p,\n\t", vp); 3264 } else { 3265 (*pr)(" %p,", vp); 3266 } 3267 } 3268 } 3269 (*pr)("\n"); 3270 } 3271 3272 if (full) { 3273 int cnt = 0; 3274 struct vnode *vp; 3275 (*pr)("all vnodes ="); 3276 /* XXX would take mountlist lock, except ddb may not have context */ 3277 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3278 if (!LIST_NEXT(vp, v_mntvnodes)) { 3279 (*pr)(" %p", vp); 3280 } else if ((++cnt % 6) == 0) { 3281 (*pr)(" %p,\n\t", vp); 3282 } else { 3283 (*pr)(" %p,", vp); 3284 } 3285 } 3286 (*pr)("\n", vp); 3287 } 3288 } 3289 3290 #endif 3291