1 /* $NetBSD: vfs_subr.c,v 1.236 2004/11/14 00:36:21 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.236 2004/11/14 00:36:21 christos Exp $"); 82 83 #include "opt_inet.h" 84 #include "opt_ddb.h" 85 #include "opt_compat_netbsd.h" 86 #include "opt_compat_43.h" 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/proc.h> 91 #include <sys/kernel.h> 92 #include <sys/mount.h> 93 #include <sys/time.h> 94 #include <sys/event.h> 95 #include <sys/fcntl.h> 96 #include <sys/vnode.h> 97 #include <sys/stat.h> 98 #include <sys/namei.h> 99 #include <sys/ucred.h> 100 #include <sys/buf.h> 101 #include <sys/errno.h> 102 #include <sys/malloc.h> 103 #include <sys/domain.h> 104 #include <sys/mbuf.h> 105 #include <sys/sa.h> 106 #include <sys/syscallargs.h> 107 #include <sys/device.h> 108 #include <sys/dirent.h> 109 #include <sys/filedesc.h> 110 111 #include <miscfs/specfs/specdev.h> 112 #include <miscfs/genfs/genfs.h> 113 #include <miscfs/syncfs/syncfs.h> 114 115 #include <netinet/in.h> 116 117 #include <uvm/uvm.h> 118 #include <uvm/uvm_ddb.h> 119 120 #include <netinet/in.h> 121 122 #include <sys/sysctl.h> 123 124 const enum vtype iftovt_tab[16] = { 125 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 126 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 127 }; 128 const int vttoif_tab[9] = { 129 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 130 S_IFSOCK, S_IFIFO, S_IFMT, 131 }; 132 133 int doforce = 1; /* 1 => permit forcible unmounting */ 134 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 135 136 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 137 138 /* 139 * Insq/Remq for the vnode usage lists. 140 */ 141 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 142 #define bufremvn(bp) { \ 143 LIST_REMOVE(bp, b_vnbufs); \ 144 (bp)->b_vnbufs.le_next = NOLIST; \ 145 } 146 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 147 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 148 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 149 150 struct mntlist mountlist = /* mounted filesystem list */ 151 CIRCLEQ_HEAD_INITIALIZER(mountlist); 152 struct vfs_list_head vfs_list = /* vfs list */ 153 LIST_HEAD_INITIALIZER(vfs_list); 154 155 struct nfs_public nfs_pub; /* publicly exported FS */ 156 157 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 158 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 159 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 160 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 161 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 162 163 /* XXX - gross; single global lock to protect v_numoutput */ 164 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 165 166 /* 167 * These define the root filesystem and device. 168 */ 169 struct mount *rootfs; 170 struct vnode *rootvnode; 171 struct device *root_device; /* root device */ 172 173 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 174 &pool_allocator_nointr); 175 176 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 177 178 /* 179 * Local declarations. 180 */ 181 void insmntque(struct vnode *, struct mount *); 182 int getdevvp(dev_t, struct vnode **, enum vtype); 183 184 void vclean(struct vnode *, int, struct proc *); 185 186 static int vfs_hang_addrlist(struct mount *, struct netexport *, 187 struct export_args *); 188 static int vfs_free_netcred(struct radix_node *, void *); 189 static void vfs_free_addrlist(struct netexport *); 190 static struct vnode *getcleanvnode(struct proc *); 191 192 #ifdef DEBUG 193 void printlockedvnodes(void); 194 #endif 195 196 /* 197 * Initialize the vnode management data structures. 198 */ 199 void 200 vntblinit() 201 { 202 203 /* 204 * Initialize the filesystem syncer. 205 */ 206 vn_initialize_syncerd(); 207 } 208 209 int 210 vfs_drainvnodes(long target, struct proc *p) 211 { 212 213 simple_lock(&vnode_free_list_slock); 214 while (numvnodes > target) { 215 struct vnode *vp; 216 217 vp = getcleanvnode(p); 218 if (vp == NULL) 219 return EBUSY; /* give up */ 220 pool_put(&vnode_pool, vp); 221 simple_lock(&vnode_free_list_slock); 222 numvnodes--; 223 } 224 simple_unlock(&vnode_free_list_slock); 225 226 return 0; 227 } 228 229 /* 230 * grab a vnode from freelist and clean it. 231 */ 232 struct vnode * 233 getcleanvnode(p) 234 struct proc *p; 235 { 236 struct vnode *vp; 237 struct mount *mp; 238 struct freelst *listhd; 239 240 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 241 242 listhd = &vnode_free_list; 243 try_nextlist: 244 TAILQ_FOREACH(vp, listhd, v_freelist) { 245 if (!simple_lock_try(&vp->v_interlock)) 246 continue; 247 /* 248 * as our lwp might hold the underlying vnode locked, 249 * don't try to reclaim the VLAYER vnode if it's locked. 250 */ 251 if ((vp->v_flag & VXLOCK) == 0 && 252 ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 253 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 254 break; 255 } 256 mp = NULL; 257 simple_unlock(&vp->v_interlock); 258 } 259 260 if (vp == NULLVP) { 261 if (listhd == &vnode_free_list) { 262 listhd = &vnode_hold_list; 263 goto try_nextlist; 264 } 265 simple_unlock(&vnode_free_list_slock); 266 return NULLVP; 267 } 268 269 if (vp->v_usecount) 270 panic("free vnode isn't, vp %p", vp); 271 TAILQ_REMOVE(listhd, vp, v_freelist); 272 /* see comment on why 0xdeadb is set at end of vgone (below) */ 273 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 274 simple_unlock(&vnode_free_list_slock); 275 vp->v_lease = NULL; 276 277 if (vp->v_type != VBAD) 278 vgonel(vp, p); 279 else 280 simple_unlock(&vp->v_interlock); 281 vn_finished_write(mp, 0); 282 #ifdef DIAGNOSTIC 283 if (vp->v_data || vp->v_uobj.uo_npages || 284 TAILQ_FIRST(&vp->v_uobj.memq)) 285 panic("cleaned vnode isn't, vp %p", vp); 286 if (vp->v_numoutput) 287 panic("clean vnode has pending I/O's, vp %p", vp); 288 #endif 289 KASSERT((vp->v_flag & VONWORKLST) == 0); 290 291 return vp; 292 } 293 294 /* 295 * Mark a mount point as busy. Used to synchronize access and to delay 296 * unmounting. Interlock is not released on failure. 297 */ 298 int 299 vfs_busy(mp, flags, interlkp) 300 struct mount *mp; 301 int flags; 302 struct simplelock *interlkp; 303 { 304 int lkflags; 305 306 while (mp->mnt_iflag & IMNT_UNMOUNT) { 307 int gone, n; 308 309 if (flags & LK_NOWAIT) 310 return (ENOENT); 311 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 312 && mp->mnt_unmounter == curproc) 313 return (EDEADLK); 314 if (interlkp) 315 simple_unlock(interlkp); 316 /* 317 * Since all busy locks are shared except the exclusive 318 * lock granted when unmounting, the only place that a 319 * wakeup needs to be done is at the release of the 320 * exclusive lock at the end of dounmount. 321 */ 322 simple_lock(&mp->mnt_slock); 323 mp->mnt_wcnt++; 324 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 325 n = --mp->mnt_wcnt; 326 simple_unlock(&mp->mnt_slock); 327 gone = mp->mnt_iflag & IMNT_GONE; 328 329 if (n == 0) 330 wakeup(&mp->mnt_wcnt); 331 if (interlkp) 332 simple_lock(interlkp); 333 if (gone) 334 return (ENOENT); 335 } 336 lkflags = LK_SHARED; 337 if (interlkp) 338 lkflags |= LK_INTERLOCK; 339 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 340 panic("vfs_busy: unexpected lock failure"); 341 return (0); 342 } 343 344 /* 345 * Free a busy filesystem. 346 */ 347 void 348 vfs_unbusy(mp) 349 struct mount *mp; 350 { 351 352 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 353 } 354 355 /* 356 * Lookup a filesystem type, and if found allocate and initialize 357 * a mount structure for it. 358 * 359 * Devname is usually updated by mount(8) after booting. 360 */ 361 int 362 vfs_rootmountalloc(fstypename, devname, mpp) 363 char *fstypename; 364 char *devname; 365 struct mount **mpp; 366 { 367 struct vfsops *vfsp = NULL; 368 struct mount *mp; 369 370 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 371 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 372 break; 373 374 if (vfsp == NULL) 375 return (ENODEV); 376 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 377 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 378 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 379 simple_lock_init(&mp->mnt_slock); 380 (void)vfs_busy(mp, LK_NOWAIT, 0); 381 LIST_INIT(&mp->mnt_vnodelist); 382 mp->mnt_op = vfsp; 383 mp->mnt_flag = MNT_RDONLY; 384 mp->mnt_vnodecovered = NULLVP; 385 mp->mnt_leaf = mp; 386 vfsp->vfs_refcount++; 387 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 388 mp->mnt_stat.f_mntonname[0] = '/'; 389 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 390 *mpp = mp; 391 return (0); 392 } 393 394 /* 395 * Lookup a mount point by filesystem identifier. 396 */ 397 struct mount * 398 vfs_getvfs(fsid) 399 fsid_t *fsid; 400 { 401 struct mount *mp; 402 403 simple_lock(&mountlist_slock); 404 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 405 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 406 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 407 simple_unlock(&mountlist_slock); 408 return (mp); 409 } 410 } 411 simple_unlock(&mountlist_slock); 412 return ((struct mount *)0); 413 } 414 415 /* 416 * Get a new unique fsid 417 */ 418 void 419 vfs_getnewfsid(mp) 420 struct mount *mp; 421 { 422 static u_short xxxfs_mntid; 423 fsid_t tfsid; 424 int mtype; 425 426 simple_lock(&mntid_slock); 427 mtype = makefstype(mp->mnt_op->vfs_name); 428 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 429 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 430 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 431 if (xxxfs_mntid == 0) 432 ++xxxfs_mntid; 433 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 434 tfsid.__fsid_val[1] = mtype; 435 if (!CIRCLEQ_EMPTY(&mountlist)) { 436 while (vfs_getvfs(&tfsid)) { 437 tfsid.__fsid_val[0]++; 438 xxxfs_mntid++; 439 } 440 } 441 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 442 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 443 simple_unlock(&mntid_slock); 444 } 445 446 /* 447 * Make a 'unique' number from a mount type name. 448 */ 449 long 450 makefstype(type) 451 const char *type; 452 { 453 long rv; 454 455 for (rv = 0; *type; type++) { 456 rv <<= 2; 457 rv ^= *type; 458 } 459 return rv; 460 } 461 462 463 /* 464 * Set vnode attributes to VNOVAL 465 */ 466 void 467 vattr_null(vap) 468 struct vattr *vap; 469 { 470 471 vap->va_type = VNON; 472 473 /* 474 * Assign individually so that it is safe even if size and 475 * sign of each member are varied. 476 */ 477 vap->va_mode = VNOVAL; 478 vap->va_nlink = VNOVAL; 479 vap->va_uid = VNOVAL; 480 vap->va_gid = VNOVAL; 481 vap->va_fsid = VNOVAL; 482 vap->va_fileid = VNOVAL; 483 vap->va_size = VNOVAL; 484 vap->va_blocksize = VNOVAL; 485 vap->va_atime.tv_sec = 486 vap->va_mtime.tv_sec = 487 vap->va_ctime.tv_sec = 488 vap->va_birthtime.tv_sec = VNOVAL; 489 vap->va_atime.tv_nsec = 490 vap->va_mtime.tv_nsec = 491 vap->va_ctime.tv_nsec = 492 vap->va_birthtime.tv_nsec = VNOVAL; 493 vap->va_gen = VNOVAL; 494 vap->va_flags = VNOVAL; 495 vap->va_rdev = VNOVAL; 496 vap->va_bytes = VNOVAL; 497 vap->va_vaflags = 0; 498 } 499 500 /* 501 * Routines having to do with the management of the vnode table. 502 */ 503 extern int (**dead_vnodeop_p)(void *); 504 long numvnodes; 505 506 /* 507 * Return the next vnode from the free list. 508 */ 509 int 510 getnewvnode(tag, mp, vops, vpp) 511 enum vtagtype tag; 512 struct mount *mp; 513 int (**vops)(void *); 514 struct vnode **vpp; 515 { 516 extern struct uvm_pagerops uvm_vnodeops; 517 struct uvm_object *uobj; 518 struct proc *p = curproc; /* XXX */ 519 static int toggle; 520 struct vnode *vp; 521 int error = 0, tryalloc; 522 523 try_again: 524 if (mp) { 525 /* 526 * Mark filesystem busy while we're creating a vnode. 527 * If unmount is in progress, this will wait; if the 528 * unmount succeeds (only if umount -f), this will 529 * return an error. If the unmount fails, we'll keep 530 * going afterwards. 531 * (This puts the per-mount vnode list logically under 532 * the protection of the vfs_busy lock). 533 */ 534 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 535 if (error && error != EDEADLK) 536 return error; 537 } 538 539 /* 540 * We must choose whether to allocate a new vnode or recycle an 541 * existing one. The criterion for allocating a new one is that 542 * the total number of vnodes is less than the number desired or 543 * there are no vnodes on either free list. Generally we only 544 * want to recycle vnodes that have no buffers associated with 545 * them, so we look first on the vnode_free_list. If it is empty, 546 * we next consider vnodes with referencing buffers on the 547 * vnode_hold_list. The toggle ensures that half the time we 548 * will use a buffer from the vnode_hold_list, and half the time 549 * we will allocate a new one unless the list has grown to twice 550 * the desired size. We are reticent to recycle vnodes from the 551 * vnode_hold_list because we will lose the identity of all its 552 * referencing buffers. 553 */ 554 555 vp = NULL; 556 557 simple_lock(&vnode_free_list_slock); 558 559 toggle ^= 1; 560 if (numvnodes > 2 * desiredvnodes) 561 toggle = 0; 562 563 tryalloc = numvnodes < desiredvnodes || 564 (TAILQ_FIRST(&vnode_free_list) == NULL && 565 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 566 567 if (tryalloc && 568 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 569 numvnodes++; 570 simple_unlock(&vnode_free_list_slock); 571 memset(vp, 0, sizeof(*vp)); 572 simple_lock_init(&vp->v_interlock); 573 uobj = &vp->v_uobj; 574 uobj->pgops = &uvm_vnodeops; 575 TAILQ_INIT(&uobj->memq); 576 /* 577 * done by memset() above. 578 * uobj->uo_npages = 0; 579 * LIST_INIT(&vp->v_nclist); 580 * LIST_INIT(&vp->v_dnclist); 581 */ 582 } else { 583 vp = getcleanvnode(p); 584 /* 585 * Unless this is a bad time of the month, at most 586 * the first NCPUS items on the free list are 587 * locked, so this is close enough to being empty. 588 */ 589 if (vp == NULLVP) { 590 if (mp && error != EDEADLK) 591 vfs_unbusy(mp); 592 if (tryalloc) { 593 printf("WARNING: unable to allocate new " 594 "vnode, retrying...\n"); 595 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 596 goto try_again; 597 } 598 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 599 *vpp = 0; 600 return (ENFILE); 601 } 602 vp->v_flag = 0; 603 vp->v_socket = NULL; 604 #ifdef VERIFIED_EXEC 605 vp->fp_status = FINGERPRINT_INVALID; 606 #endif 607 } 608 vp->v_type = VNON; 609 vp->v_vnlock = &vp->v_lock; 610 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 611 KASSERT(LIST_EMPTY(&vp->v_nclist)); 612 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 613 vp->v_tag = tag; 614 vp->v_op = vops; 615 insmntque(vp, mp); 616 *vpp = vp; 617 vp->v_usecount = 1; 618 vp->v_data = 0; 619 simple_lock_init(&vp->v_uobj.vmobjlock); 620 621 /* 622 * initialize uvm_object within vnode. 623 */ 624 625 uobj = &vp->v_uobj; 626 KASSERT(uobj->pgops == &uvm_vnodeops); 627 KASSERT(uobj->uo_npages == 0); 628 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 629 vp->v_size = VSIZENOTSET; 630 631 if (mp && error != EDEADLK) 632 vfs_unbusy(mp); 633 return (0); 634 } 635 636 /* 637 * This is really just the reverse of getnewvnode(). Needed for 638 * VFS_VGET functions who may need to push back a vnode in case 639 * of a locking race. 640 */ 641 void 642 ungetnewvnode(vp) 643 struct vnode *vp; 644 { 645 #ifdef DIAGNOSTIC 646 if (vp->v_usecount != 1) 647 panic("ungetnewvnode: busy vnode"); 648 #endif 649 vp->v_usecount--; 650 insmntque(vp, NULL); 651 vp->v_type = VBAD; 652 653 simple_lock(&vp->v_interlock); 654 /* 655 * Insert at head of LRU list 656 */ 657 simple_lock(&vnode_free_list_slock); 658 if (vp->v_holdcnt > 0) 659 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 660 else 661 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 662 simple_unlock(&vnode_free_list_slock); 663 simple_unlock(&vp->v_interlock); 664 } 665 666 /* 667 * Move a vnode from one mount queue to another. 668 */ 669 void 670 insmntque(vp, mp) 671 struct vnode *vp; 672 struct mount *mp; 673 { 674 675 #ifdef DIAGNOSTIC 676 if ((mp != NULL) && 677 (mp->mnt_iflag & IMNT_UNMOUNT) && 678 !(mp->mnt_flag & MNT_SOFTDEP) && 679 vp->v_tag != VT_VFS) { 680 panic("insmntque into dying filesystem"); 681 } 682 #endif 683 684 simple_lock(&mntvnode_slock); 685 /* 686 * Delete from old mount point vnode list, if on one. 687 */ 688 if (vp->v_mount != NULL) 689 LIST_REMOVE(vp, v_mntvnodes); 690 /* 691 * Insert into list of vnodes for the new mount point, if available. 692 */ 693 if ((vp->v_mount = mp) != NULL) 694 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 695 simple_unlock(&mntvnode_slock); 696 } 697 698 /* 699 * Update outstanding I/O count and do wakeup if requested. 700 */ 701 void 702 vwakeup(bp) 703 struct buf *bp; 704 { 705 struct vnode *vp; 706 707 if ((vp = bp->b_vp) != NULL) { 708 /* XXX global lock hack 709 * can't use v_interlock here since this is called 710 * in interrupt context from biodone(). 711 */ 712 simple_lock(&global_v_numoutput_slock); 713 if (--vp->v_numoutput < 0) 714 panic("vwakeup: neg numoutput, vp %p", vp); 715 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 716 vp->v_flag &= ~VBWAIT; 717 wakeup((caddr_t)&vp->v_numoutput); 718 } 719 simple_unlock(&global_v_numoutput_slock); 720 } 721 } 722 723 /* 724 * Flush out and invalidate all buffers associated with a vnode. 725 * Called with the underlying vnode locked, which should prevent new dirty 726 * buffers from being queued. 727 */ 728 int 729 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 730 struct vnode *vp; 731 int flags; 732 struct ucred *cred; 733 struct proc *p; 734 int slpflag, slptimeo; 735 { 736 struct buf *bp, *nbp; 737 int s, error; 738 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 739 (flags & V_SAVE ? PGO_CLEANIT : 0); 740 741 /* XXXUBC this doesn't look at flags or slp* */ 742 simple_lock(&vp->v_interlock); 743 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 744 if (error) { 745 return error; 746 } 747 748 if (flags & V_SAVE) { 749 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 750 if (error) 751 return (error); 752 #ifdef DIAGNOSTIC 753 s = splbio(); 754 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 755 panic("vinvalbuf: dirty bufs, vp %p", vp); 756 splx(s); 757 #endif 758 } 759 760 s = splbio(); 761 762 restart: 763 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 764 nbp = LIST_NEXT(bp, b_vnbufs); 765 simple_lock(&bp->b_interlock); 766 if (bp->b_flags & B_BUSY) { 767 bp->b_flags |= B_WANTED; 768 error = ltsleep((caddr_t)bp, 769 slpflag | (PRIBIO + 1) | PNORELOCK, 770 "vinvalbuf", slptimeo, &bp->b_interlock); 771 if (error) { 772 splx(s); 773 return (error); 774 } 775 goto restart; 776 } 777 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 778 simple_unlock(&bp->b_interlock); 779 brelse(bp); 780 } 781 782 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 783 nbp = LIST_NEXT(bp, b_vnbufs); 784 simple_lock(&bp->b_interlock); 785 if (bp->b_flags & B_BUSY) { 786 bp->b_flags |= B_WANTED; 787 error = ltsleep((caddr_t)bp, 788 slpflag | (PRIBIO + 1) | PNORELOCK, 789 "vinvalbuf", slptimeo, &bp->b_interlock); 790 if (error) { 791 splx(s); 792 return (error); 793 } 794 goto restart; 795 } 796 /* 797 * XXX Since there are no node locks for NFS, I believe 798 * there is a slight chance that a delayed write will 799 * occur while sleeping just above, so check for it. 800 */ 801 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 802 #ifdef DEBUG 803 printf("buffer still DELWRI\n"); 804 #endif 805 bp->b_flags |= B_BUSY | B_VFLUSH; 806 simple_unlock(&bp->b_interlock); 807 VOP_BWRITE(bp); 808 goto restart; 809 } 810 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 811 simple_unlock(&bp->b_interlock); 812 brelse(bp); 813 } 814 815 #ifdef DIAGNOSTIC 816 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 817 panic("vinvalbuf: flush failed, vp %p", vp); 818 #endif 819 820 splx(s); 821 822 return (0); 823 } 824 825 /* 826 * Destroy any in core blocks past the truncation length. 827 * Called with the underlying vnode locked, which should prevent new dirty 828 * buffers from being queued. 829 */ 830 int 831 vtruncbuf(vp, lbn, slpflag, slptimeo) 832 struct vnode *vp; 833 daddr_t lbn; 834 int slpflag, slptimeo; 835 { 836 struct buf *bp, *nbp; 837 int s, error; 838 voff_t off; 839 840 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 841 simple_lock(&vp->v_interlock); 842 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 843 if (error) { 844 return error; 845 } 846 847 s = splbio(); 848 849 restart: 850 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 851 nbp = LIST_NEXT(bp, b_vnbufs); 852 if (bp->b_lblkno < lbn) 853 continue; 854 simple_lock(&bp->b_interlock); 855 if (bp->b_flags & B_BUSY) { 856 bp->b_flags |= B_WANTED; 857 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 858 "vtruncbuf", slptimeo, &bp->b_interlock); 859 if (error) { 860 splx(s); 861 return (error); 862 } 863 goto restart; 864 } 865 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 866 simple_unlock(&bp->b_interlock); 867 brelse(bp); 868 } 869 870 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 871 nbp = LIST_NEXT(bp, b_vnbufs); 872 if (bp->b_lblkno < lbn) 873 continue; 874 simple_lock(&bp->b_interlock); 875 if (bp->b_flags & B_BUSY) { 876 bp->b_flags |= B_WANTED; 877 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 878 "vtruncbuf", slptimeo, &bp->b_interlock); 879 if (error) { 880 splx(s); 881 return (error); 882 } 883 goto restart; 884 } 885 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 886 simple_unlock(&bp->b_interlock); 887 brelse(bp); 888 } 889 890 splx(s); 891 892 return (0); 893 } 894 895 void 896 vflushbuf(vp, sync) 897 struct vnode *vp; 898 int sync; 899 { 900 struct buf *bp, *nbp; 901 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 902 int s; 903 904 simple_lock(&vp->v_interlock); 905 (void) VOP_PUTPAGES(vp, 0, 0, flags); 906 907 loop: 908 s = splbio(); 909 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 910 nbp = LIST_NEXT(bp, b_vnbufs); 911 simple_lock(&bp->b_interlock); 912 if ((bp->b_flags & B_BUSY)) { 913 simple_unlock(&bp->b_interlock); 914 continue; 915 } 916 if ((bp->b_flags & B_DELWRI) == 0) 917 panic("vflushbuf: not dirty, bp %p", bp); 918 bp->b_flags |= B_BUSY | B_VFLUSH; 919 simple_unlock(&bp->b_interlock); 920 splx(s); 921 /* 922 * Wait for I/O associated with indirect blocks to complete, 923 * since there is no way to quickly wait for them below. 924 */ 925 if (bp->b_vp == vp || sync == 0) 926 (void) bawrite(bp); 927 else 928 (void) bwrite(bp); 929 goto loop; 930 } 931 if (sync == 0) { 932 splx(s); 933 return; 934 } 935 simple_lock(&global_v_numoutput_slock); 936 while (vp->v_numoutput) { 937 vp->v_flag |= VBWAIT; 938 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 939 &global_v_numoutput_slock); 940 } 941 simple_unlock(&global_v_numoutput_slock); 942 splx(s); 943 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 944 vprint("vflushbuf: dirty", vp); 945 goto loop; 946 } 947 } 948 949 /* 950 * Associate a buffer with a vnode. 951 */ 952 void 953 bgetvp(vp, bp) 954 struct vnode *vp; 955 struct buf *bp; 956 { 957 int s; 958 959 if (bp->b_vp) 960 panic("bgetvp: not free, bp %p", bp); 961 VHOLD(vp); 962 s = splbio(); 963 bp->b_vp = vp; 964 if (vp->v_type == VBLK || vp->v_type == VCHR) 965 bp->b_dev = vp->v_rdev; 966 else 967 bp->b_dev = NODEV; 968 /* 969 * Insert onto list for new vnode. 970 */ 971 bufinsvn(bp, &vp->v_cleanblkhd); 972 splx(s); 973 } 974 975 /* 976 * Disassociate a buffer from a vnode. 977 */ 978 void 979 brelvp(bp) 980 struct buf *bp; 981 { 982 struct vnode *vp; 983 int s; 984 985 if (bp->b_vp == NULL) 986 panic("brelvp: vp NULL, bp %p", bp); 987 988 s = splbio(); 989 vp = bp->b_vp; 990 /* 991 * Delete from old vnode list, if on one. 992 */ 993 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 994 bufremvn(bp); 995 996 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 997 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 998 vp->v_flag &= ~VONWORKLST; 999 LIST_REMOVE(vp, v_synclist); 1000 } 1001 1002 bp->b_vp = NULL; 1003 HOLDRELE(vp); 1004 splx(s); 1005 } 1006 1007 /* 1008 * Reassign a buffer from one vnode to another. 1009 * Used to assign file specific control information 1010 * (indirect blocks) to the vnode to which they belong. 1011 * 1012 * This function must be called at splbio(). 1013 */ 1014 void 1015 reassignbuf(bp, newvp) 1016 struct buf *bp; 1017 struct vnode *newvp; 1018 { 1019 struct buflists *listheadp; 1020 int delay; 1021 1022 /* 1023 * Delete from old vnode list, if on one. 1024 */ 1025 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1026 bufremvn(bp); 1027 /* 1028 * If dirty, put on list of dirty buffers; 1029 * otherwise insert onto list of clean buffers. 1030 */ 1031 if ((bp->b_flags & B_DELWRI) == 0) { 1032 listheadp = &newvp->v_cleanblkhd; 1033 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 1034 (newvp->v_flag & VONWORKLST) && 1035 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1036 newvp->v_flag &= ~VONWORKLST; 1037 LIST_REMOVE(newvp, v_synclist); 1038 } 1039 } else { 1040 listheadp = &newvp->v_dirtyblkhd; 1041 if ((newvp->v_flag & VONWORKLST) == 0) { 1042 switch (newvp->v_type) { 1043 case VDIR: 1044 delay = dirdelay; 1045 break; 1046 case VBLK: 1047 if (newvp->v_specmountpoint != NULL) { 1048 delay = metadelay; 1049 break; 1050 } 1051 /* fall through */ 1052 default: 1053 delay = filedelay; 1054 break; 1055 } 1056 if (!newvp->v_mount || 1057 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1058 vn_syncer_add_to_worklist(newvp, delay); 1059 } 1060 } 1061 bufinsvn(bp, listheadp); 1062 } 1063 1064 /* 1065 * Create a vnode for a block device. 1066 * Used for root filesystem and swap areas. 1067 * Also used for memory file system special devices. 1068 */ 1069 int 1070 bdevvp(dev, vpp) 1071 dev_t dev; 1072 struct vnode **vpp; 1073 { 1074 1075 return (getdevvp(dev, vpp, VBLK)); 1076 } 1077 1078 /* 1079 * Create a vnode for a character device. 1080 * Used for kernfs and some console handling. 1081 */ 1082 int 1083 cdevvp(dev, vpp) 1084 dev_t dev; 1085 struct vnode **vpp; 1086 { 1087 1088 return (getdevvp(dev, vpp, VCHR)); 1089 } 1090 1091 /* 1092 * Create a vnode for a device. 1093 * Used by bdevvp (block device) for root file system etc., 1094 * and by cdevvp (character device) for console and kernfs. 1095 */ 1096 int 1097 getdevvp(dev, vpp, type) 1098 dev_t dev; 1099 struct vnode **vpp; 1100 enum vtype type; 1101 { 1102 struct vnode *vp; 1103 struct vnode *nvp; 1104 int error; 1105 1106 if (dev == NODEV) { 1107 *vpp = NULLVP; 1108 return (0); 1109 } 1110 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1111 if (error) { 1112 *vpp = NULLVP; 1113 return (error); 1114 } 1115 vp = nvp; 1116 vp->v_type = type; 1117 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1118 vput(vp); 1119 vp = nvp; 1120 } 1121 *vpp = vp; 1122 return (0); 1123 } 1124 1125 /* 1126 * Check to see if the new vnode represents a special device 1127 * for which we already have a vnode (either because of 1128 * bdevvp() or because of a different vnode representing 1129 * the same block device). If such an alias exists, deallocate 1130 * the existing contents and return the aliased vnode. The 1131 * caller is responsible for filling it with its new contents. 1132 */ 1133 struct vnode * 1134 checkalias(nvp, nvp_rdev, mp) 1135 struct vnode *nvp; 1136 dev_t nvp_rdev; 1137 struct mount *mp; 1138 { 1139 struct proc *p = curproc; /* XXX */ 1140 struct vnode *vp; 1141 struct vnode **vpp; 1142 1143 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1144 return (NULLVP); 1145 1146 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1147 loop: 1148 simple_lock(&spechash_slock); 1149 for (vp = *vpp; vp; vp = vp->v_specnext) { 1150 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1151 continue; 1152 /* 1153 * Alias, but not in use, so flush it out. 1154 */ 1155 simple_lock(&vp->v_interlock); 1156 simple_unlock(&spechash_slock); 1157 if (vp->v_usecount == 0) { 1158 vgonel(vp, p); 1159 goto loop; 1160 } 1161 /* 1162 * What we're interested to know here is if someone else has 1163 * removed this vnode from the device hash list while we were 1164 * waiting. This can only happen if vclean() did it, and 1165 * this requires the vnode to be locked. Therefore, we use 1166 * LK_SLEEPFAIL and retry. 1167 */ 1168 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL)) 1169 goto loop; 1170 simple_lock(&spechash_slock); 1171 break; 1172 } 1173 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1174 MALLOC(nvp->v_specinfo, struct specinfo *, 1175 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1176 /* XXX Erg. */ 1177 if (nvp->v_specinfo == NULL) { 1178 simple_unlock(&spechash_slock); 1179 uvm_wait("checkalias"); 1180 goto loop; 1181 } 1182 1183 nvp->v_rdev = nvp_rdev; 1184 nvp->v_hashchain = vpp; 1185 nvp->v_specnext = *vpp; 1186 nvp->v_specmountpoint = NULL; 1187 simple_unlock(&spechash_slock); 1188 nvp->v_speclockf = NULL; 1189 simple_lock_init(&nvp->v_spec_cow_slock); 1190 SLIST_INIT(&nvp->v_spec_cow_head); 1191 nvp->v_spec_cow_req = 0; 1192 nvp->v_spec_cow_count = 0; 1193 1194 *vpp = nvp; 1195 if (vp != NULLVP) { 1196 nvp->v_flag |= VALIASED; 1197 vp->v_flag |= VALIASED; 1198 vput(vp); 1199 } 1200 return (NULLVP); 1201 } 1202 simple_unlock(&spechash_slock); 1203 VOP_UNLOCK(vp, 0); 1204 simple_lock(&vp->v_interlock); 1205 vclean(vp, 0, p); 1206 vp->v_op = nvp->v_op; 1207 vp->v_tag = nvp->v_tag; 1208 vp->v_vnlock = &vp->v_lock; 1209 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1210 nvp->v_type = VNON; 1211 insmntque(vp, mp); 1212 return (vp); 1213 } 1214 1215 /* 1216 * Grab a particular vnode from the free list, increment its 1217 * reference count and lock it. If the vnode lock bit is set the 1218 * vnode is being eliminated in vgone. In that case, we can not 1219 * grab the vnode, so the process is awakened when the transition is 1220 * completed, and an error returned to indicate that the vnode is no 1221 * longer usable (possibly having been changed to a new file system type). 1222 */ 1223 int 1224 vget(vp, flags) 1225 struct vnode *vp; 1226 int flags; 1227 { 1228 int error; 1229 1230 /* 1231 * If the vnode is in the process of being cleaned out for 1232 * another use, we wait for the cleaning to finish and then 1233 * return failure. Cleaning is determined by checking that 1234 * the VXLOCK flag is set. 1235 */ 1236 1237 if ((flags & LK_INTERLOCK) == 0) 1238 simple_lock(&vp->v_interlock); 1239 if (vp->v_flag & VXLOCK) { 1240 if (flags & LK_NOWAIT) { 1241 simple_unlock(&vp->v_interlock); 1242 return EBUSY; 1243 } 1244 vp->v_flag |= VXWANT; 1245 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1246 return (ENOENT); 1247 } 1248 if (vp->v_usecount == 0) { 1249 simple_lock(&vnode_free_list_slock); 1250 if (vp->v_holdcnt > 0) 1251 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1252 else 1253 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1254 simple_unlock(&vnode_free_list_slock); 1255 } 1256 vp->v_usecount++; 1257 #ifdef DIAGNOSTIC 1258 if (vp->v_usecount == 0) { 1259 vprint("vget", vp); 1260 panic("vget: usecount overflow, vp %p", vp); 1261 } 1262 #endif 1263 if (flags & LK_TYPE_MASK) { 1264 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1265 /* 1266 * must expand vrele here because we do not want 1267 * to call VOP_INACTIVE if the reference count 1268 * drops back to zero since it was never really 1269 * active. We must remove it from the free list 1270 * before sleeping so that multiple processes do 1271 * not try to recycle it. 1272 */ 1273 simple_lock(&vp->v_interlock); 1274 vp->v_usecount--; 1275 if (vp->v_usecount > 0) { 1276 simple_unlock(&vp->v_interlock); 1277 return (error); 1278 } 1279 /* 1280 * insert at tail of LRU list 1281 */ 1282 simple_lock(&vnode_free_list_slock); 1283 if (vp->v_holdcnt > 0) 1284 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1285 v_freelist); 1286 else 1287 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1288 v_freelist); 1289 simple_unlock(&vnode_free_list_slock); 1290 simple_unlock(&vp->v_interlock); 1291 } 1292 return (error); 1293 } 1294 simple_unlock(&vp->v_interlock); 1295 return (0); 1296 } 1297 1298 /* 1299 * vput(), just unlock and vrele() 1300 */ 1301 void 1302 vput(vp) 1303 struct vnode *vp; 1304 { 1305 struct proc *p = curproc; /* XXX */ 1306 1307 #ifdef DIAGNOSTIC 1308 if (vp == NULL) 1309 panic("vput: null vp"); 1310 #endif 1311 simple_lock(&vp->v_interlock); 1312 vp->v_usecount--; 1313 if (vp->v_usecount > 0) { 1314 simple_unlock(&vp->v_interlock); 1315 VOP_UNLOCK(vp, 0); 1316 return; 1317 } 1318 #ifdef DIAGNOSTIC 1319 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1320 vprint("vput: bad ref count", vp); 1321 panic("vput: ref cnt"); 1322 } 1323 #endif 1324 /* 1325 * Insert at tail of LRU list. 1326 */ 1327 simple_lock(&vnode_free_list_slock); 1328 if (vp->v_holdcnt > 0) 1329 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1330 else 1331 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1332 simple_unlock(&vnode_free_list_slock); 1333 if (vp->v_flag & VEXECMAP) { 1334 uvmexp.execpages -= vp->v_uobj.uo_npages; 1335 uvmexp.filepages += vp->v_uobj.uo_npages; 1336 } 1337 vp->v_flag &= ~(VTEXT|VEXECMAP); 1338 simple_unlock(&vp->v_interlock); 1339 VOP_INACTIVE(vp, p); 1340 } 1341 1342 /* 1343 * Vnode release. 1344 * If count drops to zero, call inactive routine and return to freelist. 1345 */ 1346 void 1347 vrele(vp) 1348 struct vnode *vp; 1349 { 1350 struct proc *p = curproc; /* XXX */ 1351 1352 #ifdef DIAGNOSTIC 1353 if (vp == NULL) 1354 panic("vrele: null vp"); 1355 #endif 1356 simple_lock(&vp->v_interlock); 1357 vp->v_usecount--; 1358 if (vp->v_usecount > 0) { 1359 simple_unlock(&vp->v_interlock); 1360 return; 1361 } 1362 #ifdef DIAGNOSTIC 1363 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1364 vprint("vrele: bad ref count", vp); 1365 panic("vrele: ref cnt vp %p", vp); 1366 } 1367 #endif 1368 /* 1369 * Insert at tail of LRU list. 1370 */ 1371 simple_lock(&vnode_free_list_slock); 1372 if (vp->v_holdcnt > 0) 1373 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1374 else 1375 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1376 simple_unlock(&vnode_free_list_slock); 1377 if (vp->v_flag & VEXECMAP) { 1378 uvmexp.execpages -= vp->v_uobj.uo_npages; 1379 uvmexp.filepages += vp->v_uobj.uo_npages; 1380 } 1381 vp->v_flag &= ~(VTEXT|VEXECMAP); 1382 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1383 VOP_INACTIVE(vp, p); 1384 } 1385 1386 #ifdef DIAGNOSTIC 1387 /* 1388 * Page or buffer structure gets a reference. 1389 */ 1390 void 1391 vholdl(vp) 1392 struct vnode *vp; 1393 { 1394 1395 /* 1396 * If it is on the freelist and the hold count is currently 1397 * zero, move it to the hold list. The test of the back 1398 * pointer and the use reference count of zero is because 1399 * it will be removed from a free list by getnewvnode, 1400 * but will not have its reference count incremented until 1401 * after calling vgone. If the reference count were 1402 * incremented first, vgone would (incorrectly) try to 1403 * close the previous instance of the underlying object. 1404 * So, the back pointer is explicitly set to `0xdeadb' in 1405 * getnewvnode after removing it from a freelist to ensure 1406 * that we do not try to move it here. 1407 */ 1408 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1409 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1410 simple_lock(&vnode_free_list_slock); 1411 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1412 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1413 simple_unlock(&vnode_free_list_slock); 1414 } 1415 vp->v_holdcnt++; 1416 } 1417 1418 /* 1419 * Page or buffer structure frees a reference. 1420 */ 1421 void 1422 holdrelel(vp) 1423 struct vnode *vp; 1424 { 1425 1426 if (vp->v_holdcnt <= 0) 1427 panic("holdrelel: holdcnt vp %p", vp); 1428 vp->v_holdcnt--; 1429 1430 /* 1431 * If it is on the holdlist and the hold count drops to 1432 * zero, move it to the free list. The test of the back 1433 * pointer and the use reference count of zero is because 1434 * it will be removed from a free list by getnewvnode, 1435 * but will not have its reference count incremented until 1436 * after calling vgone. If the reference count were 1437 * incremented first, vgone would (incorrectly) try to 1438 * close the previous instance of the underlying object. 1439 * So, the back pointer is explicitly set to `0xdeadb' in 1440 * getnewvnode after removing it from a freelist to ensure 1441 * that we do not try to move it here. 1442 */ 1443 1444 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1445 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1446 simple_lock(&vnode_free_list_slock); 1447 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1448 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1449 simple_unlock(&vnode_free_list_slock); 1450 } 1451 } 1452 1453 /* 1454 * Vnode reference. 1455 */ 1456 void 1457 vref(vp) 1458 struct vnode *vp; 1459 { 1460 1461 simple_lock(&vp->v_interlock); 1462 if (vp->v_usecount <= 0) 1463 panic("vref used where vget required, vp %p", vp); 1464 vp->v_usecount++; 1465 #ifdef DIAGNOSTIC 1466 if (vp->v_usecount == 0) { 1467 vprint("vref", vp); 1468 panic("vref: usecount overflow, vp %p", vp); 1469 } 1470 #endif 1471 simple_unlock(&vp->v_interlock); 1472 } 1473 #endif /* DIAGNOSTIC */ 1474 1475 /* 1476 * Remove any vnodes in the vnode table belonging to mount point mp. 1477 * 1478 * If FORCECLOSE is not specified, there should not be any active ones, 1479 * return error if any are found (nb: this is a user error, not a 1480 * system error). If FORCECLOSE is specified, detach any active vnodes 1481 * that are found. 1482 * 1483 * If WRITECLOSE is set, only flush out regular file vnodes open for 1484 * writing. 1485 * 1486 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1487 */ 1488 #ifdef DEBUG 1489 int busyprt = 0; /* print out busy vnodes */ 1490 struct ctldebug debug1 = { "busyprt", &busyprt }; 1491 #endif 1492 1493 int 1494 vflush(mp, skipvp, flags) 1495 struct mount *mp; 1496 struct vnode *skipvp; 1497 int flags; 1498 { 1499 struct proc *p = curproc; /* XXX */ 1500 struct vnode *vp, *nvp; 1501 int busy = 0; 1502 1503 simple_lock(&mntvnode_slock); 1504 loop: 1505 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1506 if (vp->v_mount != mp) 1507 goto loop; 1508 nvp = LIST_NEXT(vp, v_mntvnodes); 1509 /* 1510 * Skip over a selected vnode. 1511 */ 1512 if (vp == skipvp) 1513 continue; 1514 simple_lock(&vp->v_interlock); 1515 /* 1516 * Skip over a vnodes marked VSYSTEM. 1517 */ 1518 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1519 simple_unlock(&vp->v_interlock); 1520 continue; 1521 } 1522 /* 1523 * If WRITECLOSE is set, only flush out regular file 1524 * vnodes open for writing. 1525 */ 1526 if ((flags & WRITECLOSE) && 1527 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1528 simple_unlock(&vp->v_interlock); 1529 continue; 1530 } 1531 /* 1532 * With v_usecount == 0, all we need to do is clear 1533 * out the vnode data structures and we are done. 1534 */ 1535 if (vp->v_usecount == 0) { 1536 simple_unlock(&mntvnode_slock); 1537 vgonel(vp, p); 1538 simple_lock(&mntvnode_slock); 1539 continue; 1540 } 1541 /* 1542 * If FORCECLOSE is set, forcibly close the vnode. 1543 * For block or character devices, revert to an 1544 * anonymous device. For all other files, just kill them. 1545 */ 1546 if (flags & FORCECLOSE) { 1547 simple_unlock(&mntvnode_slock); 1548 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1549 vgonel(vp, p); 1550 } else { 1551 vclean(vp, 0, p); 1552 vp->v_op = spec_vnodeop_p; 1553 insmntque(vp, (struct mount *)0); 1554 } 1555 simple_lock(&mntvnode_slock); 1556 continue; 1557 } 1558 #ifdef DEBUG 1559 if (busyprt) 1560 vprint("vflush: busy vnode", vp); 1561 #endif 1562 simple_unlock(&vp->v_interlock); 1563 busy++; 1564 } 1565 simple_unlock(&mntvnode_slock); 1566 if (busy) 1567 return (EBUSY); 1568 return (0); 1569 } 1570 1571 /* 1572 * Disassociate the underlying file system from a vnode. 1573 */ 1574 void 1575 vclean(vp, flags, p) 1576 struct vnode *vp; 1577 int flags; 1578 struct proc *p; 1579 { 1580 struct mount *mp; 1581 int active; 1582 1583 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1584 1585 /* 1586 * Check to see if the vnode is in use. 1587 * If so we have to reference it before we clean it out 1588 * so that its count cannot fall to zero and generate a 1589 * race against ourselves to recycle it. 1590 */ 1591 1592 if ((active = vp->v_usecount) != 0) { 1593 vp->v_usecount++; 1594 #ifdef DIAGNOSTIC 1595 if (vp->v_usecount == 0) { 1596 vprint("vclean", vp); 1597 panic("vclean: usecount overflow"); 1598 } 1599 #endif 1600 } 1601 1602 /* 1603 * Prevent the vnode from being recycled or 1604 * brought into use while we clean it out. 1605 */ 1606 if (vp->v_flag & VXLOCK) 1607 panic("vclean: deadlock, vp %p", vp); 1608 vp->v_flag |= VXLOCK; 1609 if (vp->v_flag & VEXECMAP) { 1610 uvmexp.execpages -= vp->v_uobj.uo_npages; 1611 uvmexp.filepages += vp->v_uobj.uo_npages; 1612 } 1613 vp->v_flag &= ~(VTEXT|VEXECMAP); 1614 1615 /* 1616 * Even if the count is zero, the VOP_INACTIVE routine may still 1617 * have the object locked while it cleans it out. The VOP_LOCK 1618 * ensures that the VOP_INACTIVE routine is done with its work. 1619 * For active vnodes, it ensures that no other activity can 1620 * occur while the underlying object is being cleaned out. 1621 */ 1622 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1623 1624 /* 1625 * Clean out any cached data associated with the vnode. 1626 * If special device, remove it from special device alias list. 1627 * if it is on one. 1628 */ 1629 if (flags & DOCLOSE) { 1630 int error; 1631 struct vnode *vq, *vx; 1632 1633 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1634 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1635 vn_finished_write(mp, V_LOWER); 1636 if (error) 1637 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1638 KASSERT(error == 0); 1639 KASSERT((vp->v_flag & VONWORKLST) == 0); 1640 1641 if (active) 1642 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1643 1644 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1645 vp->v_specinfo != 0) { 1646 simple_lock(&spechash_slock); 1647 if (vp->v_hashchain != NULL) { 1648 if (*vp->v_hashchain == vp) { 1649 *vp->v_hashchain = vp->v_specnext; 1650 } else { 1651 for (vq = *vp->v_hashchain; vq; 1652 vq = vq->v_specnext) { 1653 if (vq->v_specnext != vp) 1654 continue; 1655 vq->v_specnext = vp->v_specnext; 1656 break; 1657 } 1658 if (vq == NULL) 1659 panic("missing bdev"); 1660 } 1661 if (vp->v_flag & VALIASED) { 1662 vx = NULL; 1663 for (vq = *vp->v_hashchain; vq; 1664 vq = vq->v_specnext) { 1665 if (vq->v_rdev != vp->v_rdev || 1666 vq->v_type != vp->v_type) 1667 continue; 1668 if (vx) 1669 break; 1670 vx = vq; 1671 } 1672 if (vx == NULL) 1673 panic("missing alias"); 1674 if (vq == NULL) 1675 vx->v_flag &= ~VALIASED; 1676 vp->v_flag &= ~VALIASED; 1677 } 1678 } 1679 simple_unlock(&spechash_slock); 1680 FREE(vp->v_specinfo, M_VNODE); 1681 vp->v_specinfo = NULL; 1682 } 1683 } 1684 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1685 1686 /* 1687 * If purging an active vnode, it must be closed and 1688 * deactivated before being reclaimed. Note that the 1689 * VOP_INACTIVE will unlock the vnode. 1690 */ 1691 if (active) { 1692 VOP_INACTIVE(vp, p); 1693 } else { 1694 /* 1695 * Any other processes trying to obtain this lock must first 1696 * wait for VXLOCK to clear, then call the new lock operation. 1697 */ 1698 VOP_UNLOCK(vp, 0); 1699 } 1700 /* 1701 * Reclaim the vnode. 1702 */ 1703 if (VOP_RECLAIM(vp, p)) 1704 panic("vclean: cannot reclaim, vp %p", vp); 1705 if (active) { 1706 /* 1707 * Inline copy of vrele() since VOP_INACTIVE 1708 * has already been called. 1709 */ 1710 simple_lock(&vp->v_interlock); 1711 if (--vp->v_usecount <= 0) { 1712 #ifdef DIAGNOSTIC 1713 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1714 vprint("vclean: bad ref count", vp); 1715 panic("vclean: ref cnt"); 1716 } 1717 #endif 1718 /* 1719 * Insert at tail of LRU list. 1720 */ 1721 1722 simple_unlock(&vp->v_interlock); 1723 simple_lock(&vnode_free_list_slock); 1724 #ifdef DIAGNOSTIC 1725 if (vp->v_holdcnt > 0) 1726 panic("vclean: not clean, vp %p", vp); 1727 #endif 1728 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1729 simple_unlock(&vnode_free_list_slock); 1730 } else 1731 simple_unlock(&vp->v_interlock); 1732 } 1733 1734 KASSERT(vp->v_uobj.uo_npages == 0); 1735 cache_purge(vp); 1736 1737 /* 1738 * Done with purge, notify sleepers of the grim news. 1739 */ 1740 vp->v_op = dead_vnodeop_p; 1741 vp->v_tag = VT_NON; 1742 simple_lock(&vp->v_interlock); 1743 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1744 vp->v_flag &= ~(VXLOCK|VLOCKSWORK); 1745 if (vp->v_flag & VXWANT) { 1746 vp->v_flag &= ~VXWANT; 1747 simple_unlock(&vp->v_interlock); 1748 wakeup((caddr_t)vp); 1749 } else 1750 simple_unlock(&vp->v_interlock); 1751 } 1752 1753 /* 1754 * Recycle an unused vnode to the front of the free list. 1755 * Release the passed interlock if the vnode will be recycled. 1756 */ 1757 int 1758 vrecycle(vp, inter_lkp, p) 1759 struct vnode *vp; 1760 struct simplelock *inter_lkp; 1761 struct proc *p; 1762 { 1763 1764 simple_lock(&vp->v_interlock); 1765 if (vp->v_usecount == 0) { 1766 if (inter_lkp) 1767 simple_unlock(inter_lkp); 1768 vgonel(vp, p); 1769 return (1); 1770 } 1771 simple_unlock(&vp->v_interlock); 1772 return (0); 1773 } 1774 1775 /* 1776 * Eliminate all activity associated with a vnode 1777 * in preparation for reuse. 1778 */ 1779 void 1780 vgone(vp) 1781 struct vnode *vp; 1782 { 1783 struct proc *p = curproc; /* XXX */ 1784 1785 simple_lock(&vp->v_interlock); 1786 vgonel(vp, p); 1787 } 1788 1789 /* 1790 * vgone, with the vp interlock held. 1791 */ 1792 void 1793 vgonel(vp, p) 1794 struct vnode *vp; 1795 struct proc *p; 1796 { 1797 1798 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1799 1800 /* 1801 * If a vgone (or vclean) is already in progress, 1802 * wait until it is done and return. 1803 */ 1804 1805 if (vp->v_flag & VXLOCK) { 1806 vp->v_flag |= VXWANT; 1807 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1808 return; 1809 } 1810 1811 /* 1812 * Clean out the filesystem specific data. 1813 */ 1814 1815 vclean(vp, DOCLOSE, p); 1816 KASSERT((vp->v_flag & VONWORKLST) == 0); 1817 1818 /* 1819 * Delete from old mount point vnode list, if on one. 1820 */ 1821 1822 if (vp->v_mount != NULL) 1823 insmntque(vp, (struct mount *)0); 1824 1825 /* 1826 * The test of the back pointer and the reference count of 1827 * zero is because it will be removed from the free list by 1828 * getcleanvnode, but will not have its reference count 1829 * incremented until after calling vgone. If the reference 1830 * count were incremented first, vgone would (incorrectly) 1831 * try to close the previous instance of the underlying object. 1832 * So, the back pointer is explicitly set to `0xdeadb' in 1833 * getnewvnode after removing it from the freelist to ensure 1834 * that we do not try to move it here. 1835 */ 1836 1837 vp->v_type = VBAD; 1838 if (vp->v_usecount == 0) { 1839 boolean_t dofree; 1840 1841 simple_lock(&vnode_free_list_slock); 1842 if (vp->v_holdcnt > 0) 1843 panic("vgonel: not clean, vp %p", vp); 1844 /* 1845 * if it isn't on the freelist, we're called by getcleanvnode 1846 * and vnode is being re-used. otherwise, we'll free it. 1847 */ 1848 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1849 if (dofree) { 1850 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1851 numvnodes--; 1852 } 1853 simple_unlock(&vnode_free_list_slock); 1854 if (dofree) 1855 pool_put(&vnode_pool, vp); 1856 } 1857 } 1858 1859 /* 1860 * Lookup a vnode by device number. 1861 */ 1862 int 1863 vfinddev(dev, type, vpp) 1864 dev_t dev; 1865 enum vtype type; 1866 struct vnode **vpp; 1867 { 1868 struct vnode *vp; 1869 int rc = 0; 1870 1871 simple_lock(&spechash_slock); 1872 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1873 if (dev != vp->v_rdev || type != vp->v_type) 1874 continue; 1875 *vpp = vp; 1876 rc = 1; 1877 break; 1878 } 1879 simple_unlock(&spechash_slock); 1880 return (rc); 1881 } 1882 1883 /* 1884 * Revoke all the vnodes corresponding to the specified minor number 1885 * range (endpoints inclusive) of the specified major. 1886 */ 1887 void 1888 vdevgone(maj, minl, minh, type) 1889 int maj, minl, minh; 1890 enum vtype type; 1891 { 1892 struct vnode *vp; 1893 int mn; 1894 1895 for (mn = minl; mn <= minh; mn++) 1896 if (vfinddev(makedev(maj, mn), type, &vp)) 1897 VOP_REVOKE(vp, REVOKEALL); 1898 } 1899 1900 /* 1901 * Calculate the total number of references to a special device. 1902 */ 1903 int 1904 vcount(vp) 1905 struct vnode *vp; 1906 { 1907 struct vnode *vq, *vnext; 1908 int count; 1909 1910 loop: 1911 if ((vp->v_flag & VALIASED) == 0) 1912 return (vp->v_usecount); 1913 simple_lock(&spechash_slock); 1914 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1915 vnext = vq->v_specnext; 1916 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1917 continue; 1918 /* 1919 * Alias, but not in use, so flush it out. 1920 */ 1921 if (vq->v_usecount == 0 && vq != vp && 1922 (vq->v_flag & VXLOCK) == 0) { 1923 simple_unlock(&spechash_slock); 1924 vgone(vq); 1925 goto loop; 1926 } 1927 count += vq->v_usecount; 1928 } 1929 simple_unlock(&spechash_slock); 1930 return (count); 1931 } 1932 1933 /* 1934 * Print out a description of a vnode. 1935 */ 1936 const char * const vnode_types[] = { 1937 "VNON", 1938 "VREG", 1939 "VDIR", 1940 "VBLK", 1941 "VCHR", 1942 "VLNK", 1943 "VSOCK", 1944 "VFIFO", 1945 "VBAD" 1946 }; 1947 1948 void 1949 vprint(label, vp) 1950 char *label; 1951 struct vnode *vp; 1952 { 1953 char buf[96]; 1954 1955 if (label != NULL) 1956 printf("%s: ", label); 1957 printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,", 1958 vp->v_tag, vnode_types[vp->v_type], 1959 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1960 buf[0] = '\0'; 1961 if (vp->v_flag & VROOT) 1962 strlcat(buf, "|VROOT", sizeof(buf)); 1963 if (vp->v_flag & VTEXT) 1964 strlcat(buf, "|VTEXT", sizeof(buf)); 1965 if (vp->v_flag & VEXECMAP) 1966 strlcat(buf, "|VEXECMAP", sizeof(buf)); 1967 if (vp->v_flag & VSYSTEM) 1968 strlcat(buf, "|VSYSTEM", sizeof(buf)); 1969 if (vp->v_flag & VXLOCK) 1970 strlcat(buf, "|VXLOCK", sizeof(buf)); 1971 if (vp->v_flag & VXWANT) 1972 strlcat(buf, "|VXWANT", sizeof(buf)); 1973 if (vp->v_flag & VBWAIT) 1974 strlcat(buf, "|VBWAIT", sizeof(buf)); 1975 if (vp->v_flag & VALIASED) 1976 strlcat(buf, "|VALIASED", sizeof(buf)); 1977 if (buf[0] != '\0') 1978 printf(" flags (%s)", &buf[1]); 1979 if (vp->v_data == NULL) { 1980 printf("\n"); 1981 } else { 1982 printf("\n\t"); 1983 VOP_PRINT(vp); 1984 } 1985 } 1986 1987 #ifdef DEBUG 1988 /* 1989 * List all of the locked vnodes in the system. 1990 * Called when debugging the kernel. 1991 */ 1992 void 1993 printlockedvnodes() 1994 { 1995 struct mount *mp, *nmp; 1996 struct vnode *vp; 1997 1998 printf("Locked vnodes\n"); 1999 simple_lock(&mountlist_slock); 2000 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2001 mp = nmp) { 2002 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2003 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2004 continue; 2005 } 2006 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2007 if (VOP_ISLOCKED(vp)) 2008 vprint(NULL, vp); 2009 } 2010 simple_lock(&mountlist_slock); 2011 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2012 vfs_unbusy(mp); 2013 } 2014 simple_unlock(&mountlist_slock); 2015 } 2016 #endif 2017 2018 /* 2019 * sysctl helper routine for vfs.generic.conf lookups. 2020 */ 2021 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2022 static int 2023 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 2024 { 2025 struct vfsconf vfc; 2026 extern const char * const mountcompatnames[]; 2027 extern int nmountcompatnames; 2028 struct sysctlnode node; 2029 struct vfsops *vfsp; 2030 u_int vfsnum; 2031 2032 if (namelen != 1) 2033 return (ENOTDIR); 2034 vfsnum = name[0]; 2035 if (vfsnum >= nmountcompatnames || 2036 mountcompatnames[vfsnum] == NULL) 2037 return (EOPNOTSUPP); 2038 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 2039 if (vfsp == NULL) 2040 return (EOPNOTSUPP); 2041 2042 vfc.vfc_vfsops = vfsp; 2043 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 2044 vfc.vfc_typenum = vfsnum; 2045 vfc.vfc_refcount = vfsp->vfs_refcount; 2046 vfc.vfc_flags = 0; 2047 vfc.vfc_mountroot = vfsp->vfs_mountroot; 2048 vfc.vfc_next = NULL; 2049 2050 node = *rnode; 2051 node.sysctl_data = &vfc; 2052 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 2053 } 2054 #endif 2055 2056 /* 2057 * sysctl helper routine to return list of supported fstypes 2058 */ 2059 static int 2060 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2061 { 2062 char buf[MFSNAMELEN]; 2063 char *where = oldp; 2064 struct vfsops *v; 2065 size_t needed, left, slen; 2066 int error, first; 2067 2068 if (newp != NULL) 2069 return (EPERM); 2070 if (namelen != 0) 2071 return (EINVAL); 2072 2073 first = 1; 2074 error = 0; 2075 needed = 0; 2076 left = *oldlenp; 2077 2078 LIST_FOREACH(v, &vfs_list, vfs_list) { 2079 if (where == NULL) 2080 needed += strlen(v->vfs_name) + 1; 2081 else { 2082 memset(buf, 0, sizeof(buf)); 2083 if (first) { 2084 strncpy(buf, v->vfs_name, sizeof(buf)); 2085 first = 0; 2086 } else { 2087 buf[0] = ' '; 2088 strncpy(buf + 1, v->vfs_name, sizeof(buf) - 1); 2089 } 2090 buf[sizeof(buf)-1] = '\0'; 2091 slen = strlen(buf); 2092 if (left < slen + 1) 2093 break; 2094 /* +1 to copy out the trailing NUL byte */ 2095 error = copyout(buf, where, slen + 1); 2096 if (error) 2097 break; 2098 where += slen; 2099 needed += slen; 2100 left -= slen; 2101 } 2102 } 2103 *oldlenp = needed; 2104 return (error); 2105 } 2106 2107 /* 2108 * Top level filesystem related information gathering. 2109 */ 2110 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2111 { 2112 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2113 extern int nmountcompatnames; 2114 #endif 2115 2116 sysctl_createv(clog, 0, NULL, NULL, 2117 CTLFLAG_PERMANENT, 2118 CTLTYPE_NODE, "vfs", NULL, 2119 NULL, 0, NULL, 0, 2120 CTL_VFS, CTL_EOL); 2121 sysctl_createv(clog, 0, NULL, NULL, 2122 CTLFLAG_PERMANENT, 2123 CTLTYPE_NODE, "generic", 2124 SYSCTL_DESCR("Non-specific vfs related information"), 2125 NULL, 0, NULL, 0, 2126 CTL_VFS, VFS_GENERIC, CTL_EOL); 2127 2128 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2129 sysctl_createv(clog, 0, NULL, NULL, 2130 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 2131 CTLTYPE_INT, "maxtypenum", 2132 SYSCTL_DESCR("Highest valid filesystem type number"), 2133 NULL, nmountcompatnames, NULL, 0, 2134 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2135 #endif 2136 sysctl_createv(clog, 0, NULL, NULL, 2137 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2138 CTLTYPE_INT, "usermount", 2139 SYSCTL_DESCR("Whether unprivileged users may mount " 2140 "filesystems"), 2141 NULL, 0, &dovfsusermount, 0, 2142 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2143 sysctl_createv(clog, 0, NULL, NULL, 2144 CTLFLAG_PERMANENT, 2145 CTLTYPE_STRING, "fstypes", 2146 SYSCTL_DESCR("List of file systems present"), 2147 sysctl_vfs_generic_fstypes, 0, NULL, 0, 2148 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 2149 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2150 sysctl_createv(clog, 0, NULL, NULL, 2151 CTLFLAG_PERMANENT, 2152 CTLTYPE_STRUCT, "conf", 2153 SYSCTL_DESCR("Filesystem configuration information"), 2154 sysctl_vfs_generic_conf, 0, NULL, 2155 sizeof(struct vfsconf), 2156 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2157 #endif 2158 } 2159 2160 2161 int kinfo_vdebug = 1; 2162 int kinfo_vgetfailed; 2163 #define KINFO_VNODESLOP 10 2164 /* 2165 * Dump vnode list (via sysctl). 2166 * Copyout address of vnode followed by vnode. 2167 */ 2168 /* ARGSUSED */ 2169 int 2170 sysctl_kern_vnode(SYSCTLFN_ARGS) 2171 { 2172 char *where = oldp; 2173 size_t *sizep = oldlenp; 2174 struct mount *mp, *nmp; 2175 struct vnode *nvp, *vp; 2176 char *bp = where, *savebp; 2177 char *ewhere; 2178 int error; 2179 2180 if (namelen != 0) 2181 return (EOPNOTSUPP); 2182 if (newp != NULL) 2183 return (EPERM); 2184 2185 #define VPTRSZ sizeof(struct vnode *) 2186 #define VNODESZ sizeof(struct vnode) 2187 if (where == NULL) { 2188 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2189 return (0); 2190 } 2191 ewhere = where + *sizep; 2192 2193 simple_lock(&mountlist_slock); 2194 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2195 mp = nmp) { 2196 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2197 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2198 continue; 2199 } 2200 savebp = bp; 2201 again: 2202 simple_lock(&mntvnode_slock); 2203 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2204 vp != NULL; 2205 vp = nvp) { 2206 /* 2207 * Check that the vp is still associated with 2208 * this filesystem. RACE: could have been 2209 * recycled onto the same filesystem. 2210 */ 2211 if (vp->v_mount != mp) { 2212 simple_unlock(&mntvnode_slock); 2213 if (kinfo_vdebug) 2214 printf("kinfo: vp changed\n"); 2215 bp = savebp; 2216 goto again; 2217 } 2218 nvp = LIST_NEXT(vp, v_mntvnodes); 2219 if (bp + VPTRSZ + VNODESZ > ewhere) { 2220 simple_unlock(&mntvnode_slock); 2221 *sizep = bp - where; 2222 return (ENOMEM); 2223 } 2224 simple_unlock(&mntvnode_slock); 2225 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2226 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2227 return (error); 2228 bp += VPTRSZ + VNODESZ; 2229 simple_lock(&mntvnode_slock); 2230 } 2231 simple_unlock(&mntvnode_slock); 2232 simple_lock(&mountlist_slock); 2233 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2234 vfs_unbusy(mp); 2235 } 2236 simple_unlock(&mountlist_slock); 2237 2238 *sizep = bp - where; 2239 return (0); 2240 } 2241 2242 /* 2243 * Check to see if a filesystem is mounted on a block device. 2244 */ 2245 int 2246 vfs_mountedon(vp) 2247 struct vnode *vp; 2248 { 2249 struct vnode *vq; 2250 int error = 0; 2251 2252 if (vp->v_specmountpoint != NULL) 2253 return (EBUSY); 2254 if (vp->v_flag & VALIASED) { 2255 simple_lock(&spechash_slock); 2256 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2257 if (vq->v_rdev != vp->v_rdev || 2258 vq->v_type != vp->v_type) 2259 continue; 2260 if (vq->v_specmountpoint != NULL) { 2261 error = EBUSY; 2262 break; 2263 } 2264 } 2265 simple_unlock(&spechash_slock); 2266 } 2267 return (error); 2268 } 2269 2270 static int 2271 sacheck(struct sockaddr *sa) 2272 { 2273 switch (sa->sa_family) { 2274 #ifdef INET 2275 case AF_INET: { 2276 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2277 char *p = (char *)sin->sin_zero; 2278 size_t i; 2279 2280 if (sin->sin_len != sizeof(*sin)) 2281 return -1; 2282 if (sin->sin_port != 0) 2283 return -1; 2284 for (i = 0; i < sizeof(sin->sin_zero); i++) 2285 if (*p++ != '\0') 2286 return -1; 2287 return 0; 2288 } 2289 #endif 2290 #ifdef INET6 2291 case AF_INET6: { 2292 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2293 2294 if (sin6->sin6_len != sizeof(*sin6)) 2295 return -1; 2296 if (sin6->sin6_port != 0) 2297 return -1; 2298 return 0; 2299 } 2300 #endif 2301 default: 2302 return -1; 2303 } 2304 } 2305 2306 /* 2307 * Build hash lists of net addresses and hang them off the mount point. 2308 * Called by ufs_mount() to set up the lists of export addresses. 2309 */ 2310 static int 2311 vfs_hang_addrlist(mp, nep, argp) 2312 struct mount *mp; 2313 struct netexport *nep; 2314 struct export_args *argp; 2315 { 2316 struct netcred *np, *enp; 2317 struct radix_node_head *rnh; 2318 int i; 2319 struct sockaddr *saddr, *smask = 0; 2320 struct domain *dom; 2321 int error; 2322 2323 if (argp->ex_addrlen == 0) { 2324 if (mp->mnt_flag & MNT_DEFEXPORTED) 2325 return (EPERM); 2326 np = &nep->ne_defexported; 2327 np->netc_exflags = argp->ex_flags; 2328 crcvt(&np->netc_anon, &argp->ex_anon); 2329 np->netc_anon.cr_ref = 1; 2330 mp->mnt_flag |= MNT_DEFEXPORTED; 2331 return (0); 2332 } 2333 2334 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2335 return (EINVAL); 2336 2337 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2338 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2339 memset((caddr_t)np, 0, i); 2340 saddr = (struct sockaddr *)(np + 1); 2341 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2342 if (error) 2343 goto out; 2344 if (saddr->sa_len > argp->ex_addrlen) 2345 saddr->sa_len = argp->ex_addrlen; 2346 if (sacheck(saddr) == -1) 2347 return EINVAL; 2348 if (argp->ex_masklen) { 2349 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2350 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2351 if (error) 2352 goto out; 2353 if (smask->sa_len > argp->ex_masklen) 2354 smask->sa_len = argp->ex_masklen; 2355 if (smask->sa_family != saddr->sa_family) 2356 return EINVAL; 2357 if (sacheck(smask) == -1) 2358 return EINVAL; 2359 } 2360 i = saddr->sa_family; 2361 if ((rnh = nep->ne_rtable[i]) == 0) { 2362 /* 2363 * Seems silly to initialize every AF when most are not 2364 * used, do so on demand here 2365 */ 2366 for (dom = domains; dom; dom = dom->dom_next) 2367 if (dom->dom_family == i && dom->dom_rtattach) { 2368 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2369 dom->dom_rtoffset); 2370 break; 2371 } 2372 if ((rnh = nep->ne_rtable[i]) == 0) { 2373 error = ENOBUFS; 2374 goto out; 2375 } 2376 } 2377 2378 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2379 np->netc_rnodes); 2380 if (enp != np) { 2381 if (enp == NULL) { 2382 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2383 smask, rnh); 2384 if (enp == NULL) { 2385 error = EPERM; 2386 goto out; 2387 } 2388 } else 2389 enp->netc_refcnt++; 2390 2391 goto check; 2392 } else 2393 enp->netc_refcnt = 1; 2394 2395 np->netc_exflags = argp->ex_flags; 2396 crcvt(&np->netc_anon, &argp->ex_anon); 2397 np->netc_anon.cr_ref = 1; 2398 return 0; 2399 check: 2400 if (enp->netc_exflags != argp->ex_flags || 2401 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2402 error = EPERM; 2403 else 2404 error = 0; 2405 out: 2406 free(np, M_NETADDR); 2407 return error; 2408 } 2409 2410 /* ARGSUSED */ 2411 static int 2412 vfs_free_netcred(rn, w) 2413 struct radix_node *rn; 2414 void *w; 2415 { 2416 struct radix_node_head *rnh = (struct radix_node_head *)w; 2417 struct netcred *np = (struct netcred *)(void *)rn; 2418 2419 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2420 if (--(np->netc_refcnt) <= 0) 2421 free(np, M_NETADDR); 2422 return (0); 2423 } 2424 2425 /* 2426 * Free the net address hash lists that are hanging off the mount points. 2427 */ 2428 static void 2429 vfs_free_addrlist(nep) 2430 struct netexport *nep; 2431 { 2432 int i; 2433 struct radix_node_head *rnh; 2434 2435 for (i = 0; i <= AF_MAX; i++) 2436 if ((rnh = nep->ne_rtable[i]) != NULL) { 2437 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2438 free((caddr_t)rnh, M_RTABLE); 2439 nep->ne_rtable[i] = 0; 2440 } 2441 } 2442 2443 int 2444 vfs_export(mp, nep, argp) 2445 struct mount *mp; 2446 struct netexport *nep; 2447 struct export_args *argp; 2448 { 2449 int error; 2450 2451 if (argp->ex_flags & MNT_DELEXPORT) { 2452 if (mp->mnt_flag & MNT_EXPUBLIC) { 2453 vfs_setpublicfs(NULL, NULL, NULL); 2454 mp->mnt_flag &= ~MNT_EXPUBLIC; 2455 } 2456 vfs_free_addrlist(nep); 2457 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2458 } 2459 if (argp->ex_flags & MNT_EXPORTED) { 2460 if (argp->ex_flags & MNT_EXPUBLIC) { 2461 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2462 return (error); 2463 mp->mnt_flag |= MNT_EXPUBLIC; 2464 } 2465 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2466 return (error); 2467 mp->mnt_flag |= MNT_EXPORTED; 2468 } 2469 return (0); 2470 } 2471 2472 /* 2473 * Set the publicly exported filesystem (WebNFS). Currently, only 2474 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2475 */ 2476 int 2477 vfs_setpublicfs(mp, nep, argp) 2478 struct mount *mp; 2479 struct netexport *nep; 2480 struct export_args *argp; 2481 { 2482 int error; 2483 struct vnode *rvp; 2484 char *cp; 2485 2486 /* 2487 * mp == NULL -> invalidate the current info, the FS is 2488 * no longer exported. May be called from either vfs_export 2489 * or unmount, so check if it hasn't already been done. 2490 */ 2491 if (mp == NULL) { 2492 if (nfs_pub.np_valid) { 2493 nfs_pub.np_valid = 0; 2494 if (nfs_pub.np_index != NULL) { 2495 FREE(nfs_pub.np_index, M_TEMP); 2496 nfs_pub.np_index = NULL; 2497 } 2498 } 2499 return (0); 2500 } 2501 2502 /* 2503 * Only one allowed at a time. 2504 */ 2505 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2506 return (EBUSY); 2507 2508 /* 2509 * Get real filehandle for root of exported FS. 2510 */ 2511 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2512 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx; 2513 2514 if ((error = VFS_ROOT(mp, &rvp))) 2515 return (error); 2516 2517 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2518 return (error); 2519 2520 vput(rvp); 2521 2522 /* 2523 * If an indexfile was specified, pull it in. 2524 */ 2525 if (argp->ex_indexfile != NULL) { 2526 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2527 M_WAITOK); 2528 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2529 MAXNAMLEN, (size_t *)0); 2530 if (!error) { 2531 /* 2532 * Check for illegal filenames. 2533 */ 2534 for (cp = nfs_pub.np_index; *cp; cp++) { 2535 if (*cp == '/') { 2536 error = EINVAL; 2537 break; 2538 } 2539 } 2540 } 2541 if (error) { 2542 FREE(nfs_pub.np_index, M_TEMP); 2543 return (error); 2544 } 2545 } 2546 2547 nfs_pub.np_mount = mp; 2548 nfs_pub.np_valid = 1; 2549 return (0); 2550 } 2551 2552 struct netcred * 2553 vfs_export_lookup(mp, nep, nam) 2554 struct mount *mp; 2555 struct netexport *nep; 2556 struct mbuf *nam; 2557 { 2558 struct netcred *np; 2559 struct radix_node_head *rnh; 2560 struct sockaddr *saddr; 2561 2562 np = NULL; 2563 if (mp->mnt_flag & MNT_EXPORTED) { 2564 /* 2565 * Lookup in the export list first. 2566 */ 2567 if (nam != NULL) { 2568 saddr = mtod(nam, struct sockaddr *); 2569 rnh = nep->ne_rtable[saddr->sa_family]; 2570 if (rnh != NULL) { 2571 np = (struct netcred *) 2572 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2573 rnh); 2574 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2575 np = NULL; 2576 } 2577 } 2578 /* 2579 * If no address match, use the default if it exists. 2580 */ 2581 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2582 np = &nep->ne_defexported; 2583 } 2584 return (np); 2585 } 2586 2587 /* 2588 * Do the usual access checking. 2589 * file_mode, uid and gid are from the vnode in question, 2590 * while acc_mode and cred are from the VOP_ACCESS parameter list 2591 */ 2592 int 2593 vaccess(type, file_mode, uid, gid, acc_mode, cred) 2594 enum vtype type; 2595 mode_t file_mode; 2596 uid_t uid; 2597 gid_t gid; 2598 mode_t acc_mode; 2599 struct ucred *cred; 2600 { 2601 mode_t mask; 2602 2603 /* 2604 * Super-user always gets read/write access, but execute access depends 2605 * on at least one execute bit being set. 2606 */ 2607 if (cred->cr_uid == 0) { 2608 if ((acc_mode & VEXEC) && type != VDIR && 2609 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2610 return (EACCES); 2611 return (0); 2612 } 2613 2614 mask = 0; 2615 2616 /* Otherwise, check the owner. */ 2617 if (cred->cr_uid == uid) { 2618 if (acc_mode & VEXEC) 2619 mask |= S_IXUSR; 2620 if (acc_mode & VREAD) 2621 mask |= S_IRUSR; 2622 if (acc_mode & VWRITE) 2623 mask |= S_IWUSR; 2624 return ((file_mode & mask) == mask ? 0 : EACCES); 2625 } 2626 2627 /* Otherwise, check the groups. */ 2628 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2629 if (acc_mode & VEXEC) 2630 mask |= S_IXGRP; 2631 if (acc_mode & VREAD) 2632 mask |= S_IRGRP; 2633 if (acc_mode & VWRITE) 2634 mask |= S_IWGRP; 2635 return ((file_mode & mask) == mask ? 0 : EACCES); 2636 } 2637 2638 /* Otherwise, check everyone else. */ 2639 if (acc_mode & VEXEC) 2640 mask |= S_IXOTH; 2641 if (acc_mode & VREAD) 2642 mask |= S_IROTH; 2643 if (acc_mode & VWRITE) 2644 mask |= S_IWOTH; 2645 return ((file_mode & mask) == mask ? 0 : EACCES); 2646 } 2647 2648 /* 2649 * Unmount all file systems. 2650 * We traverse the list in reverse order under the assumption that doing so 2651 * will avoid needing to worry about dependencies. 2652 */ 2653 void 2654 vfs_unmountall(p) 2655 struct proc *p; 2656 { 2657 struct mount *mp, *nmp; 2658 int allerror, error; 2659 2660 printf("unmounting file systems..."); 2661 for (allerror = 0, 2662 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2663 nmp = mp->mnt_list.cqe_prev; 2664 #ifdef DEBUG 2665 printf("\nunmounting %s (%s)...", 2666 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2667 #endif 2668 /* 2669 * XXX Freeze syncer. Must do this before locking the 2670 * mount point. See dounmount() for details. 2671 */ 2672 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2673 if (vfs_busy(mp, 0, 0)) { 2674 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2675 continue; 2676 } 2677 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2678 printf("unmount of %s failed with error %d\n", 2679 mp->mnt_stat.f_mntonname, error); 2680 allerror = 1; 2681 } 2682 } 2683 printf(" done\n"); 2684 if (allerror) 2685 printf("WARNING: some file systems would not unmount\n"); 2686 } 2687 2688 extern struct simplelock bqueue_slock; /* XXX */ 2689 2690 /* 2691 * Sync and unmount file systems before shutting down. 2692 */ 2693 void 2694 vfs_shutdown() 2695 { 2696 struct lwp *l = curlwp; 2697 struct proc *p; 2698 2699 /* XXX we're certainly not running in proc0's context! */ 2700 if (l == NULL || (p = l->l_proc) == NULL) 2701 p = &proc0; 2702 2703 printf("syncing disks... "); 2704 2705 /* remove user process from run queue */ 2706 suspendsched(); 2707 (void) spl0(); 2708 2709 /* avoid coming back this way again if we panic. */ 2710 doing_shutdown = 1; 2711 2712 sys_sync(l, NULL, NULL); 2713 2714 /* Wait for sync to finish. */ 2715 if (buf_syncwait() != 0) { 2716 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2717 Debugger(); 2718 #endif 2719 printf("giving up\n"); 2720 return; 2721 } else 2722 printf("done\n"); 2723 2724 /* 2725 * If we've panic'd, don't make the situation potentially 2726 * worse by unmounting the file systems. 2727 */ 2728 if (panicstr != NULL) 2729 return; 2730 2731 /* Release inodes held by texts before update. */ 2732 #ifdef notdef 2733 vnshutdown(); 2734 #endif 2735 /* Unmount file systems. */ 2736 vfs_unmountall(p); 2737 } 2738 2739 /* 2740 * Mount the root file system. If the operator didn't specify a 2741 * file system to use, try all possible file systems until one 2742 * succeeds. 2743 */ 2744 int 2745 vfs_mountroot() 2746 { 2747 struct vfsops *v; 2748 2749 if (root_device == NULL) 2750 panic("vfs_mountroot: root device unknown"); 2751 2752 switch (root_device->dv_class) { 2753 case DV_IFNET: 2754 if (rootdev != NODEV) 2755 panic("vfs_mountroot: rootdev set for DV_IFNET " 2756 "(0x%08x -> %d,%d)", rootdev, 2757 major(rootdev), minor(rootdev)); 2758 break; 2759 2760 case DV_DISK: 2761 if (rootdev == NODEV) 2762 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2763 break; 2764 2765 default: 2766 printf("%s: inappropriate for root file system\n", 2767 root_device->dv_xname); 2768 return (ENODEV); 2769 } 2770 2771 /* 2772 * If user specified a file system, use it. 2773 */ 2774 if (mountroot != NULL) 2775 return ((*mountroot)()); 2776 2777 /* 2778 * Try each file system currently configured into the kernel. 2779 */ 2780 LIST_FOREACH(v, &vfs_list, vfs_list) { 2781 if (v->vfs_mountroot == NULL) 2782 continue; 2783 #ifdef DEBUG 2784 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2785 #endif 2786 if ((*v->vfs_mountroot)() == 0) { 2787 aprint_normal("root file system type: %s\n", 2788 v->vfs_name); 2789 break; 2790 } 2791 } 2792 2793 if (v == NULL) { 2794 printf("no file system for %s", root_device->dv_xname); 2795 if (root_device->dv_class == DV_DISK) 2796 printf(" (dev 0x%x)", rootdev); 2797 printf("\n"); 2798 return (EFTYPE); 2799 } 2800 return (0); 2801 } 2802 2803 /* 2804 * Given a file system name, look up the vfsops for that 2805 * file system, or return NULL if file system isn't present 2806 * in the kernel. 2807 */ 2808 struct vfsops * 2809 vfs_getopsbyname(name) 2810 const char *name; 2811 { 2812 struct vfsops *v; 2813 2814 LIST_FOREACH(v, &vfs_list, vfs_list) { 2815 if (strcmp(v->vfs_name, name) == 0) 2816 break; 2817 } 2818 2819 return (v); 2820 } 2821 2822 /* 2823 * Establish a file system and initialize it. 2824 */ 2825 int 2826 vfs_attach(vfs) 2827 struct vfsops *vfs; 2828 { 2829 struct vfsops *v; 2830 int error = 0; 2831 2832 2833 /* 2834 * Make sure this file system doesn't already exist. 2835 */ 2836 LIST_FOREACH(v, &vfs_list, vfs_list) { 2837 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2838 error = EEXIST; 2839 goto out; 2840 } 2841 } 2842 2843 /* 2844 * Initialize the vnode operations for this file system. 2845 */ 2846 vfs_opv_init(vfs->vfs_opv_descs); 2847 2848 /* 2849 * Now initialize the file system itself. 2850 */ 2851 (*vfs->vfs_init)(); 2852 2853 /* 2854 * ...and link it into the kernel's list. 2855 */ 2856 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2857 2858 /* 2859 * Sanity: make sure the reference count is 0. 2860 */ 2861 vfs->vfs_refcount = 0; 2862 2863 out: 2864 return (error); 2865 } 2866 2867 /* 2868 * Remove a file system from the kernel. 2869 */ 2870 int 2871 vfs_detach(vfs) 2872 struct vfsops *vfs; 2873 { 2874 struct vfsops *v; 2875 2876 /* 2877 * Make sure no one is using the filesystem. 2878 */ 2879 if (vfs->vfs_refcount != 0) 2880 return (EBUSY); 2881 2882 /* 2883 * ...and remove it from the kernel's list. 2884 */ 2885 LIST_FOREACH(v, &vfs_list, vfs_list) { 2886 if (v == vfs) { 2887 LIST_REMOVE(v, vfs_list); 2888 break; 2889 } 2890 } 2891 2892 if (v == NULL) 2893 return (ESRCH); 2894 2895 /* 2896 * Now run the file system-specific cleanups. 2897 */ 2898 (*vfs->vfs_done)(); 2899 2900 /* 2901 * Free the vnode operations vector. 2902 */ 2903 vfs_opv_free(vfs->vfs_opv_descs); 2904 return (0); 2905 } 2906 2907 void 2908 vfs_reinit(void) 2909 { 2910 struct vfsops *vfs; 2911 2912 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2913 if (vfs->vfs_reinit) { 2914 (*vfs->vfs_reinit)(); 2915 } 2916 } 2917 } 2918 2919 /* 2920 * Request a filesystem to suspend write operations. 2921 */ 2922 int 2923 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2924 { 2925 struct proc *p = curproc; /* XXX */ 2926 int error; 2927 2928 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2929 if (slptimeo < 0) 2930 return EWOULDBLOCK; 2931 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2932 if (error) 2933 return error; 2934 } 2935 mp->mnt_iflag |= IMNT_SUSPEND; 2936 2937 simple_lock(&mp->mnt_slock); 2938 if (mp->mnt_writeopcountupper > 0) 2939 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 2940 0, &mp->mnt_slock); 2941 simple_unlock(&mp->mnt_slock); 2942 2943 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2944 if (error) { 2945 vfs_write_resume(mp); 2946 return error; 2947 } 2948 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2949 2950 simple_lock(&mp->mnt_slock); 2951 if (mp->mnt_writeopcountlower > 0) 2952 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 2953 0, &mp->mnt_slock); 2954 mp->mnt_iflag |= IMNT_SUSPENDED; 2955 simple_unlock(&mp->mnt_slock); 2956 2957 return 0; 2958 } 2959 2960 /* 2961 * Request a filesystem to resume write operations. 2962 */ 2963 void 2964 vfs_write_resume(struct mount *mp) 2965 { 2966 2967 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2968 return; 2969 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2970 wakeup(&mp->mnt_flag); 2971 } 2972 2973 void 2974 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2975 { 2976 const struct statvfs *mbp; 2977 2978 if (sbp == (mbp = &mp->mnt_stat)) 2979 return; 2980 2981 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2982 sbp->f_fsid = mbp->f_fsid; 2983 sbp->f_owner = mbp->f_owner; 2984 sbp->f_flag = mbp->f_flag; 2985 sbp->f_syncwrites = mbp->f_syncwrites; 2986 sbp->f_asyncwrites = mbp->f_asyncwrites; 2987 sbp->f_syncreads = mbp->f_syncreads; 2988 sbp->f_asyncreads = mbp->f_asyncreads; 2989 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2990 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2991 sizeof(sbp->f_fstypename)); 2992 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2993 sizeof(sbp->f_mntonname)); 2994 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2995 sizeof(sbp->f_mntfromname)); 2996 sbp->f_namemax = mbp->f_namemax; 2997 } 2998 2999 int 3000 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 3001 struct mount *mp, struct proc *p) 3002 { 3003 int error; 3004 size_t size; 3005 struct statvfs *sfs = &mp->mnt_stat; 3006 int (*fun)(const void *, void *, size_t, size_t *); 3007 3008 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 3009 sizeof(mp->mnt_stat.f_fstypename)); 3010 3011 if (onp) { 3012 struct cwdinfo *cwdi = p->p_cwdi; 3013 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 3014 if (cwdi->cwdi_rdir != NULL) { 3015 size_t len; 3016 char *bp; 3017 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 3018 3019 if (!path) /* XXX can't happen with M_WAITOK */ 3020 return ENOMEM; 3021 3022 bp = path + MAXPATHLEN; 3023 *--bp = '\0'; 3024 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 3025 path, MAXPATHLEN / 2, 0, p); 3026 if (error) { 3027 free(path, M_TEMP); 3028 return error; 3029 } 3030 3031 len = strlen(bp); 3032 if (len > sizeof(sfs->f_mntonname) - 1) 3033 len = sizeof(sfs->f_mntonname) - 1; 3034 (void)strncpy(sfs->f_mntonname, bp, len); 3035 free(path, M_TEMP); 3036 3037 if (len < sizeof(sfs->f_mntonname) - 1) { 3038 error = (*fun)(onp, &sfs->f_mntonname[len], 3039 sizeof(sfs->f_mntonname) - len - 1, &size); 3040 if (error) 3041 return error; 3042 size += len; 3043 } else { 3044 size = len; 3045 } 3046 } else { 3047 error = (*fun)(onp, &sfs->f_mntonname, 3048 sizeof(sfs->f_mntonname) - 1, &size); 3049 if (error) 3050 return error; 3051 } 3052 (void)memset(sfs->f_mntonname + size, 0, 3053 sizeof(sfs->f_mntonname) - size); 3054 } 3055 3056 if (fromp) { 3057 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 3058 error = (*fun)(fromp, sfs->f_mntfromname, 3059 sizeof(sfs->f_mntfromname) - 1, &size); 3060 if (error) 3061 return error; 3062 (void)memset(sfs->f_mntfromname + size, 0, 3063 sizeof(sfs->f_mntfromname) - size); 3064 } 3065 return 0; 3066 } 3067 3068 #ifdef DDB 3069 const char buf_flagbits[] = 3070 "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" 3071 "\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" 3072 "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED" 3073 "\32XXX\33VFLUSH"; 3074 3075 void 3076 vfs_buf_print(bp, full, pr) 3077 struct buf *bp; 3078 int full; 3079 void (*pr)(const char *, ...); 3080 { 3081 char buf[1024]; 3082 3083 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 3084 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 3085 3086 bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf)); 3087 (*pr)(" error %d flags 0x%s\n", bp->b_error, buf); 3088 3089 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3090 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3091 (*pr)(" data %p saveaddr %p dep %p\n", 3092 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 3093 (*pr)(" iodone %p\n", bp->b_iodone); 3094 } 3095 3096 3097 const char vnode_flagbits[] = 3098 "\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\5EXECMAP" 3099 "\11XLOCK\12XWANT\13BWAIT\14ALIASED" 3100 "\15DIROP\16LAYER\17ONWORKLIST\20DIRTY"; 3101 3102 const char * const vnode_tags[] = { 3103 "VT_NON", 3104 "VT_UFS", 3105 "VT_NFS", 3106 "VT_MFS", 3107 "VT_MSDOSFS", 3108 "VT_LFS", 3109 "VT_LOFS", 3110 "VT_FDESC", 3111 "VT_PORTAL", 3112 "VT_NULL", 3113 "VT_UMAP", 3114 "VT_KERNFS", 3115 "VT_PROCFS", 3116 "VT_AFS", 3117 "VT_ISOFS", 3118 "VT_UNION", 3119 "VT_ADOSFS", 3120 "VT_EXT2FS", 3121 "VT_CODA", 3122 "VT_FILECORE", 3123 "VT_NTFS", 3124 "VT_VFS", 3125 "VT_OVERLAY", 3126 "VT_SMBFS" 3127 }; 3128 3129 void 3130 vfs_vnode_print(vp, full, pr) 3131 struct vnode *vp; 3132 int full; 3133 void (*pr)(const char *, ...); 3134 { 3135 char buf[256]; 3136 const char *vtype, *vtag; 3137 3138 uvm_object_printit(&vp->v_uobj, full, pr); 3139 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); 3140 (*pr)("\nVNODE flags %s\n", buf); 3141 (*pr)("mp %p numoutput %d size 0x%llx\n", 3142 vp->v_mount, vp->v_numoutput, vp->v_size); 3143 3144 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3145 vp->v_data, vp->v_usecount, vp->v_writecount, 3146 vp->v_holdcnt, vp->v_numoutput); 3147 3148 vtype = (vp->v_type >= 0 && 3149 vp->v_type < sizeof(vnode_types) / sizeof(vnode_types[0])) ? 3150 vnode_types[vp->v_type] : "UNKNOWN"; 3151 vtag = (vp->v_tag >= 0 && 3152 vp->v_tag < sizeof(vnode_tags) / sizeof(vnode_tags[0])) ? 3153 vnode_tags[vp->v_tag] : "UNKNOWN"; 3154 3155 (*pr)("type %s(%d) tag %s(%d) mount %p typedata %p\n", 3156 vtype, vp->v_type, vtag, vp->v_tag, 3157 vp->v_mount, vp->v_mountedhere); 3158 3159 if (full) { 3160 struct buf *bp; 3161 3162 (*pr)("clean bufs:\n"); 3163 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3164 (*pr)(" bp %p\n", bp); 3165 vfs_buf_print(bp, full, pr); 3166 } 3167 3168 (*pr)("dirty bufs:\n"); 3169 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3170 (*pr)(" bp %p\n", bp); 3171 vfs_buf_print(bp, full, pr); 3172 } 3173 } 3174 } 3175 3176 void 3177 vfs_mount_print(mp, full, pr) 3178 struct mount *mp; 3179 int full; 3180 void (*pr)(const char *, ...); 3181 { 3182 char sbuf[256]; 3183 3184 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3185 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3186 3187 (*pr)("fs_bshift %d dev_bshift = %d\n", 3188 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3189 3190 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3191 (*pr)("flag = %s\n", sbuf); 3192 3193 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3194 (*pr)("iflag = %s\n", sbuf); 3195 3196 /* XXX use lockmgr_printinfo */ 3197 if (mp->mnt_lock.lk_sharecount) 3198 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3199 mp->mnt_lock.lk_sharecount); 3200 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3201 (*pr)(" lock type %s: EXCL (count %d) by ", 3202 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3203 if (mp->mnt_lock.lk_flags & LK_SPIN) 3204 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3205 else 3206 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3207 mp->mnt_lock.lk_locklwp); 3208 } else 3209 (*pr)(" not locked"); 3210 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3211 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3212 3213 (*pr)("\n"); 3214 3215 if (mp->mnt_unmounter) { 3216 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3217 } 3218 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3219 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3220 3221 (*pr)("statvfs cache:\n"); 3222 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3223 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3224 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3225 3226 (*pr)("\tblocks = "PRIu64"\n",mp->mnt_stat.f_blocks); 3227 (*pr)("\tbfree = "PRIu64"\n",mp->mnt_stat.f_bfree); 3228 (*pr)("\tbavail = "PRIu64"\n",mp->mnt_stat.f_bavail); 3229 (*pr)("\tbresvd = "PRIu64"\n",mp->mnt_stat.f_bresvd); 3230 3231 (*pr)("\tfiles = "PRIu64"\n",mp->mnt_stat.f_files); 3232 (*pr)("\tffree = "PRIu64"\n",mp->mnt_stat.f_ffree); 3233 (*pr)("\tfavail = "PRIu64"\n",mp->mnt_stat.f_favail); 3234 (*pr)("\tfresvd = "PRIu64"\n",mp->mnt_stat.f_fresvd); 3235 3236 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3237 mp->mnt_stat.f_fsidx.__fsid_val[0], 3238 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3239 3240 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3241 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3242 3243 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, 3244 sizeof(sbuf)); 3245 (*pr)("\tflag = %s\n",sbuf); 3246 (*pr)("\tsyncwrites = " PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3247 (*pr)("\tasyncwrites = " PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3248 (*pr)("\tsyncreads = " PRIu64 "\n",mp->mnt_stat.f_syncreads); 3249 (*pr)("\tasyncreads = " PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3250 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3251 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3252 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3253 3254 { 3255 int cnt = 0; 3256 struct vnode *vp; 3257 (*pr)("locked vnodes ="); 3258 /* XXX would take mountlist lock, except ddb may not have context */ 3259 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3260 if (VOP_ISLOCKED(vp)) { 3261 if ((++cnt % 6) == 0) { 3262 (*pr)(" %p,\n\t", vp); 3263 } else { 3264 (*pr)(" %p,", vp); 3265 } 3266 } 3267 } 3268 (*pr)("\n"); 3269 } 3270 3271 if (full) { 3272 int cnt = 0; 3273 struct vnode *vp; 3274 (*pr)("all vnodes ="); 3275 /* XXX would take mountlist lock, except ddb may not have context */ 3276 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3277 if (!LIST_NEXT(vp, v_mntvnodes)) { 3278 (*pr)(" %p", vp); 3279 } else if ((++cnt % 6) == 0) { 3280 (*pr)(" %p,\n\t", vp); 3281 } else { 3282 (*pr)(" %p,", vp); 3283 } 3284 } 3285 (*pr)("\n", vp); 3286 } 3287 } 3288 3289 #endif 3290