1 /* $NetBSD: vfs_subr.c,v 1.226 2004/05/25 04:44:44 atatat Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1989, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 74 */ 75 76 /* 77 * External virtual filesystem routines 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.226 2004/05/25 04:44:44 atatat Exp $"); 82 83 #include "opt_inet.h" 84 #include "opt_ddb.h" 85 #include "opt_compat_netbsd.h" 86 #include "opt_compat_43.h" 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/proc.h> 91 #include <sys/kernel.h> 92 #include <sys/mount.h> 93 #include <sys/time.h> 94 #include <sys/event.h> 95 #include <sys/fcntl.h> 96 #include <sys/vnode.h> 97 #include <sys/stat.h> 98 #include <sys/namei.h> 99 #include <sys/ucred.h> 100 #include <sys/buf.h> 101 #include <sys/errno.h> 102 #include <sys/malloc.h> 103 #include <sys/domain.h> 104 #include <sys/mbuf.h> 105 #include <sys/sa.h> 106 #include <sys/syscallargs.h> 107 #include <sys/device.h> 108 #include <sys/dirent.h> 109 #include <sys/filedesc.h> 110 111 #include <miscfs/specfs/specdev.h> 112 #include <miscfs/genfs/genfs.h> 113 #include <miscfs/syncfs/syncfs.h> 114 115 #include <netinet/in.h> 116 117 #include <uvm/uvm.h> 118 #include <uvm/uvm_ddb.h> 119 120 #include <netinet/in.h> 121 122 #include <sys/sysctl.h> 123 124 const enum vtype iftovt_tab[16] = { 125 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 126 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 127 }; 128 const int vttoif_tab[9] = { 129 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 130 S_IFSOCK, S_IFIFO, S_IFMT, 131 }; 132 133 int doforce = 1; /* 1 => permit forcible unmounting */ 134 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 135 136 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 137 138 /* 139 * Insq/Remq for the vnode usage lists. 140 */ 141 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 142 #define bufremvn(bp) { \ 143 LIST_REMOVE(bp, b_vnbufs); \ 144 (bp)->b_vnbufs.le_next = NOLIST; \ 145 } 146 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 147 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 148 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 149 150 struct mntlist mountlist = /* mounted filesystem list */ 151 CIRCLEQ_HEAD_INITIALIZER(mountlist); 152 struct vfs_list_head vfs_list = /* vfs list */ 153 LIST_HEAD_INITIALIZER(vfs_list); 154 155 struct nfs_public nfs_pub; /* publicly exported FS */ 156 157 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 158 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 159 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 160 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 161 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 162 163 /* XXX - gross; single global lock to protect v_numoutput */ 164 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 165 166 /* 167 * These define the root filesystem and device. 168 */ 169 struct mount *rootfs; 170 struct vnode *rootvnode; 171 struct device *root_device; /* root device */ 172 173 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 174 &pool_allocator_nointr); 175 176 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 177 178 /* 179 * Local declarations. 180 */ 181 void insmntque(struct vnode *, struct mount *); 182 int getdevvp(dev_t, struct vnode **, enum vtype); 183 void vgoneall(struct vnode *); 184 185 void vclean(struct vnode *, int, struct proc *); 186 187 static int vfs_hang_addrlist(struct mount *, struct netexport *, 188 struct export_args *); 189 static int vfs_free_netcred(struct radix_node *, void *); 190 static void vfs_free_addrlist(struct netexport *); 191 static struct vnode *getcleanvnode(struct proc *); 192 193 #ifdef DEBUG 194 void printlockedvnodes(void); 195 #endif 196 197 /* 198 * Initialize the vnode management data structures. 199 */ 200 void 201 vntblinit() 202 { 203 204 /* 205 * Initialize the filesystem syncer. 206 */ 207 vn_initialize_syncerd(); 208 } 209 210 int 211 vfs_drainvnodes(long target, struct proc *p) 212 { 213 214 simple_lock(&vnode_free_list_slock); 215 while (numvnodes > target) { 216 struct vnode *vp; 217 218 vp = getcleanvnode(p); 219 if (vp == NULL) 220 return EBUSY; /* give up */ 221 pool_put(&vnode_pool, vp); 222 simple_lock(&vnode_free_list_slock); 223 numvnodes--; 224 } 225 simple_unlock(&vnode_free_list_slock); 226 227 return 0; 228 } 229 230 /* 231 * grab a vnode from freelist and clean it. 232 */ 233 struct vnode * 234 getcleanvnode(p) 235 struct proc *p; 236 { 237 struct vnode *vp; 238 struct mount *mp; 239 struct freelst *listhd; 240 241 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 242 if ((vp = TAILQ_FIRST(listhd = &vnode_free_list)) == NULL) 243 vp = TAILQ_FIRST(listhd = &vnode_hold_list); 244 for (; vp != NULL; vp = TAILQ_NEXT(vp, v_freelist)) { 245 if (!simple_lock_try(&vp->v_interlock)) 246 continue; 247 if ((vp->v_flag & VLAYER) == 0) { 248 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 249 break; 250 } else if (VOP_ISLOCKED(vp) == 0) { 251 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 252 break; 253 } 254 mp = NULL; 255 simple_unlock(&vp->v_interlock); 256 } 257 258 if (vp == NULLVP) { 259 simple_unlock(&vnode_free_list_slock); 260 return NULLVP; 261 } 262 263 if (vp->v_usecount) 264 panic("free vnode isn't, vp %p", vp); 265 TAILQ_REMOVE(listhd, vp, v_freelist); 266 /* see comment on why 0xdeadb is set at end of vgone (below) */ 267 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 268 simple_unlock(&vnode_free_list_slock); 269 vp->v_lease = NULL; 270 271 if (vp->v_type != VBAD) 272 vgonel(vp, p); 273 else 274 simple_unlock(&vp->v_interlock); 275 vn_finished_write(mp, 0); 276 #ifdef DIAGNOSTIC 277 if (vp->v_data || vp->v_uobj.uo_npages || 278 TAILQ_FIRST(&vp->v_uobj.memq)) 279 panic("cleaned vnode isn't, vp %p", vp); 280 if (vp->v_numoutput) 281 panic("clean vnode has pending I/O's, vp %p", vp); 282 #endif 283 KASSERT((vp->v_flag & VONWORKLST) == 0); 284 285 return vp; 286 } 287 288 /* 289 * Mark a mount point as busy. Used to synchronize access and to delay 290 * unmounting. Interlock is not released on failure. 291 */ 292 int 293 vfs_busy(mp, flags, interlkp) 294 struct mount *mp; 295 int flags; 296 struct simplelock *interlkp; 297 { 298 int lkflags; 299 300 while (mp->mnt_iflag & IMNT_UNMOUNT) { 301 int gone, n; 302 303 if (flags & LK_NOWAIT) 304 return (ENOENT); 305 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 306 && mp->mnt_unmounter == curproc) 307 return (EDEADLK); 308 if (interlkp) 309 simple_unlock(interlkp); 310 /* 311 * Since all busy locks are shared except the exclusive 312 * lock granted when unmounting, the only place that a 313 * wakeup needs to be done is at the release of the 314 * exclusive lock at the end of dounmount. 315 */ 316 simple_lock(&mp->mnt_slock); 317 mp->mnt_wcnt++; 318 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 319 n = --mp->mnt_wcnt; 320 simple_unlock(&mp->mnt_slock); 321 gone = mp->mnt_iflag & IMNT_GONE; 322 323 if (n == 0) 324 wakeup(&mp->mnt_wcnt); 325 if (interlkp) 326 simple_lock(interlkp); 327 if (gone) 328 return (ENOENT); 329 } 330 lkflags = LK_SHARED; 331 if (interlkp) 332 lkflags |= LK_INTERLOCK; 333 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 334 panic("vfs_busy: unexpected lock failure"); 335 return (0); 336 } 337 338 /* 339 * Free a busy filesystem. 340 */ 341 void 342 vfs_unbusy(mp) 343 struct mount *mp; 344 { 345 346 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 347 } 348 349 /* 350 * Lookup a filesystem type, and if found allocate and initialize 351 * a mount structure for it. 352 * 353 * Devname is usually updated by mount(8) after booting. 354 */ 355 int 356 vfs_rootmountalloc(fstypename, devname, mpp) 357 char *fstypename; 358 char *devname; 359 struct mount **mpp; 360 { 361 struct vfsops *vfsp = NULL; 362 struct mount *mp; 363 364 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 365 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 366 break; 367 368 if (vfsp == NULL) 369 return (ENODEV); 370 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 371 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 372 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 373 simple_lock_init(&mp->mnt_slock); 374 (void)vfs_busy(mp, LK_NOWAIT, 0); 375 LIST_INIT(&mp->mnt_vnodelist); 376 mp->mnt_op = vfsp; 377 mp->mnt_flag = MNT_RDONLY; 378 mp->mnt_vnodecovered = NULLVP; 379 vfsp->vfs_refcount++; 380 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 381 mp->mnt_stat.f_mntonname[0] = '/'; 382 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 383 *mpp = mp; 384 return (0); 385 } 386 387 /* 388 * Lookup a mount point by filesystem identifier. 389 */ 390 struct mount * 391 vfs_getvfs(fsid) 392 fsid_t *fsid; 393 { 394 struct mount *mp; 395 396 simple_lock(&mountlist_slock); 397 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 398 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 399 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 400 simple_unlock(&mountlist_slock); 401 return (mp); 402 } 403 } 404 simple_unlock(&mountlist_slock); 405 return ((struct mount *)0); 406 } 407 408 /* 409 * Get a new unique fsid 410 */ 411 void 412 vfs_getnewfsid(mp) 413 struct mount *mp; 414 { 415 static u_short xxxfs_mntid; 416 fsid_t tfsid; 417 int mtype; 418 419 simple_lock(&mntid_slock); 420 mtype = makefstype(mp->mnt_op->vfs_name); 421 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 422 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 423 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 424 if (xxxfs_mntid == 0) 425 ++xxxfs_mntid; 426 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 427 tfsid.__fsid_val[1] = mtype; 428 if (!CIRCLEQ_EMPTY(&mountlist)) { 429 while (vfs_getvfs(&tfsid)) { 430 tfsid.__fsid_val[0]++; 431 xxxfs_mntid++; 432 } 433 } 434 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 435 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 436 simple_unlock(&mntid_slock); 437 } 438 439 /* 440 * Make a 'unique' number from a mount type name. 441 */ 442 long 443 makefstype(type) 444 const char *type; 445 { 446 long rv; 447 448 for (rv = 0; *type; type++) { 449 rv <<= 2; 450 rv ^= *type; 451 } 452 return rv; 453 } 454 455 456 /* 457 * Set vnode attributes to VNOVAL 458 */ 459 void 460 vattr_null(vap) 461 struct vattr *vap; 462 { 463 464 vap->va_type = VNON; 465 466 /* 467 * Assign individually so that it is safe even if size and 468 * sign of each member are varied. 469 */ 470 vap->va_mode = VNOVAL; 471 vap->va_nlink = VNOVAL; 472 vap->va_uid = VNOVAL; 473 vap->va_gid = VNOVAL; 474 vap->va_fsid = VNOVAL; 475 vap->va_fileid = VNOVAL; 476 vap->va_size = VNOVAL; 477 vap->va_blocksize = VNOVAL; 478 vap->va_atime.tv_sec = 479 vap->va_mtime.tv_sec = 480 vap->va_ctime.tv_sec = 481 vap->va_birthtime.tv_sec = VNOVAL; 482 vap->va_atime.tv_nsec = 483 vap->va_mtime.tv_nsec = 484 vap->va_ctime.tv_nsec = 485 vap->va_birthtime.tv_nsec = VNOVAL; 486 vap->va_gen = VNOVAL; 487 vap->va_flags = VNOVAL; 488 vap->va_rdev = VNOVAL; 489 vap->va_bytes = VNOVAL; 490 vap->va_vaflags = 0; 491 } 492 493 /* 494 * Routines having to do with the management of the vnode table. 495 */ 496 extern int (**dead_vnodeop_p)(void *); 497 long numvnodes; 498 499 /* 500 * Return the next vnode from the free list. 501 */ 502 int 503 getnewvnode(tag, mp, vops, vpp) 504 enum vtagtype tag; 505 struct mount *mp; 506 int (**vops)(void *); 507 struct vnode **vpp; 508 { 509 extern struct uvm_pagerops uvm_vnodeops; 510 struct uvm_object *uobj; 511 struct proc *p = curproc; /* XXX */ 512 static int toggle; 513 struct vnode *vp; 514 int error = 0, tryalloc; 515 516 try_again: 517 if (mp) { 518 /* 519 * Mark filesystem busy while we're creating a vnode. 520 * If unmount is in progress, this will wait; if the 521 * unmount succeeds (only if umount -f), this will 522 * return an error. If the unmount fails, we'll keep 523 * going afterwards. 524 * (This puts the per-mount vnode list logically under 525 * the protection of the vfs_busy lock). 526 */ 527 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 528 if (error && error != EDEADLK) 529 return error; 530 } 531 532 /* 533 * We must choose whether to allocate a new vnode or recycle an 534 * existing one. The criterion for allocating a new one is that 535 * the total number of vnodes is less than the number desired or 536 * there are no vnodes on either free list. Generally we only 537 * want to recycle vnodes that have no buffers associated with 538 * them, so we look first on the vnode_free_list. If it is empty, 539 * we next consider vnodes with referencing buffers on the 540 * vnode_hold_list. The toggle ensures that half the time we 541 * will use a buffer from the vnode_hold_list, and half the time 542 * we will allocate a new one unless the list has grown to twice 543 * the desired size. We are reticent to recycle vnodes from the 544 * vnode_hold_list because we will lose the identity of all its 545 * referencing buffers. 546 */ 547 548 vp = NULL; 549 550 simple_lock(&vnode_free_list_slock); 551 552 toggle ^= 1; 553 if (numvnodes > 2 * desiredvnodes) 554 toggle = 0; 555 556 tryalloc = numvnodes < desiredvnodes || 557 (TAILQ_FIRST(&vnode_free_list) == NULL && 558 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 559 560 if (tryalloc && 561 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 562 numvnodes++; 563 simple_unlock(&vnode_free_list_slock); 564 memset(vp, 0, sizeof(*vp)); 565 simple_lock_init(&vp->v_interlock); 566 uobj = &vp->v_uobj; 567 uobj->pgops = &uvm_vnodeops; 568 TAILQ_INIT(&uobj->memq); 569 /* 570 * done by memset() above. 571 * uobj->uo_npages = 0; 572 * LIST_INIT(&vp->v_nclist); 573 * LIST_INIT(&vp->v_dnclist); 574 */ 575 } else { 576 vp = getcleanvnode(p); 577 /* 578 * Unless this is a bad time of the month, at most 579 * the first NCPUS items on the free list are 580 * locked, so this is close enough to being empty. 581 */ 582 if (vp == NULLVP) { 583 if (mp && error != EDEADLK) 584 vfs_unbusy(mp); 585 if (tryalloc) { 586 printf("WARNING: unable to allocate new " 587 "vnode, retrying...\n"); 588 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 589 goto try_again; 590 } 591 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 592 *vpp = 0; 593 return (ENFILE); 594 } 595 vp->v_flag = 0; 596 vp->v_socket = NULL; 597 #ifdef VERIFIED_EXEC 598 vp->fp_status = FINGERPRINT_INVALID; 599 #endif 600 } 601 vp->v_type = VNON; 602 vp->v_vnlock = &vp->v_lock; 603 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 604 KASSERT(LIST_EMPTY(&vp->v_nclist)); 605 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 606 vp->v_tag = tag; 607 vp->v_op = vops; 608 insmntque(vp, mp); 609 *vpp = vp; 610 vp->v_usecount = 1; 611 vp->v_data = 0; 612 simple_lock_init(&vp->v_uobj.vmobjlock); 613 614 /* 615 * initialize uvm_object within vnode. 616 */ 617 618 uobj = &vp->v_uobj; 619 KASSERT(uobj->pgops == &uvm_vnodeops); 620 KASSERT(uobj->uo_npages == 0); 621 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 622 vp->v_size = VSIZENOTSET; 623 624 if (mp && error != EDEADLK) 625 vfs_unbusy(mp); 626 return (0); 627 } 628 629 /* 630 * This is really just the reverse of getnewvnode(). Needed for 631 * VFS_VGET functions who may need to push back a vnode in case 632 * of a locking race. 633 */ 634 void 635 ungetnewvnode(vp) 636 struct vnode *vp; 637 { 638 #ifdef DIAGNOSTIC 639 if (vp->v_usecount != 1) 640 panic("ungetnewvnode: busy vnode"); 641 #endif 642 vp->v_usecount--; 643 insmntque(vp, NULL); 644 vp->v_type = VBAD; 645 646 simple_lock(&vp->v_interlock); 647 /* 648 * Insert at head of LRU list 649 */ 650 simple_lock(&vnode_free_list_slock); 651 if (vp->v_holdcnt > 0) 652 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 653 else 654 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 655 simple_unlock(&vnode_free_list_slock); 656 simple_unlock(&vp->v_interlock); 657 } 658 659 /* 660 * Move a vnode from one mount queue to another. 661 */ 662 void 663 insmntque(vp, mp) 664 struct vnode *vp; 665 struct mount *mp; 666 { 667 668 #ifdef DIAGNOSTIC 669 if ((mp != NULL) && 670 (mp->mnt_iflag & IMNT_UNMOUNT) && 671 !(mp->mnt_flag & MNT_SOFTDEP) && 672 vp->v_tag != VT_VFS) { 673 panic("insmntque into dying filesystem"); 674 } 675 #endif 676 677 simple_lock(&mntvnode_slock); 678 /* 679 * Delete from old mount point vnode list, if on one. 680 */ 681 if (vp->v_mount != NULL) 682 LIST_REMOVE(vp, v_mntvnodes); 683 /* 684 * Insert into list of vnodes for the new mount point, if available. 685 */ 686 if ((vp->v_mount = mp) != NULL) 687 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 688 simple_unlock(&mntvnode_slock); 689 } 690 691 /* 692 * Update outstanding I/O count and do wakeup if requested. 693 */ 694 void 695 vwakeup(bp) 696 struct buf *bp; 697 { 698 struct vnode *vp; 699 700 if ((vp = bp->b_vp) != NULL) { 701 /* XXX global lock hack 702 * can't use v_interlock here since this is called 703 * in interrupt context from biodone(). 704 */ 705 simple_lock(&global_v_numoutput_slock); 706 if (--vp->v_numoutput < 0) 707 panic("vwakeup: neg numoutput, vp %p", vp); 708 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 709 vp->v_flag &= ~VBWAIT; 710 wakeup((caddr_t)&vp->v_numoutput); 711 } 712 simple_unlock(&global_v_numoutput_slock); 713 } 714 } 715 716 /* 717 * Flush out and invalidate all buffers associated with a vnode. 718 * Called with the underlying vnode locked, which should prevent new dirty 719 * buffers from being queued. 720 */ 721 int 722 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 723 struct vnode *vp; 724 int flags; 725 struct ucred *cred; 726 struct proc *p; 727 int slpflag, slptimeo; 728 { 729 struct buf *bp, *nbp; 730 int s, error; 731 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 732 (flags & V_SAVE ? PGO_CLEANIT : 0); 733 734 /* XXXUBC this doesn't look at flags or slp* */ 735 simple_lock(&vp->v_interlock); 736 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 737 if (error) { 738 return error; 739 } 740 741 if (flags & V_SAVE) { 742 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 743 if (error) 744 return (error); 745 #ifdef DIAGNOSTIC 746 s = splbio(); 747 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 748 panic("vinvalbuf: dirty bufs, vp %p", vp); 749 splx(s); 750 #endif 751 } 752 753 s = splbio(); 754 755 restart: 756 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 757 nbp = LIST_NEXT(bp, b_vnbufs); 758 simple_lock(&bp->b_interlock); 759 if (bp->b_flags & B_BUSY) { 760 bp->b_flags |= B_WANTED; 761 error = ltsleep((caddr_t)bp, 762 slpflag | (PRIBIO + 1) | PNORELOCK, 763 "vinvalbuf", slptimeo, &bp->b_interlock); 764 if (error) { 765 splx(s); 766 return (error); 767 } 768 goto restart; 769 } 770 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 771 simple_unlock(&bp->b_interlock); 772 brelse(bp); 773 } 774 775 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 776 nbp = LIST_NEXT(bp, b_vnbufs); 777 simple_lock(&bp->b_interlock); 778 if (bp->b_flags & B_BUSY) { 779 bp->b_flags |= B_WANTED; 780 error = ltsleep((caddr_t)bp, 781 slpflag | (PRIBIO + 1) | PNORELOCK, 782 "vinvalbuf", slptimeo, &bp->b_interlock); 783 if (error) { 784 splx(s); 785 return (error); 786 } 787 goto restart; 788 } 789 /* 790 * XXX Since there are no node locks for NFS, I believe 791 * there is a slight chance that a delayed write will 792 * occur while sleeping just above, so check for it. 793 */ 794 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 795 #ifdef DEBUG 796 printf("buffer still DELWRI\n"); 797 #endif 798 bp->b_flags |= B_BUSY | B_VFLUSH; 799 simple_unlock(&bp->b_interlock); 800 VOP_BWRITE(bp); 801 goto restart; 802 } 803 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 804 simple_unlock(&bp->b_interlock); 805 brelse(bp); 806 } 807 808 #ifdef DIAGNOSTIC 809 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 810 panic("vinvalbuf: flush failed, vp %p", vp); 811 #endif 812 813 splx(s); 814 815 return (0); 816 } 817 818 /* 819 * Destroy any in core blocks past the truncation length. 820 * Called with the underlying vnode locked, which should prevent new dirty 821 * buffers from being queued. 822 */ 823 int 824 vtruncbuf(vp, lbn, slpflag, slptimeo) 825 struct vnode *vp; 826 daddr_t lbn; 827 int slpflag, slptimeo; 828 { 829 struct buf *bp, *nbp; 830 int s, error; 831 voff_t off; 832 833 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 834 simple_lock(&vp->v_interlock); 835 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 836 if (error) { 837 return error; 838 } 839 840 s = splbio(); 841 842 restart: 843 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 844 nbp = LIST_NEXT(bp, b_vnbufs); 845 if (bp->b_lblkno < lbn) 846 continue; 847 simple_lock(&bp->b_interlock); 848 if (bp->b_flags & B_BUSY) { 849 bp->b_flags |= B_WANTED; 850 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 851 "vtruncbuf", slptimeo, &bp->b_interlock); 852 if (error) { 853 splx(s); 854 return (error); 855 } 856 goto restart; 857 } 858 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 859 simple_unlock(&bp->b_interlock); 860 brelse(bp); 861 } 862 863 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 864 nbp = LIST_NEXT(bp, b_vnbufs); 865 if (bp->b_lblkno < lbn) 866 continue; 867 simple_lock(&bp->b_interlock); 868 if (bp->b_flags & B_BUSY) { 869 bp->b_flags |= B_WANTED; 870 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 871 "vtruncbuf", slptimeo, &bp->b_interlock); 872 if (error) { 873 splx(s); 874 return (error); 875 } 876 goto restart; 877 } 878 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 879 simple_unlock(&bp->b_interlock); 880 brelse(bp); 881 } 882 883 splx(s); 884 885 return (0); 886 } 887 888 void 889 vflushbuf(vp, sync) 890 struct vnode *vp; 891 int sync; 892 { 893 struct buf *bp, *nbp; 894 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 895 int s; 896 897 simple_lock(&vp->v_interlock); 898 (void) VOP_PUTPAGES(vp, 0, 0, flags); 899 900 loop: 901 s = splbio(); 902 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 903 nbp = LIST_NEXT(bp, b_vnbufs); 904 simple_lock(&bp->b_interlock); 905 if ((bp->b_flags & B_BUSY)) { 906 simple_unlock(&bp->b_interlock); 907 continue; 908 } 909 if ((bp->b_flags & B_DELWRI) == 0) 910 panic("vflushbuf: not dirty, bp %p", bp); 911 bp->b_flags |= B_BUSY | B_VFLUSH; 912 simple_unlock(&bp->b_interlock); 913 splx(s); 914 /* 915 * Wait for I/O associated with indirect blocks to complete, 916 * since there is no way to quickly wait for them below. 917 */ 918 if (bp->b_vp == vp || sync == 0) 919 (void) bawrite(bp); 920 else 921 (void) bwrite(bp); 922 goto loop; 923 } 924 if (sync == 0) { 925 splx(s); 926 return; 927 } 928 simple_lock(&global_v_numoutput_slock); 929 while (vp->v_numoutput) { 930 vp->v_flag |= VBWAIT; 931 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 932 &global_v_numoutput_slock); 933 } 934 simple_unlock(&global_v_numoutput_slock); 935 splx(s); 936 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 937 vprint("vflushbuf: dirty", vp); 938 goto loop; 939 } 940 } 941 942 /* 943 * Associate a buffer with a vnode. 944 */ 945 void 946 bgetvp(vp, bp) 947 struct vnode *vp; 948 struct buf *bp; 949 { 950 int s; 951 952 if (bp->b_vp) 953 panic("bgetvp: not free, bp %p", bp); 954 VHOLD(vp); 955 s = splbio(); 956 bp->b_vp = vp; 957 if (vp->v_type == VBLK || vp->v_type == VCHR) 958 bp->b_dev = vp->v_rdev; 959 else 960 bp->b_dev = NODEV; 961 /* 962 * Insert onto list for new vnode. 963 */ 964 bufinsvn(bp, &vp->v_cleanblkhd); 965 splx(s); 966 } 967 968 /* 969 * Disassociate a buffer from a vnode. 970 */ 971 void 972 brelvp(bp) 973 struct buf *bp; 974 { 975 struct vnode *vp; 976 int s; 977 978 if (bp->b_vp == NULL) 979 panic("brelvp: vp NULL, bp %p", bp); 980 981 s = splbio(); 982 vp = bp->b_vp; 983 /* 984 * Delete from old vnode list, if on one. 985 */ 986 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 987 bufremvn(bp); 988 989 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 990 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 991 vp->v_flag &= ~VONWORKLST; 992 LIST_REMOVE(vp, v_synclist); 993 } 994 995 bp->b_vp = NULL; 996 HOLDRELE(vp); 997 splx(s); 998 } 999 1000 /* 1001 * Reassign a buffer from one vnode to another. 1002 * Used to assign file specific control information 1003 * (indirect blocks) to the vnode to which they belong. 1004 * 1005 * This function must be called at splbio(). 1006 */ 1007 void 1008 reassignbuf(bp, newvp) 1009 struct buf *bp; 1010 struct vnode *newvp; 1011 { 1012 struct buflists *listheadp; 1013 int delay; 1014 1015 /* 1016 * Delete from old vnode list, if on one. 1017 */ 1018 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1019 bufremvn(bp); 1020 /* 1021 * If dirty, put on list of dirty buffers; 1022 * otherwise insert onto list of clean buffers. 1023 */ 1024 if ((bp->b_flags & B_DELWRI) == 0) { 1025 listheadp = &newvp->v_cleanblkhd; 1026 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 1027 (newvp->v_flag & VONWORKLST) && 1028 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1029 newvp->v_flag &= ~VONWORKLST; 1030 LIST_REMOVE(newvp, v_synclist); 1031 } 1032 } else { 1033 listheadp = &newvp->v_dirtyblkhd; 1034 if ((newvp->v_flag & VONWORKLST) == 0) { 1035 switch (newvp->v_type) { 1036 case VDIR: 1037 delay = dirdelay; 1038 break; 1039 case VBLK: 1040 if (newvp->v_specmountpoint != NULL) { 1041 delay = metadelay; 1042 break; 1043 } 1044 /* fall through */ 1045 default: 1046 delay = filedelay; 1047 break; 1048 } 1049 if (!newvp->v_mount || 1050 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1051 vn_syncer_add_to_worklist(newvp, delay); 1052 } 1053 } 1054 bufinsvn(bp, listheadp); 1055 } 1056 1057 /* 1058 * Create a vnode for a block device. 1059 * Used for root filesystem and swap areas. 1060 * Also used for memory file system special devices. 1061 */ 1062 int 1063 bdevvp(dev, vpp) 1064 dev_t dev; 1065 struct vnode **vpp; 1066 { 1067 1068 return (getdevvp(dev, vpp, VBLK)); 1069 } 1070 1071 /* 1072 * Create a vnode for a character device. 1073 * Used for kernfs and some console handling. 1074 */ 1075 int 1076 cdevvp(dev, vpp) 1077 dev_t dev; 1078 struct vnode **vpp; 1079 { 1080 1081 return (getdevvp(dev, vpp, VCHR)); 1082 } 1083 1084 /* 1085 * Create a vnode for a device. 1086 * Used by bdevvp (block device) for root file system etc., 1087 * and by cdevvp (character device) for console and kernfs. 1088 */ 1089 int 1090 getdevvp(dev, vpp, type) 1091 dev_t dev; 1092 struct vnode **vpp; 1093 enum vtype type; 1094 { 1095 struct vnode *vp; 1096 struct vnode *nvp; 1097 int error; 1098 1099 if (dev == NODEV) { 1100 *vpp = NULLVP; 1101 return (0); 1102 } 1103 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1104 if (error) { 1105 *vpp = NULLVP; 1106 return (error); 1107 } 1108 vp = nvp; 1109 vp->v_type = type; 1110 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1111 vput(vp); 1112 vp = nvp; 1113 } 1114 *vpp = vp; 1115 return (0); 1116 } 1117 1118 /* 1119 * Check to see if the new vnode represents a special device 1120 * for which we already have a vnode (either because of 1121 * bdevvp() or because of a different vnode representing 1122 * the same block device). If such an alias exists, deallocate 1123 * the existing contents and return the aliased vnode. The 1124 * caller is responsible for filling it with its new contents. 1125 */ 1126 struct vnode * 1127 checkalias(nvp, nvp_rdev, mp) 1128 struct vnode *nvp; 1129 dev_t nvp_rdev; 1130 struct mount *mp; 1131 { 1132 struct proc *p = curproc; /* XXX */ 1133 struct vnode *vp; 1134 struct vnode **vpp; 1135 1136 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1137 return (NULLVP); 1138 1139 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1140 loop: 1141 simple_lock(&spechash_slock); 1142 for (vp = *vpp; vp; vp = vp->v_specnext) { 1143 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1144 continue; 1145 /* 1146 * Alias, but not in use, so flush it out. 1147 */ 1148 simple_lock(&vp->v_interlock); 1149 if (vp->v_usecount == 0) { 1150 simple_unlock(&spechash_slock); 1151 vgonel(vp, p); 1152 goto loop; 1153 } 1154 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT)) { 1155 simple_unlock(&spechash_slock); 1156 goto loop; 1157 } 1158 break; 1159 } 1160 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1161 MALLOC(nvp->v_specinfo, struct specinfo *, 1162 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1163 /* XXX Erg. */ 1164 if (nvp->v_specinfo == NULL) { 1165 simple_unlock(&spechash_slock); 1166 uvm_wait("checkalias"); 1167 goto loop; 1168 } 1169 1170 nvp->v_rdev = nvp_rdev; 1171 nvp->v_hashchain = vpp; 1172 nvp->v_specnext = *vpp; 1173 nvp->v_specmountpoint = NULL; 1174 simple_unlock(&spechash_slock); 1175 nvp->v_speclockf = NULL; 1176 simple_lock_init(&nvp->v_spec_cow_slock); 1177 SLIST_INIT(&nvp->v_spec_cow_head); 1178 nvp->v_spec_cow_req = 0; 1179 nvp->v_spec_cow_count = 0; 1180 1181 *vpp = nvp; 1182 if (vp != NULLVP) { 1183 nvp->v_flag |= VALIASED; 1184 vp->v_flag |= VALIASED; 1185 vput(vp); 1186 } 1187 return (NULLVP); 1188 } 1189 simple_unlock(&spechash_slock); 1190 VOP_UNLOCK(vp, 0); 1191 simple_lock(&vp->v_interlock); 1192 vclean(vp, 0, p); 1193 vp->v_op = nvp->v_op; 1194 vp->v_tag = nvp->v_tag; 1195 vp->v_vnlock = &vp->v_lock; 1196 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1197 nvp->v_type = VNON; 1198 insmntque(vp, mp); 1199 return (vp); 1200 } 1201 1202 /* 1203 * Grab a particular vnode from the free list, increment its 1204 * reference count and lock it. If the vnode lock bit is set the 1205 * vnode is being eliminated in vgone. In that case, we can not 1206 * grab the vnode, so the process is awakened when the transition is 1207 * completed, and an error returned to indicate that the vnode is no 1208 * longer usable (possibly having been changed to a new file system type). 1209 */ 1210 int 1211 vget(vp, flags) 1212 struct vnode *vp; 1213 int flags; 1214 { 1215 int error; 1216 1217 /* 1218 * If the vnode is in the process of being cleaned out for 1219 * another use, we wait for the cleaning to finish and then 1220 * return failure. Cleaning is determined by checking that 1221 * the VXLOCK flag is set. 1222 */ 1223 1224 if ((flags & LK_INTERLOCK) == 0) 1225 simple_lock(&vp->v_interlock); 1226 if (vp->v_flag & VXLOCK) { 1227 if (flags & LK_NOWAIT) { 1228 simple_unlock(&vp->v_interlock); 1229 return EBUSY; 1230 } 1231 vp->v_flag |= VXWANT; 1232 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1233 return (ENOENT); 1234 } 1235 if (vp->v_usecount == 0) { 1236 simple_lock(&vnode_free_list_slock); 1237 if (vp->v_holdcnt > 0) 1238 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1239 else 1240 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1241 simple_unlock(&vnode_free_list_slock); 1242 } 1243 vp->v_usecount++; 1244 #ifdef DIAGNOSTIC 1245 if (vp->v_usecount == 0) { 1246 vprint("vget", vp); 1247 panic("vget: usecount overflow, vp %p", vp); 1248 } 1249 #endif 1250 if (flags & LK_TYPE_MASK) { 1251 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1252 /* 1253 * must expand vrele here because we do not want 1254 * to call VOP_INACTIVE if the reference count 1255 * drops back to zero since it was never really 1256 * active. We must remove it from the free list 1257 * before sleeping so that multiple processes do 1258 * not try to recycle it. 1259 */ 1260 simple_lock(&vp->v_interlock); 1261 vp->v_usecount--; 1262 if (vp->v_usecount > 0) { 1263 simple_unlock(&vp->v_interlock); 1264 return (error); 1265 } 1266 /* 1267 * insert at tail of LRU list 1268 */ 1269 simple_lock(&vnode_free_list_slock); 1270 if (vp->v_holdcnt > 0) 1271 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1272 v_freelist); 1273 else 1274 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1275 v_freelist); 1276 simple_unlock(&vnode_free_list_slock); 1277 simple_unlock(&vp->v_interlock); 1278 } 1279 return (error); 1280 } 1281 simple_unlock(&vp->v_interlock); 1282 return (0); 1283 } 1284 1285 /* 1286 * vput(), just unlock and vrele() 1287 */ 1288 void 1289 vput(vp) 1290 struct vnode *vp; 1291 { 1292 struct proc *p = curproc; /* XXX */ 1293 1294 #ifdef DIAGNOSTIC 1295 if (vp == NULL) 1296 panic("vput: null vp"); 1297 #endif 1298 simple_lock(&vp->v_interlock); 1299 vp->v_usecount--; 1300 if (vp->v_usecount > 0) { 1301 simple_unlock(&vp->v_interlock); 1302 VOP_UNLOCK(vp, 0); 1303 return; 1304 } 1305 #ifdef DIAGNOSTIC 1306 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1307 vprint("vput: bad ref count", vp); 1308 panic("vput: ref cnt"); 1309 } 1310 #endif 1311 /* 1312 * Insert at tail of LRU list. 1313 */ 1314 simple_lock(&vnode_free_list_slock); 1315 if (vp->v_holdcnt > 0) 1316 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1317 else 1318 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1319 simple_unlock(&vnode_free_list_slock); 1320 if (vp->v_flag & VEXECMAP) { 1321 uvmexp.execpages -= vp->v_uobj.uo_npages; 1322 uvmexp.filepages += vp->v_uobj.uo_npages; 1323 } 1324 vp->v_flag &= ~(VTEXT|VEXECMAP); 1325 simple_unlock(&vp->v_interlock); 1326 VOP_INACTIVE(vp, p); 1327 } 1328 1329 /* 1330 * Vnode release. 1331 * If count drops to zero, call inactive routine and return to freelist. 1332 */ 1333 void 1334 vrele(vp) 1335 struct vnode *vp; 1336 { 1337 struct proc *p = curproc; /* XXX */ 1338 1339 #ifdef DIAGNOSTIC 1340 if (vp == NULL) 1341 panic("vrele: null vp"); 1342 #endif 1343 simple_lock(&vp->v_interlock); 1344 vp->v_usecount--; 1345 if (vp->v_usecount > 0) { 1346 simple_unlock(&vp->v_interlock); 1347 return; 1348 } 1349 #ifdef DIAGNOSTIC 1350 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1351 vprint("vrele: bad ref count", vp); 1352 panic("vrele: ref cnt vp %p", vp); 1353 } 1354 #endif 1355 /* 1356 * Insert at tail of LRU list. 1357 */ 1358 simple_lock(&vnode_free_list_slock); 1359 if (vp->v_holdcnt > 0) 1360 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1361 else 1362 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1363 simple_unlock(&vnode_free_list_slock); 1364 if (vp->v_flag & VEXECMAP) { 1365 uvmexp.execpages -= vp->v_uobj.uo_npages; 1366 uvmexp.filepages += vp->v_uobj.uo_npages; 1367 } 1368 vp->v_flag &= ~(VTEXT|VEXECMAP); 1369 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1370 VOP_INACTIVE(vp, p); 1371 } 1372 1373 #ifdef DIAGNOSTIC 1374 /* 1375 * Page or buffer structure gets a reference. 1376 */ 1377 void 1378 vholdl(vp) 1379 struct vnode *vp; 1380 { 1381 1382 /* 1383 * If it is on the freelist and the hold count is currently 1384 * zero, move it to the hold list. The test of the back 1385 * pointer and the use reference count of zero is because 1386 * it will be removed from a free list by getnewvnode, 1387 * but will not have its reference count incremented until 1388 * after calling vgone. If the reference count were 1389 * incremented first, vgone would (incorrectly) try to 1390 * close the previous instance of the underlying object. 1391 * So, the back pointer is explicitly set to `0xdeadb' in 1392 * getnewvnode after removing it from a freelist to ensure 1393 * that we do not try to move it here. 1394 */ 1395 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1396 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1397 simple_lock(&vnode_free_list_slock); 1398 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1399 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1400 simple_unlock(&vnode_free_list_slock); 1401 } 1402 vp->v_holdcnt++; 1403 } 1404 1405 /* 1406 * Page or buffer structure frees a reference. 1407 */ 1408 void 1409 holdrelel(vp) 1410 struct vnode *vp; 1411 { 1412 1413 if (vp->v_holdcnt <= 0) 1414 panic("holdrelel: holdcnt vp %p", vp); 1415 vp->v_holdcnt--; 1416 1417 /* 1418 * If it is on the holdlist and the hold count drops to 1419 * zero, move it to the free list. The test of the back 1420 * pointer and the use reference count of zero is because 1421 * it will be removed from a free list by getnewvnode, 1422 * but will not have its reference count incremented until 1423 * after calling vgone. If the reference count were 1424 * incremented first, vgone would (incorrectly) try to 1425 * close the previous instance of the underlying object. 1426 * So, the back pointer is explicitly set to `0xdeadb' in 1427 * getnewvnode after removing it from a freelist to ensure 1428 * that we do not try to move it here. 1429 */ 1430 1431 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1432 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1433 simple_lock(&vnode_free_list_slock); 1434 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1435 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1436 simple_unlock(&vnode_free_list_slock); 1437 } 1438 } 1439 1440 /* 1441 * Vnode reference. 1442 */ 1443 void 1444 vref(vp) 1445 struct vnode *vp; 1446 { 1447 1448 simple_lock(&vp->v_interlock); 1449 if (vp->v_usecount <= 0) 1450 panic("vref used where vget required, vp %p", vp); 1451 vp->v_usecount++; 1452 #ifdef DIAGNOSTIC 1453 if (vp->v_usecount == 0) { 1454 vprint("vref", vp); 1455 panic("vref: usecount overflow, vp %p", vp); 1456 } 1457 #endif 1458 simple_unlock(&vp->v_interlock); 1459 } 1460 #endif /* DIAGNOSTIC */ 1461 1462 /* 1463 * Remove any vnodes in the vnode table belonging to mount point mp. 1464 * 1465 * If FORCECLOSE is not specified, there should not be any active ones, 1466 * return error if any are found (nb: this is a user error, not a 1467 * system error). If FORCECLOSE is specified, detach any active vnodes 1468 * that are found. 1469 * 1470 * If WRITECLOSE is set, only flush out regular file vnodes open for 1471 * writing. 1472 * 1473 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1474 */ 1475 #ifdef DEBUG 1476 int busyprt = 0; /* print out busy vnodes */ 1477 struct ctldebug debug1 = { "busyprt", &busyprt }; 1478 #endif 1479 1480 int 1481 vflush(mp, skipvp, flags) 1482 struct mount *mp; 1483 struct vnode *skipvp; 1484 int flags; 1485 { 1486 struct proc *p = curproc; /* XXX */ 1487 struct vnode *vp, *nvp; 1488 int busy = 0; 1489 1490 simple_lock(&mntvnode_slock); 1491 loop: 1492 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1493 if (vp->v_mount != mp) 1494 goto loop; 1495 nvp = LIST_NEXT(vp, v_mntvnodes); 1496 /* 1497 * Skip over a selected vnode. 1498 */ 1499 if (vp == skipvp) 1500 continue; 1501 simple_lock(&vp->v_interlock); 1502 /* 1503 * Skip over a vnodes marked VSYSTEM. 1504 */ 1505 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1506 simple_unlock(&vp->v_interlock); 1507 continue; 1508 } 1509 /* 1510 * If WRITECLOSE is set, only flush out regular file 1511 * vnodes open for writing. 1512 */ 1513 if ((flags & WRITECLOSE) && 1514 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1515 simple_unlock(&vp->v_interlock); 1516 continue; 1517 } 1518 /* 1519 * With v_usecount == 0, all we need to do is clear 1520 * out the vnode data structures and we are done. 1521 */ 1522 if (vp->v_usecount == 0) { 1523 simple_unlock(&mntvnode_slock); 1524 vgonel(vp, p); 1525 simple_lock(&mntvnode_slock); 1526 continue; 1527 } 1528 /* 1529 * If FORCECLOSE is set, forcibly close the vnode. 1530 * For block or character devices, revert to an 1531 * anonymous device. For all other files, just kill them. 1532 */ 1533 if (flags & FORCECLOSE) { 1534 simple_unlock(&mntvnode_slock); 1535 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1536 vgonel(vp, p); 1537 } else { 1538 vclean(vp, 0, p); 1539 vp->v_op = spec_vnodeop_p; 1540 insmntque(vp, (struct mount *)0); 1541 } 1542 simple_lock(&mntvnode_slock); 1543 continue; 1544 } 1545 #ifdef DEBUG 1546 if (busyprt) 1547 vprint("vflush: busy vnode", vp); 1548 #endif 1549 simple_unlock(&vp->v_interlock); 1550 busy++; 1551 } 1552 simple_unlock(&mntvnode_slock); 1553 if (busy) 1554 return (EBUSY); 1555 return (0); 1556 } 1557 1558 /* 1559 * Disassociate the underlying file system from a vnode. 1560 */ 1561 void 1562 vclean(vp, flags, p) 1563 struct vnode *vp; 1564 int flags; 1565 struct proc *p; 1566 { 1567 struct mount *mp; 1568 int active; 1569 1570 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1571 1572 /* 1573 * Check to see if the vnode is in use. 1574 * If so we have to reference it before we clean it out 1575 * so that its count cannot fall to zero and generate a 1576 * race against ourselves to recycle it. 1577 */ 1578 1579 if ((active = vp->v_usecount) != 0) { 1580 vp->v_usecount++; 1581 #ifdef DIAGNOSTIC 1582 if (vp->v_usecount == 0) { 1583 vprint("vclean", vp); 1584 panic("vclean: usecount overflow"); 1585 } 1586 #endif 1587 } 1588 1589 /* 1590 * Prevent the vnode from being recycled or 1591 * brought into use while we clean it out. 1592 */ 1593 if (vp->v_flag & VXLOCK) 1594 panic("vclean: deadlock, vp %p", vp); 1595 vp->v_flag |= VXLOCK; 1596 if (vp->v_flag & VEXECMAP) { 1597 uvmexp.execpages -= vp->v_uobj.uo_npages; 1598 uvmexp.filepages += vp->v_uobj.uo_npages; 1599 } 1600 vp->v_flag &= ~(VTEXT|VEXECMAP); 1601 1602 /* 1603 * Even if the count is zero, the VOP_INACTIVE routine may still 1604 * have the object locked while it cleans it out. The VOP_LOCK 1605 * ensures that the VOP_INACTIVE routine is done with its work. 1606 * For active vnodes, it ensures that no other activity can 1607 * occur while the underlying object is being cleaned out. 1608 */ 1609 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1610 1611 /* 1612 * Clean out any cached data associated with the vnode. 1613 */ 1614 if (flags & DOCLOSE) { 1615 int error; 1616 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1617 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1618 vn_finished_write(mp, V_LOWER); 1619 if (error) 1620 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1621 KASSERT(error == 0); 1622 KASSERT((vp->v_flag & VONWORKLST) == 0); 1623 } 1624 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1625 1626 /* 1627 * If purging an active vnode, it must be closed and 1628 * deactivated before being reclaimed. Note that the 1629 * VOP_INACTIVE will unlock the vnode. 1630 */ 1631 if (active) { 1632 if (flags & DOCLOSE) 1633 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1634 VOP_INACTIVE(vp, p); 1635 } else { 1636 /* 1637 * Any other processes trying to obtain this lock must first 1638 * wait for VXLOCK to clear, then call the new lock operation. 1639 */ 1640 VOP_UNLOCK(vp, 0); 1641 } 1642 /* 1643 * Reclaim the vnode. 1644 */ 1645 if (VOP_RECLAIM(vp, p)) 1646 panic("vclean: cannot reclaim, vp %p", vp); 1647 if (active) { 1648 /* 1649 * Inline copy of vrele() since VOP_INACTIVE 1650 * has already been called. 1651 */ 1652 simple_lock(&vp->v_interlock); 1653 if (--vp->v_usecount <= 0) { 1654 #ifdef DIAGNOSTIC 1655 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1656 vprint("vclean: bad ref count", vp); 1657 panic("vclean: ref cnt"); 1658 } 1659 #endif 1660 /* 1661 * Insert at tail of LRU list. 1662 */ 1663 1664 simple_unlock(&vp->v_interlock); 1665 simple_lock(&vnode_free_list_slock); 1666 #ifdef DIAGNOSTIC 1667 if (vp->v_holdcnt > 0) 1668 panic("vclean: not clean, vp %p", vp); 1669 #endif 1670 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1671 simple_unlock(&vnode_free_list_slock); 1672 } else 1673 simple_unlock(&vp->v_interlock); 1674 } 1675 1676 KASSERT(vp->v_uobj.uo_npages == 0); 1677 cache_purge(vp); 1678 1679 /* 1680 * Done with purge, notify sleepers of the grim news. 1681 */ 1682 vp->v_op = dead_vnodeop_p; 1683 vp->v_tag = VT_NON; 1684 simple_lock(&vp->v_interlock); 1685 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1686 vp->v_flag &= ~VXLOCK; 1687 if (vp->v_flag & VXWANT) { 1688 vp->v_flag &= ~VXWANT; 1689 simple_unlock(&vp->v_interlock); 1690 wakeup((caddr_t)vp); 1691 } else 1692 simple_unlock(&vp->v_interlock); 1693 } 1694 1695 /* 1696 * Recycle an unused vnode to the front of the free list. 1697 * Release the passed interlock if the vnode will be recycled. 1698 */ 1699 int 1700 vrecycle(vp, inter_lkp, p) 1701 struct vnode *vp; 1702 struct simplelock *inter_lkp; 1703 struct proc *p; 1704 { 1705 1706 simple_lock(&vp->v_interlock); 1707 if (vp->v_usecount == 0) { 1708 if (inter_lkp) 1709 simple_unlock(inter_lkp); 1710 vgonel(vp, p); 1711 return (1); 1712 } 1713 simple_unlock(&vp->v_interlock); 1714 return (0); 1715 } 1716 1717 /* 1718 * Eliminate all activity associated with a vnode 1719 * in preparation for reuse. 1720 */ 1721 void 1722 vgone(vp) 1723 struct vnode *vp; 1724 { 1725 struct proc *p = curproc; /* XXX */ 1726 1727 simple_lock(&vp->v_interlock); 1728 vgonel(vp, p); 1729 } 1730 1731 /* 1732 * vgone, with the vp interlock held. 1733 */ 1734 void 1735 vgonel(vp, p) 1736 struct vnode *vp; 1737 struct proc *p; 1738 { 1739 struct vnode *vq; 1740 struct vnode *vx; 1741 1742 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1743 1744 /* 1745 * If a vgone (or vclean) is already in progress, 1746 * wait until it is done and return. 1747 */ 1748 1749 if (vp->v_flag & VXLOCK) { 1750 vp->v_flag |= VXWANT; 1751 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1752 return; 1753 } 1754 1755 /* 1756 * Clean out the filesystem specific data. 1757 */ 1758 1759 vclean(vp, DOCLOSE, p); 1760 KASSERT((vp->v_flag & VONWORKLST) == 0); 1761 1762 /* 1763 * Delete from old mount point vnode list, if on one. 1764 */ 1765 1766 if (vp->v_mount != NULL) 1767 insmntque(vp, (struct mount *)0); 1768 1769 /* 1770 * If special device, remove it from special device alias list. 1771 * if it is on one. 1772 */ 1773 1774 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 1775 simple_lock(&spechash_slock); 1776 if (vp->v_hashchain != NULL) { 1777 if (*vp->v_hashchain == vp) { 1778 *vp->v_hashchain = vp->v_specnext; 1779 } else { 1780 for (vq = *vp->v_hashchain; vq; 1781 vq = vq->v_specnext) { 1782 if (vq->v_specnext != vp) 1783 continue; 1784 vq->v_specnext = vp->v_specnext; 1785 break; 1786 } 1787 if (vq == NULL) 1788 panic("missing bdev"); 1789 } 1790 if (vp->v_flag & VALIASED) { 1791 vx = NULL; 1792 for (vq = *vp->v_hashchain; vq; 1793 vq = vq->v_specnext) { 1794 if (vq->v_rdev != vp->v_rdev || 1795 vq->v_type != vp->v_type) 1796 continue; 1797 if (vx) 1798 break; 1799 vx = vq; 1800 } 1801 if (vx == NULL) 1802 panic("missing alias"); 1803 if (vq == NULL) 1804 vx->v_flag &= ~VALIASED; 1805 vp->v_flag &= ~VALIASED; 1806 } 1807 } 1808 simple_unlock(&spechash_slock); 1809 FREE(vp->v_specinfo, M_VNODE); 1810 vp->v_specinfo = NULL; 1811 } 1812 1813 /* 1814 * The test of the back pointer and the reference count of 1815 * zero is because it will be removed from the free list by 1816 * getcleanvnode, but will not have its reference count 1817 * incremented until after calling vgone. If the reference 1818 * count were incremented first, vgone would (incorrectly) 1819 * try to close the previous instance of the underlying object. 1820 * So, the back pointer is explicitly set to `0xdeadb' in 1821 * getnewvnode after removing it from the freelist to ensure 1822 * that we do not try to move it here. 1823 */ 1824 1825 vp->v_type = VBAD; 1826 if (vp->v_usecount == 0) { 1827 boolean_t dofree; 1828 1829 simple_lock(&vnode_free_list_slock); 1830 if (vp->v_holdcnt > 0) 1831 panic("vgonel: not clean, vp %p", vp); 1832 /* 1833 * if it isn't on the freelist, we're called by getcleanvnode 1834 * and vnode is being re-used. otherwise, we'll free it. 1835 */ 1836 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1837 if (dofree) { 1838 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1839 numvnodes--; 1840 } 1841 simple_unlock(&vnode_free_list_slock); 1842 if (dofree) 1843 pool_put(&vnode_pool, vp); 1844 } 1845 } 1846 1847 /* 1848 * Lookup a vnode by device number. 1849 */ 1850 int 1851 vfinddev(dev, type, vpp) 1852 dev_t dev; 1853 enum vtype type; 1854 struct vnode **vpp; 1855 { 1856 struct vnode *vp; 1857 int rc = 0; 1858 1859 simple_lock(&spechash_slock); 1860 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1861 if (dev != vp->v_rdev || type != vp->v_type) 1862 continue; 1863 *vpp = vp; 1864 rc = 1; 1865 break; 1866 } 1867 simple_unlock(&spechash_slock); 1868 return (rc); 1869 } 1870 1871 /* 1872 * Revoke all the vnodes corresponding to the specified minor number 1873 * range (endpoints inclusive) of the specified major. 1874 */ 1875 void 1876 vdevgone(maj, minl, minh, type) 1877 int maj, minl, minh; 1878 enum vtype type; 1879 { 1880 struct vnode *vp; 1881 int mn; 1882 1883 for (mn = minl; mn <= minh; mn++) 1884 if (vfinddev(makedev(maj, mn), type, &vp)) 1885 VOP_REVOKE(vp, REVOKEALL); 1886 } 1887 1888 /* 1889 * Calculate the total number of references to a special device. 1890 */ 1891 int 1892 vcount(vp) 1893 struct vnode *vp; 1894 { 1895 struct vnode *vq, *vnext; 1896 int count; 1897 1898 loop: 1899 if ((vp->v_flag & VALIASED) == 0) 1900 return (vp->v_usecount); 1901 simple_lock(&spechash_slock); 1902 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1903 vnext = vq->v_specnext; 1904 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1905 continue; 1906 /* 1907 * Alias, but not in use, so flush it out. 1908 */ 1909 if (vq->v_usecount == 0 && vq != vp && 1910 (vq->v_flag & VXLOCK) == 0) { 1911 simple_unlock(&spechash_slock); 1912 vgone(vq); 1913 goto loop; 1914 } 1915 count += vq->v_usecount; 1916 } 1917 simple_unlock(&spechash_slock); 1918 return (count); 1919 } 1920 1921 /* 1922 * Print out a description of a vnode. 1923 */ 1924 const char * const vnode_types[] = { 1925 "VNON", 1926 "VREG", 1927 "VDIR", 1928 "VBLK", 1929 "VCHR", 1930 "VLNK", 1931 "VSOCK", 1932 "VFIFO", 1933 "VBAD" 1934 }; 1935 1936 void 1937 vprint(label, vp) 1938 char *label; 1939 struct vnode *vp; 1940 { 1941 char buf[96]; 1942 1943 if (label != NULL) 1944 printf("%s: ", label); 1945 printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,", 1946 vp->v_tag, vnode_types[vp->v_type], 1947 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1948 buf[0] = '\0'; 1949 if (vp->v_flag & VROOT) 1950 strlcat(buf, "|VROOT", sizeof(buf)); 1951 if (vp->v_flag & VTEXT) 1952 strlcat(buf, "|VTEXT", sizeof(buf)); 1953 if (vp->v_flag & VEXECMAP) 1954 strlcat(buf, "|VEXECMAP", sizeof(buf)); 1955 if (vp->v_flag & VSYSTEM) 1956 strlcat(buf, "|VSYSTEM", sizeof(buf)); 1957 if (vp->v_flag & VXLOCK) 1958 strlcat(buf, "|VXLOCK", sizeof(buf)); 1959 if (vp->v_flag & VXWANT) 1960 strlcat(buf, "|VXWANT", sizeof(buf)); 1961 if (vp->v_flag & VBWAIT) 1962 strlcat(buf, "|VBWAIT", sizeof(buf)); 1963 if (vp->v_flag & VALIASED) 1964 strlcat(buf, "|VALIASED", sizeof(buf)); 1965 if (buf[0] != '\0') 1966 printf(" flags (%s)", &buf[1]); 1967 if (vp->v_data == NULL) { 1968 printf("\n"); 1969 } else { 1970 printf("\n\t"); 1971 VOP_PRINT(vp); 1972 } 1973 } 1974 1975 #ifdef DEBUG 1976 /* 1977 * List all of the locked vnodes in the system. 1978 * Called when debugging the kernel. 1979 */ 1980 void 1981 printlockedvnodes() 1982 { 1983 struct mount *mp, *nmp; 1984 struct vnode *vp; 1985 1986 printf("Locked vnodes\n"); 1987 simple_lock(&mountlist_slock); 1988 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1989 mp = nmp) { 1990 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 1991 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1992 continue; 1993 } 1994 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1995 if (VOP_ISLOCKED(vp)) 1996 vprint(NULL, vp); 1997 } 1998 simple_lock(&mountlist_slock); 1999 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2000 vfs_unbusy(mp); 2001 } 2002 simple_unlock(&mountlist_slock); 2003 } 2004 #endif 2005 2006 /* 2007 * sysctl helper routine for vfs.generic.conf lookups. 2008 */ 2009 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2010 static int 2011 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 2012 { 2013 struct vfsconf vfc; 2014 extern const char * const mountcompatnames[]; 2015 extern int nmountcompatnames; 2016 struct sysctlnode node; 2017 struct vfsops *vfsp; 2018 u_int vfsnum; 2019 2020 if (namelen != 1) 2021 return (ENOTDIR); 2022 vfsnum = name[0]; 2023 if (vfsnum >= nmountcompatnames || 2024 mountcompatnames[vfsnum] == NULL) 2025 return (EOPNOTSUPP); 2026 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 2027 if (vfsp == NULL) 2028 return (EOPNOTSUPP); 2029 2030 vfc.vfc_vfsops = vfsp; 2031 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 2032 vfc.vfc_typenum = vfsnum; 2033 vfc.vfc_refcount = vfsp->vfs_refcount; 2034 vfc.vfc_flags = 0; 2035 vfc.vfc_mountroot = vfsp->vfs_mountroot; 2036 vfc.vfc_next = NULL; 2037 2038 node = *rnode; 2039 node.sysctl_data = &vfc; 2040 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 2041 } 2042 #endif 2043 2044 /* 2045 * sysctl helper routine to return list of supported fstypes 2046 */ 2047 static int 2048 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2049 { 2050 char buf[MFSNAMELEN]; 2051 char *where = oldp; 2052 struct vfsops *v; 2053 size_t needed, left, slen; 2054 int error, first; 2055 2056 if (newp != NULL) 2057 return (EPERM); 2058 if (namelen != 0) 2059 return (EINVAL); 2060 2061 first = 1; 2062 error = 0; 2063 needed = 0; 2064 left = *oldlenp; 2065 2066 LIST_FOREACH(v, &vfs_list, vfs_list) { 2067 if (where == NULL) 2068 needed += strlen(v->vfs_name) + 1; 2069 else { 2070 memset(buf, 0, sizeof(buf)); 2071 if (first) { 2072 strncpy(buf, v->vfs_name, sizeof(buf)); 2073 first = 0; 2074 } else { 2075 buf[0] = ' '; 2076 strncpy(buf + 1, v->vfs_name, sizeof(buf) - 1); 2077 } 2078 buf[sizeof(buf)-1] = '\0'; 2079 slen = strlen(buf); 2080 if (left < slen + 1) 2081 break; 2082 /* +1 to copy out the trailing NUL byte */ 2083 error = copyout(buf, where, slen + 1); 2084 if (error) 2085 break; 2086 where += slen; 2087 needed += slen; 2088 left -= slen; 2089 } 2090 } 2091 *oldlenp = needed; 2092 return (error); 2093 } 2094 2095 /* 2096 * Top level filesystem related information gathering. 2097 */ 2098 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2099 { 2100 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2101 extern int nmountcompatnames; 2102 #endif 2103 2104 sysctl_createv(clog, 0, NULL, NULL, 2105 CTLFLAG_PERMANENT, 2106 CTLTYPE_NODE, "vfs", NULL, 2107 NULL, 0, NULL, 0, 2108 CTL_VFS, CTL_EOL); 2109 sysctl_createv(clog, 0, NULL, NULL, 2110 CTLFLAG_PERMANENT, 2111 CTLTYPE_NODE, "generic", 2112 SYSCTL_DESCR("Non-specific vfs related information"), 2113 NULL, 0, NULL, 0, 2114 CTL_VFS, VFS_GENERIC, CTL_EOL); 2115 2116 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2117 sysctl_createv(clog, 0, NULL, NULL, 2118 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 2119 CTLTYPE_INT, "maxtypenum", 2120 SYSCTL_DESCR("Highest valid filesystem type number"), 2121 NULL, nmountcompatnames, NULL, 0, 2122 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2123 #endif 2124 sysctl_createv(clog, 0, NULL, NULL, 2125 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2126 CTLTYPE_INT, "usermount", 2127 SYSCTL_DESCR("Whether unprivileged users may mount " 2128 "filesystems"), 2129 NULL, 0, &dovfsusermount, 0, 2130 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2131 sysctl_createv(clog, 0, NULL, NULL, 2132 CTLFLAG_PERMANENT, 2133 CTLTYPE_STRING, "fstypes", 2134 SYSCTL_DESCR("List of file systems present"), 2135 sysctl_vfs_generic_fstypes, 0, NULL, 0, 2136 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 2137 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2138 sysctl_createv(clog, 0, NULL, NULL, 2139 CTLFLAG_PERMANENT, 2140 CTLTYPE_STRUCT, "conf", 2141 SYSCTL_DESCR("Filesystem configuration information"), 2142 sysctl_vfs_generic_conf, 0, NULL, 2143 sizeof(struct vfsconf), 2144 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2145 #endif 2146 } 2147 2148 2149 int kinfo_vdebug = 1; 2150 int kinfo_vgetfailed; 2151 #define KINFO_VNODESLOP 10 2152 /* 2153 * Dump vnode list (via sysctl). 2154 * Copyout address of vnode followed by vnode. 2155 */ 2156 /* ARGSUSED */ 2157 int 2158 sysctl_kern_vnode(SYSCTLFN_ARGS) 2159 { 2160 char *where = oldp; 2161 size_t *sizep = oldlenp; 2162 struct mount *mp, *nmp; 2163 struct vnode *nvp, *vp; 2164 char *bp = where, *savebp; 2165 char *ewhere; 2166 int error; 2167 2168 if (namelen != 0) 2169 return (EOPNOTSUPP); 2170 if (newp != NULL) 2171 return (EPERM); 2172 2173 #define VPTRSZ sizeof(struct vnode *) 2174 #define VNODESZ sizeof(struct vnode) 2175 if (where == NULL) { 2176 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2177 return (0); 2178 } 2179 ewhere = where + *sizep; 2180 2181 simple_lock(&mountlist_slock); 2182 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2183 mp = nmp) { 2184 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2185 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2186 continue; 2187 } 2188 savebp = bp; 2189 again: 2190 simple_lock(&mntvnode_slock); 2191 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2192 vp != NULL; 2193 vp = nvp) { 2194 /* 2195 * Check that the vp is still associated with 2196 * this filesystem. RACE: could have been 2197 * recycled onto the same filesystem. 2198 */ 2199 if (vp->v_mount != mp) { 2200 simple_unlock(&mntvnode_slock); 2201 if (kinfo_vdebug) 2202 printf("kinfo: vp changed\n"); 2203 bp = savebp; 2204 goto again; 2205 } 2206 nvp = LIST_NEXT(vp, v_mntvnodes); 2207 if (bp + VPTRSZ + VNODESZ > ewhere) { 2208 simple_unlock(&mntvnode_slock); 2209 *sizep = bp - where; 2210 return (ENOMEM); 2211 } 2212 simple_unlock(&mntvnode_slock); 2213 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2214 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2215 return (error); 2216 bp += VPTRSZ + VNODESZ; 2217 simple_lock(&mntvnode_slock); 2218 } 2219 simple_unlock(&mntvnode_slock); 2220 simple_lock(&mountlist_slock); 2221 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2222 vfs_unbusy(mp); 2223 } 2224 simple_unlock(&mountlist_slock); 2225 2226 *sizep = bp - where; 2227 return (0); 2228 } 2229 2230 /* 2231 * Check to see if a filesystem is mounted on a block device. 2232 */ 2233 int 2234 vfs_mountedon(vp) 2235 struct vnode *vp; 2236 { 2237 struct vnode *vq; 2238 int error = 0; 2239 2240 if (vp->v_specmountpoint != NULL) 2241 return (EBUSY); 2242 if (vp->v_flag & VALIASED) { 2243 simple_lock(&spechash_slock); 2244 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2245 if (vq->v_rdev != vp->v_rdev || 2246 vq->v_type != vp->v_type) 2247 continue; 2248 if (vq->v_specmountpoint != NULL) { 2249 error = EBUSY; 2250 break; 2251 } 2252 } 2253 simple_unlock(&spechash_slock); 2254 } 2255 return (error); 2256 } 2257 2258 static int 2259 sacheck(struct sockaddr *sa) 2260 { 2261 switch (sa->sa_family) { 2262 #ifdef INET 2263 case AF_INET: { 2264 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2265 char *p = (char *)sin->sin_zero; 2266 size_t i; 2267 2268 if (sin->sin_len != sizeof(*sin)) 2269 return -1; 2270 if (sin->sin_port != 0) 2271 return -1; 2272 for (i = 0; i < sizeof(sin->sin_zero); i++) 2273 if (*p++ != '\0') 2274 return -1; 2275 return 0; 2276 } 2277 #endif 2278 #ifdef INET6 2279 case AF_INET6: { 2280 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2281 2282 if (sin6->sin6_len != sizeof(*sin6)) 2283 return -1; 2284 if (sin6->sin6_port != 0) 2285 return -1; 2286 return 0; 2287 } 2288 #endif 2289 default: 2290 return -1; 2291 } 2292 } 2293 2294 /* 2295 * Build hash lists of net addresses and hang them off the mount point. 2296 * Called by ufs_mount() to set up the lists of export addresses. 2297 */ 2298 static int 2299 vfs_hang_addrlist(mp, nep, argp) 2300 struct mount *mp; 2301 struct netexport *nep; 2302 struct export_args *argp; 2303 { 2304 struct netcred *np, *enp; 2305 struct radix_node_head *rnh; 2306 int i; 2307 struct sockaddr *saddr, *smask = 0; 2308 struct domain *dom; 2309 int error; 2310 2311 if (argp->ex_addrlen == 0) { 2312 if (mp->mnt_flag & MNT_DEFEXPORTED) 2313 return (EPERM); 2314 np = &nep->ne_defexported; 2315 np->netc_exflags = argp->ex_flags; 2316 crcvt(&np->netc_anon, &argp->ex_anon); 2317 np->netc_anon.cr_ref = 1; 2318 mp->mnt_flag |= MNT_DEFEXPORTED; 2319 return (0); 2320 } 2321 2322 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2323 return (EINVAL); 2324 2325 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2326 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2327 memset((caddr_t)np, 0, i); 2328 saddr = (struct sockaddr *)(np + 1); 2329 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2330 if (error) 2331 goto out; 2332 if (saddr->sa_len > argp->ex_addrlen) 2333 saddr->sa_len = argp->ex_addrlen; 2334 if (sacheck(saddr) == -1) 2335 return EINVAL; 2336 if (argp->ex_masklen) { 2337 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2338 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2339 if (error) 2340 goto out; 2341 if (smask->sa_len > argp->ex_masklen) 2342 smask->sa_len = argp->ex_masklen; 2343 if (smask->sa_family != saddr->sa_family) 2344 return EINVAL; 2345 if (sacheck(smask) == -1) 2346 return EINVAL; 2347 } 2348 i = saddr->sa_family; 2349 if ((rnh = nep->ne_rtable[i]) == 0) { 2350 /* 2351 * Seems silly to initialize every AF when most are not 2352 * used, do so on demand here 2353 */ 2354 for (dom = domains; dom; dom = dom->dom_next) 2355 if (dom->dom_family == i && dom->dom_rtattach) { 2356 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2357 dom->dom_rtoffset); 2358 break; 2359 } 2360 if ((rnh = nep->ne_rtable[i]) == 0) { 2361 error = ENOBUFS; 2362 goto out; 2363 } 2364 } 2365 2366 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2367 np->netc_rnodes); 2368 if (enp != np) { 2369 if (enp == NULL) { 2370 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2371 smask, rnh); 2372 if (enp == NULL) { 2373 error = EPERM; 2374 goto out; 2375 } 2376 } else 2377 enp->netc_refcnt++; 2378 2379 goto check; 2380 } else 2381 enp->netc_refcnt = 1; 2382 2383 np->netc_exflags = argp->ex_flags; 2384 crcvt(&np->netc_anon, &argp->ex_anon); 2385 np->netc_anon.cr_ref = 1; 2386 return 0; 2387 check: 2388 if (enp->netc_exflags != argp->ex_flags || 2389 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2390 error = EPERM; 2391 else 2392 error = 0; 2393 out: 2394 free(np, M_NETADDR); 2395 return error; 2396 } 2397 2398 /* ARGSUSED */ 2399 static int 2400 vfs_free_netcred(rn, w) 2401 struct radix_node *rn; 2402 void *w; 2403 { 2404 struct radix_node_head *rnh = (struct radix_node_head *)w; 2405 struct netcred *np = (struct netcred *)(void *)rn; 2406 2407 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2408 if (--(np->netc_refcnt) <= 0) 2409 free(np, M_NETADDR); 2410 return (0); 2411 } 2412 2413 /* 2414 * Free the net address hash lists that are hanging off the mount points. 2415 */ 2416 static void 2417 vfs_free_addrlist(nep) 2418 struct netexport *nep; 2419 { 2420 int i; 2421 struct radix_node_head *rnh; 2422 2423 for (i = 0; i <= AF_MAX; i++) 2424 if ((rnh = nep->ne_rtable[i]) != NULL) { 2425 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2426 free((caddr_t)rnh, M_RTABLE); 2427 nep->ne_rtable[i] = 0; 2428 } 2429 } 2430 2431 int 2432 vfs_export(mp, nep, argp) 2433 struct mount *mp; 2434 struct netexport *nep; 2435 struct export_args *argp; 2436 { 2437 int error; 2438 2439 if (argp->ex_flags & MNT_DELEXPORT) { 2440 if (mp->mnt_flag & MNT_EXPUBLIC) { 2441 vfs_setpublicfs(NULL, NULL, NULL); 2442 mp->mnt_flag &= ~MNT_EXPUBLIC; 2443 } 2444 vfs_free_addrlist(nep); 2445 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2446 } 2447 if (argp->ex_flags & MNT_EXPORTED) { 2448 if (argp->ex_flags & MNT_EXPUBLIC) { 2449 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2450 return (error); 2451 mp->mnt_flag |= MNT_EXPUBLIC; 2452 } 2453 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2454 return (error); 2455 mp->mnt_flag |= MNT_EXPORTED; 2456 } 2457 return (0); 2458 } 2459 2460 /* 2461 * Set the publicly exported filesystem (WebNFS). Currently, only 2462 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2463 */ 2464 int 2465 vfs_setpublicfs(mp, nep, argp) 2466 struct mount *mp; 2467 struct netexport *nep; 2468 struct export_args *argp; 2469 { 2470 int error; 2471 struct vnode *rvp; 2472 char *cp; 2473 2474 /* 2475 * mp == NULL -> invalidate the current info, the FS is 2476 * no longer exported. May be called from either vfs_export 2477 * or unmount, so check if it hasn't already been done. 2478 */ 2479 if (mp == NULL) { 2480 if (nfs_pub.np_valid) { 2481 nfs_pub.np_valid = 0; 2482 if (nfs_pub.np_index != NULL) { 2483 FREE(nfs_pub.np_index, M_TEMP); 2484 nfs_pub.np_index = NULL; 2485 } 2486 } 2487 return (0); 2488 } 2489 2490 /* 2491 * Only one allowed at a time. 2492 */ 2493 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2494 return (EBUSY); 2495 2496 /* 2497 * Get real filehandle for root of exported FS. 2498 */ 2499 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2500 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx; 2501 2502 if ((error = VFS_ROOT(mp, &rvp))) 2503 return (error); 2504 2505 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2506 return (error); 2507 2508 vput(rvp); 2509 2510 /* 2511 * If an indexfile was specified, pull it in. 2512 */ 2513 if (argp->ex_indexfile != NULL) { 2514 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2515 M_WAITOK); 2516 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2517 MAXNAMLEN, (size_t *)0); 2518 if (!error) { 2519 /* 2520 * Check for illegal filenames. 2521 */ 2522 for (cp = nfs_pub.np_index; *cp; cp++) { 2523 if (*cp == '/') { 2524 error = EINVAL; 2525 break; 2526 } 2527 } 2528 } 2529 if (error) { 2530 FREE(nfs_pub.np_index, M_TEMP); 2531 return (error); 2532 } 2533 } 2534 2535 nfs_pub.np_mount = mp; 2536 nfs_pub.np_valid = 1; 2537 return (0); 2538 } 2539 2540 struct netcred * 2541 vfs_export_lookup(mp, nep, nam) 2542 struct mount *mp; 2543 struct netexport *nep; 2544 struct mbuf *nam; 2545 { 2546 struct netcred *np; 2547 struct radix_node_head *rnh; 2548 struct sockaddr *saddr; 2549 2550 np = NULL; 2551 if (mp->mnt_flag & MNT_EXPORTED) { 2552 /* 2553 * Lookup in the export list first. 2554 */ 2555 if (nam != NULL) { 2556 saddr = mtod(nam, struct sockaddr *); 2557 rnh = nep->ne_rtable[saddr->sa_family]; 2558 if (rnh != NULL) { 2559 np = (struct netcred *) 2560 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2561 rnh); 2562 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2563 np = NULL; 2564 } 2565 } 2566 /* 2567 * If no address match, use the default if it exists. 2568 */ 2569 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2570 np = &nep->ne_defexported; 2571 } 2572 return (np); 2573 } 2574 2575 /* 2576 * Do the usual access checking. 2577 * file_mode, uid and gid are from the vnode in question, 2578 * while acc_mode and cred are from the VOP_ACCESS parameter list 2579 */ 2580 int 2581 vaccess(type, file_mode, uid, gid, acc_mode, cred) 2582 enum vtype type; 2583 mode_t file_mode; 2584 uid_t uid; 2585 gid_t gid; 2586 mode_t acc_mode; 2587 struct ucred *cred; 2588 { 2589 mode_t mask; 2590 2591 /* 2592 * Super-user always gets read/write access, but execute access depends 2593 * on at least one execute bit being set. 2594 */ 2595 if (cred->cr_uid == 0) { 2596 if ((acc_mode & VEXEC) && type != VDIR && 2597 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2598 return (EACCES); 2599 return (0); 2600 } 2601 2602 mask = 0; 2603 2604 /* Otherwise, check the owner. */ 2605 if (cred->cr_uid == uid) { 2606 if (acc_mode & VEXEC) 2607 mask |= S_IXUSR; 2608 if (acc_mode & VREAD) 2609 mask |= S_IRUSR; 2610 if (acc_mode & VWRITE) 2611 mask |= S_IWUSR; 2612 return ((file_mode & mask) == mask ? 0 : EACCES); 2613 } 2614 2615 /* Otherwise, check the groups. */ 2616 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2617 if (acc_mode & VEXEC) 2618 mask |= S_IXGRP; 2619 if (acc_mode & VREAD) 2620 mask |= S_IRGRP; 2621 if (acc_mode & VWRITE) 2622 mask |= S_IWGRP; 2623 return ((file_mode & mask) == mask ? 0 : EACCES); 2624 } 2625 2626 /* Otherwise, check everyone else. */ 2627 if (acc_mode & VEXEC) 2628 mask |= S_IXOTH; 2629 if (acc_mode & VREAD) 2630 mask |= S_IROTH; 2631 if (acc_mode & VWRITE) 2632 mask |= S_IWOTH; 2633 return ((file_mode & mask) == mask ? 0 : EACCES); 2634 } 2635 2636 /* 2637 * Unmount all file systems. 2638 * We traverse the list in reverse order under the assumption that doing so 2639 * will avoid needing to worry about dependencies. 2640 */ 2641 void 2642 vfs_unmountall(p) 2643 struct proc *p; 2644 { 2645 struct mount *mp, *nmp; 2646 int allerror, error; 2647 2648 for (allerror = 0, 2649 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2650 nmp = mp->mnt_list.cqe_prev; 2651 #ifdef DEBUG 2652 printf("unmounting %s (%s)...\n", 2653 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2654 #endif 2655 /* 2656 * XXX Freeze syncer. Must do this before locking the 2657 * mount point. See dounmount() for details. 2658 */ 2659 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2660 if (vfs_busy(mp, 0, 0)) { 2661 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2662 continue; 2663 } 2664 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2665 printf("unmount of %s failed with error %d\n", 2666 mp->mnt_stat.f_mntonname, error); 2667 allerror = 1; 2668 } 2669 } 2670 if (allerror) 2671 printf("WARNING: some file systems would not unmount\n"); 2672 } 2673 2674 extern struct simplelock bqueue_slock; /* XXX */ 2675 2676 /* 2677 * Sync and unmount file systems before shutting down. 2678 */ 2679 void 2680 vfs_shutdown() 2681 { 2682 struct lwp *l = curlwp; 2683 struct proc *p; 2684 2685 /* XXX we're certainly not running in proc0's context! */ 2686 if (l == NULL || (p = l->l_proc) == NULL) 2687 p = &proc0; 2688 2689 printf("syncing disks... "); 2690 2691 /* remove user process from run queue */ 2692 suspendsched(); 2693 (void) spl0(); 2694 2695 /* avoid coming back this way again if we panic. */ 2696 doing_shutdown = 1; 2697 2698 sys_sync(l, NULL, NULL); 2699 2700 /* Wait for sync to finish. */ 2701 if (buf_syncwait() != 0) { 2702 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2703 Debugger(); 2704 #endif 2705 printf("giving up\n"); 2706 return; 2707 } else 2708 printf("done\n"); 2709 2710 /* 2711 * If we've panic'd, don't make the situation potentially 2712 * worse by unmounting the file systems. 2713 */ 2714 if (panicstr != NULL) 2715 return; 2716 2717 /* Release inodes held by texts before update. */ 2718 #ifdef notdef 2719 vnshutdown(); 2720 #endif 2721 /* Unmount file systems. */ 2722 vfs_unmountall(p); 2723 } 2724 2725 /* 2726 * Mount the root file system. If the operator didn't specify a 2727 * file system to use, try all possible file systems until one 2728 * succeeds. 2729 */ 2730 int 2731 vfs_mountroot() 2732 { 2733 struct vfsops *v; 2734 2735 if (root_device == NULL) 2736 panic("vfs_mountroot: root device unknown"); 2737 2738 switch (root_device->dv_class) { 2739 case DV_IFNET: 2740 if (rootdev != NODEV) 2741 panic("vfs_mountroot: rootdev set for DV_IFNET " 2742 "(0x%08x -> %d,%d)", rootdev, 2743 major(rootdev), minor(rootdev)); 2744 break; 2745 2746 case DV_DISK: 2747 if (rootdev == NODEV) 2748 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2749 break; 2750 2751 default: 2752 printf("%s: inappropriate for root file system\n", 2753 root_device->dv_xname); 2754 return (ENODEV); 2755 } 2756 2757 /* 2758 * If user specified a file system, use it. 2759 */ 2760 if (mountroot != NULL) 2761 return ((*mountroot)()); 2762 2763 /* 2764 * Try each file system currently configured into the kernel. 2765 */ 2766 LIST_FOREACH(v, &vfs_list, vfs_list) { 2767 if (v->vfs_mountroot == NULL) 2768 continue; 2769 #ifdef DEBUG 2770 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2771 #endif 2772 if ((*v->vfs_mountroot)() == 0) { 2773 aprint_normal("root file system type: %s\n", 2774 v->vfs_name); 2775 break; 2776 } 2777 } 2778 2779 if (v == NULL) { 2780 printf("no file system for %s", root_device->dv_xname); 2781 if (root_device->dv_class == DV_DISK) 2782 printf(" (dev 0x%x)", rootdev); 2783 printf("\n"); 2784 return (EFTYPE); 2785 } 2786 return (0); 2787 } 2788 2789 /* 2790 * Given a file system name, look up the vfsops for that 2791 * file system, or return NULL if file system isn't present 2792 * in the kernel. 2793 */ 2794 struct vfsops * 2795 vfs_getopsbyname(name) 2796 const char *name; 2797 { 2798 struct vfsops *v; 2799 2800 LIST_FOREACH(v, &vfs_list, vfs_list) { 2801 if (strcmp(v->vfs_name, name) == 0) 2802 break; 2803 } 2804 2805 return (v); 2806 } 2807 2808 /* 2809 * Establish a file system and initialize it. 2810 */ 2811 int 2812 vfs_attach(vfs) 2813 struct vfsops *vfs; 2814 { 2815 struct vfsops *v; 2816 int error = 0; 2817 2818 2819 /* 2820 * Make sure this file system doesn't already exist. 2821 */ 2822 LIST_FOREACH(v, &vfs_list, vfs_list) { 2823 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2824 error = EEXIST; 2825 goto out; 2826 } 2827 } 2828 2829 /* 2830 * Initialize the vnode operations for this file system. 2831 */ 2832 vfs_opv_init(vfs->vfs_opv_descs); 2833 2834 /* 2835 * Now initialize the file system itself. 2836 */ 2837 (*vfs->vfs_init)(); 2838 2839 /* 2840 * ...and link it into the kernel's list. 2841 */ 2842 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2843 2844 /* 2845 * Sanity: make sure the reference count is 0. 2846 */ 2847 vfs->vfs_refcount = 0; 2848 2849 out: 2850 return (error); 2851 } 2852 2853 /* 2854 * Remove a file system from the kernel. 2855 */ 2856 int 2857 vfs_detach(vfs) 2858 struct vfsops *vfs; 2859 { 2860 struct vfsops *v; 2861 2862 /* 2863 * Make sure no one is using the filesystem. 2864 */ 2865 if (vfs->vfs_refcount != 0) 2866 return (EBUSY); 2867 2868 /* 2869 * ...and remove it from the kernel's list. 2870 */ 2871 LIST_FOREACH(v, &vfs_list, vfs_list) { 2872 if (v == vfs) { 2873 LIST_REMOVE(v, vfs_list); 2874 break; 2875 } 2876 } 2877 2878 if (v == NULL) 2879 return (ESRCH); 2880 2881 /* 2882 * Now run the file system-specific cleanups. 2883 */ 2884 (*vfs->vfs_done)(); 2885 2886 /* 2887 * Free the vnode operations vector. 2888 */ 2889 vfs_opv_free(vfs->vfs_opv_descs); 2890 return (0); 2891 } 2892 2893 void 2894 vfs_reinit(void) 2895 { 2896 struct vfsops *vfs; 2897 2898 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2899 if (vfs->vfs_reinit) { 2900 (*vfs->vfs_reinit)(); 2901 } 2902 } 2903 } 2904 2905 /* 2906 * Request a filesystem to suspend write operations. 2907 */ 2908 int 2909 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2910 { 2911 struct proc *p = curproc; /* XXX */ 2912 int error; 2913 2914 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2915 if (slptimeo < 0) 2916 return EWOULDBLOCK; 2917 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2918 if (error) 2919 return error; 2920 } 2921 mp->mnt_iflag |= IMNT_SUSPEND; 2922 2923 simple_lock(&mp->mnt_slock); 2924 if (mp->mnt_writeopcountupper > 0) 2925 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 2926 0, &mp->mnt_slock); 2927 simple_unlock(&mp->mnt_slock); 2928 2929 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2930 if (error) { 2931 vfs_write_resume(mp); 2932 return error; 2933 } 2934 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2935 2936 simple_lock(&mp->mnt_slock); 2937 if (mp->mnt_writeopcountlower > 0) 2938 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 2939 0, &mp->mnt_slock); 2940 mp->mnt_iflag |= IMNT_SUSPENDED; 2941 simple_unlock(&mp->mnt_slock); 2942 2943 return 0; 2944 } 2945 2946 /* 2947 * Request a filesystem to resume write operations. 2948 */ 2949 void 2950 vfs_write_resume(struct mount *mp) 2951 { 2952 2953 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2954 return; 2955 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2956 wakeup(&mp->mnt_flag); 2957 } 2958 2959 void 2960 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2961 { 2962 const struct statvfs *mbp; 2963 2964 if (sbp == (mbp = &mp->mnt_stat)) 2965 return; 2966 2967 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2968 sbp->f_fsid = mbp->f_fsid; 2969 sbp->f_owner = mbp->f_owner; 2970 sbp->f_flag = mbp->f_flag; 2971 sbp->f_syncwrites = mbp->f_syncwrites; 2972 sbp->f_asyncwrites = mbp->f_asyncwrites; 2973 sbp->f_syncreads = mbp->f_syncreads; 2974 sbp->f_asyncreads = mbp->f_asyncreads; 2975 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2976 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2977 sizeof(sbp->f_fstypename)); 2978 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2979 sizeof(sbp->f_mntonname)); 2980 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2981 sizeof(sbp->f_mntfromname)); 2982 } 2983 2984 int 2985 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2986 struct mount *mp, struct proc *p) 2987 { 2988 int error; 2989 size_t size; 2990 struct statvfs *sfs = &mp->mnt_stat; 2991 int (*fun)(const void *, void *, size_t, size_t *); 2992 2993 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 2994 sizeof(mp->mnt_stat.f_fstypename)); 2995 2996 if (onp) { 2997 struct cwdinfo *cwdi = p->p_cwdi; 2998 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2999 if (cwdi->cwdi_rdir != NULL) { 3000 size_t len; 3001 char *bp; 3002 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 3003 3004 if (!path) /* XXX can't happen with M_WAITOK */ 3005 return ENOMEM; 3006 3007 bp = path + MAXPATHLEN; 3008 *--bp = '\0'; 3009 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 3010 path, MAXPATHLEN / 2, 0, p); 3011 if (error) { 3012 free(path, M_TEMP); 3013 return error; 3014 } 3015 3016 len = strlen(bp); 3017 if (len > sizeof(sfs->f_mntonname) - 1) 3018 len = sizeof(sfs->f_mntonname) - 1; 3019 (void)strncpy(sfs->f_mntonname, bp, len); 3020 free(path, M_TEMP); 3021 3022 if (len < sizeof(sfs->f_mntonname) - 1) { 3023 error = (*fun)(onp, &sfs->f_mntonname[len], 3024 sizeof(sfs->f_mntonname) - len - 1, &size); 3025 if (error) 3026 return error; 3027 size += len; 3028 } else { 3029 size = len; 3030 } 3031 } else { 3032 error = (*fun)(onp, &sfs->f_mntonname, 3033 sizeof(sfs->f_mntonname) - 1, &size); 3034 if (error) 3035 return error; 3036 } 3037 (void)memset(sfs->f_mntonname + size, 0, 3038 sizeof(sfs->f_mntonname) - size); 3039 } 3040 3041 if (fromp) { 3042 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 3043 error = (*fun)(fromp, sfs->f_mntfromname, 3044 sizeof(sfs->f_mntfromname) - 1, &size); 3045 if (error) 3046 return error; 3047 (void)memset(sfs->f_mntfromname + size, 0, 3048 sizeof(sfs->f_mntfromname) - size); 3049 } 3050 return 0; 3051 } 3052 3053 #ifdef DDB 3054 const char buf_flagbits[] = 3055 "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" 3056 "\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" 3057 "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED" 3058 "\32XXX\33VFLUSH"; 3059 3060 void 3061 vfs_buf_print(bp, full, pr) 3062 struct buf *bp; 3063 int full; 3064 void (*pr)(const char *, ...); 3065 { 3066 char buf[1024]; 3067 3068 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 3069 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 3070 3071 bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf)); 3072 (*pr)(" error %d flags 0x%s\n", bp->b_error, buf); 3073 3074 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3075 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3076 (*pr)(" data %p saveaddr %p dep %p\n", 3077 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 3078 (*pr)(" iodone %p\n", bp->b_iodone); 3079 } 3080 3081 3082 const char vnode_flagbits[] = 3083 "\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\5EXECMAP" 3084 "\11XLOCK\12XWANT\13BWAIT\14ALIASED" 3085 "\15DIROP\16LAYER\17ONWORKLIST\20DIRTY"; 3086 3087 const char * const vnode_tags[] = { 3088 "VT_NON", 3089 "VT_UFS", 3090 "VT_NFS", 3091 "VT_MFS", 3092 "VT_MSDOSFS", 3093 "VT_LFS", 3094 "VT_LOFS", 3095 "VT_FDESC", 3096 "VT_PORTAL", 3097 "VT_NULL", 3098 "VT_UMAP", 3099 "VT_KERNFS", 3100 "VT_PROCFS", 3101 "VT_AFS", 3102 "VT_ISOFS", 3103 "VT_UNION", 3104 "VT_ADOSFS", 3105 "VT_EXT2FS", 3106 "VT_CODA", 3107 "VT_FILECORE", 3108 "VT_NTFS", 3109 "VT_VFS", 3110 "VT_OVERLAY", 3111 "VT_SMBFS" 3112 }; 3113 3114 void 3115 vfs_vnode_print(vp, full, pr) 3116 struct vnode *vp; 3117 int full; 3118 void (*pr)(const char *, ...); 3119 { 3120 char buf[256]; 3121 const char *vtype, *vtag; 3122 3123 uvm_object_printit(&vp->v_uobj, full, pr); 3124 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); 3125 (*pr)("\nVNODE flags %s\n", buf); 3126 (*pr)("mp %p numoutput %d size 0x%llx\n", 3127 vp->v_mount, vp->v_numoutput, vp->v_size); 3128 3129 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3130 vp->v_data, vp->v_usecount, vp->v_writecount, 3131 vp->v_holdcnt, vp->v_numoutput); 3132 3133 vtype = (vp->v_type >= 0 && 3134 vp->v_type < sizeof(vnode_types) / sizeof(vnode_types[0])) ? 3135 vnode_types[vp->v_type] : "UNKNOWN"; 3136 vtag = (vp->v_tag >= 0 && 3137 vp->v_tag < sizeof(vnode_tags) / sizeof(vnode_tags[0])) ? 3138 vnode_tags[vp->v_tag] : "UNKNOWN"; 3139 3140 (*pr)("type %s(%d) tag %s(%d) mount %p typedata %p\n", 3141 vtype, vp->v_type, vtag, vp->v_tag, 3142 vp->v_mount, vp->v_mountedhere); 3143 3144 if (full) { 3145 struct buf *bp; 3146 3147 (*pr)("clean bufs:\n"); 3148 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3149 (*pr)(" bp %p\n", bp); 3150 vfs_buf_print(bp, full, pr); 3151 } 3152 3153 (*pr)("dirty bufs:\n"); 3154 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3155 (*pr)(" bp %p\n", bp); 3156 vfs_buf_print(bp, full, pr); 3157 } 3158 } 3159 } 3160 3161 void 3162 vfs_mount_print(mp, full, pr) 3163 struct mount *mp; 3164 int full; 3165 void (*pr)(const char *, ...); 3166 { 3167 char sbuf[256]; 3168 3169 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3170 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3171 3172 (*pr)("fs_bshift %d dev_bshift = %d maxsymlinklen = %d\n", 3173 mp->mnt_fs_bshift,mp->mnt_dev_bshift,mp->mnt_maxsymlinklen); 3174 3175 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3176 (*pr)("flag = %s\n", sbuf); 3177 3178 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3179 (*pr)("iflag = %s\n", sbuf); 3180 3181 /* XXX use lockmgr_printinfo */ 3182 if (mp->mnt_lock.lk_sharecount) 3183 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3184 mp->mnt_lock.lk_sharecount); 3185 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3186 (*pr)(" lock type %s: EXCL (count %d) by ", 3187 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3188 if (mp->mnt_lock.lk_flags & LK_SPIN) 3189 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3190 else 3191 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3192 mp->mnt_lock.lk_locklwp); 3193 } else 3194 (*pr)(" not locked"); 3195 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3196 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3197 3198 (*pr)("\n"); 3199 3200 if (mp->mnt_unmounter) { 3201 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3202 } 3203 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3204 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3205 3206 (*pr)("statvfs cache:\n"); 3207 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3208 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3209 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3210 3211 (*pr)("\tblocks = "PRIu64"\n",mp->mnt_stat.f_blocks); 3212 (*pr)("\tbfree = "PRIu64"\n",mp->mnt_stat.f_bfree); 3213 (*pr)("\tbavail = "PRIu64"\n",mp->mnt_stat.f_bavail); 3214 (*pr)("\tbresvd = "PRIu64"\n",mp->mnt_stat.f_bresvd); 3215 3216 (*pr)("\tfiles = "PRIu64"\n",mp->mnt_stat.f_files); 3217 (*pr)("\tffree = "PRIu64"\n",mp->mnt_stat.f_ffree); 3218 (*pr)("\tfavail = "PRIu64"\n",mp->mnt_stat.f_favail); 3219 (*pr)("\tfresvd = "PRIu64"\n",mp->mnt_stat.f_fresvd); 3220 3221 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3222 mp->mnt_stat.f_fsidx.__fsid_val[0], 3223 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3224 3225 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3226 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3227 3228 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, 3229 sizeof(sbuf)); 3230 (*pr)("\tflag = %s\n",sbuf); 3231 (*pr)("\tsyncwrites = " PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3232 (*pr)("\tasyncwrites = " PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3233 (*pr)("\tsyncreads = " PRIu64 "\n",mp->mnt_stat.f_syncreads); 3234 (*pr)("\tasyncreads = " PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3235 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3236 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3237 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3238 3239 { 3240 int cnt = 0; 3241 struct vnode *vp; 3242 (*pr)("locked vnodes ="); 3243 /* XXX would take mountlist lock, except ddb may not have context */ 3244 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3245 if (VOP_ISLOCKED(vp)) { 3246 if ((++cnt % 6) == 0) { 3247 (*pr)(" %p,\n\t", vp); 3248 } else { 3249 (*pr)(" %p,", vp); 3250 } 3251 } 3252 } 3253 (*pr)("\n"); 3254 } 3255 3256 if (full) { 3257 int cnt = 0; 3258 struct vnode *vp; 3259 (*pr)("all vnodes ="); 3260 /* XXX would take mountlist lock, except ddb may not have context */ 3261 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3262 if (!LIST_NEXT(vp, v_mntvnodes)) { 3263 (*pr)(" %p", vp); 3264 } else if ((++cnt % 6) == 0) { 3265 (*pr)(" %p,\n\t", vp); 3266 } else { 3267 (*pr)(" %p,", vp); 3268 } 3269 } 3270 (*pr)("\n", vp); 3271 } 3272 } 3273 3274 #endif 3275