1 /* $NetBSD: vfs_subr.c,v 1.250 2005/06/19 18:22:36 elad Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the NetBSD 24 * Foundation, Inc. and its contributors. 25 * 4. Neither the name of The NetBSD Foundation nor the names of its 26 * contributors may be used to endorse or promote products derived 27 * from this software without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 30 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 31 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 32 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 33 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 34 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 35 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 36 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 37 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39 * POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * Copyright (c) 1989, 1993 44 * The Regents of the University of California. All rights reserved. 45 * (c) UNIX System Laboratories, Inc. 46 * All or some portions of this file are derived from material licensed 47 * to the University of California by American Telephone and Telegraph 48 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 49 * the permission of UNIX System Laboratories, Inc. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 3. Neither the name of the University nor the names of its contributors 60 * may be used to endorse or promote products derived from this software 61 * without specific prior written permission. 62 * 63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 73 * SUCH DAMAGE. 74 * 75 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 76 */ 77 78 /* 79 * External virtual filesystem routines 80 */ 81 82 #include <sys/cdefs.h> 83 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.250 2005/06/19 18:22:36 elad Exp $"); 84 85 #include "opt_inet.h" 86 #include "opt_ddb.h" 87 #include "opt_compat_netbsd.h" 88 #include "opt_compat_43.h" 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/proc.h> 93 #include <sys/kernel.h> 94 #include <sys/mount.h> 95 #include <sys/time.h> 96 #include <sys/event.h> 97 #include <sys/fcntl.h> 98 #include <sys/vnode.h> 99 #include <sys/stat.h> 100 #include <sys/namei.h> 101 #include <sys/ucred.h> 102 #include <sys/buf.h> 103 #include <sys/errno.h> 104 #include <sys/malloc.h> 105 #include <sys/domain.h> 106 #include <sys/mbuf.h> 107 #include <sys/sa.h> 108 #include <sys/syscallargs.h> 109 #include <sys/device.h> 110 #include <sys/extattr.h> 111 #include <sys/dirent.h> 112 #include <sys/filedesc.h> 113 114 #include <miscfs/specfs/specdev.h> 115 #include <miscfs/genfs/genfs.h> 116 #include <miscfs/syncfs/syncfs.h> 117 118 #include <netinet/in.h> 119 120 #include <uvm/uvm.h> 121 #include <uvm/uvm_ddb.h> 122 123 #include <netinet/in.h> 124 125 #include <sys/sysctl.h> 126 127 const enum vtype iftovt_tab[16] = { 128 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 129 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 130 }; 131 const int vttoif_tab[9] = { 132 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 133 S_IFSOCK, S_IFIFO, S_IFMT, 134 }; 135 136 int doforce = 1; /* 1 => permit forcible unmounting */ 137 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 138 139 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 140 141 /* 142 * Insq/Remq for the vnode usage lists. 143 */ 144 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 145 #define bufremvn(bp) { \ 146 LIST_REMOVE(bp, b_vnbufs); \ 147 (bp)->b_vnbufs.le_next = NOLIST; \ 148 } 149 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 150 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 151 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 152 153 struct mntlist mountlist = /* mounted filesystem list */ 154 CIRCLEQ_HEAD_INITIALIZER(mountlist); 155 struct vfs_list_head vfs_list = /* vfs list */ 156 LIST_HEAD_INITIALIZER(vfs_list); 157 158 struct nfs_public nfs_pub; /* publicly exported FS */ 159 160 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 161 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 162 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 163 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 164 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 165 166 /* XXX - gross; single global lock to protect v_numoutput */ 167 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 168 169 /* 170 * These define the root filesystem and device. 171 */ 172 struct mount *rootfs; 173 struct vnode *rootvnode; 174 struct device *root_device; /* root device */ 175 176 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 177 &pool_allocator_nointr); 178 179 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 180 181 /* 182 * Local declarations. 183 */ 184 void insmntque(struct vnode *, struct mount *); 185 int getdevvp(dev_t, struct vnode **, enum vtype); 186 187 void vclean(struct vnode *, int, struct proc *); 188 189 static int vfs_hang_addrlist(struct mount *, struct netexport *, 190 struct export_args *); 191 static int vfs_free_netcred(struct radix_node *, void *); 192 static void vfs_free_addrlist(struct netexport *); 193 static struct vnode *getcleanvnode(struct proc *); 194 195 #ifdef DEBUG 196 void printlockedvnodes(void); 197 #endif 198 199 /* 200 * Initialize the vnode management data structures. 201 */ 202 void 203 vntblinit(void) 204 { 205 206 /* 207 * Initialize the filesystem syncer. 208 */ 209 vn_initialize_syncerd(); 210 } 211 212 int 213 vfs_drainvnodes(long target, struct proc *p) 214 { 215 216 simple_lock(&vnode_free_list_slock); 217 while (numvnodes > target) { 218 struct vnode *vp; 219 220 vp = getcleanvnode(p); 221 if (vp == NULL) 222 return EBUSY; /* give up */ 223 pool_put(&vnode_pool, vp); 224 simple_lock(&vnode_free_list_slock); 225 numvnodes--; 226 } 227 simple_unlock(&vnode_free_list_slock); 228 229 return 0; 230 } 231 232 /* 233 * grab a vnode from freelist and clean it. 234 */ 235 struct vnode * 236 getcleanvnode(struct proc *p) 237 { 238 struct vnode *vp; 239 struct mount *mp; 240 struct freelst *listhd; 241 242 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 243 244 listhd = &vnode_free_list; 245 try_nextlist: 246 TAILQ_FOREACH(vp, listhd, v_freelist) { 247 if (!simple_lock_try(&vp->v_interlock)) 248 continue; 249 /* 250 * as our lwp might hold the underlying vnode locked, 251 * don't try to reclaim the VLAYER vnode if it's locked. 252 */ 253 if ((vp->v_flag & VXLOCK) == 0 && 254 ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 255 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 256 break; 257 } 258 mp = NULL; 259 simple_unlock(&vp->v_interlock); 260 } 261 262 if (vp == NULLVP) { 263 if (listhd == &vnode_free_list) { 264 listhd = &vnode_hold_list; 265 goto try_nextlist; 266 } 267 simple_unlock(&vnode_free_list_slock); 268 return NULLVP; 269 } 270 271 if (vp->v_usecount) 272 panic("free vnode isn't, vp %p", vp); 273 TAILQ_REMOVE(listhd, vp, v_freelist); 274 /* see comment on why 0xdeadb is set at end of vgone (below) */ 275 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 276 simple_unlock(&vnode_free_list_slock); 277 vp->v_lease = NULL; 278 279 if (vp->v_type != VBAD) 280 vgonel(vp, p); 281 else 282 simple_unlock(&vp->v_interlock); 283 vn_finished_write(mp, 0); 284 #ifdef DIAGNOSTIC 285 if (vp->v_data || vp->v_uobj.uo_npages || 286 TAILQ_FIRST(&vp->v_uobj.memq)) 287 panic("cleaned vnode isn't, vp %p", vp); 288 if (vp->v_numoutput) 289 panic("clean vnode has pending I/O's, vp %p", vp); 290 #endif 291 KASSERT((vp->v_flag & VONWORKLST) == 0); 292 293 return vp; 294 } 295 296 /* 297 * Mark a mount point as busy. Used to synchronize access and to delay 298 * unmounting. Interlock is not released on failure. 299 */ 300 int 301 vfs_busy(struct mount *mp, int flags, struct simplelock *interlkp) 302 { 303 int lkflags; 304 305 while (mp->mnt_iflag & IMNT_UNMOUNT) { 306 int gone, n; 307 308 if (flags & LK_NOWAIT) 309 return (ENOENT); 310 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 311 && mp->mnt_unmounter == curproc) 312 return (EDEADLK); 313 if (interlkp) 314 simple_unlock(interlkp); 315 /* 316 * Since all busy locks are shared except the exclusive 317 * lock granted when unmounting, the only place that a 318 * wakeup needs to be done is at the release of the 319 * exclusive lock at the end of dounmount. 320 */ 321 simple_lock(&mp->mnt_slock); 322 mp->mnt_wcnt++; 323 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 324 n = --mp->mnt_wcnt; 325 simple_unlock(&mp->mnt_slock); 326 gone = mp->mnt_iflag & IMNT_GONE; 327 328 if (n == 0) 329 wakeup(&mp->mnt_wcnt); 330 if (interlkp) 331 simple_lock(interlkp); 332 if (gone) 333 return (ENOENT); 334 } 335 lkflags = LK_SHARED; 336 if (interlkp) 337 lkflags |= LK_INTERLOCK; 338 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 339 panic("vfs_busy: unexpected lock failure"); 340 return (0); 341 } 342 343 /* 344 * Free a busy filesystem. 345 */ 346 void 347 vfs_unbusy(struct mount *mp) 348 { 349 350 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 351 } 352 353 /* 354 * Lookup a filesystem type, and if found allocate and initialize 355 * a mount structure for it. 356 * 357 * Devname is usually updated by mount(8) after booting. 358 */ 359 int 360 vfs_rootmountalloc(const char *fstypename, const char *devname, 361 struct mount **mpp) 362 { 363 struct vfsops *vfsp = NULL; 364 struct mount *mp; 365 366 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 367 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 368 break; 369 370 if (vfsp == NULL) 371 return (ENODEV); 372 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 373 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 374 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 375 simple_lock_init(&mp->mnt_slock); 376 (void)vfs_busy(mp, LK_NOWAIT, 0); 377 LIST_INIT(&mp->mnt_vnodelist); 378 mp->mnt_op = vfsp; 379 mp->mnt_flag = MNT_RDONLY; 380 mp->mnt_vnodecovered = NULLVP; 381 mp->mnt_leaf = mp; 382 vfsp->vfs_refcount++; 383 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 384 mp->mnt_stat.f_mntonname[0] = '/'; 385 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 386 *mpp = mp; 387 return (0); 388 } 389 390 /* 391 * Lookup a mount point by filesystem identifier. 392 */ 393 struct mount * 394 vfs_getvfs(fsid_t *fsid) 395 { 396 struct mount *mp; 397 398 simple_lock(&mountlist_slock); 399 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 400 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 401 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 402 simple_unlock(&mountlist_slock); 403 return (mp); 404 } 405 } 406 simple_unlock(&mountlist_slock); 407 return ((struct mount *)0); 408 } 409 410 /* 411 * Get a new unique fsid 412 */ 413 void 414 vfs_getnewfsid(struct mount *mp) 415 { 416 static u_short xxxfs_mntid; 417 fsid_t tfsid; 418 int mtype; 419 420 simple_lock(&mntid_slock); 421 mtype = makefstype(mp->mnt_op->vfs_name); 422 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 423 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 424 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 425 if (xxxfs_mntid == 0) 426 ++xxxfs_mntid; 427 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 428 tfsid.__fsid_val[1] = mtype; 429 if (!CIRCLEQ_EMPTY(&mountlist)) { 430 while (vfs_getvfs(&tfsid)) { 431 tfsid.__fsid_val[0]++; 432 xxxfs_mntid++; 433 } 434 } 435 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 436 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 437 simple_unlock(&mntid_slock); 438 } 439 440 /* 441 * Make a 'unique' number from a mount type name. 442 */ 443 long 444 makefstype(const char *type) 445 { 446 long rv; 447 448 for (rv = 0; *type; type++) { 449 rv <<= 2; 450 rv ^= *type; 451 } 452 return rv; 453 } 454 455 456 /* 457 * Set vnode attributes to VNOVAL 458 */ 459 void 460 vattr_null(struct vattr *vap) 461 { 462 463 vap->va_type = VNON; 464 465 /* 466 * Assign individually so that it is safe even if size and 467 * sign of each member are varied. 468 */ 469 vap->va_mode = VNOVAL; 470 vap->va_nlink = VNOVAL; 471 vap->va_uid = VNOVAL; 472 vap->va_gid = VNOVAL; 473 vap->va_fsid = VNOVAL; 474 vap->va_fileid = VNOVAL; 475 vap->va_size = VNOVAL; 476 vap->va_blocksize = VNOVAL; 477 vap->va_atime.tv_sec = 478 vap->va_mtime.tv_sec = 479 vap->va_ctime.tv_sec = 480 vap->va_birthtime.tv_sec = VNOVAL; 481 vap->va_atime.tv_nsec = 482 vap->va_mtime.tv_nsec = 483 vap->va_ctime.tv_nsec = 484 vap->va_birthtime.tv_nsec = VNOVAL; 485 vap->va_gen = VNOVAL; 486 vap->va_flags = VNOVAL; 487 vap->va_rdev = VNOVAL; 488 vap->va_bytes = VNOVAL; 489 vap->va_vaflags = 0; 490 } 491 492 /* 493 * Routines having to do with the management of the vnode table. 494 */ 495 extern int (**dead_vnodeop_p)(void *); 496 long numvnodes; 497 498 /* 499 * Return the next vnode from the free list. 500 */ 501 int 502 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 503 struct vnode **vpp) 504 { 505 extern struct uvm_pagerops uvm_vnodeops; 506 struct uvm_object *uobj; 507 struct proc *p = curproc; /* XXX */ 508 static int toggle; 509 struct vnode *vp; 510 int error = 0, tryalloc; 511 512 try_again: 513 if (mp) { 514 /* 515 * Mark filesystem busy while we're creating a vnode. 516 * If unmount is in progress, this will wait; if the 517 * unmount succeeds (only if umount -f), this will 518 * return an error. If the unmount fails, we'll keep 519 * going afterwards. 520 * (This puts the per-mount vnode list logically under 521 * the protection of the vfs_busy lock). 522 */ 523 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 524 if (error && error != EDEADLK) 525 return error; 526 } 527 528 /* 529 * We must choose whether to allocate a new vnode or recycle an 530 * existing one. The criterion for allocating a new one is that 531 * the total number of vnodes is less than the number desired or 532 * there are no vnodes on either free list. Generally we only 533 * want to recycle vnodes that have no buffers associated with 534 * them, so we look first on the vnode_free_list. If it is empty, 535 * we next consider vnodes with referencing buffers on the 536 * vnode_hold_list. The toggle ensures that half the time we 537 * will use a buffer from the vnode_hold_list, and half the time 538 * we will allocate a new one unless the list has grown to twice 539 * the desired size. We are reticent to recycle vnodes from the 540 * vnode_hold_list because we will lose the identity of all its 541 * referencing buffers. 542 */ 543 544 vp = NULL; 545 546 simple_lock(&vnode_free_list_slock); 547 548 toggle ^= 1; 549 if (numvnodes > 2 * desiredvnodes) 550 toggle = 0; 551 552 tryalloc = numvnodes < desiredvnodes || 553 (TAILQ_FIRST(&vnode_free_list) == NULL && 554 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 555 556 if (tryalloc && 557 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 558 numvnodes++; 559 simple_unlock(&vnode_free_list_slock); 560 memset(vp, 0, sizeof(*vp)); 561 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 1); 562 /* 563 * done by memset() above. 564 * LIST_INIT(&vp->v_nclist); 565 * LIST_INIT(&vp->v_dnclist); 566 */ 567 } else { 568 vp = getcleanvnode(p); 569 /* 570 * Unless this is a bad time of the month, at most 571 * the first NCPUS items on the free list are 572 * locked, so this is close enough to being empty. 573 */ 574 if (vp == NULLVP) { 575 if (mp && error != EDEADLK) 576 vfs_unbusy(mp); 577 if (tryalloc) { 578 printf("WARNING: unable to allocate new " 579 "vnode, retrying...\n"); 580 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 581 goto try_again; 582 } 583 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 584 *vpp = 0; 585 return (ENFILE); 586 } 587 vp->v_usecount = 1; 588 vp->v_flag = 0; 589 vp->v_socket = NULL; 590 } 591 vp->v_type = VNON; 592 vp->v_vnlock = &vp->v_lock; 593 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 594 KASSERT(LIST_EMPTY(&vp->v_nclist)); 595 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 596 vp->v_tag = tag; 597 vp->v_op = vops; 598 insmntque(vp, mp); 599 *vpp = vp; 600 vp->v_data = 0; 601 simple_lock_init(&vp->v_interlock); 602 603 /* 604 * initialize uvm_object within vnode. 605 */ 606 607 uobj = &vp->v_uobj; 608 KASSERT(uobj->pgops == &uvm_vnodeops); 609 KASSERT(uobj->uo_npages == 0); 610 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 611 vp->v_size = VSIZENOTSET; 612 613 if (mp && error != EDEADLK) 614 vfs_unbusy(mp); 615 return (0); 616 } 617 618 /* 619 * This is really just the reverse of getnewvnode(). Needed for 620 * VFS_VGET functions who may need to push back a vnode in case 621 * of a locking race. 622 */ 623 void 624 ungetnewvnode(struct vnode *vp) 625 { 626 #ifdef DIAGNOSTIC 627 if (vp->v_usecount != 1) 628 panic("ungetnewvnode: busy vnode"); 629 #endif 630 vp->v_usecount--; 631 insmntque(vp, NULL); 632 vp->v_type = VBAD; 633 634 simple_lock(&vp->v_interlock); 635 /* 636 * Insert at head of LRU list 637 */ 638 simple_lock(&vnode_free_list_slock); 639 if (vp->v_holdcnt > 0) 640 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 641 else 642 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 643 simple_unlock(&vnode_free_list_slock); 644 simple_unlock(&vp->v_interlock); 645 } 646 647 /* 648 * Move a vnode from one mount queue to another. 649 */ 650 void 651 insmntque(struct vnode *vp, struct mount *mp) 652 { 653 654 #ifdef DIAGNOSTIC 655 if ((mp != NULL) && 656 (mp->mnt_iflag & IMNT_UNMOUNT) && 657 !(mp->mnt_flag & MNT_SOFTDEP) && 658 vp->v_tag != VT_VFS) { 659 panic("insmntque into dying filesystem"); 660 } 661 #endif 662 663 simple_lock(&mntvnode_slock); 664 /* 665 * Delete from old mount point vnode list, if on one. 666 */ 667 if (vp->v_mount != NULL) 668 LIST_REMOVE(vp, v_mntvnodes); 669 /* 670 * Insert into list of vnodes for the new mount point, if available. 671 */ 672 if ((vp->v_mount = mp) != NULL) 673 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 674 simple_unlock(&mntvnode_slock); 675 } 676 677 /* 678 * Update outstanding I/O count and do wakeup if requested. 679 */ 680 void 681 vwakeup(struct buf *bp) 682 { 683 struct vnode *vp; 684 685 if ((vp = bp->b_vp) != NULL) { 686 /* XXX global lock hack 687 * can't use v_interlock here since this is called 688 * in interrupt context from biodone(). 689 */ 690 simple_lock(&global_v_numoutput_slock); 691 if (--vp->v_numoutput < 0) 692 panic("vwakeup: neg numoutput, vp %p", vp); 693 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 694 vp->v_flag &= ~VBWAIT; 695 wakeup((caddr_t)&vp->v_numoutput); 696 } 697 simple_unlock(&global_v_numoutput_slock); 698 } 699 } 700 701 /* 702 * Flush out and invalidate all buffers associated with a vnode. 703 * Called with the underlying vnode locked, which should prevent new dirty 704 * buffers from being queued. 705 */ 706 int 707 vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, struct proc *p, 708 int slpflag, int slptimeo) 709 { 710 struct buf *bp, *nbp; 711 int s, error; 712 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 713 (flags & V_SAVE ? PGO_CLEANIT : 0); 714 715 /* XXXUBC this doesn't look at flags or slp* */ 716 simple_lock(&vp->v_interlock); 717 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 718 if (error) { 719 return error; 720 } 721 722 if (flags & V_SAVE) { 723 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 724 if (error) 725 return (error); 726 #ifdef DIAGNOSTIC 727 s = splbio(); 728 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 729 panic("vinvalbuf: dirty bufs, vp %p", vp); 730 splx(s); 731 #endif 732 } 733 734 s = splbio(); 735 736 restart: 737 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 738 nbp = LIST_NEXT(bp, b_vnbufs); 739 simple_lock(&bp->b_interlock); 740 if (bp->b_flags & B_BUSY) { 741 bp->b_flags |= B_WANTED; 742 error = ltsleep((caddr_t)bp, 743 slpflag | (PRIBIO + 1) | PNORELOCK, 744 "vinvalbuf", slptimeo, &bp->b_interlock); 745 if (error) { 746 splx(s); 747 return (error); 748 } 749 goto restart; 750 } 751 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 752 simple_unlock(&bp->b_interlock); 753 brelse(bp); 754 } 755 756 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 757 nbp = LIST_NEXT(bp, b_vnbufs); 758 simple_lock(&bp->b_interlock); 759 if (bp->b_flags & B_BUSY) { 760 bp->b_flags |= B_WANTED; 761 error = ltsleep((caddr_t)bp, 762 slpflag | (PRIBIO + 1) | PNORELOCK, 763 "vinvalbuf", slptimeo, &bp->b_interlock); 764 if (error) { 765 splx(s); 766 return (error); 767 } 768 goto restart; 769 } 770 /* 771 * XXX Since there are no node locks for NFS, I believe 772 * there is a slight chance that a delayed write will 773 * occur while sleeping just above, so check for it. 774 */ 775 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 776 #ifdef DEBUG 777 printf("buffer still DELWRI\n"); 778 #endif 779 bp->b_flags |= B_BUSY | B_VFLUSH; 780 simple_unlock(&bp->b_interlock); 781 VOP_BWRITE(bp); 782 goto restart; 783 } 784 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 785 simple_unlock(&bp->b_interlock); 786 brelse(bp); 787 } 788 789 #ifdef DIAGNOSTIC 790 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 791 panic("vinvalbuf: flush failed, vp %p", vp); 792 #endif 793 794 splx(s); 795 796 return (0); 797 } 798 799 /* 800 * Destroy any in core blocks past the truncation length. 801 * Called with the underlying vnode locked, which should prevent new dirty 802 * buffers from being queued. 803 */ 804 int 805 vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo) 806 { 807 struct buf *bp, *nbp; 808 int s, error; 809 voff_t off; 810 811 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 812 simple_lock(&vp->v_interlock); 813 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 814 if (error) { 815 return error; 816 } 817 818 s = splbio(); 819 820 restart: 821 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 822 nbp = LIST_NEXT(bp, b_vnbufs); 823 if (bp->b_lblkno < lbn) 824 continue; 825 simple_lock(&bp->b_interlock); 826 if (bp->b_flags & B_BUSY) { 827 bp->b_flags |= B_WANTED; 828 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 829 "vtruncbuf", slptimeo, &bp->b_interlock); 830 if (error) { 831 splx(s); 832 return (error); 833 } 834 goto restart; 835 } 836 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 837 simple_unlock(&bp->b_interlock); 838 brelse(bp); 839 } 840 841 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 842 nbp = LIST_NEXT(bp, b_vnbufs); 843 if (bp->b_lblkno < lbn) 844 continue; 845 simple_lock(&bp->b_interlock); 846 if (bp->b_flags & B_BUSY) { 847 bp->b_flags |= B_WANTED; 848 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 849 "vtruncbuf", slptimeo, &bp->b_interlock); 850 if (error) { 851 splx(s); 852 return (error); 853 } 854 goto restart; 855 } 856 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 857 simple_unlock(&bp->b_interlock); 858 brelse(bp); 859 } 860 861 splx(s); 862 863 return (0); 864 } 865 866 void 867 vflushbuf(struct vnode *vp, int sync) 868 { 869 struct buf *bp, *nbp; 870 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 871 int s; 872 873 simple_lock(&vp->v_interlock); 874 (void) VOP_PUTPAGES(vp, 0, 0, flags); 875 876 loop: 877 s = splbio(); 878 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 879 nbp = LIST_NEXT(bp, b_vnbufs); 880 simple_lock(&bp->b_interlock); 881 if ((bp->b_flags & B_BUSY)) { 882 simple_unlock(&bp->b_interlock); 883 continue; 884 } 885 if ((bp->b_flags & B_DELWRI) == 0) 886 panic("vflushbuf: not dirty, bp %p", bp); 887 bp->b_flags |= B_BUSY | B_VFLUSH; 888 simple_unlock(&bp->b_interlock); 889 splx(s); 890 /* 891 * Wait for I/O associated with indirect blocks to complete, 892 * since there is no way to quickly wait for them below. 893 */ 894 if (bp->b_vp == vp || sync == 0) 895 (void) bawrite(bp); 896 else 897 (void) bwrite(bp); 898 goto loop; 899 } 900 if (sync == 0) { 901 splx(s); 902 return; 903 } 904 simple_lock(&global_v_numoutput_slock); 905 while (vp->v_numoutput) { 906 vp->v_flag |= VBWAIT; 907 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 908 &global_v_numoutput_slock); 909 } 910 simple_unlock(&global_v_numoutput_slock); 911 splx(s); 912 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 913 vprint("vflushbuf: dirty", vp); 914 goto loop; 915 } 916 } 917 918 /* 919 * Associate a buffer with a vnode. 920 */ 921 void 922 bgetvp(struct vnode *vp, struct buf *bp) 923 { 924 int s; 925 926 if (bp->b_vp) 927 panic("bgetvp: not free, bp %p", bp); 928 VHOLD(vp); 929 s = splbio(); 930 bp->b_vp = vp; 931 if (vp->v_type == VBLK || vp->v_type == VCHR) 932 bp->b_dev = vp->v_rdev; 933 else 934 bp->b_dev = NODEV; 935 /* 936 * Insert onto list for new vnode. 937 */ 938 bufinsvn(bp, &vp->v_cleanblkhd); 939 splx(s); 940 } 941 942 /* 943 * Disassociate a buffer from a vnode. 944 */ 945 void 946 brelvp(struct buf *bp) 947 { 948 struct vnode *vp; 949 int s; 950 951 if (bp->b_vp == NULL) 952 panic("brelvp: vp NULL, bp %p", bp); 953 954 s = splbio(); 955 vp = bp->b_vp; 956 /* 957 * Delete from old vnode list, if on one. 958 */ 959 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 960 bufremvn(bp); 961 962 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 963 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 964 vp->v_flag &= ~VONWORKLST; 965 LIST_REMOVE(vp, v_synclist); 966 } 967 968 bp->b_vp = NULL; 969 HOLDRELE(vp); 970 splx(s); 971 } 972 973 /* 974 * Reassign a buffer from one vnode to another. 975 * Used to assign file specific control information 976 * (indirect blocks) to the vnode to which they belong. 977 * 978 * This function must be called at splbio(). 979 */ 980 void 981 reassignbuf(struct buf *bp, struct vnode *newvp) 982 { 983 struct buflists *listheadp; 984 int delayx; 985 986 /* 987 * Delete from old vnode list, if on one. 988 */ 989 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 990 bufremvn(bp); 991 /* 992 * If dirty, put on list of dirty buffers; 993 * otherwise insert onto list of clean buffers. 994 */ 995 if ((bp->b_flags & B_DELWRI) == 0) { 996 listheadp = &newvp->v_cleanblkhd; 997 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 998 (newvp->v_flag & VONWORKLST) && 999 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1000 newvp->v_flag &= ~VONWORKLST; 1001 LIST_REMOVE(newvp, v_synclist); 1002 } 1003 } else { 1004 listheadp = &newvp->v_dirtyblkhd; 1005 if ((newvp->v_flag & VONWORKLST) == 0) { 1006 switch (newvp->v_type) { 1007 case VDIR: 1008 delayx = dirdelay; 1009 break; 1010 case VBLK: 1011 if (newvp->v_specmountpoint != NULL) { 1012 delayx = metadelay; 1013 break; 1014 } 1015 /* fall through */ 1016 default: 1017 delayx = filedelay; 1018 break; 1019 } 1020 if (!newvp->v_mount || 1021 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1022 vn_syncer_add_to_worklist(newvp, delayx); 1023 } 1024 } 1025 bufinsvn(bp, listheadp); 1026 } 1027 1028 /* 1029 * Create a vnode for a block device. 1030 * Used for root filesystem and swap areas. 1031 * Also used for memory file system special devices. 1032 */ 1033 int 1034 bdevvp(dev_t dev, struct vnode **vpp) 1035 { 1036 1037 return (getdevvp(dev, vpp, VBLK)); 1038 } 1039 1040 /* 1041 * Create a vnode for a character device. 1042 * Used for kernfs and some console handling. 1043 */ 1044 int 1045 cdevvp(dev_t dev, struct vnode **vpp) 1046 { 1047 1048 return (getdevvp(dev, vpp, VCHR)); 1049 } 1050 1051 /* 1052 * Create a vnode for a device. 1053 * Used by bdevvp (block device) for root file system etc., 1054 * and by cdevvp (character device) for console and kernfs. 1055 */ 1056 int 1057 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type) 1058 { 1059 struct vnode *vp; 1060 struct vnode *nvp; 1061 int error; 1062 1063 if (dev == NODEV) { 1064 *vpp = NULLVP; 1065 return (0); 1066 } 1067 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1068 if (error) { 1069 *vpp = NULLVP; 1070 return (error); 1071 } 1072 vp = nvp; 1073 vp->v_type = type; 1074 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1075 vput(vp); 1076 vp = nvp; 1077 } 1078 *vpp = vp; 1079 return (0); 1080 } 1081 1082 /* 1083 * Check to see if the new vnode represents a special device 1084 * for which we already have a vnode (either because of 1085 * bdevvp() or because of a different vnode representing 1086 * the same block device). If such an alias exists, deallocate 1087 * the existing contents and return the aliased vnode. The 1088 * caller is responsible for filling it with its new contents. 1089 */ 1090 struct vnode * 1091 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp) 1092 { 1093 struct proc *p = curproc; /* XXX */ 1094 struct vnode *vp; 1095 struct vnode **vpp; 1096 1097 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1098 return (NULLVP); 1099 1100 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1101 loop: 1102 simple_lock(&spechash_slock); 1103 for (vp = *vpp; vp; vp = vp->v_specnext) { 1104 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1105 continue; 1106 /* 1107 * Alias, but not in use, so flush it out. 1108 */ 1109 simple_lock(&vp->v_interlock); 1110 simple_unlock(&spechash_slock); 1111 if (vp->v_usecount == 0) { 1112 vgonel(vp, p); 1113 goto loop; 1114 } 1115 /* 1116 * What we're interested to know here is if someone else has 1117 * removed this vnode from the device hash list while we were 1118 * waiting. This can only happen if vclean() did it, and 1119 * this requires the vnode to be locked. Therefore, we use 1120 * LK_SLEEPFAIL and retry. 1121 */ 1122 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL)) 1123 goto loop; 1124 simple_lock(&spechash_slock); 1125 break; 1126 } 1127 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1128 MALLOC(nvp->v_specinfo, struct specinfo *, 1129 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1130 /* XXX Erg. */ 1131 if (nvp->v_specinfo == NULL) { 1132 simple_unlock(&spechash_slock); 1133 uvm_wait("checkalias"); 1134 goto loop; 1135 } 1136 1137 nvp->v_rdev = nvp_rdev; 1138 nvp->v_hashchain = vpp; 1139 nvp->v_specnext = *vpp; 1140 nvp->v_specmountpoint = NULL; 1141 simple_unlock(&spechash_slock); 1142 nvp->v_speclockf = NULL; 1143 simple_lock_init(&nvp->v_spec_cow_slock); 1144 SLIST_INIT(&nvp->v_spec_cow_head); 1145 nvp->v_spec_cow_req = 0; 1146 nvp->v_spec_cow_count = 0; 1147 1148 *vpp = nvp; 1149 if (vp != NULLVP) { 1150 nvp->v_flag |= VALIASED; 1151 vp->v_flag |= VALIASED; 1152 vput(vp); 1153 } 1154 return (NULLVP); 1155 } 1156 simple_unlock(&spechash_slock); 1157 VOP_UNLOCK(vp, 0); 1158 simple_lock(&vp->v_interlock); 1159 vclean(vp, 0, p); 1160 vp->v_op = nvp->v_op; 1161 vp->v_tag = nvp->v_tag; 1162 vp->v_vnlock = &vp->v_lock; 1163 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1164 nvp->v_type = VNON; 1165 insmntque(vp, mp); 1166 return (vp); 1167 } 1168 1169 /* 1170 * Grab a particular vnode from the free list, increment its 1171 * reference count and lock it. If the vnode lock bit is set the 1172 * vnode is being eliminated in vgone. In that case, we can not 1173 * grab the vnode, so the process is awakened when the transition is 1174 * completed, and an error returned to indicate that the vnode is no 1175 * longer usable (possibly having been changed to a new file system type). 1176 */ 1177 int 1178 vget(struct vnode *vp, int flags) 1179 { 1180 int error; 1181 1182 /* 1183 * If the vnode is in the process of being cleaned out for 1184 * another use, we wait for the cleaning to finish and then 1185 * return failure. Cleaning is determined by checking that 1186 * the VXLOCK flag is set. 1187 */ 1188 1189 if ((flags & LK_INTERLOCK) == 0) 1190 simple_lock(&vp->v_interlock); 1191 if (vp->v_flag & VXLOCK) { 1192 if (flags & LK_NOWAIT) { 1193 simple_unlock(&vp->v_interlock); 1194 return EBUSY; 1195 } 1196 vp->v_flag |= VXWANT; 1197 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1198 return (ENOENT); 1199 } 1200 if (vp->v_usecount == 0) { 1201 simple_lock(&vnode_free_list_slock); 1202 if (vp->v_holdcnt > 0) 1203 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1204 else 1205 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1206 simple_unlock(&vnode_free_list_slock); 1207 } 1208 vp->v_usecount++; 1209 #ifdef DIAGNOSTIC 1210 if (vp->v_usecount == 0) { 1211 vprint("vget", vp); 1212 panic("vget: usecount overflow, vp %p", vp); 1213 } 1214 #endif 1215 if (flags & LK_TYPE_MASK) { 1216 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1217 /* 1218 * must expand vrele here because we do not want 1219 * to call VOP_INACTIVE if the reference count 1220 * drops back to zero since it was never really 1221 * active. We must remove it from the free list 1222 * before sleeping so that multiple processes do 1223 * not try to recycle it. 1224 */ 1225 simple_lock(&vp->v_interlock); 1226 vp->v_usecount--; 1227 if (vp->v_usecount > 0) { 1228 simple_unlock(&vp->v_interlock); 1229 return (error); 1230 } 1231 /* 1232 * insert at tail of LRU list 1233 */ 1234 simple_lock(&vnode_free_list_slock); 1235 if (vp->v_holdcnt > 0) 1236 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1237 v_freelist); 1238 else 1239 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1240 v_freelist); 1241 simple_unlock(&vnode_free_list_slock); 1242 simple_unlock(&vp->v_interlock); 1243 } 1244 return (error); 1245 } 1246 simple_unlock(&vp->v_interlock); 1247 return (0); 1248 } 1249 1250 /* 1251 * vput(), just unlock and vrele() 1252 */ 1253 void 1254 vput(struct vnode *vp) 1255 { 1256 struct proc *p = curproc; /* XXX */ 1257 1258 #ifdef DIAGNOSTIC 1259 if (vp == NULL) 1260 panic("vput: null vp"); 1261 #endif 1262 simple_lock(&vp->v_interlock); 1263 vp->v_usecount--; 1264 if (vp->v_usecount > 0) { 1265 simple_unlock(&vp->v_interlock); 1266 VOP_UNLOCK(vp, 0); 1267 return; 1268 } 1269 #ifdef DIAGNOSTIC 1270 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1271 vprint("vput: bad ref count", vp); 1272 panic("vput: ref cnt"); 1273 } 1274 #endif 1275 /* 1276 * Insert at tail of LRU list. 1277 */ 1278 simple_lock(&vnode_free_list_slock); 1279 if (vp->v_holdcnt > 0) 1280 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1281 else 1282 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1283 simple_unlock(&vnode_free_list_slock); 1284 if (vp->v_flag & VEXECMAP) { 1285 uvmexp.execpages -= vp->v_uobj.uo_npages; 1286 uvmexp.filepages += vp->v_uobj.uo_npages; 1287 } 1288 vp->v_flag &= ~(VTEXT|VEXECMAP); 1289 simple_unlock(&vp->v_interlock); 1290 VOP_INACTIVE(vp, p); 1291 } 1292 1293 /* 1294 * Vnode release. 1295 * If count drops to zero, call inactive routine and return to freelist. 1296 */ 1297 void 1298 vrele(struct vnode *vp) 1299 { 1300 struct proc *p = curproc; /* XXX */ 1301 1302 #ifdef DIAGNOSTIC 1303 if (vp == NULL) 1304 panic("vrele: null vp"); 1305 #endif 1306 simple_lock(&vp->v_interlock); 1307 vp->v_usecount--; 1308 if (vp->v_usecount > 0) { 1309 simple_unlock(&vp->v_interlock); 1310 return; 1311 } 1312 #ifdef DIAGNOSTIC 1313 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1314 vprint("vrele: bad ref count", vp); 1315 panic("vrele: ref cnt vp %p", vp); 1316 } 1317 #endif 1318 /* 1319 * Insert at tail of LRU list. 1320 */ 1321 simple_lock(&vnode_free_list_slock); 1322 if (vp->v_holdcnt > 0) 1323 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1324 else 1325 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1326 simple_unlock(&vnode_free_list_slock); 1327 if (vp->v_flag & VEXECMAP) { 1328 uvmexp.execpages -= vp->v_uobj.uo_npages; 1329 uvmexp.filepages += vp->v_uobj.uo_npages; 1330 } 1331 vp->v_flag &= ~(VTEXT|VEXECMAP); 1332 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1333 VOP_INACTIVE(vp, p); 1334 } 1335 1336 #ifdef DIAGNOSTIC 1337 /* 1338 * Page or buffer structure gets a reference. 1339 */ 1340 void 1341 vholdl(struct vnode *vp) 1342 { 1343 1344 /* 1345 * If it is on the freelist and the hold count is currently 1346 * zero, move it to the hold list. The test of the back 1347 * pointer and the use reference count of zero is because 1348 * it will be removed from a free list by getnewvnode, 1349 * but will not have its reference count incremented until 1350 * after calling vgone. If the reference count were 1351 * incremented first, vgone would (incorrectly) try to 1352 * close the previous instance of the underlying object. 1353 * So, the back pointer is explicitly set to `0xdeadb' in 1354 * getnewvnode after removing it from a freelist to ensure 1355 * that we do not try to move it here. 1356 */ 1357 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1358 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1359 simple_lock(&vnode_free_list_slock); 1360 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1361 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1362 simple_unlock(&vnode_free_list_slock); 1363 } 1364 vp->v_holdcnt++; 1365 } 1366 1367 /* 1368 * Page or buffer structure frees a reference. 1369 */ 1370 void 1371 holdrelel(struct vnode *vp) 1372 { 1373 1374 if (vp->v_holdcnt <= 0) 1375 panic("holdrelel: holdcnt vp %p", vp); 1376 vp->v_holdcnt--; 1377 1378 /* 1379 * If it is on the holdlist and the hold count drops to 1380 * zero, move it to the free list. The test of the back 1381 * pointer and the use reference count of zero is because 1382 * it will be removed from a free list by getnewvnode, 1383 * but will not have its reference count incremented until 1384 * after calling vgone. If the reference count were 1385 * incremented first, vgone would (incorrectly) try to 1386 * close the previous instance of the underlying object. 1387 * So, the back pointer is explicitly set to `0xdeadb' in 1388 * getnewvnode after removing it from a freelist to ensure 1389 * that we do not try to move it here. 1390 */ 1391 1392 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1393 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1394 simple_lock(&vnode_free_list_slock); 1395 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1396 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1397 simple_unlock(&vnode_free_list_slock); 1398 } 1399 } 1400 1401 /* 1402 * Vnode reference. 1403 */ 1404 void 1405 vref(struct vnode *vp) 1406 { 1407 1408 simple_lock(&vp->v_interlock); 1409 if (vp->v_usecount <= 0) 1410 panic("vref used where vget required, vp %p", vp); 1411 vp->v_usecount++; 1412 #ifdef DIAGNOSTIC 1413 if (vp->v_usecount == 0) { 1414 vprint("vref", vp); 1415 panic("vref: usecount overflow, vp %p", vp); 1416 } 1417 #endif 1418 simple_unlock(&vp->v_interlock); 1419 } 1420 #endif /* DIAGNOSTIC */ 1421 1422 /* 1423 * Remove any vnodes in the vnode table belonging to mount point mp. 1424 * 1425 * If FORCECLOSE is not specified, there should not be any active ones, 1426 * return error if any are found (nb: this is a user error, not a 1427 * system error). If FORCECLOSE is specified, detach any active vnodes 1428 * that are found. 1429 * 1430 * If WRITECLOSE is set, only flush out regular file vnodes open for 1431 * writing. 1432 * 1433 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1434 */ 1435 #ifdef DEBUG 1436 int busyprt = 0; /* print out busy vnodes */ 1437 struct ctldebug debug1 = { "busyprt", &busyprt }; 1438 #endif 1439 1440 int 1441 vflush(struct mount *mp, struct vnode *skipvp, int flags) 1442 { 1443 struct proc *p = curproc; /* XXX */ 1444 struct vnode *vp, *nvp; 1445 int busy = 0; 1446 1447 simple_lock(&mntvnode_slock); 1448 loop: 1449 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1450 if (vp->v_mount != mp) 1451 goto loop; 1452 nvp = LIST_NEXT(vp, v_mntvnodes); 1453 /* 1454 * Skip over a selected vnode. 1455 */ 1456 if (vp == skipvp) 1457 continue; 1458 simple_lock(&vp->v_interlock); 1459 /* 1460 * Skip over a vnodes marked VSYSTEM. 1461 */ 1462 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1463 simple_unlock(&vp->v_interlock); 1464 continue; 1465 } 1466 /* 1467 * If WRITECLOSE is set, only flush out regular file 1468 * vnodes open for writing. 1469 */ 1470 if ((flags & WRITECLOSE) && 1471 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1472 simple_unlock(&vp->v_interlock); 1473 continue; 1474 } 1475 /* 1476 * With v_usecount == 0, all we need to do is clear 1477 * out the vnode data structures and we are done. 1478 */ 1479 if (vp->v_usecount == 0) { 1480 simple_unlock(&mntvnode_slock); 1481 vgonel(vp, p); 1482 simple_lock(&mntvnode_slock); 1483 continue; 1484 } 1485 /* 1486 * If FORCECLOSE is set, forcibly close the vnode. 1487 * For block or character devices, revert to an 1488 * anonymous device. For all other files, just kill them. 1489 */ 1490 if (flags & FORCECLOSE) { 1491 simple_unlock(&mntvnode_slock); 1492 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1493 vgonel(vp, p); 1494 } else { 1495 vclean(vp, 0, p); 1496 vp->v_op = spec_vnodeop_p; 1497 insmntque(vp, (struct mount *)0); 1498 } 1499 simple_lock(&mntvnode_slock); 1500 continue; 1501 } 1502 #ifdef DEBUG 1503 if (busyprt) 1504 vprint("vflush: busy vnode", vp); 1505 #endif 1506 simple_unlock(&vp->v_interlock); 1507 busy++; 1508 } 1509 simple_unlock(&mntvnode_slock); 1510 if (busy) 1511 return (EBUSY); 1512 return (0); 1513 } 1514 1515 /* 1516 * Disassociate the underlying file system from a vnode. 1517 */ 1518 void 1519 vclean(struct vnode *vp, int flags, struct proc *p) 1520 { 1521 struct mount *mp; 1522 int active; 1523 1524 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1525 1526 /* 1527 * Check to see if the vnode is in use. 1528 * If so we have to reference it before we clean it out 1529 * so that its count cannot fall to zero and generate a 1530 * race against ourselves to recycle it. 1531 */ 1532 1533 if ((active = vp->v_usecount) != 0) { 1534 vp->v_usecount++; 1535 #ifdef DIAGNOSTIC 1536 if (vp->v_usecount == 0) { 1537 vprint("vclean", vp); 1538 panic("vclean: usecount overflow"); 1539 } 1540 #endif 1541 } 1542 1543 /* 1544 * Prevent the vnode from being recycled or 1545 * brought into use while we clean it out. 1546 */ 1547 if (vp->v_flag & VXLOCK) 1548 panic("vclean: deadlock, vp %p", vp); 1549 vp->v_flag |= VXLOCK; 1550 if (vp->v_flag & VEXECMAP) { 1551 uvmexp.execpages -= vp->v_uobj.uo_npages; 1552 uvmexp.filepages += vp->v_uobj.uo_npages; 1553 } 1554 vp->v_flag &= ~(VTEXT|VEXECMAP); 1555 1556 /* 1557 * Even if the count is zero, the VOP_INACTIVE routine may still 1558 * have the object locked while it cleans it out. The VOP_LOCK 1559 * ensures that the VOP_INACTIVE routine is done with its work. 1560 * For active vnodes, it ensures that no other activity can 1561 * occur while the underlying object is being cleaned out. 1562 */ 1563 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1564 1565 /* 1566 * Clean out any cached data associated with the vnode. 1567 * If special device, remove it from special device alias list. 1568 * if it is on one. 1569 */ 1570 if (flags & DOCLOSE) { 1571 int error; 1572 struct vnode *vq, *vx; 1573 1574 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1575 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1576 vn_finished_write(mp, V_LOWER); 1577 if (error) 1578 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1579 KASSERT(error == 0); 1580 KASSERT((vp->v_flag & VONWORKLST) == 0); 1581 1582 if (active) 1583 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1584 1585 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1586 vp->v_specinfo != 0) { 1587 simple_lock(&spechash_slock); 1588 if (vp->v_hashchain != NULL) { 1589 if (*vp->v_hashchain == vp) { 1590 *vp->v_hashchain = vp->v_specnext; 1591 } else { 1592 for (vq = *vp->v_hashchain; vq; 1593 vq = vq->v_specnext) { 1594 if (vq->v_specnext != vp) 1595 continue; 1596 vq->v_specnext = vp->v_specnext; 1597 break; 1598 } 1599 if (vq == NULL) 1600 panic("missing bdev"); 1601 } 1602 if (vp->v_flag & VALIASED) { 1603 vx = NULL; 1604 for (vq = *vp->v_hashchain; vq; 1605 vq = vq->v_specnext) { 1606 if (vq->v_rdev != vp->v_rdev || 1607 vq->v_type != vp->v_type) 1608 continue; 1609 if (vx) 1610 break; 1611 vx = vq; 1612 } 1613 if (vx == NULL) 1614 panic("missing alias"); 1615 if (vq == NULL) 1616 vx->v_flag &= ~VALIASED; 1617 vp->v_flag &= ~VALIASED; 1618 } 1619 } 1620 simple_unlock(&spechash_slock); 1621 FREE(vp->v_specinfo, M_VNODE); 1622 vp->v_specinfo = NULL; 1623 } 1624 } 1625 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1626 1627 /* 1628 * If purging an active vnode, it must be closed and 1629 * deactivated before being reclaimed. Note that the 1630 * VOP_INACTIVE will unlock the vnode. 1631 */ 1632 if (active) { 1633 VOP_INACTIVE(vp, p); 1634 } else { 1635 /* 1636 * Any other processes trying to obtain this lock must first 1637 * wait for VXLOCK to clear, then call the new lock operation. 1638 */ 1639 VOP_UNLOCK(vp, 0); 1640 } 1641 /* 1642 * Reclaim the vnode. 1643 */ 1644 if (VOP_RECLAIM(vp, p)) 1645 panic("vclean: cannot reclaim, vp %p", vp); 1646 if (active) { 1647 /* 1648 * Inline copy of vrele() since VOP_INACTIVE 1649 * has already been called. 1650 */ 1651 simple_lock(&vp->v_interlock); 1652 if (--vp->v_usecount <= 0) { 1653 #ifdef DIAGNOSTIC 1654 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1655 vprint("vclean: bad ref count", vp); 1656 panic("vclean: ref cnt"); 1657 } 1658 #endif 1659 /* 1660 * Insert at tail of LRU list. 1661 */ 1662 1663 simple_unlock(&vp->v_interlock); 1664 simple_lock(&vnode_free_list_slock); 1665 #ifdef DIAGNOSTIC 1666 if (vp->v_holdcnt > 0) 1667 panic("vclean: not clean, vp %p", vp); 1668 #endif 1669 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1670 simple_unlock(&vnode_free_list_slock); 1671 } else 1672 simple_unlock(&vp->v_interlock); 1673 } 1674 1675 KASSERT(vp->v_uobj.uo_npages == 0); 1676 cache_purge(vp); 1677 1678 /* 1679 * Done with purge, notify sleepers of the grim news. 1680 */ 1681 vp->v_op = dead_vnodeop_p; 1682 vp->v_tag = VT_NON; 1683 simple_lock(&vp->v_interlock); 1684 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1685 vp->v_flag &= ~(VXLOCK|VLOCKSWORK); 1686 if (vp->v_flag & VXWANT) { 1687 vp->v_flag &= ~VXWANT; 1688 simple_unlock(&vp->v_interlock); 1689 wakeup((caddr_t)vp); 1690 } else 1691 simple_unlock(&vp->v_interlock); 1692 } 1693 1694 /* 1695 * Recycle an unused vnode to the front of the free list. 1696 * Release the passed interlock if the vnode will be recycled. 1697 */ 1698 int 1699 vrecycle(struct vnode *vp, struct simplelock *inter_lkp, struct proc *p) 1700 { 1701 1702 simple_lock(&vp->v_interlock); 1703 if (vp->v_usecount == 0) { 1704 if (inter_lkp) 1705 simple_unlock(inter_lkp); 1706 vgonel(vp, p); 1707 return (1); 1708 } 1709 simple_unlock(&vp->v_interlock); 1710 return (0); 1711 } 1712 1713 /* 1714 * Eliminate all activity associated with a vnode 1715 * in preparation for reuse. 1716 */ 1717 void 1718 vgone(struct vnode *vp) 1719 { 1720 struct proc *p = curproc; /* XXX */ 1721 1722 simple_lock(&vp->v_interlock); 1723 vgonel(vp, p); 1724 } 1725 1726 /* 1727 * vgone, with the vp interlock held. 1728 */ 1729 void 1730 vgonel(struct vnode *vp, struct proc *p) 1731 { 1732 1733 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1734 1735 /* 1736 * If a vgone (or vclean) is already in progress, 1737 * wait until it is done and return. 1738 */ 1739 1740 if (vp->v_flag & VXLOCK) { 1741 vp->v_flag |= VXWANT; 1742 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1743 return; 1744 } 1745 1746 /* 1747 * Clean out the filesystem specific data. 1748 */ 1749 1750 vclean(vp, DOCLOSE, p); 1751 KASSERT((vp->v_flag & VONWORKLST) == 0); 1752 1753 /* 1754 * Delete from old mount point vnode list, if on one. 1755 */ 1756 1757 if (vp->v_mount != NULL) 1758 insmntque(vp, (struct mount *)0); 1759 1760 /* 1761 * The test of the back pointer and the reference count of 1762 * zero is because it will be removed from the free list by 1763 * getcleanvnode, but will not have its reference count 1764 * incremented until after calling vgone. If the reference 1765 * count were incremented first, vgone would (incorrectly) 1766 * try to close the previous instance of the underlying object. 1767 * So, the back pointer is explicitly set to `0xdeadb' in 1768 * getnewvnode after removing it from the freelist to ensure 1769 * that we do not try to move it here. 1770 */ 1771 1772 vp->v_type = VBAD; 1773 if (vp->v_usecount == 0) { 1774 boolean_t dofree; 1775 1776 simple_lock(&vnode_free_list_slock); 1777 if (vp->v_holdcnt > 0) 1778 panic("vgonel: not clean, vp %p", vp); 1779 /* 1780 * if it isn't on the freelist, we're called by getcleanvnode 1781 * and vnode is being re-used. otherwise, we'll free it. 1782 */ 1783 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1784 if (dofree) { 1785 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1786 numvnodes--; 1787 } 1788 simple_unlock(&vnode_free_list_slock); 1789 if (dofree) 1790 pool_put(&vnode_pool, vp); 1791 } 1792 } 1793 1794 /* 1795 * Lookup a vnode by device number. 1796 */ 1797 int 1798 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1799 { 1800 struct vnode *vp; 1801 int rc = 0; 1802 1803 simple_lock(&spechash_slock); 1804 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1805 if (dev != vp->v_rdev || type != vp->v_type) 1806 continue; 1807 *vpp = vp; 1808 rc = 1; 1809 break; 1810 } 1811 simple_unlock(&spechash_slock); 1812 return (rc); 1813 } 1814 1815 /* 1816 * Revoke all the vnodes corresponding to the specified minor number 1817 * range (endpoints inclusive) of the specified major. 1818 */ 1819 void 1820 vdevgone(int maj, int minl, int minh, enum vtype type) 1821 { 1822 struct vnode *vp; 1823 int mn; 1824 1825 for (mn = minl; mn <= minh; mn++) 1826 if (vfinddev(makedev(maj, mn), type, &vp)) 1827 VOP_REVOKE(vp, REVOKEALL); 1828 } 1829 1830 /* 1831 * Calculate the total number of references to a special device. 1832 */ 1833 int 1834 vcount(struct vnode *vp) 1835 { 1836 struct vnode *vq, *vnext; 1837 int count; 1838 1839 loop: 1840 if ((vp->v_flag & VALIASED) == 0) 1841 return (vp->v_usecount); 1842 simple_lock(&spechash_slock); 1843 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1844 vnext = vq->v_specnext; 1845 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1846 continue; 1847 /* 1848 * Alias, but not in use, so flush it out. 1849 */ 1850 if (vq->v_usecount == 0 && vq != vp && 1851 (vq->v_flag & VXLOCK) == 0) { 1852 simple_unlock(&spechash_slock); 1853 vgone(vq); 1854 goto loop; 1855 } 1856 count += vq->v_usecount; 1857 } 1858 simple_unlock(&spechash_slock); 1859 return (count); 1860 } 1861 1862 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 1863 #define ARRAY_PRINT(idx, arr) \ 1864 ((idx) > 0 && (idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 1865 1866 const char * const vnode_tags[] = { VNODE_TAGS }; 1867 const char * const vnode_types[] = { VNODE_TYPES }; 1868 const char vnode_flagbits[] = VNODE_FLAGBITS; 1869 1870 /* 1871 * Print out a description of a vnode. 1872 */ 1873 void 1874 vprint(const char *label, struct vnode *vp) 1875 { 1876 char bf[96]; 1877 1878 if (label != NULL) 1879 printf("%s: ", label); 1880 printf("tag %s(%d) type %s(%d), usecount %d, writecount %ld, " 1881 "refcount %ld,", ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 1882 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 1883 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1884 bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf)); 1885 if (bf[0] != '\0') 1886 printf(" flags (%s)", &bf[1]); 1887 if (vp->v_data == NULL) { 1888 printf("\n"); 1889 } else { 1890 printf("\n\t"); 1891 VOP_PRINT(vp); 1892 } 1893 } 1894 1895 #ifdef DEBUG 1896 /* 1897 * List all of the locked vnodes in the system. 1898 * Called when debugging the kernel. 1899 */ 1900 void 1901 printlockedvnodes(void) 1902 { 1903 struct mount *mp, *nmp; 1904 struct vnode *vp; 1905 1906 printf("Locked vnodes\n"); 1907 simple_lock(&mountlist_slock); 1908 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1909 mp = nmp) { 1910 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 1911 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1912 continue; 1913 } 1914 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1915 if (VOP_ISLOCKED(vp)) 1916 vprint(NULL, vp); 1917 } 1918 simple_lock(&mountlist_slock); 1919 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1920 vfs_unbusy(mp); 1921 } 1922 simple_unlock(&mountlist_slock); 1923 } 1924 #endif 1925 1926 /* 1927 * sysctl helper routine for vfs.generic.conf lookups. 1928 */ 1929 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 1930 static int 1931 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 1932 { 1933 struct vfsconf vfc; 1934 extern const char * const mountcompatnames[]; 1935 extern int nmountcompatnames; 1936 struct sysctlnode node; 1937 struct vfsops *vfsp; 1938 u_int vfsnum; 1939 1940 if (namelen != 1) 1941 return (ENOTDIR); 1942 vfsnum = name[0]; 1943 if (vfsnum >= nmountcompatnames || 1944 mountcompatnames[vfsnum] == NULL) 1945 return (EOPNOTSUPP); 1946 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 1947 if (vfsp == NULL) 1948 return (EOPNOTSUPP); 1949 1950 vfc.vfc_vfsops = vfsp; 1951 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 1952 vfc.vfc_typenum = vfsnum; 1953 vfc.vfc_refcount = vfsp->vfs_refcount; 1954 vfc.vfc_flags = 0; 1955 vfc.vfc_mountroot = vfsp->vfs_mountroot; 1956 vfc.vfc_next = NULL; 1957 1958 node = *rnode; 1959 node.sysctl_data = &vfc; 1960 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 1961 } 1962 #endif 1963 1964 /* 1965 * sysctl helper routine to return list of supported fstypes 1966 */ 1967 static int 1968 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 1969 { 1970 char bf[MFSNAMELEN]; 1971 char *where = oldp; 1972 struct vfsops *v; 1973 size_t needed, left, slen; 1974 int error, first; 1975 1976 if (newp != NULL) 1977 return (EPERM); 1978 if (namelen != 0) 1979 return (EINVAL); 1980 1981 first = 1; 1982 error = 0; 1983 needed = 0; 1984 left = *oldlenp; 1985 1986 LIST_FOREACH(v, &vfs_list, vfs_list) { 1987 if (where == NULL) 1988 needed += strlen(v->vfs_name) + 1; 1989 else { 1990 memset(bf, 0, sizeof(bf)); 1991 if (first) { 1992 strncpy(bf, v->vfs_name, sizeof(bf)); 1993 first = 0; 1994 } else { 1995 bf[0] = ' '; 1996 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1997 } 1998 bf[sizeof(bf)-1] = '\0'; 1999 slen = strlen(bf); 2000 if (left < slen + 1) 2001 break; 2002 /* +1 to copy out the trailing NUL byte */ 2003 error = copyout(bf, where, slen + 1); 2004 if (error) 2005 break; 2006 where += slen; 2007 needed += slen; 2008 left -= slen; 2009 } 2010 } 2011 *oldlenp = needed; 2012 return (error); 2013 } 2014 2015 /* 2016 * Top level filesystem related information gathering. 2017 */ 2018 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2019 { 2020 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2021 extern int nmountcompatnames; 2022 #endif 2023 2024 sysctl_createv(clog, 0, NULL, NULL, 2025 CTLFLAG_PERMANENT, 2026 CTLTYPE_NODE, "vfs", NULL, 2027 NULL, 0, NULL, 0, 2028 CTL_VFS, CTL_EOL); 2029 sysctl_createv(clog, 0, NULL, NULL, 2030 CTLFLAG_PERMANENT, 2031 CTLTYPE_NODE, "generic", 2032 SYSCTL_DESCR("Non-specific vfs related information"), 2033 NULL, 0, NULL, 0, 2034 CTL_VFS, VFS_GENERIC, CTL_EOL); 2035 2036 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2037 sysctl_createv(clog, 0, NULL, NULL, 2038 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 2039 CTLTYPE_INT, "maxtypenum", 2040 SYSCTL_DESCR("Highest valid filesystem type number"), 2041 NULL, nmountcompatnames, NULL, 0, 2042 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2043 #endif 2044 sysctl_createv(clog, 0, NULL, NULL, 2045 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2046 CTLTYPE_INT, "usermount", 2047 SYSCTL_DESCR("Whether unprivileged users may mount " 2048 "filesystems"), 2049 NULL, 0, &dovfsusermount, 0, 2050 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2051 sysctl_createv(clog, 0, NULL, NULL, 2052 CTLFLAG_PERMANENT, 2053 CTLTYPE_STRING, "fstypes", 2054 SYSCTL_DESCR("List of file systems present"), 2055 sysctl_vfs_generic_fstypes, 0, NULL, 0, 2056 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 2057 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2058 sysctl_createv(clog, 0, NULL, NULL, 2059 CTLFLAG_PERMANENT, 2060 CTLTYPE_STRUCT, "conf", 2061 SYSCTL_DESCR("Filesystem configuration information"), 2062 sysctl_vfs_generic_conf, 0, NULL, 2063 sizeof(struct vfsconf), 2064 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2065 #endif 2066 } 2067 2068 2069 int kinfo_vdebug = 1; 2070 int kinfo_vgetfailed; 2071 #define KINFO_VNODESLOP 10 2072 /* 2073 * Dump vnode list (via sysctl). 2074 * Copyout address of vnode followed by vnode. 2075 */ 2076 /* ARGSUSED */ 2077 int 2078 sysctl_kern_vnode(SYSCTLFN_ARGS) 2079 { 2080 char *where = oldp; 2081 size_t *sizep = oldlenp; 2082 struct mount *mp, *nmp; 2083 struct vnode *nvp, *vp; 2084 char *bp = where, *savebp; 2085 char *ewhere; 2086 int error; 2087 2088 if (namelen != 0) 2089 return (EOPNOTSUPP); 2090 if (newp != NULL) 2091 return (EPERM); 2092 2093 #define VPTRSZ sizeof(struct vnode *) 2094 #define VNODESZ sizeof(struct vnode) 2095 if (where == NULL) { 2096 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2097 return (0); 2098 } 2099 ewhere = where + *sizep; 2100 2101 simple_lock(&mountlist_slock); 2102 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2103 mp = nmp) { 2104 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2105 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2106 continue; 2107 } 2108 savebp = bp; 2109 again: 2110 simple_lock(&mntvnode_slock); 2111 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2112 vp != NULL; 2113 vp = nvp) { 2114 /* 2115 * Check that the vp is still associated with 2116 * this filesystem. RACE: could have been 2117 * recycled onto the same filesystem. 2118 */ 2119 if (vp->v_mount != mp) { 2120 simple_unlock(&mntvnode_slock); 2121 if (kinfo_vdebug) 2122 printf("kinfo: vp changed\n"); 2123 bp = savebp; 2124 goto again; 2125 } 2126 nvp = LIST_NEXT(vp, v_mntvnodes); 2127 if (bp + VPTRSZ + VNODESZ > ewhere) { 2128 simple_unlock(&mntvnode_slock); 2129 *sizep = bp - where; 2130 return (ENOMEM); 2131 } 2132 simple_unlock(&mntvnode_slock); 2133 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2134 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2135 return (error); 2136 bp += VPTRSZ + VNODESZ; 2137 simple_lock(&mntvnode_slock); 2138 } 2139 simple_unlock(&mntvnode_slock); 2140 simple_lock(&mountlist_slock); 2141 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2142 vfs_unbusy(mp); 2143 } 2144 simple_unlock(&mountlist_slock); 2145 2146 *sizep = bp - where; 2147 return (0); 2148 } 2149 2150 /* 2151 * Check to see if a filesystem is mounted on a block device. 2152 */ 2153 int 2154 vfs_mountedon(struct vnode *vp) 2155 { 2156 struct vnode *vq; 2157 int error = 0; 2158 2159 if (vp->v_specmountpoint != NULL) 2160 return (EBUSY); 2161 if (vp->v_flag & VALIASED) { 2162 simple_lock(&spechash_slock); 2163 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2164 if (vq->v_rdev != vp->v_rdev || 2165 vq->v_type != vp->v_type) 2166 continue; 2167 if (vq->v_specmountpoint != NULL) { 2168 error = EBUSY; 2169 break; 2170 } 2171 } 2172 simple_unlock(&spechash_slock); 2173 } 2174 return (error); 2175 } 2176 2177 static int 2178 sacheck(struct sockaddr *sa) 2179 { 2180 switch (sa->sa_family) { 2181 #ifdef INET 2182 case AF_INET: { 2183 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2184 char *p = (char *)sin->sin_zero; 2185 size_t i; 2186 2187 if (sin->sin_len != sizeof(*sin)) 2188 return -1; 2189 if (sin->sin_port != 0) 2190 return -1; 2191 for (i = 0; i < sizeof(sin->sin_zero); i++) 2192 if (*p++ != '\0') 2193 return -1; 2194 return 0; 2195 } 2196 #endif 2197 #ifdef INET6 2198 case AF_INET6: { 2199 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2200 2201 if (sin6->sin6_len != sizeof(*sin6)) 2202 return -1; 2203 if (sin6->sin6_port != 0) 2204 return -1; 2205 return 0; 2206 } 2207 #endif 2208 default: 2209 return -1; 2210 } 2211 } 2212 2213 /* 2214 * Build hash lists of net addresses and hang them off the mount point. 2215 * Called by ufs_mount() to set up the lists of export addresses. 2216 */ 2217 static int 2218 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 2219 struct export_args *argp) 2220 { 2221 struct netcred *np, *enp; 2222 struct radix_node_head *rnh; 2223 int i; 2224 struct sockaddr *saddr, *smask = 0; 2225 struct domain *dom; 2226 int error; 2227 2228 if (argp->ex_addrlen == 0) { 2229 if (mp->mnt_flag & MNT_DEFEXPORTED) 2230 return (EPERM); 2231 np = &nep->ne_defexported; 2232 np->netc_exflags = argp->ex_flags; 2233 crcvt(&np->netc_anon, &argp->ex_anon); 2234 np->netc_anon.cr_ref = 1; 2235 mp->mnt_flag |= MNT_DEFEXPORTED; 2236 return (0); 2237 } 2238 2239 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2240 return (EINVAL); 2241 2242 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2243 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2244 memset((caddr_t)np, 0, i); 2245 saddr = (struct sockaddr *)(np + 1); 2246 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2247 if (error) 2248 goto out; 2249 if (saddr->sa_len > argp->ex_addrlen) 2250 saddr->sa_len = argp->ex_addrlen; 2251 if (sacheck(saddr) == -1) 2252 return EINVAL; 2253 if (argp->ex_masklen) { 2254 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2255 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2256 if (error) 2257 goto out; 2258 if (smask->sa_len > argp->ex_masklen) 2259 smask->sa_len = argp->ex_masklen; 2260 if (smask->sa_family != saddr->sa_family) 2261 return EINVAL; 2262 if (sacheck(smask) == -1) 2263 return EINVAL; 2264 } 2265 i = saddr->sa_family; 2266 if ((rnh = nep->ne_rtable[i]) == 0) { 2267 /* 2268 * Seems silly to initialize every AF when most are not 2269 * used, do so on demand here 2270 */ 2271 DOMAIN_FOREACH(dom) { 2272 if (dom->dom_family == i && dom->dom_rtattach) { 2273 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2274 dom->dom_rtoffset); 2275 break; 2276 } 2277 } 2278 if ((rnh = nep->ne_rtable[i]) == 0) { 2279 error = ENOBUFS; 2280 goto out; 2281 } 2282 } 2283 2284 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2285 np->netc_rnodes); 2286 if (enp != np) { 2287 if (enp == NULL) { 2288 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2289 smask, rnh); 2290 if (enp == NULL) { 2291 error = EPERM; 2292 goto out; 2293 } 2294 } else 2295 enp->netc_refcnt++; 2296 2297 goto check; 2298 } else 2299 enp->netc_refcnt = 1; 2300 2301 np->netc_exflags = argp->ex_flags; 2302 crcvt(&np->netc_anon, &argp->ex_anon); 2303 np->netc_anon.cr_ref = 1; 2304 return 0; 2305 check: 2306 if (enp->netc_exflags != argp->ex_flags || 2307 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2308 error = EPERM; 2309 else 2310 error = 0; 2311 out: 2312 free(np, M_NETADDR); 2313 return error; 2314 } 2315 2316 /* ARGSUSED */ 2317 static int 2318 vfs_free_netcred(struct radix_node *rn, void *w) 2319 { 2320 struct radix_node_head *rnh = (struct radix_node_head *)w; 2321 struct netcred *np = (struct netcred *)(void *)rn; 2322 2323 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2324 if (--(np->netc_refcnt) <= 0) 2325 free(np, M_NETADDR); 2326 return (0); 2327 } 2328 2329 /* 2330 * Free the net address hash lists that are hanging off the mount points. 2331 */ 2332 static void 2333 vfs_free_addrlist(struct netexport *nep) 2334 { 2335 int i; 2336 struct radix_node_head *rnh; 2337 2338 for (i = 0; i <= AF_MAX; i++) 2339 if ((rnh = nep->ne_rtable[i]) != NULL) { 2340 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2341 free((caddr_t)rnh, M_RTABLE); 2342 nep->ne_rtable[i] = 0; 2343 } 2344 } 2345 2346 int 2347 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp) 2348 { 2349 int error; 2350 2351 if (argp->ex_flags & MNT_DELEXPORT) { 2352 if (mp->mnt_flag & MNT_EXPUBLIC) { 2353 vfs_setpublicfs(NULL, NULL, NULL); 2354 mp->mnt_flag &= ~MNT_EXPUBLIC; 2355 } 2356 vfs_free_addrlist(nep); 2357 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2358 } 2359 if (argp->ex_flags & MNT_EXPORTED) { 2360 if (argp->ex_flags & MNT_EXPUBLIC) { 2361 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2362 return (error); 2363 mp->mnt_flag |= MNT_EXPUBLIC; 2364 } 2365 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2366 return (error); 2367 mp->mnt_flag |= MNT_EXPORTED; 2368 } 2369 return (0); 2370 } 2371 2372 /* 2373 * Set the publicly exported filesystem (WebNFS). Currently, only 2374 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2375 */ 2376 int 2377 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 2378 struct export_args *argp) 2379 { 2380 int error; 2381 struct vnode *rvp; 2382 char *cp; 2383 2384 /* 2385 * mp == NULL -> invalidate the current info, the FS is 2386 * no longer exported. May be called from either vfs_export 2387 * or unmount, so check if it hasn't already been done. 2388 */ 2389 if (mp == NULL) { 2390 if (nfs_pub.np_valid) { 2391 nfs_pub.np_valid = 0; 2392 if (nfs_pub.np_index != NULL) { 2393 FREE(nfs_pub.np_index, M_TEMP); 2394 nfs_pub.np_index = NULL; 2395 } 2396 } 2397 return (0); 2398 } 2399 2400 /* 2401 * Only one allowed at a time. 2402 */ 2403 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2404 return (EBUSY); 2405 2406 /* 2407 * Get real filehandle for root of exported FS. 2408 */ 2409 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2410 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx; 2411 2412 if ((error = VFS_ROOT(mp, &rvp))) 2413 return (error); 2414 2415 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2416 return (error); 2417 2418 vput(rvp); 2419 2420 /* 2421 * If an indexfile was specified, pull it in. 2422 */ 2423 if (argp->ex_indexfile != NULL) { 2424 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2425 M_WAITOK); 2426 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2427 MAXNAMLEN, (size_t *)0); 2428 if (!error) { 2429 /* 2430 * Check for illegal filenames. 2431 */ 2432 for (cp = nfs_pub.np_index; *cp; cp++) { 2433 if (*cp == '/') { 2434 error = EINVAL; 2435 break; 2436 } 2437 } 2438 } 2439 if (error) { 2440 FREE(nfs_pub.np_index, M_TEMP); 2441 return (error); 2442 } 2443 } 2444 2445 nfs_pub.np_mount = mp; 2446 nfs_pub.np_valid = 1; 2447 return (0); 2448 } 2449 2450 struct netcred * 2451 vfs_export_lookup(struct mount *mp, struct netexport *nep, struct mbuf *nam) 2452 { 2453 struct netcred *np; 2454 struct radix_node_head *rnh; 2455 struct sockaddr *saddr; 2456 2457 np = NULL; 2458 if (mp->mnt_flag & MNT_EXPORTED) { 2459 /* 2460 * Lookup in the export list first. 2461 */ 2462 if (nam != NULL) { 2463 saddr = mtod(nam, struct sockaddr *); 2464 rnh = nep->ne_rtable[saddr->sa_family]; 2465 if (rnh != NULL) { 2466 np = (struct netcred *) 2467 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2468 rnh); 2469 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2470 np = NULL; 2471 } 2472 } 2473 /* 2474 * If no address match, use the default if it exists. 2475 */ 2476 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2477 np = &nep->ne_defexported; 2478 } 2479 return (np); 2480 } 2481 2482 /* 2483 * Do the usual access checking. 2484 * file_mode, uid and gid are from the vnode in question, 2485 * while acc_mode and cred are from the VOP_ACCESS parameter list 2486 */ 2487 int 2488 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, 2489 mode_t acc_mode, struct ucred *cred) 2490 { 2491 mode_t mask; 2492 2493 /* 2494 * Super-user always gets read/write access, but execute access depends 2495 * on at least one execute bit being set. 2496 */ 2497 if (cred->cr_uid == 0) { 2498 if ((acc_mode & VEXEC) && type != VDIR && 2499 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2500 return (EACCES); 2501 return (0); 2502 } 2503 2504 mask = 0; 2505 2506 /* Otherwise, check the owner. */ 2507 if (cred->cr_uid == uid) { 2508 if (acc_mode & VEXEC) 2509 mask |= S_IXUSR; 2510 if (acc_mode & VREAD) 2511 mask |= S_IRUSR; 2512 if (acc_mode & VWRITE) 2513 mask |= S_IWUSR; 2514 return ((file_mode & mask) == mask ? 0 : EACCES); 2515 } 2516 2517 /* Otherwise, check the groups. */ 2518 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2519 if (acc_mode & VEXEC) 2520 mask |= S_IXGRP; 2521 if (acc_mode & VREAD) 2522 mask |= S_IRGRP; 2523 if (acc_mode & VWRITE) 2524 mask |= S_IWGRP; 2525 return ((file_mode & mask) == mask ? 0 : EACCES); 2526 } 2527 2528 /* Otherwise, check everyone else. */ 2529 if (acc_mode & VEXEC) 2530 mask |= S_IXOTH; 2531 if (acc_mode & VREAD) 2532 mask |= S_IROTH; 2533 if (acc_mode & VWRITE) 2534 mask |= S_IWOTH; 2535 return ((file_mode & mask) == mask ? 0 : EACCES); 2536 } 2537 2538 /* 2539 * Unmount all file systems. 2540 * We traverse the list in reverse order under the assumption that doing so 2541 * will avoid needing to worry about dependencies. 2542 */ 2543 void 2544 vfs_unmountall(struct proc *p) 2545 { 2546 struct mount *mp, *nmp; 2547 int allerror, error; 2548 2549 printf("unmounting file systems..."); 2550 for (allerror = 0, 2551 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2552 nmp = mp->mnt_list.cqe_prev; 2553 #ifdef DEBUG 2554 printf("\nunmounting %s (%s)...", 2555 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2556 #endif 2557 /* 2558 * XXX Freeze syncer. Must do this before locking the 2559 * mount point. See dounmount() for details. 2560 */ 2561 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2562 if (vfs_busy(mp, 0, 0)) { 2563 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2564 continue; 2565 } 2566 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2567 printf("unmount of %s failed with error %d\n", 2568 mp->mnt_stat.f_mntonname, error); 2569 allerror = 1; 2570 } 2571 } 2572 printf(" done\n"); 2573 if (allerror) 2574 printf("WARNING: some file systems would not unmount\n"); 2575 } 2576 2577 extern struct simplelock bqueue_slock; /* XXX */ 2578 2579 /* 2580 * Sync and unmount file systems before shutting down. 2581 */ 2582 void 2583 vfs_shutdown(void) 2584 { 2585 struct lwp *l = curlwp; 2586 struct proc *p; 2587 2588 /* XXX we're certainly not running in proc0's context! */ 2589 if (l == NULL || (p = l->l_proc) == NULL) 2590 p = &proc0; 2591 2592 printf("syncing disks... "); 2593 2594 /* remove user process from run queue */ 2595 suspendsched(); 2596 (void) spl0(); 2597 2598 /* avoid coming back this way again if we panic. */ 2599 doing_shutdown = 1; 2600 2601 sys_sync(l, NULL, NULL); 2602 2603 /* Wait for sync to finish. */ 2604 if (buf_syncwait() != 0) { 2605 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2606 Debugger(); 2607 #endif 2608 printf("giving up\n"); 2609 return; 2610 } else 2611 printf("done\n"); 2612 2613 /* 2614 * If we've panic'd, don't make the situation potentially 2615 * worse by unmounting the file systems. 2616 */ 2617 if (panicstr != NULL) 2618 return; 2619 2620 /* Release inodes held by texts before update. */ 2621 #ifdef notdef 2622 vnshutdown(); 2623 #endif 2624 /* Unmount file systems. */ 2625 vfs_unmountall(p); 2626 } 2627 2628 /* 2629 * Mount the root file system. If the operator didn't specify a 2630 * file system to use, try all possible file systems until one 2631 * succeeds. 2632 */ 2633 int 2634 vfs_mountroot(void) 2635 { 2636 struct vfsops *v; 2637 int error = ENODEV; 2638 2639 if (root_device == NULL) 2640 panic("vfs_mountroot: root device unknown"); 2641 2642 switch (root_device->dv_class) { 2643 case DV_IFNET: 2644 if (rootdev != NODEV) 2645 panic("vfs_mountroot: rootdev set for DV_IFNET " 2646 "(0x%08x -> %d,%d)", rootdev, 2647 major(rootdev), minor(rootdev)); 2648 break; 2649 2650 case DV_DISK: 2651 if (rootdev == NODEV) 2652 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2653 if (bdevvp(rootdev, &rootvp)) 2654 panic("vfs_mountroot: can't get vnode for rootdev"); 2655 error = VOP_OPEN(rootvp, FREAD, FSCRED, curproc); 2656 if (error) { 2657 printf("vfs_mountroot: can't open root device\n"); 2658 return (error); 2659 } 2660 break; 2661 2662 default: 2663 printf("%s: inappropriate for root file system\n", 2664 root_device->dv_xname); 2665 return (ENODEV); 2666 } 2667 2668 /* 2669 * If user specified a file system, use it. 2670 */ 2671 if (mountroot != NULL) { 2672 error = (*mountroot)(); 2673 goto done; 2674 } 2675 2676 /* 2677 * Try each file system currently configured into the kernel. 2678 */ 2679 LIST_FOREACH(v, &vfs_list, vfs_list) { 2680 if (v->vfs_mountroot == NULL) 2681 continue; 2682 #ifdef DEBUG 2683 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2684 #endif 2685 error = (*v->vfs_mountroot)(); 2686 if (!error) { 2687 aprint_normal("root file system type: %s\n", 2688 v->vfs_name); 2689 break; 2690 } 2691 } 2692 2693 if (v == NULL) { 2694 printf("no file system for %s", root_device->dv_xname); 2695 if (root_device->dv_class == DV_DISK) 2696 printf(" (dev 0x%x)", rootdev); 2697 printf("\n"); 2698 error = EFTYPE; 2699 } 2700 2701 done: 2702 if (error && root_device->dv_class == DV_DISK) { 2703 VOP_CLOSE(rootvp, FREAD, FSCRED, curproc); 2704 vrele(rootvp); 2705 } 2706 return (error); 2707 } 2708 2709 /* 2710 * Given a file system name, look up the vfsops for that 2711 * file system, or return NULL if file system isn't present 2712 * in the kernel. 2713 */ 2714 struct vfsops * 2715 vfs_getopsbyname(const char *name) 2716 { 2717 struct vfsops *v; 2718 2719 LIST_FOREACH(v, &vfs_list, vfs_list) { 2720 if (strcmp(v->vfs_name, name) == 0) 2721 break; 2722 } 2723 2724 return (v); 2725 } 2726 2727 /* 2728 * Establish a file system and initialize it. 2729 */ 2730 int 2731 vfs_attach(struct vfsops *vfs) 2732 { 2733 struct vfsops *v; 2734 int error = 0; 2735 2736 2737 /* 2738 * Make sure this file system doesn't already exist. 2739 */ 2740 LIST_FOREACH(v, &vfs_list, vfs_list) { 2741 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2742 error = EEXIST; 2743 goto out; 2744 } 2745 } 2746 2747 /* 2748 * Initialize the vnode operations for this file system. 2749 */ 2750 vfs_opv_init(vfs->vfs_opv_descs); 2751 2752 /* 2753 * Now initialize the file system itself. 2754 */ 2755 (*vfs->vfs_init)(); 2756 2757 /* 2758 * ...and link it into the kernel's list. 2759 */ 2760 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2761 2762 /* 2763 * Sanity: make sure the reference count is 0. 2764 */ 2765 vfs->vfs_refcount = 0; 2766 2767 out: 2768 return (error); 2769 } 2770 2771 /* 2772 * Remove a file system from the kernel. 2773 */ 2774 int 2775 vfs_detach(struct vfsops *vfs) 2776 { 2777 struct vfsops *v; 2778 2779 /* 2780 * Make sure no one is using the filesystem. 2781 */ 2782 if (vfs->vfs_refcount != 0) 2783 return (EBUSY); 2784 2785 /* 2786 * ...and remove it from the kernel's list. 2787 */ 2788 LIST_FOREACH(v, &vfs_list, vfs_list) { 2789 if (v == vfs) { 2790 LIST_REMOVE(v, vfs_list); 2791 break; 2792 } 2793 } 2794 2795 if (v == NULL) 2796 return (ESRCH); 2797 2798 /* 2799 * Now run the file system-specific cleanups. 2800 */ 2801 (*vfs->vfs_done)(); 2802 2803 /* 2804 * Free the vnode operations vector. 2805 */ 2806 vfs_opv_free(vfs->vfs_opv_descs); 2807 return (0); 2808 } 2809 2810 void 2811 vfs_reinit(void) 2812 { 2813 struct vfsops *vfs; 2814 2815 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2816 if (vfs->vfs_reinit) { 2817 (*vfs->vfs_reinit)(); 2818 } 2819 } 2820 } 2821 2822 /* 2823 * Request a filesystem to suspend write operations. 2824 */ 2825 int 2826 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2827 { 2828 struct proc *p = curproc; /* XXX */ 2829 int error; 2830 2831 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2832 if (slptimeo < 0) 2833 return EWOULDBLOCK; 2834 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2835 if (error) 2836 return error; 2837 } 2838 mp->mnt_iflag |= IMNT_SUSPEND; 2839 2840 simple_lock(&mp->mnt_slock); 2841 if (mp->mnt_writeopcountupper > 0) 2842 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 2843 0, &mp->mnt_slock); 2844 simple_unlock(&mp->mnt_slock); 2845 2846 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2847 if (error) { 2848 vfs_write_resume(mp); 2849 return error; 2850 } 2851 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2852 2853 simple_lock(&mp->mnt_slock); 2854 if (mp->mnt_writeopcountlower > 0) 2855 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 2856 0, &mp->mnt_slock); 2857 mp->mnt_iflag |= IMNT_SUSPENDED; 2858 simple_unlock(&mp->mnt_slock); 2859 2860 return 0; 2861 } 2862 2863 /* 2864 * Request a filesystem to resume write operations. 2865 */ 2866 void 2867 vfs_write_resume(struct mount *mp) 2868 { 2869 2870 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2871 return; 2872 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2873 wakeup(&mp->mnt_flag); 2874 } 2875 2876 void 2877 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2878 { 2879 const struct statvfs *mbp; 2880 2881 if (sbp == (mbp = &mp->mnt_stat)) 2882 return; 2883 2884 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2885 sbp->f_fsid = mbp->f_fsid; 2886 sbp->f_owner = mbp->f_owner; 2887 sbp->f_flag = mbp->f_flag; 2888 sbp->f_syncwrites = mbp->f_syncwrites; 2889 sbp->f_asyncwrites = mbp->f_asyncwrites; 2890 sbp->f_syncreads = mbp->f_syncreads; 2891 sbp->f_asyncreads = mbp->f_asyncreads; 2892 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2893 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2894 sizeof(sbp->f_fstypename)); 2895 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2896 sizeof(sbp->f_mntonname)); 2897 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2898 sizeof(sbp->f_mntfromname)); 2899 sbp->f_namemax = mbp->f_namemax; 2900 } 2901 2902 int 2903 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2904 struct mount *mp, struct proc *p) 2905 { 2906 int error; 2907 size_t size; 2908 struct statvfs *sfs = &mp->mnt_stat; 2909 int (*fun)(const void *, void *, size_t, size_t *); 2910 2911 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 2912 sizeof(mp->mnt_stat.f_fstypename)); 2913 2914 if (onp) { 2915 struct cwdinfo *cwdi = p->p_cwdi; 2916 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2917 if (cwdi->cwdi_rdir != NULL) { 2918 size_t len; 2919 char *bp; 2920 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 2921 2922 if (!path) /* XXX can't happen with M_WAITOK */ 2923 return ENOMEM; 2924 2925 bp = path + MAXPATHLEN; 2926 *--bp = '\0'; 2927 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 2928 path, MAXPATHLEN / 2, 0, p); 2929 if (error) { 2930 free(path, M_TEMP); 2931 return error; 2932 } 2933 2934 len = strlen(bp); 2935 if (len > sizeof(sfs->f_mntonname) - 1) 2936 len = sizeof(sfs->f_mntonname) - 1; 2937 (void)strncpy(sfs->f_mntonname, bp, len); 2938 free(path, M_TEMP); 2939 2940 if (len < sizeof(sfs->f_mntonname) - 1) { 2941 error = (*fun)(onp, &sfs->f_mntonname[len], 2942 sizeof(sfs->f_mntonname) - len - 1, &size); 2943 if (error) 2944 return error; 2945 size += len; 2946 } else { 2947 size = len; 2948 } 2949 } else { 2950 error = (*fun)(onp, &sfs->f_mntonname, 2951 sizeof(sfs->f_mntonname) - 1, &size); 2952 if (error) 2953 return error; 2954 } 2955 (void)memset(sfs->f_mntonname + size, 0, 2956 sizeof(sfs->f_mntonname) - size); 2957 } 2958 2959 if (fromp) { 2960 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 2961 error = (*fun)(fromp, sfs->f_mntfromname, 2962 sizeof(sfs->f_mntfromname) - 1, &size); 2963 if (error) 2964 return error; 2965 (void)memset(sfs->f_mntfromname + size, 0, 2966 sizeof(sfs->f_mntfromname) - size); 2967 } 2968 return 0; 2969 } 2970 2971 /* 2972 * Default vfs_extattrctl routine for file systems that do not support 2973 * it. 2974 */ 2975 /*ARGSUSED*/ 2976 int 2977 vfs_stdextattrctl(struct mount *mp, int cmt, struct vnode *vp, 2978 int attrnamespace, const char *attrname, struct proc *p) 2979 { 2980 2981 if (vp != NULL) 2982 VOP_UNLOCK(vp, 0); 2983 return (EOPNOTSUPP); 2984 } 2985 2986 /* 2987 * Credential check based on process requesting service, and per-attribute 2988 * permissions. 2989 * 2990 * NOTE: Vnode must be locked. 2991 */ 2992 int 2993 extattr_check_cred(struct vnode *vp, int attrnamespace, 2994 struct ucred *cred, struct proc *p, int access) 2995 { 2996 2997 if (cred == NOCRED) 2998 return (0); 2999 3000 switch (attrnamespace) { 3001 case EXTATTR_NAMESPACE_SYSTEM: 3002 /* 3003 * Do we really want to allow this, or just require that 3004 * these requests come from kernel code (NOCRED case above)? 3005 */ 3006 return (suser(cred, &p->p_acflag)); 3007 3008 case EXTATTR_NAMESPACE_USER: 3009 return (VOP_ACCESS(vp, access, cred, p)); 3010 3011 default: 3012 return (EPERM); 3013 } 3014 } 3015 3016 #ifdef DDB 3017 static const char buf_flagbits[] = BUF_FLAGBITS; 3018 3019 void 3020 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) 3021 { 3022 char bf[1024]; 3023 3024 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 3025 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 3026 3027 bitmask_snprintf(bp->b_flags, buf_flagbits, bf, sizeof(bf)); 3028 (*pr)(" error %d flags 0x%s\n", bp->b_error, bf); 3029 3030 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3031 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3032 (*pr)(" data %p saveaddr %p dep %p\n", 3033 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 3034 (*pr)(" iodone %p\n", bp->b_iodone); 3035 } 3036 3037 3038 void 3039 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) 3040 { 3041 char bf[256]; 3042 3043 uvm_object_printit(&vp->v_uobj, full, pr); 3044 bitmask_snprintf(vp->v_flag, vnode_flagbits, bf, sizeof(bf)); 3045 (*pr)("\nVNODE flags %s\n", bf); 3046 (*pr)("mp %p numoutput %d size 0x%llx\n", 3047 vp->v_mount, vp->v_numoutput, vp->v_size); 3048 3049 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3050 vp->v_data, vp->v_usecount, vp->v_writecount, 3051 vp->v_holdcnt, vp->v_numoutput); 3052 3053 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n", 3054 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 3055 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 3056 vp->v_mount, vp->v_mountedhere); 3057 3058 if (full) { 3059 struct buf *bp; 3060 3061 (*pr)("clean bufs:\n"); 3062 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3063 (*pr)(" bp %p\n", bp); 3064 vfs_buf_print(bp, full, pr); 3065 } 3066 3067 (*pr)("dirty bufs:\n"); 3068 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3069 (*pr)(" bp %p\n", bp); 3070 vfs_buf_print(bp, full, pr); 3071 } 3072 } 3073 } 3074 3075 void 3076 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) 3077 { 3078 char sbuf[256]; 3079 3080 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3081 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3082 3083 (*pr)("fs_bshift %d dev_bshift = %d\n", 3084 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3085 3086 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3087 (*pr)("flag = %s\n", sbuf); 3088 3089 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3090 (*pr)("iflag = %s\n", sbuf); 3091 3092 /* XXX use lockmgr_printinfo */ 3093 if (mp->mnt_lock.lk_sharecount) 3094 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3095 mp->mnt_lock.lk_sharecount); 3096 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3097 (*pr)(" lock type %s: EXCL (count %d) by ", 3098 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3099 if (mp->mnt_lock.lk_flags & LK_SPIN) 3100 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3101 else 3102 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3103 mp->mnt_lock.lk_locklwp); 3104 } else 3105 (*pr)(" not locked"); 3106 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3107 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3108 3109 (*pr)("\n"); 3110 3111 if (mp->mnt_unmounter) { 3112 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3113 } 3114 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3115 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3116 3117 (*pr)("statvfs cache:\n"); 3118 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3119 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3120 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3121 3122 (*pr)("\tblocks = "PRIu64"\n",mp->mnt_stat.f_blocks); 3123 (*pr)("\tbfree = "PRIu64"\n",mp->mnt_stat.f_bfree); 3124 (*pr)("\tbavail = "PRIu64"\n",mp->mnt_stat.f_bavail); 3125 (*pr)("\tbresvd = "PRIu64"\n",mp->mnt_stat.f_bresvd); 3126 3127 (*pr)("\tfiles = "PRIu64"\n",mp->mnt_stat.f_files); 3128 (*pr)("\tffree = "PRIu64"\n",mp->mnt_stat.f_ffree); 3129 (*pr)("\tfavail = "PRIu64"\n",mp->mnt_stat.f_favail); 3130 (*pr)("\tfresvd = "PRIu64"\n",mp->mnt_stat.f_fresvd); 3131 3132 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3133 mp->mnt_stat.f_fsidx.__fsid_val[0], 3134 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3135 3136 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3137 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3138 3139 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, 3140 sizeof(sbuf)); 3141 (*pr)("\tflag = %s\n",sbuf); 3142 (*pr)("\tsyncwrites = " PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3143 (*pr)("\tasyncwrites = " PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3144 (*pr)("\tsyncreads = " PRIu64 "\n",mp->mnt_stat.f_syncreads); 3145 (*pr)("\tasyncreads = " PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3146 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3147 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3148 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3149 3150 { 3151 int cnt = 0; 3152 struct vnode *vp; 3153 (*pr)("locked vnodes ="); 3154 /* XXX would take mountlist lock, except ddb may not have context */ 3155 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3156 if (VOP_ISLOCKED(vp)) { 3157 if ((++cnt % 6) == 0) { 3158 (*pr)(" %p,\n\t", vp); 3159 } else { 3160 (*pr)(" %p,", vp); 3161 } 3162 } 3163 } 3164 (*pr)("\n"); 3165 } 3166 3167 if (full) { 3168 int cnt = 0; 3169 struct vnode *vp; 3170 (*pr)("all vnodes ="); 3171 /* XXX would take mountlist lock, except ddb may not have context */ 3172 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3173 if (!LIST_NEXT(vp, v_mntvnodes)) { 3174 (*pr)(" %p", vp); 3175 } else if ((++cnt % 6) == 0) { 3176 (*pr)(" %p,\n\t", vp); 3177 } else { 3178 (*pr)(" %p,", vp); 3179 } 3180 } 3181 (*pr)("\n", vp); 3182 } 3183 } 3184 #endif /* DDB */ 3185