1 /* $NetBSD: vfs_subr.c,v 1.243 2005/03/02 11:05:34 mycroft Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the NetBSD 24 * Foundation, Inc. and its contributors. 25 * 4. Neither the name of The NetBSD Foundation nor the names of its 26 * contributors may be used to endorse or promote products derived 27 * from this software without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 30 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 31 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 32 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 33 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 34 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 35 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 36 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 37 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39 * POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * Copyright (c) 1989, 1993 44 * The Regents of the University of California. All rights reserved. 45 * (c) UNIX System Laboratories, Inc. 46 * All or some portions of this file are derived from material licensed 47 * to the University of California by American Telephone and Telegraph 48 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 49 * the permission of UNIX System Laboratories, Inc. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 3. Neither the name of the University nor the names of its contributors 60 * may be used to endorse or promote products derived from this software 61 * without specific prior written permission. 62 * 63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 73 * SUCH DAMAGE. 74 * 75 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 76 */ 77 78 /* 79 * External virtual filesystem routines 80 */ 81 82 #include <sys/cdefs.h> 83 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.243 2005/03/02 11:05:34 mycroft Exp $"); 84 85 #include "opt_inet.h" 86 #include "opt_ddb.h" 87 #include "opt_compat_netbsd.h" 88 #include "opt_compat_43.h" 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/proc.h> 93 #include <sys/kernel.h> 94 #include <sys/mount.h> 95 #include <sys/time.h> 96 #include <sys/event.h> 97 #include <sys/fcntl.h> 98 #include <sys/vnode.h> 99 #include <sys/stat.h> 100 #include <sys/namei.h> 101 #include <sys/ucred.h> 102 #include <sys/buf.h> 103 #include <sys/errno.h> 104 #include <sys/malloc.h> 105 #include <sys/domain.h> 106 #include <sys/mbuf.h> 107 #include <sys/sa.h> 108 #include <sys/syscallargs.h> 109 #include <sys/device.h> 110 #include <sys/extattr.h> 111 #include <sys/dirent.h> 112 #include <sys/filedesc.h> 113 114 #include <miscfs/specfs/specdev.h> 115 #include <miscfs/genfs/genfs.h> 116 #include <miscfs/syncfs/syncfs.h> 117 118 #include <netinet/in.h> 119 120 #include <uvm/uvm.h> 121 #include <uvm/uvm_ddb.h> 122 123 #include <netinet/in.h> 124 125 #include <sys/sysctl.h> 126 127 const enum vtype iftovt_tab[16] = { 128 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 129 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 130 }; 131 const int vttoif_tab[9] = { 132 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 133 S_IFSOCK, S_IFIFO, S_IFMT, 134 }; 135 136 int doforce = 1; /* 1 => permit forcible unmounting */ 137 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 138 139 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */ 140 141 /* 142 * Insq/Remq for the vnode usage lists. 143 */ 144 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 145 #define bufremvn(bp) { \ 146 LIST_REMOVE(bp, b_vnbufs); \ 147 (bp)->b_vnbufs.le_next = NOLIST; \ 148 } 149 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */ 150 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 151 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 152 153 struct mntlist mountlist = /* mounted filesystem list */ 154 CIRCLEQ_HEAD_INITIALIZER(mountlist); 155 struct vfs_list_head vfs_list = /* vfs list */ 156 LIST_HEAD_INITIALIZER(vfs_list); 157 158 struct nfs_public nfs_pub; /* publicly exported FS */ 159 160 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER; 161 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER; 162 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER; 163 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER; 164 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER; 165 166 /* XXX - gross; single global lock to protect v_numoutput */ 167 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER; 168 169 /* 170 * These define the root filesystem and device. 171 */ 172 struct mount *rootfs; 173 struct vnode *rootvnode; 174 struct device *root_device; /* root device */ 175 176 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl", 177 &pool_allocator_nointr); 178 179 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); 180 181 /* 182 * Local declarations. 183 */ 184 void insmntque(struct vnode *, struct mount *); 185 int getdevvp(dev_t, struct vnode **, enum vtype); 186 187 void vclean(struct vnode *, int, struct proc *); 188 189 static int vfs_hang_addrlist(struct mount *, struct netexport *, 190 struct export_args *); 191 static int vfs_free_netcred(struct radix_node *, void *); 192 static void vfs_free_addrlist(struct netexport *); 193 static struct vnode *getcleanvnode(struct proc *); 194 195 #ifdef DEBUG 196 void printlockedvnodes(void); 197 #endif 198 199 /* 200 * Initialize the vnode management data structures. 201 */ 202 void 203 vntblinit() 204 { 205 206 /* 207 * Initialize the filesystem syncer. 208 */ 209 vn_initialize_syncerd(); 210 } 211 212 int 213 vfs_drainvnodes(long target, struct proc *p) 214 { 215 216 simple_lock(&vnode_free_list_slock); 217 while (numvnodes > target) { 218 struct vnode *vp; 219 220 vp = getcleanvnode(p); 221 if (vp == NULL) 222 return EBUSY; /* give up */ 223 pool_put(&vnode_pool, vp); 224 simple_lock(&vnode_free_list_slock); 225 numvnodes--; 226 } 227 simple_unlock(&vnode_free_list_slock); 228 229 return 0; 230 } 231 232 /* 233 * grab a vnode from freelist and clean it. 234 */ 235 struct vnode * 236 getcleanvnode(p) 237 struct proc *p; 238 { 239 struct vnode *vp; 240 struct mount *mp; 241 struct freelst *listhd; 242 243 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock)); 244 245 listhd = &vnode_free_list; 246 try_nextlist: 247 TAILQ_FOREACH(vp, listhd, v_freelist) { 248 if (!simple_lock_try(&vp->v_interlock)) 249 continue; 250 /* 251 * as our lwp might hold the underlying vnode locked, 252 * don't try to reclaim the VLAYER vnode if it's locked. 253 */ 254 if ((vp->v_flag & VXLOCK) == 0 && 255 ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 256 if (vn_start_write(vp, &mp, V_NOWAIT) == 0) 257 break; 258 } 259 mp = NULL; 260 simple_unlock(&vp->v_interlock); 261 } 262 263 if (vp == NULLVP) { 264 if (listhd == &vnode_free_list) { 265 listhd = &vnode_hold_list; 266 goto try_nextlist; 267 } 268 simple_unlock(&vnode_free_list_slock); 269 return NULLVP; 270 } 271 272 if (vp->v_usecount) 273 panic("free vnode isn't, vp %p", vp); 274 TAILQ_REMOVE(listhd, vp, v_freelist); 275 /* see comment on why 0xdeadb is set at end of vgone (below) */ 276 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; 277 simple_unlock(&vnode_free_list_slock); 278 vp->v_lease = NULL; 279 280 if (vp->v_type != VBAD) 281 vgonel(vp, p); 282 else 283 simple_unlock(&vp->v_interlock); 284 vn_finished_write(mp, 0); 285 #ifdef DIAGNOSTIC 286 if (vp->v_data || vp->v_uobj.uo_npages || 287 TAILQ_FIRST(&vp->v_uobj.memq)) 288 panic("cleaned vnode isn't, vp %p", vp); 289 if (vp->v_numoutput) 290 panic("clean vnode has pending I/O's, vp %p", vp); 291 #endif 292 KASSERT((vp->v_flag & VONWORKLST) == 0); 293 294 return vp; 295 } 296 297 /* 298 * Mark a mount point as busy. Used to synchronize access and to delay 299 * unmounting. Interlock is not released on failure. 300 */ 301 int 302 vfs_busy(mp, flags, interlkp) 303 struct mount *mp; 304 int flags; 305 struct simplelock *interlkp; 306 { 307 int lkflags; 308 309 while (mp->mnt_iflag & IMNT_UNMOUNT) { 310 int gone, n; 311 312 if (flags & LK_NOWAIT) 313 return (ENOENT); 314 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL 315 && mp->mnt_unmounter == curproc) 316 return (EDEADLK); 317 if (interlkp) 318 simple_unlock(interlkp); 319 /* 320 * Since all busy locks are shared except the exclusive 321 * lock granted when unmounting, the only place that a 322 * wakeup needs to be done is at the release of the 323 * exclusive lock at the end of dounmount. 324 */ 325 simple_lock(&mp->mnt_slock); 326 mp->mnt_wcnt++; 327 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock); 328 n = --mp->mnt_wcnt; 329 simple_unlock(&mp->mnt_slock); 330 gone = mp->mnt_iflag & IMNT_GONE; 331 332 if (n == 0) 333 wakeup(&mp->mnt_wcnt); 334 if (interlkp) 335 simple_lock(interlkp); 336 if (gone) 337 return (ENOENT); 338 } 339 lkflags = LK_SHARED; 340 if (interlkp) 341 lkflags |= LK_INTERLOCK; 342 if (lockmgr(&mp->mnt_lock, lkflags, interlkp)) 343 panic("vfs_busy: unexpected lock failure"); 344 return (0); 345 } 346 347 /* 348 * Free a busy filesystem. 349 */ 350 void 351 vfs_unbusy(mp) 352 struct mount *mp; 353 { 354 355 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL); 356 } 357 358 /* 359 * Lookup a filesystem type, and if found allocate and initialize 360 * a mount structure for it. 361 * 362 * Devname is usually updated by mount(8) after booting. 363 */ 364 int 365 vfs_rootmountalloc(fstypename, devname, mpp) 366 char *fstypename; 367 char *devname; 368 struct mount **mpp; 369 { 370 struct vfsops *vfsp = NULL; 371 struct mount *mp; 372 373 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 374 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN)) 375 break; 376 377 if (vfsp == NULL) 378 return (ENODEV); 379 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 380 memset((char *)mp, 0, (u_long)sizeof(struct mount)); 381 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); 382 simple_lock_init(&mp->mnt_slock); 383 (void)vfs_busy(mp, LK_NOWAIT, 0); 384 LIST_INIT(&mp->mnt_vnodelist); 385 mp->mnt_op = vfsp; 386 mp->mnt_flag = MNT_RDONLY; 387 mp->mnt_vnodecovered = NULLVP; 388 mp->mnt_leaf = mp; 389 vfsp->vfs_refcount++; 390 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN); 391 mp->mnt_stat.f_mntonname[0] = '/'; 392 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 393 *mpp = mp; 394 return (0); 395 } 396 397 /* 398 * Lookup a mount point by filesystem identifier. 399 */ 400 struct mount * 401 vfs_getvfs(fsid) 402 fsid_t *fsid; 403 { 404 struct mount *mp; 405 406 simple_lock(&mountlist_slock); 407 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 408 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 409 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 410 simple_unlock(&mountlist_slock); 411 return (mp); 412 } 413 } 414 simple_unlock(&mountlist_slock); 415 return ((struct mount *)0); 416 } 417 418 /* 419 * Get a new unique fsid 420 */ 421 void 422 vfs_getnewfsid(mp) 423 struct mount *mp; 424 { 425 static u_short xxxfs_mntid; 426 fsid_t tfsid; 427 int mtype; 428 429 simple_lock(&mntid_slock); 430 mtype = makefstype(mp->mnt_op->vfs_name); 431 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 432 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 433 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 434 if (xxxfs_mntid == 0) 435 ++xxxfs_mntid; 436 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 437 tfsid.__fsid_val[1] = mtype; 438 if (!CIRCLEQ_EMPTY(&mountlist)) { 439 while (vfs_getvfs(&tfsid)) { 440 tfsid.__fsid_val[0]++; 441 xxxfs_mntid++; 442 } 443 } 444 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 445 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 446 simple_unlock(&mntid_slock); 447 } 448 449 /* 450 * Make a 'unique' number from a mount type name. 451 */ 452 long 453 makefstype(type) 454 const char *type; 455 { 456 long rv; 457 458 for (rv = 0; *type; type++) { 459 rv <<= 2; 460 rv ^= *type; 461 } 462 return rv; 463 } 464 465 466 /* 467 * Set vnode attributes to VNOVAL 468 */ 469 void 470 vattr_null(vap) 471 struct vattr *vap; 472 { 473 474 vap->va_type = VNON; 475 476 /* 477 * Assign individually so that it is safe even if size and 478 * sign of each member are varied. 479 */ 480 vap->va_mode = VNOVAL; 481 vap->va_nlink = VNOVAL; 482 vap->va_uid = VNOVAL; 483 vap->va_gid = VNOVAL; 484 vap->va_fsid = VNOVAL; 485 vap->va_fileid = VNOVAL; 486 vap->va_size = VNOVAL; 487 vap->va_blocksize = VNOVAL; 488 vap->va_atime.tv_sec = 489 vap->va_mtime.tv_sec = 490 vap->va_ctime.tv_sec = 491 vap->va_birthtime.tv_sec = VNOVAL; 492 vap->va_atime.tv_nsec = 493 vap->va_mtime.tv_nsec = 494 vap->va_ctime.tv_nsec = 495 vap->va_birthtime.tv_nsec = VNOVAL; 496 vap->va_gen = VNOVAL; 497 vap->va_flags = VNOVAL; 498 vap->va_rdev = VNOVAL; 499 vap->va_bytes = VNOVAL; 500 vap->va_vaflags = 0; 501 } 502 503 /* 504 * Routines having to do with the management of the vnode table. 505 */ 506 extern int (**dead_vnodeop_p)(void *); 507 long numvnodes; 508 509 /* 510 * Return the next vnode from the free list. 511 */ 512 int 513 getnewvnode(tag, mp, vops, vpp) 514 enum vtagtype tag; 515 struct mount *mp; 516 int (**vops)(void *); 517 struct vnode **vpp; 518 { 519 extern struct uvm_pagerops uvm_vnodeops; 520 struct uvm_object *uobj; 521 struct proc *p = curproc; /* XXX */ 522 static int toggle; 523 struct vnode *vp; 524 int error = 0, tryalloc; 525 526 try_again: 527 if (mp) { 528 /* 529 * Mark filesystem busy while we're creating a vnode. 530 * If unmount is in progress, this will wait; if the 531 * unmount succeeds (only if umount -f), this will 532 * return an error. If the unmount fails, we'll keep 533 * going afterwards. 534 * (This puts the per-mount vnode list logically under 535 * the protection of the vfs_busy lock). 536 */ 537 error = vfs_busy(mp, LK_RECURSEFAIL, 0); 538 if (error && error != EDEADLK) 539 return error; 540 } 541 542 /* 543 * We must choose whether to allocate a new vnode or recycle an 544 * existing one. The criterion for allocating a new one is that 545 * the total number of vnodes is less than the number desired or 546 * there are no vnodes on either free list. Generally we only 547 * want to recycle vnodes that have no buffers associated with 548 * them, so we look first on the vnode_free_list. If it is empty, 549 * we next consider vnodes with referencing buffers on the 550 * vnode_hold_list. The toggle ensures that half the time we 551 * will use a buffer from the vnode_hold_list, and half the time 552 * we will allocate a new one unless the list has grown to twice 553 * the desired size. We are reticent to recycle vnodes from the 554 * vnode_hold_list because we will lose the identity of all its 555 * referencing buffers. 556 */ 557 558 vp = NULL; 559 560 simple_lock(&vnode_free_list_slock); 561 562 toggle ^= 1; 563 if (numvnodes > 2 * desiredvnodes) 564 toggle = 0; 565 566 tryalloc = numvnodes < desiredvnodes || 567 (TAILQ_FIRST(&vnode_free_list) == NULL && 568 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 569 570 if (tryalloc && 571 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) { 572 numvnodes++; 573 simple_unlock(&vnode_free_list_slock); 574 memset(vp, 0, sizeof(*vp)); 575 simple_lock_init(&vp->v_interlock); 576 uobj = &vp->v_uobj; 577 uobj->pgops = &uvm_vnodeops; 578 TAILQ_INIT(&uobj->memq); 579 /* 580 * done by memset() above. 581 * uobj->uo_npages = 0; 582 * LIST_INIT(&vp->v_nclist); 583 * LIST_INIT(&vp->v_dnclist); 584 */ 585 } else { 586 vp = getcleanvnode(p); 587 /* 588 * Unless this is a bad time of the month, at most 589 * the first NCPUS items on the free list are 590 * locked, so this is close enough to being empty. 591 */ 592 if (vp == NULLVP) { 593 if (mp && error != EDEADLK) 594 vfs_unbusy(mp); 595 if (tryalloc) { 596 printf("WARNING: unable to allocate new " 597 "vnode, retrying...\n"); 598 (void) tsleep(&lbolt, PRIBIO, "newvn", hz); 599 goto try_again; 600 } 601 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 602 *vpp = 0; 603 return (ENFILE); 604 } 605 vp->v_flag = 0; 606 vp->v_socket = NULL; 607 #ifdef VERIFIED_EXEC 608 vp->fp_status = FINGERPRINT_INVALID; 609 #endif 610 } 611 vp->v_type = VNON; 612 vp->v_vnlock = &vp->v_lock; 613 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 614 KASSERT(LIST_EMPTY(&vp->v_nclist)); 615 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 616 vp->v_tag = tag; 617 vp->v_op = vops; 618 insmntque(vp, mp); 619 *vpp = vp; 620 vp->v_usecount = 1; 621 vp->v_data = 0; 622 simple_lock_init(&vp->v_interlock); 623 624 /* 625 * initialize uvm_object within vnode. 626 */ 627 628 uobj = &vp->v_uobj; 629 KASSERT(uobj->pgops == &uvm_vnodeops); 630 KASSERT(uobj->uo_npages == 0); 631 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 632 vp->v_size = VSIZENOTSET; 633 634 if (mp && error != EDEADLK) 635 vfs_unbusy(mp); 636 return (0); 637 } 638 639 /* 640 * This is really just the reverse of getnewvnode(). Needed for 641 * VFS_VGET functions who may need to push back a vnode in case 642 * of a locking race. 643 */ 644 void 645 ungetnewvnode(vp) 646 struct vnode *vp; 647 { 648 #ifdef DIAGNOSTIC 649 if (vp->v_usecount != 1) 650 panic("ungetnewvnode: busy vnode"); 651 #endif 652 vp->v_usecount--; 653 insmntque(vp, NULL); 654 vp->v_type = VBAD; 655 656 simple_lock(&vp->v_interlock); 657 /* 658 * Insert at head of LRU list 659 */ 660 simple_lock(&vnode_free_list_slock); 661 if (vp->v_holdcnt > 0) 662 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist); 663 else 664 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 665 simple_unlock(&vnode_free_list_slock); 666 simple_unlock(&vp->v_interlock); 667 } 668 669 /* 670 * Move a vnode from one mount queue to another. 671 */ 672 void 673 insmntque(vp, mp) 674 struct vnode *vp; 675 struct mount *mp; 676 { 677 678 #ifdef DIAGNOSTIC 679 if ((mp != NULL) && 680 (mp->mnt_iflag & IMNT_UNMOUNT) && 681 !(mp->mnt_flag & MNT_SOFTDEP) && 682 vp->v_tag != VT_VFS) { 683 panic("insmntque into dying filesystem"); 684 } 685 #endif 686 687 simple_lock(&mntvnode_slock); 688 /* 689 * Delete from old mount point vnode list, if on one. 690 */ 691 if (vp->v_mount != NULL) 692 LIST_REMOVE(vp, v_mntvnodes); 693 /* 694 * Insert into list of vnodes for the new mount point, if available. 695 */ 696 if ((vp->v_mount = mp) != NULL) 697 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 698 simple_unlock(&mntvnode_slock); 699 } 700 701 /* 702 * Update outstanding I/O count and do wakeup if requested. 703 */ 704 void 705 vwakeup(bp) 706 struct buf *bp; 707 { 708 struct vnode *vp; 709 710 if ((vp = bp->b_vp) != NULL) { 711 /* XXX global lock hack 712 * can't use v_interlock here since this is called 713 * in interrupt context from biodone(). 714 */ 715 simple_lock(&global_v_numoutput_slock); 716 if (--vp->v_numoutput < 0) 717 panic("vwakeup: neg numoutput, vp %p", vp); 718 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 719 vp->v_flag &= ~VBWAIT; 720 wakeup((caddr_t)&vp->v_numoutput); 721 } 722 simple_unlock(&global_v_numoutput_slock); 723 } 724 } 725 726 /* 727 * Flush out and invalidate all buffers associated with a vnode. 728 * Called with the underlying vnode locked, which should prevent new dirty 729 * buffers from being queued. 730 */ 731 int 732 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 733 struct vnode *vp; 734 int flags; 735 struct ucred *cred; 736 struct proc *p; 737 int slpflag, slptimeo; 738 { 739 struct buf *bp, *nbp; 740 int s, error; 741 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 742 (flags & V_SAVE ? PGO_CLEANIT : 0); 743 744 /* XXXUBC this doesn't look at flags or slp* */ 745 simple_lock(&vp->v_interlock); 746 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 747 if (error) { 748 return error; 749 } 750 751 if (flags & V_SAVE) { 752 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); 753 if (error) 754 return (error); 755 #ifdef DIAGNOSTIC 756 s = splbio(); 757 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd)) 758 panic("vinvalbuf: dirty bufs, vp %p", vp); 759 splx(s); 760 #endif 761 } 762 763 s = splbio(); 764 765 restart: 766 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 767 nbp = LIST_NEXT(bp, b_vnbufs); 768 simple_lock(&bp->b_interlock); 769 if (bp->b_flags & B_BUSY) { 770 bp->b_flags |= B_WANTED; 771 error = ltsleep((caddr_t)bp, 772 slpflag | (PRIBIO + 1) | PNORELOCK, 773 "vinvalbuf", slptimeo, &bp->b_interlock); 774 if (error) { 775 splx(s); 776 return (error); 777 } 778 goto restart; 779 } 780 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 781 simple_unlock(&bp->b_interlock); 782 brelse(bp); 783 } 784 785 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 786 nbp = LIST_NEXT(bp, b_vnbufs); 787 simple_lock(&bp->b_interlock); 788 if (bp->b_flags & B_BUSY) { 789 bp->b_flags |= B_WANTED; 790 error = ltsleep((caddr_t)bp, 791 slpflag | (PRIBIO + 1) | PNORELOCK, 792 "vinvalbuf", slptimeo, &bp->b_interlock); 793 if (error) { 794 splx(s); 795 return (error); 796 } 797 goto restart; 798 } 799 /* 800 * XXX Since there are no node locks for NFS, I believe 801 * there is a slight chance that a delayed write will 802 * occur while sleeping just above, so check for it. 803 */ 804 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { 805 #ifdef DEBUG 806 printf("buffer still DELWRI\n"); 807 #endif 808 bp->b_flags |= B_BUSY | B_VFLUSH; 809 simple_unlock(&bp->b_interlock); 810 VOP_BWRITE(bp); 811 goto restart; 812 } 813 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 814 simple_unlock(&bp->b_interlock); 815 brelse(bp); 816 } 817 818 #ifdef DIAGNOSTIC 819 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 820 panic("vinvalbuf: flush failed, vp %p", vp); 821 #endif 822 823 splx(s); 824 825 return (0); 826 } 827 828 /* 829 * Destroy any in core blocks past the truncation length. 830 * Called with the underlying vnode locked, which should prevent new dirty 831 * buffers from being queued. 832 */ 833 int 834 vtruncbuf(vp, lbn, slpflag, slptimeo) 835 struct vnode *vp; 836 daddr_t lbn; 837 int slpflag, slptimeo; 838 { 839 struct buf *bp, *nbp; 840 int s, error; 841 voff_t off; 842 843 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 844 simple_lock(&vp->v_interlock); 845 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 846 if (error) { 847 return error; 848 } 849 850 s = splbio(); 851 852 restart: 853 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 854 nbp = LIST_NEXT(bp, b_vnbufs); 855 if (bp->b_lblkno < lbn) 856 continue; 857 simple_lock(&bp->b_interlock); 858 if (bp->b_flags & B_BUSY) { 859 bp->b_flags |= B_WANTED; 860 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 861 "vtruncbuf", slptimeo, &bp->b_interlock); 862 if (error) { 863 splx(s); 864 return (error); 865 } 866 goto restart; 867 } 868 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 869 simple_unlock(&bp->b_interlock); 870 brelse(bp); 871 } 872 873 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 874 nbp = LIST_NEXT(bp, b_vnbufs); 875 if (bp->b_lblkno < lbn) 876 continue; 877 simple_lock(&bp->b_interlock); 878 if (bp->b_flags & B_BUSY) { 879 bp->b_flags |= B_WANTED; 880 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK, 881 "vtruncbuf", slptimeo, &bp->b_interlock); 882 if (error) { 883 splx(s); 884 return (error); 885 } 886 goto restart; 887 } 888 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH; 889 simple_unlock(&bp->b_interlock); 890 brelse(bp); 891 } 892 893 splx(s); 894 895 return (0); 896 } 897 898 void 899 vflushbuf(vp, sync) 900 struct vnode *vp; 901 int sync; 902 { 903 struct buf *bp, *nbp; 904 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 905 int s; 906 907 simple_lock(&vp->v_interlock); 908 (void) VOP_PUTPAGES(vp, 0, 0, flags); 909 910 loop: 911 s = splbio(); 912 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 913 nbp = LIST_NEXT(bp, b_vnbufs); 914 simple_lock(&bp->b_interlock); 915 if ((bp->b_flags & B_BUSY)) { 916 simple_unlock(&bp->b_interlock); 917 continue; 918 } 919 if ((bp->b_flags & B_DELWRI) == 0) 920 panic("vflushbuf: not dirty, bp %p", bp); 921 bp->b_flags |= B_BUSY | B_VFLUSH; 922 simple_unlock(&bp->b_interlock); 923 splx(s); 924 /* 925 * Wait for I/O associated with indirect blocks to complete, 926 * since there is no way to quickly wait for them below. 927 */ 928 if (bp->b_vp == vp || sync == 0) 929 (void) bawrite(bp); 930 else 931 (void) bwrite(bp); 932 goto loop; 933 } 934 if (sync == 0) { 935 splx(s); 936 return; 937 } 938 simple_lock(&global_v_numoutput_slock); 939 while (vp->v_numoutput) { 940 vp->v_flag |= VBWAIT; 941 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0, 942 &global_v_numoutput_slock); 943 } 944 simple_unlock(&global_v_numoutput_slock); 945 splx(s); 946 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) { 947 vprint("vflushbuf: dirty", vp); 948 goto loop; 949 } 950 } 951 952 /* 953 * Associate a buffer with a vnode. 954 */ 955 void 956 bgetvp(vp, bp) 957 struct vnode *vp; 958 struct buf *bp; 959 { 960 int s; 961 962 if (bp->b_vp) 963 panic("bgetvp: not free, bp %p", bp); 964 VHOLD(vp); 965 s = splbio(); 966 bp->b_vp = vp; 967 if (vp->v_type == VBLK || vp->v_type == VCHR) 968 bp->b_dev = vp->v_rdev; 969 else 970 bp->b_dev = NODEV; 971 /* 972 * Insert onto list for new vnode. 973 */ 974 bufinsvn(bp, &vp->v_cleanblkhd); 975 splx(s); 976 } 977 978 /* 979 * Disassociate a buffer from a vnode. 980 */ 981 void 982 brelvp(bp) 983 struct buf *bp; 984 { 985 struct vnode *vp; 986 int s; 987 988 if (bp->b_vp == NULL) 989 panic("brelvp: vp NULL, bp %p", bp); 990 991 s = splbio(); 992 vp = bp->b_vp; 993 /* 994 * Delete from old vnode list, if on one. 995 */ 996 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 997 bufremvn(bp); 998 999 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) && 1000 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1001 vp->v_flag &= ~VONWORKLST; 1002 LIST_REMOVE(vp, v_synclist); 1003 } 1004 1005 bp->b_vp = NULL; 1006 HOLDRELE(vp); 1007 splx(s); 1008 } 1009 1010 /* 1011 * Reassign a buffer from one vnode to another. 1012 * Used to assign file specific control information 1013 * (indirect blocks) to the vnode to which they belong. 1014 * 1015 * This function must be called at splbio(). 1016 */ 1017 void 1018 reassignbuf(bp, newvp) 1019 struct buf *bp; 1020 struct vnode *newvp; 1021 { 1022 struct buflists *listheadp; 1023 int delay; 1024 1025 /* 1026 * Delete from old vnode list, if on one. 1027 */ 1028 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1029 bufremvn(bp); 1030 /* 1031 * If dirty, put on list of dirty buffers; 1032 * otherwise insert onto list of clean buffers. 1033 */ 1034 if ((bp->b_flags & B_DELWRI) == 0) { 1035 listheadp = &newvp->v_cleanblkhd; 1036 if (TAILQ_EMPTY(&newvp->v_uobj.memq) && 1037 (newvp->v_flag & VONWORKLST) && 1038 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1039 newvp->v_flag &= ~VONWORKLST; 1040 LIST_REMOVE(newvp, v_synclist); 1041 } 1042 } else { 1043 listheadp = &newvp->v_dirtyblkhd; 1044 if ((newvp->v_flag & VONWORKLST) == 0) { 1045 switch (newvp->v_type) { 1046 case VDIR: 1047 delay = dirdelay; 1048 break; 1049 case VBLK: 1050 if (newvp->v_specmountpoint != NULL) { 1051 delay = metadelay; 1052 break; 1053 } 1054 /* fall through */ 1055 default: 1056 delay = filedelay; 1057 break; 1058 } 1059 if (!newvp->v_mount || 1060 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1061 vn_syncer_add_to_worklist(newvp, delay); 1062 } 1063 } 1064 bufinsvn(bp, listheadp); 1065 } 1066 1067 /* 1068 * Create a vnode for a block device. 1069 * Used for root filesystem and swap areas. 1070 * Also used for memory file system special devices. 1071 */ 1072 int 1073 bdevvp(dev, vpp) 1074 dev_t dev; 1075 struct vnode **vpp; 1076 { 1077 1078 return (getdevvp(dev, vpp, VBLK)); 1079 } 1080 1081 /* 1082 * Create a vnode for a character device. 1083 * Used for kernfs and some console handling. 1084 */ 1085 int 1086 cdevvp(dev, vpp) 1087 dev_t dev; 1088 struct vnode **vpp; 1089 { 1090 1091 return (getdevvp(dev, vpp, VCHR)); 1092 } 1093 1094 /* 1095 * Create a vnode for a device. 1096 * Used by bdevvp (block device) for root file system etc., 1097 * and by cdevvp (character device) for console and kernfs. 1098 */ 1099 int 1100 getdevvp(dev, vpp, type) 1101 dev_t dev; 1102 struct vnode **vpp; 1103 enum vtype type; 1104 { 1105 struct vnode *vp; 1106 struct vnode *nvp; 1107 int error; 1108 1109 if (dev == NODEV) { 1110 *vpp = NULLVP; 1111 return (0); 1112 } 1113 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1114 if (error) { 1115 *vpp = NULLVP; 1116 return (error); 1117 } 1118 vp = nvp; 1119 vp->v_type = type; 1120 if ((nvp = checkalias(vp, dev, NULL)) != 0) { 1121 vput(vp); 1122 vp = nvp; 1123 } 1124 *vpp = vp; 1125 return (0); 1126 } 1127 1128 /* 1129 * Check to see if the new vnode represents a special device 1130 * for which we already have a vnode (either because of 1131 * bdevvp() or because of a different vnode representing 1132 * the same block device). If such an alias exists, deallocate 1133 * the existing contents and return the aliased vnode. The 1134 * caller is responsible for filling it with its new contents. 1135 */ 1136 struct vnode * 1137 checkalias(nvp, nvp_rdev, mp) 1138 struct vnode *nvp; 1139 dev_t nvp_rdev; 1140 struct mount *mp; 1141 { 1142 struct proc *p = curproc; /* XXX */ 1143 struct vnode *vp; 1144 struct vnode **vpp; 1145 1146 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1147 return (NULLVP); 1148 1149 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1150 loop: 1151 simple_lock(&spechash_slock); 1152 for (vp = *vpp; vp; vp = vp->v_specnext) { 1153 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1154 continue; 1155 /* 1156 * Alias, but not in use, so flush it out. 1157 */ 1158 simple_lock(&vp->v_interlock); 1159 simple_unlock(&spechash_slock); 1160 if (vp->v_usecount == 0) { 1161 vgonel(vp, p); 1162 goto loop; 1163 } 1164 /* 1165 * What we're interested to know here is if someone else has 1166 * removed this vnode from the device hash list while we were 1167 * waiting. This can only happen if vclean() did it, and 1168 * this requires the vnode to be locked. Therefore, we use 1169 * LK_SLEEPFAIL and retry. 1170 */ 1171 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL)) 1172 goto loop; 1173 simple_lock(&spechash_slock); 1174 break; 1175 } 1176 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) { 1177 MALLOC(nvp->v_specinfo, struct specinfo *, 1178 sizeof(struct specinfo), M_VNODE, M_NOWAIT); 1179 /* XXX Erg. */ 1180 if (nvp->v_specinfo == NULL) { 1181 simple_unlock(&spechash_slock); 1182 uvm_wait("checkalias"); 1183 goto loop; 1184 } 1185 1186 nvp->v_rdev = nvp_rdev; 1187 nvp->v_hashchain = vpp; 1188 nvp->v_specnext = *vpp; 1189 nvp->v_specmountpoint = NULL; 1190 simple_unlock(&spechash_slock); 1191 nvp->v_speclockf = NULL; 1192 simple_lock_init(&nvp->v_spec_cow_slock); 1193 SLIST_INIT(&nvp->v_spec_cow_head); 1194 nvp->v_spec_cow_req = 0; 1195 nvp->v_spec_cow_count = 0; 1196 1197 *vpp = nvp; 1198 if (vp != NULLVP) { 1199 nvp->v_flag |= VALIASED; 1200 vp->v_flag |= VALIASED; 1201 vput(vp); 1202 } 1203 return (NULLVP); 1204 } 1205 simple_unlock(&spechash_slock); 1206 VOP_UNLOCK(vp, 0); 1207 simple_lock(&vp->v_interlock); 1208 vclean(vp, 0, p); 1209 vp->v_op = nvp->v_op; 1210 vp->v_tag = nvp->v_tag; 1211 vp->v_vnlock = &vp->v_lock; 1212 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); 1213 nvp->v_type = VNON; 1214 insmntque(vp, mp); 1215 return (vp); 1216 } 1217 1218 /* 1219 * Grab a particular vnode from the free list, increment its 1220 * reference count and lock it. If the vnode lock bit is set the 1221 * vnode is being eliminated in vgone. In that case, we can not 1222 * grab the vnode, so the process is awakened when the transition is 1223 * completed, and an error returned to indicate that the vnode is no 1224 * longer usable (possibly having been changed to a new file system type). 1225 */ 1226 int 1227 vget(vp, flags) 1228 struct vnode *vp; 1229 int flags; 1230 { 1231 int error; 1232 1233 /* 1234 * If the vnode is in the process of being cleaned out for 1235 * another use, we wait for the cleaning to finish and then 1236 * return failure. Cleaning is determined by checking that 1237 * the VXLOCK flag is set. 1238 */ 1239 1240 if ((flags & LK_INTERLOCK) == 0) 1241 simple_lock(&vp->v_interlock); 1242 if (vp->v_flag & VXLOCK) { 1243 if (flags & LK_NOWAIT) { 1244 simple_unlock(&vp->v_interlock); 1245 return EBUSY; 1246 } 1247 vp->v_flag |= VXWANT; 1248 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); 1249 return (ENOENT); 1250 } 1251 if (vp->v_usecount == 0) { 1252 simple_lock(&vnode_free_list_slock); 1253 if (vp->v_holdcnt > 0) 1254 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1255 else 1256 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1257 simple_unlock(&vnode_free_list_slock); 1258 } 1259 vp->v_usecount++; 1260 #ifdef DIAGNOSTIC 1261 if (vp->v_usecount == 0) { 1262 vprint("vget", vp); 1263 panic("vget: usecount overflow, vp %p", vp); 1264 } 1265 #endif 1266 if (flags & LK_TYPE_MASK) { 1267 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) { 1268 /* 1269 * must expand vrele here because we do not want 1270 * to call VOP_INACTIVE if the reference count 1271 * drops back to zero since it was never really 1272 * active. We must remove it from the free list 1273 * before sleeping so that multiple processes do 1274 * not try to recycle it. 1275 */ 1276 simple_lock(&vp->v_interlock); 1277 vp->v_usecount--; 1278 if (vp->v_usecount > 0) { 1279 simple_unlock(&vp->v_interlock); 1280 return (error); 1281 } 1282 /* 1283 * insert at tail of LRU list 1284 */ 1285 simple_lock(&vnode_free_list_slock); 1286 if (vp->v_holdcnt > 0) 1287 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, 1288 v_freelist); 1289 else 1290 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 1291 v_freelist); 1292 simple_unlock(&vnode_free_list_slock); 1293 simple_unlock(&vp->v_interlock); 1294 } 1295 return (error); 1296 } 1297 simple_unlock(&vp->v_interlock); 1298 return (0); 1299 } 1300 1301 /* 1302 * vput(), just unlock and vrele() 1303 */ 1304 void 1305 vput(vp) 1306 struct vnode *vp; 1307 { 1308 struct proc *p = curproc; /* XXX */ 1309 1310 #ifdef DIAGNOSTIC 1311 if (vp == NULL) 1312 panic("vput: null vp"); 1313 #endif 1314 simple_lock(&vp->v_interlock); 1315 vp->v_usecount--; 1316 if (vp->v_usecount > 0) { 1317 simple_unlock(&vp->v_interlock); 1318 VOP_UNLOCK(vp, 0); 1319 return; 1320 } 1321 #ifdef DIAGNOSTIC 1322 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1323 vprint("vput: bad ref count", vp); 1324 panic("vput: ref cnt"); 1325 } 1326 #endif 1327 /* 1328 * Insert at tail of LRU list. 1329 */ 1330 simple_lock(&vnode_free_list_slock); 1331 if (vp->v_holdcnt > 0) 1332 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1333 else 1334 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1335 simple_unlock(&vnode_free_list_slock); 1336 if (vp->v_flag & VEXECMAP) { 1337 uvmexp.execpages -= vp->v_uobj.uo_npages; 1338 uvmexp.filepages += vp->v_uobj.uo_npages; 1339 } 1340 vp->v_flag &= ~(VTEXT|VEXECMAP); 1341 simple_unlock(&vp->v_interlock); 1342 VOP_INACTIVE(vp, p); 1343 } 1344 1345 /* 1346 * Vnode release. 1347 * If count drops to zero, call inactive routine and return to freelist. 1348 */ 1349 void 1350 vrele(vp) 1351 struct vnode *vp; 1352 { 1353 struct proc *p = curproc; /* XXX */ 1354 1355 #ifdef DIAGNOSTIC 1356 if (vp == NULL) 1357 panic("vrele: null vp"); 1358 #endif 1359 simple_lock(&vp->v_interlock); 1360 vp->v_usecount--; 1361 if (vp->v_usecount > 0) { 1362 simple_unlock(&vp->v_interlock); 1363 return; 1364 } 1365 #ifdef DIAGNOSTIC 1366 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1367 vprint("vrele: bad ref count", vp); 1368 panic("vrele: ref cnt vp %p", vp); 1369 } 1370 #endif 1371 /* 1372 * Insert at tail of LRU list. 1373 */ 1374 simple_lock(&vnode_free_list_slock); 1375 if (vp->v_holdcnt > 0) 1376 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1377 else 1378 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1379 simple_unlock(&vnode_free_list_slock); 1380 if (vp->v_flag & VEXECMAP) { 1381 uvmexp.execpages -= vp->v_uobj.uo_npages; 1382 uvmexp.filepages += vp->v_uobj.uo_npages; 1383 } 1384 vp->v_flag &= ~(VTEXT|VEXECMAP); 1385 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) 1386 VOP_INACTIVE(vp, p); 1387 } 1388 1389 #ifdef DIAGNOSTIC 1390 /* 1391 * Page or buffer structure gets a reference. 1392 */ 1393 void 1394 vholdl(vp) 1395 struct vnode *vp; 1396 { 1397 1398 /* 1399 * If it is on the freelist and the hold count is currently 1400 * zero, move it to the hold list. The test of the back 1401 * pointer and the use reference count of zero is because 1402 * it will be removed from a free list by getnewvnode, 1403 * but will not have its reference count incremented until 1404 * after calling vgone. If the reference count were 1405 * incremented first, vgone would (incorrectly) try to 1406 * close the previous instance of the underlying object. 1407 * So, the back pointer is explicitly set to `0xdeadb' in 1408 * getnewvnode after removing it from a freelist to ensure 1409 * that we do not try to move it here. 1410 */ 1411 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1412 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1413 simple_lock(&vnode_free_list_slock); 1414 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1415 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist); 1416 simple_unlock(&vnode_free_list_slock); 1417 } 1418 vp->v_holdcnt++; 1419 } 1420 1421 /* 1422 * Page or buffer structure frees a reference. 1423 */ 1424 void 1425 holdrelel(vp) 1426 struct vnode *vp; 1427 { 1428 1429 if (vp->v_holdcnt <= 0) 1430 panic("holdrelel: holdcnt vp %p", vp); 1431 vp->v_holdcnt--; 1432 1433 /* 1434 * If it is on the holdlist and the hold count drops to 1435 * zero, move it to the free list. The test of the back 1436 * pointer and the use reference count of zero is because 1437 * it will be removed from a free list by getnewvnode, 1438 * but will not have its reference count incremented until 1439 * after calling vgone. If the reference count were 1440 * incremented first, vgone would (incorrectly) try to 1441 * close the previous instance of the underlying object. 1442 * So, the back pointer is explicitly set to `0xdeadb' in 1443 * getnewvnode after removing it from a freelist to ensure 1444 * that we do not try to move it here. 1445 */ 1446 1447 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && 1448 vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1449 simple_lock(&vnode_free_list_slock); 1450 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist); 1451 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1452 simple_unlock(&vnode_free_list_slock); 1453 } 1454 } 1455 1456 /* 1457 * Vnode reference. 1458 */ 1459 void 1460 vref(vp) 1461 struct vnode *vp; 1462 { 1463 1464 simple_lock(&vp->v_interlock); 1465 if (vp->v_usecount <= 0) 1466 panic("vref used where vget required, vp %p", vp); 1467 vp->v_usecount++; 1468 #ifdef DIAGNOSTIC 1469 if (vp->v_usecount == 0) { 1470 vprint("vref", vp); 1471 panic("vref: usecount overflow, vp %p", vp); 1472 } 1473 #endif 1474 simple_unlock(&vp->v_interlock); 1475 } 1476 #endif /* DIAGNOSTIC */ 1477 1478 /* 1479 * Remove any vnodes in the vnode table belonging to mount point mp. 1480 * 1481 * If FORCECLOSE is not specified, there should not be any active ones, 1482 * return error if any are found (nb: this is a user error, not a 1483 * system error). If FORCECLOSE is specified, detach any active vnodes 1484 * that are found. 1485 * 1486 * If WRITECLOSE is set, only flush out regular file vnodes open for 1487 * writing. 1488 * 1489 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1490 */ 1491 #ifdef DEBUG 1492 int busyprt = 0; /* print out busy vnodes */ 1493 struct ctldebug debug1 = { "busyprt", &busyprt }; 1494 #endif 1495 1496 int 1497 vflush(mp, skipvp, flags) 1498 struct mount *mp; 1499 struct vnode *skipvp; 1500 int flags; 1501 { 1502 struct proc *p = curproc; /* XXX */ 1503 struct vnode *vp, *nvp; 1504 int busy = 0; 1505 1506 simple_lock(&mntvnode_slock); 1507 loop: 1508 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1509 if (vp->v_mount != mp) 1510 goto loop; 1511 nvp = LIST_NEXT(vp, v_mntvnodes); 1512 /* 1513 * Skip over a selected vnode. 1514 */ 1515 if (vp == skipvp) 1516 continue; 1517 simple_lock(&vp->v_interlock); 1518 /* 1519 * Skip over a vnodes marked VSYSTEM. 1520 */ 1521 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1522 simple_unlock(&vp->v_interlock); 1523 continue; 1524 } 1525 /* 1526 * If WRITECLOSE is set, only flush out regular file 1527 * vnodes open for writing. 1528 */ 1529 if ((flags & WRITECLOSE) && 1530 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1531 simple_unlock(&vp->v_interlock); 1532 continue; 1533 } 1534 /* 1535 * With v_usecount == 0, all we need to do is clear 1536 * out the vnode data structures and we are done. 1537 */ 1538 if (vp->v_usecount == 0) { 1539 simple_unlock(&mntvnode_slock); 1540 vgonel(vp, p); 1541 simple_lock(&mntvnode_slock); 1542 continue; 1543 } 1544 /* 1545 * If FORCECLOSE is set, forcibly close the vnode. 1546 * For block or character devices, revert to an 1547 * anonymous device. For all other files, just kill them. 1548 */ 1549 if (flags & FORCECLOSE) { 1550 simple_unlock(&mntvnode_slock); 1551 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1552 vgonel(vp, p); 1553 } else { 1554 vclean(vp, 0, p); 1555 vp->v_op = spec_vnodeop_p; 1556 insmntque(vp, (struct mount *)0); 1557 } 1558 simple_lock(&mntvnode_slock); 1559 continue; 1560 } 1561 #ifdef DEBUG 1562 if (busyprt) 1563 vprint("vflush: busy vnode", vp); 1564 #endif 1565 simple_unlock(&vp->v_interlock); 1566 busy++; 1567 } 1568 simple_unlock(&mntvnode_slock); 1569 if (busy) 1570 return (EBUSY); 1571 return (0); 1572 } 1573 1574 /* 1575 * Disassociate the underlying file system from a vnode. 1576 */ 1577 void 1578 vclean(vp, flags, p) 1579 struct vnode *vp; 1580 int flags; 1581 struct proc *p; 1582 { 1583 struct mount *mp; 1584 int active; 1585 1586 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1587 1588 /* 1589 * Check to see if the vnode is in use. 1590 * If so we have to reference it before we clean it out 1591 * so that its count cannot fall to zero and generate a 1592 * race against ourselves to recycle it. 1593 */ 1594 1595 if ((active = vp->v_usecount) != 0) { 1596 vp->v_usecount++; 1597 #ifdef DIAGNOSTIC 1598 if (vp->v_usecount == 0) { 1599 vprint("vclean", vp); 1600 panic("vclean: usecount overflow"); 1601 } 1602 #endif 1603 } 1604 1605 /* 1606 * Prevent the vnode from being recycled or 1607 * brought into use while we clean it out. 1608 */ 1609 if (vp->v_flag & VXLOCK) 1610 panic("vclean: deadlock, vp %p", vp); 1611 vp->v_flag |= VXLOCK; 1612 if (vp->v_flag & VEXECMAP) { 1613 uvmexp.execpages -= vp->v_uobj.uo_npages; 1614 uvmexp.filepages += vp->v_uobj.uo_npages; 1615 } 1616 vp->v_flag &= ~(VTEXT|VEXECMAP); 1617 1618 /* 1619 * Even if the count is zero, the VOP_INACTIVE routine may still 1620 * have the object locked while it cleans it out. The VOP_LOCK 1621 * ensures that the VOP_INACTIVE routine is done with its work. 1622 * For active vnodes, it ensures that no other activity can 1623 * occur while the underlying object is being cleaned out. 1624 */ 1625 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); 1626 1627 /* 1628 * Clean out any cached data associated with the vnode. 1629 * If special device, remove it from special device alias list. 1630 * if it is on one. 1631 */ 1632 if (flags & DOCLOSE) { 1633 int error; 1634 struct vnode *vq, *vx; 1635 1636 vn_start_write(vp, &mp, V_WAIT | V_LOWER); 1637 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1638 vn_finished_write(mp, V_LOWER); 1639 if (error) 1640 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1641 KASSERT(error == 0); 1642 KASSERT((vp->v_flag & VONWORKLST) == 0); 1643 1644 if (active) 1645 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); 1646 1647 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1648 vp->v_specinfo != 0) { 1649 simple_lock(&spechash_slock); 1650 if (vp->v_hashchain != NULL) { 1651 if (*vp->v_hashchain == vp) { 1652 *vp->v_hashchain = vp->v_specnext; 1653 } else { 1654 for (vq = *vp->v_hashchain; vq; 1655 vq = vq->v_specnext) { 1656 if (vq->v_specnext != vp) 1657 continue; 1658 vq->v_specnext = vp->v_specnext; 1659 break; 1660 } 1661 if (vq == NULL) 1662 panic("missing bdev"); 1663 } 1664 if (vp->v_flag & VALIASED) { 1665 vx = NULL; 1666 for (vq = *vp->v_hashchain; vq; 1667 vq = vq->v_specnext) { 1668 if (vq->v_rdev != vp->v_rdev || 1669 vq->v_type != vp->v_type) 1670 continue; 1671 if (vx) 1672 break; 1673 vx = vq; 1674 } 1675 if (vx == NULL) 1676 panic("missing alias"); 1677 if (vq == NULL) 1678 vx->v_flag &= ~VALIASED; 1679 vp->v_flag &= ~VALIASED; 1680 } 1681 } 1682 simple_unlock(&spechash_slock); 1683 FREE(vp->v_specinfo, M_VNODE); 1684 vp->v_specinfo = NULL; 1685 } 1686 } 1687 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock)); 1688 1689 /* 1690 * If purging an active vnode, it must be closed and 1691 * deactivated before being reclaimed. Note that the 1692 * VOP_INACTIVE will unlock the vnode. 1693 */ 1694 if (active) { 1695 VOP_INACTIVE(vp, p); 1696 } else { 1697 /* 1698 * Any other processes trying to obtain this lock must first 1699 * wait for VXLOCK to clear, then call the new lock operation. 1700 */ 1701 VOP_UNLOCK(vp, 0); 1702 } 1703 /* 1704 * Reclaim the vnode. 1705 */ 1706 if (VOP_RECLAIM(vp, p)) 1707 panic("vclean: cannot reclaim, vp %p", vp); 1708 if (active) { 1709 /* 1710 * Inline copy of vrele() since VOP_INACTIVE 1711 * has already been called. 1712 */ 1713 simple_lock(&vp->v_interlock); 1714 if (--vp->v_usecount <= 0) { 1715 #ifdef DIAGNOSTIC 1716 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1717 vprint("vclean: bad ref count", vp); 1718 panic("vclean: ref cnt"); 1719 } 1720 #endif 1721 /* 1722 * Insert at tail of LRU list. 1723 */ 1724 1725 simple_unlock(&vp->v_interlock); 1726 simple_lock(&vnode_free_list_slock); 1727 #ifdef DIAGNOSTIC 1728 if (vp->v_holdcnt > 0) 1729 panic("vclean: not clean, vp %p", vp); 1730 #endif 1731 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1732 simple_unlock(&vnode_free_list_slock); 1733 } else 1734 simple_unlock(&vp->v_interlock); 1735 } 1736 1737 KASSERT(vp->v_uobj.uo_npages == 0); 1738 cache_purge(vp); 1739 1740 /* 1741 * Done with purge, notify sleepers of the grim news. 1742 */ 1743 vp->v_op = dead_vnodeop_p; 1744 vp->v_tag = VT_NON; 1745 simple_lock(&vp->v_interlock); 1746 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */ 1747 vp->v_flag &= ~(VXLOCK|VLOCKSWORK); 1748 if (vp->v_flag & VXWANT) { 1749 vp->v_flag &= ~VXWANT; 1750 simple_unlock(&vp->v_interlock); 1751 wakeup((caddr_t)vp); 1752 } else 1753 simple_unlock(&vp->v_interlock); 1754 } 1755 1756 /* 1757 * Recycle an unused vnode to the front of the free list. 1758 * Release the passed interlock if the vnode will be recycled. 1759 */ 1760 int 1761 vrecycle(vp, inter_lkp, p) 1762 struct vnode *vp; 1763 struct simplelock *inter_lkp; 1764 struct proc *p; 1765 { 1766 1767 simple_lock(&vp->v_interlock); 1768 if (vp->v_usecount == 0) { 1769 if (inter_lkp) 1770 simple_unlock(inter_lkp); 1771 vgonel(vp, p); 1772 return (1); 1773 } 1774 simple_unlock(&vp->v_interlock); 1775 return (0); 1776 } 1777 1778 /* 1779 * Eliminate all activity associated with a vnode 1780 * in preparation for reuse. 1781 */ 1782 void 1783 vgone(vp) 1784 struct vnode *vp; 1785 { 1786 struct proc *p = curproc; /* XXX */ 1787 1788 simple_lock(&vp->v_interlock); 1789 vgonel(vp, p); 1790 } 1791 1792 /* 1793 * vgone, with the vp interlock held. 1794 */ 1795 void 1796 vgonel(vp, p) 1797 struct vnode *vp; 1798 struct proc *p; 1799 { 1800 1801 LOCK_ASSERT(simple_lock_held(&vp->v_interlock)); 1802 1803 /* 1804 * If a vgone (or vclean) is already in progress, 1805 * wait until it is done and return. 1806 */ 1807 1808 if (vp->v_flag & VXLOCK) { 1809 vp->v_flag |= VXWANT; 1810 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock); 1811 return; 1812 } 1813 1814 /* 1815 * Clean out the filesystem specific data. 1816 */ 1817 1818 vclean(vp, DOCLOSE, p); 1819 KASSERT((vp->v_flag & VONWORKLST) == 0); 1820 1821 /* 1822 * Delete from old mount point vnode list, if on one. 1823 */ 1824 1825 if (vp->v_mount != NULL) 1826 insmntque(vp, (struct mount *)0); 1827 1828 /* 1829 * The test of the back pointer and the reference count of 1830 * zero is because it will be removed from the free list by 1831 * getcleanvnode, but will not have its reference count 1832 * incremented until after calling vgone. If the reference 1833 * count were incremented first, vgone would (incorrectly) 1834 * try to close the previous instance of the underlying object. 1835 * So, the back pointer is explicitly set to `0xdeadb' in 1836 * getnewvnode after removing it from the freelist to ensure 1837 * that we do not try to move it here. 1838 */ 1839 1840 vp->v_type = VBAD; 1841 if (vp->v_usecount == 0) { 1842 boolean_t dofree; 1843 1844 simple_lock(&vnode_free_list_slock); 1845 if (vp->v_holdcnt > 0) 1846 panic("vgonel: not clean, vp %p", vp); 1847 /* 1848 * if it isn't on the freelist, we're called by getcleanvnode 1849 * and vnode is being re-used. otherwise, we'll free it. 1850 */ 1851 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb; 1852 if (dofree) { 1853 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1854 numvnodes--; 1855 } 1856 simple_unlock(&vnode_free_list_slock); 1857 if (dofree) 1858 pool_put(&vnode_pool, vp); 1859 } 1860 } 1861 1862 /* 1863 * Lookup a vnode by device number. 1864 */ 1865 int 1866 vfinddev(dev, type, vpp) 1867 dev_t dev; 1868 enum vtype type; 1869 struct vnode **vpp; 1870 { 1871 struct vnode *vp; 1872 int rc = 0; 1873 1874 simple_lock(&spechash_slock); 1875 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1876 if (dev != vp->v_rdev || type != vp->v_type) 1877 continue; 1878 *vpp = vp; 1879 rc = 1; 1880 break; 1881 } 1882 simple_unlock(&spechash_slock); 1883 return (rc); 1884 } 1885 1886 /* 1887 * Revoke all the vnodes corresponding to the specified minor number 1888 * range (endpoints inclusive) of the specified major. 1889 */ 1890 void 1891 vdevgone(maj, minl, minh, type) 1892 int maj, minl, minh; 1893 enum vtype type; 1894 { 1895 struct vnode *vp; 1896 int mn; 1897 1898 for (mn = minl; mn <= minh; mn++) 1899 if (vfinddev(makedev(maj, mn), type, &vp)) 1900 VOP_REVOKE(vp, REVOKEALL); 1901 } 1902 1903 /* 1904 * Calculate the total number of references to a special device. 1905 */ 1906 int 1907 vcount(vp) 1908 struct vnode *vp; 1909 { 1910 struct vnode *vq, *vnext; 1911 int count; 1912 1913 loop: 1914 if ((vp->v_flag & VALIASED) == 0) 1915 return (vp->v_usecount); 1916 simple_lock(&spechash_slock); 1917 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1918 vnext = vq->v_specnext; 1919 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1920 continue; 1921 /* 1922 * Alias, but not in use, so flush it out. 1923 */ 1924 if (vq->v_usecount == 0 && vq != vp && 1925 (vq->v_flag & VXLOCK) == 0) { 1926 simple_unlock(&spechash_slock); 1927 vgone(vq); 1928 goto loop; 1929 } 1930 count += vq->v_usecount; 1931 } 1932 simple_unlock(&spechash_slock); 1933 return (count); 1934 } 1935 1936 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 1937 #define ARRAY_PRINT(idx, arr) \ 1938 ((idx) > 0 && (idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 1939 1940 const char * const vnode_tags[] = { VNODE_TAGS }; 1941 const char * const vnode_types[] = { VNODE_TYPES }; 1942 const char vnode_flagbits[] = VNODE_FLAGBITS; 1943 1944 /* 1945 * Print out a description of a vnode. 1946 */ 1947 void 1948 vprint(label, vp) 1949 char *label; 1950 struct vnode *vp; 1951 { 1952 char buf[96]; 1953 1954 if (label != NULL) 1955 printf("%s: ", label); 1956 printf("tag %s(%d) type %s(%d), usecount %d, writecount %ld, " 1957 "refcount %ld,", ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 1958 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 1959 vp->v_usecount, vp->v_writecount, vp->v_holdcnt); 1960 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); 1961 if (buf[0] != '\0') 1962 printf(" flags (%s)", &buf[1]); 1963 if (vp->v_data == NULL) { 1964 printf("\n"); 1965 } else { 1966 printf("\n\t"); 1967 VOP_PRINT(vp); 1968 } 1969 } 1970 1971 #ifdef DEBUG 1972 /* 1973 * List all of the locked vnodes in the system. 1974 * Called when debugging the kernel. 1975 */ 1976 void 1977 printlockedvnodes() 1978 { 1979 struct mount *mp, *nmp; 1980 struct vnode *vp; 1981 1982 printf("Locked vnodes\n"); 1983 simple_lock(&mountlist_slock); 1984 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 1985 mp = nmp) { 1986 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 1987 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1988 continue; 1989 } 1990 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 1991 if (VOP_ISLOCKED(vp)) 1992 vprint(NULL, vp); 1993 } 1994 simple_lock(&mountlist_slock); 1995 nmp = CIRCLEQ_NEXT(mp, mnt_list); 1996 vfs_unbusy(mp); 1997 } 1998 simple_unlock(&mountlist_slock); 1999 } 2000 #endif 2001 2002 /* 2003 * sysctl helper routine for vfs.generic.conf lookups. 2004 */ 2005 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2006 static int 2007 sysctl_vfs_generic_conf(SYSCTLFN_ARGS) 2008 { 2009 struct vfsconf vfc; 2010 extern const char * const mountcompatnames[]; 2011 extern int nmountcompatnames; 2012 struct sysctlnode node; 2013 struct vfsops *vfsp; 2014 u_int vfsnum; 2015 2016 if (namelen != 1) 2017 return (ENOTDIR); 2018 vfsnum = name[0]; 2019 if (vfsnum >= nmountcompatnames || 2020 mountcompatnames[vfsnum] == NULL) 2021 return (EOPNOTSUPP); 2022 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]); 2023 if (vfsp == NULL) 2024 return (EOPNOTSUPP); 2025 2026 vfc.vfc_vfsops = vfsp; 2027 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN); 2028 vfc.vfc_typenum = vfsnum; 2029 vfc.vfc_refcount = vfsp->vfs_refcount; 2030 vfc.vfc_flags = 0; 2031 vfc.vfc_mountroot = vfsp->vfs_mountroot; 2032 vfc.vfc_next = NULL; 2033 2034 node = *rnode; 2035 node.sysctl_data = &vfc; 2036 return (sysctl_lookup(SYSCTLFN_CALL(&node))); 2037 } 2038 #endif 2039 2040 /* 2041 * sysctl helper routine to return list of supported fstypes 2042 */ 2043 static int 2044 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2045 { 2046 char buf[MFSNAMELEN]; 2047 char *where = oldp; 2048 struct vfsops *v; 2049 size_t needed, left, slen; 2050 int error, first; 2051 2052 if (newp != NULL) 2053 return (EPERM); 2054 if (namelen != 0) 2055 return (EINVAL); 2056 2057 first = 1; 2058 error = 0; 2059 needed = 0; 2060 left = *oldlenp; 2061 2062 LIST_FOREACH(v, &vfs_list, vfs_list) { 2063 if (where == NULL) 2064 needed += strlen(v->vfs_name) + 1; 2065 else { 2066 memset(buf, 0, sizeof(buf)); 2067 if (first) { 2068 strncpy(buf, v->vfs_name, sizeof(buf)); 2069 first = 0; 2070 } else { 2071 buf[0] = ' '; 2072 strncpy(buf + 1, v->vfs_name, sizeof(buf) - 1); 2073 } 2074 buf[sizeof(buf)-1] = '\0'; 2075 slen = strlen(buf); 2076 if (left < slen + 1) 2077 break; 2078 /* +1 to copy out the trailing NUL byte */ 2079 error = copyout(buf, where, slen + 1); 2080 if (error) 2081 break; 2082 where += slen; 2083 needed += slen; 2084 left -= slen; 2085 } 2086 } 2087 *oldlenp = needed; 2088 return (error); 2089 } 2090 2091 /* 2092 * Top level filesystem related information gathering. 2093 */ 2094 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup") 2095 { 2096 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2097 extern int nmountcompatnames; 2098 #endif 2099 2100 sysctl_createv(clog, 0, NULL, NULL, 2101 CTLFLAG_PERMANENT, 2102 CTLTYPE_NODE, "vfs", NULL, 2103 NULL, 0, NULL, 0, 2104 CTL_VFS, CTL_EOL); 2105 sysctl_createv(clog, 0, NULL, NULL, 2106 CTLFLAG_PERMANENT, 2107 CTLTYPE_NODE, "generic", 2108 SYSCTL_DESCR("Non-specific vfs related information"), 2109 NULL, 0, NULL, 0, 2110 CTL_VFS, VFS_GENERIC, CTL_EOL); 2111 2112 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2113 sysctl_createv(clog, 0, NULL, NULL, 2114 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 2115 CTLTYPE_INT, "maxtypenum", 2116 SYSCTL_DESCR("Highest valid filesystem type number"), 2117 NULL, nmountcompatnames, NULL, 0, 2118 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL); 2119 #endif 2120 sysctl_createv(clog, 0, NULL, NULL, 2121 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2122 CTLTYPE_INT, "usermount", 2123 SYSCTL_DESCR("Whether unprivileged users may mount " 2124 "filesystems"), 2125 NULL, 0, &dovfsusermount, 0, 2126 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL); 2127 sysctl_createv(clog, 0, NULL, NULL, 2128 CTLFLAG_PERMANENT, 2129 CTLTYPE_STRING, "fstypes", 2130 SYSCTL_DESCR("List of file systems present"), 2131 sysctl_vfs_generic_fstypes, 0, NULL, 0, 2132 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL); 2133 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44) 2134 sysctl_createv(clog, 0, NULL, NULL, 2135 CTLFLAG_PERMANENT, 2136 CTLTYPE_STRUCT, "conf", 2137 SYSCTL_DESCR("Filesystem configuration information"), 2138 sysctl_vfs_generic_conf, 0, NULL, 2139 sizeof(struct vfsconf), 2140 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL); 2141 #endif 2142 } 2143 2144 2145 int kinfo_vdebug = 1; 2146 int kinfo_vgetfailed; 2147 #define KINFO_VNODESLOP 10 2148 /* 2149 * Dump vnode list (via sysctl). 2150 * Copyout address of vnode followed by vnode. 2151 */ 2152 /* ARGSUSED */ 2153 int 2154 sysctl_kern_vnode(SYSCTLFN_ARGS) 2155 { 2156 char *where = oldp; 2157 size_t *sizep = oldlenp; 2158 struct mount *mp, *nmp; 2159 struct vnode *nvp, *vp; 2160 char *bp = where, *savebp; 2161 char *ewhere; 2162 int error; 2163 2164 if (namelen != 0) 2165 return (EOPNOTSUPP); 2166 if (newp != NULL) 2167 return (EPERM); 2168 2169 #define VPTRSZ sizeof(struct vnode *) 2170 #define VNODESZ sizeof(struct vnode) 2171 if (where == NULL) { 2172 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2173 return (0); 2174 } 2175 ewhere = where + *sizep; 2176 2177 simple_lock(&mountlist_slock); 2178 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2179 mp = nmp) { 2180 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) { 2181 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2182 continue; 2183 } 2184 savebp = bp; 2185 again: 2186 simple_lock(&mntvnode_slock); 2187 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2188 vp != NULL; 2189 vp = nvp) { 2190 /* 2191 * Check that the vp is still associated with 2192 * this filesystem. RACE: could have been 2193 * recycled onto the same filesystem. 2194 */ 2195 if (vp->v_mount != mp) { 2196 simple_unlock(&mntvnode_slock); 2197 if (kinfo_vdebug) 2198 printf("kinfo: vp changed\n"); 2199 bp = savebp; 2200 goto again; 2201 } 2202 nvp = LIST_NEXT(vp, v_mntvnodes); 2203 if (bp + VPTRSZ + VNODESZ > ewhere) { 2204 simple_unlock(&mntvnode_slock); 2205 *sizep = bp - where; 2206 return (ENOMEM); 2207 } 2208 simple_unlock(&mntvnode_slock); 2209 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || 2210 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) 2211 return (error); 2212 bp += VPTRSZ + VNODESZ; 2213 simple_lock(&mntvnode_slock); 2214 } 2215 simple_unlock(&mntvnode_slock); 2216 simple_lock(&mountlist_slock); 2217 nmp = CIRCLEQ_NEXT(mp, mnt_list); 2218 vfs_unbusy(mp); 2219 } 2220 simple_unlock(&mountlist_slock); 2221 2222 *sizep = bp - where; 2223 return (0); 2224 } 2225 2226 /* 2227 * Check to see if a filesystem is mounted on a block device. 2228 */ 2229 int 2230 vfs_mountedon(vp) 2231 struct vnode *vp; 2232 { 2233 struct vnode *vq; 2234 int error = 0; 2235 2236 if (vp->v_specmountpoint != NULL) 2237 return (EBUSY); 2238 if (vp->v_flag & VALIASED) { 2239 simple_lock(&spechash_slock); 2240 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2241 if (vq->v_rdev != vp->v_rdev || 2242 vq->v_type != vp->v_type) 2243 continue; 2244 if (vq->v_specmountpoint != NULL) { 2245 error = EBUSY; 2246 break; 2247 } 2248 } 2249 simple_unlock(&spechash_slock); 2250 } 2251 return (error); 2252 } 2253 2254 static int 2255 sacheck(struct sockaddr *sa) 2256 { 2257 switch (sa->sa_family) { 2258 #ifdef INET 2259 case AF_INET: { 2260 struct sockaddr_in *sin = (struct sockaddr_in *)sa; 2261 char *p = (char *)sin->sin_zero; 2262 size_t i; 2263 2264 if (sin->sin_len != sizeof(*sin)) 2265 return -1; 2266 if (sin->sin_port != 0) 2267 return -1; 2268 for (i = 0; i < sizeof(sin->sin_zero); i++) 2269 if (*p++ != '\0') 2270 return -1; 2271 return 0; 2272 } 2273 #endif 2274 #ifdef INET6 2275 case AF_INET6: { 2276 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; 2277 2278 if (sin6->sin6_len != sizeof(*sin6)) 2279 return -1; 2280 if (sin6->sin6_port != 0) 2281 return -1; 2282 return 0; 2283 } 2284 #endif 2285 default: 2286 return -1; 2287 } 2288 } 2289 2290 /* 2291 * Build hash lists of net addresses and hang them off the mount point. 2292 * Called by ufs_mount() to set up the lists of export addresses. 2293 */ 2294 static int 2295 vfs_hang_addrlist(mp, nep, argp) 2296 struct mount *mp; 2297 struct netexport *nep; 2298 struct export_args *argp; 2299 { 2300 struct netcred *np, *enp; 2301 struct radix_node_head *rnh; 2302 int i; 2303 struct sockaddr *saddr, *smask = 0; 2304 struct domain *dom; 2305 int error; 2306 2307 if (argp->ex_addrlen == 0) { 2308 if (mp->mnt_flag & MNT_DEFEXPORTED) 2309 return (EPERM); 2310 np = &nep->ne_defexported; 2311 np->netc_exflags = argp->ex_flags; 2312 crcvt(&np->netc_anon, &argp->ex_anon); 2313 np->netc_anon.cr_ref = 1; 2314 mp->mnt_flag |= MNT_DEFEXPORTED; 2315 return (0); 2316 } 2317 2318 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN) 2319 return (EINVAL); 2320 2321 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2322 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); 2323 memset((caddr_t)np, 0, i); 2324 saddr = (struct sockaddr *)(np + 1); 2325 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen); 2326 if (error) 2327 goto out; 2328 if (saddr->sa_len > argp->ex_addrlen) 2329 saddr->sa_len = argp->ex_addrlen; 2330 if (sacheck(saddr) == -1) 2331 return EINVAL; 2332 if (argp->ex_masklen) { 2333 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 2334 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 2335 if (error) 2336 goto out; 2337 if (smask->sa_len > argp->ex_masklen) 2338 smask->sa_len = argp->ex_masklen; 2339 if (smask->sa_family != saddr->sa_family) 2340 return EINVAL; 2341 if (sacheck(smask) == -1) 2342 return EINVAL; 2343 } 2344 i = saddr->sa_family; 2345 if ((rnh = nep->ne_rtable[i]) == 0) { 2346 /* 2347 * Seems silly to initialize every AF when most are not 2348 * used, do so on demand here 2349 */ 2350 DOMAIN_FOREACH(dom) { 2351 if (dom->dom_family == i && dom->dom_rtattach) { 2352 dom->dom_rtattach((void **)&nep->ne_rtable[i], 2353 dom->dom_rtoffset); 2354 break; 2355 } 2356 } 2357 if ((rnh = nep->ne_rtable[i]) == 0) { 2358 error = ENOBUFS; 2359 goto out; 2360 } 2361 } 2362 2363 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh, 2364 np->netc_rnodes); 2365 if (enp != np) { 2366 if (enp == NULL) { 2367 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr, 2368 smask, rnh); 2369 if (enp == NULL) { 2370 error = EPERM; 2371 goto out; 2372 } 2373 } else 2374 enp->netc_refcnt++; 2375 2376 goto check; 2377 } else 2378 enp->netc_refcnt = 1; 2379 2380 np->netc_exflags = argp->ex_flags; 2381 crcvt(&np->netc_anon, &argp->ex_anon); 2382 np->netc_anon.cr_ref = 1; 2383 return 0; 2384 check: 2385 if (enp->netc_exflags != argp->ex_flags || 2386 crcmp(&enp->netc_anon, &argp->ex_anon) != 0) 2387 error = EPERM; 2388 else 2389 error = 0; 2390 out: 2391 free(np, M_NETADDR); 2392 return error; 2393 } 2394 2395 /* ARGSUSED */ 2396 static int 2397 vfs_free_netcred(rn, w) 2398 struct radix_node *rn; 2399 void *w; 2400 { 2401 struct radix_node_head *rnh = (struct radix_node_head *)w; 2402 struct netcred *np = (struct netcred *)(void *)rn; 2403 2404 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); 2405 if (--(np->netc_refcnt) <= 0) 2406 free(np, M_NETADDR); 2407 return (0); 2408 } 2409 2410 /* 2411 * Free the net address hash lists that are hanging off the mount points. 2412 */ 2413 static void 2414 vfs_free_addrlist(nep) 2415 struct netexport *nep; 2416 { 2417 int i; 2418 struct radix_node_head *rnh; 2419 2420 for (i = 0; i <= AF_MAX; i++) 2421 if ((rnh = nep->ne_rtable[i]) != NULL) { 2422 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh); 2423 free((caddr_t)rnh, M_RTABLE); 2424 nep->ne_rtable[i] = 0; 2425 } 2426 } 2427 2428 int 2429 vfs_export(mp, nep, argp) 2430 struct mount *mp; 2431 struct netexport *nep; 2432 struct export_args *argp; 2433 { 2434 int error; 2435 2436 if (argp->ex_flags & MNT_DELEXPORT) { 2437 if (mp->mnt_flag & MNT_EXPUBLIC) { 2438 vfs_setpublicfs(NULL, NULL, NULL); 2439 mp->mnt_flag &= ~MNT_EXPUBLIC; 2440 } 2441 vfs_free_addrlist(nep); 2442 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2443 } 2444 if (argp->ex_flags & MNT_EXPORTED) { 2445 if (argp->ex_flags & MNT_EXPUBLIC) { 2446 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2447 return (error); 2448 mp->mnt_flag |= MNT_EXPUBLIC; 2449 } 2450 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0) 2451 return (error); 2452 mp->mnt_flag |= MNT_EXPORTED; 2453 } 2454 return (0); 2455 } 2456 2457 /* 2458 * Set the publicly exported filesystem (WebNFS). Currently, only 2459 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2460 */ 2461 int 2462 vfs_setpublicfs(mp, nep, argp) 2463 struct mount *mp; 2464 struct netexport *nep; 2465 struct export_args *argp; 2466 { 2467 int error; 2468 struct vnode *rvp; 2469 char *cp; 2470 2471 /* 2472 * mp == NULL -> invalidate the current info, the FS is 2473 * no longer exported. May be called from either vfs_export 2474 * or unmount, so check if it hasn't already been done. 2475 */ 2476 if (mp == NULL) { 2477 if (nfs_pub.np_valid) { 2478 nfs_pub.np_valid = 0; 2479 if (nfs_pub.np_index != NULL) { 2480 FREE(nfs_pub.np_index, M_TEMP); 2481 nfs_pub.np_index = NULL; 2482 } 2483 } 2484 return (0); 2485 } 2486 2487 /* 2488 * Only one allowed at a time. 2489 */ 2490 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2491 return (EBUSY); 2492 2493 /* 2494 * Get real filehandle for root of exported FS. 2495 */ 2496 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle)); 2497 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx; 2498 2499 if ((error = VFS_ROOT(mp, &rvp))) 2500 return (error); 2501 2502 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2503 return (error); 2504 2505 vput(rvp); 2506 2507 /* 2508 * If an indexfile was specified, pull it in. 2509 */ 2510 if (argp->ex_indexfile != NULL) { 2511 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2512 M_WAITOK); 2513 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2514 MAXNAMLEN, (size_t *)0); 2515 if (!error) { 2516 /* 2517 * Check for illegal filenames. 2518 */ 2519 for (cp = nfs_pub.np_index; *cp; cp++) { 2520 if (*cp == '/') { 2521 error = EINVAL; 2522 break; 2523 } 2524 } 2525 } 2526 if (error) { 2527 FREE(nfs_pub.np_index, M_TEMP); 2528 return (error); 2529 } 2530 } 2531 2532 nfs_pub.np_mount = mp; 2533 nfs_pub.np_valid = 1; 2534 return (0); 2535 } 2536 2537 struct netcred * 2538 vfs_export_lookup(mp, nep, nam) 2539 struct mount *mp; 2540 struct netexport *nep; 2541 struct mbuf *nam; 2542 { 2543 struct netcred *np; 2544 struct radix_node_head *rnh; 2545 struct sockaddr *saddr; 2546 2547 np = NULL; 2548 if (mp->mnt_flag & MNT_EXPORTED) { 2549 /* 2550 * Lookup in the export list first. 2551 */ 2552 if (nam != NULL) { 2553 saddr = mtod(nam, struct sockaddr *); 2554 rnh = nep->ne_rtable[saddr->sa_family]; 2555 if (rnh != NULL) { 2556 np = (struct netcred *) 2557 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2558 rnh); 2559 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2560 np = NULL; 2561 } 2562 } 2563 /* 2564 * If no address match, use the default if it exists. 2565 */ 2566 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2567 np = &nep->ne_defexported; 2568 } 2569 return (np); 2570 } 2571 2572 /* 2573 * Do the usual access checking. 2574 * file_mode, uid and gid are from the vnode in question, 2575 * while acc_mode and cred are from the VOP_ACCESS parameter list 2576 */ 2577 int 2578 vaccess(type, file_mode, uid, gid, acc_mode, cred) 2579 enum vtype type; 2580 mode_t file_mode; 2581 uid_t uid; 2582 gid_t gid; 2583 mode_t acc_mode; 2584 struct ucred *cred; 2585 { 2586 mode_t mask; 2587 2588 /* 2589 * Super-user always gets read/write access, but execute access depends 2590 * on at least one execute bit being set. 2591 */ 2592 if (cred->cr_uid == 0) { 2593 if ((acc_mode & VEXEC) && type != VDIR && 2594 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2595 return (EACCES); 2596 return (0); 2597 } 2598 2599 mask = 0; 2600 2601 /* Otherwise, check the owner. */ 2602 if (cred->cr_uid == uid) { 2603 if (acc_mode & VEXEC) 2604 mask |= S_IXUSR; 2605 if (acc_mode & VREAD) 2606 mask |= S_IRUSR; 2607 if (acc_mode & VWRITE) 2608 mask |= S_IWUSR; 2609 return ((file_mode & mask) == mask ? 0 : EACCES); 2610 } 2611 2612 /* Otherwise, check the groups. */ 2613 if (cred->cr_gid == gid || groupmember(gid, cred)) { 2614 if (acc_mode & VEXEC) 2615 mask |= S_IXGRP; 2616 if (acc_mode & VREAD) 2617 mask |= S_IRGRP; 2618 if (acc_mode & VWRITE) 2619 mask |= S_IWGRP; 2620 return ((file_mode & mask) == mask ? 0 : EACCES); 2621 } 2622 2623 /* Otherwise, check everyone else. */ 2624 if (acc_mode & VEXEC) 2625 mask |= S_IXOTH; 2626 if (acc_mode & VREAD) 2627 mask |= S_IROTH; 2628 if (acc_mode & VWRITE) 2629 mask |= S_IWOTH; 2630 return ((file_mode & mask) == mask ? 0 : EACCES); 2631 } 2632 2633 /* 2634 * Unmount all file systems. 2635 * We traverse the list in reverse order under the assumption that doing so 2636 * will avoid needing to worry about dependencies. 2637 */ 2638 void 2639 vfs_unmountall(p) 2640 struct proc *p; 2641 { 2642 struct mount *mp, *nmp; 2643 int allerror, error; 2644 2645 printf("unmounting file systems..."); 2646 for (allerror = 0, 2647 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2648 nmp = mp->mnt_list.cqe_prev; 2649 #ifdef DEBUG 2650 printf("\nunmounting %s (%s)...", 2651 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2652 #endif 2653 /* 2654 * XXX Freeze syncer. Must do this before locking the 2655 * mount point. See dounmount() for details. 2656 */ 2657 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL); 2658 if (vfs_busy(mp, 0, 0)) { 2659 lockmgr(&syncer_lock, LK_RELEASE, NULL); 2660 continue; 2661 } 2662 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) { 2663 printf("unmount of %s failed with error %d\n", 2664 mp->mnt_stat.f_mntonname, error); 2665 allerror = 1; 2666 } 2667 } 2668 printf(" done\n"); 2669 if (allerror) 2670 printf("WARNING: some file systems would not unmount\n"); 2671 } 2672 2673 extern struct simplelock bqueue_slock; /* XXX */ 2674 2675 /* 2676 * Sync and unmount file systems before shutting down. 2677 */ 2678 void 2679 vfs_shutdown() 2680 { 2681 struct lwp *l = curlwp; 2682 struct proc *p; 2683 2684 /* XXX we're certainly not running in proc0's context! */ 2685 if (l == NULL || (p = l->l_proc) == NULL) 2686 p = &proc0; 2687 2688 printf("syncing disks... "); 2689 2690 /* remove user process from run queue */ 2691 suspendsched(); 2692 (void) spl0(); 2693 2694 /* avoid coming back this way again if we panic. */ 2695 doing_shutdown = 1; 2696 2697 sys_sync(l, NULL, NULL); 2698 2699 /* Wait for sync to finish. */ 2700 if (buf_syncwait() != 0) { 2701 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2702 Debugger(); 2703 #endif 2704 printf("giving up\n"); 2705 return; 2706 } else 2707 printf("done\n"); 2708 2709 /* 2710 * If we've panic'd, don't make the situation potentially 2711 * worse by unmounting the file systems. 2712 */ 2713 if (panicstr != NULL) 2714 return; 2715 2716 /* Release inodes held by texts before update. */ 2717 #ifdef notdef 2718 vnshutdown(); 2719 #endif 2720 /* Unmount file systems. */ 2721 vfs_unmountall(p); 2722 } 2723 2724 /* 2725 * Mount the root file system. If the operator didn't specify a 2726 * file system to use, try all possible file systems until one 2727 * succeeds. 2728 */ 2729 int 2730 vfs_mountroot() 2731 { 2732 struct vfsops *v; 2733 int error = ENODEV; 2734 2735 if (root_device == NULL) 2736 panic("vfs_mountroot: root device unknown"); 2737 2738 switch (root_device->dv_class) { 2739 case DV_IFNET: 2740 if (rootdev != NODEV) 2741 panic("vfs_mountroot: rootdev set for DV_IFNET " 2742 "(0x%08x -> %d,%d)", rootdev, 2743 major(rootdev), minor(rootdev)); 2744 break; 2745 2746 case DV_DISK: 2747 if (rootdev == NODEV) 2748 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2749 if (bdevvp(rootdev, &rootvp)) 2750 panic("vfs_mountroot: can't get vnode for rootdev"); 2751 error = VOP_OPEN(rootvp, FREAD, FSCRED, curproc); 2752 if (error) { 2753 printf("vfs_mountroot: can't open root device\n"); 2754 return (error); 2755 } 2756 break; 2757 2758 default: 2759 printf("%s: inappropriate for root file system\n", 2760 root_device->dv_xname); 2761 return (ENODEV); 2762 } 2763 2764 /* 2765 * If user specified a file system, use it. 2766 */ 2767 if (mountroot != NULL) { 2768 error = (*mountroot)(); 2769 goto done; 2770 } 2771 2772 /* 2773 * Try each file system currently configured into the kernel. 2774 */ 2775 LIST_FOREACH(v, &vfs_list, vfs_list) { 2776 if (v->vfs_mountroot == NULL) 2777 continue; 2778 #ifdef DEBUG 2779 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2780 #endif 2781 error = (*v->vfs_mountroot)(); 2782 if (!error) { 2783 aprint_normal("root file system type: %s\n", 2784 v->vfs_name); 2785 break; 2786 } 2787 } 2788 2789 if (v == NULL) { 2790 printf("no file system for %s", root_device->dv_xname); 2791 if (root_device->dv_class == DV_DISK) 2792 printf(" (dev 0x%x)", rootdev); 2793 printf("\n"); 2794 error = EFTYPE; 2795 } 2796 2797 done: 2798 if (error && root_device->dv_class == DV_DISK) { 2799 VOP_CLOSE(rootvp, FREAD, FSCRED, curproc); 2800 vrele(rootvp); 2801 } 2802 return (error); 2803 } 2804 2805 /* 2806 * Given a file system name, look up the vfsops for that 2807 * file system, or return NULL if file system isn't present 2808 * in the kernel. 2809 */ 2810 struct vfsops * 2811 vfs_getopsbyname(name) 2812 const char *name; 2813 { 2814 struct vfsops *v; 2815 2816 LIST_FOREACH(v, &vfs_list, vfs_list) { 2817 if (strcmp(v->vfs_name, name) == 0) 2818 break; 2819 } 2820 2821 return (v); 2822 } 2823 2824 /* 2825 * Establish a file system and initialize it. 2826 */ 2827 int 2828 vfs_attach(vfs) 2829 struct vfsops *vfs; 2830 { 2831 struct vfsops *v; 2832 int error = 0; 2833 2834 2835 /* 2836 * Make sure this file system doesn't already exist. 2837 */ 2838 LIST_FOREACH(v, &vfs_list, vfs_list) { 2839 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) { 2840 error = EEXIST; 2841 goto out; 2842 } 2843 } 2844 2845 /* 2846 * Initialize the vnode operations for this file system. 2847 */ 2848 vfs_opv_init(vfs->vfs_opv_descs); 2849 2850 /* 2851 * Now initialize the file system itself. 2852 */ 2853 (*vfs->vfs_init)(); 2854 2855 /* 2856 * ...and link it into the kernel's list. 2857 */ 2858 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list); 2859 2860 /* 2861 * Sanity: make sure the reference count is 0. 2862 */ 2863 vfs->vfs_refcount = 0; 2864 2865 out: 2866 return (error); 2867 } 2868 2869 /* 2870 * Remove a file system from the kernel. 2871 */ 2872 int 2873 vfs_detach(vfs) 2874 struct vfsops *vfs; 2875 { 2876 struct vfsops *v; 2877 2878 /* 2879 * Make sure no one is using the filesystem. 2880 */ 2881 if (vfs->vfs_refcount != 0) 2882 return (EBUSY); 2883 2884 /* 2885 * ...and remove it from the kernel's list. 2886 */ 2887 LIST_FOREACH(v, &vfs_list, vfs_list) { 2888 if (v == vfs) { 2889 LIST_REMOVE(v, vfs_list); 2890 break; 2891 } 2892 } 2893 2894 if (v == NULL) 2895 return (ESRCH); 2896 2897 /* 2898 * Now run the file system-specific cleanups. 2899 */ 2900 (*vfs->vfs_done)(); 2901 2902 /* 2903 * Free the vnode operations vector. 2904 */ 2905 vfs_opv_free(vfs->vfs_opv_descs); 2906 return (0); 2907 } 2908 2909 void 2910 vfs_reinit(void) 2911 { 2912 struct vfsops *vfs; 2913 2914 LIST_FOREACH(vfs, &vfs_list, vfs_list) { 2915 if (vfs->vfs_reinit) { 2916 (*vfs->vfs_reinit)(); 2917 } 2918 } 2919 } 2920 2921 /* 2922 * Request a filesystem to suspend write operations. 2923 */ 2924 int 2925 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo) 2926 { 2927 struct proc *p = curproc; /* XXX */ 2928 int error; 2929 2930 while ((mp->mnt_iflag & IMNT_SUSPEND)) { 2931 if (slptimeo < 0) 2932 return EWOULDBLOCK; 2933 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo); 2934 if (error) 2935 return error; 2936 } 2937 mp->mnt_iflag |= IMNT_SUSPEND; 2938 2939 simple_lock(&mp->mnt_slock); 2940 if (mp->mnt_writeopcountupper > 0) 2941 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt", 2942 0, &mp->mnt_slock); 2943 simple_unlock(&mp->mnt_slock); 2944 2945 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p); 2946 if (error) { 2947 vfs_write_resume(mp); 2948 return error; 2949 } 2950 mp->mnt_iflag |= IMNT_SUSPENDLOW; 2951 2952 simple_lock(&mp->mnt_slock); 2953 if (mp->mnt_writeopcountlower > 0) 2954 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt", 2955 0, &mp->mnt_slock); 2956 mp->mnt_iflag |= IMNT_SUSPENDED; 2957 simple_unlock(&mp->mnt_slock); 2958 2959 return 0; 2960 } 2961 2962 /* 2963 * Request a filesystem to resume write operations. 2964 */ 2965 void 2966 vfs_write_resume(struct mount *mp) 2967 { 2968 2969 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0) 2970 return; 2971 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED); 2972 wakeup(&mp->mnt_flag); 2973 } 2974 2975 void 2976 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2977 { 2978 const struct statvfs *mbp; 2979 2980 if (sbp == (mbp = &mp->mnt_stat)) 2981 return; 2982 2983 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2984 sbp->f_fsid = mbp->f_fsid; 2985 sbp->f_owner = mbp->f_owner; 2986 sbp->f_flag = mbp->f_flag; 2987 sbp->f_syncwrites = mbp->f_syncwrites; 2988 sbp->f_asyncwrites = mbp->f_asyncwrites; 2989 sbp->f_syncreads = mbp->f_syncreads; 2990 sbp->f_asyncreads = mbp->f_asyncreads; 2991 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2992 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2993 sizeof(sbp->f_fstypename)); 2994 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2995 sizeof(sbp->f_mntonname)); 2996 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2997 sizeof(sbp->f_mntfromname)); 2998 sbp->f_namemax = mbp->f_namemax; 2999 } 3000 3001 int 3002 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 3003 struct mount *mp, struct proc *p) 3004 { 3005 int error; 3006 size_t size; 3007 struct statvfs *sfs = &mp->mnt_stat; 3008 int (*fun)(const void *, void *, size_t, size_t *); 3009 3010 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name, 3011 sizeof(mp->mnt_stat.f_fstypename)); 3012 3013 if (onp) { 3014 struct cwdinfo *cwdi = p->p_cwdi; 3015 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 3016 if (cwdi->cwdi_rdir != NULL) { 3017 size_t len; 3018 char *bp; 3019 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 3020 3021 if (!path) /* XXX can't happen with M_WAITOK */ 3022 return ENOMEM; 3023 3024 bp = path + MAXPATHLEN; 3025 *--bp = '\0'; 3026 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 3027 path, MAXPATHLEN / 2, 0, p); 3028 if (error) { 3029 free(path, M_TEMP); 3030 return error; 3031 } 3032 3033 len = strlen(bp); 3034 if (len > sizeof(sfs->f_mntonname) - 1) 3035 len = sizeof(sfs->f_mntonname) - 1; 3036 (void)strncpy(sfs->f_mntonname, bp, len); 3037 free(path, M_TEMP); 3038 3039 if (len < sizeof(sfs->f_mntonname) - 1) { 3040 error = (*fun)(onp, &sfs->f_mntonname[len], 3041 sizeof(sfs->f_mntonname) - len - 1, &size); 3042 if (error) 3043 return error; 3044 size += len; 3045 } else { 3046 size = len; 3047 } 3048 } else { 3049 error = (*fun)(onp, &sfs->f_mntonname, 3050 sizeof(sfs->f_mntonname) - 1, &size); 3051 if (error) 3052 return error; 3053 } 3054 (void)memset(sfs->f_mntonname + size, 0, 3055 sizeof(sfs->f_mntonname) - size); 3056 } 3057 3058 if (fromp) { 3059 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 3060 error = (*fun)(fromp, sfs->f_mntfromname, 3061 sizeof(sfs->f_mntfromname) - 1, &size); 3062 if (error) 3063 return error; 3064 (void)memset(sfs->f_mntfromname + size, 0, 3065 sizeof(sfs->f_mntfromname) - size); 3066 } 3067 return 0; 3068 } 3069 3070 /* 3071 * Default vfs_extattrctl routine for file systems that do not support 3072 * it. 3073 */ 3074 /*ARGSUSED*/ 3075 int 3076 vfs_stdextattrctl(struct mount *mp, int cmt, struct vnode *vp, 3077 int attrnamespace, const char *attrname, struct proc *p) 3078 { 3079 3080 if (vp != NULL) 3081 VOP_UNLOCK(vp, 0); 3082 return (EOPNOTSUPP); 3083 } 3084 3085 /* 3086 * Credential check based on process requesting service, and per-attribute 3087 * permissions. 3088 * 3089 * NOTE: Vnode must be locked. 3090 */ 3091 int 3092 extattr_check_cred(struct vnode *vp, int attrnamespace, 3093 struct ucred *cred, struct proc *p, int access) 3094 { 3095 3096 if (cred == NOCRED) 3097 return (0); 3098 3099 switch (attrnamespace) { 3100 case EXTATTR_NAMESPACE_SYSTEM: 3101 /* 3102 * Do we really want to allow this, or just require that 3103 * these requests come from kernel code (NOCRED case above)? 3104 */ 3105 return (suser(cred, &p->p_acflag)); 3106 3107 case EXTATTR_NAMESPACE_USER: 3108 return (VOP_ACCESS(vp, access, cred, p)); 3109 3110 default: 3111 return (EPERM); 3112 } 3113 } 3114 3115 #ifdef DDB 3116 const char buf_flagbits[] = BUF_FLAGBITS; 3117 3118 void 3119 vfs_buf_print(bp, full, pr) 3120 struct buf *bp; 3121 int full; 3122 void (*pr)(const char *, ...); 3123 { 3124 char buf[1024]; 3125 3126 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n", 3127 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev); 3128 3129 bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf)); 3130 (*pr)(" error %d flags 0x%s\n", bp->b_error, buf); 3131 3132 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3133 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3134 (*pr)(" data %p saveaddr %p dep %p\n", 3135 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); 3136 (*pr)(" iodone %p\n", bp->b_iodone); 3137 } 3138 3139 3140 void 3141 vfs_vnode_print(vp, full, pr) 3142 struct vnode *vp; 3143 int full; 3144 void (*pr)(const char *, ...); 3145 { 3146 char buf[256]; 3147 3148 uvm_object_printit(&vp->v_uobj, full, pr); 3149 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); 3150 (*pr)("\nVNODE flags %s\n", buf); 3151 (*pr)("mp %p numoutput %d size 0x%llx\n", 3152 vp->v_mount, vp->v_numoutput, vp->v_size); 3153 3154 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n", 3155 vp->v_data, vp->v_usecount, vp->v_writecount, 3156 vp->v_holdcnt, vp->v_numoutput); 3157 3158 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n", 3159 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 3160 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 3161 vp->v_mount, vp->v_mountedhere); 3162 3163 if (full) { 3164 struct buf *bp; 3165 3166 (*pr)("clean bufs:\n"); 3167 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3168 (*pr)(" bp %p\n", bp); 3169 vfs_buf_print(bp, full, pr); 3170 } 3171 3172 (*pr)("dirty bufs:\n"); 3173 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3174 (*pr)(" bp %p\n", bp); 3175 vfs_buf_print(bp, full, pr); 3176 } 3177 } 3178 } 3179 3180 void 3181 vfs_mount_print(mp, full, pr) 3182 struct mount *mp; 3183 int full; 3184 void (*pr)(const char *, ...); 3185 { 3186 char sbuf[256]; 3187 3188 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3189 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3190 3191 (*pr)("fs_bshift %d dev_bshift = %d\n", 3192 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3193 3194 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3195 (*pr)("flag = %s\n", sbuf); 3196 3197 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf)); 3198 (*pr)("iflag = %s\n", sbuf); 3199 3200 /* XXX use lockmgr_printinfo */ 3201 if (mp->mnt_lock.lk_sharecount) 3202 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg, 3203 mp->mnt_lock.lk_sharecount); 3204 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) { 3205 (*pr)(" lock type %s: EXCL (count %d) by ", 3206 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount); 3207 if (mp->mnt_lock.lk_flags & LK_SPIN) 3208 (*pr)("processor %lu", mp->mnt_lock.lk_cpu); 3209 else 3210 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder, 3211 mp->mnt_lock.lk_locklwp); 3212 } else 3213 (*pr)(" not locked"); 3214 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0) 3215 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount); 3216 3217 (*pr)("\n"); 3218 3219 if (mp->mnt_unmounter) { 3220 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid); 3221 } 3222 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n", 3223 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower); 3224 3225 (*pr)("statvfs cache:\n"); 3226 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3227 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3228 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3229 3230 (*pr)("\tblocks = "PRIu64"\n",mp->mnt_stat.f_blocks); 3231 (*pr)("\tbfree = "PRIu64"\n",mp->mnt_stat.f_bfree); 3232 (*pr)("\tbavail = "PRIu64"\n",mp->mnt_stat.f_bavail); 3233 (*pr)("\tbresvd = "PRIu64"\n",mp->mnt_stat.f_bresvd); 3234 3235 (*pr)("\tfiles = "PRIu64"\n",mp->mnt_stat.f_files); 3236 (*pr)("\tffree = "PRIu64"\n",mp->mnt_stat.f_ffree); 3237 (*pr)("\tfavail = "PRIu64"\n",mp->mnt_stat.f_favail); 3238 (*pr)("\tfresvd = "PRIu64"\n",mp->mnt_stat.f_fresvd); 3239 3240 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3241 mp->mnt_stat.f_fsidx.__fsid_val[0], 3242 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3243 3244 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3245 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3246 3247 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, 3248 sizeof(sbuf)); 3249 (*pr)("\tflag = %s\n",sbuf); 3250 (*pr)("\tsyncwrites = " PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3251 (*pr)("\tasyncwrites = " PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3252 (*pr)("\tsyncreads = " PRIu64 "\n",mp->mnt_stat.f_syncreads); 3253 (*pr)("\tasyncreads = " PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3254 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3255 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3256 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3257 3258 { 3259 int cnt = 0; 3260 struct vnode *vp; 3261 (*pr)("locked vnodes ="); 3262 /* XXX would take mountlist lock, except ddb may not have context */ 3263 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3264 if (VOP_ISLOCKED(vp)) { 3265 if ((++cnt % 6) == 0) { 3266 (*pr)(" %p,\n\t", vp); 3267 } else { 3268 (*pr)(" %p,", vp); 3269 } 3270 } 3271 } 3272 (*pr)("\n"); 3273 } 3274 3275 if (full) { 3276 int cnt = 0; 3277 struct vnode *vp; 3278 (*pr)("all vnodes ="); 3279 /* XXX would take mountlist lock, except ddb may not have context */ 3280 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3281 if (!LIST_NEXT(vp, v_mntvnodes)) { 3282 (*pr)(" %p", vp); 3283 } else if ((++cnt % 6) == 0) { 3284 (*pr)(" %p,\n\t", vp); 3285 } else { 3286 (*pr)(" %p,", vp); 3287 } 3288 } 3289 (*pr)("\n", vp); 3290 } 3291 } 3292 3293 #endif 3294