1 /* $NetBSD: vfs_subr.c,v 1.379 2009/05/16 08:29:53 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * Note on v_usecount and locking: 71 * 72 * At nearly all points it is known that v_usecount could be zero, the 73 * vnode interlock will be held. 74 * 75 * To change v_usecount away from zero, the interlock must be held. To 76 * change from a non-zero value to zero, again the interlock must be 77 * held. 78 * 79 * There's a flag bit, VC_XLOCK, embedded in v_usecount. 80 * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock 81 * must be held. 82 * To modify the VC_XLOCK bit, the interlock must be held. 83 * We always keep the usecount (v_usecount & VC_MASK) non-zero while the 84 * VC_XLOCK bit is set. 85 * 86 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 87 * value to a non-zero value can safely be done using atomic operations, 88 * without the interlock held. 89 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 90 * value can be done using atomic operations, without the interlock held. 91 */ 92 93 #include <sys/cdefs.h> 94 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.379 2009/05/16 08:29:53 yamt Exp $"); 95 96 #include "opt_ddb.h" 97 #include "opt_compat_netbsd.h" 98 #include "opt_compat_43.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/conf.h> 103 #include <sys/proc.h> 104 #include <sys/kernel.h> 105 #include <sys/mount.h> 106 #include <sys/fcntl.h> 107 #include <sys/vnode.h> 108 #include <sys/stat.h> 109 #include <sys/namei.h> 110 #include <sys/ucred.h> 111 #include <sys/buf.h> 112 #include <sys/errno.h> 113 #include <sys/kmem.h> 114 #include <sys/syscallargs.h> 115 #include <sys/device.h> 116 #include <sys/filedesc.h> 117 #include <sys/kauth.h> 118 #include <sys/atomic.h> 119 #include <sys/kthread.h> 120 #include <sys/wapbl.h> 121 122 #include <miscfs/specfs/specdev.h> 123 #include <miscfs/syncfs/syncfs.h> 124 125 #include <uvm/uvm.h> 126 #include <uvm/uvm_readahead.h> 127 #include <uvm/uvm_ddb.h> 128 129 #include <sys/sysctl.h> 130 131 const enum vtype iftovt_tab[16] = { 132 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 133 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 134 }; 135 const int vttoif_tab[9] = { 136 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 137 S_IFSOCK, S_IFIFO, S_IFMT, 138 }; 139 140 /* 141 * Insq/Remq for the vnode usage lists. 142 */ 143 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 144 #define bufremvn(bp) { \ 145 LIST_REMOVE(bp, b_vnbufs); \ 146 (bp)->b_vnbufs.le_next = NOLIST; \ 147 } 148 149 int doforce = 1; /* 1 => permit forcible unmounting */ 150 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 151 152 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 153 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 154 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 155 156 struct mntlist mountlist = /* mounted filesystem list */ 157 CIRCLEQ_HEAD_INITIALIZER(mountlist); 158 159 u_int numvnodes; 160 static specificdata_domain_t mount_specificdata_domain; 161 162 static int vrele_pending; 163 static int vrele_gen; 164 static kmutex_t vrele_lock; 165 static kcondvar_t vrele_cv; 166 static lwp_t *vrele_lwp; 167 168 kmutex_t mountlist_lock; 169 kmutex_t mntid_lock; 170 kmutex_t mntvnode_lock; 171 kmutex_t vnode_free_list_lock; 172 kmutex_t vfs_list_lock; 173 174 static pool_cache_t vnode_cache; 175 176 /* 177 * These define the root filesystem and device. 178 */ 179 struct vnode *rootvnode; 180 struct device *root_device; /* root device */ 181 182 /* 183 * Local declarations. 184 */ 185 186 static void vrele_thread(void *); 187 static void insmntque(vnode_t *, struct mount *); 188 static int getdevvp(dev_t, vnode_t **, enum vtype); 189 static vnode_t *getcleanvnode(void); 190 void vpanic(vnode_t *, const char *); 191 192 #ifdef DEBUG 193 void printlockedvnodes(void); 194 #endif 195 196 #ifdef DIAGNOSTIC 197 void 198 vpanic(vnode_t *vp, const char *msg) 199 { 200 201 vprint(NULL, vp); 202 panic("%s\n", msg); 203 } 204 #else 205 #define vpanic(vp, msg) /* nothing */ 206 #endif 207 208 void 209 vn_init1(void) 210 { 211 212 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 213 NULL, IPL_NONE, NULL, NULL, NULL); 214 KASSERT(vnode_cache != NULL); 215 216 /* Create deferred release thread. */ 217 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 218 cv_init(&vrele_cv, "vrele"); 219 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 220 NULL, &vrele_lwp, "vrele")) 221 panic("fork vrele"); 222 } 223 224 /* 225 * Initialize the vnode management data structures. 226 */ 227 void 228 vntblinit(void) 229 { 230 231 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE); 232 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE); 233 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE); 234 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 235 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE); 236 237 mount_specificdata_domain = specificdata_domain_create(); 238 239 /* Initialize the filesystem syncer. */ 240 vn_initialize_syncerd(); 241 vn_init1(); 242 } 243 244 int 245 vfs_drainvnodes(long target, struct lwp *l) 246 { 247 248 while (numvnodes > target) { 249 vnode_t *vp; 250 251 mutex_enter(&vnode_free_list_lock); 252 vp = getcleanvnode(); 253 if (vp == NULL) 254 return EBUSY; /* give up */ 255 ungetnewvnode(vp); 256 } 257 258 return 0; 259 } 260 261 /* 262 * Lookup a mount point by filesystem identifier. 263 * 264 * XXX Needs to add a reference to the mount point. 265 */ 266 struct mount * 267 vfs_getvfs(fsid_t *fsid) 268 { 269 struct mount *mp; 270 271 mutex_enter(&mountlist_lock); 272 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 273 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 274 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 275 mutex_exit(&mountlist_lock); 276 return (mp); 277 } 278 } 279 mutex_exit(&mountlist_lock); 280 return ((struct mount *)0); 281 } 282 283 /* 284 * Drop a reference to a mount structure, freeing if the last reference. 285 */ 286 void 287 vfs_destroy(struct mount *mp) 288 { 289 290 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) { 291 return; 292 } 293 294 /* 295 * Nothing else has visibility of the mount: we can now 296 * free the data structures. 297 */ 298 KASSERT(mp->mnt_refcnt == 0); 299 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 300 rw_destroy(&mp->mnt_unmounting); 301 mutex_destroy(&mp->mnt_updating); 302 mutex_destroy(&mp->mnt_renamelock); 303 if (mp->mnt_op != NULL) { 304 vfs_delref(mp->mnt_op); 305 } 306 kmem_free(mp, sizeof(*mp)); 307 } 308 309 /* 310 * grab a vnode from freelist and clean it. 311 */ 312 vnode_t * 313 getcleanvnode(void) 314 { 315 vnode_t *vp; 316 vnodelst_t *listhd; 317 318 KASSERT(mutex_owned(&vnode_free_list_lock)); 319 320 retry: 321 listhd = &vnode_free_list; 322 try_nextlist: 323 TAILQ_FOREACH(vp, listhd, v_freelist) { 324 /* 325 * It's safe to test v_usecount and v_iflag 326 * without holding the interlock here, since 327 * these vnodes should never appear on the 328 * lists. 329 */ 330 if (vp->v_usecount != 0) { 331 vpanic(vp, "free vnode isn't"); 332 } 333 if ((vp->v_iflag & VI_CLEAN) != 0) { 334 vpanic(vp, "clean vnode on freelist"); 335 } 336 if (vp->v_freelisthd != listhd) { 337 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 338 vpanic(vp, "list head mismatch"); 339 } 340 if (!mutex_tryenter(&vp->v_interlock)) 341 continue; 342 /* 343 * Our lwp might hold the underlying vnode 344 * locked, so don't try to reclaim a VI_LAYER 345 * node if it's locked. 346 */ 347 if ((vp->v_iflag & VI_XLOCK) == 0 && 348 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 349 break; 350 } 351 mutex_exit(&vp->v_interlock); 352 } 353 354 if (vp == NULL) { 355 if (listhd == &vnode_free_list) { 356 listhd = &vnode_hold_list; 357 goto try_nextlist; 358 } 359 mutex_exit(&vnode_free_list_lock); 360 return NULL; 361 } 362 363 /* Remove it from the freelist. */ 364 TAILQ_REMOVE(listhd, vp, v_freelist); 365 vp->v_freelisthd = NULL; 366 mutex_exit(&vnode_free_list_lock); 367 368 /* 369 * The vnode is still associated with a file system, so we must 370 * clean it out before reusing it. We need to add a reference 371 * before doing this. If the vnode gains another reference while 372 * being cleaned out then we lose - retry. 373 */ 374 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 375 vclean(vp, DOCLOSE); 376 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 377 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 378 if (vp->v_usecount == 1) { 379 /* We're about to dirty it. */ 380 vp->v_iflag &= ~VI_CLEAN; 381 mutex_exit(&vp->v_interlock); 382 if (vp->v_type == VBLK || vp->v_type == VCHR) { 383 spec_node_destroy(vp); 384 } 385 vp->v_type = VNON; 386 } else { 387 /* 388 * Don't return to freelist - the holder of the last 389 * reference will destroy it. 390 */ 391 vrelel(vp, 0); /* releases vp->v_interlock */ 392 mutex_enter(&vnode_free_list_lock); 393 goto retry; 394 } 395 396 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 397 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 398 vpanic(vp, "cleaned vnode isn't"); 399 } 400 if (vp->v_numoutput != 0) { 401 vpanic(vp, "clean vnode has pending I/O's"); 402 } 403 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 404 vpanic(vp, "clean vnode on syncer list"); 405 } 406 407 return vp; 408 } 409 410 /* 411 * Mark a mount point as busy, and gain a new reference to it. Used to 412 * prevent the file system from being unmounted during critical sections. 413 * 414 * => The caller must hold a pre-existing reference to the mount. 415 * => Will fail if the file system is being unmounted, or is unmounted. 416 */ 417 int 418 vfs_busy(struct mount *mp, struct mount **nextp) 419 { 420 421 KASSERT(mp->mnt_refcnt > 0); 422 423 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) { 424 if (nextp != NULL) { 425 KASSERT(mutex_owned(&mountlist_lock)); 426 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 427 } 428 return EBUSY; 429 } 430 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 431 rw_exit(&mp->mnt_unmounting); 432 if (nextp != NULL) { 433 KASSERT(mutex_owned(&mountlist_lock)); 434 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 435 } 436 return ENOENT; 437 } 438 if (nextp != NULL) { 439 mutex_exit(&mountlist_lock); 440 } 441 atomic_inc_uint(&mp->mnt_refcnt); 442 return 0; 443 } 444 445 /* 446 * Unbusy a busy filesystem. 447 * 448 * => If keepref is true, preserve reference added by vfs_busy(). 449 * => If nextp != NULL, acquire mountlist_lock. 450 */ 451 void 452 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 453 { 454 455 KASSERT(mp->mnt_refcnt > 0); 456 457 if (nextp != NULL) { 458 mutex_enter(&mountlist_lock); 459 } 460 rw_exit(&mp->mnt_unmounting); 461 if (!keepref) { 462 vfs_destroy(mp); 463 } 464 if (nextp != NULL) { 465 KASSERT(mutex_owned(&mountlist_lock)); 466 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 467 } 468 } 469 470 struct mount * 471 vfs_mountalloc(struct vfsops *vfsops, struct vnode *vp) 472 { 473 int error; 474 struct mount *mp; 475 476 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 477 if (mp == NULL) 478 return NULL; 479 480 mp->mnt_op = vfsops; 481 mp->mnt_refcnt = 1; 482 TAILQ_INIT(&mp->mnt_vnodelist); 483 rw_init(&mp->mnt_unmounting); 484 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 485 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE); 486 error = vfs_busy(mp, NULL); 487 KASSERT(error == 0); 488 mp->mnt_vnodecovered = vp; 489 mount_initspecific(mp); 490 491 return mp; 492 } 493 494 /* 495 * Lookup a filesystem type, and if found allocate and initialize 496 * a mount structure for it. 497 * 498 * Devname is usually updated by mount(8) after booting. 499 */ 500 int 501 vfs_rootmountalloc(const char *fstypename, const char *devname, 502 struct mount **mpp) 503 { 504 struct vfsops *vfsp = NULL; 505 struct mount *mp; 506 507 mutex_enter(&vfs_list_lock); 508 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 509 if (!strncmp(vfsp->vfs_name, fstypename, 510 sizeof(mp->mnt_stat.f_fstypename))) 511 break; 512 if (vfsp == NULL) { 513 mutex_exit(&vfs_list_lock); 514 return (ENODEV); 515 } 516 vfsp->vfs_refcount++; 517 mutex_exit(&vfs_list_lock); 518 519 if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL) 520 return ENOMEM; 521 mp->mnt_flag = MNT_RDONLY; 522 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 523 sizeof(mp->mnt_stat.f_fstypename)); 524 mp->mnt_stat.f_mntonname[0] = '/'; 525 mp->mnt_stat.f_mntonname[1] = '\0'; 526 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 527 '\0'; 528 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 529 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 530 *mpp = mp; 531 return (0); 532 } 533 534 /* 535 * Routines having to do with the management of the vnode table. 536 */ 537 extern int (**dead_vnodeop_p)(void *); 538 539 /* 540 * Return the next vnode from the free list. 541 */ 542 int 543 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 544 vnode_t **vpp) 545 { 546 struct uvm_object *uobj; 547 static int toggle; 548 vnode_t *vp; 549 int error = 0, tryalloc; 550 551 try_again: 552 if (mp != NULL) { 553 /* 554 * Mark filesystem busy while we're creating a 555 * vnode. If unmount is in progress, this will 556 * fail. 557 */ 558 error = vfs_busy(mp, NULL); 559 if (error) 560 return error; 561 } 562 563 /* 564 * We must choose whether to allocate a new vnode or recycle an 565 * existing one. The criterion for allocating a new one is that 566 * the total number of vnodes is less than the number desired or 567 * there are no vnodes on either free list. Generally we only 568 * want to recycle vnodes that have no buffers associated with 569 * them, so we look first on the vnode_free_list. If it is empty, 570 * we next consider vnodes with referencing buffers on the 571 * vnode_hold_list. The toggle ensures that half the time we 572 * will use a buffer from the vnode_hold_list, and half the time 573 * we will allocate a new one unless the list has grown to twice 574 * the desired size. We are reticent to recycle vnodes from the 575 * vnode_hold_list because we will lose the identity of all its 576 * referencing buffers. 577 */ 578 579 vp = NULL; 580 581 mutex_enter(&vnode_free_list_lock); 582 583 toggle ^= 1; 584 if (numvnodes > 2 * desiredvnodes) 585 toggle = 0; 586 587 tryalloc = numvnodes < desiredvnodes || 588 (TAILQ_FIRST(&vnode_free_list) == NULL && 589 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 590 591 if (tryalloc) { 592 numvnodes++; 593 mutex_exit(&vnode_free_list_lock); 594 if ((vp = vnalloc(NULL)) == NULL) { 595 mutex_enter(&vnode_free_list_lock); 596 numvnodes--; 597 } else 598 vp->v_usecount = 1; 599 } 600 601 if (vp == NULL) { 602 vp = getcleanvnode(); 603 if (vp == NULL) { 604 if (mp != NULL) { 605 vfs_unbusy(mp, false, NULL); 606 } 607 if (tryalloc) { 608 printf("WARNING: unable to allocate new " 609 "vnode, retrying...\n"); 610 kpause("newvn", false, hz, NULL); 611 goto try_again; 612 } 613 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 614 *vpp = 0; 615 return (ENFILE); 616 } 617 vp->v_iflag = 0; 618 vp->v_vflag = 0; 619 vp->v_uflag = 0; 620 vp->v_socket = NULL; 621 } 622 623 KASSERT(vp->v_usecount == 1); 624 KASSERT(vp->v_freelisthd == NULL); 625 KASSERT(LIST_EMPTY(&vp->v_nclist)); 626 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 627 628 vp->v_type = VNON; 629 vp->v_vnlock = &vp->v_lock; 630 vp->v_tag = tag; 631 vp->v_op = vops; 632 insmntque(vp, mp); 633 *vpp = vp; 634 vp->v_data = 0; 635 636 /* 637 * initialize uvm_object within vnode. 638 */ 639 640 uobj = &vp->v_uobj; 641 KASSERT(uobj->pgops == &uvm_vnodeops); 642 KASSERT(uobj->uo_npages == 0); 643 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 644 vp->v_size = vp->v_writesize = VSIZENOTSET; 645 646 if (mp != NULL) { 647 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 648 vp->v_vflag |= VV_MPSAFE; 649 vfs_unbusy(mp, true, NULL); 650 } 651 652 return (0); 653 } 654 655 /* 656 * This is really just the reverse of getnewvnode(). Needed for 657 * VFS_VGET functions who may need to push back a vnode in case 658 * of a locking race. 659 */ 660 void 661 ungetnewvnode(vnode_t *vp) 662 { 663 664 KASSERT(vp->v_usecount == 1); 665 KASSERT(vp->v_data == NULL); 666 KASSERT(vp->v_freelisthd == NULL); 667 668 mutex_enter(&vp->v_interlock); 669 vp->v_iflag |= VI_CLEAN; 670 vrelel(vp, 0); 671 } 672 673 /* 674 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 675 * marker vnode and we are prepared to wait for the allocation. 676 */ 677 vnode_t * 678 vnalloc(struct mount *mp) 679 { 680 vnode_t *vp; 681 682 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 683 if (vp == NULL) { 684 return NULL; 685 } 686 687 memset(vp, 0, sizeof(*vp)); 688 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 689 cv_init(&vp->v_cv, "vnode"); 690 /* 691 * done by memset() above. 692 * LIST_INIT(&vp->v_nclist); 693 * LIST_INIT(&vp->v_dnclist); 694 */ 695 696 if (mp != NULL) { 697 vp->v_mount = mp; 698 vp->v_type = VBAD; 699 vp->v_iflag = VI_MARKER; 700 } else { 701 rw_init(&vp->v_lock.vl_lock); 702 } 703 704 return vp; 705 } 706 707 /* 708 * Free an unused, unreferenced vnode. 709 */ 710 void 711 vnfree(vnode_t *vp) 712 { 713 714 KASSERT(vp->v_usecount == 0); 715 716 if ((vp->v_iflag & VI_MARKER) == 0) { 717 rw_destroy(&vp->v_lock.vl_lock); 718 mutex_enter(&vnode_free_list_lock); 719 numvnodes--; 720 mutex_exit(&vnode_free_list_lock); 721 } 722 723 UVM_OBJ_DESTROY(&vp->v_uobj); 724 cv_destroy(&vp->v_cv); 725 pool_cache_put(vnode_cache, vp); 726 } 727 728 /* 729 * Remove a vnode from its freelist. 730 */ 731 static inline void 732 vremfree(vnode_t *vp) 733 { 734 735 KASSERT(mutex_owned(&vp->v_interlock)); 736 KASSERT(vp->v_usecount == 0); 737 738 /* 739 * Note that the reference count must not change until 740 * the vnode is removed. 741 */ 742 mutex_enter(&vnode_free_list_lock); 743 if (vp->v_holdcnt > 0) { 744 KASSERT(vp->v_freelisthd == &vnode_hold_list); 745 } else { 746 KASSERT(vp->v_freelisthd == &vnode_free_list); 747 } 748 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 749 vp->v_freelisthd = NULL; 750 mutex_exit(&vnode_free_list_lock); 751 } 752 753 /* 754 * Move a vnode from one mount queue to another. 755 */ 756 static void 757 insmntque(vnode_t *vp, struct mount *mp) 758 { 759 struct mount *omp; 760 761 #ifdef DIAGNOSTIC 762 if ((mp != NULL) && 763 (mp->mnt_iflag & IMNT_UNMOUNT) && 764 vp->v_tag != VT_VFS) { 765 panic("insmntque into dying filesystem"); 766 } 767 #endif 768 769 mutex_enter(&mntvnode_lock); 770 /* 771 * Delete from old mount point vnode list, if on one. 772 */ 773 if ((omp = vp->v_mount) != NULL) 774 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 775 /* 776 * Insert into list of vnodes for the new mount point, if 777 * available. The caller must take a reference on the mount 778 * structure and donate to the vnode. 779 */ 780 if ((vp->v_mount = mp) != NULL) 781 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 782 mutex_exit(&mntvnode_lock); 783 784 if (omp != NULL) { 785 /* Release reference to old mount. */ 786 vfs_destroy(omp); 787 } 788 } 789 790 /* 791 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 792 * recycled. 793 */ 794 void 795 vwait(vnode_t *vp, int flags) 796 { 797 798 KASSERT(mutex_owned(&vp->v_interlock)); 799 KASSERT(vp->v_usecount != 0); 800 801 while ((vp->v_iflag & flags) != 0) 802 cv_wait(&vp->v_cv, &vp->v_interlock); 803 } 804 805 /* 806 * Insert a marker vnode into a mount's vnode list, after the 807 * specified vnode. mntvnode_lock must be held. 808 */ 809 void 810 vmark(vnode_t *mvp, vnode_t *vp) 811 { 812 struct mount *mp; 813 814 mp = mvp->v_mount; 815 816 KASSERT(mutex_owned(&mntvnode_lock)); 817 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 818 KASSERT(vp->v_mount == mp); 819 820 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes); 821 } 822 823 /* 824 * Remove a marker vnode from a mount's vnode list, and return 825 * a pointer to the next vnode in the list. mntvnode_lock must 826 * be held. 827 */ 828 vnode_t * 829 vunmark(vnode_t *mvp) 830 { 831 vnode_t *vp; 832 struct mount *mp; 833 834 mp = mvp->v_mount; 835 836 KASSERT(mutex_owned(&mntvnode_lock)); 837 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 838 839 vp = TAILQ_NEXT(mvp, v_mntvnodes); 840 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes); 841 842 KASSERT(vp == NULL || vp->v_mount == mp); 843 844 return vp; 845 } 846 847 /* 848 * Update outstanding I/O count and do wakeup if requested. 849 */ 850 void 851 vwakeup(struct buf *bp) 852 { 853 struct vnode *vp; 854 855 if ((vp = bp->b_vp) == NULL) 856 return; 857 858 KASSERT(bp->b_objlock == &vp->v_interlock); 859 KASSERT(mutex_owned(bp->b_objlock)); 860 861 if (--vp->v_numoutput < 0) 862 panic("vwakeup: neg numoutput, vp %p", vp); 863 if (vp->v_numoutput == 0) 864 cv_broadcast(&vp->v_cv); 865 } 866 867 /* 868 * Flush out and invalidate all buffers associated with a vnode. 869 * Called with the underlying vnode locked, which should prevent new dirty 870 * buffers from being queued. 871 */ 872 int 873 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, 874 bool catch, int slptimeo) 875 { 876 struct buf *bp, *nbp; 877 int error; 878 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 879 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); 880 881 /* XXXUBC this doesn't look at flags or slp* */ 882 mutex_enter(&vp->v_interlock); 883 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 884 if (error) { 885 return error; 886 } 887 888 if (flags & V_SAVE) { 889 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); 890 if (error) 891 return (error); 892 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); 893 } 894 895 mutex_enter(&bufcache_lock); 896 restart: 897 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 898 nbp = LIST_NEXT(bp, b_vnbufs); 899 error = bbusy(bp, catch, slptimeo, NULL); 900 if (error != 0) { 901 if (error == EPASSTHROUGH) 902 goto restart; 903 mutex_exit(&bufcache_lock); 904 return (error); 905 } 906 brelsel(bp, BC_INVAL | BC_VFLUSH); 907 } 908 909 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 910 nbp = LIST_NEXT(bp, b_vnbufs); 911 error = bbusy(bp, catch, slptimeo, NULL); 912 if (error != 0) { 913 if (error == EPASSTHROUGH) 914 goto restart; 915 mutex_exit(&bufcache_lock); 916 return (error); 917 } 918 /* 919 * XXX Since there are no node locks for NFS, I believe 920 * there is a slight chance that a delayed write will 921 * occur while sleeping just above, so check for it. 922 */ 923 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { 924 #ifdef DEBUG 925 printf("buffer still DELWRI\n"); 926 #endif 927 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 928 mutex_exit(&bufcache_lock); 929 VOP_BWRITE(bp); 930 mutex_enter(&bufcache_lock); 931 goto restart; 932 } 933 brelsel(bp, BC_INVAL | BC_VFLUSH); 934 } 935 936 #ifdef DIAGNOSTIC 937 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 938 panic("vinvalbuf: flush failed, vp %p", vp); 939 #endif 940 941 mutex_exit(&bufcache_lock); 942 943 return (0); 944 } 945 946 /* 947 * Destroy any in core blocks past the truncation length. 948 * Called with the underlying vnode locked, which should prevent new dirty 949 * buffers from being queued. 950 */ 951 int 952 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo) 953 { 954 struct buf *bp, *nbp; 955 int error; 956 voff_t off; 957 958 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 959 mutex_enter(&vp->v_interlock); 960 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 961 if (error) { 962 return error; 963 } 964 965 mutex_enter(&bufcache_lock); 966 restart: 967 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 968 nbp = LIST_NEXT(bp, b_vnbufs); 969 if (bp->b_lblkno < lbn) 970 continue; 971 error = bbusy(bp, catch, slptimeo, NULL); 972 if (error != 0) { 973 if (error == EPASSTHROUGH) 974 goto restart; 975 mutex_exit(&bufcache_lock); 976 return (error); 977 } 978 brelsel(bp, BC_INVAL | BC_VFLUSH); 979 } 980 981 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 982 nbp = LIST_NEXT(bp, b_vnbufs); 983 if (bp->b_lblkno < lbn) 984 continue; 985 error = bbusy(bp, catch, slptimeo, NULL); 986 if (error != 0) { 987 if (error == EPASSTHROUGH) 988 goto restart; 989 mutex_exit(&bufcache_lock); 990 return (error); 991 } 992 brelsel(bp, BC_INVAL | BC_VFLUSH); 993 } 994 mutex_exit(&bufcache_lock); 995 996 return (0); 997 } 998 999 /* 1000 * Flush all dirty buffers from a vnode. 1001 * Called with the underlying vnode locked, which should prevent new dirty 1002 * buffers from being queued. 1003 */ 1004 void 1005 vflushbuf(struct vnode *vp, int sync) 1006 { 1007 struct buf *bp, *nbp; 1008 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 1009 bool dirty; 1010 1011 mutex_enter(&vp->v_interlock); 1012 (void) VOP_PUTPAGES(vp, 0, 0, flags); 1013 1014 loop: 1015 mutex_enter(&bufcache_lock); 1016 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1017 nbp = LIST_NEXT(bp, b_vnbufs); 1018 if ((bp->b_cflags & BC_BUSY)) 1019 continue; 1020 if ((bp->b_oflags & BO_DELWRI) == 0) 1021 panic("vflushbuf: not dirty, bp %p", bp); 1022 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 1023 mutex_exit(&bufcache_lock); 1024 /* 1025 * Wait for I/O associated with indirect blocks to complete, 1026 * since there is no way to quickly wait for them below. 1027 */ 1028 if (bp->b_vp == vp || sync == 0) 1029 (void) bawrite(bp); 1030 else 1031 (void) bwrite(bp); 1032 goto loop; 1033 } 1034 mutex_exit(&bufcache_lock); 1035 1036 if (sync == 0) 1037 return; 1038 1039 mutex_enter(&vp->v_interlock); 1040 while (vp->v_numoutput != 0) 1041 cv_wait(&vp->v_cv, &vp->v_interlock); 1042 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); 1043 mutex_exit(&vp->v_interlock); 1044 1045 if (dirty) { 1046 vprint("vflushbuf: dirty", vp); 1047 goto loop; 1048 } 1049 } 1050 1051 /* 1052 * Create a vnode for a block device. 1053 * Used for root filesystem and swap areas. 1054 * Also used for memory file system special devices. 1055 */ 1056 int 1057 bdevvp(dev_t dev, vnode_t **vpp) 1058 { 1059 1060 return (getdevvp(dev, vpp, VBLK)); 1061 } 1062 1063 /* 1064 * Create a vnode for a character device. 1065 * Used for kernfs and some console handling. 1066 */ 1067 int 1068 cdevvp(dev_t dev, vnode_t **vpp) 1069 { 1070 1071 return (getdevvp(dev, vpp, VCHR)); 1072 } 1073 1074 /* 1075 * Associate a buffer with a vnode. There must already be a hold on 1076 * the vnode. 1077 */ 1078 void 1079 bgetvp(struct vnode *vp, struct buf *bp) 1080 { 1081 1082 KASSERT(bp->b_vp == NULL); 1083 KASSERT(bp->b_objlock == &buffer_lock); 1084 KASSERT(mutex_owned(&vp->v_interlock)); 1085 KASSERT(mutex_owned(&bufcache_lock)); 1086 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1087 KASSERT(!cv_has_waiters(&bp->b_done)); 1088 1089 vholdl(vp); 1090 bp->b_vp = vp; 1091 if (vp->v_type == VBLK || vp->v_type == VCHR) 1092 bp->b_dev = vp->v_rdev; 1093 else 1094 bp->b_dev = NODEV; 1095 1096 /* 1097 * Insert onto list for new vnode. 1098 */ 1099 bufinsvn(bp, &vp->v_cleanblkhd); 1100 bp->b_objlock = &vp->v_interlock; 1101 } 1102 1103 /* 1104 * Disassociate a buffer from a vnode. 1105 */ 1106 void 1107 brelvp(struct buf *bp) 1108 { 1109 struct vnode *vp = bp->b_vp; 1110 1111 KASSERT(vp != NULL); 1112 KASSERT(bp->b_objlock == &vp->v_interlock); 1113 KASSERT(mutex_owned(&vp->v_interlock)); 1114 KASSERT(mutex_owned(&bufcache_lock)); 1115 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1116 KASSERT(!cv_has_waiters(&bp->b_done)); 1117 1118 /* 1119 * Delete from old vnode list, if on one. 1120 */ 1121 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1122 bufremvn(bp); 1123 1124 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) && 1125 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1126 vp->v_iflag &= ~VI_WRMAPDIRTY; 1127 vn_syncer_remove_from_worklist(vp); 1128 } 1129 1130 bp->b_objlock = &buffer_lock; 1131 bp->b_vp = NULL; 1132 holdrelel(vp); 1133 } 1134 1135 /* 1136 * Reassign a buffer from one vnode list to another. 1137 * The list reassignment must be within the same vnode. 1138 * Used to assign file specific control information 1139 * (indirect blocks) to the list to which they belong. 1140 */ 1141 void 1142 reassignbuf(struct buf *bp, struct vnode *vp) 1143 { 1144 struct buflists *listheadp; 1145 int delayx; 1146 1147 KASSERT(mutex_owned(&bufcache_lock)); 1148 KASSERT(bp->b_objlock == &vp->v_interlock); 1149 KASSERT(mutex_owned(&vp->v_interlock)); 1150 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1151 1152 /* 1153 * Delete from old vnode list, if on one. 1154 */ 1155 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1156 bufremvn(bp); 1157 1158 /* 1159 * If dirty, put on list of dirty buffers; 1160 * otherwise insert onto list of clean buffers. 1161 */ 1162 if ((bp->b_oflags & BO_DELWRI) == 0) { 1163 listheadp = &vp->v_cleanblkhd; 1164 if (TAILQ_EMPTY(&vp->v_uobj.memq) && 1165 (vp->v_iflag & VI_ONWORKLST) && 1166 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1167 vp->v_iflag &= ~VI_WRMAPDIRTY; 1168 vn_syncer_remove_from_worklist(vp); 1169 } 1170 } else { 1171 listheadp = &vp->v_dirtyblkhd; 1172 if ((vp->v_iflag & VI_ONWORKLST) == 0) { 1173 switch (vp->v_type) { 1174 case VDIR: 1175 delayx = dirdelay; 1176 break; 1177 case VBLK: 1178 if (vp->v_specmountpoint != NULL) { 1179 delayx = metadelay; 1180 break; 1181 } 1182 /* fall through */ 1183 default: 1184 delayx = filedelay; 1185 break; 1186 } 1187 if (!vp->v_mount || 1188 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1189 vn_syncer_add_to_worklist(vp, delayx); 1190 } 1191 } 1192 bufinsvn(bp, listheadp); 1193 } 1194 1195 /* 1196 * Create a vnode for a device. 1197 * Used by bdevvp (block device) for root file system etc., 1198 * and by cdevvp (character device) for console and kernfs. 1199 */ 1200 static int 1201 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 1202 { 1203 vnode_t *vp; 1204 vnode_t *nvp; 1205 int error; 1206 1207 if (dev == NODEV) { 1208 *vpp = NULL; 1209 return (0); 1210 } 1211 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1212 if (error) { 1213 *vpp = NULL; 1214 return (error); 1215 } 1216 vp = nvp; 1217 vp->v_type = type; 1218 vp->v_vflag |= VV_MPSAFE; 1219 uvm_vnp_setsize(vp, 0); 1220 spec_node_init(vp, dev); 1221 *vpp = vp; 1222 return (0); 1223 } 1224 1225 /* 1226 * Try to gain a reference to a vnode, without acquiring its interlock. 1227 * The caller must hold a lock that will prevent the vnode from being 1228 * recycled or freed. 1229 */ 1230 bool 1231 vtryget(vnode_t *vp) 1232 { 1233 u_int use, next; 1234 1235 /* 1236 * If the vnode is being freed, don't make life any harder 1237 * for vclean() by adding another reference without waiting. 1238 * This is not strictly necessary, but we'll do it anyway. 1239 */ 1240 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) { 1241 return false; 1242 } 1243 for (use = vp->v_usecount;; use = next) { 1244 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 1245 /* Need interlock held if first reference. */ 1246 return false; 1247 } 1248 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 1249 if (__predict_true(next == use)) { 1250 return true; 1251 } 1252 } 1253 } 1254 1255 /* 1256 * Grab a particular vnode from the free list, increment its 1257 * reference count and lock it. If the vnode lock bit is set the 1258 * vnode is being eliminated in vgone. In that case, we can not 1259 * grab the vnode, so the process is awakened when the transition is 1260 * completed, and an error returned to indicate that the vnode is no 1261 * longer usable (possibly having been changed to a new file system type). 1262 */ 1263 int 1264 vget(vnode_t *vp, int flags) 1265 { 1266 int error; 1267 1268 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1269 1270 if ((flags & LK_INTERLOCK) == 0) 1271 mutex_enter(&vp->v_interlock); 1272 1273 /* 1274 * Before adding a reference, we must remove the vnode 1275 * from its freelist. 1276 */ 1277 if (vp->v_usecount == 0) { 1278 vremfree(vp); 1279 vp->v_usecount = 1; 1280 } else { 1281 atomic_inc_uint(&vp->v_usecount); 1282 } 1283 1284 /* 1285 * If the vnode is in the process of being cleaned out for 1286 * another use, we wait for the cleaning to finish and then 1287 * return failure. Cleaning is determined by checking if 1288 * the VI_XLOCK or VI_FREEING flags are set. 1289 */ 1290 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 1291 if ((flags & LK_NOWAIT) != 0) { 1292 vrelel(vp, 0); 1293 return EBUSY; 1294 } 1295 vwait(vp, VI_XLOCK | VI_FREEING); 1296 vrelel(vp, 0); 1297 return ENOENT; 1298 } 1299 if (flags & LK_TYPE_MASK) { 1300 error = vn_lock(vp, flags | LK_INTERLOCK); 1301 if (error != 0) { 1302 vrele(vp); 1303 } 1304 return error; 1305 } 1306 mutex_exit(&vp->v_interlock); 1307 return 0; 1308 } 1309 1310 /* 1311 * vput(), just unlock and vrele() 1312 */ 1313 void 1314 vput(vnode_t *vp) 1315 { 1316 1317 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1318 1319 VOP_UNLOCK(vp, 0); 1320 vrele(vp); 1321 } 1322 1323 /* 1324 * Try to drop reference on a vnode. Abort if we are releasing the 1325 * last reference. Note: this _must_ succeed if not the last reference. 1326 */ 1327 static inline bool 1328 vtryrele(vnode_t *vp) 1329 { 1330 u_int use, next; 1331 1332 for (use = vp->v_usecount;; use = next) { 1333 if (use == 1) { 1334 return false; 1335 } 1336 KASSERT((use & VC_MASK) > 1); 1337 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 1338 if (__predict_true(next == use)) { 1339 return true; 1340 } 1341 } 1342 } 1343 1344 /* 1345 * Vnode release. If reference count drops to zero, call inactive 1346 * routine and either return to freelist or free to the pool. 1347 */ 1348 void 1349 vrelel(vnode_t *vp, int flags) 1350 { 1351 bool recycle, defer; 1352 int error; 1353 1354 KASSERT(mutex_owned(&vp->v_interlock)); 1355 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1356 KASSERT(vp->v_freelisthd == NULL); 1357 1358 if (__predict_false(vp->v_op == dead_vnodeop_p && 1359 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 1360 vpanic(vp, "dead but not clean"); 1361 } 1362 1363 /* 1364 * If not the last reference, just drop the reference count 1365 * and unlock. 1366 */ 1367 if (vtryrele(vp)) { 1368 vp->v_iflag |= VI_INACTREDO; 1369 mutex_exit(&vp->v_interlock); 1370 return; 1371 } 1372 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 1373 vpanic(vp, "vrelel: bad ref count"); 1374 } 1375 1376 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 1377 1378 /* 1379 * If not clean, deactivate the vnode, but preserve 1380 * our reference across the call to VOP_INACTIVE(). 1381 */ 1382 retry: 1383 if ((vp->v_iflag & VI_CLEAN) == 0) { 1384 recycle = false; 1385 vp->v_iflag |= VI_INACTNOW; 1386 1387 /* 1388 * XXX This ugly block can be largely eliminated if 1389 * locking is pushed down into the file systems. 1390 */ 1391 if (curlwp == uvm.pagedaemon_lwp) { 1392 /* The pagedaemon can't wait around; defer. */ 1393 defer = true; 1394 } else if (curlwp == vrele_lwp) { 1395 /* We have to try harder. */ 1396 vp->v_iflag &= ~VI_INACTREDO; 1397 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 1398 LK_RETRY); 1399 if (error != 0) { 1400 /* XXX */ 1401 vpanic(vp, "vrele: unable to lock %p"); 1402 } 1403 defer = false; 1404 } else if ((vp->v_iflag & VI_LAYER) != 0) { 1405 /* 1406 * Acquiring the stack's lock in vclean() even 1407 * for an honest vput/vrele is dangerous because 1408 * our caller may hold other vnode locks; defer. 1409 */ 1410 defer = true; 1411 } else { 1412 /* If we can't acquire the lock, then defer. */ 1413 vp->v_iflag &= ~VI_INACTREDO; 1414 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 1415 LK_NOWAIT); 1416 if (error != 0) { 1417 defer = true; 1418 mutex_enter(&vp->v_interlock); 1419 } else { 1420 defer = false; 1421 } 1422 } 1423 1424 if (defer) { 1425 /* 1426 * Defer reclaim to the kthread; it's not safe to 1427 * clean it here. We donate it our last reference. 1428 */ 1429 KASSERT(mutex_owned(&vp->v_interlock)); 1430 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 1431 vp->v_iflag &= ~VI_INACTNOW; 1432 vp->v_iflag |= VI_INACTPEND; 1433 mutex_enter(&vrele_lock); 1434 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 1435 if (++vrele_pending > (desiredvnodes >> 8)) 1436 cv_signal(&vrele_cv); 1437 mutex_exit(&vrele_lock); 1438 mutex_exit(&vp->v_interlock); 1439 return; 1440 } 1441 1442 #ifdef DIAGNOSTIC 1443 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1444 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 1445 vprint("vrelel: missing VOP_CLOSE()", vp); 1446 } 1447 #endif 1448 1449 /* 1450 * The vnode can gain another reference while being 1451 * deactivated. If VOP_INACTIVE() indicates that 1452 * the described file has been deleted, then recycle 1453 * the vnode irrespective of additional references. 1454 * Another thread may be waiting to re-use the on-disk 1455 * inode. 1456 * 1457 * Note that VOP_INACTIVE() will drop the vnode lock. 1458 */ 1459 VOP_INACTIVE(vp, &recycle); 1460 mutex_enter(&vp->v_interlock); 1461 vp->v_iflag &= ~VI_INACTNOW; 1462 if (!recycle) { 1463 if (vtryrele(vp)) { 1464 mutex_exit(&vp->v_interlock); 1465 return; 1466 } 1467 1468 /* 1469 * If we grew another reference while 1470 * VOP_INACTIVE() was underway, retry. 1471 */ 1472 if ((vp->v_iflag & VI_INACTREDO) != 0) { 1473 goto retry; 1474 } 1475 } 1476 1477 /* Take care of space accounting. */ 1478 if (vp->v_iflag & VI_EXECMAP) { 1479 atomic_add_int(&uvmexp.execpages, 1480 -vp->v_uobj.uo_npages); 1481 atomic_add_int(&uvmexp.filepages, 1482 vp->v_uobj.uo_npages); 1483 } 1484 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 1485 vp->v_vflag &= ~VV_MAPPED; 1486 1487 /* 1488 * Recycle the vnode if the file is now unused (unlinked), 1489 * otherwise just free it. 1490 */ 1491 if (recycle) { 1492 vclean(vp, DOCLOSE); 1493 } 1494 KASSERT(vp->v_usecount > 0); 1495 } 1496 1497 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 1498 /* Gained another reference while being reclaimed. */ 1499 mutex_exit(&vp->v_interlock); 1500 return; 1501 } 1502 1503 if ((vp->v_iflag & VI_CLEAN) != 0) { 1504 /* 1505 * It's clean so destroy it. It isn't referenced 1506 * anywhere since it has been reclaimed. 1507 */ 1508 KASSERT(vp->v_holdcnt == 0); 1509 KASSERT(vp->v_writecount == 0); 1510 mutex_exit(&vp->v_interlock); 1511 insmntque(vp, NULL); 1512 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1513 spec_node_destroy(vp); 1514 } 1515 vnfree(vp); 1516 } else { 1517 /* 1518 * Otherwise, put it back onto the freelist. It 1519 * can't be destroyed while still associated with 1520 * a file system. 1521 */ 1522 mutex_enter(&vnode_free_list_lock); 1523 if (vp->v_holdcnt > 0) { 1524 vp->v_freelisthd = &vnode_hold_list; 1525 } else { 1526 vp->v_freelisthd = &vnode_free_list; 1527 } 1528 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1529 mutex_exit(&vnode_free_list_lock); 1530 mutex_exit(&vp->v_interlock); 1531 } 1532 } 1533 1534 void 1535 vrele(vnode_t *vp) 1536 { 1537 1538 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1539 1540 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 1541 return; 1542 } 1543 mutex_enter(&vp->v_interlock); 1544 vrelel(vp, 0); 1545 } 1546 1547 static void 1548 vrele_thread(void *cookie) 1549 { 1550 vnode_t *vp; 1551 1552 for (;;) { 1553 mutex_enter(&vrele_lock); 1554 while (TAILQ_EMPTY(&vrele_list)) { 1555 vrele_gen++; 1556 cv_broadcast(&vrele_cv); 1557 cv_timedwait(&vrele_cv, &vrele_lock, hz); 1558 } 1559 vp = TAILQ_FIRST(&vrele_list); 1560 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 1561 vrele_pending--; 1562 mutex_exit(&vrele_lock); 1563 1564 /* 1565 * If not the last reference, then ignore the vnode 1566 * and look for more work. 1567 */ 1568 mutex_enter(&vp->v_interlock); 1569 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 1570 vp->v_iflag &= ~VI_INACTPEND; 1571 vrelel(vp, 0); 1572 } 1573 } 1574 1575 /* 1576 * Page or buffer structure gets a reference. 1577 * Called with v_interlock held. 1578 */ 1579 void 1580 vholdl(vnode_t *vp) 1581 { 1582 1583 KASSERT(mutex_owned(&vp->v_interlock)); 1584 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1585 1586 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1587 mutex_enter(&vnode_free_list_lock); 1588 KASSERT(vp->v_freelisthd == &vnode_free_list); 1589 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1590 vp->v_freelisthd = &vnode_hold_list; 1591 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1592 mutex_exit(&vnode_free_list_lock); 1593 } 1594 } 1595 1596 /* 1597 * Page or buffer structure frees a reference. 1598 * Called with v_interlock held. 1599 */ 1600 void 1601 holdrelel(vnode_t *vp) 1602 { 1603 1604 KASSERT(mutex_owned(&vp->v_interlock)); 1605 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1606 1607 if (vp->v_holdcnt <= 0) { 1608 vpanic(vp, "holdrelel: holdcnt vp %p"); 1609 } 1610 1611 vp->v_holdcnt--; 1612 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1613 mutex_enter(&vnode_free_list_lock); 1614 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1615 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1616 vp->v_freelisthd = &vnode_free_list; 1617 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1618 mutex_exit(&vnode_free_list_lock); 1619 } 1620 } 1621 1622 /* 1623 * Vnode reference, where a reference is already held by some other 1624 * object (for example, a file structure). 1625 */ 1626 void 1627 vref(vnode_t *vp) 1628 { 1629 1630 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1631 KASSERT(vp->v_usecount != 0); 1632 1633 atomic_inc_uint(&vp->v_usecount); 1634 } 1635 1636 /* 1637 * Remove any vnodes in the vnode table belonging to mount point mp. 1638 * 1639 * If FORCECLOSE is not specified, there should not be any active ones, 1640 * return error if any are found (nb: this is a user error, not a 1641 * system error). If FORCECLOSE is specified, detach any active vnodes 1642 * that are found. 1643 * 1644 * If WRITECLOSE is set, only flush out regular file vnodes open for 1645 * writing. 1646 * 1647 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1648 */ 1649 #ifdef DEBUG 1650 int busyprt = 0; /* print out busy vnodes */ 1651 struct ctldebug debug1 = { "busyprt", &busyprt }; 1652 #endif 1653 1654 static vnode_t * 1655 vflushnext(vnode_t *mvp, int *when) 1656 { 1657 1658 if (hardclock_ticks > *when) { 1659 mutex_exit(&mntvnode_lock); 1660 yield(); 1661 mutex_enter(&mntvnode_lock); 1662 *when = hardclock_ticks + hz / 10; 1663 } 1664 1665 return vunmark(mvp); 1666 } 1667 1668 int 1669 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1670 { 1671 vnode_t *vp, *mvp; 1672 int busy = 0, when = 0, gen; 1673 1674 /* 1675 * First, flush out any vnode references from vrele_list. 1676 */ 1677 mutex_enter(&vrele_lock); 1678 gen = vrele_gen; 1679 while (vrele_pending && gen == vrele_gen) { 1680 cv_broadcast(&vrele_cv); 1681 cv_wait(&vrele_cv, &vrele_lock); 1682 } 1683 mutex_exit(&vrele_lock); 1684 1685 /* Allocate a marker vnode. */ 1686 if ((mvp = vnalloc(mp)) == NULL) 1687 return (ENOMEM); 1688 1689 /* 1690 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1691 * and vclean() are called 1692 */ 1693 mutex_enter(&mntvnode_lock); 1694 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1695 vp = vflushnext(mvp, &when)) { 1696 vmark(mvp, vp); 1697 if (vp->v_mount != mp || vismarker(vp)) 1698 continue; 1699 /* 1700 * Skip over a selected vnode. 1701 */ 1702 if (vp == skipvp) 1703 continue; 1704 mutex_enter(&vp->v_interlock); 1705 /* 1706 * Ignore clean but still referenced vnodes. 1707 */ 1708 if ((vp->v_iflag & VI_CLEAN) != 0) { 1709 mutex_exit(&vp->v_interlock); 1710 continue; 1711 } 1712 /* 1713 * Skip over a vnodes marked VSYSTEM. 1714 */ 1715 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1716 mutex_exit(&vp->v_interlock); 1717 continue; 1718 } 1719 /* 1720 * If WRITECLOSE is set, only flush out regular file 1721 * vnodes open for writing. 1722 */ 1723 if ((flags & WRITECLOSE) && 1724 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1725 mutex_exit(&vp->v_interlock); 1726 continue; 1727 } 1728 /* 1729 * With v_usecount == 0, all we need to do is clear 1730 * out the vnode data structures and we are done. 1731 */ 1732 if (vp->v_usecount == 0) { 1733 mutex_exit(&mntvnode_lock); 1734 vremfree(vp); 1735 vp->v_usecount = 1; 1736 vclean(vp, DOCLOSE); 1737 vrelel(vp, 0); 1738 mutex_enter(&mntvnode_lock); 1739 continue; 1740 } 1741 /* 1742 * If FORCECLOSE is set, forcibly close the vnode. 1743 * For block or character devices, revert to an 1744 * anonymous device. For all other files, just 1745 * kill them. 1746 */ 1747 if (flags & FORCECLOSE) { 1748 mutex_exit(&mntvnode_lock); 1749 atomic_inc_uint(&vp->v_usecount); 1750 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1751 vclean(vp, DOCLOSE); 1752 vrelel(vp, 0); 1753 } else { 1754 vclean(vp, 0); 1755 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1756 mutex_exit(&vp->v_interlock); 1757 /* 1758 * The vnode isn't clean, but still resides 1759 * on the mount list. Remove it. XXX This 1760 * is a bit dodgy. 1761 */ 1762 insmntque(vp, NULL); 1763 vrele(vp); 1764 } 1765 mutex_enter(&mntvnode_lock); 1766 continue; 1767 } 1768 #ifdef DEBUG 1769 if (busyprt) 1770 vprint("vflush: busy vnode", vp); 1771 #endif 1772 mutex_exit(&vp->v_interlock); 1773 busy++; 1774 } 1775 mutex_exit(&mntvnode_lock); 1776 vnfree(mvp); 1777 if (busy) 1778 return (EBUSY); 1779 return (0); 1780 } 1781 1782 /* 1783 * Disassociate the underlying file system from a vnode. 1784 * 1785 * Must be called with the interlock held, and will return with it held. 1786 */ 1787 void 1788 vclean(vnode_t *vp, int flags) 1789 { 1790 lwp_t *l = curlwp; 1791 bool recycle, active; 1792 int error; 1793 1794 KASSERT(mutex_owned(&vp->v_interlock)); 1795 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1796 KASSERT(vp->v_usecount != 0); 1797 1798 /* If cleaning is already in progress wait until done and return. */ 1799 if (vp->v_iflag & VI_XLOCK) { 1800 vwait(vp, VI_XLOCK); 1801 return; 1802 } 1803 1804 /* If already clean, nothing to do. */ 1805 if ((vp->v_iflag & VI_CLEAN) != 0) { 1806 return; 1807 } 1808 1809 /* 1810 * Prevent the vnode from being recycled or brought into use 1811 * while we clean it out. 1812 */ 1813 vp->v_iflag |= VI_XLOCK; 1814 if (vp->v_iflag & VI_EXECMAP) { 1815 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1816 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1817 } 1818 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1819 active = (vp->v_usecount > 1); 1820 1821 /* XXXAD should not lock vnode under layer */ 1822 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1823 1824 /* 1825 * Clean out any cached data associated with the vnode. 1826 * If purging an active vnode, it must be closed and 1827 * deactivated before being reclaimed. Note that the 1828 * VOP_INACTIVE will unlock the vnode. 1829 */ 1830 if (flags & DOCLOSE) { 1831 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1832 if (error != 0) { 1833 /* XXX, fix vn_start_write's grab of mp and use that. */ 1834 1835 if (wapbl_vphaswapbl(vp)) 1836 WAPBL_DISCARD(wapbl_vptomp(vp)); 1837 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1838 } 1839 KASSERT(error == 0); 1840 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1841 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1842 spec_node_revoke(vp); 1843 } 1844 } 1845 if (active) { 1846 VOP_INACTIVE(vp, &recycle); 1847 } else { 1848 /* 1849 * Any other processes trying to obtain this lock must first 1850 * wait for VI_XLOCK to clear, then call the new lock operation. 1851 */ 1852 VOP_UNLOCK(vp, 0); 1853 } 1854 1855 /* Disassociate the underlying file system from the vnode. */ 1856 if (VOP_RECLAIM(vp)) { 1857 vpanic(vp, "vclean: cannot reclaim"); 1858 } 1859 1860 KASSERT(vp->v_uobj.uo_npages == 0); 1861 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1862 uvm_ra_freectx(vp->v_ractx); 1863 vp->v_ractx = NULL; 1864 } 1865 cache_purge(vp); 1866 1867 /* Done with purge, notify sleepers of the grim news. */ 1868 mutex_enter(&vp->v_interlock); 1869 vp->v_op = dead_vnodeop_p; 1870 vp->v_tag = VT_NON; 1871 vp->v_vnlock = &vp->v_lock; 1872 KNOTE(&vp->v_klist, NOTE_REVOKE); 1873 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1874 vp->v_vflag &= ~VV_LOCKSWORK; 1875 if ((flags & DOCLOSE) != 0) { 1876 vp->v_iflag |= VI_CLEAN; 1877 } 1878 cv_broadcast(&vp->v_cv); 1879 1880 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1881 } 1882 1883 /* 1884 * Recycle an unused vnode to the front of the free list. 1885 * Release the passed interlock if the vnode will be recycled. 1886 */ 1887 int 1888 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1889 { 1890 1891 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1892 1893 mutex_enter(&vp->v_interlock); 1894 if (vp->v_usecount != 0) { 1895 mutex_exit(&vp->v_interlock); 1896 return (0); 1897 } 1898 if (inter_lkp) 1899 mutex_exit(inter_lkp); 1900 vremfree(vp); 1901 vp->v_usecount = 1; 1902 vclean(vp, DOCLOSE); 1903 vrelel(vp, 0); 1904 return (1); 1905 } 1906 1907 /* 1908 * Eliminate all activity associated with a vnode in preparation for 1909 * reuse. Drops a reference from the vnode. 1910 */ 1911 void 1912 vgone(vnode_t *vp) 1913 { 1914 1915 mutex_enter(&vp->v_interlock); 1916 vclean(vp, DOCLOSE); 1917 vrelel(vp, 0); 1918 } 1919 1920 /* 1921 * Lookup a vnode by device number. 1922 */ 1923 int 1924 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1925 { 1926 vnode_t *vp; 1927 int rc = 0; 1928 1929 mutex_enter(&device_lock); 1930 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1931 if (dev != vp->v_rdev || type != vp->v_type) 1932 continue; 1933 *vpp = vp; 1934 rc = 1; 1935 break; 1936 } 1937 mutex_exit(&device_lock); 1938 return (rc); 1939 } 1940 1941 /* 1942 * Revoke all the vnodes corresponding to the specified minor number 1943 * range (endpoints inclusive) of the specified major. 1944 */ 1945 void 1946 vdevgone(int maj, int minl, int minh, enum vtype type) 1947 { 1948 vnode_t *vp, **vpp; 1949 dev_t dev; 1950 int mn; 1951 1952 vp = NULL; /* XXX gcc */ 1953 1954 mutex_enter(&device_lock); 1955 for (mn = minl; mn <= minh; mn++) { 1956 dev = makedev(maj, mn); 1957 vpp = &specfs_hash[SPECHASH(dev)]; 1958 for (vp = *vpp; vp != NULL;) { 1959 mutex_enter(&vp->v_interlock); 1960 if ((vp->v_iflag & VI_CLEAN) != 0 || 1961 dev != vp->v_rdev || type != vp->v_type) { 1962 mutex_exit(&vp->v_interlock); 1963 vp = vp->v_specnext; 1964 continue; 1965 } 1966 mutex_exit(&device_lock); 1967 if (vget(vp, LK_INTERLOCK) == 0) { 1968 VOP_REVOKE(vp, REVOKEALL); 1969 vrele(vp); 1970 } 1971 mutex_enter(&device_lock); 1972 vp = *vpp; 1973 } 1974 } 1975 mutex_exit(&device_lock); 1976 } 1977 1978 /* 1979 * Calculate the total number of references to a special device. 1980 */ 1981 int 1982 vcount(vnode_t *vp) 1983 { 1984 int count; 1985 1986 mutex_enter(&device_lock); 1987 mutex_enter(&vp->v_interlock); 1988 if (vp->v_specnode == NULL) { 1989 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 1990 mutex_exit(&vp->v_interlock); 1991 mutex_exit(&device_lock); 1992 return (count); 1993 } 1994 mutex_exit(&vp->v_interlock); 1995 count = vp->v_specnode->sn_dev->sd_opencnt; 1996 mutex_exit(&device_lock); 1997 return (count); 1998 } 1999 2000 /* 2001 * Eliminate all activity associated with the requested vnode 2002 * and with all vnodes aliased to the requested vnode. 2003 */ 2004 void 2005 vrevoke(vnode_t *vp) 2006 { 2007 vnode_t *vq, **vpp; 2008 enum vtype type; 2009 dev_t dev; 2010 2011 KASSERT(vp->v_usecount > 0); 2012 2013 mutex_enter(&vp->v_interlock); 2014 if ((vp->v_iflag & VI_CLEAN) != 0) { 2015 mutex_exit(&vp->v_interlock); 2016 return; 2017 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 2018 atomic_inc_uint(&vp->v_usecount); 2019 vclean(vp, DOCLOSE); 2020 vrelel(vp, 0); 2021 return; 2022 } else { 2023 dev = vp->v_rdev; 2024 type = vp->v_type; 2025 mutex_exit(&vp->v_interlock); 2026 } 2027 2028 vpp = &specfs_hash[SPECHASH(dev)]; 2029 mutex_enter(&device_lock); 2030 for (vq = *vpp; vq != NULL;) { 2031 /* If clean or being cleaned, then ignore it. */ 2032 mutex_enter(&vq->v_interlock); 2033 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 2034 vq->v_rdev != dev || vq->v_type != type) { 2035 mutex_exit(&vq->v_interlock); 2036 vq = vq->v_specnext; 2037 continue; 2038 } 2039 mutex_exit(&device_lock); 2040 if (vq->v_usecount == 0) { 2041 vremfree(vq); 2042 vq->v_usecount = 1; 2043 } else { 2044 atomic_inc_uint(&vq->v_usecount); 2045 } 2046 vclean(vq, DOCLOSE); 2047 vrelel(vq, 0); 2048 mutex_enter(&device_lock); 2049 vq = *vpp; 2050 } 2051 mutex_exit(&device_lock); 2052 } 2053 2054 /* 2055 * sysctl helper routine to return list of supported fstypes 2056 */ 2057 int 2058 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2059 { 2060 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 2061 char *where = oldp; 2062 struct vfsops *v; 2063 size_t needed, left, slen; 2064 int error, first; 2065 2066 if (newp != NULL) 2067 return (EPERM); 2068 if (namelen != 0) 2069 return (EINVAL); 2070 2071 first = 1; 2072 error = 0; 2073 needed = 0; 2074 left = *oldlenp; 2075 2076 sysctl_unlock(); 2077 mutex_enter(&vfs_list_lock); 2078 LIST_FOREACH(v, &vfs_list, vfs_list) { 2079 if (where == NULL) 2080 needed += strlen(v->vfs_name) + 1; 2081 else { 2082 memset(bf, 0, sizeof(bf)); 2083 if (first) { 2084 strncpy(bf, v->vfs_name, sizeof(bf)); 2085 first = 0; 2086 } else { 2087 bf[0] = ' '; 2088 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 2089 } 2090 bf[sizeof(bf)-1] = '\0'; 2091 slen = strlen(bf); 2092 if (left < slen + 1) 2093 break; 2094 v->vfs_refcount++; 2095 mutex_exit(&vfs_list_lock); 2096 /* +1 to copy out the trailing NUL byte */ 2097 error = copyout(bf, where, slen + 1); 2098 mutex_enter(&vfs_list_lock); 2099 v->vfs_refcount--; 2100 if (error) 2101 break; 2102 where += slen; 2103 needed += slen; 2104 left -= slen; 2105 } 2106 } 2107 mutex_exit(&vfs_list_lock); 2108 sysctl_relock(); 2109 *oldlenp = needed; 2110 return (error); 2111 } 2112 2113 2114 int kinfo_vdebug = 1; 2115 int kinfo_vgetfailed; 2116 #define KINFO_VNODESLOP 10 2117 /* 2118 * Dump vnode list (via sysctl). 2119 * Copyout address of vnode followed by vnode. 2120 */ 2121 /* ARGSUSED */ 2122 int 2123 sysctl_kern_vnode(SYSCTLFN_ARGS) 2124 { 2125 char *where = oldp; 2126 size_t *sizep = oldlenp; 2127 struct mount *mp, *nmp; 2128 vnode_t *vp, *mvp, vbuf; 2129 char *bp = where, *savebp; 2130 char *ewhere; 2131 int error; 2132 2133 if (namelen != 0) 2134 return (EOPNOTSUPP); 2135 if (newp != NULL) 2136 return (EPERM); 2137 2138 #define VPTRSZ sizeof(vnode_t *) 2139 #define VNODESZ sizeof(vnode_t) 2140 if (where == NULL) { 2141 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2142 return (0); 2143 } 2144 ewhere = where + *sizep; 2145 2146 sysctl_unlock(); 2147 mutex_enter(&mountlist_lock); 2148 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2149 mp = nmp) { 2150 if (vfs_busy(mp, &nmp)) { 2151 continue; 2152 } 2153 savebp = bp; 2154 /* Allocate a marker vnode. */ 2155 mvp = vnalloc(mp); 2156 /* Should never fail for mp != NULL */ 2157 KASSERT(mvp != NULL); 2158 mutex_enter(&mntvnode_lock); 2159 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 2160 vmark(mvp, vp); 2161 /* 2162 * Check that the vp is still associated with 2163 * this filesystem. RACE: could have been 2164 * recycled onto the same filesystem. 2165 */ 2166 if (vp->v_mount != mp || vismarker(vp)) 2167 continue; 2168 if (bp + VPTRSZ + VNODESZ > ewhere) { 2169 (void)vunmark(mvp); 2170 mutex_exit(&mntvnode_lock); 2171 vnfree(mvp); 2172 sysctl_relock(); 2173 *sizep = bp - where; 2174 return (ENOMEM); 2175 } 2176 memcpy(&vbuf, vp, VNODESZ); 2177 mutex_exit(&mntvnode_lock); 2178 if ((error = copyout(&vp, bp, VPTRSZ)) || 2179 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 2180 mutex_enter(&mntvnode_lock); 2181 (void)vunmark(mvp); 2182 mutex_exit(&mntvnode_lock); 2183 vnfree(mvp); 2184 sysctl_relock(); 2185 return (error); 2186 } 2187 bp += VPTRSZ + VNODESZ; 2188 mutex_enter(&mntvnode_lock); 2189 } 2190 mutex_exit(&mntvnode_lock); 2191 vnfree(mvp); 2192 vfs_unbusy(mp, false, &nmp); 2193 } 2194 mutex_exit(&mountlist_lock); 2195 sysctl_relock(); 2196 2197 *sizep = bp - where; 2198 return (0); 2199 } 2200 2201 /* 2202 * Remove clean vnodes from a mountpoint's vnode list. 2203 */ 2204 void 2205 vfs_scrubvnlist(struct mount *mp) 2206 { 2207 vnode_t *vp, *nvp; 2208 2209 retry: 2210 mutex_enter(&mntvnode_lock); 2211 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 2212 nvp = TAILQ_NEXT(vp, v_mntvnodes); 2213 mutex_enter(&vp->v_interlock); 2214 if ((vp->v_iflag & VI_CLEAN) != 0) { 2215 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 2216 vp->v_mount = NULL; 2217 mutex_exit(&mntvnode_lock); 2218 mutex_exit(&vp->v_interlock); 2219 vfs_destroy(mp); 2220 goto retry; 2221 } 2222 mutex_exit(&vp->v_interlock); 2223 } 2224 mutex_exit(&mntvnode_lock); 2225 } 2226 2227 /* 2228 * Check to see if a filesystem is mounted on a block device. 2229 */ 2230 int 2231 vfs_mountedon(vnode_t *vp) 2232 { 2233 vnode_t *vq; 2234 int error = 0; 2235 2236 if (vp->v_type != VBLK) 2237 return ENOTBLK; 2238 if (vp->v_specmountpoint != NULL) 2239 return (EBUSY); 2240 mutex_enter(&device_lock); 2241 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 2242 vq = vq->v_specnext) { 2243 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 2244 continue; 2245 if (vq->v_specmountpoint != NULL) { 2246 error = EBUSY; 2247 break; 2248 } 2249 } 2250 mutex_exit(&device_lock); 2251 return (error); 2252 } 2253 2254 /* 2255 * Unmount all file systems. 2256 * We traverse the list in reverse order under the assumption that doing so 2257 * will avoid needing to worry about dependencies. 2258 */ 2259 bool 2260 vfs_unmountall(struct lwp *l) 2261 { 2262 printf("unmounting file systems..."); 2263 return vfs_unmountall1(l, true, true); 2264 } 2265 2266 bool 2267 vfs_unmountall1(struct lwp *l, bool force, bool verbose) 2268 { 2269 struct mount *mp, *nmp; 2270 bool any_error, progress; 2271 int error; 2272 2273 for (any_error = false, mp = CIRCLEQ_LAST(&mountlist); 2274 !CIRCLEQ_EMPTY(&mountlist); 2275 mp = nmp) { 2276 nmp = CIRCLEQ_PREV(mp, mnt_list); 2277 #ifdef DEBUG 2278 printf("\nunmounting %s (%s)...", 2279 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2280 #endif 2281 atomic_inc_uint(&mp->mnt_refcnt); 2282 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) 2283 progress = true; 2284 else { 2285 if (verbose) { 2286 printf("unmount of %s failed with error %d\n", 2287 mp->mnt_stat.f_mntonname, error); 2288 } 2289 any_error = true; 2290 } 2291 } 2292 if (verbose) 2293 printf(" done\n"); 2294 if (any_error && verbose) 2295 printf("WARNING: some file systems would not unmount\n"); 2296 return progress; 2297 } 2298 2299 /* 2300 * Sync and unmount file systems before shutting down. 2301 */ 2302 void 2303 vfs_shutdown(void) 2304 { 2305 struct lwp *l; 2306 2307 /* XXX we're certainly not running in lwp0's context! */ 2308 l = (curlwp == NULL) ? &lwp0 : curlwp; 2309 2310 printf("syncing disks... "); 2311 2312 /* remove user processes from run queue */ 2313 suspendsched(); 2314 (void) spl0(); 2315 2316 /* avoid coming back this way again if we panic. */ 2317 doing_shutdown = 1; 2318 2319 sys_sync(l, NULL, NULL); 2320 2321 /* Wait for sync to finish. */ 2322 if (buf_syncwait() != 0) { 2323 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2324 Debugger(); 2325 #endif 2326 printf("giving up\n"); 2327 return; 2328 } else 2329 printf("done\n"); 2330 2331 /* 2332 * If we've panic'd, don't make the situation potentially 2333 * worse by unmounting the file systems. 2334 */ 2335 if (panicstr != NULL) 2336 return; 2337 2338 /* Release inodes held by texts before update. */ 2339 #ifdef notdef 2340 vnshutdown(); 2341 #endif 2342 /* Unmount file systems. */ 2343 vfs_unmountall(l); 2344 } 2345 2346 /* 2347 * Mount the root file system. If the operator didn't specify a 2348 * file system to use, try all possible file systems until one 2349 * succeeds. 2350 */ 2351 int 2352 vfs_mountroot(void) 2353 { 2354 struct vfsops *v; 2355 int error = ENODEV; 2356 2357 if (root_device == NULL) 2358 panic("vfs_mountroot: root device unknown"); 2359 2360 switch (device_class(root_device)) { 2361 case DV_IFNET: 2362 if (rootdev != NODEV) 2363 panic("vfs_mountroot: rootdev set for DV_IFNET " 2364 "(0x%llx -> %llu,%llu)", 2365 (unsigned long long)rootdev, 2366 (unsigned long long)major(rootdev), 2367 (unsigned long long)minor(rootdev)); 2368 break; 2369 2370 case DV_DISK: 2371 if (rootdev == NODEV) 2372 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2373 if (bdevvp(rootdev, &rootvp)) 2374 panic("vfs_mountroot: can't get vnode for rootdev"); 2375 error = VOP_OPEN(rootvp, FREAD, FSCRED); 2376 if (error) { 2377 printf("vfs_mountroot: can't open root device\n"); 2378 return (error); 2379 } 2380 break; 2381 2382 default: 2383 printf("%s: inappropriate for root file system\n", 2384 device_xname(root_device)); 2385 return (ENODEV); 2386 } 2387 2388 /* 2389 * If user specified a root fs type, use it. Make sure the 2390 * specified type exists and has a mount_root() 2391 */ 2392 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) { 2393 v = vfs_getopsbyname(rootfstype); 2394 error = EFTYPE; 2395 if (v != NULL) { 2396 if (v->vfs_mountroot != NULL) { 2397 error = (v->vfs_mountroot)(); 2398 } 2399 v->vfs_refcount--; 2400 } 2401 goto done; 2402 } 2403 2404 /* 2405 * Try each file system currently configured into the kernel. 2406 */ 2407 mutex_enter(&vfs_list_lock); 2408 LIST_FOREACH(v, &vfs_list, vfs_list) { 2409 if (v->vfs_mountroot == NULL) 2410 continue; 2411 #ifdef DEBUG 2412 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2413 #endif 2414 v->vfs_refcount++; 2415 mutex_exit(&vfs_list_lock); 2416 error = (*v->vfs_mountroot)(); 2417 mutex_enter(&vfs_list_lock); 2418 v->vfs_refcount--; 2419 if (!error) { 2420 aprint_normal("root file system type: %s\n", 2421 v->vfs_name); 2422 break; 2423 } 2424 } 2425 mutex_exit(&vfs_list_lock); 2426 2427 if (v == NULL) { 2428 printf("no file system for %s", device_xname(root_device)); 2429 if (device_class(root_device) == DV_DISK) 2430 printf(" (dev 0x%llx)", (unsigned long long)rootdev); 2431 printf("\n"); 2432 error = EFTYPE; 2433 } 2434 2435 done: 2436 if (error && device_class(root_device) == DV_DISK) { 2437 VOP_CLOSE(rootvp, FREAD, FSCRED); 2438 vrele(rootvp); 2439 } 2440 return (error); 2441 } 2442 2443 /* 2444 * Get a new unique fsid 2445 */ 2446 void 2447 vfs_getnewfsid(struct mount *mp) 2448 { 2449 static u_short xxxfs_mntid; 2450 fsid_t tfsid; 2451 int mtype; 2452 2453 mutex_enter(&mntid_lock); 2454 mtype = makefstype(mp->mnt_op->vfs_name); 2455 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 2456 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 2457 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 2458 if (xxxfs_mntid == 0) 2459 ++xxxfs_mntid; 2460 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 2461 tfsid.__fsid_val[1] = mtype; 2462 if (!CIRCLEQ_EMPTY(&mountlist)) { 2463 while (vfs_getvfs(&tfsid)) { 2464 tfsid.__fsid_val[0]++; 2465 xxxfs_mntid++; 2466 } 2467 } 2468 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 2469 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 2470 mutex_exit(&mntid_lock); 2471 } 2472 2473 /* 2474 * Make a 'unique' number from a mount type name. 2475 */ 2476 long 2477 makefstype(const char *type) 2478 { 2479 long rv; 2480 2481 for (rv = 0; *type; type++) { 2482 rv <<= 2; 2483 rv ^= *type; 2484 } 2485 return rv; 2486 } 2487 2488 /* 2489 * Set vnode attributes to VNOVAL 2490 */ 2491 void 2492 vattr_null(struct vattr *vap) 2493 { 2494 2495 vap->va_type = VNON; 2496 2497 /* 2498 * Assign individually so that it is safe even if size and 2499 * sign of each member are varied. 2500 */ 2501 vap->va_mode = VNOVAL; 2502 vap->va_nlink = VNOVAL; 2503 vap->va_uid = VNOVAL; 2504 vap->va_gid = VNOVAL; 2505 vap->va_fsid = VNOVAL; 2506 vap->va_fileid = VNOVAL; 2507 vap->va_size = VNOVAL; 2508 vap->va_blocksize = VNOVAL; 2509 vap->va_atime.tv_sec = 2510 vap->va_mtime.tv_sec = 2511 vap->va_ctime.tv_sec = 2512 vap->va_birthtime.tv_sec = VNOVAL; 2513 vap->va_atime.tv_nsec = 2514 vap->va_mtime.tv_nsec = 2515 vap->va_ctime.tv_nsec = 2516 vap->va_birthtime.tv_nsec = VNOVAL; 2517 vap->va_gen = VNOVAL; 2518 vap->va_flags = VNOVAL; 2519 vap->va_rdev = VNOVAL; 2520 vap->va_bytes = VNOVAL; 2521 vap->va_vaflags = 0; 2522 } 2523 2524 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 2525 #define ARRAY_PRINT(idx, arr) \ 2526 ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 2527 2528 const char * const vnode_tags[] = { VNODE_TAGS }; 2529 const char * const vnode_types[] = { VNODE_TYPES }; 2530 const char vnode_flagbits[] = VNODE_FLAGBITS; 2531 2532 /* 2533 * Print out a description of a vnode. 2534 */ 2535 void 2536 vprint(const char *label, struct vnode *vp) 2537 { 2538 struct vnlock *vl; 2539 char bf[96]; 2540 int flag; 2541 2542 vl = (vp->v_vnlock != NULL ? vp->v_vnlock : &vp->v_lock); 2543 flag = vp->v_iflag | vp->v_vflag | vp->v_uflag; 2544 snprintb(bf, sizeof(bf), vnode_flagbits, flag); 2545 2546 if (label != NULL) 2547 printf("%s: ", label); 2548 printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), " 2549 "usecount %d, writecount %d, holdcount %d\n" 2550 "\tfreelisthd %p, mount %p, data %p lock %p recursecnt %d\n", 2551 vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 2552 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 2553 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, 2554 vp->v_freelisthd, vp->v_mount, vp->v_data, vl, vl->vl_recursecnt); 2555 if (vp->v_data != NULL) { 2556 printf("\t"); 2557 VOP_PRINT(vp); 2558 } 2559 } 2560 2561 #ifdef DEBUG 2562 /* 2563 * List all of the locked vnodes in the system. 2564 * Called when debugging the kernel. 2565 */ 2566 void 2567 printlockedvnodes(void) 2568 { 2569 struct mount *mp, *nmp; 2570 struct vnode *vp; 2571 2572 printf("Locked vnodes\n"); 2573 mutex_enter(&mountlist_lock); 2574 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2575 mp = nmp) { 2576 if (vfs_busy(mp, &nmp)) { 2577 continue; 2578 } 2579 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2580 if (VOP_ISLOCKED(vp)) 2581 vprint(NULL, vp); 2582 } 2583 mutex_enter(&mountlist_lock); 2584 vfs_unbusy(mp, false, &nmp); 2585 } 2586 mutex_exit(&mountlist_lock); 2587 } 2588 #endif 2589 2590 /* 2591 * Do the usual access checking. 2592 * file_mode, uid and gid are from the vnode in question, 2593 * while acc_mode and cred are from the VOP_ACCESS parameter list 2594 */ 2595 int 2596 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, 2597 mode_t acc_mode, kauth_cred_t cred) 2598 { 2599 mode_t mask; 2600 int error, ismember; 2601 2602 /* 2603 * Super-user always gets read/write access, but execute access depends 2604 * on at least one execute bit being set. 2605 */ 2606 if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) == 0) { 2607 if ((acc_mode & VEXEC) && type != VDIR && 2608 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2609 return (EACCES); 2610 return (0); 2611 } 2612 2613 mask = 0; 2614 2615 /* Otherwise, check the owner. */ 2616 if (kauth_cred_geteuid(cred) == uid) { 2617 if (acc_mode & VEXEC) 2618 mask |= S_IXUSR; 2619 if (acc_mode & VREAD) 2620 mask |= S_IRUSR; 2621 if (acc_mode & VWRITE) 2622 mask |= S_IWUSR; 2623 return ((file_mode & mask) == mask ? 0 : EACCES); 2624 } 2625 2626 /* Otherwise, check the groups. */ 2627 error = kauth_cred_ismember_gid(cred, gid, &ismember); 2628 if (error) 2629 return (error); 2630 if (kauth_cred_getegid(cred) == gid || ismember) { 2631 if (acc_mode & VEXEC) 2632 mask |= S_IXGRP; 2633 if (acc_mode & VREAD) 2634 mask |= S_IRGRP; 2635 if (acc_mode & VWRITE) 2636 mask |= S_IWGRP; 2637 return ((file_mode & mask) == mask ? 0 : EACCES); 2638 } 2639 2640 /* Otherwise, check everyone else. */ 2641 if (acc_mode & VEXEC) 2642 mask |= S_IXOTH; 2643 if (acc_mode & VREAD) 2644 mask |= S_IROTH; 2645 if (acc_mode & VWRITE) 2646 mask |= S_IWOTH; 2647 return ((file_mode & mask) == mask ? 0 : EACCES); 2648 } 2649 2650 /* 2651 * Given a file system name, look up the vfsops for that 2652 * file system, or return NULL if file system isn't present 2653 * in the kernel. 2654 */ 2655 struct vfsops * 2656 vfs_getopsbyname(const char *name) 2657 { 2658 struct vfsops *v; 2659 2660 mutex_enter(&vfs_list_lock); 2661 LIST_FOREACH(v, &vfs_list, vfs_list) { 2662 if (strcmp(v->vfs_name, name) == 0) 2663 break; 2664 } 2665 if (v != NULL) 2666 v->vfs_refcount++; 2667 mutex_exit(&vfs_list_lock); 2668 2669 return (v); 2670 } 2671 2672 void 2673 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2674 { 2675 const struct statvfs *mbp; 2676 2677 if (sbp == (mbp = &mp->mnt_stat)) 2678 return; 2679 2680 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2681 sbp->f_fsid = mbp->f_fsid; 2682 sbp->f_owner = mbp->f_owner; 2683 sbp->f_flag = mbp->f_flag; 2684 sbp->f_syncwrites = mbp->f_syncwrites; 2685 sbp->f_asyncwrites = mbp->f_asyncwrites; 2686 sbp->f_syncreads = mbp->f_syncreads; 2687 sbp->f_asyncreads = mbp->f_asyncreads; 2688 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2689 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2690 sizeof(sbp->f_fstypename)); 2691 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2692 sizeof(sbp->f_mntonname)); 2693 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2694 sizeof(sbp->f_mntfromname)); 2695 sbp->f_namemax = mbp->f_namemax; 2696 } 2697 2698 int 2699 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2700 const char *vfsname, struct mount *mp, struct lwp *l) 2701 { 2702 int error; 2703 size_t size; 2704 struct statvfs *sfs = &mp->mnt_stat; 2705 int (*fun)(const void *, void *, size_t, size_t *); 2706 2707 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname, 2708 sizeof(mp->mnt_stat.f_fstypename)); 2709 2710 if (onp) { 2711 struct cwdinfo *cwdi = l->l_proc->p_cwdi; 2712 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2713 if (cwdi->cwdi_rdir != NULL) { 2714 size_t len; 2715 char *bp; 2716 char *path = PNBUF_GET(); 2717 2718 bp = path + MAXPATHLEN; 2719 *--bp = '\0'; 2720 rw_enter(&cwdi->cwdi_lock, RW_READER); 2721 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 2722 path, MAXPATHLEN / 2, 0, l); 2723 rw_exit(&cwdi->cwdi_lock); 2724 if (error) { 2725 PNBUF_PUT(path); 2726 return error; 2727 } 2728 2729 len = strlen(bp); 2730 if (len > sizeof(sfs->f_mntonname) - 1) 2731 len = sizeof(sfs->f_mntonname) - 1; 2732 (void)strncpy(sfs->f_mntonname, bp, len); 2733 PNBUF_PUT(path); 2734 2735 if (len < sizeof(sfs->f_mntonname) - 1) { 2736 error = (*fun)(onp, &sfs->f_mntonname[len], 2737 sizeof(sfs->f_mntonname) - len - 1, &size); 2738 if (error) 2739 return error; 2740 size += len; 2741 } else { 2742 size = len; 2743 } 2744 } else { 2745 error = (*fun)(onp, &sfs->f_mntonname, 2746 sizeof(sfs->f_mntonname) - 1, &size); 2747 if (error) 2748 return error; 2749 } 2750 (void)memset(sfs->f_mntonname + size, 0, 2751 sizeof(sfs->f_mntonname) - size); 2752 } 2753 2754 if (fromp) { 2755 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 2756 error = (*fun)(fromp, sfs->f_mntfromname, 2757 sizeof(sfs->f_mntfromname) - 1, &size); 2758 if (error) 2759 return error; 2760 (void)memset(sfs->f_mntfromname + size, 0, 2761 sizeof(sfs->f_mntfromname) - size); 2762 } 2763 return 0; 2764 } 2765 2766 void 2767 vfs_timestamp(struct timespec *ts) 2768 { 2769 2770 nanotime(ts); 2771 } 2772 2773 time_t rootfstime; /* recorded root fs time, if known */ 2774 void 2775 setrootfstime(time_t t) 2776 { 2777 rootfstime = t; 2778 } 2779 2780 /* 2781 * Sham lock manager for vnodes. This is a temporary measure. 2782 */ 2783 int 2784 vlockmgr(struct vnlock *vl, int flags) 2785 { 2786 2787 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 2788 2789 switch (flags & LK_TYPE_MASK) { 2790 case LK_SHARED: 2791 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 2792 return 0; 2793 } 2794 if ((flags & LK_NOWAIT) != 0) { 2795 return EBUSY; 2796 } 2797 rw_enter(&vl->vl_lock, RW_READER); 2798 return 0; 2799 2800 case LK_EXCLUSIVE: 2801 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 2802 return 0; 2803 } 2804 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 2805 rw_write_held(&vl->vl_lock)) { 2806 vl->vl_recursecnt++; 2807 return 0; 2808 } 2809 if ((flags & LK_NOWAIT) != 0) { 2810 return EBUSY; 2811 } 2812 rw_enter(&vl->vl_lock, RW_WRITER); 2813 return 0; 2814 2815 case LK_RELEASE: 2816 if (vl->vl_recursecnt != 0) { 2817 KASSERT(rw_write_held(&vl->vl_lock)); 2818 vl->vl_recursecnt--; 2819 return 0; 2820 } 2821 rw_exit(&vl->vl_lock); 2822 return 0; 2823 2824 default: 2825 panic("vlockmgr: flags %x", flags); 2826 } 2827 } 2828 2829 int 2830 vlockstatus(struct vnlock *vl) 2831 { 2832 2833 if (rw_write_held(&vl->vl_lock)) { 2834 return LK_EXCLUSIVE; 2835 } 2836 if (rw_read_held(&vl->vl_lock)) { 2837 return LK_SHARED; 2838 } 2839 return 0; 2840 } 2841 2842 /* 2843 * mount_specific_key_create -- 2844 * Create a key for subsystem mount-specific data. 2845 */ 2846 int 2847 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) 2848 { 2849 2850 return (specificdata_key_create(mount_specificdata_domain, keyp, dtor)); 2851 } 2852 2853 /* 2854 * mount_specific_key_delete -- 2855 * Delete a key for subsystem mount-specific data. 2856 */ 2857 void 2858 mount_specific_key_delete(specificdata_key_t key) 2859 { 2860 2861 specificdata_key_delete(mount_specificdata_domain, key); 2862 } 2863 2864 /* 2865 * mount_initspecific -- 2866 * Initialize a mount's specificdata container. 2867 */ 2868 void 2869 mount_initspecific(struct mount *mp) 2870 { 2871 int error; 2872 2873 error = specificdata_init(mount_specificdata_domain, 2874 &mp->mnt_specdataref); 2875 KASSERT(error == 0); 2876 } 2877 2878 /* 2879 * mount_finispecific -- 2880 * Finalize a mount's specificdata container. 2881 */ 2882 void 2883 mount_finispecific(struct mount *mp) 2884 { 2885 2886 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 2887 } 2888 2889 /* 2890 * mount_getspecific -- 2891 * Return mount-specific data corresponding to the specified key. 2892 */ 2893 void * 2894 mount_getspecific(struct mount *mp, specificdata_key_t key) 2895 { 2896 2897 return (specificdata_getspecific(mount_specificdata_domain, 2898 &mp->mnt_specdataref, key)); 2899 } 2900 2901 /* 2902 * mount_setspecific -- 2903 * Set mount-specific data corresponding to the specified key. 2904 */ 2905 void 2906 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data) 2907 { 2908 2909 specificdata_setspecific(mount_specificdata_domain, 2910 &mp->mnt_specdataref, key, data); 2911 } 2912 2913 int 2914 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c) 2915 { 2916 int error; 2917 2918 KERNEL_LOCK(1, NULL); 2919 error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c); 2920 KERNEL_UNLOCK_ONE(NULL); 2921 2922 return error; 2923 } 2924 2925 int 2926 VFS_START(struct mount *mp, int a) 2927 { 2928 int error; 2929 2930 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2931 KERNEL_LOCK(1, NULL); 2932 } 2933 error = (*(mp->mnt_op->vfs_start))(mp, a); 2934 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2935 KERNEL_UNLOCK_ONE(NULL); 2936 } 2937 2938 return error; 2939 } 2940 2941 int 2942 VFS_UNMOUNT(struct mount *mp, int a) 2943 { 2944 int error; 2945 2946 KERNEL_LOCK(1, NULL); 2947 error = (*(mp->mnt_op->vfs_unmount))(mp, a); 2948 KERNEL_UNLOCK_ONE(NULL); 2949 2950 return error; 2951 } 2952 2953 int 2954 VFS_ROOT(struct mount *mp, struct vnode **a) 2955 { 2956 int error; 2957 2958 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2959 KERNEL_LOCK(1, NULL); 2960 } 2961 error = (*(mp->mnt_op->vfs_root))(mp, a); 2962 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2963 KERNEL_UNLOCK_ONE(NULL); 2964 } 2965 2966 return error; 2967 } 2968 2969 int 2970 VFS_QUOTACTL(struct mount *mp, int a, uid_t b, void *c) 2971 { 2972 int error; 2973 2974 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2975 KERNEL_LOCK(1, NULL); 2976 } 2977 error = (*(mp->mnt_op->vfs_quotactl))(mp, a, b, c); 2978 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2979 KERNEL_UNLOCK_ONE(NULL); 2980 } 2981 2982 return error; 2983 } 2984 2985 int 2986 VFS_STATVFS(struct mount *mp, struct statvfs *a) 2987 { 2988 int error; 2989 2990 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2991 KERNEL_LOCK(1, NULL); 2992 } 2993 error = (*(mp->mnt_op->vfs_statvfs))(mp, a); 2994 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2995 KERNEL_UNLOCK_ONE(NULL); 2996 } 2997 2998 return error; 2999 } 3000 3001 int 3002 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b) 3003 { 3004 int error; 3005 3006 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3007 KERNEL_LOCK(1, NULL); 3008 } 3009 error = (*(mp->mnt_op->vfs_sync))(mp, a, b); 3010 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3011 KERNEL_UNLOCK_ONE(NULL); 3012 } 3013 3014 return error; 3015 } 3016 3017 int 3018 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b) 3019 { 3020 int error; 3021 3022 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3023 KERNEL_LOCK(1, NULL); 3024 } 3025 error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b); 3026 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3027 KERNEL_UNLOCK_ONE(NULL); 3028 } 3029 3030 return error; 3031 } 3032 3033 int 3034 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b) 3035 { 3036 int error; 3037 3038 if ((vp->v_vflag & VV_MPSAFE) == 0) { 3039 KERNEL_LOCK(1, NULL); 3040 } 3041 error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b); 3042 if ((vp->v_vflag & VV_MPSAFE) == 0) { 3043 KERNEL_UNLOCK_ONE(NULL); 3044 } 3045 3046 return error; 3047 } 3048 3049 int 3050 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b) 3051 { 3052 int error; 3053 3054 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3055 KERNEL_LOCK(1, NULL); 3056 } 3057 error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b); 3058 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3059 KERNEL_UNLOCK_ONE(NULL); 3060 } 3061 3062 return error; 3063 } 3064 3065 int 3066 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d) 3067 { 3068 int error; 3069 3070 KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */ 3071 error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d); 3072 KERNEL_UNLOCK_ONE(NULL); /* XXX */ 3073 3074 return error; 3075 } 3076 3077 int 3078 VFS_SUSPENDCTL(struct mount *mp, int a) 3079 { 3080 int error; 3081 3082 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3083 KERNEL_LOCK(1, NULL); 3084 } 3085 error = (*(mp->mnt_op->vfs_suspendctl))(mp, a); 3086 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3087 KERNEL_UNLOCK_ONE(NULL); 3088 } 3089 3090 return error; 3091 } 3092 3093 #if defined(DDB) || defined(DEBUGPRINT) 3094 static const char buf_flagbits[] = BUF_FLAGBITS; 3095 3096 void 3097 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) 3098 { 3099 char bf[1024]; 3100 3101 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%" 3102 PRIx64 " dev 0x%x\n", 3103 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev); 3104 3105 snprintb(bf, sizeof(bf), 3106 buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags); 3107 (*pr)(" error %d flags 0x%s\n", bp->b_error, bf); 3108 3109 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3110 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3111 (*pr)(" data %p saveaddr %p\n", 3112 bp->b_data, bp->b_saveaddr); 3113 (*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock); 3114 } 3115 3116 3117 void 3118 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) 3119 { 3120 char bf[256]; 3121 3122 uvm_object_printit(&vp->v_uobj, full, pr); 3123 snprintb(bf, sizeof(bf), 3124 vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag); 3125 (*pr)("\nVNODE flags %s\n", bf); 3126 (*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n", 3127 vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize); 3128 3129 (*pr)("data %p writecount %ld holdcnt %ld\n", 3130 vp->v_data, vp->v_writecount, vp->v_holdcnt); 3131 3132 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n", 3133 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 3134 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 3135 vp->v_mount, vp->v_mountedhere); 3136 3137 (*pr)("v_lock %p v_vnlock %p\n", &vp->v_lock, vp->v_vnlock); 3138 3139 if (full) { 3140 struct buf *bp; 3141 3142 (*pr)("clean bufs:\n"); 3143 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3144 (*pr)(" bp %p\n", bp); 3145 vfs_buf_print(bp, full, pr); 3146 } 3147 3148 (*pr)("dirty bufs:\n"); 3149 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3150 (*pr)(" bp %p\n", bp); 3151 vfs_buf_print(bp, full, pr); 3152 } 3153 } 3154 } 3155 3156 void 3157 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) 3158 { 3159 char sbuf[256]; 3160 3161 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3162 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3163 3164 (*pr)("fs_bshift %d dev_bshift = %d\n", 3165 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3166 3167 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag); 3168 (*pr)("flag = %s\n", sbuf); 3169 3170 snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag); 3171 (*pr)("iflag = %s\n", sbuf); 3172 3173 (*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt, 3174 &mp->mnt_unmounting, &mp->mnt_updating); 3175 3176 (*pr)("statvfs cache:\n"); 3177 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3178 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3179 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3180 3181 (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks); 3182 (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree); 3183 (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail); 3184 (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd); 3185 3186 (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files); 3187 (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree); 3188 (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail); 3189 (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd); 3190 3191 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3192 mp->mnt_stat.f_fsidx.__fsid_val[0], 3193 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3194 3195 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3196 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3197 3198 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag); 3199 3200 (*pr)("\tflag = %s\n",sbuf); 3201 (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3202 (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3203 (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads); 3204 (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3205 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3206 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3207 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3208 3209 { 3210 int cnt = 0; 3211 struct vnode *vp; 3212 (*pr)("locked vnodes ="); 3213 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3214 if (VOP_ISLOCKED(vp)) { 3215 if ((++cnt % 6) == 0) { 3216 (*pr)(" %p,\n\t", vp); 3217 } else { 3218 (*pr)(" %p,", vp); 3219 } 3220 } 3221 } 3222 (*pr)("\n"); 3223 } 3224 3225 if (full) { 3226 int cnt = 0; 3227 struct vnode *vp; 3228 (*pr)("all vnodes ="); 3229 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3230 if (!TAILQ_NEXT(vp, v_mntvnodes)) { 3231 (*pr)(" %p", vp); 3232 } else if ((++cnt % 6) == 0) { 3233 (*pr)(" %p,\n\t", vp); 3234 } else { 3235 (*pr)(" %p,", vp); 3236 } 3237 } 3238 (*pr)("\n", vp); 3239 } 3240 } 3241 #endif /* DDB || DEBUGPRINT */ 3242 3243