1 /* $NetBSD: vfs_subr.c,v 1.383 2009/06/26 18:58:14 dyoung Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * Note on v_usecount and locking: 71 * 72 * At nearly all points it is known that v_usecount could be zero, the 73 * vnode interlock will be held. 74 * 75 * To change v_usecount away from zero, the interlock must be held. To 76 * change from a non-zero value to zero, again the interlock must be 77 * held. 78 * 79 * There's a flag bit, VC_XLOCK, embedded in v_usecount. 80 * To raise v_usecount, if the VC_XLOCK bit is set in it, the interlock 81 * must be held. 82 * To modify the VC_XLOCK bit, the interlock must be held. 83 * We always keep the usecount (v_usecount & VC_MASK) non-zero while the 84 * VC_XLOCK bit is set. 85 * 86 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 87 * value to a non-zero value can safely be done using atomic operations, 88 * without the interlock held. 89 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 90 * value can be done using atomic operations, without the interlock held. 91 */ 92 93 #include <sys/cdefs.h> 94 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.383 2009/06/26 18:58:14 dyoung Exp $"); 95 96 #include "opt_ddb.h" 97 #include "opt_compat_netbsd.h" 98 #include "opt_compat_43.h" 99 100 #include <sys/param.h> 101 #include <sys/systm.h> 102 #include <sys/conf.h> 103 #include <sys/proc.h> 104 #include <sys/kernel.h> 105 #include <sys/mount.h> 106 #include <sys/fcntl.h> 107 #include <sys/vnode.h> 108 #include <sys/stat.h> 109 #include <sys/namei.h> 110 #include <sys/ucred.h> 111 #include <sys/buf.h> 112 #include <sys/errno.h> 113 #include <sys/kmem.h> 114 #include <sys/syscallargs.h> 115 #include <sys/device.h> 116 #include <sys/filedesc.h> 117 #include <sys/kauth.h> 118 #include <sys/atomic.h> 119 #include <sys/kthread.h> 120 #include <sys/wapbl.h> 121 122 #include <miscfs/genfs/genfs.h> 123 #include <miscfs/specfs/specdev.h> 124 #include <miscfs/syncfs/syncfs.h> 125 126 #include <uvm/uvm.h> 127 #include <uvm/uvm_readahead.h> 128 #include <uvm/uvm_ddb.h> 129 130 #include <sys/sysctl.h> 131 132 const enum vtype iftovt_tab[16] = { 133 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 134 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 135 }; 136 const int vttoif_tab[9] = { 137 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 138 S_IFSOCK, S_IFIFO, S_IFMT, 139 }; 140 141 /* 142 * Insq/Remq for the vnode usage lists. 143 */ 144 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 145 #define bufremvn(bp) { \ 146 LIST_REMOVE(bp, b_vnbufs); \ 147 (bp)->b_vnbufs.le_next = NOLIST; \ 148 } 149 150 int doforce = 1; /* 1 => permit forcible unmounting */ 151 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 152 153 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 154 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 155 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 156 157 struct mntlist mountlist = /* mounted filesystem list */ 158 CIRCLEQ_HEAD_INITIALIZER(mountlist); 159 160 u_int numvnodes; 161 static specificdata_domain_t mount_specificdata_domain; 162 163 static int vrele_pending; 164 static int vrele_gen; 165 static kmutex_t vrele_lock; 166 static kcondvar_t vrele_cv; 167 static lwp_t *vrele_lwp; 168 169 static uint64_t mountgen = 0; 170 static kmutex_t mountgen_lock; 171 172 kmutex_t mountlist_lock; 173 kmutex_t mntid_lock; 174 kmutex_t mntvnode_lock; 175 kmutex_t vnode_free_list_lock; 176 kmutex_t vfs_list_lock; 177 178 static pool_cache_t vnode_cache; 179 180 /* 181 * These define the root filesystem and device. 182 */ 183 struct vnode *rootvnode; 184 struct device *root_device; /* root device */ 185 186 /* 187 * Local declarations. 188 */ 189 190 static void vrele_thread(void *); 191 static void insmntque(vnode_t *, struct mount *); 192 static int getdevvp(dev_t, vnode_t **, enum vtype); 193 static vnode_t *getcleanvnode(void); 194 void vpanic(vnode_t *, const char *); 195 static void vfs_shutdown1(struct lwp *); 196 197 #ifdef DEBUG 198 void printlockedvnodes(void); 199 #endif 200 201 #ifdef DIAGNOSTIC 202 void 203 vpanic(vnode_t *vp, const char *msg) 204 { 205 206 vprint(NULL, vp); 207 panic("%s\n", msg); 208 } 209 #else 210 #define vpanic(vp, msg) /* nothing */ 211 #endif 212 213 void 214 vn_init1(void) 215 { 216 217 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 218 NULL, IPL_NONE, NULL, NULL, NULL); 219 KASSERT(vnode_cache != NULL); 220 221 /* Create deferred release thread. */ 222 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 223 cv_init(&vrele_cv, "vrele"); 224 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 225 NULL, &vrele_lwp, "vrele")) 226 panic("fork vrele"); 227 } 228 229 /* 230 * Initialize the vnode management data structures. 231 */ 232 void 233 vntblinit(void) 234 { 235 236 mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE); 237 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE); 238 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE); 239 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE); 240 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 241 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE); 242 243 mount_specificdata_domain = specificdata_domain_create(); 244 245 /* Initialize the filesystem syncer. */ 246 vn_initialize_syncerd(); 247 vn_init1(); 248 } 249 250 int 251 vfs_drainvnodes(long target, struct lwp *l) 252 { 253 254 while (numvnodes > target) { 255 vnode_t *vp; 256 257 mutex_enter(&vnode_free_list_lock); 258 vp = getcleanvnode(); 259 if (vp == NULL) 260 return EBUSY; /* give up */ 261 ungetnewvnode(vp); 262 } 263 264 return 0; 265 } 266 267 /* 268 * Lookup a mount point by filesystem identifier. 269 * 270 * XXX Needs to add a reference to the mount point. 271 */ 272 struct mount * 273 vfs_getvfs(fsid_t *fsid) 274 { 275 struct mount *mp; 276 277 mutex_enter(&mountlist_lock); 278 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 279 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 280 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 281 mutex_exit(&mountlist_lock); 282 return (mp); 283 } 284 } 285 mutex_exit(&mountlist_lock); 286 return ((struct mount *)0); 287 } 288 289 /* 290 * Drop a reference to a mount structure, freeing if the last reference. 291 */ 292 void 293 vfs_destroy(struct mount *mp) 294 { 295 296 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) { 297 return; 298 } 299 300 /* 301 * Nothing else has visibility of the mount: we can now 302 * free the data structures. 303 */ 304 KASSERT(mp->mnt_refcnt == 0); 305 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 306 rw_destroy(&mp->mnt_unmounting); 307 mutex_destroy(&mp->mnt_updating); 308 mutex_destroy(&mp->mnt_renamelock); 309 if (mp->mnt_op != NULL) { 310 vfs_delref(mp->mnt_op); 311 } 312 kmem_free(mp, sizeof(*mp)); 313 } 314 315 /* 316 * grab a vnode from freelist and clean it. 317 */ 318 vnode_t * 319 getcleanvnode(void) 320 { 321 vnode_t *vp; 322 vnodelst_t *listhd; 323 324 KASSERT(mutex_owned(&vnode_free_list_lock)); 325 326 retry: 327 listhd = &vnode_free_list; 328 try_nextlist: 329 TAILQ_FOREACH(vp, listhd, v_freelist) { 330 /* 331 * It's safe to test v_usecount and v_iflag 332 * without holding the interlock here, since 333 * these vnodes should never appear on the 334 * lists. 335 */ 336 if (vp->v_usecount != 0) { 337 vpanic(vp, "free vnode isn't"); 338 } 339 if ((vp->v_iflag & VI_CLEAN) != 0) { 340 vpanic(vp, "clean vnode on freelist"); 341 } 342 if (vp->v_freelisthd != listhd) { 343 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 344 vpanic(vp, "list head mismatch"); 345 } 346 if (!mutex_tryenter(&vp->v_interlock)) 347 continue; 348 /* 349 * Our lwp might hold the underlying vnode 350 * locked, so don't try to reclaim a VI_LAYER 351 * node if it's locked. 352 */ 353 if ((vp->v_iflag & VI_XLOCK) == 0 && 354 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 355 break; 356 } 357 mutex_exit(&vp->v_interlock); 358 } 359 360 if (vp == NULL) { 361 if (listhd == &vnode_free_list) { 362 listhd = &vnode_hold_list; 363 goto try_nextlist; 364 } 365 mutex_exit(&vnode_free_list_lock); 366 return NULL; 367 } 368 369 /* Remove it from the freelist. */ 370 TAILQ_REMOVE(listhd, vp, v_freelist); 371 vp->v_freelisthd = NULL; 372 mutex_exit(&vnode_free_list_lock); 373 374 /* 375 * The vnode is still associated with a file system, so we must 376 * clean it out before reusing it. We need to add a reference 377 * before doing this. If the vnode gains another reference while 378 * being cleaned out then we lose - retry. 379 */ 380 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 381 vclean(vp, DOCLOSE); 382 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 383 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 384 if (vp->v_usecount == 1) { 385 /* We're about to dirty it. */ 386 vp->v_iflag &= ~VI_CLEAN; 387 mutex_exit(&vp->v_interlock); 388 if (vp->v_type == VBLK || vp->v_type == VCHR) { 389 spec_node_destroy(vp); 390 } 391 vp->v_type = VNON; 392 } else { 393 /* 394 * Don't return to freelist - the holder of the last 395 * reference will destroy it. 396 */ 397 vrelel(vp, 0); /* releases vp->v_interlock */ 398 mutex_enter(&vnode_free_list_lock); 399 goto retry; 400 } 401 402 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 403 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 404 vpanic(vp, "cleaned vnode isn't"); 405 } 406 if (vp->v_numoutput != 0) { 407 vpanic(vp, "clean vnode has pending I/O's"); 408 } 409 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 410 vpanic(vp, "clean vnode on syncer list"); 411 } 412 413 return vp; 414 } 415 416 /* 417 * Mark a mount point as busy, and gain a new reference to it. Used to 418 * prevent the file system from being unmounted during critical sections. 419 * 420 * => The caller must hold a pre-existing reference to the mount. 421 * => Will fail if the file system is being unmounted, or is unmounted. 422 */ 423 int 424 vfs_busy(struct mount *mp, struct mount **nextp) 425 { 426 427 KASSERT(mp->mnt_refcnt > 0); 428 429 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) { 430 if (nextp != NULL) { 431 KASSERT(mutex_owned(&mountlist_lock)); 432 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 433 } 434 return EBUSY; 435 } 436 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 437 rw_exit(&mp->mnt_unmounting); 438 if (nextp != NULL) { 439 KASSERT(mutex_owned(&mountlist_lock)); 440 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 441 } 442 return ENOENT; 443 } 444 if (nextp != NULL) { 445 mutex_exit(&mountlist_lock); 446 } 447 atomic_inc_uint(&mp->mnt_refcnt); 448 return 0; 449 } 450 451 /* 452 * Unbusy a busy filesystem. 453 * 454 * => If keepref is true, preserve reference added by vfs_busy(). 455 * => If nextp != NULL, acquire mountlist_lock. 456 */ 457 void 458 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 459 { 460 461 KASSERT(mp->mnt_refcnt > 0); 462 463 if (nextp != NULL) { 464 mutex_enter(&mountlist_lock); 465 } 466 rw_exit(&mp->mnt_unmounting); 467 if (!keepref) { 468 vfs_destroy(mp); 469 } 470 if (nextp != NULL) { 471 KASSERT(mutex_owned(&mountlist_lock)); 472 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 473 } 474 } 475 476 struct mount * 477 vfs_mountalloc(struct vfsops *vfsops, struct vnode *vp) 478 { 479 int error; 480 struct mount *mp; 481 482 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 483 if (mp == NULL) 484 return NULL; 485 486 mp->mnt_op = vfsops; 487 mp->mnt_refcnt = 1; 488 TAILQ_INIT(&mp->mnt_vnodelist); 489 rw_init(&mp->mnt_unmounting); 490 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 491 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE); 492 error = vfs_busy(mp, NULL); 493 KASSERT(error == 0); 494 mp->mnt_vnodecovered = vp; 495 mount_initspecific(mp); 496 497 mutex_enter(&mountgen_lock); 498 mp->mnt_gen = mountgen++; 499 mutex_exit(&mountgen_lock); 500 501 return mp; 502 } 503 504 /* 505 * Lookup a filesystem type, and if found allocate and initialize 506 * a mount structure for it. 507 * 508 * Devname is usually updated by mount(8) after booting. 509 */ 510 int 511 vfs_rootmountalloc(const char *fstypename, const char *devname, 512 struct mount **mpp) 513 { 514 struct vfsops *vfsp = NULL; 515 struct mount *mp; 516 517 mutex_enter(&vfs_list_lock); 518 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 519 if (!strncmp(vfsp->vfs_name, fstypename, 520 sizeof(mp->mnt_stat.f_fstypename))) 521 break; 522 if (vfsp == NULL) { 523 mutex_exit(&vfs_list_lock); 524 return (ENODEV); 525 } 526 vfsp->vfs_refcount++; 527 mutex_exit(&vfs_list_lock); 528 529 if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL) 530 return ENOMEM; 531 mp->mnt_flag = MNT_RDONLY; 532 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 533 sizeof(mp->mnt_stat.f_fstypename)); 534 mp->mnt_stat.f_mntonname[0] = '/'; 535 mp->mnt_stat.f_mntonname[1] = '\0'; 536 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 537 '\0'; 538 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 539 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 540 *mpp = mp; 541 return (0); 542 } 543 544 /* 545 * Routines having to do with the management of the vnode table. 546 */ 547 extern int (**dead_vnodeop_p)(void *); 548 549 /* 550 * Return the next vnode from the free list. 551 */ 552 int 553 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 554 vnode_t **vpp) 555 { 556 struct uvm_object *uobj; 557 static int toggle; 558 vnode_t *vp; 559 int error = 0, tryalloc; 560 561 try_again: 562 if (mp != NULL) { 563 /* 564 * Mark filesystem busy while we're creating a 565 * vnode. If unmount is in progress, this will 566 * fail. 567 */ 568 error = vfs_busy(mp, NULL); 569 if (error) 570 return error; 571 } 572 573 /* 574 * We must choose whether to allocate a new vnode or recycle an 575 * existing one. The criterion for allocating a new one is that 576 * the total number of vnodes is less than the number desired or 577 * there are no vnodes on either free list. Generally we only 578 * want to recycle vnodes that have no buffers associated with 579 * them, so we look first on the vnode_free_list. If it is empty, 580 * we next consider vnodes with referencing buffers on the 581 * vnode_hold_list. The toggle ensures that half the time we 582 * will use a buffer from the vnode_hold_list, and half the time 583 * we will allocate a new one unless the list has grown to twice 584 * the desired size. We are reticent to recycle vnodes from the 585 * vnode_hold_list because we will lose the identity of all its 586 * referencing buffers. 587 */ 588 589 vp = NULL; 590 591 mutex_enter(&vnode_free_list_lock); 592 593 toggle ^= 1; 594 if (numvnodes > 2 * desiredvnodes) 595 toggle = 0; 596 597 tryalloc = numvnodes < desiredvnodes || 598 (TAILQ_FIRST(&vnode_free_list) == NULL && 599 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 600 601 if (tryalloc) { 602 numvnodes++; 603 mutex_exit(&vnode_free_list_lock); 604 if ((vp = vnalloc(NULL)) == NULL) { 605 mutex_enter(&vnode_free_list_lock); 606 numvnodes--; 607 } else 608 vp->v_usecount = 1; 609 } 610 611 if (vp == NULL) { 612 vp = getcleanvnode(); 613 if (vp == NULL) { 614 if (mp != NULL) { 615 vfs_unbusy(mp, false, NULL); 616 } 617 if (tryalloc) { 618 printf("WARNING: unable to allocate new " 619 "vnode, retrying...\n"); 620 kpause("newvn", false, hz, NULL); 621 goto try_again; 622 } 623 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 624 *vpp = 0; 625 return (ENFILE); 626 } 627 vp->v_iflag = 0; 628 vp->v_vflag = 0; 629 vp->v_uflag = 0; 630 vp->v_socket = NULL; 631 } 632 633 KASSERT(vp->v_usecount == 1); 634 KASSERT(vp->v_freelisthd == NULL); 635 KASSERT(LIST_EMPTY(&vp->v_nclist)); 636 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 637 638 vp->v_type = VNON; 639 vp->v_vnlock = &vp->v_lock; 640 vp->v_tag = tag; 641 vp->v_op = vops; 642 insmntque(vp, mp); 643 *vpp = vp; 644 vp->v_data = 0; 645 646 /* 647 * initialize uvm_object within vnode. 648 */ 649 650 uobj = &vp->v_uobj; 651 KASSERT(uobj->pgops == &uvm_vnodeops); 652 KASSERT(uobj->uo_npages == 0); 653 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 654 vp->v_size = vp->v_writesize = VSIZENOTSET; 655 656 if (mp != NULL) { 657 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 658 vp->v_vflag |= VV_MPSAFE; 659 vfs_unbusy(mp, true, NULL); 660 } 661 662 return (0); 663 } 664 665 /* 666 * This is really just the reverse of getnewvnode(). Needed for 667 * VFS_VGET functions who may need to push back a vnode in case 668 * of a locking race. 669 */ 670 void 671 ungetnewvnode(vnode_t *vp) 672 { 673 674 KASSERT(vp->v_usecount == 1); 675 KASSERT(vp->v_data == NULL); 676 KASSERT(vp->v_freelisthd == NULL); 677 678 mutex_enter(&vp->v_interlock); 679 vp->v_iflag |= VI_CLEAN; 680 vrelel(vp, 0); 681 } 682 683 /* 684 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 685 * marker vnode and we are prepared to wait for the allocation. 686 */ 687 vnode_t * 688 vnalloc(struct mount *mp) 689 { 690 vnode_t *vp; 691 692 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 693 if (vp == NULL) { 694 return NULL; 695 } 696 697 memset(vp, 0, sizeof(*vp)); 698 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 699 cv_init(&vp->v_cv, "vnode"); 700 /* 701 * done by memset() above. 702 * LIST_INIT(&vp->v_nclist); 703 * LIST_INIT(&vp->v_dnclist); 704 */ 705 706 if (mp != NULL) { 707 vp->v_mount = mp; 708 vp->v_type = VBAD; 709 vp->v_iflag = VI_MARKER; 710 } else { 711 rw_init(&vp->v_lock.vl_lock); 712 } 713 714 return vp; 715 } 716 717 /* 718 * Free an unused, unreferenced vnode. 719 */ 720 void 721 vnfree(vnode_t *vp) 722 { 723 724 KASSERT(vp->v_usecount == 0); 725 726 if ((vp->v_iflag & VI_MARKER) == 0) { 727 rw_destroy(&vp->v_lock.vl_lock); 728 mutex_enter(&vnode_free_list_lock); 729 numvnodes--; 730 mutex_exit(&vnode_free_list_lock); 731 } 732 733 UVM_OBJ_DESTROY(&vp->v_uobj); 734 cv_destroy(&vp->v_cv); 735 pool_cache_put(vnode_cache, vp); 736 } 737 738 /* 739 * Remove a vnode from its freelist. 740 */ 741 static inline void 742 vremfree(vnode_t *vp) 743 { 744 745 KASSERT(mutex_owned(&vp->v_interlock)); 746 KASSERT(vp->v_usecount == 0); 747 748 /* 749 * Note that the reference count must not change until 750 * the vnode is removed. 751 */ 752 mutex_enter(&vnode_free_list_lock); 753 if (vp->v_holdcnt > 0) { 754 KASSERT(vp->v_freelisthd == &vnode_hold_list); 755 } else { 756 KASSERT(vp->v_freelisthd == &vnode_free_list); 757 } 758 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 759 vp->v_freelisthd = NULL; 760 mutex_exit(&vnode_free_list_lock); 761 } 762 763 /* 764 * Move a vnode from one mount queue to another. 765 */ 766 static void 767 insmntque(vnode_t *vp, struct mount *mp) 768 { 769 struct mount *omp; 770 771 #ifdef DIAGNOSTIC 772 if ((mp != NULL) && 773 (mp->mnt_iflag & IMNT_UNMOUNT) && 774 vp->v_tag != VT_VFS) { 775 panic("insmntque into dying filesystem"); 776 } 777 #endif 778 779 mutex_enter(&mntvnode_lock); 780 /* 781 * Delete from old mount point vnode list, if on one. 782 */ 783 if ((omp = vp->v_mount) != NULL) 784 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 785 /* 786 * Insert into list of vnodes for the new mount point, if 787 * available. The caller must take a reference on the mount 788 * structure and donate to the vnode. 789 */ 790 if ((vp->v_mount = mp) != NULL) 791 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 792 mutex_exit(&mntvnode_lock); 793 794 if (omp != NULL) { 795 /* Release reference to old mount. */ 796 vfs_destroy(omp); 797 } 798 } 799 800 /* 801 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 802 * recycled. 803 */ 804 void 805 vwait(vnode_t *vp, int flags) 806 { 807 808 KASSERT(mutex_owned(&vp->v_interlock)); 809 KASSERT(vp->v_usecount != 0); 810 811 while ((vp->v_iflag & flags) != 0) 812 cv_wait(&vp->v_cv, &vp->v_interlock); 813 } 814 815 /* 816 * Insert a marker vnode into a mount's vnode list, after the 817 * specified vnode. mntvnode_lock must be held. 818 */ 819 void 820 vmark(vnode_t *mvp, vnode_t *vp) 821 { 822 struct mount *mp; 823 824 mp = mvp->v_mount; 825 826 KASSERT(mutex_owned(&mntvnode_lock)); 827 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 828 KASSERT(vp->v_mount == mp); 829 830 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes); 831 } 832 833 /* 834 * Remove a marker vnode from a mount's vnode list, and return 835 * a pointer to the next vnode in the list. mntvnode_lock must 836 * be held. 837 */ 838 vnode_t * 839 vunmark(vnode_t *mvp) 840 { 841 vnode_t *vp; 842 struct mount *mp; 843 844 mp = mvp->v_mount; 845 846 KASSERT(mutex_owned(&mntvnode_lock)); 847 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 848 849 vp = TAILQ_NEXT(mvp, v_mntvnodes); 850 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes); 851 852 KASSERT(vp == NULL || vp->v_mount == mp); 853 854 return vp; 855 } 856 857 /* 858 * Update outstanding I/O count and do wakeup if requested. 859 */ 860 void 861 vwakeup(struct buf *bp) 862 { 863 struct vnode *vp; 864 865 if ((vp = bp->b_vp) == NULL) 866 return; 867 868 KASSERT(bp->b_objlock == &vp->v_interlock); 869 KASSERT(mutex_owned(bp->b_objlock)); 870 871 if (--vp->v_numoutput < 0) 872 panic("vwakeup: neg numoutput, vp %p", vp); 873 if (vp->v_numoutput == 0) 874 cv_broadcast(&vp->v_cv); 875 } 876 877 /* 878 * Flush out and invalidate all buffers associated with a vnode. 879 * Called with the underlying vnode locked, which should prevent new dirty 880 * buffers from being queued. 881 */ 882 int 883 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, 884 bool catch, int slptimeo) 885 { 886 struct buf *bp, *nbp; 887 int error; 888 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 889 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); 890 891 /* XXXUBC this doesn't look at flags or slp* */ 892 mutex_enter(&vp->v_interlock); 893 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 894 if (error) { 895 return error; 896 } 897 898 if (flags & V_SAVE) { 899 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); 900 if (error) 901 return (error); 902 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); 903 } 904 905 mutex_enter(&bufcache_lock); 906 restart: 907 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 908 nbp = LIST_NEXT(bp, b_vnbufs); 909 error = bbusy(bp, catch, slptimeo, NULL); 910 if (error != 0) { 911 if (error == EPASSTHROUGH) 912 goto restart; 913 mutex_exit(&bufcache_lock); 914 return (error); 915 } 916 brelsel(bp, BC_INVAL | BC_VFLUSH); 917 } 918 919 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 920 nbp = LIST_NEXT(bp, b_vnbufs); 921 error = bbusy(bp, catch, slptimeo, NULL); 922 if (error != 0) { 923 if (error == EPASSTHROUGH) 924 goto restart; 925 mutex_exit(&bufcache_lock); 926 return (error); 927 } 928 /* 929 * XXX Since there are no node locks for NFS, I believe 930 * there is a slight chance that a delayed write will 931 * occur while sleeping just above, so check for it. 932 */ 933 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { 934 #ifdef DEBUG 935 printf("buffer still DELWRI\n"); 936 #endif 937 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 938 mutex_exit(&bufcache_lock); 939 VOP_BWRITE(bp); 940 mutex_enter(&bufcache_lock); 941 goto restart; 942 } 943 brelsel(bp, BC_INVAL | BC_VFLUSH); 944 } 945 946 #ifdef DIAGNOSTIC 947 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 948 panic("vinvalbuf: flush failed, vp %p", vp); 949 #endif 950 951 mutex_exit(&bufcache_lock); 952 953 return (0); 954 } 955 956 /* 957 * Destroy any in core blocks past the truncation length. 958 * Called with the underlying vnode locked, which should prevent new dirty 959 * buffers from being queued. 960 */ 961 int 962 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo) 963 { 964 struct buf *bp, *nbp; 965 int error; 966 voff_t off; 967 968 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 969 mutex_enter(&vp->v_interlock); 970 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 971 if (error) { 972 return error; 973 } 974 975 mutex_enter(&bufcache_lock); 976 restart: 977 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 978 nbp = LIST_NEXT(bp, b_vnbufs); 979 if (bp->b_lblkno < lbn) 980 continue; 981 error = bbusy(bp, catch, slptimeo, NULL); 982 if (error != 0) { 983 if (error == EPASSTHROUGH) 984 goto restart; 985 mutex_exit(&bufcache_lock); 986 return (error); 987 } 988 brelsel(bp, BC_INVAL | BC_VFLUSH); 989 } 990 991 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 992 nbp = LIST_NEXT(bp, b_vnbufs); 993 if (bp->b_lblkno < lbn) 994 continue; 995 error = bbusy(bp, catch, slptimeo, NULL); 996 if (error != 0) { 997 if (error == EPASSTHROUGH) 998 goto restart; 999 mutex_exit(&bufcache_lock); 1000 return (error); 1001 } 1002 brelsel(bp, BC_INVAL | BC_VFLUSH); 1003 } 1004 mutex_exit(&bufcache_lock); 1005 1006 return (0); 1007 } 1008 1009 /* 1010 * Flush all dirty buffers from a vnode. 1011 * Called with the underlying vnode locked, which should prevent new dirty 1012 * buffers from being queued. 1013 */ 1014 void 1015 vflushbuf(struct vnode *vp, int sync) 1016 { 1017 struct buf *bp, *nbp; 1018 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 1019 bool dirty; 1020 1021 mutex_enter(&vp->v_interlock); 1022 (void) VOP_PUTPAGES(vp, 0, 0, flags); 1023 1024 loop: 1025 mutex_enter(&bufcache_lock); 1026 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1027 nbp = LIST_NEXT(bp, b_vnbufs); 1028 if ((bp->b_cflags & BC_BUSY)) 1029 continue; 1030 if ((bp->b_oflags & BO_DELWRI) == 0) 1031 panic("vflushbuf: not dirty, bp %p", bp); 1032 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 1033 mutex_exit(&bufcache_lock); 1034 /* 1035 * Wait for I/O associated with indirect blocks to complete, 1036 * since there is no way to quickly wait for them below. 1037 */ 1038 if (bp->b_vp == vp || sync == 0) 1039 (void) bawrite(bp); 1040 else 1041 (void) bwrite(bp); 1042 goto loop; 1043 } 1044 mutex_exit(&bufcache_lock); 1045 1046 if (sync == 0) 1047 return; 1048 1049 mutex_enter(&vp->v_interlock); 1050 while (vp->v_numoutput != 0) 1051 cv_wait(&vp->v_cv, &vp->v_interlock); 1052 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); 1053 mutex_exit(&vp->v_interlock); 1054 1055 if (dirty) { 1056 vprint("vflushbuf: dirty", vp); 1057 goto loop; 1058 } 1059 } 1060 1061 /* 1062 * Create a vnode for a block device. 1063 * Used for root filesystem and swap areas. 1064 * Also used for memory file system special devices. 1065 */ 1066 int 1067 bdevvp(dev_t dev, vnode_t **vpp) 1068 { 1069 1070 return (getdevvp(dev, vpp, VBLK)); 1071 } 1072 1073 /* 1074 * Create a vnode for a character device. 1075 * Used for kernfs and some console handling. 1076 */ 1077 int 1078 cdevvp(dev_t dev, vnode_t **vpp) 1079 { 1080 1081 return (getdevvp(dev, vpp, VCHR)); 1082 } 1083 1084 /* 1085 * Associate a buffer with a vnode. There must already be a hold on 1086 * the vnode. 1087 */ 1088 void 1089 bgetvp(struct vnode *vp, struct buf *bp) 1090 { 1091 1092 KASSERT(bp->b_vp == NULL); 1093 KASSERT(bp->b_objlock == &buffer_lock); 1094 KASSERT(mutex_owned(&vp->v_interlock)); 1095 KASSERT(mutex_owned(&bufcache_lock)); 1096 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1097 KASSERT(!cv_has_waiters(&bp->b_done)); 1098 1099 vholdl(vp); 1100 bp->b_vp = vp; 1101 if (vp->v_type == VBLK || vp->v_type == VCHR) 1102 bp->b_dev = vp->v_rdev; 1103 else 1104 bp->b_dev = NODEV; 1105 1106 /* 1107 * Insert onto list for new vnode. 1108 */ 1109 bufinsvn(bp, &vp->v_cleanblkhd); 1110 bp->b_objlock = &vp->v_interlock; 1111 } 1112 1113 /* 1114 * Disassociate a buffer from a vnode. 1115 */ 1116 void 1117 brelvp(struct buf *bp) 1118 { 1119 struct vnode *vp = bp->b_vp; 1120 1121 KASSERT(vp != NULL); 1122 KASSERT(bp->b_objlock == &vp->v_interlock); 1123 KASSERT(mutex_owned(&vp->v_interlock)); 1124 KASSERT(mutex_owned(&bufcache_lock)); 1125 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1126 KASSERT(!cv_has_waiters(&bp->b_done)); 1127 1128 /* 1129 * Delete from old vnode list, if on one. 1130 */ 1131 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1132 bufremvn(bp); 1133 1134 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) && 1135 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1136 vp->v_iflag &= ~VI_WRMAPDIRTY; 1137 vn_syncer_remove_from_worklist(vp); 1138 } 1139 1140 bp->b_objlock = &buffer_lock; 1141 bp->b_vp = NULL; 1142 holdrelel(vp); 1143 } 1144 1145 /* 1146 * Reassign a buffer from one vnode list to another. 1147 * The list reassignment must be within the same vnode. 1148 * Used to assign file specific control information 1149 * (indirect blocks) to the list to which they belong. 1150 */ 1151 void 1152 reassignbuf(struct buf *bp, struct vnode *vp) 1153 { 1154 struct buflists *listheadp; 1155 int delayx; 1156 1157 KASSERT(mutex_owned(&bufcache_lock)); 1158 KASSERT(bp->b_objlock == &vp->v_interlock); 1159 KASSERT(mutex_owned(&vp->v_interlock)); 1160 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1161 1162 /* 1163 * Delete from old vnode list, if on one. 1164 */ 1165 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1166 bufremvn(bp); 1167 1168 /* 1169 * If dirty, put on list of dirty buffers; 1170 * otherwise insert onto list of clean buffers. 1171 */ 1172 if ((bp->b_oflags & BO_DELWRI) == 0) { 1173 listheadp = &vp->v_cleanblkhd; 1174 if (TAILQ_EMPTY(&vp->v_uobj.memq) && 1175 (vp->v_iflag & VI_ONWORKLST) && 1176 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1177 vp->v_iflag &= ~VI_WRMAPDIRTY; 1178 vn_syncer_remove_from_worklist(vp); 1179 } 1180 } else { 1181 listheadp = &vp->v_dirtyblkhd; 1182 if ((vp->v_iflag & VI_ONWORKLST) == 0) { 1183 switch (vp->v_type) { 1184 case VDIR: 1185 delayx = dirdelay; 1186 break; 1187 case VBLK: 1188 if (vp->v_specmountpoint != NULL) { 1189 delayx = metadelay; 1190 break; 1191 } 1192 /* fall through */ 1193 default: 1194 delayx = filedelay; 1195 break; 1196 } 1197 if (!vp->v_mount || 1198 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1199 vn_syncer_add_to_worklist(vp, delayx); 1200 } 1201 } 1202 bufinsvn(bp, listheadp); 1203 } 1204 1205 /* 1206 * Create a vnode for a device. 1207 * Used by bdevvp (block device) for root file system etc., 1208 * and by cdevvp (character device) for console and kernfs. 1209 */ 1210 static int 1211 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 1212 { 1213 vnode_t *vp; 1214 vnode_t *nvp; 1215 int error; 1216 1217 if (dev == NODEV) { 1218 *vpp = NULL; 1219 return (0); 1220 } 1221 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1222 if (error) { 1223 *vpp = NULL; 1224 return (error); 1225 } 1226 vp = nvp; 1227 vp->v_type = type; 1228 vp->v_vflag |= VV_MPSAFE; 1229 uvm_vnp_setsize(vp, 0); 1230 spec_node_init(vp, dev); 1231 *vpp = vp; 1232 return (0); 1233 } 1234 1235 /* 1236 * Try to gain a reference to a vnode, without acquiring its interlock. 1237 * The caller must hold a lock that will prevent the vnode from being 1238 * recycled or freed. 1239 */ 1240 bool 1241 vtryget(vnode_t *vp) 1242 { 1243 u_int use, next; 1244 1245 /* 1246 * If the vnode is being freed, don't make life any harder 1247 * for vclean() by adding another reference without waiting. 1248 * This is not strictly necessary, but we'll do it anyway. 1249 */ 1250 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) { 1251 return false; 1252 } 1253 for (use = vp->v_usecount;; use = next) { 1254 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 1255 /* Need interlock held if first reference. */ 1256 return false; 1257 } 1258 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 1259 if (__predict_true(next == use)) { 1260 return true; 1261 } 1262 } 1263 } 1264 1265 /* 1266 * Grab a particular vnode from the free list, increment its 1267 * reference count and lock it. If the vnode lock bit is set the 1268 * vnode is being eliminated in vgone. In that case, we can not 1269 * grab the vnode, so the process is awakened when the transition is 1270 * completed, and an error returned to indicate that the vnode is no 1271 * longer usable (possibly having been changed to a new file system type). 1272 */ 1273 int 1274 vget(vnode_t *vp, int flags) 1275 { 1276 int error; 1277 1278 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1279 1280 if ((flags & LK_INTERLOCK) == 0) 1281 mutex_enter(&vp->v_interlock); 1282 1283 /* 1284 * Before adding a reference, we must remove the vnode 1285 * from its freelist. 1286 */ 1287 if (vp->v_usecount == 0) { 1288 vremfree(vp); 1289 vp->v_usecount = 1; 1290 } else { 1291 atomic_inc_uint(&vp->v_usecount); 1292 } 1293 1294 /* 1295 * If the vnode is in the process of being cleaned out for 1296 * another use, we wait for the cleaning to finish and then 1297 * return failure. Cleaning is determined by checking if 1298 * the VI_XLOCK or VI_FREEING flags are set. 1299 */ 1300 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 1301 if ((flags & LK_NOWAIT) != 0) { 1302 vrelel(vp, 0); 1303 return EBUSY; 1304 } 1305 vwait(vp, VI_XLOCK | VI_FREEING); 1306 vrelel(vp, 0); 1307 return ENOENT; 1308 } 1309 if (flags & LK_TYPE_MASK) { 1310 error = vn_lock(vp, flags | LK_INTERLOCK); 1311 if (error != 0) { 1312 vrele(vp); 1313 } 1314 return error; 1315 } 1316 mutex_exit(&vp->v_interlock); 1317 return 0; 1318 } 1319 1320 /* 1321 * vput(), just unlock and vrele() 1322 */ 1323 void 1324 vput(vnode_t *vp) 1325 { 1326 1327 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1328 1329 VOP_UNLOCK(vp, 0); 1330 vrele(vp); 1331 } 1332 1333 /* 1334 * Try to drop reference on a vnode. Abort if we are releasing the 1335 * last reference. Note: this _must_ succeed if not the last reference. 1336 */ 1337 static inline bool 1338 vtryrele(vnode_t *vp) 1339 { 1340 u_int use, next; 1341 1342 for (use = vp->v_usecount;; use = next) { 1343 if (use == 1) { 1344 return false; 1345 } 1346 KASSERT((use & VC_MASK) > 1); 1347 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 1348 if (__predict_true(next == use)) { 1349 return true; 1350 } 1351 } 1352 } 1353 1354 /* 1355 * Vnode release. If reference count drops to zero, call inactive 1356 * routine and either return to freelist or free to the pool. 1357 */ 1358 void 1359 vrelel(vnode_t *vp, int flags) 1360 { 1361 bool recycle, defer; 1362 int error; 1363 1364 KASSERT(mutex_owned(&vp->v_interlock)); 1365 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1366 KASSERT(vp->v_freelisthd == NULL); 1367 1368 if (__predict_false(vp->v_op == dead_vnodeop_p && 1369 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 1370 vpanic(vp, "dead but not clean"); 1371 } 1372 1373 /* 1374 * If not the last reference, just drop the reference count 1375 * and unlock. 1376 */ 1377 if (vtryrele(vp)) { 1378 vp->v_iflag |= VI_INACTREDO; 1379 mutex_exit(&vp->v_interlock); 1380 return; 1381 } 1382 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 1383 vpanic(vp, "vrelel: bad ref count"); 1384 } 1385 1386 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 1387 1388 /* 1389 * If not clean, deactivate the vnode, but preserve 1390 * our reference across the call to VOP_INACTIVE(). 1391 */ 1392 retry: 1393 if ((vp->v_iflag & VI_CLEAN) == 0) { 1394 recycle = false; 1395 vp->v_iflag |= VI_INACTNOW; 1396 1397 /* 1398 * XXX This ugly block can be largely eliminated if 1399 * locking is pushed down into the file systems. 1400 */ 1401 if (curlwp == uvm.pagedaemon_lwp) { 1402 /* The pagedaemon can't wait around; defer. */ 1403 defer = true; 1404 } else if (curlwp == vrele_lwp) { 1405 /* We have to try harder. */ 1406 vp->v_iflag &= ~VI_INACTREDO; 1407 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 1408 LK_RETRY); 1409 if (error != 0) { 1410 /* XXX */ 1411 vpanic(vp, "vrele: unable to lock %p"); 1412 } 1413 defer = false; 1414 } else if ((vp->v_iflag & VI_LAYER) != 0) { 1415 /* 1416 * Acquiring the stack's lock in vclean() even 1417 * for an honest vput/vrele is dangerous because 1418 * our caller may hold other vnode locks; defer. 1419 */ 1420 defer = true; 1421 } else { 1422 /* If we can't acquire the lock, then defer. */ 1423 vp->v_iflag &= ~VI_INACTREDO; 1424 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 1425 LK_NOWAIT); 1426 if (error != 0) { 1427 defer = true; 1428 mutex_enter(&vp->v_interlock); 1429 } else { 1430 defer = false; 1431 } 1432 } 1433 1434 if (defer) { 1435 /* 1436 * Defer reclaim to the kthread; it's not safe to 1437 * clean it here. We donate it our last reference. 1438 */ 1439 KASSERT(mutex_owned(&vp->v_interlock)); 1440 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 1441 vp->v_iflag &= ~VI_INACTNOW; 1442 vp->v_iflag |= VI_INACTPEND; 1443 mutex_enter(&vrele_lock); 1444 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 1445 if (++vrele_pending > (desiredvnodes >> 8)) 1446 cv_signal(&vrele_cv); 1447 mutex_exit(&vrele_lock); 1448 mutex_exit(&vp->v_interlock); 1449 return; 1450 } 1451 1452 #ifdef DIAGNOSTIC 1453 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1454 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 1455 vprint("vrelel: missing VOP_CLOSE()", vp); 1456 } 1457 #endif 1458 1459 /* 1460 * The vnode can gain another reference while being 1461 * deactivated. If VOP_INACTIVE() indicates that 1462 * the described file has been deleted, then recycle 1463 * the vnode irrespective of additional references. 1464 * Another thread may be waiting to re-use the on-disk 1465 * inode. 1466 * 1467 * Note that VOP_INACTIVE() will drop the vnode lock. 1468 */ 1469 VOP_INACTIVE(vp, &recycle); 1470 mutex_enter(&vp->v_interlock); 1471 vp->v_iflag &= ~VI_INACTNOW; 1472 if (!recycle) { 1473 if (vtryrele(vp)) { 1474 mutex_exit(&vp->v_interlock); 1475 return; 1476 } 1477 1478 /* 1479 * If we grew another reference while 1480 * VOP_INACTIVE() was underway, retry. 1481 */ 1482 if ((vp->v_iflag & VI_INACTREDO) != 0) { 1483 goto retry; 1484 } 1485 } 1486 1487 /* Take care of space accounting. */ 1488 if (vp->v_iflag & VI_EXECMAP) { 1489 atomic_add_int(&uvmexp.execpages, 1490 -vp->v_uobj.uo_npages); 1491 atomic_add_int(&uvmexp.filepages, 1492 vp->v_uobj.uo_npages); 1493 } 1494 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 1495 vp->v_vflag &= ~VV_MAPPED; 1496 1497 /* 1498 * Recycle the vnode if the file is now unused (unlinked), 1499 * otherwise just free it. 1500 */ 1501 if (recycle) { 1502 vclean(vp, DOCLOSE); 1503 } 1504 KASSERT(vp->v_usecount > 0); 1505 } 1506 1507 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 1508 /* Gained another reference while being reclaimed. */ 1509 mutex_exit(&vp->v_interlock); 1510 return; 1511 } 1512 1513 if ((vp->v_iflag & VI_CLEAN) != 0) { 1514 /* 1515 * It's clean so destroy it. It isn't referenced 1516 * anywhere since it has been reclaimed. 1517 */ 1518 KASSERT(vp->v_holdcnt == 0); 1519 KASSERT(vp->v_writecount == 0); 1520 mutex_exit(&vp->v_interlock); 1521 insmntque(vp, NULL); 1522 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1523 spec_node_destroy(vp); 1524 } 1525 vnfree(vp); 1526 } else { 1527 /* 1528 * Otherwise, put it back onto the freelist. It 1529 * can't be destroyed while still associated with 1530 * a file system. 1531 */ 1532 mutex_enter(&vnode_free_list_lock); 1533 if (vp->v_holdcnt > 0) { 1534 vp->v_freelisthd = &vnode_hold_list; 1535 } else { 1536 vp->v_freelisthd = &vnode_free_list; 1537 } 1538 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1539 mutex_exit(&vnode_free_list_lock); 1540 mutex_exit(&vp->v_interlock); 1541 } 1542 } 1543 1544 void 1545 vrele(vnode_t *vp) 1546 { 1547 1548 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1549 1550 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 1551 return; 1552 } 1553 mutex_enter(&vp->v_interlock); 1554 vrelel(vp, 0); 1555 } 1556 1557 static void 1558 vrele_thread(void *cookie) 1559 { 1560 vnode_t *vp; 1561 1562 for (;;) { 1563 mutex_enter(&vrele_lock); 1564 while (TAILQ_EMPTY(&vrele_list)) { 1565 vrele_gen++; 1566 cv_broadcast(&vrele_cv); 1567 cv_timedwait(&vrele_cv, &vrele_lock, hz); 1568 } 1569 vp = TAILQ_FIRST(&vrele_list); 1570 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 1571 vrele_pending--; 1572 mutex_exit(&vrele_lock); 1573 1574 /* 1575 * If not the last reference, then ignore the vnode 1576 * and look for more work. 1577 */ 1578 mutex_enter(&vp->v_interlock); 1579 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 1580 vp->v_iflag &= ~VI_INACTPEND; 1581 vrelel(vp, 0); 1582 } 1583 } 1584 1585 /* 1586 * Page or buffer structure gets a reference. 1587 * Called with v_interlock held. 1588 */ 1589 void 1590 vholdl(vnode_t *vp) 1591 { 1592 1593 KASSERT(mutex_owned(&vp->v_interlock)); 1594 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1595 1596 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1597 mutex_enter(&vnode_free_list_lock); 1598 KASSERT(vp->v_freelisthd == &vnode_free_list); 1599 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1600 vp->v_freelisthd = &vnode_hold_list; 1601 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1602 mutex_exit(&vnode_free_list_lock); 1603 } 1604 } 1605 1606 /* 1607 * Page or buffer structure frees a reference. 1608 * Called with v_interlock held. 1609 */ 1610 void 1611 holdrelel(vnode_t *vp) 1612 { 1613 1614 KASSERT(mutex_owned(&vp->v_interlock)); 1615 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1616 1617 if (vp->v_holdcnt <= 0) { 1618 vpanic(vp, "holdrelel: holdcnt vp %p"); 1619 } 1620 1621 vp->v_holdcnt--; 1622 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1623 mutex_enter(&vnode_free_list_lock); 1624 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1625 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1626 vp->v_freelisthd = &vnode_free_list; 1627 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1628 mutex_exit(&vnode_free_list_lock); 1629 } 1630 } 1631 1632 /* 1633 * Vnode reference, where a reference is already held by some other 1634 * object (for example, a file structure). 1635 */ 1636 void 1637 vref(vnode_t *vp) 1638 { 1639 1640 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1641 KASSERT(vp->v_usecount != 0); 1642 1643 atomic_inc_uint(&vp->v_usecount); 1644 } 1645 1646 /* 1647 * Remove any vnodes in the vnode table belonging to mount point mp. 1648 * 1649 * If FORCECLOSE is not specified, there should not be any active ones, 1650 * return error if any are found (nb: this is a user error, not a 1651 * system error). If FORCECLOSE is specified, detach any active vnodes 1652 * that are found. 1653 * 1654 * If WRITECLOSE is set, only flush out regular file vnodes open for 1655 * writing. 1656 * 1657 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1658 */ 1659 #ifdef DEBUG 1660 int busyprt = 0; /* print out busy vnodes */ 1661 struct ctldebug debug1 = { "busyprt", &busyprt }; 1662 #endif 1663 1664 static vnode_t * 1665 vflushnext(vnode_t *mvp, int *when) 1666 { 1667 1668 if (hardclock_ticks > *when) { 1669 mutex_exit(&mntvnode_lock); 1670 yield(); 1671 mutex_enter(&mntvnode_lock); 1672 *when = hardclock_ticks + hz / 10; 1673 } 1674 1675 return vunmark(mvp); 1676 } 1677 1678 int 1679 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1680 { 1681 vnode_t *vp, *mvp; 1682 int busy = 0, when = 0, gen; 1683 1684 /* 1685 * First, flush out any vnode references from vrele_list. 1686 */ 1687 mutex_enter(&vrele_lock); 1688 gen = vrele_gen; 1689 while (vrele_pending && gen == vrele_gen) { 1690 cv_broadcast(&vrele_cv); 1691 cv_wait(&vrele_cv, &vrele_lock); 1692 } 1693 mutex_exit(&vrele_lock); 1694 1695 /* Allocate a marker vnode. */ 1696 if ((mvp = vnalloc(mp)) == NULL) 1697 return (ENOMEM); 1698 1699 /* 1700 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1701 * and vclean() are called 1702 */ 1703 mutex_enter(&mntvnode_lock); 1704 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1705 vp = vflushnext(mvp, &when)) { 1706 vmark(mvp, vp); 1707 if (vp->v_mount != mp || vismarker(vp)) 1708 continue; 1709 /* 1710 * Skip over a selected vnode. 1711 */ 1712 if (vp == skipvp) 1713 continue; 1714 mutex_enter(&vp->v_interlock); 1715 /* 1716 * Ignore clean but still referenced vnodes. 1717 */ 1718 if ((vp->v_iflag & VI_CLEAN) != 0) { 1719 mutex_exit(&vp->v_interlock); 1720 continue; 1721 } 1722 /* 1723 * Skip over a vnodes marked VSYSTEM. 1724 */ 1725 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1726 mutex_exit(&vp->v_interlock); 1727 continue; 1728 } 1729 /* 1730 * If WRITECLOSE is set, only flush out regular file 1731 * vnodes open for writing. 1732 */ 1733 if ((flags & WRITECLOSE) && 1734 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1735 mutex_exit(&vp->v_interlock); 1736 continue; 1737 } 1738 /* 1739 * With v_usecount == 0, all we need to do is clear 1740 * out the vnode data structures and we are done. 1741 */ 1742 if (vp->v_usecount == 0) { 1743 mutex_exit(&mntvnode_lock); 1744 vremfree(vp); 1745 vp->v_usecount = 1; 1746 vclean(vp, DOCLOSE); 1747 vrelel(vp, 0); 1748 mutex_enter(&mntvnode_lock); 1749 continue; 1750 } 1751 /* 1752 * If FORCECLOSE is set, forcibly close the vnode. 1753 * For block or character devices, revert to an 1754 * anonymous device. For all other files, just 1755 * kill them. 1756 */ 1757 if (flags & FORCECLOSE) { 1758 mutex_exit(&mntvnode_lock); 1759 atomic_inc_uint(&vp->v_usecount); 1760 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1761 vclean(vp, DOCLOSE); 1762 vrelel(vp, 0); 1763 } else { 1764 vclean(vp, 0); 1765 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1766 mutex_exit(&vp->v_interlock); 1767 /* 1768 * The vnode isn't clean, but still resides 1769 * on the mount list. Remove it. XXX This 1770 * is a bit dodgy. 1771 */ 1772 insmntque(vp, NULL); 1773 vrele(vp); 1774 } 1775 mutex_enter(&mntvnode_lock); 1776 continue; 1777 } 1778 #ifdef DEBUG 1779 if (busyprt) 1780 vprint("vflush: busy vnode", vp); 1781 #endif 1782 mutex_exit(&vp->v_interlock); 1783 busy++; 1784 } 1785 mutex_exit(&mntvnode_lock); 1786 vnfree(mvp); 1787 if (busy) 1788 return (EBUSY); 1789 return (0); 1790 } 1791 1792 /* 1793 * Disassociate the underlying file system from a vnode. 1794 * 1795 * Must be called with the interlock held, and will return with it held. 1796 */ 1797 void 1798 vclean(vnode_t *vp, int flags) 1799 { 1800 lwp_t *l = curlwp; 1801 bool recycle, active; 1802 int error; 1803 1804 KASSERT(mutex_owned(&vp->v_interlock)); 1805 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1806 KASSERT(vp->v_usecount != 0); 1807 1808 /* If cleaning is already in progress wait until done and return. */ 1809 if (vp->v_iflag & VI_XLOCK) { 1810 vwait(vp, VI_XLOCK); 1811 return; 1812 } 1813 1814 /* If already clean, nothing to do. */ 1815 if ((vp->v_iflag & VI_CLEAN) != 0) { 1816 return; 1817 } 1818 1819 /* 1820 * Prevent the vnode from being recycled or brought into use 1821 * while we clean it out. 1822 */ 1823 vp->v_iflag |= VI_XLOCK; 1824 if (vp->v_iflag & VI_EXECMAP) { 1825 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1826 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1827 } 1828 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1829 active = (vp->v_usecount > 1); 1830 1831 /* XXXAD should not lock vnode under layer */ 1832 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1833 1834 /* 1835 * Clean out any cached data associated with the vnode. 1836 * If purging an active vnode, it must be closed and 1837 * deactivated before being reclaimed. Note that the 1838 * VOP_INACTIVE will unlock the vnode. 1839 */ 1840 if (flags & DOCLOSE) { 1841 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1842 if (error != 0) { 1843 /* XXX, fix vn_start_write's grab of mp and use that. */ 1844 1845 if (wapbl_vphaswapbl(vp)) 1846 WAPBL_DISCARD(wapbl_vptomp(vp)); 1847 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1848 } 1849 KASSERT(error == 0); 1850 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1851 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1852 spec_node_revoke(vp); 1853 } 1854 } 1855 if (active) { 1856 VOP_INACTIVE(vp, &recycle); 1857 } else { 1858 /* 1859 * Any other processes trying to obtain this lock must first 1860 * wait for VI_XLOCK to clear, then call the new lock operation. 1861 */ 1862 VOP_UNLOCK(vp, 0); 1863 } 1864 1865 /* Disassociate the underlying file system from the vnode. */ 1866 if (VOP_RECLAIM(vp)) { 1867 vpanic(vp, "vclean: cannot reclaim"); 1868 } 1869 1870 KASSERT(vp->v_uobj.uo_npages == 0); 1871 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1872 uvm_ra_freectx(vp->v_ractx); 1873 vp->v_ractx = NULL; 1874 } 1875 cache_purge(vp); 1876 1877 /* Done with purge, notify sleepers of the grim news. */ 1878 mutex_enter(&vp->v_interlock); 1879 vp->v_op = dead_vnodeop_p; 1880 vp->v_tag = VT_NON; 1881 vp->v_vnlock = &vp->v_lock; 1882 KNOTE(&vp->v_klist, NOTE_REVOKE); 1883 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1884 vp->v_vflag &= ~VV_LOCKSWORK; 1885 if ((flags & DOCLOSE) != 0) { 1886 vp->v_iflag |= VI_CLEAN; 1887 } 1888 cv_broadcast(&vp->v_cv); 1889 1890 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1891 } 1892 1893 /* 1894 * Recycle an unused vnode to the front of the free list. 1895 * Release the passed interlock if the vnode will be recycled. 1896 */ 1897 int 1898 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1899 { 1900 1901 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1902 1903 mutex_enter(&vp->v_interlock); 1904 if (vp->v_usecount != 0) { 1905 mutex_exit(&vp->v_interlock); 1906 return (0); 1907 } 1908 if (inter_lkp) 1909 mutex_exit(inter_lkp); 1910 vremfree(vp); 1911 vp->v_usecount = 1; 1912 vclean(vp, DOCLOSE); 1913 vrelel(vp, 0); 1914 return (1); 1915 } 1916 1917 /* 1918 * Eliminate all activity associated with a vnode in preparation for 1919 * reuse. Drops a reference from the vnode. 1920 */ 1921 void 1922 vgone(vnode_t *vp) 1923 { 1924 1925 mutex_enter(&vp->v_interlock); 1926 vclean(vp, DOCLOSE); 1927 vrelel(vp, 0); 1928 } 1929 1930 /* 1931 * Lookup a vnode by device number. 1932 */ 1933 int 1934 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1935 { 1936 vnode_t *vp; 1937 int rc = 0; 1938 1939 mutex_enter(&device_lock); 1940 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1941 if (dev != vp->v_rdev || type != vp->v_type) 1942 continue; 1943 *vpp = vp; 1944 rc = 1; 1945 break; 1946 } 1947 mutex_exit(&device_lock); 1948 return (rc); 1949 } 1950 1951 /* 1952 * Revoke all the vnodes corresponding to the specified minor number 1953 * range (endpoints inclusive) of the specified major. 1954 */ 1955 void 1956 vdevgone(int maj, int minl, int minh, enum vtype type) 1957 { 1958 vnode_t *vp, **vpp; 1959 dev_t dev; 1960 int mn; 1961 1962 vp = NULL; /* XXX gcc */ 1963 1964 mutex_enter(&device_lock); 1965 for (mn = minl; mn <= minh; mn++) { 1966 dev = makedev(maj, mn); 1967 vpp = &specfs_hash[SPECHASH(dev)]; 1968 for (vp = *vpp; vp != NULL;) { 1969 mutex_enter(&vp->v_interlock); 1970 if ((vp->v_iflag & VI_CLEAN) != 0 || 1971 dev != vp->v_rdev || type != vp->v_type) { 1972 mutex_exit(&vp->v_interlock); 1973 vp = vp->v_specnext; 1974 continue; 1975 } 1976 mutex_exit(&device_lock); 1977 if (vget(vp, LK_INTERLOCK) == 0) { 1978 VOP_REVOKE(vp, REVOKEALL); 1979 vrele(vp); 1980 } 1981 mutex_enter(&device_lock); 1982 vp = *vpp; 1983 } 1984 } 1985 mutex_exit(&device_lock); 1986 } 1987 1988 /* 1989 * Calculate the total number of references to a special device. 1990 */ 1991 int 1992 vcount(vnode_t *vp) 1993 { 1994 int count; 1995 1996 mutex_enter(&device_lock); 1997 mutex_enter(&vp->v_interlock); 1998 if (vp->v_specnode == NULL) { 1999 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 2000 mutex_exit(&vp->v_interlock); 2001 mutex_exit(&device_lock); 2002 return (count); 2003 } 2004 mutex_exit(&vp->v_interlock); 2005 count = vp->v_specnode->sn_dev->sd_opencnt; 2006 mutex_exit(&device_lock); 2007 return (count); 2008 } 2009 2010 /* 2011 * Eliminate all activity associated with the requested vnode 2012 * and with all vnodes aliased to the requested vnode. 2013 */ 2014 void 2015 vrevoke(vnode_t *vp) 2016 { 2017 vnode_t *vq, **vpp; 2018 enum vtype type; 2019 dev_t dev; 2020 2021 KASSERT(vp->v_usecount > 0); 2022 2023 mutex_enter(&vp->v_interlock); 2024 if ((vp->v_iflag & VI_CLEAN) != 0) { 2025 mutex_exit(&vp->v_interlock); 2026 return; 2027 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 2028 atomic_inc_uint(&vp->v_usecount); 2029 vclean(vp, DOCLOSE); 2030 vrelel(vp, 0); 2031 return; 2032 } else { 2033 dev = vp->v_rdev; 2034 type = vp->v_type; 2035 mutex_exit(&vp->v_interlock); 2036 } 2037 2038 vpp = &specfs_hash[SPECHASH(dev)]; 2039 mutex_enter(&device_lock); 2040 for (vq = *vpp; vq != NULL;) { 2041 /* If clean or being cleaned, then ignore it. */ 2042 mutex_enter(&vq->v_interlock); 2043 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 2044 vq->v_rdev != dev || vq->v_type != type) { 2045 mutex_exit(&vq->v_interlock); 2046 vq = vq->v_specnext; 2047 continue; 2048 } 2049 mutex_exit(&device_lock); 2050 if (vq->v_usecount == 0) { 2051 vremfree(vq); 2052 vq->v_usecount = 1; 2053 } else { 2054 atomic_inc_uint(&vq->v_usecount); 2055 } 2056 vclean(vq, DOCLOSE); 2057 vrelel(vq, 0); 2058 mutex_enter(&device_lock); 2059 vq = *vpp; 2060 } 2061 mutex_exit(&device_lock); 2062 } 2063 2064 /* 2065 * sysctl helper routine to return list of supported fstypes 2066 */ 2067 int 2068 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2069 { 2070 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 2071 char *where = oldp; 2072 struct vfsops *v; 2073 size_t needed, left, slen; 2074 int error, first; 2075 2076 if (newp != NULL) 2077 return (EPERM); 2078 if (namelen != 0) 2079 return (EINVAL); 2080 2081 first = 1; 2082 error = 0; 2083 needed = 0; 2084 left = *oldlenp; 2085 2086 sysctl_unlock(); 2087 mutex_enter(&vfs_list_lock); 2088 LIST_FOREACH(v, &vfs_list, vfs_list) { 2089 if (where == NULL) 2090 needed += strlen(v->vfs_name) + 1; 2091 else { 2092 memset(bf, 0, sizeof(bf)); 2093 if (first) { 2094 strncpy(bf, v->vfs_name, sizeof(bf)); 2095 first = 0; 2096 } else { 2097 bf[0] = ' '; 2098 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 2099 } 2100 bf[sizeof(bf)-1] = '\0'; 2101 slen = strlen(bf); 2102 if (left < slen + 1) 2103 break; 2104 v->vfs_refcount++; 2105 mutex_exit(&vfs_list_lock); 2106 /* +1 to copy out the trailing NUL byte */ 2107 error = copyout(bf, where, slen + 1); 2108 mutex_enter(&vfs_list_lock); 2109 v->vfs_refcount--; 2110 if (error) 2111 break; 2112 where += slen; 2113 needed += slen; 2114 left -= slen; 2115 } 2116 } 2117 mutex_exit(&vfs_list_lock); 2118 sysctl_relock(); 2119 *oldlenp = needed; 2120 return (error); 2121 } 2122 2123 2124 int kinfo_vdebug = 1; 2125 int kinfo_vgetfailed; 2126 #define KINFO_VNODESLOP 10 2127 /* 2128 * Dump vnode list (via sysctl). 2129 * Copyout address of vnode followed by vnode. 2130 */ 2131 /* ARGSUSED */ 2132 int 2133 sysctl_kern_vnode(SYSCTLFN_ARGS) 2134 { 2135 char *where = oldp; 2136 size_t *sizep = oldlenp; 2137 struct mount *mp, *nmp; 2138 vnode_t *vp, *mvp, vbuf; 2139 char *bp = where, *savebp; 2140 char *ewhere; 2141 int error; 2142 2143 if (namelen != 0) 2144 return (EOPNOTSUPP); 2145 if (newp != NULL) 2146 return (EPERM); 2147 2148 #define VPTRSZ sizeof(vnode_t *) 2149 #define VNODESZ sizeof(vnode_t) 2150 if (where == NULL) { 2151 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2152 return (0); 2153 } 2154 ewhere = where + *sizep; 2155 2156 sysctl_unlock(); 2157 mutex_enter(&mountlist_lock); 2158 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2159 mp = nmp) { 2160 if (vfs_busy(mp, &nmp)) { 2161 continue; 2162 } 2163 savebp = bp; 2164 /* Allocate a marker vnode. */ 2165 mvp = vnalloc(mp); 2166 /* Should never fail for mp != NULL */ 2167 KASSERT(mvp != NULL); 2168 mutex_enter(&mntvnode_lock); 2169 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 2170 vmark(mvp, vp); 2171 /* 2172 * Check that the vp is still associated with 2173 * this filesystem. RACE: could have been 2174 * recycled onto the same filesystem. 2175 */ 2176 if (vp->v_mount != mp || vismarker(vp)) 2177 continue; 2178 if (bp + VPTRSZ + VNODESZ > ewhere) { 2179 (void)vunmark(mvp); 2180 mutex_exit(&mntvnode_lock); 2181 vnfree(mvp); 2182 sysctl_relock(); 2183 *sizep = bp - where; 2184 return (ENOMEM); 2185 } 2186 memcpy(&vbuf, vp, VNODESZ); 2187 mutex_exit(&mntvnode_lock); 2188 if ((error = copyout(&vp, bp, VPTRSZ)) || 2189 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 2190 mutex_enter(&mntvnode_lock); 2191 (void)vunmark(mvp); 2192 mutex_exit(&mntvnode_lock); 2193 vnfree(mvp); 2194 sysctl_relock(); 2195 return (error); 2196 } 2197 bp += VPTRSZ + VNODESZ; 2198 mutex_enter(&mntvnode_lock); 2199 } 2200 mutex_exit(&mntvnode_lock); 2201 vnfree(mvp); 2202 vfs_unbusy(mp, false, &nmp); 2203 } 2204 mutex_exit(&mountlist_lock); 2205 sysctl_relock(); 2206 2207 *sizep = bp - where; 2208 return (0); 2209 } 2210 2211 /* 2212 * Remove clean vnodes from a mountpoint's vnode list. 2213 */ 2214 void 2215 vfs_scrubvnlist(struct mount *mp) 2216 { 2217 vnode_t *vp, *nvp; 2218 2219 retry: 2220 mutex_enter(&mntvnode_lock); 2221 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 2222 nvp = TAILQ_NEXT(vp, v_mntvnodes); 2223 mutex_enter(&vp->v_interlock); 2224 if ((vp->v_iflag & VI_CLEAN) != 0) { 2225 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 2226 vp->v_mount = NULL; 2227 mutex_exit(&mntvnode_lock); 2228 mutex_exit(&vp->v_interlock); 2229 vfs_destroy(mp); 2230 goto retry; 2231 } 2232 mutex_exit(&vp->v_interlock); 2233 } 2234 mutex_exit(&mntvnode_lock); 2235 } 2236 2237 /* 2238 * Check to see if a filesystem is mounted on a block device. 2239 */ 2240 int 2241 vfs_mountedon(vnode_t *vp) 2242 { 2243 vnode_t *vq; 2244 int error = 0; 2245 2246 if (vp->v_type != VBLK) 2247 return ENOTBLK; 2248 if (vp->v_specmountpoint != NULL) 2249 return (EBUSY); 2250 mutex_enter(&device_lock); 2251 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 2252 vq = vq->v_specnext) { 2253 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 2254 continue; 2255 if (vq->v_specmountpoint != NULL) { 2256 error = EBUSY; 2257 break; 2258 } 2259 } 2260 mutex_exit(&device_lock); 2261 return (error); 2262 } 2263 2264 /* 2265 * Unmount all file systems. 2266 * We traverse the list in reverse order under the assumption that doing so 2267 * will avoid needing to worry about dependencies. 2268 */ 2269 bool 2270 vfs_unmountall(struct lwp *l) 2271 { 2272 printf("unmounting file systems..."); 2273 return vfs_unmountall1(l, true, true); 2274 } 2275 2276 static void 2277 vfs_unmount_print(struct mount *mp, const char *pfx) 2278 { 2279 printf("%sunmounted %s on %s type %s\n", pfx, 2280 mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, 2281 mp->mnt_stat.f_fstypename); 2282 } 2283 2284 bool 2285 vfs_unmount_forceone(struct lwp *l) 2286 { 2287 struct mount *mp, *nmp = NULL; 2288 int error; 2289 2290 CIRCLEQ_FOREACH_REVERSE(mp, &mountlist, mnt_list) { 2291 if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen) 2292 nmp = mp; 2293 } 2294 2295 if (nmp == NULL) 2296 return false; 2297 2298 #ifdef DEBUG 2299 printf("\nforcefully unmounting %s (%s)...", 2300 nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname); 2301 #endif 2302 atomic_inc_uint(&nmp->mnt_refcnt); 2303 if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) { 2304 vfs_unmount_print(nmp, "forcefully "); 2305 return true; 2306 } else 2307 atomic_dec_uint(&nmp->mnt_refcnt); 2308 2309 #ifdef DEBUG 2310 printf("forceful unmount of %s failed with error %d\n", 2311 nmp->mnt_stat.f_mntonname, error); 2312 #endif 2313 2314 return false; 2315 } 2316 2317 bool 2318 vfs_unmountall1(struct lwp *l, bool force, bool verbose) 2319 { 2320 struct mount *mp, *nmp; 2321 bool any_error = false, progress = false; 2322 int error; 2323 2324 for (mp = CIRCLEQ_LAST(&mountlist); 2325 mp != (void *)&mountlist; 2326 mp = nmp) { 2327 nmp = CIRCLEQ_PREV(mp, mnt_list); 2328 #ifdef DEBUG 2329 printf("\nunmounting %p %s (%s)...", 2330 (void *)mp, mp->mnt_stat.f_mntonname, 2331 mp->mnt_stat.f_mntfromname); 2332 #endif 2333 atomic_inc_uint(&mp->mnt_refcnt); 2334 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) { 2335 vfs_unmount_print(mp, ""); 2336 progress = true; 2337 } else { 2338 atomic_dec_uint(&mp->mnt_refcnt); 2339 if (verbose) { 2340 printf("unmount of %s failed with error %d\n", 2341 mp->mnt_stat.f_mntonname, error); 2342 } 2343 any_error = true; 2344 } 2345 } 2346 if (verbose) 2347 printf(" done\n"); 2348 if (any_error && verbose) 2349 printf("WARNING: some file systems would not unmount\n"); 2350 return progress; 2351 } 2352 2353 /* 2354 * Sync and unmount file systems before shutting down. 2355 */ 2356 void 2357 vfs_shutdown(void) 2358 { 2359 struct lwp *l; 2360 2361 /* XXX we're certainly not running in lwp0's context! */ 2362 l = (curlwp == NULL) ? &lwp0 : curlwp; 2363 2364 vfs_shutdown1(l); 2365 } 2366 2367 void 2368 vfs_sync_all(struct lwp *l) 2369 { 2370 printf("syncing disks... "); 2371 2372 /* remove user processes from run queue */ 2373 suspendsched(); 2374 (void) spl0(); 2375 2376 /* avoid coming back this way again if we panic. */ 2377 doing_shutdown = 1; 2378 2379 sys_sync(l, NULL, NULL); 2380 2381 /* Wait for sync to finish. */ 2382 if (buf_syncwait() != 0) { 2383 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2384 Debugger(); 2385 #endif 2386 printf("giving up\n"); 2387 return; 2388 } else 2389 printf("done\n"); 2390 } 2391 2392 static void 2393 vfs_shutdown1(struct lwp *l) 2394 { 2395 2396 vfs_sync_all(l); 2397 2398 /* 2399 * If we've panic'd, don't make the situation potentially 2400 * worse by unmounting the file systems. 2401 */ 2402 if (panicstr != NULL) 2403 return; 2404 2405 /* Release inodes held by texts before update. */ 2406 #ifdef notdef 2407 vnshutdown(); 2408 #endif 2409 /* Unmount file systems. */ 2410 vfs_unmountall(l); 2411 } 2412 2413 /* 2414 * Mount the root file system. If the operator didn't specify a 2415 * file system to use, try all possible file systems until one 2416 * succeeds. 2417 */ 2418 int 2419 vfs_mountroot(void) 2420 { 2421 struct vfsops *v; 2422 int error = ENODEV; 2423 2424 if (root_device == NULL) 2425 panic("vfs_mountroot: root device unknown"); 2426 2427 switch (device_class(root_device)) { 2428 case DV_IFNET: 2429 if (rootdev != NODEV) 2430 panic("vfs_mountroot: rootdev set for DV_IFNET " 2431 "(0x%llx -> %llu,%llu)", 2432 (unsigned long long)rootdev, 2433 (unsigned long long)major(rootdev), 2434 (unsigned long long)minor(rootdev)); 2435 break; 2436 2437 case DV_DISK: 2438 if (rootdev == NODEV) 2439 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2440 if (bdevvp(rootdev, &rootvp)) 2441 panic("vfs_mountroot: can't get vnode for rootdev"); 2442 error = VOP_OPEN(rootvp, FREAD, FSCRED); 2443 if (error) { 2444 printf("vfs_mountroot: can't open root device\n"); 2445 return (error); 2446 } 2447 break; 2448 2449 default: 2450 printf("%s: inappropriate for root file system\n", 2451 device_xname(root_device)); 2452 return (ENODEV); 2453 } 2454 2455 /* 2456 * If user specified a root fs type, use it. Make sure the 2457 * specified type exists and has a mount_root() 2458 */ 2459 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) { 2460 v = vfs_getopsbyname(rootfstype); 2461 error = EFTYPE; 2462 if (v != NULL) { 2463 if (v->vfs_mountroot != NULL) { 2464 error = (v->vfs_mountroot)(); 2465 } 2466 v->vfs_refcount--; 2467 } 2468 goto done; 2469 } 2470 2471 /* 2472 * Try each file system currently configured into the kernel. 2473 */ 2474 mutex_enter(&vfs_list_lock); 2475 LIST_FOREACH(v, &vfs_list, vfs_list) { 2476 if (v->vfs_mountroot == NULL) 2477 continue; 2478 #ifdef DEBUG 2479 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2480 #endif 2481 v->vfs_refcount++; 2482 mutex_exit(&vfs_list_lock); 2483 error = (*v->vfs_mountroot)(); 2484 mutex_enter(&vfs_list_lock); 2485 v->vfs_refcount--; 2486 if (!error) { 2487 aprint_normal("root file system type: %s\n", 2488 v->vfs_name); 2489 break; 2490 } 2491 } 2492 mutex_exit(&vfs_list_lock); 2493 2494 if (v == NULL) { 2495 printf("no file system for %s", device_xname(root_device)); 2496 if (device_class(root_device) == DV_DISK) 2497 printf(" (dev 0x%llx)", (unsigned long long)rootdev); 2498 printf("\n"); 2499 error = EFTYPE; 2500 } 2501 2502 done: 2503 if (error && device_class(root_device) == DV_DISK) { 2504 VOP_CLOSE(rootvp, FREAD, FSCRED); 2505 vrele(rootvp); 2506 } 2507 return (error); 2508 } 2509 2510 /* 2511 * Get a new unique fsid 2512 */ 2513 void 2514 vfs_getnewfsid(struct mount *mp) 2515 { 2516 static u_short xxxfs_mntid; 2517 fsid_t tfsid; 2518 int mtype; 2519 2520 mutex_enter(&mntid_lock); 2521 mtype = makefstype(mp->mnt_op->vfs_name); 2522 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 2523 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 2524 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 2525 if (xxxfs_mntid == 0) 2526 ++xxxfs_mntid; 2527 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 2528 tfsid.__fsid_val[1] = mtype; 2529 if (!CIRCLEQ_EMPTY(&mountlist)) { 2530 while (vfs_getvfs(&tfsid)) { 2531 tfsid.__fsid_val[0]++; 2532 xxxfs_mntid++; 2533 } 2534 } 2535 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 2536 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 2537 mutex_exit(&mntid_lock); 2538 } 2539 2540 /* 2541 * Make a 'unique' number from a mount type name. 2542 */ 2543 long 2544 makefstype(const char *type) 2545 { 2546 long rv; 2547 2548 for (rv = 0; *type; type++) { 2549 rv <<= 2; 2550 rv ^= *type; 2551 } 2552 return rv; 2553 } 2554 2555 /* 2556 * Set vnode attributes to VNOVAL 2557 */ 2558 void 2559 vattr_null(struct vattr *vap) 2560 { 2561 2562 vap->va_type = VNON; 2563 2564 /* 2565 * Assign individually so that it is safe even if size and 2566 * sign of each member are varied. 2567 */ 2568 vap->va_mode = VNOVAL; 2569 vap->va_nlink = VNOVAL; 2570 vap->va_uid = VNOVAL; 2571 vap->va_gid = VNOVAL; 2572 vap->va_fsid = VNOVAL; 2573 vap->va_fileid = VNOVAL; 2574 vap->va_size = VNOVAL; 2575 vap->va_blocksize = VNOVAL; 2576 vap->va_atime.tv_sec = 2577 vap->va_mtime.tv_sec = 2578 vap->va_ctime.tv_sec = 2579 vap->va_birthtime.tv_sec = VNOVAL; 2580 vap->va_atime.tv_nsec = 2581 vap->va_mtime.tv_nsec = 2582 vap->va_ctime.tv_nsec = 2583 vap->va_birthtime.tv_nsec = VNOVAL; 2584 vap->va_gen = VNOVAL; 2585 vap->va_flags = VNOVAL; 2586 vap->va_rdev = VNOVAL; 2587 vap->va_bytes = VNOVAL; 2588 vap->va_vaflags = 0; 2589 } 2590 2591 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 2592 #define ARRAY_PRINT(idx, arr) \ 2593 ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 2594 2595 const char * const vnode_tags[] = { VNODE_TAGS }; 2596 const char * const vnode_types[] = { VNODE_TYPES }; 2597 const char vnode_flagbits[] = VNODE_FLAGBITS; 2598 2599 /* 2600 * Print out a description of a vnode. 2601 */ 2602 void 2603 vprint(const char *label, struct vnode *vp) 2604 { 2605 struct vnlock *vl; 2606 char bf[96]; 2607 int flag; 2608 2609 vl = (vp->v_vnlock != NULL ? vp->v_vnlock : &vp->v_lock); 2610 flag = vp->v_iflag | vp->v_vflag | vp->v_uflag; 2611 snprintb(bf, sizeof(bf), vnode_flagbits, flag); 2612 2613 if (label != NULL) 2614 printf("%s: ", label); 2615 printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), " 2616 "usecount %d, writecount %d, holdcount %d\n" 2617 "\tfreelisthd %p, mount %p, data %p lock %p recursecnt %d\n", 2618 vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 2619 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 2620 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, 2621 vp->v_freelisthd, vp->v_mount, vp->v_data, vl, vl->vl_recursecnt); 2622 if (vp->v_data != NULL) { 2623 printf("\t"); 2624 VOP_PRINT(vp); 2625 } 2626 } 2627 2628 #ifdef DEBUG 2629 /* 2630 * List all of the locked vnodes in the system. 2631 * Called when debugging the kernel. 2632 */ 2633 void 2634 printlockedvnodes(void) 2635 { 2636 struct mount *mp, *nmp; 2637 struct vnode *vp; 2638 2639 printf("Locked vnodes\n"); 2640 mutex_enter(&mountlist_lock); 2641 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2642 mp = nmp) { 2643 if (vfs_busy(mp, &nmp)) { 2644 continue; 2645 } 2646 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2647 if (VOP_ISLOCKED(vp)) 2648 vprint(NULL, vp); 2649 } 2650 mutex_enter(&mountlist_lock); 2651 vfs_unbusy(mp, false, &nmp); 2652 } 2653 mutex_exit(&mountlist_lock); 2654 } 2655 #endif 2656 2657 /* Deprecated. Kept for KPI compatibility. */ 2658 int 2659 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, 2660 mode_t acc_mode, kauth_cred_t cred) 2661 { 2662 2663 #ifdef DIAGNOSTIC 2664 printf("vaccess: deprecated interface used.\n"); 2665 #endif /* DIAGNOSTIC */ 2666 2667 return genfs_can_access(type, file_mode, uid, gid, acc_mode, cred); 2668 } 2669 2670 /* 2671 * Given a file system name, look up the vfsops for that 2672 * file system, or return NULL if file system isn't present 2673 * in the kernel. 2674 */ 2675 struct vfsops * 2676 vfs_getopsbyname(const char *name) 2677 { 2678 struct vfsops *v; 2679 2680 mutex_enter(&vfs_list_lock); 2681 LIST_FOREACH(v, &vfs_list, vfs_list) { 2682 if (strcmp(v->vfs_name, name) == 0) 2683 break; 2684 } 2685 if (v != NULL) 2686 v->vfs_refcount++; 2687 mutex_exit(&vfs_list_lock); 2688 2689 return (v); 2690 } 2691 2692 void 2693 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2694 { 2695 const struct statvfs *mbp; 2696 2697 if (sbp == (mbp = &mp->mnt_stat)) 2698 return; 2699 2700 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2701 sbp->f_fsid = mbp->f_fsid; 2702 sbp->f_owner = mbp->f_owner; 2703 sbp->f_flag = mbp->f_flag; 2704 sbp->f_syncwrites = mbp->f_syncwrites; 2705 sbp->f_asyncwrites = mbp->f_asyncwrites; 2706 sbp->f_syncreads = mbp->f_syncreads; 2707 sbp->f_asyncreads = mbp->f_asyncreads; 2708 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2709 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2710 sizeof(sbp->f_fstypename)); 2711 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2712 sizeof(sbp->f_mntonname)); 2713 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2714 sizeof(sbp->f_mntfromname)); 2715 sbp->f_namemax = mbp->f_namemax; 2716 } 2717 2718 int 2719 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2720 const char *vfsname, struct mount *mp, struct lwp *l) 2721 { 2722 int error; 2723 size_t size; 2724 struct statvfs *sfs = &mp->mnt_stat; 2725 int (*fun)(const void *, void *, size_t, size_t *); 2726 2727 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname, 2728 sizeof(mp->mnt_stat.f_fstypename)); 2729 2730 if (onp) { 2731 struct cwdinfo *cwdi = l->l_proc->p_cwdi; 2732 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2733 if (cwdi->cwdi_rdir != NULL) { 2734 size_t len; 2735 char *bp; 2736 char *path = PNBUF_GET(); 2737 2738 bp = path + MAXPATHLEN; 2739 *--bp = '\0'; 2740 rw_enter(&cwdi->cwdi_lock, RW_READER); 2741 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 2742 path, MAXPATHLEN / 2, 0, l); 2743 rw_exit(&cwdi->cwdi_lock); 2744 if (error) { 2745 PNBUF_PUT(path); 2746 return error; 2747 } 2748 2749 len = strlen(bp); 2750 if (len > sizeof(sfs->f_mntonname) - 1) 2751 len = sizeof(sfs->f_mntonname) - 1; 2752 (void)strncpy(sfs->f_mntonname, bp, len); 2753 PNBUF_PUT(path); 2754 2755 if (len < sizeof(sfs->f_mntonname) - 1) { 2756 error = (*fun)(onp, &sfs->f_mntonname[len], 2757 sizeof(sfs->f_mntonname) - len - 1, &size); 2758 if (error) 2759 return error; 2760 size += len; 2761 } else { 2762 size = len; 2763 } 2764 } else { 2765 error = (*fun)(onp, &sfs->f_mntonname, 2766 sizeof(sfs->f_mntonname) - 1, &size); 2767 if (error) 2768 return error; 2769 } 2770 (void)memset(sfs->f_mntonname + size, 0, 2771 sizeof(sfs->f_mntonname) - size); 2772 } 2773 2774 if (fromp) { 2775 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 2776 error = (*fun)(fromp, sfs->f_mntfromname, 2777 sizeof(sfs->f_mntfromname) - 1, &size); 2778 if (error) 2779 return error; 2780 (void)memset(sfs->f_mntfromname + size, 0, 2781 sizeof(sfs->f_mntfromname) - size); 2782 } 2783 return 0; 2784 } 2785 2786 void 2787 vfs_timestamp(struct timespec *ts) 2788 { 2789 2790 nanotime(ts); 2791 } 2792 2793 time_t rootfstime; /* recorded root fs time, if known */ 2794 void 2795 setrootfstime(time_t t) 2796 { 2797 rootfstime = t; 2798 } 2799 2800 /* 2801 * Sham lock manager for vnodes. This is a temporary measure. 2802 */ 2803 int 2804 vlockmgr(struct vnlock *vl, int flags) 2805 { 2806 2807 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 2808 2809 switch (flags & LK_TYPE_MASK) { 2810 case LK_SHARED: 2811 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 2812 return 0; 2813 } 2814 if ((flags & LK_NOWAIT) != 0) { 2815 return EBUSY; 2816 } 2817 rw_enter(&vl->vl_lock, RW_READER); 2818 return 0; 2819 2820 case LK_EXCLUSIVE: 2821 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 2822 return 0; 2823 } 2824 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 2825 rw_write_held(&vl->vl_lock)) { 2826 vl->vl_recursecnt++; 2827 return 0; 2828 } 2829 if ((flags & LK_NOWAIT) != 0) { 2830 return EBUSY; 2831 } 2832 rw_enter(&vl->vl_lock, RW_WRITER); 2833 return 0; 2834 2835 case LK_RELEASE: 2836 if (vl->vl_recursecnt != 0) { 2837 KASSERT(rw_write_held(&vl->vl_lock)); 2838 vl->vl_recursecnt--; 2839 return 0; 2840 } 2841 rw_exit(&vl->vl_lock); 2842 return 0; 2843 2844 default: 2845 panic("vlockmgr: flags %x", flags); 2846 } 2847 } 2848 2849 int 2850 vlockstatus(struct vnlock *vl) 2851 { 2852 2853 if (rw_write_held(&vl->vl_lock)) { 2854 return LK_EXCLUSIVE; 2855 } 2856 if (rw_read_held(&vl->vl_lock)) { 2857 return LK_SHARED; 2858 } 2859 return 0; 2860 } 2861 2862 /* 2863 * mount_specific_key_create -- 2864 * Create a key for subsystem mount-specific data. 2865 */ 2866 int 2867 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) 2868 { 2869 2870 return (specificdata_key_create(mount_specificdata_domain, keyp, dtor)); 2871 } 2872 2873 /* 2874 * mount_specific_key_delete -- 2875 * Delete a key for subsystem mount-specific data. 2876 */ 2877 void 2878 mount_specific_key_delete(specificdata_key_t key) 2879 { 2880 2881 specificdata_key_delete(mount_specificdata_domain, key); 2882 } 2883 2884 /* 2885 * mount_initspecific -- 2886 * Initialize a mount's specificdata container. 2887 */ 2888 void 2889 mount_initspecific(struct mount *mp) 2890 { 2891 int error; 2892 2893 error = specificdata_init(mount_specificdata_domain, 2894 &mp->mnt_specdataref); 2895 KASSERT(error == 0); 2896 } 2897 2898 /* 2899 * mount_finispecific -- 2900 * Finalize a mount's specificdata container. 2901 */ 2902 void 2903 mount_finispecific(struct mount *mp) 2904 { 2905 2906 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 2907 } 2908 2909 /* 2910 * mount_getspecific -- 2911 * Return mount-specific data corresponding to the specified key. 2912 */ 2913 void * 2914 mount_getspecific(struct mount *mp, specificdata_key_t key) 2915 { 2916 2917 return (specificdata_getspecific(mount_specificdata_domain, 2918 &mp->mnt_specdataref, key)); 2919 } 2920 2921 /* 2922 * mount_setspecific -- 2923 * Set mount-specific data corresponding to the specified key. 2924 */ 2925 void 2926 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data) 2927 { 2928 2929 specificdata_setspecific(mount_specificdata_domain, 2930 &mp->mnt_specdataref, key, data); 2931 } 2932 2933 int 2934 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c) 2935 { 2936 int error; 2937 2938 KERNEL_LOCK(1, NULL); 2939 error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c); 2940 KERNEL_UNLOCK_ONE(NULL); 2941 2942 return error; 2943 } 2944 2945 int 2946 VFS_START(struct mount *mp, int a) 2947 { 2948 int error; 2949 2950 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2951 KERNEL_LOCK(1, NULL); 2952 } 2953 error = (*(mp->mnt_op->vfs_start))(mp, a); 2954 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2955 KERNEL_UNLOCK_ONE(NULL); 2956 } 2957 2958 return error; 2959 } 2960 2961 int 2962 VFS_UNMOUNT(struct mount *mp, int a) 2963 { 2964 int error; 2965 2966 KERNEL_LOCK(1, NULL); 2967 error = (*(mp->mnt_op->vfs_unmount))(mp, a); 2968 KERNEL_UNLOCK_ONE(NULL); 2969 2970 return error; 2971 } 2972 2973 int 2974 VFS_ROOT(struct mount *mp, struct vnode **a) 2975 { 2976 int error; 2977 2978 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2979 KERNEL_LOCK(1, NULL); 2980 } 2981 error = (*(mp->mnt_op->vfs_root))(mp, a); 2982 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2983 KERNEL_UNLOCK_ONE(NULL); 2984 } 2985 2986 return error; 2987 } 2988 2989 int 2990 VFS_QUOTACTL(struct mount *mp, int a, uid_t b, void *c) 2991 { 2992 int error; 2993 2994 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2995 KERNEL_LOCK(1, NULL); 2996 } 2997 error = (*(mp->mnt_op->vfs_quotactl))(mp, a, b, c); 2998 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2999 KERNEL_UNLOCK_ONE(NULL); 3000 } 3001 3002 return error; 3003 } 3004 3005 int 3006 VFS_STATVFS(struct mount *mp, struct statvfs *a) 3007 { 3008 int error; 3009 3010 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3011 KERNEL_LOCK(1, NULL); 3012 } 3013 error = (*(mp->mnt_op->vfs_statvfs))(mp, a); 3014 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3015 KERNEL_UNLOCK_ONE(NULL); 3016 } 3017 3018 return error; 3019 } 3020 3021 int 3022 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b) 3023 { 3024 int error; 3025 3026 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3027 KERNEL_LOCK(1, NULL); 3028 } 3029 error = (*(mp->mnt_op->vfs_sync))(mp, a, b); 3030 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3031 KERNEL_UNLOCK_ONE(NULL); 3032 } 3033 3034 return error; 3035 } 3036 3037 int 3038 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b) 3039 { 3040 int error; 3041 3042 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3043 KERNEL_LOCK(1, NULL); 3044 } 3045 error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b); 3046 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3047 KERNEL_UNLOCK_ONE(NULL); 3048 } 3049 3050 return error; 3051 } 3052 3053 int 3054 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b) 3055 { 3056 int error; 3057 3058 if ((vp->v_vflag & VV_MPSAFE) == 0) { 3059 KERNEL_LOCK(1, NULL); 3060 } 3061 error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b); 3062 if ((vp->v_vflag & VV_MPSAFE) == 0) { 3063 KERNEL_UNLOCK_ONE(NULL); 3064 } 3065 3066 return error; 3067 } 3068 3069 int 3070 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b) 3071 { 3072 int error; 3073 3074 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3075 KERNEL_LOCK(1, NULL); 3076 } 3077 error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b); 3078 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3079 KERNEL_UNLOCK_ONE(NULL); 3080 } 3081 3082 return error; 3083 } 3084 3085 int 3086 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d) 3087 { 3088 int error; 3089 3090 KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */ 3091 error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d); 3092 KERNEL_UNLOCK_ONE(NULL); /* XXX */ 3093 3094 return error; 3095 } 3096 3097 int 3098 VFS_SUSPENDCTL(struct mount *mp, int a) 3099 { 3100 int error; 3101 3102 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3103 KERNEL_LOCK(1, NULL); 3104 } 3105 error = (*(mp->mnt_op->vfs_suspendctl))(mp, a); 3106 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3107 KERNEL_UNLOCK_ONE(NULL); 3108 } 3109 3110 return error; 3111 } 3112 3113 #if defined(DDB) || defined(DEBUGPRINT) 3114 static const char buf_flagbits[] = BUF_FLAGBITS; 3115 3116 void 3117 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) 3118 { 3119 char bf[1024]; 3120 3121 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%" 3122 PRIx64 " dev 0x%x\n", 3123 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev); 3124 3125 snprintb(bf, sizeof(bf), 3126 buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags); 3127 (*pr)(" error %d flags 0x%s\n", bp->b_error, bf); 3128 3129 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3130 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3131 (*pr)(" data %p saveaddr %p\n", 3132 bp->b_data, bp->b_saveaddr); 3133 (*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock); 3134 } 3135 3136 3137 void 3138 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) 3139 { 3140 char bf[256]; 3141 3142 uvm_object_printit(&vp->v_uobj, full, pr); 3143 snprintb(bf, sizeof(bf), 3144 vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag); 3145 (*pr)("\nVNODE flags %s\n", bf); 3146 (*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n", 3147 vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize); 3148 3149 (*pr)("data %p writecount %ld holdcnt %ld\n", 3150 vp->v_data, vp->v_writecount, vp->v_holdcnt); 3151 3152 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n", 3153 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 3154 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 3155 vp->v_mount, vp->v_mountedhere); 3156 3157 (*pr)("v_lock %p v_vnlock %p\n", &vp->v_lock, vp->v_vnlock); 3158 3159 if (full) { 3160 struct buf *bp; 3161 3162 (*pr)("clean bufs:\n"); 3163 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3164 (*pr)(" bp %p\n", bp); 3165 vfs_buf_print(bp, full, pr); 3166 } 3167 3168 (*pr)("dirty bufs:\n"); 3169 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3170 (*pr)(" bp %p\n", bp); 3171 vfs_buf_print(bp, full, pr); 3172 } 3173 } 3174 } 3175 3176 void 3177 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) 3178 { 3179 char sbuf[256]; 3180 3181 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3182 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3183 3184 (*pr)("fs_bshift %d dev_bshift = %d\n", 3185 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3186 3187 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag); 3188 (*pr)("flag = %s\n", sbuf); 3189 3190 snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag); 3191 (*pr)("iflag = %s\n", sbuf); 3192 3193 (*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt, 3194 &mp->mnt_unmounting, &mp->mnt_updating); 3195 3196 (*pr)("statvfs cache:\n"); 3197 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3198 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3199 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3200 3201 (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks); 3202 (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree); 3203 (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail); 3204 (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd); 3205 3206 (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files); 3207 (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree); 3208 (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail); 3209 (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd); 3210 3211 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3212 mp->mnt_stat.f_fsidx.__fsid_val[0], 3213 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3214 3215 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3216 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3217 3218 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag); 3219 3220 (*pr)("\tflag = %s\n",sbuf); 3221 (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3222 (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3223 (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads); 3224 (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3225 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3226 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3227 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3228 3229 { 3230 int cnt = 0; 3231 struct vnode *vp; 3232 (*pr)("locked vnodes ="); 3233 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3234 if (VOP_ISLOCKED(vp)) { 3235 if ((++cnt % 6) == 0) { 3236 (*pr)(" %p,\n\t", vp); 3237 } else { 3238 (*pr)(" %p,", vp); 3239 } 3240 } 3241 } 3242 (*pr)("\n"); 3243 } 3244 3245 if (full) { 3246 int cnt = 0; 3247 struct vnode *vp; 3248 (*pr)("all vnodes ="); 3249 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3250 if (!TAILQ_NEXT(vp, v_mntvnodes)) { 3251 (*pr)(" %p", vp); 3252 } else if ((++cnt % 6) == 0) { 3253 (*pr)(" %p,\n\t", vp); 3254 } else { 3255 (*pr)(" %p,", vp); 3256 } 3257 } 3258 (*pr)("\n", vp); 3259 } 3260 } 3261 #endif /* DDB || DEBUGPRINT */ 3262 3263