1 /* $NetBSD: vfs_subr.c,v 1.370 2009/03/30 16:38:05 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * Note on v_usecount and locking: 71 * 72 * At nearly all points it is known that v_usecount could be zero, the 73 * vnode interlock will be held. 74 * 75 * To change v_usecount away from zero, the interlock must be held. To 76 * change from a non-zero value to zero, again the interlock must be 77 * held. 78 * 79 * Changing the usecount from a non-zero value to a non-zero value can 80 * safely be done using atomic operations, without the interlock held. 81 */ 82 83 #include <sys/cdefs.h> 84 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.370 2009/03/30 16:38:05 yamt Exp $"); 85 86 #include "opt_ddb.h" 87 #include "opt_compat_netbsd.h" 88 #include "opt_compat_43.h" 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/conf.h> 93 #include <sys/proc.h> 94 #include <sys/kernel.h> 95 #include <sys/mount.h> 96 #include <sys/fcntl.h> 97 #include <sys/vnode.h> 98 #include <sys/stat.h> 99 #include <sys/namei.h> 100 #include <sys/ucred.h> 101 #include <sys/buf.h> 102 #include <sys/errno.h> 103 #include <sys/kmem.h> 104 #include <sys/syscallargs.h> 105 #include <sys/device.h> 106 #include <sys/filedesc.h> 107 #include <sys/kauth.h> 108 #include <sys/atomic.h> 109 #include <sys/kthread.h> 110 #include <sys/wapbl.h> 111 112 #include <miscfs/specfs/specdev.h> 113 #include <miscfs/syncfs/syncfs.h> 114 115 #include <uvm/uvm.h> 116 #include <uvm/uvm_readahead.h> 117 #include <uvm/uvm_ddb.h> 118 119 #include <sys/sysctl.h> 120 121 const enum vtype iftovt_tab[16] = { 122 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 123 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 124 }; 125 const int vttoif_tab[9] = { 126 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 127 S_IFSOCK, S_IFIFO, S_IFMT, 128 }; 129 130 /* 131 * Insq/Remq for the vnode usage lists. 132 */ 133 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 134 #define bufremvn(bp) { \ 135 LIST_REMOVE(bp, b_vnbufs); \ 136 (bp)->b_vnbufs.le_next = NOLIST; \ 137 } 138 139 int doforce = 1; /* 1 => permit forcible unmounting */ 140 int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 141 142 static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list); 143 static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list); 144 static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list); 145 146 struct mntlist mountlist = /* mounted filesystem list */ 147 CIRCLEQ_HEAD_INITIALIZER(mountlist); 148 149 u_int numvnodes; 150 static specificdata_domain_t mount_specificdata_domain; 151 152 static int vrele_pending; 153 static int vrele_gen; 154 static kmutex_t vrele_lock; 155 static kcondvar_t vrele_cv; 156 static lwp_t *vrele_lwp; 157 158 kmutex_t mountlist_lock; 159 kmutex_t mntid_lock; 160 kmutex_t mntvnode_lock; 161 kmutex_t vnode_free_list_lock; 162 kmutex_t vfs_list_lock; 163 164 static pool_cache_t vnode_cache; 165 166 /* 167 * These define the root filesystem and device. 168 */ 169 struct vnode *rootvnode; 170 struct device *root_device; /* root device */ 171 172 /* 173 * Local declarations. 174 */ 175 176 static void vrele_thread(void *); 177 static void insmntque(vnode_t *, struct mount *); 178 static int getdevvp(dev_t, vnode_t **, enum vtype); 179 static vnode_t *getcleanvnode(void); 180 void vpanic(vnode_t *, const char *); 181 182 #ifdef DEBUG 183 void printlockedvnodes(void); 184 #endif 185 186 #ifdef DIAGNOSTIC 187 void 188 vpanic(vnode_t *vp, const char *msg) 189 { 190 191 vprint(NULL, vp); 192 panic("%s\n", msg); 193 } 194 #else 195 #define vpanic(vp, msg) /* nothing */ 196 #endif 197 198 void 199 vn_init1(void) 200 { 201 202 vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl", 203 NULL, IPL_NONE, NULL, NULL, NULL); 204 KASSERT(vnode_cache != NULL); 205 206 /* Create deferred release thread. */ 207 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 208 cv_init(&vrele_cv, "vrele"); 209 if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 210 NULL, &vrele_lwp, "vrele")) 211 panic("fork vrele"); 212 } 213 214 /* 215 * Initialize the vnode management data structures. 216 */ 217 void 218 vntblinit(void) 219 { 220 221 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE); 222 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE); 223 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE); 224 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 225 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE); 226 227 mount_specificdata_domain = specificdata_domain_create(); 228 229 /* Initialize the filesystem syncer. */ 230 vn_initialize_syncerd(); 231 vn_init1(); 232 } 233 234 int 235 vfs_drainvnodes(long target, struct lwp *l) 236 { 237 238 while (numvnodes > target) { 239 vnode_t *vp; 240 241 mutex_enter(&vnode_free_list_lock); 242 vp = getcleanvnode(); 243 if (vp == NULL) 244 return EBUSY; /* give up */ 245 ungetnewvnode(vp); 246 } 247 248 return 0; 249 } 250 251 /* 252 * Lookup a mount point by filesystem identifier. 253 * 254 * XXX Needs to add a reference to the mount point. 255 */ 256 struct mount * 257 vfs_getvfs(fsid_t *fsid) 258 { 259 struct mount *mp; 260 261 mutex_enter(&mountlist_lock); 262 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { 263 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] && 264 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) { 265 mutex_exit(&mountlist_lock); 266 return (mp); 267 } 268 } 269 mutex_exit(&mountlist_lock); 270 return ((struct mount *)0); 271 } 272 273 /* 274 * Drop a reference to a mount structure, freeing if the last reference. 275 */ 276 void 277 vfs_destroy(struct mount *mp) 278 { 279 280 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) { 281 return; 282 } 283 284 /* 285 * Nothing else has visibility of the mount: we can now 286 * free the data structures. 287 */ 288 KASSERT(mp->mnt_refcnt == 0); 289 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 290 rw_destroy(&mp->mnt_unmounting); 291 mutex_destroy(&mp->mnt_updating); 292 mutex_destroy(&mp->mnt_renamelock); 293 if (mp->mnt_op != NULL) { 294 vfs_delref(mp->mnt_op); 295 } 296 kmem_free(mp, sizeof(*mp)); 297 } 298 299 /* 300 * grab a vnode from freelist and clean it. 301 */ 302 vnode_t * 303 getcleanvnode(void) 304 { 305 vnode_t *vp; 306 vnodelst_t *listhd; 307 308 KASSERT(mutex_owned(&vnode_free_list_lock)); 309 310 retry: 311 listhd = &vnode_free_list; 312 try_nextlist: 313 TAILQ_FOREACH(vp, listhd, v_freelist) { 314 /* 315 * It's safe to test v_usecount and v_iflag 316 * without holding the interlock here, since 317 * these vnodes should never appear on the 318 * lists. 319 */ 320 if (vp->v_usecount != 0) { 321 vpanic(vp, "free vnode isn't"); 322 } 323 if ((vp->v_iflag & VI_CLEAN) != 0) { 324 vpanic(vp, "clean vnode on freelist"); 325 } 326 if (vp->v_freelisthd != listhd) { 327 printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd); 328 vpanic(vp, "list head mismatch"); 329 } 330 if (!mutex_tryenter(&vp->v_interlock)) 331 continue; 332 /* 333 * Our lwp might hold the underlying vnode 334 * locked, so don't try to reclaim a VI_LAYER 335 * node if it's locked. 336 */ 337 if ((vp->v_iflag & VI_XLOCK) == 0 && 338 ((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) { 339 break; 340 } 341 mutex_exit(&vp->v_interlock); 342 } 343 344 if (vp == NULL) { 345 if (listhd == &vnode_free_list) { 346 listhd = &vnode_hold_list; 347 goto try_nextlist; 348 } 349 mutex_exit(&vnode_free_list_lock); 350 return NULL; 351 } 352 353 /* Remove it from the freelist. */ 354 TAILQ_REMOVE(listhd, vp, v_freelist); 355 vp->v_freelisthd = NULL; 356 mutex_exit(&vnode_free_list_lock); 357 358 /* 359 * The vnode is still associated with a file system, so we must 360 * clean it out before reusing it. We need to add a reference 361 * before doing this. If the vnode gains another reference while 362 * being cleaned out then we lose - retry. 363 */ 364 atomic_inc_uint(&vp->v_usecount); 365 vclean(vp, DOCLOSE); 366 if (vp->v_usecount == 1) { 367 /* We're about to dirty it. */ 368 vp->v_iflag &= ~VI_CLEAN; 369 mutex_exit(&vp->v_interlock); 370 if (vp->v_type == VBLK || vp->v_type == VCHR) { 371 spec_node_destroy(vp); 372 } 373 vp->v_type = VNON; 374 } else { 375 /* 376 * Don't return to freelist - the holder of the last 377 * reference will destroy it. 378 */ 379 vrelel(vp, 0); /* releases vp->v_interlock */ 380 mutex_enter(&vnode_free_list_lock); 381 goto retry; 382 } 383 384 if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 || 385 !TAILQ_EMPTY(&vp->v_uobj.memq)) { 386 vpanic(vp, "cleaned vnode isn't"); 387 } 388 if (vp->v_numoutput != 0) { 389 vpanic(vp, "clean vnode has pending I/O's"); 390 } 391 if ((vp->v_iflag & VI_ONWORKLST) != 0) { 392 vpanic(vp, "clean vnode on syncer list"); 393 } 394 395 return vp; 396 } 397 398 /* 399 * Mark a mount point as busy, and gain a new reference to it. Used to 400 * prevent the file system from being unmounted during critical sections. 401 * 402 * => The caller must hold a pre-existing reference to the mount. 403 * => Will fail if the file system is being unmounted, or is unmounted. 404 */ 405 int 406 vfs_busy(struct mount *mp, struct mount **nextp) 407 { 408 409 KASSERT(mp->mnt_refcnt > 0); 410 411 if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) { 412 if (nextp != NULL) { 413 KASSERT(mutex_owned(&mountlist_lock)); 414 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 415 } 416 return EBUSY; 417 } 418 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) { 419 rw_exit(&mp->mnt_unmounting); 420 if (nextp != NULL) { 421 KASSERT(mutex_owned(&mountlist_lock)); 422 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 423 } 424 return ENOENT; 425 } 426 if (nextp != NULL) { 427 mutex_exit(&mountlist_lock); 428 } 429 atomic_inc_uint(&mp->mnt_refcnt); 430 return 0; 431 } 432 433 /* 434 * Unbusy a busy filesystem. 435 * 436 * => If keepref is true, preserve reference added by vfs_busy(). 437 * => If nextp != NULL, acquire mountlist_lock. 438 */ 439 void 440 vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp) 441 { 442 443 KASSERT(mp->mnt_refcnt > 0); 444 445 if (nextp != NULL) { 446 mutex_enter(&mountlist_lock); 447 } 448 rw_exit(&mp->mnt_unmounting); 449 if (!keepref) { 450 vfs_destroy(mp); 451 } 452 if (nextp != NULL) { 453 KASSERT(mutex_owned(&mountlist_lock)); 454 *nextp = CIRCLEQ_NEXT(mp, mnt_list); 455 } 456 } 457 458 /* 459 * Lookup a filesystem type, and if found allocate and initialize 460 * a mount structure for it. 461 * 462 * Devname is usually updated by mount(8) after booting. 463 */ 464 int 465 vfs_rootmountalloc(const char *fstypename, const char *devname, 466 struct mount **mpp) 467 { 468 struct vfsops *vfsp = NULL; 469 struct mount *mp; 470 471 mutex_enter(&vfs_list_lock); 472 LIST_FOREACH(vfsp, &vfs_list, vfs_list) 473 if (!strncmp(vfsp->vfs_name, fstypename, 474 sizeof(mp->mnt_stat.f_fstypename))) 475 break; 476 if (vfsp == NULL) { 477 mutex_exit(&vfs_list_lock); 478 return (ENODEV); 479 } 480 vfsp->vfs_refcount++; 481 mutex_exit(&vfs_list_lock); 482 483 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); 484 if (mp == NULL) 485 return ENOMEM; 486 mp->mnt_refcnt = 1; 487 rw_init(&mp->mnt_unmounting); 488 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE); 489 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE); 490 (void)vfs_busy(mp, NULL); 491 TAILQ_INIT(&mp->mnt_vnodelist); 492 mp->mnt_op = vfsp; 493 mp->mnt_flag = MNT_RDONLY; 494 mp->mnt_vnodecovered = NULL; 495 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, 496 sizeof(mp->mnt_stat.f_fstypename)); 497 mp->mnt_stat.f_mntonname[0] = '/'; 498 mp->mnt_stat.f_mntonname[1] = '\0'; 499 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] = 500 '\0'; 501 (void)copystr(devname, mp->mnt_stat.f_mntfromname, 502 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0); 503 mount_initspecific(mp); 504 *mpp = mp; 505 return (0); 506 } 507 508 /* 509 * Routines having to do with the management of the vnode table. 510 */ 511 extern int (**dead_vnodeop_p)(void *); 512 513 /* 514 * Return the next vnode from the free list. 515 */ 516 int 517 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 518 vnode_t **vpp) 519 { 520 struct uvm_object *uobj; 521 static int toggle; 522 vnode_t *vp; 523 int error = 0, tryalloc; 524 525 try_again: 526 if (mp != NULL) { 527 /* 528 * Mark filesystem busy while we're creating a 529 * vnode. If unmount is in progress, this will 530 * fail. 531 */ 532 error = vfs_busy(mp, NULL); 533 if (error) 534 return error; 535 } 536 537 /* 538 * We must choose whether to allocate a new vnode or recycle an 539 * existing one. The criterion for allocating a new one is that 540 * the total number of vnodes is less than the number desired or 541 * there are no vnodes on either free list. Generally we only 542 * want to recycle vnodes that have no buffers associated with 543 * them, so we look first on the vnode_free_list. If it is empty, 544 * we next consider vnodes with referencing buffers on the 545 * vnode_hold_list. The toggle ensures that half the time we 546 * will use a buffer from the vnode_hold_list, and half the time 547 * we will allocate a new one unless the list has grown to twice 548 * the desired size. We are reticent to recycle vnodes from the 549 * vnode_hold_list because we will lose the identity of all its 550 * referencing buffers. 551 */ 552 553 vp = NULL; 554 555 mutex_enter(&vnode_free_list_lock); 556 557 toggle ^= 1; 558 if (numvnodes > 2 * desiredvnodes) 559 toggle = 0; 560 561 tryalloc = numvnodes < desiredvnodes || 562 (TAILQ_FIRST(&vnode_free_list) == NULL && 563 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 564 565 if (tryalloc) { 566 numvnodes++; 567 mutex_exit(&vnode_free_list_lock); 568 if ((vp = vnalloc(NULL)) == NULL) { 569 mutex_enter(&vnode_free_list_lock); 570 numvnodes--; 571 } else 572 vp->v_usecount = 1; 573 } 574 575 if (vp == NULL) { 576 vp = getcleanvnode(); 577 if (vp == NULL) { 578 if (mp != NULL) { 579 vfs_unbusy(mp, false, NULL); 580 } 581 if (tryalloc) { 582 printf("WARNING: unable to allocate new " 583 "vnode, retrying...\n"); 584 kpause("newvn", false, hz, NULL); 585 goto try_again; 586 } 587 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 588 *vpp = 0; 589 return (ENFILE); 590 } 591 vp->v_iflag = 0; 592 vp->v_vflag = 0; 593 vp->v_uflag = 0; 594 vp->v_socket = NULL; 595 } 596 597 KASSERT(vp->v_usecount == 1); 598 KASSERT(vp->v_freelisthd == NULL); 599 KASSERT(LIST_EMPTY(&vp->v_nclist)); 600 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 601 602 vp->v_type = VNON; 603 vp->v_vnlock = &vp->v_lock; 604 vp->v_tag = tag; 605 vp->v_op = vops; 606 insmntque(vp, mp); 607 *vpp = vp; 608 vp->v_data = 0; 609 610 /* 611 * initialize uvm_object within vnode. 612 */ 613 614 uobj = &vp->v_uobj; 615 KASSERT(uobj->pgops == &uvm_vnodeops); 616 KASSERT(uobj->uo_npages == 0); 617 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 618 vp->v_size = vp->v_writesize = VSIZENOTSET; 619 620 if (mp != NULL) { 621 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 622 vp->v_vflag |= VV_MPSAFE; 623 vfs_unbusy(mp, true, NULL); 624 } 625 626 return (0); 627 } 628 629 /* 630 * This is really just the reverse of getnewvnode(). Needed for 631 * VFS_VGET functions who may need to push back a vnode in case 632 * of a locking race. 633 */ 634 void 635 ungetnewvnode(vnode_t *vp) 636 { 637 638 KASSERT(vp->v_usecount == 1); 639 KASSERT(vp->v_data == NULL); 640 KASSERT(vp->v_freelisthd == NULL); 641 642 mutex_enter(&vp->v_interlock); 643 vp->v_iflag |= VI_CLEAN; 644 vrelel(vp, 0); 645 } 646 647 /* 648 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 649 * marker vnode and we are prepared to wait for the allocation. 650 */ 651 vnode_t * 652 vnalloc(struct mount *mp) 653 { 654 vnode_t *vp; 655 656 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 657 if (vp == NULL) { 658 return NULL; 659 } 660 661 memset(vp, 0, sizeof(*vp)); 662 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 663 cv_init(&vp->v_cv, "vnode"); 664 /* 665 * done by memset() above. 666 * LIST_INIT(&vp->v_nclist); 667 * LIST_INIT(&vp->v_dnclist); 668 */ 669 670 if (mp != NULL) { 671 vp->v_mount = mp; 672 vp->v_type = VBAD; 673 vp->v_iflag = VI_MARKER; 674 } else { 675 rw_init(&vp->v_lock.vl_lock); 676 } 677 678 return vp; 679 } 680 681 /* 682 * Free an unused, unreferenced vnode. 683 */ 684 void 685 vnfree(vnode_t *vp) 686 { 687 688 KASSERT(vp->v_usecount == 0); 689 690 if ((vp->v_iflag & VI_MARKER) == 0) { 691 rw_destroy(&vp->v_lock.vl_lock); 692 mutex_enter(&vnode_free_list_lock); 693 numvnodes--; 694 mutex_exit(&vnode_free_list_lock); 695 } 696 697 UVM_OBJ_DESTROY(&vp->v_uobj); 698 cv_destroy(&vp->v_cv); 699 pool_cache_put(vnode_cache, vp); 700 } 701 702 /* 703 * Remove a vnode from its freelist. 704 */ 705 static inline void 706 vremfree(vnode_t *vp) 707 { 708 709 KASSERT(mutex_owned(&vp->v_interlock)); 710 KASSERT(vp->v_usecount == 0); 711 712 /* 713 * Note that the reference count must not change until 714 * the vnode is removed. 715 */ 716 mutex_enter(&vnode_free_list_lock); 717 if (vp->v_holdcnt > 0) { 718 KASSERT(vp->v_freelisthd == &vnode_hold_list); 719 } else { 720 KASSERT(vp->v_freelisthd == &vnode_free_list); 721 } 722 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 723 vp->v_freelisthd = NULL; 724 mutex_exit(&vnode_free_list_lock); 725 } 726 727 /* 728 * Move a vnode from one mount queue to another. 729 */ 730 static void 731 insmntque(vnode_t *vp, struct mount *mp) 732 { 733 struct mount *omp; 734 735 #ifdef DIAGNOSTIC 736 if ((mp != NULL) && 737 (mp->mnt_iflag & IMNT_UNMOUNT) && 738 vp->v_tag != VT_VFS) { 739 panic("insmntque into dying filesystem"); 740 } 741 #endif 742 743 mutex_enter(&mntvnode_lock); 744 /* 745 * Delete from old mount point vnode list, if on one. 746 */ 747 if ((omp = vp->v_mount) != NULL) 748 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes); 749 /* 750 * Insert into list of vnodes for the new mount point, if 751 * available. The caller must take a reference on the mount 752 * structure and donate to the vnode. 753 */ 754 if ((vp->v_mount = mp) != NULL) 755 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 756 mutex_exit(&mntvnode_lock); 757 758 if (omp != NULL) { 759 /* Release reference to old mount. */ 760 vfs_destroy(omp); 761 } 762 } 763 764 /* 765 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 766 * recycled. 767 */ 768 void 769 vwait(vnode_t *vp, int flags) 770 { 771 772 KASSERT(mutex_owned(&vp->v_interlock)); 773 KASSERT(vp->v_usecount != 0); 774 775 while ((vp->v_iflag & flags) != 0) 776 cv_wait(&vp->v_cv, &vp->v_interlock); 777 } 778 779 /* 780 * Insert a marker vnode into a mount's vnode list, after the 781 * specified vnode. mntvnode_lock must be held. 782 */ 783 void 784 vmark(vnode_t *mvp, vnode_t *vp) 785 { 786 struct mount *mp; 787 788 mp = mvp->v_mount; 789 790 KASSERT(mutex_owned(&mntvnode_lock)); 791 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 792 KASSERT(vp->v_mount == mp); 793 794 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes); 795 } 796 797 /* 798 * Remove a marker vnode from a mount's vnode list, and return 799 * a pointer to the next vnode in the list. mntvnode_lock must 800 * be held. 801 */ 802 vnode_t * 803 vunmark(vnode_t *mvp) 804 { 805 vnode_t *vp; 806 struct mount *mp; 807 808 mp = mvp->v_mount; 809 810 KASSERT(mutex_owned(&mntvnode_lock)); 811 KASSERT((mvp->v_iflag & VI_MARKER) != 0); 812 813 vp = TAILQ_NEXT(mvp, v_mntvnodes); 814 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes); 815 816 KASSERT(vp == NULL || vp->v_mount == mp); 817 818 return vp; 819 } 820 821 /* 822 * Update outstanding I/O count and do wakeup if requested. 823 */ 824 void 825 vwakeup(struct buf *bp) 826 { 827 struct vnode *vp; 828 829 if ((vp = bp->b_vp) == NULL) 830 return; 831 832 KASSERT(bp->b_objlock == &vp->v_interlock); 833 KASSERT(mutex_owned(bp->b_objlock)); 834 835 if (--vp->v_numoutput < 0) 836 panic("vwakeup: neg numoutput, vp %p", vp); 837 if (vp->v_numoutput == 0) 838 cv_broadcast(&vp->v_cv); 839 } 840 841 /* 842 * Flush out and invalidate all buffers associated with a vnode. 843 * Called with the underlying vnode locked, which should prevent new dirty 844 * buffers from being queued. 845 */ 846 int 847 vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, 848 bool catch, int slptimeo) 849 { 850 struct buf *bp, *nbp; 851 int error; 852 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 853 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); 854 855 /* XXXUBC this doesn't look at flags or slp* */ 856 mutex_enter(&vp->v_interlock); 857 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 858 if (error) { 859 return error; 860 } 861 862 if (flags & V_SAVE) { 863 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); 864 if (error) 865 return (error); 866 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); 867 } 868 869 mutex_enter(&bufcache_lock); 870 restart: 871 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 872 nbp = LIST_NEXT(bp, b_vnbufs); 873 error = bbusy(bp, catch, slptimeo, NULL); 874 if (error != 0) { 875 if (error == EPASSTHROUGH) 876 goto restart; 877 mutex_exit(&bufcache_lock); 878 return (error); 879 } 880 brelsel(bp, BC_INVAL | BC_VFLUSH); 881 } 882 883 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 884 nbp = LIST_NEXT(bp, b_vnbufs); 885 error = bbusy(bp, catch, slptimeo, NULL); 886 if (error != 0) { 887 if (error == EPASSTHROUGH) 888 goto restart; 889 mutex_exit(&bufcache_lock); 890 return (error); 891 } 892 /* 893 * XXX Since there are no node locks for NFS, I believe 894 * there is a slight chance that a delayed write will 895 * occur while sleeping just above, so check for it. 896 */ 897 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { 898 #ifdef DEBUG 899 printf("buffer still DELWRI\n"); 900 #endif 901 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 902 mutex_exit(&bufcache_lock); 903 VOP_BWRITE(bp); 904 mutex_enter(&bufcache_lock); 905 goto restart; 906 } 907 brelsel(bp, BC_INVAL | BC_VFLUSH); 908 } 909 910 #ifdef DIAGNOSTIC 911 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 912 panic("vinvalbuf: flush failed, vp %p", vp); 913 #endif 914 915 mutex_exit(&bufcache_lock); 916 917 return (0); 918 } 919 920 /* 921 * Destroy any in core blocks past the truncation length. 922 * Called with the underlying vnode locked, which should prevent new dirty 923 * buffers from being queued. 924 */ 925 int 926 vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo) 927 { 928 struct buf *bp, *nbp; 929 int error; 930 voff_t off; 931 932 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 933 mutex_enter(&vp->v_interlock); 934 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 935 if (error) { 936 return error; 937 } 938 939 mutex_enter(&bufcache_lock); 940 restart: 941 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 942 nbp = LIST_NEXT(bp, b_vnbufs); 943 if (bp->b_lblkno < lbn) 944 continue; 945 error = bbusy(bp, catch, slptimeo, NULL); 946 if (error != 0) { 947 if (error == EPASSTHROUGH) 948 goto restart; 949 mutex_exit(&bufcache_lock); 950 return (error); 951 } 952 brelsel(bp, BC_INVAL | BC_VFLUSH); 953 } 954 955 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 956 nbp = LIST_NEXT(bp, b_vnbufs); 957 if (bp->b_lblkno < lbn) 958 continue; 959 error = bbusy(bp, catch, slptimeo, NULL); 960 if (error != 0) { 961 if (error == EPASSTHROUGH) 962 goto restart; 963 mutex_exit(&bufcache_lock); 964 return (error); 965 } 966 brelsel(bp, BC_INVAL | BC_VFLUSH); 967 } 968 mutex_exit(&bufcache_lock); 969 970 return (0); 971 } 972 973 /* 974 * Flush all dirty buffers from a vnode. 975 * Called with the underlying vnode locked, which should prevent new dirty 976 * buffers from being queued. 977 */ 978 void 979 vflushbuf(struct vnode *vp, int sync) 980 { 981 struct buf *bp, *nbp; 982 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0); 983 bool dirty; 984 985 mutex_enter(&vp->v_interlock); 986 (void) VOP_PUTPAGES(vp, 0, 0, flags); 987 988 loop: 989 mutex_enter(&bufcache_lock); 990 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 991 nbp = LIST_NEXT(bp, b_vnbufs); 992 if ((bp->b_cflags & BC_BUSY)) 993 continue; 994 if ((bp->b_oflags & BO_DELWRI) == 0) 995 panic("vflushbuf: not dirty, bp %p", bp); 996 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 997 mutex_exit(&bufcache_lock); 998 /* 999 * Wait for I/O associated with indirect blocks to complete, 1000 * since there is no way to quickly wait for them below. 1001 */ 1002 if (bp->b_vp == vp || sync == 0) 1003 (void) bawrite(bp); 1004 else 1005 (void) bwrite(bp); 1006 goto loop; 1007 } 1008 mutex_exit(&bufcache_lock); 1009 1010 if (sync == 0) 1011 return; 1012 1013 mutex_enter(&vp->v_interlock); 1014 while (vp->v_numoutput != 0) 1015 cv_wait(&vp->v_cv, &vp->v_interlock); 1016 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); 1017 mutex_exit(&vp->v_interlock); 1018 1019 if (dirty) { 1020 vprint("vflushbuf: dirty", vp); 1021 goto loop; 1022 } 1023 } 1024 1025 /* 1026 * Create a vnode for a block device. 1027 * Used for root filesystem and swap areas. 1028 * Also used for memory file system special devices. 1029 */ 1030 int 1031 bdevvp(dev_t dev, vnode_t **vpp) 1032 { 1033 1034 return (getdevvp(dev, vpp, VBLK)); 1035 } 1036 1037 /* 1038 * Create a vnode for a character device. 1039 * Used for kernfs and some console handling. 1040 */ 1041 int 1042 cdevvp(dev_t dev, vnode_t **vpp) 1043 { 1044 1045 return (getdevvp(dev, vpp, VCHR)); 1046 } 1047 1048 /* 1049 * Associate a buffer with a vnode. There must already be a hold on 1050 * the vnode. 1051 */ 1052 void 1053 bgetvp(struct vnode *vp, struct buf *bp) 1054 { 1055 1056 KASSERT(bp->b_vp == NULL); 1057 KASSERT(bp->b_objlock == &buffer_lock); 1058 KASSERT(mutex_owned(&vp->v_interlock)); 1059 KASSERT(mutex_owned(&bufcache_lock)); 1060 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1061 KASSERT(!cv_has_waiters(&bp->b_done)); 1062 1063 vholdl(vp); 1064 bp->b_vp = vp; 1065 if (vp->v_type == VBLK || vp->v_type == VCHR) 1066 bp->b_dev = vp->v_rdev; 1067 else 1068 bp->b_dev = NODEV; 1069 1070 /* 1071 * Insert onto list for new vnode. 1072 */ 1073 bufinsvn(bp, &vp->v_cleanblkhd); 1074 bp->b_objlock = &vp->v_interlock; 1075 } 1076 1077 /* 1078 * Disassociate a buffer from a vnode. 1079 */ 1080 void 1081 brelvp(struct buf *bp) 1082 { 1083 struct vnode *vp = bp->b_vp; 1084 1085 KASSERT(vp != NULL); 1086 KASSERT(bp->b_objlock == &vp->v_interlock); 1087 KASSERT(mutex_owned(&vp->v_interlock)); 1088 KASSERT(mutex_owned(&bufcache_lock)); 1089 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1090 KASSERT(!cv_has_waiters(&bp->b_done)); 1091 1092 /* 1093 * Delete from old vnode list, if on one. 1094 */ 1095 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1096 bufremvn(bp); 1097 1098 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_iflag & VI_ONWORKLST) && 1099 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1100 vp->v_iflag &= ~VI_WRMAPDIRTY; 1101 vn_syncer_remove_from_worklist(vp); 1102 } 1103 1104 bp->b_objlock = &buffer_lock; 1105 bp->b_vp = NULL; 1106 holdrelel(vp); 1107 } 1108 1109 /* 1110 * Reassign a buffer from one vnode list to another. 1111 * The list reassignment must be within the same vnode. 1112 * Used to assign file specific control information 1113 * (indirect blocks) to the list to which they belong. 1114 */ 1115 void 1116 reassignbuf(struct buf *bp, struct vnode *vp) 1117 { 1118 struct buflists *listheadp; 1119 int delayx; 1120 1121 KASSERT(mutex_owned(&bufcache_lock)); 1122 KASSERT(bp->b_objlock == &vp->v_interlock); 1123 KASSERT(mutex_owned(&vp->v_interlock)); 1124 KASSERT((bp->b_cflags & BC_BUSY) != 0); 1125 1126 /* 1127 * Delete from old vnode list, if on one. 1128 */ 1129 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 1130 bufremvn(bp); 1131 1132 /* 1133 * If dirty, put on list of dirty buffers; 1134 * otherwise insert onto list of clean buffers. 1135 */ 1136 if ((bp->b_oflags & BO_DELWRI) == 0) { 1137 listheadp = &vp->v_cleanblkhd; 1138 if (TAILQ_EMPTY(&vp->v_uobj.memq) && 1139 (vp->v_iflag & VI_ONWORKLST) && 1140 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { 1141 vp->v_iflag &= ~VI_WRMAPDIRTY; 1142 vn_syncer_remove_from_worklist(vp); 1143 } 1144 } else { 1145 listheadp = &vp->v_dirtyblkhd; 1146 if ((vp->v_iflag & VI_ONWORKLST) == 0) { 1147 switch (vp->v_type) { 1148 case VDIR: 1149 delayx = dirdelay; 1150 break; 1151 case VBLK: 1152 if (vp->v_specmountpoint != NULL) { 1153 delayx = metadelay; 1154 break; 1155 } 1156 /* fall through */ 1157 default: 1158 delayx = filedelay; 1159 break; 1160 } 1161 if (!vp->v_mount || 1162 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) 1163 vn_syncer_add_to_worklist(vp, delayx); 1164 } 1165 } 1166 bufinsvn(bp, listheadp); 1167 } 1168 1169 /* 1170 * Create a vnode for a device. 1171 * Used by bdevvp (block device) for root file system etc., 1172 * and by cdevvp (character device) for console and kernfs. 1173 */ 1174 static int 1175 getdevvp(dev_t dev, vnode_t **vpp, enum vtype type) 1176 { 1177 vnode_t *vp; 1178 vnode_t *nvp; 1179 int error; 1180 1181 if (dev == NODEV) { 1182 *vpp = NULL; 1183 return (0); 1184 } 1185 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp); 1186 if (error) { 1187 *vpp = NULL; 1188 return (error); 1189 } 1190 vp = nvp; 1191 vp->v_type = type; 1192 vp->v_vflag |= VV_MPSAFE; 1193 uvm_vnp_setsize(vp, 0); 1194 spec_node_init(vp, dev); 1195 *vpp = vp; 1196 return (0); 1197 } 1198 1199 /* 1200 * Try to gain a reference to a vnode, without acquiring its interlock. 1201 * The caller must hold a lock that will prevent the vnode from being 1202 * recycled or freed. 1203 */ 1204 bool 1205 vtryget(vnode_t *vp) 1206 { 1207 u_int use, next; 1208 1209 /* 1210 * If the vnode is being freed, don't make life any harder 1211 * for vclean() by adding another reference without waiting. 1212 * This is not strictly necessary, but we'll do it anyway. 1213 */ 1214 if (__predict_false((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0)) { 1215 return false; 1216 } 1217 for (use = vp->v_usecount;; use = next) { 1218 if (use == 0) { 1219 /* Need interlock held if first reference. */ 1220 return false; 1221 } 1222 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 1223 if (__predict_true(next == use)) { 1224 return true; 1225 } 1226 } 1227 } 1228 1229 /* 1230 * Grab a particular vnode from the free list, increment its 1231 * reference count and lock it. If the vnode lock bit is set the 1232 * vnode is being eliminated in vgone. In that case, we can not 1233 * grab the vnode, so the process is awakened when the transition is 1234 * completed, and an error returned to indicate that the vnode is no 1235 * longer usable (possibly having been changed to a new file system type). 1236 */ 1237 int 1238 vget(vnode_t *vp, int flags) 1239 { 1240 int error; 1241 1242 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1243 1244 if ((flags & LK_INTERLOCK) == 0) 1245 mutex_enter(&vp->v_interlock); 1246 1247 /* 1248 * Before adding a reference, we must remove the vnode 1249 * from its freelist. 1250 */ 1251 if (vp->v_usecount == 0) { 1252 vremfree(vp); 1253 vp->v_usecount = 1; 1254 } else { 1255 atomic_inc_uint(&vp->v_usecount); 1256 } 1257 1258 /* 1259 * If the vnode is in the process of being cleaned out for 1260 * another use, we wait for the cleaning to finish and then 1261 * return failure. Cleaning is determined by checking if 1262 * the VI_XLOCK or VI_FREEING flags are set. 1263 */ 1264 if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) { 1265 if ((flags & LK_NOWAIT) != 0) { 1266 vrelel(vp, 0); 1267 return EBUSY; 1268 } 1269 vwait(vp, VI_XLOCK | VI_FREEING); 1270 vrelel(vp, 0); 1271 return ENOENT; 1272 } 1273 if (flags & LK_TYPE_MASK) { 1274 error = vn_lock(vp, flags | LK_INTERLOCK); 1275 if (error != 0) { 1276 vrele(vp); 1277 } 1278 return error; 1279 } 1280 mutex_exit(&vp->v_interlock); 1281 return 0; 1282 } 1283 1284 /* 1285 * vput(), just unlock and vrele() 1286 */ 1287 void 1288 vput(vnode_t *vp) 1289 { 1290 1291 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1292 1293 VOP_UNLOCK(vp, 0); 1294 vrele(vp); 1295 } 1296 1297 /* 1298 * Try to drop reference on a vnode. Abort if we are releasing the 1299 * last reference. Note: this _must_ succeed if not the last reference. 1300 */ 1301 static inline bool 1302 vtryrele(vnode_t *vp) 1303 { 1304 u_int use, next; 1305 1306 for (use = vp->v_usecount;; use = next) { 1307 if (use == 1) { 1308 return false; 1309 } 1310 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 1311 if (__predict_true(next == use)) { 1312 return true; 1313 } 1314 } 1315 } 1316 1317 /* 1318 * Vnode release. If reference count drops to zero, call inactive 1319 * routine and either return to freelist or free to the pool. 1320 */ 1321 void 1322 vrelel(vnode_t *vp, int flags) 1323 { 1324 bool recycle, defer; 1325 int error; 1326 1327 KASSERT(mutex_owned(&vp->v_interlock)); 1328 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1329 KASSERT(vp->v_freelisthd == NULL); 1330 1331 if (__predict_false(vp->v_op == dead_vnodeop_p && 1332 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 1333 vpanic(vp, "dead but not clean"); 1334 } 1335 1336 /* 1337 * If not the last reference, just drop the reference count 1338 * and unlock. 1339 */ 1340 if (vtryrele(vp)) { 1341 vp->v_iflag |= VI_INACTREDO; 1342 mutex_exit(&vp->v_interlock); 1343 return; 1344 } 1345 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 1346 vpanic(vp, "vrelel: bad ref count"); 1347 } 1348 1349 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 1350 1351 /* 1352 * If not clean, deactivate the vnode, but preserve 1353 * our reference across the call to VOP_INACTIVE(). 1354 */ 1355 retry: 1356 if ((vp->v_iflag & VI_CLEAN) == 0) { 1357 recycle = false; 1358 vp->v_iflag |= VI_INACTNOW; 1359 1360 /* 1361 * XXX This ugly block can be largely eliminated if 1362 * locking is pushed down into the file systems. 1363 */ 1364 if (curlwp == uvm.pagedaemon_lwp) { 1365 /* The pagedaemon can't wait around; defer. */ 1366 defer = true; 1367 } else if (curlwp == vrele_lwp) { 1368 /* We have to try harder. */ 1369 vp->v_iflag &= ~VI_INACTREDO; 1370 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 1371 LK_RETRY); 1372 if (error != 0) { 1373 /* XXX */ 1374 vpanic(vp, "vrele: unable to lock %p"); 1375 } 1376 defer = false; 1377 } else if ((vp->v_iflag & VI_LAYER) != 0) { 1378 /* 1379 * Acquiring the stack's lock in vclean() even 1380 * for an honest vput/vrele is dangerous because 1381 * our caller may hold other vnode locks; defer. 1382 */ 1383 defer = true; 1384 } else { 1385 /* If we can't acquire the lock, then defer. */ 1386 vp->v_iflag &= ~VI_INACTREDO; 1387 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | 1388 LK_NOWAIT); 1389 if (error != 0) { 1390 defer = true; 1391 mutex_enter(&vp->v_interlock); 1392 } else { 1393 defer = false; 1394 } 1395 } 1396 1397 if (defer) { 1398 /* 1399 * Defer reclaim to the kthread; it's not safe to 1400 * clean it here. We donate it our last reference. 1401 */ 1402 KASSERT(mutex_owned(&vp->v_interlock)); 1403 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 1404 vp->v_iflag &= ~VI_INACTNOW; 1405 vp->v_iflag |= VI_INACTPEND; 1406 mutex_enter(&vrele_lock); 1407 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 1408 if (++vrele_pending > (desiredvnodes >> 8)) 1409 cv_signal(&vrele_cv); 1410 mutex_exit(&vrele_lock); 1411 mutex_exit(&vp->v_interlock); 1412 return; 1413 } 1414 1415 #ifdef DIAGNOSTIC 1416 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1417 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 1418 vprint("vrelel: missing VOP_CLOSE()", vp); 1419 } 1420 #endif 1421 1422 /* 1423 * The vnode can gain another reference while being 1424 * deactivated. If VOP_INACTIVE() indicates that 1425 * the described file has been deleted, then recycle 1426 * the vnode irrespective of additional references. 1427 * Another thread may be waiting to re-use the on-disk 1428 * inode. 1429 * 1430 * Note that VOP_INACTIVE() will drop the vnode lock. 1431 */ 1432 VOP_INACTIVE(vp, &recycle); 1433 mutex_enter(&vp->v_interlock); 1434 vp->v_iflag &= ~VI_INACTNOW; 1435 if (!recycle) { 1436 if (vtryrele(vp)) { 1437 mutex_exit(&vp->v_interlock); 1438 return; 1439 } 1440 1441 /* 1442 * If we grew another reference while 1443 * VOP_INACTIVE() was underway, retry. 1444 */ 1445 if ((vp->v_iflag & VI_INACTREDO) != 0) { 1446 goto retry; 1447 } 1448 } 1449 1450 /* Take care of space accounting. */ 1451 if (vp->v_iflag & VI_EXECMAP) { 1452 atomic_add_int(&uvmexp.execpages, 1453 -vp->v_uobj.uo_npages); 1454 atomic_add_int(&uvmexp.filepages, 1455 vp->v_uobj.uo_npages); 1456 } 1457 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 1458 vp->v_vflag &= ~VV_MAPPED; 1459 1460 /* 1461 * Recycle the vnode if the file is now unused (unlinked), 1462 * otherwise just free it. 1463 */ 1464 if (recycle) { 1465 vclean(vp, DOCLOSE); 1466 } 1467 KASSERT(vp->v_usecount > 0); 1468 } 1469 1470 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 1471 /* Gained another reference while being reclaimed. */ 1472 mutex_exit(&vp->v_interlock); 1473 return; 1474 } 1475 1476 if ((vp->v_iflag & VI_CLEAN) != 0) { 1477 /* 1478 * It's clean so destroy it. It isn't referenced 1479 * anywhere since it has been reclaimed. 1480 */ 1481 KASSERT(vp->v_holdcnt == 0); 1482 KASSERT(vp->v_writecount == 0); 1483 mutex_exit(&vp->v_interlock); 1484 insmntque(vp, NULL); 1485 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1486 spec_node_destroy(vp); 1487 } 1488 vnfree(vp); 1489 } else { 1490 /* 1491 * Otherwise, put it back onto the freelist. It 1492 * can't be destroyed while still associated with 1493 * a file system. 1494 */ 1495 mutex_enter(&vnode_free_list_lock); 1496 if (vp->v_holdcnt > 0) { 1497 vp->v_freelisthd = &vnode_hold_list; 1498 } else { 1499 vp->v_freelisthd = &vnode_free_list; 1500 } 1501 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1502 mutex_exit(&vnode_free_list_lock); 1503 mutex_exit(&vp->v_interlock); 1504 } 1505 } 1506 1507 void 1508 vrele(vnode_t *vp) 1509 { 1510 1511 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1512 1513 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 1514 return; 1515 } 1516 mutex_enter(&vp->v_interlock); 1517 vrelel(vp, 0); 1518 } 1519 1520 static void 1521 vrele_thread(void *cookie) 1522 { 1523 vnode_t *vp; 1524 1525 for (;;) { 1526 mutex_enter(&vrele_lock); 1527 while (TAILQ_EMPTY(&vrele_list)) { 1528 vrele_gen++; 1529 cv_broadcast(&vrele_cv); 1530 cv_timedwait(&vrele_cv, &vrele_lock, hz); 1531 } 1532 vp = TAILQ_FIRST(&vrele_list); 1533 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 1534 vrele_pending--; 1535 mutex_exit(&vrele_lock); 1536 1537 /* 1538 * If not the last reference, then ignore the vnode 1539 * and look for more work. 1540 */ 1541 mutex_enter(&vp->v_interlock); 1542 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 1543 vp->v_iflag &= ~VI_INACTPEND; 1544 vrelel(vp, 0); 1545 } 1546 } 1547 1548 /* 1549 * Page or buffer structure gets a reference. 1550 * Called with v_interlock held. 1551 */ 1552 void 1553 vholdl(vnode_t *vp) 1554 { 1555 1556 KASSERT(mutex_owned(&vp->v_interlock)); 1557 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1558 1559 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 1560 mutex_enter(&vnode_free_list_lock); 1561 KASSERT(vp->v_freelisthd == &vnode_free_list); 1562 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1563 vp->v_freelisthd = &vnode_hold_list; 1564 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1565 mutex_exit(&vnode_free_list_lock); 1566 } 1567 } 1568 1569 /* 1570 * Page or buffer structure frees a reference. 1571 * Called with v_interlock held. 1572 */ 1573 void 1574 holdrelel(vnode_t *vp) 1575 { 1576 1577 KASSERT(mutex_owned(&vp->v_interlock)); 1578 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1579 1580 if (vp->v_holdcnt <= 0) { 1581 vpanic(vp, "holdrelel: holdcnt vp %p"); 1582 } 1583 1584 vp->v_holdcnt--; 1585 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 1586 mutex_enter(&vnode_free_list_lock); 1587 KASSERT(vp->v_freelisthd == &vnode_hold_list); 1588 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 1589 vp->v_freelisthd = &vnode_free_list; 1590 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 1591 mutex_exit(&vnode_free_list_lock); 1592 } 1593 } 1594 1595 /* 1596 * Vnode reference, where a reference is already held by some other 1597 * object (for example, a file structure). 1598 */ 1599 void 1600 vref(vnode_t *vp) 1601 { 1602 1603 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1604 KASSERT(vp->v_usecount != 0); 1605 1606 atomic_inc_uint(&vp->v_usecount); 1607 } 1608 1609 /* 1610 * Remove any vnodes in the vnode table belonging to mount point mp. 1611 * 1612 * If FORCECLOSE is not specified, there should not be any active ones, 1613 * return error if any are found (nb: this is a user error, not a 1614 * system error). If FORCECLOSE is specified, detach any active vnodes 1615 * that are found. 1616 * 1617 * If WRITECLOSE is set, only flush out regular file vnodes open for 1618 * writing. 1619 * 1620 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. 1621 */ 1622 #ifdef DEBUG 1623 int busyprt = 0; /* print out busy vnodes */ 1624 struct ctldebug debug1 = { "busyprt", &busyprt }; 1625 #endif 1626 1627 static vnode_t * 1628 vflushnext(vnode_t *mvp, int *when) 1629 { 1630 1631 if (hardclock_ticks > *when) { 1632 mutex_exit(&mntvnode_lock); 1633 yield(); 1634 mutex_enter(&mntvnode_lock); 1635 *when = hardclock_ticks + hz / 10; 1636 } 1637 1638 return vunmark(mvp); 1639 } 1640 1641 int 1642 vflush(struct mount *mp, vnode_t *skipvp, int flags) 1643 { 1644 vnode_t *vp, *mvp; 1645 int busy = 0, when = 0, gen; 1646 1647 /* 1648 * First, flush out any vnode references from vrele_list. 1649 */ 1650 mutex_enter(&vrele_lock); 1651 gen = vrele_gen; 1652 while (vrele_pending && gen == vrele_gen) { 1653 cv_broadcast(&vrele_cv); 1654 cv_wait(&vrele_cv, &vrele_lock); 1655 } 1656 mutex_exit(&vrele_lock); 1657 1658 /* Allocate a marker vnode. */ 1659 if ((mvp = vnalloc(mp)) == NULL) 1660 return (ENOMEM); 1661 1662 /* 1663 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 1664 * and vclean() are called 1665 */ 1666 mutex_enter(&mntvnode_lock); 1667 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL; 1668 vp = vflushnext(mvp, &when)) { 1669 vmark(mvp, vp); 1670 if (vp->v_mount != mp || vismarker(vp)) 1671 continue; 1672 /* 1673 * Skip over a selected vnode. 1674 */ 1675 if (vp == skipvp) 1676 continue; 1677 mutex_enter(&vp->v_interlock); 1678 /* 1679 * Ignore clean but still referenced vnodes. 1680 */ 1681 if ((vp->v_iflag & VI_CLEAN) != 0) { 1682 mutex_exit(&vp->v_interlock); 1683 continue; 1684 } 1685 /* 1686 * Skip over a vnodes marked VSYSTEM. 1687 */ 1688 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 1689 mutex_exit(&vp->v_interlock); 1690 continue; 1691 } 1692 /* 1693 * If WRITECLOSE is set, only flush out regular file 1694 * vnodes open for writing. 1695 */ 1696 if ((flags & WRITECLOSE) && 1697 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1698 mutex_exit(&vp->v_interlock); 1699 continue; 1700 } 1701 /* 1702 * With v_usecount == 0, all we need to do is clear 1703 * out the vnode data structures and we are done. 1704 */ 1705 if (vp->v_usecount == 0) { 1706 mutex_exit(&mntvnode_lock); 1707 vremfree(vp); 1708 vp->v_usecount = 1; 1709 vclean(vp, DOCLOSE); 1710 vrelel(vp, 0); 1711 mutex_enter(&mntvnode_lock); 1712 continue; 1713 } 1714 /* 1715 * If FORCECLOSE is set, forcibly close the vnode. 1716 * For block or character devices, revert to an 1717 * anonymous device. For all other files, just 1718 * kill them. 1719 */ 1720 if (flags & FORCECLOSE) { 1721 mutex_exit(&mntvnode_lock); 1722 atomic_inc_uint(&vp->v_usecount); 1723 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1724 vclean(vp, DOCLOSE); 1725 vrelel(vp, 0); 1726 } else { 1727 vclean(vp, 0); 1728 vp->v_op = spec_vnodeop_p; /* XXXSMP */ 1729 mutex_exit(&vp->v_interlock); 1730 /* 1731 * The vnode isn't clean, but still resides 1732 * on the mount list. Remove it. XXX This 1733 * is a bit dodgy. 1734 */ 1735 insmntque(vp, NULL); 1736 vrele(vp); 1737 } 1738 mutex_enter(&mntvnode_lock); 1739 continue; 1740 } 1741 #ifdef DEBUG 1742 if (busyprt) 1743 vprint("vflush: busy vnode", vp); 1744 #endif 1745 mutex_exit(&vp->v_interlock); 1746 busy++; 1747 } 1748 mutex_exit(&mntvnode_lock); 1749 vnfree(mvp); 1750 if (busy) 1751 return (EBUSY); 1752 return (0); 1753 } 1754 1755 /* 1756 * Disassociate the underlying file system from a vnode. 1757 * 1758 * Must be called with the interlock held, and will return with it held. 1759 */ 1760 void 1761 vclean(vnode_t *vp, int flags) 1762 { 1763 lwp_t *l = curlwp; 1764 bool recycle, active; 1765 int error; 1766 1767 KASSERT(mutex_owned(&vp->v_interlock)); 1768 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1769 KASSERT(vp->v_usecount != 0); 1770 1771 /* If cleaning is already in progress wait until done and return. */ 1772 if (vp->v_iflag & VI_XLOCK) { 1773 vwait(vp, VI_XLOCK); 1774 return; 1775 } 1776 1777 /* If already clean, nothing to do. */ 1778 if ((vp->v_iflag & VI_CLEAN) != 0) { 1779 return; 1780 } 1781 1782 /* 1783 * Prevent the vnode from being recycled or brought into use 1784 * while we clean it out. 1785 */ 1786 vp->v_iflag |= VI_XLOCK; 1787 if (vp->v_iflag & VI_EXECMAP) { 1788 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1789 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1790 } 1791 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1792 active = (vp->v_usecount > 1); 1793 1794 /* XXXAD should not lock vnode under layer */ 1795 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK); 1796 1797 /* 1798 * Clean out any cached data associated with the vnode. 1799 * If purging an active vnode, it must be closed and 1800 * deactivated before being reclaimed. Note that the 1801 * VOP_INACTIVE will unlock the vnode. 1802 */ 1803 if (flags & DOCLOSE) { 1804 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1805 if (error != 0) { 1806 /* XXX, fix vn_start_write's grab of mp and use that. */ 1807 1808 if (wapbl_vphaswapbl(vp)) 1809 WAPBL_DISCARD(wapbl_vptomp(vp)); 1810 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1811 } 1812 KASSERT(error == 0); 1813 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1814 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1815 spec_node_revoke(vp); 1816 } 1817 } 1818 if (active) { 1819 VOP_INACTIVE(vp, &recycle); 1820 } else { 1821 /* 1822 * Any other processes trying to obtain this lock must first 1823 * wait for VI_XLOCK to clear, then call the new lock operation. 1824 */ 1825 VOP_UNLOCK(vp, 0); 1826 } 1827 1828 /* Disassociate the underlying file system from the vnode. */ 1829 if (VOP_RECLAIM(vp)) { 1830 vpanic(vp, "vclean: cannot reclaim"); 1831 } 1832 1833 KASSERT(vp->v_uobj.uo_npages == 0); 1834 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1835 uvm_ra_freectx(vp->v_ractx); 1836 vp->v_ractx = NULL; 1837 } 1838 cache_purge(vp); 1839 1840 /* Done with purge, notify sleepers of the grim news. */ 1841 mutex_enter(&vp->v_interlock); 1842 vp->v_op = dead_vnodeop_p; 1843 vp->v_tag = VT_NON; 1844 vp->v_vnlock = &vp->v_lock; 1845 KNOTE(&vp->v_klist, NOTE_REVOKE); 1846 vp->v_iflag &= ~(VI_XLOCK | VI_FREEING); 1847 vp->v_vflag &= ~VV_LOCKSWORK; 1848 if ((flags & DOCLOSE) != 0) { 1849 vp->v_iflag |= VI_CLEAN; 1850 } 1851 cv_broadcast(&vp->v_cv); 1852 1853 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1854 } 1855 1856 /* 1857 * Recycle an unused vnode to the front of the free list. 1858 * Release the passed interlock if the vnode will be recycled. 1859 */ 1860 int 1861 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1862 { 1863 1864 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1865 1866 mutex_enter(&vp->v_interlock); 1867 if (vp->v_usecount != 0) { 1868 mutex_exit(&vp->v_interlock); 1869 return (0); 1870 } 1871 if (inter_lkp) 1872 mutex_exit(inter_lkp); 1873 vremfree(vp); 1874 vp->v_usecount = 1; 1875 vclean(vp, DOCLOSE); 1876 vrelel(vp, 0); 1877 return (1); 1878 } 1879 1880 /* 1881 * Eliminate all activity associated with a vnode in preparation for 1882 * reuse. Drops a reference from the vnode. 1883 */ 1884 void 1885 vgone(vnode_t *vp) 1886 { 1887 1888 mutex_enter(&vp->v_interlock); 1889 vclean(vp, DOCLOSE); 1890 vrelel(vp, 0); 1891 } 1892 1893 /* 1894 * Lookup a vnode by device number. 1895 */ 1896 int 1897 vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 1898 { 1899 vnode_t *vp; 1900 int rc = 0; 1901 1902 mutex_enter(&device_lock); 1903 for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1904 if (dev != vp->v_rdev || type != vp->v_type) 1905 continue; 1906 *vpp = vp; 1907 rc = 1; 1908 break; 1909 } 1910 mutex_exit(&device_lock); 1911 return (rc); 1912 } 1913 1914 /* 1915 * Revoke all the vnodes corresponding to the specified minor number 1916 * range (endpoints inclusive) of the specified major. 1917 */ 1918 void 1919 vdevgone(int maj, int minl, int minh, enum vtype type) 1920 { 1921 vnode_t *vp, **vpp; 1922 dev_t dev; 1923 int mn; 1924 1925 vp = NULL; /* XXX gcc */ 1926 1927 mutex_enter(&device_lock); 1928 for (mn = minl; mn <= minh; mn++) { 1929 dev = makedev(maj, mn); 1930 vpp = &specfs_hash[SPECHASH(dev)]; 1931 for (vp = *vpp; vp != NULL;) { 1932 mutex_enter(&vp->v_interlock); 1933 if ((vp->v_iflag & VI_CLEAN) != 0 || 1934 dev != vp->v_rdev || type != vp->v_type) { 1935 mutex_exit(&vp->v_interlock); 1936 vp = vp->v_specnext; 1937 continue; 1938 } 1939 mutex_exit(&device_lock); 1940 if (vget(vp, LK_INTERLOCK) == 0) { 1941 VOP_REVOKE(vp, REVOKEALL); 1942 vrele(vp); 1943 } 1944 mutex_enter(&device_lock); 1945 vp = *vpp; 1946 } 1947 } 1948 mutex_exit(&device_lock); 1949 } 1950 1951 /* 1952 * Calculate the total number of references to a special device. 1953 */ 1954 int 1955 vcount(vnode_t *vp) 1956 { 1957 int count; 1958 1959 mutex_enter(&device_lock); 1960 mutex_enter(&vp->v_interlock); 1961 if (vp->v_specnode == NULL) { 1962 count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0); 1963 mutex_exit(&vp->v_interlock); 1964 mutex_exit(&device_lock); 1965 return (count); 1966 } 1967 mutex_exit(&vp->v_interlock); 1968 count = vp->v_specnode->sn_dev->sd_opencnt; 1969 mutex_exit(&device_lock); 1970 return (count); 1971 } 1972 1973 /* 1974 * Eliminate all activity associated with the requested vnode 1975 * and with all vnodes aliased to the requested vnode. 1976 */ 1977 void 1978 vrevoke(vnode_t *vp) 1979 { 1980 vnode_t *vq, **vpp; 1981 enum vtype type; 1982 dev_t dev; 1983 1984 KASSERT(vp->v_usecount > 0); 1985 1986 mutex_enter(&vp->v_interlock); 1987 if ((vp->v_iflag & VI_CLEAN) != 0) { 1988 mutex_exit(&vp->v_interlock); 1989 return; 1990 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1991 atomic_inc_uint(&vp->v_usecount); 1992 vclean(vp, DOCLOSE); 1993 vrelel(vp, 0); 1994 return; 1995 } else { 1996 dev = vp->v_rdev; 1997 type = vp->v_type; 1998 mutex_exit(&vp->v_interlock); 1999 } 2000 2001 vpp = &specfs_hash[SPECHASH(dev)]; 2002 mutex_enter(&device_lock); 2003 for (vq = *vpp; vq != NULL;) { 2004 /* If clean or being cleaned, then ignore it. */ 2005 mutex_enter(&vq->v_interlock); 2006 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 2007 vq->v_rdev != dev || vq->v_type != type) { 2008 mutex_exit(&vq->v_interlock); 2009 vq = vq->v_specnext; 2010 continue; 2011 } 2012 mutex_exit(&device_lock); 2013 if (vq->v_usecount == 0) { 2014 vremfree(vq); 2015 vq->v_usecount = 1; 2016 } else { 2017 atomic_inc_uint(&vq->v_usecount); 2018 } 2019 vclean(vq, DOCLOSE); 2020 vrelel(vq, 0); 2021 mutex_enter(&device_lock); 2022 vq = *vpp; 2023 } 2024 mutex_exit(&device_lock); 2025 } 2026 2027 /* 2028 * sysctl helper routine to return list of supported fstypes 2029 */ 2030 int 2031 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 2032 { 2033 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 2034 char *where = oldp; 2035 struct vfsops *v; 2036 size_t needed, left, slen; 2037 int error, first; 2038 2039 if (newp != NULL) 2040 return (EPERM); 2041 if (namelen != 0) 2042 return (EINVAL); 2043 2044 first = 1; 2045 error = 0; 2046 needed = 0; 2047 left = *oldlenp; 2048 2049 sysctl_unlock(); 2050 mutex_enter(&vfs_list_lock); 2051 LIST_FOREACH(v, &vfs_list, vfs_list) { 2052 if (where == NULL) 2053 needed += strlen(v->vfs_name) + 1; 2054 else { 2055 memset(bf, 0, sizeof(bf)); 2056 if (first) { 2057 strncpy(bf, v->vfs_name, sizeof(bf)); 2058 first = 0; 2059 } else { 2060 bf[0] = ' '; 2061 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 2062 } 2063 bf[sizeof(bf)-1] = '\0'; 2064 slen = strlen(bf); 2065 if (left < slen + 1) 2066 break; 2067 v->vfs_refcount++; 2068 mutex_exit(&vfs_list_lock); 2069 /* +1 to copy out the trailing NUL byte */ 2070 error = copyout(bf, where, slen + 1); 2071 mutex_enter(&vfs_list_lock); 2072 v->vfs_refcount--; 2073 if (error) 2074 break; 2075 where += slen; 2076 needed += slen; 2077 left -= slen; 2078 } 2079 } 2080 mutex_exit(&vfs_list_lock); 2081 sysctl_relock(); 2082 *oldlenp = needed; 2083 return (error); 2084 } 2085 2086 2087 int kinfo_vdebug = 1; 2088 int kinfo_vgetfailed; 2089 #define KINFO_VNODESLOP 10 2090 /* 2091 * Dump vnode list (via sysctl). 2092 * Copyout address of vnode followed by vnode. 2093 */ 2094 /* ARGSUSED */ 2095 int 2096 sysctl_kern_vnode(SYSCTLFN_ARGS) 2097 { 2098 char *where = oldp; 2099 size_t *sizep = oldlenp; 2100 struct mount *mp, *nmp; 2101 vnode_t *vp, *mvp, vbuf; 2102 char *bp = where, *savebp; 2103 char *ewhere; 2104 int error; 2105 2106 if (namelen != 0) 2107 return (EOPNOTSUPP); 2108 if (newp != NULL) 2109 return (EPERM); 2110 2111 #define VPTRSZ sizeof(vnode_t *) 2112 #define VNODESZ sizeof(vnode_t) 2113 if (where == NULL) { 2114 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 2115 return (0); 2116 } 2117 ewhere = where + *sizep; 2118 2119 sysctl_unlock(); 2120 mutex_enter(&mountlist_lock); 2121 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2122 mp = nmp) { 2123 if (vfs_busy(mp, &nmp)) { 2124 continue; 2125 } 2126 savebp = bp; 2127 /* Allocate a marker vnode. */ 2128 if ((mvp = vnalloc(mp)) == NULL) { 2129 sysctl_relock(); 2130 return (ENOMEM); 2131 } 2132 mutex_enter(&mntvnode_lock); 2133 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) { 2134 vmark(mvp, vp); 2135 /* 2136 * Check that the vp is still associated with 2137 * this filesystem. RACE: could have been 2138 * recycled onto the same filesystem. 2139 */ 2140 if (vp->v_mount != mp || vismarker(vp)) 2141 continue; 2142 if (bp + VPTRSZ + VNODESZ > ewhere) { 2143 (void)vunmark(mvp); 2144 mutex_exit(&mntvnode_lock); 2145 vnfree(mvp); 2146 sysctl_relock(); 2147 *sizep = bp - where; 2148 return (ENOMEM); 2149 } 2150 memcpy(&vbuf, vp, VNODESZ); 2151 mutex_exit(&mntvnode_lock); 2152 if ((error = copyout(&vp, bp, VPTRSZ)) || 2153 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 2154 mutex_enter(&mntvnode_lock); 2155 (void)vunmark(mvp); 2156 mutex_exit(&mntvnode_lock); 2157 vnfree(mvp); 2158 sysctl_relock(); 2159 return (error); 2160 } 2161 bp += VPTRSZ + VNODESZ; 2162 mutex_enter(&mntvnode_lock); 2163 } 2164 mutex_exit(&mntvnode_lock); 2165 vnfree(mvp); 2166 vfs_unbusy(mp, false, &nmp); 2167 } 2168 mutex_exit(&mountlist_lock); 2169 sysctl_relock(); 2170 2171 *sizep = bp - where; 2172 return (0); 2173 } 2174 2175 /* 2176 * Remove clean vnodes from a mountpoint's vnode list. 2177 */ 2178 void 2179 vfs_scrubvnlist(struct mount *mp) 2180 { 2181 vnode_t *vp, *nvp; 2182 2183 retry: 2184 mutex_enter(&mntvnode_lock); 2185 for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 2186 nvp = TAILQ_NEXT(vp, v_mntvnodes); 2187 mutex_enter(&vp->v_interlock); 2188 if ((vp->v_iflag & VI_CLEAN) != 0) { 2189 TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes); 2190 vp->v_mount = NULL; 2191 mutex_exit(&mntvnode_lock); 2192 mutex_exit(&vp->v_interlock); 2193 vfs_destroy(mp); 2194 goto retry; 2195 } 2196 mutex_exit(&vp->v_interlock); 2197 } 2198 mutex_exit(&mntvnode_lock); 2199 } 2200 2201 /* 2202 * Check to see if a filesystem is mounted on a block device. 2203 */ 2204 int 2205 vfs_mountedon(vnode_t *vp) 2206 { 2207 vnode_t *vq; 2208 int error = 0; 2209 2210 if (vp->v_type != VBLK) 2211 return ENOTBLK; 2212 if (vp->v_specmountpoint != NULL) 2213 return (EBUSY); 2214 mutex_enter(&device_lock); 2215 for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL; 2216 vq = vq->v_specnext) { 2217 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 2218 continue; 2219 if (vq->v_specmountpoint != NULL) { 2220 error = EBUSY; 2221 break; 2222 } 2223 } 2224 mutex_exit(&device_lock); 2225 return (error); 2226 } 2227 2228 /* 2229 * Unmount all file systems. 2230 * We traverse the list in reverse order under the assumption that doing so 2231 * will avoid needing to worry about dependencies. 2232 */ 2233 void 2234 vfs_unmountall(struct lwp *l) 2235 { 2236 struct mount *mp, *nmp; 2237 int allerror, error; 2238 2239 printf("unmounting file systems..."); 2240 for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist); 2241 !CIRCLEQ_EMPTY(&mountlist); 2242 mp = nmp) { 2243 nmp = CIRCLEQ_PREV(mp, mnt_list); 2244 #ifdef DEBUG 2245 printf("\nunmounting %s (%s)...", 2246 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname); 2247 #endif 2248 atomic_inc_uint(&mp->mnt_refcnt); 2249 if ((error = dounmount(mp, MNT_FORCE, l)) != 0) { 2250 printf("unmount of %s failed with error %d\n", 2251 mp->mnt_stat.f_mntonname, error); 2252 allerror = 1; 2253 } 2254 } 2255 printf(" done\n"); 2256 if (allerror) 2257 printf("WARNING: some file systems would not unmount\n"); 2258 } 2259 2260 /* 2261 * Sync and unmount file systems before shutting down. 2262 */ 2263 void 2264 vfs_shutdown(void) 2265 { 2266 struct lwp *l; 2267 2268 /* XXX we're certainly not running in lwp0's context! */ 2269 l = curlwp; 2270 if (l == NULL) 2271 l = &lwp0; 2272 2273 printf("syncing disks... "); 2274 2275 /* remove user processes from run queue */ 2276 suspendsched(); 2277 (void) spl0(); 2278 2279 /* avoid coming back this way again if we panic. */ 2280 doing_shutdown = 1; 2281 2282 sys_sync(l, NULL, NULL); 2283 2284 /* Wait for sync to finish. */ 2285 if (buf_syncwait() != 0) { 2286 #if defined(DDB) && defined(DEBUG_HALT_BUSY) 2287 Debugger(); 2288 #endif 2289 printf("giving up\n"); 2290 return; 2291 } else 2292 printf("done\n"); 2293 2294 /* 2295 * If we've panic'd, don't make the situation potentially 2296 * worse by unmounting the file systems. 2297 */ 2298 if (panicstr != NULL) 2299 return; 2300 2301 /* Release inodes held by texts before update. */ 2302 #ifdef notdef 2303 vnshutdown(); 2304 #endif 2305 /* Unmount file systems. */ 2306 vfs_unmountall(l); 2307 } 2308 2309 /* 2310 * Mount the root file system. If the operator didn't specify a 2311 * file system to use, try all possible file systems until one 2312 * succeeds. 2313 */ 2314 int 2315 vfs_mountroot(void) 2316 { 2317 struct vfsops *v; 2318 int error = ENODEV; 2319 2320 if (root_device == NULL) 2321 panic("vfs_mountroot: root device unknown"); 2322 2323 switch (device_class(root_device)) { 2324 case DV_IFNET: 2325 if (rootdev != NODEV) 2326 panic("vfs_mountroot: rootdev set for DV_IFNET " 2327 "(0x%llx -> %llu,%llu)", 2328 (unsigned long long)rootdev, 2329 (unsigned long long)major(rootdev), 2330 (unsigned long long)minor(rootdev)); 2331 break; 2332 2333 case DV_DISK: 2334 if (rootdev == NODEV) 2335 panic("vfs_mountroot: rootdev not set for DV_DISK"); 2336 if (bdevvp(rootdev, &rootvp)) 2337 panic("vfs_mountroot: can't get vnode for rootdev"); 2338 error = VOP_OPEN(rootvp, FREAD, FSCRED); 2339 if (error) { 2340 printf("vfs_mountroot: can't open root device\n"); 2341 return (error); 2342 } 2343 break; 2344 2345 default: 2346 printf("%s: inappropriate for root file system\n", 2347 device_xname(root_device)); 2348 return (ENODEV); 2349 } 2350 2351 /* 2352 * If user specified a root fs type, use it. Make sure the 2353 * specified type exists and has a mount_root() 2354 */ 2355 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) { 2356 v = vfs_getopsbyname(rootfstype); 2357 error = EFTYPE; 2358 if (v != NULL) { 2359 if (v->vfs_mountroot != NULL) { 2360 error = (v->vfs_mountroot)(); 2361 } 2362 v->vfs_refcount--; 2363 } 2364 goto done; 2365 } 2366 2367 /* 2368 * Try each file system currently configured into the kernel. 2369 */ 2370 mutex_enter(&vfs_list_lock); 2371 LIST_FOREACH(v, &vfs_list, vfs_list) { 2372 if (v->vfs_mountroot == NULL) 2373 continue; 2374 #ifdef DEBUG 2375 aprint_normal("mountroot: trying %s...\n", v->vfs_name); 2376 #endif 2377 v->vfs_refcount++; 2378 mutex_exit(&vfs_list_lock); 2379 error = (*v->vfs_mountroot)(); 2380 mutex_enter(&vfs_list_lock); 2381 v->vfs_refcount--; 2382 if (!error) { 2383 aprint_normal("root file system type: %s\n", 2384 v->vfs_name); 2385 break; 2386 } 2387 } 2388 mutex_exit(&vfs_list_lock); 2389 2390 if (v == NULL) { 2391 printf("no file system for %s", device_xname(root_device)); 2392 if (device_class(root_device) == DV_DISK) 2393 printf(" (dev 0x%llx)", (unsigned long long)rootdev); 2394 printf("\n"); 2395 error = EFTYPE; 2396 } 2397 2398 done: 2399 if (error && device_class(root_device) == DV_DISK) { 2400 VOP_CLOSE(rootvp, FREAD, FSCRED); 2401 vrele(rootvp); 2402 } 2403 return (error); 2404 } 2405 2406 /* 2407 * Get a new unique fsid 2408 */ 2409 void 2410 vfs_getnewfsid(struct mount *mp) 2411 { 2412 static u_short xxxfs_mntid; 2413 fsid_t tfsid; 2414 int mtype; 2415 2416 mutex_enter(&mntid_lock); 2417 mtype = makefstype(mp->mnt_op->vfs_name); 2418 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0); 2419 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype; 2420 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 2421 if (xxxfs_mntid == 0) 2422 ++xxxfs_mntid; 2423 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid); 2424 tfsid.__fsid_val[1] = mtype; 2425 if (!CIRCLEQ_EMPTY(&mountlist)) { 2426 while (vfs_getvfs(&tfsid)) { 2427 tfsid.__fsid_val[0]++; 2428 xxxfs_mntid++; 2429 } 2430 } 2431 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0]; 2432 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0]; 2433 mutex_exit(&mntid_lock); 2434 } 2435 2436 /* 2437 * Make a 'unique' number from a mount type name. 2438 */ 2439 long 2440 makefstype(const char *type) 2441 { 2442 long rv; 2443 2444 for (rv = 0; *type; type++) { 2445 rv <<= 2; 2446 rv ^= *type; 2447 } 2448 return rv; 2449 } 2450 2451 /* 2452 * Set vnode attributes to VNOVAL 2453 */ 2454 void 2455 vattr_null(struct vattr *vap) 2456 { 2457 2458 vap->va_type = VNON; 2459 2460 /* 2461 * Assign individually so that it is safe even if size and 2462 * sign of each member are varied. 2463 */ 2464 vap->va_mode = VNOVAL; 2465 vap->va_nlink = VNOVAL; 2466 vap->va_uid = VNOVAL; 2467 vap->va_gid = VNOVAL; 2468 vap->va_fsid = VNOVAL; 2469 vap->va_fileid = VNOVAL; 2470 vap->va_size = VNOVAL; 2471 vap->va_blocksize = VNOVAL; 2472 vap->va_atime.tv_sec = 2473 vap->va_mtime.tv_sec = 2474 vap->va_ctime.tv_sec = 2475 vap->va_birthtime.tv_sec = VNOVAL; 2476 vap->va_atime.tv_nsec = 2477 vap->va_mtime.tv_nsec = 2478 vap->va_ctime.tv_nsec = 2479 vap->va_birthtime.tv_nsec = VNOVAL; 2480 vap->va_gen = VNOVAL; 2481 vap->va_flags = VNOVAL; 2482 vap->va_rdev = VNOVAL; 2483 vap->va_bytes = VNOVAL; 2484 vap->va_vaflags = 0; 2485 } 2486 2487 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 2488 #define ARRAY_PRINT(idx, arr) \ 2489 ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 2490 2491 const char * const vnode_tags[] = { VNODE_TAGS }; 2492 const char * const vnode_types[] = { VNODE_TYPES }; 2493 const char vnode_flagbits[] = VNODE_FLAGBITS; 2494 2495 /* 2496 * Print out a description of a vnode. 2497 */ 2498 void 2499 vprint(const char *label, struct vnode *vp) 2500 { 2501 struct vnlock *vl; 2502 char bf[96]; 2503 int flag; 2504 2505 vl = (vp->v_vnlock != NULL ? vp->v_vnlock : &vp->v_lock); 2506 flag = vp->v_iflag | vp->v_vflag | vp->v_uflag; 2507 snprintb(bf, sizeof(bf), vnode_flagbits, flag); 2508 2509 if (label != NULL) 2510 printf("%s: ", label); 2511 printf("vnode @ %p, flags (%s)\n\ttag %s(%d), type %s(%d), " 2512 "usecount %d, writecount %d, holdcount %d\n" 2513 "\tfreelisthd %p, mount %p, data %p lock %p recursecnt %d\n", 2514 vp, bf, ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 2515 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 2516 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, 2517 vp->v_freelisthd, vp->v_mount, vp->v_data, vl, vl->vl_recursecnt); 2518 if (vp->v_data != NULL) { 2519 printf("\t"); 2520 VOP_PRINT(vp); 2521 } 2522 } 2523 2524 #ifdef DEBUG 2525 /* 2526 * List all of the locked vnodes in the system. 2527 * Called when debugging the kernel. 2528 */ 2529 void 2530 printlockedvnodes(void) 2531 { 2532 struct mount *mp, *nmp; 2533 struct vnode *vp; 2534 2535 printf("Locked vnodes\n"); 2536 mutex_enter(&mountlist_lock); 2537 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; 2538 mp = nmp) { 2539 if (vfs_busy(mp, &nmp)) { 2540 continue; 2541 } 2542 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2543 if (VOP_ISLOCKED(vp)) 2544 vprint(NULL, vp); 2545 } 2546 mutex_enter(&mountlist_lock); 2547 vfs_unbusy(mp, false, &nmp); 2548 } 2549 mutex_exit(&mountlist_lock); 2550 } 2551 #endif 2552 2553 /* 2554 * Do the usual access checking. 2555 * file_mode, uid and gid are from the vnode in question, 2556 * while acc_mode and cred are from the VOP_ACCESS parameter list 2557 */ 2558 int 2559 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, 2560 mode_t acc_mode, kauth_cred_t cred) 2561 { 2562 mode_t mask; 2563 int error, ismember; 2564 2565 /* 2566 * Super-user always gets read/write access, but execute access depends 2567 * on at least one execute bit being set. 2568 */ 2569 if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) == 0) { 2570 if ((acc_mode & VEXEC) && type != VDIR && 2571 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0) 2572 return (EACCES); 2573 return (0); 2574 } 2575 2576 mask = 0; 2577 2578 /* Otherwise, check the owner. */ 2579 if (kauth_cred_geteuid(cred) == uid) { 2580 if (acc_mode & VEXEC) 2581 mask |= S_IXUSR; 2582 if (acc_mode & VREAD) 2583 mask |= S_IRUSR; 2584 if (acc_mode & VWRITE) 2585 mask |= S_IWUSR; 2586 return ((file_mode & mask) == mask ? 0 : EACCES); 2587 } 2588 2589 /* Otherwise, check the groups. */ 2590 error = kauth_cred_ismember_gid(cred, gid, &ismember); 2591 if (error) 2592 return (error); 2593 if (kauth_cred_getegid(cred) == gid || ismember) { 2594 if (acc_mode & VEXEC) 2595 mask |= S_IXGRP; 2596 if (acc_mode & VREAD) 2597 mask |= S_IRGRP; 2598 if (acc_mode & VWRITE) 2599 mask |= S_IWGRP; 2600 return ((file_mode & mask) == mask ? 0 : EACCES); 2601 } 2602 2603 /* Otherwise, check everyone else. */ 2604 if (acc_mode & VEXEC) 2605 mask |= S_IXOTH; 2606 if (acc_mode & VREAD) 2607 mask |= S_IROTH; 2608 if (acc_mode & VWRITE) 2609 mask |= S_IWOTH; 2610 return ((file_mode & mask) == mask ? 0 : EACCES); 2611 } 2612 2613 /* 2614 * Given a file system name, look up the vfsops for that 2615 * file system, or return NULL if file system isn't present 2616 * in the kernel. 2617 */ 2618 struct vfsops * 2619 vfs_getopsbyname(const char *name) 2620 { 2621 struct vfsops *v; 2622 2623 mutex_enter(&vfs_list_lock); 2624 LIST_FOREACH(v, &vfs_list, vfs_list) { 2625 if (strcmp(v->vfs_name, name) == 0) 2626 break; 2627 } 2628 if (v != NULL) 2629 v->vfs_refcount++; 2630 mutex_exit(&vfs_list_lock); 2631 2632 return (v); 2633 } 2634 2635 void 2636 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 2637 { 2638 const struct statvfs *mbp; 2639 2640 if (sbp == (mbp = &mp->mnt_stat)) 2641 return; 2642 2643 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 2644 sbp->f_fsid = mbp->f_fsid; 2645 sbp->f_owner = mbp->f_owner; 2646 sbp->f_flag = mbp->f_flag; 2647 sbp->f_syncwrites = mbp->f_syncwrites; 2648 sbp->f_asyncwrites = mbp->f_asyncwrites; 2649 sbp->f_syncreads = mbp->f_syncreads; 2650 sbp->f_asyncreads = mbp->f_asyncreads; 2651 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 2652 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 2653 sizeof(sbp->f_fstypename)); 2654 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 2655 sizeof(sbp->f_mntonname)); 2656 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 2657 sizeof(sbp->f_mntfromname)); 2658 sbp->f_namemax = mbp->f_namemax; 2659 } 2660 2661 int 2662 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 2663 const char *vfsname, struct mount *mp, struct lwp *l) 2664 { 2665 int error; 2666 size_t size; 2667 struct statvfs *sfs = &mp->mnt_stat; 2668 int (*fun)(const void *, void *, size_t, size_t *); 2669 2670 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname, 2671 sizeof(mp->mnt_stat.f_fstypename)); 2672 2673 if (onp) { 2674 struct cwdinfo *cwdi = l->l_proc->p_cwdi; 2675 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 2676 if (cwdi->cwdi_rdir != NULL) { 2677 size_t len; 2678 char *bp; 2679 char *path = PNBUF_GET(); 2680 2681 bp = path + MAXPATHLEN; 2682 *--bp = '\0'; 2683 rw_enter(&cwdi->cwdi_lock, RW_READER); 2684 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 2685 path, MAXPATHLEN / 2, 0, l); 2686 rw_exit(&cwdi->cwdi_lock); 2687 if (error) { 2688 PNBUF_PUT(path); 2689 return error; 2690 } 2691 2692 len = strlen(bp); 2693 if (len > sizeof(sfs->f_mntonname) - 1) 2694 len = sizeof(sfs->f_mntonname) - 1; 2695 (void)strncpy(sfs->f_mntonname, bp, len); 2696 PNBUF_PUT(path); 2697 2698 if (len < sizeof(sfs->f_mntonname) - 1) { 2699 error = (*fun)(onp, &sfs->f_mntonname[len], 2700 sizeof(sfs->f_mntonname) - len - 1, &size); 2701 if (error) 2702 return error; 2703 size += len; 2704 } else { 2705 size = len; 2706 } 2707 } else { 2708 error = (*fun)(onp, &sfs->f_mntonname, 2709 sizeof(sfs->f_mntonname) - 1, &size); 2710 if (error) 2711 return error; 2712 } 2713 (void)memset(sfs->f_mntonname + size, 0, 2714 sizeof(sfs->f_mntonname) - size); 2715 } 2716 2717 if (fromp) { 2718 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 2719 error = (*fun)(fromp, sfs->f_mntfromname, 2720 sizeof(sfs->f_mntfromname) - 1, &size); 2721 if (error) 2722 return error; 2723 (void)memset(sfs->f_mntfromname + size, 0, 2724 sizeof(sfs->f_mntfromname) - size); 2725 } 2726 return 0; 2727 } 2728 2729 void 2730 vfs_timestamp(struct timespec *ts) 2731 { 2732 2733 nanotime(ts); 2734 } 2735 2736 time_t rootfstime; /* recorded root fs time, if known */ 2737 void 2738 setrootfstime(time_t t) 2739 { 2740 rootfstime = t; 2741 } 2742 2743 /* 2744 * Sham lock manager for vnodes. This is a temporary measure. 2745 */ 2746 int 2747 vlockmgr(struct vnlock *vl, int flags) 2748 { 2749 2750 KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0); 2751 2752 switch (flags & LK_TYPE_MASK) { 2753 case LK_SHARED: 2754 if (rw_tryenter(&vl->vl_lock, RW_READER)) { 2755 return 0; 2756 } 2757 if ((flags & LK_NOWAIT) != 0) { 2758 return EBUSY; 2759 } 2760 rw_enter(&vl->vl_lock, RW_READER); 2761 return 0; 2762 2763 case LK_EXCLUSIVE: 2764 if (rw_tryenter(&vl->vl_lock, RW_WRITER)) { 2765 return 0; 2766 } 2767 if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) && 2768 rw_write_held(&vl->vl_lock)) { 2769 vl->vl_recursecnt++; 2770 return 0; 2771 } 2772 if ((flags & LK_NOWAIT) != 0) { 2773 return EBUSY; 2774 } 2775 rw_enter(&vl->vl_lock, RW_WRITER); 2776 return 0; 2777 2778 case LK_RELEASE: 2779 if (vl->vl_recursecnt != 0) { 2780 KASSERT(rw_write_held(&vl->vl_lock)); 2781 vl->vl_recursecnt--; 2782 return 0; 2783 } 2784 rw_exit(&vl->vl_lock); 2785 return 0; 2786 2787 default: 2788 panic("vlockmgr: flags %x", flags); 2789 } 2790 } 2791 2792 int 2793 vlockstatus(struct vnlock *vl) 2794 { 2795 2796 if (rw_write_held(&vl->vl_lock)) { 2797 return LK_EXCLUSIVE; 2798 } 2799 if (rw_read_held(&vl->vl_lock)) { 2800 return LK_SHARED; 2801 } 2802 return 0; 2803 } 2804 2805 /* 2806 * mount_specific_key_create -- 2807 * Create a key for subsystem mount-specific data. 2808 */ 2809 int 2810 mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor) 2811 { 2812 2813 return (specificdata_key_create(mount_specificdata_domain, keyp, dtor)); 2814 } 2815 2816 /* 2817 * mount_specific_key_delete -- 2818 * Delete a key for subsystem mount-specific data. 2819 */ 2820 void 2821 mount_specific_key_delete(specificdata_key_t key) 2822 { 2823 2824 specificdata_key_delete(mount_specificdata_domain, key); 2825 } 2826 2827 /* 2828 * mount_initspecific -- 2829 * Initialize a mount's specificdata container. 2830 */ 2831 void 2832 mount_initspecific(struct mount *mp) 2833 { 2834 int error; 2835 2836 error = specificdata_init(mount_specificdata_domain, 2837 &mp->mnt_specdataref); 2838 KASSERT(error == 0); 2839 } 2840 2841 /* 2842 * mount_finispecific -- 2843 * Finalize a mount's specificdata container. 2844 */ 2845 void 2846 mount_finispecific(struct mount *mp) 2847 { 2848 2849 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref); 2850 } 2851 2852 /* 2853 * mount_getspecific -- 2854 * Return mount-specific data corresponding to the specified key. 2855 */ 2856 void * 2857 mount_getspecific(struct mount *mp, specificdata_key_t key) 2858 { 2859 2860 return (specificdata_getspecific(mount_specificdata_domain, 2861 &mp->mnt_specdataref, key)); 2862 } 2863 2864 /* 2865 * mount_setspecific -- 2866 * Set mount-specific data corresponding to the specified key. 2867 */ 2868 void 2869 mount_setspecific(struct mount *mp, specificdata_key_t key, void *data) 2870 { 2871 2872 specificdata_setspecific(mount_specificdata_domain, 2873 &mp->mnt_specdataref, key, data); 2874 } 2875 2876 int 2877 VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c) 2878 { 2879 int error; 2880 2881 KERNEL_LOCK(1, NULL); 2882 error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c); 2883 KERNEL_UNLOCK_ONE(NULL); 2884 2885 return error; 2886 } 2887 2888 int 2889 VFS_START(struct mount *mp, int a) 2890 { 2891 int error; 2892 2893 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2894 KERNEL_LOCK(1, NULL); 2895 } 2896 error = (*(mp->mnt_op->vfs_start))(mp, a); 2897 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2898 KERNEL_UNLOCK_ONE(NULL); 2899 } 2900 2901 return error; 2902 } 2903 2904 int 2905 VFS_UNMOUNT(struct mount *mp, int a) 2906 { 2907 int error; 2908 2909 KERNEL_LOCK(1, NULL); 2910 error = (*(mp->mnt_op->vfs_unmount))(mp, a); 2911 KERNEL_UNLOCK_ONE(NULL); 2912 2913 return error; 2914 } 2915 2916 int 2917 VFS_ROOT(struct mount *mp, struct vnode **a) 2918 { 2919 int error; 2920 2921 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2922 KERNEL_LOCK(1, NULL); 2923 } 2924 error = (*(mp->mnt_op->vfs_root))(mp, a); 2925 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2926 KERNEL_UNLOCK_ONE(NULL); 2927 } 2928 2929 return error; 2930 } 2931 2932 int 2933 VFS_QUOTACTL(struct mount *mp, int a, uid_t b, void *c) 2934 { 2935 int error; 2936 2937 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2938 KERNEL_LOCK(1, NULL); 2939 } 2940 error = (*(mp->mnt_op->vfs_quotactl))(mp, a, b, c); 2941 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2942 KERNEL_UNLOCK_ONE(NULL); 2943 } 2944 2945 return error; 2946 } 2947 2948 int 2949 VFS_STATVFS(struct mount *mp, struct statvfs *a) 2950 { 2951 int error; 2952 2953 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2954 KERNEL_LOCK(1, NULL); 2955 } 2956 error = (*(mp->mnt_op->vfs_statvfs))(mp, a); 2957 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2958 KERNEL_UNLOCK_ONE(NULL); 2959 } 2960 2961 return error; 2962 } 2963 2964 int 2965 VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b) 2966 { 2967 int error; 2968 2969 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2970 KERNEL_LOCK(1, NULL); 2971 } 2972 error = (*(mp->mnt_op->vfs_sync))(mp, a, b); 2973 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2974 KERNEL_UNLOCK_ONE(NULL); 2975 } 2976 2977 return error; 2978 } 2979 2980 int 2981 VFS_FHTOVP(struct mount *mp, struct fid *a, struct vnode **b) 2982 { 2983 int error; 2984 2985 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2986 KERNEL_LOCK(1, NULL); 2987 } 2988 error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b); 2989 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 2990 KERNEL_UNLOCK_ONE(NULL); 2991 } 2992 2993 return error; 2994 } 2995 2996 int 2997 VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b) 2998 { 2999 int error; 3000 3001 if ((vp->v_vflag & VV_MPSAFE) == 0) { 3002 KERNEL_LOCK(1, NULL); 3003 } 3004 error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b); 3005 if ((vp->v_vflag & VV_MPSAFE) == 0) { 3006 KERNEL_UNLOCK_ONE(NULL); 3007 } 3008 3009 return error; 3010 } 3011 3012 int 3013 VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b) 3014 { 3015 int error; 3016 3017 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3018 KERNEL_LOCK(1, NULL); 3019 } 3020 error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b); 3021 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3022 KERNEL_UNLOCK_ONE(NULL); 3023 } 3024 3025 return error; 3026 } 3027 3028 int 3029 VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d) 3030 { 3031 int error; 3032 3033 KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */ 3034 error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d); 3035 KERNEL_UNLOCK_ONE(NULL); /* XXX */ 3036 3037 return error; 3038 } 3039 3040 int 3041 VFS_SUSPENDCTL(struct mount *mp, int a) 3042 { 3043 int error; 3044 3045 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3046 KERNEL_LOCK(1, NULL); 3047 } 3048 error = (*(mp->mnt_op->vfs_suspendctl))(mp, a); 3049 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 3050 KERNEL_UNLOCK_ONE(NULL); 3051 } 3052 3053 return error; 3054 } 3055 3056 #ifdef DDB 3057 static const char buf_flagbits[] = BUF_FLAGBITS; 3058 3059 void 3060 vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) 3061 { 3062 char bf[1024]; 3063 3064 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%" 3065 PRIx64 " dev 0x%x\n", 3066 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev); 3067 3068 snprintb(bf, sizeof(bf), 3069 buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags); 3070 (*pr)(" error %d flags 0x%s\n", bp->b_error, bf); 3071 3072 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 3073 bp->b_bufsize, bp->b_bcount, bp->b_resid); 3074 (*pr)(" data %p saveaddr %p\n", 3075 bp->b_data, bp->b_saveaddr); 3076 (*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock); 3077 } 3078 3079 3080 void 3081 vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) 3082 { 3083 char bf[256]; 3084 3085 uvm_object_printit(&vp->v_uobj, full, pr); 3086 snprintb(bf, sizeof(bf), 3087 vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag); 3088 (*pr)("\nVNODE flags %s\n", bf); 3089 (*pr)("mp %p numoutput %d size 0x%llx writesize 0x%llx\n", 3090 vp->v_mount, vp->v_numoutput, vp->v_size, vp->v_writesize); 3091 3092 (*pr)("data %p writecount %ld holdcnt %ld\n", 3093 vp->v_data, vp->v_writecount, vp->v_holdcnt); 3094 3095 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n", 3096 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 3097 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 3098 vp->v_mount, vp->v_mountedhere); 3099 3100 (*pr)("v_lock %p v_vnlock %p\n", &vp->v_lock, vp->v_vnlock); 3101 3102 if (full) { 3103 struct buf *bp; 3104 3105 (*pr)("clean bufs:\n"); 3106 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 3107 (*pr)(" bp %p\n", bp); 3108 vfs_buf_print(bp, full, pr); 3109 } 3110 3111 (*pr)("dirty bufs:\n"); 3112 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 3113 (*pr)(" bp %p\n", bp); 3114 vfs_buf_print(bp, full, pr); 3115 } 3116 } 3117 } 3118 3119 void 3120 vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) 3121 { 3122 char sbuf[256]; 3123 3124 (*pr)("vnodecovered = %p syncer = %p data = %p\n", 3125 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); 3126 3127 (*pr)("fs_bshift %d dev_bshift = %d\n", 3128 mp->mnt_fs_bshift,mp->mnt_dev_bshift); 3129 3130 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag); 3131 (*pr)("flag = %s\n", sbuf); 3132 3133 snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag); 3134 (*pr)("iflag = %s\n", sbuf); 3135 3136 (*pr)("refcnt = %d unmounting @ %p updating @ %p\n", mp->mnt_refcnt, 3137 &mp->mnt_unmounting, &mp->mnt_updating); 3138 3139 (*pr)("statvfs cache:\n"); 3140 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); 3141 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); 3142 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); 3143 3144 (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks); 3145 (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree); 3146 (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail); 3147 (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd); 3148 3149 (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files); 3150 (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree); 3151 (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail); 3152 (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd); 3153 3154 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 3155 mp->mnt_stat.f_fsidx.__fsid_val[0], 3156 mp->mnt_stat.f_fsidx.__fsid_val[1]); 3157 3158 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); 3159 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); 3160 3161 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag); 3162 3163 (*pr)("\tflag = %s\n",sbuf); 3164 (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites); 3165 (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites); 3166 (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads); 3167 (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads); 3168 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); 3169 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); 3170 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); 3171 3172 { 3173 int cnt = 0; 3174 struct vnode *vp; 3175 (*pr)("locked vnodes ="); 3176 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3177 if (VOP_ISLOCKED(vp)) { 3178 if ((++cnt % 6) == 0) { 3179 (*pr)(" %p,\n\t", vp); 3180 } else { 3181 (*pr)(" %p,", vp); 3182 } 3183 } 3184 } 3185 (*pr)("\n"); 3186 } 3187 3188 if (full) { 3189 int cnt = 0; 3190 struct vnode *vp; 3191 (*pr)("all vnodes ="); 3192 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 3193 if (!TAILQ_NEXT(vp, v_mntvnodes)) { 3194 (*pr)(" %p", vp); 3195 } else if ((++cnt % 6) == 0) { 3196 (*pr)(" %p,\n\t", vp); 3197 } else { 3198 (*pr)(" %p,", vp); 3199 } 3200 } 3201 (*pr)("\n", vp); 3202 } 3203 } 3204 #endif /* DDB */ 3205