1 /* $NetBSD: vfs_vnode.c,v 1.25 2013/11/07 09:48:34 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Reference counting 96 * 97 * Vnode is considered active, if reference count (vnode_t::v_usecount) 98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 99 * as vput(9), routines. Common points holding references are e.g. 100 * file openings, current working directory, mount points, etc. 101 * 102 * Note on v_usecount and its locking 103 * 104 * At nearly all points it is known that v_usecount could be zero, 105 * the vnode_t::v_interlock will be held. To change v_usecount away 106 * from zero, the interlock must be held. To change from a non-zero 107 * value to zero, again the interlock must be held. 108 * 109 * Changing the usecount from a non-zero value to a non-zero value can 110 * safely be done using atomic operations, without the interlock held. 111 * 112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 113 * mntvnode_lock is still held. 114 * 115 * See PR 41374. 116 */ 117 118 #include <sys/cdefs.h> 119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.25 2013/11/07 09:48:34 hannken Exp $"); 120 121 #define _VFS_VNODE_PRIVATE 122 123 #include <sys/param.h> 124 #include <sys/kernel.h> 125 126 #include <sys/atomic.h> 127 #include <sys/buf.h> 128 #include <sys/conf.h> 129 #include <sys/device.h> 130 #include <sys/kauth.h> 131 #include <sys/kmem.h> 132 #include <sys/kthread.h> 133 #include <sys/module.h> 134 #include <sys/mount.h> 135 #include <sys/namei.h> 136 #include <sys/syscallargs.h> 137 #include <sys/sysctl.h> 138 #include <sys/systm.h> 139 #include <sys/vnode.h> 140 #include <sys/wapbl.h> 141 #include <sys/fstrans.h> 142 143 #include <uvm/uvm.h> 144 #include <uvm/uvm_readahead.h> 145 146 /* Flags to vrelel. */ 147 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 148 149 u_int numvnodes __cacheline_aligned; 150 151 static pool_cache_t vnode_cache __read_mostly; 152 153 /* 154 * There are two free lists: one is for vnodes which have no buffer/page 155 * references and one for those which do (i.e. v_holdcnt is non-zero). 156 * Vnode recycling mechanism first attempts to look into the former list. 157 */ 158 static kmutex_t vnode_free_list_lock __cacheline_aligned; 159 static vnodelst_t vnode_free_list __cacheline_aligned; 160 static vnodelst_t vnode_hold_list __cacheline_aligned; 161 static kcondvar_t vdrain_cv __cacheline_aligned; 162 163 static vnodelst_t vrele_list __cacheline_aligned; 164 static kmutex_t vrele_lock __cacheline_aligned; 165 static kcondvar_t vrele_cv __cacheline_aligned; 166 static lwp_t * vrele_lwp __cacheline_aligned; 167 static int vrele_pending __cacheline_aligned; 168 static int vrele_gen __cacheline_aligned; 169 170 static int cleanvnode(void); 171 static void vclean(vnode_t *); 172 static void vrelel(vnode_t *, int); 173 static void vdrain_thread(void *); 174 static void vrele_thread(void *); 175 static void vnpanic(vnode_t *, const char *, ...) 176 __printflike(2, 3); 177 178 /* Routines having to do with the management of the vnode table. */ 179 extern int (**dead_vnodeop_p)(void *); 180 181 void 182 vfs_vnode_sysinit(void) 183 { 184 int error __diagused; 185 186 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 187 NULL, IPL_NONE, NULL, NULL, NULL); 188 KASSERT(vnode_cache != NULL); 189 190 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 191 TAILQ_INIT(&vnode_free_list); 192 TAILQ_INIT(&vnode_hold_list); 193 TAILQ_INIT(&vrele_list); 194 195 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 196 cv_init(&vdrain_cv, "vdrain"); 197 cv_init(&vrele_cv, "vrele"); 198 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 199 NULL, NULL, "vdrain"); 200 KASSERT(error == 0); 201 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 202 NULL, &vrele_lwp, "vrele"); 203 KASSERT(error == 0); 204 } 205 206 /* 207 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 208 * marker vnode. 209 */ 210 vnode_t * 211 vnalloc(struct mount *mp) 212 { 213 vnode_t *vp; 214 215 vp = pool_cache_get(vnode_cache, PR_WAITOK); 216 KASSERT(vp != NULL); 217 218 memset(vp, 0, sizeof(*vp)); 219 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 220 cv_init(&vp->v_cv, "vnode"); 221 /* 222 * Done by memset() above. 223 * LIST_INIT(&vp->v_nclist); 224 * LIST_INIT(&vp->v_dnclist); 225 */ 226 227 if (mp != NULL) { 228 vp->v_mount = mp; 229 vp->v_type = VBAD; 230 vp->v_iflag = VI_MARKER; 231 } else { 232 rw_init(&vp->v_lock); 233 } 234 235 return vp; 236 } 237 238 /* 239 * Free an unused, unreferenced vnode. 240 */ 241 void 242 vnfree(vnode_t *vp) 243 { 244 245 KASSERT(vp->v_usecount == 0); 246 247 if ((vp->v_iflag & VI_MARKER) == 0) { 248 rw_destroy(&vp->v_lock); 249 mutex_enter(&vnode_free_list_lock); 250 numvnodes--; 251 mutex_exit(&vnode_free_list_lock); 252 } 253 254 /* 255 * Note: the vnode interlock will either be freed, of reference 256 * dropped (if VI_LOCKSHARE was in use). 257 */ 258 uvm_obj_destroy(&vp->v_uobj, true); 259 cv_destroy(&vp->v_cv); 260 pool_cache_put(vnode_cache, vp); 261 } 262 263 /* 264 * cleanvnode: grab a vnode from freelist, clean and free it. 265 * 266 * => Releases vnode_free_list_lock. 267 */ 268 static int 269 cleanvnode(void) 270 { 271 vnode_t *vp; 272 vnodelst_t *listhd; 273 struct mount *mp; 274 275 KASSERT(mutex_owned(&vnode_free_list_lock)); 276 277 listhd = &vnode_free_list; 278 try_nextlist: 279 TAILQ_FOREACH(vp, listhd, v_freelist) { 280 /* 281 * It's safe to test v_usecount and v_iflag 282 * without holding the interlock here, since 283 * these vnodes should never appear on the 284 * lists. 285 */ 286 KASSERT(vp->v_usecount == 0); 287 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 288 KASSERT(vp->v_freelisthd == listhd); 289 290 if (!mutex_tryenter(vp->v_interlock)) 291 continue; 292 if ((vp->v_iflag & VI_XLOCK) != 0) { 293 mutex_exit(vp->v_interlock); 294 continue; 295 } 296 mp = vp->v_mount; 297 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 298 mutex_exit(vp->v_interlock); 299 continue; 300 } 301 break; 302 } 303 304 if (vp == NULL) { 305 if (listhd == &vnode_free_list) { 306 listhd = &vnode_hold_list; 307 goto try_nextlist; 308 } 309 mutex_exit(&vnode_free_list_lock); 310 return EBUSY; 311 } 312 313 /* Remove it from the freelist. */ 314 TAILQ_REMOVE(listhd, vp, v_freelist); 315 vp->v_freelisthd = NULL; 316 mutex_exit(&vnode_free_list_lock); 317 318 KASSERT(vp->v_usecount == 0); 319 320 /* 321 * The vnode is still associated with a file system, so we must 322 * clean it out before freeing it. We need to add a reference 323 * before doing this. 324 */ 325 vp->v_usecount = 1; 326 vclean(vp); 327 vrelel(vp, 0); 328 fstrans_done(mp); 329 330 return 0; 331 } 332 333 /* 334 * getnewvnode: return a fresh vnode. 335 * 336 * => Returns referenced vnode, moved into the mount queue. 337 * => Shares the interlock specified by 'slock', if it is not NULL. 338 */ 339 int 340 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 341 kmutex_t *slock, vnode_t **vpp) 342 { 343 struct uvm_object *uobj __diagused; 344 vnode_t *vp; 345 int error = 0; 346 347 if (mp != NULL) { 348 /* 349 * Mark filesystem busy while we are creating a vnode. 350 * If unmount is in progress, this will fail. 351 */ 352 error = vfs_busy(mp, NULL); 353 if (error) 354 return error; 355 } 356 357 vp = NULL; 358 359 /* Allocate a new vnode. */ 360 mutex_enter(&vnode_free_list_lock); 361 numvnodes++; 362 if (numvnodes > desiredvnodes + desiredvnodes / 10) 363 cv_signal(&vdrain_cv); 364 mutex_exit(&vnode_free_list_lock); 365 vp = vnalloc(NULL); 366 367 KASSERT(vp->v_freelisthd == NULL); 368 KASSERT(LIST_EMPTY(&vp->v_nclist)); 369 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 370 371 /* Initialize vnode. */ 372 vp->v_usecount = 1; 373 vp->v_type = VNON; 374 vp->v_tag = tag; 375 vp->v_op = vops; 376 vp->v_data = NULL; 377 378 uobj = &vp->v_uobj; 379 KASSERT(uobj->pgops == &uvm_vnodeops); 380 KASSERT(uobj->uo_npages == 0); 381 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 382 vp->v_size = vp->v_writesize = VSIZENOTSET; 383 384 /* Share the vnode_t::v_interlock, if requested. */ 385 if (slock) { 386 /* Set the interlock and mark that it is shared. */ 387 KASSERT(vp->v_mount == NULL); 388 mutex_obj_hold(slock); 389 uvm_obj_setlock(&vp->v_uobj, slock); 390 KASSERT(vp->v_interlock == slock); 391 vp->v_iflag |= VI_LOCKSHARE; 392 } 393 394 /* Finally, move vnode into the mount queue. */ 395 vfs_insmntque(vp, mp); 396 397 if (mp != NULL) { 398 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 399 vp->v_vflag |= VV_MPSAFE; 400 vfs_unbusy(mp, true, NULL); 401 } 402 403 *vpp = vp; 404 return 0; 405 } 406 407 /* 408 * This is really just the reverse of getnewvnode(). Needed for 409 * VFS_VGET functions who may need to push back a vnode in case 410 * of a locking race. 411 */ 412 void 413 ungetnewvnode(vnode_t *vp) 414 { 415 416 KASSERT(vp->v_usecount == 1); 417 KASSERT(vp->v_data == NULL); 418 KASSERT(vp->v_freelisthd == NULL); 419 420 mutex_enter(vp->v_interlock); 421 vp->v_iflag |= VI_CLEAN; 422 vrelel(vp, 0); 423 } 424 425 /* 426 * Helper thread to keep the number of vnodes below desiredvnodes. 427 */ 428 static void 429 vdrain_thread(void *cookie) 430 { 431 int error; 432 433 mutex_enter(&vnode_free_list_lock); 434 435 for (;;) { 436 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 437 while (numvnodes > desiredvnodes) { 438 error = cleanvnode(); 439 if (error) 440 kpause("vndsbusy", false, hz, NULL); 441 mutex_enter(&vnode_free_list_lock); 442 if (error) 443 break; 444 } 445 } 446 } 447 448 /* 449 * Remove a vnode from its freelist. 450 */ 451 void 452 vremfree(vnode_t *vp) 453 { 454 455 KASSERT(mutex_owned(vp->v_interlock)); 456 KASSERT(vp->v_usecount == 0); 457 458 /* 459 * Note that the reference count must not change until 460 * the vnode is removed. 461 */ 462 mutex_enter(&vnode_free_list_lock); 463 if (vp->v_holdcnt > 0) { 464 KASSERT(vp->v_freelisthd == &vnode_hold_list); 465 } else { 466 KASSERT(vp->v_freelisthd == &vnode_free_list); 467 } 468 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 469 vp->v_freelisthd = NULL; 470 mutex_exit(&vnode_free_list_lock); 471 } 472 473 /* 474 * vget: get a particular vnode from the free list, increment its reference 475 * count and lock it. 476 * 477 * => Should be called with v_interlock held. 478 * 479 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 480 * In that case, we cannot grab the vnode, so the process is awakened when 481 * the transition is completed, and an error returned to indicate that the 482 * vnode is no longer usable (e.g. changed to a new file system type). 483 */ 484 int 485 vget(vnode_t *vp, int flags) 486 { 487 int error = 0; 488 489 KASSERT((vp->v_iflag & VI_MARKER) == 0); 490 KASSERT(mutex_owned(vp->v_interlock)); 491 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 492 493 /* 494 * Before adding a reference, we must remove the vnode 495 * from its freelist. 496 */ 497 if (vp->v_usecount == 0) { 498 vremfree(vp); 499 vp->v_usecount = 1; 500 } else { 501 atomic_inc_uint(&vp->v_usecount); 502 } 503 504 /* 505 * If the vnode is in the process of being cleaned out for 506 * another use, we wait for the cleaning to finish and then 507 * return failure. Cleaning is determined by checking if 508 * the VI_XLOCK flag is set. 509 */ 510 if ((vp->v_iflag & VI_XLOCK) != 0) { 511 if ((flags & LK_NOWAIT) != 0) { 512 vrelel(vp, 0); 513 return EBUSY; 514 } 515 vwait(vp, VI_XLOCK); 516 vrelel(vp, 0); 517 return ENOENT; 518 } 519 520 if ((vp->v_iflag & VI_INACTNOW) != 0) { 521 /* 522 * if it's being desactived, wait for it to complete. 523 * Make sure to not return a clean vnode. 524 */ 525 if ((flags & LK_NOWAIT) != 0) { 526 vrelel(vp, 0); 527 return EBUSY; 528 } 529 vwait(vp, VI_INACTNOW); 530 if ((vp->v_iflag & VI_CLEAN) != 0) { 531 vrelel(vp, 0); 532 return ENOENT; 533 } 534 } 535 536 /* 537 * Ok, we got it in good shape. Just locking left. 538 */ 539 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 540 mutex_exit(vp->v_interlock); 541 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 542 error = vn_lock(vp, flags); 543 if (error != 0) { 544 vrele(vp); 545 } 546 } 547 return error; 548 } 549 550 /* 551 * vput: unlock and release the reference. 552 */ 553 void 554 vput(vnode_t *vp) 555 { 556 557 KASSERT((vp->v_iflag & VI_MARKER) == 0); 558 559 VOP_UNLOCK(vp); 560 vrele(vp); 561 } 562 563 /* 564 * Try to drop reference on a vnode. Abort if we are releasing the 565 * last reference. Note: this _must_ succeed if not the last reference. 566 */ 567 static inline bool 568 vtryrele(vnode_t *vp) 569 { 570 u_int use, next; 571 572 for (use = vp->v_usecount;; use = next) { 573 if (use == 1) { 574 return false; 575 } 576 KASSERT(use > 1); 577 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 578 if (__predict_true(next == use)) { 579 return true; 580 } 581 } 582 } 583 584 /* 585 * Vnode release. If reference count drops to zero, call inactive 586 * routine and either return to freelist or free to the pool. 587 */ 588 static void 589 vrelel(vnode_t *vp, int flags) 590 { 591 bool recycle, defer; 592 int error; 593 594 KASSERT(mutex_owned(vp->v_interlock)); 595 KASSERT((vp->v_iflag & VI_MARKER) == 0); 596 KASSERT(vp->v_freelisthd == NULL); 597 598 if (__predict_false(vp->v_op == dead_vnodeop_p && 599 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 600 vnpanic(vp, "dead but not clean"); 601 } 602 603 /* 604 * If not the last reference, just drop the reference count 605 * and unlock. 606 */ 607 if (vtryrele(vp)) { 608 vp->v_iflag |= VI_INACTREDO; 609 mutex_exit(vp->v_interlock); 610 return; 611 } 612 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 613 vnpanic(vp, "%s: bad ref count", __func__); 614 } 615 616 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 617 618 #ifdef DIAGNOSTIC 619 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 620 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 621 vprint("vrelel: missing VOP_CLOSE()", vp); 622 } 623 #endif 624 625 /* 626 * If not clean, deactivate the vnode, but preserve 627 * our reference across the call to VOP_INACTIVE(). 628 */ 629 retry: 630 if ((vp->v_iflag & VI_CLEAN) == 0) { 631 recycle = false; 632 vp->v_iflag |= VI_INACTNOW; 633 634 /* 635 * XXX This ugly block can be largely eliminated if 636 * locking is pushed down into the file systems. 637 * 638 * Defer vnode release to vrele_thread if caller 639 * requests it explicitly. 640 */ 641 if ((curlwp == uvm.pagedaemon_lwp) || 642 (flags & VRELEL_ASYNC_RELE) != 0) { 643 /* The pagedaemon can't wait around; defer. */ 644 defer = true; 645 } else if (curlwp == vrele_lwp) { 646 /* 647 * We have to try harder. But we can't sleep 648 * with VI_INACTNOW as vget() may be waiting on it. 649 */ 650 vp->v_iflag &= ~(VI_INACTREDO|VI_INACTNOW); 651 cv_broadcast(&vp->v_cv); 652 mutex_exit(vp->v_interlock); 653 error = vn_lock(vp, LK_EXCLUSIVE); 654 if (error != 0) { 655 /* XXX */ 656 vnpanic(vp, "%s: unable to lock %p", 657 __func__, vp); 658 } 659 mutex_enter(vp->v_interlock); 660 /* 661 * if we did get another reference while 662 * sleeping, don't try to inactivate it yet. 663 */ 664 if (__predict_false(vtryrele(vp))) { 665 VOP_UNLOCK(vp); 666 mutex_exit(vp->v_interlock); 667 return; 668 } 669 vp->v_iflag |= VI_INACTNOW; 670 mutex_exit(vp->v_interlock); 671 defer = false; 672 } else if ((vp->v_iflag & VI_LAYER) != 0) { 673 /* 674 * Acquiring the stack's lock in vclean() even 675 * for an honest vput/vrele is dangerous because 676 * our caller may hold other vnode locks; defer. 677 */ 678 defer = true; 679 } else { 680 /* If we can't acquire the lock, then defer. */ 681 vp->v_iflag &= ~VI_INACTREDO; 682 mutex_exit(vp->v_interlock); 683 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 684 if (error != 0) { 685 defer = true; 686 mutex_enter(vp->v_interlock); 687 } else { 688 defer = false; 689 } 690 } 691 692 if (defer) { 693 /* 694 * Defer reclaim to the kthread; it's not safe to 695 * clean it here. We donate it our last reference. 696 */ 697 KASSERT(mutex_owned(vp->v_interlock)); 698 vp->v_iflag &= ~VI_INACTNOW; 699 mutex_enter(&vrele_lock); 700 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 701 if (++vrele_pending > (desiredvnodes >> 8)) 702 cv_signal(&vrele_cv); 703 mutex_exit(&vrele_lock); 704 cv_broadcast(&vp->v_cv); 705 mutex_exit(vp->v_interlock); 706 return; 707 } 708 709 /* 710 * The vnode can gain another reference while being 711 * deactivated. If VOP_INACTIVE() indicates that 712 * the described file has been deleted, then recycle 713 * the vnode irrespective of additional references. 714 * Another thread may be waiting to re-use the on-disk 715 * inode. 716 * 717 * Note that VOP_INACTIVE() will drop the vnode lock. 718 */ 719 VOP_INACTIVE(vp, &recycle); 720 mutex_enter(vp->v_interlock); 721 vp->v_iflag &= ~VI_INACTNOW; 722 cv_broadcast(&vp->v_cv); 723 if (!recycle) { 724 if (vtryrele(vp)) { 725 mutex_exit(vp->v_interlock); 726 return; 727 } 728 729 /* 730 * If we grew another reference while 731 * VOP_INACTIVE() was underway, retry. 732 */ 733 if ((vp->v_iflag & VI_INACTREDO) != 0) { 734 goto retry; 735 } 736 } 737 738 /* Take care of space accounting. */ 739 if (vp->v_iflag & VI_EXECMAP) { 740 atomic_add_int(&uvmexp.execpages, 741 -vp->v_uobj.uo_npages); 742 atomic_add_int(&uvmexp.filepages, 743 vp->v_uobj.uo_npages); 744 } 745 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 746 vp->v_vflag &= ~VV_MAPPED; 747 748 /* 749 * Recycle the vnode if the file is now unused (unlinked), 750 * otherwise just free it. 751 */ 752 if (recycle) { 753 vclean(vp); 754 } 755 KASSERT(vp->v_usecount > 0); 756 } 757 758 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 759 /* Gained another reference while being reclaimed. */ 760 mutex_exit(vp->v_interlock); 761 return; 762 } 763 764 if ((vp->v_iflag & VI_CLEAN) != 0) { 765 /* 766 * It's clean so destroy it. It isn't referenced 767 * anywhere since it has been reclaimed. 768 */ 769 KASSERT(vp->v_holdcnt == 0); 770 KASSERT(vp->v_writecount == 0); 771 mutex_exit(vp->v_interlock); 772 vfs_insmntque(vp, NULL); 773 if (vp->v_type == VBLK || vp->v_type == VCHR) { 774 spec_node_destroy(vp); 775 } 776 vnfree(vp); 777 } else { 778 /* 779 * Otherwise, put it back onto the freelist. It 780 * can't be destroyed while still associated with 781 * a file system. 782 */ 783 mutex_enter(&vnode_free_list_lock); 784 if (vp->v_holdcnt > 0) { 785 vp->v_freelisthd = &vnode_hold_list; 786 } else { 787 vp->v_freelisthd = &vnode_free_list; 788 } 789 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 790 mutex_exit(&vnode_free_list_lock); 791 mutex_exit(vp->v_interlock); 792 } 793 } 794 795 void 796 vrele(vnode_t *vp) 797 { 798 799 KASSERT((vp->v_iflag & VI_MARKER) == 0); 800 801 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 802 return; 803 } 804 mutex_enter(vp->v_interlock); 805 vrelel(vp, 0); 806 } 807 808 /* 809 * Asynchronous vnode release, vnode is released in different context. 810 */ 811 void 812 vrele_async(vnode_t *vp) 813 { 814 815 KASSERT((vp->v_iflag & VI_MARKER) == 0); 816 817 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 818 return; 819 } 820 mutex_enter(vp->v_interlock); 821 vrelel(vp, VRELEL_ASYNC_RELE); 822 } 823 824 static void 825 vrele_thread(void *cookie) 826 { 827 vnode_t *vp; 828 829 for (;;) { 830 mutex_enter(&vrele_lock); 831 while (TAILQ_EMPTY(&vrele_list)) { 832 vrele_gen++; 833 cv_broadcast(&vrele_cv); 834 cv_timedwait(&vrele_cv, &vrele_lock, hz); 835 } 836 vp = TAILQ_FIRST(&vrele_list); 837 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 838 vrele_pending--; 839 mutex_exit(&vrele_lock); 840 841 /* 842 * If not the last reference, then ignore the vnode 843 * and look for more work. 844 */ 845 mutex_enter(vp->v_interlock); 846 vrelel(vp, 0); 847 } 848 } 849 850 void 851 vrele_flush(void) 852 { 853 int gen; 854 855 mutex_enter(&vrele_lock); 856 gen = vrele_gen; 857 while (vrele_pending && gen == vrele_gen) { 858 cv_broadcast(&vrele_cv); 859 cv_wait(&vrele_cv, &vrele_lock); 860 } 861 mutex_exit(&vrele_lock); 862 } 863 864 /* 865 * Vnode reference, where a reference is already held by some other 866 * object (for example, a file structure). 867 */ 868 void 869 vref(vnode_t *vp) 870 { 871 872 KASSERT((vp->v_iflag & VI_MARKER) == 0); 873 KASSERT(vp->v_usecount != 0); 874 875 atomic_inc_uint(&vp->v_usecount); 876 } 877 878 /* 879 * Page or buffer structure gets a reference. 880 * Called with v_interlock held. 881 */ 882 void 883 vholdl(vnode_t *vp) 884 { 885 886 KASSERT(mutex_owned(vp->v_interlock)); 887 KASSERT((vp->v_iflag & VI_MARKER) == 0); 888 889 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 890 mutex_enter(&vnode_free_list_lock); 891 KASSERT(vp->v_freelisthd == &vnode_free_list); 892 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 893 vp->v_freelisthd = &vnode_hold_list; 894 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 895 mutex_exit(&vnode_free_list_lock); 896 } 897 } 898 899 /* 900 * Page or buffer structure frees a reference. 901 * Called with v_interlock held. 902 */ 903 void 904 holdrelel(vnode_t *vp) 905 { 906 907 KASSERT(mutex_owned(vp->v_interlock)); 908 KASSERT((vp->v_iflag & VI_MARKER) == 0); 909 910 if (vp->v_holdcnt <= 0) { 911 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 912 } 913 914 vp->v_holdcnt--; 915 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 916 mutex_enter(&vnode_free_list_lock); 917 KASSERT(vp->v_freelisthd == &vnode_hold_list); 918 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 919 vp->v_freelisthd = &vnode_free_list; 920 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 921 mutex_exit(&vnode_free_list_lock); 922 } 923 } 924 925 /* 926 * Disassociate the underlying file system from a vnode. 927 * 928 * Must be called with the interlock held, and will return with it held. 929 */ 930 static void 931 vclean(vnode_t *vp) 932 { 933 lwp_t *l = curlwp; 934 bool recycle, active, doclose; 935 int error; 936 937 KASSERT(mutex_owned(vp->v_interlock)); 938 KASSERT((vp->v_iflag & VI_MARKER) == 0); 939 KASSERT(vp->v_usecount != 0); 940 941 /* If cleaning is already in progress wait until done and return. */ 942 if (vp->v_iflag & VI_XLOCK) { 943 vwait(vp, VI_XLOCK); 944 return; 945 } 946 947 /* If already clean, nothing to do. */ 948 if ((vp->v_iflag & VI_CLEAN) != 0) { 949 return; 950 } 951 952 /* 953 * Prevent the vnode from being recycled or brought into use 954 * while we clean it out. 955 */ 956 vp->v_iflag |= VI_XLOCK; 957 if (vp->v_iflag & VI_EXECMAP) { 958 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 959 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 960 } 961 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 962 active = (vp->v_usecount > 1); 963 964 /* XXXAD should not lock vnode under layer */ 965 mutex_exit(vp->v_interlock); 966 VOP_LOCK(vp, LK_EXCLUSIVE); 967 968 doclose = ! (active && vp->v_type == VBLK && 969 spec_node_getmountedfs(vp) != NULL); 970 971 /* 972 * Clean out any cached data associated with the vnode. 973 * If purging an active vnode, it must be closed and 974 * deactivated before being reclaimed. Note that the 975 * VOP_INACTIVE will unlock the vnode. 976 */ 977 if (doclose) { 978 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 979 if (error != 0) { 980 if (wapbl_vphaswapbl(vp)) 981 WAPBL_DISCARD(wapbl_vptomp(vp)); 982 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 983 } 984 KASSERT(error == 0); 985 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 986 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 987 spec_node_revoke(vp); 988 } 989 } 990 if (active) { 991 VOP_INACTIVE(vp, &recycle); 992 } else { 993 /* 994 * Any other processes trying to obtain this lock must first 995 * wait for VI_XLOCK to clear, then call the new lock operation. 996 */ 997 VOP_UNLOCK(vp); 998 } 999 1000 /* Disassociate the underlying file system from the vnode. */ 1001 if (VOP_RECLAIM(vp)) { 1002 vnpanic(vp, "%s: cannot reclaim", __func__); 1003 } 1004 1005 KASSERT(vp->v_data == NULL); 1006 KASSERT(vp->v_uobj.uo_npages == 0); 1007 1008 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1009 uvm_ra_freectx(vp->v_ractx); 1010 vp->v_ractx = NULL; 1011 } 1012 1013 /* Purge name cache. */ 1014 cache_purge(vp); 1015 1016 /* 1017 * The vnode isn't clean, but still resides on the mount list. Remove 1018 * it. XXX This is a bit dodgy. 1019 */ 1020 if (! doclose) 1021 vfs_insmntque(vp, NULL); 1022 1023 /* Done with purge, notify sleepers of the grim news. */ 1024 mutex_enter(vp->v_interlock); 1025 if (doclose) { 1026 vp->v_op = dead_vnodeop_p; 1027 vp->v_vflag |= VV_LOCKSWORK; 1028 vp->v_iflag |= VI_CLEAN; 1029 } else { 1030 vp->v_op = spec_vnodeop_p; 1031 vp->v_vflag &= ~VV_LOCKSWORK; 1032 } 1033 vp->v_tag = VT_NON; 1034 KNOTE(&vp->v_klist, NOTE_REVOKE); 1035 vp->v_iflag &= ~VI_XLOCK; 1036 cv_broadcast(&vp->v_cv); 1037 1038 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1039 } 1040 1041 /* 1042 * Recycle an unused vnode to the front of the free list. 1043 * Release the passed interlock if the vnode will be recycled. 1044 */ 1045 int 1046 vrecycle(vnode_t *vp, kmutex_t *inter_lkp) 1047 { 1048 1049 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1050 1051 mutex_enter(vp->v_interlock); 1052 if (vp->v_usecount != 0 || (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) != 0) { 1053 mutex_exit(vp->v_interlock); 1054 return 0; 1055 } 1056 if (inter_lkp) { 1057 mutex_exit(inter_lkp); 1058 } 1059 vremfree(vp); 1060 vp->v_usecount = 1; 1061 vclean(vp); 1062 vrelel(vp, 0); 1063 return 1; 1064 } 1065 1066 /* 1067 * Eliminate all activity associated with the requested vnode 1068 * and with all vnodes aliased to the requested vnode. 1069 */ 1070 void 1071 vrevoke(vnode_t *vp) 1072 { 1073 vnode_t *vq; 1074 enum vtype type; 1075 dev_t dev; 1076 1077 KASSERT(vp->v_usecount > 0); 1078 1079 mutex_enter(vp->v_interlock); 1080 if ((vp->v_iflag & VI_CLEAN) != 0) { 1081 mutex_exit(vp->v_interlock); 1082 return; 1083 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1084 atomic_inc_uint(&vp->v_usecount); 1085 vclean(vp); 1086 vrelel(vp, 0); 1087 return; 1088 } else { 1089 dev = vp->v_rdev; 1090 type = vp->v_type; 1091 mutex_exit(vp->v_interlock); 1092 } 1093 1094 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1095 mutex_enter(vq->v_interlock); 1096 vclean(vq); 1097 vrelel(vq, 0); 1098 } 1099 } 1100 1101 /* 1102 * Eliminate all activity associated with a vnode in preparation for 1103 * reuse. Drops a reference from the vnode. 1104 */ 1105 void 1106 vgone(vnode_t *vp) 1107 { 1108 1109 mutex_enter(vp->v_interlock); 1110 vclean(vp); 1111 vrelel(vp, 0); 1112 } 1113 1114 /* 1115 * Update outstanding I/O count and do wakeup if requested. 1116 */ 1117 void 1118 vwakeup(struct buf *bp) 1119 { 1120 vnode_t *vp; 1121 1122 if ((vp = bp->b_vp) == NULL) 1123 return; 1124 1125 KASSERT(bp->b_objlock == vp->v_interlock); 1126 KASSERT(mutex_owned(bp->b_objlock)); 1127 1128 if (--vp->v_numoutput < 0) 1129 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1130 if (vp->v_numoutput == 0) 1131 cv_broadcast(&vp->v_cv); 1132 } 1133 1134 /* 1135 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1136 * recycled. 1137 */ 1138 void 1139 vwait(vnode_t *vp, int flags) 1140 { 1141 1142 KASSERT(mutex_owned(vp->v_interlock)); 1143 KASSERT(vp->v_usecount != 0); 1144 1145 while ((vp->v_iflag & flags) != 0) 1146 cv_wait(&vp->v_cv, vp->v_interlock); 1147 } 1148 1149 int 1150 vfs_drainvnodes(long target) 1151 { 1152 int error; 1153 1154 mutex_enter(&vnode_free_list_lock); 1155 1156 while (numvnodes > target) { 1157 error = cleanvnode(); 1158 if (error != 0) 1159 return error; 1160 mutex_enter(&vnode_free_list_lock); 1161 } 1162 1163 mutex_exit(&vnode_free_list_lock); 1164 1165 return 0; 1166 } 1167 1168 void 1169 vnpanic(vnode_t *vp, const char *fmt, ...) 1170 { 1171 va_list ap; 1172 1173 #ifdef DIAGNOSTIC 1174 vprint(NULL, vp); 1175 #endif 1176 va_start(ap, fmt); 1177 vpanic(fmt, ap); 1178 va_end(ap); 1179 } 1180