1 /* $NetBSD: vfs_vnode.c,v 1.20 2013/09/21 19:51:33 dholland Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Reference counting 96 * 97 * Vnode is considered active, if reference count (vnode_t::v_usecount) 98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 99 * as vput(9), routines. Common points holding references are e.g. 100 * file openings, current working directory, mount points, etc. 101 * 102 * Note on v_usecount and its locking 103 * 104 * At nearly all points it is known that v_usecount could be zero, 105 * the vnode_t::v_interlock will be held. To change v_usecount away 106 * from zero, the interlock must be held. To change from a non-zero 107 * value to zero, again the interlock must be held. 108 * 109 * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise 110 * v_usecount, if the VC_XLOCK bit is set in it, the interlock must 111 * be held. To modify the VC_XLOCK bit, the interlock must be held. 112 * We always keep the usecount (v_usecount & VC_MASK) non-zero while 113 * the VC_XLOCK bit is set. 114 * 115 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 116 * value to a non-zero value can safely be done using atomic operations, 117 * without the interlock held. 118 * 119 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 120 * value can be done using atomic operations, without the interlock held. 121 * 122 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 123 * mntvnode_lock is still held. 124 * 125 * See PR 41374. 126 */ 127 128 #include <sys/cdefs.h> 129 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.20 2013/09/21 19:51:33 dholland Exp $"); 130 131 #include <sys/param.h> 132 #include <sys/kernel.h> 133 134 #include <sys/atomic.h> 135 #include <sys/buf.h> 136 #include <sys/conf.h> 137 #include <sys/device.h> 138 #include <sys/kauth.h> 139 #include <sys/kmem.h> 140 #include <sys/kthread.h> 141 #include <sys/module.h> 142 #include <sys/mount.h> 143 #include <sys/namei.h> 144 #include <sys/syscallargs.h> 145 #include <sys/sysctl.h> 146 #include <sys/systm.h> 147 #include <sys/vnode.h> 148 #include <sys/wapbl.h> 149 150 #include <uvm/uvm.h> 151 #include <uvm/uvm_readahead.h> 152 153 u_int numvnodes __cacheline_aligned; 154 155 static pool_cache_t vnode_cache __read_mostly; 156 157 /* 158 * There are two free lists: one is for vnodes which have no buffer/page 159 * references and one for those which do (i.e. v_holdcnt is non-zero). 160 * Vnode recycling mechanism first attempts to look into the former list. 161 */ 162 static kmutex_t vnode_free_list_lock __cacheline_aligned; 163 static vnodelst_t vnode_free_list __cacheline_aligned; 164 static vnodelst_t vnode_hold_list __cacheline_aligned; 165 static kcondvar_t vdrain_cv __cacheline_aligned; 166 167 static vnodelst_t vrele_list __cacheline_aligned; 168 static kmutex_t vrele_lock __cacheline_aligned; 169 static kcondvar_t vrele_cv __cacheline_aligned; 170 static lwp_t * vrele_lwp __cacheline_aligned; 171 static int vrele_pending __cacheline_aligned; 172 static int vrele_gen __cacheline_aligned; 173 174 static int cleanvnode(void); 175 static void vdrain_thread(void *); 176 static void vrele_thread(void *); 177 static void vnpanic(vnode_t *, const char *, ...) 178 __printflike(2, 3); 179 180 /* Routines having to do with the management of the vnode table. */ 181 extern int (**dead_vnodeop_p)(void *); 182 183 void 184 vfs_vnode_sysinit(void) 185 { 186 int error; 187 188 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 189 NULL, IPL_NONE, NULL, NULL, NULL); 190 KASSERT(vnode_cache != NULL); 191 192 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 193 TAILQ_INIT(&vnode_free_list); 194 TAILQ_INIT(&vnode_hold_list); 195 TAILQ_INIT(&vrele_list); 196 197 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 198 cv_init(&vdrain_cv, "vdrain"); 199 cv_init(&vrele_cv, "vrele"); 200 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 201 NULL, NULL, "vdrain"); 202 KASSERT(error == 0); 203 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 204 NULL, &vrele_lwp, "vrele"); 205 KASSERT(error == 0); 206 } 207 208 /* 209 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 210 * marker vnode. 211 */ 212 vnode_t * 213 vnalloc(struct mount *mp) 214 { 215 vnode_t *vp; 216 217 vp = pool_cache_get(vnode_cache, PR_WAITOK); 218 KASSERT(vp != NULL); 219 220 memset(vp, 0, sizeof(*vp)); 221 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 222 cv_init(&vp->v_cv, "vnode"); 223 /* 224 * Done by memset() above. 225 * LIST_INIT(&vp->v_nclist); 226 * LIST_INIT(&vp->v_dnclist); 227 */ 228 229 if (mp != NULL) { 230 vp->v_mount = mp; 231 vp->v_type = VBAD; 232 vp->v_iflag = VI_MARKER; 233 } else { 234 rw_init(&vp->v_lock); 235 } 236 237 return vp; 238 } 239 240 /* 241 * Free an unused, unreferenced vnode. 242 */ 243 void 244 vnfree(vnode_t *vp) 245 { 246 247 KASSERT(vp->v_usecount == 0); 248 249 if ((vp->v_iflag & VI_MARKER) == 0) { 250 rw_destroy(&vp->v_lock); 251 mutex_enter(&vnode_free_list_lock); 252 numvnodes--; 253 mutex_exit(&vnode_free_list_lock); 254 } 255 256 /* 257 * Note: the vnode interlock will either be freed, of reference 258 * dropped (if VI_LOCKSHARE was in use). 259 */ 260 uvm_obj_destroy(&vp->v_uobj, true); 261 cv_destroy(&vp->v_cv); 262 pool_cache_put(vnode_cache, vp); 263 } 264 265 /* 266 * cleanvnode: grab a vnode from freelist, clean and free it. 267 * 268 * => Releases vnode_free_list_lock. 269 */ 270 static int 271 cleanvnode(void) 272 { 273 vnode_t *vp; 274 vnodelst_t *listhd; 275 276 KASSERT(mutex_owned(&vnode_free_list_lock)); 277 retry: 278 listhd = &vnode_free_list; 279 try_nextlist: 280 TAILQ_FOREACH(vp, listhd, v_freelist) { 281 /* 282 * It's safe to test v_usecount and v_iflag 283 * without holding the interlock here, since 284 * these vnodes should never appear on the 285 * lists. 286 */ 287 KASSERT(vp->v_usecount == 0); 288 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 289 KASSERT(vp->v_freelisthd == listhd); 290 291 if (!mutex_tryenter(vp->v_interlock)) 292 continue; 293 if ((vp->v_iflag & VI_XLOCK) == 0) 294 break; 295 mutex_exit(vp->v_interlock); 296 } 297 298 if (vp == NULL) { 299 if (listhd == &vnode_free_list) { 300 listhd = &vnode_hold_list; 301 goto try_nextlist; 302 } 303 mutex_exit(&vnode_free_list_lock); 304 return EBUSY; 305 } 306 307 /* Remove it from the freelist. */ 308 TAILQ_REMOVE(listhd, vp, v_freelist); 309 vp->v_freelisthd = NULL; 310 mutex_exit(&vnode_free_list_lock); 311 312 KASSERT(vp->v_usecount == 0); 313 314 /* 315 * The vnode is still associated with a file system, so we must 316 * clean it out before freeing it. We need to add a reference 317 * before doing this. If the vnode gains another reference while 318 * being cleaned out then we lose - retry. 319 */ 320 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 321 vclean(vp, DOCLOSE); 322 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 323 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 324 if (vp->v_usecount > 1) { 325 /* 326 * Don't return to freelist - the holder of the last 327 * reference will destroy it. 328 */ 329 vrelel(vp, 0); /* releases vp->v_interlock */ 330 mutex_enter(&vnode_free_list_lock); 331 goto retry; 332 } 333 334 KASSERT((vp->v_iflag & VI_CLEAN) == VI_CLEAN); 335 mutex_exit(vp->v_interlock); 336 if (vp->v_type == VBLK || vp->v_type == VCHR) { 337 spec_node_destroy(vp); 338 } 339 vp->v_type = VNON; 340 341 KASSERT(vp->v_data == NULL); 342 KASSERT(vp->v_uobj.uo_npages == 0); 343 KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); 344 KASSERT(vp->v_numoutput == 0); 345 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 346 347 vrele(vp); 348 349 return 0; 350 } 351 352 /* 353 * getnewvnode: return a fresh vnode. 354 * 355 * => Returns referenced vnode, moved into the mount queue. 356 * => Shares the interlock specified by 'slock', if it is not NULL. 357 */ 358 int 359 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 360 kmutex_t *slock, vnode_t **vpp) 361 { 362 struct uvm_object *uobj; 363 vnode_t *vp; 364 int error = 0; 365 366 if (mp != NULL) { 367 /* 368 * Mark filesystem busy while we are creating a vnode. 369 * If unmount is in progress, this will fail. 370 */ 371 error = vfs_busy(mp, NULL); 372 if (error) 373 return error; 374 } 375 376 vp = NULL; 377 378 /* Allocate a new vnode. */ 379 mutex_enter(&vnode_free_list_lock); 380 numvnodes++; 381 if (numvnodes > desiredvnodes + desiredvnodes / 10) 382 cv_signal(&vdrain_cv); 383 mutex_exit(&vnode_free_list_lock); 384 vp = vnalloc(NULL); 385 386 KASSERT(vp->v_freelisthd == NULL); 387 KASSERT(LIST_EMPTY(&vp->v_nclist)); 388 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 389 390 /* Initialize vnode. */ 391 vp->v_usecount = 1; 392 vp->v_type = VNON; 393 vp->v_tag = tag; 394 vp->v_op = vops; 395 vp->v_data = NULL; 396 397 uobj = &vp->v_uobj; 398 KASSERT(uobj->pgops == &uvm_vnodeops); 399 KASSERT(uobj->uo_npages == 0); 400 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 401 vp->v_size = vp->v_writesize = VSIZENOTSET; 402 403 /* Share the vnode_t::v_interlock, if requested. */ 404 if (slock) { 405 /* Set the interlock and mark that it is shared. */ 406 KASSERT(vp->v_mount == NULL); 407 mutex_obj_hold(slock); 408 uvm_obj_setlock(&vp->v_uobj, slock); 409 KASSERT(vp->v_interlock == slock); 410 vp->v_iflag |= VI_LOCKSHARE; 411 } 412 413 /* Finally, move vnode into the mount queue. */ 414 vfs_insmntque(vp, mp); 415 416 if (mp != NULL) { 417 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 418 vp->v_vflag |= VV_MPSAFE; 419 vfs_unbusy(mp, true, NULL); 420 } 421 422 *vpp = vp; 423 return 0; 424 } 425 426 /* 427 * This is really just the reverse of getnewvnode(). Needed for 428 * VFS_VGET functions who may need to push back a vnode in case 429 * of a locking race. 430 */ 431 void 432 ungetnewvnode(vnode_t *vp) 433 { 434 435 KASSERT(vp->v_usecount == 1); 436 KASSERT(vp->v_data == NULL); 437 KASSERT(vp->v_freelisthd == NULL); 438 439 mutex_enter(vp->v_interlock); 440 vp->v_iflag |= VI_CLEAN; 441 vrelel(vp, 0); 442 } 443 444 /* 445 * Helper thread to keep the number of vnodes below desiredvnodes. 446 */ 447 static void 448 vdrain_thread(void *cookie) 449 { 450 int error; 451 452 mutex_enter(&vnode_free_list_lock); 453 454 for (;;) { 455 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 456 while (numvnodes > desiredvnodes) { 457 error = cleanvnode(); 458 if (error) 459 kpause("vndsbusy", false, hz, NULL); 460 mutex_enter(&vnode_free_list_lock); 461 if (error) 462 break; 463 } 464 } 465 } 466 467 /* 468 * Remove a vnode from its freelist. 469 */ 470 void 471 vremfree(vnode_t *vp) 472 { 473 474 KASSERT(mutex_owned(vp->v_interlock)); 475 KASSERT(vp->v_usecount == 0); 476 477 /* 478 * Note that the reference count must not change until 479 * the vnode is removed. 480 */ 481 mutex_enter(&vnode_free_list_lock); 482 if (vp->v_holdcnt > 0) { 483 KASSERT(vp->v_freelisthd == &vnode_hold_list); 484 } else { 485 KASSERT(vp->v_freelisthd == &vnode_free_list); 486 } 487 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 488 vp->v_freelisthd = NULL; 489 mutex_exit(&vnode_free_list_lock); 490 } 491 492 /* 493 * Try to gain a reference to a vnode, without acquiring its interlock. 494 * The caller must hold a lock that will prevent the vnode from being 495 * recycled or freed. 496 */ 497 bool 498 vtryget(vnode_t *vp) 499 { 500 u_int use, next; 501 502 /* 503 * If the vnode is being freed, don't make life any harder 504 * for vclean() by adding another reference without waiting. 505 * This is not strictly necessary, but we'll do it anyway. 506 */ 507 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) { 508 return false; 509 } 510 for (use = vp->v_usecount;; use = next) { 511 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 512 /* Need interlock held if first reference. */ 513 return false; 514 } 515 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 516 if (__predict_true(next == use)) { 517 return true; 518 } 519 } 520 } 521 522 /* 523 * vget: get a particular vnode from the free list, increment its reference 524 * count and lock it. 525 * 526 * => Should be called with v_interlock held. 527 * 528 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 529 * In that case, we cannot grab the vnode, so the process is awakened when 530 * the transition is completed, and an error returned to indicate that the 531 * vnode is no longer usable (e.g. changed to a new file system type). 532 */ 533 int 534 vget(vnode_t *vp, int flags) 535 { 536 int error = 0; 537 538 KASSERT((vp->v_iflag & VI_MARKER) == 0); 539 KASSERT(mutex_owned(vp->v_interlock)); 540 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 541 542 /* 543 * Before adding a reference, we must remove the vnode 544 * from its freelist. 545 */ 546 if (vp->v_usecount == 0) { 547 vremfree(vp); 548 vp->v_usecount = 1; 549 } else { 550 atomic_inc_uint(&vp->v_usecount); 551 } 552 553 /* 554 * If the vnode is in the process of being cleaned out for 555 * another use, we wait for the cleaning to finish and then 556 * return failure. Cleaning is determined by checking if 557 * the VI_XLOCK flag is set. 558 */ 559 if ((vp->v_iflag & VI_XLOCK) != 0) { 560 if ((flags & LK_NOWAIT) != 0) { 561 vrelel(vp, 0); 562 return EBUSY; 563 } 564 vwait(vp, VI_XLOCK); 565 vrelel(vp, 0); 566 return ENOENT; 567 } 568 569 if ((vp->v_iflag & VI_INACTNOW) != 0) { 570 /* 571 * if it's being desactived, wait for it to complete. 572 * Make sure to not return a clean vnode. 573 */ 574 if ((flags & LK_NOWAIT) != 0) { 575 vrelel(vp, 0); 576 return EBUSY; 577 } 578 vwait(vp, VI_INACTNOW); 579 if ((vp->v_iflag & VI_CLEAN) != 0) { 580 vrelel(vp, 0); 581 return ENOENT; 582 } 583 } 584 585 /* 586 * Ok, we got it in good shape. Just locking left. 587 */ 588 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 589 mutex_exit(vp->v_interlock); 590 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 591 error = vn_lock(vp, flags); 592 if (error != 0) { 593 vrele(vp); 594 } 595 } 596 return error; 597 } 598 599 /* 600 * vput: unlock and release the reference. 601 */ 602 void 603 vput(vnode_t *vp) 604 { 605 606 KASSERT((vp->v_iflag & VI_MARKER) == 0); 607 608 VOP_UNLOCK(vp); 609 vrele(vp); 610 } 611 612 /* 613 * Try to drop reference on a vnode. Abort if we are releasing the 614 * last reference. Note: this _must_ succeed if not the last reference. 615 */ 616 static inline bool 617 vtryrele(vnode_t *vp) 618 { 619 u_int use, next; 620 621 for (use = vp->v_usecount;; use = next) { 622 if (use == 1) { 623 return false; 624 } 625 KASSERT((use & VC_MASK) > 1); 626 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 627 if (__predict_true(next == use)) { 628 return true; 629 } 630 } 631 } 632 633 /* 634 * Vnode release. If reference count drops to zero, call inactive 635 * routine and either return to freelist or free to the pool. 636 */ 637 void 638 vrelel(vnode_t *vp, int flags) 639 { 640 bool recycle, defer; 641 int error; 642 643 KASSERT(mutex_owned(vp->v_interlock)); 644 KASSERT((vp->v_iflag & VI_MARKER) == 0); 645 KASSERT(vp->v_freelisthd == NULL); 646 647 if (__predict_false(vp->v_op == dead_vnodeop_p && 648 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 649 vnpanic(vp, "dead but not clean"); 650 } 651 652 /* 653 * If not the last reference, just drop the reference count 654 * and unlock. 655 */ 656 if (vtryrele(vp)) { 657 vp->v_iflag |= VI_INACTREDO; 658 mutex_exit(vp->v_interlock); 659 return; 660 } 661 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 662 vnpanic(vp, "%s: bad ref count", __func__); 663 } 664 665 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 666 667 #ifdef DIAGNOSTIC 668 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 669 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 670 vprint("vrelel: missing VOP_CLOSE()", vp); 671 } 672 #endif 673 674 /* 675 * If not clean, deactivate the vnode, but preserve 676 * our reference across the call to VOP_INACTIVE(). 677 */ 678 retry: 679 if ((vp->v_iflag & VI_CLEAN) == 0) { 680 recycle = false; 681 vp->v_iflag |= VI_INACTNOW; 682 683 /* 684 * XXX This ugly block can be largely eliminated if 685 * locking is pushed down into the file systems. 686 * 687 * Defer vnode release to vrele_thread if caller 688 * requests it explicitly. 689 */ 690 if ((curlwp == uvm.pagedaemon_lwp) || 691 (flags & VRELEL_ASYNC_RELE) != 0) { 692 /* The pagedaemon can't wait around; defer. */ 693 defer = true; 694 } else if (curlwp == vrele_lwp) { 695 /* 696 * We have to try harder. But we can't sleep 697 * with VI_INACTNOW as vget() may be waiting on it. 698 */ 699 vp->v_iflag &= ~(VI_INACTREDO|VI_INACTNOW); 700 cv_broadcast(&vp->v_cv); 701 mutex_exit(vp->v_interlock); 702 error = vn_lock(vp, LK_EXCLUSIVE); 703 if (error != 0) { 704 /* XXX */ 705 vnpanic(vp, "%s: unable to lock %p", 706 __func__, vp); 707 } 708 mutex_enter(vp->v_interlock); 709 /* 710 * if we did get another reference while 711 * sleeping, don't try to inactivate it yet. 712 */ 713 if (__predict_false(vtryrele(vp))) { 714 VOP_UNLOCK(vp); 715 mutex_exit(vp->v_interlock); 716 return; 717 } 718 vp->v_iflag |= VI_INACTNOW; 719 mutex_exit(vp->v_interlock); 720 defer = false; 721 } else if ((vp->v_iflag & VI_LAYER) != 0) { 722 /* 723 * Acquiring the stack's lock in vclean() even 724 * for an honest vput/vrele is dangerous because 725 * our caller may hold other vnode locks; defer. 726 */ 727 defer = true; 728 } else { 729 /* If we can't acquire the lock, then defer. */ 730 vp->v_iflag &= ~VI_INACTREDO; 731 mutex_exit(vp->v_interlock); 732 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 733 if (error != 0) { 734 defer = true; 735 mutex_enter(vp->v_interlock); 736 } else { 737 defer = false; 738 } 739 } 740 741 if (defer) { 742 /* 743 * Defer reclaim to the kthread; it's not safe to 744 * clean it here. We donate it our last reference. 745 */ 746 KASSERT(mutex_owned(vp->v_interlock)); 747 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 748 vp->v_iflag &= ~VI_INACTNOW; 749 vp->v_iflag |= VI_INACTPEND; 750 mutex_enter(&vrele_lock); 751 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 752 if (++vrele_pending > (desiredvnodes >> 8)) 753 cv_signal(&vrele_cv); 754 mutex_exit(&vrele_lock); 755 cv_broadcast(&vp->v_cv); 756 mutex_exit(vp->v_interlock); 757 return; 758 } 759 760 /* 761 * The vnode can gain another reference while being 762 * deactivated. If VOP_INACTIVE() indicates that 763 * the described file has been deleted, then recycle 764 * the vnode irrespective of additional references. 765 * Another thread may be waiting to re-use the on-disk 766 * inode. 767 * 768 * Note that VOP_INACTIVE() will drop the vnode lock. 769 */ 770 VOP_INACTIVE(vp, &recycle); 771 mutex_enter(vp->v_interlock); 772 vp->v_iflag &= ~VI_INACTNOW; 773 cv_broadcast(&vp->v_cv); 774 if (!recycle) { 775 if (vtryrele(vp)) { 776 mutex_exit(vp->v_interlock); 777 return; 778 } 779 780 /* 781 * If we grew another reference while 782 * VOP_INACTIVE() was underway, retry. 783 */ 784 if ((vp->v_iflag & VI_INACTREDO) != 0) { 785 goto retry; 786 } 787 } 788 789 /* Take care of space accounting. */ 790 if (vp->v_iflag & VI_EXECMAP) { 791 atomic_add_int(&uvmexp.execpages, 792 -vp->v_uobj.uo_npages); 793 atomic_add_int(&uvmexp.filepages, 794 vp->v_uobj.uo_npages); 795 } 796 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 797 vp->v_vflag &= ~VV_MAPPED; 798 799 /* 800 * Recycle the vnode if the file is now unused (unlinked), 801 * otherwise just free it. 802 */ 803 if (recycle) { 804 vclean(vp, DOCLOSE); 805 } 806 KASSERT(vp->v_usecount > 0); 807 } 808 809 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 810 /* Gained another reference while being reclaimed. */ 811 mutex_exit(vp->v_interlock); 812 return; 813 } 814 815 if ((vp->v_iflag & VI_CLEAN) != 0) { 816 /* 817 * It's clean so destroy it. It isn't referenced 818 * anywhere since it has been reclaimed. 819 */ 820 KASSERT(vp->v_holdcnt == 0); 821 KASSERT(vp->v_writecount == 0); 822 mutex_exit(vp->v_interlock); 823 vfs_insmntque(vp, NULL); 824 if (vp->v_type == VBLK || vp->v_type == VCHR) { 825 spec_node_destroy(vp); 826 } 827 vnfree(vp); 828 } else { 829 /* 830 * Otherwise, put it back onto the freelist. It 831 * can't be destroyed while still associated with 832 * a file system. 833 */ 834 mutex_enter(&vnode_free_list_lock); 835 if (vp->v_holdcnt > 0) { 836 vp->v_freelisthd = &vnode_hold_list; 837 } else { 838 vp->v_freelisthd = &vnode_free_list; 839 } 840 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 841 mutex_exit(&vnode_free_list_lock); 842 mutex_exit(vp->v_interlock); 843 } 844 } 845 846 void 847 vrele(vnode_t *vp) 848 { 849 850 KASSERT((vp->v_iflag & VI_MARKER) == 0); 851 852 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 853 return; 854 } 855 mutex_enter(vp->v_interlock); 856 vrelel(vp, 0); 857 } 858 859 /* 860 * Asynchronous vnode release, vnode is released in different context. 861 */ 862 void 863 vrele_async(vnode_t *vp) 864 { 865 866 KASSERT((vp->v_iflag & VI_MARKER) == 0); 867 868 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 869 return; 870 } 871 mutex_enter(vp->v_interlock); 872 vrelel(vp, VRELEL_ASYNC_RELE); 873 } 874 875 static void 876 vrele_thread(void *cookie) 877 { 878 vnode_t *vp; 879 880 for (;;) { 881 mutex_enter(&vrele_lock); 882 while (TAILQ_EMPTY(&vrele_list)) { 883 vrele_gen++; 884 cv_broadcast(&vrele_cv); 885 cv_timedwait(&vrele_cv, &vrele_lock, hz); 886 } 887 vp = TAILQ_FIRST(&vrele_list); 888 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 889 vrele_pending--; 890 mutex_exit(&vrele_lock); 891 892 /* 893 * If not the last reference, then ignore the vnode 894 * and look for more work. 895 */ 896 mutex_enter(vp->v_interlock); 897 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 898 vp->v_iflag &= ~VI_INACTPEND; 899 vrelel(vp, 0); 900 } 901 } 902 903 void 904 vrele_flush(void) 905 { 906 int gen; 907 908 mutex_enter(&vrele_lock); 909 gen = vrele_gen; 910 while (vrele_pending && gen == vrele_gen) { 911 cv_broadcast(&vrele_cv); 912 cv_wait(&vrele_cv, &vrele_lock); 913 } 914 mutex_exit(&vrele_lock); 915 } 916 917 /* 918 * Vnode reference, where a reference is already held by some other 919 * object (for example, a file structure). 920 */ 921 void 922 vref(vnode_t *vp) 923 { 924 925 KASSERT((vp->v_iflag & VI_MARKER) == 0); 926 KASSERT(vp->v_usecount != 0); 927 928 atomic_inc_uint(&vp->v_usecount); 929 } 930 931 /* 932 * Page or buffer structure gets a reference. 933 * Called with v_interlock held. 934 */ 935 void 936 vholdl(vnode_t *vp) 937 { 938 939 KASSERT(mutex_owned(vp->v_interlock)); 940 KASSERT((vp->v_iflag & VI_MARKER) == 0); 941 942 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 943 mutex_enter(&vnode_free_list_lock); 944 KASSERT(vp->v_freelisthd == &vnode_free_list); 945 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 946 vp->v_freelisthd = &vnode_hold_list; 947 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 948 mutex_exit(&vnode_free_list_lock); 949 } 950 } 951 952 /* 953 * Page or buffer structure frees a reference. 954 * Called with v_interlock held. 955 */ 956 void 957 holdrelel(vnode_t *vp) 958 { 959 960 KASSERT(mutex_owned(vp->v_interlock)); 961 KASSERT((vp->v_iflag & VI_MARKER) == 0); 962 963 if (vp->v_holdcnt <= 0) { 964 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 965 } 966 967 vp->v_holdcnt--; 968 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 969 mutex_enter(&vnode_free_list_lock); 970 KASSERT(vp->v_freelisthd == &vnode_hold_list); 971 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 972 vp->v_freelisthd = &vnode_free_list; 973 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 974 mutex_exit(&vnode_free_list_lock); 975 } 976 } 977 978 /* 979 * Disassociate the underlying file system from a vnode. 980 * 981 * Must be called with the interlock held, and will return with it held. 982 */ 983 void 984 vclean(vnode_t *vp, int flags) 985 { 986 lwp_t *l = curlwp; 987 bool recycle, active; 988 int error; 989 990 KASSERT(mutex_owned(vp->v_interlock)); 991 KASSERT((vp->v_iflag & VI_MARKER) == 0); 992 KASSERT(vp->v_usecount != 0); 993 994 /* If cleaning is already in progress wait until done and return. */ 995 if (vp->v_iflag & VI_XLOCK) { 996 vwait(vp, VI_XLOCK); 997 return; 998 } 999 1000 /* If already clean, nothing to do. */ 1001 if ((vp->v_iflag & VI_CLEAN) != 0) { 1002 return; 1003 } 1004 1005 /* 1006 * Prevent the vnode from being recycled or brought into use 1007 * while we clean it out. 1008 */ 1009 vp->v_iflag |= VI_XLOCK; 1010 if (vp->v_iflag & VI_EXECMAP) { 1011 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1012 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1013 } 1014 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1015 active = (vp->v_usecount & VC_MASK) > 1; 1016 1017 /* XXXAD should not lock vnode under layer */ 1018 mutex_exit(vp->v_interlock); 1019 VOP_LOCK(vp, LK_EXCLUSIVE); 1020 1021 /* 1022 * Clean out any cached data associated with the vnode. 1023 * If purging an active vnode, it must be closed and 1024 * deactivated before being reclaimed. Note that the 1025 * VOP_INACTIVE will unlock the vnode. 1026 */ 1027 if (flags & DOCLOSE) { 1028 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1029 if (error != 0) { 1030 /* XXX, fix vn_start_write's grab of mp and use that. */ 1031 1032 if (wapbl_vphaswapbl(vp)) 1033 WAPBL_DISCARD(wapbl_vptomp(vp)); 1034 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1035 } 1036 KASSERT(error == 0); 1037 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1038 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1039 spec_node_revoke(vp); 1040 } 1041 } 1042 if (active) { 1043 VOP_INACTIVE(vp, &recycle); 1044 } else { 1045 /* 1046 * Any other processes trying to obtain this lock must first 1047 * wait for VI_XLOCK to clear, then call the new lock operation. 1048 */ 1049 VOP_UNLOCK(vp); 1050 } 1051 1052 /* Disassociate the underlying file system from the vnode. */ 1053 if (VOP_RECLAIM(vp)) { 1054 vnpanic(vp, "%s: cannot reclaim", __func__); 1055 } 1056 1057 KASSERT(vp->v_data == NULL); 1058 KASSERT(vp->v_uobj.uo_npages == 0); 1059 1060 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1061 uvm_ra_freectx(vp->v_ractx); 1062 vp->v_ractx = NULL; 1063 } 1064 1065 /* Purge name cache. */ 1066 cache_purge(vp); 1067 1068 /* Done with purge, notify sleepers of the grim news. */ 1069 mutex_enter(vp->v_interlock); 1070 vp->v_op = dead_vnodeop_p; 1071 vp->v_tag = VT_NON; 1072 KNOTE(&vp->v_klist, NOTE_REVOKE); 1073 vp->v_iflag &= ~VI_XLOCK; 1074 vp->v_vflag &= ~VV_LOCKSWORK; 1075 if ((flags & DOCLOSE) != 0) { 1076 vp->v_iflag |= VI_CLEAN; 1077 } 1078 cv_broadcast(&vp->v_cv); 1079 1080 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1081 } 1082 1083 /* 1084 * Recycle an unused vnode to the front of the free list. 1085 * Release the passed interlock if the vnode will be recycled. 1086 */ 1087 int 1088 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1089 { 1090 1091 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1092 1093 mutex_enter(vp->v_interlock); 1094 if (vp->v_usecount != 0) { 1095 mutex_exit(vp->v_interlock); 1096 return 0; 1097 } 1098 if (inter_lkp) { 1099 mutex_exit(inter_lkp); 1100 } 1101 vremfree(vp); 1102 vp->v_usecount = 1; 1103 vclean(vp, DOCLOSE); 1104 vrelel(vp, 0); 1105 return 1; 1106 } 1107 1108 /* 1109 * Eliminate all activity associated with the requested vnode 1110 * and with all vnodes aliased to the requested vnode. 1111 */ 1112 void 1113 vrevoke(vnode_t *vp) 1114 { 1115 vnode_t *vq; 1116 enum vtype type; 1117 dev_t dev; 1118 1119 KASSERT(vp->v_usecount > 0); 1120 1121 mutex_enter(vp->v_interlock); 1122 if ((vp->v_iflag & VI_CLEAN) != 0) { 1123 mutex_exit(vp->v_interlock); 1124 return; 1125 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1126 atomic_inc_uint(&vp->v_usecount); 1127 vclean(vp, DOCLOSE); 1128 vrelel(vp, 0); 1129 return; 1130 } else { 1131 dev = vp->v_rdev; 1132 type = vp->v_type; 1133 mutex_exit(vp->v_interlock); 1134 } 1135 1136 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1137 mutex_enter(vq->v_interlock); 1138 vclean(vq, DOCLOSE); 1139 vrelel(vq, 0); 1140 } 1141 } 1142 1143 /* 1144 * Eliminate all activity associated with a vnode in preparation for 1145 * reuse. Drops a reference from the vnode. 1146 */ 1147 void 1148 vgone(vnode_t *vp) 1149 { 1150 1151 mutex_enter(vp->v_interlock); 1152 vclean(vp, DOCLOSE); 1153 vrelel(vp, 0); 1154 } 1155 1156 /* 1157 * Update outstanding I/O count and do wakeup if requested. 1158 */ 1159 void 1160 vwakeup(struct buf *bp) 1161 { 1162 vnode_t *vp; 1163 1164 if ((vp = bp->b_vp) == NULL) 1165 return; 1166 1167 KASSERT(bp->b_objlock == vp->v_interlock); 1168 KASSERT(mutex_owned(bp->b_objlock)); 1169 1170 if (--vp->v_numoutput < 0) 1171 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1172 if (vp->v_numoutput == 0) 1173 cv_broadcast(&vp->v_cv); 1174 } 1175 1176 /* 1177 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1178 * recycled. 1179 */ 1180 void 1181 vwait(vnode_t *vp, int flags) 1182 { 1183 1184 KASSERT(mutex_owned(vp->v_interlock)); 1185 KASSERT(vp->v_usecount != 0); 1186 1187 while ((vp->v_iflag & flags) != 0) 1188 cv_wait(&vp->v_cv, vp->v_interlock); 1189 } 1190 1191 int 1192 vfs_drainvnodes(long target) 1193 { 1194 int error; 1195 1196 mutex_enter(&vnode_free_list_lock); 1197 1198 while (numvnodes > target) { 1199 error = cleanvnode(); 1200 if (error != 0) 1201 return error; 1202 mutex_enter(&vnode_free_list_lock); 1203 } 1204 1205 mutex_exit(&vnode_free_list_lock); 1206 1207 return 0; 1208 } 1209 1210 void 1211 vnpanic(vnode_t *vp, const char *fmt, ...) 1212 { 1213 va_list ap; 1214 1215 #ifdef DIAGNOSTIC 1216 vprint(NULL, vp); 1217 #endif 1218 va_start(ap, fmt); 1219 vpanic(fmt, ap); 1220 va_end(ap); 1221 } 1222