1 /* $NetBSD: vfs_vnode.c,v 1.16 2012/10/12 21:10:55 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Reference counting 96 * 97 * Vnode is considered active, if reference count (vnode_t::v_usecount) 98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 99 * as vput(9), routines. Common points holding references are e.g. 100 * file openings, current working directory, mount points, etc. 101 * 102 * Note on v_usecount and its locking 103 * 104 * At nearly all points it is known that v_usecount could be zero, 105 * the vnode_t::v_interlock will be held. To change v_usecount away 106 * from zero, the interlock must be held. To change from a non-zero 107 * value to zero, again the interlock must be held. 108 * 109 * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise 110 * v_usecount, if the VC_XLOCK bit is set in it, the interlock must 111 * be held. To modify the VC_XLOCK bit, the interlock must be held. 112 * We always keep the usecount (v_usecount & VC_MASK) non-zero while 113 * the VC_XLOCK bit is set. 114 * 115 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 116 * value to a non-zero value can safely be done using atomic operations, 117 * without the interlock held. 118 * 119 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 120 * value can be done using atomic operations, without the interlock held. 121 * 122 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 123 * mntvnode_lock is still held. 124 */ 125 126 #include <sys/cdefs.h> 127 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.16 2012/10/12 21:10:55 rmind Exp $"); 128 129 #include <sys/param.h> 130 #include <sys/kernel.h> 131 132 #include <sys/atomic.h> 133 #include <sys/buf.h> 134 #include <sys/conf.h> 135 #include <sys/device.h> 136 #include <sys/kauth.h> 137 #include <sys/kmem.h> 138 #include <sys/kthread.h> 139 #include <sys/module.h> 140 #include <sys/mount.h> 141 #include <sys/namei.h> 142 #include <sys/syscallargs.h> 143 #include <sys/sysctl.h> 144 #include <sys/systm.h> 145 #include <sys/vnode.h> 146 #include <sys/wapbl.h> 147 148 #include <uvm/uvm.h> 149 #include <uvm/uvm_readahead.h> 150 151 u_int numvnodes __cacheline_aligned; 152 153 static pool_cache_t vnode_cache __read_mostly; 154 155 /* 156 * There are two free lists: one is for vnodes which have no buffer/page 157 * references and one for those which do (i.e. v_holdcnt is non-zero). 158 * Vnode recycling mechanism first attempts to look into the former list. 159 */ 160 static kmutex_t vnode_free_list_lock __cacheline_aligned; 161 static vnodelst_t vnode_free_list __cacheline_aligned; 162 static vnodelst_t vnode_hold_list __cacheline_aligned; 163 static kcondvar_t vdrain_cv __cacheline_aligned; 164 165 static vnodelst_t vrele_list __cacheline_aligned; 166 static kmutex_t vrele_lock __cacheline_aligned; 167 static kcondvar_t vrele_cv __cacheline_aligned; 168 static lwp_t * vrele_lwp __cacheline_aligned; 169 static int vrele_pending __cacheline_aligned; 170 static int vrele_gen __cacheline_aligned; 171 172 static int cleanvnode(void); 173 static void vdrain_thread(void *); 174 static void vrele_thread(void *); 175 static void vnpanic(vnode_t *, const char *, ...) 176 __attribute__((__format__(__printf__, 2, 3))); 177 178 /* Routines having to do with the management of the vnode table. */ 179 extern int (**dead_vnodeop_p)(void *); 180 181 void 182 vfs_vnode_sysinit(void) 183 { 184 int error; 185 186 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 187 NULL, IPL_NONE, NULL, NULL, NULL); 188 KASSERT(vnode_cache != NULL); 189 190 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 191 TAILQ_INIT(&vnode_free_list); 192 TAILQ_INIT(&vnode_hold_list); 193 TAILQ_INIT(&vrele_list); 194 195 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 196 cv_init(&vdrain_cv, "vdrain"); 197 cv_init(&vrele_cv, "vrele"); 198 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 199 NULL, NULL, "vdrain"); 200 KASSERT(error == 0); 201 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 202 NULL, &vrele_lwp, "vrele"); 203 KASSERT(error == 0); 204 } 205 206 /* 207 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 208 * marker vnode. 209 */ 210 vnode_t * 211 vnalloc(struct mount *mp) 212 { 213 vnode_t *vp; 214 215 vp = pool_cache_get(vnode_cache, PR_WAITOK); 216 KASSERT(vp != NULL); 217 218 memset(vp, 0, sizeof(*vp)); 219 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 220 cv_init(&vp->v_cv, "vnode"); 221 /* 222 * Done by memset() above. 223 * LIST_INIT(&vp->v_nclist); 224 * LIST_INIT(&vp->v_dnclist); 225 */ 226 227 if (mp != NULL) { 228 vp->v_mount = mp; 229 vp->v_type = VBAD; 230 vp->v_iflag = VI_MARKER; 231 } else { 232 rw_init(&vp->v_lock); 233 } 234 235 return vp; 236 } 237 238 /* 239 * Free an unused, unreferenced vnode. 240 */ 241 void 242 vnfree(vnode_t *vp) 243 { 244 245 KASSERT(vp->v_usecount == 0); 246 247 if ((vp->v_iflag & VI_MARKER) == 0) { 248 rw_destroy(&vp->v_lock); 249 mutex_enter(&vnode_free_list_lock); 250 numvnodes--; 251 mutex_exit(&vnode_free_list_lock); 252 } 253 254 /* 255 * Note: the vnode interlock will either be freed, of reference 256 * dropped (if VI_LOCKSHARE was in use). 257 */ 258 uvm_obj_destroy(&vp->v_uobj, true); 259 cv_destroy(&vp->v_cv); 260 pool_cache_put(vnode_cache, vp); 261 } 262 263 /* 264 * cleanvnode: grab a vnode from freelist, clean and free it. 265 * 266 * => Releases vnode_free_list_lock. 267 */ 268 static int 269 cleanvnode(void) 270 { 271 vnode_t *vp; 272 vnodelst_t *listhd; 273 274 KASSERT(mutex_owned(&vnode_free_list_lock)); 275 retry: 276 listhd = &vnode_free_list; 277 try_nextlist: 278 TAILQ_FOREACH(vp, listhd, v_freelist) { 279 /* 280 * It's safe to test v_usecount and v_iflag 281 * without holding the interlock here, since 282 * these vnodes should never appear on the 283 * lists. 284 */ 285 KASSERT(vp->v_usecount == 0); 286 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 287 KASSERT(vp->v_freelisthd == listhd); 288 289 if (!mutex_tryenter(vp->v_interlock)) 290 continue; 291 if ((vp->v_iflag & VI_XLOCK) == 0) 292 break; 293 mutex_exit(vp->v_interlock); 294 } 295 296 if (vp == NULL) { 297 if (listhd == &vnode_free_list) { 298 listhd = &vnode_hold_list; 299 goto try_nextlist; 300 } 301 mutex_exit(&vnode_free_list_lock); 302 return EBUSY; 303 } 304 305 /* Remove it from the freelist. */ 306 TAILQ_REMOVE(listhd, vp, v_freelist); 307 vp->v_freelisthd = NULL; 308 mutex_exit(&vnode_free_list_lock); 309 310 KASSERT(vp->v_usecount == 0); 311 312 /* 313 * The vnode is still associated with a file system, so we must 314 * clean it out before freeing it. We need to add a reference 315 * before doing this. If the vnode gains another reference while 316 * being cleaned out then we lose - retry. 317 */ 318 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 319 vclean(vp, DOCLOSE); 320 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 321 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 322 if (vp->v_usecount > 1) { 323 /* 324 * Don't return to freelist - the holder of the last 325 * reference will destroy it. 326 */ 327 vrelel(vp, 0); /* releases vp->v_interlock */ 328 mutex_enter(&vnode_free_list_lock); 329 goto retry; 330 } 331 332 KASSERT((vp->v_iflag & VI_CLEAN) == VI_CLEAN); 333 mutex_exit(vp->v_interlock); 334 if (vp->v_type == VBLK || vp->v_type == VCHR) { 335 spec_node_destroy(vp); 336 } 337 vp->v_type = VNON; 338 339 KASSERT(vp->v_data == NULL); 340 KASSERT(vp->v_uobj.uo_npages == 0); 341 KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); 342 KASSERT(vp->v_numoutput == 0); 343 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 344 345 vrele(vp); 346 347 return 0; 348 } 349 350 /* 351 * getnewvnode: return a fresh vnode. 352 * 353 * => Returns referenced vnode, moved into the mount queue. 354 * => Shares the interlock specified by 'slock', if it is not NULL. 355 */ 356 int 357 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 358 kmutex_t *slock, vnode_t **vpp) 359 { 360 struct uvm_object *uobj; 361 vnode_t *vp; 362 int error = 0; 363 364 if (mp != NULL) { 365 /* 366 * Mark filesystem busy while we are creating a vnode. 367 * If unmount is in progress, this will fail. 368 */ 369 error = vfs_busy(mp, NULL); 370 if (error) 371 return error; 372 } 373 374 vp = NULL; 375 376 /* Allocate a new vnode. */ 377 mutex_enter(&vnode_free_list_lock); 378 numvnodes++; 379 if (numvnodes > desiredvnodes + desiredvnodes / 10) 380 cv_signal(&vdrain_cv); 381 mutex_exit(&vnode_free_list_lock); 382 vp = vnalloc(NULL); 383 384 KASSERT(vp->v_freelisthd == NULL); 385 KASSERT(LIST_EMPTY(&vp->v_nclist)); 386 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 387 388 /* Initialize vnode. */ 389 vp->v_usecount = 1; 390 vp->v_type = VNON; 391 vp->v_tag = tag; 392 vp->v_op = vops; 393 vp->v_data = NULL; 394 395 uobj = &vp->v_uobj; 396 KASSERT(uobj->pgops == &uvm_vnodeops); 397 KASSERT(uobj->uo_npages == 0); 398 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 399 vp->v_size = vp->v_writesize = VSIZENOTSET; 400 401 /* Share the vnode_t::v_interlock, if requested. */ 402 if (slock) { 403 /* Set the interlock and mark that it is shared. */ 404 KASSERT(vp->v_mount == NULL); 405 mutex_obj_hold(slock); 406 uvm_obj_setlock(&vp->v_uobj, slock); 407 KASSERT(vp->v_interlock == slock); 408 vp->v_iflag |= VI_LOCKSHARE; 409 } 410 411 /* Finally, move vnode into the mount queue. */ 412 vfs_insmntque(vp, mp); 413 414 if (mp != NULL) { 415 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 416 vp->v_vflag |= VV_MPSAFE; 417 vfs_unbusy(mp, true, NULL); 418 } 419 420 *vpp = vp; 421 return 0; 422 } 423 424 /* 425 * This is really just the reverse of getnewvnode(). Needed for 426 * VFS_VGET functions who may need to push back a vnode in case 427 * of a locking race. 428 */ 429 void 430 ungetnewvnode(vnode_t *vp) 431 { 432 433 KASSERT(vp->v_usecount == 1); 434 KASSERT(vp->v_data == NULL); 435 KASSERT(vp->v_freelisthd == NULL); 436 437 mutex_enter(vp->v_interlock); 438 vp->v_iflag |= VI_CLEAN; 439 vrelel(vp, 0); 440 } 441 442 /* 443 * Helper thread to keep the number of vnodes below desiredvnodes. 444 */ 445 static void 446 vdrain_thread(void *cookie) 447 { 448 int error; 449 450 mutex_enter(&vnode_free_list_lock); 451 452 for (;;) { 453 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 454 while (numvnodes > desiredvnodes) { 455 error = cleanvnode(); 456 if (error) 457 kpause("vndsbusy", false, hz, NULL); 458 mutex_enter(&vnode_free_list_lock); 459 if (error) 460 break; 461 } 462 } 463 } 464 465 /* 466 * Remove a vnode from its freelist. 467 */ 468 void 469 vremfree(vnode_t *vp) 470 { 471 472 KASSERT(mutex_owned(vp->v_interlock)); 473 KASSERT(vp->v_usecount == 0); 474 475 /* 476 * Note that the reference count must not change until 477 * the vnode is removed. 478 */ 479 mutex_enter(&vnode_free_list_lock); 480 if (vp->v_holdcnt > 0) { 481 KASSERT(vp->v_freelisthd == &vnode_hold_list); 482 } else { 483 KASSERT(vp->v_freelisthd == &vnode_free_list); 484 } 485 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 486 vp->v_freelisthd = NULL; 487 mutex_exit(&vnode_free_list_lock); 488 } 489 490 /* 491 * Try to gain a reference to a vnode, without acquiring its interlock. 492 * The caller must hold a lock that will prevent the vnode from being 493 * recycled or freed. 494 */ 495 bool 496 vtryget(vnode_t *vp) 497 { 498 u_int use, next; 499 500 /* 501 * If the vnode is being freed, don't make life any harder 502 * for vclean() by adding another reference without waiting. 503 * This is not strictly necessary, but we'll do it anyway. 504 */ 505 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) { 506 return false; 507 } 508 for (use = vp->v_usecount;; use = next) { 509 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 510 /* Need interlock held if first reference. */ 511 return false; 512 } 513 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 514 if (__predict_true(next == use)) { 515 return true; 516 } 517 } 518 } 519 520 /* 521 * vget: get a particular vnode from the free list, increment its reference 522 * count and lock it. 523 * 524 * => Should be called with v_interlock held. 525 * 526 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 527 * In that case, we cannot grab the vnode, so the process is awakened when 528 * the transition is completed, and an error returned to indicate that the 529 * vnode is no longer usable (e.g. changed to a new file system type). 530 */ 531 int 532 vget(vnode_t *vp, int flags) 533 { 534 int error = 0; 535 536 KASSERT((vp->v_iflag & VI_MARKER) == 0); 537 KASSERT(mutex_owned(vp->v_interlock)); 538 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 539 540 /* 541 * Before adding a reference, we must remove the vnode 542 * from its freelist. 543 */ 544 if (vp->v_usecount == 0) { 545 vremfree(vp); 546 vp->v_usecount = 1; 547 } else { 548 atomic_inc_uint(&vp->v_usecount); 549 } 550 551 /* 552 * If the vnode is in the process of being cleaned out for 553 * another use, we wait for the cleaning to finish and then 554 * return failure. Cleaning is determined by checking if 555 * the VI_XLOCK flag is set. 556 */ 557 if ((vp->v_iflag & VI_XLOCK) != 0) { 558 if ((flags & LK_NOWAIT) != 0) { 559 vrelel(vp, 0); 560 return EBUSY; 561 } 562 vwait(vp, VI_XLOCK); 563 vrelel(vp, 0); 564 return ENOENT; 565 } 566 567 /* 568 * Ok, we got it in good shape. Just locking left. 569 */ 570 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 571 mutex_exit(vp->v_interlock); 572 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 573 error = vn_lock(vp, flags); 574 if (error != 0) { 575 vrele(vp); 576 } 577 } 578 return error; 579 } 580 581 /* 582 * vput: unlock and release the reference. 583 */ 584 void 585 vput(vnode_t *vp) 586 { 587 588 KASSERT((vp->v_iflag & VI_MARKER) == 0); 589 590 VOP_UNLOCK(vp); 591 vrele(vp); 592 } 593 594 /* 595 * Try to drop reference on a vnode. Abort if we are releasing the 596 * last reference. Note: this _must_ succeed if not the last reference. 597 */ 598 static inline bool 599 vtryrele(vnode_t *vp) 600 { 601 u_int use, next; 602 603 for (use = vp->v_usecount;; use = next) { 604 if (use == 1) { 605 return false; 606 } 607 KASSERT((use & VC_MASK) > 1); 608 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 609 if (__predict_true(next == use)) { 610 return true; 611 } 612 } 613 } 614 615 /* 616 * Vnode release. If reference count drops to zero, call inactive 617 * routine and either return to freelist or free to the pool. 618 */ 619 void 620 vrelel(vnode_t *vp, int flags) 621 { 622 bool recycle, defer; 623 int error; 624 625 KASSERT(mutex_owned(vp->v_interlock)); 626 KASSERT((vp->v_iflag & VI_MARKER) == 0); 627 KASSERT(vp->v_freelisthd == NULL); 628 629 if (__predict_false(vp->v_op == dead_vnodeop_p && 630 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 631 vnpanic(vp, "dead but not clean"); 632 } 633 634 /* 635 * If not the last reference, just drop the reference count 636 * and unlock. 637 */ 638 if (vtryrele(vp)) { 639 vp->v_iflag |= VI_INACTREDO; 640 mutex_exit(vp->v_interlock); 641 return; 642 } 643 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 644 vnpanic(vp, "%s: bad ref count", __func__); 645 } 646 647 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 648 649 #ifdef DIAGNOSTIC 650 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 651 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 652 vprint("vrelel: missing VOP_CLOSE()", vp); 653 } 654 #endif 655 656 /* 657 * If not clean, deactivate the vnode, but preserve 658 * our reference across the call to VOP_INACTIVE(). 659 */ 660 retry: 661 if ((vp->v_iflag & VI_CLEAN) == 0) { 662 recycle = false; 663 vp->v_iflag |= VI_INACTNOW; 664 665 /* 666 * XXX This ugly block can be largely eliminated if 667 * locking is pushed down into the file systems. 668 * 669 * Defer vnode release to vrele_thread if caller 670 * requests it explicitly. 671 */ 672 if ((curlwp == uvm.pagedaemon_lwp) || 673 (flags & VRELEL_ASYNC_RELE) != 0) { 674 /* The pagedaemon can't wait around; defer. */ 675 defer = true; 676 } else if (curlwp == vrele_lwp) { 677 /* We have to try harder. */ 678 vp->v_iflag &= ~VI_INACTREDO; 679 mutex_exit(vp->v_interlock); 680 error = vn_lock(vp, LK_EXCLUSIVE); 681 if (error != 0) { 682 /* XXX */ 683 vnpanic(vp, "%s: unable to lock %p", 684 __func__, vp); 685 } 686 defer = false; 687 } else if ((vp->v_iflag & VI_LAYER) != 0) { 688 /* 689 * Acquiring the stack's lock in vclean() even 690 * for an honest vput/vrele is dangerous because 691 * our caller may hold other vnode locks; defer. 692 */ 693 defer = true; 694 } else { 695 /* If we can't acquire the lock, then defer. */ 696 vp->v_iflag &= ~VI_INACTREDO; 697 mutex_exit(vp->v_interlock); 698 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 699 if (error != 0) { 700 defer = true; 701 mutex_enter(vp->v_interlock); 702 } else { 703 defer = false; 704 } 705 } 706 707 if (defer) { 708 /* 709 * Defer reclaim to the kthread; it's not safe to 710 * clean it here. We donate it our last reference. 711 */ 712 KASSERT(mutex_owned(vp->v_interlock)); 713 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 714 vp->v_iflag &= ~VI_INACTNOW; 715 vp->v_iflag |= VI_INACTPEND; 716 mutex_enter(&vrele_lock); 717 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 718 if (++vrele_pending > (desiredvnodes >> 8)) 719 cv_signal(&vrele_cv); 720 mutex_exit(&vrele_lock); 721 mutex_exit(vp->v_interlock); 722 return; 723 } 724 725 /* 726 * The vnode can gain another reference while being 727 * deactivated. If VOP_INACTIVE() indicates that 728 * the described file has been deleted, then recycle 729 * the vnode irrespective of additional references. 730 * Another thread may be waiting to re-use the on-disk 731 * inode. 732 * 733 * Note that VOP_INACTIVE() will drop the vnode lock. 734 */ 735 VOP_INACTIVE(vp, &recycle); 736 mutex_enter(vp->v_interlock); 737 vp->v_iflag &= ~VI_INACTNOW; 738 if (!recycle) { 739 if (vtryrele(vp)) { 740 mutex_exit(vp->v_interlock); 741 return; 742 } 743 744 /* 745 * If we grew another reference while 746 * VOP_INACTIVE() was underway, retry. 747 */ 748 if ((vp->v_iflag & VI_INACTREDO) != 0) { 749 goto retry; 750 } 751 } 752 753 /* Take care of space accounting. */ 754 if (vp->v_iflag & VI_EXECMAP) { 755 atomic_add_int(&uvmexp.execpages, 756 -vp->v_uobj.uo_npages); 757 atomic_add_int(&uvmexp.filepages, 758 vp->v_uobj.uo_npages); 759 } 760 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 761 vp->v_vflag &= ~VV_MAPPED; 762 763 /* 764 * Recycle the vnode if the file is now unused (unlinked), 765 * otherwise just free it. 766 */ 767 if (recycle) { 768 vclean(vp, DOCLOSE); 769 } 770 KASSERT(vp->v_usecount > 0); 771 } 772 773 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 774 /* Gained another reference while being reclaimed. */ 775 mutex_exit(vp->v_interlock); 776 return; 777 } 778 779 if ((vp->v_iflag & VI_CLEAN) != 0) { 780 /* 781 * It's clean so destroy it. It isn't referenced 782 * anywhere since it has been reclaimed. 783 */ 784 KASSERT(vp->v_holdcnt == 0); 785 KASSERT(vp->v_writecount == 0); 786 mutex_exit(vp->v_interlock); 787 vfs_insmntque(vp, NULL); 788 if (vp->v_type == VBLK || vp->v_type == VCHR) { 789 spec_node_destroy(vp); 790 } 791 vnfree(vp); 792 } else { 793 /* 794 * Otherwise, put it back onto the freelist. It 795 * can't be destroyed while still associated with 796 * a file system. 797 */ 798 mutex_enter(&vnode_free_list_lock); 799 if (vp->v_holdcnt > 0) { 800 vp->v_freelisthd = &vnode_hold_list; 801 } else { 802 vp->v_freelisthd = &vnode_free_list; 803 } 804 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 805 mutex_exit(&vnode_free_list_lock); 806 mutex_exit(vp->v_interlock); 807 } 808 } 809 810 void 811 vrele(vnode_t *vp) 812 { 813 814 KASSERT((vp->v_iflag & VI_MARKER) == 0); 815 816 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 817 return; 818 } 819 mutex_enter(vp->v_interlock); 820 vrelel(vp, 0); 821 } 822 823 /* 824 * Asynchronous vnode release, vnode is released in different context. 825 */ 826 void 827 vrele_async(vnode_t *vp) 828 { 829 830 KASSERT((vp->v_iflag & VI_MARKER) == 0); 831 832 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 833 return; 834 } 835 mutex_enter(vp->v_interlock); 836 vrelel(vp, VRELEL_ASYNC_RELE); 837 } 838 839 static void 840 vrele_thread(void *cookie) 841 { 842 vnode_t *vp; 843 844 for (;;) { 845 mutex_enter(&vrele_lock); 846 while (TAILQ_EMPTY(&vrele_list)) { 847 vrele_gen++; 848 cv_broadcast(&vrele_cv); 849 cv_timedwait(&vrele_cv, &vrele_lock, hz); 850 } 851 vp = TAILQ_FIRST(&vrele_list); 852 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 853 vrele_pending--; 854 mutex_exit(&vrele_lock); 855 856 /* 857 * If not the last reference, then ignore the vnode 858 * and look for more work. 859 */ 860 mutex_enter(vp->v_interlock); 861 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 862 vp->v_iflag &= ~VI_INACTPEND; 863 vrelel(vp, 0); 864 } 865 } 866 867 void 868 vrele_flush(void) 869 { 870 int gen; 871 872 mutex_enter(&vrele_lock); 873 gen = vrele_gen; 874 while (vrele_pending && gen == vrele_gen) { 875 cv_broadcast(&vrele_cv); 876 cv_wait(&vrele_cv, &vrele_lock); 877 } 878 mutex_exit(&vrele_lock); 879 } 880 881 /* 882 * Vnode reference, where a reference is already held by some other 883 * object (for example, a file structure). 884 */ 885 void 886 vref(vnode_t *vp) 887 { 888 889 KASSERT((vp->v_iflag & VI_MARKER) == 0); 890 KASSERT(vp->v_usecount != 0); 891 892 atomic_inc_uint(&vp->v_usecount); 893 } 894 895 /* 896 * Page or buffer structure gets a reference. 897 * Called with v_interlock held. 898 */ 899 void 900 vholdl(vnode_t *vp) 901 { 902 903 KASSERT(mutex_owned(vp->v_interlock)); 904 KASSERT((vp->v_iflag & VI_MARKER) == 0); 905 906 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 907 mutex_enter(&vnode_free_list_lock); 908 KASSERT(vp->v_freelisthd == &vnode_free_list); 909 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 910 vp->v_freelisthd = &vnode_hold_list; 911 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 912 mutex_exit(&vnode_free_list_lock); 913 } 914 } 915 916 /* 917 * Page or buffer structure frees a reference. 918 * Called with v_interlock held. 919 */ 920 void 921 holdrelel(vnode_t *vp) 922 { 923 924 KASSERT(mutex_owned(vp->v_interlock)); 925 KASSERT((vp->v_iflag & VI_MARKER) == 0); 926 927 if (vp->v_holdcnt <= 0) { 928 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 929 } 930 931 vp->v_holdcnt--; 932 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 933 mutex_enter(&vnode_free_list_lock); 934 KASSERT(vp->v_freelisthd == &vnode_hold_list); 935 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 936 vp->v_freelisthd = &vnode_free_list; 937 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 938 mutex_exit(&vnode_free_list_lock); 939 } 940 } 941 942 /* 943 * Disassociate the underlying file system from a vnode. 944 * 945 * Must be called with the interlock held, and will return with it held. 946 */ 947 void 948 vclean(vnode_t *vp, int flags) 949 { 950 lwp_t *l = curlwp; 951 bool recycle, active; 952 int error; 953 954 KASSERT(mutex_owned(vp->v_interlock)); 955 KASSERT((vp->v_iflag & VI_MARKER) == 0); 956 KASSERT(vp->v_usecount != 0); 957 958 /* If cleaning is already in progress wait until done and return. */ 959 if (vp->v_iflag & VI_XLOCK) { 960 vwait(vp, VI_XLOCK); 961 return; 962 } 963 964 /* If already clean, nothing to do. */ 965 if ((vp->v_iflag & VI_CLEAN) != 0) { 966 return; 967 } 968 969 /* 970 * Prevent the vnode from being recycled or brought into use 971 * while we clean it out. 972 */ 973 vp->v_iflag |= VI_XLOCK; 974 if (vp->v_iflag & VI_EXECMAP) { 975 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 976 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 977 } 978 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 979 active = (vp->v_usecount & VC_MASK) > 1; 980 981 /* XXXAD should not lock vnode under layer */ 982 mutex_exit(vp->v_interlock); 983 VOP_LOCK(vp, LK_EXCLUSIVE); 984 985 /* 986 * Clean out any cached data associated with the vnode. 987 * If purging an active vnode, it must be closed and 988 * deactivated before being reclaimed. Note that the 989 * VOP_INACTIVE will unlock the vnode. 990 */ 991 if (flags & DOCLOSE) { 992 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 993 if (error != 0) { 994 /* XXX, fix vn_start_write's grab of mp and use that. */ 995 996 if (wapbl_vphaswapbl(vp)) 997 WAPBL_DISCARD(wapbl_vptomp(vp)); 998 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 999 } 1000 KASSERT(error == 0); 1001 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1002 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1003 spec_node_revoke(vp); 1004 } 1005 } 1006 if (active) { 1007 VOP_INACTIVE(vp, &recycle); 1008 } else { 1009 /* 1010 * Any other processes trying to obtain this lock must first 1011 * wait for VI_XLOCK to clear, then call the new lock operation. 1012 */ 1013 VOP_UNLOCK(vp); 1014 } 1015 1016 /* Disassociate the underlying file system from the vnode. */ 1017 if (VOP_RECLAIM(vp)) { 1018 vnpanic(vp, "%s: cannot reclaim", __func__); 1019 } 1020 1021 KASSERT(vp->v_data == NULL); 1022 KASSERT(vp->v_uobj.uo_npages == 0); 1023 1024 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1025 uvm_ra_freectx(vp->v_ractx); 1026 vp->v_ractx = NULL; 1027 } 1028 1029 /* Purge name cache. */ 1030 cache_purge(vp); 1031 1032 /* Done with purge, notify sleepers of the grim news. */ 1033 mutex_enter(vp->v_interlock); 1034 vp->v_op = dead_vnodeop_p; 1035 vp->v_tag = VT_NON; 1036 KNOTE(&vp->v_klist, NOTE_REVOKE); 1037 vp->v_iflag &= ~VI_XLOCK; 1038 vp->v_vflag &= ~VV_LOCKSWORK; 1039 if ((flags & DOCLOSE) != 0) { 1040 vp->v_iflag |= VI_CLEAN; 1041 } 1042 cv_broadcast(&vp->v_cv); 1043 1044 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1045 } 1046 1047 /* 1048 * Recycle an unused vnode to the front of the free list. 1049 * Release the passed interlock if the vnode will be recycled. 1050 */ 1051 int 1052 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1053 { 1054 1055 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1056 1057 mutex_enter(vp->v_interlock); 1058 if (vp->v_usecount != 0) { 1059 mutex_exit(vp->v_interlock); 1060 return 0; 1061 } 1062 if (inter_lkp) { 1063 mutex_exit(inter_lkp); 1064 } 1065 vremfree(vp); 1066 vp->v_usecount = 1; 1067 vclean(vp, DOCLOSE); 1068 vrelel(vp, 0); 1069 return 1; 1070 } 1071 1072 /* 1073 * Eliminate all activity associated with the requested vnode 1074 * and with all vnodes aliased to the requested vnode. 1075 */ 1076 void 1077 vrevoke(vnode_t *vp) 1078 { 1079 vnode_t *vq, **vpp; 1080 enum vtype type; 1081 dev_t dev; 1082 1083 KASSERT(vp->v_usecount > 0); 1084 1085 mutex_enter(vp->v_interlock); 1086 if ((vp->v_iflag & VI_CLEAN) != 0) { 1087 mutex_exit(vp->v_interlock); 1088 return; 1089 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1090 atomic_inc_uint(&vp->v_usecount); 1091 vclean(vp, DOCLOSE); 1092 vrelel(vp, 0); 1093 return; 1094 } else { 1095 dev = vp->v_rdev; 1096 type = vp->v_type; 1097 mutex_exit(vp->v_interlock); 1098 } 1099 1100 vpp = &specfs_hash[SPECHASH(dev)]; 1101 mutex_enter(&device_lock); 1102 for (vq = *vpp; vq != NULL;) { 1103 /* If clean or being cleaned, then ignore it. */ 1104 mutex_enter(vq->v_interlock); 1105 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1106 vq->v_type != type || vq->v_rdev != dev) { 1107 mutex_exit(vq->v_interlock); 1108 vq = vq->v_specnext; 1109 continue; 1110 } 1111 mutex_exit(&device_lock); 1112 if (vq->v_usecount == 0) { 1113 vremfree(vq); 1114 vq->v_usecount = 1; 1115 } else { 1116 atomic_inc_uint(&vq->v_usecount); 1117 } 1118 vclean(vq, DOCLOSE); 1119 vrelel(vq, 0); 1120 mutex_enter(&device_lock); 1121 vq = *vpp; 1122 } 1123 mutex_exit(&device_lock); 1124 } 1125 1126 /* 1127 * Eliminate all activity associated with a vnode in preparation for 1128 * reuse. Drops a reference from the vnode. 1129 */ 1130 void 1131 vgone(vnode_t *vp) 1132 { 1133 1134 mutex_enter(vp->v_interlock); 1135 vclean(vp, DOCLOSE); 1136 vrelel(vp, 0); 1137 } 1138 1139 /* 1140 * Update outstanding I/O count and do wakeup if requested. 1141 */ 1142 void 1143 vwakeup(struct buf *bp) 1144 { 1145 vnode_t *vp; 1146 1147 if ((vp = bp->b_vp) == NULL) 1148 return; 1149 1150 KASSERT(bp->b_objlock == vp->v_interlock); 1151 KASSERT(mutex_owned(bp->b_objlock)); 1152 1153 if (--vp->v_numoutput < 0) 1154 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1155 if (vp->v_numoutput == 0) 1156 cv_broadcast(&vp->v_cv); 1157 } 1158 1159 /* 1160 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1161 * recycled. 1162 */ 1163 void 1164 vwait(vnode_t *vp, int flags) 1165 { 1166 1167 KASSERT(mutex_owned(vp->v_interlock)); 1168 KASSERT(vp->v_usecount != 0); 1169 1170 while ((vp->v_iflag & flags) != 0) 1171 cv_wait(&vp->v_cv, vp->v_interlock); 1172 } 1173 1174 int 1175 vfs_drainvnodes(long target) 1176 { 1177 int error; 1178 1179 mutex_enter(&vnode_free_list_lock); 1180 1181 while (numvnodes > target) { 1182 error = cleanvnode(); 1183 if (error != 0) 1184 return error; 1185 mutex_enter(&vnode_free_list_lock); 1186 } 1187 1188 mutex_exit(&vnode_free_list_lock); 1189 1190 return 0; 1191 } 1192 1193 void 1194 vnpanic(vnode_t *vp, const char *fmt, ...) 1195 { 1196 va_list ap; 1197 1198 #ifdef DIAGNOSTIC 1199 vprint(NULL, vp); 1200 #endif 1201 va_start(ap, fmt); 1202 vpanic(fmt, ap); 1203 va_end(ap); 1204 } 1205