1 /* $NetBSD: vfs_vnode.c,v 1.8 2011/05/19 03:26:06 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9). 80 * - Reclamation of inactive vnode, via vget(9). 81 * 82 * The life-cycle ends when the last reference is dropped, usually 83 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 84 * the file system that vnode is inactive. Via this call, file system 85 * indicates whether vnode should be recycled (usually, count of links 86 * is checked i.e. whether file was removed). 87 * 88 * Depending on indication, vnode can be put into a free list (cache), 89 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 90 * underlying file system from the vnode, and finally destroyed. 91 * 92 * Reference counting 93 * 94 * Vnode is considered active, if reference count (vnode_t::v_usecount) 95 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 96 * as vput(9), routines. Common points holding references are e.g. 97 * file openings, current working directory, mount points, etc. 98 * 99 * Note on v_usecount and its locking 100 * 101 * At nearly all points it is known that v_usecount could be zero, 102 * the vnode_t::v_interlock will be held. To change v_usecount away 103 * from zero, the interlock must be held. To change from a non-zero 104 * value to zero, again the interlock must be held. 105 * 106 * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise 107 * v_usecount, if the VC_XLOCK bit is set in it, the interlock must 108 * be held. To modify the VC_XLOCK bit, the interlock must be held. 109 * We always keep the usecount (v_usecount & VC_MASK) non-zero while 110 * the VC_XLOCK bit is set. 111 * 112 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 113 * value to a non-zero value can safely be done using atomic operations, 114 * without the interlock held. 115 * 116 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 117 * value can be done using atomic operations, without the interlock held. 118 * 119 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 120 * mntvnode_lock is still held. 121 */ 122 123 #include <sys/cdefs.h> 124 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.8 2011/05/19 03:26:06 rmind Exp $"); 125 126 #include <sys/param.h> 127 #include <sys/kernel.h> 128 129 #include <sys/atomic.h> 130 #include <sys/buf.h> 131 #include <sys/conf.h> 132 #include <sys/device.h> 133 #include <sys/kauth.h> 134 #include <sys/kmem.h> 135 #include <sys/kthread.h> 136 #include <sys/module.h> 137 #include <sys/mount.h> 138 #include <sys/namei.h> 139 #include <sys/syscallargs.h> 140 #include <sys/sysctl.h> 141 #include <sys/systm.h> 142 #include <sys/vnode.h> 143 #include <sys/wapbl.h> 144 145 #include <uvm/uvm.h> 146 #include <uvm/uvm_readahead.h> 147 148 u_int numvnodes __cacheline_aligned; 149 150 static pool_cache_t vnode_cache __read_mostly; 151 static kmutex_t vnode_free_list_lock __cacheline_aligned; 152 153 static vnodelst_t vnode_free_list __cacheline_aligned; 154 static vnodelst_t vnode_hold_list __cacheline_aligned; 155 static vnodelst_t vrele_list __cacheline_aligned; 156 157 static kmutex_t vrele_lock __cacheline_aligned; 158 static kcondvar_t vrele_cv __cacheline_aligned; 159 static lwp_t * vrele_lwp __cacheline_aligned; 160 static int vrele_pending __cacheline_aligned; 161 static int vrele_gen __cacheline_aligned; 162 163 static vnode_t * getcleanvnode(void); 164 static void vrele_thread(void *); 165 static void vpanic(vnode_t *, const char *); 166 167 /* Routines having to do with the management of the vnode table. */ 168 extern int (**dead_vnodeop_p)(void *); 169 170 void 171 vfs_vnode_sysinit(void) 172 { 173 int error; 174 175 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 176 NULL, IPL_NONE, NULL, NULL, NULL); 177 KASSERT(vnode_cache != NULL); 178 179 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 180 TAILQ_INIT(&vnode_free_list); 181 TAILQ_INIT(&vnode_hold_list); 182 TAILQ_INIT(&vrele_list); 183 184 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 185 cv_init(&vrele_cv, "vrele"); 186 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 187 NULL, &vrele_lwp, "vrele"); 188 KASSERT(error == 0); 189 } 190 191 /* 192 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 193 * marker vnode and we are prepared to wait for the allocation. 194 */ 195 vnode_t * 196 vnalloc(struct mount *mp) 197 { 198 vnode_t *vp; 199 200 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 201 if (vp == NULL) { 202 return NULL; 203 } 204 205 memset(vp, 0, sizeof(*vp)); 206 UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0); 207 cv_init(&vp->v_cv, "vnode"); 208 /* 209 * Done by memset() above. 210 * LIST_INIT(&vp->v_nclist); 211 * LIST_INIT(&vp->v_dnclist); 212 */ 213 214 if (mp != NULL) { 215 vp->v_mount = mp; 216 vp->v_type = VBAD; 217 vp->v_iflag = VI_MARKER; 218 } else { 219 rw_init(&vp->v_lock); 220 } 221 222 return vp; 223 } 224 225 /* 226 * Free an unused, unreferenced vnode. 227 */ 228 void 229 vnfree(vnode_t *vp) 230 { 231 232 KASSERT(vp->v_usecount == 0); 233 234 if ((vp->v_iflag & VI_MARKER) == 0) { 235 rw_destroy(&vp->v_lock); 236 mutex_enter(&vnode_free_list_lock); 237 numvnodes--; 238 mutex_exit(&vnode_free_list_lock); 239 } 240 241 UVM_OBJ_DESTROY(&vp->v_uobj); 242 cv_destroy(&vp->v_cv); 243 pool_cache_put(vnode_cache, vp); 244 } 245 246 /* 247 * getcleanvnode: grab a vnode from freelist and clean it. 248 * 249 * => Releases vnode_free_list_lock. 250 * => Returns referenced vnode on success. 251 */ 252 static vnode_t * 253 getcleanvnode(void) 254 { 255 vnode_t *vp; 256 vnodelst_t *listhd; 257 258 KASSERT(mutex_owned(&vnode_free_list_lock)); 259 retry: 260 listhd = &vnode_free_list; 261 try_nextlist: 262 TAILQ_FOREACH(vp, listhd, v_freelist) { 263 /* 264 * It's safe to test v_usecount and v_iflag 265 * without holding the interlock here, since 266 * these vnodes should never appear on the 267 * lists. 268 */ 269 KASSERT(vp->v_usecount == 0); 270 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 271 KASSERT(vp->v_freelisthd == listhd); 272 273 if (!mutex_tryenter(&vp->v_interlock)) 274 continue; 275 if ((vp->v_iflag & VI_XLOCK) == 0) 276 break; 277 mutex_exit(&vp->v_interlock); 278 } 279 280 if (vp == NULL) { 281 if (listhd == &vnode_free_list) { 282 listhd = &vnode_hold_list; 283 goto try_nextlist; 284 } 285 mutex_exit(&vnode_free_list_lock); 286 return NULL; 287 } 288 289 /* Remove it from the freelist. */ 290 TAILQ_REMOVE(listhd, vp, v_freelist); 291 vp->v_freelisthd = NULL; 292 mutex_exit(&vnode_free_list_lock); 293 294 KASSERT(vp->v_usecount == 0); 295 296 /* 297 * The vnode is still associated with a file system, so we must 298 * clean it out before reusing it. We need to add a reference 299 * before doing this. If the vnode gains another reference while 300 * being cleaned out then we lose - retry. 301 */ 302 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 303 vclean(vp, DOCLOSE); 304 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 305 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 306 if (vp->v_usecount == 1) { 307 /* We're about to dirty it. */ 308 vp->v_iflag &= ~VI_CLEAN; 309 mutex_exit(&vp->v_interlock); 310 if (vp->v_type == VBLK || vp->v_type == VCHR) { 311 spec_node_destroy(vp); 312 } 313 vp->v_type = VNON; 314 } else { 315 /* 316 * Don't return to freelist - the holder of the last 317 * reference will destroy it. 318 */ 319 vrelel(vp, 0); /* releases vp->v_interlock */ 320 mutex_enter(&vnode_free_list_lock); 321 goto retry; 322 } 323 324 KASSERT(vp->v_data == NULL); 325 KASSERT(vp->v_uobj.uo_npages == 0); 326 KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); 327 KASSERT(vp->v_numoutput == 0); 328 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 329 330 return vp; 331 } 332 333 /* 334 * getnewvnode: return the next vnode from the free list. 335 * 336 * => Returns referenced vnode, moved into the mount queue. 337 */ 338 int 339 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 340 vnode_t **vpp) 341 { 342 struct uvm_object *uobj; 343 static int toggle; 344 vnode_t *vp; 345 int error = 0, tryalloc; 346 347 try_again: 348 if (mp != NULL) { 349 /* 350 * Mark filesystem busy while we are creating a vnode. 351 * If unmount is in progress, this will fail. 352 */ 353 error = vfs_busy(mp, NULL); 354 if (error) 355 return error; 356 } 357 358 /* 359 * We must choose whether to allocate a new vnode or recycle an 360 * existing one. The criterion for allocating a new one is that 361 * the total number of vnodes is less than the number desired or 362 * there are no vnodes on either free list. Generally we only 363 * want to recycle vnodes that have no buffers associated with 364 * them, so we look first on the vnode_free_list. If it is empty, 365 * we next consider vnodes with referencing buffers on the 366 * vnode_hold_list. The toggle ensures that half the time we 367 * will use a buffer from the vnode_hold_list, and half the time 368 * we will allocate a new one unless the list has grown to twice 369 * the desired size. We are reticent to recycle vnodes from the 370 * vnode_hold_list because we will lose the identity of all its 371 * referencing buffers. 372 */ 373 374 vp = NULL; 375 376 mutex_enter(&vnode_free_list_lock); 377 378 toggle ^= 1; 379 if (numvnodes > 2 * desiredvnodes) 380 toggle = 0; 381 382 tryalloc = numvnodes < desiredvnodes || 383 (TAILQ_FIRST(&vnode_free_list) == NULL && 384 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 385 386 if (tryalloc) { 387 /* Allocate a new vnode. */ 388 numvnodes++; 389 mutex_exit(&vnode_free_list_lock); 390 if ((vp = vnalloc(NULL)) == NULL) { 391 mutex_enter(&vnode_free_list_lock); 392 numvnodes--; 393 } else 394 vp->v_usecount = 1; 395 } 396 397 if (vp == NULL) { 398 /* Recycle and get vnode clean. */ 399 vp = getcleanvnode(); 400 if (vp == NULL) { 401 if (mp != NULL) { 402 vfs_unbusy(mp, false, NULL); 403 } 404 if (tryalloc) { 405 printf("WARNING: unable to allocate new " 406 "vnode, retrying...\n"); 407 kpause("newvn", false, hz, NULL); 408 goto try_again; 409 } 410 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 411 *vpp = 0; 412 return ENFILE; 413 } 414 vp->v_iflag = 0; 415 vp->v_vflag = 0; 416 vp->v_uflag = 0; 417 vp->v_socket = NULL; 418 } 419 420 KASSERT(vp->v_usecount == 1); 421 KASSERT(vp->v_freelisthd == NULL); 422 KASSERT(LIST_EMPTY(&vp->v_nclist)); 423 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 424 425 /* Initialize vnode. */ 426 vp->v_type = VNON; 427 vp->v_tag = tag; 428 vp->v_op = vops; 429 vp->v_data = NULL; 430 431 uobj = &vp->v_uobj; 432 KASSERT(uobj->pgops == &uvm_vnodeops); 433 KASSERT(uobj->uo_npages == 0); 434 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 435 vp->v_size = vp->v_writesize = VSIZENOTSET; 436 437 /* Finally, move vnode into the mount queue. */ 438 vfs_insmntque(vp, mp); 439 440 if (mp != NULL) { 441 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 442 vp->v_vflag |= VV_MPSAFE; 443 vfs_unbusy(mp, true, NULL); 444 } 445 446 *vpp = vp; 447 return 0; 448 } 449 450 /* 451 * This is really just the reverse of getnewvnode(). Needed for 452 * VFS_VGET functions who may need to push back a vnode in case 453 * of a locking race. 454 */ 455 void 456 ungetnewvnode(vnode_t *vp) 457 { 458 459 KASSERT(vp->v_usecount == 1); 460 KASSERT(vp->v_data == NULL); 461 KASSERT(vp->v_freelisthd == NULL); 462 463 mutex_enter(&vp->v_interlock); 464 vp->v_iflag |= VI_CLEAN; 465 vrelel(vp, 0); 466 } 467 468 /* 469 * Remove a vnode from its freelist. 470 */ 471 void 472 vremfree(vnode_t *vp) 473 { 474 475 KASSERT(mutex_owned(&vp->v_interlock)); 476 KASSERT(vp->v_usecount == 0); 477 478 /* 479 * Note that the reference count must not change until 480 * the vnode is removed. 481 */ 482 mutex_enter(&vnode_free_list_lock); 483 if (vp->v_holdcnt > 0) { 484 KASSERT(vp->v_freelisthd == &vnode_hold_list); 485 } else { 486 KASSERT(vp->v_freelisthd == &vnode_free_list); 487 } 488 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 489 vp->v_freelisthd = NULL; 490 mutex_exit(&vnode_free_list_lock); 491 } 492 493 /* 494 * Try to gain a reference to a vnode, without acquiring its interlock. 495 * The caller must hold a lock that will prevent the vnode from being 496 * recycled or freed. 497 */ 498 bool 499 vtryget(vnode_t *vp) 500 { 501 u_int use, next; 502 503 /* 504 * If the vnode is being freed, don't make life any harder 505 * for vclean() by adding another reference without waiting. 506 * This is not strictly necessary, but we'll do it anyway. 507 */ 508 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) { 509 return false; 510 } 511 for (use = vp->v_usecount;; use = next) { 512 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 513 /* Need interlock held if first reference. */ 514 return false; 515 } 516 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 517 if (__predict_true(next == use)) { 518 return true; 519 } 520 } 521 } 522 523 /* 524 * vget: get a particular vnode from the free list, increment its reference 525 * count and lock it. 526 * 527 * => Should be called with v_interlock held. 528 * 529 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 530 * In that case, we cannot grab the vnode, so the process is awakened when 531 * the transition is completed, and an error returned to indicate that the 532 * vnode is no longer usable (e.g. changed to a new file system type). 533 */ 534 int 535 vget(vnode_t *vp, int flags) 536 { 537 int error = 0; 538 539 KASSERT((vp->v_iflag & VI_MARKER) == 0); 540 KASSERT(mutex_owned(&vp->v_interlock)); 541 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 542 543 /* 544 * Before adding a reference, we must remove the vnode 545 * from its freelist. 546 */ 547 if (vp->v_usecount == 0) { 548 vremfree(vp); 549 vp->v_usecount = 1; 550 } else { 551 atomic_inc_uint(&vp->v_usecount); 552 } 553 554 /* 555 * If the vnode is in the process of being cleaned out for 556 * another use, we wait for the cleaning to finish and then 557 * return failure. Cleaning is determined by checking if 558 * the VI_XLOCK flag is set. 559 */ 560 if ((vp->v_iflag & VI_XLOCK) != 0) { 561 if ((flags & LK_NOWAIT) != 0) { 562 vrelel(vp, 0); 563 return EBUSY; 564 } 565 vwait(vp, VI_XLOCK); 566 vrelel(vp, 0); 567 return ENOENT; 568 } 569 570 /* 571 * Ok, we got it in good shape. Just locking left. 572 */ 573 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 574 mutex_exit(&vp->v_interlock); 575 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 576 error = vn_lock(vp, flags); 577 if (error != 0) { 578 vrele(vp); 579 } 580 } 581 return error; 582 } 583 584 /* 585 * vput: unlock and release the reference. 586 */ 587 void 588 vput(vnode_t *vp) 589 { 590 591 KASSERT((vp->v_iflag & VI_MARKER) == 0); 592 593 VOP_UNLOCK(vp); 594 vrele(vp); 595 } 596 597 /* 598 * Try to drop reference on a vnode. Abort if we are releasing the 599 * last reference. Note: this _must_ succeed if not the last reference. 600 */ 601 static inline bool 602 vtryrele(vnode_t *vp) 603 { 604 u_int use, next; 605 606 for (use = vp->v_usecount;; use = next) { 607 if (use == 1) { 608 return false; 609 } 610 KASSERT((use & VC_MASK) > 1); 611 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 612 if (__predict_true(next == use)) { 613 return true; 614 } 615 } 616 } 617 618 /* 619 * Vnode release. If reference count drops to zero, call inactive 620 * routine and either return to freelist or free to the pool. 621 */ 622 void 623 vrelel(vnode_t *vp, int flags) 624 { 625 bool recycle, defer; 626 int error; 627 628 KASSERT(mutex_owned(&vp->v_interlock)); 629 KASSERT((vp->v_iflag & VI_MARKER) == 0); 630 KASSERT(vp->v_freelisthd == NULL); 631 632 if (__predict_false(vp->v_op == dead_vnodeop_p && 633 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 634 vpanic(vp, "dead but not clean"); 635 } 636 637 /* 638 * If not the last reference, just drop the reference count 639 * and unlock. 640 */ 641 if (vtryrele(vp)) { 642 vp->v_iflag |= VI_INACTREDO; 643 mutex_exit(&vp->v_interlock); 644 return; 645 } 646 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 647 vpanic(vp, "vrelel: bad ref count"); 648 } 649 650 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 651 652 /* 653 * If not clean, deactivate the vnode, but preserve 654 * our reference across the call to VOP_INACTIVE(). 655 */ 656 retry: 657 if ((vp->v_iflag & VI_CLEAN) == 0) { 658 recycle = false; 659 vp->v_iflag |= VI_INACTNOW; 660 661 /* 662 * XXX This ugly block can be largely eliminated if 663 * locking is pushed down into the file systems. 664 * 665 * Defer vnode release to vrele_thread if caller 666 * requests it explicitly. 667 */ 668 if ((curlwp == uvm.pagedaemon_lwp) || 669 (flags & VRELEL_ASYNC_RELE) != 0) { 670 /* The pagedaemon can't wait around; defer. */ 671 defer = true; 672 } else if (curlwp == vrele_lwp) { 673 /* We have to try harder. */ 674 vp->v_iflag &= ~VI_INACTREDO; 675 mutex_exit(&vp->v_interlock); 676 error = vn_lock(vp, LK_EXCLUSIVE); 677 if (error != 0) { 678 /* XXX */ 679 vpanic(vp, "vrele: unable to lock %p"); 680 } 681 defer = false; 682 } else if ((vp->v_iflag & VI_LAYER) != 0) { 683 /* 684 * Acquiring the stack's lock in vclean() even 685 * for an honest vput/vrele is dangerous because 686 * our caller may hold other vnode locks; defer. 687 */ 688 defer = true; 689 } else { 690 /* If we can't acquire the lock, then defer. */ 691 vp->v_iflag &= ~VI_INACTREDO; 692 mutex_exit(&vp->v_interlock); 693 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 694 if (error != 0) { 695 defer = true; 696 mutex_enter(&vp->v_interlock); 697 } else { 698 defer = false; 699 } 700 } 701 702 if (defer) { 703 /* 704 * Defer reclaim to the kthread; it's not safe to 705 * clean it here. We donate it our last reference. 706 */ 707 KASSERT(mutex_owned(&vp->v_interlock)); 708 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 709 vp->v_iflag &= ~VI_INACTNOW; 710 vp->v_iflag |= VI_INACTPEND; 711 mutex_enter(&vrele_lock); 712 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 713 if (++vrele_pending > (desiredvnodes >> 8)) 714 cv_signal(&vrele_cv); 715 mutex_exit(&vrele_lock); 716 mutex_exit(&vp->v_interlock); 717 return; 718 } 719 720 #ifdef DIAGNOSTIC 721 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 722 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 723 vprint("vrelel: missing VOP_CLOSE()", vp); 724 } 725 #endif 726 727 /* 728 * The vnode can gain another reference while being 729 * deactivated. If VOP_INACTIVE() indicates that 730 * the described file has been deleted, then recycle 731 * the vnode irrespective of additional references. 732 * Another thread may be waiting to re-use the on-disk 733 * inode. 734 * 735 * Note that VOP_INACTIVE() will drop the vnode lock. 736 */ 737 VOP_INACTIVE(vp, &recycle); 738 mutex_enter(&vp->v_interlock); 739 vp->v_iflag &= ~VI_INACTNOW; 740 if (!recycle) { 741 if (vtryrele(vp)) { 742 mutex_exit(&vp->v_interlock); 743 return; 744 } 745 746 /* 747 * If we grew another reference while 748 * VOP_INACTIVE() was underway, retry. 749 */ 750 if ((vp->v_iflag & VI_INACTREDO) != 0) { 751 goto retry; 752 } 753 } 754 755 /* Take care of space accounting. */ 756 if (vp->v_iflag & VI_EXECMAP) { 757 atomic_add_int(&uvmexp.execpages, 758 -vp->v_uobj.uo_npages); 759 atomic_add_int(&uvmexp.filepages, 760 vp->v_uobj.uo_npages); 761 } 762 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 763 vp->v_vflag &= ~VV_MAPPED; 764 765 /* 766 * Recycle the vnode if the file is now unused (unlinked), 767 * otherwise just free it. 768 */ 769 if (recycle) { 770 vclean(vp, DOCLOSE); 771 } 772 KASSERT(vp->v_usecount > 0); 773 } 774 775 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 776 /* Gained another reference while being reclaimed. */ 777 mutex_exit(&vp->v_interlock); 778 return; 779 } 780 781 if ((vp->v_iflag & VI_CLEAN) != 0) { 782 /* 783 * It's clean so destroy it. It isn't referenced 784 * anywhere since it has been reclaimed. 785 */ 786 KASSERT(vp->v_holdcnt == 0); 787 KASSERT(vp->v_writecount == 0); 788 mutex_exit(&vp->v_interlock); 789 vfs_insmntque(vp, NULL); 790 if (vp->v_type == VBLK || vp->v_type == VCHR) { 791 spec_node_destroy(vp); 792 } 793 vnfree(vp); 794 } else { 795 /* 796 * Otherwise, put it back onto the freelist. It 797 * can't be destroyed while still associated with 798 * a file system. 799 */ 800 mutex_enter(&vnode_free_list_lock); 801 if (vp->v_holdcnt > 0) { 802 vp->v_freelisthd = &vnode_hold_list; 803 } else { 804 vp->v_freelisthd = &vnode_free_list; 805 } 806 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 807 mutex_exit(&vnode_free_list_lock); 808 mutex_exit(&vp->v_interlock); 809 } 810 } 811 812 void 813 vrele(vnode_t *vp) 814 { 815 816 KASSERT((vp->v_iflag & VI_MARKER) == 0); 817 818 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 819 return; 820 } 821 mutex_enter(&vp->v_interlock); 822 vrelel(vp, 0); 823 } 824 825 /* 826 * Asynchronous vnode release, vnode is released in different context. 827 */ 828 void 829 vrele_async(vnode_t *vp) 830 { 831 832 KASSERT((vp->v_iflag & VI_MARKER) == 0); 833 834 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 835 return; 836 } 837 mutex_enter(&vp->v_interlock); 838 vrelel(vp, VRELEL_ASYNC_RELE); 839 } 840 841 static void 842 vrele_thread(void *cookie) 843 { 844 vnode_t *vp; 845 846 for (;;) { 847 mutex_enter(&vrele_lock); 848 while (TAILQ_EMPTY(&vrele_list)) { 849 vrele_gen++; 850 cv_broadcast(&vrele_cv); 851 cv_timedwait(&vrele_cv, &vrele_lock, hz); 852 } 853 vp = TAILQ_FIRST(&vrele_list); 854 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 855 vrele_pending--; 856 mutex_exit(&vrele_lock); 857 858 /* 859 * If not the last reference, then ignore the vnode 860 * and look for more work. 861 */ 862 mutex_enter(&vp->v_interlock); 863 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 864 vp->v_iflag &= ~VI_INACTPEND; 865 vrelel(vp, 0); 866 } 867 } 868 869 void 870 vrele_flush(void) 871 { 872 int gen; 873 874 mutex_enter(&vrele_lock); 875 gen = vrele_gen; 876 while (vrele_pending && gen == vrele_gen) { 877 cv_broadcast(&vrele_cv); 878 cv_wait(&vrele_cv, &vrele_lock); 879 } 880 mutex_exit(&vrele_lock); 881 } 882 883 /* 884 * Vnode reference, where a reference is already held by some other 885 * object (for example, a file structure). 886 */ 887 void 888 vref(vnode_t *vp) 889 { 890 891 KASSERT((vp->v_iflag & VI_MARKER) == 0); 892 KASSERT(vp->v_usecount != 0); 893 894 atomic_inc_uint(&vp->v_usecount); 895 } 896 897 /* 898 * Page or buffer structure gets a reference. 899 * Called with v_interlock held. 900 */ 901 void 902 vholdl(vnode_t *vp) 903 { 904 905 KASSERT(mutex_owned(&vp->v_interlock)); 906 KASSERT((vp->v_iflag & VI_MARKER) == 0); 907 908 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 909 mutex_enter(&vnode_free_list_lock); 910 KASSERT(vp->v_freelisthd == &vnode_free_list); 911 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 912 vp->v_freelisthd = &vnode_hold_list; 913 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 914 mutex_exit(&vnode_free_list_lock); 915 } 916 } 917 918 /* 919 * Page or buffer structure frees a reference. 920 * Called with v_interlock held. 921 */ 922 void 923 holdrelel(vnode_t *vp) 924 { 925 926 KASSERT(mutex_owned(&vp->v_interlock)); 927 KASSERT((vp->v_iflag & VI_MARKER) == 0); 928 929 if (vp->v_holdcnt <= 0) { 930 vpanic(vp, "holdrelel: holdcnt vp %p"); 931 } 932 933 vp->v_holdcnt--; 934 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 935 mutex_enter(&vnode_free_list_lock); 936 KASSERT(vp->v_freelisthd == &vnode_hold_list); 937 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 938 vp->v_freelisthd = &vnode_free_list; 939 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 940 mutex_exit(&vnode_free_list_lock); 941 } 942 } 943 944 /* 945 * Disassociate the underlying file system from a vnode. 946 * 947 * Must be called with the interlock held, and will return with it held. 948 */ 949 void 950 vclean(vnode_t *vp, int flags) 951 { 952 lwp_t *l = curlwp; 953 bool recycle, active; 954 int error; 955 956 KASSERT(mutex_owned(&vp->v_interlock)); 957 KASSERT((vp->v_iflag & VI_MARKER) == 0); 958 KASSERT(vp->v_usecount != 0); 959 960 /* If cleaning is already in progress wait until done and return. */ 961 if (vp->v_iflag & VI_XLOCK) { 962 vwait(vp, VI_XLOCK); 963 return; 964 } 965 966 /* If already clean, nothing to do. */ 967 if ((vp->v_iflag & VI_CLEAN) != 0) { 968 return; 969 } 970 971 /* 972 * Prevent the vnode from being recycled or brought into use 973 * while we clean it out. 974 */ 975 vp->v_iflag |= VI_XLOCK; 976 if (vp->v_iflag & VI_EXECMAP) { 977 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 978 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 979 } 980 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 981 active = (vp->v_usecount & VC_MASK) > 1; 982 983 /* XXXAD should not lock vnode under layer */ 984 mutex_exit(&vp->v_interlock); 985 VOP_LOCK(vp, LK_EXCLUSIVE); 986 987 /* 988 * Clean out any cached data associated with the vnode. 989 * If purging an active vnode, it must be closed and 990 * deactivated before being reclaimed. Note that the 991 * VOP_INACTIVE will unlock the vnode. 992 */ 993 if (flags & DOCLOSE) { 994 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 995 if (error != 0) { 996 /* XXX, fix vn_start_write's grab of mp and use that. */ 997 998 if (wapbl_vphaswapbl(vp)) 999 WAPBL_DISCARD(wapbl_vptomp(vp)); 1000 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1001 } 1002 KASSERT(error == 0); 1003 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1004 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1005 spec_node_revoke(vp); 1006 } 1007 } 1008 if (active) { 1009 VOP_INACTIVE(vp, &recycle); 1010 } else { 1011 /* 1012 * Any other processes trying to obtain this lock must first 1013 * wait for VI_XLOCK to clear, then call the new lock operation. 1014 */ 1015 VOP_UNLOCK(vp); 1016 } 1017 1018 /* Disassociate the underlying file system from the vnode. */ 1019 if (VOP_RECLAIM(vp)) { 1020 vpanic(vp, "vclean: cannot reclaim"); 1021 } 1022 1023 KASSERT(vp->v_data == NULL); 1024 KASSERT(vp->v_uobj.uo_npages == 0); 1025 1026 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1027 uvm_ra_freectx(vp->v_ractx); 1028 vp->v_ractx = NULL; 1029 } 1030 1031 /* Purge name cache. */ 1032 cache_purge(vp); 1033 1034 /* Done with purge, notify sleepers of the grim news. */ 1035 mutex_enter(&vp->v_interlock); 1036 vp->v_op = dead_vnodeop_p; 1037 vp->v_tag = VT_NON; 1038 KNOTE(&vp->v_klist, NOTE_REVOKE); 1039 vp->v_iflag &= ~VI_XLOCK; 1040 vp->v_vflag &= ~VV_LOCKSWORK; 1041 if ((flags & DOCLOSE) != 0) { 1042 vp->v_iflag |= VI_CLEAN; 1043 } 1044 cv_broadcast(&vp->v_cv); 1045 1046 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1047 } 1048 1049 /* 1050 * Recycle an unused vnode to the front of the free list. 1051 * Release the passed interlock if the vnode will be recycled. 1052 */ 1053 int 1054 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1055 { 1056 1057 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1058 1059 mutex_enter(&vp->v_interlock); 1060 if (vp->v_usecount != 0) { 1061 mutex_exit(&vp->v_interlock); 1062 return 0; 1063 } 1064 if (inter_lkp) { 1065 mutex_exit(inter_lkp); 1066 } 1067 vremfree(vp); 1068 vp->v_usecount = 1; 1069 vclean(vp, DOCLOSE); 1070 vrelel(vp, 0); 1071 return 1; 1072 } 1073 1074 /* 1075 * Eliminate all activity associated with the requested vnode 1076 * and with all vnodes aliased to the requested vnode. 1077 */ 1078 void 1079 vrevoke(vnode_t *vp) 1080 { 1081 vnode_t *vq, **vpp; 1082 enum vtype type; 1083 dev_t dev; 1084 1085 KASSERT(vp->v_usecount > 0); 1086 1087 mutex_enter(&vp->v_interlock); 1088 if ((vp->v_iflag & VI_CLEAN) != 0) { 1089 mutex_exit(&vp->v_interlock); 1090 return; 1091 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1092 atomic_inc_uint(&vp->v_usecount); 1093 vclean(vp, DOCLOSE); 1094 vrelel(vp, 0); 1095 return; 1096 } else { 1097 dev = vp->v_rdev; 1098 type = vp->v_type; 1099 mutex_exit(&vp->v_interlock); 1100 } 1101 1102 vpp = &specfs_hash[SPECHASH(dev)]; 1103 mutex_enter(&device_lock); 1104 for (vq = *vpp; vq != NULL;) { 1105 /* If clean or being cleaned, then ignore it. */ 1106 mutex_enter(&vq->v_interlock); 1107 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1108 vq->v_rdev != dev || vq->v_type != type) { 1109 mutex_exit(&vq->v_interlock); 1110 vq = vq->v_specnext; 1111 continue; 1112 } 1113 mutex_exit(&device_lock); 1114 if (vq->v_usecount == 0) { 1115 vremfree(vq); 1116 vq->v_usecount = 1; 1117 } else { 1118 atomic_inc_uint(&vq->v_usecount); 1119 } 1120 vclean(vq, DOCLOSE); 1121 vrelel(vq, 0); 1122 mutex_enter(&device_lock); 1123 vq = *vpp; 1124 } 1125 mutex_exit(&device_lock); 1126 } 1127 1128 /* 1129 * Eliminate all activity associated with a vnode in preparation for 1130 * reuse. Drops a reference from the vnode. 1131 */ 1132 void 1133 vgone(vnode_t *vp) 1134 { 1135 1136 mutex_enter(&vp->v_interlock); 1137 vclean(vp, DOCLOSE); 1138 vrelel(vp, 0); 1139 } 1140 1141 /* 1142 * Update outstanding I/O count and do wakeup if requested. 1143 */ 1144 void 1145 vwakeup(struct buf *bp) 1146 { 1147 vnode_t *vp; 1148 1149 if ((vp = bp->b_vp) == NULL) 1150 return; 1151 1152 KASSERT(bp->b_objlock == &vp->v_interlock); 1153 KASSERT(mutex_owned(bp->b_objlock)); 1154 1155 if (--vp->v_numoutput < 0) 1156 panic("vwakeup: neg numoutput, vp %p", vp); 1157 if (vp->v_numoutput == 0) 1158 cv_broadcast(&vp->v_cv); 1159 } 1160 1161 /* 1162 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1163 * recycled. 1164 */ 1165 void 1166 vwait(vnode_t *vp, int flags) 1167 { 1168 1169 KASSERT(mutex_owned(&vp->v_interlock)); 1170 KASSERT(vp->v_usecount != 0); 1171 1172 while ((vp->v_iflag & flags) != 0) 1173 cv_wait(&vp->v_cv, &vp->v_interlock); 1174 } 1175 1176 int 1177 vfs_drainvnodes(long target) 1178 { 1179 1180 while (numvnodes > target) { 1181 vnode_t *vp; 1182 1183 mutex_enter(&vnode_free_list_lock); 1184 vp = getcleanvnode(); 1185 if (vp == NULL) { 1186 return EBUSY; 1187 } 1188 ungetnewvnode(vp); 1189 } 1190 return 0; 1191 } 1192 1193 void 1194 vpanic(vnode_t *vp, const char *msg) 1195 { 1196 #ifdef DIAGNOSTIC 1197 1198 vprint(NULL, vp); 1199 panic("%s\n", msg); 1200 #endif 1201 } 1202