1 /* $NetBSD: vfs_vnode.c,v 1.26 2013/11/23 13:46:22 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Reference counting 96 * 97 * Vnode is considered active, if reference count (vnode_t::v_usecount) 98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 99 * as vput(9), routines. Common points holding references are e.g. 100 * file openings, current working directory, mount points, etc. 101 * 102 * Note on v_usecount and its locking 103 * 104 * At nearly all points it is known that v_usecount could be zero, 105 * the vnode_t::v_interlock will be held. To change v_usecount away 106 * from zero, the interlock must be held. To change from a non-zero 107 * value to zero, again the interlock must be held. 108 * 109 * Changing the usecount from a non-zero value to a non-zero value can 110 * safely be done using atomic operations, without the interlock held. 111 * 112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 113 * mntvnode_lock is still held. 114 * 115 * See PR 41374. 116 */ 117 118 #include <sys/cdefs.h> 119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.26 2013/11/23 13:46:22 hannken Exp $"); 120 121 #define _VFS_VNODE_PRIVATE 122 123 #include <sys/param.h> 124 #include <sys/kernel.h> 125 126 #include <sys/atomic.h> 127 #include <sys/buf.h> 128 #include <sys/conf.h> 129 #include <sys/device.h> 130 #include <sys/kauth.h> 131 #include <sys/kmem.h> 132 #include <sys/kthread.h> 133 #include <sys/module.h> 134 #include <sys/mount.h> 135 #include <sys/namei.h> 136 #include <sys/syscallargs.h> 137 #include <sys/sysctl.h> 138 #include <sys/systm.h> 139 #include <sys/vnode.h> 140 #include <sys/wapbl.h> 141 #include <sys/fstrans.h> 142 143 #include <uvm/uvm.h> 144 #include <uvm/uvm_readahead.h> 145 146 /* Flags to vrelel. */ 147 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 148 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */ 149 150 u_int numvnodes __cacheline_aligned; 151 152 static pool_cache_t vnode_cache __read_mostly; 153 154 /* 155 * There are two free lists: one is for vnodes which have no buffer/page 156 * references and one for those which do (i.e. v_holdcnt is non-zero). 157 * Vnode recycling mechanism first attempts to look into the former list. 158 */ 159 static kmutex_t vnode_free_list_lock __cacheline_aligned; 160 static vnodelst_t vnode_free_list __cacheline_aligned; 161 static vnodelst_t vnode_hold_list __cacheline_aligned; 162 static kcondvar_t vdrain_cv __cacheline_aligned; 163 164 static vnodelst_t vrele_list __cacheline_aligned; 165 static kmutex_t vrele_lock __cacheline_aligned; 166 static kcondvar_t vrele_cv __cacheline_aligned; 167 static lwp_t * vrele_lwp __cacheline_aligned; 168 static int vrele_pending __cacheline_aligned; 169 static int vrele_gen __cacheline_aligned; 170 171 static int cleanvnode(void); 172 static void vclean(vnode_t *); 173 static void vrelel(vnode_t *, int); 174 static void vdrain_thread(void *); 175 static void vrele_thread(void *); 176 static void vnpanic(vnode_t *, const char *, ...) 177 __printflike(2, 3); 178 179 /* Routines having to do with the management of the vnode table. */ 180 extern int (**dead_vnodeop_p)(void *); 181 182 void 183 vfs_vnode_sysinit(void) 184 { 185 int error __diagused; 186 187 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 188 NULL, IPL_NONE, NULL, NULL, NULL); 189 KASSERT(vnode_cache != NULL); 190 191 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 192 TAILQ_INIT(&vnode_free_list); 193 TAILQ_INIT(&vnode_hold_list); 194 TAILQ_INIT(&vrele_list); 195 196 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 197 cv_init(&vdrain_cv, "vdrain"); 198 cv_init(&vrele_cv, "vrele"); 199 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 200 NULL, NULL, "vdrain"); 201 KASSERT(error == 0); 202 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 203 NULL, &vrele_lwp, "vrele"); 204 KASSERT(error == 0); 205 } 206 207 /* 208 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 209 * marker vnode. 210 */ 211 vnode_t * 212 vnalloc(struct mount *mp) 213 { 214 vnode_t *vp; 215 216 vp = pool_cache_get(vnode_cache, PR_WAITOK); 217 KASSERT(vp != NULL); 218 219 memset(vp, 0, sizeof(*vp)); 220 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 221 cv_init(&vp->v_cv, "vnode"); 222 /* 223 * Done by memset() above. 224 * LIST_INIT(&vp->v_nclist); 225 * LIST_INIT(&vp->v_dnclist); 226 */ 227 228 if (mp != NULL) { 229 vp->v_mount = mp; 230 vp->v_type = VBAD; 231 vp->v_iflag = VI_MARKER; 232 } else { 233 rw_init(&vp->v_lock); 234 } 235 236 return vp; 237 } 238 239 /* 240 * Free an unused, unreferenced vnode. 241 */ 242 void 243 vnfree(vnode_t *vp) 244 { 245 246 KASSERT(vp->v_usecount == 0); 247 248 if ((vp->v_iflag & VI_MARKER) == 0) { 249 rw_destroy(&vp->v_lock); 250 mutex_enter(&vnode_free_list_lock); 251 numvnodes--; 252 mutex_exit(&vnode_free_list_lock); 253 } 254 255 /* 256 * Note: the vnode interlock will either be freed, of reference 257 * dropped (if VI_LOCKSHARE was in use). 258 */ 259 uvm_obj_destroy(&vp->v_uobj, true); 260 cv_destroy(&vp->v_cv); 261 pool_cache_put(vnode_cache, vp); 262 } 263 264 /* 265 * cleanvnode: grab a vnode from freelist, clean and free it. 266 * 267 * => Releases vnode_free_list_lock. 268 */ 269 static int 270 cleanvnode(void) 271 { 272 vnode_t *vp; 273 vnodelst_t *listhd; 274 struct mount *mp; 275 276 KASSERT(mutex_owned(&vnode_free_list_lock)); 277 278 listhd = &vnode_free_list; 279 try_nextlist: 280 TAILQ_FOREACH(vp, listhd, v_freelist) { 281 /* 282 * It's safe to test v_usecount and v_iflag 283 * without holding the interlock here, since 284 * these vnodes should never appear on the 285 * lists. 286 */ 287 KASSERT(vp->v_usecount == 0); 288 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 289 KASSERT(vp->v_freelisthd == listhd); 290 291 if (!mutex_tryenter(vp->v_interlock)) 292 continue; 293 if ((vp->v_iflag & VI_XLOCK) != 0) { 294 mutex_exit(vp->v_interlock); 295 continue; 296 } 297 mp = vp->v_mount; 298 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 299 mutex_exit(vp->v_interlock); 300 continue; 301 } 302 break; 303 } 304 305 if (vp == NULL) { 306 if (listhd == &vnode_free_list) { 307 listhd = &vnode_hold_list; 308 goto try_nextlist; 309 } 310 mutex_exit(&vnode_free_list_lock); 311 return EBUSY; 312 } 313 314 /* Remove it from the freelist. */ 315 TAILQ_REMOVE(listhd, vp, v_freelist); 316 vp->v_freelisthd = NULL; 317 mutex_exit(&vnode_free_list_lock); 318 319 KASSERT(vp->v_usecount == 0); 320 321 /* 322 * The vnode is still associated with a file system, so we must 323 * clean it out before freeing it. We need to add a reference 324 * before doing this. 325 */ 326 vp->v_usecount = 1; 327 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 328 vp->v_iflag |= VI_CHANGING; 329 vclean(vp); 330 vrelel(vp, VRELEL_CHANGING_SET); 331 fstrans_done(mp); 332 333 return 0; 334 } 335 336 /* 337 * getnewvnode: return a fresh vnode. 338 * 339 * => Returns referenced vnode, moved into the mount queue. 340 * => Shares the interlock specified by 'slock', if it is not NULL. 341 */ 342 int 343 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 344 kmutex_t *slock, vnode_t **vpp) 345 { 346 struct uvm_object *uobj __diagused; 347 vnode_t *vp; 348 int error = 0; 349 350 if (mp != NULL) { 351 /* 352 * Mark filesystem busy while we are creating a vnode. 353 * If unmount is in progress, this will fail. 354 */ 355 error = vfs_busy(mp, NULL); 356 if (error) 357 return error; 358 } 359 360 vp = NULL; 361 362 /* Allocate a new vnode. */ 363 mutex_enter(&vnode_free_list_lock); 364 numvnodes++; 365 if (numvnodes > desiredvnodes + desiredvnodes / 10) 366 cv_signal(&vdrain_cv); 367 mutex_exit(&vnode_free_list_lock); 368 vp = vnalloc(NULL); 369 370 KASSERT(vp->v_freelisthd == NULL); 371 KASSERT(LIST_EMPTY(&vp->v_nclist)); 372 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 373 374 /* Initialize vnode. */ 375 vp->v_usecount = 1; 376 vp->v_type = VNON; 377 vp->v_tag = tag; 378 vp->v_op = vops; 379 vp->v_data = NULL; 380 381 uobj = &vp->v_uobj; 382 KASSERT(uobj->pgops == &uvm_vnodeops); 383 KASSERT(uobj->uo_npages == 0); 384 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 385 vp->v_size = vp->v_writesize = VSIZENOTSET; 386 387 /* Share the vnode_t::v_interlock, if requested. */ 388 if (slock) { 389 /* Set the interlock and mark that it is shared. */ 390 KASSERT(vp->v_mount == NULL); 391 mutex_obj_hold(slock); 392 uvm_obj_setlock(&vp->v_uobj, slock); 393 KASSERT(vp->v_interlock == slock); 394 vp->v_iflag |= VI_LOCKSHARE; 395 } 396 397 /* Finally, move vnode into the mount queue. */ 398 vfs_insmntque(vp, mp); 399 400 if (mp != NULL) { 401 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 402 vp->v_vflag |= VV_MPSAFE; 403 vfs_unbusy(mp, true, NULL); 404 } 405 406 *vpp = vp; 407 return 0; 408 } 409 410 /* 411 * This is really just the reverse of getnewvnode(). Needed for 412 * VFS_VGET functions who may need to push back a vnode in case 413 * of a locking race. 414 */ 415 void 416 ungetnewvnode(vnode_t *vp) 417 { 418 419 KASSERT(vp->v_usecount == 1); 420 KASSERT(vp->v_data == NULL); 421 KASSERT(vp->v_freelisthd == NULL); 422 423 mutex_enter(vp->v_interlock); 424 vp->v_iflag |= VI_CLEAN; 425 vrelel(vp, 0); 426 } 427 428 /* 429 * Helper thread to keep the number of vnodes below desiredvnodes. 430 */ 431 static void 432 vdrain_thread(void *cookie) 433 { 434 int error; 435 436 mutex_enter(&vnode_free_list_lock); 437 438 for (;;) { 439 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 440 while (numvnodes > desiredvnodes) { 441 error = cleanvnode(); 442 if (error) 443 kpause("vndsbusy", false, hz, NULL); 444 mutex_enter(&vnode_free_list_lock); 445 if (error) 446 break; 447 } 448 } 449 } 450 451 /* 452 * Remove a vnode from its freelist. 453 */ 454 void 455 vremfree(vnode_t *vp) 456 { 457 458 KASSERT(mutex_owned(vp->v_interlock)); 459 KASSERT(vp->v_usecount == 0); 460 461 /* 462 * Note that the reference count must not change until 463 * the vnode is removed. 464 */ 465 mutex_enter(&vnode_free_list_lock); 466 if (vp->v_holdcnt > 0) { 467 KASSERT(vp->v_freelisthd == &vnode_hold_list); 468 } else { 469 KASSERT(vp->v_freelisthd == &vnode_free_list); 470 } 471 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 472 vp->v_freelisthd = NULL; 473 mutex_exit(&vnode_free_list_lock); 474 } 475 476 /* 477 * vget: get a particular vnode from the free list, increment its reference 478 * count and lock it. 479 * 480 * => Should be called with v_interlock held. 481 * 482 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean(). 483 * In that case, we cannot grab the vnode, so the process is awakened when 484 * the transition is completed, and an error returned to indicate that the 485 * vnode is no longer usable. 486 */ 487 int 488 vget(vnode_t *vp, int flags) 489 { 490 int error = 0; 491 492 KASSERT((vp->v_iflag & VI_MARKER) == 0); 493 KASSERT(mutex_owned(vp->v_interlock)); 494 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 495 496 /* 497 * Before adding a reference, we must remove the vnode 498 * from its freelist. 499 */ 500 if (vp->v_usecount == 0) { 501 vremfree(vp); 502 vp->v_usecount = 1; 503 } else { 504 atomic_inc_uint(&vp->v_usecount); 505 } 506 507 /* 508 * If the vnode is in the process of changing state we wait 509 * for the change to complete and take care not to return 510 * a clean vnode. 511 */ 512 if ((vp->v_iflag & VI_CHANGING) != 0) { 513 if ((flags & LK_NOWAIT) != 0) { 514 vrelel(vp, 0); 515 return EBUSY; 516 } 517 vwait(vp, VI_CHANGING); 518 if ((vp->v_iflag & VI_CLEAN) != 0) { 519 vrelel(vp, 0); 520 return ENOENT; 521 } 522 } 523 524 /* 525 * Ok, we got it in good shape. Just locking left. 526 */ 527 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 528 mutex_exit(vp->v_interlock); 529 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 530 error = vn_lock(vp, flags); 531 if (error != 0) { 532 vrele(vp); 533 } 534 } 535 return error; 536 } 537 538 /* 539 * vput: unlock and release the reference. 540 */ 541 void 542 vput(vnode_t *vp) 543 { 544 545 KASSERT((vp->v_iflag & VI_MARKER) == 0); 546 547 VOP_UNLOCK(vp); 548 vrele(vp); 549 } 550 551 /* 552 * Try to drop reference on a vnode. Abort if we are releasing the 553 * last reference. Note: this _must_ succeed if not the last reference. 554 */ 555 static inline bool 556 vtryrele(vnode_t *vp) 557 { 558 u_int use, next; 559 560 for (use = vp->v_usecount;; use = next) { 561 if (use == 1) { 562 return false; 563 } 564 KASSERT(use > 1); 565 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 566 if (__predict_true(next == use)) { 567 return true; 568 } 569 } 570 } 571 572 /* 573 * Vnode release. If reference count drops to zero, call inactive 574 * routine and either return to freelist or free to the pool. 575 */ 576 static void 577 vrelel(vnode_t *vp, int flags) 578 { 579 bool recycle, defer; 580 int error; 581 582 KASSERT(mutex_owned(vp->v_interlock)); 583 KASSERT((vp->v_iflag & VI_MARKER) == 0); 584 KASSERT(vp->v_freelisthd == NULL); 585 586 if (__predict_false(vp->v_op == dead_vnodeop_p && 587 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 588 vnpanic(vp, "dead but not clean"); 589 } 590 591 /* 592 * If not the last reference, just drop the reference count 593 * and unlock. 594 */ 595 if (vtryrele(vp)) { 596 if ((flags & VRELEL_CHANGING_SET) != 0) { 597 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 598 vp->v_iflag &= ~VI_CHANGING; 599 cv_broadcast(&vp->v_cv); 600 } 601 mutex_exit(vp->v_interlock); 602 return; 603 } 604 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 605 vnpanic(vp, "%s: bad ref count", __func__); 606 } 607 608 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 609 if ((flags & VRELEL_CHANGING_SET) == 0) { 610 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 611 vp->v_iflag |= VI_CHANGING; 612 } 613 614 #ifdef DIAGNOSTIC 615 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 616 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 617 vprint("vrelel: missing VOP_CLOSE()", vp); 618 } 619 #endif 620 621 /* 622 * If not clean, deactivate the vnode, but preserve 623 * our reference across the call to VOP_INACTIVE(). 624 */ 625 if ((vp->v_iflag & VI_CLEAN) == 0) { 626 recycle = false; 627 628 /* 629 * XXX This ugly block can be largely eliminated if 630 * locking is pushed down into the file systems. 631 * 632 * Defer vnode release to vrele_thread if caller 633 * requests it explicitly. 634 */ 635 if ((curlwp == uvm.pagedaemon_lwp) || 636 (flags & VRELEL_ASYNC_RELE) != 0) { 637 /* The pagedaemon can't wait around; defer. */ 638 defer = true; 639 } else if (curlwp == vrele_lwp) { 640 /* 641 * We have to try harder. 642 */ 643 mutex_exit(vp->v_interlock); 644 error = vn_lock(vp, LK_EXCLUSIVE); 645 if (error != 0) { 646 /* XXX */ 647 vnpanic(vp, "%s: unable to lock %p", 648 __func__, vp); 649 } 650 mutex_enter(vp->v_interlock); 651 /* 652 * if we did get another reference while 653 * sleeping, don't try to inactivate it yet. 654 */ 655 if (__predict_false(vtryrele(vp))) { 656 VOP_UNLOCK(vp); 657 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 658 vp->v_iflag &= ~VI_CHANGING; 659 cv_broadcast(&vp->v_cv); 660 mutex_exit(vp->v_interlock); 661 return; 662 } 663 mutex_exit(vp->v_interlock); 664 defer = false; 665 } else if ((vp->v_iflag & VI_LAYER) != 0) { 666 /* 667 * Acquiring the stack's lock in vclean() even 668 * for an honest vput/vrele is dangerous because 669 * our caller may hold other vnode locks; defer. 670 */ 671 defer = true; 672 } else { 673 /* If we can't acquire the lock, then defer. */ 674 mutex_exit(vp->v_interlock); 675 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 676 if (error != 0) { 677 defer = true; 678 mutex_enter(vp->v_interlock); 679 } else { 680 defer = false; 681 } 682 } 683 684 if (defer) { 685 /* 686 * Defer reclaim to the kthread; it's not safe to 687 * clean it here. We donate it our last reference. 688 */ 689 KASSERT(mutex_owned(vp->v_interlock)); 690 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 691 vp->v_iflag &= ~VI_CHANGING; 692 mutex_enter(&vrele_lock); 693 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 694 if (++vrele_pending > (desiredvnodes >> 8)) 695 cv_signal(&vrele_cv); 696 mutex_exit(&vrele_lock); 697 cv_broadcast(&vp->v_cv); 698 mutex_exit(vp->v_interlock); 699 return; 700 } 701 702 /* 703 * The vnode can gain another reference while being 704 * deactivated. If VOP_INACTIVE() indicates that 705 * the described file has been deleted, then recycle 706 * the vnode irrespective of additional references. 707 * Another thread may be waiting to re-use the on-disk 708 * inode. 709 * 710 * Note that VOP_INACTIVE() will drop the vnode lock. 711 */ 712 VOP_INACTIVE(vp, &recycle); 713 mutex_enter(vp->v_interlock); 714 if (!recycle) { 715 if (vtryrele(vp)) { 716 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 717 vp->v_iflag &= ~VI_CHANGING; 718 cv_broadcast(&vp->v_cv); 719 mutex_exit(vp->v_interlock); 720 return; 721 } 722 } 723 724 /* Take care of space accounting. */ 725 if (vp->v_iflag & VI_EXECMAP) { 726 atomic_add_int(&uvmexp.execpages, 727 -vp->v_uobj.uo_npages); 728 atomic_add_int(&uvmexp.filepages, 729 vp->v_uobj.uo_npages); 730 } 731 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 732 vp->v_vflag &= ~VV_MAPPED; 733 734 /* 735 * Recycle the vnode if the file is now unused (unlinked), 736 * otherwise just free it. 737 */ 738 if (recycle) { 739 vclean(vp); 740 } 741 KASSERT(vp->v_usecount > 0); 742 } 743 744 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 745 /* Gained another reference while being reclaimed. */ 746 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 747 vp->v_iflag &= ~VI_CHANGING; 748 cv_broadcast(&vp->v_cv); 749 mutex_exit(vp->v_interlock); 750 return; 751 } 752 753 if ((vp->v_iflag & VI_CLEAN) != 0) { 754 /* 755 * It's clean so destroy it. It isn't referenced 756 * anywhere since it has been reclaimed. 757 */ 758 KASSERT(vp->v_holdcnt == 0); 759 KASSERT(vp->v_writecount == 0); 760 mutex_exit(vp->v_interlock); 761 vfs_insmntque(vp, NULL); 762 if (vp->v_type == VBLK || vp->v_type == VCHR) { 763 spec_node_destroy(vp); 764 } 765 vnfree(vp); 766 } else { 767 /* 768 * Otherwise, put it back onto the freelist. It 769 * can't be destroyed while still associated with 770 * a file system. 771 */ 772 mutex_enter(&vnode_free_list_lock); 773 if (vp->v_holdcnt > 0) { 774 vp->v_freelisthd = &vnode_hold_list; 775 } else { 776 vp->v_freelisthd = &vnode_free_list; 777 } 778 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 779 mutex_exit(&vnode_free_list_lock); 780 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 781 vp->v_iflag &= ~VI_CHANGING; 782 cv_broadcast(&vp->v_cv); 783 mutex_exit(vp->v_interlock); 784 } 785 } 786 787 void 788 vrele(vnode_t *vp) 789 { 790 791 KASSERT((vp->v_iflag & VI_MARKER) == 0); 792 793 if (vtryrele(vp)) { 794 return; 795 } 796 mutex_enter(vp->v_interlock); 797 vrelel(vp, 0); 798 } 799 800 /* 801 * Asynchronous vnode release, vnode is released in different context. 802 */ 803 void 804 vrele_async(vnode_t *vp) 805 { 806 807 KASSERT((vp->v_iflag & VI_MARKER) == 0); 808 809 if (vtryrele(vp)) { 810 return; 811 } 812 mutex_enter(vp->v_interlock); 813 vrelel(vp, VRELEL_ASYNC_RELE); 814 } 815 816 static void 817 vrele_thread(void *cookie) 818 { 819 vnode_t *vp; 820 821 for (;;) { 822 mutex_enter(&vrele_lock); 823 while (TAILQ_EMPTY(&vrele_list)) { 824 vrele_gen++; 825 cv_broadcast(&vrele_cv); 826 cv_timedwait(&vrele_cv, &vrele_lock, hz); 827 } 828 vp = TAILQ_FIRST(&vrele_list); 829 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 830 vrele_pending--; 831 mutex_exit(&vrele_lock); 832 833 /* 834 * If not the last reference, then ignore the vnode 835 * and look for more work. 836 */ 837 mutex_enter(vp->v_interlock); 838 vrelel(vp, 0); 839 } 840 } 841 842 void 843 vrele_flush(void) 844 { 845 int gen; 846 847 mutex_enter(&vrele_lock); 848 gen = vrele_gen; 849 while (vrele_pending && gen == vrele_gen) { 850 cv_broadcast(&vrele_cv); 851 cv_wait(&vrele_cv, &vrele_lock); 852 } 853 mutex_exit(&vrele_lock); 854 } 855 856 /* 857 * Vnode reference, where a reference is already held by some other 858 * object (for example, a file structure). 859 */ 860 void 861 vref(vnode_t *vp) 862 { 863 864 KASSERT((vp->v_iflag & VI_MARKER) == 0); 865 KASSERT(vp->v_usecount != 0); 866 867 atomic_inc_uint(&vp->v_usecount); 868 } 869 870 /* 871 * Page or buffer structure gets a reference. 872 * Called with v_interlock held. 873 */ 874 void 875 vholdl(vnode_t *vp) 876 { 877 878 KASSERT(mutex_owned(vp->v_interlock)); 879 KASSERT((vp->v_iflag & VI_MARKER) == 0); 880 881 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 882 mutex_enter(&vnode_free_list_lock); 883 KASSERT(vp->v_freelisthd == &vnode_free_list); 884 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 885 vp->v_freelisthd = &vnode_hold_list; 886 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 887 mutex_exit(&vnode_free_list_lock); 888 } 889 } 890 891 /* 892 * Page or buffer structure frees a reference. 893 * Called with v_interlock held. 894 */ 895 void 896 holdrelel(vnode_t *vp) 897 { 898 899 KASSERT(mutex_owned(vp->v_interlock)); 900 KASSERT((vp->v_iflag & VI_MARKER) == 0); 901 902 if (vp->v_holdcnt <= 0) { 903 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 904 } 905 906 vp->v_holdcnt--; 907 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 908 mutex_enter(&vnode_free_list_lock); 909 KASSERT(vp->v_freelisthd == &vnode_hold_list); 910 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 911 vp->v_freelisthd = &vnode_free_list; 912 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 913 mutex_exit(&vnode_free_list_lock); 914 } 915 } 916 917 /* 918 * Disassociate the underlying file system from a vnode. 919 * 920 * Must be called with the interlock held, and will return with it held. 921 */ 922 static void 923 vclean(vnode_t *vp) 924 { 925 lwp_t *l = curlwp; 926 bool recycle, active, doclose; 927 int error; 928 929 KASSERT(mutex_owned(vp->v_interlock)); 930 KASSERT((vp->v_iflag & VI_MARKER) == 0); 931 KASSERT(vp->v_usecount != 0); 932 933 /* If cleaning is already in progress wait until done and return. */ 934 if (vp->v_iflag & VI_XLOCK) { 935 vwait(vp, VI_XLOCK); 936 return; 937 } 938 939 /* If already clean, nothing to do. */ 940 if ((vp->v_iflag & VI_CLEAN) != 0) { 941 return; 942 } 943 944 /* 945 * Prevent the vnode from being recycled or brought into use 946 * while we clean it out. 947 */ 948 vp->v_iflag |= VI_XLOCK; 949 if (vp->v_iflag & VI_EXECMAP) { 950 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 951 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 952 } 953 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 954 active = (vp->v_usecount > 1); 955 956 /* XXXAD should not lock vnode under layer */ 957 mutex_exit(vp->v_interlock); 958 VOP_LOCK(vp, LK_EXCLUSIVE); 959 960 doclose = ! (active && vp->v_type == VBLK && 961 spec_node_getmountedfs(vp) != NULL); 962 963 /* 964 * Clean out any cached data associated with the vnode. 965 * If purging an active vnode, it must be closed and 966 * deactivated before being reclaimed. Note that the 967 * VOP_INACTIVE will unlock the vnode. 968 */ 969 if (doclose) { 970 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 971 if (error != 0) { 972 if (wapbl_vphaswapbl(vp)) 973 WAPBL_DISCARD(wapbl_vptomp(vp)); 974 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 975 } 976 KASSERT(error == 0); 977 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 978 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 979 spec_node_revoke(vp); 980 } 981 } 982 if (active) { 983 VOP_INACTIVE(vp, &recycle); 984 } else { 985 /* 986 * Any other processes trying to obtain this lock must first 987 * wait for VI_XLOCK to clear, then call the new lock operation. 988 */ 989 VOP_UNLOCK(vp); 990 } 991 992 /* Disassociate the underlying file system from the vnode. */ 993 if (VOP_RECLAIM(vp)) { 994 vnpanic(vp, "%s: cannot reclaim", __func__); 995 } 996 997 KASSERT(vp->v_data == NULL); 998 KASSERT(vp->v_uobj.uo_npages == 0); 999 1000 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1001 uvm_ra_freectx(vp->v_ractx); 1002 vp->v_ractx = NULL; 1003 } 1004 1005 /* Purge name cache. */ 1006 cache_purge(vp); 1007 1008 /* 1009 * The vnode isn't clean, but still resides on the mount list. Remove 1010 * it. XXX This is a bit dodgy. 1011 */ 1012 if (! doclose) 1013 vfs_insmntque(vp, NULL); 1014 1015 /* Done with purge, notify sleepers of the grim news. */ 1016 mutex_enter(vp->v_interlock); 1017 if (doclose) { 1018 vp->v_op = dead_vnodeop_p; 1019 vp->v_vflag |= VV_LOCKSWORK; 1020 vp->v_iflag |= VI_CLEAN; 1021 } else { 1022 vp->v_op = spec_vnodeop_p; 1023 vp->v_vflag &= ~VV_LOCKSWORK; 1024 } 1025 vp->v_tag = VT_NON; 1026 KNOTE(&vp->v_klist, NOTE_REVOKE); 1027 vp->v_iflag &= ~VI_XLOCK; 1028 cv_broadcast(&vp->v_cv); 1029 1030 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1031 } 1032 1033 /* 1034 * Recycle an unused vnode to the front of the free list. 1035 * Release the passed interlock if the vnode will be recycled. 1036 */ 1037 int 1038 vrecycle(vnode_t *vp, kmutex_t *inter_lkp) 1039 { 1040 1041 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1042 1043 mutex_enter(vp->v_interlock); 1044 if (vp->v_usecount != 0 || (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) != 0) { 1045 mutex_exit(vp->v_interlock); 1046 return 0; 1047 } 1048 if (inter_lkp) { 1049 mutex_exit(inter_lkp); 1050 } 1051 vremfree(vp); 1052 vp->v_usecount = 1; 1053 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 1054 vp->v_iflag |= VI_CHANGING; 1055 vclean(vp); 1056 vrelel(vp, VRELEL_CHANGING_SET); 1057 return 1; 1058 } 1059 1060 /* 1061 * Eliminate all activity associated with the requested vnode 1062 * and with all vnodes aliased to the requested vnode. 1063 */ 1064 void 1065 vrevoke(vnode_t *vp) 1066 { 1067 vnode_t *vq; 1068 enum vtype type; 1069 dev_t dev; 1070 1071 KASSERT(vp->v_usecount > 0); 1072 1073 mutex_enter(vp->v_interlock); 1074 if ((vp->v_iflag & VI_CLEAN) != 0) { 1075 mutex_exit(vp->v_interlock); 1076 return; 1077 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1078 atomic_inc_uint(&vp->v_usecount); 1079 mutex_exit(vp->v_interlock); 1080 vgone(vp); 1081 return; 1082 } else { 1083 dev = vp->v_rdev; 1084 type = vp->v_type; 1085 mutex_exit(vp->v_interlock); 1086 } 1087 1088 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1089 vgone(vq); 1090 } 1091 } 1092 1093 /* 1094 * Eliminate all activity associated with a vnode in preparation for 1095 * reuse. Drops a reference from the vnode. 1096 */ 1097 void 1098 vgone(vnode_t *vp) 1099 { 1100 1101 mutex_enter(vp->v_interlock); 1102 if ((vp->v_iflag & VI_CHANGING) != 0) 1103 vwait(vp, VI_CHANGING); 1104 vp->v_iflag |= VI_CHANGING; 1105 vclean(vp); 1106 vrelel(vp, VRELEL_CHANGING_SET); 1107 } 1108 1109 /* 1110 * Update outstanding I/O count and do wakeup if requested. 1111 */ 1112 void 1113 vwakeup(struct buf *bp) 1114 { 1115 vnode_t *vp; 1116 1117 if ((vp = bp->b_vp) == NULL) 1118 return; 1119 1120 KASSERT(bp->b_objlock == vp->v_interlock); 1121 KASSERT(mutex_owned(bp->b_objlock)); 1122 1123 if (--vp->v_numoutput < 0) 1124 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1125 if (vp->v_numoutput == 0) 1126 cv_broadcast(&vp->v_cv); 1127 } 1128 1129 /* 1130 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1131 * recycled. 1132 */ 1133 void 1134 vwait(vnode_t *vp, int flags) 1135 { 1136 1137 KASSERT(mutex_owned(vp->v_interlock)); 1138 KASSERT(vp->v_usecount != 0); 1139 1140 while ((vp->v_iflag & flags) != 0) 1141 cv_wait(&vp->v_cv, vp->v_interlock); 1142 } 1143 1144 int 1145 vfs_drainvnodes(long target) 1146 { 1147 int error; 1148 1149 mutex_enter(&vnode_free_list_lock); 1150 1151 while (numvnodes > target) { 1152 error = cleanvnode(); 1153 if (error != 0) 1154 return error; 1155 mutex_enter(&vnode_free_list_lock); 1156 } 1157 1158 mutex_exit(&vnode_free_list_lock); 1159 1160 return 0; 1161 } 1162 1163 void 1164 vnpanic(vnode_t *vp, const char *fmt, ...) 1165 { 1166 va_list ap; 1167 1168 #ifdef DIAGNOSTIC 1169 vprint(NULL, vp); 1170 #endif 1171 va_start(ap, fmt); 1172 vpanic(fmt, ap); 1173 va_end(ap); 1174 } 1175