1 /* $NetBSD: vfs_vnode.c,v 1.35 2014/03/24 13:42:40 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Reference counting 96 * 97 * Vnode is considered active, if reference count (vnode_t::v_usecount) 98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 99 * as vput(9), routines. Common points holding references are e.g. 100 * file openings, current working directory, mount points, etc. 101 * 102 * Note on v_usecount and its locking 103 * 104 * At nearly all points it is known that v_usecount could be zero, 105 * the vnode_t::v_interlock will be held. To change v_usecount away 106 * from zero, the interlock must be held. To change from a non-zero 107 * value to zero, again the interlock must be held. 108 * 109 * Changing the usecount from a non-zero value to a non-zero value can 110 * safely be done using atomic operations, without the interlock held. 111 * 112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 113 * mntvnode_lock is still held. 114 * 115 * See PR 41374. 116 */ 117 118 #include <sys/cdefs.h> 119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.35 2014/03/24 13:42:40 hannken Exp $"); 120 121 #define _VFS_VNODE_PRIVATE 122 123 #include <sys/param.h> 124 #include <sys/kernel.h> 125 126 #include <sys/atomic.h> 127 #include <sys/buf.h> 128 #include <sys/conf.h> 129 #include <sys/device.h> 130 #include <sys/kauth.h> 131 #include <sys/kmem.h> 132 #include <sys/kthread.h> 133 #include <sys/module.h> 134 #include <sys/mount.h> 135 #include <sys/namei.h> 136 #include <sys/syscallargs.h> 137 #include <sys/sysctl.h> 138 #include <sys/systm.h> 139 #include <sys/vnode.h> 140 #include <sys/wapbl.h> 141 #include <sys/fstrans.h> 142 143 #include <uvm/uvm.h> 144 #include <uvm/uvm_readahead.h> 145 146 /* Flags to vrelel. */ 147 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 148 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */ 149 150 u_int numvnodes __cacheline_aligned; 151 152 static pool_cache_t vnode_cache __read_mostly; 153 static struct mount *dead_mount; 154 155 /* 156 * There are two free lists: one is for vnodes which have no buffer/page 157 * references and one for those which do (i.e. v_holdcnt is non-zero). 158 * Vnode recycling mechanism first attempts to look into the former list. 159 */ 160 static kmutex_t vnode_free_list_lock __cacheline_aligned; 161 static vnodelst_t vnode_free_list __cacheline_aligned; 162 static vnodelst_t vnode_hold_list __cacheline_aligned; 163 static kcondvar_t vdrain_cv __cacheline_aligned; 164 165 static vnodelst_t vrele_list __cacheline_aligned; 166 static kmutex_t vrele_lock __cacheline_aligned; 167 static kcondvar_t vrele_cv __cacheline_aligned; 168 static lwp_t * vrele_lwp __cacheline_aligned; 169 static int vrele_pending __cacheline_aligned; 170 static int vrele_gen __cacheline_aligned; 171 172 static int cleanvnode(void); 173 static void vclean(vnode_t *); 174 static void vrelel(vnode_t *, int); 175 static void vdrain_thread(void *); 176 static void vrele_thread(void *); 177 static void vnpanic(vnode_t *, const char *, ...) 178 __printflike(2, 3); 179 static void vwait(vnode_t *, int); 180 181 /* Routines having to do with the management of the vnode table. */ 182 extern int (**dead_vnodeop_p)(void *); 183 extern struct vfsops dead_vfsops; 184 185 void 186 vfs_vnode_sysinit(void) 187 { 188 int error __diagused; 189 190 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 191 NULL, IPL_NONE, NULL, NULL, NULL); 192 KASSERT(vnode_cache != NULL); 193 194 dead_mount = vfs_mountalloc(&dead_vfsops, NULL); 195 KASSERT(dead_mount != NULL); 196 dead_mount->mnt_iflag = IMNT_MPSAFE; 197 198 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 199 TAILQ_INIT(&vnode_free_list); 200 TAILQ_INIT(&vnode_hold_list); 201 TAILQ_INIT(&vrele_list); 202 203 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 204 cv_init(&vdrain_cv, "vdrain"); 205 cv_init(&vrele_cv, "vrele"); 206 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 207 NULL, NULL, "vdrain"); 208 KASSERT(error == 0); 209 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 210 NULL, &vrele_lwp, "vrele"); 211 KASSERT(error == 0); 212 } 213 214 /* 215 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 216 * marker vnode. 217 */ 218 vnode_t * 219 vnalloc(struct mount *mp) 220 { 221 vnode_t *vp; 222 223 vp = pool_cache_get(vnode_cache, PR_WAITOK); 224 KASSERT(vp != NULL); 225 226 memset(vp, 0, sizeof(*vp)); 227 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 228 cv_init(&vp->v_cv, "vnode"); 229 /* 230 * Done by memset() above. 231 * LIST_INIT(&vp->v_nclist); 232 * LIST_INIT(&vp->v_dnclist); 233 */ 234 235 if (mp != NULL) { 236 vp->v_mount = mp; 237 vp->v_type = VBAD; 238 vp->v_iflag = VI_MARKER; 239 } else { 240 rw_init(&vp->v_lock); 241 } 242 243 return vp; 244 } 245 246 /* 247 * Free an unused, unreferenced vnode. 248 */ 249 void 250 vnfree(vnode_t *vp) 251 { 252 253 KASSERT(vp->v_usecount == 0); 254 255 if ((vp->v_iflag & VI_MARKER) == 0) { 256 rw_destroy(&vp->v_lock); 257 mutex_enter(&vnode_free_list_lock); 258 numvnodes--; 259 mutex_exit(&vnode_free_list_lock); 260 } 261 262 /* 263 * Note: the vnode interlock will either be freed, of reference 264 * dropped (if VI_LOCKSHARE was in use). 265 */ 266 uvm_obj_destroy(&vp->v_uobj, true); 267 cv_destroy(&vp->v_cv); 268 pool_cache_put(vnode_cache, vp); 269 } 270 271 /* 272 * cleanvnode: grab a vnode from freelist, clean and free it. 273 * 274 * => Releases vnode_free_list_lock. 275 */ 276 static int 277 cleanvnode(void) 278 { 279 vnode_t *vp; 280 vnodelst_t *listhd; 281 struct mount *mp; 282 283 KASSERT(mutex_owned(&vnode_free_list_lock)); 284 285 listhd = &vnode_free_list; 286 try_nextlist: 287 TAILQ_FOREACH(vp, listhd, v_freelist) { 288 /* 289 * It's safe to test v_usecount and v_iflag 290 * without holding the interlock here, since 291 * these vnodes should never appear on the 292 * lists. 293 */ 294 KASSERT(vp->v_usecount == 0); 295 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 296 KASSERT(vp->v_freelisthd == listhd); 297 298 if (!mutex_tryenter(vp->v_interlock)) 299 continue; 300 if ((vp->v_iflag & VI_XLOCK) != 0) { 301 mutex_exit(vp->v_interlock); 302 continue; 303 } 304 mp = vp->v_mount; 305 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 306 mutex_exit(vp->v_interlock); 307 continue; 308 } 309 break; 310 } 311 312 if (vp == NULL) { 313 if (listhd == &vnode_free_list) { 314 listhd = &vnode_hold_list; 315 goto try_nextlist; 316 } 317 mutex_exit(&vnode_free_list_lock); 318 return EBUSY; 319 } 320 321 /* Remove it from the freelist. */ 322 TAILQ_REMOVE(listhd, vp, v_freelist); 323 vp->v_freelisthd = NULL; 324 mutex_exit(&vnode_free_list_lock); 325 326 KASSERT(vp->v_usecount == 0); 327 328 /* 329 * The vnode is still associated with a file system, so we must 330 * clean it out before freeing it. We need to add a reference 331 * before doing this. 332 */ 333 vp->v_usecount = 1; 334 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 335 vp->v_iflag |= VI_CHANGING; 336 vclean(vp); 337 vrelel(vp, VRELEL_CHANGING_SET); 338 fstrans_done(mp); 339 340 return 0; 341 } 342 343 /* 344 * getnewvnode: return a fresh vnode. 345 * 346 * => Returns referenced vnode, moved into the mount queue. 347 * => Shares the interlock specified by 'slock', if it is not NULL. 348 */ 349 int 350 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 351 kmutex_t *slock, vnode_t **vpp) 352 { 353 struct uvm_object *uobj __diagused; 354 vnode_t *vp; 355 int error = 0; 356 357 if (mp != NULL) { 358 /* 359 * Mark filesystem busy while we are creating a vnode. 360 * If unmount is in progress, this will fail. 361 */ 362 error = vfs_busy(mp, NULL); 363 if (error) 364 return error; 365 } 366 367 vp = NULL; 368 369 /* Allocate a new vnode. */ 370 mutex_enter(&vnode_free_list_lock); 371 numvnodes++; 372 if (numvnodes > desiredvnodes + desiredvnodes / 10) 373 cv_signal(&vdrain_cv); 374 mutex_exit(&vnode_free_list_lock); 375 vp = vnalloc(NULL); 376 377 KASSERT(vp->v_freelisthd == NULL); 378 KASSERT(LIST_EMPTY(&vp->v_nclist)); 379 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 380 381 /* Initialize vnode. */ 382 vp->v_usecount = 1; 383 vp->v_type = VNON; 384 vp->v_tag = tag; 385 vp->v_op = vops; 386 vp->v_data = NULL; 387 388 uobj = &vp->v_uobj; 389 KASSERT(uobj->pgops == &uvm_vnodeops); 390 KASSERT(uobj->uo_npages == 0); 391 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 392 vp->v_size = vp->v_writesize = VSIZENOTSET; 393 394 /* Share the vnode_t::v_interlock, if requested. */ 395 if (slock) { 396 /* Set the interlock and mark that it is shared. */ 397 KASSERT(vp->v_mount == NULL); 398 mutex_obj_hold(slock); 399 uvm_obj_setlock(&vp->v_uobj, slock); 400 KASSERT(vp->v_interlock == slock); 401 vp->v_iflag |= VI_LOCKSHARE; 402 } 403 404 /* Finally, move vnode into the mount queue. */ 405 vfs_insmntque(vp, mp); 406 407 if (mp != NULL) { 408 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 409 vp->v_vflag |= VV_MPSAFE; 410 vfs_unbusy(mp, true, NULL); 411 } 412 413 *vpp = vp; 414 return 0; 415 } 416 417 /* 418 * This is really just the reverse of getnewvnode(). Needed for 419 * VFS_VGET functions who may need to push back a vnode in case 420 * of a locking race. 421 */ 422 void 423 ungetnewvnode(vnode_t *vp) 424 { 425 426 KASSERT(vp->v_usecount == 1); 427 KASSERT(vp->v_data == NULL); 428 KASSERT(vp->v_freelisthd == NULL); 429 430 mutex_enter(vp->v_interlock); 431 vp->v_iflag |= VI_CLEAN; 432 vrelel(vp, 0); 433 } 434 435 /* 436 * Helper thread to keep the number of vnodes below desiredvnodes. 437 */ 438 static void 439 vdrain_thread(void *cookie) 440 { 441 int error; 442 443 mutex_enter(&vnode_free_list_lock); 444 445 for (;;) { 446 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 447 while (numvnodes > desiredvnodes) { 448 error = cleanvnode(); 449 if (error) 450 kpause("vndsbusy", false, hz, NULL); 451 mutex_enter(&vnode_free_list_lock); 452 if (error) 453 break; 454 } 455 } 456 } 457 458 /* 459 * Remove a vnode from its freelist. 460 */ 461 void 462 vremfree(vnode_t *vp) 463 { 464 465 KASSERT(mutex_owned(vp->v_interlock)); 466 KASSERT(vp->v_usecount == 0); 467 468 /* 469 * Note that the reference count must not change until 470 * the vnode is removed. 471 */ 472 mutex_enter(&vnode_free_list_lock); 473 if (vp->v_holdcnt > 0) { 474 KASSERT(vp->v_freelisthd == &vnode_hold_list); 475 } else { 476 KASSERT(vp->v_freelisthd == &vnode_free_list); 477 } 478 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 479 vp->v_freelisthd = NULL; 480 mutex_exit(&vnode_free_list_lock); 481 } 482 483 /* 484 * vget: get a particular vnode from the free list, increment its reference 485 * count and lock it. 486 * 487 * => Should be called with v_interlock held. 488 * 489 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean(). 490 * In that case, we cannot grab the vnode, so the process is awakened when 491 * the transition is completed, and an error returned to indicate that the 492 * vnode is no longer usable. 493 */ 494 int 495 vget(vnode_t *vp, int flags) 496 { 497 int error = 0; 498 499 KASSERT((vp->v_iflag & VI_MARKER) == 0); 500 KASSERT(mutex_owned(vp->v_interlock)); 501 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 502 503 /* 504 * Before adding a reference, we must remove the vnode 505 * from its freelist. 506 */ 507 if (vp->v_usecount == 0) { 508 vremfree(vp); 509 vp->v_usecount = 1; 510 } else { 511 atomic_inc_uint(&vp->v_usecount); 512 } 513 514 /* 515 * If the vnode is in the process of changing state we wait 516 * for the change to complete and take care not to return 517 * a clean vnode. 518 */ 519 if ((vp->v_iflag & VI_CHANGING) != 0) { 520 if ((flags & LK_NOWAIT) != 0) { 521 vrelel(vp, 0); 522 return EBUSY; 523 } 524 vwait(vp, VI_CHANGING); 525 if ((vp->v_iflag & VI_CLEAN) != 0) { 526 vrelel(vp, 0); 527 return ENOENT; 528 } 529 } 530 531 /* 532 * Ok, we got it in good shape. Just locking left. 533 */ 534 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 535 mutex_exit(vp->v_interlock); 536 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 537 error = vn_lock(vp, flags); 538 if (error != 0) { 539 vrele(vp); 540 } 541 } 542 return error; 543 } 544 545 /* 546 * vput: unlock and release the reference. 547 */ 548 void 549 vput(vnode_t *vp) 550 { 551 552 KASSERT((vp->v_iflag & VI_MARKER) == 0); 553 554 VOP_UNLOCK(vp); 555 vrele(vp); 556 } 557 558 /* 559 * Try to drop reference on a vnode. Abort if we are releasing the 560 * last reference. Note: this _must_ succeed if not the last reference. 561 */ 562 static inline bool 563 vtryrele(vnode_t *vp) 564 { 565 u_int use, next; 566 567 for (use = vp->v_usecount;; use = next) { 568 if (use == 1) { 569 return false; 570 } 571 KASSERT(use > 1); 572 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 573 if (__predict_true(next == use)) { 574 return true; 575 } 576 } 577 } 578 579 /* 580 * Vnode release. If reference count drops to zero, call inactive 581 * routine and either return to freelist or free to the pool. 582 */ 583 static void 584 vrelel(vnode_t *vp, int flags) 585 { 586 bool recycle, defer; 587 int error; 588 589 KASSERT(mutex_owned(vp->v_interlock)); 590 KASSERT((vp->v_iflag & VI_MARKER) == 0); 591 KASSERT(vp->v_freelisthd == NULL); 592 593 if (__predict_false(vp->v_op == dead_vnodeop_p && 594 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 595 vnpanic(vp, "dead but not clean"); 596 } 597 598 /* 599 * If not the last reference, just drop the reference count 600 * and unlock. 601 */ 602 if (vtryrele(vp)) { 603 if ((flags & VRELEL_CHANGING_SET) != 0) { 604 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 605 vp->v_iflag &= ~VI_CHANGING; 606 cv_broadcast(&vp->v_cv); 607 } 608 mutex_exit(vp->v_interlock); 609 return; 610 } 611 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 612 vnpanic(vp, "%s: bad ref count", __func__); 613 } 614 615 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 616 617 #ifdef DIAGNOSTIC 618 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 619 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 620 vprint("vrelel: missing VOP_CLOSE()", vp); 621 } 622 #endif 623 624 /* 625 * If not clean, deactivate the vnode, but preserve 626 * our reference across the call to VOP_INACTIVE(). 627 */ 628 if ((vp->v_iflag & VI_CLEAN) == 0) { 629 recycle = false; 630 631 /* 632 * XXX This ugly block can be largely eliminated if 633 * locking is pushed down into the file systems. 634 * 635 * Defer vnode release to vrele_thread if caller 636 * requests it explicitly or is the pagedaemon. 637 */ 638 if ((curlwp == uvm.pagedaemon_lwp) || 639 (flags & VRELEL_ASYNC_RELE) != 0) { 640 defer = true; 641 } else if (curlwp == vrele_lwp) { 642 /* 643 * We have to try harder. 644 */ 645 mutex_exit(vp->v_interlock); 646 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 647 KASSERT(error == 0); 648 mutex_enter(vp->v_interlock); 649 defer = false; 650 } else { 651 /* If we can't acquire the lock, then defer. */ 652 mutex_exit(vp->v_interlock); 653 error = vn_lock(vp, 654 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 655 defer = (error != 0); 656 mutex_enter(vp->v_interlock); 657 } 658 659 KASSERT(mutex_owned(vp->v_interlock)); 660 KASSERT(! (curlwp == vrele_lwp && defer)); 661 662 if (defer) { 663 /* 664 * Defer reclaim to the kthread; it's not safe to 665 * clean it here. We donate it our last reference. 666 */ 667 if ((flags & VRELEL_CHANGING_SET) != 0) { 668 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 669 vp->v_iflag &= ~VI_CHANGING; 670 cv_broadcast(&vp->v_cv); 671 } 672 mutex_enter(&vrele_lock); 673 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 674 if (++vrele_pending > (desiredvnodes >> 8)) 675 cv_signal(&vrele_cv); 676 mutex_exit(&vrele_lock); 677 mutex_exit(vp->v_interlock); 678 return; 679 } 680 681 /* 682 * If the node got another reference while we 683 * released the interlock, don't try to inactivate it yet. 684 */ 685 if (__predict_false(vtryrele(vp))) { 686 VOP_UNLOCK(vp); 687 if ((flags & VRELEL_CHANGING_SET) != 0) { 688 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 689 vp->v_iflag &= ~VI_CHANGING; 690 cv_broadcast(&vp->v_cv); 691 } 692 mutex_exit(vp->v_interlock); 693 return; 694 } 695 696 if ((flags & VRELEL_CHANGING_SET) == 0) { 697 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 698 vp->v_iflag |= VI_CHANGING; 699 } 700 mutex_exit(vp->v_interlock); 701 702 /* 703 * The vnode can gain another reference while being 704 * deactivated. If VOP_INACTIVE() indicates that 705 * the described file has been deleted, then recycle 706 * the vnode irrespective of additional references. 707 * Another thread may be waiting to re-use the on-disk 708 * inode. 709 * 710 * Note that VOP_INACTIVE() will drop the vnode lock. 711 */ 712 VOP_INACTIVE(vp, &recycle); 713 mutex_enter(vp->v_interlock); 714 if (!recycle) { 715 if (vtryrele(vp)) { 716 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 717 vp->v_iflag &= ~VI_CHANGING; 718 cv_broadcast(&vp->v_cv); 719 mutex_exit(vp->v_interlock); 720 return; 721 } 722 } 723 724 /* Take care of space accounting. */ 725 if (vp->v_iflag & VI_EXECMAP) { 726 atomic_add_int(&uvmexp.execpages, 727 -vp->v_uobj.uo_npages); 728 atomic_add_int(&uvmexp.filepages, 729 vp->v_uobj.uo_npages); 730 } 731 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 732 vp->v_vflag &= ~VV_MAPPED; 733 734 /* 735 * Recycle the vnode if the file is now unused (unlinked), 736 * otherwise just free it. 737 */ 738 if (recycle) { 739 vclean(vp); 740 } 741 KASSERT(vp->v_usecount > 0); 742 } else { /* vnode was already clean */ 743 if ((flags & VRELEL_CHANGING_SET) == 0) { 744 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 745 vp->v_iflag |= VI_CHANGING; 746 } 747 } 748 749 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 750 /* Gained another reference while being reclaimed. */ 751 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 752 vp->v_iflag &= ~VI_CHANGING; 753 cv_broadcast(&vp->v_cv); 754 mutex_exit(vp->v_interlock); 755 return; 756 } 757 758 if ((vp->v_iflag & VI_CLEAN) != 0) { 759 /* 760 * It's clean so destroy it. It isn't referenced 761 * anywhere since it has been reclaimed. 762 */ 763 KASSERT(vp->v_holdcnt == 0); 764 KASSERT(vp->v_writecount == 0); 765 mutex_exit(vp->v_interlock); 766 vfs_insmntque(vp, NULL); 767 if (vp->v_type == VBLK || vp->v_type == VCHR) { 768 spec_node_destroy(vp); 769 } 770 vnfree(vp); 771 } else { 772 /* 773 * Otherwise, put it back onto the freelist. It 774 * can't be destroyed while still associated with 775 * a file system. 776 */ 777 mutex_enter(&vnode_free_list_lock); 778 if (vp->v_holdcnt > 0) { 779 vp->v_freelisthd = &vnode_hold_list; 780 } else { 781 vp->v_freelisthd = &vnode_free_list; 782 } 783 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 784 mutex_exit(&vnode_free_list_lock); 785 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 786 vp->v_iflag &= ~VI_CHANGING; 787 cv_broadcast(&vp->v_cv); 788 mutex_exit(vp->v_interlock); 789 } 790 } 791 792 void 793 vrele(vnode_t *vp) 794 { 795 796 KASSERT((vp->v_iflag & VI_MARKER) == 0); 797 798 if (vtryrele(vp)) { 799 return; 800 } 801 mutex_enter(vp->v_interlock); 802 vrelel(vp, 0); 803 } 804 805 /* 806 * Asynchronous vnode release, vnode is released in different context. 807 */ 808 void 809 vrele_async(vnode_t *vp) 810 { 811 812 KASSERT((vp->v_iflag & VI_MARKER) == 0); 813 814 if (vtryrele(vp)) { 815 return; 816 } 817 mutex_enter(vp->v_interlock); 818 vrelel(vp, VRELEL_ASYNC_RELE); 819 } 820 821 static void 822 vrele_thread(void *cookie) 823 { 824 vnodelst_t skip_list; 825 vnode_t *vp; 826 struct mount *mp; 827 828 TAILQ_INIT(&skip_list); 829 830 mutex_enter(&vrele_lock); 831 for (;;) { 832 while (TAILQ_EMPTY(&vrele_list)) { 833 vrele_gen++; 834 cv_broadcast(&vrele_cv); 835 cv_timedwait(&vrele_cv, &vrele_lock, hz); 836 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist); 837 } 838 vp = TAILQ_FIRST(&vrele_list); 839 mp = vp->v_mount; 840 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 841 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) { 842 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist); 843 continue; 844 } 845 vrele_pending--; 846 mutex_exit(&vrele_lock); 847 848 /* 849 * If not the last reference, then ignore the vnode 850 * and look for more work. 851 */ 852 mutex_enter(vp->v_interlock); 853 vrelel(vp, 0); 854 fstrans_done(mp); 855 mutex_enter(&vrele_lock); 856 } 857 } 858 859 void 860 vrele_flush(void) 861 { 862 int gen; 863 864 mutex_enter(&vrele_lock); 865 gen = vrele_gen; 866 while (vrele_pending && gen == vrele_gen) { 867 cv_broadcast(&vrele_cv); 868 cv_wait(&vrele_cv, &vrele_lock); 869 } 870 mutex_exit(&vrele_lock); 871 } 872 873 /* 874 * Vnode reference, where a reference is already held by some other 875 * object (for example, a file structure). 876 */ 877 void 878 vref(vnode_t *vp) 879 { 880 881 KASSERT((vp->v_iflag & VI_MARKER) == 0); 882 KASSERT(vp->v_usecount != 0); 883 884 atomic_inc_uint(&vp->v_usecount); 885 } 886 887 /* 888 * Page or buffer structure gets a reference. 889 * Called with v_interlock held. 890 */ 891 void 892 vholdl(vnode_t *vp) 893 { 894 895 KASSERT(mutex_owned(vp->v_interlock)); 896 KASSERT((vp->v_iflag & VI_MARKER) == 0); 897 898 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 899 mutex_enter(&vnode_free_list_lock); 900 KASSERT(vp->v_freelisthd == &vnode_free_list); 901 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 902 vp->v_freelisthd = &vnode_hold_list; 903 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 904 mutex_exit(&vnode_free_list_lock); 905 } 906 } 907 908 /* 909 * Page or buffer structure frees a reference. 910 * Called with v_interlock held. 911 */ 912 void 913 holdrelel(vnode_t *vp) 914 { 915 916 KASSERT(mutex_owned(vp->v_interlock)); 917 KASSERT((vp->v_iflag & VI_MARKER) == 0); 918 919 if (vp->v_holdcnt <= 0) { 920 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 921 } 922 923 vp->v_holdcnt--; 924 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 925 mutex_enter(&vnode_free_list_lock); 926 KASSERT(vp->v_freelisthd == &vnode_hold_list); 927 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 928 vp->v_freelisthd = &vnode_free_list; 929 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 930 mutex_exit(&vnode_free_list_lock); 931 } 932 } 933 934 /* 935 * Disassociate the underlying file system from a vnode. 936 * 937 * Must be called with the interlock held, and will return with it held. 938 */ 939 static void 940 vclean(vnode_t *vp) 941 { 942 lwp_t *l = curlwp; 943 bool recycle, active, doclose; 944 int error; 945 946 KASSERT(mutex_owned(vp->v_interlock)); 947 KASSERT((vp->v_iflag & VI_MARKER) == 0); 948 KASSERT(vp->v_usecount != 0); 949 950 /* If already clean, nothing to do. */ 951 if ((vp->v_iflag & VI_CLEAN) != 0) { 952 return; 953 } 954 955 active = (vp->v_usecount > 1); 956 doclose = ! (active && vp->v_type == VBLK && 957 spec_node_getmountedfs(vp) != NULL); 958 mutex_exit(vp->v_interlock); 959 960 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 961 962 /* 963 * Prevent the vnode from being recycled or brought into use 964 * while we clean it out. 965 */ 966 mutex_enter(vp->v_interlock); 967 KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0); 968 vp->v_iflag |= VI_XLOCK; 969 if (vp->v_iflag & VI_EXECMAP) { 970 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 971 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 972 } 973 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 974 mutex_exit(vp->v_interlock); 975 976 /* 977 * Clean out any cached data associated with the vnode. 978 * If purging an active vnode, it must be closed and 979 * deactivated before being reclaimed. Note that the 980 * VOP_INACTIVE will unlock the vnode. 981 */ 982 if (doclose) { 983 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 984 if (error != 0) { 985 if (wapbl_vphaswapbl(vp)) 986 WAPBL_DISCARD(wapbl_vptomp(vp)); 987 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 988 } 989 KASSERT(error == 0); 990 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 991 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 992 spec_node_revoke(vp); 993 } 994 } 995 if (active) { 996 VOP_INACTIVE(vp, &recycle); 997 } else { 998 /* 999 * Any other processes trying to obtain this lock must first 1000 * wait for VI_XLOCK to clear, then call the new lock operation. 1001 */ 1002 VOP_UNLOCK(vp); 1003 } 1004 1005 /* Disassociate the underlying file system from the vnode. */ 1006 if (VOP_RECLAIM(vp)) { 1007 vnpanic(vp, "%s: cannot reclaim", __func__); 1008 } 1009 1010 KASSERT(vp->v_data == NULL); 1011 KASSERT(vp->v_uobj.uo_npages == 0); 1012 1013 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1014 uvm_ra_freectx(vp->v_ractx); 1015 vp->v_ractx = NULL; 1016 } 1017 1018 /* Purge name cache. */ 1019 cache_purge(vp); 1020 1021 /* Move to dead mount. */ 1022 vp->v_vflag &= ~VV_ROOT; 1023 atomic_inc_uint(&dead_mount->mnt_refcnt); 1024 vfs_insmntque(vp, dead_mount); 1025 1026 /* Done with purge, notify sleepers of the grim news. */ 1027 mutex_enter(vp->v_interlock); 1028 if (doclose) { 1029 vp->v_op = dead_vnodeop_p; 1030 vp->v_vflag |= VV_LOCKSWORK; 1031 vp->v_iflag |= VI_CLEAN; 1032 } else { 1033 vp->v_op = spec_vnodeop_p; 1034 vp->v_vflag &= ~VV_LOCKSWORK; 1035 } 1036 vp->v_tag = VT_NON; 1037 KNOTE(&vp->v_klist, NOTE_REVOKE); 1038 vp->v_iflag &= ~VI_XLOCK; 1039 cv_broadcast(&vp->v_cv); 1040 1041 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1042 } 1043 1044 /* 1045 * Recycle an unused vnode if caller holds the last reference. 1046 */ 1047 bool 1048 vrecycle(vnode_t *vp) 1049 { 1050 1051 mutex_enter(vp->v_interlock); 1052 1053 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1054 1055 if (vp->v_usecount != 1) { 1056 mutex_exit(vp->v_interlock); 1057 return false; 1058 } 1059 if ((vp->v_iflag & VI_CHANGING) != 0) 1060 vwait(vp, VI_CHANGING); 1061 if (vp->v_usecount != 1) { 1062 mutex_exit(vp->v_interlock); 1063 return false; 1064 } else if ((vp->v_iflag & VI_CLEAN) != 0) { 1065 mutex_exit(vp->v_interlock); 1066 return true; 1067 } 1068 vp->v_iflag |= VI_CHANGING; 1069 vclean(vp); 1070 vrelel(vp, VRELEL_CHANGING_SET); 1071 return true; 1072 } 1073 1074 /* 1075 * Eliminate all activity associated with the requested vnode 1076 * and with all vnodes aliased to the requested vnode. 1077 */ 1078 void 1079 vrevoke(vnode_t *vp) 1080 { 1081 vnode_t *vq; 1082 enum vtype type; 1083 dev_t dev; 1084 1085 KASSERT(vp->v_usecount > 0); 1086 1087 mutex_enter(vp->v_interlock); 1088 if ((vp->v_iflag & VI_CLEAN) != 0) { 1089 mutex_exit(vp->v_interlock); 1090 return; 1091 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1092 atomic_inc_uint(&vp->v_usecount); 1093 mutex_exit(vp->v_interlock); 1094 vgone(vp); 1095 return; 1096 } else { 1097 dev = vp->v_rdev; 1098 type = vp->v_type; 1099 mutex_exit(vp->v_interlock); 1100 } 1101 1102 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1103 vgone(vq); 1104 } 1105 } 1106 1107 /* 1108 * Eliminate all activity associated with a vnode in preparation for 1109 * reuse. Drops a reference from the vnode. 1110 */ 1111 void 1112 vgone(vnode_t *vp) 1113 { 1114 1115 mutex_enter(vp->v_interlock); 1116 if ((vp->v_iflag & VI_CHANGING) != 0) 1117 vwait(vp, VI_CHANGING); 1118 vp->v_iflag |= VI_CHANGING; 1119 vclean(vp); 1120 vrelel(vp, VRELEL_CHANGING_SET); 1121 } 1122 1123 /* 1124 * Update outstanding I/O count and do wakeup if requested. 1125 */ 1126 void 1127 vwakeup(struct buf *bp) 1128 { 1129 vnode_t *vp; 1130 1131 if ((vp = bp->b_vp) == NULL) 1132 return; 1133 1134 KASSERT(bp->b_objlock == vp->v_interlock); 1135 KASSERT(mutex_owned(bp->b_objlock)); 1136 1137 if (--vp->v_numoutput < 0) 1138 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1139 if (vp->v_numoutput == 0) 1140 cv_broadcast(&vp->v_cv); 1141 } 1142 1143 /* 1144 * Test a vnode for being or becoming dead. Returns one of: 1145 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 1146 * ENOENT: vnode is dead. 1147 * 0: otherwise. 1148 * 1149 * Whenever this function returns a non-zero value all future 1150 * calls will also return a non-zero value. 1151 */ 1152 int 1153 vdead_check(struct vnode *vp, int flags) 1154 { 1155 1156 KASSERT(mutex_owned(vp->v_interlock)); 1157 if (ISSET(vp->v_iflag, VI_XLOCK)) { 1158 if (ISSET(flags, VDEAD_NOWAIT)) 1159 return EBUSY; 1160 vwait(vp, VI_XLOCK); 1161 KASSERT(ISSET(vp->v_iflag, VI_CLEAN)); 1162 } 1163 if (ISSET(vp->v_iflag, VI_CLEAN)) 1164 return ENOENT; 1165 return 0; 1166 } 1167 1168 /* 1169 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1170 * recycled. 1171 */ 1172 static void 1173 vwait(vnode_t *vp, int flags) 1174 { 1175 1176 KASSERT(mutex_owned(vp->v_interlock)); 1177 KASSERT(vp->v_usecount != 0); 1178 1179 while ((vp->v_iflag & flags) != 0) 1180 cv_wait(&vp->v_cv, vp->v_interlock); 1181 } 1182 1183 int 1184 vfs_drainvnodes(long target) 1185 { 1186 int error; 1187 1188 mutex_enter(&vnode_free_list_lock); 1189 1190 while (numvnodes > target) { 1191 error = cleanvnode(); 1192 if (error != 0) 1193 return error; 1194 mutex_enter(&vnode_free_list_lock); 1195 } 1196 1197 mutex_exit(&vnode_free_list_lock); 1198 1199 return 0; 1200 } 1201 1202 void 1203 vnpanic(vnode_t *vp, const char *fmt, ...) 1204 { 1205 va_list ap; 1206 1207 #ifdef DIAGNOSTIC 1208 vprint(NULL, vp); 1209 #endif 1210 va_start(ap, fmt); 1211 vpanic(fmt, ap); 1212 va_end(ap); 1213 } 1214