1 /* $NetBSD: vfs_vnode.c,v 1.32 2014/02/27 16:51:38 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 93 * underlying file system from the vnode, and finally destroyed. 94 * 95 * Reference counting 96 * 97 * Vnode is considered active, if reference count (vnode_t::v_usecount) 98 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 99 * as vput(9), routines. Common points holding references are e.g. 100 * file openings, current working directory, mount points, etc. 101 * 102 * Note on v_usecount and its locking 103 * 104 * At nearly all points it is known that v_usecount could be zero, 105 * the vnode_t::v_interlock will be held. To change v_usecount away 106 * from zero, the interlock must be held. To change from a non-zero 107 * value to zero, again the interlock must be held. 108 * 109 * Changing the usecount from a non-zero value to a non-zero value can 110 * safely be done using atomic operations, without the interlock held. 111 * 112 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 113 * mntvnode_lock is still held. 114 * 115 * See PR 41374. 116 */ 117 118 #include <sys/cdefs.h> 119 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.32 2014/02/27 16:51:38 hannken Exp $"); 120 121 #define _VFS_VNODE_PRIVATE 122 123 #include <sys/param.h> 124 #include <sys/kernel.h> 125 126 #include <sys/atomic.h> 127 #include <sys/buf.h> 128 #include <sys/conf.h> 129 #include <sys/device.h> 130 #include <sys/kauth.h> 131 #include <sys/kmem.h> 132 #include <sys/kthread.h> 133 #include <sys/module.h> 134 #include <sys/mount.h> 135 #include <sys/namei.h> 136 #include <sys/syscallargs.h> 137 #include <sys/sysctl.h> 138 #include <sys/systm.h> 139 #include <sys/vnode.h> 140 #include <sys/wapbl.h> 141 #include <sys/fstrans.h> 142 143 #include <uvm/uvm.h> 144 #include <uvm/uvm_readahead.h> 145 146 /* Flags to vrelel. */ 147 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 148 #define VRELEL_CHANGING_SET 0x0002 /* VI_CHANGING set by caller. */ 149 150 u_int numvnodes __cacheline_aligned; 151 152 static pool_cache_t vnode_cache __read_mostly; 153 static struct mount *dead_mount; 154 155 /* 156 * There are two free lists: one is for vnodes which have no buffer/page 157 * references and one for those which do (i.e. v_holdcnt is non-zero). 158 * Vnode recycling mechanism first attempts to look into the former list. 159 */ 160 static kmutex_t vnode_free_list_lock __cacheline_aligned; 161 static vnodelst_t vnode_free_list __cacheline_aligned; 162 static vnodelst_t vnode_hold_list __cacheline_aligned; 163 static kcondvar_t vdrain_cv __cacheline_aligned; 164 165 static vnodelst_t vrele_list __cacheline_aligned; 166 static kmutex_t vrele_lock __cacheline_aligned; 167 static kcondvar_t vrele_cv __cacheline_aligned; 168 static lwp_t * vrele_lwp __cacheline_aligned; 169 static int vrele_pending __cacheline_aligned; 170 static int vrele_gen __cacheline_aligned; 171 172 static int cleanvnode(void); 173 static void vclean(vnode_t *); 174 static void vrelel(vnode_t *, int); 175 static void vdrain_thread(void *); 176 static void vrele_thread(void *); 177 static void vnpanic(vnode_t *, const char *, ...) 178 __printflike(2, 3); 179 180 /* Routines having to do with the management of the vnode table. */ 181 extern int (**dead_vnodeop_p)(void *); 182 extern struct vfsops dead_vfsops; 183 184 void 185 vfs_vnode_sysinit(void) 186 { 187 int error __diagused; 188 189 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 190 NULL, IPL_NONE, NULL, NULL, NULL); 191 KASSERT(vnode_cache != NULL); 192 193 dead_mount = vfs_mountalloc(&dead_vfsops, NULL); 194 KASSERT(dead_mount != NULL); 195 dead_mount->mnt_iflag = IMNT_MPSAFE; 196 197 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 198 TAILQ_INIT(&vnode_free_list); 199 TAILQ_INIT(&vnode_hold_list); 200 TAILQ_INIT(&vrele_list); 201 202 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 203 cv_init(&vdrain_cv, "vdrain"); 204 cv_init(&vrele_cv, "vrele"); 205 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 206 NULL, NULL, "vdrain"); 207 KASSERT(error == 0); 208 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 209 NULL, &vrele_lwp, "vrele"); 210 KASSERT(error == 0); 211 } 212 213 /* 214 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 215 * marker vnode. 216 */ 217 vnode_t * 218 vnalloc(struct mount *mp) 219 { 220 vnode_t *vp; 221 222 vp = pool_cache_get(vnode_cache, PR_WAITOK); 223 KASSERT(vp != NULL); 224 225 memset(vp, 0, sizeof(*vp)); 226 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 227 cv_init(&vp->v_cv, "vnode"); 228 /* 229 * Done by memset() above. 230 * LIST_INIT(&vp->v_nclist); 231 * LIST_INIT(&vp->v_dnclist); 232 */ 233 234 if (mp != NULL) { 235 vp->v_mount = mp; 236 vp->v_type = VBAD; 237 vp->v_iflag = VI_MARKER; 238 } else { 239 rw_init(&vp->v_lock); 240 } 241 242 return vp; 243 } 244 245 /* 246 * Free an unused, unreferenced vnode. 247 */ 248 void 249 vnfree(vnode_t *vp) 250 { 251 252 KASSERT(vp->v_usecount == 0); 253 254 if ((vp->v_iflag & VI_MARKER) == 0) { 255 rw_destroy(&vp->v_lock); 256 mutex_enter(&vnode_free_list_lock); 257 numvnodes--; 258 mutex_exit(&vnode_free_list_lock); 259 } 260 261 /* 262 * Note: the vnode interlock will either be freed, of reference 263 * dropped (if VI_LOCKSHARE was in use). 264 */ 265 uvm_obj_destroy(&vp->v_uobj, true); 266 cv_destroy(&vp->v_cv); 267 pool_cache_put(vnode_cache, vp); 268 } 269 270 /* 271 * cleanvnode: grab a vnode from freelist, clean and free it. 272 * 273 * => Releases vnode_free_list_lock. 274 */ 275 static int 276 cleanvnode(void) 277 { 278 vnode_t *vp; 279 vnodelst_t *listhd; 280 struct mount *mp; 281 282 KASSERT(mutex_owned(&vnode_free_list_lock)); 283 284 listhd = &vnode_free_list; 285 try_nextlist: 286 TAILQ_FOREACH(vp, listhd, v_freelist) { 287 /* 288 * It's safe to test v_usecount and v_iflag 289 * without holding the interlock here, since 290 * these vnodes should never appear on the 291 * lists. 292 */ 293 KASSERT(vp->v_usecount == 0); 294 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 295 KASSERT(vp->v_freelisthd == listhd); 296 297 if (!mutex_tryenter(vp->v_interlock)) 298 continue; 299 if ((vp->v_iflag & VI_XLOCK) != 0) { 300 mutex_exit(vp->v_interlock); 301 continue; 302 } 303 mp = vp->v_mount; 304 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 305 mutex_exit(vp->v_interlock); 306 continue; 307 } 308 break; 309 } 310 311 if (vp == NULL) { 312 if (listhd == &vnode_free_list) { 313 listhd = &vnode_hold_list; 314 goto try_nextlist; 315 } 316 mutex_exit(&vnode_free_list_lock); 317 return EBUSY; 318 } 319 320 /* Remove it from the freelist. */ 321 TAILQ_REMOVE(listhd, vp, v_freelist); 322 vp->v_freelisthd = NULL; 323 mutex_exit(&vnode_free_list_lock); 324 325 KASSERT(vp->v_usecount == 0); 326 327 /* 328 * The vnode is still associated with a file system, so we must 329 * clean it out before freeing it. We need to add a reference 330 * before doing this. 331 */ 332 vp->v_usecount = 1; 333 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 334 vp->v_iflag |= VI_CHANGING; 335 vclean(vp); 336 vrelel(vp, VRELEL_CHANGING_SET); 337 fstrans_done(mp); 338 339 return 0; 340 } 341 342 /* 343 * getnewvnode: return a fresh vnode. 344 * 345 * => Returns referenced vnode, moved into the mount queue. 346 * => Shares the interlock specified by 'slock', if it is not NULL. 347 */ 348 int 349 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 350 kmutex_t *slock, vnode_t **vpp) 351 { 352 struct uvm_object *uobj __diagused; 353 vnode_t *vp; 354 int error = 0; 355 356 if (mp != NULL) { 357 /* 358 * Mark filesystem busy while we are creating a vnode. 359 * If unmount is in progress, this will fail. 360 */ 361 error = vfs_busy(mp, NULL); 362 if (error) 363 return error; 364 } 365 366 vp = NULL; 367 368 /* Allocate a new vnode. */ 369 mutex_enter(&vnode_free_list_lock); 370 numvnodes++; 371 if (numvnodes > desiredvnodes + desiredvnodes / 10) 372 cv_signal(&vdrain_cv); 373 mutex_exit(&vnode_free_list_lock); 374 vp = vnalloc(NULL); 375 376 KASSERT(vp->v_freelisthd == NULL); 377 KASSERT(LIST_EMPTY(&vp->v_nclist)); 378 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 379 380 /* Initialize vnode. */ 381 vp->v_usecount = 1; 382 vp->v_type = VNON; 383 vp->v_tag = tag; 384 vp->v_op = vops; 385 vp->v_data = NULL; 386 387 uobj = &vp->v_uobj; 388 KASSERT(uobj->pgops == &uvm_vnodeops); 389 KASSERT(uobj->uo_npages == 0); 390 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 391 vp->v_size = vp->v_writesize = VSIZENOTSET; 392 393 /* Share the vnode_t::v_interlock, if requested. */ 394 if (slock) { 395 /* Set the interlock and mark that it is shared. */ 396 KASSERT(vp->v_mount == NULL); 397 mutex_obj_hold(slock); 398 uvm_obj_setlock(&vp->v_uobj, slock); 399 KASSERT(vp->v_interlock == slock); 400 vp->v_iflag |= VI_LOCKSHARE; 401 } 402 403 /* Finally, move vnode into the mount queue. */ 404 vfs_insmntque(vp, mp); 405 406 if (mp != NULL) { 407 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 408 vp->v_vflag |= VV_MPSAFE; 409 vfs_unbusy(mp, true, NULL); 410 } 411 412 *vpp = vp; 413 return 0; 414 } 415 416 /* 417 * This is really just the reverse of getnewvnode(). Needed for 418 * VFS_VGET functions who may need to push back a vnode in case 419 * of a locking race. 420 */ 421 void 422 ungetnewvnode(vnode_t *vp) 423 { 424 425 KASSERT(vp->v_usecount == 1); 426 KASSERT(vp->v_data == NULL); 427 KASSERT(vp->v_freelisthd == NULL); 428 429 mutex_enter(vp->v_interlock); 430 vp->v_iflag |= VI_CLEAN; 431 vrelel(vp, 0); 432 } 433 434 /* 435 * Helper thread to keep the number of vnodes below desiredvnodes. 436 */ 437 static void 438 vdrain_thread(void *cookie) 439 { 440 int error; 441 442 mutex_enter(&vnode_free_list_lock); 443 444 for (;;) { 445 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 446 while (numvnodes > desiredvnodes) { 447 error = cleanvnode(); 448 if (error) 449 kpause("vndsbusy", false, hz, NULL); 450 mutex_enter(&vnode_free_list_lock); 451 if (error) 452 break; 453 } 454 } 455 } 456 457 /* 458 * Remove a vnode from its freelist. 459 */ 460 void 461 vremfree(vnode_t *vp) 462 { 463 464 KASSERT(mutex_owned(vp->v_interlock)); 465 KASSERT(vp->v_usecount == 0); 466 467 /* 468 * Note that the reference count must not change until 469 * the vnode is removed. 470 */ 471 mutex_enter(&vnode_free_list_lock); 472 if (vp->v_holdcnt > 0) { 473 KASSERT(vp->v_freelisthd == &vnode_hold_list); 474 } else { 475 KASSERT(vp->v_freelisthd == &vnode_free_list); 476 } 477 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 478 vp->v_freelisthd = NULL; 479 mutex_exit(&vnode_free_list_lock); 480 } 481 482 /* 483 * vget: get a particular vnode from the free list, increment its reference 484 * count and lock it. 485 * 486 * => Should be called with v_interlock held. 487 * 488 * If VI_CHANGING is set, the vnode may be eliminated in vgone()/vclean(). 489 * In that case, we cannot grab the vnode, so the process is awakened when 490 * the transition is completed, and an error returned to indicate that the 491 * vnode is no longer usable. 492 */ 493 int 494 vget(vnode_t *vp, int flags) 495 { 496 int error = 0; 497 498 KASSERT((vp->v_iflag & VI_MARKER) == 0); 499 KASSERT(mutex_owned(vp->v_interlock)); 500 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 501 502 /* 503 * Before adding a reference, we must remove the vnode 504 * from its freelist. 505 */ 506 if (vp->v_usecount == 0) { 507 vremfree(vp); 508 vp->v_usecount = 1; 509 } else { 510 atomic_inc_uint(&vp->v_usecount); 511 } 512 513 /* 514 * If the vnode is in the process of changing state we wait 515 * for the change to complete and take care not to return 516 * a clean vnode. 517 */ 518 if ((vp->v_iflag & VI_CHANGING) != 0) { 519 if ((flags & LK_NOWAIT) != 0) { 520 vrelel(vp, 0); 521 return EBUSY; 522 } 523 vwait(vp, VI_CHANGING); 524 if ((vp->v_iflag & VI_CLEAN) != 0) { 525 vrelel(vp, 0); 526 return ENOENT; 527 } 528 } 529 530 /* 531 * Ok, we got it in good shape. Just locking left. 532 */ 533 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 534 mutex_exit(vp->v_interlock); 535 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 536 error = vn_lock(vp, flags); 537 if (error != 0) { 538 vrele(vp); 539 } 540 } 541 return error; 542 } 543 544 /* 545 * vput: unlock and release the reference. 546 */ 547 void 548 vput(vnode_t *vp) 549 { 550 551 KASSERT((vp->v_iflag & VI_MARKER) == 0); 552 553 VOP_UNLOCK(vp); 554 vrele(vp); 555 } 556 557 /* 558 * Try to drop reference on a vnode. Abort if we are releasing the 559 * last reference. Note: this _must_ succeed if not the last reference. 560 */ 561 static inline bool 562 vtryrele(vnode_t *vp) 563 { 564 u_int use, next; 565 566 for (use = vp->v_usecount;; use = next) { 567 if (use == 1) { 568 return false; 569 } 570 KASSERT(use > 1); 571 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 572 if (__predict_true(next == use)) { 573 return true; 574 } 575 } 576 } 577 578 /* 579 * Vnode release. If reference count drops to zero, call inactive 580 * routine and either return to freelist or free to the pool. 581 */ 582 static void 583 vrelel(vnode_t *vp, int flags) 584 { 585 bool recycle, defer; 586 int error; 587 588 KASSERT(mutex_owned(vp->v_interlock)); 589 KASSERT((vp->v_iflag & VI_MARKER) == 0); 590 KASSERT(vp->v_freelisthd == NULL); 591 592 if (__predict_false(vp->v_op == dead_vnodeop_p && 593 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 594 vnpanic(vp, "dead but not clean"); 595 } 596 597 /* 598 * If not the last reference, just drop the reference count 599 * and unlock. 600 */ 601 if (vtryrele(vp)) { 602 if ((flags & VRELEL_CHANGING_SET) != 0) { 603 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 604 vp->v_iflag &= ~VI_CHANGING; 605 cv_broadcast(&vp->v_cv); 606 } 607 mutex_exit(vp->v_interlock); 608 return; 609 } 610 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 611 vnpanic(vp, "%s: bad ref count", __func__); 612 } 613 614 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 615 616 #ifdef DIAGNOSTIC 617 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 618 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 619 vprint("vrelel: missing VOP_CLOSE()", vp); 620 } 621 #endif 622 623 /* 624 * If not clean, deactivate the vnode, but preserve 625 * our reference across the call to VOP_INACTIVE(). 626 */ 627 if ((vp->v_iflag & VI_CLEAN) == 0) { 628 recycle = false; 629 630 /* 631 * XXX This ugly block can be largely eliminated if 632 * locking is pushed down into the file systems. 633 * 634 * Defer vnode release to vrele_thread if caller 635 * requests it explicitly or is the pagedaemon. 636 */ 637 if ((curlwp == uvm.pagedaemon_lwp) || 638 (flags & VRELEL_ASYNC_RELE) != 0) { 639 defer = true; 640 } else if (curlwp == vrele_lwp) { 641 /* 642 * We have to try harder. 643 */ 644 mutex_exit(vp->v_interlock); 645 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 646 KASSERT(error == 0); 647 mutex_enter(vp->v_interlock); 648 defer = false; 649 } else { 650 /* If we can't acquire the lock, then defer. */ 651 mutex_exit(vp->v_interlock); 652 error = vn_lock(vp, 653 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 654 defer = (error != 0); 655 mutex_enter(vp->v_interlock); 656 } 657 658 KASSERT(mutex_owned(vp->v_interlock)); 659 KASSERT(! (curlwp == vrele_lwp && defer)); 660 661 if (defer) { 662 /* 663 * Defer reclaim to the kthread; it's not safe to 664 * clean it here. We donate it our last reference. 665 */ 666 if ((flags & VRELEL_CHANGING_SET) != 0) { 667 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 668 vp->v_iflag &= ~VI_CHANGING; 669 cv_broadcast(&vp->v_cv); 670 } 671 mutex_enter(&vrele_lock); 672 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 673 if (++vrele_pending > (desiredvnodes >> 8)) 674 cv_signal(&vrele_cv); 675 mutex_exit(&vrele_lock); 676 mutex_exit(vp->v_interlock); 677 return; 678 } 679 680 /* 681 * If the node got another reference while we 682 * released the interlock, don't try to inactivate it yet. 683 */ 684 if (__predict_false(vtryrele(vp))) { 685 VOP_UNLOCK(vp); 686 if ((flags & VRELEL_CHANGING_SET) != 0) { 687 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 688 vp->v_iflag &= ~VI_CHANGING; 689 cv_broadcast(&vp->v_cv); 690 } 691 mutex_exit(vp->v_interlock); 692 return; 693 } 694 695 if ((flags & VRELEL_CHANGING_SET) == 0) { 696 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 697 vp->v_iflag |= VI_CHANGING; 698 } 699 mutex_exit(vp->v_interlock); 700 701 /* 702 * The vnode can gain another reference while being 703 * deactivated. If VOP_INACTIVE() indicates that 704 * the described file has been deleted, then recycle 705 * the vnode irrespective of additional references. 706 * Another thread may be waiting to re-use the on-disk 707 * inode. 708 * 709 * Note that VOP_INACTIVE() will drop the vnode lock. 710 */ 711 VOP_INACTIVE(vp, &recycle); 712 mutex_enter(vp->v_interlock); 713 if (!recycle) { 714 if (vtryrele(vp)) { 715 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 716 vp->v_iflag &= ~VI_CHANGING; 717 cv_broadcast(&vp->v_cv); 718 mutex_exit(vp->v_interlock); 719 return; 720 } 721 } 722 723 /* Take care of space accounting. */ 724 if (vp->v_iflag & VI_EXECMAP) { 725 atomic_add_int(&uvmexp.execpages, 726 -vp->v_uobj.uo_npages); 727 atomic_add_int(&uvmexp.filepages, 728 vp->v_uobj.uo_npages); 729 } 730 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 731 vp->v_vflag &= ~VV_MAPPED; 732 733 /* 734 * Recycle the vnode if the file is now unused (unlinked), 735 * otherwise just free it. 736 */ 737 if (recycle) { 738 vclean(vp); 739 } 740 KASSERT(vp->v_usecount > 0); 741 } else { /* vnode was already clean */ 742 if ((flags & VRELEL_CHANGING_SET) == 0) { 743 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 744 vp->v_iflag |= VI_CHANGING; 745 } 746 } 747 748 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 749 /* Gained another reference while being reclaimed. */ 750 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 751 vp->v_iflag &= ~VI_CHANGING; 752 cv_broadcast(&vp->v_cv); 753 mutex_exit(vp->v_interlock); 754 return; 755 } 756 757 if ((vp->v_iflag & VI_CLEAN) != 0) { 758 /* 759 * It's clean so destroy it. It isn't referenced 760 * anywhere since it has been reclaimed. 761 */ 762 KASSERT(vp->v_holdcnt == 0); 763 KASSERT(vp->v_writecount == 0); 764 mutex_exit(vp->v_interlock); 765 vfs_insmntque(vp, NULL); 766 if (vp->v_type == VBLK || vp->v_type == VCHR) { 767 spec_node_destroy(vp); 768 } 769 vnfree(vp); 770 } else { 771 /* 772 * Otherwise, put it back onto the freelist. It 773 * can't be destroyed while still associated with 774 * a file system. 775 */ 776 mutex_enter(&vnode_free_list_lock); 777 if (vp->v_holdcnt > 0) { 778 vp->v_freelisthd = &vnode_hold_list; 779 } else { 780 vp->v_freelisthd = &vnode_free_list; 781 } 782 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 783 mutex_exit(&vnode_free_list_lock); 784 KASSERT((vp->v_iflag & VI_CHANGING) != 0); 785 vp->v_iflag &= ~VI_CHANGING; 786 cv_broadcast(&vp->v_cv); 787 mutex_exit(vp->v_interlock); 788 } 789 } 790 791 void 792 vrele(vnode_t *vp) 793 { 794 795 KASSERT((vp->v_iflag & VI_MARKER) == 0); 796 797 if (vtryrele(vp)) { 798 return; 799 } 800 mutex_enter(vp->v_interlock); 801 vrelel(vp, 0); 802 } 803 804 /* 805 * Asynchronous vnode release, vnode is released in different context. 806 */ 807 void 808 vrele_async(vnode_t *vp) 809 { 810 811 KASSERT((vp->v_iflag & VI_MARKER) == 0); 812 813 if (vtryrele(vp)) { 814 return; 815 } 816 mutex_enter(vp->v_interlock); 817 vrelel(vp, VRELEL_ASYNC_RELE); 818 } 819 820 static void 821 vrele_thread(void *cookie) 822 { 823 vnode_t *vp; 824 825 for (;;) { 826 mutex_enter(&vrele_lock); 827 while (TAILQ_EMPTY(&vrele_list)) { 828 vrele_gen++; 829 cv_broadcast(&vrele_cv); 830 cv_timedwait(&vrele_cv, &vrele_lock, hz); 831 } 832 vp = TAILQ_FIRST(&vrele_list); 833 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 834 vrele_pending--; 835 mutex_exit(&vrele_lock); 836 837 /* 838 * If not the last reference, then ignore the vnode 839 * and look for more work. 840 */ 841 mutex_enter(vp->v_interlock); 842 vrelel(vp, 0); 843 } 844 } 845 846 void 847 vrele_flush(void) 848 { 849 int gen; 850 851 mutex_enter(&vrele_lock); 852 gen = vrele_gen; 853 while (vrele_pending && gen == vrele_gen) { 854 cv_broadcast(&vrele_cv); 855 cv_wait(&vrele_cv, &vrele_lock); 856 } 857 mutex_exit(&vrele_lock); 858 } 859 860 /* 861 * Vnode reference, where a reference is already held by some other 862 * object (for example, a file structure). 863 */ 864 void 865 vref(vnode_t *vp) 866 { 867 868 KASSERT((vp->v_iflag & VI_MARKER) == 0); 869 KASSERT(vp->v_usecount != 0); 870 871 atomic_inc_uint(&vp->v_usecount); 872 } 873 874 /* 875 * Page or buffer structure gets a reference. 876 * Called with v_interlock held. 877 */ 878 void 879 vholdl(vnode_t *vp) 880 { 881 882 KASSERT(mutex_owned(vp->v_interlock)); 883 KASSERT((vp->v_iflag & VI_MARKER) == 0); 884 885 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 886 mutex_enter(&vnode_free_list_lock); 887 KASSERT(vp->v_freelisthd == &vnode_free_list); 888 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 889 vp->v_freelisthd = &vnode_hold_list; 890 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 891 mutex_exit(&vnode_free_list_lock); 892 } 893 } 894 895 /* 896 * Page or buffer structure frees a reference. 897 * Called with v_interlock held. 898 */ 899 void 900 holdrelel(vnode_t *vp) 901 { 902 903 KASSERT(mutex_owned(vp->v_interlock)); 904 KASSERT((vp->v_iflag & VI_MARKER) == 0); 905 906 if (vp->v_holdcnt <= 0) { 907 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 908 } 909 910 vp->v_holdcnt--; 911 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 912 mutex_enter(&vnode_free_list_lock); 913 KASSERT(vp->v_freelisthd == &vnode_hold_list); 914 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 915 vp->v_freelisthd = &vnode_free_list; 916 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 917 mutex_exit(&vnode_free_list_lock); 918 } 919 } 920 921 /* 922 * Disassociate the underlying file system from a vnode. 923 * 924 * Must be called with the interlock held, and will return with it held. 925 */ 926 static void 927 vclean(vnode_t *vp) 928 { 929 lwp_t *l = curlwp; 930 bool recycle, active, doclose; 931 int error; 932 933 KASSERT(mutex_owned(vp->v_interlock)); 934 KASSERT((vp->v_iflag & VI_MARKER) == 0); 935 KASSERT(vp->v_usecount != 0); 936 937 /* If already clean, nothing to do. */ 938 if ((vp->v_iflag & VI_CLEAN) != 0) { 939 return; 940 } 941 942 active = (vp->v_usecount > 1); 943 doclose = ! (active && vp->v_type == VBLK && 944 spec_node_getmountedfs(vp) != NULL); 945 mutex_exit(vp->v_interlock); 946 947 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 948 949 /* 950 * Prevent the vnode from being recycled or brought into use 951 * while we clean it out. 952 */ 953 mutex_enter(vp->v_interlock); 954 KASSERT((vp->v_iflag & (VI_XLOCK | VI_CLEAN)) == 0); 955 vp->v_iflag |= VI_XLOCK; 956 if (vp->v_iflag & VI_EXECMAP) { 957 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 958 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 959 } 960 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 961 mutex_exit(vp->v_interlock); 962 963 /* 964 * Clean out any cached data associated with the vnode. 965 * If purging an active vnode, it must be closed and 966 * deactivated before being reclaimed. Note that the 967 * VOP_INACTIVE will unlock the vnode. 968 */ 969 if (doclose) { 970 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 971 if (error != 0) { 972 if (wapbl_vphaswapbl(vp)) 973 WAPBL_DISCARD(wapbl_vptomp(vp)); 974 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 975 } 976 KASSERT(error == 0); 977 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 978 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 979 spec_node_revoke(vp); 980 } 981 } 982 if (active) { 983 VOP_INACTIVE(vp, &recycle); 984 } else { 985 /* 986 * Any other processes trying to obtain this lock must first 987 * wait for VI_XLOCK to clear, then call the new lock operation. 988 */ 989 VOP_UNLOCK(vp); 990 } 991 992 /* Disassociate the underlying file system from the vnode. */ 993 if (VOP_RECLAIM(vp)) { 994 vnpanic(vp, "%s: cannot reclaim", __func__); 995 } 996 997 KASSERT(vp->v_data == NULL); 998 KASSERT(vp->v_uobj.uo_npages == 0); 999 1000 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1001 uvm_ra_freectx(vp->v_ractx); 1002 vp->v_ractx = NULL; 1003 } 1004 1005 /* Purge name cache. */ 1006 cache_purge(vp); 1007 1008 /* Move to dead mount. */ 1009 vp->v_vflag &= ~VV_ROOT; 1010 atomic_inc_uint(&dead_mount->mnt_refcnt); 1011 vfs_insmntque(vp, dead_mount); 1012 1013 /* Done with purge, notify sleepers of the grim news. */ 1014 mutex_enter(vp->v_interlock); 1015 if (doclose) { 1016 vp->v_op = dead_vnodeop_p; 1017 vp->v_vflag |= VV_LOCKSWORK; 1018 vp->v_iflag |= VI_CLEAN; 1019 } else { 1020 vp->v_op = spec_vnodeop_p; 1021 vp->v_vflag &= ~VV_LOCKSWORK; 1022 } 1023 vp->v_tag = VT_NON; 1024 KNOTE(&vp->v_klist, NOTE_REVOKE); 1025 vp->v_iflag &= ~VI_XLOCK; 1026 cv_broadcast(&vp->v_cv); 1027 1028 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1029 } 1030 1031 /* 1032 * Recycle an unused vnode to the front of the free list. 1033 * Release the passed interlock if the vnode will be recycled. 1034 */ 1035 int 1036 vrecycle(vnode_t *vp, kmutex_t *inter_lkp) 1037 { 1038 1039 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1040 1041 mutex_enter(vp->v_interlock); 1042 if (vp->v_usecount != 0 || (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) != 0) { 1043 mutex_exit(vp->v_interlock); 1044 return 0; 1045 } 1046 if (inter_lkp) { 1047 mutex_exit(inter_lkp); 1048 } 1049 vremfree(vp); 1050 vp->v_usecount = 1; 1051 KASSERT((vp->v_iflag & VI_CHANGING) == 0); 1052 vp->v_iflag |= VI_CHANGING; 1053 vclean(vp); 1054 vrelel(vp, VRELEL_CHANGING_SET); 1055 return 1; 1056 } 1057 1058 /* 1059 * Eliminate all activity associated with the requested vnode 1060 * and with all vnodes aliased to the requested vnode. 1061 */ 1062 void 1063 vrevoke(vnode_t *vp) 1064 { 1065 vnode_t *vq; 1066 enum vtype type; 1067 dev_t dev; 1068 1069 KASSERT(vp->v_usecount > 0); 1070 1071 mutex_enter(vp->v_interlock); 1072 if ((vp->v_iflag & VI_CLEAN) != 0) { 1073 mutex_exit(vp->v_interlock); 1074 return; 1075 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1076 atomic_inc_uint(&vp->v_usecount); 1077 mutex_exit(vp->v_interlock); 1078 vgone(vp); 1079 return; 1080 } else { 1081 dev = vp->v_rdev; 1082 type = vp->v_type; 1083 mutex_exit(vp->v_interlock); 1084 } 1085 1086 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 1087 vgone(vq); 1088 } 1089 } 1090 1091 /* 1092 * Eliminate all activity associated with a vnode in preparation for 1093 * reuse. Drops a reference from the vnode. 1094 */ 1095 void 1096 vgone(vnode_t *vp) 1097 { 1098 1099 mutex_enter(vp->v_interlock); 1100 if ((vp->v_iflag & VI_CHANGING) != 0) 1101 vwait(vp, VI_CHANGING); 1102 vp->v_iflag |= VI_CHANGING; 1103 vclean(vp); 1104 vrelel(vp, VRELEL_CHANGING_SET); 1105 } 1106 1107 /* 1108 * Update outstanding I/O count and do wakeup if requested. 1109 */ 1110 void 1111 vwakeup(struct buf *bp) 1112 { 1113 vnode_t *vp; 1114 1115 if ((vp = bp->b_vp) == NULL) 1116 return; 1117 1118 KASSERT(bp->b_objlock == vp->v_interlock); 1119 KASSERT(mutex_owned(bp->b_objlock)); 1120 1121 if (--vp->v_numoutput < 0) 1122 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1123 if (vp->v_numoutput == 0) 1124 cv_broadcast(&vp->v_cv); 1125 } 1126 1127 /* 1128 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1129 * recycled. 1130 */ 1131 void 1132 vwait(vnode_t *vp, int flags) 1133 { 1134 1135 KASSERT(mutex_owned(vp->v_interlock)); 1136 KASSERT(vp->v_usecount != 0); 1137 1138 while ((vp->v_iflag & flags) != 0) 1139 cv_wait(&vp->v_cv, vp->v_interlock); 1140 } 1141 1142 int 1143 vfs_drainvnodes(long target) 1144 { 1145 int error; 1146 1147 mutex_enter(&vnode_free_list_lock); 1148 1149 while (numvnodes > target) { 1150 error = cleanvnode(); 1151 if (error != 0) 1152 return error; 1153 mutex_enter(&vnode_free_list_lock); 1154 } 1155 1156 mutex_exit(&vnode_free_list_lock); 1157 1158 return 0; 1159 } 1160 1161 void 1162 vnpanic(vnode_t *vp, const char *fmt, ...) 1163 { 1164 va_list ap; 1165 1166 #ifdef DIAGNOSTIC 1167 vprint(NULL, vp); 1168 #endif 1169 va_start(ap, fmt); 1170 vpanic(fmt, ap); 1171 va_end(ap); 1172 } 1173