1 /* $NetBSD: vfs_vnode.c,v 1.15 2011/12/20 16:49:37 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * The life-cycle ends when the last reference is dropped, usually 82 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 83 * the file system that vnode is inactive. Via this call, file system 84 * indicates whether vnode should be recycled (usually, count of links 85 * is checked i.e. whether file was removed). 86 * 87 * Depending on indication, vnode can be put into a free list (cache), 88 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 89 * underlying file system from the vnode, and finally destroyed. 90 * 91 * Reference counting 92 * 93 * Vnode is considered active, if reference count (vnode_t::v_usecount) 94 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 95 * as vput(9), routines. Common points holding references are e.g. 96 * file openings, current working directory, mount points, etc. 97 * 98 * Note on v_usecount and its locking 99 * 100 * At nearly all points it is known that v_usecount could be zero, 101 * the vnode_t::v_interlock will be held. To change v_usecount away 102 * from zero, the interlock must be held. To change from a non-zero 103 * value to zero, again the interlock must be held. 104 * 105 * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise 106 * v_usecount, if the VC_XLOCK bit is set in it, the interlock must 107 * be held. To modify the VC_XLOCK bit, the interlock must be held. 108 * We always keep the usecount (v_usecount & VC_MASK) non-zero while 109 * the VC_XLOCK bit is set. 110 * 111 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 112 * value to a non-zero value can safely be done using atomic operations, 113 * without the interlock held. 114 * 115 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 116 * value can be done using atomic operations, without the interlock held. 117 * 118 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 119 * mntvnode_lock is still held. 120 */ 121 122 #include <sys/cdefs.h> 123 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.15 2011/12/20 16:49:37 hannken Exp $"); 124 125 #include <sys/param.h> 126 #include <sys/kernel.h> 127 128 #include <sys/atomic.h> 129 #include <sys/buf.h> 130 #include <sys/conf.h> 131 #include <sys/device.h> 132 #include <sys/kauth.h> 133 #include <sys/kmem.h> 134 #include <sys/kthread.h> 135 #include <sys/module.h> 136 #include <sys/mount.h> 137 #include <sys/namei.h> 138 #include <sys/syscallargs.h> 139 #include <sys/sysctl.h> 140 #include <sys/systm.h> 141 #include <sys/vnode.h> 142 #include <sys/wapbl.h> 143 144 #include <uvm/uvm.h> 145 #include <uvm/uvm_readahead.h> 146 147 u_int numvnodes __cacheline_aligned; 148 149 static pool_cache_t vnode_cache __read_mostly; 150 static kmutex_t vnode_free_list_lock __cacheline_aligned; 151 152 static vnodelst_t vnode_free_list __cacheline_aligned; 153 static vnodelst_t vnode_hold_list __cacheline_aligned; 154 static vnodelst_t vrele_list __cacheline_aligned; 155 156 static kmutex_t vrele_lock __cacheline_aligned; 157 static kcondvar_t vrele_cv __cacheline_aligned; 158 static lwp_t * vrele_lwp __cacheline_aligned; 159 static int vrele_pending __cacheline_aligned; 160 static int vrele_gen __cacheline_aligned; 161 static kcondvar_t vdrain_cv __cacheline_aligned; 162 163 static int cleanvnode(void); 164 static void vdrain_thread(void *); 165 static void vrele_thread(void *); 166 static void vnpanic(vnode_t *, const char *, ...) 167 __attribute__((__format__(__printf__, 2, 3))); 168 169 /* Routines having to do with the management of the vnode table. */ 170 extern int (**dead_vnodeop_p)(void *); 171 172 void 173 vfs_vnode_sysinit(void) 174 { 175 int error; 176 177 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 178 NULL, IPL_NONE, NULL, NULL, NULL); 179 KASSERT(vnode_cache != NULL); 180 181 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 182 TAILQ_INIT(&vnode_free_list); 183 TAILQ_INIT(&vnode_hold_list); 184 TAILQ_INIT(&vrele_list); 185 186 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 187 cv_init(&vdrain_cv, "vdrain"); 188 cv_init(&vrele_cv, "vrele"); 189 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 190 NULL, NULL, "vdrain"); 191 KASSERT(error == 0); 192 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 193 NULL, &vrele_lwp, "vrele"); 194 KASSERT(error == 0); 195 } 196 197 /* 198 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 199 * marker vnode. 200 */ 201 vnode_t * 202 vnalloc(struct mount *mp) 203 { 204 vnode_t *vp; 205 206 vp = pool_cache_get(vnode_cache, PR_WAITOK); 207 KASSERT(vp != NULL); 208 209 memset(vp, 0, sizeof(*vp)); 210 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 211 cv_init(&vp->v_cv, "vnode"); 212 /* 213 * Done by memset() above. 214 * LIST_INIT(&vp->v_nclist); 215 * LIST_INIT(&vp->v_dnclist); 216 */ 217 218 if (mp != NULL) { 219 vp->v_mount = mp; 220 vp->v_type = VBAD; 221 vp->v_iflag = VI_MARKER; 222 } else { 223 rw_init(&vp->v_lock); 224 } 225 226 return vp; 227 } 228 229 /* 230 * Free an unused, unreferenced vnode. 231 */ 232 void 233 vnfree(vnode_t *vp) 234 { 235 236 KASSERT(vp->v_usecount == 0); 237 238 if ((vp->v_iflag & VI_MARKER) == 0) { 239 rw_destroy(&vp->v_lock); 240 mutex_enter(&vnode_free_list_lock); 241 numvnodes--; 242 mutex_exit(&vnode_free_list_lock); 243 } 244 245 /* 246 * Note: the vnode interlock will either be freed, of reference 247 * dropped (if VI_LOCKSHARE was in use). 248 */ 249 uvm_obj_destroy(&vp->v_uobj, true); 250 cv_destroy(&vp->v_cv); 251 pool_cache_put(vnode_cache, vp); 252 } 253 254 /* 255 * cleanvnode: grab a vnode from freelist, clean and free it. 256 * 257 * => Releases vnode_free_list_lock. 258 */ 259 static int 260 cleanvnode(void) 261 { 262 vnode_t *vp; 263 vnodelst_t *listhd; 264 265 KASSERT(mutex_owned(&vnode_free_list_lock)); 266 retry: 267 listhd = &vnode_free_list; 268 try_nextlist: 269 TAILQ_FOREACH(vp, listhd, v_freelist) { 270 /* 271 * It's safe to test v_usecount and v_iflag 272 * without holding the interlock here, since 273 * these vnodes should never appear on the 274 * lists. 275 */ 276 KASSERT(vp->v_usecount == 0); 277 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 278 KASSERT(vp->v_freelisthd == listhd); 279 280 if (!mutex_tryenter(vp->v_interlock)) 281 continue; 282 if ((vp->v_iflag & VI_XLOCK) == 0) 283 break; 284 mutex_exit(vp->v_interlock); 285 } 286 287 if (vp == NULL) { 288 if (listhd == &vnode_free_list) { 289 listhd = &vnode_hold_list; 290 goto try_nextlist; 291 } 292 mutex_exit(&vnode_free_list_lock); 293 return EBUSY; 294 } 295 296 /* Remove it from the freelist. */ 297 TAILQ_REMOVE(listhd, vp, v_freelist); 298 vp->v_freelisthd = NULL; 299 mutex_exit(&vnode_free_list_lock); 300 301 KASSERT(vp->v_usecount == 0); 302 303 /* 304 * The vnode is still associated with a file system, so we must 305 * clean it out before freeing it. We need to add a reference 306 * before doing this. If the vnode gains another reference while 307 * being cleaned out then we lose - retry. 308 */ 309 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 310 vclean(vp, DOCLOSE); 311 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 312 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 313 if (vp->v_usecount > 1) { 314 /* 315 * Don't return to freelist - the holder of the last 316 * reference will destroy it. 317 */ 318 vrelel(vp, 0); /* releases vp->v_interlock */ 319 mutex_enter(&vnode_free_list_lock); 320 goto retry; 321 } 322 323 KASSERT((vp->v_iflag & VI_CLEAN) == VI_CLEAN); 324 mutex_exit(vp->v_interlock); 325 if (vp->v_type == VBLK || vp->v_type == VCHR) { 326 spec_node_destroy(vp); 327 } 328 vp->v_type = VNON; 329 330 KASSERT(vp->v_data == NULL); 331 KASSERT(vp->v_uobj.uo_npages == 0); 332 KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); 333 KASSERT(vp->v_numoutput == 0); 334 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 335 336 vrele(vp); 337 338 return 0; 339 } 340 341 /* 342 * getnewvnode: return a fresh vnode. 343 * 344 * => Returns referenced vnode, moved into the mount queue. 345 * => Shares the interlock specified by 'slock', if it is not NULL. 346 */ 347 int 348 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 349 kmutex_t *slock, vnode_t **vpp) 350 { 351 struct uvm_object *uobj; 352 vnode_t *vp; 353 int error = 0; 354 355 if (mp != NULL) { 356 /* 357 * Mark filesystem busy while we are creating a vnode. 358 * If unmount is in progress, this will fail. 359 */ 360 error = vfs_busy(mp, NULL); 361 if (error) 362 return error; 363 } 364 365 vp = NULL; 366 367 /* Allocate a new vnode. */ 368 mutex_enter(&vnode_free_list_lock); 369 numvnodes++; 370 if (numvnodes > desiredvnodes + desiredvnodes / 10) 371 cv_signal(&vdrain_cv); 372 mutex_exit(&vnode_free_list_lock); 373 vp = vnalloc(NULL); 374 375 KASSERT(vp->v_freelisthd == NULL); 376 KASSERT(LIST_EMPTY(&vp->v_nclist)); 377 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 378 379 /* Initialize vnode. */ 380 vp->v_usecount = 1; 381 vp->v_type = VNON; 382 vp->v_tag = tag; 383 vp->v_op = vops; 384 vp->v_data = NULL; 385 386 uobj = &vp->v_uobj; 387 KASSERT(uobj->pgops == &uvm_vnodeops); 388 KASSERT(uobj->uo_npages == 0); 389 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 390 vp->v_size = vp->v_writesize = VSIZENOTSET; 391 392 /* Share the vnode_t::v_interlock, if requested. */ 393 if (slock) { 394 /* Set the interlock and mark that it is shared. */ 395 KASSERT(vp->v_mount == NULL); 396 mutex_obj_hold(slock); 397 uvm_obj_setlock(&vp->v_uobj, slock); 398 KASSERT(vp->v_interlock == slock); 399 vp->v_iflag |= VI_LOCKSHARE; 400 } 401 402 /* Finally, move vnode into the mount queue. */ 403 vfs_insmntque(vp, mp); 404 405 if (mp != NULL) { 406 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 407 vp->v_vflag |= VV_MPSAFE; 408 vfs_unbusy(mp, true, NULL); 409 } 410 411 *vpp = vp; 412 return 0; 413 } 414 415 /* 416 * This is really just the reverse of getnewvnode(). Needed for 417 * VFS_VGET functions who may need to push back a vnode in case 418 * of a locking race. 419 */ 420 void 421 ungetnewvnode(vnode_t *vp) 422 { 423 424 KASSERT(vp->v_usecount == 1); 425 KASSERT(vp->v_data == NULL); 426 KASSERT(vp->v_freelisthd == NULL); 427 428 mutex_enter(vp->v_interlock); 429 vp->v_iflag |= VI_CLEAN; 430 vrelel(vp, 0); 431 } 432 433 /* 434 * Helper thread to keep the number of vnodes below desiredvnodes. 435 */ 436 static void 437 vdrain_thread(void *cookie) 438 { 439 int error; 440 441 mutex_enter(&vnode_free_list_lock); 442 443 for (;;) { 444 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 445 while (numvnodes > desiredvnodes) { 446 error = cleanvnode(); 447 if (error) 448 kpause("vndsbusy", false, hz, NULL); 449 mutex_enter(&vnode_free_list_lock); 450 if (error) 451 break; 452 } 453 } 454 } 455 456 /* 457 * Remove a vnode from its freelist. 458 */ 459 void 460 vremfree(vnode_t *vp) 461 { 462 463 KASSERT(mutex_owned(vp->v_interlock)); 464 KASSERT(vp->v_usecount == 0); 465 466 /* 467 * Note that the reference count must not change until 468 * the vnode is removed. 469 */ 470 mutex_enter(&vnode_free_list_lock); 471 if (vp->v_holdcnt > 0) { 472 KASSERT(vp->v_freelisthd == &vnode_hold_list); 473 } else { 474 KASSERT(vp->v_freelisthd == &vnode_free_list); 475 } 476 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 477 vp->v_freelisthd = NULL; 478 mutex_exit(&vnode_free_list_lock); 479 } 480 481 /* 482 * Try to gain a reference to a vnode, without acquiring its interlock. 483 * The caller must hold a lock that will prevent the vnode from being 484 * recycled or freed. 485 */ 486 bool 487 vtryget(vnode_t *vp) 488 { 489 u_int use, next; 490 491 /* 492 * If the vnode is being freed, don't make life any harder 493 * for vclean() by adding another reference without waiting. 494 * This is not strictly necessary, but we'll do it anyway. 495 */ 496 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) { 497 return false; 498 } 499 for (use = vp->v_usecount;; use = next) { 500 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 501 /* Need interlock held if first reference. */ 502 return false; 503 } 504 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 505 if (__predict_true(next == use)) { 506 return true; 507 } 508 } 509 } 510 511 /* 512 * vget: get a particular vnode from the free list, increment its reference 513 * count and lock it. 514 * 515 * => Should be called with v_interlock held. 516 * 517 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 518 * In that case, we cannot grab the vnode, so the process is awakened when 519 * the transition is completed, and an error returned to indicate that the 520 * vnode is no longer usable (e.g. changed to a new file system type). 521 */ 522 int 523 vget(vnode_t *vp, int flags) 524 { 525 int error = 0; 526 527 KASSERT((vp->v_iflag & VI_MARKER) == 0); 528 KASSERT(mutex_owned(vp->v_interlock)); 529 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 530 531 /* 532 * Before adding a reference, we must remove the vnode 533 * from its freelist. 534 */ 535 if (vp->v_usecount == 0) { 536 vremfree(vp); 537 vp->v_usecount = 1; 538 } else { 539 atomic_inc_uint(&vp->v_usecount); 540 } 541 542 /* 543 * If the vnode is in the process of being cleaned out for 544 * another use, we wait for the cleaning to finish and then 545 * return failure. Cleaning is determined by checking if 546 * the VI_XLOCK flag is set. 547 */ 548 if ((vp->v_iflag & VI_XLOCK) != 0) { 549 if ((flags & LK_NOWAIT) != 0) { 550 vrelel(vp, 0); 551 return EBUSY; 552 } 553 vwait(vp, VI_XLOCK); 554 vrelel(vp, 0); 555 return ENOENT; 556 } 557 558 /* 559 * Ok, we got it in good shape. Just locking left. 560 */ 561 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 562 mutex_exit(vp->v_interlock); 563 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 564 error = vn_lock(vp, flags); 565 if (error != 0) { 566 vrele(vp); 567 } 568 } 569 return error; 570 } 571 572 /* 573 * vput: unlock and release the reference. 574 */ 575 void 576 vput(vnode_t *vp) 577 { 578 579 KASSERT((vp->v_iflag & VI_MARKER) == 0); 580 581 VOP_UNLOCK(vp); 582 vrele(vp); 583 } 584 585 /* 586 * Try to drop reference on a vnode. Abort if we are releasing the 587 * last reference. Note: this _must_ succeed if not the last reference. 588 */ 589 static inline bool 590 vtryrele(vnode_t *vp) 591 { 592 u_int use, next; 593 594 for (use = vp->v_usecount;; use = next) { 595 if (use == 1) { 596 return false; 597 } 598 KASSERT((use & VC_MASK) > 1); 599 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 600 if (__predict_true(next == use)) { 601 return true; 602 } 603 } 604 } 605 606 /* 607 * Vnode release. If reference count drops to zero, call inactive 608 * routine and either return to freelist or free to the pool. 609 */ 610 void 611 vrelel(vnode_t *vp, int flags) 612 { 613 bool recycle, defer; 614 int error; 615 616 KASSERT(mutex_owned(vp->v_interlock)); 617 KASSERT((vp->v_iflag & VI_MARKER) == 0); 618 KASSERT(vp->v_freelisthd == NULL); 619 620 if (__predict_false(vp->v_op == dead_vnodeop_p && 621 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 622 vnpanic(vp, "dead but not clean"); 623 } 624 625 /* 626 * If not the last reference, just drop the reference count 627 * and unlock. 628 */ 629 if (vtryrele(vp)) { 630 vp->v_iflag |= VI_INACTREDO; 631 mutex_exit(vp->v_interlock); 632 return; 633 } 634 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 635 vnpanic(vp, "%s: bad ref count", __func__); 636 } 637 638 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 639 640 #ifdef DIAGNOSTIC 641 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 642 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 643 vprint("vrelel: missing VOP_CLOSE()", vp); 644 } 645 #endif 646 647 /* 648 * If not clean, deactivate the vnode, but preserve 649 * our reference across the call to VOP_INACTIVE(). 650 */ 651 retry: 652 if ((vp->v_iflag & VI_CLEAN) == 0) { 653 recycle = false; 654 vp->v_iflag |= VI_INACTNOW; 655 656 /* 657 * XXX This ugly block can be largely eliminated if 658 * locking is pushed down into the file systems. 659 * 660 * Defer vnode release to vrele_thread if caller 661 * requests it explicitly. 662 */ 663 if ((curlwp == uvm.pagedaemon_lwp) || 664 (flags & VRELEL_ASYNC_RELE) != 0) { 665 /* The pagedaemon can't wait around; defer. */ 666 defer = true; 667 } else if (curlwp == vrele_lwp) { 668 /* We have to try harder. */ 669 vp->v_iflag &= ~VI_INACTREDO; 670 mutex_exit(vp->v_interlock); 671 error = vn_lock(vp, LK_EXCLUSIVE); 672 if (error != 0) { 673 /* XXX */ 674 vnpanic(vp, "%s: unable to lock %p", 675 __func__, vp); 676 } 677 defer = false; 678 } else if ((vp->v_iflag & VI_LAYER) != 0) { 679 /* 680 * Acquiring the stack's lock in vclean() even 681 * for an honest vput/vrele is dangerous because 682 * our caller may hold other vnode locks; defer. 683 */ 684 defer = true; 685 } else { 686 /* If we can't acquire the lock, then defer. */ 687 vp->v_iflag &= ~VI_INACTREDO; 688 mutex_exit(vp->v_interlock); 689 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 690 if (error != 0) { 691 defer = true; 692 mutex_enter(vp->v_interlock); 693 } else { 694 defer = false; 695 } 696 } 697 698 if (defer) { 699 /* 700 * Defer reclaim to the kthread; it's not safe to 701 * clean it here. We donate it our last reference. 702 */ 703 KASSERT(mutex_owned(vp->v_interlock)); 704 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 705 vp->v_iflag &= ~VI_INACTNOW; 706 vp->v_iflag |= VI_INACTPEND; 707 mutex_enter(&vrele_lock); 708 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 709 if (++vrele_pending > (desiredvnodes >> 8)) 710 cv_signal(&vrele_cv); 711 mutex_exit(&vrele_lock); 712 mutex_exit(vp->v_interlock); 713 return; 714 } 715 716 /* 717 * The vnode can gain another reference while being 718 * deactivated. If VOP_INACTIVE() indicates that 719 * the described file has been deleted, then recycle 720 * the vnode irrespective of additional references. 721 * Another thread may be waiting to re-use the on-disk 722 * inode. 723 * 724 * Note that VOP_INACTIVE() will drop the vnode lock. 725 */ 726 VOP_INACTIVE(vp, &recycle); 727 mutex_enter(vp->v_interlock); 728 vp->v_iflag &= ~VI_INACTNOW; 729 if (!recycle) { 730 if (vtryrele(vp)) { 731 mutex_exit(vp->v_interlock); 732 return; 733 } 734 735 /* 736 * If we grew another reference while 737 * VOP_INACTIVE() was underway, retry. 738 */ 739 if ((vp->v_iflag & VI_INACTREDO) != 0) { 740 goto retry; 741 } 742 } 743 744 /* Take care of space accounting. */ 745 if (vp->v_iflag & VI_EXECMAP) { 746 atomic_add_int(&uvmexp.execpages, 747 -vp->v_uobj.uo_npages); 748 atomic_add_int(&uvmexp.filepages, 749 vp->v_uobj.uo_npages); 750 } 751 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 752 vp->v_vflag &= ~VV_MAPPED; 753 754 /* 755 * Recycle the vnode if the file is now unused (unlinked), 756 * otherwise just free it. 757 */ 758 if (recycle) { 759 vclean(vp, DOCLOSE); 760 } 761 KASSERT(vp->v_usecount > 0); 762 } 763 764 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 765 /* Gained another reference while being reclaimed. */ 766 mutex_exit(vp->v_interlock); 767 return; 768 } 769 770 if ((vp->v_iflag & VI_CLEAN) != 0) { 771 /* 772 * It's clean so destroy it. It isn't referenced 773 * anywhere since it has been reclaimed. 774 */ 775 KASSERT(vp->v_holdcnt == 0); 776 KASSERT(vp->v_writecount == 0); 777 mutex_exit(vp->v_interlock); 778 vfs_insmntque(vp, NULL); 779 if (vp->v_type == VBLK || vp->v_type == VCHR) { 780 spec_node_destroy(vp); 781 } 782 vnfree(vp); 783 } else { 784 /* 785 * Otherwise, put it back onto the freelist. It 786 * can't be destroyed while still associated with 787 * a file system. 788 */ 789 mutex_enter(&vnode_free_list_lock); 790 if (vp->v_holdcnt > 0) { 791 vp->v_freelisthd = &vnode_hold_list; 792 } else { 793 vp->v_freelisthd = &vnode_free_list; 794 } 795 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 796 mutex_exit(&vnode_free_list_lock); 797 mutex_exit(vp->v_interlock); 798 } 799 } 800 801 void 802 vrele(vnode_t *vp) 803 { 804 805 KASSERT((vp->v_iflag & VI_MARKER) == 0); 806 807 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 808 return; 809 } 810 mutex_enter(vp->v_interlock); 811 vrelel(vp, 0); 812 } 813 814 /* 815 * Asynchronous vnode release, vnode is released in different context. 816 */ 817 void 818 vrele_async(vnode_t *vp) 819 { 820 821 KASSERT((vp->v_iflag & VI_MARKER) == 0); 822 823 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 824 return; 825 } 826 mutex_enter(vp->v_interlock); 827 vrelel(vp, VRELEL_ASYNC_RELE); 828 } 829 830 static void 831 vrele_thread(void *cookie) 832 { 833 vnode_t *vp; 834 835 for (;;) { 836 mutex_enter(&vrele_lock); 837 while (TAILQ_EMPTY(&vrele_list)) { 838 vrele_gen++; 839 cv_broadcast(&vrele_cv); 840 cv_timedwait(&vrele_cv, &vrele_lock, hz); 841 } 842 vp = TAILQ_FIRST(&vrele_list); 843 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 844 vrele_pending--; 845 mutex_exit(&vrele_lock); 846 847 /* 848 * If not the last reference, then ignore the vnode 849 * and look for more work. 850 */ 851 mutex_enter(vp->v_interlock); 852 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 853 vp->v_iflag &= ~VI_INACTPEND; 854 vrelel(vp, 0); 855 } 856 } 857 858 void 859 vrele_flush(void) 860 { 861 int gen; 862 863 mutex_enter(&vrele_lock); 864 gen = vrele_gen; 865 while (vrele_pending && gen == vrele_gen) { 866 cv_broadcast(&vrele_cv); 867 cv_wait(&vrele_cv, &vrele_lock); 868 } 869 mutex_exit(&vrele_lock); 870 } 871 872 /* 873 * Vnode reference, where a reference is already held by some other 874 * object (for example, a file structure). 875 */ 876 void 877 vref(vnode_t *vp) 878 { 879 880 KASSERT((vp->v_iflag & VI_MARKER) == 0); 881 KASSERT(vp->v_usecount != 0); 882 883 atomic_inc_uint(&vp->v_usecount); 884 } 885 886 /* 887 * Page or buffer structure gets a reference. 888 * Called with v_interlock held. 889 */ 890 void 891 vholdl(vnode_t *vp) 892 { 893 894 KASSERT(mutex_owned(vp->v_interlock)); 895 KASSERT((vp->v_iflag & VI_MARKER) == 0); 896 897 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 898 mutex_enter(&vnode_free_list_lock); 899 KASSERT(vp->v_freelisthd == &vnode_free_list); 900 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 901 vp->v_freelisthd = &vnode_hold_list; 902 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 903 mutex_exit(&vnode_free_list_lock); 904 } 905 } 906 907 /* 908 * Page or buffer structure frees a reference. 909 * Called with v_interlock held. 910 */ 911 void 912 holdrelel(vnode_t *vp) 913 { 914 915 KASSERT(mutex_owned(vp->v_interlock)); 916 KASSERT((vp->v_iflag & VI_MARKER) == 0); 917 918 if (vp->v_holdcnt <= 0) { 919 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 920 } 921 922 vp->v_holdcnt--; 923 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 924 mutex_enter(&vnode_free_list_lock); 925 KASSERT(vp->v_freelisthd == &vnode_hold_list); 926 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 927 vp->v_freelisthd = &vnode_free_list; 928 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 929 mutex_exit(&vnode_free_list_lock); 930 } 931 } 932 933 /* 934 * Disassociate the underlying file system from a vnode. 935 * 936 * Must be called with the interlock held, and will return with it held. 937 */ 938 void 939 vclean(vnode_t *vp, int flags) 940 { 941 lwp_t *l = curlwp; 942 bool recycle, active; 943 int error; 944 945 KASSERT(mutex_owned(vp->v_interlock)); 946 KASSERT((vp->v_iflag & VI_MARKER) == 0); 947 KASSERT(vp->v_usecount != 0); 948 949 /* If cleaning is already in progress wait until done and return. */ 950 if (vp->v_iflag & VI_XLOCK) { 951 vwait(vp, VI_XLOCK); 952 return; 953 } 954 955 /* If already clean, nothing to do. */ 956 if ((vp->v_iflag & VI_CLEAN) != 0) { 957 return; 958 } 959 960 /* 961 * Prevent the vnode from being recycled or brought into use 962 * while we clean it out. 963 */ 964 vp->v_iflag |= VI_XLOCK; 965 if (vp->v_iflag & VI_EXECMAP) { 966 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 967 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 968 } 969 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 970 active = (vp->v_usecount & VC_MASK) > 1; 971 972 /* XXXAD should not lock vnode under layer */ 973 mutex_exit(vp->v_interlock); 974 VOP_LOCK(vp, LK_EXCLUSIVE); 975 976 /* 977 * Clean out any cached data associated with the vnode. 978 * If purging an active vnode, it must be closed and 979 * deactivated before being reclaimed. Note that the 980 * VOP_INACTIVE will unlock the vnode. 981 */ 982 if (flags & DOCLOSE) { 983 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 984 if (error != 0) { 985 /* XXX, fix vn_start_write's grab of mp and use that. */ 986 987 if (wapbl_vphaswapbl(vp)) 988 WAPBL_DISCARD(wapbl_vptomp(vp)); 989 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 990 } 991 KASSERT(error == 0); 992 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 993 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 994 spec_node_revoke(vp); 995 } 996 } 997 if (active) { 998 VOP_INACTIVE(vp, &recycle); 999 } else { 1000 /* 1001 * Any other processes trying to obtain this lock must first 1002 * wait for VI_XLOCK to clear, then call the new lock operation. 1003 */ 1004 VOP_UNLOCK(vp); 1005 } 1006 1007 /* Disassociate the underlying file system from the vnode. */ 1008 if (VOP_RECLAIM(vp)) { 1009 vnpanic(vp, "%s: cannot reclaim", __func__); 1010 } 1011 1012 KASSERT(vp->v_data == NULL); 1013 KASSERT(vp->v_uobj.uo_npages == 0); 1014 1015 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1016 uvm_ra_freectx(vp->v_ractx); 1017 vp->v_ractx = NULL; 1018 } 1019 1020 /* Purge name cache. */ 1021 cache_purge(vp); 1022 1023 /* Done with purge, notify sleepers of the grim news. */ 1024 mutex_enter(vp->v_interlock); 1025 vp->v_op = dead_vnodeop_p; 1026 vp->v_tag = VT_NON; 1027 KNOTE(&vp->v_klist, NOTE_REVOKE); 1028 vp->v_iflag &= ~VI_XLOCK; 1029 vp->v_vflag &= ~VV_LOCKSWORK; 1030 if ((flags & DOCLOSE) != 0) { 1031 vp->v_iflag |= VI_CLEAN; 1032 } 1033 cv_broadcast(&vp->v_cv); 1034 1035 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1036 } 1037 1038 /* 1039 * Recycle an unused vnode to the front of the free list. 1040 * Release the passed interlock if the vnode will be recycled. 1041 */ 1042 int 1043 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1044 { 1045 1046 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1047 1048 mutex_enter(vp->v_interlock); 1049 if (vp->v_usecount != 0) { 1050 mutex_exit(vp->v_interlock); 1051 return 0; 1052 } 1053 if (inter_lkp) { 1054 mutex_exit(inter_lkp); 1055 } 1056 vremfree(vp); 1057 vp->v_usecount = 1; 1058 vclean(vp, DOCLOSE); 1059 vrelel(vp, 0); 1060 return 1; 1061 } 1062 1063 /* 1064 * Eliminate all activity associated with the requested vnode 1065 * and with all vnodes aliased to the requested vnode. 1066 */ 1067 void 1068 vrevoke(vnode_t *vp) 1069 { 1070 vnode_t *vq, **vpp; 1071 enum vtype type; 1072 dev_t dev; 1073 1074 KASSERT(vp->v_usecount > 0); 1075 1076 mutex_enter(vp->v_interlock); 1077 if ((vp->v_iflag & VI_CLEAN) != 0) { 1078 mutex_exit(vp->v_interlock); 1079 return; 1080 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1081 atomic_inc_uint(&vp->v_usecount); 1082 vclean(vp, DOCLOSE); 1083 vrelel(vp, 0); 1084 return; 1085 } else { 1086 dev = vp->v_rdev; 1087 type = vp->v_type; 1088 mutex_exit(vp->v_interlock); 1089 } 1090 1091 vpp = &specfs_hash[SPECHASH(dev)]; 1092 mutex_enter(&device_lock); 1093 for (vq = *vpp; vq != NULL;) { 1094 /* If clean or being cleaned, then ignore it. */ 1095 mutex_enter(vq->v_interlock); 1096 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1097 vq->v_type != type || vq->v_rdev != dev) { 1098 mutex_exit(vq->v_interlock); 1099 vq = vq->v_specnext; 1100 continue; 1101 } 1102 mutex_exit(&device_lock); 1103 if (vq->v_usecount == 0) { 1104 vremfree(vq); 1105 vq->v_usecount = 1; 1106 } else { 1107 atomic_inc_uint(&vq->v_usecount); 1108 } 1109 vclean(vq, DOCLOSE); 1110 vrelel(vq, 0); 1111 mutex_enter(&device_lock); 1112 vq = *vpp; 1113 } 1114 mutex_exit(&device_lock); 1115 } 1116 1117 /* 1118 * Eliminate all activity associated with a vnode in preparation for 1119 * reuse. Drops a reference from the vnode. 1120 */ 1121 void 1122 vgone(vnode_t *vp) 1123 { 1124 1125 mutex_enter(vp->v_interlock); 1126 vclean(vp, DOCLOSE); 1127 vrelel(vp, 0); 1128 } 1129 1130 /* 1131 * Update outstanding I/O count and do wakeup if requested. 1132 */ 1133 void 1134 vwakeup(struct buf *bp) 1135 { 1136 vnode_t *vp; 1137 1138 if ((vp = bp->b_vp) == NULL) 1139 return; 1140 1141 KASSERT(bp->b_objlock == vp->v_interlock); 1142 KASSERT(mutex_owned(bp->b_objlock)); 1143 1144 if (--vp->v_numoutput < 0) 1145 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1146 if (vp->v_numoutput == 0) 1147 cv_broadcast(&vp->v_cv); 1148 } 1149 1150 /* 1151 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1152 * recycled. 1153 */ 1154 void 1155 vwait(vnode_t *vp, int flags) 1156 { 1157 1158 KASSERT(mutex_owned(vp->v_interlock)); 1159 KASSERT(vp->v_usecount != 0); 1160 1161 while ((vp->v_iflag & flags) != 0) 1162 cv_wait(&vp->v_cv, vp->v_interlock); 1163 } 1164 1165 int 1166 vfs_drainvnodes(long target) 1167 { 1168 int error; 1169 1170 mutex_enter(&vnode_free_list_lock); 1171 1172 while (numvnodes > target) { 1173 error = cleanvnode(); 1174 if (error != 0) 1175 return error; 1176 mutex_enter(&vnode_free_list_lock); 1177 } 1178 1179 mutex_exit(&vnode_free_list_lock); 1180 1181 return 0; 1182 } 1183 1184 void 1185 vnpanic(vnode_t *vp, const char *fmt, ...) 1186 { 1187 va_list ap; 1188 1189 #ifdef DIAGNOSTIC 1190 vprint(NULL, vp); 1191 #endif 1192 va_start(ap, fmt); 1193 vpanic(fmt, ap); 1194 va_end(ap); 1195 } 1196