1 /* $NetBSD: vfs_vnode.c,v 1.9 2011/06/12 03:35:57 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9). 80 * - Reclamation of inactive vnode, via vget(9). 81 * 82 * The life-cycle ends when the last reference is dropped, usually 83 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 84 * the file system that vnode is inactive. Via this call, file system 85 * indicates whether vnode should be recycled (usually, count of links 86 * is checked i.e. whether file was removed). 87 * 88 * Depending on indication, vnode can be put into a free list (cache), 89 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 90 * underlying file system from the vnode, and finally destroyed. 91 * 92 * Reference counting 93 * 94 * Vnode is considered active, if reference count (vnode_t::v_usecount) 95 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 96 * as vput(9), routines. Common points holding references are e.g. 97 * file openings, current working directory, mount points, etc. 98 * 99 * Note on v_usecount and its locking 100 * 101 * At nearly all points it is known that v_usecount could be zero, 102 * the vnode_t::v_interlock will be held. To change v_usecount away 103 * from zero, the interlock must be held. To change from a non-zero 104 * value to zero, again the interlock must be held. 105 * 106 * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise 107 * v_usecount, if the VC_XLOCK bit is set in it, the interlock must 108 * be held. To modify the VC_XLOCK bit, the interlock must be held. 109 * We always keep the usecount (v_usecount & VC_MASK) non-zero while 110 * the VC_XLOCK bit is set. 111 * 112 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 113 * value to a non-zero value can safely be done using atomic operations, 114 * without the interlock held. 115 * 116 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 117 * value can be done using atomic operations, without the interlock held. 118 * 119 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 120 * mntvnode_lock is still held. 121 */ 122 123 #include <sys/cdefs.h> 124 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.9 2011/06/12 03:35:57 rmind Exp $"); 125 126 #include <sys/param.h> 127 #include <sys/kernel.h> 128 129 #include <sys/atomic.h> 130 #include <sys/buf.h> 131 #include <sys/conf.h> 132 #include <sys/device.h> 133 #include <sys/kauth.h> 134 #include <sys/kmem.h> 135 #include <sys/kthread.h> 136 #include <sys/module.h> 137 #include <sys/mount.h> 138 #include <sys/namei.h> 139 #include <sys/syscallargs.h> 140 #include <sys/sysctl.h> 141 #include <sys/systm.h> 142 #include <sys/vnode.h> 143 #include <sys/wapbl.h> 144 145 #include <uvm/uvm.h> 146 #include <uvm/uvm_readahead.h> 147 148 u_int numvnodes __cacheline_aligned; 149 150 static pool_cache_t vnode_cache __read_mostly; 151 static kmutex_t vnode_free_list_lock __cacheline_aligned; 152 153 static vnodelst_t vnode_free_list __cacheline_aligned; 154 static vnodelst_t vnode_hold_list __cacheline_aligned; 155 static vnodelst_t vrele_list __cacheline_aligned; 156 157 static kmutex_t vrele_lock __cacheline_aligned; 158 static kcondvar_t vrele_cv __cacheline_aligned; 159 static lwp_t * vrele_lwp __cacheline_aligned; 160 static int vrele_pending __cacheline_aligned; 161 static int vrele_gen __cacheline_aligned; 162 163 static vnode_t * getcleanvnode(void); 164 static void vrele_thread(void *); 165 static void vpanic(vnode_t *, const char *); 166 167 /* Routines having to do with the management of the vnode table. */ 168 extern int (**dead_vnodeop_p)(void *); 169 170 void 171 vfs_vnode_sysinit(void) 172 { 173 int error; 174 175 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 176 NULL, IPL_NONE, NULL, NULL, NULL); 177 KASSERT(vnode_cache != NULL); 178 179 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 180 TAILQ_INIT(&vnode_free_list); 181 TAILQ_INIT(&vnode_hold_list); 182 TAILQ_INIT(&vrele_list); 183 184 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 185 cv_init(&vrele_cv, "vrele"); 186 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 187 NULL, &vrele_lwp, "vrele"); 188 KASSERT(error == 0); 189 } 190 191 /* 192 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 193 * marker vnode and we are prepared to wait for the allocation. 194 */ 195 vnode_t * 196 vnalloc(struct mount *mp) 197 { 198 vnode_t *vp; 199 200 vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT)); 201 if (vp == NULL) { 202 return NULL; 203 } 204 205 memset(vp, 0, sizeof(*vp)); 206 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 207 cv_init(&vp->v_cv, "vnode"); 208 /* 209 * Done by memset() above. 210 * LIST_INIT(&vp->v_nclist); 211 * LIST_INIT(&vp->v_dnclist); 212 */ 213 214 if (mp != NULL) { 215 vp->v_mount = mp; 216 vp->v_type = VBAD; 217 vp->v_iflag = VI_MARKER; 218 } else { 219 rw_init(&vp->v_lock); 220 } 221 222 return vp; 223 } 224 225 /* 226 * Free an unused, unreferenced vnode. 227 */ 228 void 229 vnfree(vnode_t *vp) 230 { 231 232 KASSERT(vp->v_usecount == 0); 233 234 if ((vp->v_iflag & VI_MARKER) == 0) { 235 rw_destroy(&vp->v_lock); 236 mutex_enter(&vnode_free_list_lock); 237 numvnodes--; 238 mutex_exit(&vnode_free_list_lock); 239 } 240 241 /* 242 * Note: the vnode interlock will either be freed, of reference 243 * dropped (if VI_LOCKSHARE was in use). 244 */ 245 uvm_obj_destroy(&vp->v_uobj, true); 246 cv_destroy(&vp->v_cv); 247 pool_cache_put(vnode_cache, vp); 248 } 249 250 /* 251 * getcleanvnode: grab a vnode from freelist and clean it. 252 * 253 * => Releases vnode_free_list_lock. 254 * => Returns referenced vnode on success. 255 */ 256 static vnode_t * 257 getcleanvnode(void) 258 { 259 vnode_t *vp; 260 vnodelst_t *listhd; 261 262 KASSERT(mutex_owned(&vnode_free_list_lock)); 263 retry: 264 listhd = &vnode_free_list; 265 try_nextlist: 266 TAILQ_FOREACH(vp, listhd, v_freelist) { 267 /* 268 * It's safe to test v_usecount and v_iflag 269 * without holding the interlock here, since 270 * these vnodes should never appear on the 271 * lists. 272 */ 273 KASSERT(vp->v_usecount == 0); 274 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 275 KASSERT(vp->v_freelisthd == listhd); 276 277 if (!mutex_tryenter(vp->v_interlock)) 278 continue; 279 if ((vp->v_iflag & VI_XLOCK) == 0) 280 break; 281 mutex_exit(vp->v_interlock); 282 } 283 284 if (vp == NULL) { 285 if (listhd == &vnode_free_list) { 286 listhd = &vnode_hold_list; 287 goto try_nextlist; 288 } 289 mutex_exit(&vnode_free_list_lock); 290 return NULL; 291 } 292 293 /* Remove it from the freelist. */ 294 TAILQ_REMOVE(listhd, vp, v_freelist); 295 vp->v_freelisthd = NULL; 296 mutex_exit(&vnode_free_list_lock); 297 298 KASSERT(vp->v_usecount == 0); 299 300 /* 301 * The vnode is still associated with a file system, so we must 302 * clean it out before reusing it. We need to add a reference 303 * before doing this. If the vnode gains another reference while 304 * being cleaned out then we lose - retry. 305 */ 306 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 307 vclean(vp, DOCLOSE); 308 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 309 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 310 if (vp->v_usecount == 1) { 311 /* We're about to dirty it. */ 312 vp->v_iflag &= ~VI_CLEAN; 313 mutex_exit(vp->v_interlock); 314 if (vp->v_type == VBLK || vp->v_type == VCHR) { 315 spec_node_destroy(vp); 316 } 317 vp->v_type = VNON; 318 } else { 319 /* 320 * Don't return to freelist - the holder of the last 321 * reference will destroy it. 322 */ 323 vrelel(vp, 0); /* releases vp->v_interlock */ 324 mutex_enter(&vnode_free_list_lock); 325 goto retry; 326 } 327 328 KASSERT(vp->v_data == NULL); 329 KASSERT(vp->v_uobj.uo_npages == 0); 330 KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); 331 KASSERT(vp->v_numoutput == 0); 332 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 333 334 return vp; 335 } 336 337 /* 338 * getnewvnode: return the next vnode from the free list. 339 * 340 * => Returns referenced vnode, moved into the mount queue. 341 * => Shares the interlock specified by 'slock', if it is not NULL. 342 */ 343 int 344 getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 345 kmutex_t *slock, vnode_t **vpp) 346 { 347 struct uvm_object *uobj; 348 static int toggle; 349 vnode_t *vp; 350 int error = 0, tryalloc; 351 352 try_again: 353 if (mp != NULL) { 354 /* 355 * Mark filesystem busy while we are creating a vnode. 356 * If unmount is in progress, this will fail. 357 */ 358 error = vfs_busy(mp, NULL); 359 if (error) 360 return error; 361 } 362 363 /* 364 * We must choose whether to allocate a new vnode or recycle an 365 * existing one. The criterion for allocating a new one is that 366 * the total number of vnodes is less than the number desired or 367 * there are no vnodes on either free list. Generally we only 368 * want to recycle vnodes that have no buffers associated with 369 * them, so we look first on the vnode_free_list. If it is empty, 370 * we next consider vnodes with referencing buffers on the 371 * vnode_hold_list. The toggle ensures that half the time we 372 * will use a buffer from the vnode_hold_list, and half the time 373 * we will allocate a new one unless the list has grown to twice 374 * the desired size. We are reticent to recycle vnodes from the 375 * vnode_hold_list because we will lose the identity of all its 376 * referencing buffers. 377 */ 378 379 vp = NULL; 380 381 mutex_enter(&vnode_free_list_lock); 382 383 toggle ^= 1; 384 if (numvnodes > 2 * desiredvnodes) 385 toggle = 0; 386 387 tryalloc = numvnodes < desiredvnodes || 388 (TAILQ_FIRST(&vnode_free_list) == NULL && 389 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle)); 390 391 if (tryalloc) { 392 /* Allocate a new vnode. */ 393 numvnodes++; 394 mutex_exit(&vnode_free_list_lock); 395 if ((vp = vnalloc(NULL)) == NULL) { 396 mutex_enter(&vnode_free_list_lock); 397 numvnodes--; 398 } else 399 vp->v_usecount = 1; 400 } 401 402 if (vp == NULL) { 403 /* Recycle and get vnode clean. */ 404 vp = getcleanvnode(); 405 if (vp == NULL) { 406 if (mp != NULL) { 407 vfs_unbusy(mp, false, NULL); 408 } 409 if (tryalloc) { 410 printf("WARNING: unable to allocate new " 411 "vnode, retrying...\n"); 412 kpause("newvn", false, hz, NULL); 413 goto try_again; 414 } 415 tablefull("vnode", "increase kern.maxvnodes or NVNODE"); 416 *vpp = 0; 417 return ENFILE; 418 } 419 if ((vp->v_iflag & VI_LOCKSHARE) != 0 || slock) { 420 /* We must remove vnode from the old mount point. */ 421 if (vp->v_mount) { 422 vfs_insmntque(vp, NULL); 423 } 424 /* Allocate a new interlock, if it was shared. */ 425 if (vp->v_iflag & VI_LOCKSHARE) { 426 uvm_obj_setlock(&vp->v_uobj, NULL); 427 vp->v_iflag &= ~VI_LOCKSHARE; 428 } 429 } 430 vp->v_iflag = 0; 431 vp->v_vflag = 0; 432 vp->v_uflag = 0; 433 vp->v_socket = NULL; 434 } 435 436 KASSERT(vp->v_usecount == 1); 437 KASSERT(vp->v_freelisthd == NULL); 438 KASSERT(LIST_EMPTY(&vp->v_nclist)); 439 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 440 441 /* Initialize vnode. */ 442 vp->v_type = VNON; 443 vp->v_tag = tag; 444 vp->v_op = vops; 445 vp->v_data = NULL; 446 447 uobj = &vp->v_uobj; 448 KASSERT(uobj->pgops == &uvm_vnodeops); 449 KASSERT(uobj->uo_npages == 0); 450 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 451 vp->v_size = vp->v_writesize = VSIZENOTSET; 452 453 /* Share the vnode_t::v_interlock, if requested. */ 454 if (slock) { 455 /* Set the interlock and mark that it is shared. */ 456 KASSERT(vp->v_mount == NULL); 457 mutex_obj_hold(slock); 458 uvm_obj_setlock(&vp->v_uobj, slock); 459 KASSERT(vp->v_interlock == slock); 460 vp->v_iflag |= VI_LOCKSHARE; 461 } 462 463 /* Finally, move vnode into the mount queue. */ 464 vfs_insmntque(vp, mp); 465 466 if (mp != NULL) { 467 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 468 vp->v_vflag |= VV_MPSAFE; 469 vfs_unbusy(mp, true, NULL); 470 } 471 472 *vpp = vp; 473 return 0; 474 } 475 476 /* 477 * This is really just the reverse of getnewvnode(). Needed for 478 * VFS_VGET functions who may need to push back a vnode in case 479 * of a locking race. 480 */ 481 void 482 ungetnewvnode(vnode_t *vp) 483 { 484 485 KASSERT(vp->v_usecount == 1); 486 KASSERT(vp->v_data == NULL); 487 KASSERT(vp->v_freelisthd == NULL); 488 489 mutex_enter(vp->v_interlock); 490 vp->v_iflag |= VI_CLEAN; 491 vrelel(vp, 0); 492 } 493 494 /* 495 * Remove a vnode from its freelist. 496 */ 497 void 498 vremfree(vnode_t *vp) 499 { 500 501 KASSERT(mutex_owned(vp->v_interlock)); 502 KASSERT(vp->v_usecount == 0); 503 504 /* 505 * Note that the reference count must not change until 506 * the vnode is removed. 507 */ 508 mutex_enter(&vnode_free_list_lock); 509 if (vp->v_holdcnt > 0) { 510 KASSERT(vp->v_freelisthd == &vnode_hold_list); 511 } else { 512 KASSERT(vp->v_freelisthd == &vnode_free_list); 513 } 514 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 515 vp->v_freelisthd = NULL; 516 mutex_exit(&vnode_free_list_lock); 517 } 518 519 /* 520 * Try to gain a reference to a vnode, without acquiring its interlock. 521 * The caller must hold a lock that will prevent the vnode from being 522 * recycled or freed. 523 */ 524 bool 525 vtryget(vnode_t *vp) 526 { 527 u_int use, next; 528 529 /* 530 * If the vnode is being freed, don't make life any harder 531 * for vclean() by adding another reference without waiting. 532 * This is not strictly necessary, but we'll do it anyway. 533 */ 534 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) { 535 return false; 536 } 537 for (use = vp->v_usecount;; use = next) { 538 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 539 /* Need interlock held if first reference. */ 540 return false; 541 } 542 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 543 if (__predict_true(next == use)) { 544 return true; 545 } 546 } 547 } 548 549 /* 550 * vget: get a particular vnode from the free list, increment its reference 551 * count and lock it. 552 * 553 * => Should be called with v_interlock held. 554 * 555 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 556 * In that case, we cannot grab the vnode, so the process is awakened when 557 * the transition is completed, and an error returned to indicate that the 558 * vnode is no longer usable (e.g. changed to a new file system type). 559 */ 560 int 561 vget(vnode_t *vp, int flags) 562 { 563 int error = 0; 564 565 KASSERT((vp->v_iflag & VI_MARKER) == 0); 566 KASSERT(mutex_owned(vp->v_interlock)); 567 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 568 569 /* 570 * Before adding a reference, we must remove the vnode 571 * from its freelist. 572 */ 573 if (vp->v_usecount == 0) { 574 vremfree(vp); 575 vp->v_usecount = 1; 576 } else { 577 atomic_inc_uint(&vp->v_usecount); 578 } 579 580 /* 581 * If the vnode is in the process of being cleaned out for 582 * another use, we wait for the cleaning to finish and then 583 * return failure. Cleaning is determined by checking if 584 * the VI_XLOCK flag is set. 585 */ 586 if ((vp->v_iflag & VI_XLOCK) != 0) { 587 if ((flags & LK_NOWAIT) != 0) { 588 vrelel(vp, 0); 589 return EBUSY; 590 } 591 vwait(vp, VI_XLOCK); 592 vrelel(vp, 0); 593 return ENOENT; 594 } 595 596 /* 597 * Ok, we got it in good shape. Just locking left. 598 */ 599 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 600 mutex_exit(vp->v_interlock); 601 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 602 error = vn_lock(vp, flags); 603 if (error != 0) { 604 vrele(vp); 605 } 606 } 607 return error; 608 } 609 610 /* 611 * vput: unlock and release the reference. 612 */ 613 void 614 vput(vnode_t *vp) 615 { 616 617 KASSERT((vp->v_iflag & VI_MARKER) == 0); 618 619 VOP_UNLOCK(vp); 620 vrele(vp); 621 } 622 623 /* 624 * Try to drop reference on a vnode. Abort if we are releasing the 625 * last reference. Note: this _must_ succeed if not the last reference. 626 */ 627 static inline bool 628 vtryrele(vnode_t *vp) 629 { 630 u_int use, next; 631 632 for (use = vp->v_usecount;; use = next) { 633 if (use == 1) { 634 return false; 635 } 636 KASSERT((use & VC_MASK) > 1); 637 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 638 if (__predict_true(next == use)) { 639 return true; 640 } 641 } 642 } 643 644 /* 645 * Vnode release. If reference count drops to zero, call inactive 646 * routine and either return to freelist or free to the pool. 647 */ 648 void 649 vrelel(vnode_t *vp, int flags) 650 { 651 bool recycle, defer; 652 int error; 653 654 KASSERT(mutex_owned(vp->v_interlock)); 655 KASSERT((vp->v_iflag & VI_MARKER) == 0); 656 KASSERT(vp->v_freelisthd == NULL); 657 658 if (__predict_false(vp->v_op == dead_vnodeop_p && 659 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 660 vpanic(vp, "dead but not clean"); 661 } 662 663 /* 664 * If not the last reference, just drop the reference count 665 * and unlock. 666 */ 667 if (vtryrele(vp)) { 668 vp->v_iflag |= VI_INACTREDO; 669 mutex_exit(vp->v_interlock); 670 return; 671 } 672 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 673 vpanic(vp, "vrelel: bad ref count"); 674 } 675 676 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 677 678 /* 679 * If not clean, deactivate the vnode, but preserve 680 * our reference across the call to VOP_INACTIVE(). 681 */ 682 retry: 683 if ((vp->v_iflag & VI_CLEAN) == 0) { 684 recycle = false; 685 vp->v_iflag |= VI_INACTNOW; 686 687 /* 688 * XXX This ugly block can be largely eliminated if 689 * locking is pushed down into the file systems. 690 * 691 * Defer vnode release to vrele_thread if caller 692 * requests it explicitly. 693 */ 694 if ((curlwp == uvm.pagedaemon_lwp) || 695 (flags & VRELEL_ASYNC_RELE) != 0) { 696 /* The pagedaemon can't wait around; defer. */ 697 defer = true; 698 } else if (curlwp == vrele_lwp) { 699 /* We have to try harder. */ 700 vp->v_iflag &= ~VI_INACTREDO; 701 mutex_exit(vp->v_interlock); 702 error = vn_lock(vp, LK_EXCLUSIVE); 703 if (error != 0) { 704 /* XXX */ 705 vpanic(vp, "vrele: unable to lock %p"); 706 } 707 defer = false; 708 } else if ((vp->v_iflag & VI_LAYER) != 0) { 709 /* 710 * Acquiring the stack's lock in vclean() even 711 * for an honest vput/vrele is dangerous because 712 * our caller may hold other vnode locks; defer. 713 */ 714 defer = true; 715 } else { 716 /* If we can't acquire the lock, then defer. */ 717 vp->v_iflag &= ~VI_INACTREDO; 718 mutex_exit(vp->v_interlock); 719 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 720 if (error != 0) { 721 defer = true; 722 mutex_enter(vp->v_interlock); 723 } else { 724 defer = false; 725 } 726 } 727 728 if (defer) { 729 /* 730 * Defer reclaim to the kthread; it's not safe to 731 * clean it here. We donate it our last reference. 732 */ 733 KASSERT(mutex_owned(vp->v_interlock)); 734 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 735 vp->v_iflag &= ~VI_INACTNOW; 736 vp->v_iflag |= VI_INACTPEND; 737 mutex_enter(&vrele_lock); 738 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 739 if (++vrele_pending > (desiredvnodes >> 8)) 740 cv_signal(&vrele_cv); 741 mutex_exit(&vrele_lock); 742 mutex_exit(vp->v_interlock); 743 return; 744 } 745 746 #ifdef DIAGNOSTIC 747 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 748 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 749 vprint("vrelel: missing VOP_CLOSE()", vp); 750 } 751 #endif 752 753 /* 754 * The vnode can gain another reference while being 755 * deactivated. If VOP_INACTIVE() indicates that 756 * the described file has been deleted, then recycle 757 * the vnode irrespective of additional references. 758 * Another thread may be waiting to re-use the on-disk 759 * inode. 760 * 761 * Note that VOP_INACTIVE() will drop the vnode lock. 762 */ 763 VOP_INACTIVE(vp, &recycle); 764 mutex_enter(vp->v_interlock); 765 vp->v_iflag &= ~VI_INACTNOW; 766 if (!recycle) { 767 if (vtryrele(vp)) { 768 mutex_exit(vp->v_interlock); 769 return; 770 } 771 772 /* 773 * If we grew another reference while 774 * VOP_INACTIVE() was underway, retry. 775 */ 776 if ((vp->v_iflag & VI_INACTREDO) != 0) { 777 goto retry; 778 } 779 } 780 781 /* Take care of space accounting. */ 782 if (vp->v_iflag & VI_EXECMAP) { 783 atomic_add_int(&uvmexp.execpages, 784 -vp->v_uobj.uo_npages); 785 atomic_add_int(&uvmexp.filepages, 786 vp->v_uobj.uo_npages); 787 } 788 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 789 vp->v_vflag &= ~VV_MAPPED; 790 791 /* 792 * Recycle the vnode if the file is now unused (unlinked), 793 * otherwise just free it. 794 */ 795 if (recycle) { 796 vclean(vp, DOCLOSE); 797 } 798 KASSERT(vp->v_usecount > 0); 799 } 800 801 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 802 /* Gained another reference while being reclaimed. */ 803 mutex_exit(vp->v_interlock); 804 return; 805 } 806 807 if ((vp->v_iflag & VI_CLEAN) != 0) { 808 /* 809 * It's clean so destroy it. It isn't referenced 810 * anywhere since it has been reclaimed. 811 */ 812 KASSERT(vp->v_holdcnt == 0); 813 KASSERT(vp->v_writecount == 0); 814 mutex_exit(vp->v_interlock); 815 vfs_insmntque(vp, NULL); 816 if (vp->v_type == VBLK || vp->v_type == VCHR) { 817 spec_node_destroy(vp); 818 } 819 vnfree(vp); 820 } else { 821 /* 822 * Otherwise, put it back onto the freelist. It 823 * can't be destroyed while still associated with 824 * a file system. 825 */ 826 mutex_enter(&vnode_free_list_lock); 827 if (vp->v_holdcnt > 0) { 828 vp->v_freelisthd = &vnode_hold_list; 829 } else { 830 vp->v_freelisthd = &vnode_free_list; 831 } 832 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 833 mutex_exit(&vnode_free_list_lock); 834 mutex_exit(vp->v_interlock); 835 } 836 } 837 838 void 839 vrele(vnode_t *vp) 840 { 841 842 KASSERT((vp->v_iflag & VI_MARKER) == 0); 843 844 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 845 return; 846 } 847 mutex_enter(vp->v_interlock); 848 vrelel(vp, 0); 849 } 850 851 /* 852 * Asynchronous vnode release, vnode is released in different context. 853 */ 854 void 855 vrele_async(vnode_t *vp) 856 { 857 858 KASSERT((vp->v_iflag & VI_MARKER) == 0); 859 860 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 861 return; 862 } 863 mutex_enter(vp->v_interlock); 864 vrelel(vp, VRELEL_ASYNC_RELE); 865 } 866 867 static void 868 vrele_thread(void *cookie) 869 { 870 vnode_t *vp; 871 872 for (;;) { 873 mutex_enter(&vrele_lock); 874 while (TAILQ_EMPTY(&vrele_list)) { 875 vrele_gen++; 876 cv_broadcast(&vrele_cv); 877 cv_timedwait(&vrele_cv, &vrele_lock, hz); 878 } 879 vp = TAILQ_FIRST(&vrele_list); 880 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 881 vrele_pending--; 882 mutex_exit(&vrele_lock); 883 884 /* 885 * If not the last reference, then ignore the vnode 886 * and look for more work. 887 */ 888 mutex_enter(vp->v_interlock); 889 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 890 vp->v_iflag &= ~VI_INACTPEND; 891 vrelel(vp, 0); 892 } 893 } 894 895 void 896 vrele_flush(void) 897 { 898 int gen; 899 900 mutex_enter(&vrele_lock); 901 gen = vrele_gen; 902 while (vrele_pending && gen == vrele_gen) { 903 cv_broadcast(&vrele_cv); 904 cv_wait(&vrele_cv, &vrele_lock); 905 } 906 mutex_exit(&vrele_lock); 907 } 908 909 /* 910 * Vnode reference, where a reference is already held by some other 911 * object (for example, a file structure). 912 */ 913 void 914 vref(vnode_t *vp) 915 { 916 917 KASSERT((vp->v_iflag & VI_MARKER) == 0); 918 KASSERT(vp->v_usecount != 0); 919 920 atomic_inc_uint(&vp->v_usecount); 921 } 922 923 /* 924 * Page or buffer structure gets a reference. 925 * Called with v_interlock held. 926 */ 927 void 928 vholdl(vnode_t *vp) 929 { 930 931 KASSERT(mutex_owned(vp->v_interlock)); 932 KASSERT((vp->v_iflag & VI_MARKER) == 0); 933 934 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 935 mutex_enter(&vnode_free_list_lock); 936 KASSERT(vp->v_freelisthd == &vnode_free_list); 937 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 938 vp->v_freelisthd = &vnode_hold_list; 939 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 940 mutex_exit(&vnode_free_list_lock); 941 } 942 } 943 944 /* 945 * Page or buffer structure frees a reference. 946 * Called with v_interlock held. 947 */ 948 void 949 holdrelel(vnode_t *vp) 950 { 951 952 KASSERT(mutex_owned(vp->v_interlock)); 953 KASSERT((vp->v_iflag & VI_MARKER) == 0); 954 955 if (vp->v_holdcnt <= 0) { 956 vpanic(vp, "holdrelel: holdcnt vp %p"); 957 } 958 959 vp->v_holdcnt--; 960 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 961 mutex_enter(&vnode_free_list_lock); 962 KASSERT(vp->v_freelisthd == &vnode_hold_list); 963 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 964 vp->v_freelisthd = &vnode_free_list; 965 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 966 mutex_exit(&vnode_free_list_lock); 967 } 968 } 969 970 /* 971 * Disassociate the underlying file system from a vnode. 972 * 973 * Must be called with the interlock held, and will return with it held. 974 */ 975 void 976 vclean(vnode_t *vp, int flags) 977 { 978 lwp_t *l = curlwp; 979 bool recycle, active; 980 int error; 981 982 KASSERT(mutex_owned(vp->v_interlock)); 983 KASSERT((vp->v_iflag & VI_MARKER) == 0); 984 KASSERT(vp->v_usecount != 0); 985 986 /* If cleaning is already in progress wait until done and return. */ 987 if (vp->v_iflag & VI_XLOCK) { 988 vwait(vp, VI_XLOCK); 989 return; 990 } 991 992 /* If already clean, nothing to do. */ 993 if ((vp->v_iflag & VI_CLEAN) != 0) { 994 return; 995 } 996 997 /* 998 * Prevent the vnode from being recycled or brought into use 999 * while we clean it out. 1000 */ 1001 vp->v_iflag |= VI_XLOCK; 1002 if (vp->v_iflag & VI_EXECMAP) { 1003 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1004 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1005 } 1006 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1007 active = (vp->v_usecount & VC_MASK) > 1; 1008 1009 /* XXXAD should not lock vnode under layer */ 1010 mutex_exit(vp->v_interlock); 1011 VOP_LOCK(vp, LK_EXCLUSIVE); 1012 1013 /* 1014 * Clean out any cached data associated with the vnode. 1015 * If purging an active vnode, it must be closed and 1016 * deactivated before being reclaimed. Note that the 1017 * VOP_INACTIVE will unlock the vnode. 1018 */ 1019 if (flags & DOCLOSE) { 1020 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1021 if (error != 0) { 1022 /* XXX, fix vn_start_write's grab of mp and use that. */ 1023 1024 if (wapbl_vphaswapbl(vp)) 1025 WAPBL_DISCARD(wapbl_vptomp(vp)); 1026 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1027 } 1028 KASSERT(error == 0); 1029 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1030 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1031 spec_node_revoke(vp); 1032 } 1033 } 1034 if (active) { 1035 VOP_INACTIVE(vp, &recycle); 1036 } else { 1037 /* 1038 * Any other processes trying to obtain this lock must first 1039 * wait for VI_XLOCK to clear, then call the new lock operation. 1040 */ 1041 VOP_UNLOCK(vp); 1042 } 1043 1044 /* Disassociate the underlying file system from the vnode. */ 1045 if (VOP_RECLAIM(vp)) { 1046 vpanic(vp, "vclean: cannot reclaim"); 1047 } 1048 1049 KASSERT(vp->v_data == NULL); 1050 KASSERT(vp->v_uobj.uo_npages == 0); 1051 1052 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1053 uvm_ra_freectx(vp->v_ractx); 1054 vp->v_ractx = NULL; 1055 } 1056 1057 /* Purge name cache. */ 1058 cache_purge(vp); 1059 1060 /* Done with purge, notify sleepers of the grim news. */ 1061 mutex_enter(vp->v_interlock); 1062 vp->v_op = dead_vnodeop_p; 1063 vp->v_tag = VT_NON; 1064 KNOTE(&vp->v_klist, NOTE_REVOKE); 1065 vp->v_iflag &= ~VI_XLOCK; 1066 vp->v_vflag &= ~VV_LOCKSWORK; 1067 if ((flags & DOCLOSE) != 0) { 1068 vp->v_iflag |= VI_CLEAN; 1069 } 1070 cv_broadcast(&vp->v_cv); 1071 1072 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1073 } 1074 1075 /* 1076 * Recycle an unused vnode to the front of the free list. 1077 * Release the passed interlock if the vnode will be recycled. 1078 */ 1079 int 1080 vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1081 { 1082 1083 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1084 1085 mutex_enter(vp->v_interlock); 1086 if (vp->v_usecount != 0) { 1087 mutex_exit(vp->v_interlock); 1088 return 0; 1089 } 1090 if (inter_lkp) { 1091 mutex_exit(inter_lkp); 1092 } 1093 vremfree(vp); 1094 vp->v_usecount = 1; 1095 vclean(vp, DOCLOSE); 1096 vrelel(vp, 0); 1097 return 1; 1098 } 1099 1100 /* 1101 * Eliminate all activity associated with the requested vnode 1102 * and with all vnodes aliased to the requested vnode. 1103 */ 1104 void 1105 vrevoke(vnode_t *vp) 1106 { 1107 vnode_t *vq, **vpp; 1108 enum vtype type; 1109 dev_t dev; 1110 1111 KASSERT(vp->v_usecount > 0); 1112 1113 mutex_enter(vp->v_interlock); 1114 if ((vp->v_iflag & VI_CLEAN) != 0) { 1115 mutex_exit(vp->v_interlock); 1116 return; 1117 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1118 atomic_inc_uint(&vp->v_usecount); 1119 vclean(vp, DOCLOSE); 1120 vrelel(vp, 0); 1121 return; 1122 } else { 1123 dev = vp->v_rdev; 1124 type = vp->v_type; 1125 mutex_exit(vp->v_interlock); 1126 } 1127 1128 vpp = &specfs_hash[SPECHASH(dev)]; 1129 mutex_enter(&device_lock); 1130 for (vq = *vpp; vq != NULL;) { 1131 /* If clean or being cleaned, then ignore it. */ 1132 mutex_enter(vq->v_interlock); 1133 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1134 vq->v_rdev != dev || vq->v_type != type) { 1135 mutex_exit(vq->v_interlock); 1136 vq = vq->v_specnext; 1137 continue; 1138 } 1139 mutex_exit(&device_lock); 1140 if (vq->v_usecount == 0) { 1141 vremfree(vq); 1142 vq->v_usecount = 1; 1143 } else { 1144 atomic_inc_uint(&vq->v_usecount); 1145 } 1146 vclean(vq, DOCLOSE); 1147 vrelel(vq, 0); 1148 mutex_enter(&device_lock); 1149 vq = *vpp; 1150 } 1151 mutex_exit(&device_lock); 1152 } 1153 1154 /* 1155 * Eliminate all activity associated with a vnode in preparation for 1156 * reuse. Drops a reference from the vnode. 1157 */ 1158 void 1159 vgone(vnode_t *vp) 1160 { 1161 1162 mutex_enter(vp->v_interlock); 1163 vclean(vp, DOCLOSE); 1164 vrelel(vp, 0); 1165 } 1166 1167 /* 1168 * Update outstanding I/O count and do wakeup if requested. 1169 */ 1170 void 1171 vwakeup(struct buf *bp) 1172 { 1173 vnode_t *vp; 1174 1175 if ((vp = bp->b_vp) == NULL) 1176 return; 1177 1178 KASSERT(bp->b_objlock == vp->v_interlock); 1179 KASSERT(mutex_owned(bp->b_objlock)); 1180 1181 if (--vp->v_numoutput < 0) 1182 panic("vwakeup: neg numoutput, vp %p", vp); 1183 if (vp->v_numoutput == 0) 1184 cv_broadcast(&vp->v_cv); 1185 } 1186 1187 /* 1188 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1189 * recycled. 1190 */ 1191 void 1192 vwait(vnode_t *vp, int flags) 1193 { 1194 1195 KASSERT(mutex_owned(vp->v_interlock)); 1196 KASSERT(vp->v_usecount != 0); 1197 1198 while ((vp->v_iflag & flags) != 0) 1199 cv_wait(&vp->v_cv, vp->v_interlock); 1200 } 1201 1202 int 1203 vfs_drainvnodes(long target) 1204 { 1205 1206 while (numvnodes > target) { 1207 vnode_t *vp; 1208 1209 mutex_enter(&vnode_free_list_lock); 1210 vp = getcleanvnode(); 1211 if (vp == NULL) { 1212 return EBUSY; 1213 } 1214 ungetnewvnode(vp); 1215 } 1216 return 0; 1217 } 1218 1219 void 1220 vpanic(vnode_t *vp, const char *msg) 1221 { 1222 #ifdef DIAGNOSTIC 1223 1224 vprint(NULL, vp); 1225 panic("%s\n", msg); 1226 #endif 1227 } 1228