1 /* $NetBSD: vfs_vnode.c,v 1.76 2017/03/06 10:07:52 hannken Exp $ */ 2 3 /*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69 /* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via vcache_get(9) or vcache_new(9). 79 * - Reclamation of inactive vnode, via vcache_vget(9). 80 * 81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9) 82 * was another, traditional way. Currently, only the draining thread 83 * recycles the vnodes. This behaviour might be revisited. 84 * 85 * The life-cycle ends when the last reference is dropped, usually 86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 87 * the file system that vnode is inactive. Via this call, file system 88 * indicates whether vnode can be recycled (usually, it checks its own 89 * references, e.g. count of links, whether the file was removed). 90 * 91 * Depending on indication, vnode can be put into a free list (cache), 92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to 93 * disassociate underlying file system from the vnode, and finally 94 * destroyed. 95 * 96 * Vnode state 97 * 98 * Vnode is always in one of six states: 99 * - MARKER This is a marker vnode to help list traversal. It 100 * will never change its state. 101 * - LOADING Vnode is associating underlying file system and not 102 * yet ready to use. 103 * - ACTIVE Vnode has associated underlying file system and is 104 * ready to use. 105 * - BLOCKED Vnode is active but cannot get new references. 106 * - RECLAIMING Vnode is disassociating from the underlying file 107 * system. 108 * - RECLAIMED Vnode has disassociated from underlying file system 109 * and is dead. 110 * 111 * Valid state changes are: 112 * LOADING -> ACTIVE 113 * Vnode has been initialised in vcache_get() or 114 * vcache_new() and is ready to use. 115 * ACTIVE -> RECLAIMING 116 * Vnode starts disassociation from underlying file 117 * system in vcache_reclaim(). 118 * RECLAIMING -> RECLAIMED 119 * Vnode finished disassociation from underlying file 120 * system in vcache_reclaim(). 121 * ACTIVE -> BLOCKED 122 * Either vcache_rekey*() is changing the vnode key or 123 * vrelel() is about to call VOP_INACTIVE(). 124 * BLOCKED -> ACTIVE 125 * The block condition is over. 126 * LOADING -> RECLAIMED 127 * Either vcache_get() or vcache_new() failed to 128 * associate the underlying file system or vcache_rekey*() 129 * drops a vnode used as placeholder. 130 * 131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate 132 * and it is possible to wait for state change. 133 * 134 * State is protected with v_interlock with one exception: 135 * to change from LOADING both v_interlock and vcache_lock must be held 136 * so it is possible to check "state == LOADING" without holding 137 * v_interlock. See vcache_get() for details. 138 * 139 * Reference counting 140 * 141 * Vnode is considered active, if reference count (vnode_t::v_usecount) 142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 143 * as vput(9), routines. Common points holding references are e.g. 144 * file openings, current working directory, mount points, etc. 145 * 146 * Note on v_usecount and its locking 147 * 148 * At nearly all points it is known that v_usecount could be zero, 149 * the vnode_t::v_interlock will be held. To change v_usecount away 150 * from zero, the interlock must be held. To change from a non-zero 151 * value to zero, again the interlock must be held. 152 * 153 * Changing the usecount from a non-zero value to a non-zero value can 154 * safely be done using atomic operations, without the interlock held. 155 * 156 */ 157 158 #include <sys/cdefs.h> 159 __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.76 2017/03/06 10:07:52 hannken Exp $"); 160 161 #include <sys/param.h> 162 #include <sys/kernel.h> 163 164 #include <sys/atomic.h> 165 #include <sys/buf.h> 166 #include <sys/conf.h> 167 #include <sys/device.h> 168 #include <sys/hash.h> 169 #include <sys/kauth.h> 170 #include <sys/kmem.h> 171 #include <sys/kthread.h> 172 #include <sys/module.h> 173 #include <sys/mount.h> 174 #include <sys/namei.h> 175 #include <sys/syscallargs.h> 176 #include <sys/sysctl.h> 177 #include <sys/systm.h> 178 #include <sys/vnode_impl.h> 179 #include <sys/wapbl.h> 180 #include <sys/fstrans.h> 181 182 #include <uvm/uvm.h> 183 #include <uvm/uvm_readahead.h> 184 185 /* Flags to vrelel. */ 186 #define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ 187 188 u_int numvnodes __cacheline_aligned; 189 190 /* 191 * There are three lru lists: one holds vnodes waiting for async release, 192 * one is for vnodes which have no buffer/page references and 193 * one for those which do (i.e. v_holdcnt is non-zero). 194 */ 195 static vnodelst_t lru_vrele_list __cacheline_aligned; 196 static vnodelst_t lru_free_list __cacheline_aligned; 197 static vnodelst_t lru_hold_list __cacheline_aligned; 198 static kmutex_t vdrain_lock __cacheline_aligned; 199 static kcondvar_t vdrain_cv __cacheline_aligned; 200 static int vdrain_gen; 201 static kcondvar_t vdrain_gen_cv; 202 static bool vdrain_retry; 203 static lwp_t * vdrain_lwp; 204 SLIST_HEAD(hashhead, vnode_impl); 205 static kmutex_t vcache_lock __cacheline_aligned; 206 static kcondvar_t vcache_cv __cacheline_aligned; 207 static u_int vcache_hashsize; 208 static u_long vcache_hashmask; 209 static struct hashhead *vcache_hashtab __cacheline_aligned; 210 static pool_cache_t vcache_pool; 211 static void lru_requeue(vnode_t *, vnodelst_t *); 212 static vnodelst_t * lru_which(vnode_t *); 213 static vnode_impl_t * vcache_alloc(void); 214 static void vcache_free(vnode_impl_t *); 215 static void vcache_init(void); 216 static void vcache_reinit(void); 217 static void vcache_reclaim(vnode_t *); 218 static void vrelel(vnode_t *, int); 219 static void vdrain_thread(void *); 220 static void vnpanic(vnode_t *, const char *, ...) 221 __printflike(2, 3); 222 223 /* Routines having to do with the management of the vnode table. */ 224 extern struct mount *dead_rootmount; 225 extern int (**dead_vnodeop_p)(void *); 226 extern struct vfsops dead_vfsops; 227 228 /* Vnode state operations and diagnostics. */ 229 230 #if defined(DIAGNOSTIC) 231 232 #define VSTATE_GET(vp) \ 233 vstate_assert_get((vp), __func__, __LINE__) 234 #define VSTATE_CHANGE(vp, from, to) \ 235 vstate_assert_change((vp), (from), (to), __func__, __LINE__) 236 #define VSTATE_WAIT_STABLE(vp) \ 237 vstate_assert_wait_stable((vp), __func__, __LINE__) 238 #define VSTATE_ASSERT(vp, state) \ 239 vstate_assert((vp), (state), __func__, __LINE__) 240 241 static void 242 vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line) 243 { 244 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 245 246 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 247 248 if (__predict_true(vip->vi_state == state)) 249 return; 250 vnpanic(vp, "state is %s, expected %s at %s:%d", 251 vstate_name(vip->vi_state), vstate_name(state), func, line); 252 } 253 254 static enum vnode_state 255 vstate_assert_get(vnode_t *vp, const char *func, int line) 256 { 257 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 258 259 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 260 if (vip->vi_state == VS_MARKER) 261 vnpanic(vp, "state is %s at %s:%d", 262 vstate_name(vip->vi_state), func, line); 263 264 return vip->vi_state; 265 } 266 267 static void 268 vstate_assert_wait_stable(vnode_t *vp, const char *func, int line) 269 { 270 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 271 272 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 273 if (vip->vi_state == VS_MARKER) 274 vnpanic(vp, "state is %s at %s:%d", 275 vstate_name(vip->vi_state), func, line); 276 277 while (vip->vi_state != VS_ACTIVE && vip->vi_state != VS_RECLAIMED) 278 cv_wait(&vp->v_cv, vp->v_interlock); 279 280 if (vip->vi_state == VS_MARKER) 281 vnpanic(vp, "state is %s at %s:%d", 282 vstate_name(vip->vi_state), func, line); 283 } 284 285 static void 286 vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to, 287 const char *func, int line) 288 { 289 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 290 291 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); 292 if (from == VS_LOADING) 293 KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line); 294 295 if (from == VS_MARKER) 296 vnpanic(vp, "from is %s at %s:%d", 297 vstate_name(from), func, line); 298 if (to == VS_MARKER) 299 vnpanic(vp, "to is %s at %s:%d", 300 vstate_name(to), func, line); 301 if (vip->vi_state != from) 302 vnpanic(vp, "from is %s, expected %s at %s:%d\n", 303 vstate_name(vip->vi_state), vstate_name(from), func, line); 304 if ((from == VS_BLOCKED || to == VS_BLOCKED) && vp->v_usecount != 1) 305 vnpanic(vp, "%s to %s with usecount %d at %s:%d", 306 vstate_name(from), vstate_name(to), vp->v_usecount, 307 func, line); 308 309 vip->vi_state = to; 310 if (from == VS_LOADING) 311 cv_broadcast(&vcache_cv); 312 if (to == VS_ACTIVE || to == VS_RECLAIMED) 313 cv_broadcast(&vp->v_cv); 314 } 315 316 #else /* defined(DIAGNOSTIC) */ 317 318 #define VSTATE_GET(vp) \ 319 (VNODE_TO_VIMPL((vp))->vi_state) 320 #define VSTATE_CHANGE(vp, from, to) \ 321 vstate_change((vp), (from), (to)) 322 #define VSTATE_WAIT_STABLE(vp) \ 323 vstate_wait_stable((vp)) 324 #define VSTATE_ASSERT(vp, state) 325 326 static void 327 vstate_wait_stable(vnode_t *vp) 328 { 329 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 330 331 while (vip->vi_state != VS_ACTIVE && vip->vi_state != VS_RECLAIMED) 332 cv_wait(&vp->v_cv, vp->v_interlock); 333 } 334 335 static void 336 vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to) 337 { 338 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 339 340 vip->vi_state = to; 341 if (from == VS_LOADING) 342 cv_broadcast(&vcache_cv); 343 if (to == VS_ACTIVE || to == VS_RECLAIMED) 344 cv_broadcast(&vp->v_cv); 345 } 346 347 #endif /* defined(DIAGNOSTIC) */ 348 349 void 350 vfs_vnode_sysinit(void) 351 { 352 int error __diagused; 353 354 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); 355 KASSERT(dead_rootmount != NULL); 356 dead_rootmount->mnt_iflag = IMNT_MPSAFE; 357 358 mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); 359 TAILQ_INIT(&lru_free_list); 360 TAILQ_INIT(&lru_hold_list); 361 TAILQ_INIT(&lru_vrele_list); 362 363 vcache_init(); 364 365 cv_init(&vdrain_cv, "vdrain"); 366 cv_init(&vdrain_gen_cv, "vdrainwt"); 367 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 368 NULL, &vdrain_lwp, "vdrain"); 369 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error); 370 } 371 372 /* 373 * Allocate a new marker vnode. 374 */ 375 vnode_t * 376 vnalloc_marker(struct mount *mp) 377 { 378 vnode_impl_t *vip; 379 vnode_t *vp; 380 381 vip = pool_cache_get(vcache_pool, PR_WAITOK); 382 memset(vip, 0, sizeof(*vip)); 383 vp = VIMPL_TO_VNODE(vip); 384 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 385 vp->v_mount = mp; 386 vp->v_type = VBAD; 387 vip->vi_state = VS_MARKER; 388 389 return vp; 390 } 391 392 /* 393 * Free a marker vnode. 394 */ 395 void 396 vnfree_marker(vnode_t *vp) 397 { 398 vnode_impl_t *vip; 399 400 vip = VNODE_TO_VIMPL(vp); 401 KASSERT(vip->vi_state == VS_MARKER); 402 uvm_obj_destroy(&vp->v_uobj, true); 403 pool_cache_put(vcache_pool, vip); 404 } 405 406 /* 407 * Test a vnode for being a marker vnode. 408 */ 409 bool 410 vnis_marker(vnode_t *vp) 411 { 412 413 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER); 414 } 415 416 /* 417 * Return the lru list this node should be on. 418 */ 419 static vnodelst_t * 420 lru_which(vnode_t *vp) 421 { 422 423 KASSERT(mutex_owned(vp->v_interlock)); 424 425 if (vp->v_holdcnt > 0) 426 return &lru_hold_list; 427 else 428 return &lru_free_list; 429 } 430 431 /* 432 * Put vnode to end of given list. 433 * Both the current and the new list may be NULL, used on vnode alloc/free. 434 * Adjust numvnodes and signal vdrain thread if there is work. 435 */ 436 static void 437 lru_requeue(vnode_t *vp, vnodelst_t *listhd) 438 { 439 vnode_impl_t *vip; 440 441 mutex_enter(&vdrain_lock); 442 vip = VNODE_TO_VIMPL(vp); 443 if (vip->vi_lrulisthd != NULL) 444 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 445 else 446 numvnodes++; 447 vip->vi_lrulisthd = listhd; 448 if (vip->vi_lrulisthd != NULL) 449 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 450 else 451 numvnodes--; 452 if (numvnodes > desiredvnodes || listhd == &lru_vrele_list) 453 cv_broadcast(&vdrain_cv); 454 mutex_exit(&vdrain_lock); 455 } 456 457 /* 458 * Release deferred vrele vnodes for this mount. 459 * Called with file system suspended. 460 */ 461 void 462 vrele_flush(struct mount *mp) 463 { 464 vnode_impl_t *vip, *marker; 465 466 KASSERT(fstrans_is_owner(mp)); 467 468 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 469 470 mutex_enter(&vdrain_lock); 471 TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist); 472 473 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 474 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist); 475 TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist); 476 if (vnis_marker(VIMPL_TO_VNODE(vip))) 477 continue; 478 479 KASSERT(vip->vi_lrulisthd == &lru_vrele_list); 480 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 481 vip->vi_lrulisthd = &lru_hold_list; 482 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 483 mutex_exit(&vdrain_lock); 484 485 vrele(VIMPL_TO_VNODE(vip)); 486 487 mutex_enter(&vdrain_lock); 488 } 489 490 TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist); 491 mutex_exit(&vdrain_lock); 492 493 vnfree_marker(VIMPL_TO_VNODE(marker)); 494 } 495 496 /* 497 * Reclaim a cached vnode. Used from vdrain_thread only. 498 */ 499 static __inline void 500 vdrain_remove(vnode_t *vp) 501 { 502 struct mount *mp; 503 504 KASSERT(mutex_owned(&vdrain_lock)); 505 506 /* Probe usecount (unlocked). */ 507 if (vp->v_usecount > 0) 508 return; 509 /* Try v_interlock -- we lock the wrong direction! */ 510 if (!mutex_tryenter(vp->v_interlock)) 511 return; 512 /* Probe usecount and state. */ 513 if (vp->v_usecount > 0 || VSTATE_GET(vp) != VS_ACTIVE) { 514 mutex_exit(vp->v_interlock); 515 return; 516 } 517 mp = vp->v_mount; 518 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) { 519 mutex_exit(vp->v_interlock); 520 return; 521 } 522 vdrain_retry = true; 523 mutex_exit(&vdrain_lock); 524 525 if (vcache_vget(vp) == 0) { 526 if (!vrecycle(vp)) 527 vrele(vp); 528 } 529 fstrans_done(mp); 530 531 mutex_enter(&vdrain_lock); 532 } 533 534 /* 535 * Release a cached vnode. Used from vdrain_thread only. 536 */ 537 static __inline void 538 vdrain_vrele(vnode_t *vp) 539 { 540 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 541 struct mount *mp; 542 543 KASSERT(mutex_owned(&vdrain_lock)); 544 545 mp = vp->v_mount; 546 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) 547 return; 548 549 /* 550 * First remove the vnode from the vrele list. 551 * Put it on the last lru list, the last vrele() 552 * will put it back onto the right list before 553 * its v_usecount reaches zero. 554 */ 555 KASSERT(vip->vi_lrulisthd == &lru_vrele_list); 556 TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); 557 vip->vi_lrulisthd = &lru_hold_list; 558 TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); 559 560 vdrain_retry = true; 561 mutex_exit(&vdrain_lock); 562 563 mutex_enter(vp->v_interlock); 564 vrelel(vp, 0); 565 fstrans_done(mp); 566 567 mutex_enter(&vdrain_lock); 568 } 569 570 /* 571 * Helper thread to keep the number of vnodes below desiredvnodes 572 * and release vnodes from asynchronous vrele. 573 */ 574 static void 575 vdrain_thread(void *cookie) 576 { 577 vnodelst_t *listhd[] = { 578 &lru_vrele_list, &lru_free_list, &lru_hold_list 579 }; 580 int i; 581 u_int target; 582 vnode_impl_t *vip, *marker; 583 584 marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); 585 586 mutex_enter(&vdrain_lock); 587 588 for (;;) { 589 vdrain_retry = false; 590 target = desiredvnodes - desiredvnodes/10; 591 592 for (i = 0; i < __arraycount(listhd); i++) { 593 TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist); 594 while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { 595 TAILQ_REMOVE(listhd[i], marker, vi_lrulist); 596 TAILQ_INSERT_AFTER(listhd[i], vip, marker, 597 vi_lrulist); 598 if (vnis_marker(VIMPL_TO_VNODE(vip))) 599 continue; 600 if (listhd[i] == &lru_vrele_list) 601 vdrain_vrele(VIMPL_TO_VNODE(vip)); 602 else if (numvnodes < target) 603 break; 604 else 605 vdrain_remove(VIMPL_TO_VNODE(vip)); 606 } 607 TAILQ_REMOVE(listhd[i], marker, vi_lrulist); 608 } 609 610 if (vdrain_retry) { 611 mutex_exit(&vdrain_lock); 612 yield(); 613 mutex_enter(&vdrain_lock); 614 } else { 615 vdrain_gen++; 616 cv_broadcast(&vdrain_gen_cv); 617 cv_wait(&vdrain_cv, &vdrain_lock); 618 } 619 } 620 } 621 622 /* 623 * vput: unlock and release the reference. 624 */ 625 void 626 vput(vnode_t *vp) 627 { 628 629 VOP_UNLOCK(vp); 630 vrele(vp); 631 } 632 633 /* 634 * Try to drop reference on a vnode. Abort if we are releasing the 635 * last reference. Note: this _must_ succeed if not the last reference. 636 */ 637 static inline bool 638 vtryrele(vnode_t *vp) 639 { 640 u_int use, next; 641 642 for (use = vp->v_usecount;; use = next) { 643 if (use == 1) { 644 return false; 645 } 646 KASSERT(use > 1); 647 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 648 if (__predict_true(next == use)) { 649 return true; 650 } 651 } 652 } 653 654 /* 655 * Vnode release. If reference count drops to zero, call inactive 656 * routine and either return to freelist or free to the pool. 657 */ 658 static void 659 vrelel(vnode_t *vp, int flags) 660 { 661 bool recycle, defer; 662 int error; 663 664 KASSERT(mutex_owned(vp->v_interlock)); 665 666 if (__predict_false(vp->v_op == dead_vnodeop_p && 667 VSTATE_GET(vp) != VS_RECLAIMED)) { 668 vnpanic(vp, "dead but not clean"); 669 } 670 671 /* 672 * If not the last reference, just drop the reference count 673 * and unlock. 674 */ 675 if (vtryrele(vp)) { 676 mutex_exit(vp->v_interlock); 677 return; 678 } 679 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 680 vnpanic(vp, "%s: bad ref count", __func__); 681 } 682 683 #ifdef DIAGNOSTIC 684 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 685 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 686 vprint("vrelel: missing VOP_CLOSE()", vp); 687 } 688 #endif 689 690 /* 691 * If not clean, deactivate the vnode, but preserve 692 * our reference across the call to VOP_INACTIVE(). 693 */ 694 if (VSTATE_GET(vp) != VS_RECLAIMED) { 695 recycle = false; 696 697 /* 698 * XXX This ugly block can be largely eliminated if 699 * locking is pushed down into the file systems. 700 * 701 * Defer vnode release to vdrain_thread if caller 702 * requests it explicitly or is the pagedaemon. 703 */ 704 if ((curlwp == uvm.pagedaemon_lwp) || 705 (flags & VRELEL_ASYNC_RELE) != 0) { 706 defer = true; 707 } else if (curlwp == vdrain_lwp) { 708 /* 709 * We have to try harder. 710 */ 711 mutex_exit(vp->v_interlock); 712 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 713 KASSERTMSG((error == 0), "vn_lock failed: %d", error); 714 mutex_enter(vp->v_interlock); 715 defer = false; 716 } else { 717 /* If we can't acquire the lock, then defer. */ 718 mutex_exit(vp->v_interlock); 719 error = vn_lock(vp, 720 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 721 defer = (error != 0); 722 mutex_enter(vp->v_interlock); 723 } 724 725 KASSERT(mutex_owned(vp->v_interlock)); 726 KASSERT(! (curlwp == vdrain_lwp && defer)); 727 728 if (defer) { 729 /* 730 * Defer reclaim to the kthread; it's not safe to 731 * clean it here. We donate it our last reference. 732 */ 733 lru_requeue(vp, &lru_vrele_list); 734 mutex_exit(vp->v_interlock); 735 return; 736 } 737 738 /* 739 * If the node got another reference while we 740 * released the interlock, don't try to inactivate it yet. 741 */ 742 if (__predict_false(vtryrele(vp))) { 743 VOP_UNLOCK(vp); 744 mutex_exit(vp->v_interlock); 745 return; 746 } 747 VSTATE_CHANGE(vp, VS_ACTIVE, VS_BLOCKED); 748 mutex_exit(vp->v_interlock); 749 750 /* 751 * The vnode must not gain another reference while being 752 * deactivated. If VOP_INACTIVE() indicates that 753 * the described file has been deleted, then recycle 754 * the vnode. 755 * 756 * Note that VOP_INACTIVE() will drop the vnode lock. 757 */ 758 VOP_INACTIVE(vp, &recycle); 759 if (recycle) { 760 /* vcache_reclaim() below will drop the lock. */ 761 if (vn_lock(vp, LK_EXCLUSIVE) != 0) 762 recycle = false; 763 } 764 mutex_enter(vp->v_interlock); 765 VSTATE_CHANGE(vp, VS_BLOCKED, VS_ACTIVE); 766 if (!recycle) { 767 if (vtryrele(vp)) { 768 mutex_exit(vp->v_interlock); 769 return; 770 } 771 } 772 773 /* Take care of space accounting. */ 774 if (vp->v_iflag & VI_EXECMAP) { 775 atomic_add_int(&uvmexp.execpages, 776 -vp->v_uobj.uo_npages); 777 atomic_add_int(&uvmexp.filepages, 778 vp->v_uobj.uo_npages); 779 } 780 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 781 vp->v_vflag &= ~VV_MAPPED; 782 783 /* 784 * Recycle the vnode if the file is now unused (unlinked), 785 * otherwise just free it. 786 */ 787 if (recycle) { 788 VSTATE_ASSERT(vp, VS_ACTIVE); 789 vcache_reclaim(vp); 790 } 791 KASSERT(vp->v_usecount > 0); 792 } 793 794 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 795 /* Gained another reference while being reclaimed. */ 796 mutex_exit(vp->v_interlock); 797 return; 798 } 799 800 if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) { 801 /* 802 * It's clean so destroy it. It isn't referenced 803 * anywhere since it has been reclaimed. 804 */ 805 vcache_free(VNODE_TO_VIMPL(vp)); 806 } else { 807 /* 808 * Otherwise, put it back onto the freelist. It 809 * can't be destroyed while still associated with 810 * a file system. 811 */ 812 lru_requeue(vp, lru_which(vp)); 813 mutex_exit(vp->v_interlock); 814 } 815 } 816 817 void 818 vrele(vnode_t *vp) 819 { 820 821 if (vtryrele(vp)) { 822 return; 823 } 824 mutex_enter(vp->v_interlock); 825 vrelel(vp, 0); 826 } 827 828 /* 829 * Asynchronous vnode release, vnode is released in different context. 830 */ 831 void 832 vrele_async(vnode_t *vp) 833 { 834 835 if (vtryrele(vp)) { 836 return; 837 } 838 mutex_enter(vp->v_interlock); 839 vrelel(vp, VRELEL_ASYNC_RELE); 840 } 841 842 /* 843 * Vnode reference, where a reference is already held by some other 844 * object (for example, a file structure). 845 */ 846 void 847 vref(vnode_t *vp) 848 { 849 850 KASSERT(vp->v_usecount != 0); 851 852 atomic_inc_uint(&vp->v_usecount); 853 } 854 855 /* 856 * Page or buffer structure gets a reference. 857 * Called with v_interlock held. 858 */ 859 void 860 vholdl(vnode_t *vp) 861 { 862 863 KASSERT(mutex_owned(vp->v_interlock)); 864 865 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) 866 lru_requeue(vp, lru_which(vp)); 867 } 868 869 /* 870 * Page or buffer structure frees a reference. 871 * Called with v_interlock held. 872 */ 873 void 874 holdrelel(vnode_t *vp) 875 { 876 877 KASSERT(mutex_owned(vp->v_interlock)); 878 879 if (vp->v_holdcnt <= 0) { 880 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 881 } 882 883 vp->v_holdcnt--; 884 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) 885 lru_requeue(vp, lru_which(vp)); 886 } 887 888 /* 889 * Recycle an unused vnode if caller holds the last reference. 890 */ 891 bool 892 vrecycle(vnode_t *vp) 893 { 894 int error __diagused; 895 896 mutex_enter(vp->v_interlock); 897 898 /* Make sure we hold the last reference. */ 899 VSTATE_WAIT_STABLE(vp); 900 if (vp->v_usecount != 1) { 901 mutex_exit(vp->v_interlock); 902 return false; 903 } 904 905 /* If the vnode is already clean we're done. */ 906 if (VSTATE_GET(vp) != VS_ACTIVE) { 907 VSTATE_ASSERT(vp, VS_RECLAIMED); 908 vrelel(vp, 0); 909 return true; 910 } 911 912 /* Prevent further references until the vnode is locked. */ 913 VSTATE_CHANGE(vp, VS_ACTIVE, VS_BLOCKED); 914 mutex_exit(vp->v_interlock); 915 916 /* 917 * On a leaf file system this lock will always succeed as we hold 918 * the last reference and prevent further references. 919 * On layered file systems waiting for the lock would open a can of 920 * deadlocks as the lower vnodes may have other active references. 921 */ 922 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); 923 924 mutex_enter(vp->v_interlock); 925 VSTATE_CHANGE(vp, VS_BLOCKED, VS_ACTIVE); 926 927 if (error) { 928 mutex_exit(vp->v_interlock); 929 return false; 930 } 931 932 KASSERT(vp->v_usecount == 1); 933 vcache_reclaim(vp); 934 vrelel(vp, 0); 935 936 return true; 937 } 938 939 /* 940 * Eliminate all activity associated with the requested vnode 941 * and with all vnodes aliased to the requested vnode. 942 */ 943 void 944 vrevoke(vnode_t *vp) 945 { 946 vnode_t *vq; 947 enum vtype type; 948 dev_t dev; 949 950 KASSERT(vp->v_usecount > 0); 951 952 mutex_enter(vp->v_interlock); 953 VSTATE_WAIT_STABLE(vp); 954 if (VSTATE_GET(vp) == VS_RECLAIMED) { 955 mutex_exit(vp->v_interlock); 956 return; 957 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 958 atomic_inc_uint(&vp->v_usecount); 959 mutex_exit(vp->v_interlock); 960 vgone(vp); 961 return; 962 } else { 963 dev = vp->v_rdev; 964 type = vp->v_type; 965 mutex_exit(vp->v_interlock); 966 } 967 968 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) { 969 vgone(vq); 970 } 971 } 972 973 /* 974 * Eliminate all activity associated with a vnode in preparation for 975 * reuse. Drops a reference from the vnode. 976 */ 977 void 978 vgone(vnode_t *vp) 979 { 980 981 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 982 mutex_enter(vp->v_interlock); 983 VSTATE_WAIT_STABLE(vp); 984 if (VSTATE_GET(vp) == VS_ACTIVE) 985 vcache_reclaim(vp); 986 VSTATE_ASSERT(vp, VS_RECLAIMED); 987 vrelel(vp, 0); 988 } 989 990 static inline uint32_t 991 vcache_hash(const struct vcache_key *key) 992 { 993 uint32_t hash = HASH32_BUF_INIT; 994 995 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash); 996 hash = hash32_buf(key->vk_key, key->vk_key_len, hash); 997 return hash; 998 } 999 1000 static void 1001 vcache_init(void) 1002 { 1003 1004 vcache_pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0, 1005 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL); 1006 KASSERT(vcache_pool != NULL); 1007 mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE); 1008 cv_init(&vcache_cv, "vcache"); 1009 vcache_hashsize = desiredvnodes; 1010 vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true, 1011 &vcache_hashmask); 1012 } 1013 1014 static void 1015 vcache_reinit(void) 1016 { 1017 int i; 1018 uint32_t hash; 1019 u_long oldmask, newmask; 1020 struct hashhead *oldtab, *newtab; 1021 vnode_impl_t *vip; 1022 1023 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask); 1024 mutex_enter(&vcache_lock); 1025 oldtab = vcache_hashtab; 1026 oldmask = vcache_hashmask; 1027 vcache_hashsize = desiredvnodes; 1028 vcache_hashtab = newtab; 1029 vcache_hashmask = newmask; 1030 for (i = 0; i <= oldmask; i++) { 1031 while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) { 1032 SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash); 1033 hash = vcache_hash(&vip->vi_key); 1034 SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask], 1035 vip, vi_hash); 1036 } 1037 } 1038 mutex_exit(&vcache_lock); 1039 hashdone(oldtab, HASH_SLIST, oldmask); 1040 } 1041 1042 static inline vnode_impl_t * 1043 vcache_hash_lookup(const struct vcache_key *key, uint32_t hash) 1044 { 1045 struct hashhead *hashp; 1046 vnode_impl_t *vip; 1047 1048 KASSERT(mutex_owned(&vcache_lock)); 1049 1050 hashp = &vcache_hashtab[hash & vcache_hashmask]; 1051 SLIST_FOREACH(vip, hashp, vi_hash) { 1052 if (key->vk_mount != vip->vi_key.vk_mount) 1053 continue; 1054 if (key->vk_key_len != vip->vi_key.vk_key_len) 1055 continue; 1056 if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len)) 1057 continue; 1058 return vip; 1059 } 1060 return NULL; 1061 } 1062 1063 /* 1064 * Allocate a new, uninitialized vcache node. 1065 */ 1066 static vnode_impl_t * 1067 vcache_alloc(void) 1068 { 1069 vnode_impl_t *vip; 1070 vnode_t *vp; 1071 1072 vip = pool_cache_get(vcache_pool, PR_WAITOK); 1073 memset(vip, 0, sizeof(*vip)); 1074 1075 rw_init(&vip->vi_lock); 1076 /* SLIST_INIT(&vip->vi_hash); */ 1077 /* LIST_INIT(&vip->vi_nclist); */ 1078 /* LIST_INIT(&vip->vi_dnclist); */ 1079 1080 vp = VIMPL_TO_VNODE(vip); 1081 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 1082 cv_init(&vp->v_cv, "vnode"); 1083 1084 vp->v_usecount = 1; 1085 vp->v_type = VNON; 1086 vp->v_size = vp->v_writesize = VSIZENOTSET; 1087 1088 vip->vi_state = VS_LOADING; 1089 1090 lru_requeue(vp, &lru_free_list); 1091 1092 return vip; 1093 } 1094 1095 /* 1096 * Free an unused, unreferenced vcache node. 1097 * v_interlock locked on entry. 1098 */ 1099 static void 1100 vcache_free(vnode_impl_t *vip) 1101 { 1102 vnode_t *vp; 1103 1104 vp = VIMPL_TO_VNODE(vip); 1105 KASSERT(mutex_owned(vp->v_interlock)); 1106 1107 KASSERT(vp->v_usecount == 0); 1108 KASSERT(vp->v_holdcnt == 0); 1109 KASSERT(vp->v_writecount == 0); 1110 lru_requeue(vp, NULL); 1111 mutex_exit(vp->v_interlock); 1112 1113 vfs_insmntque(vp, NULL); 1114 if (vp->v_type == VBLK || vp->v_type == VCHR) 1115 spec_node_destroy(vp); 1116 1117 rw_destroy(&vip->vi_lock); 1118 uvm_obj_destroy(&vp->v_uobj, true); 1119 cv_destroy(&vp->v_cv); 1120 pool_cache_put(vcache_pool, vip); 1121 } 1122 1123 /* 1124 * Try to get an initial reference on this cached vnode. 1125 * Returns zero on success, ENOENT if the vnode has been reclaimed and 1126 * EBUSY if the vnode state is unstable. 1127 * 1128 * v_interlock locked on entry and unlocked on exit. 1129 */ 1130 int 1131 vcache_tryvget(vnode_t *vp) 1132 { 1133 int error = 0; 1134 1135 KASSERT(mutex_owned(vp->v_interlock)); 1136 1137 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) 1138 error = ENOENT; 1139 else if (__predict_false(VSTATE_GET(vp) != VS_ACTIVE)) 1140 error = EBUSY; 1141 else if (vp->v_usecount == 0) 1142 vp->v_usecount = 1; 1143 else 1144 atomic_inc_uint(&vp->v_usecount); 1145 1146 mutex_exit(vp->v_interlock); 1147 1148 return error; 1149 } 1150 1151 /* 1152 * Try to get an initial reference on this cached vnode. 1153 * Returns zero on success and ENOENT if the vnode has been reclaimed. 1154 * Will wait for the vnode state to be stable. 1155 * 1156 * v_interlock locked on entry and unlocked on exit. 1157 */ 1158 int 1159 vcache_vget(vnode_t *vp) 1160 { 1161 1162 KASSERT(mutex_owned(vp->v_interlock)); 1163 1164 /* Increment hold count to prevent vnode from disappearing. */ 1165 vp->v_holdcnt++; 1166 VSTATE_WAIT_STABLE(vp); 1167 vp->v_holdcnt--; 1168 1169 /* If this was the last reference to a reclaimed vnode free it now. */ 1170 if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { 1171 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) 1172 vcache_free(VNODE_TO_VIMPL(vp)); 1173 else 1174 mutex_exit(vp->v_interlock); 1175 return ENOENT; 1176 } 1177 VSTATE_ASSERT(vp, VS_ACTIVE); 1178 if (vp->v_usecount == 0) 1179 vp->v_usecount = 1; 1180 else 1181 atomic_inc_uint(&vp->v_usecount); 1182 1183 mutex_exit(vp->v_interlock); 1184 1185 return 0; 1186 } 1187 1188 /* 1189 * Get a vnode / fs node pair by key and return it referenced through vpp. 1190 */ 1191 int 1192 vcache_get(struct mount *mp, const void *key, size_t key_len, 1193 struct vnode **vpp) 1194 { 1195 int error; 1196 uint32_t hash; 1197 const void *new_key; 1198 struct vnode *vp; 1199 struct vcache_key vcache_key; 1200 vnode_impl_t *vip, *new_vip; 1201 1202 new_key = NULL; 1203 *vpp = NULL; 1204 1205 vcache_key.vk_mount = mp; 1206 vcache_key.vk_key = key; 1207 vcache_key.vk_key_len = key_len; 1208 hash = vcache_hash(&vcache_key); 1209 1210 again: 1211 mutex_enter(&vcache_lock); 1212 vip = vcache_hash_lookup(&vcache_key, hash); 1213 1214 /* If found, take a reference or retry. */ 1215 if (__predict_true(vip != NULL)) { 1216 /* 1217 * If the vnode is loading we cannot take the v_interlock 1218 * here as it might change during load (see uvm_obj_setlock()). 1219 * As changing state from VS_LOADING requires both vcache_lock 1220 * and v_interlock it is safe to test with vcache_lock held. 1221 * 1222 * Wait for vnodes changing state from VS_LOADING and retry. 1223 */ 1224 if (__predict_false(vip->vi_state == VS_LOADING)) { 1225 cv_wait(&vcache_cv, &vcache_lock); 1226 mutex_exit(&vcache_lock); 1227 goto again; 1228 } 1229 vp = VIMPL_TO_VNODE(vip); 1230 mutex_enter(vp->v_interlock); 1231 mutex_exit(&vcache_lock); 1232 error = vcache_vget(vp); 1233 if (error == ENOENT) 1234 goto again; 1235 if (error == 0) 1236 *vpp = vp; 1237 KASSERT((error != 0) == (*vpp == NULL)); 1238 return error; 1239 } 1240 mutex_exit(&vcache_lock); 1241 1242 /* Allocate and initialize a new vcache / vnode pair. */ 1243 error = vfs_busy(mp, NULL); 1244 if (error) 1245 return error; 1246 new_vip = vcache_alloc(); 1247 new_vip->vi_key = vcache_key; 1248 vp = VIMPL_TO_VNODE(new_vip); 1249 mutex_enter(&vcache_lock); 1250 vip = vcache_hash_lookup(&vcache_key, hash); 1251 if (vip == NULL) { 1252 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1253 new_vip, vi_hash); 1254 vip = new_vip; 1255 } 1256 1257 /* If another thread beat us inserting this node, retry. */ 1258 if (vip != new_vip) { 1259 mutex_enter(vp->v_interlock); 1260 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); 1261 mutex_exit(&vcache_lock); 1262 vrelel(vp, 0); 1263 vfs_unbusy(mp, false, NULL); 1264 goto again; 1265 } 1266 mutex_exit(&vcache_lock); 1267 1268 /* Load the fs node. Exclusive as new_node is VS_LOADING. */ 1269 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key); 1270 if (error) { 1271 mutex_enter(&vcache_lock); 1272 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1273 new_vip, vnode_impl, vi_hash); 1274 mutex_enter(vp->v_interlock); 1275 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); 1276 mutex_exit(&vcache_lock); 1277 vrelel(vp, 0); 1278 vfs_unbusy(mp, false, NULL); 1279 KASSERT(*vpp == NULL); 1280 return error; 1281 } 1282 KASSERT(new_key != NULL); 1283 KASSERT(memcmp(key, new_key, key_len) == 0); 1284 KASSERT(vp->v_op != NULL); 1285 vfs_insmntque(vp, mp); 1286 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1287 vp->v_vflag |= VV_MPSAFE; 1288 vfs_unbusy(mp, true, NULL); 1289 1290 /* Finished loading, finalize node. */ 1291 mutex_enter(&vcache_lock); 1292 new_vip->vi_key.vk_key = new_key; 1293 mutex_enter(vp->v_interlock); 1294 VSTATE_CHANGE(vp, VS_LOADING, VS_ACTIVE); 1295 mutex_exit(vp->v_interlock); 1296 mutex_exit(&vcache_lock); 1297 *vpp = vp; 1298 return 0; 1299 } 1300 1301 /* 1302 * Create a new vnode / fs node pair and return it referenced through vpp. 1303 */ 1304 int 1305 vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap, 1306 kauth_cred_t cred, struct vnode **vpp) 1307 { 1308 int error; 1309 uint32_t hash; 1310 struct vnode *vp, *ovp; 1311 vnode_impl_t *vip, *ovip; 1312 1313 *vpp = NULL; 1314 1315 /* Allocate and initialize a new vcache / vnode pair. */ 1316 error = vfs_busy(mp, NULL); 1317 if (error) 1318 return error; 1319 vip = vcache_alloc(); 1320 vip->vi_key.vk_mount = mp; 1321 vp = VIMPL_TO_VNODE(vip); 1322 1323 /* Create and load the fs node. */ 1324 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, 1325 &vip->vi_key.vk_key_len, &vip->vi_key.vk_key); 1326 if (error) { 1327 mutex_enter(&vcache_lock); 1328 mutex_enter(vp->v_interlock); 1329 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED); 1330 mutex_exit(&vcache_lock); 1331 vrelel(vp, 0); 1332 vfs_unbusy(mp, false, NULL); 1333 KASSERT(*vpp == NULL); 1334 return error; 1335 } 1336 KASSERT(vip->vi_key.vk_key != NULL); 1337 KASSERT(vp->v_op != NULL); 1338 hash = vcache_hash(&vip->vi_key); 1339 1340 /* Wait for previous instance to be reclaimed, then insert new node. */ 1341 mutex_enter(&vcache_lock); 1342 while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) { 1343 ovp = VIMPL_TO_VNODE(ovip); 1344 mutex_enter(ovp->v_interlock); 1345 mutex_exit(&vcache_lock); 1346 error = vcache_vget(ovp); 1347 KASSERT(error == ENOENT); 1348 mutex_enter(&vcache_lock); 1349 } 1350 SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask], 1351 vip, vi_hash); 1352 mutex_exit(&vcache_lock); 1353 vfs_insmntque(vp, mp); 1354 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 1355 vp->v_vflag |= VV_MPSAFE; 1356 vfs_unbusy(mp, true, NULL); 1357 1358 /* Finished loading, finalize node. */ 1359 mutex_enter(&vcache_lock); 1360 mutex_enter(vp->v_interlock); 1361 VSTATE_CHANGE(vp, VS_LOADING, VS_ACTIVE); 1362 mutex_exit(&vcache_lock); 1363 mutex_exit(vp->v_interlock); 1364 *vpp = vp; 1365 return 0; 1366 } 1367 1368 /* 1369 * Prepare key change: update old cache nodes key and lock new cache node. 1370 * Return an error if the new node already exists. 1371 */ 1372 int 1373 vcache_rekey_enter(struct mount *mp, struct vnode *vp, 1374 const void *old_key, size_t old_key_len, 1375 const void *new_key, size_t new_key_len) 1376 { 1377 uint32_t old_hash, new_hash; 1378 struct vcache_key old_vcache_key, new_vcache_key; 1379 vnode_impl_t *vip, *new_vip; 1380 struct vnode *new_vp; 1381 1382 old_vcache_key.vk_mount = mp; 1383 old_vcache_key.vk_key = old_key; 1384 old_vcache_key.vk_key_len = old_key_len; 1385 old_hash = vcache_hash(&old_vcache_key); 1386 1387 new_vcache_key.vk_mount = mp; 1388 new_vcache_key.vk_key = new_key; 1389 new_vcache_key.vk_key_len = new_key_len; 1390 new_hash = vcache_hash(&new_vcache_key); 1391 1392 new_vip = vcache_alloc(); 1393 new_vip->vi_key = new_vcache_key; 1394 new_vp = VIMPL_TO_VNODE(new_vip); 1395 1396 /* Insert locked new node used as placeholder. */ 1397 mutex_enter(&vcache_lock); 1398 vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1399 if (vip != NULL) { 1400 mutex_enter(new_vp->v_interlock); 1401 VSTATE_CHANGE(new_vp, VS_LOADING, VS_RECLAIMED); 1402 mutex_exit(&vcache_lock); 1403 vrelel(new_vp, 0); 1404 return EEXIST; 1405 } 1406 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1407 new_vip, vi_hash); 1408 1409 /* Replace old nodes key with the temporary copy. */ 1410 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1411 KASSERT(vip != NULL); 1412 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1413 KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key); 1414 vip->vi_key = old_vcache_key; 1415 mutex_exit(&vcache_lock); 1416 return 0; 1417 } 1418 1419 /* 1420 * Key change complete: update old node and remove placeholder. 1421 */ 1422 void 1423 vcache_rekey_exit(struct mount *mp, struct vnode *vp, 1424 const void *old_key, size_t old_key_len, 1425 const void *new_key, size_t new_key_len) 1426 { 1427 uint32_t old_hash, new_hash; 1428 struct vcache_key old_vcache_key, new_vcache_key; 1429 vnode_impl_t *vip, *new_vip; 1430 struct vnode *new_vp; 1431 1432 old_vcache_key.vk_mount = mp; 1433 old_vcache_key.vk_key = old_key; 1434 old_vcache_key.vk_key_len = old_key_len; 1435 old_hash = vcache_hash(&old_vcache_key); 1436 1437 new_vcache_key.vk_mount = mp; 1438 new_vcache_key.vk_key = new_key; 1439 new_vcache_key.vk_key_len = new_key_len; 1440 new_hash = vcache_hash(&new_vcache_key); 1441 1442 mutex_enter(&vcache_lock); 1443 1444 /* Lookup old and new node. */ 1445 vip = vcache_hash_lookup(&old_vcache_key, old_hash); 1446 KASSERT(vip != NULL); 1447 KASSERT(VIMPL_TO_VNODE(vip) == vp); 1448 1449 new_vip = vcache_hash_lookup(&new_vcache_key, new_hash); 1450 KASSERT(new_vip != NULL); 1451 KASSERT(new_vip->vi_key.vk_key_len == new_key_len); 1452 new_vp = VIMPL_TO_VNODE(new_vip); 1453 mutex_enter(new_vp->v_interlock); 1454 VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING); 1455 1456 /* Rekey old node and put it onto its new hashlist. */ 1457 vip->vi_key = new_vcache_key; 1458 if (old_hash != new_hash) { 1459 SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask], 1460 vip, vnode_impl, vi_hash); 1461 SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask], 1462 vip, vi_hash); 1463 } 1464 1465 /* Remove new node used as placeholder. */ 1466 SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask], 1467 new_vip, vnode_impl, vi_hash); 1468 VSTATE_CHANGE(new_vp, VS_LOADING, VS_RECLAIMED); 1469 mutex_exit(&vcache_lock); 1470 vrelel(new_vp, 0); 1471 } 1472 1473 /* 1474 * Disassociate the underlying file system from a vnode. 1475 * 1476 * Must be called with vnode locked and will return unlocked. 1477 * Must be called with the interlock held, and will return with it held. 1478 */ 1479 static void 1480 vcache_reclaim(vnode_t *vp) 1481 { 1482 lwp_t *l = curlwp; 1483 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 1484 struct mount *mp = vp->v_mount; 1485 uint32_t hash; 1486 uint8_t temp_buf[64], *temp_key; 1487 size_t temp_key_len; 1488 bool recycle, active; 1489 int error; 1490 1491 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 || 1492 VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 1493 KASSERT(mutex_owned(vp->v_interlock)); 1494 KASSERT(vp->v_usecount != 0); 1495 1496 active = (vp->v_usecount > 1); 1497 temp_key_len = vip->vi_key.vk_key_len; 1498 /* 1499 * Prevent the vnode from being recycled or brought into use 1500 * while we clean it out. 1501 */ 1502 VSTATE_CHANGE(vp, VS_ACTIVE, VS_RECLAIMING); 1503 if (vp->v_iflag & VI_EXECMAP) { 1504 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1505 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1506 } 1507 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1508 mutex_exit(vp->v_interlock); 1509 1510 /* Replace the vnode key with a temporary copy. */ 1511 if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { 1512 temp_key = kmem_alloc(temp_key_len, KM_SLEEP); 1513 } else { 1514 temp_key = temp_buf; 1515 } 1516 mutex_enter(&vcache_lock); 1517 memcpy(temp_key, vip->vi_key.vk_key, temp_key_len); 1518 vip->vi_key.vk_key = temp_key; 1519 mutex_exit(&vcache_lock); 1520 1521 fstrans_start(mp, FSTRANS_LAZY); 1522 1523 /* 1524 * Clean out any cached data associated with the vnode. 1525 * If purging an active vnode, it must be closed and 1526 * deactivated before being reclaimed. 1527 */ 1528 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1529 if (error != 0) { 1530 if (wapbl_vphaswapbl(vp)) 1531 WAPBL_DISCARD(wapbl_vptomp(vp)); 1532 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1533 } 1534 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); 1535 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1536 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1537 spec_node_revoke(vp); 1538 } 1539 1540 /* 1541 * Disassociate the underlying file system from the vnode. 1542 * Note that the VOP_INACTIVE will unlock the vnode. 1543 */ 1544 VOP_INACTIVE(vp, &recycle); 1545 if (VOP_RECLAIM(vp)) { 1546 vnpanic(vp, "%s: cannot reclaim", __func__); 1547 } 1548 1549 KASSERT(vp->v_data == NULL); 1550 KASSERT(vp->v_uobj.uo_npages == 0); 1551 1552 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1553 uvm_ra_freectx(vp->v_ractx); 1554 vp->v_ractx = NULL; 1555 } 1556 1557 /* Purge name cache. */ 1558 cache_purge(vp); 1559 1560 /* Move to dead mount. */ 1561 vp->v_vflag &= ~VV_ROOT; 1562 atomic_inc_uint(&dead_rootmount->mnt_refcnt); 1563 vfs_insmntque(vp, dead_rootmount); 1564 1565 /* Remove from vnode cache. */ 1566 hash = vcache_hash(&vip->vi_key); 1567 mutex_enter(&vcache_lock); 1568 KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); 1569 SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask], 1570 vip, vnode_impl, vi_hash); 1571 mutex_exit(&vcache_lock); 1572 if (temp_key != temp_buf) 1573 kmem_free(temp_key, temp_key_len); 1574 1575 /* Done with purge, notify sleepers of the grim news. */ 1576 mutex_enter(vp->v_interlock); 1577 vp->v_op = dead_vnodeop_p; 1578 vp->v_vflag |= VV_LOCKSWORK; 1579 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED); 1580 vp->v_tag = VT_NON; 1581 KNOTE(&vp->v_klist, NOTE_REVOKE); 1582 1583 fstrans_done(mp); 1584 1585 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1586 } 1587 1588 /* 1589 * Update outstanding I/O count and do wakeup if requested. 1590 */ 1591 void 1592 vwakeup(struct buf *bp) 1593 { 1594 vnode_t *vp; 1595 1596 if ((vp = bp->b_vp) == NULL) 1597 return; 1598 1599 KASSERT(bp->b_objlock == vp->v_interlock); 1600 KASSERT(mutex_owned(bp->b_objlock)); 1601 1602 if (--vp->v_numoutput < 0) 1603 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1604 if (vp->v_numoutput == 0) 1605 cv_broadcast(&vp->v_cv); 1606 } 1607 1608 /* 1609 * Test a vnode for being or becoming dead. Returns one of: 1610 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only. 1611 * ENOENT: vnode is dead. 1612 * 0: otherwise. 1613 * 1614 * Whenever this function returns a non-zero value all future 1615 * calls will also return a non-zero value. 1616 */ 1617 int 1618 vdead_check(struct vnode *vp, int flags) 1619 { 1620 1621 KASSERT(mutex_owned(vp->v_interlock)); 1622 1623 if (! ISSET(flags, VDEAD_NOWAIT)) 1624 VSTATE_WAIT_STABLE(vp); 1625 1626 if (VSTATE_GET(vp) == VS_RECLAIMING) { 1627 KASSERT(ISSET(flags, VDEAD_NOWAIT)); 1628 return EBUSY; 1629 } else if (VSTATE_GET(vp) == VS_RECLAIMED) { 1630 return ENOENT; 1631 } 1632 1633 return 0; 1634 } 1635 1636 int 1637 vfs_drainvnodes(void) 1638 { 1639 int i, gen; 1640 1641 mutex_enter(&vdrain_lock); 1642 for (i = 0; i < 2; i++) { 1643 gen = vdrain_gen; 1644 while (gen == vdrain_gen) { 1645 cv_broadcast(&vdrain_cv); 1646 cv_wait(&vdrain_gen_cv, &vdrain_lock); 1647 } 1648 } 1649 mutex_exit(&vdrain_lock); 1650 1651 if (numvnodes >= desiredvnodes) 1652 return EBUSY; 1653 1654 if (vcache_hashsize != desiredvnodes) 1655 vcache_reinit(); 1656 1657 return 0; 1658 } 1659 1660 void 1661 vnpanic(vnode_t *vp, const char *fmt, ...) 1662 { 1663 va_list ap; 1664 1665 #ifdef DIAGNOSTIC 1666 vprint(NULL, vp); 1667 #endif 1668 va_start(ap, fmt); 1669 vpanic(fmt, ap); 1670 va_end(ap); 1671 } 1672