1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/kern/vfs_lock.c,v 1.24 2006/09/05 00:55:45 dillon Exp $ 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/malloc.h> 46 #include <sys/mount.h> 47 #include <sys/proc.h> 48 #include <sys/vnode.h> 49 #include <sys/buf.h> 50 #include <sys/sysctl.h> 51 52 #include <machine/limits.h> 53 54 #include <vm/vm.h> 55 #include <vm/vm_object.h> 56 57 #include <sys/buf2.h> 58 #include <sys/thread2.h> 59 60 61 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 62 63 static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 64 65 int freevnodes = 0; 66 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, 67 &freevnodes, 0, ""); 68 static int wantfreevnodes = 25; 69 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, 70 &wantfreevnodes, 0, ""); 71 static int minvnodes; 72 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 73 &minvnodes, 0, "Minimum number of vnodes"); 74 75 /* 76 * Called from vfsinit() 77 */ 78 void 79 vfs_lock_init(void) 80 { 81 minvnodes = desiredvnodes / 4; 82 83 TAILQ_INIT(&vnode_free_list); 84 } 85 86 /* 87 * Inline helper functions. vbusy() and vfree() must be called while in a 88 * critical section. 89 * 90 * Warning: must be callable if the caller holds a read spinlock to something 91 * else, meaning we can't use read spinlocks here. 92 */ 93 static __inline 94 void 95 __vbusy(struct vnode *vp) 96 { 97 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 98 freevnodes--; 99 vp->v_flag &= ~(VFREE|VAGE); 100 } 101 102 static __inline 103 void 104 __vfree(struct vnode *vp) 105 { 106 if (vp->v_flag & (VAGE|VRECLAIMED)) 107 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 108 else 109 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 110 freevnodes++; 111 vp->v_flag &= ~VAGE; 112 vp->v_flag |= VFREE; 113 } 114 115 /* 116 * Return 1 if we can immediately place the vnode on the freelist. 117 */ 118 static __inline int 119 vshouldfree(struct vnode *vp, int usecount) 120 { 121 if (vp->v_flag & VFREE) 122 return (0); /* already free */ 123 if (vp->v_holdcnt != 0 || vp->v_usecount != usecount) 124 return (0); /* other holderse */ 125 if (vp->v_object && 126 (vp->v_object->ref_count || vp->v_object->resident_page_count)) { 127 return (0); 128 } 129 return (1); 130 } 131 132 /* 133 * Add another ref to a vnode. The vnode must already have at least one 134 * ref. 135 * 136 * NOTE: The vnode may continue to reside on the free list 137 */ 138 void 139 vref(struct vnode *vp) 140 { 141 KKASSERT(vp->v_usecount > 0 && (vp->v_flag & VINACTIVE) == 0); 142 atomic_add_int(&vp->v_usecount, 1); 143 } 144 145 /* 146 * Add a ref to a vnode which may not have any refs. This routine is called 147 * from the namecache and vx_get(). If requested, the vnode will be 148 * reactivated. 149 * 150 * Removal of the vnode from the free list is optional. Since most vnodes 151 * are temporary in nature we opt not do it. This also means we don't have 152 * to deal with lock ordering issues between the freelist and vnode 153 * spinlocks. 154 * 155 * We must acquire the vnode's spinlock to interlock against vrele(). 156 * 157 * vget(), cache_vget(), and cache_vref() reactives vnodes. vx_get() does 158 * not. 159 */ 160 void 161 vref_initial(struct vnode *vp, int reactivate) 162 { 163 spin_lock_wr(&vp->v_spinlock); 164 atomic_add_int(&vp->v_usecount, 1); 165 if (reactivate) 166 vp->v_flag &= ~VINACTIVE; 167 spin_unlock_wr(&vp->v_spinlock); 168 } 169 170 /* 171 * Release a ref on the vnode. Since 0->1 transitions can only be made 172 * by vref_initial(), 1->0 transitions will be protected by the spinlock. 173 * 174 * When handling a 1->0 transition the vnode is guarenteed to not be locked 175 * and we can set the exclusive lock atomically while interlocked with our 176 * spinlock. A panic will occur if the lock is held. 177 */ 178 void 179 vrele(struct vnode *vp) 180 { 181 spin_lock_wr(&vp->v_spinlock); 182 if (vp->v_usecount > 1) { 183 atomic_subtract_int(&vp->v_usecount, 1); 184 spin_unlock_wr(&vp->v_spinlock); 185 return; 186 } 187 KKASSERT(vp->v_usecount == 1); 188 189 /* 190 * This is roughly equivalent to obtaining an exclusive 191 * lock, but the spinlock is already held (and remains held 192 * on return) and the lock must be obtainable without 193 * blocking, which it is in a 1->0 transition. 194 */ 195 lockmgr_setexclusive_interlocked(&vp->v_lock); 196 197 /* 198 * VINACTIVE is interlocked by the spinlock, so we have to re-check 199 * the bit if we release and reacquire the spinlock even though 200 * we are holding the exclusive lockmgr lock throughout. 201 * 202 * VOP_INACTIVE can race other VOPs even though we hold an exclusive 203 * lock. This is ok. The ref count of 1 must remain intact through 204 * the VOP_INACTIVE call to avoid a recursion. 205 */ 206 while ((vp->v_flag & VINACTIVE) == 0 && vp->v_usecount == 1) { 207 vp->v_flag |= VINACTIVE; 208 spin_unlock_wr(&vp->v_spinlock); 209 VOP_INACTIVE(vp); 210 spin_lock_wr(&vp->v_spinlock); 211 } 212 213 /* 214 * NOTE: v_usecount might no longer be 1 215 */ 216 atomic_subtract_int(&vp->v_usecount, 1); 217 if (vshouldfree(vp, 0)) 218 __vfree(vp); 219 lockmgr_clrexclusive_interlocked(&vp->v_lock); 220 /* spinlock unlocked */ 221 } 222 223 /* 224 * Hold a vnode, preventing it from being recycled (unless it is already 225 * undergoing a recyclement or already has been recycled). 226 * 227 * Opting not to remove a vnode from the freelist simply means that 228 * allocvnode must do it for us if it finds an unsuitable vnode. 229 */ 230 void 231 vhold(struct vnode *vp) 232 { 233 spin_lock_wr(&vp->v_spinlock); 234 atomic_add_int(&vp->v_holdcnt, 1); 235 spin_unlock_wr(&vp->v_spinlock); 236 } 237 238 /* 239 * Like vrele(), we must atomically place the vnode on the free list if 240 * it becomes suitable. vhold()/vdrop() do not mess with VINACTIVE. 241 */ 242 void 243 vdrop(struct vnode *vp) 244 { 245 KKASSERT(vp->v_holdcnt > 0); 246 spin_lock_wr(&vp->v_spinlock); 247 atomic_subtract_int(&vp->v_holdcnt, 1); 248 if (vshouldfree(vp, 0)) 249 __vfree(vp); 250 spin_unlock_wr(&vp->v_spinlock); 251 } 252 253 /**************************************************************** 254 * VX LOCKING FUNCTIONS * 255 **************************************************************** 256 * 257 * These functions lock vnodes for reclamation and deactivation related 258 * activities. Only vp->v_lock, the top layer of the VFS, is locked. 259 * You must be holding a normal reference in order to be able to safely 260 * call vx_lock() and vx_unlock(). 261 * 262 * vx_get() also differs from vget() in that it does not clear the 263 * VINACTIVE bit on a vnode. 264 */ 265 266 void 267 vx_lock(struct vnode *vp) 268 { 269 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 270 } 271 272 void 273 vx_unlock(struct vnode *vp) 274 { 275 lockmgr(&vp->v_lock, LK_RELEASE); 276 } 277 278 void 279 vx_get(struct vnode *vp) 280 { 281 vref_initial(vp, 0); 282 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 283 } 284 285 int 286 vx_get_nonblock(struct vnode *vp) 287 { 288 int error; 289 290 vref_initial(vp, 0); 291 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 292 if (error) 293 vrele(vp); 294 return(error); 295 } 296 297 void 298 vx_put(struct vnode *vp) 299 { 300 lockmgr(&vp->v_lock, LK_RELEASE); 301 vrele(vp); 302 } 303 304 /**************************************************************** 305 * VNODE ACQUISITION FUNCTIONS * 306 **************************************************************** 307 * 308 * vget() and vput() access a vnode for the intent of executing an 309 * operation other then a reclamation or deactivation. vget() will ref 310 * and lock the vnode, vput() will unlock and deref the vnode. 311 * The VOP_*() locking functions are used. 312 * 313 * CALLING VGET IS MANDATORY PRIOR TO ANY MODIFYING OPERATION ON A VNODE. 314 * This is because vget handles the VINACTIVE interlock and is responsible 315 * for clearing the bit. If the bit is not cleared inode updates may not 316 * make it to disk. 317 * 318 * Special cases: If vget()'s locking operation fails the vrele() call may 319 * cause the vnode to be deactivated (VOP_INACTIVE called). However, this 320 * never occurs if the vnode is in a reclaimed state. Vnodes in reclaimed 321 * states always return an error code of ENOENT. 322 * 323 * Special cases: vput() will unlock and, if it is the last reference, 324 * deactivate the vnode. The deactivation uses a separate non-layered 325 * VX lock after the normal unlock. XXX make it more efficient. 326 */ 327 int 328 vget(struct vnode *vp, int flags) 329 { 330 int error; 331 332 vref_initial(vp, 0); 333 if (flags & LK_TYPE_MASK) { 334 if ((error = vn_lock(vp, flags)) != 0) { 335 vrele(vp); 336 } else if (vp->v_flag & VRECLAIMED) { 337 vn_unlock(vp); 338 vrele(vp); 339 error = ENOENT; 340 } else { 341 vp->v_flag &= ~VINACTIVE; /* XXX not MP safe */ 342 error = 0; 343 } 344 } else { 345 panic("vget() called with no lock specified!"); 346 error = ENOENT; /* not reached, compiler opt */ 347 } 348 return(error); 349 } 350 351 void 352 vput(struct vnode *vp) 353 { 354 vn_unlock(vp); 355 vrele(vp); 356 } 357 358 void 359 vsetflags(struct vnode *vp, int flags) 360 { 361 crit_enter(); 362 vp->v_flag |= flags; 363 crit_exit(); 364 } 365 366 void 367 vclrflags(struct vnode *vp, int flags) 368 { 369 crit_enter(); 370 vp->v_flag &= ~flags; 371 crit_exit(); 372 } 373 374 /* 375 * Obtain a new vnode from the freelist, allocating more if necessary. 376 * The returned vnode is VX locked & refd. 377 */ 378 struct vnode * 379 allocvnode(int lktimeout, int lkflags) 380 { 381 struct thread *td; 382 struct vnode *vp; 383 384 /* 385 * Try to reuse vnodes if we hit the max. This situation only 386 * occurs in certain large-memory (2G+) situations. We cannot 387 * attempt to directly reclaim vnodes due to nasty recursion 388 * problems. 389 */ 390 while (numvnodes - freevnodes > desiredvnodes) 391 vnlru_proc_wait(); 392 393 td = curthread; 394 vp = NULL; 395 396 /* 397 * Attempt to reuse a vnode already on the free list, allocating 398 * a new vnode if we can't find one or if we have not reached a 399 * good minimum for good LRU performance. 400 */ 401 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 402 int count; 403 404 for (count = 0; count < freevnodes; count++) { 405 /* 406 * __VNODESCAN__ 407 * 408 * Pull the next vnode off the free list and do some 409 * sanity checks. Note that regardless of how we 410 * block, if freevnodes is non-zero there had better 411 * be something on the list. 412 */ 413 vp = TAILQ_FIRST(&vnode_free_list); 414 if (vp == NULL) 415 panic("getnewvnode: free vnode isn't"); 416 417 /* XXX for now */ 418 KKASSERT(vp->v_flag & VFREE); 419 420 /* 421 * Handle the case where the vnode was pulled off 422 * the free list while we were waiting for the 423 * spinlock. 424 */ 425 spin_lock_wr(&vp->v_spinlock); 426 if ((vp->v_flag & VFREE) == 0) { 427 spin_unlock_wr(&vp->v_spinlock); 428 vp = NULL; 429 continue; 430 } 431 432 /* 433 * Lazy removal of the vnode from the freelist if 434 * the vnode has references. 435 */ 436 if (vp->v_usecount || vp->v_holdcnt) { 437 __vbusy(vp); 438 spin_unlock_wr(&vp->v_spinlock); 439 vp = NULL; 440 continue; 441 } 442 443 /* 444 * vx_get() equivalent, but atomic with the 445 * spinlock held. Since 0->1 transitions and the 446 * lockmgr are protected by the spinlock we must 447 * be able to get an exclusive lock without blocking 448 * here. 449 * 450 * Also take the vnode off of the free list and 451 * assert that it is inactive. 452 */ 453 vp->v_usecount = 1; 454 lockmgr_setexclusive_interlocked(&vp->v_lock); 455 __vbusy(vp); 456 KKASSERT(vp->v_flag & VINACTIVE); 457 458 /* 459 * Reclaim the vnode. VRECLAIMED will be set 460 * atomically before the spinlock is released 461 * by vgone_interlocked(). 462 */ 463 if ((vp->v_flag & VRECLAIMED) == 0) { 464 vgone_interlocked(vp); 465 /* spinlock unlocked */ 466 } else { 467 spin_unlock_wr(&vp->v_spinlock); 468 } 469 470 /* 471 * We reclaimed the vnode but other claimants may 472 * have referenced it while we were blocked. We 473 * cannot reuse a vnode until all refs are gone and 474 * the vnode has completed reclamation. 475 */ 476 KKASSERT(vp->v_flag & VRECLAIMED); 477 if (vp->v_usecount != 1 || vp->v_holdcnt) { 478 vx_put(vp); 479 vp = NULL; 480 continue; 481 } 482 483 /* 484 * There are no more structural references to the 485 * vnode, referenced or otherwise. We have a vnode! 486 * 487 * The vnode may have been placed on the free list 488 * while we were blocked. 489 */ 490 if (vp->v_flag & VFREE) 491 __vbusy(vp); 492 KKASSERT(vp->v_flag & VINACTIVE); 493 break; 494 } 495 } 496 497 /* 498 * If we have a vp it will be refd and VX locked. 499 */ 500 if (vp) { 501 #ifdef INVARIANTS 502 if (vp->v_data) 503 panic("cleaned vnode isn't"); 504 if (vp->v_track_read.bk_active + vp->v_track_write.bk_active) 505 panic("Clean vnode has pending I/O's"); 506 KKASSERT(vp->v_mount == NULL); 507 #endif 508 vp->v_flag = 0; 509 vp->v_lastw = 0; 510 vp->v_lasta = 0; 511 vp->v_cstart = 0; 512 vp->v_clen = 0; 513 vp->v_socket = 0; 514 vp->v_opencount = 0; 515 vp->v_writecount = 0; /* XXX */ 516 lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags); 517 KKASSERT(TAILQ_FIRST(&vp->v_namecache) == NULL); 518 } else { 519 /* 520 * A brand-new vnode (we could use malloc() here I think) XXX 521 */ 522 vp = kmalloc(sizeof(struct vnode), M_VNODE, M_WAITOK|M_ZERO); 523 lwkt_token_init(&vp->v_pollinfo.vpi_token); 524 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 525 ccms_dataspace_init(&vp->v_ccms); 526 TAILQ_INIT(&vp->v_namecache); 527 528 /* 529 * short cut around vfreeing it and looping, just set it up 530 * as if we had pulled a reclaimed vnode off the freelist 531 * and reinitialized it. 532 */ 533 vp->v_usecount = 1; 534 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 535 numvnodes++; 536 } 537 538 RB_INIT(&vp->v_rbclean_tree); 539 RB_INIT(&vp->v_rbdirty_tree); 540 RB_INIT(&vp->v_rbhash_tree); 541 vp->v_filesize = NOOFFSET; 542 vp->v_type = VNON; 543 vp->v_tag = 0; 544 vp->v_ops = NULL; 545 vp->v_data = NULL; 546 KKASSERT(vp->v_mount == NULL); 547 return (vp); 548 } 549 550