1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysproto.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/namei.h> 77 #include <sys/nlookup.h> 78 #include <sys/filedesc.h> 79 #include <sys/fnv_hash.h> 80 #include <sys/globaldata.h> 81 #include <sys/kern_syscall.h> 82 #include <sys/dirent.h> 83 #include <ddb/ddb.h> 84 85 #include <sys/spinlock2.h> 86 87 #define MAX_RECURSION_DEPTH 64 88 89 /* 90 * Random lookups in the cache are accomplished with a hash table using 91 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock. 92 * 93 * Negative entries may exist and correspond to resolved namecache 94 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 95 * will be set if the entry corresponds to a whited-out directory entry 96 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 97 * is locked via pcpu_ncache[n].neg_spin; 98 * 99 * MPSAFE RULES: 100 * 101 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 102 * is applicable to direct lookups via the hash table nchpp or via 103 * nc_list (the two are added or removed together). Removal of the ncp 104 * from the hash table drops this reference. The second is applicable 105 * to vp->v_namecache linkages (or negative list linkages), and removal 106 * of the ncp from these lists drops this reference. 107 * 108 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 109 * and must be destroyed. No other thread should have access to it at 110 * this point so it can be safely locked and freed without any deadlock 111 * fears. 112 * 113 * The 1->0 transition can occur at almost any juncture and so cache_drop() 114 * deals with it directly. 115 * 116 * (2) Once the 1->0 transition occurs, the entity that caused the transition 117 * will be responsible for destroying the ncp. The ncp cannot be on any 118 * list or hash at this time, or be held by anyone other than the caller 119 * responsible for the transition. 120 * 121 * (3) A ncp must be locked in order to modify it. 122 * 123 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 124 * This may seem backwards but forward-scans use the hash table and thus 125 * can hold the parent unlocked while traversing downward. Deletions, 126 * on the other-hand, tend to propagate bottom-up since the ref on the 127 * is dropped as the children go away. 128 * 129 * (6) Both parent and child must be locked in order to enter the child onto 130 * the parent's nc_list. 131 */ 132 133 /* 134 * Structures associated with name cacheing. 135 */ 136 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 137 #define MINNEG 1024 138 #define MINPOS 1024 139 #define NCMOUNT_NUMCACHE 32768 /* power of 2 */ 140 141 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 142 143 TAILQ_HEAD(nchash_list, namecache); 144 145 /* 146 * Don't cachealign, but at least pad to 32 bytes so entries 147 * don't cross a cache line. 148 */ 149 struct nchash_head { 150 struct nchash_list list; /* 16 bytes */ 151 struct spinlock spin; /* 8 bytes */ 152 long pad01; /* 8 bytes */ 153 }; 154 155 struct ncmount_cache { 156 struct spinlock spin; 157 struct namecache *ncp; 158 struct mount *mp; 159 int isneg; /* if != 0 mp is originator and not target */ 160 int ticks; 161 } __cachealign; 162 163 struct pcpu_ncache { 164 struct spinlock neg_spin; /* for neg_list and neg_count */ 165 struct namecache_list neg_list; 166 long neg_count; 167 long vfscache_negs; 168 long vfscache_count; 169 long vfscache_leafs; 170 long numdefered; 171 } __cachealign; 172 173 __read_mostly static struct nchash_head *nchashtbl; 174 __read_mostly static struct pcpu_ncache *pcpu_ncache; 175 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 176 177 /* 178 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 179 * to create the namecache infrastructure leading to a dangling vnode. 180 * 181 * 0 Only errors are reported 182 * 1 Successes are reported 183 * 2 Successes + the whole directory scan is reported 184 * 3 Force the directory scan code run as if the parent vnode did not 185 * have a namecache record, even if it does have one. 186 */ 187 __read_mostly static int ncvp_debug; 188 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 189 "Namecache debug level (0-3)"); 190 191 __read_mostly static u_long nchash; /* size of hash table */ 192 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 193 "Size of namecache hash table"); 194 195 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 196 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 197 "Batch flush negative entries"); 198 199 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 200 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 201 "Batch flush positive entries"); 202 203 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 204 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 205 "Ratio of namecache negative entries"); 206 207 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 208 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 209 "Warn on locked namecache entries in ticks"); 210 211 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 212 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 213 "Number of cache entries allocated"); 214 215 __read_mostly static int ncp_shared_lock_disable = 0; 216 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 217 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 218 219 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 220 "sizeof(struct vnode)"); 221 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 222 "sizeof(struct namecache)"); 223 224 __read_mostly static int ncmount_cache_enable = 1; 225 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 226 &ncmount_cache_enable, 0, "mount point cache"); 227 228 static __inline void _cache_drop(struct namecache *ncp); 229 static int cache_resolve_mp(struct mount *mp); 230 static struct vnode *cache_dvpref(struct namecache *ncp); 231 static void _cache_setunresolved(struct namecache *ncp); 232 static void _cache_cleanneg(long count); 233 static void _cache_cleanpos(long count); 234 static void _cache_cleandefered(void); 235 static void _cache_unlink(struct namecache *ncp); 236 237 /* 238 * The new name cache statistics (these are rolled up globals and not 239 * modified in the critical path, see struct pcpu_ncache). 240 */ 241 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 242 static long vfscache_negs; 243 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 244 "Number of negative namecache entries"); 245 static long vfscache_count; 246 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 247 "Number of namecaches entries"); 248 static long vfscache_leafs; 249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 250 "Number of namecaches entries"); 251 static long numdefered; 252 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 253 "Number of cache entries allocated"); 254 255 256 struct nchstats nchstats[SMP_MAXCPU]; 257 /* 258 * Export VFS cache effectiveness statistics to user-land. 259 * 260 * The statistics are left for aggregation to user-land so 261 * neat things can be achieved, like observing per-CPU cache 262 * distribution. 263 */ 264 static int 265 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 266 { 267 struct globaldata *gd; 268 int i, error; 269 270 error = 0; 271 for (i = 0; i < ncpus; ++i) { 272 gd = globaldata_find(i); 273 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 274 sizeof(struct nchstats)))) 275 break; 276 } 277 278 return (error); 279 } 280 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 281 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 282 283 static void cache_zap(struct namecache *ncp); 284 285 /* 286 * Cache mount points and namecache records in order to avoid unnecessary 287 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 288 * performance and is particularly important on multi-socket systems to 289 * reduce cache-line ping-ponging. 290 * 291 * Try to keep the pcpu structure within one cache line (~64 bytes). 292 */ 293 #define MNTCACHE_COUNT 5 294 295 struct mntcache { 296 struct mount *mntary[MNTCACHE_COUNT]; 297 struct namecache *ncp1; 298 struct namecache *ncp2; 299 struct nchandle ncdir; 300 int iter; 301 int unused01; 302 } __cachealign; 303 304 static struct mntcache pcpu_mntcache[MAXCPU]; 305 306 static 307 void 308 _cache_mntref(struct mount *mp) 309 { 310 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 311 int i; 312 313 for (i = 0; i < MNTCACHE_COUNT; ++i) { 314 if (cache->mntary[i] != mp) 315 continue; 316 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL)) 317 return; 318 } 319 atomic_add_int(&mp->mnt_refs, 1); 320 } 321 322 static 323 void 324 _cache_mntrel(struct mount *mp) 325 { 326 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; 327 int i; 328 329 for (i = 0; i < MNTCACHE_COUNT; ++i) { 330 if (cache->mntary[i] == NULL) { 331 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 332 if (mp == NULL) 333 return; 334 } 335 } 336 i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT); 337 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp); 338 if (mp) 339 atomic_add_int(&mp->mnt_refs, -1); 340 } 341 342 /* 343 * Clears all cached mount points on all cpus. This routine should only 344 * be called when we are waiting for a mount to clear, e.g. so we can 345 * unmount. 346 */ 347 void 348 cache_clearmntcache(void) 349 { 350 int n; 351 352 for (n = 0; n < ncpus; ++n) { 353 struct mntcache *cache = &pcpu_mntcache[n]; 354 struct namecache *ncp; 355 struct mount *mp; 356 int i; 357 358 for (i = 0; i < MNTCACHE_COUNT; ++i) { 359 if (cache->mntary[i]) { 360 mp = atomic_swap_ptr( 361 (void *)&cache->mntary[i], NULL); 362 if (mp) 363 atomic_add_int(&mp->mnt_refs, -1); 364 } 365 } 366 367 if (cache->ncp1) { 368 ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL); 369 if (ncp) 370 _cache_drop(ncp); 371 } 372 if (cache->ncp2) { 373 ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL); 374 if (ncp) 375 _cache_drop(ncp); 376 } 377 if (cache->ncdir.ncp) { 378 ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL); 379 if (ncp) 380 _cache_drop(ncp); 381 } 382 if (cache->ncdir.mount) { 383 mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL); 384 if (mp) 385 atomic_add_int(&mp->mnt_refs, -1); 386 } 387 } 388 } 389 390 /* 391 * Namespace locking. The caller must already hold a reference to the 392 * namecache structure in order to lock/unlock it. The controlling entity 393 * in a 1->0 transition does not need to lock the ncp to dispose of it, 394 * as nobody else will have visiblity to it at that point. 395 * 396 * Note that holding a locked namecache structure prevents other threads 397 * from making namespace changes (e.g. deleting or creating), prevents 398 * vnode association state changes by other threads, and prevents the 399 * namecache entry from being resolved or unresolved by other threads. 400 * 401 * An exclusive lock owner has full authority to associate/disassociate 402 * vnodes and resolve/unresolve the locked ncp. 403 * 404 * A shared lock owner only has authority to acquire the underlying vnode, 405 * if any. 406 * 407 * The primary lock field is nc_lockstatus. nc_locktd is set after the 408 * fact (when locking) or cleared prior to unlocking. 409 * 410 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 411 * or recycled, but it does NOT help you if the vnode had already 412 * initiated a recyclement. If this is important, use cache_get() 413 * rather then cache_lock() (and deal with the differences in the 414 * way the refs counter is handled). Or, alternatively, make an 415 * unconditional call to cache_validate() or cache_resolve() 416 * after cache_lock() returns. 417 */ 418 static __inline 419 void 420 _cache_lock(struct namecache *ncp) 421 { 422 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 423 } 424 425 /* 426 * Release a previously acquired lock. 427 * 428 * A concurrent shared-lock acquisition or acquisition/release can 429 * race bit 31 so only drop the ncp if bit 31 was set. 430 */ 431 static __inline 432 void 433 _cache_unlock(struct namecache *ncp) 434 { 435 lockmgr(&ncp->nc_lock, LK_RELEASE); 436 } 437 438 /* 439 * Lock ncp exclusively, non-blocking. Return 0 on success. 440 */ 441 static __inline 442 int 443 _cache_lock_nonblock(struct namecache *ncp) 444 { 445 int error; 446 447 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 448 if (__predict_false(error != 0)) { 449 return(EWOULDBLOCK); 450 } 451 return 0; 452 } 453 454 /* 455 * This is a special form of _cache_lock() which only succeeds if 456 * it can get a pristine, non-recursive lock. The caller must have 457 * already ref'd the ncp. 458 * 459 * On success the ncp will be locked, on failure it will not. The 460 * ref count does not change either way. 461 * 462 * We want _cache_lock_special() (on success) to return a definitively 463 * usable vnode or a definitively unresolved ncp. 464 */ 465 static __inline 466 int 467 _cache_lock_special(struct namecache *ncp) 468 { 469 if (_cache_lock_nonblock(ncp) == 0) { 470 if (lockmgr_oneexcl(&ncp->nc_lock)) { 471 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 472 _cache_setunresolved(ncp); 473 return 0; 474 } 475 _cache_unlock(ncp); 476 } 477 return EWOULDBLOCK; 478 } 479 480 /* 481 * Shared lock, guarantees vp held 482 * 483 * The shared lock holds vp on the 0->1 transition. It is possible to race 484 * another shared lock release, preventing the other release from dropping 485 * the vnode and clearing bit 31. 486 * 487 * If it is not set then we are responsible for setting it, and this 488 * responsibility does not race with anyone else. 489 */ 490 static __inline 491 void 492 _cache_lock_shared(struct namecache *ncp) 493 { 494 lockmgr(&ncp->nc_lock, LK_SHARED); 495 } 496 497 #if 0 498 if (didwarn == 0) { 499 didwarn = ticks - nclockwarn; 500 kprintf("[diagnostic] cache_lock_shared: " 501 "%s blocked on %p %08x " 502 "\"%*.*s\"\n", 503 curthread->td_comm, ncp, count, 504 ncp->nc_nlen, ncp->nc_nlen, 505 ncp->nc_name); 506 } 507 if (didwarn) { 508 kprintf("[diagnostic] cache_lock_shared: " 509 "%s unblocked %*.*s after %d secs\n", 510 curthread->td_comm, 511 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 512 (int)(ticks - didwarn) / hz); 513 } 514 #endif 515 516 /* 517 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 518 */ 519 static __inline 520 int 521 _cache_lock_shared_nonblock(struct namecache *ncp) 522 { 523 int error; 524 525 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 526 if (__predict_false(error != 0)) { 527 return(EWOULDBLOCK); 528 } 529 return 0; 530 } 531 532 /* 533 * This function tries to get a shared lock but will back-off to an 534 * exclusive lock if: 535 * 536 * (1) Some other thread is trying to obtain an exclusive lock 537 * (to prevent the exclusive requester from getting livelocked out 538 * by many shared locks). 539 * 540 * (2) The current thread already owns an exclusive lock (to avoid 541 * deadlocking). 542 * 543 * WARNING! On machines with lots of cores we really want to try hard to 544 * get a shared lock or concurrent path lookups can chain-react 545 * into a very high-latency exclusive lock. 546 * 547 * This is very evident in dsynth's initial scans. 548 */ 549 static __inline 550 int 551 _cache_lock_shared_special(struct namecache *ncp) 552 { 553 /* 554 * Only honor a successful shared lock (returning 0) if there is 555 * no exclusive request pending and the vnode, if present, is not 556 * in a reclaimed state. 557 */ 558 if (_cache_lock_shared_nonblock(ncp) == 0) { 559 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 560 if (ncp->nc_vp == NULL || 561 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 562 return(0); 563 } 564 } 565 _cache_unlock(ncp); 566 return(EWOULDBLOCK); 567 } 568 569 /* 570 * Non-blocking shared lock failed. If we already own the exclusive 571 * lock just acquire another exclusive lock (instead of deadlocking). 572 * Otherwise acquire a shared lock. 573 */ 574 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 575 _cache_lock(ncp); 576 return(0); 577 } 578 _cache_lock_shared(ncp); 579 return(0); 580 } 581 582 static __inline 583 int 584 _cache_lockstatus(struct namecache *ncp) 585 { 586 int status; 587 588 status = lockstatus(&ncp->nc_lock, curthread); 589 if (status == 0 || status == LK_EXCLOTHER) 590 status = -1; 591 return status; 592 } 593 594 /* 595 * cache_hold() and cache_drop() prevent the premature deletion of a 596 * namecache entry but do not prevent operations (such as zapping) on 597 * that namecache entry. 598 * 599 * This routine may only be called from outside this source module if 600 * nc_refs is already deterministically at least 1, such as being 601 * associated with e.g. a process, file descriptor, or some other entity. 602 * 603 * Only the above situations, similar situations within this module where 604 * the ref count is deterministically at least 1, or when the ncp is found 605 * via the nchpp (hash table) lookup, can bump nc_refs. 606 * 607 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 608 * can still be removed from the nc_list, however, as long as the caller 609 * can acquire its lock (in the wrong order). 610 * 611 * This is a rare case where callers are allowed to hold a spinlock, 612 * so we can't ourselves. 613 */ 614 static __inline 615 struct namecache * 616 _cache_hold(struct namecache *ncp) 617 { 618 KKASSERT(ncp->nc_refs > 0); 619 atomic_add_int(&ncp->nc_refs, 1); 620 621 return(ncp); 622 } 623 624 /* 625 * Drop a cache entry. 626 * 627 * The 1->0 transition is special and requires the caller to destroy the 628 * entry. It means that the ncp is no longer on a nchpp list (since that 629 * would mean there was stilla ref). The ncp could still be on a nc_list 630 * but will not have any child of its own, again because nc_refs is now 0 631 * and children would have a ref to their parent. 632 * 633 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 634 */ 635 static __inline 636 void 637 _cache_drop(struct namecache *ncp) 638 { 639 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 640 /* 641 * Executed unlocked (no need to lock on last drop) 642 */ 643 _cache_setunresolved(ncp); 644 645 /* 646 * Scrap it. 647 */ 648 ncp->nc_refs = -1; /* safety */ 649 if (ncp->nc_name) 650 kfree(ncp->nc_name, M_VFSCACHE); 651 kfree(ncp, M_VFSCACHE); 652 } 653 } 654 655 /* 656 * Link a new namecache entry to its parent and to the hash table. Be 657 * careful to avoid races if vhold() blocks in the future. 658 * 659 * Both ncp and par must be referenced and locked. The reference is 660 * transfered to the nchpp (and, most notably, NOT to the parent list). 661 * 662 * NOTE: The hash table spinlock is held across this call, we can't do 663 * anything fancy. 664 */ 665 static void 666 _cache_link_parent(struct namecache *ncp, struct namecache *par, 667 struct nchash_head *nchpp) 668 { 669 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 670 671 KKASSERT(ncp->nc_parent == NULL); 672 ncp->nc_parent = par; 673 ncp->nc_head = nchpp; 674 675 /* 676 * Set inheritance flags. Note that the parent flags may be 677 * stale due to getattr potentially not having been run yet 678 * (it gets run during nlookup()'s). 679 */ 680 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 681 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 682 ncp->nc_flag |= NCF_SF_PNOCACHE; 683 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 684 ncp->nc_flag |= NCF_UF_PCACHE; 685 686 /* 687 * Add to hash table and parent, adjust accounting 688 */ 689 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 690 atomic_add_long(&pn->vfscache_count, 1); 691 if (TAILQ_EMPTY(&ncp->nc_list)) 692 atomic_add_long(&pn->vfscache_leafs, 1); 693 694 if (TAILQ_EMPTY(&par->nc_list)) { 695 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 696 atomic_add_long(&pn->vfscache_leafs, -1); 697 /* 698 * Any vp associated with an ncp which has children must 699 * be held to prevent it from being recycled. 700 */ 701 if (par->nc_vp) 702 vhold(par->nc_vp); 703 } else { 704 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 705 } 706 _cache_hold(par); /* add nc_parent ref */ 707 } 708 709 /* 710 * Remove the parent and hash associations from a namecache structure. 711 * Drop the ref-count on the parent. The caller receives the ref 712 * from the ncp's nchpp linkage that was removed and may forward that 713 * ref to a new linkage. 714 715 * The caller usually holds an additional ref * on the ncp so the unlink 716 * cannot be the final drop. XXX should not be necessary now since the 717 * caller receives the ref from the nchpp linkage, assuming the ncp 718 * was linked in the first place. 719 * 720 * ncp must be locked, which means that there won't be any nc_parent 721 * removal races. This routine will acquire a temporary lock on 722 * the parent as well as the appropriate hash chain. 723 */ 724 static void 725 _cache_unlink_parent(struct namecache *ncp) 726 { 727 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 728 struct namecache *par; 729 struct vnode *dropvp; 730 struct nchash_head *nchpp; 731 732 if ((par = ncp->nc_parent) != NULL) { 733 cpu_ccfence(); 734 KKASSERT(ncp->nc_parent == par); 735 736 /* don't add a ref, we drop the nchpp ref later */ 737 _cache_lock(par); 738 nchpp = ncp->nc_head; 739 spin_lock(&nchpp->spin); 740 741 /* 742 * Remove from hash table and parent, adjust accounting 743 */ 744 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 745 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 746 atomic_add_long(&pn->vfscache_count, -1); 747 if (TAILQ_EMPTY(&ncp->nc_list)) 748 atomic_add_long(&pn->vfscache_leafs, -1); 749 750 dropvp = NULL; 751 if (TAILQ_EMPTY(&par->nc_list)) { 752 atomic_add_long(&pn->vfscache_leafs, 1); 753 if (par->nc_vp) 754 dropvp = par->nc_vp; 755 } 756 ncp->nc_parent = NULL; 757 ncp->nc_head = NULL; 758 spin_unlock(&nchpp->spin); 759 _cache_unlock(par); 760 _cache_drop(par); /* drop nc_parent ref */ 761 762 /* 763 * We can only safely vdrop with no spinlocks held. 764 */ 765 if (dropvp) 766 vdrop(dropvp); 767 } 768 } 769 770 /* 771 * Allocate a new namecache structure. Most of the code does not require 772 * zero-termination of the string but it makes vop_compat_ncreate() easier. 773 * 774 * The returned ncp will be locked and referenced. The ref is generally meant 775 * to be transfered to the nchpp linkage. 776 */ 777 static struct namecache * 778 cache_alloc(int nlen) 779 { 780 struct namecache *ncp; 781 782 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 783 if (nlen) 784 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 785 ncp->nc_nlen = nlen; 786 ncp->nc_flag = NCF_UNRESOLVED; 787 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 788 ncp->nc_refs = 1; 789 TAILQ_INIT(&ncp->nc_list); 790 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 791 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 792 793 return(ncp); 794 } 795 796 /* 797 * Can only be called for the case where the ncp has never been 798 * associated with anything (so no spinlocks are needed). 799 */ 800 static void 801 _cache_free(struct namecache *ncp) 802 { 803 KKASSERT(ncp->nc_refs == 1); 804 if (ncp->nc_name) 805 kfree(ncp->nc_name, M_VFSCACHE); 806 kfree(ncp, M_VFSCACHE); 807 } 808 809 /* 810 * [re]initialize a nchandle. 811 */ 812 void 813 cache_zero(struct nchandle *nch) 814 { 815 nch->ncp = NULL; 816 nch->mount = NULL; 817 } 818 819 /* 820 * Ref and deref a nchandle structure (ncp + mp) 821 * 822 * The caller must specify a stable ncp pointer, typically meaning the 823 * ncp is already referenced but this can also occur indirectly through 824 * e.g. holding a lock on a direct child. 825 * 826 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 827 * use read spinlocks here. 828 */ 829 struct nchandle * 830 cache_hold(struct nchandle *nch) 831 { 832 _cache_hold(nch->ncp); 833 _cache_mntref(nch->mount); 834 return(nch); 835 } 836 837 /* 838 * Create a copy of a namecache handle for an already-referenced 839 * entry. 840 */ 841 void 842 cache_copy(struct nchandle *nch, struct nchandle *target) 843 { 844 struct mntcache *cache; 845 struct namecache *ncp; 846 847 *target = *nch; 848 _cache_mntref(target->mount); 849 ncp = target->ncp; 850 851 cache = &pcpu_mntcache[mycpu->gd_cpuid]; 852 if (ncp) { 853 if (ncp == cache->ncp1) { 854 if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL)) 855 return; 856 } 857 if (ncp == cache->ncp2) { 858 if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL)) 859 return; 860 } 861 _cache_hold(ncp); 862 } 863 } 864 865 /* 866 * Caller wants to copy the current directory, copy it out from our 867 * pcpu cache if possible (the entire critical path is just two localized 868 * cmpset ops). If the pcpu cache has a snapshot at all it will be a 869 * valid one, so we don't have to lock p->p_fd even though we are loading 870 * two fields. 871 * 872 * This has a limited effect since nlookup must still ref and shlock the 873 * vnode to check perms. We do avoid the per-proc spin-lock though, which 874 * can aid threaded programs. 875 */ 876 void 877 cache_copy_ncdir(struct proc *p, struct nchandle *target) 878 { 879 struct mntcache *cache; 880 881 *target = p->p_fd->fd_ncdir; 882 883 cache = &pcpu_mntcache[mycpu->gd_cpuid]; 884 if (target->ncp == cache->ncdir.ncp && 885 target->mount == cache->ncdir.mount) { 886 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp, 887 target->ncp, NULL)) { 888 if (atomic_cmpset_ptr((void *)&cache->ncdir.mount, 889 target->mount, NULL)) { 890 /* CRITICAL PATH */ 891 return; 892 } 893 _cache_drop(target->ncp); 894 } 895 } 896 spin_lock_shared(&p->p_fd->fd_spin); 897 cache_copy(&p->p_fd->fd_ncdir, target); 898 spin_unlock_shared(&p->p_fd->fd_spin); 899 } 900 901 void 902 cache_changemount(struct nchandle *nch, struct mount *mp) 903 { 904 _cache_mntref(mp); 905 _cache_mntrel(nch->mount); 906 nch->mount = mp; 907 } 908 909 void 910 cache_drop(struct nchandle *nch) 911 { 912 _cache_mntrel(nch->mount); 913 _cache_drop(nch->ncp); 914 nch->ncp = NULL; 915 nch->mount = NULL; 916 } 917 918 /* 919 * Drop the nchandle, but try to cache the ref to avoid global atomic 920 * ops. This is typically done on the system root and jail root nchandles. 921 */ 922 void 923 cache_drop_and_cache(struct nchandle *nch) 924 { 925 struct mntcache *cache; 926 struct namecache *ncp; 927 928 _cache_mntrel(nch->mount); 929 930 cache = &pcpu_mntcache[mycpu->gd_cpuid]; 931 ncp = nch->ncp; 932 if (cache->ncp1 == NULL) { 933 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 934 if (ncp == NULL) 935 goto done; 936 } 937 if (cache->ncp2 == NULL) { 938 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 939 if (ncp == NULL) 940 goto done; 941 } 942 if (++cache->iter & 1) 943 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp); 944 else 945 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp); 946 if (ncp) 947 _cache_drop(ncp); 948 done: 949 nch->ncp = NULL; 950 nch->mount = NULL; 951 } 952 953 /* 954 * We are dropping what the caller believes is the current directory, 955 * unconditionally store it in our pcpu cache. Anything already in 956 * the cache will be discarded. 957 */ 958 void 959 cache_drop_ncdir(struct nchandle *nch) 960 { 961 struct mntcache *cache; 962 963 cache = &pcpu_mntcache[mycpu->gd_cpuid]; 964 nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp); 965 nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount); 966 967 if (nch->ncp) 968 _cache_drop(nch->ncp); 969 if (nch->mount) 970 _cache_mntrel(nch->mount); 971 nch->ncp = NULL; 972 nch->mount = NULL; 973 } 974 975 int 976 cache_lockstatus(struct nchandle *nch) 977 { 978 return(_cache_lockstatus(nch->ncp)); 979 } 980 981 void 982 cache_lock(struct nchandle *nch) 983 { 984 _cache_lock(nch->ncp); 985 } 986 987 void 988 cache_lock_maybe_shared(struct nchandle *nch, int excl) 989 { 990 struct namecache *ncp = nch->ncp; 991 992 if (ncp_shared_lock_disable || excl || 993 (ncp->nc_flag & NCF_UNRESOLVED)) { 994 _cache_lock(ncp); 995 } else { 996 _cache_lock_shared(ncp); 997 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 998 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 999 _cache_unlock(ncp); 1000 _cache_lock(ncp); 1001 } 1002 } else { 1003 _cache_unlock(ncp); 1004 _cache_lock(ncp); 1005 } 1006 } 1007 } 1008 1009 /* 1010 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1011 * is responsible for checking both for validity on return as they 1012 * may have become invalid. 1013 * 1014 * We have to deal with potential deadlocks here, just ping pong 1015 * the lock until we get it (we will always block somewhere when 1016 * looping so this is not cpu-intensive). 1017 * 1018 * which = 0 nch1 not locked, nch2 is locked 1019 * which = 1 nch1 is locked, nch2 is not locked 1020 */ 1021 void 1022 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1023 struct nchandle *nch2, struct ucred *cred2) 1024 { 1025 int which; 1026 1027 which = 0; 1028 1029 for (;;) { 1030 if (which == 0) { 1031 if (cache_lock_nonblock(nch1) == 0) { 1032 cache_resolve(nch1, cred1); 1033 break; 1034 } 1035 cache_unlock(nch2); 1036 cache_lock(nch1); 1037 cache_resolve(nch1, cred1); 1038 which = 1; 1039 } else { 1040 if (cache_lock_nonblock(nch2) == 0) { 1041 cache_resolve(nch2, cred2); 1042 break; 1043 } 1044 cache_unlock(nch1); 1045 cache_lock(nch2); 1046 cache_resolve(nch2, cred2); 1047 which = 0; 1048 } 1049 } 1050 } 1051 1052 int 1053 cache_lock_nonblock(struct nchandle *nch) 1054 { 1055 return(_cache_lock_nonblock(nch->ncp)); 1056 } 1057 1058 void 1059 cache_unlock(struct nchandle *nch) 1060 { 1061 _cache_unlock(nch->ncp); 1062 } 1063 1064 /* 1065 * ref-and-lock, unlock-and-deref functions. 1066 * 1067 * This function is primarily used by nlookup. Even though cache_lock 1068 * holds the vnode, it is possible that the vnode may have already 1069 * initiated a recyclement. 1070 * 1071 * We want cache_get() to return a definitively usable vnode or a 1072 * definitively unresolved ncp. 1073 */ 1074 static 1075 struct namecache * 1076 _cache_get(struct namecache *ncp) 1077 { 1078 _cache_hold(ncp); 1079 _cache_lock(ncp); 1080 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1081 _cache_setunresolved(ncp); 1082 return(ncp); 1083 } 1084 1085 /* 1086 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1087 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1088 * valid. Otherwise an exclusive lock will be acquired instead. 1089 */ 1090 static 1091 struct namecache * 1092 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1093 { 1094 if (ncp_shared_lock_disable || excl || 1095 (ncp->nc_flag & NCF_UNRESOLVED)) { 1096 return(_cache_get(ncp)); 1097 } 1098 _cache_hold(ncp); 1099 _cache_lock_shared(ncp); 1100 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1101 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1102 _cache_unlock(ncp); 1103 ncp = _cache_get(ncp); 1104 _cache_drop(ncp); 1105 } 1106 } else { 1107 _cache_unlock(ncp); 1108 ncp = _cache_get(ncp); 1109 _cache_drop(ncp); 1110 } 1111 return(ncp); 1112 } 1113 1114 /* 1115 * NOTE: The same nchandle can be passed for both arguments. 1116 */ 1117 void 1118 cache_get(struct nchandle *nch, struct nchandle *target) 1119 { 1120 KKASSERT(nch->ncp->nc_refs > 0); 1121 target->mount = nch->mount; 1122 target->ncp = _cache_get(nch->ncp); 1123 _cache_mntref(target->mount); 1124 } 1125 1126 void 1127 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1128 { 1129 KKASSERT(nch->ncp->nc_refs > 0); 1130 target->mount = nch->mount; 1131 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1132 _cache_mntref(target->mount); 1133 } 1134 1135 /* 1136 * Release a held and locked ncp 1137 */ 1138 static __inline 1139 void 1140 _cache_put(struct namecache *ncp) 1141 { 1142 _cache_unlock(ncp); 1143 _cache_drop(ncp); 1144 } 1145 1146 void 1147 cache_put(struct nchandle *nch) 1148 { 1149 _cache_mntrel(nch->mount); 1150 _cache_put(nch->ncp); 1151 nch->ncp = NULL; 1152 nch->mount = NULL; 1153 } 1154 1155 /* 1156 * Resolve an unresolved ncp by associating a vnode with it. If the 1157 * vnode is NULL, a negative cache entry is created. 1158 * 1159 * The ncp should be locked on entry and will remain locked on return. 1160 */ 1161 static 1162 void 1163 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1164 { 1165 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1166 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1167 ncp->nc_vp == NULL); 1168 1169 if (vp) { 1170 /* 1171 * Any vp associated with an ncp which has children must 1172 * be held. Any vp associated with a locked ncp must be held. 1173 */ 1174 if (!TAILQ_EMPTY(&ncp->nc_list)) 1175 vhold(vp); 1176 spin_lock(&vp->v_spin); 1177 ncp->nc_vp = vp; 1178 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1179 ++vp->v_namecache_count; 1180 _cache_hold(ncp); /* v_namecache assoc */ 1181 spin_unlock(&vp->v_spin); 1182 vhold(vp); /* nc_vp */ 1183 1184 /* 1185 * Set auxiliary flags 1186 */ 1187 switch(vp->v_type) { 1188 case VDIR: 1189 ncp->nc_flag |= NCF_ISDIR; 1190 break; 1191 case VLNK: 1192 ncp->nc_flag |= NCF_ISSYMLINK; 1193 /* XXX cache the contents of the symlink */ 1194 break; 1195 default: 1196 break; 1197 } 1198 1199 ncp->nc_error = 0; 1200 1201 /* 1202 * XXX: this is a hack to work-around the lack of a real pfs vfs 1203 * implementation 1204 */ 1205 if (mp) { 1206 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1207 vp->v_pfsmp = mp; 1208 } 1209 } else { 1210 /* 1211 * When creating a negative cache hit we set the 1212 * namecache_gen. A later resolve will clean out the 1213 * negative cache hit if the mount point's namecache_gen 1214 * has changed. Used by devfs, could also be used by 1215 * other remote FSs. 1216 */ 1217 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1218 1219 ncp->nc_vp = NULL; 1220 ncp->nc_negcpu = mycpu->gd_cpuid; 1221 spin_lock(&pn->neg_spin); 1222 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1223 _cache_hold(ncp); /* neg_list assoc */ 1224 ++pn->neg_count; 1225 spin_unlock(&pn->neg_spin); 1226 atomic_add_long(&pn->vfscache_negs, 1); 1227 1228 ncp->nc_error = ENOENT; 1229 if (mp) 1230 VFS_NCPGEN_SET(mp, ncp); 1231 } 1232 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1233 } 1234 1235 void 1236 cache_setvp(struct nchandle *nch, struct vnode *vp) 1237 { 1238 _cache_setvp(nch->mount, nch->ncp, vp); 1239 } 1240 1241 /* 1242 * Used for NFS 1243 */ 1244 void 1245 cache_settimeout(struct nchandle *nch, int nticks) 1246 { 1247 struct namecache *ncp = nch->ncp; 1248 1249 if ((ncp->nc_timeout = ticks + nticks) == 0) 1250 ncp->nc_timeout = 1; 1251 } 1252 1253 /* 1254 * Disassociate the vnode or negative-cache association and mark a 1255 * namecache entry as unresolved again. Note that the ncp is still 1256 * left in the hash table and still linked to its parent. 1257 * 1258 * The ncp should be locked and refd on entry and will remain locked and refd 1259 * on return. 1260 * 1261 * This routine is normally never called on a directory containing children. 1262 * However, NFS often does just that in its rename() code as a cop-out to 1263 * avoid complex namespace operations. This disconnects a directory vnode 1264 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1265 * sync. 1266 * 1267 */ 1268 static 1269 void 1270 _cache_setunresolved(struct namecache *ncp) 1271 { 1272 struct vnode *vp; 1273 1274 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1275 ncp->nc_flag |= NCF_UNRESOLVED; 1276 ncp->nc_timeout = 0; 1277 ncp->nc_error = ENOTCONN; 1278 if ((vp = ncp->nc_vp) != NULL) { 1279 spin_lock(&vp->v_spin); 1280 ncp->nc_vp = NULL; 1281 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1282 --vp->v_namecache_count; 1283 spin_unlock(&vp->v_spin); 1284 1285 /* 1286 * Any vp associated with an ncp with children is 1287 * held by that ncp. Any vp associated with ncp 1288 * is held by that ncp. These conditions must be 1289 * undone when the vp is cleared out from the ncp. 1290 */ 1291 if (!TAILQ_EMPTY(&ncp->nc_list)) 1292 vdrop(vp); 1293 vdrop(vp); 1294 } else { 1295 struct pcpu_ncache *pn; 1296 1297 pn = &pcpu_ncache[ncp->nc_negcpu]; 1298 1299 atomic_add_long(&pn->vfscache_negs, -1); 1300 spin_lock(&pn->neg_spin); 1301 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1302 --pn->neg_count; 1303 spin_unlock(&pn->neg_spin); 1304 } 1305 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1306 _cache_drop(ncp); /* from v_namecache or neg_list */ 1307 } 1308 } 1309 1310 /* 1311 * The cache_nresolve() code calls this function to automatically 1312 * set a resolved cache element to unresolved if it has timed out 1313 * or if it is a negative cache hit and the mount point namecache_gen 1314 * has changed. 1315 */ 1316 static __inline int 1317 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1318 { 1319 /* 1320 * Try to zap entries that have timed out. We have 1321 * to be careful here because locked leafs may depend 1322 * on the vnode remaining intact in a parent, so only 1323 * do this under very specific conditions. 1324 */ 1325 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1326 TAILQ_EMPTY(&ncp->nc_list)) { 1327 return 1; 1328 } 1329 1330 /* 1331 * If a resolved negative cache hit is invalid due to 1332 * the mount's namecache generation being bumped, zap it. 1333 */ 1334 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1335 return 1; 1336 } 1337 1338 /* 1339 * Otherwise we are good 1340 */ 1341 return 0; 1342 } 1343 1344 static __inline void 1345 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1346 { 1347 /* 1348 * Already in an unresolved state, nothing to do. 1349 */ 1350 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1351 if (_cache_auto_unresolve_test(mp, ncp)) 1352 _cache_setunresolved(ncp); 1353 } 1354 } 1355 1356 void 1357 cache_setunresolved(struct nchandle *nch) 1358 { 1359 _cache_setunresolved(nch->ncp); 1360 } 1361 1362 /* 1363 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1364 * looking for matches. This flag tells the lookup code when it must 1365 * check for a mount linkage and also prevents the directories in question 1366 * from being deleted or renamed. 1367 */ 1368 static 1369 int 1370 cache_clrmountpt_callback(struct mount *mp, void *data) 1371 { 1372 struct nchandle *nch = data; 1373 1374 if (mp->mnt_ncmounton.ncp == nch->ncp) 1375 return(1); 1376 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1377 return(1); 1378 return(0); 1379 } 1380 1381 /* 1382 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1383 * with a mount point. 1384 */ 1385 void 1386 cache_clrmountpt(struct nchandle *nch) 1387 { 1388 int count; 1389 1390 count = mountlist_scan(cache_clrmountpt_callback, nch, 1391 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1392 MNTSCAN_NOUNLOCK); 1393 if (count == 0) 1394 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1395 } 1396 1397 /* 1398 * Invalidate portions of the namecache topology given a starting entry. 1399 * The passed ncp is set to an unresolved state and: 1400 * 1401 * The passed ncp must be referenced and locked. The routine may unlock 1402 * and relock ncp several times, and will recheck the children and loop 1403 * to catch races. When done the passed ncp will be returned with the 1404 * reference and lock intact. 1405 * 1406 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1407 * that the physical underlying nodes have been 1408 * destroyed... as in deleted. For example, when 1409 * a directory is removed. This will cause record 1410 * lookups on the name to no longer be able to find 1411 * the record and tells the resolver to return failure 1412 * rather then trying to resolve through the parent. 1413 * 1414 * The topology itself, including ncp->nc_name, 1415 * remains intact. 1416 * 1417 * This only applies to the passed ncp, if CINV_CHILDREN 1418 * is specified the children are not flagged. 1419 * 1420 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1421 * state as well. 1422 * 1423 * Note that this will also have the side effect of 1424 * cleaning out any unreferenced nodes in the topology 1425 * from the leaves up as the recursion backs out. 1426 * 1427 * Note that the topology for any referenced nodes remains intact, but 1428 * the nodes will be marked as having been destroyed and will be set 1429 * to an unresolved state. 1430 * 1431 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1432 * the namecache entry may not actually be invalidated on return if it was 1433 * revalidated while recursing down into its children. This code guarentees 1434 * that the node(s) will go through an invalidation cycle, but does not 1435 * guarentee that they will remain in an invalidated state. 1436 * 1437 * Returns non-zero if a revalidation was detected during the invalidation 1438 * recursion, zero otherwise. Note that since only the original ncp is 1439 * locked the revalidation ultimately can only indicate that the original ncp 1440 * *MIGHT* no have been reresolved. 1441 * 1442 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1443 * have to avoid blowing out the kernel stack. We do this by saving the 1444 * deep namecache node and aborting the recursion, then re-recursing at that 1445 * node using a depth-first algorithm in order to allow multiple deep 1446 * recursions to chain through each other, then we restart the invalidation 1447 * from scratch. 1448 */ 1449 1450 struct cinvtrack { 1451 struct namecache *resume_ncp; 1452 int depth; 1453 }; 1454 1455 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1456 1457 static 1458 int 1459 _cache_inval(struct namecache *ncp, int flags) 1460 { 1461 struct cinvtrack track; 1462 struct namecache *ncp2; 1463 int r; 1464 1465 track.depth = 0; 1466 track.resume_ncp = NULL; 1467 1468 for (;;) { 1469 r = _cache_inval_internal(ncp, flags, &track); 1470 if (track.resume_ncp == NULL) 1471 break; 1472 _cache_unlock(ncp); 1473 while ((ncp2 = track.resume_ncp) != NULL) { 1474 track.resume_ncp = NULL; 1475 _cache_lock(ncp2); 1476 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1477 &track); 1478 /*_cache_put(ncp2);*/ 1479 cache_zap(ncp2); 1480 } 1481 _cache_lock(ncp); 1482 } 1483 return(r); 1484 } 1485 1486 int 1487 cache_inval(struct nchandle *nch, int flags) 1488 { 1489 return(_cache_inval(nch->ncp, flags)); 1490 } 1491 1492 /* 1493 * Helper for _cache_inval(). The passed ncp is refd and locked and 1494 * remains that way on return, but may be unlocked/relocked multiple 1495 * times by the routine. 1496 */ 1497 static int 1498 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1499 { 1500 struct namecache *nextkid; 1501 int rcnt = 0; 1502 1503 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1504 1505 _cache_setunresolved(ncp); 1506 if (flags & CINV_DESTROY) { 1507 ncp->nc_flag |= NCF_DESTROYED; 1508 ++ncp->nc_generation; 1509 } 1510 1511 while ((flags & CINV_CHILDREN) && 1512 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1513 ) { 1514 struct namecache *kid; 1515 int restart; 1516 1517 restart = 0; 1518 _cache_hold(nextkid); 1519 if (++track->depth > MAX_RECURSION_DEPTH) { 1520 track->resume_ncp = ncp; 1521 _cache_hold(ncp); 1522 ++rcnt; 1523 } 1524 while ((kid = nextkid) != NULL) { 1525 /* 1526 * Parent (ncp) must be locked for the iteration. 1527 */ 1528 nextkid = NULL; 1529 if (kid->nc_parent != ncp) { 1530 _cache_drop(kid); 1531 kprintf("cache_inval_internal restartA %s\n", 1532 ncp->nc_name); 1533 restart = 1; 1534 break; 1535 } 1536 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1537 _cache_hold(nextkid); 1538 1539 /* 1540 * Parent unlocked for this section to avoid 1541 * deadlocks. Then lock the kid and check for 1542 * races. 1543 */ 1544 _cache_unlock(ncp); 1545 if (track->resume_ncp) { 1546 _cache_drop(kid); 1547 _cache_lock(ncp); 1548 break; 1549 } 1550 _cache_lock(kid); 1551 if (kid->nc_parent != ncp) { 1552 kprintf("cache_inval_internal " 1553 "restartB %s\n", 1554 ncp->nc_name); 1555 restart = 1; 1556 _cache_unlock(kid); 1557 _cache_drop(kid); 1558 _cache_lock(ncp); 1559 break; 1560 } 1561 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1562 TAILQ_FIRST(&kid->nc_list) 1563 ) { 1564 1565 rcnt += _cache_inval_internal(kid, 1566 flags & ~CINV_DESTROY, track); 1567 /*_cache_unlock(kid);*/ 1568 /*_cache_drop(kid);*/ 1569 cache_zap(kid); 1570 } else { 1571 cache_zap(kid); 1572 } 1573 1574 /* 1575 * Relock parent to continue scan 1576 */ 1577 _cache_lock(ncp); 1578 } 1579 if (nextkid) 1580 _cache_drop(nextkid); 1581 --track->depth; 1582 if (restart == 0) 1583 break; 1584 } 1585 1586 /* 1587 * Someone could have gotten in there while ncp was unlocked, 1588 * retry if so. 1589 */ 1590 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1591 ++rcnt; 1592 return (rcnt); 1593 } 1594 1595 /* 1596 * Invalidate a vnode's namecache associations. To avoid races against 1597 * the resolver we do not invalidate a node which we previously invalidated 1598 * but which was then re-resolved while we were in the invalidation loop. 1599 * 1600 * Returns non-zero if any namecache entries remain after the invalidation 1601 * loop completed. 1602 * 1603 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1604 * be ripped out of the topology while held, the vnode's v_namecache 1605 * list has no such restriction. NCP's can be ripped out of the list 1606 * at virtually any time if not locked, even if held. 1607 * 1608 * In addition, the v_namecache list itself must be locked via 1609 * the vnode's spinlock. 1610 */ 1611 int 1612 cache_inval_vp(struct vnode *vp, int flags) 1613 { 1614 struct namecache *ncp; 1615 struct namecache *next; 1616 1617 restart: 1618 spin_lock(&vp->v_spin); 1619 ncp = TAILQ_FIRST(&vp->v_namecache); 1620 if (ncp) 1621 _cache_hold(ncp); 1622 while (ncp) { 1623 /* loop entered with ncp held and vp spin-locked */ 1624 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1625 _cache_hold(next); 1626 spin_unlock(&vp->v_spin); 1627 _cache_lock(ncp); 1628 if (ncp->nc_vp != vp) { 1629 kprintf("Warning: cache_inval_vp: race-A detected on " 1630 "%s\n", ncp->nc_name); 1631 _cache_put(ncp); 1632 if (next) 1633 _cache_drop(next); 1634 goto restart; 1635 } 1636 _cache_inval(ncp, flags); 1637 _cache_put(ncp); /* also releases reference */ 1638 ncp = next; 1639 spin_lock(&vp->v_spin); 1640 if (ncp && ncp->nc_vp != vp) { 1641 spin_unlock(&vp->v_spin); 1642 kprintf("Warning: cache_inval_vp: race-B detected on " 1643 "%s\n", ncp->nc_name); 1644 _cache_drop(ncp); 1645 goto restart; 1646 } 1647 } 1648 spin_unlock(&vp->v_spin); 1649 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1650 } 1651 1652 /* 1653 * This routine is used instead of the normal cache_inval_vp() when we 1654 * are trying to recycle otherwise good vnodes. 1655 * 1656 * Return 0 on success, non-zero if not all namecache records could be 1657 * disassociated from the vnode (for various reasons). 1658 */ 1659 int 1660 cache_inval_vp_nonblock(struct vnode *vp) 1661 { 1662 struct namecache *ncp; 1663 struct namecache *next; 1664 1665 spin_lock(&vp->v_spin); 1666 ncp = TAILQ_FIRST(&vp->v_namecache); 1667 if (ncp) 1668 _cache_hold(ncp); 1669 while (ncp) { 1670 /* loop entered with ncp held */ 1671 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1672 _cache_hold(next); 1673 spin_unlock(&vp->v_spin); 1674 if (_cache_lock_nonblock(ncp)) { 1675 _cache_drop(ncp); 1676 if (next) 1677 _cache_drop(next); 1678 goto done; 1679 } 1680 if (ncp->nc_vp != vp) { 1681 kprintf("Warning: cache_inval_vp: race-A detected on " 1682 "%s\n", ncp->nc_name); 1683 _cache_put(ncp); 1684 if (next) 1685 _cache_drop(next); 1686 goto done; 1687 } 1688 _cache_inval(ncp, 0); 1689 _cache_put(ncp); /* also releases reference */ 1690 ncp = next; 1691 spin_lock(&vp->v_spin); 1692 if (ncp && ncp->nc_vp != vp) { 1693 spin_unlock(&vp->v_spin); 1694 kprintf("Warning: cache_inval_vp: race-B detected on " 1695 "%s\n", ncp->nc_name); 1696 _cache_drop(ncp); 1697 goto done; 1698 } 1699 } 1700 spin_unlock(&vp->v_spin); 1701 done: 1702 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1703 } 1704 1705 /* 1706 * Clears the universal directory search 'ok' flag. This flag allows 1707 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1708 * so clearing it simply forces revalidation. 1709 */ 1710 void 1711 cache_inval_wxok(struct vnode *vp) 1712 { 1713 struct namecache *ncp; 1714 1715 spin_lock(&vp->v_spin); 1716 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1717 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1718 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1719 } 1720 spin_unlock(&vp->v_spin); 1721 } 1722 1723 /* 1724 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1725 * must be locked. The target ncp is destroyed (as a normal rename-over 1726 * would destroy the target file or directory). 1727 * 1728 * Because there may be references to the source ncp we cannot copy its 1729 * contents to the target. Instead the source ncp is relinked as the target 1730 * and the target ncp is removed from the namecache topology. 1731 */ 1732 void 1733 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1734 { 1735 struct namecache *fncp = fnch->ncp; 1736 struct namecache *tncp = tnch->ncp; 1737 struct namecache *tncp_par; 1738 struct nchash_head *nchpp; 1739 u_int32_t hash; 1740 char *oname; 1741 char *nname; 1742 1743 ++fncp->nc_generation; 1744 ++tncp->nc_generation; 1745 if (tncp->nc_nlen) { 1746 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1747 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1748 nname[tncp->nc_nlen] = 0; 1749 } else { 1750 nname = NULL; 1751 } 1752 1753 /* 1754 * Rename fncp (unlink) 1755 */ 1756 _cache_unlink_parent(fncp); 1757 oname = fncp->nc_name; 1758 fncp->nc_name = nname; 1759 fncp->nc_nlen = tncp->nc_nlen; 1760 if (oname) 1761 kfree(oname, M_VFSCACHE); 1762 1763 tncp_par = tncp->nc_parent; 1764 _cache_hold(tncp_par); 1765 _cache_lock(tncp_par); 1766 1767 /* 1768 * Rename fncp (relink) 1769 */ 1770 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1771 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1772 nchpp = NCHHASH(hash); 1773 1774 spin_lock(&nchpp->spin); 1775 _cache_link_parent(fncp, tncp_par, nchpp); 1776 spin_unlock(&nchpp->spin); 1777 1778 _cache_put(tncp_par); 1779 1780 /* 1781 * Get rid of the overwritten tncp (unlink) 1782 */ 1783 _cache_unlink(tncp); 1784 } 1785 1786 /* 1787 * Perform actions consistent with unlinking a file. The passed-in ncp 1788 * must be locked. 1789 * 1790 * The ncp is marked DESTROYED so it no longer shows up in searches, 1791 * and will be physically deleted when the vnode goes away. 1792 * 1793 * If the related vnode has no refs then we cycle it through vget()/vput() 1794 * to (possibly if we don't have a ref race) trigger a deactivation, 1795 * allowing the VFS to trivially detect and recycle the deleted vnode 1796 * via VOP_INACTIVE(). 1797 * 1798 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1799 * target ncp. 1800 */ 1801 void 1802 cache_unlink(struct nchandle *nch) 1803 { 1804 _cache_unlink(nch->ncp); 1805 } 1806 1807 static void 1808 _cache_unlink(struct namecache *ncp) 1809 { 1810 struct vnode *vp; 1811 1812 /* 1813 * Causes lookups to fail and allows another ncp with the same 1814 * name to be created under ncp->nc_parent. 1815 */ 1816 ncp->nc_flag |= NCF_DESTROYED; 1817 ++ncp->nc_generation; 1818 1819 /* 1820 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1821 * force action on the 1->0 transition. 1822 */ 1823 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1824 (vp = ncp->nc_vp) != NULL) { 1825 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1826 if (VREFCNT(vp) <= 0) { 1827 if (vget(vp, LK_SHARED) == 0) 1828 vput(vp); 1829 } 1830 } 1831 } 1832 1833 /* 1834 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1835 * file. The easy solution is to just return non-zero if the vnode has refs. 1836 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1837 * force the reclaim). 1838 */ 1839 int 1840 cache_isopen(struct nchandle *nch) 1841 { 1842 struct vnode *vp; 1843 struct namecache *ncp = nch->ncp; 1844 1845 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1846 (vp = ncp->nc_vp) != NULL && 1847 VREFCNT(vp)) { 1848 return 1; 1849 } 1850 return 0; 1851 } 1852 1853 1854 /* 1855 * vget the vnode associated with the namecache entry. Resolve the namecache 1856 * entry if necessary. The passed ncp must be referenced and locked. If 1857 * the ncp is resolved it might be locked shared. 1858 * 1859 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1860 * (depending on the passed lk_type) will be returned in *vpp with an error 1861 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1862 * most typical error is ENOENT, meaning that the ncp represents a negative 1863 * cache hit and there is no vnode to retrieve, but other errors can occur 1864 * too. 1865 * 1866 * The vget() can race a reclaim. If this occurs we re-resolve the 1867 * namecache entry. 1868 * 1869 * There are numerous places in the kernel where vget() is called on a 1870 * vnode while one or more of its namecache entries is locked. Releasing 1871 * a vnode never deadlocks against locked namecache entries (the vnode 1872 * will not get recycled while referenced ncp's exist). This means we 1873 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1874 * lock when acquiring the vp lock or we might cause a deadlock. 1875 * 1876 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1877 * unresolved. If a reclaim race occurs the passed-in ncp will be 1878 * relocked exclusively before being re-resolved. 1879 */ 1880 int 1881 cache_vget(struct nchandle *nch, struct ucred *cred, 1882 int lk_type, struct vnode **vpp) 1883 { 1884 struct namecache *ncp; 1885 struct vnode *vp; 1886 int error; 1887 1888 ncp = nch->ncp; 1889 again: 1890 vp = NULL; 1891 if (ncp->nc_flag & NCF_UNRESOLVED) 1892 error = cache_resolve(nch, cred); 1893 else 1894 error = 0; 1895 1896 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1897 error = vget(vp, lk_type); 1898 if (error) { 1899 /* 1900 * VRECLAIM race 1901 * 1902 * The ncp may have been locked shared, we must relock 1903 * it exclusively before we can set it to unresolved. 1904 */ 1905 if (error == ENOENT) { 1906 kprintf("Warning: vnode reclaim race detected " 1907 "in cache_vget on %p (%s)\n", 1908 vp, ncp->nc_name); 1909 _cache_unlock(ncp); 1910 _cache_lock(ncp); 1911 _cache_setunresolved(ncp); 1912 goto again; 1913 } 1914 1915 /* 1916 * Not a reclaim race, some other error. 1917 */ 1918 KKASSERT(ncp->nc_vp == vp); 1919 vp = NULL; 1920 } else { 1921 KKASSERT(ncp->nc_vp == vp); 1922 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1923 } 1924 } 1925 if (error == 0 && vp == NULL) 1926 error = ENOENT; 1927 *vpp = vp; 1928 return(error); 1929 } 1930 1931 /* 1932 * Similar to cache_vget() but only acquires a ref on the vnode. 1933 * 1934 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1935 * unresolved. If a reclaim race occurs the passed-in ncp will be 1936 * relocked exclusively before being re-resolved. 1937 */ 1938 int 1939 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 1940 { 1941 struct namecache *ncp; 1942 struct vnode *vp; 1943 int error; 1944 1945 ncp = nch->ncp; 1946 again: 1947 vp = NULL; 1948 if (ncp->nc_flag & NCF_UNRESOLVED) 1949 error = cache_resolve(nch, cred); 1950 else 1951 error = 0; 1952 1953 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1954 error = vget(vp, LK_SHARED); 1955 if (error) { 1956 /* 1957 * VRECLAIM race 1958 */ 1959 if (error == ENOENT) { 1960 kprintf("Warning: vnode reclaim race detected " 1961 "in cache_vget on %p (%s)\n", 1962 vp, ncp->nc_name); 1963 _cache_unlock(ncp); 1964 _cache_lock(ncp); 1965 _cache_setunresolved(ncp); 1966 goto again; 1967 } 1968 1969 /* 1970 * Not a reclaim race, some other error. 1971 */ 1972 KKASSERT(ncp->nc_vp == vp); 1973 vp = NULL; 1974 } else { 1975 KKASSERT(ncp->nc_vp == vp); 1976 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1977 /* caller does not want a lock */ 1978 vn_unlock(vp); 1979 } 1980 } 1981 if (error == 0 && vp == NULL) 1982 error = ENOENT; 1983 *vpp = vp; 1984 return(error); 1985 } 1986 1987 /* 1988 * Return a referenced vnode representing the parent directory of 1989 * ncp. 1990 * 1991 * Because the caller has locked the ncp it should not be possible for 1992 * the parent ncp to go away. However, the parent can unresolve its 1993 * dvp at any time so we must be able to acquire a lock on the parent 1994 * to safely access nc_vp. 1995 * 1996 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 1997 * so use vhold()/vdrop() while holding the lock to prevent dvp from 1998 * getting destroyed. 1999 * 2000 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2001 * lock on the ncp in question.. 2002 */ 2003 static struct vnode * 2004 cache_dvpref(struct namecache *ncp) 2005 { 2006 struct namecache *par; 2007 struct vnode *dvp; 2008 2009 dvp = NULL; 2010 if ((par = ncp->nc_parent) != NULL) { 2011 _cache_hold(par); 2012 _cache_lock(par); 2013 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2014 if ((dvp = par->nc_vp) != NULL) 2015 vhold(dvp); 2016 } 2017 _cache_unlock(par); 2018 if (dvp) { 2019 if (vget(dvp, LK_SHARED) == 0) { 2020 vn_unlock(dvp); 2021 vdrop(dvp); 2022 /* return refd, unlocked dvp */ 2023 } else { 2024 vdrop(dvp); 2025 dvp = NULL; 2026 } 2027 } 2028 _cache_drop(par); 2029 } 2030 return(dvp); 2031 } 2032 2033 /* 2034 * Convert a directory vnode to a namecache record without any other 2035 * knowledge of the topology. This ONLY works with directory vnodes and 2036 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2037 * returned ncp (if not NULL) will be held and unlocked. 2038 * 2039 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2040 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2041 * for dvp. This will fail only if the directory has been deleted out from 2042 * under the caller. 2043 * 2044 * Callers must always check for a NULL return no matter the value of 'makeit'. 2045 * 2046 * To avoid underflowing the kernel stack each recursive call increments 2047 * the makeit variable. 2048 */ 2049 2050 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2051 struct vnode *dvp, char *fakename); 2052 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2053 struct vnode **saved_dvp); 2054 2055 int 2056 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2057 struct nchandle *nch) 2058 { 2059 struct vnode *saved_dvp; 2060 struct vnode *pvp; 2061 char *fakename; 2062 int error; 2063 2064 nch->ncp = NULL; 2065 nch->mount = dvp->v_mount; 2066 saved_dvp = NULL; 2067 fakename = NULL; 2068 2069 /* 2070 * Handle the makeit == 0 degenerate case 2071 */ 2072 if (makeit == 0) { 2073 spin_lock_shared(&dvp->v_spin); 2074 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2075 if (nch->ncp) 2076 cache_hold(nch); 2077 spin_unlock_shared(&dvp->v_spin); 2078 } 2079 2080 /* 2081 * Loop until resolution, inside code will break out on error. 2082 */ 2083 while (makeit) { 2084 /* 2085 * Break out if we successfully acquire a working ncp. 2086 */ 2087 spin_lock_shared(&dvp->v_spin); 2088 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2089 if (nch->ncp) { 2090 cache_hold(nch); 2091 spin_unlock_shared(&dvp->v_spin); 2092 break; 2093 } 2094 spin_unlock_shared(&dvp->v_spin); 2095 2096 /* 2097 * If dvp is the root of its filesystem it should already 2098 * have a namecache pointer associated with it as a side 2099 * effect of the mount, but it may have been disassociated. 2100 */ 2101 if (dvp->v_flag & VROOT) { 2102 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2103 error = cache_resolve_mp(nch->mount); 2104 _cache_put(nch->ncp); 2105 if (ncvp_debug) { 2106 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2107 dvp->v_mount, error); 2108 } 2109 if (error) { 2110 if (ncvp_debug) 2111 kprintf(" failed\n"); 2112 nch->ncp = NULL; 2113 break; 2114 } 2115 if (ncvp_debug) 2116 kprintf(" succeeded\n"); 2117 continue; 2118 } 2119 2120 /* 2121 * If we are recursed too deeply resort to an O(n^2) 2122 * algorithm to resolve the namecache topology. The 2123 * resolved pvp is left referenced in saved_dvp to 2124 * prevent the tree from being destroyed while we loop. 2125 */ 2126 if (makeit > 20) { 2127 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2128 if (error) { 2129 kprintf("lookupdotdot(longpath) failed %d " 2130 "dvp %p\n", error, dvp); 2131 nch->ncp = NULL; 2132 break; 2133 } 2134 continue; 2135 } 2136 2137 /* 2138 * Get the parent directory and resolve its ncp. 2139 */ 2140 if (fakename) { 2141 kfree(fakename, M_TEMP); 2142 fakename = NULL; 2143 } 2144 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2145 &fakename); 2146 if (error) { 2147 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2148 break; 2149 } 2150 vn_unlock(pvp); 2151 2152 /* 2153 * Reuse makeit as a recursion depth counter. On success 2154 * nch will be fully referenced. 2155 */ 2156 cache_fromdvp(pvp, cred, makeit + 1, nch); 2157 vrele(pvp); 2158 if (nch->ncp == NULL) 2159 break; 2160 2161 /* 2162 * Do an inefficient scan of pvp (embodied by ncp) to look 2163 * for dvp. This will create a namecache record for dvp on 2164 * success. We loop up to recheck on success. 2165 * 2166 * ncp and dvp are both held but not locked. 2167 */ 2168 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2169 if (error) { 2170 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2171 pvp, nch->ncp->nc_name, dvp); 2172 cache_drop(nch); 2173 /* nch was NULLed out, reload mount */ 2174 nch->mount = dvp->v_mount; 2175 break; 2176 } 2177 if (ncvp_debug) { 2178 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2179 pvp, nch->ncp->nc_name); 2180 } 2181 cache_drop(nch); 2182 /* nch was NULLed out, reload mount */ 2183 nch->mount = dvp->v_mount; 2184 } 2185 2186 /* 2187 * If nch->ncp is non-NULL it will have been held already. 2188 */ 2189 if (fakename) 2190 kfree(fakename, M_TEMP); 2191 if (saved_dvp) 2192 vrele(saved_dvp); 2193 if (nch->ncp) 2194 return (0); 2195 return (EINVAL); 2196 } 2197 2198 /* 2199 * Go up the chain of parent directories until we find something 2200 * we can resolve into the namecache. This is very inefficient. 2201 */ 2202 static 2203 int 2204 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2205 struct vnode **saved_dvp) 2206 { 2207 struct nchandle nch; 2208 struct vnode *pvp; 2209 int error; 2210 static time_t last_fromdvp_report; 2211 char *fakename; 2212 2213 /* 2214 * Loop getting the parent directory vnode until we get something we 2215 * can resolve in the namecache. 2216 */ 2217 vref(dvp); 2218 nch.mount = dvp->v_mount; 2219 nch.ncp = NULL; 2220 fakename = NULL; 2221 2222 for (;;) { 2223 if (fakename) { 2224 kfree(fakename, M_TEMP); 2225 fakename = NULL; 2226 } 2227 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2228 &fakename); 2229 if (error) { 2230 vrele(dvp); 2231 break; 2232 } 2233 vn_unlock(pvp); 2234 spin_lock_shared(&pvp->v_spin); 2235 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2236 _cache_hold(nch.ncp); 2237 spin_unlock_shared(&pvp->v_spin); 2238 vrele(pvp); 2239 break; 2240 } 2241 spin_unlock_shared(&pvp->v_spin); 2242 if (pvp->v_flag & VROOT) { 2243 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2244 error = cache_resolve_mp(nch.mount); 2245 _cache_unlock(nch.ncp); 2246 vrele(pvp); 2247 if (error) { 2248 _cache_drop(nch.ncp); 2249 nch.ncp = NULL; 2250 vrele(dvp); 2251 } 2252 break; 2253 } 2254 vrele(dvp); 2255 dvp = pvp; 2256 } 2257 if (error == 0) { 2258 if (last_fromdvp_report != time_uptime) { 2259 last_fromdvp_report = time_uptime; 2260 kprintf("Warning: extremely inefficient path " 2261 "resolution on %s\n", 2262 nch.ncp->nc_name); 2263 } 2264 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2265 2266 /* 2267 * Hopefully dvp now has a namecache record associated with 2268 * it. Leave it referenced to prevent the kernel from 2269 * recycling the vnode. Otherwise extremely long directory 2270 * paths could result in endless recycling. 2271 */ 2272 if (*saved_dvp) 2273 vrele(*saved_dvp); 2274 *saved_dvp = dvp; 2275 _cache_drop(nch.ncp); 2276 } 2277 if (fakename) 2278 kfree(fakename, M_TEMP); 2279 return (error); 2280 } 2281 2282 /* 2283 * Do an inefficient scan of the directory represented by ncp looking for 2284 * the directory vnode dvp. ncp must be held but not locked on entry and 2285 * will be held on return. dvp must be refd but not locked on entry and 2286 * will remain refd on return. 2287 * 2288 * Why do this at all? Well, due to its stateless nature the NFS server 2289 * converts file handles directly to vnodes without necessarily going through 2290 * the namecache ops that would otherwise create the namecache topology 2291 * leading to the vnode. We could either (1) Change the namecache algorithms 2292 * to allow disconnect namecache records that are re-merged opportunistically, 2293 * or (2) Make the NFS server backtrack and scan to recover a connected 2294 * namecache topology in order to then be able to issue new API lookups. 2295 * 2296 * It turns out that (1) is a huge mess. It takes a nice clean set of 2297 * namecache algorithms and introduces a lot of complication in every subsystem 2298 * that calls into the namecache to deal with the re-merge case, especially 2299 * since we are using the namecache to placehold negative lookups and the 2300 * vnode might not be immediately assigned. (2) is certainly far less 2301 * efficient then (1), but since we are only talking about directories here 2302 * (which are likely to remain cached), the case does not actually run all 2303 * that often and has the supreme advantage of not polluting the namecache 2304 * algorithms. 2305 * 2306 * If a fakename is supplied just construct a namecache entry using the 2307 * fake name. 2308 */ 2309 static int 2310 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2311 struct vnode *dvp, char *fakename) 2312 { 2313 struct nlcomponent nlc; 2314 struct nchandle rncp; 2315 struct dirent *den; 2316 struct vnode *pvp; 2317 struct vattr vat; 2318 struct iovec iov; 2319 struct uio uio; 2320 int blksize; 2321 int eofflag; 2322 int bytes; 2323 char *rbuf; 2324 int error; 2325 2326 vat.va_blocksize = 0; 2327 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2328 return (error); 2329 cache_lock(nch); 2330 error = cache_vref(nch, cred, &pvp); 2331 cache_unlock(nch); 2332 if (error) 2333 return (error); 2334 if (ncvp_debug) { 2335 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2336 "vattr fileid = %lld\n", 2337 nch->ncp, nch->ncp->nc_name, 2338 vat.va_blocksize, 2339 (long long)vat.va_fileid); 2340 } 2341 2342 /* 2343 * Use the supplied fakename if not NULL. Fake names are typically 2344 * not in the actual filesystem hierarchy. This is used by HAMMER 2345 * to glue @@timestamp recursions together. 2346 */ 2347 if (fakename) { 2348 nlc.nlc_nameptr = fakename; 2349 nlc.nlc_namelen = strlen(fakename); 2350 rncp = cache_nlookup(nch, &nlc); 2351 goto done; 2352 } 2353 2354 if ((blksize = vat.va_blocksize) == 0) 2355 blksize = DEV_BSIZE; 2356 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2357 rncp.ncp = NULL; 2358 2359 eofflag = 0; 2360 uio.uio_offset = 0; 2361 again: 2362 iov.iov_base = rbuf; 2363 iov.iov_len = blksize; 2364 uio.uio_iov = &iov; 2365 uio.uio_iovcnt = 1; 2366 uio.uio_resid = blksize; 2367 uio.uio_segflg = UIO_SYSSPACE; 2368 uio.uio_rw = UIO_READ; 2369 uio.uio_td = curthread; 2370 2371 if (ncvp_debug >= 2) 2372 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2373 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2374 if (error == 0) { 2375 den = (struct dirent *)rbuf; 2376 bytes = blksize - uio.uio_resid; 2377 2378 while (bytes > 0) { 2379 if (ncvp_debug >= 2) { 2380 kprintf("cache_inefficient_scan: %*.*s\n", 2381 den->d_namlen, den->d_namlen, 2382 den->d_name); 2383 } 2384 if (den->d_type != DT_WHT && 2385 den->d_ino == vat.va_fileid) { 2386 if (ncvp_debug) { 2387 kprintf("cache_inefficient_scan: " 2388 "MATCHED inode %lld path %s/%*.*s\n", 2389 (long long)vat.va_fileid, 2390 nch->ncp->nc_name, 2391 den->d_namlen, den->d_namlen, 2392 den->d_name); 2393 } 2394 nlc.nlc_nameptr = den->d_name; 2395 nlc.nlc_namelen = den->d_namlen; 2396 rncp = cache_nlookup(nch, &nlc); 2397 KKASSERT(rncp.ncp != NULL); 2398 break; 2399 } 2400 bytes -= _DIRENT_DIRSIZ(den); 2401 den = _DIRENT_NEXT(den); 2402 } 2403 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2404 goto again; 2405 } 2406 kfree(rbuf, M_TEMP); 2407 done: 2408 vrele(pvp); 2409 if (rncp.ncp) { 2410 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2411 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2412 if (ncvp_debug >= 2) { 2413 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2414 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2415 } 2416 } else { 2417 if (ncvp_debug >= 2) { 2418 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2419 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2420 rncp.ncp->nc_vp); 2421 } 2422 } 2423 if (rncp.ncp->nc_vp == NULL) 2424 error = rncp.ncp->nc_error; 2425 /* 2426 * Release rncp after a successful nlookup. rncp was fully 2427 * referenced. 2428 */ 2429 cache_put(&rncp); 2430 } else { 2431 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2432 dvp, nch->ncp->nc_name); 2433 error = ENOENT; 2434 } 2435 return (error); 2436 } 2437 2438 /* 2439 * This function must be called with the ncp held and locked and will unlock 2440 * and drop it during zapping. 2441 * 2442 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2443 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2444 * and removes the related reference. If the ncp can be removed, and the 2445 * parent can be zapped non-blocking, this function loops up. 2446 * 2447 * There will be one ref from the caller (which we now own). The only 2448 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2449 * so possibly 2 refs left. Taking this into account, if there are no 2450 * additional refs and no children, the ncp will be removed from the topology 2451 * and destroyed. 2452 * 2453 * References and/or children may exist if the ncp is in the middle of the 2454 * topology, preventing the ncp from being destroyed. 2455 * 2456 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2457 * 2458 * This function may return a held (but NOT locked) parent node which the 2459 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2460 * due to deep namecache trees. 2461 * 2462 * WARNING! For MPSAFE operation this routine must acquire up to three 2463 * spin locks to be able to safely test nc_refs. Lock order is 2464 * very important. 2465 * 2466 * hash spinlock if on hash list 2467 * parent spinlock if child of parent 2468 * (the ncp is unresolved so there is no vnode association) 2469 */ 2470 static void 2471 cache_zap(struct namecache *ncp) 2472 { 2473 struct namecache *par; 2474 struct vnode *dropvp; 2475 struct nchash_head *nchpp; 2476 int refcmp; 2477 int nonblock = 1; /* XXX cleanup */ 2478 2479 again: 2480 /* 2481 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2482 * This gets rid of any vp->v_namecache list or negative list and 2483 * the related ref. 2484 */ 2485 _cache_setunresolved(ncp); 2486 2487 /* 2488 * Try to scrap the entry and possibly tail-recurse on its parent. 2489 * We only scrap unref'd (other then our ref) unresolved entries, 2490 * we do not scrap 'live' entries. 2491 * 2492 * If nc_parent is non NULL we expect 2 references, else just 1. 2493 * If there are more, someone else also holds the ncp and we cannot 2494 * destroy it. 2495 */ 2496 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2497 KKASSERT(ncp->nc_refs > 0); 2498 2499 /* 2500 * If the ncp is linked to its parent it will also be in the hash 2501 * table. We have to be able to lock the parent and the hash table. 2502 * 2503 * Acquire locks. Note that the parent can't go away while we hold 2504 * a child locked. If nc_parent is present, expect 2 refs instead 2505 * of 1. 2506 */ 2507 nchpp = NULL; 2508 if ((par = ncp->nc_parent) != NULL) { 2509 if (nonblock) { 2510 if (_cache_lock_nonblock(par)) { 2511 /* lock failed */ 2512 ncp->nc_flag |= NCF_DEFEREDZAP; 2513 atomic_add_long( 2514 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2515 1); 2516 _cache_unlock(ncp); 2517 _cache_drop(ncp); /* caller's ref */ 2518 return; 2519 } 2520 _cache_hold(par); 2521 } else { 2522 _cache_hold(par); 2523 _cache_lock(par); 2524 } 2525 nchpp = ncp->nc_head; 2526 spin_lock(&nchpp->spin); 2527 } 2528 2529 /* 2530 * With the parent and nchpp locked, and the vnode removed 2531 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2532 * more someone else has a ref and we cannot zap the entry. 2533 * 2534 * one for our hold 2535 * one for our parent link (parent also has one from the linkage) 2536 */ 2537 if (par) 2538 refcmp = 2; 2539 else 2540 refcmp = 1; 2541 2542 /* 2543 * On failure undo the work we've done so far and drop the 2544 * caller's ref and ncp. 2545 */ 2546 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2547 if (par) { 2548 spin_unlock(&nchpp->spin); 2549 _cache_put(par); 2550 } 2551 _cache_unlock(ncp); 2552 _cache_drop(ncp); 2553 return; 2554 } 2555 2556 /* 2557 * We own all the refs and with the spinlocks held no further 2558 * refs can be acquired by others. 2559 * 2560 * Remove us from the hash list and parent list. We have to 2561 * drop a ref on the parent's vp if the parent's list becomes 2562 * empty. 2563 */ 2564 dropvp = NULL; 2565 if (par) { 2566 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2567 2568 KKASSERT(nchpp == ncp->nc_head); 2569 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2570 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2571 atomic_add_long(&pn->vfscache_count, -1); 2572 if (TAILQ_EMPTY(&ncp->nc_list)) 2573 atomic_add_long(&pn->vfscache_leafs, -1); 2574 2575 if (TAILQ_EMPTY(&par->nc_list)) { 2576 atomic_add_long(&pn->vfscache_leafs, 1); 2577 if (par->nc_vp) 2578 dropvp = par->nc_vp; 2579 } 2580 ncp->nc_parent = NULL; 2581 ncp->nc_head = NULL; 2582 spin_unlock(&nchpp->spin); 2583 _cache_drop(par); /* removal of ncp from par->nc_list */ 2584 /*_cache_unlock(par);*/ 2585 } else { 2586 KKASSERT(ncp->nc_head == NULL); 2587 } 2588 2589 /* 2590 * ncp should not have picked up any refs. Physically 2591 * destroy the ncp. 2592 */ 2593 if (ncp->nc_refs != refcmp) { 2594 panic("cache_zap: %p bad refs %d (expected %d)\n", 2595 ncp, ncp->nc_refs, refcmp); 2596 } 2597 /* _cache_unlock(ncp) not required */ 2598 ncp->nc_refs = -1; /* safety */ 2599 if (ncp->nc_name) 2600 kfree(ncp->nc_name, M_VFSCACHE); 2601 kfree(ncp, M_VFSCACHE); 2602 2603 /* 2604 * Delayed drop (we had to release our spinlocks) 2605 */ 2606 if (dropvp) 2607 vdrop(dropvp); 2608 2609 /* 2610 * Loop up if we can recursively clean out the parent. 2611 */ 2612 if (par) { 2613 refcmp = 1; /* ref on parent */ 2614 if (par->nc_parent) /* par->par */ 2615 ++refcmp; 2616 par->nc_flag &= ~NCF_DEFEREDZAP; 2617 if ((par->nc_flag & NCF_UNRESOLVED) && 2618 par->nc_refs == refcmp && 2619 TAILQ_EMPTY(&par->nc_list)) { 2620 ncp = par; 2621 goto again; 2622 } 2623 _cache_unlock(par); 2624 _cache_drop(par); 2625 } 2626 } 2627 2628 /* 2629 * Clean up dangling negative cache and defered-drop entries in the 2630 * namecache. 2631 * 2632 * This routine is called in the critical path and also called from 2633 * vnlru(). When called from vnlru we use a lower limit to try to 2634 * deal with the negative cache before the critical path has to start 2635 * dealing with it. 2636 */ 2637 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2638 2639 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2640 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2641 2642 void 2643 cache_hysteresis(int critpath) 2644 { 2645 long poslimit; 2646 long neglimit = maxvnodes / ncnegfactor; 2647 long xnumcache = vfscache_leafs; 2648 2649 if (critpath == 0) 2650 neglimit = neglimit * 8 / 10; 2651 2652 /* 2653 * Don't cache too many negative hits. We use hysteresis to reduce 2654 * the impact on the critical path. 2655 */ 2656 switch(neg_cache_hysteresis_state[critpath]) { 2657 case CHI_LOW: 2658 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2659 if (critpath) 2660 _cache_cleanneg(ncnegflush); 2661 else 2662 _cache_cleanneg(ncnegflush + 2663 vfscache_negs - neglimit); 2664 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2665 } 2666 break; 2667 case CHI_HIGH: 2668 if (vfscache_negs > MINNEG * 9 / 10 && 2669 vfscache_negs * 9 / 10 > neglimit 2670 ) { 2671 if (critpath) 2672 _cache_cleanneg(ncnegflush); 2673 else 2674 _cache_cleanneg(ncnegflush + 2675 vfscache_negs * 9 / 10 - 2676 neglimit); 2677 } else { 2678 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2679 } 2680 break; 2681 } 2682 2683 /* 2684 * Don't cache too many positive hits. We use hysteresis to reduce 2685 * the impact on the critical path. 2686 * 2687 * Excessive positive hits can accumulate due to large numbers of 2688 * hardlinks (the vnode cache will not prevent hl ncps from growing 2689 * into infinity). 2690 */ 2691 if ((poslimit = ncposlimit) == 0) 2692 poslimit = maxvnodes * 2; 2693 if (critpath == 0) 2694 poslimit = poslimit * 8 / 10; 2695 2696 switch(pos_cache_hysteresis_state[critpath]) { 2697 case CHI_LOW: 2698 if (xnumcache > poslimit && xnumcache > MINPOS) { 2699 if (critpath) 2700 _cache_cleanpos(ncposflush); 2701 else 2702 _cache_cleanpos(ncposflush + 2703 xnumcache - poslimit); 2704 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2705 } 2706 break; 2707 case CHI_HIGH: 2708 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2709 if (critpath) 2710 _cache_cleanpos(ncposflush); 2711 else 2712 _cache_cleanpos(ncposflush + 2713 xnumcache - poslimit * 5 / 6); 2714 } else { 2715 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2716 } 2717 break; 2718 } 2719 2720 /* 2721 * Clean out dangling defered-zap ncps which could not be cleanly 2722 * dropped if too many build up. Note that numdefered is 2723 * heuristical. Make sure we are real-time for the current cpu, 2724 * plus the global rollup. 2725 */ 2726 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2727 _cache_cleandefered(); 2728 } 2729 } 2730 2731 /* 2732 * NEW NAMECACHE LOOKUP API 2733 * 2734 * Lookup an entry in the namecache. The passed par_nch must be referenced 2735 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2736 * is ALWAYS returned, eve if the supplied component is illegal. 2737 * 2738 * The resulting namecache entry should be returned to the system with 2739 * cache_put() or cache_unlock() + cache_drop(). 2740 * 2741 * namecache locks are recursive but care must be taken to avoid lock order 2742 * reversals (hence why the passed par_nch must be unlocked). Locking 2743 * rules are to order for parent traversals, not for child traversals. 2744 * 2745 * Nobody else will be able to manipulate the associated namespace (e.g. 2746 * create, delete, rename, rename-target) until the caller unlocks the 2747 * entry. 2748 * 2749 * The returned entry will be in one of three states: positive hit (non-null 2750 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2751 * Unresolved entries must be resolved through the filesystem to associate the 2752 * vnode and/or determine whether a positive or negative hit has occured. 2753 * 2754 * It is not necessary to lock a directory in order to lock namespace under 2755 * that directory. In fact, it is explicitly not allowed to do that. A 2756 * directory is typically only locked when being created, renamed, or 2757 * destroyed. 2758 * 2759 * The directory (par) may be unresolved, in which case any returned child 2760 * will likely also be marked unresolved. Likely but not guarenteed. Since 2761 * the filesystem lookup requires a resolved directory vnode the caller is 2762 * responsible for resolving the namecache chain top-down. This API 2763 * specifically allows whole chains to be created in an unresolved state. 2764 */ 2765 struct nchandle 2766 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2767 { 2768 struct nchandle nch; 2769 struct namecache *ncp; 2770 struct namecache *new_ncp; 2771 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2772 struct nchash_head *nchpp; 2773 struct mount *mp; 2774 u_int32_t hash; 2775 globaldata_t gd; 2776 int par_locked; 2777 2778 gd = mycpu; 2779 mp = par_nch->mount; 2780 par_locked = 0; 2781 2782 /* 2783 * This is a good time to call it, no ncp's are locked by 2784 * the caller or us. 2785 */ 2786 cache_hysteresis(1); 2787 2788 /* 2789 * Try to locate an existing entry 2790 */ 2791 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2792 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2793 new_ncp = NULL; 2794 nchpp = NCHHASH(hash); 2795 restart: 2796 rep_ncp = NULL; 2797 if (new_ncp) 2798 spin_lock(&nchpp->spin); 2799 else 2800 spin_lock_shared(&nchpp->spin); 2801 2802 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2803 /* 2804 * Break out if we find a matching entry. Note that 2805 * UNRESOLVED entries may match, but DESTROYED entries 2806 * do not. 2807 * 2808 * We may be able to reuse DESTROYED entries that we come 2809 * across, even if the name does not match, as long as 2810 * nc_nlen is correct. 2811 */ 2812 if (ncp->nc_parent == par_nch->ncp && 2813 ncp->nc_nlen == nlc->nlc_namelen) { 2814 if (ncp->nc_flag & NCF_DESTROYED) { 2815 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2816 rep_ncp = ncp; 2817 continue; 2818 } 2819 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2820 continue; 2821 _cache_hold(ncp); 2822 if (new_ncp) 2823 spin_unlock(&nchpp->spin); 2824 else 2825 spin_unlock_shared(&nchpp->spin); 2826 if (par_locked) { 2827 _cache_unlock(par_nch->ncp); 2828 par_locked = 0; 2829 } 2830 if (_cache_lock_special(ncp) == 0) { 2831 /* 2832 * Successfully locked but we must re-test 2833 * conditions that might have changed since 2834 * we did not have the lock before. 2835 */ 2836 if (ncp->nc_parent != par_nch->ncp || 2837 ncp->nc_nlen != nlc->nlc_namelen || 2838 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2839 ncp->nc_nlen) || 2840 (ncp->nc_flag & NCF_DESTROYED)) { 2841 _cache_put(ncp); 2842 goto restart; 2843 } 2844 _cache_auto_unresolve(mp, ncp); 2845 if (new_ncp) 2846 _cache_free(new_ncp); 2847 goto found; 2848 } 2849 _cache_get(ncp); /* cycle the lock to block */ 2850 _cache_put(ncp); 2851 _cache_drop(ncp); 2852 goto restart; 2853 } 2854 } 2855 2856 /* 2857 * We failed to locate the entry, try to resurrect a destroyed 2858 * entry that we did find that is already correctly linked into 2859 * nchpp and the parent. We must re-test conditions after 2860 * successfully locking rep_ncp. 2861 * 2862 * This case can occur under heavy loads due to not being able 2863 * to safely lock the parent in cache_zap(). Nominally a repeated 2864 * create/unlink load, but only the namelen needs to match. 2865 */ 2866 if (rep_ncp && new_ncp == NULL) { 2867 if (_cache_lock_nonblock(rep_ncp) == 0) { 2868 _cache_hold(rep_ncp); 2869 if (rep_ncp->nc_parent == par_nch->ncp && 2870 rep_ncp->nc_nlen == nlc->nlc_namelen && 2871 (rep_ncp->nc_flag & NCF_DESTROYED) && 2872 rep_ncp->nc_refs == 2) { 2873 /* 2874 * Update nc_name as reuse as new. 2875 */ 2876 ncp = rep_ncp; 2877 bcopy(nlc->nlc_nameptr, ncp->nc_name, 2878 nlc->nlc_namelen); 2879 spin_unlock_shared(&nchpp->spin); 2880 _cache_setunresolved(ncp); 2881 ncp->nc_flag = NCF_UNRESOLVED; 2882 ncp->nc_error = ENOTCONN; 2883 goto found; 2884 } 2885 _cache_put(rep_ncp); 2886 } 2887 } 2888 2889 /* 2890 * Otherwise create a new entry and add it to the cache. The parent 2891 * ncp must also be locked so we can link into it. 2892 * 2893 * We have to relookup after possibly blocking in kmalloc or 2894 * when locking par_nch. 2895 * 2896 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2897 * mount case, in which case nc_name will be NULL. 2898 */ 2899 if (new_ncp == NULL) { 2900 spin_unlock_shared(&nchpp->spin); 2901 new_ncp = cache_alloc(nlc->nlc_namelen); 2902 if (nlc->nlc_namelen) { 2903 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2904 nlc->nlc_namelen); 2905 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2906 } 2907 goto restart; 2908 } 2909 2910 /* 2911 * NOTE! The spinlock is held exclusively here because new_ncp 2912 * is non-NULL. 2913 */ 2914 if (par_locked == 0) { 2915 spin_unlock(&nchpp->spin); 2916 _cache_lock(par_nch->ncp); 2917 par_locked = 1; 2918 goto restart; 2919 } 2920 2921 /* 2922 * Link to parent (requires another ref, the one already in new_ncp 2923 * is what we wil lreturn). 2924 * 2925 * WARNING! We still hold the spinlock. We have to set the hash 2926 * table entry atomically. 2927 */ 2928 ncp = new_ncp; 2929 ++ncp->nc_refs; 2930 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2931 spin_unlock(&nchpp->spin); 2932 _cache_unlock(par_nch->ncp); 2933 /* par_locked = 0 - not used */ 2934 found: 2935 /* 2936 * stats and namecache size management 2937 */ 2938 if (ncp->nc_flag & NCF_UNRESOLVED) 2939 ++gd->gd_nchstats->ncs_miss; 2940 else if (ncp->nc_vp) 2941 ++gd->gd_nchstats->ncs_goodhits; 2942 else 2943 ++gd->gd_nchstats->ncs_neghits; 2944 nch.mount = mp; 2945 nch.ncp = ncp; 2946 _cache_mntref(nch.mount); 2947 2948 return(nch); 2949 } 2950 2951 /* 2952 * Attempt to lookup a namecache entry and return with a shared namecache 2953 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 2954 * set or we are unable to lock. 2955 */ 2956 int 2957 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, 2958 int excl, struct nchandle *res_nch) 2959 { 2960 struct namecache *ncp; 2961 struct nchash_head *nchpp; 2962 struct mount *mp; 2963 u_int32_t hash; 2964 globaldata_t gd; 2965 2966 /* 2967 * If exclusive requested or shared namecache locks are disabled, 2968 * return failure. 2969 */ 2970 if (ncp_shared_lock_disable || excl) 2971 return(EWOULDBLOCK); 2972 2973 gd = mycpu; 2974 mp = par_nch->mount; 2975 2976 /* 2977 * This is a good time to call it, no ncp's are locked by 2978 * the caller or us. 2979 */ 2980 cache_hysteresis(1); 2981 2982 /* 2983 * Try to locate an existing entry 2984 */ 2985 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2986 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2987 nchpp = NCHHASH(hash); 2988 2989 spin_lock_shared(&nchpp->spin); 2990 2991 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2992 /* 2993 * Break out if we find a matching entry. Note that 2994 * UNRESOLVED entries may match, but DESTROYED entries 2995 * do not. 2996 */ 2997 if (ncp->nc_parent == par_nch->ncp && 2998 ncp->nc_nlen == nlc->nlc_namelen && 2999 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3000 (ncp->nc_flag & NCF_DESTROYED) == 0 3001 ) { 3002 _cache_hold(ncp); 3003 spin_unlock_shared(&nchpp->spin); 3004 3005 if (_cache_lock_shared_special(ncp) == 0) { 3006 if (ncp->nc_parent == par_nch->ncp && 3007 ncp->nc_nlen == nlc->nlc_namelen && 3008 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3009 ncp->nc_nlen) == 0 && 3010 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3011 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3012 _cache_auto_unresolve_test(mp, ncp) == 0) { 3013 goto found; 3014 } 3015 _cache_unlock(ncp); 3016 } 3017 _cache_drop(ncp); 3018 return(EWOULDBLOCK); 3019 } 3020 } 3021 3022 /* 3023 * Failure 3024 */ 3025 spin_unlock_shared(&nchpp->spin); 3026 return(EWOULDBLOCK); 3027 3028 /* 3029 * Success 3030 * 3031 * Note that nc_error might be non-zero (e.g ENOENT). 3032 */ 3033 found: 3034 res_nch->mount = mp; 3035 res_nch->ncp = ncp; 3036 ++gd->gd_nchstats->ncs_goodhits; 3037 _cache_mntref(res_nch->mount); 3038 3039 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3040 return(ncp->nc_error); 3041 } 3042 3043 /* 3044 * This is a non-blocking verison of cache_nlookup() used by 3045 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3046 * will return nch.ncp == NULL in that case. 3047 */ 3048 struct nchandle 3049 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3050 { 3051 struct nchandle nch; 3052 struct namecache *ncp; 3053 struct namecache *new_ncp; 3054 struct nchash_head *nchpp; 3055 struct mount *mp; 3056 u_int32_t hash; 3057 globaldata_t gd; 3058 int par_locked; 3059 3060 gd = mycpu; 3061 mp = par_nch->mount; 3062 par_locked = 0; 3063 3064 /* 3065 * Try to locate an existing entry 3066 */ 3067 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3068 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3069 new_ncp = NULL; 3070 nchpp = NCHHASH(hash); 3071 restart: 3072 spin_lock(&nchpp->spin); 3073 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3074 /* 3075 * Break out if we find a matching entry. Note that 3076 * UNRESOLVED entries may match, but DESTROYED entries 3077 * do not. 3078 */ 3079 if (ncp->nc_parent == par_nch->ncp && 3080 ncp->nc_nlen == nlc->nlc_namelen && 3081 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3082 (ncp->nc_flag & NCF_DESTROYED) == 0 3083 ) { 3084 _cache_hold(ncp); 3085 spin_unlock(&nchpp->spin); 3086 if (par_locked) { 3087 _cache_unlock(par_nch->ncp); 3088 par_locked = 0; 3089 } 3090 if (_cache_lock_special(ncp) == 0) { 3091 if (ncp->nc_parent != par_nch->ncp || 3092 ncp->nc_nlen != nlc->nlc_namelen || 3093 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3094 (ncp->nc_flag & NCF_DESTROYED)) { 3095 kprintf("cache_lookup_nonblock: " 3096 "ncp-race %p %*.*s\n", 3097 ncp, 3098 nlc->nlc_namelen, 3099 nlc->nlc_namelen, 3100 nlc->nlc_nameptr); 3101 _cache_unlock(ncp); 3102 _cache_drop(ncp); 3103 goto failed; 3104 } 3105 _cache_auto_unresolve(mp, ncp); 3106 if (new_ncp) { 3107 _cache_free(new_ncp); 3108 new_ncp = NULL; 3109 } 3110 goto found; 3111 } 3112 _cache_drop(ncp); 3113 goto failed; 3114 } 3115 } 3116 3117 /* 3118 * We failed to locate an entry, create a new entry and add it to 3119 * the cache. The parent ncp must also be locked so we 3120 * can link into it. 3121 * 3122 * We have to relookup after possibly blocking in kmalloc or 3123 * when locking par_nch. 3124 * 3125 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3126 * mount case, in which case nc_name will be NULL. 3127 */ 3128 if (new_ncp == NULL) { 3129 spin_unlock(&nchpp->spin); 3130 new_ncp = cache_alloc(nlc->nlc_namelen); 3131 if (nlc->nlc_namelen) { 3132 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3133 nlc->nlc_namelen); 3134 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3135 } 3136 goto restart; 3137 } 3138 if (par_locked == 0) { 3139 spin_unlock(&nchpp->spin); 3140 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3141 par_locked = 1; 3142 goto restart; 3143 } 3144 goto failed; 3145 } 3146 3147 /* 3148 * Link to parent (requires another ref, the one already in new_ncp 3149 * is what we wil lreturn). 3150 * 3151 * WARNING! We still hold the spinlock. We have to set the hash 3152 * table entry atomically. 3153 */ 3154 ncp = new_ncp; 3155 ++ncp->nc_refs; 3156 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3157 spin_unlock(&nchpp->spin); 3158 _cache_unlock(par_nch->ncp); 3159 /* par_locked = 0 - not used */ 3160 found: 3161 /* 3162 * stats and namecache size management 3163 */ 3164 if (ncp->nc_flag & NCF_UNRESOLVED) 3165 ++gd->gd_nchstats->ncs_miss; 3166 else if (ncp->nc_vp) 3167 ++gd->gd_nchstats->ncs_goodhits; 3168 else 3169 ++gd->gd_nchstats->ncs_neghits; 3170 nch.mount = mp; 3171 nch.ncp = ncp; 3172 _cache_mntref(nch.mount); 3173 3174 return(nch); 3175 failed: 3176 if (new_ncp) { 3177 _cache_free(new_ncp); 3178 new_ncp = NULL; 3179 } 3180 nch.mount = NULL; 3181 nch.ncp = NULL; 3182 return(nch); 3183 } 3184 3185 /* 3186 * The namecache entry is marked as being used as a mount point. 3187 * Locate the mount if it is visible to the caller. The DragonFly 3188 * mount system allows arbitrary loops in the topology and disentangles 3189 * those loops by matching against (mp, ncp) rather than just (ncp). 3190 * This means any given ncp can dive any number of mounts, depending 3191 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3192 * 3193 * We use a very simple frontend cache to reduce SMP conflicts, 3194 * which we have to do because the mountlist scan needs an exclusive 3195 * lock around its ripout info list. Not to mention that there might 3196 * be a lot of mounts. 3197 * 3198 * The hash table is 4-way set-associative and will either return the 3199 * matching slot or the best slot to reuse. 3200 */ 3201 struct findmount_info { 3202 struct mount *result; 3203 struct mount *nch_mount; 3204 struct namecache *nch_ncp; 3205 }; 3206 3207 static 3208 struct ncmount_cache * 3209 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3210 { 3211 uintptr_t hash; 3212 3213 hash = ((uintptr_t)mp / sizeof(*mp)) * 3214 ((uintptr_t)ncp / sizeof(*ncp)); 3215 hash ^= (uintptr_t)ncp >> 12; 3216 hash ^= (uintptr_t)mp >> 12; 3217 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~3); 3218 3219 return (&ncmount_cache[hash]); 3220 } 3221 3222 static 3223 struct ncmount_cache * 3224 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3225 { 3226 struct ncmount_cache *ncc; 3227 struct ncmount_cache *best; 3228 uintptr_t hash; 3229 int delta; 3230 int best_delta; 3231 int i; 3232 3233 hash = ((uintptr_t)mp / sizeof(*mp)) * 3234 ((uintptr_t)ncp / sizeof(*ncp)); 3235 hash ^= (uintptr_t)ncp >> 12; 3236 hash ^= (uintptr_t)mp >> 12; 3237 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~3); 3238 3239 ncc = &ncmount_cache[hash]; 3240 3241 /* 3242 * NOTE: When checking for a ticks overflow implement a slop of 3243 * 2 ticks just to be safe, because ticks is accessed 3244 * non-atomically one CPU can increment it while another 3245 * is still using the old value. 3246 */ 3247 if (ncc->mp == mp && ncc->ncp == ncp) /* 0 */ 3248 return ncc; 3249 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3250 if (delta < -2) /* overflow reset */ 3251 ncc->ticks = ticks; 3252 best = ncc; 3253 best_delta = delta; 3254 3255 for (i = 1; i < 4; ++i) { /* 1, 2, 3 */ 3256 ++ncc; 3257 if (ncc->mp == mp && ncc->ncp == ncp) 3258 return ncc; 3259 delta = (int)(ticks - ncc->ticks); 3260 if (delta < -2) 3261 ncc->ticks = ticks; 3262 if (delta > best_delta) { 3263 best_delta = delta; 3264 best = ncc; 3265 } 3266 } 3267 return best; 3268 } 3269 3270 static 3271 int 3272 cache_findmount_callback(struct mount *mp, void *data) 3273 { 3274 struct findmount_info *info = data; 3275 3276 /* 3277 * Check the mount's mounted-on point against the passed nch. 3278 */ 3279 if (mp->mnt_ncmounton.mount == info->nch_mount && 3280 mp->mnt_ncmounton.ncp == info->nch_ncp 3281 ) { 3282 info->result = mp; 3283 _cache_mntref(mp); 3284 return(-1); 3285 } 3286 return(0); 3287 } 3288 3289 /* 3290 * Find the recursive mountpoint (mp, ncp) -> mtpt 3291 */ 3292 struct mount * 3293 cache_findmount(struct nchandle *nch) 3294 { 3295 struct findmount_info info; 3296 struct ncmount_cache *ncc; 3297 struct mount *mp; 3298 3299 /* 3300 * Fast 3301 */ 3302 if (ncmount_cache_enable == 0) { 3303 ncc = NULL; 3304 goto skip; 3305 } 3306 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3307 if (ncc->ncp == nch->ncp) { 3308 spin_lock_shared(&ncc->spin); 3309 if (ncc->isneg == 0 && 3310 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) { 3311 if (mp->mnt_ncmounton.mount == nch->mount && 3312 mp->mnt_ncmounton.ncp == nch->ncp) { 3313 /* 3314 * Cache hit (positive) (avoid dirtying 3315 * the cache line if possible) 3316 */ 3317 if (ncc->ticks != (int)ticks) 3318 ncc->ticks = (int)ticks; 3319 _cache_mntref(mp); 3320 spin_unlock_shared(&ncc->spin); 3321 return(mp); 3322 } 3323 /* else cache miss */ 3324 } 3325 if (ncc->isneg && 3326 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3327 /* 3328 * Cache hit (negative) (avoid dirtying 3329 * the cache line if possible) 3330 */ 3331 if (ncc->ticks != (int)ticks) 3332 ncc->ticks = (int)ticks; 3333 spin_unlock_shared(&ncc->spin); 3334 return(NULL); 3335 } 3336 spin_unlock_shared(&ncc->spin); 3337 } 3338 skip: 3339 3340 /* 3341 * Slow 3342 */ 3343 info.result = NULL; 3344 info.nch_mount = nch->mount; 3345 info.nch_ncp = nch->ncp; 3346 mountlist_scan(cache_findmount_callback, &info, 3347 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3348 3349 /* 3350 * Cache the result. 3351 * 3352 * Negative lookups: We cache the originating {ncp,mp}. (mp) is 3353 * only used for pointer comparisons and is not 3354 * referenced (otherwise there would be dangling 3355 * refs). 3356 * 3357 * Positive lookups: We cache the originating {ncp} and the target 3358 * (mp). (mp) is referenced. 3359 * 3360 * Indeterminant: If the match is undergoing an unmount we do 3361 * not cache it to avoid racing cache_unmounting(), 3362 * but still return the match. 3363 */ 3364 if (ncc) { 3365 spin_lock(&ncc->spin); 3366 if (info.result == NULL) { 3367 if (ncc->isneg == 0 && ncc->mp) 3368 _cache_mntrel(ncc->mp); 3369 ncc->ncp = nch->ncp; 3370 ncc->mp = nch->mount; 3371 ncc->isneg = 1; 3372 ncc->ticks = (int)ticks; 3373 spin_unlock(&ncc->spin); 3374 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) { 3375 if (ncc->isneg == 0 && ncc->mp) 3376 _cache_mntrel(ncc->mp); 3377 _cache_mntref(info.result); 3378 ncc->ncp = nch->ncp; 3379 ncc->mp = info.result; 3380 ncc->isneg = 0; 3381 ncc->ticks = (int)ticks; 3382 spin_unlock(&ncc->spin); 3383 } else { 3384 spin_unlock(&ncc->spin); 3385 } 3386 } 3387 return(info.result); 3388 } 3389 3390 void 3391 cache_dropmount(struct mount *mp) 3392 { 3393 _cache_mntrel(mp); 3394 } 3395 3396 void 3397 cache_ismounting(struct mount *mp) 3398 { 3399 struct nchandle *nch = &mp->mnt_ncmounton; 3400 struct ncmount_cache *ncc; 3401 int i; 3402 3403 ncc = ncmount_cache_lookup4(nch->mount, nch->ncp); 3404 for (i = 0; i < 4; ++i) { 3405 if (ncc->isneg && 3406 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3407 spin_lock(&ncc->spin); 3408 if (ncc->isneg && 3409 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3410 ncc->ncp = NULL; 3411 ncc->mp = NULL; 3412 ncc->ticks = (int)ticks - hz * 120; 3413 } 3414 spin_unlock(&ncc->spin); 3415 } 3416 ++ncc; 3417 } 3418 } 3419 3420 void 3421 cache_unmounting(struct mount *mp) 3422 { 3423 struct nchandle *nch = &mp->mnt_ncmounton; 3424 struct ncmount_cache *ncc; 3425 int i; 3426 3427 ncc = ncmount_cache_lookup4(nch->mount, nch->ncp); 3428 for (i = 0; i < 4; ++i) { 3429 if (ncc->isneg == 0 && 3430 ncc->ncp == nch->ncp && ncc->mp == mp) { 3431 spin_lock(&ncc->spin); 3432 if (ncc->isneg == 0 && 3433 ncc->ncp == nch->ncp && ncc->mp == mp) { 3434 _cache_mntrel(mp); 3435 ncc->ncp = NULL; 3436 ncc->mp = NULL; 3437 ncc->ticks = (int)ticks - hz * 120; 3438 } 3439 spin_unlock(&ncc->spin); 3440 } 3441 ++ncc; 3442 } 3443 } 3444 3445 /* 3446 * Resolve an unresolved namecache entry, generally by looking it up. 3447 * The passed ncp must be locked and refd. 3448 * 3449 * Theoretically since a vnode cannot be recycled while held, and since 3450 * the nc_parent chain holds its vnode as long as children exist, the 3451 * direct parent of the cache entry we are trying to resolve should 3452 * have a valid vnode. If not then generate an error that we can 3453 * determine is related to a resolver bug. 3454 * 3455 * However, if a vnode was in the middle of a recyclement when the NCP 3456 * got locked, ncp->nc_vp might point to a vnode that is about to become 3457 * invalid. cache_resolve() handles this case by unresolving the entry 3458 * and then re-resolving it. 3459 * 3460 * Note that successful resolution does not necessarily return an error 3461 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3462 * will be returned. 3463 */ 3464 int 3465 cache_resolve(struct nchandle *nch, struct ucred *cred) 3466 { 3467 struct namecache *par_tmp; 3468 struct namecache *par; 3469 struct namecache *ncp; 3470 struct nchandle nctmp; 3471 struct mount *mp; 3472 struct vnode *dvp; 3473 int error; 3474 3475 ncp = nch->ncp; 3476 mp = nch->mount; 3477 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3478 restart: 3479 /* 3480 * If the ncp is already resolved we have nothing to do. However, 3481 * we do want to guarentee that a usable vnode is returned when 3482 * a vnode is present, so make sure it hasn't been reclaimed. 3483 */ 3484 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3485 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3486 _cache_setunresolved(ncp); 3487 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3488 return (ncp->nc_error); 3489 } 3490 3491 /* 3492 * If the ncp was destroyed it will never resolve again. This 3493 * can basically only happen when someone is chdir'd into an 3494 * empty directory which is then rmdir'd. We want to catch this 3495 * here and not dive the VFS because the VFS might actually 3496 * have a way to re-resolve the disconnected ncp, which will 3497 * result in inconsistencies in the cdir/nch for proc->p_fd. 3498 */ 3499 if (ncp->nc_flag & NCF_DESTROYED) 3500 return(EINVAL); 3501 3502 /* 3503 * Mount points need special handling because the parent does not 3504 * belong to the same filesystem as the ncp. 3505 */ 3506 if (ncp == mp->mnt_ncmountpt.ncp) 3507 return (cache_resolve_mp(mp)); 3508 3509 /* 3510 * We expect an unbroken chain of ncps to at least the mount point, 3511 * and even all the way to root (but this code doesn't have to go 3512 * past the mount point). 3513 */ 3514 if (ncp->nc_parent == NULL) { 3515 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3516 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3517 ncp->nc_error = EXDEV; 3518 return(ncp->nc_error); 3519 } 3520 3521 /* 3522 * The vp's of the parent directories in the chain are held via vhold() 3523 * due to the existance of the child, and should not disappear. 3524 * However, there are cases where they can disappear: 3525 * 3526 * - due to filesystem I/O errors. 3527 * - due to NFS being stupid about tracking the namespace and 3528 * destroys the namespace for entire directories quite often. 3529 * - due to forced unmounts. 3530 * - due to an rmdir (parent will be marked DESTROYED) 3531 * 3532 * When this occurs we have to track the chain backwards and resolve 3533 * it, looping until the resolver catches up to the current node. We 3534 * could recurse here but we might run ourselves out of kernel stack 3535 * so we do it in a more painful manner. This situation really should 3536 * not occur all that often, or if it does not have to go back too 3537 * many nodes to resolve the ncp. 3538 */ 3539 while ((dvp = cache_dvpref(ncp)) == NULL) { 3540 /* 3541 * This case can occur if a process is CD'd into a 3542 * directory which is then rmdir'd. If the parent is marked 3543 * destroyed there is no point trying to resolve it. 3544 */ 3545 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3546 return(ENOENT); 3547 par = ncp->nc_parent; 3548 _cache_hold(par); 3549 _cache_lock(par); 3550 while ((par_tmp = par->nc_parent) != NULL && 3551 par_tmp->nc_vp == NULL) { 3552 _cache_hold(par_tmp); 3553 _cache_lock(par_tmp); 3554 _cache_put(par); 3555 par = par_tmp; 3556 } 3557 if (par->nc_parent == NULL) { 3558 kprintf("EXDEV case 2 %*.*s\n", 3559 par->nc_nlen, par->nc_nlen, par->nc_name); 3560 _cache_put(par); 3561 return (EXDEV); 3562 } 3563 /* 3564 * The parent is not set in stone, ref and lock it to prevent 3565 * it from disappearing. Also note that due to renames it 3566 * is possible for our ncp to move and for par to no longer 3567 * be one of its parents. We resolve it anyway, the loop 3568 * will handle any moves. 3569 */ 3570 _cache_get(par); /* additional hold/lock */ 3571 _cache_put(par); /* from earlier hold/lock */ 3572 if (par == nch->mount->mnt_ncmountpt.ncp) { 3573 cache_resolve_mp(nch->mount); 3574 } else if ((dvp = cache_dvpref(par)) == NULL) { 3575 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name); 3576 _cache_put(par); 3577 continue; 3578 } else { 3579 if (par->nc_flag & NCF_UNRESOLVED) { 3580 nctmp.mount = mp; 3581 nctmp.ncp = par; 3582 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3583 } 3584 vrele(dvp); 3585 } 3586 if ((error = par->nc_error) != 0) { 3587 if (par->nc_error != EAGAIN) { 3588 kprintf("EXDEV case 3 %*.*s error %d\n", 3589 par->nc_nlen, par->nc_nlen, par->nc_name, 3590 par->nc_error); 3591 _cache_put(par); 3592 return(error); 3593 } 3594 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3595 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3596 } 3597 _cache_put(par); 3598 /* loop */ 3599 } 3600 3601 /* 3602 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3603 * ncp's and reattach them. If this occurs the original ncp is marked 3604 * EAGAIN to force a relookup. 3605 * 3606 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3607 * ncp must already be resolved. 3608 */ 3609 if (dvp) { 3610 nctmp.mount = mp; 3611 nctmp.ncp = ncp; 3612 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3613 vrele(dvp); 3614 } else { 3615 ncp->nc_error = EPERM; 3616 } 3617 if (ncp->nc_error == EAGAIN) { 3618 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3619 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3620 goto restart; 3621 } 3622 return(ncp->nc_error); 3623 } 3624 3625 /* 3626 * Resolve the ncp associated with a mount point. Such ncp's almost always 3627 * remain resolved and this routine is rarely called. NFS MPs tends to force 3628 * re-resolution more often due to its mac-truck-smash-the-namecache 3629 * method of tracking namespace changes. 3630 * 3631 * The semantics for this call is that the passed ncp must be locked on 3632 * entry and will be locked on return. However, if we actually have to 3633 * resolve the mount point we temporarily unlock the entry in order to 3634 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3635 * the unlock we have to recheck the flags after we relock. 3636 */ 3637 static int 3638 cache_resolve_mp(struct mount *mp) 3639 { 3640 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3641 struct vnode *vp; 3642 int error; 3643 3644 KKASSERT(mp != NULL); 3645 3646 /* 3647 * If the ncp is already resolved we have nothing to do. However, 3648 * we do want to guarentee that a usable vnode is returned when 3649 * a vnode is present, so make sure it hasn't been reclaimed. 3650 */ 3651 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3652 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3653 _cache_setunresolved(ncp); 3654 } 3655 3656 if (ncp->nc_flag & NCF_UNRESOLVED) { 3657 _cache_unlock(ncp); 3658 while (vfs_busy(mp, 0)) 3659 ; 3660 error = VFS_ROOT(mp, &vp); 3661 _cache_lock(ncp); 3662 3663 /* 3664 * recheck the ncp state after relocking. 3665 */ 3666 if (ncp->nc_flag & NCF_UNRESOLVED) { 3667 ncp->nc_error = error; 3668 if (error == 0) { 3669 _cache_setvp(mp, ncp, vp); 3670 vput(vp); 3671 } else { 3672 kprintf("[diagnostic] cache_resolve_mp: failed" 3673 " to resolve mount %p err=%d ncp=%p\n", 3674 mp, error, ncp); 3675 _cache_setvp(mp, ncp, NULL); 3676 } 3677 } else if (error == 0) { 3678 vput(vp); 3679 } 3680 vfs_unbusy(mp); 3681 } 3682 return(ncp->nc_error); 3683 } 3684 3685 /* 3686 * Clean out negative cache entries when too many have accumulated. 3687 */ 3688 static void 3689 _cache_cleanneg(long count) 3690 { 3691 struct pcpu_ncache *pn; 3692 struct namecache *ncp; 3693 static uint32_t neg_rover; 3694 uint32_t n; 3695 long vnegs; 3696 3697 n = neg_rover++; /* SMP heuristical, race ok */ 3698 cpu_ccfence(); 3699 n = n % (uint32_t)ncpus; 3700 3701 /* 3702 * Normalize vfscache_negs and count. count is sometimes based 3703 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 3704 * have crazy values. 3705 */ 3706 vnegs = vfscache_negs; 3707 cpu_ccfence(); 3708 if (vnegs <= MINNEG) 3709 vnegs = MINNEG; 3710 if (count < 1) 3711 count = 1; 3712 3713 pn = &pcpu_ncache[n]; 3714 spin_lock(&pn->neg_spin); 3715 count = pn->neg_count * count / vnegs + 1; 3716 spin_unlock(&pn->neg_spin); 3717 3718 /* 3719 * Attempt to clean out the specified number of negative cache 3720 * entries. 3721 */ 3722 while (count > 0) { 3723 spin_lock(&pn->neg_spin); 3724 ncp = TAILQ_FIRST(&pn->neg_list); 3725 if (ncp == NULL) { 3726 spin_unlock(&pn->neg_spin); 3727 break; 3728 } 3729 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 3730 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 3731 _cache_hold(ncp); 3732 spin_unlock(&pn->neg_spin); 3733 3734 /* 3735 * This can race, so we must re-check that the ncp 3736 * is on the ncneg.list after successfully locking it. 3737 */ 3738 if (_cache_lock_special(ncp) == 0) { 3739 if (ncp->nc_vp == NULL && 3740 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3741 cache_zap(ncp); 3742 } else { 3743 _cache_unlock(ncp); 3744 _cache_drop(ncp); 3745 } 3746 } else { 3747 _cache_drop(ncp); 3748 } 3749 --count; 3750 } 3751 } 3752 3753 /* 3754 * Clean out positive cache entries when too many have accumulated. 3755 */ 3756 static void 3757 _cache_cleanpos(long count) 3758 { 3759 static volatile int rover; 3760 struct nchash_head *nchpp; 3761 struct namecache *ncp; 3762 int rover_copy; 3763 3764 /* 3765 * Attempt to clean out the specified number of negative cache 3766 * entries. 3767 */ 3768 while (count > 0) { 3769 rover_copy = ++rover; /* MPSAFEENOUGH */ 3770 cpu_ccfence(); 3771 nchpp = NCHHASH(rover_copy); 3772 3773 if (TAILQ_FIRST(&nchpp->list) == NULL) { 3774 --count; 3775 continue; 3776 } 3777 3778 /* 3779 * Cycle ncp on list, ignore and do not move DUMMY 3780 * ncps. These are temporary list iterators. 3781 * 3782 * We must cycle the ncp to the end of the list to 3783 * ensure that all ncp's have an equal chance of 3784 * being removed. 3785 */ 3786 spin_lock(&nchpp->spin); 3787 ncp = TAILQ_FIRST(&nchpp->list); 3788 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 3789 ncp = TAILQ_NEXT(ncp, nc_hash); 3790 if (ncp) { 3791 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 3792 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 3793 _cache_hold(ncp); 3794 } 3795 spin_unlock(&nchpp->spin); 3796 3797 if (ncp) { 3798 if (_cache_lock_special(ncp) == 0) { 3799 cache_zap(ncp); 3800 } else { 3801 _cache_drop(ncp); 3802 } 3803 } 3804 --count; 3805 } 3806 } 3807 3808 /* 3809 * This is a kitchen sink function to clean out ncps which we 3810 * tried to zap from cache_drop() but failed because we were 3811 * unable to acquire the parent lock. 3812 * 3813 * Such entries can also be removed via cache_inval_vp(), such 3814 * as when unmounting. 3815 */ 3816 static void 3817 _cache_cleandefered(void) 3818 { 3819 struct nchash_head *nchpp; 3820 struct namecache *ncp; 3821 struct namecache dummy; 3822 int i; 3823 3824 /* 3825 * Create a list iterator. DUMMY indicates that this is a list 3826 * iterator, DESTROYED prevents matches by lookup functions. 3827 */ 3828 numdefered = 0; 3829 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 3830 bzero(&dummy, sizeof(dummy)); 3831 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 3832 dummy.nc_refs = 1; 3833 3834 for (i = 0; i <= nchash; ++i) { 3835 nchpp = &nchashtbl[i]; 3836 3837 spin_lock(&nchpp->spin); 3838 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 3839 ncp = &dummy; 3840 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 3841 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 3842 continue; 3843 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 3844 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 3845 _cache_hold(ncp); 3846 spin_unlock(&nchpp->spin); 3847 if (_cache_lock_nonblock(ncp) == 0) { 3848 ncp->nc_flag &= ~NCF_DEFEREDZAP; 3849 _cache_unlock(ncp); 3850 } 3851 _cache_drop(ncp); 3852 spin_lock(&nchpp->spin); 3853 ncp = &dummy; 3854 } 3855 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 3856 spin_unlock(&nchpp->spin); 3857 } 3858 } 3859 3860 /* 3861 * Name cache initialization, from vfsinit() when we are booting 3862 */ 3863 void 3864 nchinit(void) 3865 { 3866 struct pcpu_ncache *pn; 3867 globaldata_t gd; 3868 int i; 3869 3870 /* 3871 * Per-cpu accounting and negative hit list 3872 */ 3873 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 3874 M_VFSCACHE, M_WAITOK|M_ZERO); 3875 for (i = 0; i < ncpus; ++i) { 3876 pn = &pcpu_ncache[i]; 3877 TAILQ_INIT(&pn->neg_list); 3878 spin_init(&pn->neg_spin, "ncneg"); 3879 } 3880 3881 /* 3882 * Initialise per-cpu namecache effectiveness statistics. 3883 */ 3884 for (i = 0; i < ncpus; ++i) { 3885 gd = globaldata_find(i); 3886 gd->gd_nchstats = &nchstats[i]; 3887 } 3888 3889 /* 3890 * Create a generous namecache hash table 3891 */ 3892 nchashtbl = hashinit_ext(vfs_inodehashsize(), 3893 sizeof(struct nchash_head), 3894 M_VFSCACHE, &nchash); 3895 for (i = 0; i <= (int)nchash; ++i) { 3896 TAILQ_INIT(&nchashtbl[i].list); 3897 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 3898 } 3899 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 3900 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 3901 nclockwarn = 5 * hz; 3902 } 3903 3904 /* 3905 * Called from start_init() to bootstrap the root filesystem. Returns 3906 * a referenced, unlocked namecache record. 3907 */ 3908 void 3909 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 3910 { 3911 nch->ncp = cache_alloc(0); 3912 nch->mount = mp; 3913 _cache_mntref(mp); 3914 if (vp) 3915 _cache_setvp(nch->mount, nch->ncp, vp); 3916 } 3917 3918 /* 3919 * vfs_cache_setroot() 3920 * 3921 * Create an association between the root of our namecache and 3922 * the root vnode. This routine may be called several times during 3923 * booting. 3924 * 3925 * If the caller intends to save the returned namecache pointer somewhere 3926 * it must cache_hold() it. 3927 */ 3928 void 3929 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 3930 { 3931 struct vnode *ovp; 3932 struct nchandle onch; 3933 3934 ovp = rootvnode; 3935 onch = rootnch; 3936 rootvnode = nvp; 3937 if (nch) 3938 rootnch = *nch; 3939 else 3940 cache_zero(&rootnch); 3941 if (ovp) 3942 vrele(ovp); 3943 if (onch.ncp) 3944 cache_drop(&onch); 3945 } 3946 3947 /* 3948 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 3949 * topology and is being removed as quickly as possible. The new VOP_N*() 3950 * API calls are required to make specific adjustments using the supplied 3951 * ncp pointers rather then just bogusly purging random vnodes. 3952 * 3953 * Invalidate all namecache entries to a particular vnode as well as 3954 * any direct children of that vnode in the namecache. This is a 3955 * 'catch all' purge used by filesystems that do not know any better. 3956 * 3957 * Note that the linkage between the vnode and its namecache entries will 3958 * be removed, but the namecache entries themselves might stay put due to 3959 * active references from elsewhere in the system or due to the existance of 3960 * the children. The namecache topology is left intact even if we do not 3961 * know what the vnode association is. Such entries will be marked 3962 * NCF_UNRESOLVED. 3963 */ 3964 void 3965 cache_purge(struct vnode *vp) 3966 { 3967 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 3968 } 3969 3970 static int disablecwd; 3971 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 3972 "Disable getcwd"); 3973 3974 static u_long numcwdcalls; 3975 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 3976 "Number of current directory resolution calls"); 3977 static u_long numcwdfailnf; 3978 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 3979 "Number of current directory failures due to lack of file"); 3980 static u_long numcwdfailsz; 3981 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 3982 "Number of current directory failures due to large result"); 3983 static u_long numcwdfound; 3984 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 3985 "Number of current directory resolution successes"); 3986 3987 /* 3988 * MPALMOSTSAFE 3989 */ 3990 int 3991 sys___getcwd(struct __getcwd_args *uap) 3992 { 3993 u_int buflen; 3994 int error; 3995 char *buf; 3996 char *bp; 3997 3998 if (disablecwd) 3999 return (ENODEV); 4000 4001 buflen = uap->buflen; 4002 if (buflen == 0) 4003 return (EINVAL); 4004 if (buflen > MAXPATHLEN) 4005 buflen = MAXPATHLEN; 4006 4007 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4008 bp = kern_getcwd(buf, buflen, &error); 4009 if (error == 0) 4010 error = copyout(bp, uap->buf, strlen(bp) + 1); 4011 kfree(buf, M_TEMP); 4012 return (error); 4013 } 4014 4015 char * 4016 kern_getcwd(char *buf, size_t buflen, int *error) 4017 { 4018 struct proc *p = curproc; 4019 char *bp; 4020 int i, slash_prefixed; 4021 struct filedesc *fdp; 4022 struct nchandle nch; 4023 struct namecache *ncp; 4024 4025 numcwdcalls++; 4026 bp = buf; 4027 bp += buflen - 1; 4028 *bp = '\0'; 4029 fdp = p->p_fd; 4030 slash_prefixed = 0; 4031 4032 nch = fdp->fd_ncdir; 4033 ncp = nch.ncp; 4034 if (ncp) 4035 _cache_hold(ncp); 4036 4037 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4038 nch.mount != fdp->fd_nrdir.mount) 4039 ) { 4040 /* 4041 * While traversing upwards if we encounter the root 4042 * of the current mount we have to skip to the mount point 4043 * in the underlying filesystem. 4044 */ 4045 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4046 nch = nch.mount->mnt_ncmounton; 4047 _cache_drop(ncp); 4048 ncp = nch.ncp; 4049 if (ncp) 4050 _cache_hold(ncp); 4051 continue; 4052 } 4053 4054 /* 4055 * Prepend the path segment 4056 */ 4057 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4058 if (bp == buf) { 4059 numcwdfailsz++; 4060 *error = ERANGE; 4061 bp = NULL; 4062 goto done; 4063 } 4064 *--bp = ncp->nc_name[i]; 4065 } 4066 if (bp == buf) { 4067 numcwdfailsz++; 4068 *error = ERANGE; 4069 bp = NULL; 4070 goto done; 4071 } 4072 *--bp = '/'; 4073 slash_prefixed = 1; 4074 4075 /* 4076 * Go up a directory. This isn't a mount point so we don't 4077 * have to check again. 4078 */ 4079 while ((nch.ncp = ncp->nc_parent) != NULL) { 4080 if (ncp_shared_lock_disable) 4081 _cache_lock(ncp); 4082 else 4083 _cache_lock_shared(ncp); 4084 if (nch.ncp != ncp->nc_parent) { 4085 _cache_unlock(ncp); 4086 continue; 4087 } 4088 _cache_hold(nch.ncp); 4089 _cache_unlock(ncp); 4090 break; 4091 } 4092 _cache_drop(ncp); 4093 ncp = nch.ncp; 4094 } 4095 if (ncp == NULL) { 4096 numcwdfailnf++; 4097 *error = ENOENT; 4098 bp = NULL; 4099 goto done; 4100 } 4101 if (!slash_prefixed) { 4102 if (bp == buf) { 4103 numcwdfailsz++; 4104 *error = ERANGE; 4105 bp = NULL; 4106 goto done; 4107 } 4108 *--bp = '/'; 4109 } 4110 numcwdfound++; 4111 *error = 0; 4112 done: 4113 if (ncp) 4114 _cache_drop(ncp); 4115 return (bp); 4116 } 4117 4118 /* 4119 * Thus begins the fullpath magic. 4120 * 4121 * The passed nchp is referenced but not locked. 4122 */ 4123 static int disablefullpath; 4124 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4125 &disablefullpath, 0, 4126 "Disable fullpath lookups"); 4127 4128 int 4129 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4130 char **retbuf, char **freebuf, int guess) 4131 { 4132 struct nchandle fd_nrdir; 4133 struct nchandle nch; 4134 struct namecache *ncp; 4135 struct mount *mp, *new_mp; 4136 char *bp, *buf; 4137 int slash_prefixed; 4138 int error = 0; 4139 int i; 4140 4141 *retbuf = NULL; 4142 *freebuf = NULL; 4143 4144 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4145 bp = buf + MAXPATHLEN - 1; 4146 *bp = '\0'; 4147 if (nchbase) 4148 fd_nrdir = *nchbase; 4149 else if (p != NULL) 4150 fd_nrdir = p->p_fd->fd_nrdir; 4151 else 4152 fd_nrdir = rootnch; 4153 slash_prefixed = 0; 4154 nch = *nchp; 4155 ncp = nch.ncp; 4156 if (ncp) 4157 _cache_hold(ncp); 4158 mp = nch.mount; 4159 4160 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4161 new_mp = NULL; 4162 4163 /* 4164 * If we are asked to guess the upwards path, we do so whenever 4165 * we encounter an ncp marked as a mountpoint. We try to find 4166 * the actual mountpoint by finding the mountpoint with this 4167 * ncp. 4168 */ 4169 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4170 new_mp = mount_get_by_nc(ncp); 4171 } 4172 /* 4173 * While traversing upwards if we encounter the root 4174 * of the current mount we have to skip to the mount point. 4175 */ 4176 if (ncp == mp->mnt_ncmountpt.ncp) { 4177 new_mp = mp; 4178 } 4179 if (new_mp) { 4180 nch = new_mp->mnt_ncmounton; 4181 _cache_drop(ncp); 4182 ncp = nch.ncp; 4183 if (ncp) 4184 _cache_hold(ncp); 4185 mp = nch.mount; 4186 continue; 4187 } 4188 4189 /* 4190 * Prepend the path segment 4191 */ 4192 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4193 if (bp == buf) { 4194 kfree(buf, M_TEMP); 4195 error = ENOMEM; 4196 goto done; 4197 } 4198 *--bp = ncp->nc_name[i]; 4199 } 4200 if (bp == buf) { 4201 kfree(buf, M_TEMP); 4202 error = ENOMEM; 4203 goto done; 4204 } 4205 *--bp = '/'; 4206 slash_prefixed = 1; 4207 4208 /* 4209 * Go up a directory. This isn't a mount point so we don't 4210 * have to check again. 4211 * 4212 * We can only safely access nc_parent with ncp held locked. 4213 */ 4214 while ((nch.ncp = ncp->nc_parent) != NULL) { 4215 _cache_lock_shared(ncp); 4216 if (nch.ncp != ncp->nc_parent) { 4217 _cache_unlock(ncp); 4218 continue; 4219 } 4220 _cache_hold(nch.ncp); 4221 _cache_unlock(ncp); 4222 break; 4223 } 4224 _cache_drop(ncp); 4225 ncp = nch.ncp; 4226 } 4227 if (ncp == NULL) { 4228 kfree(buf, M_TEMP); 4229 error = ENOENT; 4230 goto done; 4231 } 4232 4233 if (!slash_prefixed) { 4234 if (bp == buf) { 4235 kfree(buf, M_TEMP); 4236 error = ENOMEM; 4237 goto done; 4238 } 4239 *--bp = '/'; 4240 } 4241 *retbuf = bp; 4242 *freebuf = buf; 4243 error = 0; 4244 done: 4245 if (ncp) 4246 _cache_drop(ncp); 4247 return(error); 4248 } 4249 4250 int 4251 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4252 char **freebuf, int guess) 4253 { 4254 struct namecache *ncp; 4255 struct nchandle nch; 4256 int error; 4257 4258 *freebuf = NULL; 4259 if (disablefullpath) 4260 return (ENODEV); 4261 4262 if (p == NULL) 4263 return (EINVAL); 4264 4265 /* vn is NULL, client wants us to use p->p_textvp */ 4266 if (vn == NULL) { 4267 if ((vn = p->p_textvp) == NULL) 4268 return (EINVAL); 4269 } 4270 spin_lock_shared(&vn->v_spin); 4271 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4272 if (ncp->nc_nlen) 4273 break; 4274 } 4275 if (ncp == NULL) { 4276 spin_unlock_shared(&vn->v_spin); 4277 return (EINVAL); 4278 } 4279 _cache_hold(ncp); 4280 spin_unlock_shared(&vn->v_spin); 4281 4282 nch.ncp = ncp; 4283 nch.mount = vn->v_mount; 4284 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4285 _cache_drop(ncp); 4286 return (error); 4287 } 4288 4289 void 4290 vfscache_rollup_cpu(struct globaldata *gd) 4291 { 4292 struct pcpu_ncache *pn; 4293 long count; 4294 4295 if (pcpu_ncache == NULL) 4296 return; 4297 pn = &pcpu_ncache[gd->gd_cpuid]; 4298 4299 if (pn->vfscache_count) { 4300 count = atomic_swap_long(&pn->vfscache_count, 0); 4301 atomic_add_long(&vfscache_count, count); 4302 } 4303 if (pn->vfscache_leafs) { 4304 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4305 atomic_add_long(&vfscache_leafs, count); 4306 } 4307 if (pn->vfscache_negs) { 4308 count = atomic_swap_long(&pn->vfscache_negs, 0); 4309 atomic_add_long(&vfscache_negs, count); 4310 } 4311 if (pn->numdefered) { 4312 count = atomic_swap_long(&pn->numdefered, 0); 4313 atomic_add_long(&numdefered, count); 4314 } 4315 } 4316