1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/uio.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 #include <sys/mount.h> 71 #include <sys/vnode.h> 72 #include <sys/malloc.h> 73 #include <sys/sysproto.h> 74 #include <sys/spinlock.h> 75 #include <sys/proc.h> 76 #include <sys/namei.h> 77 #include <sys/nlookup.h> 78 #include <sys/filedesc.h> 79 #include <sys/fnv_hash.h> 80 #include <sys/globaldata.h> 81 #include <sys/kern_syscall.h> 82 #include <sys/dirent.h> 83 #include <ddb/ddb.h> 84 85 #include <sys/spinlock2.h> 86 87 #define MAX_RECURSION_DEPTH 64 88 89 /* 90 * Random lookups in the cache are accomplished with a hash table using 91 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock, 92 * but we use the ncp->update counter trick to avoid acquiring any 93 * contestable spin-locks during a lookup. 94 * 95 * Negative entries may exist and correspond to resolved namecache 96 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 97 * will be set if the entry corresponds to a whited-out directory entry 98 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list 99 * is locked via pcpu_ncache[n].neg_spin; 100 * 101 * MPSAFE RULES: 102 * 103 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One 104 * is applicable to direct lookups via the hash table nchpp or via 105 * nc_list (the two are added or removed together). Removal of the ncp 106 * from the hash table drops this reference. The second is applicable 107 * to vp->v_namecache linkages (or negative list linkages), and removal 108 * of the ncp from these lists drops this reference. 109 * 110 * On the 1->0 transition of nc_refs the ncp can no longer be referenced 111 * and must be destroyed. No other thread should have access to it at 112 * this point so it can be safely locked and freed without any deadlock 113 * fears. 114 * 115 * The 1->0 transition can occur at almost any juncture and so cache_drop() 116 * deals with it directly. 117 * 118 * (2) Once the 1->0 transition occurs, the entity that caused the transition 119 * will be responsible for destroying the ncp. The ncp cannot be on any 120 * list or hash at this time, or be held by anyone other than the caller 121 * responsible for the transition. 122 * 123 * (3) A ncp must be locked in order to modify it. 124 * 125 * (5) ncp locks are ordered, child-to-parent. Child first, then parent. 126 * This may seem backwards but forward-scans use the hash table and thus 127 * can hold the parent unlocked while traversing downward. Deletions, 128 * on the other-hand, tend to propagate bottom-up since the ref on the 129 * is dropped as the children go away. 130 * 131 * (6) Both parent and child must be locked in order to enter the child onto 132 * the parent's nc_list. 133 */ 134 135 /* 136 * Structures associated with name cacheing. 137 */ 138 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 139 #define MINNEG 1024 140 #define MINPOS 1024 141 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */ 142 #define NCMOUNT_SET (8) /* power of 2 */ 143 144 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 145 146 TAILQ_HEAD(nchash_list, namecache); 147 148 /* 149 * Don't cachealign, but at least pad to 32 bytes so entries 150 * don't cross a cache line. 151 */ 152 struct nchash_head { 153 struct nchash_list list; /* 16 bytes */ 154 struct spinlock spin; /* 8 bytes */ 155 long pad01; /* 8 bytes */ 156 }; 157 158 struct ncmount_cache { 159 struct spinlock spin; 160 struct namecache *ncp; 161 struct mount *mp; 162 struct mount *mp_target; 163 int isneg; 164 int ticks; 165 long updating; 166 }; 167 168 struct pcpu_ncache { 169 struct spinlock umount_spin; /* cache_findmount/interlock */ 170 struct spinlock neg_spin; /* for neg_list and neg_count */ 171 struct namecache_list neg_list; 172 long neg_count; 173 long vfscache_negs; 174 long vfscache_count; 175 long vfscache_leafs; 176 long numdefered; 177 } __cachealign; 178 179 __read_mostly static struct nchash_head *nchashtbl; 180 __read_mostly static struct pcpu_ncache *pcpu_ncache; 181 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 182 183 /* 184 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 185 * to create the namecache infrastructure leading to a dangling vnode. 186 * 187 * 0 Only errors are reported 188 * 1 Successes are reported 189 * 2 Successes + the whole directory scan is reported 190 * 3 Force the directory scan code run as if the parent vnode did not 191 * have a namecache record, even if it does have one. 192 */ 193 __read_mostly static int ncvp_debug; 194 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 195 "Namecache debug level (0-3)"); 196 197 __read_mostly static u_long nchash; /* size of hash table */ 198 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 199 "Size of namecache hash table"); 200 201 __read_mostly static int ncnegflush = 10; /* burst for negative flush */ 202 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 203 "Batch flush negative entries"); 204 205 __read_mostly static int ncposflush = 10; /* burst for positive flush */ 206 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 207 "Batch flush positive entries"); 208 209 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */ 210 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 211 "Ratio of namecache negative entries"); 212 213 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */ 214 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 215 "Warn on locked namecache entries in ticks"); 216 217 __read_mostly static int ncposlimit; /* number of cache entries allocated */ 218 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 219 "Number of cache entries allocated"); 220 221 __read_mostly static int ncp_shared_lock_disable = 0; 222 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 223 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 224 225 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 226 "sizeof(struct vnode)"); 227 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 228 "sizeof(struct namecache)"); 229 230 __read_mostly static int ncmount_cache_enable = 1; 231 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 232 &ncmount_cache_enable, 0, "mount point cache"); 233 234 static __inline void _cache_drop(struct namecache *ncp); 235 static int cache_resolve_mp(struct mount *mp); 236 static int cache_findmount_callback(struct mount *mp, void *data); 237 static void _cache_setunresolved(struct namecache *ncp); 238 static void _cache_cleanneg(long count); 239 static void _cache_cleanpos(long count); 240 static void _cache_cleandefered(void); 241 static void _cache_unlink(struct namecache *ncp); 242 243 /* 244 * The new name cache statistics (these are rolled up globals and not 245 * modified in the critical path, see struct pcpu_ncache). 246 */ 247 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 248 static long vfscache_negs; 249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0, 250 "Number of negative namecache entries"); 251 static long vfscache_count; 252 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0, 253 "Number of namecaches entries"); 254 static long vfscache_leafs; 255 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0, 256 "Number of namecaches entries"); 257 static long numdefered; 258 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 259 "Number of cache entries allocated"); 260 261 262 struct nchstats nchstats[SMP_MAXCPU]; 263 /* 264 * Export VFS cache effectiveness statistics to user-land. 265 * 266 * The statistics are left for aggregation to user-land so 267 * neat things can be achieved, like observing per-CPU cache 268 * distribution. 269 */ 270 static int 271 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 272 { 273 struct globaldata *gd; 274 int i, error; 275 276 error = 0; 277 for (i = 0; i < ncpus; ++i) { 278 gd = globaldata_find(i); 279 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 280 sizeof(struct nchstats)))) 281 break; 282 } 283 284 return (error); 285 } 286 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 287 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 288 289 static void cache_zap(struct namecache *ncp); 290 291 /* 292 * Cache mount points and namecache records in order to avoid unnecessary 293 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP 294 * performance and is particularly important on multi-socket systems to 295 * reduce cache-line ping-ponging. 296 * 297 * Try to keep the pcpu structure within one cache line (~64 bytes). 298 */ 299 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */ 300 #define MNTCACHE_SET 8 /* set associativity */ 301 302 struct mntcache_elm { 303 struct namecache *ncp; 304 struct mount *mp; 305 int ticks; 306 int unused01; 307 }; 308 309 struct mntcache { 310 struct mntcache_elm array[MNTCACHE_COUNT]; 311 } __cachealign; 312 313 static struct mntcache pcpu_mntcache[MAXCPU]; 314 315 static __inline 316 struct mntcache_elm * 317 _cache_mntcache_hash(void *ptr) 318 { 319 struct mntcache_elm *elm; 320 int hv; 321 322 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1); 323 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)]; 324 325 return elm; 326 } 327 328 static 329 void 330 _cache_mntref(struct mount *mp) 331 { 332 struct mntcache_elm *elm; 333 struct mount *mpr; 334 int i; 335 336 elm = _cache_mntcache_hash(mp); 337 for (i = 0; i < MNTCACHE_SET; ++i) { 338 if (elm->mp == mp) { 339 mpr = atomic_swap_ptr((void *)&elm->mp, NULL); 340 if (__predict_true(mpr == mp)) 341 return; 342 if (mpr) 343 atomic_add_int(&mpr->mnt_refs, -1); 344 } 345 ++elm; 346 } 347 atomic_add_int(&mp->mnt_refs, 1); 348 } 349 350 static 351 void 352 _cache_mntrel(struct mount *mp) 353 { 354 struct mntcache_elm *elm; 355 struct mntcache_elm *best; 356 struct mount *mpr; 357 int delta1; 358 int delta2; 359 int i; 360 361 elm = _cache_mntcache_hash(mp); 362 best = elm; 363 for (i = 0; i < MNTCACHE_SET; ++i) { 364 if (elm->mp == NULL) { 365 mpr = atomic_swap_ptr((void *)&elm->mp, mp); 366 if (__predict_false(mpr != NULL)) { 367 atomic_add_int(&mpr->mnt_refs, -1); 368 } 369 elm->ticks = ticks; 370 return; 371 } 372 delta1 = ticks - best->ticks; 373 delta2 = ticks - elm->ticks; 374 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 375 best = elm; 376 ++elm; 377 } 378 mpr = atomic_swap_ptr((void *)&best->mp, mp); 379 best->ticks = ticks; 380 if (mpr) 381 atomic_add_int(&mpr->mnt_refs, -1); 382 } 383 384 /* 385 * Clears all cached mount points on all cpus. This routine should only 386 * be called when we are waiting for a mount to clear, e.g. so we can 387 * unmount. 388 */ 389 void 390 cache_clearmntcache(struct mount *target __unused) 391 { 392 int n; 393 394 for (n = 0; n < ncpus; ++n) { 395 struct mntcache *cache = &pcpu_mntcache[n]; 396 struct mntcache_elm *elm; 397 struct namecache *ncp; 398 struct mount *mp; 399 int i; 400 401 for (i = 0; i < MNTCACHE_COUNT; ++i) { 402 elm = &cache->array[i]; 403 if (elm->mp) { 404 mp = atomic_swap_ptr((void *)&elm->mp, NULL); 405 if (mp) 406 atomic_add_int(&mp->mnt_refs, -1); 407 } 408 if (elm->ncp) { 409 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL); 410 if (ncp) 411 _cache_drop(ncp); 412 } 413 } 414 } 415 } 416 417 /* 418 * Namespace locking. The caller must already hold a reference to the 419 * namecache structure in order to lock/unlock it. The controlling entity 420 * in a 1->0 transition does not need to lock the ncp to dispose of it, 421 * as nobody else will have visiblity to it at that point. 422 * 423 * Note that holding a locked namecache structure prevents other threads 424 * from making namespace changes (e.g. deleting or creating), prevents 425 * vnode association state changes by other threads, and prevents the 426 * namecache entry from being resolved or unresolved by other threads. 427 * 428 * An exclusive lock owner has full authority to associate/disassociate 429 * vnodes and resolve/unresolve the locked ncp. 430 * 431 * A shared lock owner only has authority to acquire the underlying vnode, 432 * if any. 433 * 434 * The primary lock field is nc_lockstatus. nc_locktd is set after the 435 * fact (when locking) or cleared prior to unlocking. 436 * 437 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 438 * or recycled, but it does NOT help you if the vnode had already 439 * initiated a recyclement. If this is important, use cache_get() 440 * rather then cache_lock() (and deal with the differences in the 441 * way the refs counter is handled). Or, alternatively, make an 442 * unconditional call to cache_validate() or cache_resolve() 443 * after cache_lock() returns. 444 */ 445 static __inline 446 void 447 _cache_lock(struct namecache *ncp) 448 { 449 int didwarn = 0; 450 int error; 451 452 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 453 while (__predict_false(error == EWOULDBLOCK)) { 454 if (didwarn == 0) { 455 didwarn = ticks - nclockwarn; 456 kprintf("[diagnostic] cache_lock: " 457 "%s blocked on %p " 458 "\"%*.*s\"\n", 459 curthread->td_comm, ncp, 460 ncp->nc_nlen, ncp->nc_nlen, 461 ncp->nc_name); 462 } 463 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK); 464 } 465 if (__predict_false(didwarn)) { 466 kprintf("[diagnostic] cache_lock: " 467 "%s unblocked %*.*s after %d secs\n", 468 curthread->td_comm, 469 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 470 (int)(ticks - didwarn) / hz); 471 } 472 } 473 474 /* 475 * Release a previously acquired lock. 476 * 477 * A concurrent shared-lock acquisition or acquisition/release can 478 * race bit 31 so only drop the ncp if bit 31 was set. 479 */ 480 static __inline 481 void 482 _cache_unlock(struct namecache *ncp) 483 { 484 lockmgr(&ncp->nc_lock, LK_RELEASE); 485 } 486 487 /* 488 * Lock ncp exclusively, non-blocking. Return 0 on success. 489 */ 490 static __inline 491 int 492 _cache_lock_nonblock(struct namecache *ncp) 493 { 494 int error; 495 496 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT); 497 if (__predict_false(error != 0)) { 498 return(EWOULDBLOCK); 499 } 500 return 0; 501 } 502 503 /* 504 * This is a special form of _cache_lock() which only succeeds if 505 * it can get a pristine, non-recursive lock. The caller must have 506 * already ref'd the ncp. 507 * 508 * On success the ncp will be locked, on failure it will not. The 509 * ref count does not change either way. 510 * 511 * We want _cache_lock_special() (on success) to return a definitively 512 * usable vnode or a definitively unresolved ncp. 513 */ 514 static __inline 515 int 516 _cache_lock_special(struct namecache *ncp) 517 { 518 if (_cache_lock_nonblock(ncp) == 0) { 519 if (lockmgr_oneexcl(&ncp->nc_lock)) { 520 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 521 _cache_setunresolved(ncp); 522 return 0; 523 } 524 _cache_unlock(ncp); 525 } 526 return EWOULDBLOCK; 527 } 528 529 /* 530 * Shared lock, guarantees vp held 531 * 532 * The shared lock holds vp on the 0->1 transition. It is possible to race 533 * another shared lock release, preventing the other release from dropping 534 * the vnode and clearing bit 31. 535 * 536 * If it is not set then we are responsible for setting it, and this 537 * responsibility does not race with anyone else. 538 */ 539 static __inline 540 void 541 _cache_lock_shared(struct namecache *ncp) 542 { 543 int didwarn = 0; 544 int error; 545 546 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 547 while (__predict_false(error == EWOULDBLOCK)) { 548 if (didwarn == 0) { 549 didwarn = ticks - nclockwarn; 550 kprintf("[diagnostic] cache_lock_shared: " 551 "%s blocked on %p " 552 "\"%*.*s\"\n", 553 curthread->td_comm, ncp, 554 ncp->nc_nlen, ncp->nc_nlen, 555 ncp->nc_name); 556 } 557 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK); 558 } 559 if (__predict_false(didwarn)) { 560 kprintf("[diagnostic] cache_lock_shared: " 561 "%s unblocked %*.*s after %d secs\n", 562 curthread->td_comm, 563 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 564 (int)(ticks - didwarn) / hz); 565 } 566 } 567 568 /* 569 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success 570 */ 571 static __inline 572 int 573 _cache_lock_shared_nonblock(struct namecache *ncp) 574 { 575 int error; 576 577 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT); 578 if (__predict_false(error != 0)) { 579 return(EWOULDBLOCK); 580 } 581 return 0; 582 } 583 584 /* 585 * This function tries to get a shared lock but will back-off to an 586 * exclusive lock if: 587 * 588 * (1) Some other thread is trying to obtain an exclusive lock 589 * (to prevent the exclusive requester from getting livelocked out 590 * by many shared locks). 591 * 592 * (2) The current thread already owns an exclusive lock (to avoid 593 * deadlocking). 594 * 595 * WARNING! On machines with lots of cores we really want to try hard to 596 * get a shared lock or concurrent path lookups can chain-react 597 * into a very high-latency exclusive lock. 598 * 599 * This is very evident in dsynth's initial scans. 600 */ 601 static __inline 602 int 603 _cache_lock_shared_special(struct namecache *ncp) 604 { 605 /* 606 * Only honor a successful shared lock (returning 0) if there is 607 * no exclusive request pending and the vnode, if present, is not 608 * in a reclaimed state. 609 */ 610 if (_cache_lock_shared_nonblock(ncp) == 0) { 611 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) { 612 if (ncp->nc_vp == NULL || 613 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 614 return(0); 615 } 616 } 617 _cache_unlock(ncp); 618 return(EWOULDBLOCK); 619 } 620 621 /* 622 * Non-blocking shared lock failed. If we already own the exclusive 623 * lock just acquire another exclusive lock (instead of deadlocking). 624 * Otherwise acquire a shared lock. 625 */ 626 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) { 627 _cache_lock(ncp); 628 return(0); 629 } 630 _cache_lock_shared(ncp); 631 return(0); 632 } 633 634 static __inline 635 int 636 _cache_lockstatus(struct namecache *ncp) 637 { 638 int status; 639 640 status = lockstatus(&ncp->nc_lock, curthread); 641 if (status == 0 || status == LK_EXCLOTHER) 642 status = -1; 643 return status; 644 } 645 646 /* 647 * cache_hold() and cache_drop() prevent the premature deletion of a 648 * namecache entry but do not prevent operations (such as zapping) on 649 * that namecache entry. 650 * 651 * This routine may only be called from outside this source module if 652 * nc_refs is already deterministically at least 1, such as being 653 * associated with e.g. a process, file descriptor, or some other entity. 654 * 655 * Only the above situations, similar situations within this module where 656 * the ref count is deterministically at least 1, or when the ncp is found 657 * via the nchpp (hash table) lookup, can bump nc_refs. 658 * 659 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It 660 * can still be removed from the nc_list, however, as long as the caller 661 * can acquire its lock (in the wrong order). 662 * 663 * This is a rare case where callers are allowed to hold a spinlock, 664 * so we can't ourselves. 665 */ 666 static __inline 667 struct namecache * 668 _cache_hold(struct namecache *ncp) 669 { 670 KKASSERT(ncp->nc_refs > 0); 671 atomic_add_int(&ncp->nc_refs, 1); 672 673 return(ncp); 674 } 675 676 /* 677 * Drop a cache entry. 678 * 679 * The 1->0 transition is special and requires the caller to destroy the 680 * entry. It means that the ncp is no longer on a nchpp list (since that 681 * would mean there was stilla ref). The ncp could still be on a nc_list 682 * but will not have any child of its own, again because nc_refs is now 0 683 * and children would have a ref to their parent. 684 * 685 * Once the 1->0 transition is made, nc_refs cannot be incremented again. 686 */ 687 static __inline 688 void 689 _cache_drop(struct namecache *ncp) 690 { 691 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) { 692 /* 693 * Executed unlocked (no need to lock on last drop) 694 */ 695 _cache_setunresolved(ncp); 696 697 /* 698 * Scrap it. 699 */ 700 ncp->nc_refs = -1; /* safety */ 701 if (ncp->nc_name) 702 kfree(ncp->nc_name, M_VFSCACHE); 703 kfree(ncp, M_VFSCACHE); 704 } 705 } 706 707 /* 708 * Link a new namecache entry to its parent and to the hash table. Be 709 * careful to avoid races if vhold() blocks in the future. 710 * 711 * Both ncp and par must be referenced and locked. The reference is 712 * transfered to the nchpp (and, most notably, NOT to the parent list). 713 * 714 * NOTE: The hash table spinlock is held across this call, we can't do 715 * anything fancy. 716 */ 717 static void 718 _cache_link_parent(struct namecache *ncp, struct namecache *par, 719 struct nchash_head *nchpp) 720 { 721 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 722 723 KKASSERT(ncp->nc_parent == NULL); 724 ncp->nc_parent = par; 725 ncp->nc_head = nchpp; 726 727 /* 728 * Set inheritance flags. Note that the parent flags may be 729 * stale due to getattr potentially not having been run yet 730 * (it gets run during nlookup()'s). 731 */ 732 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 733 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 734 ncp->nc_flag |= NCF_SF_PNOCACHE; 735 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 736 ncp->nc_flag |= NCF_UF_PCACHE; 737 738 /* 739 * Add to hash table and parent, adjust accounting 740 */ 741 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 742 atomic_add_long(&pn->vfscache_count, 1); 743 if (TAILQ_EMPTY(&ncp->nc_list)) 744 atomic_add_long(&pn->vfscache_leafs, 1); 745 746 if (TAILQ_EMPTY(&par->nc_list)) { 747 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 748 atomic_add_long(&pn->vfscache_leafs, -1); 749 /* 750 * Any vp associated with an ncp which has children must 751 * be held to prevent it from being recycled. 752 */ 753 if (par->nc_vp) 754 vhold(par->nc_vp); 755 } else { 756 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 757 } 758 _cache_hold(par); /* add nc_parent ref */ 759 } 760 761 /* 762 * Remove the parent and hash associations from a namecache structure. 763 * Drop the ref-count on the parent. The caller receives the ref 764 * from the ncp's nchpp linkage that was removed and may forward that 765 * ref to a new linkage. 766 767 * The caller usually holds an additional ref * on the ncp so the unlink 768 * cannot be the final drop. XXX should not be necessary now since the 769 * caller receives the ref from the nchpp linkage, assuming the ncp 770 * was linked in the first place. 771 * 772 * ncp must be locked, which means that there won't be any nc_parent 773 * removal races. This routine will acquire a temporary lock on 774 * the parent as well as the appropriate hash chain. 775 */ 776 static void 777 _cache_unlink_parent(struct namecache *ncp) 778 { 779 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 780 struct namecache *par; 781 struct vnode *dropvp; 782 struct nchash_head *nchpp; 783 784 if ((par = ncp->nc_parent) != NULL) { 785 cpu_ccfence(); 786 KKASSERT(ncp->nc_parent == par); 787 788 /* don't add a ref, we drop the nchpp ref later */ 789 _cache_lock(par); 790 nchpp = ncp->nc_head; 791 spin_lock(&nchpp->spin); 792 793 /* 794 * Remove from hash table and parent, adjust accounting 795 */ 796 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 797 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 798 atomic_add_long(&pn->vfscache_count, -1); 799 if (TAILQ_EMPTY(&ncp->nc_list)) 800 atomic_add_long(&pn->vfscache_leafs, -1); 801 802 dropvp = NULL; 803 if (TAILQ_EMPTY(&par->nc_list)) { 804 atomic_add_long(&pn->vfscache_leafs, 1); 805 if (par->nc_vp) 806 dropvp = par->nc_vp; 807 } 808 ncp->nc_parent = NULL; 809 ncp->nc_head = NULL; 810 spin_unlock(&nchpp->spin); 811 _cache_unlock(par); 812 _cache_drop(par); /* drop nc_parent ref */ 813 814 /* 815 * We can only safely vdrop with no spinlocks held. 816 */ 817 if (dropvp) 818 vdrop(dropvp); 819 } 820 } 821 822 /* 823 * Allocate a new namecache structure. Most of the code does not require 824 * zero-termination of the string but it makes vop_compat_ncreate() easier. 825 * 826 * The returned ncp will be locked and referenced. The ref is generally meant 827 * to be transfered to the nchpp linkage. 828 */ 829 static struct namecache * 830 cache_alloc(int nlen) 831 { 832 struct namecache *ncp; 833 834 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 835 if (nlen) 836 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 837 ncp->nc_nlen = nlen; 838 ncp->nc_flag = NCF_UNRESOLVED; 839 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 840 ncp->nc_refs = 1; 841 TAILQ_INIT(&ncp->nc_list); 842 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE); 843 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE); 844 845 return(ncp); 846 } 847 848 /* 849 * Can only be called for the case where the ncp has never been 850 * associated with anything (so no spinlocks are needed). 851 */ 852 static void 853 _cache_free(struct namecache *ncp) 854 { 855 KKASSERT(ncp->nc_refs == 1); 856 if (ncp->nc_name) 857 kfree(ncp->nc_name, M_VFSCACHE); 858 kfree(ncp, M_VFSCACHE); 859 } 860 861 /* 862 * [re]initialize a nchandle. 863 */ 864 void 865 cache_zero(struct nchandle *nch) 866 { 867 nch->ncp = NULL; 868 nch->mount = NULL; 869 } 870 871 /* 872 * Ref and deref a nchandle structure (ncp + mp) 873 * 874 * The caller must specify a stable ncp pointer, typically meaning the 875 * ncp is already referenced but this can also occur indirectly through 876 * e.g. holding a lock on a direct child. 877 * 878 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 879 * use read spinlocks here. 880 */ 881 struct nchandle * 882 cache_hold(struct nchandle *nch) 883 { 884 _cache_hold(nch->ncp); 885 _cache_mntref(nch->mount); 886 return(nch); 887 } 888 889 /* 890 * Create a copy of a namecache handle for an already-referenced 891 * entry. 892 */ 893 void 894 cache_copy(struct nchandle *nch, struct nchandle *target) 895 { 896 struct namecache *ncp; 897 struct mount *mp; 898 struct mntcache_elm *elm; 899 struct namecache *ncpr; 900 int i; 901 902 ncp = nch->ncp; 903 mp = nch->mount; 904 target->ncp = ncp; 905 target->mount = mp; 906 907 elm = _cache_mntcache_hash(ncp); 908 for (i = 0; i < MNTCACHE_SET; ++i) { 909 if (elm->ncp == ncp) { 910 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL); 911 if (ncpr == ncp) { 912 _cache_mntref(mp); 913 return; 914 } 915 if (ncpr) 916 _cache_drop(ncpr); 917 } 918 ++elm; 919 } 920 if (ncp) 921 _cache_hold(ncp); 922 _cache_mntref(mp); 923 } 924 925 /* 926 * Drop the nchandle, but try to cache the ref to avoid global atomic 927 * ops. This is typically done on the system root and jail root nchandles. 928 */ 929 void 930 cache_drop_and_cache(struct nchandle *nch, int elmno) 931 { 932 struct mntcache_elm *elm; 933 struct mntcache_elm *best; 934 struct namecache *ncpr; 935 int delta1; 936 int delta2; 937 int i; 938 939 if (elmno > 4) { 940 if (nch->ncp) { 941 _cache_drop(nch->ncp); 942 nch->ncp = NULL; 943 } 944 if (nch->mount) { 945 _cache_mntrel(nch->mount); 946 nch->mount = NULL; 947 } 948 return; 949 } 950 951 elm = _cache_mntcache_hash(nch->ncp); 952 best = elm; 953 for (i = 0; i < MNTCACHE_SET; ++i) { 954 if (elm->ncp == NULL) { 955 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp); 956 _cache_mntrel(nch->mount); 957 elm->ticks = ticks; 958 nch->mount = NULL; 959 nch->ncp = NULL; 960 if (ncpr) 961 _cache_drop(ncpr); 962 return; 963 } 964 delta1 = ticks - best->ticks; 965 delta2 = ticks - elm->ticks; 966 if (delta2 > delta1 || delta1 < -1 || delta2 < -1) 967 best = elm; 968 ++elm; 969 } 970 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp); 971 _cache_mntrel(nch->mount); 972 best->ticks = ticks; 973 nch->mount = NULL; 974 nch->ncp = NULL; 975 if (ncpr) 976 _cache_drop(ncpr); 977 } 978 979 void 980 cache_changemount(struct nchandle *nch, struct mount *mp) 981 { 982 _cache_mntref(mp); 983 _cache_mntrel(nch->mount); 984 nch->mount = mp; 985 } 986 987 void 988 cache_drop(struct nchandle *nch) 989 { 990 _cache_mntrel(nch->mount); 991 _cache_drop(nch->ncp); 992 nch->ncp = NULL; 993 nch->mount = NULL; 994 } 995 996 int 997 cache_lockstatus(struct nchandle *nch) 998 { 999 return(_cache_lockstatus(nch->ncp)); 1000 } 1001 1002 void 1003 cache_lock(struct nchandle *nch) 1004 { 1005 _cache_lock(nch->ncp); 1006 } 1007 1008 void 1009 cache_lock_maybe_shared(struct nchandle *nch, int excl) 1010 { 1011 struct namecache *ncp = nch->ncp; 1012 1013 if (ncp_shared_lock_disable || excl || 1014 (ncp->nc_flag & NCF_UNRESOLVED)) { 1015 _cache_lock(ncp); 1016 } else { 1017 _cache_lock_shared(ncp); 1018 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1019 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1020 _cache_unlock(ncp); 1021 _cache_lock(ncp); 1022 } 1023 } else { 1024 _cache_unlock(ncp); 1025 _cache_lock(ncp); 1026 } 1027 } 1028 } 1029 1030 /* 1031 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 1032 * is responsible for checking both for validity on return as they 1033 * may have become invalid. 1034 * 1035 * We have to deal with potential deadlocks here, just ping pong 1036 * the lock until we get it (we will always block somewhere when 1037 * looping so this is not cpu-intensive). 1038 * 1039 * which = 0 nch1 not locked, nch2 is locked 1040 * which = 1 nch1 is locked, nch2 is not locked 1041 */ 1042 void 1043 cache_relock(struct nchandle *nch1, struct ucred *cred1, 1044 struct nchandle *nch2, struct ucred *cred2) 1045 { 1046 int which; 1047 1048 which = 0; 1049 1050 for (;;) { 1051 if (which == 0) { 1052 if (cache_lock_nonblock(nch1) == 0) { 1053 cache_resolve(nch1, cred1); 1054 break; 1055 } 1056 cache_unlock(nch2); 1057 cache_lock(nch1); 1058 cache_resolve(nch1, cred1); 1059 which = 1; 1060 } else { 1061 if (cache_lock_nonblock(nch2) == 0) { 1062 cache_resolve(nch2, cred2); 1063 break; 1064 } 1065 cache_unlock(nch1); 1066 cache_lock(nch2); 1067 cache_resolve(nch2, cred2); 1068 which = 0; 1069 } 1070 } 1071 } 1072 1073 int 1074 cache_lock_nonblock(struct nchandle *nch) 1075 { 1076 return(_cache_lock_nonblock(nch->ncp)); 1077 } 1078 1079 void 1080 cache_unlock(struct nchandle *nch) 1081 { 1082 _cache_unlock(nch->ncp); 1083 } 1084 1085 /* 1086 * ref-and-lock, unlock-and-deref functions. 1087 * 1088 * This function is primarily used by nlookup. Even though cache_lock 1089 * holds the vnode, it is possible that the vnode may have already 1090 * initiated a recyclement. 1091 * 1092 * We want cache_get() to return a definitively usable vnode or a 1093 * definitively unresolved ncp. 1094 */ 1095 static 1096 struct namecache * 1097 _cache_get(struct namecache *ncp) 1098 { 1099 _cache_hold(ncp); 1100 _cache_lock(ncp); 1101 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1102 _cache_setunresolved(ncp); 1103 return(ncp); 1104 } 1105 1106 /* 1107 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1108 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1109 * valid. Otherwise an exclusive lock will be acquired instead. 1110 */ 1111 static 1112 struct namecache * 1113 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1114 { 1115 if (ncp_shared_lock_disable || excl || 1116 (ncp->nc_flag & NCF_UNRESOLVED)) { 1117 return(_cache_get(ncp)); 1118 } 1119 _cache_hold(ncp); 1120 _cache_lock_shared(ncp); 1121 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1122 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1123 _cache_unlock(ncp); 1124 ncp = _cache_get(ncp); 1125 _cache_drop(ncp); 1126 } 1127 } else { 1128 _cache_unlock(ncp); 1129 ncp = _cache_get(ncp); 1130 _cache_drop(ncp); 1131 } 1132 return(ncp); 1133 } 1134 1135 /* 1136 * NOTE: The same nchandle can be passed for both arguments. 1137 */ 1138 void 1139 cache_get(struct nchandle *nch, struct nchandle *target) 1140 { 1141 KKASSERT(nch->ncp->nc_refs > 0); 1142 target->mount = nch->mount; 1143 target->ncp = _cache_get(nch->ncp); 1144 _cache_mntref(target->mount); 1145 } 1146 1147 void 1148 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1149 { 1150 KKASSERT(nch->ncp->nc_refs > 0); 1151 target->mount = nch->mount; 1152 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1153 _cache_mntref(target->mount); 1154 } 1155 1156 /* 1157 * Release a held and locked ncp 1158 */ 1159 static __inline 1160 void 1161 _cache_put(struct namecache *ncp) 1162 { 1163 _cache_unlock(ncp); 1164 _cache_drop(ncp); 1165 } 1166 1167 void 1168 cache_put(struct nchandle *nch) 1169 { 1170 _cache_mntrel(nch->mount); 1171 _cache_put(nch->ncp); 1172 nch->ncp = NULL; 1173 nch->mount = NULL; 1174 } 1175 1176 /* 1177 * Resolve an unresolved ncp by associating a vnode with it. If the 1178 * vnode is NULL, a negative cache entry is created. 1179 * 1180 * The ncp should be locked on entry and will remain locked on return. 1181 */ 1182 static 1183 void 1184 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1185 { 1186 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) && 1187 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) && 1188 ncp->nc_vp == NULL); 1189 1190 if (vp) { 1191 /* 1192 * Any vp associated with an ncp which has children must 1193 * be held. Any vp associated with a locked ncp must be held. 1194 */ 1195 if (!TAILQ_EMPTY(&ncp->nc_list)) 1196 vhold(vp); 1197 spin_lock(&vp->v_spin); 1198 ncp->nc_vp = vp; 1199 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1200 ++vp->v_namecache_count; 1201 _cache_hold(ncp); /* v_namecache assoc */ 1202 spin_unlock(&vp->v_spin); 1203 vhold(vp); /* nc_vp */ 1204 1205 /* 1206 * Set auxiliary flags 1207 */ 1208 switch(vp->v_type) { 1209 case VDIR: 1210 ncp->nc_flag |= NCF_ISDIR; 1211 break; 1212 case VLNK: 1213 ncp->nc_flag |= NCF_ISSYMLINK; 1214 /* XXX cache the contents of the symlink */ 1215 break; 1216 default: 1217 break; 1218 } 1219 1220 ncp->nc_error = 0; 1221 1222 /* 1223 * XXX: this is a hack to work-around the lack of a real pfs vfs 1224 * implementation 1225 */ 1226 if (mp) { 1227 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1228 vp->v_pfsmp = mp; 1229 } 1230 } else { 1231 /* 1232 * When creating a negative cache hit we set the 1233 * namecache_gen. A later resolve will clean out the 1234 * negative cache hit if the mount point's namecache_gen 1235 * has changed. Used by devfs, could also be used by 1236 * other remote FSs. 1237 */ 1238 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 1239 1240 ncp->nc_vp = NULL; 1241 ncp->nc_negcpu = mycpu->gd_cpuid; 1242 spin_lock(&pn->neg_spin); 1243 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 1244 _cache_hold(ncp); /* neg_list assoc */ 1245 ++pn->neg_count; 1246 spin_unlock(&pn->neg_spin); 1247 atomic_add_long(&pn->vfscache_negs, 1); 1248 1249 ncp->nc_error = ENOENT; 1250 if (mp) 1251 VFS_NCPGEN_SET(mp, ncp); 1252 } 1253 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1254 } 1255 1256 void 1257 cache_setvp(struct nchandle *nch, struct vnode *vp) 1258 { 1259 _cache_setvp(nch->mount, nch->ncp, vp); 1260 } 1261 1262 /* 1263 * Used for NFS 1264 */ 1265 void 1266 cache_settimeout(struct nchandle *nch, int nticks) 1267 { 1268 struct namecache *ncp = nch->ncp; 1269 1270 if ((ncp->nc_timeout = ticks + nticks) == 0) 1271 ncp->nc_timeout = 1; 1272 } 1273 1274 /* 1275 * Disassociate the vnode or negative-cache association and mark a 1276 * namecache entry as unresolved again. Note that the ncp is still 1277 * left in the hash table and still linked to its parent. 1278 * 1279 * The ncp should be locked and refd on entry and will remain locked and refd 1280 * on return. 1281 * 1282 * This routine is normally never called on a directory containing children. 1283 * However, NFS often does just that in its rename() code as a cop-out to 1284 * avoid complex namespace operations. This disconnects a directory vnode 1285 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1286 * sync. 1287 * 1288 */ 1289 static 1290 void 1291 _cache_setunresolved(struct namecache *ncp) 1292 { 1293 struct vnode *vp; 1294 1295 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1296 ncp->nc_flag |= NCF_UNRESOLVED; 1297 ncp->nc_timeout = 0; 1298 ncp->nc_error = ENOTCONN; 1299 if ((vp = ncp->nc_vp) != NULL) { 1300 spin_lock(&vp->v_spin); 1301 ncp->nc_vp = NULL; 1302 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1303 --vp->v_namecache_count; 1304 spin_unlock(&vp->v_spin); 1305 1306 /* 1307 * Any vp associated with an ncp with children is 1308 * held by that ncp. Any vp associated with ncp 1309 * is held by that ncp. These conditions must be 1310 * undone when the vp is cleared out from the ncp. 1311 */ 1312 if (!TAILQ_EMPTY(&ncp->nc_list)) 1313 vdrop(vp); 1314 vdrop(vp); 1315 } else { 1316 struct pcpu_ncache *pn; 1317 1318 pn = &pcpu_ncache[ncp->nc_negcpu]; 1319 1320 atomic_add_long(&pn->vfscache_negs, -1); 1321 spin_lock(&pn->neg_spin); 1322 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 1323 --pn->neg_count; 1324 spin_unlock(&pn->neg_spin); 1325 } 1326 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1327 _cache_drop(ncp); /* from v_namecache or neg_list */ 1328 } 1329 } 1330 1331 /* 1332 * The cache_nresolve() code calls this function to automatically 1333 * set a resolved cache element to unresolved if it has timed out 1334 * or if it is a negative cache hit and the mount point namecache_gen 1335 * has changed. 1336 */ 1337 static __inline int 1338 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1339 { 1340 /* 1341 * Try to zap entries that have timed out. We have 1342 * to be careful here because locked leafs may depend 1343 * on the vnode remaining intact in a parent, so only 1344 * do this under very specific conditions. 1345 */ 1346 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1347 TAILQ_EMPTY(&ncp->nc_list)) { 1348 return 1; 1349 } 1350 1351 /* 1352 * If a resolved negative cache hit is invalid due to 1353 * the mount's namecache generation being bumped, zap it. 1354 */ 1355 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1356 return 1; 1357 } 1358 1359 /* 1360 * Otherwise we are good 1361 */ 1362 return 0; 1363 } 1364 1365 static __inline void 1366 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1367 { 1368 /* 1369 * Already in an unresolved state, nothing to do. 1370 */ 1371 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1372 if (_cache_auto_unresolve_test(mp, ncp)) 1373 _cache_setunresolved(ncp); 1374 } 1375 } 1376 1377 void 1378 cache_setunresolved(struct nchandle *nch) 1379 { 1380 _cache_setunresolved(nch->ncp); 1381 } 1382 1383 /* 1384 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1385 * looking for matches. This flag tells the lookup code when it must 1386 * check for a mount linkage and also prevents the directories in question 1387 * from being deleted or renamed. 1388 */ 1389 static 1390 int 1391 cache_clrmountpt_callback(struct mount *mp, void *data) 1392 { 1393 struct nchandle *nch = data; 1394 1395 if (mp->mnt_ncmounton.ncp == nch->ncp) 1396 return(1); 1397 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1398 return(1); 1399 return(0); 1400 } 1401 1402 /* 1403 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated 1404 * with a mount point. 1405 */ 1406 void 1407 cache_clrmountpt(struct nchandle *nch) 1408 { 1409 int count; 1410 1411 count = mountlist_scan(cache_clrmountpt_callback, nch, 1412 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | 1413 MNTSCAN_NOUNLOCK); 1414 if (count == 0) 1415 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1416 } 1417 1418 /* 1419 * Invalidate portions of the namecache topology given a starting entry. 1420 * The passed ncp is set to an unresolved state and: 1421 * 1422 * The passed ncp must be referenced and locked. The routine may unlock 1423 * and relock ncp several times, and will recheck the children and loop 1424 * to catch races. When done the passed ncp will be returned with the 1425 * reference and lock intact. 1426 * 1427 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1428 * that the physical underlying nodes have been 1429 * destroyed... as in deleted. For example, when 1430 * a directory is removed. This will cause record 1431 * lookups on the name to no longer be able to find 1432 * the record and tells the resolver to return failure 1433 * rather then trying to resolve through the parent. 1434 * 1435 * The topology itself, including ncp->nc_name, 1436 * remains intact. 1437 * 1438 * This only applies to the passed ncp, if CINV_CHILDREN 1439 * is specified the children are not flagged. 1440 * 1441 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1442 * state as well. 1443 * 1444 * Note that this will also have the side effect of 1445 * cleaning out any unreferenced nodes in the topology 1446 * from the leaves up as the recursion backs out. 1447 * 1448 * Note that the topology for any referenced nodes remains intact, but 1449 * the nodes will be marked as having been destroyed and will be set 1450 * to an unresolved state. 1451 * 1452 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1453 * the namecache entry may not actually be invalidated on return if it was 1454 * revalidated while recursing down into its children. This code guarentees 1455 * that the node(s) will go through an invalidation cycle, but does not 1456 * guarentee that they will remain in an invalidated state. 1457 * 1458 * Returns non-zero if a revalidation was detected during the invalidation 1459 * recursion, zero otherwise. Note that since only the original ncp is 1460 * locked the revalidation ultimately can only indicate that the original ncp 1461 * *MIGHT* no have been reresolved. 1462 * 1463 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1464 * have to avoid blowing out the kernel stack. We do this by saving the 1465 * deep namecache node and aborting the recursion, then re-recursing at that 1466 * node using a depth-first algorithm in order to allow multiple deep 1467 * recursions to chain through each other, then we restart the invalidation 1468 * from scratch. 1469 */ 1470 1471 struct cinvtrack { 1472 struct namecache *resume_ncp; 1473 int depth; 1474 }; 1475 1476 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1477 1478 static 1479 int 1480 _cache_inval(struct namecache *ncp, int flags) 1481 { 1482 struct cinvtrack track; 1483 struct namecache *ncp2; 1484 int r; 1485 1486 track.depth = 0; 1487 track.resume_ncp = NULL; 1488 1489 for (;;) { 1490 r = _cache_inval_internal(ncp, flags, &track); 1491 if (track.resume_ncp == NULL) 1492 break; 1493 _cache_unlock(ncp); 1494 while ((ncp2 = track.resume_ncp) != NULL) { 1495 track.resume_ncp = NULL; 1496 _cache_lock(ncp2); 1497 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1498 &track); 1499 /*_cache_put(ncp2);*/ 1500 cache_zap(ncp2); 1501 } 1502 _cache_lock(ncp); 1503 } 1504 return(r); 1505 } 1506 1507 int 1508 cache_inval(struct nchandle *nch, int flags) 1509 { 1510 return(_cache_inval(nch->ncp, flags)); 1511 } 1512 1513 /* 1514 * Helper for _cache_inval(). The passed ncp is refd and locked and 1515 * remains that way on return, but may be unlocked/relocked multiple 1516 * times by the routine. 1517 */ 1518 static int 1519 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1520 { 1521 struct namecache *nextkid; 1522 int rcnt = 0; 1523 1524 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1525 1526 _cache_setunresolved(ncp); 1527 if (flags & CINV_DESTROY) { 1528 ncp->nc_flag |= NCF_DESTROYED; 1529 ++ncp->nc_generation; 1530 } 1531 1532 while ((flags & CINV_CHILDREN) && 1533 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1534 ) { 1535 struct namecache *kid; 1536 int restart; 1537 1538 restart = 0; 1539 _cache_hold(nextkid); 1540 if (++track->depth > MAX_RECURSION_DEPTH) { 1541 track->resume_ncp = ncp; 1542 _cache_hold(ncp); 1543 ++rcnt; 1544 } 1545 while ((kid = nextkid) != NULL) { 1546 /* 1547 * Parent (ncp) must be locked for the iteration. 1548 */ 1549 nextkid = NULL; 1550 if (kid->nc_parent != ncp) { 1551 _cache_drop(kid); 1552 kprintf("cache_inval_internal restartA %s\n", 1553 ncp->nc_name); 1554 restart = 1; 1555 break; 1556 } 1557 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1558 _cache_hold(nextkid); 1559 1560 /* 1561 * Parent unlocked for this section to avoid 1562 * deadlocks. Then lock the kid and check for 1563 * races. 1564 */ 1565 _cache_unlock(ncp); 1566 if (track->resume_ncp) { 1567 _cache_drop(kid); 1568 _cache_lock(ncp); 1569 break; 1570 } 1571 _cache_lock(kid); 1572 if (kid->nc_parent != ncp) { 1573 kprintf("cache_inval_internal " 1574 "restartB %s\n", 1575 ncp->nc_name); 1576 restart = 1; 1577 _cache_unlock(kid); 1578 _cache_drop(kid); 1579 _cache_lock(ncp); 1580 break; 1581 } 1582 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1583 TAILQ_FIRST(&kid->nc_list) 1584 ) { 1585 1586 rcnt += _cache_inval_internal(kid, 1587 flags & ~CINV_DESTROY, track); 1588 /*_cache_unlock(kid);*/ 1589 /*_cache_drop(kid);*/ 1590 cache_zap(kid); 1591 } else { 1592 cache_zap(kid); 1593 } 1594 1595 /* 1596 * Relock parent to continue scan 1597 */ 1598 _cache_lock(ncp); 1599 } 1600 if (nextkid) 1601 _cache_drop(nextkid); 1602 --track->depth; 1603 if (restart == 0) 1604 break; 1605 } 1606 1607 /* 1608 * Someone could have gotten in there while ncp was unlocked, 1609 * retry if so. 1610 */ 1611 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1612 ++rcnt; 1613 return (rcnt); 1614 } 1615 1616 /* 1617 * Invalidate a vnode's namecache associations. To avoid races against 1618 * the resolver we do not invalidate a node which we previously invalidated 1619 * but which was then re-resolved while we were in the invalidation loop. 1620 * 1621 * Returns non-zero if any namecache entries remain after the invalidation 1622 * loop completed. 1623 * 1624 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1625 * be ripped out of the topology while held, the vnode's v_namecache 1626 * list has no such restriction. NCP's can be ripped out of the list 1627 * at virtually any time if not locked, even if held. 1628 * 1629 * In addition, the v_namecache list itself must be locked via 1630 * the vnode's spinlock. 1631 */ 1632 int 1633 cache_inval_vp(struct vnode *vp, int flags) 1634 { 1635 struct namecache *ncp; 1636 struct namecache *next; 1637 1638 restart: 1639 spin_lock(&vp->v_spin); 1640 ncp = TAILQ_FIRST(&vp->v_namecache); 1641 if (ncp) 1642 _cache_hold(ncp); 1643 while (ncp) { 1644 /* loop entered with ncp held and vp spin-locked */ 1645 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1646 _cache_hold(next); 1647 spin_unlock(&vp->v_spin); 1648 _cache_lock(ncp); 1649 if (ncp->nc_vp != vp) { 1650 kprintf("Warning: cache_inval_vp: race-A detected on " 1651 "%s\n", ncp->nc_name); 1652 _cache_put(ncp); 1653 if (next) 1654 _cache_drop(next); 1655 goto restart; 1656 } 1657 _cache_inval(ncp, flags); 1658 _cache_put(ncp); /* also releases reference */ 1659 ncp = next; 1660 spin_lock(&vp->v_spin); 1661 if (ncp && ncp->nc_vp != vp) { 1662 spin_unlock(&vp->v_spin); 1663 kprintf("Warning: cache_inval_vp: race-B detected on " 1664 "%s\n", ncp->nc_name); 1665 _cache_drop(ncp); 1666 goto restart; 1667 } 1668 } 1669 spin_unlock(&vp->v_spin); 1670 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1671 } 1672 1673 /* 1674 * This routine is used instead of the normal cache_inval_vp() when we 1675 * are trying to recycle otherwise good vnodes. 1676 * 1677 * Return 0 on success, non-zero if not all namecache records could be 1678 * disassociated from the vnode (for various reasons). 1679 */ 1680 int 1681 cache_inval_vp_nonblock(struct vnode *vp) 1682 { 1683 struct namecache *ncp; 1684 struct namecache *next; 1685 1686 spin_lock(&vp->v_spin); 1687 ncp = TAILQ_FIRST(&vp->v_namecache); 1688 if (ncp) 1689 _cache_hold(ncp); 1690 while (ncp) { 1691 /* loop entered with ncp held */ 1692 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1693 _cache_hold(next); 1694 spin_unlock(&vp->v_spin); 1695 if (_cache_lock_nonblock(ncp)) { 1696 _cache_drop(ncp); 1697 if (next) 1698 _cache_drop(next); 1699 goto done; 1700 } 1701 if (ncp->nc_vp != vp) { 1702 kprintf("Warning: cache_inval_vp: race-A detected on " 1703 "%s\n", ncp->nc_name); 1704 _cache_put(ncp); 1705 if (next) 1706 _cache_drop(next); 1707 goto done; 1708 } 1709 _cache_inval(ncp, 0); 1710 _cache_put(ncp); /* also releases reference */ 1711 ncp = next; 1712 spin_lock(&vp->v_spin); 1713 if (ncp && ncp->nc_vp != vp) { 1714 spin_unlock(&vp->v_spin); 1715 kprintf("Warning: cache_inval_vp: race-B detected on " 1716 "%s\n", ncp->nc_name); 1717 _cache_drop(ncp); 1718 goto done; 1719 } 1720 } 1721 spin_unlock(&vp->v_spin); 1722 done: 1723 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1724 } 1725 1726 /* 1727 * Clears the universal directory search 'ok' flag. This flag allows 1728 * nlookup() to bypass normal vnode checks. This flag is a cached flag 1729 * so clearing it simply forces revalidation. 1730 */ 1731 void 1732 cache_inval_wxok(struct vnode *vp) 1733 { 1734 struct namecache *ncp; 1735 1736 spin_lock(&vp->v_spin); 1737 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 1738 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX)) 1739 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX); 1740 } 1741 spin_unlock(&vp->v_spin); 1742 } 1743 1744 /* 1745 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1746 * must be locked. The target ncp is destroyed (as a normal rename-over 1747 * would destroy the target file or directory). 1748 * 1749 * Because there may be references to the source ncp we cannot copy its 1750 * contents to the target. Instead the source ncp is relinked as the target 1751 * and the target ncp is removed from the namecache topology. 1752 */ 1753 void 1754 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1755 { 1756 struct namecache *fncp = fnch->ncp; 1757 struct namecache *tncp = tnch->ncp; 1758 struct namecache *tncp_par; 1759 struct nchash_head *nchpp; 1760 u_int32_t hash; 1761 char *oname; 1762 char *nname; 1763 1764 ++fncp->nc_generation; 1765 ++tncp->nc_generation; 1766 if (tncp->nc_nlen) { 1767 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1768 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1769 nname[tncp->nc_nlen] = 0; 1770 } else { 1771 nname = NULL; 1772 } 1773 1774 /* 1775 * Rename fncp (unlink) 1776 */ 1777 _cache_unlink_parent(fncp); 1778 oname = fncp->nc_name; 1779 fncp->nc_name = nname; 1780 fncp->nc_nlen = tncp->nc_nlen; 1781 if (oname) 1782 kfree(oname, M_VFSCACHE); 1783 1784 tncp_par = tncp->nc_parent; 1785 _cache_hold(tncp_par); 1786 _cache_lock(tncp_par); 1787 1788 /* 1789 * Rename fncp (relink) 1790 */ 1791 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1792 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1793 nchpp = NCHHASH(hash); 1794 1795 spin_lock(&nchpp->spin); 1796 _cache_link_parent(fncp, tncp_par, nchpp); 1797 spin_unlock(&nchpp->spin); 1798 1799 _cache_put(tncp_par); 1800 1801 /* 1802 * Get rid of the overwritten tncp (unlink) 1803 */ 1804 _cache_unlink(tncp); 1805 } 1806 1807 /* 1808 * Perform actions consistent with unlinking a file. The passed-in ncp 1809 * must be locked. 1810 * 1811 * The ncp is marked DESTROYED so it no longer shows up in searches, 1812 * and will be physically deleted when the vnode goes away. 1813 * 1814 * If the related vnode has no refs then we cycle it through vget()/vput() 1815 * to (possibly if we don't have a ref race) trigger a deactivation, 1816 * allowing the VFS to trivially detect and recycle the deleted vnode 1817 * via VOP_INACTIVE(). 1818 * 1819 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1820 * target ncp. 1821 */ 1822 void 1823 cache_unlink(struct nchandle *nch) 1824 { 1825 _cache_unlink(nch->ncp); 1826 } 1827 1828 static void 1829 _cache_unlink(struct namecache *ncp) 1830 { 1831 struct vnode *vp; 1832 1833 /* 1834 * Causes lookups to fail and allows another ncp with the same 1835 * name to be created under ncp->nc_parent. 1836 */ 1837 ncp->nc_flag |= NCF_DESTROYED; 1838 ++ncp->nc_generation; 1839 1840 /* 1841 * Attempt to trigger a deactivation. Set VREF_FINALIZE to 1842 * force action on the 1->0 transition. 1843 */ 1844 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1845 (vp = ncp->nc_vp) != NULL) { 1846 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 1847 if (VREFCNT(vp) <= 0) { 1848 if (vget(vp, LK_SHARED) == 0) 1849 vput(vp); 1850 } 1851 } 1852 } 1853 1854 /* 1855 * Return non-zero if the nch might be associated with an open and/or mmap()'d 1856 * file. The easy solution is to just return non-zero if the vnode has refs. 1857 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to 1858 * force the reclaim). 1859 */ 1860 int 1861 cache_isopen(struct nchandle *nch) 1862 { 1863 struct vnode *vp; 1864 struct namecache *ncp = nch->ncp; 1865 1866 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1867 (vp = ncp->nc_vp) != NULL && 1868 VREFCNT(vp)) { 1869 return 1; 1870 } 1871 return 0; 1872 } 1873 1874 1875 /* 1876 * vget the vnode associated with the namecache entry. Resolve the namecache 1877 * entry if necessary. The passed ncp must be referenced and locked. If 1878 * the ncp is resolved it might be locked shared. 1879 * 1880 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1881 * (depending on the passed lk_type) will be returned in *vpp with an error 1882 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1883 * most typical error is ENOENT, meaning that the ncp represents a negative 1884 * cache hit and there is no vnode to retrieve, but other errors can occur 1885 * too. 1886 * 1887 * The vget() can race a reclaim. If this occurs we re-resolve the 1888 * namecache entry. 1889 * 1890 * There are numerous places in the kernel where vget() is called on a 1891 * vnode while one or more of its namecache entries is locked. Releasing 1892 * a vnode never deadlocks against locked namecache entries (the vnode 1893 * will not get recycled while referenced ncp's exist). This means we 1894 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1895 * lock when acquiring the vp lock or we might cause a deadlock. 1896 * 1897 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1898 * unresolved. If a reclaim race occurs the passed-in ncp will be 1899 * relocked exclusively before being re-resolved. 1900 */ 1901 int 1902 cache_vget(struct nchandle *nch, struct ucred *cred, 1903 int lk_type, struct vnode **vpp) 1904 { 1905 struct namecache *ncp; 1906 struct vnode *vp; 1907 int error; 1908 1909 ncp = nch->ncp; 1910 again: 1911 vp = NULL; 1912 if (ncp->nc_flag & NCF_UNRESOLVED) 1913 error = cache_resolve(nch, cred); 1914 else 1915 error = 0; 1916 1917 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1918 error = vget(vp, lk_type); 1919 if (error) { 1920 /* 1921 * VRECLAIM race 1922 * 1923 * The ncp may have been locked shared, we must relock 1924 * it exclusively before we can set it to unresolved. 1925 */ 1926 if (error == ENOENT) { 1927 kprintf("Warning: vnode reclaim race detected " 1928 "in cache_vget on %p (%s)\n", 1929 vp, ncp->nc_name); 1930 _cache_unlock(ncp); 1931 _cache_lock(ncp); 1932 _cache_setunresolved(ncp); 1933 goto again; 1934 } 1935 1936 /* 1937 * Not a reclaim race, some other error. 1938 */ 1939 KKASSERT(ncp->nc_vp == vp); 1940 vp = NULL; 1941 } else { 1942 KKASSERT(ncp->nc_vp == vp); 1943 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1944 } 1945 } 1946 if (error == 0 && vp == NULL) 1947 error = ENOENT; 1948 *vpp = vp; 1949 return(error); 1950 } 1951 1952 /* 1953 * Similar to cache_vget() but only acquires a ref on the vnode. 1954 * 1955 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1956 * unresolved. If a reclaim race occurs the passed-in ncp will be 1957 * relocked exclusively before being re-resolved. 1958 */ 1959 int 1960 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 1961 { 1962 struct namecache *ncp; 1963 struct vnode *vp; 1964 int error; 1965 1966 ncp = nch->ncp; 1967 again: 1968 vp = NULL; 1969 if (ncp->nc_flag & NCF_UNRESOLVED) 1970 error = cache_resolve(nch, cred); 1971 else 1972 error = 0; 1973 1974 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1975 error = vget(vp, LK_SHARED); 1976 if (error) { 1977 /* 1978 * VRECLAIM race 1979 */ 1980 if (error == ENOENT) { 1981 kprintf("Warning: vnode reclaim race detected " 1982 "in cache_vget on %p (%s)\n", 1983 vp, ncp->nc_name); 1984 _cache_unlock(ncp); 1985 _cache_lock(ncp); 1986 _cache_setunresolved(ncp); 1987 goto again; 1988 } 1989 1990 /* 1991 * Not a reclaim race, some other error. 1992 */ 1993 KKASSERT(ncp->nc_vp == vp); 1994 vp = NULL; 1995 } else { 1996 KKASSERT(ncp->nc_vp == vp); 1997 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1998 /* caller does not want a lock */ 1999 vn_unlock(vp); 2000 } 2001 } 2002 if (error == 0 && vp == NULL) 2003 error = ENOENT; 2004 *vpp = vp; 2005 return(error); 2006 } 2007 2008 /* 2009 * Return a referenced vnode representing the parent directory of 2010 * ncp. 2011 * 2012 * Because the caller has locked the ncp it should not be possible for 2013 * the parent ncp to go away. However, the parent can unresolve its 2014 * dvp at any time so we must be able to acquire a lock on the parent 2015 * to safely access nc_vp. 2016 * 2017 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 2018 * so use vhold()/vdrop() while holding the lock to prevent dvp from 2019 * getting destroyed. 2020 * 2021 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 2022 * lock on the ncp in question.. 2023 */ 2024 struct vnode * 2025 cache_dvpref(struct namecache *ncp) 2026 { 2027 struct namecache *par; 2028 struct vnode *dvp; 2029 2030 dvp = NULL; 2031 if ((par = ncp->nc_parent) != NULL) { 2032 _cache_hold(par); 2033 _cache_lock(par); 2034 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 2035 if ((dvp = par->nc_vp) != NULL) 2036 vhold(dvp); 2037 } 2038 _cache_unlock(par); 2039 if (dvp) { 2040 if (vget(dvp, LK_SHARED) == 0) { 2041 vn_unlock(dvp); 2042 vdrop(dvp); 2043 /* return refd, unlocked dvp */ 2044 } else { 2045 vdrop(dvp); 2046 dvp = NULL; 2047 } 2048 } 2049 _cache_drop(par); 2050 } 2051 return(dvp); 2052 } 2053 2054 /* 2055 * Convert a directory vnode to a namecache record without any other 2056 * knowledge of the topology. This ONLY works with directory vnodes and 2057 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 2058 * returned ncp (if not NULL) will be held and unlocked. 2059 * 2060 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 2061 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 2062 * for dvp. This will fail only if the directory has been deleted out from 2063 * under the caller. 2064 * 2065 * Callers must always check for a NULL return no matter the value of 'makeit'. 2066 * 2067 * To avoid underflowing the kernel stack each recursive call increments 2068 * the makeit variable. 2069 */ 2070 2071 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2072 struct vnode *dvp, char *fakename); 2073 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2074 struct vnode **saved_dvp); 2075 2076 int 2077 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 2078 struct nchandle *nch) 2079 { 2080 struct vnode *saved_dvp; 2081 struct vnode *pvp; 2082 char *fakename; 2083 int error; 2084 2085 nch->ncp = NULL; 2086 nch->mount = dvp->v_mount; 2087 saved_dvp = NULL; 2088 fakename = NULL; 2089 2090 /* 2091 * Handle the makeit == 0 degenerate case 2092 */ 2093 if (makeit == 0) { 2094 spin_lock_shared(&dvp->v_spin); 2095 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2096 if (nch->ncp) 2097 cache_hold(nch); 2098 spin_unlock_shared(&dvp->v_spin); 2099 } 2100 2101 /* 2102 * Loop until resolution, inside code will break out on error. 2103 */ 2104 while (makeit) { 2105 /* 2106 * Break out if we successfully acquire a working ncp. 2107 */ 2108 spin_lock_shared(&dvp->v_spin); 2109 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 2110 if (nch->ncp) { 2111 cache_hold(nch); 2112 spin_unlock_shared(&dvp->v_spin); 2113 break; 2114 } 2115 spin_unlock_shared(&dvp->v_spin); 2116 2117 /* 2118 * If dvp is the root of its filesystem it should already 2119 * have a namecache pointer associated with it as a side 2120 * effect of the mount, but it may have been disassociated. 2121 */ 2122 if (dvp->v_flag & VROOT) { 2123 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2124 error = cache_resolve_mp(nch->mount); 2125 _cache_put(nch->ncp); 2126 if (ncvp_debug) { 2127 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2128 dvp->v_mount, error); 2129 } 2130 if (error) { 2131 if (ncvp_debug) 2132 kprintf(" failed\n"); 2133 nch->ncp = NULL; 2134 break; 2135 } 2136 if (ncvp_debug) 2137 kprintf(" succeeded\n"); 2138 continue; 2139 } 2140 2141 /* 2142 * If we are recursed too deeply resort to an O(n^2) 2143 * algorithm to resolve the namecache topology. The 2144 * resolved pvp is left referenced in saved_dvp to 2145 * prevent the tree from being destroyed while we loop. 2146 */ 2147 if (makeit > 20) { 2148 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2149 if (error) { 2150 kprintf("lookupdotdot(longpath) failed %d " 2151 "dvp %p\n", error, dvp); 2152 nch->ncp = NULL; 2153 break; 2154 } 2155 continue; 2156 } 2157 2158 /* 2159 * Get the parent directory and resolve its ncp. 2160 */ 2161 if (fakename) { 2162 kfree(fakename, M_TEMP); 2163 fakename = NULL; 2164 } 2165 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2166 &fakename); 2167 if (error) { 2168 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2169 break; 2170 } 2171 vn_unlock(pvp); 2172 2173 /* 2174 * Reuse makeit as a recursion depth counter. On success 2175 * nch will be fully referenced. 2176 */ 2177 cache_fromdvp(pvp, cred, makeit + 1, nch); 2178 vrele(pvp); 2179 if (nch->ncp == NULL) 2180 break; 2181 2182 /* 2183 * Do an inefficient scan of pvp (embodied by ncp) to look 2184 * for dvp. This will create a namecache record for dvp on 2185 * success. We loop up to recheck on success. 2186 * 2187 * ncp and dvp are both held but not locked. 2188 */ 2189 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2190 if (error) { 2191 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2192 pvp, nch->ncp->nc_name, dvp); 2193 cache_drop(nch); 2194 /* nch was NULLed out, reload mount */ 2195 nch->mount = dvp->v_mount; 2196 break; 2197 } 2198 if (ncvp_debug) { 2199 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2200 pvp, nch->ncp->nc_name); 2201 } 2202 cache_drop(nch); 2203 /* nch was NULLed out, reload mount */ 2204 nch->mount = dvp->v_mount; 2205 } 2206 2207 /* 2208 * If nch->ncp is non-NULL it will have been held already. 2209 */ 2210 if (fakename) 2211 kfree(fakename, M_TEMP); 2212 if (saved_dvp) 2213 vrele(saved_dvp); 2214 if (nch->ncp) 2215 return (0); 2216 return (EINVAL); 2217 } 2218 2219 /* 2220 * Go up the chain of parent directories until we find something 2221 * we can resolve into the namecache. This is very inefficient. 2222 */ 2223 static 2224 int 2225 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2226 struct vnode **saved_dvp) 2227 { 2228 struct nchandle nch; 2229 struct vnode *pvp; 2230 int error; 2231 static time_t last_fromdvp_report; 2232 char *fakename; 2233 2234 /* 2235 * Loop getting the parent directory vnode until we get something we 2236 * can resolve in the namecache. 2237 */ 2238 vref(dvp); 2239 nch.mount = dvp->v_mount; 2240 nch.ncp = NULL; 2241 fakename = NULL; 2242 2243 for (;;) { 2244 if (fakename) { 2245 kfree(fakename, M_TEMP); 2246 fakename = NULL; 2247 } 2248 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2249 &fakename); 2250 if (error) { 2251 vrele(dvp); 2252 break; 2253 } 2254 vn_unlock(pvp); 2255 spin_lock_shared(&pvp->v_spin); 2256 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2257 _cache_hold(nch.ncp); 2258 spin_unlock_shared(&pvp->v_spin); 2259 vrele(pvp); 2260 break; 2261 } 2262 spin_unlock_shared(&pvp->v_spin); 2263 if (pvp->v_flag & VROOT) { 2264 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2265 error = cache_resolve_mp(nch.mount); 2266 _cache_unlock(nch.ncp); 2267 vrele(pvp); 2268 if (error) { 2269 _cache_drop(nch.ncp); 2270 nch.ncp = NULL; 2271 vrele(dvp); 2272 } 2273 break; 2274 } 2275 vrele(dvp); 2276 dvp = pvp; 2277 } 2278 if (error == 0) { 2279 if (last_fromdvp_report != time_uptime) { 2280 last_fromdvp_report = time_uptime; 2281 kprintf("Warning: extremely inefficient path " 2282 "resolution on %s\n", 2283 nch.ncp->nc_name); 2284 } 2285 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2286 2287 /* 2288 * Hopefully dvp now has a namecache record associated with 2289 * it. Leave it referenced to prevent the kernel from 2290 * recycling the vnode. Otherwise extremely long directory 2291 * paths could result in endless recycling. 2292 */ 2293 if (*saved_dvp) 2294 vrele(*saved_dvp); 2295 *saved_dvp = dvp; 2296 _cache_drop(nch.ncp); 2297 } 2298 if (fakename) 2299 kfree(fakename, M_TEMP); 2300 return (error); 2301 } 2302 2303 /* 2304 * Do an inefficient scan of the directory represented by ncp looking for 2305 * the directory vnode dvp. ncp must be held but not locked on entry and 2306 * will be held on return. dvp must be refd but not locked on entry and 2307 * will remain refd on return. 2308 * 2309 * Why do this at all? Well, due to its stateless nature the NFS server 2310 * converts file handles directly to vnodes without necessarily going through 2311 * the namecache ops that would otherwise create the namecache topology 2312 * leading to the vnode. We could either (1) Change the namecache algorithms 2313 * to allow disconnect namecache records that are re-merged opportunistically, 2314 * or (2) Make the NFS server backtrack and scan to recover a connected 2315 * namecache topology in order to then be able to issue new API lookups. 2316 * 2317 * It turns out that (1) is a huge mess. It takes a nice clean set of 2318 * namecache algorithms and introduces a lot of complication in every subsystem 2319 * that calls into the namecache to deal with the re-merge case, especially 2320 * since we are using the namecache to placehold negative lookups and the 2321 * vnode might not be immediately assigned. (2) is certainly far less 2322 * efficient then (1), but since we are only talking about directories here 2323 * (which are likely to remain cached), the case does not actually run all 2324 * that often and has the supreme advantage of not polluting the namecache 2325 * algorithms. 2326 * 2327 * If a fakename is supplied just construct a namecache entry using the 2328 * fake name. 2329 */ 2330 static int 2331 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2332 struct vnode *dvp, char *fakename) 2333 { 2334 struct nlcomponent nlc; 2335 struct nchandle rncp; 2336 struct dirent *den; 2337 struct vnode *pvp; 2338 struct vattr vat; 2339 struct iovec iov; 2340 struct uio uio; 2341 int blksize; 2342 int eofflag; 2343 int bytes; 2344 char *rbuf; 2345 int error; 2346 2347 vat.va_blocksize = 0; 2348 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2349 return (error); 2350 cache_lock(nch); 2351 error = cache_vref(nch, cred, &pvp); 2352 cache_unlock(nch); 2353 if (error) 2354 return (error); 2355 if (ncvp_debug) { 2356 kprintf("inefficient_scan of (%p,%s): directory iosize %ld " 2357 "vattr fileid = %lld\n", 2358 nch->ncp, nch->ncp->nc_name, 2359 vat.va_blocksize, 2360 (long long)vat.va_fileid); 2361 } 2362 2363 /* 2364 * Use the supplied fakename if not NULL. Fake names are typically 2365 * not in the actual filesystem hierarchy. This is used by HAMMER 2366 * to glue @@timestamp recursions together. 2367 */ 2368 if (fakename) { 2369 nlc.nlc_nameptr = fakename; 2370 nlc.nlc_namelen = strlen(fakename); 2371 rncp = cache_nlookup(nch, &nlc); 2372 goto done; 2373 } 2374 2375 if ((blksize = vat.va_blocksize) == 0) 2376 blksize = DEV_BSIZE; 2377 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2378 rncp.ncp = NULL; 2379 2380 eofflag = 0; 2381 uio.uio_offset = 0; 2382 again: 2383 iov.iov_base = rbuf; 2384 iov.iov_len = blksize; 2385 uio.uio_iov = &iov; 2386 uio.uio_iovcnt = 1; 2387 uio.uio_resid = blksize; 2388 uio.uio_segflg = UIO_SYSSPACE; 2389 uio.uio_rw = UIO_READ; 2390 uio.uio_td = curthread; 2391 2392 if (ncvp_debug >= 2) 2393 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2394 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2395 if (error == 0) { 2396 den = (struct dirent *)rbuf; 2397 bytes = blksize - uio.uio_resid; 2398 2399 while (bytes > 0) { 2400 if (ncvp_debug >= 2) { 2401 kprintf("cache_inefficient_scan: %*.*s\n", 2402 den->d_namlen, den->d_namlen, 2403 den->d_name); 2404 } 2405 if (den->d_type != DT_WHT && 2406 den->d_ino == vat.va_fileid) { 2407 if (ncvp_debug) { 2408 kprintf("cache_inefficient_scan: " 2409 "MATCHED inode %lld path %s/%*.*s\n", 2410 (long long)vat.va_fileid, 2411 nch->ncp->nc_name, 2412 den->d_namlen, den->d_namlen, 2413 den->d_name); 2414 } 2415 nlc.nlc_nameptr = den->d_name; 2416 nlc.nlc_namelen = den->d_namlen; 2417 rncp = cache_nlookup(nch, &nlc); 2418 KKASSERT(rncp.ncp != NULL); 2419 break; 2420 } 2421 bytes -= _DIRENT_DIRSIZ(den); 2422 den = _DIRENT_NEXT(den); 2423 } 2424 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2425 goto again; 2426 } 2427 kfree(rbuf, M_TEMP); 2428 done: 2429 vrele(pvp); 2430 if (rncp.ncp) { 2431 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2432 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2433 if (ncvp_debug >= 2) { 2434 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2435 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2436 } 2437 } else { 2438 if (ncvp_debug >= 2) { 2439 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2440 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2441 rncp.ncp->nc_vp); 2442 } 2443 } 2444 if (rncp.ncp->nc_vp == NULL) 2445 error = rncp.ncp->nc_error; 2446 /* 2447 * Release rncp after a successful nlookup. rncp was fully 2448 * referenced. 2449 */ 2450 cache_put(&rncp); 2451 } else { 2452 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2453 dvp, nch->ncp->nc_name); 2454 error = ENOENT; 2455 } 2456 return (error); 2457 } 2458 2459 /* 2460 * This function must be called with the ncp held and locked and will unlock 2461 * and drop it during zapping. 2462 * 2463 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2464 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list 2465 * and removes the related reference. If the ncp can be removed, and the 2466 * parent can be zapped non-blocking, this function loops up. 2467 * 2468 * There will be one ref from the caller (which we now own). The only 2469 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list, 2470 * so possibly 2 refs left. Taking this into account, if there are no 2471 * additional refs and no children, the ncp will be removed from the topology 2472 * and destroyed. 2473 * 2474 * References and/or children may exist if the ncp is in the middle of the 2475 * topology, preventing the ncp from being destroyed. 2476 * 2477 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2478 * 2479 * This function may return a held (but NOT locked) parent node which the 2480 * caller must drop in a loop. Looping is one way to avoid unbounded recursion 2481 * due to deep namecache trees. 2482 * 2483 * WARNING! For MPSAFE operation this routine must acquire up to three 2484 * spin locks to be able to safely test nc_refs. Lock order is 2485 * very important. 2486 * 2487 * hash spinlock if on hash list 2488 * parent spinlock if child of parent 2489 * (the ncp is unresolved so there is no vnode association) 2490 */ 2491 static void 2492 cache_zap(struct namecache *ncp) 2493 { 2494 struct namecache *par; 2495 struct vnode *dropvp; 2496 struct nchash_head *nchpp; 2497 int refcmp; 2498 int nonblock = 1; /* XXX cleanup */ 2499 2500 again: 2501 /* 2502 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2503 * This gets rid of any vp->v_namecache list or negative list and 2504 * the related ref. 2505 */ 2506 _cache_setunresolved(ncp); 2507 2508 /* 2509 * Try to scrap the entry and possibly tail-recurse on its parent. 2510 * We only scrap unref'd (other then our ref) unresolved entries, 2511 * we do not scrap 'live' entries. 2512 * 2513 * If nc_parent is non NULL we expect 2 references, else just 1. 2514 * If there are more, someone else also holds the ncp and we cannot 2515 * destroy it. 2516 */ 2517 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2518 KKASSERT(ncp->nc_refs > 0); 2519 2520 /* 2521 * If the ncp is linked to its parent it will also be in the hash 2522 * table. We have to be able to lock the parent and the hash table. 2523 * 2524 * Acquire locks. Note that the parent can't go away while we hold 2525 * a child locked. If nc_parent is present, expect 2 refs instead 2526 * of 1. 2527 */ 2528 nchpp = NULL; 2529 if ((par = ncp->nc_parent) != NULL) { 2530 if (nonblock) { 2531 if (_cache_lock_nonblock(par)) { 2532 /* lock failed */ 2533 ncp->nc_flag |= NCF_DEFEREDZAP; 2534 atomic_add_long( 2535 &pcpu_ncache[mycpu->gd_cpuid].numdefered, 2536 1); 2537 _cache_unlock(ncp); 2538 _cache_drop(ncp); /* caller's ref */ 2539 return; 2540 } 2541 _cache_hold(par); 2542 } else { 2543 _cache_hold(par); 2544 _cache_lock(par); 2545 } 2546 nchpp = ncp->nc_head; 2547 spin_lock(&nchpp->spin); 2548 } 2549 2550 /* 2551 * With the parent and nchpp locked, and the vnode removed 2552 * (no vp->v_namecache), we expect 1 or 2 refs. If there are 2553 * more someone else has a ref and we cannot zap the entry. 2554 * 2555 * one for our hold 2556 * one for our parent link (parent also has one from the linkage) 2557 */ 2558 if (par) 2559 refcmp = 2; 2560 else 2561 refcmp = 1; 2562 2563 /* 2564 * On failure undo the work we've done so far and drop the 2565 * caller's ref and ncp. 2566 */ 2567 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) { 2568 if (par) { 2569 spin_unlock(&nchpp->spin); 2570 _cache_put(par); 2571 } 2572 _cache_unlock(ncp); 2573 _cache_drop(ncp); 2574 return; 2575 } 2576 2577 /* 2578 * We own all the refs and with the spinlocks held no further 2579 * refs can be acquired by others. 2580 * 2581 * Remove us from the hash list and parent list. We have to 2582 * drop a ref on the parent's vp if the parent's list becomes 2583 * empty. 2584 */ 2585 dropvp = NULL; 2586 if (par) { 2587 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; 2588 2589 KKASSERT(nchpp == ncp->nc_head); 2590 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); 2591 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2592 atomic_add_long(&pn->vfscache_count, -1); 2593 if (TAILQ_EMPTY(&ncp->nc_list)) 2594 atomic_add_long(&pn->vfscache_leafs, -1); 2595 2596 if (TAILQ_EMPTY(&par->nc_list)) { 2597 atomic_add_long(&pn->vfscache_leafs, 1); 2598 if (par->nc_vp) 2599 dropvp = par->nc_vp; 2600 } 2601 ncp->nc_parent = NULL; 2602 ncp->nc_head = NULL; 2603 spin_unlock(&nchpp->spin); 2604 _cache_drop(par); /* removal of ncp from par->nc_list */ 2605 /*_cache_unlock(par);*/ 2606 } else { 2607 KKASSERT(ncp->nc_head == NULL); 2608 } 2609 2610 /* 2611 * ncp should not have picked up any refs. Physically 2612 * destroy the ncp. 2613 */ 2614 if (ncp->nc_refs != refcmp) { 2615 panic("cache_zap: %p bad refs %d (expected %d)\n", 2616 ncp, ncp->nc_refs, refcmp); 2617 } 2618 /* _cache_unlock(ncp) not required */ 2619 ncp->nc_refs = -1; /* safety */ 2620 if (ncp->nc_name) 2621 kfree(ncp->nc_name, M_VFSCACHE); 2622 kfree(ncp, M_VFSCACHE); 2623 2624 /* 2625 * Delayed drop (we had to release our spinlocks) 2626 */ 2627 if (dropvp) 2628 vdrop(dropvp); 2629 2630 /* 2631 * Loop up if we can recursively clean out the parent. 2632 */ 2633 if (par) { 2634 refcmp = 1; /* ref on parent */ 2635 if (par->nc_parent) /* par->par */ 2636 ++refcmp; 2637 par->nc_flag &= ~NCF_DEFEREDZAP; 2638 if ((par->nc_flag & NCF_UNRESOLVED) && 2639 par->nc_refs == refcmp && 2640 TAILQ_EMPTY(&par->nc_list)) { 2641 ncp = par; 2642 goto again; 2643 } 2644 _cache_unlock(par); 2645 _cache_drop(par); 2646 } 2647 } 2648 2649 /* 2650 * Clean up dangling negative cache and defered-drop entries in the 2651 * namecache. 2652 * 2653 * This routine is called in the critical path and also called from 2654 * vnlru(). When called from vnlru we use a lower limit to try to 2655 * deal with the negative cache before the critical path has to start 2656 * dealing with it. 2657 */ 2658 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2659 2660 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2661 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2662 2663 void 2664 cache_hysteresis(int critpath) 2665 { 2666 long poslimit; 2667 long neglimit = maxvnodes / ncnegfactor; 2668 long xnumcache = vfscache_leafs; 2669 2670 if (critpath == 0) 2671 neglimit = neglimit * 8 / 10; 2672 2673 /* 2674 * Don't cache too many negative hits. We use hysteresis to reduce 2675 * the impact on the critical path. 2676 */ 2677 switch(neg_cache_hysteresis_state[critpath]) { 2678 case CHI_LOW: 2679 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) { 2680 if (critpath) 2681 _cache_cleanneg(ncnegflush); 2682 else 2683 _cache_cleanneg(ncnegflush + 2684 vfscache_negs - neglimit); 2685 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2686 } 2687 break; 2688 case CHI_HIGH: 2689 if (vfscache_negs > MINNEG * 9 / 10 && 2690 vfscache_negs * 9 / 10 > neglimit 2691 ) { 2692 if (critpath) 2693 _cache_cleanneg(ncnegflush); 2694 else 2695 _cache_cleanneg(ncnegflush + 2696 vfscache_negs * 9 / 10 - 2697 neglimit); 2698 } else { 2699 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2700 } 2701 break; 2702 } 2703 2704 /* 2705 * Don't cache too many positive hits. We use hysteresis to reduce 2706 * the impact on the critical path. 2707 * 2708 * Excessive positive hits can accumulate due to large numbers of 2709 * hardlinks (the vnode cache will not prevent hl ncps from growing 2710 * into infinity). 2711 */ 2712 if ((poslimit = ncposlimit) == 0) 2713 poslimit = maxvnodes * 2; 2714 if (critpath == 0) 2715 poslimit = poslimit * 8 / 10; 2716 2717 switch(pos_cache_hysteresis_state[critpath]) { 2718 case CHI_LOW: 2719 if (xnumcache > poslimit && xnumcache > MINPOS) { 2720 if (critpath) 2721 _cache_cleanpos(ncposflush); 2722 else 2723 _cache_cleanpos(ncposflush + 2724 xnumcache - poslimit); 2725 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2726 } 2727 break; 2728 case CHI_HIGH: 2729 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2730 if (critpath) 2731 _cache_cleanpos(ncposflush); 2732 else 2733 _cache_cleanpos(ncposflush + 2734 xnumcache - poslimit * 5 / 6); 2735 } else { 2736 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2737 } 2738 break; 2739 } 2740 2741 /* 2742 * Clean out dangling defered-zap ncps which could not be cleanly 2743 * dropped if too many build up. Note that numdefered is 2744 * heuristical. Make sure we are real-time for the current cpu, 2745 * plus the global rollup. 2746 */ 2747 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) { 2748 _cache_cleandefered(); 2749 } 2750 } 2751 2752 /* 2753 * NEW NAMECACHE LOOKUP API 2754 * 2755 * Lookup an entry in the namecache. The passed par_nch must be referenced 2756 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2757 * is ALWAYS returned, eve if the supplied component is illegal. 2758 * 2759 * The resulting namecache entry should be returned to the system with 2760 * cache_put() or cache_unlock() + cache_drop(). 2761 * 2762 * namecache locks are recursive but care must be taken to avoid lock order 2763 * reversals (hence why the passed par_nch must be unlocked). Locking 2764 * rules are to order for parent traversals, not for child traversals. 2765 * 2766 * Nobody else will be able to manipulate the associated namespace (e.g. 2767 * create, delete, rename, rename-target) until the caller unlocks the 2768 * entry. 2769 * 2770 * The returned entry will be in one of three states: positive hit (non-null 2771 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2772 * Unresolved entries must be resolved through the filesystem to associate the 2773 * vnode and/or determine whether a positive or negative hit has occured. 2774 * 2775 * It is not necessary to lock a directory in order to lock namespace under 2776 * that directory. In fact, it is explicitly not allowed to do that. A 2777 * directory is typically only locked when being created, renamed, or 2778 * destroyed. 2779 * 2780 * The directory (par) may be unresolved, in which case any returned child 2781 * will likely also be marked unresolved. Likely but not guarenteed. Since 2782 * the filesystem lookup requires a resolved directory vnode the caller is 2783 * responsible for resolving the namecache chain top-down. This API 2784 * specifically allows whole chains to be created in an unresolved state. 2785 */ 2786 struct nchandle 2787 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2788 { 2789 struct nchandle nch; 2790 struct namecache *ncp; 2791 struct namecache *new_ncp; 2792 struct namecache *rep_ncp; /* reuse a destroyed ncp */ 2793 struct nchash_head *nchpp; 2794 struct mount *mp; 2795 u_int32_t hash; 2796 globaldata_t gd; 2797 int par_locked; 2798 2799 gd = mycpu; 2800 mp = par_nch->mount; 2801 par_locked = 0; 2802 2803 /* 2804 * This is a good time to call it, no ncp's are locked by 2805 * the caller or us. 2806 */ 2807 cache_hysteresis(1); 2808 2809 /* 2810 * Try to locate an existing entry 2811 */ 2812 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2813 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2814 new_ncp = NULL; 2815 nchpp = NCHHASH(hash); 2816 restart: 2817 rep_ncp = NULL; 2818 if (new_ncp) 2819 spin_lock(&nchpp->spin); 2820 else 2821 spin_lock_shared(&nchpp->spin); 2822 2823 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 2824 /* 2825 * Break out if we find a matching entry. Note that 2826 * UNRESOLVED entries may match, but DESTROYED entries 2827 * do not. 2828 * 2829 * We may be able to reuse DESTROYED entries that we come 2830 * across, even if the name does not match, as long as 2831 * nc_nlen is correct and the only hold ref is from the nchpp 2832 * list itself. 2833 */ 2834 if (ncp->nc_parent == par_nch->ncp && 2835 ncp->nc_nlen == nlc->nlc_namelen) { 2836 if (ncp->nc_flag & NCF_DESTROYED) { 2837 if (ncp->nc_refs == 1 && rep_ncp == NULL) 2838 rep_ncp = ncp; 2839 continue; 2840 } 2841 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen)) 2842 continue; 2843 _cache_hold(ncp); 2844 if (new_ncp) 2845 spin_unlock(&nchpp->spin); 2846 else 2847 spin_unlock_shared(&nchpp->spin); 2848 if (par_locked) { 2849 _cache_unlock(par_nch->ncp); 2850 par_locked = 0; 2851 } 2852 if (_cache_lock_special(ncp) == 0) { 2853 /* 2854 * Successfully locked but we must re-test 2855 * conditions that might have changed since 2856 * we did not have the lock before. 2857 */ 2858 if (ncp->nc_parent != par_nch->ncp || 2859 ncp->nc_nlen != nlc->nlc_namelen || 2860 bcmp(ncp->nc_name, nlc->nlc_nameptr, 2861 ncp->nc_nlen) || 2862 (ncp->nc_flag & NCF_DESTROYED)) { 2863 _cache_put(ncp); 2864 goto restart; 2865 } 2866 _cache_auto_unresolve(mp, ncp); 2867 if (new_ncp) 2868 _cache_free(new_ncp); 2869 goto found; 2870 } 2871 _cache_get(ncp); /* cycle the lock to block */ 2872 _cache_put(ncp); 2873 _cache_drop(ncp); 2874 goto restart; 2875 } 2876 } 2877 2878 /* 2879 * We failed to locate the entry, try to resurrect a destroyed 2880 * entry that we did find that is already correctly linked into 2881 * nchpp and the parent. We must re-test conditions after 2882 * successfully locking rep_ncp. 2883 * 2884 * This case can occur under heavy loads due to not being able 2885 * to safely lock the parent in cache_zap(). Nominally a repeated 2886 * create/unlink load, but only the namelen needs to match. 2887 */ 2888 if (rep_ncp && new_ncp == NULL) { 2889 if (_cache_lock_nonblock(rep_ncp) == 0) { 2890 _cache_hold(rep_ncp); 2891 if (rep_ncp->nc_parent == par_nch->ncp && 2892 rep_ncp->nc_nlen == nlc->nlc_namelen && 2893 (rep_ncp->nc_flag & NCF_DESTROYED) && 2894 rep_ncp->nc_refs == 2) { 2895 /* 2896 * Update nc_name as reuse as new. 2897 */ 2898 ncp = rep_ncp; 2899 bcopy(nlc->nlc_nameptr, ncp->nc_name, 2900 nlc->nlc_namelen); 2901 spin_unlock_shared(&nchpp->spin); 2902 _cache_setunresolved(ncp); 2903 ncp->nc_flag = NCF_UNRESOLVED; 2904 ncp->nc_error = ENOTCONN; 2905 goto found; 2906 } 2907 _cache_put(rep_ncp); 2908 } 2909 } 2910 2911 /* 2912 * Otherwise create a new entry and add it to the cache. The parent 2913 * ncp must also be locked so we can link into it. 2914 * 2915 * We have to relookup after possibly blocking in kmalloc or 2916 * when locking par_nch. 2917 * 2918 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2919 * mount case, in which case nc_name will be NULL. 2920 */ 2921 if (new_ncp == NULL) { 2922 spin_unlock_shared(&nchpp->spin); 2923 new_ncp = cache_alloc(nlc->nlc_namelen); 2924 if (nlc->nlc_namelen) { 2925 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2926 nlc->nlc_namelen); 2927 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2928 } 2929 goto restart; 2930 } 2931 2932 /* 2933 * NOTE! The spinlock is held exclusively here because new_ncp 2934 * is non-NULL. 2935 */ 2936 if (par_locked == 0) { 2937 spin_unlock(&nchpp->spin); 2938 _cache_lock(par_nch->ncp); 2939 par_locked = 1; 2940 goto restart; 2941 } 2942 2943 /* 2944 * Link to parent (requires another ref, the one already in new_ncp 2945 * is what we wil lreturn). 2946 * 2947 * WARNING! We still hold the spinlock. We have to set the hash 2948 * table entry atomically. 2949 */ 2950 ncp = new_ncp; 2951 ++ncp->nc_refs; 2952 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2953 spin_unlock(&nchpp->spin); 2954 _cache_unlock(par_nch->ncp); 2955 /* par_locked = 0 - not used */ 2956 found: 2957 /* 2958 * stats and namecache size management 2959 */ 2960 if (ncp->nc_flag & NCF_UNRESOLVED) 2961 ++gd->gd_nchstats->ncs_miss; 2962 else if (ncp->nc_vp) 2963 ++gd->gd_nchstats->ncs_goodhits; 2964 else 2965 ++gd->gd_nchstats->ncs_neghits; 2966 nch.mount = mp; 2967 nch.ncp = ncp; 2968 _cache_mntref(nch.mount); 2969 2970 return(nch); 2971 } 2972 2973 /* 2974 * Attempt to lookup a namecache entry and return with a shared namecache 2975 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is 2976 * set or we are unable to lock. 2977 */ 2978 int 2979 cache_nlookup_maybe_shared(struct nchandle *par_nch, 2980 struct nlcomponent *nlc, 2981 int excl, struct nchandle *res_nch) 2982 { 2983 struct namecache *ncp; 2984 struct nchash_head *nchpp; 2985 struct mount *mp; 2986 u_int32_t hash; 2987 globaldata_t gd; 2988 2989 /* 2990 * If exclusive requested or shared namecache locks are disabled, 2991 * return failure. 2992 */ 2993 if (ncp_shared_lock_disable || excl) 2994 return(EWOULDBLOCK); 2995 2996 gd = mycpu; 2997 mp = par_nch->mount; 2998 2999 /* 3000 * This is a good time to call it, no ncp's are locked by 3001 * the caller or us. 3002 */ 3003 cache_hysteresis(1); 3004 3005 /* 3006 * Try to locate an existing entry 3007 */ 3008 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3009 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3010 nchpp = NCHHASH(hash); 3011 3012 spin_lock_shared(&nchpp->spin); 3013 3014 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3015 /* 3016 * Break out if we find a matching entry. Note that 3017 * UNRESOLVED entries may match, but DESTROYED entries 3018 * do not. 3019 */ 3020 if (ncp->nc_parent == par_nch->ncp && 3021 ncp->nc_nlen == nlc->nlc_namelen && 3022 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3023 (ncp->nc_flag & NCF_DESTROYED) == 0 3024 ) { 3025 _cache_hold(ncp); 3026 spin_unlock_shared(&nchpp->spin); 3027 3028 if (_cache_lock_shared_special(ncp) == 0) { 3029 if (ncp->nc_parent == par_nch->ncp && 3030 ncp->nc_nlen == nlc->nlc_namelen && 3031 bcmp(ncp->nc_name, nlc->nlc_nameptr, 3032 ncp->nc_nlen) == 0 && 3033 (ncp->nc_flag & NCF_DESTROYED) == 0 && 3034 (ncp->nc_flag & NCF_UNRESOLVED) == 0 && 3035 _cache_auto_unresolve_test(mp, ncp) == 0) { 3036 goto found; 3037 } 3038 _cache_unlock(ncp); 3039 } 3040 _cache_drop(ncp); 3041 return(EWOULDBLOCK); 3042 } 3043 } 3044 3045 /* 3046 * Failure 3047 */ 3048 spin_unlock_shared(&nchpp->spin); 3049 return(EWOULDBLOCK); 3050 3051 /* 3052 * Success 3053 * 3054 * Note that nc_error might be non-zero (e.g ENOENT). 3055 */ 3056 found: 3057 res_nch->mount = mp; 3058 res_nch->ncp = ncp; 3059 ++gd->gd_nchstats->ncs_goodhits; 3060 _cache_mntref(res_nch->mount); 3061 3062 KKASSERT(ncp->nc_error != EWOULDBLOCK); 3063 return(ncp->nc_error); 3064 } 3065 3066 /* 3067 * This is a non-blocking verison of cache_nlookup() used by 3068 * nfs_readdirplusrpc_uio(). It can fail for any reason and 3069 * will return nch.ncp == NULL in that case. 3070 */ 3071 struct nchandle 3072 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 3073 { 3074 struct nchandle nch; 3075 struct namecache *ncp; 3076 struct namecache *new_ncp; 3077 struct nchash_head *nchpp; 3078 struct mount *mp; 3079 u_int32_t hash; 3080 globaldata_t gd; 3081 int par_locked; 3082 3083 gd = mycpu; 3084 mp = par_nch->mount; 3085 par_locked = 0; 3086 3087 /* 3088 * Try to locate an existing entry 3089 */ 3090 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3091 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3092 new_ncp = NULL; 3093 nchpp = NCHHASH(hash); 3094 restart: 3095 spin_lock(&nchpp->spin); 3096 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3097 /* 3098 * Break out if we find a matching entry. Note that 3099 * UNRESOLVED entries may match, but DESTROYED entries 3100 * do not. 3101 */ 3102 if (ncp->nc_parent == par_nch->ncp && 3103 ncp->nc_nlen == nlc->nlc_namelen && 3104 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3105 (ncp->nc_flag & NCF_DESTROYED) == 0 3106 ) { 3107 _cache_hold(ncp); 3108 spin_unlock(&nchpp->spin); 3109 if (par_locked) { 3110 _cache_unlock(par_nch->ncp); 3111 par_locked = 0; 3112 } 3113 if (_cache_lock_special(ncp) == 0) { 3114 if (ncp->nc_parent != par_nch->ncp || 3115 ncp->nc_nlen != nlc->nlc_namelen || 3116 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) || 3117 (ncp->nc_flag & NCF_DESTROYED)) { 3118 kprintf("cache_lookup_nonblock: " 3119 "ncp-race %p %*.*s\n", 3120 ncp, 3121 nlc->nlc_namelen, 3122 nlc->nlc_namelen, 3123 nlc->nlc_nameptr); 3124 _cache_unlock(ncp); 3125 _cache_drop(ncp); 3126 goto failed; 3127 } 3128 _cache_auto_unresolve(mp, ncp); 3129 if (new_ncp) { 3130 _cache_free(new_ncp); 3131 new_ncp = NULL; 3132 } 3133 goto found; 3134 } 3135 _cache_drop(ncp); 3136 goto failed; 3137 } 3138 } 3139 3140 /* 3141 * We failed to locate an entry, create a new entry and add it to 3142 * the cache. The parent ncp must also be locked so we 3143 * can link into it. 3144 * 3145 * We have to relookup after possibly blocking in kmalloc or 3146 * when locking par_nch. 3147 * 3148 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 3149 * mount case, in which case nc_name will be NULL. 3150 */ 3151 if (new_ncp == NULL) { 3152 spin_unlock(&nchpp->spin); 3153 new_ncp = cache_alloc(nlc->nlc_namelen); 3154 if (nlc->nlc_namelen) { 3155 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 3156 nlc->nlc_namelen); 3157 new_ncp->nc_name[nlc->nlc_namelen] = 0; 3158 } 3159 goto restart; 3160 } 3161 if (par_locked == 0) { 3162 spin_unlock(&nchpp->spin); 3163 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 3164 par_locked = 1; 3165 goto restart; 3166 } 3167 goto failed; 3168 } 3169 3170 /* 3171 * Link to parent (requires another ref, the one already in new_ncp 3172 * is what we wil lreturn). 3173 * 3174 * WARNING! We still hold the spinlock. We have to set the hash 3175 * table entry atomically. 3176 */ 3177 ncp = new_ncp; 3178 ++ncp->nc_refs; 3179 _cache_link_parent(ncp, par_nch->ncp, nchpp); 3180 spin_unlock(&nchpp->spin); 3181 _cache_unlock(par_nch->ncp); 3182 /* par_locked = 0 - not used */ 3183 found: 3184 /* 3185 * stats and namecache size management 3186 */ 3187 if (ncp->nc_flag & NCF_UNRESOLVED) 3188 ++gd->gd_nchstats->ncs_miss; 3189 else if (ncp->nc_vp) 3190 ++gd->gd_nchstats->ncs_goodhits; 3191 else 3192 ++gd->gd_nchstats->ncs_neghits; 3193 nch.mount = mp; 3194 nch.ncp = ncp; 3195 _cache_mntref(nch.mount); 3196 3197 return(nch); 3198 failed: 3199 if (new_ncp) { 3200 _cache_free(new_ncp); 3201 new_ncp = NULL; 3202 } 3203 nch.mount = NULL; 3204 nch.ncp = NULL; 3205 return(nch); 3206 } 3207 3208 /* 3209 * This version is non-locking. The caller must validate the result 3210 * for parent-to-child continuity. 3211 * 3212 * It can fail for any reason and will return nch.ncp == NULL in that case. 3213 */ 3214 struct nchandle 3215 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc) 3216 { 3217 struct nchandle nch; 3218 struct namecache *ncp; 3219 struct nchash_head *nchpp; 3220 struct mount *mp; 3221 u_int32_t hash; 3222 globaldata_t gd; 3223 3224 gd = mycpu; 3225 mp = par_nch->mount; 3226 3227 /* 3228 * Try to locate an existing entry 3229 */ 3230 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 3231 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 3232 nchpp = NCHHASH(hash); 3233 3234 spin_lock_shared(&nchpp->spin); 3235 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { 3236 /* 3237 * Break out if we find a matching entry. Note that 3238 * UNRESOLVED entries may match, but DESTROYED entries 3239 * do not. 3240 * 3241 * Resolved NFS entries which have timed out fail so the 3242 * caller can rerun with normal locking. 3243 */ 3244 if (ncp->nc_parent == par_nch->ncp && 3245 ncp->nc_nlen == nlc->nlc_namelen && 3246 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 3247 (ncp->nc_flag & NCF_DESTROYED) == 0 3248 ) { 3249 if (_cache_auto_unresolve_test(par_nch->mount, ncp)) 3250 break; 3251 _cache_hold(ncp); 3252 spin_unlock_shared(&nchpp->spin); 3253 goto found; 3254 } 3255 } 3256 spin_unlock_shared(&nchpp->spin); 3257 nch.mount = NULL; 3258 nch.ncp = NULL; 3259 return nch; 3260 found: 3261 /* 3262 * stats and namecache size management 3263 */ 3264 if (ncp->nc_flag & NCF_UNRESOLVED) 3265 ++gd->gd_nchstats->ncs_miss; 3266 else if (ncp->nc_vp) 3267 ++gd->gd_nchstats->ncs_goodhits; 3268 else 3269 ++gd->gd_nchstats->ncs_neghits; 3270 nch.mount = mp; 3271 nch.ncp = ncp; 3272 _cache_mntref(nch.mount); 3273 3274 return(nch); 3275 } 3276 3277 /* 3278 * The namecache entry is marked as being used as a mount point. 3279 * Locate the mount if it is visible to the caller. The DragonFly 3280 * mount system allows arbitrary loops in the topology and disentangles 3281 * those loops by matching against (mp, ncp) rather than just (ncp). 3282 * This means any given ncp can dive any number of mounts, depending 3283 * on the relative mount (e.g. nullfs) the caller is at in the topology. 3284 * 3285 * We use a very simple frontend cache to reduce SMP conflicts, 3286 * which we have to do because the mountlist scan needs an exclusive 3287 * lock around its ripout info list. Not to mention that there might 3288 * be a lot of mounts. 3289 * 3290 * Because all mounts can potentially be accessed by all cpus, break the cpu's 3291 * down a bit to allow some contention rather than making the cache 3292 * excessively huge. 3293 * 3294 * The hash table is split into per-cpu areas, is 4-way set-associative. 3295 */ 3296 struct findmount_info { 3297 struct mount *result; 3298 struct mount *nch_mount; 3299 struct namecache *nch_ncp; 3300 }; 3301 3302 static __inline 3303 struct ncmount_cache * 3304 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp) 3305 { 3306 uint32_t hash; 3307 3308 hash = iscsi_crc32(&mp, sizeof(mp)); 3309 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash); 3310 hash ^= hash >> 16; 3311 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1)); 3312 3313 return (&ncmount_cache[hash]); 3314 } 3315 3316 static 3317 struct ncmount_cache * 3318 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 3319 { 3320 struct ncmount_cache *ncc; 3321 struct ncmount_cache *best; 3322 int delta; 3323 int best_delta; 3324 int i; 3325 3326 ncc = ncmount_cache_lookup4(mp, ncp); 3327 3328 /* 3329 * NOTE: When checking for a ticks overflow implement a slop of 3330 * 2 ticks just to be safe, because ticks is accessed 3331 * non-atomically one CPU can increment it while another 3332 * is still using the old value. 3333 */ 3334 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */ 3335 return ncc; 3336 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */ 3337 if (delta < -2) /* overflow reset */ 3338 ncc->ticks = ticks; 3339 best = ncc; 3340 best_delta = delta; 3341 3342 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */ 3343 ++ncc; 3344 if (ncc->ncp == ncp && ncc->mp == mp) 3345 return ncc; 3346 delta = (int)(ticks - ncc->ticks); 3347 if (delta < -2) 3348 ncc->ticks = ticks; 3349 if (delta > best_delta) { 3350 best_delta = delta; 3351 best = ncc; 3352 } 3353 } 3354 return best; 3355 } 3356 3357 /* 3358 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid 3359 * doing an expensive mountlist_scan*() if possible. 3360 * 3361 * (mp, ncp) -> mountonpt.k 3362 * 3363 * Returns a referenced mount pointer or NULL 3364 * 3365 * General SMP operation uses a per-cpu umount_spin to interlock unmount 3366 * operations (that is, where the mp_target can be freed out from under us). 3367 * 3368 * Lookups use the ncc->updating counter to validate the contents in order 3369 * to avoid having to obtain the per cache-element spin-lock. In addition, 3370 * the ticks field is only updated when it changes. However, if our per-cpu 3371 * lock fails due to an unmount-in-progress, we fall-back to the 3372 * cache-element's spin-lock. 3373 */ 3374 struct mount * 3375 cache_findmount(struct nchandle *nch) 3376 { 3377 struct findmount_info info; 3378 struct ncmount_cache *ncc; 3379 struct ncmount_cache ncc_copy; 3380 struct mount *target; 3381 struct pcpu_ncache *pcpu; 3382 struct spinlock *spinlk; 3383 long update; 3384 3385 pcpu = pcpu_ncache; 3386 if (ncmount_cache_enable == 0 || pcpu == NULL) { 3387 ncc = NULL; 3388 goto skip; 3389 } 3390 pcpu += mycpu->gd_cpuid; 3391 3392 again: 3393 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3394 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3395 found: 3396 /* 3397 * This is a bit messy for now because we do not yet have 3398 * safe disposal of mount structures. We have to ref 3399 * ncc->mp_target but the 'update' counter only tell us 3400 * whether the cache has changed after the fact. 3401 * 3402 * For now get a per-cpu spinlock that will only contend 3403 * against umount's. This is the best path. If it fails, 3404 * instead of waiting on the umount we fall-back to a 3405 * shared ncc->spin lock, which will generally only cost a 3406 * cache ping-pong. 3407 */ 3408 update = ncc->updating; 3409 if (__predict_true(spin_trylock(&pcpu->umount_spin))) { 3410 spinlk = &pcpu->umount_spin; 3411 } else { 3412 spinlk = &ncc->spin; 3413 spin_lock_shared(spinlk); 3414 } 3415 if (update & 1) { /* update in progress */ 3416 spin_unlock_any(spinlk); 3417 goto skip; 3418 } 3419 ncc_copy = *ncc; 3420 cpu_lfence(); 3421 if (ncc->updating != update) { /* content changed */ 3422 spin_unlock_any(spinlk); 3423 goto again; 3424 } 3425 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) { 3426 spin_unlock_any(spinlk); 3427 goto again; 3428 } 3429 if (ncc_copy.isneg == 0) { 3430 target = ncc_copy.mp_target; 3431 if (target->mnt_ncmounton.mount == nch->mount && 3432 target->mnt_ncmounton.ncp == nch->ncp) { 3433 /* 3434 * Cache hit (positive) (avoid dirtying 3435 * the cache line if possible) 3436 */ 3437 if (ncc->ticks != (int)ticks) 3438 ncc->ticks = (int)ticks; 3439 _cache_mntref(target); 3440 } 3441 } else { 3442 /* 3443 * Cache hit (negative) (avoid dirtying 3444 * the cache line if possible) 3445 */ 3446 if (ncc->ticks != (int)ticks) 3447 ncc->ticks = (int)ticks; 3448 target = NULL; 3449 } 3450 spin_unlock_any(spinlk); 3451 3452 return target; 3453 } 3454 skip: 3455 3456 /* 3457 * Slow 3458 */ 3459 info.result = NULL; 3460 info.nch_mount = nch->mount; 3461 info.nch_ncp = nch->ncp; 3462 mountlist_scan(cache_findmount_callback, &info, 3463 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK); 3464 3465 /* 3466 * To reduce multi-re-entry on the cache, relookup in the cache. 3467 * This can still race, obviously, but that's ok. 3468 */ 3469 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3470 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3471 if (info.result) 3472 atomic_add_int(&info.result->mnt_refs, -1); 3473 goto found; 3474 } 3475 3476 /* 3477 * Cache the result. 3478 */ 3479 if ((info.result == NULL || 3480 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) { 3481 spin_lock(&ncc->spin); 3482 ++ncc->updating; 3483 cpu_sfence(); 3484 KKASSERT(ncc->updating & 1); 3485 if (ncc->mp != nch->mount) { 3486 if (ncc->mp) 3487 atomic_add_int(&ncc->mp->mnt_refs, -1); 3488 atomic_add_int(&nch->mount->mnt_refs, 1); 3489 ncc->mp = nch->mount; 3490 } 3491 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/ 3492 ncc->ticks = (int)ticks; 3493 3494 if (info.result) { 3495 ncc->isneg = 0; 3496 if (ncc->mp_target != info.result) { 3497 if (ncc->mp_target) 3498 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3499 ncc->mp_target = info.result; 3500 atomic_add_int(&info.result->mnt_refs, 1); 3501 } 3502 } else { 3503 ncc->isneg = 1; 3504 if (ncc->mp_target) { 3505 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3506 ncc->mp_target = NULL; 3507 } 3508 } 3509 cpu_sfence(); 3510 ++ncc->updating; 3511 spin_unlock(&ncc->spin); 3512 } 3513 return(info.result); 3514 } 3515 3516 static 3517 int 3518 cache_findmount_callback(struct mount *mp, void *data) 3519 { 3520 struct findmount_info *info = data; 3521 3522 /* 3523 * Check the mount's mounted-on point against the passed nch. 3524 */ 3525 if (mp->mnt_ncmounton.mount == info->nch_mount && 3526 mp->mnt_ncmounton.ncp == info->nch_ncp 3527 ) { 3528 info->result = mp; 3529 _cache_mntref(mp); 3530 return(-1); 3531 } 3532 return(0); 3533 } 3534 3535 void 3536 cache_dropmount(struct mount *mp) 3537 { 3538 _cache_mntrel(mp); 3539 } 3540 3541 /* 3542 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive 3543 * or negative). 3544 * 3545 * A full scan is not required, but for now just do it anyway. 3546 */ 3547 void 3548 cache_ismounting(struct mount *mp) 3549 { 3550 struct ncmount_cache *ncc; 3551 struct mount *ncc_mp; 3552 int i; 3553 3554 if (pcpu_ncache == NULL) 3555 return; 3556 3557 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3558 ncc = &ncmount_cache[i]; 3559 if (ncc->mp != mp->mnt_ncmounton.mount || 3560 ncc->ncp != mp->mnt_ncmounton.ncp) { 3561 continue; 3562 } 3563 spin_lock(&ncc->spin); 3564 ++ncc->updating; 3565 cpu_sfence(); 3566 KKASSERT(ncc->updating & 1); 3567 if (ncc->mp != mp->mnt_ncmounton.mount || 3568 ncc->ncp != mp->mnt_ncmounton.ncp) { 3569 cpu_sfence(); 3570 ++ncc->updating; 3571 spin_unlock(&ncc->spin); 3572 continue; 3573 } 3574 ncc_mp = ncc->mp; 3575 ncc->ncp = NULL; 3576 ncc->mp = NULL; 3577 if (ncc_mp) 3578 atomic_add_int(&ncc_mp->mnt_refs, -1); 3579 ncc_mp = ncc->mp_target; 3580 ncc->mp_target = NULL; 3581 if (ncc_mp) 3582 atomic_add_int(&ncc_mp->mnt_refs, -1); 3583 ncc->ticks = (int)ticks - hz * 120; 3584 3585 cpu_sfence(); 3586 ++ncc->updating; 3587 spin_unlock(&ncc->spin); 3588 } 3589 3590 /* 3591 * Pre-cache the mount point 3592 */ 3593 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount, 3594 mp->mnt_ncmounton.ncp); 3595 3596 spin_lock(&ncc->spin); 3597 ++ncc->updating; 3598 cpu_sfence(); 3599 KKASSERT(ncc->updating & 1); 3600 3601 if (ncc->mp) 3602 atomic_add_int(&ncc->mp->mnt_refs, -1); 3603 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1); 3604 ncc->mp = mp->mnt_ncmounton.mount; 3605 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */ 3606 ncc->ticks = (int)ticks; 3607 3608 ncc->isneg = 0; 3609 if (ncc->mp_target != mp) { 3610 if (ncc->mp_target) 3611 atomic_add_int(&ncc->mp_target->mnt_refs, -1); 3612 ncc->mp_target = mp; 3613 atomic_add_int(&mp->mnt_refs, 1); 3614 } 3615 ++ncc->updating; 3616 cpu_sfence(); 3617 spin_unlock(&ncc->spin); 3618 } 3619 3620 /* 3621 * Scrap any ncmount_cache entries related to mp. Not only do we need to 3622 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any 3623 * negative hits involving (mp, <any>). 3624 * 3625 * A full scan is required. 3626 */ 3627 void 3628 cache_unmounting(struct mount *mp) 3629 { 3630 struct ncmount_cache *ncc; 3631 struct pcpu_ncache *pcpu; 3632 struct mount *ncc_mp; 3633 int i; 3634 3635 pcpu = pcpu_ncache; 3636 if (pcpu == NULL) 3637 return; 3638 3639 for (i = 0; i < ncpus; ++i) 3640 spin_lock(&pcpu[i].umount_spin); 3641 3642 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) { 3643 ncc = &ncmount_cache[i]; 3644 if (ncc->mp != mp && ncc->mp_target != mp) 3645 continue; 3646 spin_lock(&ncc->spin); 3647 ++ncc->updating; 3648 cpu_sfence(); 3649 3650 if (ncc->mp != mp && ncc->mp_target != mp) { 3651 ++ncc->updating; 3652 cpu_sfence(); 3653 spin_unlock(&ncc->spin); 3654 continue; 3655 } 3656 ncc_mp = ncc->mp; 3657 ncc->ncp = NULL; 3658 ncc->mp = NULL; 3659 if (ncc_mp) 3660 atomic_add_int(&ncc_mp->mnt_refs, -1); 3661 ncc_mp = ncc->mp_target; 3662 ncc->mp_target = NULL; 3663 if (ncc_mp) 3664 atomic_add_int(&ncc_mp->mnt_refs, -1); 3665 ncc->ticks = (int)ticks - hz * 120; 3666 3667 ++ncc->updating; 3668 cpu_sfence(); 3669 spin_unlock(&ncc->spin); 3670 } 3671 3672 for (i = 0; i < ncpus; ++i) 3673 spin_unlock(&pcpu[i].umount_spin); 3674 } 3675 3676 /* 3677 * Resolve an unresolved namecache entry, generally by looking it up. 3678 * The passed ncp must be locked and refd. 3679 * 3680 * Theoretically since a vnode cannot be recycled while held, and since 3681 * the nc_parent chain holds its vnode as long as children exist, the 3682 * direct parent of the cache entry we are trying to resolve should 3683 * have a valid vnode. If not then generate an error that we can 3684 * determine is related to a resolver bug. 3685 * 3686 * However, if a vnode was in the middle of a recyclement when the NCP 3687 * got locked, ncp->nc_vp might point to a vnode that is about to become 3688 * invalid. cache_resolve() handles this case by unresolving the entry 3689 * and then re-resolving it. 3690 * 3691 * Note that successful resolution does not necessarily return an error 3692 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3693 * will be returned. 3694 */ 3695 int 3696 cache_resolve(struct nchandle *nch, struct ucred *cred) 3697 { 3698 struct namecache *par_tmp; 3699 struct namecache *par; 3700 struct namecache *ncp; 3701 struct nchandle nctmp; 3702 struct mount *mp; 3703 struct vnode *dvp; 3704 int error; 3705 3706 ncp = nch->ncp; 3707 mp = nch->mount; 3708 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3709 restart: 3710 /* 3711 * If the ncp is already resolved we have nothing to do. However, 3712 * we do want to guarentee that a usable vnode is returned when 3713 * a vnode is present, so make sure it hasn't been reclaimed. 3714 */ 3715 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3716 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3717 _cache_setunresolved(ncp); 3718 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3719 return (ncp->nc_error); 3720 } 3721 3722 /* 3723 * If the ncp was destroyed it will never resolve again. This 3724 * can basically only happen when someone is chdir'd into an 3725 * empty directory which is then rmdir'd. We want to catch this 3726 * here and not dive the VFS because the VFS might actually 3727 * have a way to re-resolve the disconnected ncp, which will 3728 * result in inconsistencies in the cdir/nch for proc->p_fd. 3729 */ 3730 if (ncp->nc_flag & NCF_DESTROYED) 3731 return(EINVAL); 3732 3733 /* 3734 * Mount points need special handling because the parent does not 3735 * belong to the same filesystem as the ncp. 3736 */ 3737 if (ncp == mp->mnt_ncmountpt.ncp) 3738 return (cache_resolve_mp(mp)); 3739 3740 /* 3741 * We expect an unbroken chain of ncps to at least the mount point, 3742 * and even all the way to root (but this code doesn't have to go 3743 * past the mount point). 3744 */ 3745 if (ncp->nc_parent == NULL) { 3746 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3747 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3748 ncp->nc_error = EXDEV; 3749 return(ncp->nc_error); 3750 } 3751 3752 /* 3753 * The vp's of the parent directories in the chain are held via vhold() 3754 * due to the existance of the child, and should not disappear. 3755 * However, there are cases where they can disappear: 3756 * 3757 * - due to filesystem I/O errors. 3758 * - due to NFS being stupid about tracking the namespace and 3759 * destroys the namespace for entire directories quite often. 3760 * - due to forced unmounts. 3761 * - due to an rmdir (parent will be marked DESTROYED) 3762 * 3763 * When this occurs we have to track the chain backwards and resolve 3764 * it, looping until the resolver catches up to the current node. We 3765 * could recurse here but we might run ourselves out of kernel stack 3766 * so we do it in a more painful manner. This situation really should 3767 * not occur all that often, or if it does not have to go back too 3768 * many nodes to resolve the ncp. 3769 */ 3770 while ((dvp = cache_dvpref(ncp)) == NULL) { 3771 /* 3772 * This case can occur if a process is CD'd into a 3773 * directory which is then rmdir'd. If the parent is marked 3774 * destroyed there is no point trying to resolve it. 3775 */ 3776 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3777 return(ENOENT); 3778 par = ncp->nc_parent; 3779 _cache_hold(par); 3780 _cache_lock(par); 3781 while ((par_tmp = par->nc_parent) != NULL && 3782 par_tmp->nc_vp == NULL) { 3783 _cache_hold(par_tmp); 3784 _cache_lock(par_tmp); 3785 _cache_put(par); 3786 par = par_tmp; 3787 } 3788 if (par->nc_parent == NULL) { 3789 kprintf("EXDEV case 2 %*.*s\n", 3790 par->nc_nlen, par->nc_nlen, par->nc_name); 3791 _cache_put(par); 3792 return (EXDEV); 3793 } 3794 /* 3795 * The parent is not set in stone, ref and lock it to prevent 3796 * it from disappearing. Also note that due to renames it 3797 * is possible for our ncp to move and for par to no longer 3798 * be one of its parents. We resolve it anyway, the loop 3799 * will handle any moves. 3800 */ 3801 _cache_get(par); /* additional hold/lock */ 3802 _cache_put(par); /* from earlier hold/lock */ 3803 if (par == nch->mount->mnt_ncmountpt.ncp) { 3804 cache_resolve_mp(nch->mount); 3805 } else if ((dvp = cache_dvpref(par)) == NULL) { 3806 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", 3807 par->nc_nlen, par->nc_nlen, par->nc_name); 3808 _cache_put(par); 3809 continue; 3810 } else { 3811 if (par->nc_flag & NCF_UNRESOLVED) { 3812 nctmp.mount = mp; 3813 nctmp.ncp = par; 3814 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3815 } 3816 vrele(dvp); 3817 } 3818 if ((error = par->nc_error) != 0) { 3819 if (par->nc_error != EAGAIN) { 3820 kprintf("EXDEV case 3 %*.*s error %d\n", 3821 par->nc_nlen, par->nc_nlen, par->nc_name, 3822 par->nc_error); 3823 _cache_put(par); 3824 return(error); 3825 } 3826 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3827 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3828 } 3829 _cache_put(par); 3830 /* loop */ 3831 } 3832 3833 /* 3834 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3835 * ncp's and reattach them. If this occurs the original ncp is marked 3836 * EAGAIN to force a relookup. 3837 * 3838 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3839 * ncp must already be resolved. 3840 */ 3841 if (dvp) { 3842 nctmp.mount = mp; 3843 nctmp.ncp = ncp; 3844 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3845 vrele(dvp); 3846 } else { 3847 ncp->nc_error = EPERM; 3848 } 3849 if (ncp->nc_error == EAGAIN) { 3850 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3851 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3852 goto restart; 3853 } 3854 return(ncp->nc_error); 3855 } 3856 3857 /* 3858 * Resolve the ncp associated with a mount point. Such ncp's almost always 3859 * remain resolved and this routine is rarely called. NFS MPs tends to force 3860 * re-resolution more often due to its mac-truck-smash-the-namecache 3861 * method of tracking namespace changes. 3862 * 3863 * The semantics for this call is that the passed ncp must be locked on 3864 * entry and will be locked on return. However, if we actually have to 3865 * resolve the mount point we temporarily unlock the entry in order to 3866 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3867 * the unlock we have to recheck the flags after we relock. 3868 */ 3869 static int 3870 cache_resolve_mp(struct mount *mp) 3871 { 3872 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3873 struct vnode *vp; 3874 int error; 3875 3876 KKASSERT(mp != NULL); 3877 3878 /* 3879 * If the ncp is already resolved we have nothing to do. However, 3880 * we do want to guarentee that a usable vnode is returned when 3881 * a vnode is present, so make sure it hasn't been reclaimed. 3882 */ 3883 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3884 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3885 _cache_setunresolved(ncp); 3886 } 3887 3888 if (ncp->nc_flag & NCF_UNRESOLVED) { 3889 _cache_unlock(ncp); 3890 while (vfs_busy(mp, 0)) 3891 ; 3892 error = VFS_ROOT(mp, &vp); 3893 _cache_lock(ncp); 3894 3895 /* 3896 * recheck the ncp state after relocking. 3897 */ 3898 if (ncp->nc_flag & NCF_UNRESOLVED) { 3899 ncp->nc_error = error; 3900 if (error == 0) { 3901 _cache_setvp(mp, ncp, vp); 3902 vput(vp); 3903 } else { 3904 kprintf("[diagnostic] cache_resolve_mp: failed" 3905 " to resolve mount %p err=%d ncp=%p\n", 3906 mp, error, ncp); 3907 _cache_setvp(mp, ncp, NULL); 3908 } 3909 } else if (error == 0) { 3910 vput(vp); 3911 } 3912 vfs_unbusy(mp); 3913 } 3914 return(ncp->nc_error); 3915 } 3916 3917 /* 3918 * Clean out negative cache entries when too many have accumulated. 3919 */ 3920 static void 3921 _cache_cleanneg(long count) 3922 { 3923 struct pcpu_ncache *pn; 3924 struct namecache *ncp; 3925 static uint32_t neg_rover; 3926 uint32_t n; 3927 long vnegs; 3928 3929 n = neg_rover++; /* SMP heuristical, race ok */ 3930 cpu_ccfence(); 3931 n = n % (uint32_t)ncpus; 3932 3933 /* 3934 * Normalize vfscache_negs and count. count is sometimes based 3935 * on vfscache_negs. vfscache_negs is heuristical and can sometimes 3936 * have crazy values. 3937 */ 3938 vnegs = vfscache_negs; 3939 cpu_ccfence(); 3940 if (vnegs <= MINNEG) 3941 vnegs = MINNEG; 3942 if (count < 1) 3943 count = 1; 3944 3945 pn = &pcpu_ncache[n]; 3946 spin_lock(&pn->neg_spin); 3947 count = pn->neg_count * count / vnegs + 1; 3948 spin_unlock(&pn->neg_spin); 3949 3950 /* 3951 * Attempt to clean out the specified number of negative cache 3952 * entries. 3953 */ 3954 while (count > 0) { 3955 spin_lock(&pn->neg_spin); 3956 ncp = TAILQ_FIRST(&pn->neg_list); 3957 if (ncp == NULL) { 3958 spin_unlock(&pn->neg_spin); 3959 break; 3960 } 3961 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode); 3962 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode); 3963 _cache_hold(ncp); 3964 spin_unlock(&pn->neg_spin); 3965 3966 /* 3967 * This can race, so we must re-check that the ncp 3968 * is on the ncneg.list after successfully locking it. 3969 */ 3970 if (_cache_lock_special(ncp) == 0) { 3971 if (ncp->nc_vp == NULL && 3972 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3973 cache_zap(ncp); 3974 } else { 3975 _cache_unlock(ncp); 3976 _cache_drop(ncp); 3977 } 3978 } else { 3979 _cache_drop(ncp); 3980 } 3981 --count; 3982 } 3983 } 3984 3985 /* 3986 * Clean out positive cache entries when too many have accumulated. 3987 */ 3988 static void 3989 _cache_cleanpos(long count) 3990 { 3991 static volatile int rover; 3992 struct nchash_head *nchpp; 3993 struct namecache *ncp; 3994 int rover_copy; 3995 3996 /* 3997 * Attempt to clean out the specified number of negative cache 3998 * entries. 3999 */ 4000 while (count > 0) { 4001 rover_copy = ++rover; /* MPSAFEENOUGH */ 4002 cpu_ccfence(); 4003 nchpp = NCHHASH(rover_copy); 4004 4005 if (TAILQ_FIRST(&nchpp->list) == NULL) { 4006 --count; 4007 continue; 4008 } 4009 4010 /* 4011 * Cycle ncp on list, ignore and do not move DUMMY 4012 * ncps. These are temporary list iterators. 4013 * 4014 * We must cycle the ncp to the end of the list to 4015 * ensure that all ncp's have an equal chance of 4016 * being removed. 4017 */ 4018 spin_lock(&nchpp->spin); 4019 ncp = TAILQ_FIRST(&nchpp->list); 4020 while (ncp && (ncp->nc_flag & NCF_DUMMY)) 4021 ncp = TAILQ_NEXT(ncp, nc_hash); 4022 if (ncp) { 4023 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); 4024 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); 4025 _cache_hold(ncp); 4026 } 4027 spin_unlock(&nchpp->spin); 4028 4029 if (ncp) { 4030 if (_cache_lock_special(ncp) == 0) { 4031 cache_zap(ncp); 4032 } else { 4033 _cache_drop(ncp); 4034 } 4035 } 4036 --count; 4037 } 4038 } 4039 4040 /* 4041 * This is a kitchen sink function to clean out ncps which we 4042 * tried to zap from cache_drop() but failed because we were 4043 * unable to acquire the parent lock. 4044 * 4045 * Such entries can also be removed via cache_inval_vp(), such 4046 * as when unmounting. 4047 */ 4048 static void 4049 _cache_cleandefered(void) 4050 { 4051 struct nchash_head *nchpp; 4052 struct namecache *ncp; 4053 struct namecache dummy; 4054 int i; 4055 4056 /* 4057 * Create a list iterator. DUMMY indicates that this is a list 4058 * iterator, DESTROYED prevents matches by lookup functions. 4059 */ 4060 numdefered = 0; 4061 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0; 4062 bzero(&dummy, sizeof(dummy)); 4063 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY; 4064 dummy.nc_refs = 1; 4065 4066 for (i = 0; i <= nchash; ++i) { 4067 nchpp = &nchashtbl[i]; 4068 4069 spin_lock(&nchpp->spin); 4070 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 4071 ncp = &dummy; 4072 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { 4073 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 4074 continue; 4075 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4076 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); 4077 _cache_hold(ncp); 4078 spin_unlock(&nchpp->spin); 4079 if (_cache_lock_nonblock(ncp) == 0) { 4080 ncp->nc_flag &= ~NCF_DEFEREDZAP; 4081 _cache_unlock(ncp); 4082 } 4083 _cache_drop(ncp); 4084 spin_lock(&nchpp->spin); 4085 ncp = &dummy; 4086 } 4087 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); 4088 spin_unlock(&nchpp->spin); 4089 } 4090 } 4091 4092 /* 4093 * Name cache initialization, from vfsinit() when we are booting 4094 */ 4095 void 4096 nchinit(void) 4097 { 4098 struct pcpu_ncache *pn; 4099 globaldata_t gd; 4100 int i; 4101 4102 /* 4103 * Per-cpu accounting and negative hit list 4104 */ 4105 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus, 4106 M_VFSCACHE, M_WAITOK|M_ZERO); 4107 for (i = 0; i < ncpus; ++i) { 4108 pn = &pcpu_ncache[i]; 4109 TAILQ_INIT(&pn->neg_list); 4110 spin_init(&pn->neg_spin, "ncneg"); 4111 spin_init(&pn->umount_spin, "ncumm"); 4112 } 4113 4114 /* 4115 * Initialise per-cpu namecache effectiveness statistics. 4116 */ 4117 for (i = 0; i < ncpus; ++i) { 4118 gd = globaldata_find(i); 4119 gd->gd_nchstats = &nchstats[i]; 4120 } 4121 4122 /* 4123 * Create a generous namecache hash table 4124 */ 4125 nchashtbl = hashinit_ext(vfs_inodehashsize(), 4126 sizeof(struct nchash_head), 4127 M_VFSCACHE, &nchash); 4128 for (i = 0; i <= (int)nchash; ++i) { 4129 TAILQ_INIT(&nchashtbl[i].list); 4130 spin_init(&nchashtbl[i].spin, "nchinit_hash"); 4131 } 4132 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 4133 spin_init(&ncmount_cache[i].spin, "nchinit_cache"); 4134 nclockwarn = 5 * hz; 4135 } 4136 4137 /* 4138 * Called from start_init() to bootstrap the root filesystem. Returns 4139 * a referenced, unlocked namecache record. 4140 */ 4141 void 4142 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 4143 { 4144 nch->ncp = cache_alloc(0); 4145 nch->mount = mp; 4146 _cache_mntref(mp); 4147 if (vp) 4148 _cache_setvp(nch->mount, nch->ncp, vp); 4149 } 4150 4151 /* 4152 * vfs_cache_setroot() 4153 * 4154 * Create an association between the root of our namecache and 4155 * the root vnode. This routine may be called several times during 4156 * booting. 4157 * 4158 * If the caller intends to save the returned namecache pointer somewhere 4159 * it must cache_hold() it. 4160 */ 4161 void 4162 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 4163 { 4164 struct vnode *ovp; 4165 struct nchandle onch; 4166 4167 ovp = rootvnode; 4168 onch = rootnch; 4169 rootvnode = nvp; 4170 if (nch) 4171 rootnch = *nch; 4172 else 4173 cache_zero(&rootnch); 4174 if (ovp) 4175 vrele(ovp); 4176 if (onch.ncp) 4177 cache_drop(&onch); 4178 } 4179 4180 /* 4181 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 4182 * topology and is being removed as quickly as possible. The new VOP_N*() 4183 * API calls are required to make specific adjustments using the supplied 4184 * ncp pointers rather then just bogusly purging random vnodes. 4185 * 4186 * Invalidate all namecache entries to a particular vnode as well as 4187 * any direct children of that vnode in the namecache. This is a 4188 * 'catch all' purge used by filesystems that do not know any better. 4189 * 4190 * Note that the linkage between the vnode and its namecache entries will 4191 * be removed, but the namecache entries themselves might stay put due to 4192 * active references from elsewhere in the system or due to the existance of 4193 * the children. The namecache topology is left intact even if we do not 4194 * know what the vnode association is. Such entries will be marked 4195 * NCF_UNRESOLVED. 4196 */ 4197 void 4198 cache_purge(struct vnode *vp) 4199 { 4200 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 4201 } 4202 4203 static int disablecwd; 4204 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 4205 "Disable getcwd"); 4206 4207 static u_long numcwdcalls; 4208 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 4209 "Number of current directory resolution calls"); 4210 static u_long numcwdfailnf; 4211 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 4212 "Number of current directory failures due to lack of file"); 4213 static u_long numcwdfailsz; 4214 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 4215 "Number of current directory failures due to large result"); 4216 static u_long numcwdfound; 4217 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 4218 "Number of current directory resolution successes"); 4219 4220 /* 4221 * MPALMOSTSAFE 4222 */ 4223 int 4224 sys___getcwd(struct __getcwd_args *uap) 4225 { 4226 u_int buflen; 4227 int error; 4228 char *buf; 4229 char *bp; 4230 4231 if (disablecwd) 4232 return (ENODEV); 4233 4234 buflen = uap->buflen; 4235 if (buflen == 0) 4236 return (EINVAL); 4237 if (buflen > MAXPATHLEN) 4238 buflen = MAXPATHLEN; 4239 4240 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 4241 bp = kern_getcwd(buf, buflen, &error); 4242 if (error == 0) 4243 error = copyout(bp, uap->buf, strlen(bp) + 1); 4244 kfree(buf, M_TEMP); 4245 return (error); 4246 } 4247 4248 char * 4249 kern_getcwd(char *buf, size_t buflen, int *error) 4250 { 4251 struct proc *p = curproc; 4252 char *bp; 4253 int i, slash_prefixed; 4254 struct filedesc *fdp; 4255 struct nchandle nch; 4256 struct namecache *ncp; 4257 4258 numcwdcalls++; 4259 bp = buf; 4260 bp += buflen - 1; 4261 *bp = '\0'; 4262 fdp = p->p_fd; 4263 slash_prefixed = 0; 4264 4265 nch = fdp->fd_ncdir; 4266 ncp = nch.ncp; 4267 if (ncp) 4268 _cache_hold(ncp); 4269 4270 while (ncp && (ncp != fdp->fd_nrdir.ncp || 4271 nch.mount != fdp->fd_nrdir.mount) 4272 ) { 4273 /* 4274 * While traversing upwards if we encounter the root 4275 * of the current mount we have to skip to the mount point 4276 * in the underlying filesystem. 4277 */ 4278 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 4279 nch = nch.mount->mnt_ncmounton; 4280 _cache_drop(ncp); 4281 ncp = nch.ncp; 4282 if (ncp) 4283 _cache_hold(ncp); 4284 continue; 4285 } 4286 4287 /* 4288 * Prepend the path segment 4289 */ 4290 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4291 if (bp == buf) { 4292 numcwdfailsz++; 4293 *error = ERANGE; 4294 bp = NULL; 4295 goto done; 4296 } 4297 *--bp = ncp->nc_name[i]; 4298 } 4299 if (bp == buf) { 4300 numcwdfailsz++; 4301 *error = ERANGE; 4302 bp = NULL; 4303 goto done; 4304 } 4305 *--bp = '/'; 4306 slash_prefixed = 1; 4307 4308 /* 4309 * Go up a directory. This isn't a mount point so we don't 4310 * have to check again. 4311 */ 4312 while ((nch.ncp = ncp->nc_parent) != NULL) { 4313 if (ncp_shared_lock_disable) 4314 _cache_lock(ncp); 4315 else 4316 _cache_lock_shared(ncp); 4317 if (nch.ncp != ncp->nc_parent) { 4318 _cache_unlock(ncp); 4319 continue; 4320 } 4321 _cache_hold(nch.ncp); 4322 _cache_unlock(ncp); 4323 break; 4324 } 4325 _cache_drop(ncp); 4326 ncp = nch.ncp; 4327 } 4328 if (ncp == NULL) { 4329 numcwdfailnf++; 4330 *error = ENOENT; 4331 bp = NULL; 4332 goto done; 4333 } 4334 if (!slash_prefixed) { 4335 if (bp == buf) { 4336 numcwdfailsz++; 4337 *error = ERANGE; 4338 bp = NULL; 4339 goto done; 4340 } 4341 *--bp = '/'; 4342 } 4343 numcwdfound++; 4344 *error = 0; 4345 done: 4346 if (ncp) 4347 _cache_drop(ncp); 4348 return (bp); 4349 } 4350 4351 /* 4352 * Thus begins the fullpath magic. 4353 * 4354 * The passed nchp is referenced but not locked. 4355 */ 4356 static int disablefullpath; 4357 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 4358 &disablefullpath, 0, 4359 "Disable fullpath lookups"); 4360 4361 int 4362 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 4363 char **retbuf, char **freebuf, int guess) 4364 { 4365 struct nchandle fd_nrdir; 4366 struct nchandle nch; 4367 struct namecache *ncp; 4368 struct mount *mp, *new_mp; 4369 char *bp, *buf; 4370 int slash_prefixed; 4371 int error = 0; 4372 int i; 4373 4374 *retbuf = NULL; 4375 *freebuf = NULL; 4376 4377 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 4378 bp = buf + MAXPATHLEN - 1; 4379 *bp = '\0'; 4380 if (nchbase) 4381 fd_nrdir = *nchbase; 4382 else if (p != NULL) 4383 fd_nrdir = p->p_fd->fd_nrdir; 4384 else 4385 fd_nrdir = rootnch; 4386 slash_prefixed = 0; 4387 nch = *nchp; 4388 ncp = nch.ncp; 4389 if (ncp) 4390 _cache_hold(ncp); 4391 mp = nch.mount; 4392 4393 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 4394 new_mp = NULL; 4395 4396 /* 4397 * If we are asked to guess the upwards path, we do so whenever 4398 * we encounter an ncp marked as a mountpoint. We try to find 4399 * the actual mountpoint by finding the mountpoint with this 4400 * ncp. 4401 */ 4402 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 4403 new_mp = mount_get_by_nc(ncp); 4404 } 4405 /* 4406 * While traversing upwards if we encounter the root 4407 * of the current mount we have to skip to the mount point. 4408 */ 4409 if (ncp == mp->mnt_ncmountpt.ncp) { 4410 new_mp = mp; 4411 } 4412 if (new_mp) { 4413 nch = new_mp->mnt_ncmounton; 4414 _cache_drop(ncp); 4415 ncp = nch.ncp; 4416 if (ncp) 4417 _cache_hold(ncp); 4418 mp = nch.mount; 4419 continue; 4420 } 4421 4422 /* 4423 * Prepend the path segment 4424 */ 4425 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 4426 if (bp == buf) { 4427 kfree(buf, M_TEMP); 4428 error = ENOMEM; 4429 goto done; 4430 } 4431 *--bp = ncp->nc_name[i]; 4432 } 4433 if (bp == buf) { 4434 kfree(buf, M_TEMP); 4435 error = ENOMEM; 4436 goto done; 4437 } 4438 *--bp = '/'; 4439 slash_prefixed = 1; 4440 4441 /* 4442 * Go up a directory. This isn't a mount point so we don't 4443 * have to check again. 4444 * 4445 * We can only safely access nc_parent with ncp held locked. 4446 */ 4447 while ((nch.ncp = ncp->nc_parent) != NULL) { 4448 _cache_lock_shared(ncp); 4449 if (nch.ncp != ncp->nc_parent) { 4450 _cache_unlock(ncp); 4451 continue; 4452 } 4453 _cache_hold(nch.ncp); 4454 _cache_unlock(ncp); 4455 break; 4456 } 4457 _cache_drop(ncp); 4458 ncp = nch.ncp; 4459 } 4460 if (ncp == NULL) { 4461 kfree(buf, M_TEMP); 4462 error = ENOENT; 4463 goto done; 4464 } 4465 4466 if (!slash_prefixed) { 4467 if (bp == buf) { 4468 kfree(buf, M_TEMP); 4469 error = ENOMEM; 4470 goto done; 4471 } 4472 *--bp = '/'; 4473 } 4474 *retbuf = bp; 4475 *freebuf = buf; 4476 error = 0; 4477 done: 4478 if (ncp) 4479 _cache_drop(ncp); 4480 return(error); 4481 } 4482 4483 int 4484 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, 4485 char **freebuf, int guess) 4486 { 4487 struct namecache *ncp; 4488 struct nchandle nch; 4489 int error; 4490 4491 *freebuf = NULL; 4492 if (disablefullpath) 4493 return (ENODEV); 4494 4495 if (p == NULL) 4496 return (EINVAL); 4497 4498 /* vn is NULL, client wants us to use p->p_textvp */ 4499 if (vn == NULL) { 4500 if ((vn = p->p_textvp) == NULL) 4501 return (EINVAL); 4502 } 4503 spin_lock_shared(&vn->v_spin); 4504 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 4505 if (ncp->nc_nlen) 4506 break; 4507 } 4508 if (ncp == NULL) { 4509 spin_unlock_shared(&vn->v_spin); 4510 return (EINVAL); 4511 } 4512 _cache_hold(ncp); 4513 spin_unlock_shared(&vn->v_spin); 4514 4515 nch.ncp = ncp; 4516 nch.mount = vn->v_mount; 4517 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4518 _cache_drop(ncp); 4519 return (error); 4520 } 4521 4522 void 4523 vfscache_rollup_cpu(struct globaldata *gd) 4524 { 4525 struct pcpu_ncache *pn; 4526 long count; 4527 4528 if (pcpu_ncache == NULL) 4529 return; 4530 pn = &pcpu_ncache[gd->gd_cpuid]; 4531 4532 if (pn->vfscache_count) { 4533 count = atomic_swap_long(&pn->vfscache_count, 0); 4534 atomic_add_long(&vfscache_count, count); 4535 } 4536 if (pn->vfscache_leafs) { 4537 count = atomic_swap_long(&pn->vfscache_leafs, 0); 4538 atomic_add_long(&vfscache_leafs, count); 4539 } 4540 if (pn->vfscache_negs) { 4541 count = atomic_swap_long(&pn->vfscache_negs, 0); 4542 atomic_add_long(&vfscache_negs, count); 4543 } 4544 if (pn->numdefered) { 4545 count = atomic_swap_long(&pn->numdefered, 0); 4546 atomic_add_long(&numdefered, count); 4547 } 4548 } 4549