1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993, 1995 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * Poul-Henning Kamp of the FreeBSD Project. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 3. All advertising materials mentioning features or use of this software 49 * must display the following acknowledgement: 50 * This product includes software developed by the University of 51 * California, Berkeley and its contributors. 52 * 4. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 */ 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/kernel.h> 72 #include <sys/sysctl.h> 73 #include <sys/mount.h> 74 #include <sys/vnode.h> 75 #include <sys/malloc.h> 76 #include <sys/sysproto.h> 77 #include <sys/spinlock.h> 78 #include <sys/proc.h> 79 #include <sys/namei.h> 80 #include <sys/nlookup.h> 81 #include <sys/filedesc.h> 82 #include <sys/fnv_hash.h> 83 #include <sys/globaldata.h> 84 #include <sys/kern_syscall.h> 85 #include <sys/dirent.h> 86 #include <ddb/ddb.h> 87 88 #include <sys/sysref2.h> 89 #include <sys/spinlock2.h> 90 #include <sys/mplock2.h> 91 92 #define MAX_RECURSION_DEPTH 64 93 94 /* 95 * Random lookups in the cache are accomplished with a hash table using 96 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock. 97 * 98 * Negative entries may exist and correspond to resolved namecache 99 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT 100 * will be set if the entry corresponds to a whited-out directory entry 101 * (verses simply not finding the entry at all). ncneglist is locked 102 * with a global spinlock (ncspin). 103 * 104 * MPSAFE RULES: 105 * 106 * (1) A ncp must be referenced before it can be locked. 107 * 108 * (2) A ncp must be locked in order to modify it. 109 * 110 * (3) ncp locks are always ordered child -> parent. That may seem 111 * backwards but forward scans use the hash table and thus can hold 112 * the parent unlocked when traversing downward. 113 * 114 * This allows insert/rename/delete/dot-dot and other operations 115 * to use ncp->nc_parent links. 116 * 117 * This also prevents a locked up e.g. NFS node from creating a 118 * chain reaction all the way back to the root vnode / namecache. 119 * 120 * (4) parent linkages require both the parent and child to be locked. 121 */ 122 123 /* 124 * Structures associated with name cacheing. 125 */ 126 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) 127 #define MINNEG 1024 128 #define MINPOS 1024 129 #define NCMOUNT_NUMCACHE 1009 /* prime number */ 130 131 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); 132 133 LIST_HEAD(nchash_list, namecache); 134 135 struct nchash_head { 136 struct nchash_list list; 137 struct spinlock spin; 138 }; 139 140 struct ncmount_cache { 141 struct spinlock spin; 142 struct namecache *ncp; 143 struct mount *mp; 144 int isneg; /* if != 0 mp is originator and not target */ 145 }; 146 147 static struct nchash_head *nchashtbl; 148 static struct namecache_list ncneglist; 149 static struct spinlock ncspin; 150 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE]; 151 152 /* 153 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server 154 * to create the namecache infrastructure leading to a dangling vnode. 155 * 156 * 0 Only errors are reported 157 * 1 Successes are reported 158 * 2 Successes + the whole directory scan is reported 159 * 3 Force the directory scan code run as if the parent vnode did not 160 * have a namecache record, even if it does have one. 161 */ 162 static int ncvp_debug; 163 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0, 164 "Namecache debug level (0-3)"); 165 166 static u_long nchash; /* size of hash table */ 167 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, 168 "Size of namecache hash table"); 169 170 static int ncnegflush = 10; /* burst for negative flush */ 171 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0, 172 "Batch flush negative entries"); 173 174 static int ncposflush = 10; /* burst for positive flush */ 175 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0, 176 "Batch flush positive entries"); 177 178 static int ncnegfactor = 16; /* ratio of negative entries */ 179 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, 180 "Ratio of namecache negative entries"); 181 182 static int nclockwarn; /* warn on locked entries in ticks */ 183 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0, 184 "Warn on locked namecache entries in ticks"); 185 186 static int numdefered; /* number of cache entries allocated */ 187 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0, 188 "Number of cache entries allocated"); 189 190 static int ncposlimit; /* number of cache entries allocated */ 191 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, 192 "Number of cache entries allocated"); 193 194 static int ncp_shared_lock_disable = 1; 195 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, 196 &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); 197 198 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), 199 "sizeof(struct vnode)"); 200 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), 201 "sizeof(struct namecache)"); 202 203 static int ncmount_cache_enable = 1; 204 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW, 205 &ncmount_cache_enable, 0, "mount point cache"); 206 static long ncmount_cache_hit; 207 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW, 208 &ncmount_cache_hit, 0, "mpcache hits"); 209 static long ncmount_cache_miss; 210 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW, 211 &ncmount_cache_miss, 0, "mpcache misses"); 212 static long ncmount_cache_overwrite; 213 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW, 214 &ncmount_cache_overwrite, 0, "mpcache entry overwrites"); 215 216 static int cache_resolve_mp(struct mount *mp); 217 static struct vnode *cache_dvpref(struct namecache *ncp); 218 static void _cache_lock(struct namecache *ncp); 219 static void _cache_setunresolved(struct namecache *ncp); 220 static void _cache_cleanneg(int count); 221 static void _cache_cleanpos(int count); 222 static void _cache_cleandefered(void); 223 static void _cache_unlink(struct namecache *ncp); 224 225 /* 226 * The new name cache statistics 227 */ 228 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); 229 static int numneg; 230 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, 231 "Number of negative namecache entries"); 232 static int numcache; 233 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, 234 "Number of namecaches entries"); 235 static u_long numcalls; 236 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0, 237 "Number of namecache lookups"); 238 static u_long numchecks; 239 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0, 240 "Number of checked entries in namecache lookups"); 241 242 struct nchstats nchstats[SMP_MAXCPU]; 243 /* 244 * Export VFS cache effectiveness statistics to user-land. 245 * 246 * The statistics are left for aggregation to user-land so 247 * neat things can be achieved, like observing per-CPU cache 248 * distribution. 249 */ 250 static int 251 sysctl_nchstats(SYSCTL_HANDLER_ARGS) 252 { 253 struct globaldata *gd; 254 int i, error; 255 256 error = 0; 257 for (i = 0; i < ncpus; ++i) { 258 gd = globaldata_find(i); 259 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats), 260 sizeof(struct nchstats)))) 261 break; 262 } 263 264 return (error); 265 } 266 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 267 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); 268 269 static struct namecache *cache_zap(struct namecache *ncp, int nonblock); 270 271 /* 272 * Namespace locking. The caller must already hold a reference to the 273 * namecache structure in order to lock/unlock it. This function prevents 274 * the namespace from being created or destroyed by accessors other then 275 * the lock holder. 276 * 277 * Note that holding a locked namecache structure prevents other threads 278 * from making namespace changes (e.g. deleting or creating), prevents 279 * vnode association state changes by other threads, and prevents the 280 * namecache entry from being resolved or unresolved by other threads. 281 * 282 * An exclusive lock owner has full authority to associate/disassociate 283 * vnodes and resolve/unresolve the locked ncp. 284 * 285 * A shared lock owner only has authority to acquire the underlying vnode, 286 * if any. 287 * 288 * The primary lock field is nc_lockstatus. nc_locktd is set after the 289 * fact (when locking) or cleared prior to unlocking. 290 * 291 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed 292 * or recycled, but it does NOT help you if the vnode had already 293 * initiated a recyclement. If this is important, use cache_get() 294 * rather then cache_lock() (and deal with the differences in the 295 * way the refs counter is handled). Or, alternatively, make an 296 * unconditional call to cache_validate() or cache_resolve() 297 * after cache_lock() returns. 298 */ 299 static 300 void 301 _cache_lock(struct namecache *ncp) 302 { 303 thread_t td; 304 int didwarn; 305 int error; 306 u_int count; 307 308 KKASSERT(ncp->nc_refs != 0); 309 didwarn = 0; 310 td = curthread; 311 312 for (;;) { 313 count = ncp->nc_lockstatus; 314 cpu_ccfence(); 315 316 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 317 if (atomic_cmpset_int(&ncp->nc_lockstatus, 318 count, count + 1)) { 319 /* 320 * The vp associated with a locked ncp must 321 * be held to prevent it from being recycled. 322 * 323 * WARNING! If VRECLAIMED is set the vnode 324 * could already be in the middle of a recycle. 325 * Callers must use cache_vref() or 326 * cache_vget() on the locked ncp to 327 * validate the vp or set the cache entry 328 * to unresolved. 329 * 330 * NOTE! vhold() is allowed if we hold a 331 * lock on the ncp (which we do). 332 */ 333 ncp->nc_locktd = td; 334 if (ncp->nc_vp) 335 vhold(ncp->nc_vp); 336 break; 337 } 338 /* cmpset failed */ 339 continue; 340 } 341 if (ncp->nc_locktd == td) { 342 KKASSERT((count & NC_SHLOCK_FLAG) == 0); 343 if (atomic_cmpset_int(&ncp->nc_lockstatus, 344 count, count + 1)) { 345 break; 346 } 347 /* cmpset failed */ 348 continue; 349 } 350 tsleep_interlock(&ncp->nc_locktd, 0); 351 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 352 count | NC_EXLOCK_REQ) == 0) { 353 /* cmpset failed */ 354 continue; 355 } 356 error = tsleep(&ncp->nc_locktd, PINTERLOCKED, 357 "clock", nclockwarn); 358 if (error == EWOULDBLOCK) { 359 if (didwarn == 0) { 360 didwarn = ticks; 361 kprintf("[diagnostic] cache_lock: " 362 "blocked on %p %08x", 363 ncp, count); 364 kprintf(" \"%*.*s\"\n", 365 ncp->nc_nlen, ncp->nc_nlen, 366 ncp->nc_name); 367 } 368 } 369 /* loop */ 370 } 371 if (didwarn) { 372 kprintf("[diagnostic] cache_lock: unblocked %*.*s after " 373 "%d secs\n", 374 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 375 (int)(ticks - didwarn) / hz); 376 } 377 } 378 379 /* 380 * The shared lock works similarly to the exclusive lock except 381 * nc_locktd is left NULL and we need an interlock (VHOLD) to 382 * prevent vhold() races, since the moment our cmpset_int succeeds 383 * another cpu can come in and get its own shared lock. 384 * 385 * A critical section is needed to prevent interruption during the 386 * VHOLD interlock. 387 */ 388 static 389 void 390 _cache_lock_shared(struct namecache *ncp) 391 { 392 int didwarn; 393 int error; 394 u_int count; 395 396 KKASSERT(ncp->nc_refs != 0); 397 didwarn = 0; 398 399 for (;;) { 400 count = ncp->nc_lockstatus; 401 cpu_ccfence(); 402 403 if ((count & ~NC_SHLOCK_REQ) == 0) { 404 crit_enter(); 405 if (atomic_cmpset_int(&ncp->nc_lockstatus, 406 count, 407 (count + 1) | NC_SHLOCK_FLAG | 408 NC_SHLOCK_VHOLD)) { 409 /* 410 * The vp associated with a locked ncp must 411 * be held to prevent it from being recycled. 412 * 413 * WARNING! If VRECLAIMED is set the vnode 414 * could already be in the middle of a recycle. 415 * Callers must use cache_vref() or 416 * cache_vget() on the locked ncp to 417 * validate the vp or set the cache entry 418 * to unresolved. 419 * 420 * NOTE! vhold() is allowed if we hold a 421 * lock on the ncp (which we do). 422 */ 423 if (ncp->nc_vp) 424 vhold(ncp->nc_vp); 425 atomic_clear_int(&ncp->nc_lockstatus, 426 NC_SHLOCK_VHOLD); 427 crit_exit(); 428 break; 429 } 430 /* cmpset failed */ 431 crit_exit(); 432 continue; 433 } 434 435 /* 436 * If already held shared we can just bump the count, but 437 * only allow this if nobody is trying to get the lock 438 * exclusively. 439 * 440 * VHOLD is a bit of a hack. Even though we successfully 441 * added another shared ref, the cpu that got the first 442 * shared ref might not yet have held the vnode. 443 */ 444 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == 445 NC_SHLOCK_FLAG) { 446 KKASSERT((count & ~(NC_EXLOCK_REQ | 447 NC_SHLOCK_REQ | 448 NC_SHLOCK_FLAG)) > 0); 449 if (atomic_cmpset_int(&ncp->nc_lockstatus, 450 count, count + 1)) { 451 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 452 cpu_pause(); 453 break; 454 } 455 continue; 456 } 457 tsleep_interlock(ncp, 0); 458 if (atomic_cmpset_int(&ncp->nc_lockstatus, count, 459 count | NC_SHLOCK_REQ) == 0) { 460 /* cmpset failed */ 461 continue; 462 } 463 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn); 464 if (error == EWOULDBLOCK) { 465 if (didwarn == 0) { 466 didwarn = ticks; 467 kprintf("[diagnostic] cache_lock_shared: " 468 "blocked on %p %08x", 469 ncp, count); 470 kprintf(" \"%*.*s\"\n", 471 ncp->nc_nlen, ncp->nc_nlen, 472 ncp->nc_name); 473 } 474 } 475 /* loop */ 476 } 477 if (didwarn) { 478 kprintf("[diagnostic] cache_lock_shared: " 479 "unblocked %*.*s after %d secs\n", 480 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, 481 (int)(ticks - didwarn) / hz); 482 } 483 } 484 485 /* 486 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance, 487 * such as the case where one of its children is locked. 488 */ 489 static 490 int 491 _cache_lock_nonblock(struct namecache *ncp) 492 { 493 thread_t td; 494 u_int count; 495 496 td = curthread; 497 498 for (;;) { 499 count = ncp->nc_lockstatus; 500 501 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { 502 if (atomic_cmpset_int(&ncp->nc_lockstatus, 503 count, count + 1)) { 504 /* 505 * The vp associated with a locked ncp must 506 * be held to prevent it from being recycled. 507 * 508 * WARNING! If VRECLAIMED is set the vnode 509 * could already be in the middle of a recycle. 510 * Callers must use cache_vref() or 511 * cache_vget() on the locked ncp to 512 * validate the vp or set the cache entry 513 * to unresolved. 514 * 515 * NOTE! vhold() is allowed if we hold a 516 * lock on the ncp (which we do). 517 */ 518 ncp->nc_locktd = td; 519 if (ncp->nc_vp) 520 vhold(ncp->nc_vp); 521 break; 522 } 523 /* cmpset failed */ 524 continue; 525 } 526 if (ncp->nc_locktd == td) { 527 if (atomic_cmpset_int(&ncp->nc_lockstatus, 528 count, count + 1)) { 529 break; 530 } 531 /* cmpset failed */ 532 continue; 533 } 534 return(EWOULDBLOCK); 535 } 536 return(0); 537 } 538 539 /* 540 * The shared lock works similarly to the exclusive lock except 541 * nc_locktd is left NULL and we need an interlock (VHOLD) to 542 * prevent vhold() races, since the moment our cmpset_int succeeds 543 * another cpu can come in and get its own shared lock. 544 * 545 * A critical section is needed to prevent interruption during the 546 * VHOLD interlock. 547 */ 548 static 549 int 550 _cache_lock_shared_nonblock(struct namecache *ncp) 551 { 552 u_int count; 553 554 for (;;) { 555 count = ncp->nc_lockstatus; 556 557 if ((count & ~NC_SHLOCK_REQ) == 0) { 558 crit_enter(); 559 if (atomic_cmpset_int(&ncp->nc_lockstatus, 560 count, 561 (count + 1) | NC_SHLOCK_FLAG | 562 NC_SHLOCK_VHOLD)) { 563 /* 564 * The vp associated with a locked ncp must 565 * be held to prevent it from being recycled. 566 * 567 * WARNING! If VRECLAIMED is set the vnode 568 * could already be in the middle of a recycle. 569 * Callers must use cache_vref() or 570 * cache_vget() on the locked ncp to 571 * validate the vp or set the cache entry 572 * to unresolved. 573 * 574 * NOTE! vhold() is allowed if we hold a 575 * lock on the ncp (which we do). 576 */ 577 if (ncp->nc_vp) 578 vhold(ncp->nc_vp); 579 atomic_clear_int(&ncp->nc_lockstatus, 580 NC_SHLOCK_VHOLD); 581 crit_exit(); 582 break; 583 } 584 /* cmpset failed */ 585 crit_exit(); 586 continue; 587 } 588 589 /* 590 * If already held shared we can just bump the count, but 591 * only allow this if nobody is trying to get the lock 592 * exclusively. 593 * 594 * VHOLD is a bit of a hack. Even though we successfully 595 * added another shared ref, the cpu that got the first 596 * shared ref might not yet have held the vnode. 597 */ 598 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == 599 NC_SHLOCK_FLAG) { 600 KKASSERT((count & ~(NC_EXLOCK_REQ | 601 NC_SHLOCK_REQ | 602 NC_SHLOCK_FLAG)) > 0); 603 if (atomic_cmpset_int(&ncp->nc_lockstatus, 604 count, count + 1)) { 605 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) 606 cpu_pause(); 607 break; 608 } 609 continue; 610 } 611 return(EWOULDBLOCK); 612 } 613 return(0); 614 } 615 616 /* 617 * Helper function 618 * 619 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop). 620 * 621 * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared. 622 */ 623 static 624 void 625 _cache_unlock(struct namecache *ncp) 626 { 627 thread_t td __debugvar = curthread; 628 u_int count; 629 u_int ncount; 630 struct vnode *dropvp; 631 632 KKASSERT(ncp->nc_refs >= 0); 633 KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0); 634 KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td); 635 636 count = ncp->nc_lockstatus; 637 cpu_ccfence(); 638 639 /* 640 * Clear nc_locktd prior to the atomic op (excl lock only) 641 */ 642 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) 643 ncp->nc_locktd = NULL; 644 dropvp = NULL; 645 646 for (;;) { 647 if ((count & 648 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) { 649 dropvp = ncp->nc_vp; 650 if (count & NC_EXLOCK_REQ) 651 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */ 652 else 653 ncount = 0; 654 655 if (atomic_cmpset_int(&ncp->nc_lockstatus, 656 count, ncount)) { 657 if (count & NC_EXLOCK_REQ) 658 wakeup(&ncp->nc_locktd); 659 else if (count & NC_SHLOCK_REQ) 660 wakeup(ncp); 661 break; 662 } 663 dropvp = NULL; 664 } else { 665 KKASSERT((count & NC_SHLOCK_VHOLD) == 0); 666 KKASSERT((count & ~(NC_EXLOCK_REQ | 667 NC_SHLOCK_REQ | 668 NC_SHLOCK_FLAG)) > 1); 669 if (atomic_cmpset_int(&ncp->nc_lockstatus, 670 count, count - 1)) { 671 break; 672 } 673 } 674 count = ncp->nc_lockstatus; 675 cpu_ccfence(); 676 } 677 678 /* 679 * Don't actually drop the vp until we successfully clean out 680 * the lock, otherwise we may race another shared lock. 681 */ 682 if (dropvp) 683 vdrop(dropvp); 684 } 685 686 static 687 int 688 _cache_lockstatus(struct namecache *ncp) 689 { 690 if (ncp->nc_locktd == curthread) 691 return(LK_EXCLUSIVE); 692 if (ncp->nc_lockstatus & NC_SHLOCK_FLAG) 693 return(LK_SHARED); 694 return(-1); 695 } 696 697 /* 698 * cache_hold() and cache_drop() prevent the premature deletion of a 699 * namecache entry but do not prevent operations (such as zapping) on 700 * that namecache entry. 701 * 702 * This routine may only be called from outside this source module if 703 * nc_refs is already at least 1. 704 * 705 * This is a rare case where callers are allowed to hold a spinlock, 706 * so we can't ourselves. 707 */ 708 static __inline 709 struct namecache * 710 _cache_hold(struct namecache *ncp) 711 { 712 atomic_add_int(&ncp->nc_refs, 1); 713 return(ncp); 714 } 715 716 /* 717 * Drop a cache entry, taking care to deal with races. 718 * 719 * For potential 1->0 transitions we must hold the ncp lock to safely 720 * test its flags. An unresolved entry with no children must be zapped 721 * to avoid leaks. 722 * 723 * The call to cache_zap() itself will handle all remaining races and 724 * will decrement the ncp's refs regardless. If we are resolved or 725 * have children nc_refs can safely be dropped to 0 without having to 726 * zap the entry. 727 * 728 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion. 729 * 730 * NOTE: cache_zap() may return a non-NULL referenced parent which must 731 * be dropped in a loop. 732 */ 733 static __inline 734 void 735 _cache_drop(struct namecache *ncp) 736 { 737 int refs; 738 739 while (ncp) { 740 KKASSERT(ncp->nc_refs > 0); 741 refs = ncp->nc_refs; 742 743 if (refs == 1) { 744 if (_cache_lock_nonblock(ncp) == 0) { 745 ncp->nc_flag &= ~NCF_DEFEREDZAP; 746 if ((ncp->nc_flag & NCF_UNRESOLVED) && 747 TAILQ_EMPTY(&ncp->nc_list)) { 748 ncp = cache_zap(ncp, 1); 749 continue; 750 } 751 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) { 752 _cache_unlock(ncp); 753 break; 754 } 755 _cache_unlock(ncp); 756 } 757 } else { 758 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) 759 break; 760 } 761 cpu_pause(); 762 } 763 } 764 765 /* 766 * Link a new namecache entry to its parent and to the hash table. Be 767 * careful to avoid races if vhold() blocks in the future. 768 * 769 * Both ncp and par must be referenced and locked. 770 * 771 * NOTE: The hash table spinlock is held during this call, we can't do 772 * anything fancy. 773 */ 774 static void 775 _cache_link_parent(struct namecache *ncp, struct namecache *par, 776 struct nchash_head *nchpp) 777 { 778 KKASSERT(ncp->nc_parent == NULL); 779 ncp->nc_parent = par; 780 ncp->nc_head = nchpp; 781 782 /* 783 * Set inheritance flags. Note that the parent flags may be 784 * stale due to getattr potentially not having been run yet 785 * (it gets run during nlookup()'s). 786 */ 787 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE); 788 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) 789 ncp->nc_flag |= NCF_SF_PNOCACHE; 790 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE)) 791 ncp->nc_flag |= NCF_UF_PCACHE; 792 793 LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash); 794 795 if (TAILQ_EMPTY(&par->nc_list)) { 796 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 797 /* 798 * Any vp associated with an ncp which has children must 799 * be held to prevent it from being recycled. 800 */ 801 if (par->nc_vp) 802 vhold(par->nc_vp); 803 } else { 804 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry); 805 } 806 } 807 808 /* 809 * Remove the parent and hash associations from a namecache structure. 810 * If this is the last child of the parent the cache_drop(par) will 811 * attempt to recursively zap the parent. 812 * 813 * ncp must be locked. This routine will acquire a temporary lock on 814 * the parent as wlel as the appropriate hash chain. 815 */ 816 static void 817 _cache_unlink_parent(struct namecache *ncp) 818 { 819 struct namecache *par; 820 struct vnode *dropvp; 821 822 if ((par = ncp->nc_parent) != NULL) { 823 KKASSERT(ncp->nc_parent == par); 824 _cache_hold(par); 825 _cache_lock(par); 826 spin_lock(&ncp->nc_head->spin); 827 LIST_REMOVE(ncp, nc_hash); 828 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 829 dropvp = NULL; 830 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list)) 831 dropvp = par->nc_vp; 832 spin_unlock(&ncp->nc_head->spin); 833 ncp->nc_parent = NULL; 834 ncp->nc_head = NULL; 835 _cache_unlock(par); 836 _cache_drop(par); 837 838 /* 839 * We can only safely vdrop with no spinlocks held. 840 */ 841 if (dropvp) 842 vdrop(dropvp); 843 } 844 } 845 846 /* 847 * Allocate a new namecache structure. Most of the code does not require 848 * zero-termination of the string but it makes vop_compat_ncreate() easier. 849 */ 850 static struct namecache * 851 cache_alloc(int nlen) 852 { 853 struct namecache *ncp; 854 855 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO); 856 if (nlen) 857 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK); 858 ncp->nc_nlen = nlen; 859 ncp->nc_flag = NCF_UNRESOLVED; 860 ncp->nc_error = ENOTCONN; /* needs to be resolved */ 861 ncp->nc_refs = 1; 862 863 TAILQ_INIT(&ncp->nc_list); 864 _cache_lock(ncp); 865 return(ncp); 866 } 867 868 /* 869 * Can only be called for the case where the ncp has never been 870 * associated with anything (so no spinlocks are needed). 871 */ 872 static void 873 _cache_free(struct namecache *ncp) 874 { 875 KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1); 876 if (ncp->nc_name) 877 kfree(ncp->nc_name, M_VFSCACHE); 878 kfree(ncp, M_VFSCACHE); 879 } 880 881 /* 882 * [re]initialize a nchandle. 883 */ 884 void 885 cache_zero(struct nchandle *nch) 886 { 887 nch->ncp = NULL; 888 nch->mount = NULL; 889 } 890 891 /* 892 * Ref and deref a namecache structure. 893 * 894 * The caller must specify a stable ncp pointer, typically meaning the 895 * ncp is already referenced but this can also occur indirectly through 896 * e.g. holding a lock on a direct child. 897 * 898 * WARNING: Caller may hold an unrelated read spinlock, which means we can't 899 * use read spinlocks here. 900 * 901 * MPSAFE if nch is 902 */ 903 struct nchandle * 904 cache_hold(struct nchandle *nch) 905 { 906 _cache_hold(nch->ncp); 907 atomic_add_int(&nch->mount->mnt_refs, 1); 908 return(nch); 909 } 910 911 /* 912 * Create a copy of a namecache handle for an already-referenced 913 * entry. 914 * 915 * MPSAFE if nch is 916 */ 917 void 918 cache_copy(struct nchandle *nch, struct nchandle *target) 919 { 920 *target = *nch; 921 if (target->ncp) 922 _cache_hold(target->ncp); 923 atomic_add_int(&nch->mount->mnt_refs, 1); 924 } 925 926 /* 927 * MPSAFE if nch is 928 */ 929 void 930 cache_changemount(struct nchandle *nch, struct mount *mp) 931 { 932 atomic_add_int(&nch->mount->mnt_refs, -1); 933 nch->mount = mp; 934 atomic_add_int(&nch->mount->mnt_refs, 1); 935 } 936 937 void 938 cache_drop(struct nchandle *nch) 939 { 940 atomic_add_int(&nch->mount->mnt_refs, -1); 941 _cache_drop(nch->ncp); 942 nch->ncp = NULL; 943 nch->mount = NULL; 944 } 945 946 int 947 cache_lockstatus(struct nchandle *nch) 948 { 949 return(_cache_lockstatus(nch->ncp)); 950 } 951 952 void 953 cache_lock(struct nchandle *nch) 954 { 955 _cache_lock(nch->ncp); 956 } 957 958 void 959 cache_lock_maybe_shared(struct nchandle *nch, int excl) 960 { 961 struct namecache *ncp = nch->ncp; 962 963 if (ncp_shared_lock_disable || excl || 964 (ncp->nc_flag & NCF_UNRESOLVED)) { 965 _cache_lock(ncp); 966 } else { 967 _cache_lock_shared(ncp); 968 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 969 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 970 _cache_unlock(ncp); 971 _cache_lock(ncp); 972 } 973 } else { 974 _cache_unlock(ncp); 975 _cache_lock(ncp); 976 } 977 } 978 } 979 980 /* 981 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller 982 * is responsible for checking both for validity on return as they 983 * may have become invalid. 984 * 985 * We have to deal with potential deadlocks here, just ping pong 986 * the lock until we get it (we will always block somewhere when 987 * looping so this is not cpu-intensive). 988 * 989 * which = 0 nch1 not locked, nch2 is locked 990 * which = 1 nch1 is locked, nch2 is not locked 991 */ 992 void 993 cache_relock(struct nchandle *nch1, struct ucred *cred1, 994 struct nchandle *nch2, struct ucred *cred2) 995 { 996 int which; 997 998 which = 0; 999 1000 for (;;) { 1001 if (which == 0) { 1002 if (cache_lock_nonblock(nch1) == 0) { 1003 cache_resolve(nch1, cred1); 1004 break; 1005 } 1006 cache_unlock(nch2); 1007 cache_lock(nch1); 1008 cache_resolve(nch1, cred1); 1009 which = 1; 1010 } else { 1011 if (cache_lock_nonblock(nch2) == 0) { 1012 cache_resolve(nch2, cred2); 1013 break; 1014 } 1015 cache_unlock(nch1); 1016 cache_lock(nch2); 1017 cache_resolve(nch2, cred2); 1018 which = 0; 1019 } 1020 } 1021 } 1022 1023 int 1024 cache_lock_nonblock(struct nchandle *nch) 1025 { 1026 return(_cache_lock_nonblock(nch->ncp)); 1027 } 1028 1029 void 1030 cache_unlock(struct nchandle *nch) 1031 { 1032 _cache_unlock(nch->ncp); 1033 } 1034 1035 /* 1036 * ref-and-lock, unlock-and-deref functions. 1037 * 1038 * This function is primarily used by nlookup. Even though cache_lock 1039 * holds the vnode, it is possible that the vnode may have already 1040 * initiated a recyclement. 1041 * 1042 * We want cache_get() to return a definitively usable vnode or a 1043 * definitively unresolved ncp. 1044 */ 1045 static 1046 struct namecache * 1047 _cache_get(struct namecache *ncp) 1048 { 1049 _cache_hold(ncp); 1050 _cache_lock(ncp); 1051 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1052 _cache_setunresolved(ncp); 1053 return(ncp); 1054 } 1055 1056 /* 1057 * Attempt to obtain a shared lock on the ncp. A shared lock will only 1058 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is 1059 * valid. Otherwise an exclusive lock will be acquired instead. 1060 */ 1061 static 1062 struct namecache * 1063 _cache_get_maybe_shared(struct namecache *ncp, int excl) 1064 { 1065 if (ncp_shared_lock_disable || excl || 1066 (ncp->nc_flag & NCF_UNRESOLVED)) { 1067 return(_cache_get(ncp)); 1068 } 1069 _cache_hold(ncp); 1070 _cache_lock_shared(ncp); 1071 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1072 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { 1073 _cache_unlock(ncp); 1074 ncp = _cache_get(ncp); 1075 _cache_drop(ncp); 1076 } 1077 } else { 1078 _cache_unlock(ncp); 1079 ncp = _cache_get(ncp); 1080 _cache_drop(ncp); 1081 } 1082 return(ncp); 1083 } 1084 1085 /* 1086 * This is a special form of _cache_lock() which only succeeds if 1087 * it can get a pristine, non-recursive lock. The caller must have 1088 * already ref'd the ncp. 1089 * 1090 * On success the ncp will be locked, on failure it will not. The 1091 * ref count does not change either way. 1092 * 1093 * We want _cache_lock_special() (on success) to return a definitively 1094 * usable vnode or a definitively unresolved ncp. 1095 */ 1096 static int 1097 _cache_lock_special(struct namecache *ncp) 1098 { 1099 if (_cache_lock_nonblock(ncp) == 0) { 1100 if ((ncp->nc_lockstatus & 1101 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) { 1102 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 1103 _cache_setunresolved(ncp); 1104 return(0); 1105 } 1106 _cache_unlock(ncp); 1107 } 1108 return(EWOULDBLOCK); 1109 } 1110 1111 static int 1112 _cache_lock_shared_special(struct namecache *ncp) 1113 { 1114 if (_cache_lock_shared_nonblock(ncp) == 0) { 1115 if ((ncp->nc_lockstatus & 1116 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == (NC_SHLOCK_FLAG | 1)) { 1117 if (ncp->nc_vp == NULL || 1118 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { 1119 return(0); 1120 } 1121 } 1122 _cache_unlock(ncp); 1123 } 1124 return(EWOULDBLOCK); 1125 } 1126 1127 1128 /* 1129 * NOTE: The same nchandle can be passed for both arguments. 1130 */ 1131 void 1132 cache_get(struct nchandle *nch, struct nchandle *target) 1133 { 1134 KKASSERT(nch->ncp->nc_refs > 0); 1135 target->mount = nch->mount; 1136 target->ncp = _cache_get(nch->ncp); 1137 atomic_add_int(&target->mount->mnt_refs, 1); 1138 } 1139 1140 void 1141 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) 1142 { 1143 KKASSERT(nch->ncp->nc_refs > 0); 1144 target->mount = nch->mount; 1145 target->ncp = _cache_get_maybe_shared(nch->ncp, excl); 1146 atomic_add_int(&target->mount->mnt_refs, 1); 1147 } 1148 1149 /* 1150 * 1151 */ 1152 static __inline 1153 void 1154 _cache_put(struct namecache *ncp) 1155 { 1156 _cache_unlock(ncp); 1157 _cache_drop(ncp); 1158 } 1159 1160 /* 1161 * 1162 */ 1163 void 1164 cache_put(struct nchandle *nch) 1165 { 1166 atomic_add_int(&nch->mount->mnt_refs, -1); 1167 _cache_put(nch->ncp); 1168 nch->ncp = NULL; 1169 nch->mount = NULL; 1170 } 1171 1172 /* 1173 * Resolve an unresolved ncp by associating a vnode with it. If the 1174 * vnode is NULL, a negative cache entry is created. 1175 * 1176 * The ncp should be locked on entry and will remain locked on return. 1177 */ 1178 static 1179 void 1180 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) 1181 { 1182 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 1183 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1184 1185 if (vp != NULL) { 1186 /* 1187 * Any vp associated with an ncp which has children must 1188 * be held. Any vp associated with a locked ncp must be held. 1189 */ 1190 if (!TAILQ_EMPTY(&ncp->nc_list)) 1191 vhold(vp); 1192 spin_lock(&vp->v_spin); 1193 ncp->nc_vp = vp; 1194 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); 1195 spin_unlock(&vp->v_spin); 1196 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1197 vhold(vp); 1198 1199 /* 1200 * Set auxiliary flags 1201 */ 1202 switch(vp->v_type) { 1203 case VDIR: 1204 ncp->nc_flag |= NCF_ISDIR; 1205 break; 1206 case VLNK: 1207 ncp->nc_flag |= NCF_ISSYMLINK; 1208 /* XXX cache the contents of the symlink */ 1209 break; 1210 default: 1211 break; 1212 } 1213 atomic_add_int(&numcache, 1); 1214 ncp->nc_error = 0; 1215 /* XXX: this is a hack to work-around the lack of a real pfs vfs 1216 * implementation*/ 1217 if (mp != NULL) 1218 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0) 1219 vp->v_pfsmp = mp; 1220 } else { 1221 /* 1222 * When creating a negative cache hit we set the 1223 * namecache_gen. A later resolve will clean out the 1224 * negative cache hit if the mount point's namecache_gen 1225 * has changed. Used by devfs, could also be used by 1226 * other remote FSs. 1227 */ 1228 ncp->nc_vp = NULL; 1229 spin_lock(&ncspin); 1230 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode); 1231 ++numneg; 1232 spin_unlock(&ncspin); 1233 ncp->nc_error = ENOENT; 1234 if (mp) 1235 VFS_NCPGEN_SET(mp, ncp); 1236 } 1237 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP); 1238 } 1239 1240 /* 1241 * 1242 */ 1243 void 1244 cache_setvp(struct nchandle *nch, struct vnode *vp) 1245 { 1246 _cache_setvp(nch->mount, nch->ncp, vp); 1247 } 1248 1249 /* 1250 * 1251 */ 1252 void 1253 cache_settimeout(struct nchandle *nch, int nticks) 1254 { 1255 struct namecache *ncp = nch->ncp; 1256 1257 if ((ncp->nc_timeout = ticks + nticks) == 0) 1258 ncp->nc_timeout = 1; 1259 } 1260 1261 /* 1262 * Disassociate the vnode or negative-cache association and mark a 1263 * namecache entry as unresolved again. Note that the ncp is still 1264 * left in the hash table and still linked to its parent. 1265 * 1266 * The ncp should be locked and refd on entry and will remain locked and refd 1267 * on return. 1268 * 1269 * This routine is normally never called on a directory containing children. 1270 * However, NFS often does just that in its rename() code as a cop-out to 1271 * avoid complex namespace operations. This disconnects a directory vnode 1272 * from its namecache and can cause the OLDAPI and NEWAPI to get out of 1273 * sync. 1274 * 1275 */ 1276 static 1277 void 1278 _cache_setunresolved(struct namecache *ncp) 1279 { 1280 struct vnode *vp; 1281 1282 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1283 ncp->nc_flag |= NCF_UNRESOLVED; 1284 ncp->nc_timeout = 0; 1285 ncp->nc_error = ENOTCONN; 1286 if ((vp = ncp->nc_vp) != NULL) { 1287 atomic_add_int(&numcache, -1); 1288 spin_lock(&vp->v_spin); 1289 ncp->nc_vp = NULL; 1290 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode); 1291 spin_unlock(&vp->v_spin); 1292 1293 /* 1294 * Any vp associated with an ncp with children is 1295 * held by that ncp. Any vp associated with a locked 1296 * ncp is held by that ncp. These conditions must be 1297 * undone when the vp is cleared out from the ncp. 1298 */ 1299 if (!TAILQ_EMPTY(&ncp->nc_list)) 1300 vdrop(vp); 1301 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) 1302 vdrop(vp); 1303 } else { 1304 spin_lock(&ncspin); 1305 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); 1306 --numneg; 1307 spin_unlock(&ncspin); 1308 } 1309 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK); 1310 } 1311 } 1312 1313 /* 1314 * The cache_nresolve() code calls this function to automatically 1315 * set a resolved cache element to unresolved if it has timed out 1316 * or if it is a negative cache hit and the mount point namecache_gen 1317 * has changed. 1318 */ 1319 static __inline int 1320 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) 1321 { 1322 /* 1323 * Try to zap entries that have timed out. We have 1324 * to be careful here because locked leafs may depend 1325 * on the vnode remaining intact in a parent, so only 1326 * do this under very specific conditions. 1327 */ 1328 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && 1329 TAILQ_EMPTY(&ncp->nc_list)) { 1330 return 1; 1331 } 1332 1333 /* 1334 * If a resolved negative cache hit is invalid due to 1335 * the mount's namecache generation being bumped, zap it. 1336 */ 1337 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { 1338 return 1; 1339 } 1340 1341 /* 1342 * Otherwise we are good 1343 */ 1344 return 0; 1345 } 1346 1347 static __inline void 1348 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) 1349 { 1350 /* 1351 * Already in an unresolved state, nothing to do. 1352 */ 1353 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 1354 if (_cache_auto_unresolve_test(mp, ncp)) 1355 _cache_setunresolved(ncp); 1356 } 1357 } 1358 1359 /* 1360 * 1361 */ 1362 void 1363 cache_setunresolved(struct nchandle *nch) 1364 { 1365 _cache_setunresolved(nch->ncp); 1366 } 1367 1368 /* 1369 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist 1370 * looking for matches. This flag tells the lookup code when it must 1371 * check for a mount linkage and also prevents the directories in question 1372 * from being deleted or renamed. 1373 */ 1374 static 1375 int 1376 cache_clrmountpt_callback(struct mount *mp, void *data) 1377 { 1378 struct nchandle *nch = data; 1379 1380 if (mp->mnt_ncmounton.ncp == nch->ncp) 1381 return(1); 1382 if (mp->mnt_ncmountpt.ncp == nch->ncp) 1383 return(1); 1384 return(0); 1385 } 1386 1387 /* 1388 * 1389 */ 1390 void 1391 cache_clrmountpt(struct nchandle *nch) 1392 { 1393 int count; 1394 1395 count = mountlist_scan(cache_clrmountpt_callback, nch, 1396 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1397 if (count == 0) 1398 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT; 1399 } 1400 1401 /* 1402 * Invalidate portions of the namecache topology given a starting entry. 1403 * The passed ncp is set to an unresolved state and: 1404 * 1405 * The passed ncp must be referencxed and locked. The routine may unlock 1406 * and relock ncp several times, and will recheck the children and loop 1407 * to catch races. When done the passed ncp will be returned with the 1408 * reference and lock intact. 1409 * 1410 * CINV_DESTROY - Set a flag in the passed ncp entry indicating 1411 * that the physical underlying nodes have been 1412 * destroyed... as in deleted. For example, when 1413 * a directory is removed. This will cause record 1414 * lookups on the name to no longer be able to find 1415 * the record and tells the resolver to return failure 1416 * rather then trying to resolve through the parent. 1417 * 1418 * The topology itself, including ncp->nc_name, 1419 * remains intact. 1420 * 1421 * This only applies to the passed ncp, if CINV_CHILDREN 1422 * is specified the children are not flagged. 1423 * 1424 * CINV_CHILDREN - Set all children (recursively) to an unresolved 1425 * state as well. 1426 * 1427 * Note that this will also have the side effect of 1428 * cleaning out any unreferenced nodes in the topology 1429 * from the leaves up as the recursion backs out. 1430 * 1431 * Note that the topology for any referenced nodes remains intact, but 1432 * the nodes will be marked as having been destroyed and will be set 1433 * to an unresolved state. 1434 * 1435 * It is possible for cache_inval() to race a cache_resolve(), meaning that 1436 * the namecache entry may not actually be invalidated on return if it was 1437 * revalidated while recursing down into its children. This code guarentees 1438 * that the node(s) will go through an invalidation cycle, but does not 1439 * guarentee that they will remain in an invalidated state. 1440 * 1441 * Returns non-zero if a revalidation was detected during the invalidation 1442 * recursion, zero otherwise. Note that since only the original ncp is 1443 * locked the revalidation ultimately can only indicate that the original ncp 1444 * *MIGHT* no have been reresolved. 1445 * 1446 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we 1447 * have to avoid blowing out the kernel stack. We do this by saving the 1448 * deep namecache node and aborting the recursion, then re-recursing at that 1449 * node using a depth-first algorithm in order to allow multiple deep 1450 * recursions to chain through each other, then we restart the invalidation 1451 * from scratch. 1452 */ 1453 1454 struct cinvtrack { 1455 struct namecache *resume_ncp; 1456 int depth; 1457 }; 1458 1459 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *); 1460 1461 static 1462 int 1463 _cache_inval(struct namecache *ncp, int flags) 1464 { 1465 struct cinvtrack track; 1466 struct namecache *ncp2; 1467 int r; 1468 1469 track.depth = 0; 1470 track.resume_ncp = NULL; 1471 1472 for (;;) { 1473 r = _cache_inval_internal(ncp, flags, &track); 1474 if (track.resume_ncp == NULL) 1475 break; 1476 kprintf("Warning: deep namecache recursion at %s\n", 1477 ncp->nc_name); 1478 _cache_unlock(ncp); 1479 while ((ncp2 = track.resume_ncp) != NULL) { 1480 track.resume_ncp = NULL; 1481 _cache_lock(ncp2); 1482 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY, 1483 &track); 1484 _cache_put(ncp2); 1485 } 1486 _cache_lock(ncp); 1487 } 1488 return(r); 1489 } 1490 1491 int 1492 cache_inval(struct nchandle *nch, int flags) 1493 { 1494 return(_cache_inval(nch->ncp, flags)); 1495 } 1496 1497 /* 1498 * Helper for _cache_inval(). The passed ncp is refd and locked and 1499 * remains that way on return, but may be unlocked/relocked multiple 1500 * times by the routine. 1501 */ 1502 static int 1503 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) 1504 { 1505 struct namecache *kid; 1506 struct namecache *nextkid; 1507 int rcnt = 0; 1508 1509 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 1510 1511 _cache_setunresolved(ncp); 1512 if (flags & CINV_DESTROY) 1513 ncp->nc_flag |= NCF_DESTROYED; 1514 if ((flags & CINV_CHILDREN) && 1515 (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL 1516 ) { 1517 _cache_hold(kid); 1518 if (++track->depth > MAX_RECURSION_DEPTH) { 1519 track->resume_ncp = ncp; 1520 _cache_hold(ncp); 1521 ++rcnt; 1522 } 1523 _cache_unlock(ncp); 1524 while (kid) { 1525 if (track->resume_ncp) { 1526 _cache_drop(kid); 1527 break; 1528 } 1529 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL) 1530 _cache_hold(nextkid); 1531 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 || 1532 TAILQ_FIRST(&kid->nc_list) 1533 ) { 1534 _cache_lock(kid); 1535 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track); 1536 _cache_unlock(kid); 1537 } 1538 _cache_drop(kid); 1539 kid = nextkid; 1540 } 1541 --track->depth; 1542 _cache_lock(ncp); 1543 } 1544 1545 /* 1546 * Someone could have gotten in there while ncp was unlocked, 1547 * retry if so. 1548 */ 1549 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 1550 ++rcnt; 1551 return (rcnt); 1552 } 1553 1554 /* 1555 * Invalidate a vnode's namecache associations. To avoid races against 1556 * the resolver we do not invalidate a node which we previously invalidated 1557 * but which was then re-resolved while we were in the invalidation loop. 1558 * 1559 * Returns non-zero if any namecache entries remain after the invalidation 1560 * loop completed. 1561 * 1562 * NOTE: Unlike the namecache topology which guarentees that ncp's will not 1563 * be ripped out of the topology while held, the vnode's v_namecache 1564 * list has no such restriction. NCP's can be ripped out of the list 1565 * at virtually any time if not locked, even if held. 1566 * 1567 * In addition, the v_namecache list itself must be locked via 1568 * the vnode's spinlock. 1569 */ 1570 int 1571 cache_inval_vp(struct vnode *vp, int flags) 1572 { 1573 struct namecache *ncp; 1574 struct namecache *next; 1575 1576 restart: 1577 spin_lock(&vp->v_spin); 1578 ncp = TAILQ_FIRST(&vp->v_namecache); 1579 if (ncp) 1580 _cache_hold(ncp); 1581 while (ncp) { 1582 /* loop entered with ncp held and vp spin-locked */ 1583 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1584 _cache_hold(next); 1585 spin_unlock(&vp->v_spin); 1586 _cache_lock(ncp); 1587 if (ncp->nc_vp != vp) { 1588 kprintf("Warning: cache_inval_vp: race-A detected on " 1589 "%s\n", ncp->nc_name); 1590 _cache_put(ncp); 1591 if (next) 1592 _cache_drop(next); 1593 goto restart; 1594 } 1595 _cache_inval(ncp, flags); 1596 _cache_put(ncp); /* also releases reference */ 1597 ncp = next; 1598 spin_lock(&vp->v_spin); 1599 if (ncp && ncp->nc_vp != vp) { 1600 spin_unlock(&vp->v_spin); 1601 kprintf("Warning: cache_inval_vp: race-B detected on " 1602 "%s\n", ncp->nc_name); 1603 _cache_drop(ncp); 1604 goto restart; 1605 } 1606 } 1607 spin_unlock(&vp->v_spin); 1608 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1609 } 1610 1611 /* 1612 * This routine is used instead of the normal cache_inval_vp() when we 1613 * are trying to recycle otherwise good vnodes. 1614 * 1615 * Return 0 on success, non-zero if not all namecache records could be 1616 * disassociated from the vnode (for various reasons). 1617 */ 1618 int 1619 cache_inval_vp_nonblock(struct vnode *vp) 1620 { 1621 struct namecache *ncp; 1622 struct namecache *next; 1623 1624 spin_lock(&vp->v_spin); 1625 ncp = TAILQ_FIRST(&vp->v_namecache); 1626 if (ncp) 1627 _cache_hold(ncp); 1628 while (ncp) { 1629 /* loop entered with ncp held */ 1630 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL) 1631 _cache_hold(next); 1632 spin_unlock(&vp->v_spin); 1633 if (_cache_lock_nonblock(ncp)) { 1634 _cache_drop(ncp); 1635 if (next) 1636 _cache_drop(next); 1637 goto done; 1638 } 1639 if (ncp->nc_vp != vp) { 1640 kprintf("Warning: cache_inval_vp: race-A detected on " 1641 "%s\n", ncp->nc_name); 1642 _cache_put(ncp); 1643 if (next) 1644 _cache_drop(next); 1645 goto done; 1646 } 1647 _cache_inval(ncp, 0); 1648 _cache_put(ncp); /* also releases reference */ 1649 ncp = next; 1650 spin_lock(&vp->v_spin); 1651 if (ncp && ncp->nc_vp != vp) { 1652 spin_unlock(&vp->v_spin); 1653 kprintf("Warning: cache_inval_vp: race-B detected on " 1654 "%s\n", ncp->nc_name); 1655 _cache_drop(ncp); 1656 goto done; 1657 } 1658 } 1659 spin_unlock(&vp->v_spin); 1660 done: 1661 return(TAILQ_FIRST(&vp->v_namecache) != NULL); 1662 } 1663 1664 /* 1665 * The source ncp has been renamed to the target ncp. Both fncp and tncp 1666 * must be locked. The target ncp is destroyed (as a normal rename-over 1667 * would destroy the target file or directory). 1668 * 1669 * Because there may be references to the source ncp we cannot copy its 1670 * contents to the target. Instead the source ncp is relinked as the target 1671 * and the target ncp is removed from the namecache topology. 1672 */ 1673 void 1674 cache_rename(struct nchandle *fnch, struct nchandle *tnch) 1675 { 1676 struct namecache *fncp = fnch->ncp; 1677 struct namecache *tncp = tnch->ncp; 1678 struct namecache *tncp_par; 1679 struct nchash_head *nchpp; 1680 u_int32_t hash; 1681 char *oname; 1682 char *nname; 1683 1684 if (tncp->nc_nlen) { 1685 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK); 1686 bcopy(tncp->nc_name, nname, tncp->nc_nlen); 1687 nname[tncp->nc_nlen] = 0; 1688 } else { 1689 nname = NULL; 1690 } 1691 1692 /* 1693 * Rename fncp (unlink) 1694 */ 1695 _cache_unlink_parent(fncp); 1696 oname = fncp->nc_name; 1697 fncp->nc_name = nname; 1698 fncp->nc_nlen = tncp->nc_nlen; 1699 if (oname) 1700 kfree(oname, M_VFSCACHE); 1701 1702 tncp_par = tncp->nc_parent; 1703 _cache_hold(tncp_par); 1704 _cache_lock(tncp_par); 1705 1706 /* 1707 * Rename fncp (relink) 1708 */ 1709 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT); 1710 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash); 1711 nchpp = NCHHASH(hash); 1712 1713 spin_lock(&nchpp->spin); 1714 _cache_link_parent(fncp, tncp_par, nchpp); 1715 spin_unlock(&nchpp->spin); 1716 1717 _cache_put(tncp_par); 1718 1719 /* 1720 * Get rid of the overwritten tncp (unlink) 1721 */ 1722 _cache_unlink(tncp); 1723 } 1724 1725 /* 1726 * Perform actions consistent with unlinking a file. The passed-in ncp 1727 * must be locked. 1728 * 1729 * The ncp is marked DESTROYED so it no longer shows up in searches, 1730 * and will be physically deleted when the vnode goes away. 1731 * 1732 * If the related vnode has no refs then we cycle it through vget()/vput() 1733 * to (possibly if we don't have a ref race) trigger a deactivation, 1734 * allowing the VFS to trivially detect and recycle the deleted vnode 1735 * via VOP_INACTIVE(). 1736 * 1737 * NOTE: _cache_rename() will automatically call _cache_unlink() on the 1738 * target ncp. 1739 */ 1740 void 1741 cache_unlink(struct nchandle *nch) 1742 { 1743 _cache_unlink(nch->ncp); 1744 } 1745 1746 static void 1747 _cache_unlink(struct namecache *ncp) 1748 { 1749 struct vnode *vp; 1750 1751 /* 1752 * Causes lookups to fail and allows another ncp with the same 1753 * name to be created under ncp->nc_parent. 1754 */ 1755 ncp->nc_flag |= NCF_DESTROYED; 1756 1757 /* 1758 * Attempt to trigger a deactivation. 1759 */ 1760 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 1761 (vp = ncp->nc_vp) != NULL && 1762 !sysref_isactive(&vp->v_sysref)) { 1763 if (vget(vp, LK_SHARED) == 0) 1764 vput(vp); 1765 } 1766 } 1767 1768 /* 1769 * vget the vnode associated with the namecache entry. Resolve the namecache 1770 * entry if necessary. The passed ncp must be referenced and locked. 1771 * 1772 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked 1773 * (depending on the passed lk_type) will be returned in *vpp with an error 1774 * of 0, or NULL will be returned in *vpp with a non-0 error code. The 1775 * most typical error is ENOENT, meaning that the ncp represents a negative 1776 * cache hit and there is no vnode to retrieve, but other errors can occur 1777 * too. 1778 * 1779 * The vget() can race a reclaim. If this occurs we re-resolve the 1780 * namecache entry. 1781 * 1782 * There are numerous places in the kernel where vget() is called on a 1783 * vnode while one or more of its namecache entries is locked. Releasing 1784 * a vnode never deadlocks against locked namecache entries (the vnode 1785 * will not get recycled while referenced ncp's exist). This means we 1786 * can safely acquire the vnode. In fact, we MUST NOT release the ncp 1787 * lock when acquiring the vp lock or we might cause a deadlock. 1788 * 1789 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1790 * unresolved. If a reclaim race occurs the passed-in ncp will be 1791 * relocked exclusively before being re-resolved. 1792 */ 1793 int 1794 cache_vget(struct nchandle *nch, struct ucred *cred, 1795 int lk_type, struct vnode **vpp) 1796 { 1797 struct namecache *ncp; 1798 struct vnode *vp; 1799 int error; 1800 1801 ncp = nch->ncp; 1802 again: 1803 vp = NULL; 1804 if (ncp->nc_flag & NCF_UNRESOLVED) 1805 error = cache_resolve(nch, cred); 1806 else 1807 error = 0; 1808 1809 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1810 error = vget(vp, lk_type); 1811 if (error) { 1812 /* 1813 * VRECLAIM race 1814 */ 1815 if (error == ENOENT) { 1816 kprintf("Warning: vnode reclaim race detected " 1817 "in cache_vget on %p (%s)\n", 1818 vp, ncp->nc_name); 1819 _cache_unlock(ncp); 1820 _cache_lock(ncp); 1821 _cache_setunresolved(ncp); 1822 goto again; 1823 } 1824 1825 /* 1826 * Not a reclaim race, some other error. 1827 */ 1828 KKASSERT(ncp->nc_vp == vp); 1829 vp = NULL; 1830 } else { 1831 KKASSERT(ncp->nc_vp == vp); 1832 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1833 } 1834 } 1835 if (error == 0 && vp == NULL) 1836 error = ENOENT; 1837 *vpp = vp; 1838 return(error); 1839 } 1840 1841 /* 1842 * Similar to cache_vget() but only acquires a ref on the vnode. 1843 * 1844 * NOTE: The passed-in ncp must be locked exclusively if it is initially 1845 * unresolved. If a reclaim race occurs the passed-in ncp will be 1846 * relocked exclusively before being re-resolved. 1847 */ 1848 int 1849 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) 1850 { 1851 struct namecache *ncp; 1852 struct vnode *vp; 1853 int error; 1854 1855 ncp = nch->ncp; 1856 again: 1857 vp = NULL; 1858 if (ncp->nc_flag & NCF_UNRESOLVED) 1859 error = cache_resolve(nch, cred); 1860 else 1861 error = 0; 1862 1863 if (error == 0 && (vp = ncp->nc_vp) != NULL) { 1864 error = vget(vp, LK_SHARED); 1865 if (error) { 1866 /* 1867 * VRECLAIM race 1868 */ 1869 if (error == ENOENT) { 1870 kprintf("Warning: vnode reclaim race detected " 1871 "in cache_vget on %p (%s)\n", 1872 vp, ncp->nc_name); 1873 _cache_unlock(ncp); 1874 _cache_lock(ncp); 1875 _cache_setunresolved(ncp); 1876 goto again; 1877 } 1878 1879 /* 1880 * Not a reclaim race, some other error. 1881 */ 1882 KKASSERT(ncp->nc_vp == vp); 1883 vp = NULL; 1884 } else { 1885 KKASSERT(ncp->nc_vp == vp); 1886 KKASSERT((vp->v_flag & VRECLAIMED) == 0); 1887 /* caller does not want a lock */ 1888 vn_unlock(vp); 1889 } 1890 } 1891 if (error == 0 && vp == NULL) 1892 error = ENOENT; 1893 *vpp = vp; 1894 return(error); 1895 } 1896 1897 /* 1898 * Return a referenced vnode representing the parent directory of 1899 * ncp. 1900 * 1901 * Because the caller has locked the ncp it should not be possible for 1902 * the parent ncp to go away. However, the parent can unresolve its 1903 * dvp at any time so we must be able to acquire a lock on the parent 1904 * to safely access nc_vp. 1905 * 1906 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock, 1907 * so use vhold()/vdrop() while holding the lock to prevent dvp from 1908 * getting destroyed. 1909 * 1910 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a 1911 * lock on the ncp in question.. 1912 */ 1913 static struct vnode * 1914 cache_dvpref(struct namecache *ncp) 1915 { 1916 struct namecache *par; 1917 struct vnode *dvp; 1918 1919 dvp = NULL; 1920 if ((par = ncp->nc_parent) != NULL) { 1921 _cache_hold(par); 1922 _cache_lock(par); 1923 if ((par->nc_flag & NCF_UNRESOLVED) == 0) { 1924 if ((dvp = par->nc_vp) != NULL) 1925 vhold(dvp); 1926 } 1927 _cache_unlock(par); 1928 if (dvp) { 1929 if (vget(dvp, LK_SHARED) == 0) { 1930 vn_unlock(dvp); 1931 vdrop(dvp); 1932 /* return refd, unlocked dvp */ 1933 } else { 1934 vdrop(dvp); 1935 dvp = NULL; 1936 } 1937 } 1938 _cache_drop(par); 1939 } 1940 return(dvp); 1941 } 1942 1943 /* 1944 * Convert a directory vnode to a namecache record without any other 1945 * knowledge of the topology. This ONLY works with directory vnodes and 1946 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the 1947 * returned ncp (if not NULL) will be held and unlocked. 1948 * 1949 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned. 1950 * If 'makeit' is 1 we attempt to track-down and create the namecache topology 1951 * for dvp. This will fail only if the directory has been deleted out from 1952 * under the caller. 1953 * 1954 * Callers must always check for a NULL return no matter the value of 'makeit'. 1955 * 1956 * To avoid underflowing the kernel stack each recursive call increments 1957 * the makeit variable. 1958 */ 1959 1960 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 1961 struct vnode *dvp, char *fakename); 1962 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 1963 struct vnode **saved_dvp); 1964 1965 int 1966 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit, 1967 struct nchandle *nch) 1968 { 1969 struct vnode *saved_dvp; 1970 struct vnode *pvp; 1971 char *fakename; 1972 int error; 1973 1974 nch->ncp = NULL; 1975 nch->mount = dvp->v_mount; 1976 saved_dvp = NULL; 1977 fakename = NULL; 1978 1979 /* 1980 * Handle the makeit == 0 degenerate case 1981 */ 1982 if (makeit == 0) { 1983 spin_lock(&dvp->v_spin); 1984 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 1985 if (nch->ncp) 1986 cache_hold(nch); 1987 spin_unlock(&dvp->v_spin); 1988 } 1989 1990 /* 1991 * Loop until resolution, inside code will break out on error. 1992 */ 1993 while (makeit) { 1994 /* 1995 * Break out if we successfully acquire a working ncp. 1996 */ 1997 spin_lock(&dvp->v_spin); 1998 nch->ncp = TAILQ_FIRST(&dvp->v_namecache); 1999 if (nch->ncp) { 2000 cache_hold(nch); 2001 spin_unlock(&dvp->v_spin); 2002 break; 2003 } 2004 spin_unlock(&dvp->v_spin); 2005 2006 /* 2007 * If dvp is the root of its filesystem it should already 2008 * have a namecache pointer associated with it as a side 2009 * effect of the mount, but it may have been disassociated. 2010 */ 2011 if (dvp->v_flag & VROOT) { 2012 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp); 2013 error = cache_resolve_mp(nch->mount); 2014 _cache_put(nch->ncp); 2015 if (ncvp_debug) { 2016 kprintf("cache_fromdvp: resolve root of mount %p error %d", 2017 dvp->v_mount, error); 2018 } 2019 if (error) { 2020 if (ncvp_debug) 2021 kprintf(" failed\n"); 2022 nch->ncp = NULL; 2023 break; 2024 } 2025 if (ncvp_debug) 2026 kprintf(" succeeded\n"); 2027 continue; 2028 } 2029 2030 /* 2031 * If we are recursed too deeply resort to an O(n^2) 2032 * algorithm to resolve the namecache topology. The 2033 * resolved pvp is left referenced in saved_dvp to 2034 * prevent the tree from being destroyed while we loop. 2035 */ 2036 if (makeit > 20) { 2037 error = cache_fromdvp_try(dvp, cred, &saved_dvp); 2038 if (error) { 2039 kprintf("lookupdotdot(longpath) failed %d " 2040 "dvp %p\n", error, dvp); 2041 nch->ncp = NULL; 2042 break; 2043 } 2044 continue; 2045 } 2046 2047 /* 2048 * Get the parent directory and resolve its ncp. 2049 */ 2050 if (fakename) { 2051 kfree(fakename, M_TEMP); 2052 fakename = NULL; 2053 } 2054 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2055 &fakename); 2056 if (error) { 2057 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp); 2058 break; 2059 } 2060 vn_unlock(pvp); 2061 2062 /* 2063 * Reuse makeit as a recursion depth counter. On success 2064 * nch will be fully referenced. 2065 */ 2066 cache_fromdvp(pvp, cred, makeit + 1, nch); 2067 vrele(pvp); 2068 if (nch->ncp == NULL) 2069 break; 2070 2071 /* 2072 * Do an inefficient scan of pvp (embodied by ncp) to look 2073 * for dvp. This will create a namecache record for dvp on 2074 * success. We loop up to recheck on success. 2075 * 2076 * ncp and dvp are both held but not locked. 2077 */ 2078 error = cache_inefficient_scan(nch, cred, dvp, fakename); 2079 if (error) { 2080 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n", 2081 pvp, nch->ncp->nc_name, dvp); 2082 cache_drop(nch); 2083 /* nch was NULLed out, reload mount */ 2084 nch->mount = dvp->v_mount; 2085 break; 2086 } 2087 if (ncvp_debug) { 2088 kprintf("cache_fromdvp: scan %p (%s) succeeded\n", 2089 pvp, nch->ncp->nc_name); 2090 } 2091 cache_drop(nch); 2092 /* nch was NULLed out, reload mount */ 2093 nch->mount = dvp->v_mount; 2094 } 2095 2096 /* 2097 * If nch->ncp is non-NULL it will have been held already. 2098 */ 2099 if (fakename) 2100 kfree(fakename, M_TEMP); 2101 if (saved_dvp) 2102 vrele(saved_dvp); 2103 if (nch->ncp) 2104 return (0); 2105 return (EINVAL); 2106 } 2107 2108 /* 2109 * Go up the chain of parent directories until we find something 2110 * we can resolve into the namecache. This is very inefficient. 2111 */ 2112 static 2113 int 2114 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred, 2115 struct vnode **saved_dvp) 2116 { 2117 struct nchandle nch; 2118 struct vnode *pvp; 2119 int error; 2120 static time_t last_fromdvp_report; 2121 char *fakename; 2122 2123 /* 2124 * Loop getting the parent directory vnode until we get something we 2125 * can resolve in the namecache. 2126 */ 2127 vref(dvp); 2128 nch.mount = dvp->v_mount; 2129 nch.ncp = NULL; 2130 fakename = NULL; 2131 2132 for (;;) { 2133 if (fakename) { 2134 kfree(fakename, M_TEMP); 2135 fakename = NULL; 2136 } 2137 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred, 2138 &fakename); 2139 if (error) { 2140 vrele(dvp); 2141 break; 2142 } 2143 vn_unlock(pvp); 2144 spin_lock(&pvp->v_spin); 2145 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) { 2146 _cache_hold(nch.ncp); 2147 spin_unlock(&pvp->v_spin); 2148 vrele(pvp); 2149 break; 2150 } 2151 spin_unlock(&pvp->v_spin); 2152 if (pvp->v_flag & VROOT) { 2153 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp); 2154 error = cache_resolve_mp(nch.mount); 2155 _cache_unlock(nch.ncp); 2156 vrele(pvp); 2157 if (error) { 2158 _cache_drop(nch.ncp); 2159 nch.ncp = NULL; 2160 vrele(dvp); 2161 } 2162 break; 2163 } 2164 vrele(dvp); 2165 dvp = pvp; 2166 } 2167 if (error == 0) { 2168 if (last_fromdvp_report != time_second) { 2169 last_fromdvp_report = time_second; 2170 kprintf("Warning: extremely inefficient path " 2171 "resolution on %s\n", 2172 nch.ncp->nc_name); 2173 } 2174 error = cache_inefficient_scan(&nch, cred, dvp, fakename); 2175 2176 /* 2177 * Hopefully dvp now has a namecache record associated with 2178 * it. Leave it referenced to prevent the kernel from 2179 * recycling the vnode. Otherwise extremely long directory 2180 * paths could result in endless recycling. 2181 */ 2182 if (*saved_dvp) 2183 vrele(*saved_dvp); 2184 *saved_dvp = dvp; 2185 _cache_drop(nch.ncp); 2186 } 2187 if (fakename) 2188 kfree(fakename, M_TEMP); 2189 return (error); 2190 } 2191 2192 /* 2193 * Do an inefficient scan of the directory represented by ncp looking for 2194 * the directory vnode dvp. ncp must be held but not locked on entry and 2195 * will be held on return. dvp must be refd but not locked on entry and 2196 * will remain refd on return. 2197 * 2198 * Why do this at all? Well, due to its stateless nature the NFS server 2199 * converts file handles directly to vnodes without necessarily going through 2200 * the namecache ops that would otherwise create the namecache topology 2201 * leading to the vnode. We could either (1) Change the namecache algorithms 2202 * to allow disconnect namecache records that are re-merged opportunistically, 2203 * or (2) Make the NFS server backtrack and scan to recover a connected 2204 * namecache topology in order to then be able to issue new API lookups. 2205 * 2206 * It turns out that (1) is a huge mess. It takes a nice clean set of 2207 * namecache algorithms and introduces a lot of complication in every subsystem 2208 * that calls into the namecache to deal with the re-merge case, especially 2209 * since we are using the namecache to placehold negative lookups and the 2210 * vnode might not be immediately assigned. (2) is certainly far less 2211 * efficient then (1), but since we are only talking about directories here 2212 * (which are likely to remain cached), the case does not actually run all 2213 * that often and has the supreme advantage of not polluting the namecache 2214 * algorithms. 2215 * 2216 * If a fakename is supplied just construct a namecache entry using the 2217 * fake name. 2218 */ 2219 static int 2220 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred, 2221 struct vnode *dvp, char *fakename) 2222 { 2223 struct nlcomponent nlc; 2224 struct nchandle rncp; 2225 struct dirent *den; 2226 struct vnode *pvp; 2227 struct vattr vat; 2228 struct iovec iov; 2229 struct uio uio; 2230 int blksize; 2231 int eofflag; 2232 int bytes; 2233 char *rbuf; 2234 int error; 2235 2236 vat.va_blocksize = 0; 2237 if ((error = VOP_GETATTR(dvp, &vat)) != 0) 2238 return (error); 2239 cache_lock(nch); 2240 error = cache_vref(nch, cred, &pvp); 2241 cache_unlock(nch); 2242 if (error) 2243 return (error); 2244 if (ncvp_debug) { 2245 kprintf("inefficient_scan: directory iosize %ld " 2246 "vattr fileid = %lld\n", 2247 vat.va_blocksize, 2248 (long long)vat.va_fileid); 2249 } 2250 2251 /* 2252 * Use the supplied fakename if not NULL. Fake names are typically 2253 * not in the actual filesystem hierarchy. This is used by HAMMER 2254 * to glue @@timestamp recursions together. 2255 */ 2256 if (fakename) { 2257 nlc.nlc_nameptr = fakename; 2258 nlc.nlc_namelen = strlen(fakename); 2259 rncp = cache_nlookup(nch, &nlc); 2260 goto done; 2261 } 2262 2263 if ((blksize = vat.va_blocksize) == 0) 2264 blksize = DEV_BSIZE; 2265 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK); 2266 rncp.ncp = NULL; 2267 2268 eofflag = 0; 2269 uio.uio_offset = 0; 2270 again: 2271 iov.iov_base = rbuf; 2272 iov.iov_len = blksize; 2273 uio.uio_iov = &iov; 2274 uio.uio_iovcnt = 1; 2275 uio.uio_resid = blksize; 2276 uio.uio_segflg = UIO_SYSSPACE; 2277 uio.uio_rw = UIO_READ; 2278 uio.uio_td = curthread; 2279 2280 if (ncvp_debug >= 2) 2281 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset); 2282 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL); 2283 if (error == 0) { 2284 den = (struct dirent *)rbuf; 2285 bytes = blksize - uio.uio_resid; 2286 2287 while (bytes > 0) { 2288 if (ncvp_debug >= 2) { 2289 kprintf("cache_inefficient_scan: %*.*s\n", 2290 den->d_namlen, den->d_namlen, 2291 den->d_name); 2292 } 2293 if (den->d_type != DT_WHT && 2294 den->d_ino == vat.va_fileid) { 2295 if (ncvp_debug) { 2296 kprintf("cache_inefficient_scan: " 2297 "MATCHED inode %lld path %s/%*.*s\n", 2298 (long long)vat.va_fileid, 2299 nch->ncp->nc_name, 2300 den->d_namlen, den->d_namlen, 2301 den->d_name); 2302 } 2303 nlc.nlc_nameptr = den->d_name; 2304 nlc.nlc_namelen = den->d_namlen; 2305 rncp = cache_nlookup(nch, &nlc); 2306 KKASSERT(rncp.ncp != NULL); 2307 break; 2308 } 2309 bytes -= _DIRENT_DIRSIZ(den); 2310 den = _DIRENT_NEXT(den); 2311 } 2312 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize) 2313 goto again; 2314 } 2315 kfree(rbuf, M_TEMP); 2316 done: 2317 vrele(pvp); 2318 if (rncp.ncp) { 2319 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) { 2320 _cache_setvp(rncp.mount, rncp.ncp, dvp); 2321 if (ncvp_debug >= 2) { 2322 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n", 2323 nch->ncp->nc_name, rncp.ncp->nc_name, dvp); 2324 } 2325 } else { 2326 if (ncvp_debug >= 2) { 2327 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 2328 nch->ncp->nc_name, rncp.ncp->nc_name, dvp, 2329 rncp.ncp->nc_vp); 2330 } 2331 } 2332 if (rncp.ncp->nc_vp == NULL) 2333 error = rncp.ncp->nc_error; 2334 /* 2335 * Release rncp after a successful nlookup. rncp was fully 2336 * referenced. 2337 */ 2338 cache_put(&rncp); 2339 } else { 2340 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n", 2341 dvp, nch->ncp->nc_name); 2342 error = ENOENT; 2343 } 2344 return (error); 2345 } 2346 2347 /* 2348 * Zap a namecache entry. The ncp is unconditionally set to an unresolved 2349 * state, which disassociates it from its vnode or ncneglist. 2350 * 2351 * Then, if there are no additional references to the ncp and no children, 2352 * the ncp is removed from the topology and destroyed. 2353 * 2354 * References and/or children may exist if the ncp is in the middle of the 2355 * topology, preventing the ncp from being destroyed. 2356 * 2357 * This function must be called with the ncp held and locked and will unlock 2358 * and drop it during zapping. 2359 * 2360 * If nonblock is non-zero and the parent ncp cannot be locked we give up. 2361 * This case can occur in the cache_drop() path. 2362 * 2363 * This function may returned a held (but NOT locked) parent node which the 2364 * caller must drop. We do this so _cache_drop() can loop, to avoid 2365 * blowing out the kernel stack. 2366 * 2367 * WARNING! For MPSAFE operation this routine must acquire up to three 2368 * spin locks to be able to safely test nc_refs. Lock order is 2369 * very important. 2370 * 2371 * hash spinlock if on hash list 2372 * parent spinlock if child of parent 2373 * (the ncp is unresolved so there is no vnode association) 2374 */ 2375 static struct namecache * 2376 cache_zap(struct namecache *ncp, int nonblock) 2377 { 2378 struct namecache *par; 2379 struct vnode *dropvp; 2380 int refs; 2381 2382 /* 2383 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED. 2384 */ 2385 _cache_setunresolved(ncp); 2386 2387 /* 2388 * Try to scrap the entry and possibly tail-recurse on its parent. 2389 * We only scrap unref'd (other then our ref) unresolved entries, 2390 * we do not scrap 'live' entries. 2391 * 2392 * Note that once the spinlocks are acquired if nc_refs == 1 no 2393 * other references are possible. If it isn't, however, we have 2394 * to decrement but also be sure to avoid a 1->0 transition. 2395 */ 2396 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); 2397 KKASSERT(ncp->nc_refs > 0); 2398 2399 /* 2400 * Acquire locks. Note that the parent can't go away while we hold 2401 * a child locked. 2402 */ 2403 if ((par = ncp->nc_parent) != NULL) { 2404 if (nonblock) { 2405 for (;;) { 2406 if (_cache_lock_nonblock(par) == 0) 2407 break; 2408 refs = ncp->nc_refs; 2409 ncp->nc_flag |= NCF_DEFEREDZAP; 2410 ++numdefered; /* MP race ok */ 2411 if (atomic_cmpset_int(&ncp->nc_refs, 2412 refs, refs - 1)) { 2413 _cache_unlock(ncp); 2414 return(NULL); 2415 } 2416 cpu_pause(); 2417 } 2418 _cache_hold(par); 2419 } else { 2420 _cache_hold(par); 2421 _cache_lock(par); 2422 } 2423 spin_lock(&ncp->nc_head->spin); 2424 } 2425 2426 /* 2427 * If someone other then us has a ref or we have children 2428 * we cannot zap the entry. The 1->0 transition and any 2429 * further list operation is protected by the spinlocks 2430 * we have acquired but other transitions are not. 2431 */ 2432 for (;;) { 2433 refs = ncp->nc_refs; 2434 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list)) 2435 break; 2436 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) { 2437 if (par) { 2438 spin_unlock(&ncp->nc_head->spin); 2439 _cache_put(par); 2440 } 2441 _cache_unlock(ncp); 2442 return(NULL); 2443 } 2444 cpu_pause(); 2445 } 2446 2447 /* 2448 * We are the only ref and with the spinlocks held no further 2449 * refs can be acquired by others. 2450 * 2451 * Remove us from the hash list and parent list. We have to 2452 * drop a ref on the parent's vp if the parent's list becomes 2453 * empty. 2454 */ 2455 dropvp = NULL; 2456 if (par) { 2457 struct nchash_head *nchpp = ncp->nc_head; 2458 2459 KKASSERT(nchpp != NULL); 2460 LIST_REMOVE(ncp, nc_hash); 2461 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); 2462 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list)) 2463 dropvp = par->nc_vp; 2464 ncp->nc_head = NULL; 2465 ncp->nc_parent = NULL; 2466 spin_unlock(&nchpp->spin); 2467 _cache_unlock(par); 2468 } else { 2469 KKASSERT(ncp->nc_head == NULL); 2470 } 2471 2472 /* 2473 * ncp should not have picked up any refs. Physically 2474 * destroy the ncp. 2475 */ 2476 KKASSERT(ncp->nc_refs == 1); 2477 /* _cache_unlock(ncp) not required */ 2478 ncp->nc_refs = -1; /* safety */ 2479 if (ncp->nc_name) 2480 kfree(ncp->nc_name, M_VFSCACHE); 2481 kfree(ncp, M_VFSCACHE); 2482 2483 /* 2484 * Delayed drop (we had to release our spinlocks) 2485 * 2486 * The refed parent (if not NULL) must be dropped. The 2487 * caller is responsible for looping. 2488 */ 2489 if (dropvp) 2490 vdrop(dropvp); 2491 return(par); 2492 } 2493 2494 /* 2495 * Clean up dangling negative cache and defered-drop entries in the 2496 * namecache. 2497 * 2498 * This routine is called in the critical path and also called from 2499 * vnlru(). When called from vnlru we use a lower limit to try to 2500 * deal with the negative cache before the critical path has to start 2501 * dealing with it. 2502 */ 2503 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t; 2504 2505 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2506 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW }; 2507 2508 void 2509 cache_hysteresis(int critpath) 2510 { 2511 int poslimit; 2512 int neglimit = desiredvnodes / ncnegfactor; 2513 int xnumcache = numcache; 2514 2515 if (critpath == 0) 2516 neglimit = neglimit * 8 / 10; 2517 2518 /* 2519 * Don't cache too many negative hits. We use hysteresis to reduce 2520 * the impact on the critical path. 2521 */ 2522 switch(neg_cache_hysteresis_state[critpath]) { 2523 case CHI_LOW: 2524 if (numneg > MINNEG && numneg > neglimit) { 2525 if (critpath) 2526 _cache_cleanneg(ncnegflush); 2527 else 2528 _cache_cleanneg(ncnegflush + 2529 numneg - neglimit); 2530 neg_cache_hysteresis_state[critpath] = CHI_HIGH; 2531 } 2532 break; 2533 case CHI_HIGH: 2534 if (numneg > MINNEG * 9 / 10 && 2535 numneg * 9 / 10 > neglimit 2536 ) { 2537 if (critpath) 2538 _cache_cleanneg(ncnegflush); 2539 else 2540 _cache_cleanneg(ncnegflush + 2541 numneg * 9 / 10 - neglimit); 2542 } else { 2543 neg_cache_hysteresis_state[critpath] = CHI_LOW; 2544 } 2545 break; 2546 } 2547 2548 /* 2549 * Don't cache too many positive hits. We use hysteresis to reduce 2550 * the impact on the critical path. 2551 * 2552 * Excessive positive hits can accumulate due to large numbers of 2553 * hardlinks (the vnode cache will not prevent hl ncps from growing 2554 * into infinity). 2555 */ 2556 if ((poslimit = ncposlimit) == 0) 2557 poslimit = desiredvnodes * 2; 2558 if (critpath == 0) 2559 poslimit = poslimit * 8 / 10; 2560 2561 switch(pos_cache_hysteresis_state[critpath]) { 2562 case CHI_LOW: 2563 if (xnumcache > poslimit && xnumcache > MINPOS) { 2564 if (critpath) 2565 _cache_cleanpos(ncposflush); 2566 else 2567 _cache_cleanpos(ncposflush + 2568 xnumcache - poslimit); 2569 pos_cache_hysteresis_state[critpath] = CHI_HIGH; 2570 } 2571 break; 2572 case CHI_HIGH: 2573 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) { 2574 if (critpath) 2575 _cache_cleanpos(ncposflush); 2576 else 2577 _cache_cleanpos(ncposflush + 2578 xnumcache - poslimit * 5 / 6); 2579 } else { 2580 pos_cache_hysteresis_state[critpath] = CHI_LOW; 2581 } 2582 break; 2583 } 2584 2585 /* 2586 * Clean out dangling defered-zap ncps which could not 2587 * be cleanly dropped if too many build up. Note 2588 * that numdefered is not an exact number as such ncps 2589 * can be reused and the counter is not handled in a MP 2590 * safe manner by design. 2591 */ 2592 if (numdefered > neglimit) { 2593 _cache_cleandefered(); 2594 } 2595 } 2596 2597 /* 2598 * NEW NAMECACHE LOOKUP API 2599 * 2600 * Lookup an entry in the namecache. The passed par_nch must be referenced 2601 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp 2602 * is ALWAYS returned, eve if the supplied component is illegal. 2603 * 2604 * The resulting namecache entry should be returned to the system with 2605 * cache_put() or cache_unlock() + cache_drop(). 2606 * 2607 * namecache locks are recursive but care must be taken to avoid lock order 2608 * reversals (hence why the passed par_nch must be unlocked). Locking 2609 * rules are to order for parent traversals, not for child traversals. 2610 * 2611 * Nobody else will be able to manipulate the associated namespace (e.g. 2612 * create, delete, rename, rename-target) until the caller unlocks the 2613 * entry. 2614 * 2615 * The returned entry will be in one of three states: positive hit (non-null 2616 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set). 2617 * Unresolved entries must be resolved through the filesystem to associate the 2618 * vnode and/or determine whether a positive or negative hit has occured. 2619 * 2620 * It is not necessary to lock a directory in order to lock namespace under 2621 * that directory. In fact, it is explicitly not allowed to do that. A 2622 * directory is typically only locked when being created, renamed, or 2623 * destroyed. 2624 * 2625 * The directory (par) may be unresolved, in which case any returned child 2626 * will likely also be marked unresolved. Likely but not guarenteed. Since 2627 * the filesystem lookup requires a resolved directory vnode the caller is 2628 * responsible for resolving the namecache chain top-down. This API 2629 * specifically allows whole chains to be created in an unresolved state. 2630 */ 2631 struct nchandle 2632 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc) 2633 { 2634 struct nchandle nch; 2635 struct namecache *ncp; 2636 struct namecache *new_ncp; 2637 struct nchash_head *nchpp; 2638 struct mount *mp; 2639 u_int32_t hash; 2640 globaldata_t gd; 2641 int par_locked; 2642 2643 numcalls++; 2644 gd = mycpu; 2645 mp = par_nch->mount; 2646 par_locked = 0; 2647 2648 /* 2649 * This is a good time to call it, no ncp's are locked by 2650 * the caller or us. 2651 */ 2652 cache_hysteresis(1); 2653 2654 /* 2655 * Try to locate an existing entry 2656 */ 2657 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2658 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2659 new_ncp = NULL; 2660 nchpp = NCHHASH(hash); 2661 restart: 2662 spin_lock(&nchpp->spin); 2663 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 2664 numchecks++; 2665 2666 /* 2667 * Break out if we find a matching entry. Note that 2668 * UNRESOLVED entries may match, but DESTROYED entries 2669 * do not. 2670 */ 2671 if (ncp->nc_parent == par_nch->ncp && 2672 ncp->nc_nlen == nlc->nlc_namelen && 2673 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 2674 (ncp->nc_flag & NCF_DESTROYED) == 0 2675 ) { 2676 _cache_hold(ncp); 2677 spin_unlock(&nchpp->spin); 2678 if (par_locked) { 2679 _cache_unlock(par_nch->ncp); 2680 par_locked = 0; 2681 } 2682 if (_cache_lock_special(ncp) == 0) { 2683 _cache_auto_unresolve(mp, ncp); 2684 if (new_ncp) 2685 _cache_free(new_ncp); 2686 goto found; 2687 } 2688 _cache_get(ncp); 2689 _cache_put(ncp); 2690 _cache_drop(ncp); 2691 goto restart; 2692 } 2693 } 2694 2695 /* 2696 * We failed to locate an entry, create a new entry and add it to 2697 * the cache. The parent ncp must also be locked so we 2698 * can link into it. 2699 * 2700 * We have to relookup after possibly blocking in kmalloc or 2701 * when locking par_nch. 2702 * 2703 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2704 * mount case, in which case nc_name will be NULL. 2705 */ 2706 if (new_ncp == NULL) { 2707 spin_unlock(&nchpp->spin); 2708 new_ncp = cache_alloc(nlc->nlc_namelen); 2709 if (nlc->nlc_namelen) { 2710 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2711 nlc->nlc_namelen); 2712 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2713 } 2714 goto restart; 2715 } 2716 if (par_locked == 0) { 2717 spin_unlock(&nchpp->spin); 2718 _cache_lock(par_nch->ncp); 2719 par_locked = 1; 2720 goto restart; 2721 } 2722 2723 /* 2724 * WARNING! We still hold the spinlock. We have to set the hash 2725 * table entry atomically. 2726 */ 2727 ncp = new_ncp; 2728 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2729 spin_unlock(&nchpp->spin); 2730 _cache_unlock(par_nch->ncp); 2731 /* par_locked = 0 - not used */ 2732 found: 2733 /* 2734 * stats and namecache size management 2735 */ 2736 if (ncp->nc_flag & NCF_UNRESOLVED) 2737 ++gd->gd_nchstats->ncs_miss; 2738 else if (ncp->nc_vp) 2739 ++gd->gd_nchstats->ncs_goodhits; 2740 else 2741 ++gd->gd_nchstats->ncs_neghits; 2742 nch.mount = mp; 2743 nch.ncp = ncp; 2744 atomic_add_int(&nch.mount->mnt_refs, 1); 2745 return(nch); 2746 } 2747 2748 /* 2749 * Attempt to lookup a namecache entry and return with a shared namecache 2750 * lock. 2751 */ 2752 int 2753 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, 2754 int excl, struct nchandle *res_nch) 2755 { 2756 struct namecache *ncp; 2757 struct nchash_head *nchpp; 2758 struct mount *mp; 2759 u_int32_t hash; 2760 globaldata_t gd; 2761 2762 /* 2763 * If exclusive requested or shared namecache locks are disabled, 2764 * return failure. 2765 */ 2766 if (ncp_shared_lock_disable || excl) 2767 return(EWOULDBLOCK); 2768 2769 numcalls++; 2770 gd = mycpu; 2771 mp = par_nch->mount; 2772 2773 /* 2774 * This is a good time to call it, no ncp's are locked by 2775 * the caller or us. 2776 */ 2777 cache_hysteresis(1); 2778 2779 /* 2780 * Try to locate an existing entry 2781 */ 2782 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2783 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2784 nchpp = NCHHASH(hash); 2785 2786 spin_lock(&nchpp->spin); 2787 2788 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 2789 numchecks++; 2790 2791 /* 2792 * Break out if we find a matching entry. Note that 2793 * UNRESOLVED entries may match, but DESTROYED entries 2794 * do not. 2795 */ 2796 if (ncp->nc_parent == par_nch->ncp && 2797 ncp->nc_nlen == nlc->nlc_namelen && 2798 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 2799 (ncp->nc_flag & NCF_DESTROYED) == 0 2800 ) { 2801 _cache_hold(ncp); 2802 spin_unlock(&nchpp->spin); 2803 if (_cache_lock_shared_special(ncp) == 0) { 2804 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && 2805 (ncp->nc_flag & NCF_DESTROYED) == 0 && 2806 _cache_auto_unresolve_test(mp, ncp) == 0) { 2807 goto found; 2808 } 2809 _cache_unlock(ncp); 2810 } 2811 _cache_drop(ncp); 2812 spin_lock(&nchpp->spin); 2813 break; 2814 } 2815 } 2816 2817 /* 2818 * Failure 2819 */ 2820 spin_unlock(&nchpp->spin); 2821 return(EWOULDBLOCK); 2822 2823 /* 2824 * Success 2825 * 2826 * Note that nc_error might be non-zero (e.g ENOENT). 2827 */ 2828 found: 2829 res_nch->mount = mp; 2830 res_nch->ncp = ncp; 2831 ++gd->gd_nchstats->ncs_goodhits; 2832 atomic_add_int(&res_nch->mount->mnt_refs, 1); 2833 2834 KKASSERT(ncp->nc_error != EWOULDBLOCK); 2835 return(ncp->nc_error); 2836 } 2837 2838 /* 2839 * This is a non-blocking verison of cache_nlookup() used by 2840 * nfs_readdirplusrpc_uio(). It can fail for any reason and 2841 * will return nch.ncp == NULL in that case. 2842 */ 2843 struct nchandle 2844 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) 2845 { 2846 struct nchandle nch; 2847 struct namecache *ncp; 2848 struct namecache *new_ncp; 2849 struct nchash_head *nchpp; 2850 struct mount *mp; 2851 u_int32_t hash; 2852 globaldata_t gd; 2853 int par_locked; 2854 2855 numcalls++; 2856 gd = mycpu; 2857 mp = par_nch->mount; 2858 par_locked = 0; 2859 2860 /* 2861 * Try to locate an existing entry 2862 */ 2863 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); 2864 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); 2865 new_ncp = NULL; 2866 nchpp = NCHHASH(hash); 2867 restart: 2868 spin_lock(&nchpp->spin); 2869 LIST_FOREACH(ncp, &nchpp->list, nc_hash) { 2870 numchecks++; 2871 2872 /* 2873 * Break out if we find a matching entry. Note that 2874 * UNRESOLVED entries may match, but DESTROYED entries 2875 * do not. 2876 */ 2877 if (ncp->nc_parent == par_nch->ncp && 2878 ncp->nc_nlen == nlc->nlc_namelen && 2879 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && 2880 (ncp->nc_flag & NCF_DESTROYED) == 0 2881 ) { 2882 _cache_hold(ncp); 2883 spin_unlock(&nchpp->spin); 2884 if (par_locked) { 2885 _cache_unlock(par_nch->ncp); 2886 par_locked = 0; 2887 } 2888 if (_cache_lock_special(ncp) == 0) { 2889 _cache_auto_unresolve(mp, ncp); 2890 if (new_ncp) { 2891 _cache_free(new_ncp); 2892 new_ncp = NULL; 2893 } 2894 goto found; 2895 } 2896 _cache_drop(ncp); 2897 goto failed; 2898 } 2899 } 2900 2901 /* 2902 * We failed to locate an entry, create a new entry and add it to 2903 * the cache. The parent ncp must also be locked so we 2904 * can link into it. 2905 * 2906 * We have to relookup after possibly blocking in kmalloc or 2907 * when locking par_nch. 2908 * 2909 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special 2910 * mount case, in which case nc_name will be NULL. 2911 */ 2912 if (new_ncp == NULL) { 2913 spin_unlock(&nchpp->spin); 2914 new_ncp = cache_alloc(nlc->nlc_namelen); 2915 if (nlc->nlc_namelen) { 2916 bcopy(nlc->nlc_nameptr, new_ncp->nc_name, 2917 nlc->nlc_namelen); 2918 new_ncp->nc_name[nlc->nlc_namelen] = 0; 2919 } 2920 goto restart; 2921 } 2922 if (par_locked == 0) { 2923 spin_unlock(&nchpp->spin); 2924 if (_cache_lock_nonblock(par_nch->ncp) == 0) { 2925 par_locked = 1; 2926 goto restart; 2927 } 2928 goto failed; 2929 } 2930 2931 /* 2932 * WARNING! We still hold the spinlock. We have to set the hash 2933 * table entry atomically. 2934 */ 2935 ncp = new_ncp; 2936 _cache_link_parent(ncp, par_nch->ncp, nchpp); 2937 spin_unlock(&nchpp->spin); 2938 _cache_unlock(par_nch->ncp); 2939 /* par_locked = 0 - not used */ 2940 found: 2941 /* 2942 * stats and namecache size management 2943 */ 2944 if (ncp->nc_flag & NCF_UNRESOLVED) 2945 ++gd->gd_nchstats->ncs_miss; 2946 else if (ncp->nc_vp) 2947 ++gd->gd_nchstats->ncs_goodhits; 2948 else 2949 ++gd->gd_nchstats->ncs_neghits; 2950 nch.mount = mp; 2951 nch.ncp = ncp; 2952 atomic_add_int(&nch.mount->mnt_refs, 1); 2953 return(nch); 2954 failed: 2955 if (new_ncp) { 2956 _cache_free(new_ncp); 2957 new_ncp = NULL; 2958 } 2959 nch.mount = NULL; 2960 nch.ncp = NULL; 2961 return(nch); 2962 } 2963 2964 /* 2965 * The namecache entry is marked as being used as a mount point. 2966 * Locate the mount if it is visible to the caller. The DragonFly 2967 * mount system allows arbitrary loops in the topology and disentangles 2968 * those loops by matching against (mp, ncp) rather than just (ncp). 2969 * This means any given ncp can dive any number of mounts, depending 2970 * on the relative mount (e.g. nullfs) the caller is at in the topology. 2971 * 2972 * We use a very simple frontend cache to reduce SMP conflicts, 2973 * which we have to do because the mountlist scan needs an exclusive 2974 * lock around its ripout info list. Not to mention that there might 2975 * be a lot of mounts. 2976 */ 2977 struct findmount_info { 2978 struct mount *result; 2979 struct mount *nch_mount; 2980 struct namecache *nch_ncp; 2981 }; 2982 2983 static 2984 struct ncmount_cache * 2985 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp) 2986 { 2987 int hash; 2988 2989 hash = ((int)(intptr_t)mp / sizeof(*mp)) ^ 2990 ((int)(intptr_t)ncp / sizeof(*ncp)); 2991 hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE; 2992 return (&ncmount_cache[hash]); 2993 } 2994 2995 static 2996 int 2997 cache_findmount_callback(struct mount *mp, void *data) 2998 { 2999 struct findmount_info *info = data; 3000 3001 /* 3002 * Check the mount's mounted-on point against the passed nch. 3003 */ 3004 if (mp->mnt_ncmounton.mount == info->nch_mount && 3005 mp->mnt_ncmounton.ncp == info->nch_ncp 3006 ) { 3007 info->result = mp; 3008 atomic_add_int(&mp->mnt_refs, 1); 3009 return(-1); 3010 } 3011 return(0); 3012 } 3013 3014 struct mount * 3015 cache_findmount(struct nchandle *nch) 3016 { 3017 struct findmount_info info; 3018 struct ncmount_cache *ncc; 3019 struct mount *mp; 3020 3021 /* 3022 * Fast 3023 */ 3024 if (ncmount_cache_enable == 0) { 3025 ncc = NULL; 3026 goto skip; 3027 } 3028 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3029 if (ncc->ncp == nch->ncp) { 3030 spin_lock_shared(&ncc->spin); 3031 if (ncc->isneg == 0 && 3032 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) { 3033 if (mp->mnt_ncmounton.mount == nch->mount && 3034 mp->mnt_ncmounton.ncp == nch->ncp) { 3035 /* 3036 * Cache hit (positive) 3037 */ 3038 atomic_add_int(&mp->mnt_refs, 1); 3039 spin_unlock_shared(&ncc->spin); 3040 ++ncmount_cache_hit; 3041 return(mp); 3042 } 3043 /* else cache miss */ 3044 } 3045 if (ncc->isneg && 3046 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3047 /* 3048 * Cache hit (negative) 3049 */ 3050 spin_unlock_shared(&ncc->spin); 3051 ++ncmount_cache_hit; 3052 return(NULL); 3053 } 3054 spin_unlock_shared(&ncc->spin); 3055 } 3056 skip: 3057 3058 /* 3059 * Slow 3060 */ 3061 info.result = NULL; 3062 info.nch_mount = nch->mount; 3063 info.nch_ncp = nch->ncp; 3064 mountlist_scan(cache_findmount_callback, &info, 3065 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 3066 3067 /* 3068 * Cache the result. 3069 * 3070 * Negative lookups: We cache the originating {ncp,mp}. (mp) is 3071 * only used for pointer comparisons and is not 3072 * referenced (otherwise there would be dangling 3073 * refs). 3074 * 3075 * Positive lookups: We cache the originating {ncp} and the target 3076 * (mp). (mp) is referenced. 3077 * 3078 * Indeterminant: If the match is undergoing an unmount we do 3079 * not cache it to avoid racing cache_unmounting(), 3080 * but still return the match. 3081 */ 3082 if (ncc) { 3083 spin_lock(&ncc->spin); 3084 if (info.result == NULL) { 3085 if (ncc->isneg == 0 && ncc->mp) 3086 atomic_add_int(&ncc->mp->mnt_refs, -1); 3087 ncc->ncp = nch->ncp; 3088 ncc->mp = nch->mount; 3089 ncc->isneg = 1; 3090 spin_unlock(&ncc->spin); 3091 ++ncmount_cache_overwrite; 3092 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) { 3093 if (ncc->isneg == 0 && ncc->mp) 3094 atomic_add_int(&ncc->mp->mnt_refs, -1); 3095 atomic_add_int(&info.result->mnt_refs, 1); 3096 ncc->ncp = nch->ncp; 3097 ncc->mp = info.result; 3098 ncc->isneg = 0; 3099 spin_unlock(&ncc->spin); 3100 ++ncmount_cache_overwrite; 3101 } else { 3102 spin_unlock(&ncc->spin); 3103 } 3104 ++ncmount_cache_miss; 3105 } 3106 return(info.result); 3107 } 3108 3109 void 3110 cache_dropmount(struct mount *mp) 3111 { 3112 atomic_add_int(&mp->mnt_refs, -1); 3113 } 3114 3115 void 3116 cache_ismounting(struct mount *mp) 3117 { 3118 struct nchandle *nch = &mp->mnt_ncmounton; 3119 struct ncmount_cache *ncc; 3120 3121 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3122 if (ncc->isneg && 3123 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3124 spin_lock(&ncc->spin); 3125 if (ncc->isneg && 3126 ncc->ncp == nch->ncp && ncc->mp == nch->mount) { 3127 ncc->ncp = NULL; 3128 ncc->mp = NULL; 3129 } 3130 spin_unlock(&ncc->spin); 3131 } 3132 } 3133 3134 void 3135 cache_unmounting(struct mount *mp) 3136 { 3137 struct nchandle *nch = &mp->mnt_ncmounton; 3138 struct ncmount_cache *ncc; 3139 3140 ncc = ncmount_cache_lookup(nch->mount, nch->ncp); 3141 if (ncc->isneg == 0 && 3142 ncc->ncp == nch->ncp && ncc->mp == mp) { 3143 spin_lock(&ncc->spin); 3144 if (ncc->isneg == 0 && 3145 ncc->ncp == nch->ncp && ncc->mp == mp) { 3146 atomic_add_int(&mp->mnt_refs, -1); 3147 ncc->ncp = NULL; 3148 ncc->mp = NULL; 3149 } 3150 spin_unlock(&ncc->spin); 3151 } 3152 } 3153 3154 /* 3155 * Resolve an unresolved namecache entry, generally by looking it up. 3156 * The passed ncp must be locked and refd. 3157 * 3158 * Theoretically since a vnode cannot be recycled while held, and since 3159 * the nc_parent chain holds its vnode as long as children exist, the 3160 * direct parent of the cache entry we are trying to resolve should 3161 * have a valid vnode. If not then generate an error that we can 3162 * determine is related to a resolver bug. 3163 * 3164 * However, if a vnode was in the middle of a recyclement when the NCP 3165 * got locked, ncp->nc_vp might point to a vnode that is about to become 3166 * invalid. cache_resolve() handles this case by unresolving the entry 3167 * and then re-resolving it. 3168 * 3169 * Note that successful resolution does not necessarily return an error 3170 * code of 0. If the ncp resolves to a negative cache hit then ENOENT 3171 * will be returned. 3172 */ 3173 int 3174 cache_resolve(struct nchandle *nch, struct ucred *cred) 3175 { 3176 struct namecache *par_tmp; 3177 struct namecache *par; 3178 struct namecache *ncp; 3179 struct nchandle nctmp; 3180 struct mount *mp; 3181 struct vnode *dvp; 3182 int error; 3183 3184 ncp = nch->ncp; 3185 mp = nch->mount; 3186 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); 3187 restart: 3188 /* 3189 * If the ncp is already resolved we have nothing to do. However, 3190 * we do want to guarentee that a usable vnode is returned when 3191 * a vnode is present, so make sure it hasn't been reclaimed. 3192 */ 3193 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3194 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3195 _cache_setunresolved(ncp); 3196 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) 3197 return (ncp->nc_error); 3198 } 3199 3200 /* 3201 * If the ncp was destroyed it will never resolve again. This 3202 * can basically only happen when someone is chdir'd into an 3203 * empty directory which is then rmdir'd. We want to catch this 3204 * here and not dive the VFS because the VFS might actually 3205 * have a way to re-resolve the disconnected ncp, which will 3206 * result in inconsistencies in the cdir/nch for proc->p_fd. 3207 */ 3208 if (ncp->nc_flag & NCF_DESTROYED) { 3209 kprintf("Warning: cache_resolve: ncp '%s' was unlinked\n", 3210 ncp->nc_name); 3211 return(EINVAL); 3212 } 3213 3214 /* 3215 * Mount points need special handling because the parent does not 3216 * belong to the same filesystem as the ncp. 3217 */ 3218 if (ncp == mp->mnt_ncmountpt.ncp) 3219 return (cache_resolve_mp(mp)); 3220 3221 /* 3222 * We expect an unbroken chain of ncps to at least the mount point, 3223 * and even all the way to root (but this code doesn't have to go 3224 * past the mount point). 3225 */ 3226 if (ncp->nc_parent == NULL) { 3227 kprintf("EXDEV case 1 %p %*.*s\n", ncp, 3228 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3229 ncp->nc_error = EXDEV; 3230 return(ncp->nc_error); 3231 } 3232 3233 /* 3234 * The vp's of the parent directories in the chain are held via vhold() 3235 * due to the existance of the child, and should not disappear. 3236 * However, there are cases where they can disappear: 3237 * 3238 * - due to filesystem I/O errors. 3239 * - due to NFS being stupid about tracking the namespace and 3240 * destroys the namespace for entire directories quite often. 3241 * - due to forced unmounts. 3242 * - due to an rmdir (parent will be marked DESTROYED) 3243 * 3244 * When this occurs we have to track the chain backwards and resolve 3245 * it, looping until the resolver catches up to the current node. We 3246 * could recurse here but we might run ourselves out of kernel stack 3247 * so we do it in a more painful manner. This situation really should 3248 * not occur all that often, or if it does not have to go back too 3249 * many nodes to resolve the ncp. 3250 */ 3251 while ((dvp = cache_dvpref(ncp)) == NULL) { 3252 /* 3253 * This case can occur if a process is CD'd into a 3254 * directory which is then rmdir'd. If the parent is marked 3255 * destroyed there is no point trying to resolve it. 3256 */ 3257 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) 3258 return(ENOENT); 3259 par = ncp->nc_parent; 3260 _cache_hold(par); 3261 _cache_lock(par); 3262 while ((par_tmp = par->nc_parent) != NULL && 3263 par_tmp->nc_vp == NULL) { 3264 _cache_hold(par_tmp); 3265 _cache_lock(par_tmp); 3266 _cache_put(par); 3267 par = par_tmp; 3268 } 3269 if (par->nc_parent == NULL) { 3270 kprintf("EXDEV case 2 %*.*s\n", 3271 par->nc_nlen, par->nc_nlen, par->nc_name); 3272 _cache_put(par); 3273 return (EXDEV); 3274 } 3275 kprintf("[diagnostic] cache_resolve: had to recurse on %*.*s\n", 3276 par->nc_nlen, par->nc_nlen, par->nc_name); 3277 /* 3278 * The parent is not set in stone, ref and lock it to prevent 3279 * it from disappearing. Also note that due to renames it 3280 * is possible for our ncp to move and for par to no longer 3281 * be one of its parents. We resolve it anyway, the loop 3282 * will handle any moves. 3283 */ 3284 _cache_get(par); /* additional hold/lock */ 3285 _cache_put(par); /* from earlier hold/lock */ 3286 if (par == nch->mount->mnt_ncmountpt.ncp) { 3287 cache_resolve_mp(nch->mount); 3288 } else if ((dvp = cache_dvpref(par)) == NULL) { 3289 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name); 3290 _cache_put(par); 3291 continue; 3292 } else { 3293 if (par->nc_flag & NCF_UNRESOLVED) { 3294 nctmp.mount = mp; 3295 nctmp.ncp = par; 3296 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3297 } 3298 vrele(dvp); 3299 } 3300 if ((error = par->nc_error) != 0) { 3301 if (par->nc_error != EAGAIN) { 3302 kprintf("EXDEV case 3 %*.*s error %d\n", 3303 par->nc_nlen, par->nc_nlen, par->nc_name, 3304 par->nc_error); 3305 _cache_put(par); 3306 return(error); 3307 } 3308 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n", 3309 par, par->nc_nlen, par->nc_nlen, par->nc_name); 3310 } 3311 _cache_put(par); 3312 /* loop */ 3313 } 3314 3315 /* 3316 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected 3317 * ncp's and reattach them. If this occurs the original ncp is marked 3318 * EAGAIN to force a relookup. 3319 * 3320 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed 3321 * ncp must already be resolved. 3322 */ 3323 if (dvp) { 3324 nctmp.mount = mp; 3325 nctmp.ncp = ncp; 3326 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred); 3327 vrele(dvp); 3328 } else { 3329 ncp->nc_error = EPERM; 3330 } 3331 if (ncp->nc_error == EAGAIN) { 3332 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n", 3333 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); 3334 goto restart; 3335 } 3336 return(ncp->nc_error); 3337 } 3338 3339 /* 3340 * Resolve the ncp associated with a mount point. Such ncp's almost always 3341 * remain resolved and this routine is rarely called. NFS MPs tends to force 3342 * re-resolution more often due to its mac-truck-smash-the-namecache 3343 * method of tracking namespace changes. 3344 * 3345 * The semantics for this call is that the passed ncp must be locked on 3346 * entry and will be locked on return. However, if we actually have to 3347 * resolve the mount point we temporarily unlock the entry in order to 3348 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of 3349 * the unlock we have to recheck the flags after we relock. 3350 */ 3351 static int 3352 cache_resolve_mp(struct mount *mp) 3353 { 3354 struct namecache *ncp = mp->mnt_ncmountpt.ncp; 3355 struct vnode *vp; 3356 int error; 3357 3358 KKASSERT(mp != NULL); 3359 3360 /* 3361 * If the ncp is already resolved we have nothing to do. However, 3362 * we do want to guarentee that a usable vnode is returned when 3363 * a vnode is present, so make sure it hasn't been reclaimed. 3364 */ 3365 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3366 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) 3367 _cache_setunresolved(ncp); 3368 } 3369 3370 if (ncp->nc_flag & NCF_UNRESOLVED) { 3371 _cache_unlock(ncp); 3372 while (vfs_busy(mp, 0)) 3373 ; 3374 error = VFS_ROOT(mp, &vp); 3375 _cache_lock(ncp); 3376 3377 /* 3378 * recheck the ncp state after relocking. 3379 */ 3380 if (ncp->nc_flag & NCF_UNRESOLVED) { 3381 ncp->nc_error = error; 3382 if (error == 0) { 3383 _cache_setvp(mp, ncp, vp); 3384 vput(vp); 3385 } else { 3386 kprintf("[diagnostic] cache_resolve_mp: failed" 3387 " to resolve mount %p err=%d ncp=%p\n", 3388 mp, error, ncp); 3389 _cache_setvp(mp, ncp, NULL); 3390 } 3391 } else if (error == 0) { 3392 vput(vp); 3393 } 3394 vfs_unbusy(mp); 3395 } 3396 return(ncp->nc_error); 3397 } 3398 3399 /* 3400 * Clean out negative cache entries when too many have accumulated. 3401 */ 3402 static void 3403 _cache_cleanneg(int count) 3404 { 3405 struct namecache *ncp; 3406 3407 /* 3408 * Attempt to clean out the specified number of negative cache 3409 * entries. 3410 */ 3411 while (count) { 3412 spin_lock(&ncspin); 3413 ncp = TAILQ_FIRST(&ncneglist); 3414 if (ncp == NULL) { 3415 spin_unlock(&ncspin); 3416 break; 3417 } 3418 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); 3419 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode); 3420 _cache_hold(ncp); 3421 spin_unlock(&ncspin); 3422 3423 /* 3424 * This can race, so we must re-check that the ncp 3425 * is on the ncneglist after successfully locking it. 3426 */ 3427 if (_cache_lock_special(ncp) == 0) { 3428 if (ncp->nc_vp == NULL && 3429 (ncp->nc_flag & NCF_UNRESOLVED) == 0) { 3430 ncp = cache_zap(ncp, 1); 3431 if (ncp) 3432 _cache_drop(ncp); 3433 } else { 3434 kprintf("cache_cleanneg: race avoided\n"); 3435 _cache_unlock(ncp); 3436 } 3437 } else { 3438 _cache_drop(ncp); 3439 } 3440 --count; 3441 } 3442 } 3443 3444 /* 3445 * Clean out positive cache entries when too many have accumulated. 3446 */ 3447 static void 3448 _cache_cleanpos(int count) 3449 { 3450 static volatile int rover; 3451 struct nchash_head *nchpp; 3452 struct namecache *ncp; 3453 int rover_copy; 3454 3455 /* 3456 * Attempt to clean out the specified number of negative cache 3457 * entries. 3458 */ 3459 while (count) { 3460 rover_copy = ++rover; /* MPSAFEENOUGH */ 3461 cpu_ccfence(); 3462 nchpp = NCHHASH(rover_copy); 3463 3464 spin_lock(&nchpp->spin); 3465 ncp = LIST_FIRST(&nchpp->list); 3466 while (ncp && (ncp->nc_flag & NCF_DESTROYED)) 3467 ncp = LIST_NEXT(ncp, nc_hash); 3468 if (ncp) 3469 _cache_hold(ncp); 3470 spin_unlock(&nchpp->spin); 3471 3472 if (ncp) { 3473 if (_cache_lock_special(ncp) == 0) { 3474 ncp = cache_zap(ncp, 1); 3475 if (ncp) 3476 _cache_drop(ncp); 3477 } else { 3478 _cache_drop(ncp); 3479 } 3480 } 3481 --count; 3482 } 3483 } 3484 3485 /* 3486 * This is a kitchen sink function to clean out ncps which we 3487 * tried to zap from cache_drop() but failed because we were 3488 * unable to acquire the parent lock. 3489 * 3490 * Such entries can also be removed via cache_inval_vp(), such 3491 * as when unmounting. 3492 */ 3493 static void 3494 _cache_cleandefered(void) 3495 { 3496 struct nchash_head *nchpp; 3497 struct namecache *ncp; 3498 struct namecache dummy; 3499 int i; 3500 3501 numdefered = 0; 3502 bzero(&dummy, sizeof(dummy)); 3503 dummy.nc_flag = NCF_DESTROYED; 3504 dummy.nc_refs = 1; 3505 3506 for (i = 0; i <= nchash; ++i) { 3507 nchpp = &nchashtbl[i]; 3508 3509 spin_lock(&nchpp->spin); 3510 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); 3511 ncp = &dummy; 3512 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) { 3513 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) 3514 continue; 3515 LIST_REMOVE(&dummy, nc_hash); 3516 LIST_INSERT_AFTER(ncp, &dummy, nc_hash); 3517 _cache_hold(ncp); 3518 spin_unlock(&nchpp->spin); 3519 if (_cache_lock_nonblock(ncp) == 0) { 3520 ncp->nc_flag &= ~NCF_DEFEREDZAP; 3521 _cache_unlock(ncp); 3522 } 3523 _cache_drop(ncp); 3524 spin_lock(&nchpp->spin); 3525 ncp = &dummy; 3526 } 3527 LIST_REMOVE(&dummy, nc_hash); 3528 spin_unlock(&nchpp->spin); 3529 } 3530 } 3531 3532 /* 3533 * Name cache initialization, from vfsinit() when we are booting 3534 */ 3535 void 3536 nchinit(void) 3537 { 3538 int i; 3539 globaldata_t gd; 3540 3541 /* initialise per-cpu namecache effectiveness statistics. */ 3542 for (i = 0; i < ncpus; ++i) { 3543 gd = globaldata_find(i); 3544 gd->gd_nchstats = &nchstats[i]; 3545 } 3546 TAILQ_INIT(&ncneglist); 3547 spin_init(&ncspin); 3548 nchashtbl = hashinit_ext(desiredvnodes / 2, 3549 sizeof(struct nchash_head), 3550 M_VFSCACHE, &nchash); 3551 for (i = 0; i <= (int)nchash; ++i) { 3552 LIST_INIT(&nchashtbl[i].list); 3553 spin_init(&nchashtbl[i].spin); 3554 } 3555 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) 3556 spin_init(&ncmount_cache[i].spin); 3557 nclockwarn = 5 * hz; 3558 } 3559 3560 /* 3561 * Called from start_init() to bootstrap the root filesystem. Returns 3562 * a referenced, unlocked namecache record. 3563 */ 3564 void 3565 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp) 3566 { 3567 nch->ncp = cache_alloc(0); 3568 nch->mount = mp; 3569 atomic_add_int(&mp->mnt_refs, 1); 3570 if (vp) 3571 _cache_setvp(nch->mount, nch->ncp, vp); 3572 } 3573 3574 /* 3575 * vfs_cache_setroot() 3576 * 3577 * Create an association between the root of our namecache and 3578 * the root vnode. This routine may be called several times during 3579 * booting. 3580 * 3581 * If the caller intends to save the returned namecache pointer somewhere 3582 * it must cache_hold() it. 3583 */ 3584 void 3585 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch) 3586 { 3587 struct vnode *ovp; 3588 struct nchandle onch; 3589 3590 ovp = rootvnode; 3591 onch = rootnch; 3592 rootvnode = nvp; 3593 if (nch) 3594 rootnch = *nch; 3595 else 3596 cache_zero(&rootnch); 3597 if (ovp) 3598 vrele(ovp); 3599 if (onch.ncp) 3600 cache_drop(&onch); 3601 } 3602 3603 /* 3604 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache 3605 * topology and is being removed as quickly as possible. The new VOP_N*() 3606 * API calls are required to make specific adjustments using the supplied 3607 * ncp pointers rather then just bogusly purging random vnodes. 3608 * 3609 * Invalidate all namecache entries to a particular vnode as well as 3610 * any direct children of that vnode in the namecache. This is a 3611 * 'catch all' purge used by filesystems that do not know any better. 3612 * 3613 * Note that the linkage between the vnode and its namecache entries will 3614 * be removed, but the namecache entries themselves might stay put due to 3615 * active references from elsewhere in the system or due to the existance of 3616 * the children. The namecache topology is left intact even if we do not 3617 * know what the vnode association is. Such entries will be marked 3618 * NCF_UNRESOLVED. 3619 */ 3620 void 3621 cache_purge(struct vnode *vp) 3622 { 3623 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN); 3624 } 3625 3626 /* 3627 * Flush all entries referencing a particular filesystem. 3628 * 3629 * Since we need to check it anyway, we will flush all the invalid 3630 * entries at the same time. 3631 */ 3632 #if 0 3633 3634 void 3635 cache_purgevfs(struct mount *mp) 3636 { 3637 struct nchash_head *nchpp; 3638 struct namecache *ncp, *nnp; 3639 3640 /* 3641 * Scan hash tables for applicable entries. 3642 */ 3643 for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) { 3644 spin_lock_wr(&nchpp->spin); XXX 3645 ncp = LIST_FIRST(&nchpp->list); 3646 if (ncp) 3647 _cache_hold(ncp); 3648 while (ncp) { 3649 nnp = LIST_NEXT(ncp, nc_hash); 3650 if (nnp) 3651 _cache_hold(nnp); 3652 if (ncp->nc_mount == mp) { 3653 _cache_lock(ncp); 3654 ncp = cache_zap(ncp, 0); 3655 if (ncp) 3656 _cache_drop(ncp); 3657 } else { 3658 _cache_drop(ncp); 3659 } 3660 ncp = nnp; 3661 } 3662 spin_unlock_wr(&nchpp->spin); XXX 3663 } 3664 } 3665 3666 #endif 3667 3668 static int disablecwd; 3669 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, 3670 "Disable getcwd"); 3671 3672 static u_long numcwdcalls; 3673 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0, 3674 "Number of current directory resolution calls"); 3675 static u_long numcwdfailnf; 3676 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0, 3677 "Number of current directory failures due to lack of file"); 3678 static u_long numcwdfailsz; 3679 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0, 3680 "Number of current directory failures due to large result"); 3681 static u_long numcwdfound; 3682 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0, 3683 "Number of current directory resolution successes"); 3684 3685 /* 3686 * MPALMOSTSAFE 3687 */ 3688 int 3689 sys___getcwd(struct __getcwd_args *uap) 3690 { 3691 u_int buflen; 3692 int error; 3693 char *buf; 3694 char *bp; 3695 3696 if (disablecwd) 3697 return (ENODEV); 3698 3699 buflen = uap->buflen; 3700 if (buflen == 0) 3701 return (EINVAL); 3702 if (buflen > MAXPATHLEN) 3703 buflen = MAXPATHLEN; 3704 3705 buf = kmalloc(buflen, M_TEMP, M_WAITOK); 3706 bp = kern_getcwd(buf, buflen, &error); 3707 if (error == 0) 3708 error = copyout(bp, uap->buf, strlen(bp) + 1); 3709 kfree(buf, M_TEMP); 3710 return (error); 3711 } 3712 3713 char * 3714 kern_getcwd(char *buf, size_t buflen, int *error) 3715 { 3716 struct proc *p = curproc; 3717 char *bp; 3718 int i, slash_prefixed; 3719 struct filedesc *fdp; 3720 struct nchandle nch; 3721 struct namecache *ncp; 3722 3723 numcwdcalls++; 3724 bp = buf; 3725 bp += buflen - 1; 3726 *bp = '\0'; 3727 fdp = p->p_fd; 3728 slash_prefixed = 0; 3729 3730 nch = fdp->fd_ncdir; 3731 ncp = nch.ncp; 3732 if (ncp) 3733 _cache_hold(ncp); 3734 3735 while (ncp && (ncp != fdp->fd_nrdir.ncp || 3736 nch.mount != fdp->fd_nrdir.mount) 3737 ) { 3738 /* 3739 * While traversing upwards if we encounter the root 3740 * of the current mount we have to skip to the mount point 3741 * in the underlying filesystem. 3742 */ 3743 if (ncp == nch.mount->mnt_ncmountpt.ncp) { 3744 nch = nch.mount->mnt_ncmounton; 3745 _cache_drop(ncp); 3746 ncp = nch.ncp; 3747 if (ncp) 3748 _cache_hold(ncp); 3749 continue; 3750 } 3751 3752 /* 3753 * Prepend the path segment 3754 */ 3755 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 3756 if (bp == buf) { 3757 numcwdfailsz++; 3758 *error = ERANGE; 3759 bp = NULL; 3760 goto done; 3761 } 3762 *--bp = ncp->nc_name[i]; 3763 } 3764 if (bp == buf) { 3765 numcwdfailsz++; 3766 *error = ERANGE; 3767 bp = NULL; 3768 goto done; 3769 } 3770 *--bp = '/'; 3771 slash_prefixed = 1; 3772 3773 /* 3774 * Go up a directory. This isn't a mount point so we don't 3775 * have to check again. 3776 */ 3777 while ((nch.ncp = ncp->nc_parent) != NULL) { 3778 if (ncp_shared_lock_disable) 3779 _cache_lock(ncp); 3780 else 3781 _cache_lock_shared(ncp); 3782 if (nch.ncp != ncp->nc_parent) { 3783 _cache_unlock(ncp); 3784 continue; 3785 } 3786 _cache_hold(nch.ncp); 3787 _cache_unlock(ncp); 3788 break; 3789 } 3790 _cache_drop(ncp); 3791 ncp = nch.ncp; 3792 } 3793 if (ncp == NULL) { 3794 numcwdfailnf++; 3795 *error = ENOENT; 3796 bp = NULL; 3797 goto done; 3798 } 3799 if (!slash_prefixed) { 3800 if (bp == buf) { 3801 numcwdfailsz++; 3802 *error = ERANGE; 3803 bp = NULL; 3804 goto done; 3805 } 3806 *--bp = '/'; 3807 } 3808 numcwdfound++; 3809 *error = 0; 3810 done: 3811 if (ncp) 3812 _cache_drop(ncp); 3813 return (bp); 3814 } 3815 3816 /* 3817 * Thus begins the fullpath magic. 3818 * 3819 * The passed nchp is referenced but not locked. 3820 */ 3821 static int disablefullpath; 3822 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, 3823 &disablefullpath, 0, 3824 "Disable fullpath lookups"); 3825 3826 static u_int numfullpathcalls; 3827 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD, 3828 &numfullpathcalls, 0, 3829 "Number of full path resolutions in progress"); 3830 static u_int numfullpathfailnf; 3831 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD, 3832 &numfullpathfailnf, 0, 3833 "Number of full path resolution failures due to lack of file"); 3834 static u_int numfullpathfailsz; 3835 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD, 3836 &numfullpathfailsz, 0, 3837 "Number of full path resolution failures due to insufficient memory"); 3838 static u_int numfullpathfound; 3839 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD, 3840 &numfullpathfound, 0, 3841 "Number of full path resolution successes"); 3842 3843 int 3844 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase, 3845 char **retbuf, char **freebuf, int guess) 3846 { 3847 struct nchandle fd_nrdir; 3848 struct nchandle nch; 3849 struct namecache *ncp; 3850 struct mount *mp, *new_mp; 3851 char *bp, *buf; 3852 int slash_prefixed; 3853 int error = 0; 3854 int i; 3855 3856 atomic_add_int(&numfullpathcalls, -1); 3857 3858 *retbuf = NULL; 3859 *freebuf = NULL; 3860 3861 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK); 3862 bp = buf + MAXPATHLEN - 1; 3863 *bp = '\0'; 3864 if (nchbase) 3865 fd_nrdir = *nchbase; 3866 else if (p != NULL) 3867 fd_nrdir = p->p_fd->fd_nrdir; 3868 else 3869 fd_nrdir = rootnch; 3870 slash_prefixed = 0; 3871 nch = *nchp; 3872 ncp = nch.ncp; 3873 if (ncp) 3874 _cache_hold(ncp); 3875 mp = nch.mount; 3876 3877 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) { 3878 new_mp = NULL; 3879 3880 /* 3881 * If we are asked to guess the upwards path, we do so whenever 3882 * we encounter an ncp marked as a mountpoint. We try to find 3883 * the actual mountpoint by finding the mountpoint with this 3884 * ncp. 3885 */ 3886 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) { 3887 new_mp = mount_get_by_nc(ncp); 3888 } 3889 /* 3890 * While traversing upwards if we encounter the root 3891 * of the current mount we have to skip to the mount point. 3892 */ 3893 if (ncp == mp->mnt_ncmountpt.ncp) { 3894 new_mp = mp; 3895 } 3896 if (new_mp) { 3897 nch = new_mp->mnt_ncmounton; 3898 _cache_drop(ncp); 3899 ncp = nch.ncp; 3900 if (ncp) 3901 _cache_hold(ncp); 3902 mp = nch.mount; 3903 continue; 3904 } 3905 3906 /* 3907 * Prepend the path segment 3908 */ 3909 for (i = ncp->nc_nlen - 1; i >= 0; i--) { 3910 if (bp == buf) { 3911 numfullpathfailsz++; 3912 kfree(buf, M_TEMP); 3913 error = ENOMEM; 3914 goto done; 3915 } 3916 *--bp = ncp->nc_name[i]; 3917 } 3918 if (bp == buf) { 3919 numfullpathfailsz++; 3920 kfree(buf, M_TEMP); 3921 error = ENOMEM; 3922 goto done; 3923 } 3924 *--bp = '/'; 3925 slash_prefixed = 1; 3926 3927 /* 3928 * Go up a directory. This isn't a mount point so we don't 3929 * have to check again. 3930 * 3931 * We can only safely access nc_parent with ncp held locked. 3932 */ 3933 while ((nch.ncp = ncp->nc_parent) != NULL) { 3934 _cache_lock(ncp); 3935 if (nch.ncp != ncp->nc_parent) { 3936 _cache_unlock(ncp); 3937 continue; 3938 } 3939 _cache_hold(nch.ncp); 3940 _cache_unlock(ncp); 3941 break; 3942 } 3943 _cache_drop(ncp); 3944 ncp = nch.ncp; 3945 } 3946 if (ncp == NULL) { 3947 numfullpathfailnf++; 3948 kfree(buf, M_TEMP); 3949 error = ENOENT; 3950 goto done; 3951 } 3952 3953 if (!slash_prefixed) { 3954 if (bp == buf) { 3955 numfullpathfailsz++; 3956 kfree(buf, M_TEMP); 3957 error = ENOMEM; 3958 goto done; 3959 } 3960 *--bp = '/'; 3961 } 3962 numfullpathfound++; 3963 *retbuf = bp; 3964 *freebuf = buf; 3965 error = 0; 3966 done: 3967 if (ncp) 3968 _cache_drop(ncp); 3969 return(error); 3970 } 3971 3972 int 3973 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf, 3974 int guess) 3975 { 3976 struct namecache *ncp; 3977 struct nchandle nch; 3978 int error; 3979 3980 *freebuf = NULL; 3981 atomic_add_int(&numfullpathcalls, 1); 3982 if (disablefullpath) 3983 return (ENODEV); 3984 3985 if (p == NULL) 3986 return (EINVAL); 3987 3988 /* vn is NULL, client wants us to use p->p_textvp */ 3989 if (vn == NULL) { 3990 if ((vn = p->p_textvp) == NULL) 3991 return (EINVAL); 3992 } 3993 spin_lock(&vn->v_spin); 3994 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) { 3995 if (ncp->nc_nlen) 3996 break; 3997 } 3998 if (ncp == NULL) { 3999 spin_unlock(&vn->v_spin); 4000 return (EINVAL); 4001 } 4002 _cache_hold(ncp); 4003 spin_unlock(&vn->v_spin); 4004 4005 atomic_add_int(&numfullpathcalls, -1); 4006 nch.ncp = ncp; 4007 nch.mount = vn->v_mount; 4008 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess); 4009 _cache_drop(ncp); 4010 return (error); 4011 } 4012