xref: /dflybsd-src/sys/kern/vfs_cache.c (revision bc76a771df54af7e361532b257cecc26227736b4)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>,
5  *	All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
39  * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $
40  * $DragonFly: src/sys/kern/vfs_cache.c,v 1.16 2004/04/08 22:00:41 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysctl.h>
47 #include <sys/mount.h>
48 #include <sys/vnode.h>
49 #include <sys/malloc.h>
50 #include <sys/sysproto.h>
51 #include <sys/proc.h>
52 #include <sys/namei.h>
53 #include <sys/filedesc.h>
54 #include <sys/fnv_hash.h>
55 #include <sys/globaldata.h>
56 
57 /*
58  * Random lookups in the cache are accomplished with a hash table using
59  * a hash key of (nc_src_vp, name).
60  *
61  * Negative entries may exist and correspond to structures where nc_vp
62  * is NULL.  In a negative entry, NCF_WHITEOUT will be set if the entry
63  * corresponds to a whited-out directory entry (verses simply not finding the
64  * entry at all).
65  *
66  * Upon reaching the last segment of a path, if the reference is for DELETE,
67  * or NOCACHE is set (rewrite), and the name is located in the cache, it
68  * will be dropped.
69  */
70 
71 /*
72  * Structures associated with name cacheing.
73  */
74 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
75 #define MINNEG		1024
76 
77 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
78 
79 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
80 static struct namecache_list	ncneglist;		/* instead of vnode */
81 static struct namecache		rootnamecache;		/* Dummy node */
82 
83 static int	nczapcheck;		/* panic on bad release */
84 SYSCTL_INT(_debug, OID_AUTO, nczapcheck, CTLFLAG_RW, &nczapcheck, 0, "");
85 
86 static u_long	nchash;			/* size of hash table */
87 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
88 
89 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
90 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
91 
92 static u_long	numneg;		/* number of cache entries allocated */
93 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
94 
95 static u_long	numcache;		/* number of cache entries allocated */
96 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
97 
98 static u_long	numunres;		/* number of unresolved entries */
99 SYSCTL_ULONG(_debug, OID_AUTO, numunres, CTLFLAG_RD, &numunres, 0, "");
100 
101 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
102 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
103 
104 /*
105  * The new name cache statistics
106  */
107 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
108 #define STATNODE(mode, name, var) \
109 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
110 STATNODE(CTLFLAG_RD, numneg, &numneg);
111 STATNODE(CTLFLAG_RD, numcache, &numcache);
112 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
113 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
114 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
115 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
116 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
117 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
118 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
119 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
120 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
121 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
122 
123 struct nchstats nchstats[SMP_MAXCPU];
124 /*
125  * Export VFS cache effectiveness statistics to user-land.
126  *
127  * The statistics are left for aggregation to user-land so
128  * neat things can be achieved, like observing per-CPU cache
129  * distribution.
130  */
131 static int
132 nchstats_agg(SYSCTL_HANDLER_ARGS)
133 {
134 	struct globaldata *gd;
135 	int i, error;
136 
137 	error = 0;
138 	for (i = 0; i < ncpus; ++i) {
139 		gd = globaldata_find(i);
140 		if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
141 			sizeof(struct nchstats))))
142 			break;
143 	}
144 
145 	return (error);
146 }
147 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
148   0, 0, nchstats_agg, "S,nchstats", "VFS cache effectiveness statistics");
149 
150 static void cache_zap(struct namecache *ncp);
151 
152 /*
153  * cache_hold() and cache_drop() prevent the premature deletion of a
154  * namecache entry but do not prevent operations (such as zapping) on
155  * that namecache entry.
156  */
157 static __inline
158 struct namecache *
159 _cache_hold(struct namecache *ncp)
160 {
161 	++ncp->nc_refs;
162 	return(ncp);
163 }
164 
165 static __inline
166 void
167 _cache_drop(struct namecache *ncp)
168 {
169 	KKASSERT(ncp->nc_refs > 0);
170 	if (ncp->nc_refs == 1 &&
171 	    (ncp->nc_flag & NCF_UNRESOLVED) &&
172 	    TAILQ_EMPTY(&ncp->nc_list)
173 	) {
174 		cache_zap(ncp);
175 	} else {
176 		--ncp->nc_refs;
177 	}
178 }
179 
180 struct namecache *
181 cache_hold(struct namecache *ncp)
182 {
183 	return(_cache_hold(ncp));
184 }
185 
186 void
187 cache_drop(struct namecache *ncp)
188 {
189 	_cache_drop(ncp);
190 }
191 
192 static void
193 cache_link_parent(struct namecache *ncp, struct namecache *par)
194 {
195 	KKASSERT(ncp->nc_parent == NULL);
196 	ncp->nc_parent = par;
197 	if (TAILQ_EMPTY(&par->nc_list)) {
198 		if (par->nc_vp)
199 			vhold(par->nc_vp);
200 	}
201 	TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
202 }
203 
204 static void
205 cache_unlink_parent(struct namecache *ncp)
206 {
207 	struct namecache *par;
208 
209 	if ((par = ncp->nc_parent) != NULL) {
210 		ncp->nc_parent = NULL;
211 		par = cache_hold(par);
212 		TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
213 		if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
214 			vdrop(par->nc_vp);
215 		cache_drop(par);
216 	}
217 }
218 
219 static struct namecache *
220 cache_alloc(struct vnode *vp)
221 {
222 	struct namecache *ncp;
223 
224 	ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
225 	TAILQ_INIT(&ncp->nc_list);
226 	ncp->nc_vp = vp;
227 	if (vp != NULL) {
228 		TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
229 		++numcache;
230 	} else {
231 		TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
232 		++numneg;
233 	}
234 	return(ncp);
235 }
236 
237 /*
238  * Try to destroy a namecache entry.  The entry is disassociated from its
239  * vnode or ncneglist and reverted to an UNRESOLVED state.
240  *
241  * Then, if there are no additional references to the ncp and we can
242  * successfully delete the children, the entry is also removed from the
243  * namecache hashlist / topology.
244  *
245  * References or undeletable children will prevent the entry from being
246  * removed from the topology.  The entry may be revalidated (typically
247  * by cache_enter()) at a later time.  Children remain because:
248  *
249  *	+ we have tried to delete a node rather then a leaf in the topology.
250  *	+ the presence of negative entries (we try to scrap these).
251  *	+ an entry or child has a non-zero ref count and cannot be scrapped.
252  *
253  * This function must be called with the ncp held and will drop the ref
254  * count during zapping.
255  */
256 static void
257 cache_zap(struct namecache *ncp)
258 {
259 	struct namecache *par;
260 	struct vnode *vp;
261 
262 	/*
263 	 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
264 	 */
265 	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
266 		ncp->nc_flag |= NCF_UNRESOLVED;
267 		++numunres;
268 		if ((vp = ncp->nc_vp) != NULL) {
269 			--numcache;
270 			ncp->nc_vp = NULL;	/* safety */
271 			TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
272 			if (!TAILQ_EMPTY(&ncp->nc_list))
273 				vdrop(vp);
274 		} else {
275 			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
276 			--numneg;
277 		}
278 	}
279 
280 	/*
281 	 * Try to scrap the entry and possibly tail-recurse on its parent.
282 	 * We only scrap unref'd (other then our ref) unresolved entries,
283 	 * we do not scrap 'live' entries.
284 	 */
285 	while (ncp->nc_flag & NCF_UNRESOLVED) {
286 		/*
287 		 * Someone other then us has a ref, stop.
288 		 */
289 		if (ncp->nc_refs > 1)
290 			goto done;
291 
292 		/*
293 		 * We have children, stop.
294 		 */
295 		if (!TAILQ_EMPTY(&ncp->nc_list))
296 			goto done;
297 
298 		/*
299 		 * Ok, we can completely destroy and free this entry.  Sanity
300 		 * check it against our static rootnamecache structure,
301 		 * then remove it from the hash.
302 		 */
303 		KKASSERT(ncp != &rootnamecache);
304 
305 		if (ncp->nc_flag & NCF_HASHED) {
306 			ncp->nc_flag &= ~NCF_HASHED;
307 			LIST_REMOVE(ncp, nc_hash);
308 		}
309 
310 		/*
311 		 * Unlink from its parent and free, then loop on the
312 		 * parent.  XXX temp hack, in stage-3 parent is never NULL
313 		 */
314 		if ((par = ncp->nc_parent) != NULL) {
315 			par = cache_hold(par);
316 			TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
317 			if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
318 				vdrop(par->nc_vp);
319 		}
320 		--numunres;
321 		ncp->nc_refs = -1;	/* safety */
322 		ncp->nc_parent = NULL;	/* safety */
323 		if (ncp->nc_name)
324 			free(ncp->nc_name, M_VFSCACHE);
325 		free(ncp, M_VFSCACHE);
326 		ncp = par;
327 		if (par == NULL)	/* temp hack */
328 			return;		/* temp hack */
329 	}
330 done:
331 	--ncp->nc_refs;
332 }
333 
334 /*
335  * Lookup an entry in the cache
336  *
337  * Lookup is called with dvp pointing to the directory to search,
338  * cnp pointing to the name of the entry being sought.
339  *
340  * If the lookup succeeds, the vnode is returned in *vpp, and a
341  * status of -1 is returned.
342  *
343  * If the lookup determines that the name does not exist (negative cacheing),
344  * a status of ENOENT is returned.
345  *
346  * If the lookup fails, a status of zero is returned.
347  *
348  * Note that UNRESOLVED entries are ignored.  They are not negative cache
349  * entries.
350  */
351 int
352 cache_lookup(struct vnode *dvp, struct namecache *par, struct vnode **vpp,
353 		struct namecache **ncpp, struct componentname *cnp)
354 {
355 	struct namecache *ncp;
356 	u_int32_t hash;
357 	globaldata_t gd = mycpu;
358 
359 	numcalls++;
360 
361 	/*
362 	 * Obtain the namecache entry associated with dvp, creating one if
363 	 * necessary.  If we have to create one we have insufficient
364 	 * information to hash it or even supply the name, but we still
365 	 * need one so we can link it in.
366 	 *
367 	 * NOTE: in this stage of development, the passed 'par' is
368 	 * almost always NULL.
369 	 */
370 	if (par == NULL) {
371 		if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
372 			par = cache_alloc(dvp);
373 	}
374 
375 	/*
376 	 * Deal with "." and "..".  In this stage of code development we leave
377 	 * the returned ncpp NULL.  Note that if the namecache is disjoint,
378 	 * we won't find a vnode for "..".
379 	 */
380 	if (cnp->cn_nameptr[0] == '.') {
381 		if (cnp->cn_namelen == 1) {
382 			*vpp = dvp;
383 			dothits++;
384 			numposhits++;	/* include in total statistics */
385 			return (-1);
386 		}
387 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
388 			dotdothits++;
389 			numposhits++;	/* include in total statistics */
390 			if ((cnp->cn_flags & CNP_MAKEENTRY) == 0)
391 				return (0);
392 			if (par->nc_parent == NULL ||
393 			    par->nc_parent->nc_vp == NULL) {
394 				return (0);
395 			}
396 			*vpp = par->nc_parent->nc_vp;
397 			return (-1);
398 		}
399 	}
400 
401 	/*
402 	 * Try to locate an existing entry
403 	 */
404 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
405 	hash = fnv_32_buf(&par, sizeof(par), hash);
406 	if (nczapcheck > 1)
407 	    printf("DVP %p/%p %08x %*.*s\n", dvp, par, hash, (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
408 restart:
409 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
410 		numchecks++;
411 		if (nczapcheck > 1) {
412 		    printf("TEST ncp par=%p %*.*s\n",
413 			ncp->nc_parent, ncp->nc_nlen, ncp->nc_nlen,
414 			ncp->nc_name);
415 		}
416 
417 		/*
418 		 * Zap entries that have timed out.
419 		 */
420 		if (ncp->nc_timeout &&
421 		    (int)(ncp->nc_timeout - ticks) < 0
422 		) {
423 			if (nczapcheck > 1)
424 			    printf("TIMEOUT\n");
425 			cache_zap(cache_hold(ncp));
426 			goto restart;
427 		}
428 
429 		/*
430 		 * Break out if we find a matching entry.  UNRESOLVED entries
431 		 * never match (they are in the middle of being destroyed).
432 		 */
433 		if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
434 		    ncp->nc_parent == par &&
435 		    ncp->nc_nlen == cnp->cn_namelen &&
436 		    bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
437 		) {
438 			if (nczapcheck > 1)
439 			    printf("GOOD\n");
440 			cache_hold(ncp);
441 			break;
442 		}
443 	}
444 
445 	/*
446 	 * If we failed to locate an entry, return 0 (indicates failure).
447 	 */
448 	if (ncp == NULL) {
449 		if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
450 			nummisszap++;
451 		} else {
452 			nummiss++;
453 		}
454 		gd->gd_nchstats->ncs_miss++;
455 		if (nczapcheck) {
456 		    printf("MISS %p/%p %*.*s/%*.*s\n", dvp, par,
457 			par->nc_nlen, par->nc_nlen, (par->nc_name ? par->nc_name : ""),
458 			(int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
459 		}
460 		return (0);
461 	}
462 
463 	/*
464 	 * If we found an entry, but we don't want to have one, we zap it.
465 	 */
466 	if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
467 		numposzaps++;
468 		gd->gd_nchstats->ncs_badhits++;
469 		cache_zap(ncp);
470 		return (0);
471 	}
472 
473 	/*
474 	 * If the vnode is not NULL then return the positive match.
475 	 */
476 	if (ncp->nc_vp) {
477 		numposhits++;
478 		gd->gd_nchstats->ncs_goodhits++;
479 		*vpp = ncp->nc_vp;
480 		cache_drop(ncp);
481 		return (-1);
482 	}
483 
484 	/*
485 	 * If the vnode is NULL we found a negative match.  If we want to
486 	 * create it, purge the negative match and return failure (as if
487 	 * we hadn't found a match in the first place).
488 	 */
489 	if (cnp->cn_nameiop == NAMEI_CREATE) {
490 		numnegzaps++;
491 		gd->gd_nchstats->ncs_badhits++;
492 		cache_zap(ncp);
493 		return (0);
494 	}
495 
496 	numneghits++;
497 
498 	/*
499 	 * We found a "negative" match, ENOENT notifies client of this match.
500 	 * The nc_flag field records whether this is a whiteout.  Since there
501 	 * is no vnode we can use the vnode tailq link field with ncneglist.
502 	 */
503 	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
504 	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
505 	gd->gd_nchstats->ncs_neghits++;
506 	if (ncp->nc_flag & NCF_WHITEOUT)
507 		cnp->cn_flags |= CNP_ISWHITEOUT;
508 	cache_drop(ncp);
509 	return (ENOENT);
510 }
511 
512 /*
513  * Generate a special linkage between the mount point and the root of the
514  * mounted filesystem in order to maintain the namecache topology across
515  * a mount point.  The special linkage has a 0-length name component
516  * and sets NCF_MOUNTPT.
517  */
518 void
519 cache_mount(struct vnode *dvp, struct vnode *tvp)
520 {
521 	struct namecache *ncp;
522 	struct namecache *par;
523 	struct nchashhead *nchpp;
524 	u_int32_t hash;
525 
526 	/*
527 	 * If a linkage already exists we do not have to do anything.
528 	 */
529 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
530 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
531 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
532 		numchecks++;
533 		if (ncp->nc_vp == tvp &&
534 		    ncp->nc_nlen == 0 &&
535 		    ncp->nc_parent &&
536 		    ncp->nc_parent->nc_vp == dvp
537 		) {
538 			return;
539 		}
540 	}
541 
542 	if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
543 		par = cache_alloc(dvp);
544 
545 	/*
546 	 * Otherwise create a new linkage.
547 	 */
548 	ncp = cache_alloc(tvp);
549 	ncp->nc_flag = NCF_MOUNTPT;
550 	cache_link_parent(ncp, par);
551 
552 	/*
553 	 * Hash table
554 	 */
555 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
556 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
557 	nchpp = NCHHASH(hash);
558 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
559 
560 	ncp->nc_flag |= NCF_HASHED;
561 }
562 
563 /*
564  * Add an entry to the cache.
565  */
566 void
567 cache_enter(struct vnode *dvp, struct namecache *par, struct vnode *vp, struct componentname *cnp)
568 {
569 	struct namecache *ncp;
570 	struct namecache *bpar;
571 	struct nchashhead *nchpp;
572 	u_int32_t hash;
573 	char *name;
574 
575 	/*
576 	 * If the directory has no namecache entry we must associate one with
577 	 * it.  The name of the entry is not known so it isn't hashed.
578 	 */
579 	if (par == NULL) {
580 		if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
581 			par = cache_alloc(dvp);
582 	}
583 
584 	/*
585 	 * This may be a bit confusing.  "." and ".." are 'virtual' entries.
586 	 * We do not actually create a namecache entry representing either.
587 	 * However, the ".." case is used to linkup a potentially disjoint
588 	 * directory with its parent, to disconnect a directory from its
589 	 * parent, or to change an existing linkage that may no longer be
590 	 * correct (as might occur when a subdirectory is renamed).
591 	 */
592 
593 	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')
594 		return;
595 	if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' &&
596 	    cnp->cn_nameptr[1] == '.'
597 	) {
598 		if (vp == NULL) {
599 			if (par->nc_parent)
600 				cache_unlink_parent(par);
601 		} else {
602 			if ((ncp = TAILQ_FIRST(&vp->v_namecache)) == NULL)
603 				ncp = cache_alloc(vp);
604 			cache_hold(par);
605 			if (par->nc_parent)
606 				cache_unlink_parent(par);
607 			cache_link_parent(par, ncp); /* ncp is parent of par */
608 			cache_drop(par);
609 		}
610 		return;
611 	}
612 
613 	/*
614 	 * Locate other entries associated with this vnode and zap them,
615 	 * because the purge code may not be able to find them due to
616 	 * the topology not yet being consistent.  This is a temporary
617 	 * hack.
618 	 */
619 	if (vp) {
620 again:
621 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
622 			if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
623 				cache_zap(cache_hold(ncp));
624 				goto again;
625 			}
626 		}
627 	}
628 
629 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
630 	bpar = par;
631 	hash = fnv_32_buf(&bpar, sizeof(bpar), hash);
632 
633 	if (nczapcheck > 1)
634 	    printf("ENTER %p/%p %08x '%*.*s' %p ", dvp, par, hash, (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr, vp);
635 
636 
637 	name = malloc(cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
638 	ncp = cache_alloc(vp);
639 	if (nczapcheck > 1)
640 	    printf("alloc\n");
641 
642 	/*
643 	 * Set a timeout
644 	 */
645 	if (cnp->cn_flags & CNP_CACHETIMEOUT) {
646 		if ((ncp->nc_timeout = ticks + cnp->cn_timeout) == 0)
647 			ncp->nc_timeout = 1;
648 	}
649 
650 	/*
651 	 * Linkup the parent pointer, bump the parent vnode's hold
652 	 * count when we go from 0->1 children.
653 	 */
654 	cache_link_parent(ncp, par);
655 
656 	/*
657 	 * Add to the hash table
658 	 */
659 	ncp->nc_name = name;
660 	ncp->nc_nlen = cnp->cn_namelen;
661 	bcopy(cnp->cn_nameptr, ncp->nc_name, cnp->cn_namelen);
662 	nchpp = NCHHASH(hash);
663 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
664 	ncp->nc_flag |= NCF_HASHED;
665 
666 	/*
667 	 * If the target vnode is NULL if this is to be a negative cache
668 	 * entry.
669 	 */
670 	if (vp == NULL) {
671 		ncp->nc_flag &= ~NCF_WHITEOUT;
672 		if (cnp->cn_flags & CNP_ISWHITEOUT)
673 			ncp->nc_flag |= NCF_WHITEOUT;
674 	}
675 
676 	/*
677 	 * Don't cache too many negative hits
678 	 */
679 	if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
680 		ncp = TAILQ_FIRST(&ncneglist);
681 		KKASSERT(ncp != NULL);
682 		cache_zap(cache_hold(ncp));
683 	}
684 }
685 
686 /*
687  * Name cache initialization, from vfsinit() when we are booting
688  *
689  * rootnamecache is initialized such that it cannot be recursively deleted.
690  */
691 void
692 nchinit(void)
693 {
694 	int i;
695 	globaldata_t gd;
696 
697 	/* initialise per-cpu namecache effectiveness statistics. */
698 	for (i = 0; i < ncpus; ++i) {
699 		gd = globaldata_find(i);
700 		gd->gd_nchstats = &nchstats[i];
701 	}
702 
703 	TAILQ_INIT(&ncneglist);
704 	nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
705 	TAILQ_INIT(&rootnamecache.nc_list);
706 	rootnamecache.nc_flag |= NCF_HASHED | NCF_ROOT | NCF_UNRESOLVED;
707 	rootnamecache.nc_refs = 1;
708 }
709 
710 /*
711  * vfs_cache_setroot()
712  *
713  *	Create an association between the root of our namecache and
714  *	the root vnode.  This routine may be called several times during
715  *	booting.
716  */
717 void
718 vfs_cache_setroot(struct vnode *nvp)
719 {
720 	KKASSERT(rootnamecache.nc_refs > 0);	/* don't accidently free */
721 	cache_zap(cache_hold(&rootnamecache));
722 
723 	rootnamecache.nc_vp = nvp;
724 	rootnamecache.nc_flag &= ~NCF_UNRESOLVED;
725 	if (nvp) {
726 		++numcache;
727 		if (!TAILQ_EMPTY(&rootnamecache.nc_list))
728 			vhold(nvp);
729 		TAILQ_INSERT_HEAD(&nvp->v_namecache, &rootnamecache, nc_vnode);
730 	} else {
731 		++numneg;
732 		TAILQ_INSERT_TAIL(&ncneglist, &rootnamecache, nc_vnode);
733 		rootnamecache.nc_flag &= ~NCF_WHITEOUT;
734 	}
735 }
736 
737 /*
738  * Invalidate all namecache entries to a particular vnode as well as
739  * any direct children of that vnode in the namecache.  This is a
740  * 'catch all' purge used by filesystems that do not know any better.
741  *
742  * A new vnode v_id is generated.  Note that no vnode will ever have a
743  * v_id of 0.
744  *
745  * Note that the linkage between the vnode and its namecache entries will
746  * be removed, but the namecache entries themselves might stay put due to
747  * active references from elsewhere in the system or due to the existance of
748  * the children.   The namecache topology is left intact even if we do not
749  * know what the vnode association is.  Such entries will be marked
750  * NCF_UNRESOLVED.
751  *
752  * XXX: Only time and the size of v_id prevents this from failing:
753  * XXX: In theory we should hunt down all (struct vnode*, v_id)
754  * XXX: soft references and nuke them, at least on the global
755  * XXX: v_id wraparound.  The period of resistance can be extended
756  * XXX: by incrementing each vnodes v_id individually instead of
757  * XXX: using the global v_id.
758  */
759 void
760 cache_purge(struct vnode *vp)
761 {
762 	static u_long nextid;
763 	struct namecache *ncp;
764 	struct namecache *scan;
765 
766 	/*
767 	 * Disassociate the vnode from its namecache entries along with
768 	 * (for historical reasons) any direct children.
769 	 */
770 	while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
771 		cache_hold(ncp);
772 
773 restart: /* YYY hack, fix me */
774 		TAILQ_FOREACH(scan, &ncp->nc_list, nc_entry) {
775 			if ((scan->nc_flag & NCF_UNRESOLVED) == 0) {
776 				cache_zap(cache_hold(scan));
777 				goto restart;
778 			}
779 		}
780 		cache_zap(ncp);
781 	}
782 
783 	/*
784 	 * Calculate a new unique id for ".." handling
785 	 */
786 	do {
787 		nextid++;
788 	} while (nextid == vp->v_id || nextid == 0);
789 	vp->v_id = nextid;
790 }
791 
792 /*
793  * Flush all entries referencing a particular filesystem.
794  *
795  * Since we need to check it anyway, we will flush all the invalid
796  * entries at the same time.
797  */
798 void
799 cache_purgevfs(struct mount *mp)
800 {
801 	struct nchashhead *nchpp;
802 	struct namecache *ncp, *nnp;
803 
804 	/*
805 	 * Scan hash tables for applicable entries.
806 	 */
807 	for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
808 		ncp = LIST_FIRST(nchpp);
809 		if (ncp)
810 			cache_hold(ncp);
811 		while (ncp) {
812 			nnp = LIST_NEXT(ncp, nc_hash);
813 			if (nnp)
814 				cache_hold(nnp);
815 			if (ncp->nc_vp && ncp->nc_vp->v_mount == mp)
816 				cache_zap(ncp);
817 			else
818 				cache_drop(ncp);
819 			ncp = nnp;
820 		}
821 	}
822 }
823 
824 /*
825  * cache_leaf_test()
826  *
827  *	Test whether the vnode is at a leaf in the nameicache tree.
828  *
829  *	Returns 0 if it is a leaf, -1 if it isn't.
830  */
831 int
832 cache_leaf_test(struct vnode *vp)
833 {
834 	struct namecache *scan;
835 	struct namecache *ncp;
836 
837 	TAILQ_FOREACH(scan, &vp->v_namecache, nc_vnode) {
838 		TAILQ_FOREACH(ncp, &scan->nc_list, nc_entry) {
839 			/* YYY && ncp->nc_vp->v_type == VDIR ? */
840 			if (ncp->nc_vp != NULL)
841 				return(-1);
842 		}
843 	}
844 	return(0);
845 }
846 
847 /*
848  * Perform canonical checks and cache lookup and pass on to filesystem
849  * through the vop_cachedlookup only if needed.
850  *
851  * vop_lookup_args {
852  *	struct vnode a_dvp;
853  *	struct namecache *a_ncp;
854  *	struct vnode **a_vpp;
855  *	struct namecache **a_ncpp;
856  *	struct componentname *a_cnp;
857  * }
858  */
859 int
860 vfs_cache_lookup(struct vop_lookup_args *ap)
861 {
862 	struct vnode *dvp, *vp;
863 	int lockparent;
864 	int error;
865 	struct namecache *par = ap->a_par;
866 	struct vnode **vpp = ap->a_vpp;
867 	struct namecache **ncpp = ap->a_ncpp;
868 	struct componentname *cnp = ap->a_cnp;
869 	struct ucred *cred = cnp->cn_cred;
870 	int flags = cnp->cn_flags;
871 	struct thread *td = cnp->cn_td;
872 	u_long vpid;	/* capability number of vnode */
873 
874 	*vpp = NULL;
875 	if (ncpp)
876 		*ncpp = NULL;
877 	dvp = ap->a_dvp;
878 	lockparent = flags & CNP_LOCKPARENT;
879 
880 	if (dvp->v_type != VDIR)
881                 return (ENOTDIR);
882 
883 	if ((flags & CNP_ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
884 	    (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)) {
885 		return (EROFS);
886 	}
887 
888 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
889 
890 	if (error)
891 		return (error);
892 
893 	error = cache_lookup(dvp, par, vpp, ncpp, cnp);
894 
895 	if (!error)
896 		return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
897 
898 	if (error == ENOENT)
899 		return (error);
900 
901 	vp = *vpp;
902 	vpid = vp->v_id;
903 	cnp->cn_flags &= ~CNP_PDIRUNLOCK;
904 	if (dvp == vp) {   /* lookup on "." */
905 		VREF(vp);
906 		error = 0;
907 	} else if (flags & CNP_ISDOTDOT) {
908 		VOP_UNLOCK(dvp, NULL, 0, td);
909 		cnp->cn_flags |= CNP_PDIRUNLOCK;
910 		error = vget(vp, NULL, LK_EXCLUSIVE, td);
911 		if (!error && lockparent && (flags & CNP_ISLASTCN)) {
912 			if ((error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td)) == 0)
913 				cnp->cn_flags &= ~CNP_PDIRUNLOCK;
914 		}
915 	} else {
916 		error = vget(vp, NULL, LK_EXCLUSIVE, td);
917 		if (!lockparent || error || !(flags & CNP_ISLASTCN)) {
918 			VOP_UNLOCK(dvp, NULL, 0, td);
919 			cnp->cn_flags |= CNP_PDIRUNLOCK;
920 		}
921 	}
922 	/*
923 	 * Check that the capability number did not change
924 	 * while we were waiting for the lock.
925 	 */
926 	if (!error) {
927 		if (vpid == vp->v_id)
928 			return (0);
929 		vput(vp);
930 		if (lockparent && dvp != vp && (flags & CNP_ISLASTCN)) {
931 			VOP_UNLOCK(dvp, NULL, 0, td);
932 			cnp->cn_flags |= CNP_PDIRUNLOCK;
933 		}
934 	}
935 	if (cnp->cn_flags & CNP_PDIRUNLOCK) {
936 		error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td);
937 		if (error)
938 			return (error);
939 		cnp->cn_flags &= ~CNP_PDIRUNLOCK;
940 	}
941 	return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
942 }
943 
944 static int disablecwd;
945 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
946 
947 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
948 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
949 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
950 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
951 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
952 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
953 
954 int
955 __getcwd(struct __getcwd_args *uap)
956 {
957 	struct proc *p = curproc;
958 	char *bp, *buf;
959 	int error, i, slash_prefixed;
960 	struct filedesc *fdp;
961 	struct namecache *ncp;
962 	struct vnode *vp;
963 
964 	numcwdcalls++;
965 	if (disablecwd)
966 		return (ENODEV);
967 	if (uap->buflen < 2)
968 		return (EINVAL);
969 	if (uap->buflen > MAXPATHLEN)
970 		uap->buflen = MAXPATHLEN;
971 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
972 	bp += uap->buflen - 1;
973 	*bp = '\0';
974 	fdp = p->p_fd;
975 	slash_prefixed = 0;
976 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
977 		if (vp->v_flag & VROOT) {
978 			if (vp->v_mount == NULL) {	/* forced unmount */
979 				free(buf, M_TEMP);
980 				return (EBADF);
981 			}
982 			vp = vp->v_mount->mnt_vnodecovered;
983 			continue;
984 		}
985 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
986 			if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
987 			    ncp->nc_nlen > 0) {
988 				break;
989 			}
990 		}
991 		if (ncp == NULL) {
992 			numcwdfail2++;
993 			free(buf, M_TEMP);
994 			return (ENOENT);
995 		}
996 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
997 			if (bp == buf) {
998 				numcwdfail4++;
999 				free(buf, M_TEMP);
1000 				return (ENOMEM);
1001 			}
1002 			*--bp = ncp->nc_name[i];
1003 		}
1004 		if (bp == buf) {
1005 			numcwdfail4++;
1006 			free(buf, M_TEMP);
1007 			return (ENOMEM);
1008 		}
1009 		*--bp = '/';
1010 		slash_prefixed = 1;
1011 		vp = ncp->nc_parent->nc_vp;
1012 	}
1013 	if (!slash_prefixed) {
1014 		if (bp == buf) {
1015 			numcwdfail4++;
1016 			free(buf, M_TEMP);
1017 			return (ENOMEM);
1018 		}
1019 		*--bp = '/';
1020 	}
1021 	numcwdfound++;
1022 	error = copyout(bp, uap->buf, strlen(bp) + 1);
1023 	free(buf, M_TEMP);
1024 	return (error);
1025 }
1026 
1027 /*
1028  * Thus begins the fullpath magic.
1029  */
1030 
1031 #undef STATNODE
1032 #define STATNODE(name)							\
1033 	static u_int name;						\
1034 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
1035 
1036 static int disablefullpath;
1037 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
1038     &disablefullpath, 0, "");
1039 
1040 STATNODE(numfullpathcalls);
1041 STATNODE(numfullpathfail1);
1042 STATNODE(numfullpathfail2);
1043 STATNODE(numfullpathfail3);
1044 STATNODE(numfullpathfail4);
1045 STATNODE(numfullpathfound);
1046 
1047 int
1048 textvp_fullpath(struct proc *p, char **retbuf, char **retfreebuf)
1049 {
1050 	char *bp, *buf;
1051 	int i, slash_prefixed;
1052 	struct filedesc *fdp;
1053 	struct namecache *ncp;
1054 	struct vnode *vp, *textvp;
1055 
1056 	numfullpathcalls++;
1057 	if (disablefullpath)
1058 		return (ENODEV);
1059 	textvp = p->p_textvp;
1060 	if (textvp == NULL)
1061 		return (EINVAL);
1062 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1063 	bp = buf + MAXPATHLEN - 1;
1064 	*bp = '\0';
1065 	fdp = p->p_fd;
1066 	slash_prefixed = 0;
1067 	for (vp = textvp; vp != fdp->fd_rdir && vp != rootvnode;) {
1068 		if (vp->v_flag & VROOT) {
1069 			if (vp->v_mount == NULL) {	/* forced unmount */
1070 				free(buf, M_TEMP);
1071 				return (EBADF);
1072 			}
1073 			vp = vp->v_mount->mnt_vnodecovered;
1074 			continue;
1075 		}
1076 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1077 			if (vp == textvp)
1078 				break;
1079 			if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1080 			    ncp->nc_nlen > 0) {
1081 				break;
1082 			}
1083 		}
1084 		if (ncp == NULL) {
1085 			numfullpathfail2++;
1086 			free(buf, M_TEMP);
1087 			return (ENOENT);
1088 		}
1089 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1090 			if (bp == buf) {
1091 				numfullpathfail4++;
1092 				free(buf, M_TEMP);
1093 				return (ENOMEM);
1094 			}
1095 			*--bp = ncp->nc_name[i];
1096 		}
1097 		if (bp == buf) {
1098 			numfullpathfail4++;
1099 			free(buf, M_TEMP);
1100 			return (ENOMEM);
1101 		}
1102 		*--bp = '/';
1103 		slash_prefixed = 1;
1104 		vp = ncp->nc_parent->nc_vp;
1105 	}
1106 	if (!slash_prefixed) {
1107 		if (bp == buf) {
1108 			numfullpathfail4++;
1109 			free(buf, M_TEMP);
1110 			return (ENOMEM);
1111 		}
1112 		*--bp = '/';
1113 	}
1114 	numfullpathfound++;
1115 	*retbuf = bp;
1116 	*retfreebuf = buf;
1117 	return (0);
1118 }
1119 
1120