xref: /dflybsd-src/sys/kern/vfs_nlookup.c (revision 78ce1036881a18b24268ba328cde152be2d1979d)
1 /*
2  * Copyright (c) 2004-2020 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * nlookup() is the 'new' namei interface.  Rather then return directory and
36  * leaf vnodes (in various lock states) the new interface instead deals in
37  * namecache records.  Namecache records may represent both a positive or
38  * a negative hit.  The namespace is locked via the namecache record instead
39  * of via the vnode, and only the leaf namecache record (representing the
40  * filename) needs to be locked.
41  *
42  * This greatly improves filesystem parallelism and is a huge simplification
43  * of the API verses the old vnode locking / namei scheme.
44  *
45  * Filesystems must actively control the caching aspects of the namecache,
46  * and since namecache pointers are used as handles they are non-optional
47  * even for filesystems which do not generally wish to cache things.  It is
48  * intended that a separate cache coherency API will be constructed to handle
49  * these issues.
50  */
51 
52 #include "opt_ktrace.h"
53 
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/vnode.h>
59 #include <sys/mount.h>
60 #include <sys/filedesc.h>
61 #include <sys/proc.h>
62 #include <sys/namei.h>
63 #include <sys/nlookup.h>
64 #include <sys/malloc.h>
65 #include <sys/stat.h>
66 #include <sys/objcache.h>
67 #include <sys/file.h>
68 #include <sys/kcollect.h>
69 
70 #ifdef KTRACE
71 #include <sys/ktrace.h>
72 #endif
73 
74 static int naccess(struct nchandle *nch, int vmode, struct ucred *cred,
75 		int *stickyp);
76 
77 /*
78  * unmount operations flag NLC_IGNBADDIR in order to allow the
79  * umount to successfully issue a nlookup() on the path in order
80  * to extract the mount point.  Allow certain errors through.
81  */
82 static __inline
83 int
84 keeperror(struct nlookupdata *nd, int error)
85 {
86 	if (error) {
87 		if ((nd->nl_flags & NLC_IGNBADDIR) == 0 ||
88 		   (error != EIO && error != EBADRPC && error != ESTALE)) {
89 			return 1;
90 		}
91 	}
92 	return 0;
93 }
94 
95 /*
96  * Initialize a nlookup() structure, early error return for copyin faults
97  * or a degenerate empty string (which is not allowed).
98  *
99  * The first process proc0's credentials are used if the calling thread
100  * is not associated with a process context.
101  *
102  * MPSAFE
103  */
104 int
105 nlookup_init(struct nlookupdata *nd,
106 	     const char *path, enum uio_seg seg, int flags)
107 {
108     size_t pathlen;
109     struct proc *p;
110     thread_t td;
111     int error;
112 
113     td = curthread;
114     p = td->td_proc;
115 
116     /*
117      * note: the pathlen set by copy*str() includes the terminating \0.
118      */
119     bzero(nd, sizeof(struct nlookupdata));
120     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
121     nd->nl_flags |= NLC_HASBUF;
122     nd->nl_rootopt = 0;
123     if (seg == UIO_SYSSPACE)
124 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
125     else
126 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
127 
128     /*
129      * Don't allow empty pathnames.
130      * POSIX.1 requirement: "" is not a vaild file name.
131      */
132     if (error == 0 && pathlen <= 1)
133 	error = ENOENT;
134 
135     if (error == 0) {
136 	if (p && p->p_fd) {
137 	    if (nd->nl_path[0] == '/') {
138 		    nd->nl_rootopt = 1;
139 		    cache_copy(&p->p_fd->fd_nrdir, &nd->nl_nch);
140 		    cache_copy(&p->p_fd->fd_nrdir, &nd->nl_rootnch);
141 		    if (p->p_fd->fd_njdir.ncp)
142 			cache_copy(&p->p_fd->fd_njdir, &nd->nl_jailnch);
143 		    nd->nl_cred = td->td_ucred;
144 		    nd->nl_flags |= NLC_BORROWCRED;
145 	    } else {
146 		    cache_copy_ncdir(p, &nd->nl_nch);
147 		    cache_copy(&p->p_fd->fd_nrdir, &nd->nl_rootnch);
148 		    if (p->p_fd->fd_njdir.ncp)
149 			cache_copy(&p->p_fd->fd_njdir, &nd->nl_jailnch);
150 		    nd->nl_cred = td->td_ucred;
151 		    nd->nl_flags |= NLC_BORROWCRED | NLC_NCDIR;
152 	    }
153 	} else {
154 	    cache_copy(&rootnch, &nd->nl_nch);
155 	    cache_copy(&nd->nl_nch, &nd->nl_rootnch);
156 	    cache_copy(&nd->nl_nch, &nd->nl_jailnch);
157 	    nd->nl_cred = proc0.p_ucred;
158 	    nd->nl_flags |= NLC_BORROWCRED;
159 	}
160 	nd->nl_td = td;
161 	nd->nl_flags |= flags;
162     } else {
163 	nlookup_done(nd);
164     }
165     return(error);
166 }
167 
168 
169 /*
170  * nlookup_init() for "at" family of syscalls.
171  *
172  * Works similarly to nlookup_init() but if path is relative and fd is not
173  * AT_FDCWD, path is interpreted relative to the directory pointed to by fd.
174  * In this case, the file entry pointed to by fd is ref'ed and returned in
175  * *fpp.
176  *
177  * If the call succeeds, nlookup_done_at() must be called to clean-up the nd
178  * and release the ref to the file entry.
179  */
180 int
181 nlookup_init_at(struct nlookupdata *nd, struct file **fpp, int fd,
182 		const char *path, enum uio_seg seg, int flags)
183 {
184 	struct thread *td = curthread;
185 	struct file* fp;
186 	struct vnode *vp;
187 	int error;
188 
189 	*fpp = NULL;
190 
191 	if  ((error = nlookup_init(nd, path, seg, flags)) != 0) {
192 		return (error);
193 	}
194 
195 	if (nd->nl_path[0] != '/' && fd != AT_FDCWD) {
196 		if ((error = holdvnode(td, fd, &fp)) != 0)
197 			goto done;
198 		vp = (struct vnode*)fp->f_data;
199 		if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL) {
200 			fdrop(fp);
201 			fp = NULL;
202 			error = ENOTDIR;
203 			goto done;
204 		}
205 		if (nd->nl_flags & NLC_NCDIR) {
206 			cache_drop_ncdir(&nd->nl_nch);
207 			nd->nl_flags &= ~NLC_NCDIR;
208 		} else {
209 			cache_drop(&nd->nl_nch);
210 		}
211 		cache_copy(&fp->f_nchandle, &nd->nl_nch);
212 		*fpp = fp;
213 	}
214 
215 
216 done:
217 	if (error)
218 		nlookup_done(nd);
219 	return (error);
220 
221 }
222 
223 /*
224  * This works similarly to nlookup_init() but does not assume a process
225  * context.  rootnch is always chosen for the root directory and the cred
226  * and starting directory are supplied in arguments.
227  */
228 int
229 nlookup_init_raw(struct nlookupdata *nd,
230 	     const char *path, enum uio_seg seg, int flags,
231 	     struct ucred *cred, struct nchandle *ncstart)
232 {
233     size_t pathlen;
234     thread_t td;
235     int error;
236 
237     td = curthread;
238 
239     bzero(nd, sizeof(struct nlookupdata));
240     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
241     nd->nl_flags |= NLC_HASBUF;
242     if (seg == UIO_SYSSPACE)
243 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
244     else
245 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
246 
247     /*
248      * Don't allow empty pathnames.
249      * POSIX.1 requirement: "" is not a vaild file name.
250      */
251     if (error == 0 && pathlen <= 1)
252 	error = ENOENT;
253 
254     if (error == 0) {
255 	cache_copy(ncstart, &nd->nl_nch);
256 	cache_copy(&rootnch, &nd->nl_rootnch);
257 	cache_copy(&rootnch, &nd->nl_jailnch);
258 	nd->nl_cred = crhold(cred);
259 	nd->nl_td = td;
260 	nd->nl_flags |= flags;
261     } else {
262 	nlookup_done(nd);
263     }
264     return(error);
265 }
266 
267 /*
268  * This works similarly to nlookup_init_raw() but does not rely
269  * on rootnch being initialized yet.
270  */
271 int
272 nlookup_init_root(struct nlookupdata *nd,
273 	     const char *path, enum uio_seg seg, int flags,
274 	     struct ucred *cred, struct nchandle *ncstart,
275 	     struct nchandle *ncroot)
276 {
277     size_t pathlen;
278     thread_t td;
279     int error;
280 
281     td = curthread;
282 
283     bzero(nd, sizeof(struct nlookupdata));
284     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
285     nd->nl_flags |= NLC_HASBUF;
286     if (seg == UIO_SYSSPACE)
287 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
288     else
289 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
290 
291     /*
292      * Don't allow empty pathnames.
293      * POSIX.1 requirement: "" is not a vaild file name.
294      */
295     if (error == 0 && pathlen <= 1)
296 	error = ENOENT;
297 
298     if (error == 0) {
299 	cache_copy(ncstart, &nd->nl_nch);
300 	cache_copy(ncroot, &nd->nl_rootnch);
301 	cache_copy(ncroot, &nd->nl_jailnch);
302 	nd->nl_cred = crhold(cred);
303 	nd->nl_td = td;
304 	nd->nl_flags |= flags;
305     } else {
306 	nlookup_done(nd);
307     }
308     return(error);
309 }
310 
311 #if 0
312 /*
313  * Set a different credential; this credential will be used by future
314  * operations performed on nd.nl_open_vp and nlookupdata structure.
315  */
316 void
317 nlookup_set_cred(struct nlookupdata *nd, struct ucred *cred)
318 {
319 	KKASSERT(nd->nl_cred != NULL);
320 
321 	if (nd->nl_cred != cred) {
322 		cred = crhold(cred);
323 		if ((nd->nl_flags & NLC_BORROWCRED) == 0)
324 			crfree(nd->nl_cred);
325 		nd->nl_flags &= ~NLC_BORROWCRED;
326 		nd->nl_cred = cred;
327 	}
328 }
329 #endif
330 
331 /*
332  * Cleanup a nlookupdata structure after we are through with it.  This may
333  * be called on any nlookupdata structure initialized with nlookup_init().
334  * Calling nlookup_done() is mandatory in all cases except where nlookup_init()
335  * returns an error, even if as a consumer you believe you have taken all
336  * dynamic elements out of the nlookupdata structure.
337  */
338 void
339 nlookup_done(struct nlookupdata *nd)
340 {
341     if (nd->nl_nch.ncp) {
342 	if (nd->nl_flags & NLC_NCPISLOCKED) {
343 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
344 	    cache_unlock(&nd->nl_nch);
345 	}
346 	if (nd->nl_flags & NLC_NCDIR) {
347 		cache_drop_ncdir(&nd->nl_nch);
348 		nd->nl_flags &= ~NLC_NCDIR;
349 	} else {
350 		cache_drop(&nd->nl_nch);	/* NULL's out the nch */
351 	}
352     }
353     if (nd->nl_rootnch.ncp)
354 	cache_drop_and_cache(&nd->nl_rootnch);
355     if (nd->nl_jailnch.ncp)
356 	cache_drop_and_cache(&nd->nl_jailnch);
357     if ((nd->nl_flags & NLC_HASBUF) && nd->nl_path) {
358 	objcache_put(namei_oc, nd->nl_path);
359 	nd->nl_path = NULL;
360     }
361     if (nd->nl_cred) {
362 	if ((nd->nl_flags & NLC_BORROWCRED) == 0)
363 	    crfree(nd->nl_cred);
364 	nd->nl_cred = NULL;
365 	nd->nl_flags &= ~NLC_BORROWCRED;
366     }
367     if (nd->nl_open_vp) {
368 	if (nd->nl_flags & NLC_LOCKVP) {
369 		vn_unlock(nd->nl_open_vp);
370 		nd->nl_flags &= ~NLC_LOCKVP;
371 	}
372 	vn_close(nd->nl_open_vp, nd->nl_vp_fmode, NULL);
373 	nd->nl_open_vp = NULL;
374     }
375     if (nd->nl_dvp) {
376 	vrele(nd->nl_dvp);
377 	nd->nl_dvp = NULL;
378     }
379     nd->nl_flags = 0;	/* clear remaining flags (just clear everything) */
380 }
381 
382 /*
383  * Works similarly to nlookup_done() when nd initialized with
384  * nlookup_init_at().
385  */
386 void
387 nlookup_done_at(struct nlookupdata *nd, struct file *fp)
388 {
389 	nlookup_done(nd);
390 	if (fp != NULL)
391 		fdrop(fp);
392 }
393 
394 void
395 nlookup_zero(struct nlookupdata *nd)
396 {
397 	bzero(nd, sizeof(struct nlookupdata));
398 }
399 
400 /*
401  * Simple all-in-one nlookup.  Returns a locked namecache structure or NULL
402  * if an error occured.
403  *
404  * Note that the returned ncp is not checked for permissions, though VEXEC
405  * is checked on the directory path leading up to the result.  The caller
406  * must call naccess() to check the permissions of the returned leaf.
407  */
408 struct nchandle
409 nlookup_simple(const char *str, enum uio_seg seg,
410 	       int niflags, int *error)
411 {
412     struct nlookupdata nd;
413     struct nchandle nch;
414 
415     *error = nlookup_init(&nd, str, seg, niflags);
416     if (*error == 0) {
417 	    if ((*error = nlookup(&nd)) == 0) {
418 		    nch = nd.nl_nch;	/* keep hold ref from structure */
419 		    cache_zero(&nd.nl_nch); /* and NULL out */
420 	    } else {
421 		    cache_zero(&nch);
422 	    }
423 	    nlookup_done(&nd);
424     } else {
425 	    cache_zero(&nch);
426     }
427     return(nch);
428 }
429 
430 /*
431  * Returns non-zero if the path element is the last element
432  */
433 static
434 int
435 islastelement(const char *ptr)
436 {
437 	while (*ptr == '/')
438 		++ptr;
439 	return (*ptr == 0);
440 }
441 
442 /*
443  * Returns non-zero if we need to lock the namecache element
444  * exclusively.  Unless otherwise requested by NLC_SHAREDLOCK,
445  * the last element of the namecache lookup will be locked
446  * exclusively.
447  *
448  * O_CREAT or O_TRUNC need the last element to be locked exlcusively.
449  * Intermediate elements are always locked shared.
450  *
451  * NOTE: Even if we return on-zero, an unresolved namecache record
452  *	 will always be locked exclusively.
453  */
454 static __inline
455 int
456 wantsexcllock(struct nlookupdata *nd, const char *ptr)
457 {
458 	if ((nd->nl_flags & NLC_SHAREDLOCK) == 0)
459 		return(islastelement(ptr));
460 	return 0;
461 }
462 
463 
464 /*
465  * Do a generic nlookup.  Note that the passed nd is not nlookup_done()'d
466  * on return, even if an error occurs.  If no error occurs or NLC_CREATE
467  * is flagged and ENOENT is returned, then the returned nl_nch is always
468  * referenced and locked exclusively.
469  *
470  * WARNING: For any general error other than ENOENT w/NLC_CREATE, the
471  *	    the resulting nl_nch may or may not be locked and if locked
472  *	    might be locked either shared or exclusive.
473  *
474  * Intermediate directory elements, including the current directory, require
475  * execute (search) permission.  nlookup does not examine the access
476  * permissions on the returned element.
477  *
478  * If NLC_CREATE is set the last directory must allow node creation,
479  * and an error code of 0 will be returned for a non-existant
480  * target (not ENOENT).
481  *
482  * If NLC_RENAME_DST is set the last directory mut allow node deletion,
483  * plus the sticky check is made, and an error code of 0 will be returned
484  * for a non-existant target (not ENOENT).
485  *
486  * If NLC_DELETE is set the last directory mut allow node deletion,
487  * plus the sticky check is made.
488  *
489  * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode
490  * of the returned entry.  The vnode will be referenced, but not locked,
491  * and will be released by nlookup_done() along with everything else.
492  *
493  * NOTE: As an optimization we attempt to obtain a shared namecache lock
494  *	 on any intermediate elements.  On success, the returned element
495  *	 is ALWAYS locked exclusively.
496  */
497 int
498 nlookup(struct nlookupdata *nd)
499 {
500     globaldata_t gd = mycpu;
501     struct nlcomponent nlc;
502     struct nchandle nch;
503     struct nchandle par;
504     struct nchandle nctmp;
505     struct mount *mp;
506     int wasdotordotdot;
507     char *ptr;
508     char *nptr;
509     int error;
510     int len;
511     int dflags;
512     int hit = 1;
513     int saveflag = nd->nl_flags & ~NLC_NCDIR;
514     boolean_t doretry = FALSE;
515     boolean_t inretry = FALSE;
516 
517 nlookup_start:
518 #ifdef KTRACE
519     if (KTRPOINT(nd->nl_td, KTR_NAMEI))
520 	ktrnamei(nd->nl_td->td_lwp, nd->nl_path);
521 #endif
522     bzero(&nlc, sizeof(nlc));
523 
524     /*
525      * Setup for the loop.  The current working namecache element is
526      * always at least referenced.  We lock it as required, but always
527      * return a locked, resolved namecache entry.
528      */
529     nd->nl_loopcnt = 0;
530     if (nd->nl_dvp) {
531 	vrele(nd->nl_dvp);
532 	nd->nl_dvp = NULL;
533     }
534     ptr = nd->nl_path;
535 
536     /*
537      * Loop on the path components.  At the top of the loop nd->nl_nch
538      * is ref'd and unlocked and represents our current position.
539      */
540     for (;;) {
541 	/*
542 	 * Make sure nl_nch is locked so we can access the vnode, resolution
543 	 * state, etc.
544 	 */
545 	if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) {
546 		nd->nl_flags |= NLC_NCPISLOCKED;
547 		cache_lock_maybe_shared(&nd->nl_nch, wantsexcllock(nd, ptr));
548 	}
549 
550 	/*
551 	 * Check if the root directory should replace the current
552 	 * directory.  This is done at the start of a translation
553 	 * or after a symbolic link has been found.  In other cases
554 	 * ptr will never be pointing at a '/'.
555 	 */
556 	if (*ptr == '/') {
557 	    do {
558 		++ptr;
559 	    } while (*ptr == '/');
560 	    if (nd->nl_rootopt) {
561 		nd->nl_rootopt = 0;
562 	    } else {
563 		cache_unlock(&nd->nl_nch);
564 		cache_get_maybe_shared(&nd->nl_rootnch, &nch,
565 				       wantsexcllock(nd, ptr));
566 		if (nd->nl_flags & NLC_NCDIR) {
567 			cache_drop_ncdir(&nd->nl_nch);
568 			nd->nl_flags &= ~NLC_NCDIR;
569 		} else {
570 			cache_drop(&nd->nl_nch);
571 		}
572 		nd->nl_nch = nch;		/* remains locked */
573 	    }
574 
575 	    /*
576 	     * Fast-track termination.  There is no parent directory of
577 	     * the root in the same mount from the point of view of
578 	     * the caller so return EACCES if NLC_REFDVP is specified,
579 	     * and EEXIST if NLC_CREATE is also specified.
580 	     * e.g. 'rmdir /' or 'mkdir /' are not allowed.
581 	     */
582 	    if (*ptr == 0) {
583 		if (nd->nl_flags & NLC_REFDVP)
584 			error = (nd->nl_flags & NLC_CREATE) ? EEXIST : EACCES;
585 		else
586 			error = 0;
587 		break;
588 	    }
589 	    continue;
590 	}
591 
592 	/*
593 	 * Pre-calculate next path component so we can check whether the
594 	 * current component directory is the last directory in the path
595 	 * or not.
596 	 */
597 	for (nptr = ptr; *nptr && *nptr != '/'; ++nptr)
598 		;
599 
600 	/*
601 	 * Check directory search permissions (nd->nl_nch is locked & refd).
602 	 * This will load dflags to obtain directory-special permissions to
603 	 * be checked along with the last component.
604 	 *
605 	 * We only need to pass-in &dflags for the second-to-last component.
606 	 * Optimize by passing-in NULL for any prior components, which may
607 	 * allow the code to bypass the naccess() call.
608 	 */
609 	dflags = 0;
610 	if (*nptr == '/' || (saveflag & NLC_MODIFYING_MASK) == 0)
611 	    error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, NULL);
612 	else
613 	    error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags);
614 	if (error) {
615 	    if (keeperror(nd, error))
616 		    break;
617 	    error = 0;
618 	}
619 
620 	/*
621 	 * Extract the next (or last) path component.  Path components are
622 	 * limited to 255 characters.
623 	 */
624 	nlc.nlc_nameptr = ptr;
625 	nlc.nlc_namelen = nptr - ptr;
626 	ptr = nptr;
627 	if (nlc.nlc_namelen >= 256) {
628 	    error = ENAMETOOLONG;
629 	    break;
630 	}
631 
632 	/*
633 	 * Lookup the path component in the cache, creating an unresolved
634 	 * entry if necessary.  We have to handle "." and ".." as special
635 	 * cases.
636 	 *
637 	 * When handling ".." we have to detect a traversal back through a
638 	 * mount point.   If we are at the root, ".." just returns the root.
639 	 *
640 	 * When handling "." or ".." we also have to recalculate dflags
641 	 * since our dflags will be for some sub-directory instead of the
642 	 * parent dir.
643 	 *
644 	 * This subsection returns a locked, refd 'nch' unless it errors out,
645 	 * and an unlocked but still ref'd nd->nl_nch.
646 	 *
647 	 * The namecache topology is not allowed to be disconnected, so
648 	 * encountering a NULL parent will generate EINVAL.  This typically
649 	 * occurs when a directory is removed out from under a process.
650 	 *
651 	 * WARNING! The unlocking of nd->nl_nch is sensitive code.
652 	 */
653 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
654 
655 	if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') {
656 	    cache_unlock(&nd->nl_nch);
657 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
658 	    cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr));
659 	    wasdotordotdot = 1;
660 	} else if (nlc.nlc_namelen == 2 &&
661 		   nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') {
662 	    if (nd->nl_nch.mount == nd->nl_rootnch.mount &&
663 		nd->nl_nch.ncp == nd->nl_rootnch.ncp
664 	    ) {
665 		/*
666 		 * ".." at the root returns the root
667 		 */
668 		cache_unlock(&nd->nl_nch);
669 		nd->nl_flags &= ~NLC_NCPISLOCKED;
670 		cache_get_maybe_shared(&nd->nl_nch, &nch,
671 				       wantsexcllock(nd, ptr));
672 	    } else {
673 		/*
674 		 * Locate the parent ncp.  If we are at the root of a
675 		 * filesystem mount we have to skip to the mounted-on
676 		 * point in the underlying filesystem.
677 		 *
678 		 * Expect the parent to always be good since the
679 		 * mountpoint doesn't go away.  XXX hack.  cache_get()
680 		 * requires the ncp to already have a ref as a safety.
681 		 *
682 		 * However, a process which has been broken out of a chroot
683 		 * will wind up with a NULL parent if it tries to '..' above
684 		 * the real root, deal with the case.  Note that this does
685 		 * not protect us from a jail breakout, it just stops a panic
686 		 * if the jail-broken process tries to '..' past the real
687 		 * root.
688 		 */
689 		nctmp = nd->nl_nch;
690 		while (nctmp.ncp == nctmp.mount->mnt_ncmountpt.ncp) {
691 			nctmp = nctmp.mount->mnt_ncmounton;
692 			if (nctmp.ncp == NULL)
693 				break;
694 		}
695 		if (nctmp.ncp == NULL) {
696 			if (curthread->td_proc) {
697 				kprintf("vfs_nlookup: '..' traverse broke "
698 					"jail: pid %d (%s)\n",
699 					curthread->td_proc->p_pid,
700 					curthread->td_comm);
701 			}
702 			nctmp = nd->nl_rootnch;
703 		} else {
704 			nctmp.ncp = nctmp.ncp->nc_parent;
705 		}
706 		cache_hold(&nctmp);
707 		cache_unlock(&nd->nl_nch);
708 		nd->nl_flags &= ~NLC_NCPISLOCKED;
709 		cache_get_maybe_shared(&nctmp, &nch, wantsexcllock(nd, ptr));
710 		cache_drop(&nctmp);		/* NOTE: zero's nctmp */
711 	    }
712 	    wasdotordotdot = 2;
713 	} else {
714 	    /*
715 	     * Currently downward traversals are in reverse-lock-order,
716 	     * requiring that we release the parent ncp before looking up
717 	     * and locking the child.  However, the parent must remain resolved
718 	     * so for now we also have to vhold() its vnode.
719 	     *
720 	     * Releasing the lock without holding the vnode allows a race
721 	     * whereby the parent ncp's vnode is recycled.  This case can
722 	     * especially occur when maxvnodes is set very low.
723 	     *
724 	     * If we race an unlink or rename the ncp might be marked
725 	     * DESTROYED after resolution, requiring a retry.
726 	     */
727 	    struct vnode *hvp;	/* prevent recyclement */
728 
729 	    if ((hvp = nd->nl_nch.ncp->nc_vp) != NULL)
730 		vhold(hvp);
731 	    cache_unlock(&nd->nl_nch);
732 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
733 
734 	    error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc,
735 					       wantsexcllock(nd, ptr), &nch);
736 	    if (error == EWOULDBLOCK) {
737 		nch = cache_nlookup(&nd->nl_nch, &nlc);
738 		if (nch.ncp->nc_flag & NCF_UNRESOLVED)
739 		    hit = 0;
740 		for (;;) {
741 		    error = cache_resolve(&nch, nd->nl_cred);
742 		    if (error != EAGAIN &&
743 			(nch.ncp->nc_flag & NCF_DESTROYED) == 0) {
744 			    if (error == ESTALE) {
745 				if (!inretry)
746 				    error = ENOENT;
747 				doretry = TRUE;
748 			    }
749 			    break;
750 		    }
751 		    kprintf("[diagnostic] nlookup: relookup %*.*s\n",
752 			    nch.ncp->nc_nlen, nch.ncp->nc_nlen,
753 			    nch.ncp->nc_name);
754 		    cache_put(&nch);
755 		    nch = cache_nlookup(&nd->nl_nch, &nlc);
756 		}
757 	    }
758 	    if (hvp)
759 		vdrop(hvp);
760 	    wasdotordotdot = 0;
761 	}
762 
763 	/*
764 	 * If the last component was "." or ".." our dflags no longer
765 	 * represents the parent directory and we have to explicitly
766 	 * look it up.
767 	 *
768 	 * Expect the parent to be good since nch is locked.
769 	 */
770 	if (wasdotordotdot && error == 0) {
771 	    dflags = 0;
772 	    if ((par.ncp = nch.ncp->nc_parent) != NULL) {
773 		par.mount = nch.mount;
774 		cache_hold(&par);
775 		cache_lock_maybe_shared(&par, wantsexcllock(nd, ptr));
776 		error = naccess(&par, 0, nd->nl_cred, &dflags);
777 		cache_put(&par);
778 		if (error) {
779 		    if (!keeperror(nd, error))
780 			    error = 0;
781 		}
782 	    }
783 	}
784 
785 	/*
786 	 * [end of subsection]
787 	 *
788 	 * nch is locked and referenced.
789 	 * nd->nl_nch is unlocked and referenced.
790 	 *
791 	 * nl_nch must be unlocked or we could chain lock to the root
792 	 * if a resolve gets stuck (e.g. in NFS).
793 	 */
794 	KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
795 
796 	/*
797 	 * Resolve the namespace if necessary.  The ncp returned by
798 	 * cache_nlookup() is referenced and locked.
799 	 *
800 	 * XXX neither '.' nor '..' should return EAGAIN since they were
801 	 * previously resolved and thus cannot be newly created ncp's.
802 	 */
803 	if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
804 	    hit = 0;
805 	    error = cache_resolve(&nch, nd->nl_cred);
806 	    if (error == ESTALE) {
807 		if (!inretry)
808 		    error = ENOENT;
809 		doretry = TRUE;
810 	    }
811 	    KKASSERT(error != EAGAIN);
812 	} else {
813 	    error = nch.ncp->nc_error;
814 	}
815 
816 	/*
817 	 * Early completion.  ENOENT is not an error if this is the last
818 	 * component and NLC_CREATE or NLC_RENAME (rename target) was
819 	 * requested.  Note that ncp->nc_error is left as ENOENT in that
820 	 * case, which we check later on.
821 	 *
822 	 * Also handle invalid '.' or '..' components terminating a path
823 	 * for a create/rename/delete.  The standard requires this and pax
824 	 * pretty stupidly depends on it.
825 	 */
826 	if (islastelement(ptr)) {
827 	    if (error == ENOENT &&
828 		(nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST))
829 	    ) {
830 		if (nd->nl_flags & NLC_NFS_RDONLY) {
831 			error = EROFS;
832 		} else {
833 			error = naccess(&nch, nd->nl_flags | dflags,
834 					nd->nl_cred, NULL);
835 		}
836 	    }
837 	    if (error == 0 && wasdotordotdot &&
838 		(nd->nl_flags & (NLC_CREATE | NLC_DELETE |
839 				 NLC_RENAME_SRC | NLC_RENAME_DST))) {
840 		/*
841 		 * POSIX junk
842 		 */
843 		if (nd->nl_flags & NLC_CREATE)
844 			error = EEXIST;
845 		else if (nd->nl_flags & NLC_DELETE)
846 			error = (wasdotordotdot == 1) ? EINVAL : ENOTEMPTY;
847 		else
848 			error = EINVAL;
849 	    }
850 	}
851 
852 	/*
853 	 * Early completion on error.
854 	 */
855 	if (error) {
856 	    cache_put(&nch);
857 	    break;
858 	}
859 
860 	/*
861 	 * If the element is a symlink and it is either not the last
862 	 * element or it is the last element and we are allowed to
863 	 * follow symlinks, resolve the symlink.
864 	 */
865 	if ((nch.ncp->nc_flag & NCF_ISSYMLINK) &&
866 	    (*ptr || (nd->nl_flags & NLC_FOLLOW))
867 	) {
868 	    if (nd->nl_loopcnt++ >= MAXSYMLINKS) {
869 		error = ELOOP;
870 		cache_put(&nch);
871 		break;
872 	    }
873 	    error = nreadsymlink(nd, &nch, &nlc);
874 	    cache_put(&nch);
875 	    if (error)
876 		break;
877 
878 	    /*
879 	     * Concatenate trailing path elements onto the returned symlink.
880 	     * Note that if the path component (ptr) is not exhausted, it
881 	     * will being with a '/', so we do not have to add another one.
882 	     *
883 	     * The symlink may not be empty.
884 	     */
885 	    len = strlen(ptr);
886 	    if (nlc.nlc_namelen == 0 || nlc.nlc_namelen + len >= MAXPATHLEN) {
887 		error = nlc.nlc_namelen ? ENAMETOOLONG : ENOENT;
888 		objcache_put(namei_oc, nlc.nlc_nameptr);
889 		break;
890 	    }
891 	    bcopy(ptr, nlc.nlc_nameptr + nlc.nlc_namelen, len + 1);
892 	    if (nd->nl_flags & NLC_HASBUF)
893 		objcache_put(namei_oc, nd->nl_path);
894 	    nd->nl_path = nlc.nlc_nameptr;
895 	    nd->nl_flags |= NLC_HASBUF;
896 	    ptr = nd->nl_path;
897 
898 	    /*
899 	     * Go back up to the top to resolve any initial '/'s in the
900 	     * symlink.
901 	     */
902 	    continue;
903 	}
904 
905 	/*
906 	 * If the element is a directory and we are crossing a mount point,
907 	 * Locate the mount.
908 	 */
909 	while ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
910 	    (nd->nl_flags & NLC_NOCROSSMOUNT) == 0 &&
911 	    (mp = cache_findmount(&nch)) != NULL
912 	) {
913 	    struct vnode *tdp;
914 	    int vfs_do_busy = 0;
915 
916 	    /*
917 	     * VFS must be busied before the namecache entry is locked,
918 	     * but we don't want to waste time calling vfs_busy() if the
919 	     * mount point is already resolved.
920 	     */
921 again:
922 	    cache_put(&nch);
923 	    if (vfs_do_busy) {
924 		while (vfs_busy(mp, 0)) {
925 		    if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
926 			kprintf("nlookup: warning umount race avoided\n");
927 			cache_dropmount(mp);
928 			error = EBUSY;
929 			vfs_do_busy = 0;
930 			goto double_break;
931 		    }
932 		}
933 	    }
934 	    cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch,
935 				   wantsexcllock(nd, ptr));
936 
937 	    if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
938 		if (vfs_do_busy == 0) {
939 		    vfs_do_busy = 1;
940 		    goto again;
941 		}
942 		error = VFS_ROOT(mp, &tdp);
943 		vfs_unbusy(mp);
944 		vfs_do_busy = 0;
945 		if (keeperror(nd, error)) {
946 		    cache_dropmount(mp);
947 		    break;
948 		}
949 		if (error == 0) {
950 		    cache_setvp(&nch, tdp);
951 		    vput(tdp);
952 		}
953 	    }
954 	    if (vfs_do_busy)
955 		vfs_unbusy(mp);
956 	    cache_dropmount(mp);
957 	}
958 
959 	if (keeperror(nd, error)) {
960 	    cache_put(&nch);
961 double_break:
962 	    break;
963 	}
964 
965 	/*
966 	 * Skip any slashes to get to the next element.  If there
967 	 * are any slashes at all the current element must be a
968 	 * directory or, in the create case, intended to become a directory.
969 	 * If it isn't we break without incrementing ptr and fall through
970 	 * to the failure case below.
971 	 */
972 	while (*ptr == '/') {
973 	    if ((nch.ncp->nc_flag & NCF_ISDIR) == 0 &&
974 		!(nd->nl_flags & NLC_WILLBEDIR)
975 	    ) {
976 		break;
977 	    }
978 	    ++ptr;
979 	}
980 
981 	/*
982 	 * Continuation case: additional elements and the current
983 	 * element is a directory.
984 	 */
985 	if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) {
986 	    if (nd->nl_flags & NLC_NCDIR) {
987 		    cache_drop_ncdir(&nd->nl_nch);
988 		    nd->nl_flags &= ~NLC_NCDIR;
989 	    } else {
990 		    cache_drop(&nd->nl_nch);
991 	    }
992 	    cache_unlock(&nch);
993 	    KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
994 	    nd->nl_nch = nch;
995 	    continue;
996 	}
997 
998 	/*
999 	 * Failure case: additional elements and the current element
1000 	 * is not a directory
1001 	 */
1002 	if (*ptr) {
1003 	    cache_put(&nch);
1004 	    error = ENOTDIR;
1005 	    break;
1006 	}
1007 
1008 	/*
1009 	 * Successful lookup of last element.
1010 	 *
1011 	 * Check permissions if the target exists.  If the target does not
1012 	 * exist directory permissions were already tested in the early
1013 	 * completion code above.
1014 	 *
1015 	 * nd->nl_flags will be adjusted on return with NLC_APPENDONLY
1016 	 * if the file is marked append-only, and NLC_STICKY if the directory
1017 	 * containing the file is sticky.
1018 	 */
1019 	if (nch.ncp->nc_vp && (nd->nl_flags & NLC_ALLCHKS)) {
1020 	    error = naccess(&nch, nd->nl_flags | dflags,
1021 			    nd->nl_cred, NULL);
1022 	    if (keeperror(nd, error)) {
1023 		cache_put(&nch);
1024 		break;
1025 	    }
1026 	}
1027 
1028 	/*
1029 	 * Termination: no more elements.
1030 	 *
1031 	 * If NLC_REFDVP is set acquire a referenced parent dvp.
1032 	 */
1033 	if (nd->nl_flags & NLC_REFDVP) {
1034 		cache_lock(&nd->nl_nch);
1035 		error = cache_vref(&nd->nl_nch, nd->nl_cred, &nd->nl_dvp);
1036 		cache_unlock(&nd->nl_nch);
1037 		if (keeperror(nd, error)) {
1038 			kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch.ncp);
1039 			cache_put(&nch);
1040 			break;
1041 		}
1042 	}
1043 	if (nd->nl_flags & NLC_NCDIR) {
1044 		cache_drop_ncdir(&nd->nl_nch);
1045 		nd->nl_flags &= ~NLC_NCDIR;
1046 	} else {
1047 		cache_drop(&nd->nl_nch);
1048 	}
1049 	nd->nl_nch = nch;
1050 	nd->nl_flags |= NLC_NCPISLOCKED;
1051 	error = 0;
1052 	break;
1053     }
1054 
1055     if (hit)
1056 	++gd->gd_nchstats->ncs_longhits;
1057     else
1058 	++gd->gd_nchstats->ncs_longmiss;
1059 
1060     if (nd->nl_flags & NLC_NCPISLOCKED)
1061 	KKASSERT(cache_lockstatus(&nd->nl_nch) > 0);
1062 
1063     /*
1064      * Retry the whole thing if doretry flag is set, but only once.
1065      * autofs(5) may mount another filesystem under its root directory
1066      * while resolving a path.
1067      */
1068     if (doretry && !inretry) {
1069 	inretry = TRUE;
1070 	nd->nl_flags &= NLC_NCDIR;
1071 	nd->nl_flags |= saveflag;
1072 	goto nlookup_start;
1073     }
1074 
1075     /*
1076      * NOTE: If NLC_CREATE was set the ncp may represent a negative hit
1077      * (ncp->nc_error will be ENOENT), but we will still return an error
1078      * code of 0.
1079      */
1080     return(error);
1081 }
1082 
1083 /*
1084  * Resolve a mount point's glue ncp.  This ncp connects creates the illusion
1085  * of continuity in the namecache tree by connecting the ncp related to the
1086  * vnode under the mount to the ncp related to the mount's root vnode.
1087  *
1088  * If no error occured a locked, ref'd ncp is stored in *ncpp.
1089  */
1090 int
1091 nlookup_mp(struct mount *mp, struct nchandle *nch)
1092 {
1093     struct vnode *vp;
1094     int error;
1095 
1096     error = 0;
1097     cache_get(&mp->mnt_ncmountpt, nch);
1098     if (nch->ncp->nc_flag & NCF_UNRESOLVED) {
1099 	while (vfs_busy(mp, 0))
1100 	    ;
1101 	error = VFS_ROOT(mp, &vp);
1102 	vfs_unbusy(mp);
1103 	if (error) {
1104 	    cache_put(nch);
1105 	} else {
1106 	    cache_setvp(nch, vp);
1107 	    vput(vp);
1108 	}
1109     }
1110     return(error);
1111 }
1112 
1113 /*
1114  * Read the contents of a symlink, allocate a path buffer out of the
1115  * namei_oc and initialize the supplied nlcomponent with the result.
1116  *
1117  * If an error occurs no buffer will be allocated or returned in the nlc.
1118  */
1119 int
1120 nreadsymlink(struct nlookupdata *nd, struct nchandle *nch,
1121 		struct nlcomponent *nlc)
1122 {
1123     struct vnode *vp;
1124     struct iovec aiov;
1125     struct uio auio;
1126     int linklen;
1127     int error;
1128     char *cp;
1129 
1130     nlc->nlc_nameptr = NULL;
1131     nlc->nlc_namelen = 0;
1132     if (nch->ncp->nc_vp == NULL)
1133 	return(ENOENT);
1134     if ((error = cache_vget(nch, nd->nl_cred, LK_SHARED, &vp)) != 0)
1135 	return(error);
1136     cp = objcache_get(namei_oc, M_WAITOK);
1137     aiov.iov_base = cp;
1138     aiov.iov_len = MAXPATHLEN;
1139     auio.uio_iov = &aiov;
1140     auio.uio_iovcnt = 1;
1141     auio.uio_offset = 0;
1142     auio.uio_rw = UIO_READ;
1143     auio.uio_segflg = UIO_SYSSPACE;
1144     auio.uio_td = nd->nl_td;
1145     auio.uio_resid = MAXPATHLEN - 1;
1146     error = VOP_READLINK(vp, &auio, nd->nl_cred);
1147     if (error)
1148 	goto fail;
1149     linklen = MAXPATHLEN - 1 - auio.uio_resid;
1150     if (varsym_enable) {
1151 	linklen = varsymreplace(cp, linklen, MAXPATHLEN - 1);
1152 	if (linklen < 0) {
1153 	    error = ENAMETOOLONG;
1154 	    goto fail;
1155 	}
1156     }
1157     cp[linklen] = 0;
1158     nlc->nlc_nameptr = cp;
1159     nlc->nlc_namelen = linklen;
1160     vput(vp);
1161     return(0);
1162 fail:
1163     objcache_put(namei_oc, cp);
1164     vput(vp);
1165     return(error);
1166 }
1167 
1168 /*
1169  * Check access [XXX cache vattr!] [XXX quota]
1170  *
1171  * Generally check the NLC_* access bits.   All specified bits must pass
1172  * for this function to return 0.
1173  *
1174  * The file does not have to exist when checking NLC_CREATE or NLC_RENAME_DST
1175  * access, otherwise it must exist.  No error is returned in this case.
1176  *
1177  * The file must not exist if NLC_EXCL is specified.
1178  *
1179  * Directory permissions in general are tested for NLC_CREATE if the file
1180  * does not exist, NLC_DELETE if the file does exist, and NLC_RENAME_DST
1181  * whether the file exists or not.
1182  *
1183  * The directory sticky bit is tested for NLC_DELETE and NLC_RENAME_DST,
1184  * the latter is only tested if the target exists.
1185  *
1186  * The passed ncp must be referenced and locked.  If it is already resolved
1187  * it may be locked shared but otherwise should be locked exclusively.
1188  */
1189 
1190 #define S_WXOK_MASK	(S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
1191 #define S_XOK_MASK	(S_IXUSR|S_IXGRP|S_IXOTH)
1192 
1193 static int
1194 naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp)
1195 {
1196     struct vnode *vp;
1197     struct vattr va;
1198     struct namecache *ncp;
1199     int error;
1200     int cflags;
1201 
1202     KKASSERT(cache_lockstatus(nch) > 0);
1203 
1204     ncp = nch->ncp;
1205     if (ncp->nc_flag & NCF_UNRESOLVED) {
1206 	cache_resolve(nch, cred);
1207 	ncp = nch->ncp;
1208     }
1209     error = ncp->nc_error;
1210 
1211     /*
1212      * Directory permissions checks.  Silently ignore ENOENT if these
1213      * tests pass.  It isn't an error.
1214      *
1215      * We can safely resolve ncp->nc_parent because ncp is currently
1216      * locked.
1217      */
1218     if (nflags & (NLC_CREATE | NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST)) {
1219 	if (((nflags & NLC_CREATE) && ncp->nc_vp == NULL) ||
1220 	    ((nflags & NLC_DELETE) && ncp->nc_vp != NULL) ||
1221 	    ((nflags & NLC_RENAME_SRC) && ncp->nc_vp != NULL) ||
1222 	    (nflags & NLC_RENAME_DST)
1223 	) {
1224 	    struct nchandle par;
1225 
1226 	    if ((par.ncp = ncp->nc_parent) == NULL) {
1227 		if (error != EAGAIN)
1228 			error = EINVAL;
1229 	    } else if (error == 0 || error == ENOENT) {
1230 		par.mount = nch->mount;
1231 		cache_hold(&par);
1232 		cache_lock_maybe_shared(&par, 0);
1233 		error = naccess(&par, NLC_WRITE, cred, NULL);
1234 		cache_put(&par);
1235 	    }
1236 	}
1237     }
1238 
1239     /*
1240      * NLC_EXCL check.  Target file must not exist.
1241      */
1242     if (error == 0 && (nflags & NLC_EXCL) && ncp->nc_vp != NULL)
1243 	error = EEXIST;
1244 
1245     /*
1246      * Try to short-cut the vnode operation for intermediate directory
1247      * components.  This is a major SMP win because it avoids having
1248      * to execute a lot of code for intermediate directory components,
1249      * including shared refs and locks on intermediate directory vnodes.
1250      *
1251      * We can only do this if the caller does not need nflagsp.
1252      */
1253     if (error == 0 && nflagsp == NULL &&
1254 	nflags == NLC_EXEC && (ncp->nc_flag & NCF_WXOK)) {
1255 	return 0;
1256     }
1257 
1258     /*
1259      * Get the vnode attributes so we can do the rest of our checks.
1260      *
1261      * NOTE: We only call naccess_va() if the target exists.
1262      */
1263     if (error == 0) {
1264 	error = cache_vget(nch, cred, LK_SHARED, &vp);
1265 	if (error == ENOENT) {
1266 	    /*
1267 	     * Silently zero-out ENOENT if creating or renaming
1268 	     * (rename target).  It isn't an error.
1269 	     */
1270 	    if (nflags & (NLC_CREATE | NLC_RENAME_DST))
1271 		error = 0;
1272 	} else if (error == 0) {
1273 	    /*
1274 	     * Get the vnode attributes and check for illegal O_TRUNC
1275 	     * requests and read-only mounts.
1276 	     *
1277 	     * NOTE: You can still open devices on read-only mounts for
1278 	     * 	     writing.
1279 	     *
1280 	     * NOTE: creates/deletes/renames are handled by the NLC_WRITE
1281 	     *	     check on the parent directory above.
1282 	     *
1283 	     * XXX cache the va in the namecache or in the vnode
1284 	     */
1285 	    error = VOP_GETATTR(vp, &va);
1286 	    if (error == 0 && (nflags & NLC_TRUNCATE)) {
1287 		switch(va.va_type) {
1288 		case VREG:
1289 		case VDATABASE:
1290 		case VCHR:
1291 		case VBLK:
1292 		case VFIFO:
1293 		    break;
1294 		case VDIR:
1295 		    error = EISDIR;
1296 		    break;
1297 		default:
1298 		    error = EINVAL;
1299 		    break;
1300 		}
1301 	    }
1302 	    if (error == 0 && (nflags & NLC_WRITE) && vp->v_mount &&
1303 		(vp->v_mount->mnt_flag & MNT_RDONLY)
1304 	    ) {
1305 		switch(va.va_type) {
1306 		case VDIR:
1307 		case VLNK:
1308 		case VREG:
1309 		case VDATABASE:
1310 		    error = EROFS;
1311 		    break;
1312 		default:
1313 		    break;
1314 		}
1315 	    }
1316 	    vput(vp);
1317 
1318 	    /*
1319 	     * Check permissions based on file attributes.  The passed
1320 	     * flags (*nflagsp) are modified with feedback based on
1321 	     * special attributes and requirements.
1322 	     */
1323 	    if (error == 0) {
1324 		/*
1325 		 * Adjust the returned (*nflagsp) if non-NULL.
1326 		 */
1327 		if (nflagsp) {
1328 		    if ((va.va_mode & VSVTX) && va.va_uid != cred->cr_uid)
1329 			*nflagsp |= NLC_STICKY;
1330 		    if (va.va_flags & APPEND)
1331 			*nflagsp |= NLC_APPENDONLY;
1332 		    if (va.va_flags & IMMUTABLE)
1333 			*nflagsp |= NLC_IMMUTABLE;
1334 		}
1335 
1336 		/*
1337 		 * NCF_WXOK can be set for world-searchable directories.
1338 		 *
1339 		 * XXX When we implement capabilities this code would also
1340 		 * need a cap check, or only set the flag if there are no
1341 		 * capabilities.
1342 		 */
1343 		cflags = 0;
1344 		if (va.va_type == VDIR &&
1345 		    (va.va_mode & S_WXOK_MASK) == S_WXOK_MASK) {
1346 			cflags |= NCF_WXOK;
1347 		}
1348 		if ((va.va_mode & S_XOK_MASK) == 0)
1349 			cflags |= NCF_NOTX;
1350 
1351 		/*
1352 		 * Track swapcache management flags in the namecache.
1353 		 *
1354 		 * Calculate the flags based on the current vattr info
1355 		 * and recalculate the inherited flags from the parent
1356 		 * (the original cache linkage may have occurred without
1357 		 * getattrs and thus have stale flags).
1358 		 */
1359 		if (va.va_flags & SF_NOCACHE)
1360 			cflags |= NCF_SF_NOCACHE;
1361 		if (va.va_flags & UF_CACHE)
1362 			cflags |= NCF_UF_CACHE;
1363 		if (ncp->nc_parent) {
1364 			if (ncp->nc_parent->nc_flag &
1365 			    (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) {
1366 				cflags |= NCF_SF_PNOCACHE;
1367 			}
1368 			if (ncp->nc_parent->nc_flag &
1369 			    (NCF_UF_CACHE | NCF_UF_PCACHE)) {
1370 				cflags |= NCF_UF_PCACHE;
1371 			}
1372 		}
1373 
1374 		/*
1375 		 * We're not supposed to update nc_flag when holding a shared
1376 		 * lock, but we allow the case for certain flags.  Note that
1377 		 * holding an exclusive lock allows updating nc_flag without
1378 		 * atomics.  nc_flag is not allowe to be updated at all unless
1379 		 * a shared or exclusive lock is held.
1380 		 */
1381 		atomic_clear_short(&ncp->nc_flag,
1382 				   (NCF_SF_NOCACHE | NCF_UF_CACHE |
1383 				   NCF_SF_PNOCACHE | NCF_UF_PCACHE |
1384 				   NCF_WXOK | NCF_NOTX) & ~cflags);
1385 		atomic_set_short(&ncp->nc_flag, cflags);
1386 
1387 		/*
1388 		 * Process general access.
1389 		 */
1390 		error = naccess_va(&va, nflags, cred);
1391 	    }
1392 	}
1393     }
1394     return(error);
1395 }
1396 
1397 /*
1398  * Check the requested access against the given vattr using cred.
1399  */
1400 int
1401 naccess_va(struct vattr *va, int nflags, struct ucred *cred)
1402 {
1403     int i;
1404     int vmode;
1405 
1406     /*
1407      * Test the immutable bit.  Creations, deletions, renames (source
1408      * or destination) are not allowed.  chown/chmod/other is also not
1409      * allowed but is handled by SETATTR.  Hardlinks to the immutable
1410      * file are allowed.
1411      *
1412      * If the directory is set to immutable then creations, deletions,
1413      * renames (source or dest) and hardlinks to files within the directory
1414      * are not allowed, and regular files opened through the directory may
1415      * not be written to or truncated (unless a special device).
1416      *
1417      * NOTE!  New hardlinks to immutable files work but new hardlinks to
1418      * files, immutable or not, sitting inside an immutable directory are
1419      * not allowed.  As always if the file is hardlinked via some other
1420      * path additional hardlinks may be possible even if the file is marked
1421      * immutable.  The sysop needs to create a closure by checking the hard
1422      * link count.  Once closure is achieved you are good, and security
1423      * scripts should check link counts anyway.
1424      *
1425      * Writes and truncations are only allowed on special devices.
1426      */
1427     if ((va->va_flags & IMMUTABLE) || (nflags & NLC_IMMUTABLE)) {
1428 	if ((nflags & NLC_IMMUTABLE) && (nflags & NLC_HLINK))
1429 	    return (EPERM);
1430 	if (nflags & (NLC_CREATE | NLC_DELETE |
1431 		      NLC_RENAME_SRC | NLC_RENAME_DST)) {
1432 	    return (EPERM);
1433 	}
1434 	if (nflags & (NLC_WRITE | NLC_TRUNCATE)) {
1435 	    switch(va->va_type) {
1436 	    case VDIR:
1437 		return (EISDIR);
1438 	    case VLNK:
1439 	    case VREG:
1440 	    case VDATABASE:
1441 		return (EPERM);
1442 	    default:
1443 		break;
1444 	    }
1445 	}
1446     }
1447 
1448     /*
1449      * Test the no-unlink and append-only bits for opens, rename targets,
1450      * and deletions.  These bits are not tested for creations or
1451      * rename sources.
1452      *
1453      * Unlike FreeBSD we allow a file with APPEND set to be renamed.
1454      * If you do not wish this you must also set NOUNLINK.
1455      *
1456      * If the governing directory is marked APPEND-only it implies
1457      * NOUNLINK for all entries in the directory.
1458      */
1459     if (((va->va_flags & NOUNLINK) || (nflags & NLC_APPENDONLY)) &&
1460 	(nflags & (NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST))
1461     ) {
1462 	return (EPERM);
1463     }
1464 
1465     /*
1466      * A file marked append-only may not be deleted but can be renamed.
1467      */
1468     if ((va->va_flags & APPEND) &&
1469 	(nflags & (NLC_DELETE | NLC_RENAME_DST))
1470     ) {
1471 	return (EPERM);
1472     }
1473 
1474     /*
1475      * A file marked append-only which is opened for writing must also
1476      * be opened O_APPEND.
1477      */
1478     if ((va->va_flags & APPEND) && (nflags & (NLC_OPEN | NLC_TRUNCATE))) {
1479 	if (nflags & NLC_TRUNCATE)
1480 	    return (EPERM);
1481 	if ((nflags & (NLC_OPEN | NLC_WRITE)) == (NLC_OPEN | NLC_WRITE)) {
1482 	    if ((nflags & NLC_APPEND) == 0)
1483 		return (EPERM);
1484 	}
1485     }
1486 
1487     /*
1488      * root gets universal access
1489      */
1490     if (cred->cr_uid == 0)
1491 	return(0);
1492 
1493     /*
1494      * Check owner perms.
1495      *
1496      * If NLC_OWN is set the owner of the file is allowed no matter when
1497      * the owner-mode bits say (utimes).
1498      */
1499     vmode = 0;
1500     if (nflags & NLC_READ)
1501 	vmode |= S_IRUSR;
1502     if (nflags & NLC_WRITE)
1503 	vmode |= S_IWUSR;
1504     if (nflags & NLC_EXEC)
1505 	vmode |= S_IXUSR;
1506 
1507     if (cred->cr_uid == va->va_uid) {
1508 	if ((nflags & NLC_OWN) == 0) {
1509 	    if ((vmode & va->va_mode) != vmode)
1510 		return(EACCES);
1511 	}
1512 	return(0);
1513     }
1514 
1515     /*
1516      * If NLC_STICKY is set only the owner may delete or rename a file.
1517      * This bit is typically set on /tmp.
1518      *
1519      * Note that the NLC_READ/WRITE/EXEC bits are not typically set in
1520      * the specific delete or rename case.  For deletions and renames we
1521      * usually just care about directory permissions, not file permissions.
1522      */
1523     if ((nflags & NLC_STICKY) &&
1524 	(nflags & (NLC_RENAME_SRC | NLC_RENAME_DST | NLC_DELETE))) {
1525 	return(EACCES);
1526     }
1527 
1528     /*
1529      * Check group perms
1530      */
1531     vmode >>= 3;
1532     for (i = 0; i < cred->cr_ngroups; ++i) {
1533 	if (va->va_gid == cred->cr_groups[i]) {
1534 	    if ((vmode & va->va_mode) != vmode)
1535 		return(EACCES);
1536 	    return(0);
1537 	}
1538     }
1539 
1540     /*
1541      * Check world perms
1542      */
1543     vmode >>= 3;
1544     if ((vmode & va->va_mode) != vmode)
1545 	return(EACCES);
1546     return(0);
1547 }
1548 
1549 /*
1550  * Long-term (10-second interval) statistics collection
1551  */
1552 static
1553 uint64_t
1554 collect_nlookup_callback(int n)
1555 {
1556 	static uint64_t last_total;
1557 	uint64_t save;
1558 	uint64_t total;
1559 
1560 	total = 0;
1561 	for (n = 0; n < ncpus; ++n) {
1562 		globaldata_t gd = globaldata_find(n);
1563 		struct nchstats *sp;
1564 
1565 		if ((sp = gd->gd_nchstats) != NULL)
1566 			total += sp->ncs_longhits + sp->ncs_longmiss;
1567 	}
1568 	save = total;
1569 	total = total - last_total;
1570 	last_total = save;
1571 
1572 	return total;
1573 }
1574 
1575 static
1576 void
1577 nlookup_collect_init(void *dummy __unused)
1578 {
1579 	kcollect_register(KCOLLECT_NLOOKUP, "nlookup", collect_nlookup_callback,
1580 			  KCOLLECT_SCALE(KCOLLECT_NLOOKUP_FORMAT, 0));
1581 }
1582 SYSINIT(collect_nlookup, SI_SUB_PROP, SI_ORDER_ANY, nlookup_collect_init, 0);
1583