xref: /netbsd-src/sys/kern/vfs_lookup.c (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1 /*	$NetBSD: vfs_lookup.c,v 1.200 2012/11/18 17:41:53 manu Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_lookup.c	8.10 (Berkeley) 5/27/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.200 2012/11/18 17:41:53 manu Exp $");
41 
42 #include "opt_magiclinks.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/syslimits.h>
48 #include <sys/time.h>
49 #include <sys/namei.h>
50 #include <sys/vnode.h>
51 #include <sys/mount.h>
52 #include <sys/errno.h>
53 #include <sys/filedesc.h>
54 #include <sys/hash.h>
55 #include <sys/proc.h>
56 #include <sys/syslog.h>
57 #include <sys/kauth.h>
58 #include <sys/ktrace.h>
59 #include <sys/dirent.h>
60 
61 #ifndef MAGICLINKS
62 #define MAGICLINKS 0
63 #endif
64 
65 int vfs_magiclinks = MAGICLINKS;
66 
67 __CTASSERT(MAXNAMLEN == NAME_MAX);
68 
69 /*
70  * Substitute replacement text for 'magic' strings in symlinks.
71  * Returns 0 if successful, and returns non-zero if an error
72  * occurs.  (Currently, the only possible error is running out
73  * of temporary pathname space.)
74  *
75  * Looks for "@<string>" and "@<string>/", where <string> is a
76  * recognized 'magic' string.  Replaces the "@<string>" with the
77  * appropriate replacement text.  (Note that in some cases the
78  * replacement text may have zero length.)
79  *
80  * This would have been table driven, but the variance in
81  * replacement strings (and replacement string lengths) made
82  * that impractical.
83  */
84 #define	VNL(x)							\
85 	(sizeof(x) - 1)
86 
87 #define	VO	'{'
88 #define	VC	'}'
89 
90 #define	MATCH(str)						\
91 	((termchar == '/' && i + VNL(str) == *len) ||		\
92 	 (i + VNL(str) < *len &&				\
93 	  cp[i + VNL(str)] == termchar)) &&			\
94 	!strncmp((str), &cp[i], VNL(str))
95 
96 #define	SUBSTITUTE(m, s, sl)					\
97 	if ((newlen + (sl)) >= MAXPATHLEN)			\
98 		return 1;					\
99 	i += VNL(m);						\
100 	if (termchar != '/')					\
101 		i++;						\
102 	(void)memcpy(&tmp[newlen], (s), (sl));			\
103 	newlen += (sl);						\
104 	change = 1;						\
105 	termchar = '/';
106 
107 static int
108 symlink_magic(struct proc *p, char *cp, size_t *len)
109 {
110 	char *tmp;
111 	size_t change, i, newlen, slen;
112 	char termchar = '/';
113 	char idtmp[11]; /* enough for 32 bit *unsigned* integer */
114 
115 
116 	tmp = PNBUF_GET();
117 	for (change = i = newlen = 0; i < *len; ) {
118 		if (cp[i] != '@') {
119 			tmp[newlen++] = cp[i++];
120 			continue;
121 		}
122 
123 		i++;
124 
125 		/* Check for @{var} syntax. */
126 		if (cp[i] == VO) {
127 			termchar = VC;
128 			i++;
129 		}
130 
131 		/*
132 		 * The following checks should be ordered according
133 		 * to frequency of use.
134 		 */
135 		if (MATCH("machine_arch")) {
136 			slen = VNL(MACHINE_ARCH);
137 			SUBSTITUTE("machine_arch", MACHINE_ARCH, slen);
138 		} else if (MATCH("machine")) {
139 			slen = VNL(MACHINE);
140 			SUBSTITUTE("machine", MACHINE, slen);
141 		} else if (MATCH("hostname")) {
142 			SUBSTITUTE("hostname", hostname, hostnamelen);
143 		} else if (MATCH("osrelease")) {
144 			slen = strlen(osrelease);
145 			SUBSTITUTE("osrelease", osrelease, slen);
146 		} else if (MATCH("emul")) {
147 			slen = strlen(p->p_emul->e_name);
148 			SUBSTITUTE("emul", p->p_emul->e_name, slen);
149 		} else if (MATCH("kernel_ident")) {
150 			slen = strlen(kernel_ident);
151 			SUBSTITUTE("kernel_ident", kernel_ident, slen);
152 		} else if (MATCH("domainname")) {
153 			SUBSTITUTE("domainname", domainname, domainnamelen);
154 		} else if (MATCH("ostype")) {
155 			slen = strlen(ostype);
156 			SUBSTITUTE("ostype", ostype, slen);
157 		} else if (MATCH("uid")) {
158 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
159 			    kauth_cred_geteuid(kauth_cred_get()));
160 			SUBSTITUTE("uid", idtmp, slen);
161 		} else if (MATCH("ruid")) {
162 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
163 			    kauth_cred_getuid(kauth_cred_get()));
164 			SUBSTITUTE("ruid", idtmp, slen);
165 		} else if (MATCH("gid")) {
166 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
167 			    kauth_cred_getegid(kauth_cred_get()));
168 			SUBSTITUTE("gid", idtmp, slen);
169 		} else if (MATCH("rgid")) {
170 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
171 			    kauth_cred_getgid(kauth_cred_get()));
172 			SUBSTITUTE("rgid", idtmp, slen);
173 		} else {
174 			tmp[newlen++] = '@';
175 			if (termchar == VC)
176 				tmp[newlen++] = VO;
177 		}
178 	}
179 
180 	if (change) {
181 		(void)memcpy(cp, tmp, newlen);
182 		*len = newlen;
183 	}
184 	PNBUF_PUT(tmp);
185 
186 	return 0;
187 }
188 
189 #undef VNL
190 #undef VO
191 #undef VC
192 #undef MATCH
193 #undef SUBSTITUTE
194 
195 ////////////////////////////////////////////////////////////
196 
197 /*
198  * Determine the namei hash (for the namecache) for name.
199  * If *ep != NULL, hash from name to ep-1.
200  * If *ep == NULL, hash from name until the first NUL or '/', and
201  * return the location of this termination character in *ep.
202  *
203  * This function returns an equivalent hash to the MI hash32_strn().
204  * The latter isn't used because in the *ep == NULL case, determining
205  * the length of the string to the first NUL or `/' and then calling
206  * hash32_strn() involves unnecessary double-handling of the data.
207  */
208 uint32_t
209 namei_hash(const char *name, const char **ep)
210 {
211 	uint32_t	hash;
212 
213 	hash = HASH32_STR_INIT;
214 	if (*ep != NULL) {
215 		for (; name < *ep; name++)
216 			hash = hash * 33 + *(const uint8_t *)name;
217 	} else {
218 		for (; *name != '\0' && *name != '/'; name++)
219 			hash = hash * 33 + *(const uint8_t *)name;
220 		*ep = name;
221 	}
222 	return (hash + (hash >> 5));
223 }
224 
225 /*
226  * Find the end of the first path component in NAME and return its
227  * length.
228  */
229 static size_t
230 namei_getcomponent(const char *name)
231 {
232 	size_t pos;
233 
234 	pos = 0;
235 	while (name[pos] != '\0' && name[pos] != '/') {
236 		pos++;
237 	}
238 	return pos;
239 }
240 
241 ////////////////////////////////////////////////////////////
242 
243 /*
244  * Sealed abstraction for pathnames.
245  *
246  * System-call-layer level code that is going to call namei should
247  * first create a pathbuf and adjust all the bells and whistles on it
248  * as needed by context.
249  */
250 
251 struct pathbuf {
252 	char *pb_path;
253 	char *pb_pathcopy;
254 	unsigned pb_pathcopyuses;
255 };
256 
257 static struct pathbuf *
258 pathbuf_create_raw(void)
259 {
260 	struct pathbuf *pb;
261 
262 	pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
263 	if (pb == NULL) {
264 		return NULL;
265 	}
266 	pb->pb_path = PNBUF_GET();
267 	if (pb->pb_path == NULL) {
268 		kmem_free(pb, sizeof(*pb));
269 		return NULL;
270 	}
271 	pb->pb_pathcopy = NULL;
272 	pb->pb_pathcopyuses = 0;
273 	return pb;
274 }
275 
276 void
277 pathbuf_destroy(struct pathbuf *pb)
278 {
279 	KASSERT(pb->pb_pathcopyuses == 0);
280 	KASSERT(pb->pb_pathcopy == NULL);
281 	PNBUF_PUT(pb->pb_path);
282 	kmem_free(pb, sizeof(*pb));
283 }
284 
285 struct pathbuf *
286 pathbuf_assimilate(char *pnbuf)
287 {
288 	struct pathbuf *pb;
289 
290 	pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
291 	if (pb == NULL) {
292 		return NULL;
293 	}
294 	pb->pb_path = pnbuf;
295 	pb->pb_pathcopy = NULL;
296 	pb->pb_pathcopyuses = 0;
297 	return pb;
298 }
299 
300 struct pathbuf *
301 pathbuf_create(const char *path)
302 {
303 	struct pathbuf *pb;
304 	int error;
305 
306 	pb = pathbuf_create_raw();
307 	if (pb == NULL) {
308 		return NULL;
309 	}
310 	error = copystr(path, pb->pb_path, PATH_MAX, NULL);
311 	if (error != 0) {
312 		KASSERT(!"kernel path too long in pathbuf_create");
313 		/* make sure it's null-terminated, just in case */
314 		pb->pb_path[PATH_MAX-1] = '\0';
315 	}
316 	return pb;
317 }
318 
319 int
320 pathbuf_copyin(const char *userpath, struct pathbuf **ret)
321 {
322 	struct pathbuf *pb;
323 	int error;
324 
325 	pb = pathbuf_create_raw();
326 	if (pb == NULL) {
327 		return ENOMEM;
328 	}
329 	error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
330 	if (error) {
331 		pathbuf_destroy(pb);
332 		return error;
333 	}
334 	*ret = pb;
335 	return 0;
336 }
337 
338 /*
339  * XXX should not exist:
340  *   1. whether a pointer is kernel or user should be statically checkable.
341  *   2. copyin should be handled by the upper part of the syscall layer,
342  *      not in here.
343  */
344 int
345 pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
346 {
347 	if (seg == UIO_USERSPACE) {
348 		return pathbuf_copyin(path, ret);
349 	} else {
350 		*ret = pathbuf_create(path);
351 		if (*ret == NULL) {
352 			return ENOMEM;
353 		}
354 		return 0;
355 	}
356 }
357 
358 /*
359  * Get a copy of the path buffer as it currently exists. If this is
360  * called after namei starts the results may be arbitrary.
361  */
362 void
363 pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
364 {
365 	strlcpy(buf, pb->pb_path, maxlen);
366 }
367 
368 /*
369  * These two functions allow access to a saved copy of the original
370  * path string. The first copy should be gotten before namei is
371  * called. Each copy that is gotten should be put back.
372  */
373 
374 const char *
375 pathbuf_stringcopy_get(struct pathbuf *pb)
376 {
377 	if (pb->pb_pathcopyuses == 0) {
378 		pb->pb_pathcopy = PNBUF_GET();
379 		strcpy(pb->pb_pathcopy, pb->pb_path);
380 	}
381 	pb->pb_pathcopyuses++;
382 	return pb->pb_pathcopy;
383 }
384 
385 void
386 pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
387 {
388 	KASSERT(str == pb->pb_pathcopy);
389 	KASSERT(pb->pb_pathcopyuses > 0);
390 	pb->pb_pathcopyuses--;
391 	if (pb->pb_pathcopyuses == 0) {
392 		PNBUF_PUT(pb->pb_pathcopy);
393 		pb->pb_pathcopy = NULL;
394 	}
395 }
396 
397 
398 ////////////////////////////////////////////////////////////
399 
400 /*
401  * namei: convert a pathname into a pointer to a (maybe-locked) vnode,
402  * and maybe also its parent directory vnode, and assorted other guff.
403  * See namei(9) for the interface documentation.
404  *
405  *
406  * The FOLLOW flag is set when symbolic links are to be followed
407  * when they occur at the end of the name translation process.
408  * Symbolic links are always followed for all other pathname
409  * components other than the last.
410  *
411  * The segflg defines whether the name is to be copied from user
412  * space or kernel space.
413  *
414  * Overall outline of namei:
415  *
416  *	copy in name
417  *	get starting directory
418  *	while (!done && !error) {
419  *		call lookup to search path.
420  *		if symbolic link, massage name in buffer and continue
421  *	}
422  */
423 
424 /*
425  * Search a pathname.
426  * This is a very central and rather complicated routine.
427  *
428  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
429  * The starting directory is passed in. The pathname is descended
430  * until done, or a symbolic link is encountered. The variable ni_more
431  * is clear if the path is completed; it is set to one if a symbolic
432  * link needing interpretation is encountered.
433  *
434  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
435  * whether the name is to be looked up, created, renamed, or deleted.
436  * When CREATE, RENAME, or DELETE is specified, information usable in
437  * creating, renaming, or deleting a directory entry may be calculated.
438  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
439  * locked.  Otherwise the parent directory is not returned. If the target
440  * of the pathname exists and LOCKLEAF is or'ed into the flag the target
441  * is returned locked, otherwise it is returned unlocked.  When creating
442  * or renaming and LOCKPARENT is specified, the target may not be ".".
443  * When deleting and LOCKPARENT is specified, the target may be ".".
444  *
445  * Overall outline of lookup:
446  *
447  * dirloop:
448  *	identify next component of name at ndp->ni_ptr
449  *	handle degenerate case where name is null string
450  *	if .. and crossing mount points and on mounted filesys, find parent
451  *	call VOP_LOOKUP routine for next component name
452  *	    directory vnode returned in ni_dvp, locked.
453  *	    component vnode returned in ni_vp (if it exists), locked.
454  *	if result vnode is mounted on and crossing mount points,
455  *	    find mounted on vnode
456  *	if more components of name, do next level at dirloop
457  *	return the answer in ni_vp, locked if LOCKLEAF set
458  *	    if LOCKPARENT set, return locked parent in ni_dvp
459  */
460 
461 
462 /*
463  * Internal state for a namei operation.
464  *
465  * cnp is always equal to &ndp->ni_cnp.
466  */
467 struct namei_state {
468 	struct nameidata *ndp;
469 	struct componentname *cnp;
470 
471 	int docache;			/* == 0 do not cache last component */
472 	int rdonly;			/* lookup read-only flag bit */
473 	int slashes;
474 
475 	unsigned attempt_retry:1;	/* true if error allows emul retry */
476 };
477 
478 
479 /*
480  * Initialize the namei working state.
481  */
482 static void
483 namei_init(struct namei_state *state, struct nameidata *ndp)
484 {
485 	state->ndp = ndp;
486 	state->cnp = &ndp->ni_cnd;
487 	KASSERT((state->cnp->cn_flags & INRELOOKUP) == 0);
488 
489 	state->docache = 0;
490 	state->rdonly = 0;
491 	state->slashes = 0;
492 
493 #ifdef DIAGNOSTIC
494 	if (!state->cnp->cn_cred)
495 		panic("namei: bad cred/proc");
496 	if (state->cnp->cn_nameiop & (~OPMASK))
497 		panic("namei: nameiop contaminated with flags");
498 	if (state->cnp->cn_flags & OPMASK)
499 		panic("namei: flags contaminated with nameiops");
500 #endif
501 
502 	/*
503 	 * The buffer for name translation shall be the one inside the
504 	 * pathbuf.
505 	 */
506 	state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
507 }
508 
509 /*
510  * Clean up the working namei state, leaving things ready for return
511  * from namei.
512  */
513 static void
514 namei_cleanup(struct namei_state *state)
515 {
516 	KASSERT(state->cnp == &state->ndp->ni_cnd);
517 
518 	/* nothing for now */
519 	(void)state;
520 }
521 
522 //////////////////////////////
523 
524 /*
525  * Get the directory context.
526  * Initializes the rootdir and erootdir state and returns a reference
527  * to the starting dir.
528  */
529 static struct vnode *
530 namei_getstartdir(struct namei_state *state)
531 {
532 	struct nameidata *ndp = state->ndp;
533 	struct componentname *cnp = state->cnp;
534 	struct cwdinfo *cwdi;		/* pointer to cwd state */
535 	struct lwp *self = curlwp;	/* thread doing namei() */
536 	struct vnode *rootdir, *erootdir, *curdir, *startdir;
537 
538 	cwdi = self->l_proc->p_cwdi;
539 	rw_enter(&cwdi->cwdi_lock, RW_READER);
540 
541 	/* root dir */
542 	if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
543 		rootdir = rootvnode;
544 	} else {
545 		rootdir = cwdi->cwdi_rdir;
546 	}
547 
548 	/* emulation root dir, if any */
549 	if ((cnp->cn_flags & TRYEMULROOT) == 0) {
550 		/* if we don't want it, don't fetch it */
551 		erootdir = NULL;
552 	} else if (cnp->cn_flags & EMULROOTSET) {
553 		/* explicitly set emulroot; "/../" doesn't override this */
554 		erootdir = ndp->ni_erootdir;
555 	} else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
556 		/* explicit reference to real rootdir */
557 		erootdir = NULL;
558 	} else {
559 		/* may be null */
560 		erootdir = cwdi->cwdi_edir;
561 	}
562 
563 	/* current dir */
564 	curdir = cwdi->cwdi_cdir;
565 
566 	if (ndp->ni_pnbuf[0] != '/') {
567 		if (ndp->ni_atdir != NULL) {
568 			startdir = ndp->ni_atdir;
569 		} else {
570 			startdir = curdir;
571 		}
572 		erootdir = NULL;
573 	} else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
574 		startdir = erootdir;
575 	} else {
576 		startdir = rootdir;
577 		erootdir = NULL;
578 	}
579 
580 	state->ndp->ni_rootdir = rootdir;
581 	state->ndp->ni_erootdir = erootdir;
582 
583 	/*
584 	 * Get a reference to the start dir so we can safely unlock cwdi.
585 	 *
586 	 * XXX: should we hold references to rootdir and erootdir while
587 	 * we're running? What happens if a multithreaded process chroots
588 	 * during namei?
589 	 */
590 	vref(startdir);
591 
592 	rw_exit(&cwdi->cwdi_lock);
593 	return startdir;
594 }
595 
596 /*
597  * Get the directory context for the nfsd case, in parallel to
598  * getstartdir. Initializes the rootdir and erootdir state and
599  * returns a reference to the passed-in starting dir.
600  */
601 static struct vnode *
602 namei_getstartdir_for_nfsd(struct namei_state *state)
603 {
604 	KASSERT(state->ndp->ni_atdir != NULL);
605 
606 	/* always use the real root, and never set an emulation root */
607 	state->ndp->ni_rootdir = rootvnode;
608 	state->ndp->ni_erootdir = NULL;
609 
610 	vref(state->ndp->ni_atdir);
611 	return state->ndp->ni_atdir;
612 }
613 
614 
615 /*
616  * Ktrace the namei operation.
617  */
618 static void
619 namei_ktrace(struct namei_state *state)
620 {
621 	struct nameidata *ndp = state->ndp;
622 	struct componentname *cnp = state->cnp;
623 	struct lwp *self = curlwp;	/* thread doing namei() */
624 	const char *emul_path;
625 
626 	if (ktrpoint(KTR_NAMEI)) {
627 		if (ndp->ni_erootdir != NULL) {
628 			/*
629 			 * To make any sense, the trace entry need to have the
630 			 * text of the emulation path prepended.
631 			 * Usually we can get this from the current process,
632 			 * but when called from emul_find_interp() it is only
633 			 * in the exec_package - so we get it passed in ni_next
634 			 * (this is a hack).
635 			 */
636 			if (cnp->cn_flags & EMULROOTSET)
637 				emul_path = ndp->ni_next;
638 			else
639 				emul_path = self->l_proc->p_emul->e_path;
640 			ktrnamei2(emul_path, strlen(emul_path),
641 			    ndp->ni_pnbuf, ndp->ni_pathlen);
642 		} else
643 			ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
644 	}
645 }
646 
647 /*
648  * Start up namei. Find the root dir and cwd, establish the starting
649  * directory for lookup, and lock it. Also calls ktrace when
650  * appropriate.
651  */
652 static int
653 namei_start(struct namei_state *state, int isnfsd,
654 	    struct vnode **startdir_ret)
655 {
656 	struct nameidata *ndp = state->ndp;
657 	struct vnode *startdir;
658 
659 	/* length includes null terminator (was originally from copyinstr) */
660 	ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
661 
662 	/*
663 	 * POSIX.1 requirement: "" is not a valid file name.
664 	 */
665 	if (ndp->ni_pathlen == 1) {
666 		return ENOENT;
667 	}
668 
669 	ndp->ni_loopcnt = 0;
670 
671 	/* Get starting directory, set up root, and ktrace. */
672 	if (isnfsd) {
673 		startdir = namei_getstartdir_for_nfsd(state);
674 		/* no ktrace */
675 	} else {
676 		startdir = namei_getstartdir(state);
677 		namei_ktrace(state);
678 	}
679 
680 	/* NDAT may feed us with a non directory namei_getstartdir */
681 	if (startdir->v_type != VDIR)
682 		return ENOTDIR;
683 
684 	vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY);
685 
686 	*startdir_ret = startdir;
687 	return 0;
688 }
689 
690 /*
691  * Check for being at a symlink that we're going to follow.
692  */
693 static inline int
694 namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
695 {
696 	return (foundobj->v_type == VLNK) &&
697 		(state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
698 }
699 
700 /*
701  * Follow a symlink.
702  *
703  * Updates searchdir. inhibitmagic causes magic symlinks to not be
704  * interpreted; this is used by nfsd.
705  *
706  * Unlocks foundobj on success (ugh)
707  */
708 static inline int
709 namei_follow(struct namei_state *state, int inhibitmagic,
710 	     struct vnode *searchdir, struct vnode *foundobj,
711 	     struct vnode **newsearchdir_ret)
712 {
713 	struct nameidata *ndp = state->ndp;
714 	struct componentname *cnp = state->cnp;
715 
716 	struct lwp *self = curlwp;	/* thread doing namei() */
717 	struct iovec aiov;		/* uio for reading symbolic links */
718 	struct uio auio;
719 	char *cp;			/* pointer into pathname argument */
720 	size_t linklen;
721 	int error;
722 
723 	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
724 	KASSERT(VOP_ISLOCKED(foundobj) == LK_EXCLUSIVE);
725 	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
726 		return ELOOP;
727 	}
728 	if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
729 		error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
730 		if (error != 0)
731 			return error;
732 	}
733 
734 	/* FUTURE: fix this to not use a second buffer */
735 	cp = PNBUF_GET();
736 	aiov.iov_base = cp;
737 	aiov.iov_len = MAXPATHLEN;
738 	auio.uio_iov = &aiov;
739 	auio.uio_iovcnt = 1;
740 	auio.uio_offset = 0;
741 	auio.uio_rw = UIO_READ;
742 	auio.uio_resid = MAXPATHLEN;
743 	UIO_SETUP_SYSSPACE(&auio);
744 	error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
745 	if (error) {
746 		PNBUF_PUT(cp);
747 		return error;
748 	}
749 	linklen = MAXPATHLEN - auio.uio_resid;
750 	if (linklen == 0) {
751 		PNBUF_PUT(cp);
752 		return ENOENT;
753 	}
754 
755 	/*
756 	 * Do symlink substitution, if appropriate, and
757 	 * check length for potential overflow.
758 	 *
759 	 * Inhibit symlink substitution for nfsd.
760 	 * XXX: This is how it was before; is that a bug or a feature?
761 	 */
762 	if ((!inhibitmagic && vfs_magiclinks &&
763 	     symlink_magic(self->l_proc, cp, &linklen)) ||
764 	    (linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
765 		PNBUF_PUT(cp);
766 		return ENAMETOOLONG;
767 	}
768 	if (ndp->ni_pathlen > 1) {
769 		/* includes a null-terminator */
770 		memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
771 	} else {
772 		cp[linklen] = '\0';
773 	}
774 	ndp->ni_pathlen += linklen;
775 	memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
776 	PNBUF_PUT(cp);
777 
778 	/* we're now starting from the beginning of the buffer again */
779 	cnp->cn_nameptr = ndp->ni_pnbuf;
780 
781 	/* must unlock this before relocking searchdir */
782 	VOP_UNLOCK(foundobj);
783 
784 	/*
785 	 * Check if root directory should replace current directory.
786 	 */
787 	if (ndp->ni_pnbuf[0] == '/') {
788 		vput(searchdir);
789 		/* Keep absolute symbolic links inside emulation root */
790 		searchdir = ndp->ni_erootdir;
791 		if (searchdir == NULL ||
792 		    (ndp->ni_pnbuf[1] == '.'
793 		     && ndp->ni_pnbuf[2] == '.'
794 		     && ndp->ni_pnbuf[3] == '/')) {
795 			ndp->ni_erootdir = NULL;
796 			searchdir = ndp->ni_rootdir;
797 		}
798 		vref(searchdir);
799 		vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
800 		while (cnp->cn_nameptr[0] == '/') {
801 			cnp->cn_nameptr++;
802 			ndp->ni_pathlen--;
803 		}
804 	}
805 
806 	*newsearchdir_ret = searchdir;
807 	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
808 	return 0;
809 }
810 
811 //////////////////////////////
812 
813 /*
814  * Inspect the leading path component and update the state accordingly.
815  */
816 static int
817 lookup_parsepath(struct namei_state *state)
818 {
819 	const char *cp;			/* pointer into pathname argument */
820 
821 	struct componentname *cnp = state->cnp;
822 	struct nameidata *ndp = state->ndp;
823 
824 	KASSERT(cnp == &ndp->ni_cnd);
825 
826 	/*
827 	 * Search a new directory.
828 	 *
829 	 * The last component of the filename is left accessible via
830 	 * cnp->cn_nameptr for callers that need the name. Callers needing
831 	 * the name set the SAVENAME flag. When done, they assume
832 	 * responsibility for freeing the pathname buffer.
833 	 *
834 	 * At this point, our only vnode state is that the search dir
835 	 * is held and locked.
836 	 */
837 	cnp->cn_consume = 0;
838 	cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
839 	cp = cnp->cn_nameptr + cnp->cn_namelen;
840 	if (cnp->cn_namelen > KERNEL_NAME_MAX) {
841 		return ENAMETOOLONG;
842 	}
843 #ifdef NAMEI_DIAGNOSTIC
844 	{ char c = *cp;
845 	*(char *)cp = '\0';
846 	printf("{%s}: ", cnp->cn_nameptr);
847 	*(char *)cp = c; }
848 #endif /* NAMEI_DIAGNOSTIC */
849 	ndp->ni_pathlen -= cnp->cn_namelen;
850 	ndp->ni_next = cp;
851 	/*
852 	 * If this component is followed by a slash, then move the pointer to
853 	 * the next component forward, and remember that this component must be
854 	 * a directory.
855 	 */
856 	if (*cp == '/') {
857 		do {
858 			cp++;
859 		} while (*cp == '/');
860 		state->slashes = cp - ndp->ni_next;
861 		ndp->ni_pathlen -= state->slashes;
862 		ndp->ni_next = cp;
863 		cnp->cn_flags |= REQUIREDIR;
864 	} else {
865 		state->slashes = 0;
866 		cnp->cn_flags &= ~REQUIREDIR;
867 	}
868 	/*
869 	 * We do special processing on the last component, whether or not it's
870 	 * a directory.  Cache all intervening lookups, but not the final one.
871 	 */
872 	if (*cp == '\0') {
873 		if (state->docache)
874 			cnp->cn_flags |= MAKEENTRY;
875 		else
876 			cnp->cn_flags &= ~MAKEENTRY;
877 		cnp->cn_flags |= ISLASTCN;
878 	} else {
879 		cnp->cn_flags |= MAKEENTRY;
880 		cnp->cn_flags &= ~ISLASTCN;
881 	}
882 	if (cnp->cn_namelen == 2 &&
883 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
884 		cnp->cn_flags |= ISDOTDOT;
885 	else
886 		cnp->cn_flags &= ~ISDOTDOT;
887 
888 	return 0;
889 }
890 
891 /*
892  * Call VOP_LOOKUP for a single lookup; return a new search directory
893  * (used when crossing mountpoints up or searching union mounts down) and
894  * the found object, which for create operations may be NULL on success.
895  */
896 static int
897 lookup_once(struct namei_state *state,
898 	    struct vnode *searchdir,
899 	    struct vnode **newsearchdir_ret,
900 	    struct vnode **foundobj_ret)
901 {
902 	struct vnode *tmpvn;		/* scratch vnode */
903 	struct vnode *foundobj;		/* result */
904 	struct mount *mp;		/* mount table entry */
905 	struct lwp *l = curlwp;
906 	int error;
907 
908 	struct componentname *cnp = state->cnp;
909 	struct nameidata *ndp = state->ndp;
910 
911 	KASSERT(cnp == &ndp->ni_cnd);
912 	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
913 	*newsearchdir_ret = searchdir;
914 
915 	/*
916 	 * Handle "..": two special cases.
917 	 * 1. If at root directory (e.g. after chroot)
918 	 *    or at absolute root directory
919 	 *    then ignore it so can't get out.
920 	 * 1a. If at the root of the emulation filesystem go to the real
921 	 *    root. So "/../<path>" is always absolute.
922 	 * 1b. If we have somehow gotten out of a jail, warn
923 	 *    and also ignore it so we can't get farther out.
924 	 * 2. If this vnode is the root of a mounted
925 	 *    filesystem, then replace it with the
926 	 *    vnode which was mounted on so we take the
927 	 *    .. in the other file system.
928 	 */
929 	if (cnp->cn_flags & ISDOTDOT) {
930 		struct proc *p = l->l_proc;
931 
932 		for (;;) {
933 			if (searchdir == ndp->ni_rootdir ||
934 			    searchdir == rootvnode) {
935 				foundobj = searchdir;
936 				vref(foundobj);
937 				*foundobj_ret = foundobj;
938 				error = 0;
939 				goto done;
940 			}
941 			if (ndp->ni_rootdir != rootvnode) {
942 				int retval;
943 
944 				VOP_UNLOCK(searchdir);
945 				retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
946 				vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
947 				if (!retval) {
948 				    /* Oops! We got out of jail! */
949 				    log(LOG_WARNING,
950 					"chrooted pid %d uid %d (%s) "
951 					"detected outside of its chroot\n",
952 					p->p_pid, kauth_cred_geteuid(l->l_cred),
953 					p->p_comm);
954 				    /* Put us at the jail root. */
955 				    vput(searchdir);
956 				    searchdir = NULL;
957 				    foundobj = ndp->ni_rootdir;
958 				    vref(foundobj);
959 				    vref(foundobj);
960 				    vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
961 				    *newsearchdir_ret = foundobj;
962 				    *foundobj_ret = foundobj;
963 				    error = 0;
964 				    goto done;
965 				}
966 			}
967 			if ((searchdir->v_vflag & VV_ROOT) == 0 ||
968 			    (cnp->cn_flags & NOCROSSMOUNT))
969 				break;
970 			tmpvn = searchdir;
971 			searchdir = searchdir->v_mount->mnt_vnodecovered;
972 			vref(searchdir);
973 			vput(tmpvn);
974 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
975 			*newsearchdir_ret = searchdir;
976 		}
977 	}
978 
979 	/*
980 	 * We now have a segment name to search for, and a directory to search.
981 	 * Our vnode state here is that "searchdir" is held and locked.
982 	 */
983 unionlookup:
984 	foundobj = NULL;
985 	error = VOP_LOOKUP(searchdir, &foundobj, cnp);
986 
987 	if (error != 0) {
988 #ifdef DIAGNOSTIC
989 		if (foundobj != NULL)
990 			panic("leaf `%s' should be empty", cnp->cn_nameptr);
991 #endif /* DIAGNOSTIC */
992 #ifdef NAMEI_DIAGNOSTIC
993 		printf("not found\n");
994 #endif /* NAMEI_DIAGNOSTIC */
995 		if ((error == ENOENT) &&
996 		    (searchdir->v_vflag & VV_ROOT) &&
997 		    (searchdir->v_mount->mnt_flag & MNT_UNION)) {
998 			tmpvn = searchdir;
999 			searchdir = searchdir->v_mount->mnt_vnodecovered;
1000 			vref(searchdir);
1001 			vput(tmpvn);
1002 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1003 			*newsearchdir_ret = searchdir;
1004 			goto unionlookup;
1005 		}
1006 
1007 		if (error != EJUSTRETURN)
1008 			goto done;
1009 
1010 		/*
1011 		 * If this was not the last component, or there were trailing
1012 		 * slashes, and we are not going to create a directory,
1013 		 * then the name must exist.
1014 		 */
1015 		if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
1016 			error = ENOENT;
1017 			goto done;
1018 		}
1019 
1020 		/*
1021 		 * If creating and at end of pathname, then can consider
1022 		 * allowing file to be created.
1023 		 */
1024 		if (state->rdonly) {
1025 			error = EROFS;
1026 			goto done;
1027 		}
1028 
1029 		/*
1030 		 * We return success and a NULL foundobj to indicate
1031 		 * that the entry doesn't currently exist, leaving a
1032 		 * pointer to the (normally, locked) directory vnode
1033 		 * as searchdir.
1034 		 */
1035 		*foundobj_ret = NULL;
1036 		error = 0;
1037 		goto done;
1038 	}
1039 #ifdef NAMEI_DIAGNOSTIC
1040 	printf("found\n");
1041 #endif /* NAMEI_DIAGNOSTIC */
1042 
1043 	/*
1044 	 * Take into account any additional components consumed by the
1045 	 * underlying filesystem.  This will include any trailing slashes after
1046 	 * the last component consumed.
1047 	 */
1048 	if (cnp->cn_consume > 0) {
1049 		ndp->ni_pathlen -= cnp->cn_consume - state->slashes;
1050 		ndp->ni_next += cnp->cn_consume - state->slashes;
1051 		cnp->cn_consume = 0;
1052 		if (ndp->ni_next[0] == '\0')
1053 			cnp->cn_flags |= ISLASTCN;
1054 	}
1055 
1056 	/*
1057 	 * "foundobj" and "searchdir" are both locked and held,
1058 	 * and may be the same vnode.
1059 	 */
1060 
1061 	/*
1062 	 * Check to see if the vnode has been mounted on;
1063 	 * if so find the root of the mounted file system.
1064 	 */
1065 	while (foundobj->v_type == VDIR &&
1066 	       (mp = foundobj->v_mountedhere) != NULL &&
1067 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1068 		error = vfs_busy(mp, NULL);
1069 		if (error != 0) {
1070 			if (searchdir != foundobj) {
1071 				vput(foundobj);
1072 			} else {
1073 				vrele(foundobj);
1074 			}
1075 			goto done;
1076 		}
1077 		if (searchdir != foundobj) {
1078 			VOP_UNLOCK(searchdir);
1079 		}
1080 		vput(foundobj);
1081 		error = VFS_ROOT(mp, &foundobj);
1082 		vfs_unbusy(mp, false, NULL);
1083 		if (error) {
1084 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1085 			goto done;
1086 		}
1087 		/*
1088 		 * avoid locking vnodes from two filesystems because it's
1089 		 * prune to deadlock.  eg. when using puffs.
1090 		 * also, it isn't a good idea to propagate slowness of a
1091 		 * filesystem up to the root directory.
1092 		 * for now, only handle the common case.  (ie. foundobj is VDIR)
1093 		 */
1094 		if (foundobj->v_type == VDIR) {
1095 			vrele(searchdir);
1096 			*newsearchdir_ret = searchdir = foundobj;
1097 			vref(searchdir);
1098 		} else {
1099 			VOP_UNLOCK(foundobj);
1100 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1101 			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
1102 		}
1103 	}
1104 
1105 	*foundobj_ret = foundobj;
1106 	error = 0;
1107 done:
1108 	KASSERT(VOP_ISLOCKED(*newsearchdir_ret) == LK_EXCLUSIVE);
1109 	/*
1110 	 * *foundobj_ret is valid only if error == 0.
1111 	 */
1112 	KASSERT(error != 0 || *foundobj_ret == NULL ||
1113 	    VOP_ISLOCKED(*foundobj_ret) == LK_EXCLUSIVE);
1114 	return error;
1115 }
1116 
1117 //////////////////////////////
1118 
1119 /*
1120  * Do a complete path search from a single root directory.
1121  * (This is called up to twice if TRYEMULROOT is in effect.)
1122  */
1123 static int
1124 namei_oneroot(struct namei_state *state,
1125 	 int neverfollow, int inhibitmagic, int isnfsd)
1126 {
1127 	struct nameidata *ndp = state->ndp;
1128 	struct componentname *cnp = state->cnp;
1129 	struct vnode *searchdir, *foundobj;
1130 	int error;
1131 
1132 	error = namei_start(state, isnfsd, &searchdir);
1133 	if (error) {
1134 		ndp->ni_dvp = NULL;
1135 		ndp->ni_vp = NULL;
1136 		return error;
1137 	}
1138 	KASSERT(searchdir->v_type == VDIR);
1139 
1140 	/*
1141 	 * Setup: break out flag bits into variables.
1142 	 */
1143 	state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
1144 	if (cnp->cn_nameiop == DELETE)
1145 		state->docache = 0;
1146 	state->rdonly = cnp->cn_flags & RDONLY;
1147 
1148 	/*
1149 	 * Keep going until we run out of path components.
1150 	 */
1151 	cnp->cn_nameptr = ndp->ni_pnbuf;
1152 
1153 	/* drop leading slashes (already used them to choose startdir) */
1154 	while (cnp->cn_nameptr[0] == '/') {
1155 		cnp->cn_nameptr++;
1156 		ndp->ni_pathlen--;
1157 	}
1158 	/* was it just "/"? */
1159 	if (cnp->cn_nameptr[0] == '\0') {
1160 		foundobj = searchdir;
1161 		searchdir = NULL;
1162 		cnp->cn_flags |= ISLASTCN;
1163 
1164 		/* bleh */
1165 		goto skiploop;
1166 	}
1167 
1168 	for (;;) {
1169 
1170 		/*
1171 		 * If the directory we're on is unmounted, bail out.
1172 		 * XXX: should this also check if it's unlinked?
1173 		 * XXX: yes it should... but how?
1174 		 */
1175 		if (searchdir->v_mount == NULL) {
1176 			vput(searchdir);
1177 			ndp->ni_dvp = NULL;
1178 			ndp->ni_vp = NULL;
1179 			return (ENOENT);
1180 		}
1181 
1182 		/*
1183 		 * Look up the next path component.
1184 		 * (currently, this may consume more than one)
1185 		 */
1186 
1187 		/* There should be no slashes here. */
1188 		KASSERT(cnp->cn_nameptr[0] != '/');
1189 
1190 		/* and we shouldn't have looped around if we were done */
1191 		KASSERT(cnp->cn_nameptr[0] != '\0');
1192 
1193 		error = lookup_parsepath(state);
1194 		if (error) {
1195 			vput(searchdir);
1196 			ndp->ni_dvp = NULL;
1197 			ndp->ni_vp = NULL;
1198 			state->attempt_retry = 1;
1199 			return (error);
1200 		}
1201 
1202 		error = lookup_once(state, searchdir, &searchdir, &foundobj);
1203 		if (error) {
1204 			vput(searchdir);
1205 			ndp->ni_dvp = NULL;
1206 			ndp->ni_vp = NULL;
1207 			/*
1208 			 * Note that if we're doing TRYEMULROOT we can
1209 			 * retry with the normal root. Where this is
1210 			 * currently set matches previous practice,
1211 			 * but the previous practice didn't make much
1212 			 * sense and somebody should sit down and
1213 			 * figure out which cases should cause retry
1214 			 * and which shouldn't. XXX.
1215 			 */
1216 			state->attempt_retry = 1;
1217 			return (error);
1218 		}
1219 
1220 		if (foundobj == NULL) {
1221 			/*
1222 			 * Success with no object returned means we're
1223 			 * creating something and it isn't already
1224 			 * there. Break out of the main loop now so
1225 			 * the code below doesn't have to test for
1226 			 * foundobj == NULL.
1227 			 */
1228 			break;
1229 		}
1230 
1231 		/*
1232 		 * Check for symbolic link. If we've reached one,
1233 		 * follow it, unless we aren't supposed to. Back up
1234 		 * over any slashes that we skipped, as we will need
1235 		 * them again.
1236 		 */
1237 		if (namei_atsymlink(state, foundobj)) {
1238 			ndp->ni_pathlen += state->slashes;
1239 			ndp->ni_next -= state->slashes;
1240 			if (neverfollow) {
1241 				error = EINVAL;
1242 			} else {
1243 				/*
1244 				 * dholland 20110410: if we're at a
1245 				 * union mount it might make sense to
1246 				 * use the top of the union stack here
1247 				 * rather than the layer we found the
1248 				 * symlink in. (FUTURE)
1249 				 */
1250 				error = namei_follow(state, inhibitmagic,
1251 						     searchdir, foundobj,
1252 						     &searchdir);
1253 			}
1254 			if (error) {
1255 				KASSERT(searchdir != foundobj);
1256 				vput(searchdir);
1257 				vput(foundobj);
1258 				ndp->ni_dvp = NULL;
1259 				ndp->ni_vp = NULL;
1260 				return error;
1261 			}
1262 			/* namei_follow unlocks it (ugh) so rele, not put */
1263 			vrele(foundobj);
1264 			foundobj = NULL;
1265 
1266 			/*
1267 			 * If we followed a symlink to `/' and there
1268 			 * are no more components after the symlink,
1269 			 * we're done with the loop and what we found
1270 			 * is the searchdir.
1271 			 */
1272 			if (cnp->cn_nameptr[0] == '\0') {
1273 				foundobj = searchdir;
1274 				searchdir = NULL;
1275 				cnp->cn_flags |= ISLASTCN;
1276 				break;
1277 			}
1278 
1279 			continue;
1280 		}
1281 
1282 		/*
1283 		 * Not a symbolic link.
1284 		 *
1285 		 * Check for directory, if the component was
1286 		 * followed by a series of slashes.
1287 		 */
1288 		if ((foundobj->v_type != VDIR) &&
1289 		    (cnp->cn_flags & REQUIREDIR)) {
1290 			if (searchdir == foundobj) {
1291 				vrele(searchdir);
1292 			} else {
1293 				vput(searchdir);
1294 			}
1295 			vput(foundobj);
1296 			ndp->ni_dvp = NULL;
1297 			ndp->ni_vp = NULL;
1298 			state->attempt_retry = 1;
1299 			return ENOTDIR;
1300 		}
1301 
1302 		/*
1303 		 * Stop if we've reached the last component.
1304 		 */
1305 		if (cnp->cn_flags & ISLASTCN) {
1306 			break;
1307 		}
1308 
1309 		/*
1310 		 * Continue with the next component.
1311 		 */
1312 		cnp->cn_nameptr = ndp->ni_next;
1313 		if (searchdir == foundobj) {
1314 			vrele(searchdir);
1315 		} else {
1316 			vput(searchdir);
1317 		}
1318 		searchdir = foundobj;
1319 		foundobj = NULL;
1320 	}
1321 
1322  skiploop:
1323 
1324 	if (foundobj != NULL) {
1325 		if (foundobj == ndp->ni_erootdir) {
1326 			/*
1327 			 * We are about to return the emulation root.
1328 			 * This isn't a good idea because code might
1329 			 * repeatedly lookup ".." until the file
1330 			 * matches that returned for "/" and loop
1331 			 * forever.  So convert it to the real root.
1332 			 */
1333 			if (searchdir != NULL) {
1334 				if (searchdir == foundobj)
1335 					vrele(searchdir);
1336 				else
1337 					vput(searchdir);
1338 				searchdir = NULL;
1339 			}
1340 			vput(foundobj);
1341 			foundobj = ndp->ni_rootdir;
1342 			vref(foundobj);
1343 			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
1344 		}
1345 
1346 		/*
1347 		 * If the caller requested the parent node (i.e. it's
1348 		 * a CREATE, DELETE, or RENAME), and we don't have one
1349 		 * (because this is the root directory, or we crossed
1350 		 * a mount point), then we must fail.
1351 		 */
1352 		if (cnp->cn_nameiop != LOOKUP &&
1353 		    (searchdir == NULL ||
1354 		     searchdir->v_mount != foundobj->v_mount)) {
1355 			if (searchdir) {
1356 				vput(searchdir);
1357 			}
1358 			vput(foundobj);
1359 			foundobj = NULL;
1360 			ndp->ni_dvp = NULL;
1361 			ndp->ni_vp = NULL;
1362 			state->attempt_retry = 1;
1363 
1364 			switch (cnp->cn_nameiop) {
1365 			    case CREATE:
1366 				return EEXIST;
1367 			    case DELETE:
1368 			    case RENAME:
1369 				return EBUSY;
1370 			    default:
1371 				break;
1372 			}
1373 			panic("Invalid nameiop\n");
1374 		}
1375 
1376 		/*
1377 		 * Disallow directory write attempts on read-only lookups.
1378 		 * Prefers EEXIST over EROFS for the CREATE case.
1379 		 */
1380 		if (state->rdonly &&
1381 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1382 			if (searchdir) {
1383 				if (foundobj != searchdir) {
1384 					vput(searchdir);
1385 				} else {
1386 					vrele(searchdir);
1387 				}
1388 				searchdir = NULL;
1389 			}
1390 			vput(foundobj);
1391 			foundobj = NULL;
1392 			ndp->ni_dvp = NULL;
1393 			ndp->ni_vp = NULL;
1394 			state->attempt_retry = 1;
1395 			return EROFS;
1396 		}
1397 		if ((cnp->cn_flags & LOCKLEAF) == 0) {
1398 			/*
1399 			 * Note: if LOCKPARENT but not LOCKLEAF is
1400 			 * set, and searchdir == foundobj, this code
1401 			 * necessarily unlocks the parent as well as
1402 			 * the leaf. That is, just because you specify
1403 			 * LOCKPARENT doesn't mean you necessarily get
1404 			 * a locked parent vnode. The code in
1405 			 * vfs_syscalls.c, and possibly elsewhere,
1406 			 * that uses this combination "knows" this, so
1407 			 * it can't be safely changed. Feh. XXX
1408 			 */
1409 			VOP_UNLOCK(foundobj);
1410 		}
1411 	}
1412 
1413 	/*
1414 	 * Done.
1415 	 */
1416 
1417 	/*
1418 	 * If LOCKPARENT is not set, the parent directory isn't returned.
1419 	 */
1420 	if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) {
1421 		if (searchdir == foundobj) {
1422 			vrele(searchdir);
1423 		} else {
1424 			vput(searchdir);
1425 		}
1426 		searchdir = NULL;
1427 	}
1428 
1429 	ndp->ni_dvp = searchdir;
1430 	ndp->ni_vp = foundobj;
1431 	return 0;
1432 }
1433 
1434 /*
1435  * Do namei; wrapper layer that handles TRYEMULROOT.
1436  */
1437 static int
1438 namei_tryemulroot(struct namei_state *state,
1439 	 int neverfollow, int inhibitmagic, int isnfsd)
1440 {
1441 	int error;
1442 
1443 	struct nameidata *ndp = state->ndp;
1444 	struct componentname *cnp = state->cnp;
1445 	const char *savepath = NULL;
1446 
1447 	KASSERT(cnp == &ndp->ni_cnd);
1448 
1449 	if (cnp->cn_flags & TRYEMULROOT) {
1450 		savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
1451 	}
1452 
1453     emul_retry:
1454 	state->attempt_retry = 0;
1455 
1456 	error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd);
1457 	if (error) {
1458 		/*
1459 		 * Once namei has started up, the existence of ni_erootdir
1460 		 * tells us whether we're working from an emulation root.
1461 		 * The TRYEMULROOT flag isn't necessarily authoritative.
1462 		 */
1463 		if (ndp->ni_erootdir != NULL && state->attempt_retry) {
1464 			/* Retry the whole thing using the normal root */
1465 			cnp->cn_flags &= ~TRYEMULROOT;
1466 			state->attempt_retry = 0;
1467 
1468 			/* kinda gross */
1469 			strcpy(ndp->ni_pathbuf->pb_path, savepath);
1470 			pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1471 			savepath = NULL;
1472 
1473 			goto emul_retry;
1474 		}
1475 	}
1476 	if (savepath != NULL) {
1477 		pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1478 	}
1479 	return error;
1480 }
1481 
1482 /*
1483  * External interface.
1484  */
1485 int
1486 namei(struct nameidata *ndp)
1487 {
1488 	struct namei_state state;
1489 	int error;
1490 
1491 	namei_init(&state, ndp);
1492 	error = namei_tryemulroot(&state,
1493 				  0/*!neverfollow*/, 0/*!inhibitmagic*/,
1494 				  0/*isnfsd*/);
1495 	namei_cleanup(&state);
1496 
1497 	if (error) {
1498 		/* make sure no stray refs leak out */
1499 		KASSERT(ndp->ni_dvp == NULL);
1500 		KASSERT(ndp->ni_vp == NULL);
1501 	}
1502 
1503 	return error;
1504 }
1505 
1506 ////////////////////////////////////////////////////////////
1507 
1508 /*
1509  * External interface used by nfsd. This is basically different from
1510  * namei only in that it has the ability to pass in the "current
1511  * directory", and uses an extra flag "neverfollow" for which there's
1512  * no physical flag defined in namei.h. (There used to be a cut&paste
1513  * copy of about half of namei in nfsd to allow these minor
1514  * adjustments to exist.)
1515  *
1516  * XXX: the namei interface should be adjusted so nfsd can just use
1517  * ordinary namei().
1518  */
1519 int
1520 lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
1521 {
1522 	struct namei_state state;
1523 	int error;
1524 
1525 	KASSERT(ndp->ni_atdir == NULL);
1526 	ndp->ni_atdir = forcecwd;
1527 
1528 	namei_init(&state, ndp);
1529 	error = namei_tryemulroot(&state,
1530 				  neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
1531 	namei_cleanup(&state);
1532 
1533 	if (error) {
1534 		/* make sure no stray refs leak out */
1535 		KASSERT(ndp->ni_dvp == NULL);
1536 		KASSERT(ndp->ni_vp == NULL);
1537 	}
1538 
1539 	return error;
1540 }
1541 
1542 /*
1543  * A second external interface used by nfsd. This turns out to be a
1544  * single lookup used by the WebNFS code (ha!) to get "index.html" or
1545  * equivalent when asked for a directory. It should eventually evolve
1546  * into some kind of namei_once() call; for the time being it's kind
1547  * of a mess. XXX.
1548  *
1549  * dholland 20110109: I don't think it works, and I don't think it
1550  * worked before I started hacking and slashing either, and I doubt
1551  * anyone will ever notice.
1552  */
1553 
1554 /*
1555  * Internals. This calls lookup_once() after setting up the assorted
1556  * pieces of state the way they ought to be.
1557  */
1558 static int
1559 do_lookup_for_nfsd_index(struct namei_state *state)
1560 {
1561 	int error = 0;
1562 
1563 	struct componentname *cnp = state->cnp;
1564 	struct nameidata *ndp = state->ndp;
1565 	struct vnode *startdir;
1566 	struct vnode *foundobj;
1567 	const char *cp;			/* pointer into pathname argument */
1568 
1569 	KASSERT(cnp == &ndp->ni_cnd);
1570 
1571 	startdir = state->ndp->ni_atdir;
1572 
1573 	cnp->cn_nameptr = ndp->ni_pnbuf;
1574 	state->docache = 1;
1575 	state->rdonly = cnp->cn_flags & RDONLY;
1576 	ndp->ni_dvp = NULL;
1577 
1578 	cnp->cn_consume = 0;
1579 	cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
1580 	cp = cnp->cn_nameptr + cnp->cn_namelen;
1581 	KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
1582 	ndp->ni_pathlen -= cnp->cn_namelen;
1583 	ndp->ni_next = cp;
1584 	state->slashes = 0;
1585 	cnp->cn_flags &= ~REQUIREDIR;
1586 	cnp->cn_flags |= MAKEENTRY|ISLASTCN;
1587 
1588 	if (cnp->cn_namelen == 2 &&
1589 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
1590 		cnp->cn_flags |= ISDOTDOT;
1591 	else
1592 		cnp->cn_flags &= ~ISDOTDOT;
1593 
1594 	/*
1595 	 * Because lookup_once can change the startdir, we need our
1596 	 * own reference to it to avoid consuming the caller's.
1597 	 */
1598 	vref(startdir);
1599 	vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY);
1600 	error = lookup_once(state, startdir, &startdir, &foundobj);
1601 	if (error == 0 && startdir == foundobj) {
1602 		vrele(startdir);
1603 	} else {
1604 		vput(startdir);
1605 	}
1606 	if (error) {
1607 		goto bad;
1608 	}
1609 	ndp->ni_vp = foundobj;
1610 
1611 	if (foundobj == NULL) {
1612 		return 0;
1613 	}
1614 
1615 	KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
1616 	if ((cnp->cn_flags & LOCKLEAF) == 0) {
1617 		VOP_UNLOCK(foundobj);
1618 	}
1619 	return (0);
1620 
1621 bad:
1622 	ndp->ni_vp = NULL;
1623 	return (error);
1624 }
1625 
1626 /*
1627  * External interface. The partitioning between this function and the
1628  * above isn't very clear - the above function exists mostly so code
1629  * that uses "state->" can be shuffled around without having to change
1630  * it to "state.".
1631  */
1632 int
1633 lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
1634 {
1635 	struct namei_state state;
1636 	int error;
1637 
1638 	KASSERT(ndp->ni_atdir == NULL);
1639 	ndp->ni_atdir = startdir;
1640 
1641 	/*
1642 	 * Note: the name sent in here (is not|should not be) allowed
1643 	 * to contain a slash.
1644 	 */
1645 	if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
1646 		return ENAMETOOLONG;
1647 	}
1648 	if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
1649 		return EINVAL;
1650 	}
1651 
1652 	ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
1653 	ndp->ni_pnbuf = NULL;
1654 	ndp->ni_cnd.cn_nameptr = NULL;
1655 
1656 	namei_init(&state, ndp);
1657 	error = do_lookup_for_nfsd_index(&state);
1658 	namei_cleanup(&state);
1659 
1660 	return error;
1661 }
1662 
1663 ////////////////////////////////////////////////////////////
1664 
1665 /*
1666  * Reacquire a path name component.
1667  * dvp is locked on entry and exit.
1668  * *vpp is locked on exit unless it's NULL.
1669  */
1670 int
1671 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
1672 {
1673 	int rdonly;			/* lookup read-only flag bit */
1674 	int error = 0;
1675 #ifdef DEBUG
1676 	size_t newlen;			/* DEBUG: check name len */
1677 	const char *cp;			/* DEBUG: check name ptr */
1678 #endif /* DEBUG */
1679 
1680 	(void)dummy;
1681 
1682 	/*
1683 	 * Setup: break out flag bits into variables.
1684 	 */
1685 	rdonly = cnp->cn_flags & RDONLY;
1686 
1687 	/*
1688 	 * Search a new directory.
1689 	 *
1690 	 * The cn_hash value is for use by vfs_cache.
1691 	 * The last component of the filename is left accessible via
1692 	 * cnp->cn_nameptr for callers that need the name. Callers needing
1693 	 * the name set the SAVENAME flag. When done, they assume
1694 	 * responsibility for freeing the pathname buffer.
1695 	 */
1696 #ifdef DEBUG
1697 #if 0
1698 	cp = NULL;
1699 	newhash = namei_hash(cnp->cn_nameptr, &cp);
1700 	if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
1701 		panic("relookup: bad hash");
1702 #endif
1703 	newlen = namei_getcomponent(cnp->cn_nameptr);
1704 	if (cnp->cn_namelen != newlen)
1705 		panic("relookup: bad len");
1706 	cp = cnp->cn_nameptr + cnp->cn_namelen;
1707 	while (*cp == '/')
1708 		cp++;
1709 	if (*cp != 0)
1710 		panic("relookup: not last component");
1711 #endif /* DEBUG */
1712 
1713 	/*
1714 	 * Check for degenerate name (e.g. / or "")
1715 	 * which is a way of talking about a directory,
1716 	 * e.g. like "/." or ".".
1717 	 */
1718 	if (cnp->cn_nameptr[0] == '\0')
1719 		panic("relookup: null name");
1720 
1721 	if (cnp->cn_flags & ISDOTDOT)
1722 		panic("relookup: lookup on dot-dot");
1723 
1724 	/*
1725 	 * We now have a segment name to search for, and a directory to search.
1726 	 */
1727 	*vpp = NULL;
1728 	cnp->cn_flags |= INRELOOKUP;
1729 	error = VOP_LOOKUP(dvp, vpp, cnp);
1730 	cnp->cn_flags &= ~INRELOOKUP;
1731 	if ((error) != 0) {
1732 #ifdef DIAGNOSTIC
1733 		if (*vpp != NULL)
1734 			panic("leaf `%s' should be empty", cnp->cn_nameptr);
1735 #endif
1736 		if (error != EJUSTRETURN)
1737 			goto bad;
1738 	}
1739 
1740 #ifdef DIAGNOSTIC
1741 	/*
1742 	 * Check for symbolic link
1743 	 */
1744 	if (*vpp && (*vpp)->v_type == VLNK && (cnp->cn_flags & FOLLOW))
1745 		panic("relookup: symlink found");
1746 #endif
1747 
1748 	/*
1749 	 * Check for read-only lookups.
1750 	 */
1751 	if (rdonly && cnp->cn_nameiop != LOOKUP) {
1752 		error = EROFS;
1753 		if (*vpp) {
1754 			vput(*vpp);
1755 		}
1756 		goto bad;
1757 	}
1758 	return (0);
1759 
1760 bad:
1761 	*vpp = NULL;
1762 	return (error);
1763 }
1764 
1765 /*
1766  * namei_simple - simple forms of namei.
1767  *
1768  * These are wrappers to allow the simple case callers of namei to be
1769  * left alone while everything else changes under them.
1770  */
1771 
1772 /* Flags */
1773 struct namei_simple_flags_type {
1774 	int dummy;
1775 };
1776 static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
1777 const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
1778 const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
1779 const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
1780 const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;
1781 
1782 static
1783 int
1784 namei_simple_convert_flags(namei_simple_flags_t sflags)
1785 {
1786 	if (sflags == NSM_NOFOLLOW_NOEMULROOT)
1787 		return NOFOLLOW | 0;
1788 	if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
1789 		return NOFOLLOW | TRYEMULROOT;
1790 	if (sflags == NSM_FOLLOW_NOEMULROOT)
1791 		return FOLLOW | 0;
1792 	if (sflags == NSM_FOLLOW_TRYEMULROOT)
1793 		return FOLLOW | TRYEMULROOT;
1794 	panic("namei_simple_convert_flags: bogus sflags\n");
1795 	return 0;
1796 }
1797 
1798 int
1799 namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
1800 	struct vnode **vp_ret)
1801 {
1802 	return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
1803 }
1804 
1805 int
1806 nameiat_simple_kernel(struct vnode *dvp, const char *path,
1807 	namei_simple_flags_t sflags, struct vnode **vp_ret)
1808 {
1809 	struct nameidata nd;
1810 	struct pathbuf *pb;
1811 	int err;
1812 
1813 	pb = pathbuf_create(path);
1814 	if (pb == NULL) {
1815 		return ENOMEM;
1816 	}
1817 
1818 	NDINIT(&nd,
1819 		LOOKUP,
1820 		namei_simple_convert_flags(sflags),
1821 		pb);
1822 
1823 	if (dvp != NULL)
1824 		NDAT(&nd, dvp);
1825 
1826 	err = namei(&nd);
1827 	if (err != 0) {
1828 		pathbuf_destroy(pb);
1829 		return err;
1830 	}
1831 	*vp_ret = nd.ni_vp;
1832 	pathbuf_destroy(pb);
1833 	return 0;
1834 }
1835 
1836 int
1837 namei_simple_user(const char *path, namei_simple_flags_t sflags,
1838 	struct vnode **vp_ret)
1839 {
1840 	return nameiat_simple_user(NULL, path, sflags, vp_ret);
1841 }
1842 
1843 int
1844 nameiat_simple_user(struct vnode *dvp, const char *path,
1845 	namei_simple_flags_t sflags, struct vnode **vp_ret)
1846 {
1847 	struct pathbuf *pb;
1848 	struct nameidata nd;
1849 	int err;
1850 
1851 	err = pathbuf_copyin(path, &pb);
1852 	if (err) {
1853 		return err;
1854 	}
1855 
1856 	NDINIT(&nd,
1857 		LOOKUP,
1858 		namei_simple_convert_flags(sflags),
1859 		pb);
1860 
1861 	if (dvp != NULL)
1862 		NDAT(&nd, dvp);
1863 
1864 	err = namei(&nd);
1865 	if (err != 0) {
1866 		pathbuf_destroy(pb);
1867 		return err;
1868 	}
1869 	*vp_ret = nd.ni_vp;
1870 	pathbuf_destroy(pb);
1871 	return 0;
1872 }
1873