xref: /netbsd-src/sys/kern/vfs_lookup.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: vfs_lookup.c,v 1.201 2014/02/07 15:29:22 hannken Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_lookup.c	8.10 (Berkeley) 5/27/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.201 2014/02/07 15:29:22 hannken Exp $");
41 
42 #include "opt_magiclinks.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/syslimits.h>
48 #include <sys/time.h>
49 #include <sys/namei.h>
50 #include <sys/vnode.h>
51 #include <sys/mount.h>
52 #include <sys/errno.h>
53 #include <sys/filedesc.h>
54 #include <sys/hash.h>
55 #include <sys/proc.h>
56 #include <sys/syslog.h>
57 #include <sys/kauth.h>
58 #include <sys/ktrace.h>
59 #include <sys/dirent.h>
60 
61 #ifndef MAGICLINKS
62 #define MAGICLINKS 0
63 #endif
64 
65 int vfs_magiclinks = MAGICLINKS;
66 
67 __CTASSERT(MAXNAMLEN == NAME_MAX);
68 
69 /*
70  * Substitute replacement text for 'magic' strings in symlinks.
71  * Returns 0 if successful, and returns non-zero if an error
72  * occurs.  (Currently, the only possible error is running out
73  * of temporary pathname space.)
74  *
75  * Looks for "@<string>" and "@<string>/", where <string> is a
76  * recognized 'magic' string.  Replaces the "@<string>" with the
77  * appropriate replacement text.  (Note that in some cases the
78  * replacement text may have zero length.)
79  *
80  * This would have been table driven, but the variance in
81  * replacement strings (and replacement string lengths) made
82  * that impractical.
83  */
84 #define	VNL(x)							\
85 	(sizeof(x) - 1)
86 
87 #define	VO	'{'
88 #define	VC	'}'
89 
90 #define	MATCH(str)						\
91 	((termchar == '/' && i + VNL(str) == *len) ||		\
92 	 (i + VNL(str) < *len &&				\
93 	  cp[i + VNL(str)] == termchar)) &&			\
94 	!strncmp((str), &cp[i], VNL(str))
95 
96 #define	SUBSTITUTE(m, s, sl)					\
97 	if ((newlen + (sl)) >= MAXPATHLEN)			\
98 		return 1;					\
99 	i += VNL(m);						\
100 	if (termchar != '/')					\
101 		i++;						\
102 	(void)memcpy(&tmp[newlen], (s), (sl));			\
103 	newlen += (sl);						\
104 	change = 1;						\
105 	termchar = '/';
106 
107 static int
108 symlink_magic(struct proc *p, char *cp, size_t *len)
109 {
110 	char *tmp;
111 	size_t change, i, newlen, slen;
112 	char termchar = '/';
113 	char idtmp[11]; /* enough for 32 bit *unsigned* integer */
114 
115 
116 	tmp = PNBUF_GET();
117 	for (change = i = newlen = 0; i < *len; ) {
118 		if (cp[i] != '@') {
119 			tmp[newlen++] = cp[i++];
120 			continue;
121 		}
122 
123 		i++;
124 
125 		/* Check for @{var} syntax. */
126 		if (cp[i] == VO) {
127 			termchar = VC;
128 			i++;
129 		}
130 
131 		/*
132 		 * The following checks should be ordered according
133 		 * to frequency of use.
134 		 */
135 		if (MATCH("machine_arch")) {
136 			slen = VNL(MACHINE_ARCH);
137 			SUBSTITUTE("machine_arch", MACHINE_ARCH, slen);
138 		} else if (MATCH("machine")) {
139 			slen = VNL(MACHINE);
140 			SUBSTITUTE("machine", MACHINE, slen);
141 		} else if (MATCH("hostname")) {
142 			SUBSTITUTE("hostname", hostname, hostnamelen);
143 		} else if (MATCH("osrelease")) {
144 			slen = strlen(osrelease);
145 			SUBSTITUTE("osrelease", osrelease, slen);
146 		} else if (MATCH("emul")) {
147 			slen = strlen(p->p_emul->e_name);
148 			SUBSTITUTE("emul", p->p_emul->e_name, slen);
149 		} else if (MATCH("kernel_ident")) {
150 			slen = strlen(kernel_ident);
151 			SUBSTITUTE("kernel_ident", kernel_ident, slen);
152 		} else if (MATCH("domainname")) {
153 			SUBSTITUTE("domainname", domainname, domainnamelen);
154 		} else if (MATCH("ostype")) {
155 			slen = strlen(ostype);
156 			SUBSTITUTE("ostype", ostype, slen);
157 		} else if (MATCH("uid")) {
158 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
159 			    kauth_cred_geteuid(kauth_cred_get()));
160 			SUBSTITUTE("uid", idtmp, slen);
161 		} else if (MATCH("ruid")) {
162 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
163 			    kauth_cred_getuid(kauth_cred_get()));
164 			SUBSTITUTE("ruid", idtmp, slen);
165 		} else if (MATCH("gid")) {
166 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
167 			    kauth_cred_getegid(kauth_cred_get()));
168 			SUBSTITUTE("gid", idtmp, slen);
169 		} else if (MATCH("rgid")) {
170 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
171 			    kauth_cred_getgid(kauth_cred_get()));
172 			SUBSTITUTE("rgid", idtmp, slen);
173 		} else {
174 			tmp[newlen++] = '@';
175 			if (termchar == VC)
176 				tmp[newlen++] = VO;
177 		}
178 	}
179 
180 	if (change) {
181 		(void)memcpy(cp, tmp, newlen);
182 		*len = newlen;
183 	}
184 	PNBUF_PUT(tmp);
185 
186 	return 0;
187 }
188 
189 #undef VNL
190 #undef VO
191 #undef VC
192 #undef MATCH
193 #undef SUBSTITUTE
194 
195 ////////////////////////////////////////////////////////////
196 
197 /*
198  * Determine the namei hash (for the namecache) for name.
199  * If *ep != NULL, hash from name to ep-1.
200  * If *ep == NULL, hash from name until the first NUL or '/', and
201  * return the location of this termination character in *ep.
202  *
203  * This function returns an equivalent hash to the MI hash32_strn().
204  * The latter isn't used because in the *ep == NULL case, determining
205  * the length of the string to the first NUL or `/' and then calling
206  * hash32_strn() involves unnecessary double-handling of the data.
207  */
208 uint32_t
209 namei_hash(const char *name, const char **ep)
210 {
211 	uint32_t	hash;
212 
213 	hash = HASH32_STR_INIT;
214 	if (*ep != NULL) {
215 		for (; name < *ep; name++)
216 			hash = hash * 33 + *(const uint8_t *)name;
217 	} else {
218 		for (; *name != '\0' && *name != '/'; name++)
219 			hash = hash * 33 + *(const uint8_t *)name;
220 		*ep = name;
221 	}
222 	return (hash + (hash >> 5));
223 }
224 
225 /*
226  * Find the end of the first path component in NAME and return its
227  * length.
228  */
229 static size_t
230 namei_getcomponent(const char *name)
231 {
232 	size_t pos;
233 
234 	pos = 0;
235 	while (name[pos] != '\0' && name[pos] != '/') {
236 		pos++;
237 	}
238 	return pos;
239 }
240 
241 ////////////////////////////////////////////////////////////
242 
243 /*
244  * Sealed abstraction for pathnames.
245  *
246  * System-call-layer level code that is going to call namei should
247  * first create a pathbuf and adjust all the bells and whistles on it
248  * as needed by context.
249  */
250 
251 struct pathbuf {
252 	char *pb_path;
253 	char *pb_pathcopy;
254 	unsigned pb_pathcopyuses;
255 };
256 
257 static struct pathbuf *
258 pathbuf_create_raw(void)
259 {
260 	struct pathbuf *pb;
261 
262 	pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
263 	if (pb == NULL) {
264 		return NULL;
265 	}
266 	pb->pb_path = PNBUF_GET();
267 	if (pb->pb_path == NULL) {
268 		kmem_free(pb, sizeof(*pb));
269 		return NULL;
270 	}
271 	pb->pb_pathcopy = NULL;
272 	pb->pb_pathcopyuses = 0;
273 	return pb;
274 }
275 
276 void
277 pathbuf_destroy(struct pathbuf *pb)
278 {
279 	KASSERT(pb->pb_pathcopyuses == 0);
280 	KASSERT(pb->pb_pathcopy == NULL);
281 	PNBUF_PUT(pb->pb_path);
282 	kmem_free(pb, sizeof(*pb));
283 }
284 
285 struct pathbuf *
286 pathbuf_assimilate(char *pnbuf)
287 {
288 	struct pathbuf *pb;
289 
290 	pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
291 	if (pb == NULL) {
292 		return NULL;
293 	}
294 	pb->pb_path = pnbuf;
295 	pb->pb_pathcopy = NULL;
296 	pb->pb_pathcopyuses = 0;
297 	return pb;
298 }
299 
300 struct pathbuf *
301 pathbuf_create(const char *path)
302 {
303 	struct pathbuf *pb;
304 	int error;
305 
306 	pb = pathbuf_create_raw();
307 	if (pb == NULL) {
308 		return NULL;
309 	}
310 	error = copystr(path, pb->pb_path, PATH_MAX, NULL);
311 	if (error != 0) {
312 		KASSERT(!"kernel path too long in pathbuf_create");
313 		/* make sure it's null-terminated, just in case */
314 		pb->pb_path[PATH_MAX-1] = '\0';
315 	}
316 	return pb;
317 }
318 
319 int
320 pathbuf_copyin(const char *userpath, struct pathbuf **ret)
321 {
322 	struct pathbuf *pb;
323 	int error;
324 
325 	pb = pathbuf_create_raw();
326 	if (pb == NULL) {
327 		return ENOMEM;
328 	}
329 	error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
330 	if (error) {
331 		pathbuf_destroy(pb);
332 		return error;
333 	}
334 	*ret = pb;
335 	return 0;
336 }
337 
338 /*
339  * XXX should not exist:
340  *   1. whether a pointer is kernel or user should be statically checkable.
341  *   2. copyin should be handled by the upper part of the syscall layer,
342  *      not in here.
343  */
344 int
345 pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
346 {
347 	if (seg == UIO_USERSPACE) {
348 		return pathbuf_copyin(path, ret);
349 	} else {
350 		*ret = pathbuf_create(path);
351 		if (*ret == NULL) {
352 			return ENOMEM;
353 		}
354 		return 0;
355 	}
356 }
357 
358 /*
359  * Get a copy of the path buffer as it currently exists. If this is
360  * called after namei starts the results may be arbitrary.
361  */
362 void
363 pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
364 {
365 	strlcpy(buf, pb->pb_path, maxlen);
366 }
367 
368 /*
369  * These two functions allow access to a saved copy of the original
370  * path string. The first copy should be gotten before namei is
371  * called. Each copy that is gotten should be put back.
372  */
373 
374 const char *
375 pathbuf_stringcopy_get(struct pathbuf *pb)
376 {
377 	if (pb->pb_pathcopyuses == 0) {
378 		pb->pb_pathcopy = PNBUF_GET();
379 		strcpy(pb->pb_pathcopy, pb->pb_path);
380 	}
381 	pb->pb_pathcopyuses++;
382 	return pb->pb_pathcopy;
383 }
384 
385 void
386 pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
387 {
388 	KASSERT(str == pb->pb_pathcopy);
389 	KASSERT(pb->pb_pathcopyuses > 0);
390 	pb->pb_pathcopyuses--;
391 	if (pb->pb_pathcopyuses == 0) {
392 		PNBUF_PUT(pb->pb_pathcopy);
393 		pb->pb_pathcopy = NULL;
394 	}
395 }
396 
397 
398 ////////////////////////////////////////////////////////////
399 
400 /*
401  * namei: convert a pathname into a pointer to a (maybe-locked) vnode,
402  * and maybe also its parent directory vnode, and assorted other guff.
403  * See namei(9) for the interface documentation.
404  *
405  *
406  * The FOLLOW flag is set when symbolic links are to be followed
407  * when they occur at the end of the name translation process.
408  * Symbolic links are always followed for all other pathname
409  * components other than the last.
410  *
411  * The segflg defines whether the name is to be copied from user
412  * space or kernel space.
413  *
414  * Overall outline of namei:
415  *
416  *	copy in name
417  *	get starting directory
418  *	while (!done && !error) {
419  *		call lookup to search path.
420  *		if symbolic link, massage name in buffer and continue
421  *	}
422  */
423 
424 /*
425  * Search a pathname.
426  * This is a very central and rather complicated routine.
427  *
428  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
429  * The starting directory is passed in. The pathname is descended
430  * until done, or a symbolic link is encountered. The variable ni_more
431  * is clear if the path is completed; it is set to one if a symbolic
432  * link needing interpretation is encountered.
433  *
434  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
435  * whether the name is to be looked up, created, renamed, or deleted.
436  * When CREATE, RENAME, or DELETE is specified, information usable in
437  * creating, renaming, or deleting a directory entry may be calculated.
438  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
439  * locked.  Otherwise the parent directory is not returned. If the target
440  * of the pathname exists and LOCKLEAF is or'ed into the flag the target
441  * is returned locked, otherwise it is returned unlocked.  When creating
442  * or renaming and LOCKPARENT is specified, the target may not be ".".
443  * When deleting and LOCKPARENT is specified, the target may be ".".
444  *
445  * Overall outline of lookup:
446  *
447  * dirloop:
448  *	identify next component of name at ndp->ni_ptr
449  *	handle degenerate case where name is null string
450  *	if .. and crossing mount points and on mounted filesys, find parent
451  *	call VOP_LOOKUP routine for next component name
452  *	    directory vnode returned in ni_dvp, locked.
453  *	    component vnode returned in ni_vp (if it exists), locked.
454  *	if result vnode is mounted on and crossing mount points,
455  *	    find mounted on vnode
456  *	if more components of name, do next level at dirloop
457  *	return the answer in ni_vp, locked if LOCKLEAF set
458  *	    if LOCKPARENT set, return locked parent in ni_dvp
459  */
460 
461 
462 /*
463  * Internal state for a namei operation.
464  *
465  * cnp is always equal to &ndp->ni_cnp.
466  */
467 struct namei_state {
468 	struct nameidata *ndp;
469 	struct componentname *cnp;
470 
471 	int docache;			/* == 0 do not cache last component */
472 	int rdonly;			/* lookup read-only flag bit */
473 	int slashes;
474 
475 	unsigned attempt_retry:1;	/* true if error allows emul retry */
476 };
477 
478 
479 /*
480  * Initialize the namei working state.
481  */
482 static void
483 namei_init(struct namei_state *state, struct nameidata *ndp)
484 {
485 	state->ndp = ndp;
486 	state->cnp = &ndp->ni_cnd;
487 	KASSERT((state->cnp->cn_flags & INRELOOKUP) == 0);
488 
489 	state->docache = 0;
490 	state->rdonly = 0;
491 	state->slashes = 0;
492 
493 #ifdef DIAGNOSTIC
494 	if (!state->cnp->cn_cred)
495 		panic("namei: bad cred/proc");
496 	if (state->cnp->cn_nameiop & (~OPMASK))
497 		panic("namei: nameiop contaminated with flags");
498 	if (state->cnp->cn_flags & OPMASK)
499 		panic("namei: flags contaminated with nameiops");
500 #endif
501 
502 	/*
503 	 * The buffer for name translation shall be the one inside the
504 	 * pathbuf.
505 	 */
506 	state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
507 }
508 
509 /*
510  * Clean up the working namei state, leaving things ready for return
511  * from namei.
512  */
513 static void
514 namei_cleanup(struct namei_state *state)
515 {
516 	KASSERT(state->cnp == &state->ndp->ni_cnd);
517 
518 	/* nothing for now */
519 	(void)state;
520 }
521 
522 //////////////////////////////
523 
524 /*
525  * Get the directory context.
526  * Initializes the rootdir and erootdir state and returns a reference
527  * to the starting dir.
528  */
529 static struct vnode *
530 namei_getstartdir(struct namei_state *state)
531 {
532 	struct nameidata *ndp = state->ndp;
533 	struct componentname *cnp = state->cnp;
534 	struct cwdinfo *cwdi;		/* pointer to cwd state */
535 	struct lwp *self = curlwp;	/* thread doing namei() */
536 	struct vnode *rootdir, *erootdir, *curdir, *startdir;
537 
538 	cwdi = self->l_proc->p_cwdi;
539 	rw_enter(&cwdi->cwdi_lock, RW_READER);
540 
541 	/* root dir */
542 	if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
543 		rootdir = rootvnode;
544 	} else {
545 		rootdir = cwdi->cwdi_rdir;
546 	}
547 
548 	/* emulation root dir, if any */
549 	if ((cnp->cn_flags & TRYEMULROOT) == 0) {
550 		/* if we don't want it, don't fetch it */
551 		erootdir = NULL;
552 	} else if (cnp->cn_flags & EMULROOTSET) {
553 		/* explicitly set emulroot; "/../" doesn't override this */
554 		erootdir = ndp->ni_erootdir;
555 	} else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
556 		/* explicit reference to real rootdir */
557 		erootdir = NULL;
558 	} else {
559 		/* may be null */
560 		erootdir = cwdi->cwdi_edir;
561 	}
562 
563 	/* current dir */
564 	curdir = cwdi->cwdi_cdir;
565 
566 	if (ndp->ni_pnbuf[0] != '/') {
567 		if (ndp->ni_atdir != NULL) {
568 			startdir = ndp->ni_atdir;
569 		} else {
570 			startdir = curdir;
571 		}
572 		erootdir = NULL;
573 	} else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
574 		startdir = erootdir;
575 	} else {
576 		startdir = rootdir;
577 		erootdir = NULL;
578 	}
579 
580 	state->ndp->ni_rootdir = rootdir;
581 	state->ndp->ni_erootdir = erootdir;
582 
583 	/*
584 	 * Get a reference to the start dir so we can safely unlock cwdi.
585 	 *
586 	 * XXX: should we hold references to rootdir and erootdir while
587 	 * we're running? What happens if a multithreaded process chroots
588 	 * during namei?
589 	 */
590 	vref(startdir);
591 
592 	rw_exit(&cwdi->cwdi_lock);
593 	return startdir;
594 }
595 
596 /*
597  * Get the directory context for the nfsd case, in parallel to
598  * getstartdir. Initializes the rootdir and erootdir state and
599  * returns a reference to the passed-in starting dir.
600  */
601 static struct vnode *
602 namei_getstartdir_for_nfsd(struct namei_state *state)
603 {
604 	KASSERT(state->ndp->ni_atdir != NULL);
605 
606 	/* always use the real root, and never set an emulation root */
607 	state->ndp->ni_rootdir = rootvnode;
608 	state->ndp->ni_erootdir = NULL;
609 
610 	vref(state->ndp->ni_atdir);
611 	return state->ndp->ni_atdir;
612 }
613 
614 
615 /*
616  * Ktrace the namei operation.
617  */
618 static void
619 namei_ktrace(struct namei_state *state)
620 {
621 	struct nameidata *ndp = state->ndp;
622 	struct componentname *cnp = state->cnp;
623 	struct lwp *self = curlwp;	/* thread doing namei() */
624 	const char *emul_path;
625 
626 	if (ktrpoint(KTR_NAMEI)) {
627 		if (ndp->ni_erootdir != NULL) {
628 			/*
629 			 * To make any sense, the trace entry need to have the
630 			 * text of the emulation path prepended.
631 			 * Usually we can get this from the current process,
632 			 * but when called from emul_find_interp() it is only
633 			 * in the exec_package - so we get it passed in ni_next
634 			 * (this is a hack).
635 			 */
636 			if (cnp->cn_flags & EMULROOTSET)
637 				emul_path = ndp->ni_next;
638 			else
639 				emul_path = self->l_proc->p_emul->e_path;
640 			ktrnamei2(emul_path, strlen(emul_path),
641 			    ndp->ni_pnbuf, ndp->ni_pathlen);
642 		} else
643 			ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
644 	}
645 }
646 
647 /*
648  * Start up namei. Find the root dir and cwd, establish the starting
649  * directory for lookup, and lock it. Also calls ktrace when
650  * appropriate.
651  */
652 static int
653 namei_start(struct namei_state *state, int isnfsd,
654 	    struct vnode **startdir_ret)
655 {
656 	struct nameidata *ndp = state->ndp;
657 	struct vnode *startdir;
658 
659 	/* length includes null terminator (was originally from copyinstr) */
660 	ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
661 
662 	/*
663 	 * POSIX.1 requirement: "" is not a valid file name.
664 	 */
665 	if (ndp->ni_pathlen == 1) {
666 		return ENOENT;
667 	}
668 
669 	ndp->ni_loopcnt = 0;
670 
671 	/* Get starting directory, set up root, and ktrace. */
672 	if (isnfsd) {
673 		startdir = namei_getstartdir_for_nfsd(state);
674 		/* no ktrace */
675 	} else {
676 		startdir = namei_getstartdir(state);
677 		namei_ktrace(state);
678 	}
679 
680 	/* NDAT may feed us with a non directory namei_getstartdir */
681 	if (startdir->v_type != VDIR)
682 		return ENOTDIR;
683 
684 	vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY);
685 
686 	*startdir_ret = startdir;
687 	return 0;
688 }
689 
690 /*
691  * Check for being at a symlink that we're going to follow.
692  */
693 static inline int
694 namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
695 {
696 	return (foundobj->v_type == VLNK) &&
697 		(state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
698 }
699 
700 /*
701  * Follow a symlink.
702  *
703  * Updates searchdir. inhibitmagic causes magic symlinks to not be
704  * interpreted; this is used by nfsd.
705  *
706  * Unlocks foundobj on success (ugh)
707  */
708 static inline int
709 namei_follow(struct namei_state *state, int inhibitmagic,
710 	     struct vnode *searchdir, struct vnode *foundobj,
711 	     struct vnode **newsearchdir_ret)
712 {
713 	struct nameidata *ndp = state->ndp;
714 	struct componentname *cnp = state->cnp;
715 
716 	struct lwp *self = curlwp;	/* thread doing namei() */
717 	struct iovec aiov;		/* uio for reading symbolic links */
718 	struct uio auio;
719 	char *cp;			/* pointer into pathname argument */
720 	size_t linklen;
721 	int error;
722 
723 	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
724 	KASSERT(VOP_ISLOCKED(foundobj) == LK_EXCLUSIVE);
725 	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
726 		return ELOOP;
727 	}
728 	if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
729 		error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
730 		if (error != 0)
731 			return error;
732 	}
733 
734 	/* FUTURE: fix this to not use a second buffer */
735 	cp = PNBUF_GET();
736 	aiov.iov_base = cp;
737 	aiov.iov_len = MAXPATHLEN;
738 	auio.uio_iov = &aiov;
739 	auio.uio_iovcnt = 1;
740 	auio.uio_offset = 0;
741 	auio.uio_rw = UIO_READ;
742 	auio.uio_resid = MAXPATHLEN;
743 	UIO_SETUP_SYSSPACE(&auio);
744 	error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
745 	if (error) {
746 		PNBUF_PUT(cp);
747 		return error;
748 	}
749 	linklen = MAXPATHLEN - auio.uio_resid;
750 	if (linklen == 0) {
751 		PNBUF_PUT(cp);
752 		return ENOENT;
753 	}
754 
755 	/*
756 	 * Do symlink substitution, if appropriate, and
757 	 * check length for potential overflow.
758 	 *
759 	 * Inhibit symlink substitution for nfsd.
760 	 * XXX: This is how it was before; is that a bug or a feature?
761 	 */
762 	if ((!inhibitmagic && vfs_magiclinks &&
763 	     symlink_magic(self->l_proc, cp, &linklen)) ||
764 	    (linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
765 		PNBUF_PUT(cp);
766 		return ENAMETOOLONG;
767 	}
768 	if (ndp->ni_pathlen > 1) {
769 		/* includes a null-terminator */
770 		memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
771 	} else {
772 		cp[linklen] = '\0';
773 	}
774 	ndp->ni_pathlen += linklen;
775 	memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
776 	PNBUF_PUT(cp);
777 
778 	/* we're now starting from the beginning of the buffer again */
779 	cnp->cn_nameptr = ndp->ni_pnbuf;
780 
781 	/* must unlock this before relocking searchdir */
782 	VOP_UNLOCK(foundobj);
783 
784 	/*
785 	 * Check if root directory should replace current directory.
786 	 */
787 	if (ndp->ni_pnbuf[0] == '/') {
788 		vput(searchdir);
789 		/* Keep absolute symbolic links inside emulation root */
790 		searchdir = ndp->ni_erootdir;
791 		if (searchdir == NULL ||
792 		    (ndp->ni_pnbuf[1] == '.'
793 		     && ndp->ni_pnbuf[2] == '.'
794 		     && ndp->ni_pnbuf[3] == '/')) {
795 			ndp->ni_erootdir = NULL;
796 			searchdir = ndp->ni_rootdir;
797 		}
798 		vref(searchdir);
799 		vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
800 		while (cnp->cn_nameptr[0] == '/') {
801 			cnp->cn_nameptr++;
802 			ndp->ni_pathlen--;
803 		}
804 	}
805 
806 	*newsearchdir_ret = searchdir;
807 	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
808 	return 0;
809 }
810 
811 //////////////////////////////
812 
813 /*
814  * Inspect the leading path component and update the state accordingly.
815  */
816 static int
817 lookup_parsepath(struct namei_state *state)
818 {
819 	const char *cp;			/* pointer into pathname argument */
820 
821 	struct componentname *cnp = state->cnp;
822 	struct nameidata *ndp = state->ndp;
823 
824 	KASSERT(cnp == &ndp->ni_cnd);
825 
826 	/*
827 	 * Search a new directory.
828 	 *
829 	 * The last component of the filename is left accessible via
830 	 * cnp->cn_nameptr for callers that need the name. Callers needing
831 	 * the name set the SAVENAME flag. When done, they assume
832 	 * responsibility for freeing the pathname buffer.
833 	 *
834 	 * At this point, our only vnode state is that the search dir
835 	 * is held and locked.
836 	 */
837 	cnp->cn_consume = 0;
838 	cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
839 	cp = cnp->cn_nameptr + cnp->cn_namelen;
840 	if (cnp->cn_namelen > KERNEL_NAME_MAX) {
841 		return ENAMETOOLONG;
842 	}
843 #ifdef NAMEI_DIAGNOSTIC
844 	{ char c = *cp;
845 	*(char *)cp = '\0';
846 	printf("{%s}: ", cnp->cn_nameptr);
847 	*(char *)cp = c; }
848 #endif /* NAMEI_DIAGNOSTIC */
849 	ndp->ni_pathlen -= cnp->cn_namelen;
850 	ndp->ni_next = cp;
851 	/*
852 	 * If this component is followed by a slash, then move the pointer to
853 	 * the next component forward, and remember that this component must be
854 	 * a directory.
855 	 */
856 	if (*cp == '/') {
857 		do {
858 			cp++;
859 		} while (*cp == '/');
860 		state->slashes = cp - ndp->ni_next;
861 		ndp->ni_pathlen -= state->slashes;
862 		ndp->ni_next = cp;
863 		cnp->cn_flags |= REQUIREDIR;
864 	} else {
865 		state->slashes = 0;
866 		cnp->cn_flags &= ~REQUIREDIR;
867 	}
868 	/*
869 	 * We do special processing on the last component, whether or not it's
870 	 * a directory.  Cache all intervening lookups, but not the final one.
871 	 */
872 	if (*cp == '\0') {
873 		if (state->docache)
874 			cnp->cn_flags |= MAKEENTRY;
875 		else
876 			cnp->cn_flags &= ~MAKEENTRY;
877 		cnp->cn_flags |= ISLASTCN;
878 	} else {
879 		cnp->cn_flags |= MAKEENTRY;
880 		cnp->cn_flags &= ~ISLASTCN;
881 	}
882 	if (cnp->cn_namelen == 2 &&
883 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
884 		cnp->cn_flags |= ISDOTDOT;
885 	else
886 		cnp->cn_flags &= ~ISDOTDOT;
887 
888 	return 0;
889 }
890 
891 /*
892  * Call VOP_LOOKUP for a single lookup; return a new search directory
893  * (used when crossing mountpoints up or searching union mounts down) and
894  * the found object, which for create operations may be NULL on success.
895  */
896 static int
897 lookup_once(struct namei_state *state,
898 	    struct vnode *searchdir,
899 	    struct vnode **newsearchdir_ret,
900 	    struct vnode **foundobj_ret)
901 {
902 	struct vnode *tmpvn;		/* scratch vnode */
903 	struct vnode *foundobj;		/* result */
904 	struct mount *mp;		/* mount table entry */
905 	struct lwp *l = curlwp;
906 	int error;
907 
908 	struct componentname *cnp = state->cnp;
909 	struct nameidata *ndp = state->ndp;
910 
911 	KASSERT(cnp == &ndp->ni_cnd);
912 	KASSERT(VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
913 	*newsearchdir_ret = searchdir;
914 
915 	/*
916 	 * Handle "..": two special cases.
917 	 * 1. If at root directory (e.g. after chroot)
918 	 *    or at absolute root directory
919 	 *    then ignore it so can't get out.
920 	 * 1a. If at the root of the emulation filesystem go to the real
921 	 *    root. So "/../<path>" is always absolute.
922 	 * 1b. If we have somehow gotten out of a jail, warn
923 	 *    and also ignore it so we can't get farther out.
924 	 * 2. If this vnode is the root of a mounted
925 	 *    filesystem, then replace it with the
926 	 *    vnode which was mounted on so we take the
927 	 *    .. in the other file system.
928 	 */
929 	if (cnp->cn_flags & ISDOTDOT) {
930 		struct proc *p = l->l_proc;
931 
932 		for (;;) {
933 			if (searchdir == ndp->ni_rootdir ||
934 			    searchdir == rootvnode) {
935 				foundobj = searchdir;
936 				vref(foundobj);
937 				*foundobj_ret = foundobj;
938 				error = 0;
939 				goto done;
940 			}
941 			if (ndp->ni_rootdir != rootvnode) {
942 				int retval;
943 
944 				VOP_UNLOCK(searchdir);
945 				retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
946 				vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
947 				if (!retval) {
948 				    /* Oops! We got out of jail! */
949 				    log(LOG_WARNING,
950 					"chrooted pid %d uid %d (%s) "
951 					"detected outside of its chroot\n",
952 					p->p_pid, kauth_cred_geteuid(l->l_cred),
953 					p->p_comm);
954 				    /* Put us at the jail root. */
955 				    vput(searchdir);
956 				    searchdir = NULL;
957 				    foundobj = ndp->ni_rootdir;
958 				    vref(foundobj);
959 				    vref(foundobj);
960 				    vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
961 				    *newsearchdir_ret = foundobj;
962 				    *foundobj_ret = foundobj;
963 				    error = 0;
964 				    goto done;
965 				}
966 			}
967 			if ((searchdir->v_vflag & VV_ROOT) == 0 ||
968 			    (cnp->cn_flags & NOCROSSMOUNT))
969 				break;
970 			tmpvn = searchdir;
971 			searchdir = searchdir->v_mount->mnt_vnodecovered;
972 			vref(searchdir);
973 			vput(tmpvn);
974 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
975 			*newsearchdir_ret = searchdir;
976 		}
977 	}
978 
979 	/*
980 	 * We now have a segment name to search for, and a directory to search.
981 	 * Our vnode state here is that "searchdir" is held and locked.
982 	 */
983 unionlookup:
984 	foundobj = NULL;
985 	error = VOP_LOOKUP(searchdir, &foundobj, cnp);
986 
987 	if (error != 0) {
988 #ifdef DIAGNOSTIC
989 		if (foundobj != NULL)
990 			panic("leaf `%s' should be empty", cnp->cn_nameptr);
991 #endif /* DIAGNOSTIC */
992 #ifdef NAMEI_DIAGNOSTIC
993 		printf("not found\n");
994 #endif /* NAMEI_DIAGNOSTIC */
995 		if ((error == ENOENT) &&
996 		    (searchdir->v_vflag & VV_ROOT) &&
997 		    (searchdir->v_mount->mnt_flag & MNT_UNION)) {
998 			tmpvn = searchdir;
999 			searchdir = searchdir->v_mount->mnt_vnodecovered;
1000 			vref(searchdir);
1001 			vput(tmpvn);
1002 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1003 			*newsearchdir_ret = searchdir;
1004 			goto unionlookup;
1005 		}
1006 
1007 		if (error != EJUSTRETURN)
1008 			goto done;
1009 
1010 		/*
1011 		 * If this was not the last component, or there were trailing
1012 		 * slashes, and we are not going to create a directory,
1013 		 * then the name must exist.
1014 		 */
1015 		if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
1016 			error = ENOENT;
1017 			goto done;
1018 		}
1019 
1020 		/*
1021 		 * If creating and at end of pathname, then can consider
1022 		 * allowing file to be created.
1023 		 */
1024 		if (state->rdonly) {
1025 			error = EROFS;
1026 			goto done;
1027 		}
1028 
1029 		/*
1030 		 * We return success and a NULL foundobj to indicate
1031 		 * that the entry doesn't currently exist, leaving a
1032 		 * pointer to the (normally, locked) directory vnode
1033 		 * as searchdir.
1034 		 */
1035 		*foundobj_ret = NULL;
1036 		error = 0;
1037 		goto done;
1038 	}
1039 #ifdef NAMEI_DIAGNOSTIC
1040 	printf("found\n");
1041 #endif /* NAMEI_DIAGNOSTIC */
1042 
1043 	/*
1044 	 * Take into account any additional components consumed by the
1045 	 * underlying filesystem.  This will include any trailing slashes after
1046 	 * the last component consumed.
1047 	 */
1048 	if (cnp->cn_consume > 0) {
1049 		ndp->ni_pathlen -= cnp->cn_consume - state->slashes;
1050 		ndp->ni_next += cnp->cn_consume - state->slashes;
1051 		cnp->cn_consume = 0;
1052 		if (ndp->ni_next[0] == '\0')
1053 			cnp->cn_flags |= ISLASTCN;
1054 	}
1055 
1056 	/*
1057 	 * "searchdir" is locked and held, "foundobj" is held,
1058 	 * they may be the same vnode.
1059 	 */
1060 	if (searchdir != foundobj) {
1061 		if (cnp->cn_flags & ISDOTDOT)
1062 			VOP_UNLOCK(searchdir);
1063 		error = vn_lock(foundobj, LK_EXCLUSIVE);
1064 		if (cnp->cn_flags & ISDOTDOT)
1065 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1066 		if (error != 0) {
1067 			vrele(foundobj);
1068 			goto done;
1069 		}
1070 	}
1071 
1072 	/*
1073 	 * Check to see if the vnode has been mounted on;
1074 	 * if so find the root of the mounted file system.
1075 	 */
1076 	while (foundobj->v_type == VDIR &&
1077 	       (mp = foundobj->v_mountedhere) != NULL &&
1078 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1079 		error = vfs_busy(mp, NULL);
1080 		if (error != 0) {
1081 			if (searchdir != foundobj) {
1082 				vput(foundobj);
1083 			} else {
1084 				vrele(foundobj);
1085 			}
1086 			goto done;
1087 		}
1088 		if (searchdir != foundobj) {
1089 			VOP_UNLOCK(searchdir);
1090 		}
1091 		vput(foundobj);
1092 		error = VFS_ROOT(mp, &foundobj);
1093 		vfs_unbusy(mp, false, NULL);
1094 		if (error) {
1095 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1096 			goto done;
1097 		}
1098 		/*
1099 		 * avoid locking vnodes from two filesystems because it's
1100 		 * prune to deadlock.  eg. when using puffs.
1101 		 * also, it isn't a good idea to propagate slowness of a
1102 		 * filesystem up to the root directory.
1103 		 * for now, only handle the common case.  (ie. foundobj is VDIR)
1104 		 */
1105 		if (foundobj->v_type == VDIR) {
1106 			vrele(searchdir);
1107 			*newsearchdir_ret = searchdir = foundobj;
1108 			vref(searchdir);
1109 		} else {
1110 			VOP_UNLOCK(foundobj);
1111 			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
1112 			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
1113 		}
1114 	}
1115 
1116 	*foundobj_ret = foundobj;
1117 	error = 0;
1118 done:
1119 	KASSERT(VOP_ISLOCKED(*newsearchdir_ret) == LK_EXCLUSIVE);
1120 	/*
1121 	 * *foundobj_ret is valid only if error == 0.
1122 	 */
1123 	KASSERT(error != 0 || *foundobj_ret == NULL ||
1124 	    VOP_ISLOCKED(*foundobj_ret) == LK_EXCLUSIVE);
1125 	return error;
1126 }
1127 
1128 //////////////////////////////
1129 
1130 /*
1131  * Do a complete path search from a single root directory.
1132  * (This is called up to twice if TRYEMULROOT is in effect.)
1133  */
1134 static int
1135 namei_oneroot(struct namei_state *state,
1136 	 int neverfollow, int inhibitmagic, int isnfsd)
1137 {
1138 	struct nameidata *ndp = state->ndp;
1139 	struct componentname *cnp = state->cnp;
1140 	struct vnode *searchdir, *foundobj;
1141 	int error;
1142 
1143 	error = namei_start(state, isnfsd, &searchdir);
1144 	if (error) {
1145 		ndp->ni_dvp = NULL;
1146 		ndp->ni_vp = NULL;
1147 		return error;
1148 	}
1149 	KASSERT(searchdir->v_type == VDIR);
1150 
1151 	/*
1152 	 * Setup: break out flag bits into variables.
1153 	 */
1154 	state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
1155 	if (cnp->cn_nameiop == DELETE)
1156 		state->docache = 0;
1157 	state->rdonly = cnp->cn_flags & RDONLY;
1158 
1159 	/*
1160 	 * Keep going until we run out of path components.
1161 	 */
1162 	cnp->cn_nameptr = ndp->ni_pnbuf;
1163 
1164 	/* drop leading slashes (already used them to choose startdir) */
1165 	while (cnp->cn_nameptr[0] == '/') {
1166 		cnp->cn_nameptr++;
1167 		ndp->ni_pathlen--;
1168 	}
1169 	/* was it just "/"? */
1170 	if (cnp->cn_nameptr[0] == '\0') {
1171 		foundobj = searchdir;
1172 		searchdir = NULL;
1173 		cnp->cn_flags |= ISLASTCN;
1174 
1175 		/* bleh */
1176 		goto skiploop;
1177 	}
1178 
1179 	for (;;) {
1180 
1181 		/*
1182 		 * If the directory we're on is unmounted, bail out.
1183 		 * XXX: should this also check if it's unlinked?
1184 		 * XXX: yes it should... but how?
1185 		 */
1186 		if (searchdir->v_mount == NULL) {
1187 			vput(searchdir);
1188 			ndp->ni_dvp = NULL;
1189 			ndp->ni_vp = NULL;
1190 			return (ENOENT);
1191 		}
1192 
1193 		/*
1194 		 * Look up the next path component.
1195 		 * (currently, this may consume more than one)
1196 		 */
1197 
1198 		/* There should be no slashes here. */
1199 		KASSERT(cnp->cn_nameptr[0] != '/');
1200 
1201 		/* and we shouldn't have looped around if we were done */
1202 		KASSERT(cnp->cn_nameptr[0] != '\0');
1203 
1204 		error = lookup_parsepath(state);
1205 		if (error) {
1206 			vput(searchdir);
1207 			ndp->ni_dvp = NULL;
1208 			ndp->ni_vp = NULL;
1209 			state->attempt_retry = 1;
1210 			return (error);
1211 		}
1212 
1213 		error = lookup_once(state, searchdir, &searchdir, &foundobj);
1214 		if (error) {
1215 			vput(searchdir);
1216 			ndp->ni_dvp = NULL;
1217 			ndp->ni_vp = NULL;
1218 			/*
1219 			 * Note that if we're doing TRYEMULROOT we can
1220 			 * retry with the normal root. Where this is
1221 			 * currently set matches previous practice,
1222 			 * but the previous practice didn't make much
1223 			 * sense and somebody should sit down and
1224 			 * figure out which cases should cause retry
1225 			 * and which shouldn't. XXX.
1226 			 */
1227 			state->attempt_retry = 1;
1228 			return (error);
1229 		}
1230 
1231 		if (foundobj == NULL) {
1232 			/*
1233 			 * Success with no object returned means we're
1234 			 * creating something and it isn't already
1235 			 * there. Break out of the main loop now so
1236 			 * the code below doesn't have to test for
1237 			 * foundobj == NULL.
1238 			 */
1239 			break;
1240 		}
1241 
1242 		/*
1243 		 * Check for symbolic link. If we've reached one,
1244 		 * follow it, unless we aren't supposed to. Back up
1245 		 * over any slashes that we skipped, as we will need
1246 		 * them again.
1247 		 */
1248 		if (namei_atsymlink(state, foundobj)) {
1249 			ndp->ni_pathlen += state->slashes;
1250 			ndp->ni_next -= state->slashes;
1251 			if (neverfollow) {
1252 				error = EINVAL;
1253 			} else {
1254 				/*
1255 				 * dholland 20110410: if we're at a
1256 				 * union mount it might make sense to
1257 				 * use the top of the union stack here
1258 				 * rather than the layer we found the
1259 				 * symlink in. (FUTURE)
1260 				 */
1261 				error = namei_follow(state, inhibitmagic,
1262 						     searchdir, foundobj,
1263 						     &searchdir);
1264 			}
1265 			if (error) {
1266 				KASSERT(searchdir != foundobj);
1267 				vput(searchdir);
1268 				vput(foundobj);
1269 				ndp->ni_dvp = NULL;
1270 				ndp->ni_vp = NULL;
1271 				return error;
1272 			}
1273 			/* namei_follow unlocks it (ugh) so rele, not put */
1274 			vrele(foundobj);
1275 			foundobj = NULL;
1276 
1277 			/*
1278 			 * If we followed a symlink to `/' and there
1279 			 * are no more components after the symlink,
1280 			 * we're done with the loop and what we found
1281 			 * is the searchdir.
1282 			 */
1283 			if (cnp->cn_nameptr[0] == '\0') {
1284 				foundobj = searchdir;
1285 				searchdir = NULL;
1286 				cnp->cn_flags |= ISLASTCN;
1287 				break;
1288 			}
1289 
1290 			continue;
1291 		}
1292 
1293 		/*
1294 		 * Not a symbolic link.
1295 		 *
1296 		 * Check for directory, if the component was
1297 		 * followed by a series of slashes.
1298 		 */
1299 		if ((foundobj->v_type != VDIR) &&
1300 		    (cnp->cn_flags & REQUIREDIR)) {
1301 			if (searchdir == foundobj) {
1302 				vrele(searchdir);
1303 			} else {
1304 				vput(searchdir);
1305 			}
1306 			vput(foundobj);
1307 			ndp->ni_dvp = NULL;
1308 			ndp->ni_vp = NULL;
1309 			state->attempt_retry = 1;
1310 			return ENOTDIR;
1311 		}
1312 
1313 		/*
1314 		 * Stop if we've reached the last component.
1315 		 */
1316 		if (cnp->cn_flags & ISLASTCN) {
1317 			break;
1318 		}
1319 
1320 		/*
1321 		 * Continue with the next component.
1322 		 */
1323 		cnp->cn_nameptr = ndp->ni_next;
1324 		if (searchdir == foundobj) {
1325 			vrele(searchdir);
1326 		} else {
1327 			vput(searchdir);
1328 		}
1329 		searchdir = foundobj;
1330 		foundobj = NULL;
1331 	}
1332 
1333  skiploop:
1334 
1335 	if (foundobj != NULL) {
1336 		if (foundobj == ndp->ni_erootdir) {
1337 			/*
1338 			 * We are about to return the emulation root.
1339 			 * This isn't a good idea because code might
1340 			 * repeatedly lookup ".." until the file
1341 			 * matches that returned for "/" and loop
1342 			 * forever.  So convert it to the real root.
1343 			 */
1344 			if (searchdir != NULL) {
1345 				if (searchdir == foundobj)
1346 					vrele(searchdir);
1347 				else
1348 					vput(searchdir);
1349 				searchdir = NULL;
1350 			}
1351 			vput(foundobj);
1352 			foundobj = ndp->ni_rootdir;
1353 			vref(foundobj);
1354 			vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
1355 		}
1356 
1357 		/*
1358 		 * If the caller requested the parent node (i.e. it's
1359 		 * a CREATE, DELETE, or RENAME), and we don't have one
1360 		 * (because this is the root directory, or we crossed
1361 		 * a mount point), then we must fail.
1362 		 */
1363 		if (cnp->cn_nameiop != LOOKUP &&
1364 		    (searchdir == NULL ||
1365 		     searchdir->v_mount != foundobj->v_mount)) {
1366 			if (searchdir) {
1367 				vput(searchdir);
1368 			}
1369 			vput(foundobj);
1370 			foundobj = NULL;
1371 			ndp->ni_dvp = NULL;
1372 			ndp->ni_vp = NULL;
1373 			state->attempt_retry = 1;
1374 
1375 			switch (cnp->cn_nameiop) {
1376 			    case CREATE:
1377 				return EEXIST;
1378 			    case DELETE:
1379 			    case RENAME:
1380 				return EBUSY;
1381 			    default:
1382 				break;
1383 			}
1384 			panic("Invalid nameiop\n");
1385 		}
1386 
1387 		/*
1388 		 * Disallow directory write attempts on read-only lookups.
1389 		 * Prefers EEXIST over EROFS for the CREATE case.
1390 		 */
1391 		if (state->rdonly &&
1392 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1393 			if (searchdir) {
1394 				if (foundobj != searchdir) {
1395 					vput(searchdir);
1396 				} else {
1397 					vrele(searchdir);
1398 				}
1399 				searchdir = NULL;
1400 			}
1401 			vput(foundobj);
1402 			foundobj = NULL;
1403 			ndp->ni_dvp = NULL;
1404 			ndp->ni_vp = NULL;
1405 			state->attempt_retry = 1;
1406 			return EROFS;
1407 		}
1408 		if ((cnp->cn_flags & LOCKLEAF) == 0) {
1409 			/*
1410 			 * Note: if LOCKPARENT but not LOCKLEAF is
1411 			 * set, and searchdir == foundobj, this code
1412 			 * necessarily unlocks the parent as well as
1413 			 * the leaf. That is, just because you specify
1414 			 * LOCKPARENT doesn't mean you necessarily get
1415 			 * a locked parent vnode. The code in
1416 			 * vfs_syscalls.c, and possibly elsewhere,
1417 			 * that uses this combination "knows" this, so
1418 			 * it can't be safely changed. Feh. XXX
1419 			 */
1420 			VOP_UNLOCK(foundobj);
1421 		}
1422 	}
1423 
1424 	/*
1425 	 * Done.
1426 	 */
1427 
1428 	/*
1429 	 * If LOCKPARENT is not set, the parent directory isn't returned.
1430 	 */
1431 	if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) {
1432 		if (searchdir == foundobj) {
1433 			vrele(searchdir);
1434 		} else {
1435 			vput(searchdir);
1436 		}
1437 		searchdir = NULL;
1438 	}
1439 
1440 	ndp->ni_dvp = searchdir;
1441 	ndp->ni_vp = foundobj;
1442 	return 0;
1443 }
1444 
1445 /*
1446  * Do namei; wrapper layer that handles TRYEMULROOT.
1447  */
1448 static int
1449 namei_tryemulroot(struct namei_state *state,
1450 	 int neverfollow, int inhibitmagic, int isnfsd)
1451 {
1452 	int error;
1453 
1454 	struct nameidata *ndp = state->ndp;
1455 	struct componentname *cnp = state->cnp;
1456 	const char *savepath = NULL;
1457 
1458 	KASSERT(cnp == &ndp->ni_cnd);
1459 
1460 	if (cnp->cn_flags & TRYEMULROOT) {
1461 		savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
1462 	}
1463 
1464     emul_retry:
1465 	state->attempt_retry = 0;
1466 
1467 	error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd);
1468 	if (error) {
1469 		/*
1470 		 * Once namei has started up, the existence of ni_erootdir
1471 		 * tells us whether we're working from an emulation root.
1472 		 * The TRYEMULROOT flag isn't necessarily authoritative.
1473 		 */
1474 		if (ndp->ni_erootdir != NULL && state->attempt_retry) {
1475 			/* Retry the whole thing using the normal root */
1476 			cnp->cn_flags &= ~TRYEMULROOT;
1477 			state->attempt_retry = 0;
1478 
1479 			/* kinda gross */
1480 			strcpy(ndp->ni_pathbuf->pb_path, savepath);
1481 			pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1482 			savepath = NULL;
1483 
1484 			goto emul_retry;
1485 		}
1486 	}
1487 	if (savepath != NULL) {
1488 		pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1489 	}
1490 	return error;
1491 }
1492 
1493 /*
1494  * External interface.
1495  */
1496 int
1497 namei(struct nameidata *ndp)
1498 {
1499 	struct namei_state state;
1500 	int error;
1501 
1502 	namei_init(&state, ndp);
1503 	error = namei_tryemulroot(&state,
1504 				  0/*!neverfollow*/, 0/*!inhibitmagic*/,
1505 				  0/*isnfsd*/);
1506 	namei_cleanup(&state);
1507 
1508 	if (error) {
1509 		/* make sure no stray refs leak out */
1510 		KASSERT(ndp->ni_dvp == NULL);
1511 		KASSERT(ndp->ni_vp == NULL);
1512 	}
1513 
1514 	return error;
1515 }
1516 
1517 ////////////////////////////////////////////////////////////
1518 
1519 /*
1520  * External interface used by nfsd. This is basically different from
1521  * namei only in that it has the ability to pass in the "current
1522  * directory", and uses an extra flag "neverfollow" for which there's
1523  * no physical flag defined in namei.h. (There used to be a cut&paste
1524  * copy of about half of namei in nfsd to allow these minor
1525  * adjustments to exist.)
1526  *
1527  * XXX: the namei interface should be adjusted so nfsd can just use
1528  * ordinary namei().
1529  */
1530 int
1531 lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
1532 {
1533 	struct namei_state state;
1534 	int error;
1535 
1536 	KASSERT(ndp->ni_atdir == NULL);
1537 	ndp->ni_atdir = forcecwd;
1538 
1539 	namei_init(&state, ndp);
1540 	error = namei_tryemulroot(&state,
1541 				  neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
1542 	namei_cleanup(&state);
1543 
1544 	if (error) {
1545 		/* make sure no stray refs leak out */
1546 		KASSERT(ndp->ni_dvp == NULL);
1547 		KASSERT(ndp->ni_vp == NULL);
1548 	}
1549 
1550 	return error;
1551 }
1552 
1553 /*
1554  * A second external interface used by nfsd. This turns out to be a
1555  * single lookup used by the WebNFS code (ha!) to get "index.html" or
1556  * equivalent when asked for a directory. It should eventually evolve
1557  * into some kind of namei_once() call; for the time being it's kind
1558  * of a mess. XXX.
1559  *
1560  * dholland 20110109: I don't think it works, and I don't think it
1561  * worked before I started hacking and slashing either, and I doubt
1562  * anyone will ever notice.
1563  */
1564 
1565 /*
1566  * Internals. This calls lookup_once() after setting up the assorted
1567  * pieces of state the way they ought to be.
1568  */
1569 static int
1570 do_lookup_for_nfsd_index(struct namei_state *state)
1571 {
1572 	int error = 0;
1573 
1574 	struct componentname *cnp = state->cnp;
1575 	struct nameidata *ndp = state->ndp;
1576 	struct vnode *startdir;
1577 	struct vnode *foundobj;
1578 	const char *cp;			/* pointer into pathname argument */
1579 
1580 	KASSERT(cnp == &ndp->ni_cnd);
1581 
1582 	startdir = state->ndp->ni_atdir;
1583 
1584 	cnp->cn_nameptr = ndp->ni_pnbuf;
1585 	state->docache = 1;
1586 	state->rdonly = cnp->cn_flags & RDONLY;
1587 	ndp->ni_dvp = NULL;
1588 
1589 	cnp->cn_consume = 0;
1590 	cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
1591 	cp = cnp->cn_nameptr + cnp->cn_namelen;
1592 	KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
1593 	ndp->ni_pathlen -= cnp->cn_namelen;
1594 	ndp->ni_next = cp;
1595 	state->slashes = 0;
1596 	cnp->cn_flags &= ~REQUIREDIR;
1597 	cnp->cn_flags |= MAKEENTRY|ISLASTCN;
1598 
1599 	if (cnp->cn_namelen == 2 &&
1600 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
1601 		cnp->cn_flags |= ISDOTDOT;
1602 	else
1603 		cnp->cn_flags &= ~ISDOTDOT;
1604 
1605 	/*
1606 	 * Because lookup_once can change the startdir, we need our
1607 	 * own reference to it to avoid consuming the caller's.
1608 	 */
1609 	vref(startdir);
1610 	vn_lock(startdir, LK_EXCLUSIVE | LK_RETRY);
1611 	error = lookup_once(state, startdir, &startdir, &foundobj);
1612 	if (error == 0 && startdir == foundobj) {
1613 		vrele(startdir);
1614 	} else {
1615 		vput(startdir);
1616 	}
1617 	if (error) {
1618 		goto bad;
1619 	}
1620 	ndp->ni_vp = foundobj;
1621 
1622 	if (foundobj == NULL) {
1623 		return 0;
1624 	}
1625 
1626 	KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
1627 	if ((cnp->cn_flags & LOCKLEAF) == 0) {
1628 		VOP_UNLOCK(foundobj);
1629 	}
1630 	return (0);
1631 
1632 bad:
1633 	ndp->ni_vp = NULL;
1634 	return (error);
1635 }
1636 
1637 /*
1638  * External interface. The partitioning between this function and the
1639  * above isn't very clear - the above function exists mostly so code
1640  * that uses "state->" can be shuffled around without having to change
1641  * it to "state.".
1642  */
1643 int
1644 lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
1645 {
1646 	struct namei_state state;
1647 	int error;
1648 
1649 	KASSERT(ndp->ni_atdir == NULL);
1650 	ndp->ni_atdir = startdir;
1651 
1652 	/*
1653 	 * Note: the name sent in here (is not|should not be) allowed
1654 	 * to contain a slash.
1655 	 */
1656 	if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
1657 		return ENAMETOOLONG;
1658 	}
1659 	if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
1660 		return EINVAL;
1661 	}
1662 
1663 	ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
1664 	ndp->ni_pnbuf = NULL;
1665 	ndp->ni_cnd.cn_nameptr = NULL;
1666 
1667 	namei_init(&state, ndp);
1668 	error = do_lookup_for_nfsd_index(&state);
1669 	namei_cleanup(&state);
1670 
1671 	return error;
1672 }
1673 
1674 ////////////////////////////////////////////////////////////
1675 
1676 /*
1677  * Reacquire a path name component.
1678  * dvp is locked on entry and exit.
1679  * *vpp is locked on exit unless it's NULL.
1680  */
1681 int
1682 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
1683 {
1684 	int rdonly;			/* lookup read-only flag bit */
1685 	int error = 0;
1686 #ifdef DEBUG
1687 	size_t newlen;			/* DEBUG: check name len */
1688 	const char *cp;			/* DEBUG: check name ptr */
1689 #endif /* DEBUG */
1690 
1691 	(void)dummy;
1692 
1693 	/*
1694 	 * Setup: break out flag bits into variables.
1695 	 */
1696 	rdonly = cnp->cn_flags & RDONLY;
1697 
1698 	/*
1699 	 * Search a new directory.
1700 	 *
1701 	 * The cn_hash value is for use by vfs_cache.
1702 	 * The last component of the filename is left accessible via
1703 	 * cnp->cn_nameptr for callers that need the name. Callers needing
1704 	 * the name set the SAVENAME flag. When done, they assume
1705 	 * responsibility for freeing the pathname buffer.
1706 	 */
1707 #ifdef DEBUG
1708 #if 0
1709 	cp = NULL;
1710 	newhash = namei_hash(cnp->cn_nameptr, &cp);
1711 	if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
1712 		panic("relookup: bad hash");
1713 #endif
1714 	newlen = namei_getcomponent(cnp->cn_nameptr);
1715 	if (cnp->cn_namelen != newlen)
1716 		panic("relookup: bad len");
1717 	cp = cnp->cn_nameptr + cnp->cn_namelen;
1718 	while (*cp == '/')
1719 		cp++;
1720 	if (*cp != 0)
1721 		panic("relookup: not last component");
1722 #endif /* DEBUG */
1723 
1724 	/*
1725 	 * Check for degenerate name (e.g. / or "")
1726 	 * which is a way of talking about a directory,
1727 	 * e.g. like "/." or ".".
1728 	 */
1729 	if (cnp->cn_nameptr[0] == '\0')
1730 		panic("relookup: null name");
1731 
1732 	if (cnp->cn_flags & ISDOTDOT)
1733 		panic("relookup: lookup on dot-dot");
1734 
1735 	/*
1736 	 * We now have a segment name to search for, and a directory to search.
1737 	 */
1738 	*vpp = NULL;
1739 	cnp->cn_flags |= INRELOOKUP;
1740 	error = VOP_LOOKUP(dvp, vpp, cnp);
1741 	cnp->cn_flags &= ~INRELOOKUP;
1742 	if ((error) != 0) {
1743 #ifdef DIAGNOSTIC
1744 		if (*vpp != NULL)
1745 			panic("leaf `%s' should be empty", cnp->cn_nameptr);
1746 #endif
1747 		if (error != EJUSTRETURN)
1748 			goto bad;
1749 	}
1750 
1751 #ifdef DIAGNOSTIC
1752 	/*
1753 	 * Check for symbolic link
1754 	 */
1755 	if (*vpp && (*vpp)->v_type == VLNK && (cnp->cn_flags & FOLLOW))
1756 		panic("relookup: symlink found");
1757 #endif
1758 
1759 	/*
1760 	 * Check for read-only lookups.
1761 	 */
1762 	if (rdonly && cnp->cn_nameiop != LOOKUP) {
1763 		error = EROFS;
1764 		if (*vpp) {
1765 			vrele(*vpp);
1766 		}
1767 		goto bad;
1768 	}
1769 	/*
1770 	 * Lock result.
1771 	 */
1772 	if (*vpp && *vpp != dvp) {
1773 		error = vn_lock(*vpp, LK_EXCLUSIVE);
1774 		if (error != 0) {
1775 			vrele(*vpp);
1776 			goto bad;
1777 		}
1778 	}
1779 	return (0);
1780 
1781 bad:
1782 	*vpp = NULL;
1783 	return (error);
1784 }
1785 
1786 /*
1787  * namei_simple - simple forms of namei.
1788  *
1789  * These are wrappers to allow the simple case callers of namei to be
1790  * left alone while everything else changes under them.
1791  */
1792 
1793 /* Flags */
1794 struct namei_simple_flags_type {
1795 	int dummy;
1796 };
1797 static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
1798 const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
1799 const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
1800 const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
1801 const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;
1802 
1803 static
1804 int
1805 namei_simple_convert_flags(namei_simple_flags_t sflags)
1806 {
1807 	if (sflags == NSM_NOFOLLOW_NOEMULROOT)
1808 		return NOFOLLOW | 0;
1809 	if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
1810 		return NOFOLLOW | TRYEMULROOT;
1811 	if (sflags == NSM_FOLLOW_NOEMULROOT)
1812 		return FOLLOW | 0;
1813 	if (sflags == NSM_FOLLOW_TRYEMULROOT)
1814 		return FOLLOW | TRYEMULROOT;
1815 	panic("namei_simple_convert_flags: bogus sflags\n");
1816 	return 0;
1817 }
1818 
1819 int
1820 namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
1821 	struct vnode **vp_ret)
1822 {
1823 	return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
1824 }
1825 
1826 int
1827 nameiat_simple_kernel(struct vnode *dvp, const char *path,
1828 	namei_simple_flags_t sflags, struct vnode **vp_ret)
1829 {
1830 	struct nameidata nd;
1831 	struct pathbuf *pb;
1832 	int err;
1833 
1834 	pb = pathbuf_create(path);
1835 	if (pb == NULL) {
1836 		return ENOMEM;
1837 	}
1838 
1839 	NDINIT(&nd,
1840 		LOOKUP,
1841 		namei_simple_convert_flags(sflags),
1842 		pb);
1843 
1844 	if (dvp != NULL)
1845 		NDAT(&nd, dvp);
1846 
1847 	err = namei(&nd);
1848 	if (err != 0) {
1849 		pathbuf_destroy(pb);
1850 		return err;
1851 	}
1852 	*vp_ret = nd.ni_vp;
1853 	pathbuf_destroy(pb);
1854 	return 0;
1855 }
1856 
1857 int
1858 namei_simple_user(const char *path, namei_simple_flags_t sflags,
1859 	struct vnode **vp_ret)
1860 {
1861 	return nameiat_simple_user(NULL, path, sflags, vp_ret);
1862 }
1863 
1864 int
1865 nameiat_simple_user(struct vnode *dvp, const char *path,
1866 	namei_simple_flags_t sflags, struct vnode **vp_ret)
1867 {
1868 	struct pathbuf *pb;
1869 	struct nameidata nd;
1870 	int err;
1871 
1872 	err = pathbuf_copyin(path, &pb);
1873 	if (err) {
1874 		return err;
1875 	}
1876 
1877 	NDINIT(&nd,
1878 		LOOKUP,
1879 		namei_simple_convert_flags(sflags),
1880 		pb);
1881 
1882 	if (dvp != NULL)
1883 		NDAT(&nd, dvp);
1884 
1885 	err = namei(&nd);
1886 	if (err != 0) {
1887 		pathbuf_destroy(pb);
1888 		return err;
1889 	}
1890 	*vp_ret = nd.ni_vp;
1891 	pathbuf_destroy(pb);
1892 	return 0;
1893 }
1894