xref: /netbsd-src/sys/kern/vfs_lookup.c (revision fb5eed702691094bd687fbf1ded189c87457cd35)
1 /*	$NetBSD: vfs_lookup.c,v 1.229 2021/06/29 22:39:21 dholland Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_lookup.c	8.10 (Berkeley) 5/27/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.229 2021/06/29 22:39:21 dholland Exp $");
41 
42 #ifdef _KERNEL_OPT
43 #include "opt_magiclinks.h"
44 #endif
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/syslimits.h>
50 #include <sys/time.h>
51 #include <sys/namei.h>
52 #include <sys/vnode.h>
53 #include <sys/vnode_impl.h>
54 #include <sys/mount.h>
55 #include <sys/errno.h>
56 #include <sys/filedesc.h>
57 #include <sys/hash.h>
58 #include <sys/proc.h>
59 #include <sys/syslog.h>
60 #include <sys/kauth.h>
61 #include <sys/ktrace.h>
62 #include <sys/dirent.h>
63 
64 #ifndef MAGICLINKS
65 #define MAGICLINKS 0
66 #endif
67 
68 int vfs_magiclinks = MAGICLINKS;
69 
70 __CTASSERT(MAXNAMLEN == NAME_MAX);
71 
72 /*
73  * Substitute replacement text for 'magic' strings in symlinks.
74  * Returns 0 if successful, and returns non-zero if an error
75  * occurs.  (Currently, the only possible error is running out
76  * of temporary pathname space.)
77  *
78  * Looks for "@<string>" and "@<string>/", where <string> is a
79  * recognized 'magic' string.  Replaces the "@<string>" with the
80  * appropriate replacement text.  (Note that in some cases the
81  * replacement text may have zero length.)
82  *
83  * This would have been table driven, but the variance in
84  * replacement strings (and replacement string lengths) made
85  * that impractical.
86  */
87 #define	VNL(x)							\
88 	(sizeof(x) - 1)
89 
90 #define	VO	'{'
91 #define	VC	'}'
92 
93 #define	MATCH(str)						\
94 	((termchar == '/' && i + VNL(str) == *len) ||		\
95 	 (i + VNL(str) < *len &&				\
96 	  cp[i + VNL(str)] == termchar)) &&			\
97 	!strncmp((str), &cp[i], VNL(str))
98 
99 #define	SUBSTITUTE(m, s, sl)					\
100 	if ((newlen + (sl)) >= MAXPATHLEN)			\
101 		return 1;					\
102 	i += VNL(m);						\
103 	if (termchar != '/')					\
104 		i++;						\
105 	(void)memcpy(&tmp[newlen], (s), (sl));			\
106 	newlen += (sl);						\
107 	change = 1;						\
108 	termchar = '/';
109 
110 static int
111 symlink_magic(struct proc *p, char *cp, size_t *len)
112 {
113 	char *tmp;
114 	size_t change, i, newlen, slen;
115 	char termchar = '/';
116 	char idtmp[11]; /* enough for 32 bit *unsigned* integer */
117 
118 
119 	tmp = PNBUF_GET();
120 	for (change = i = newlen = 0; i < *len; ) {
121 		if (cp[i] != '@') {
122 			tmp[newlen++] = cp[i++];
123 			continue;
124 		}
125 
126 		i++;
127 
128 		/* Check for @{var} syntax. */
129 		if (cp[i] == VO) {
130 			termchar = VC;
131 			i++;
132 		}
133 
134 		/*
135 		 * The following checks should be ordered according
136 		 * to frequency of use.
137 		 */
138 		if (MATCH("machine_arch")) {
139 			slen = VNL(MACHINE_ARCH);
140 			SUBSTITUTE("machine_arch", MACHINE_ARCH, slen);
141 		} else if (MATCH("machine")) {
142 			slen = VNL(MACHINE);
143 			SUBSTITUTE("machine", MACHINE, slen);
144 		} else if (MATCH("hostname")) {
145 			SUBSTITUTE("hostname", hostname, hostnamelen);
146 		} else if (MATCH("osrelease")) {
147 			slen = strlen(osrelease);
148 			SUBSTITUTE("osrelease", osrelease, slen);
149 		} else if (MATCH("emul")) {
150 			slen = strlen(p->p_emul->e_name);
151 			SUBSTITUTE("emul", p->p_emul->e_name, slen);
152 		} else if (MATCH("kernel_ident")) {
153 			slen = strlen(kernel_ident);
154 			SUBSTITUTE("kernel_ident", kernel_ident, slen);
155 		} else if (MATCH("domainname")) {
156 			SUBSTITUTE("domainname", domainname, domainnamelen);
157 		} else if (MATCH("ostype")) {
158 			slen = strlen(ostype);
159 			SUBSTITUTE("ostype", ostype, slen);
160 		} else if (MATCH("uid")) {
161 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
162 			    kauth_cred_geteuid(kauth_cred_get()));
163 			SUBSTITUTE("uid", idtmp, slen);
164 		} else if (MATCH("ruid")) {
165 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
166 			    kauth_cred_getuid(kauth_cred_get()));
167 			SUBSTITUTE("ruid", idtmp, slen);
168 		} else if (MATCH("gid")) {
169 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
170 			    kauth_cred_getegid(kauth_cred_get()));
171 			SUBSTITUTE("gid", idtmp, slen);
172 		} else if (MATCH("rgid")) {
173 			slen = snprintf(idtmp, sizeof(idtmp), "%u",
174 			    kauth_cred_getgid(kauth_cred_get()));
175 			SUBSTITUTE("rgid", idtmp, slen);
176 		} else {
177 			tmp[newlen++] = '@';
178 			if (termchar == VC)
179 				tmp[newlen++] = VO;
180 		}
181 	}
182 
183 	if (change) {
184 		(void)memcpy(cp, tmp, newlen);
185 		*len = newlen;
186 	}
187 	PNBUF_PUT(tmp);
188 
189 	return 0;
190 }
191 
192 #undef VNL
193 #undef VO
194 #undef VC
195 #undef MATCH
196 #undef SUBSTITUTE
197 
198 ////////////////////////////////////////////////////////////
199 
200 /*
201  * Determine the namei hash (for the namecache) for name.
202  * If *ep != NULL, hash from name to ep-1.
203  * If *ep == NULL, hash from name until the first NUL or '/', and
204  * return the location of this termination character in *ep.
205  *
206  * This function returns an equivalent hash to the MI hash32_strn().
207  * The latter isn't used because in the *ep == NULL case, determining
208  * the length of the string to the first NUL or `/' and then calling
209  * hash32_strn() involves unnecessary double-handling of the data.
210  */
211 uint32_t
212 namei_hash(const char *name, const char **ep)
213 {
214 	uint32_t	hash;
215 
216 	hash = HASH32_STR_INIT;
217 	if (*ep != NULL) {
218 		for (; name < *ep; name++)
219 			hash = hash * 33 + *(const uint8_t *)name;
220 	} else {
221 		for (; *name != '\0' && *name != '/'; name++)
222 			hash = hash * 33 + *(const uint8_t *)name;
223 		*ep = name;
224 	}
225 	return (hash + (hash >> 5));
226 }
227 
228 ////////////////////////////////////////////////////////////
229 
230 /*
231  * Sealed abstraction for pathnames.
232  *
233  * System-call-layer level code that is going to call namei should
234  * first create a pathbuf and adjust all the bells and whistles on it
235  * as needed by context.
236  */
237 
238 struct pathbuf {
239 	char *pb_path;
240 	char *pb_pathcopy;
241 	unsigned pb_pathcopyuses;
242 };
243 
244 static struct pathbuf *
245 pathbuf_create_raw(void)
246 {
247 	struct pathbuf *pb;
248 
249 	pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
250 	pb->pb_path = PNBUF_GET();
251 	if (pb->pb_path == NULL) {
252 		kmem_free(pb, sizeof(*pb));
253 		return NULL;
254 	}
255 	pb->pb_pathcopy = NULL;
256 	pb->pb_pathcopyuses = 0;
257 	return pb;
258 }
259 
260 void
261 pathbuf_destroy(struct pathbuf *pb)
262 {
263 	KASSERT(pb->pb_pathcopyuses == 0);
264 	KASSERT(pb->pb_pathcopy == NULL);
265 	PNBUF_PUT(pb->pb_path);
266 	kmem_free(pb, sizeof(*pb));
267 }
268 
269 struct pathbuf *
270 pathbuf_assimilate(char *pnbuf)
271 {
272 	struct pathbuf *pb;
273 
274 	pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
275 	pb->pb_path = pnbuf;
276 	pb->pb_pathcopy = NULL;
277 	pb->pb_pathcopyuses = 0;
278 	return pb;
279 }
280 
281 struct pathbuf *
282 pathbuf_create(const char *path)
283 {
284 	struct pathbuf *pb;
285 	int error;
286 
287 	pb = pathbuf_create_raw();
288 	if (pb == NULL) {
289 		return NULL;
290 	}
291 	error = copystr(path, pb->pb_path, PATH_MAX, NULL);
292 	if (error != 0) {
293 		KASSERT(!"kernel path too long in pathbuf_create");
294 		/* make sure it's null-terminated, just in case */
295 		pb->pb_path[PATH_MAX-1] = '\0';
296 	}
297 	return pb;
298 }
299 
300 int
301 pathbuf_copyin(const char *userpath, struct pathbuf **ret)
302 {
303 	struct pathbuf *pb;
304 	int error;
305 
306 	pb = pathbuf_create_raw();
307 	if (pb == NULL) {
308 		return ENOMEM;
309 	}
310 	error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
311 	if (error) {
312 		pathbuf_destroy(pb);
313 		return error;
314 	}
315 	*ret = pb;
316 	return 0;
317 }
318 
319 /*
320  * XXX should not exist:
321  *   1. whether a pointer is kernel or user should be statically checkable.
322  *   2. copyin should be handled by the upper part of the syscall layer,
323  *      not in here.
324  */
325 int
326 pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
327 {
328 	if (seg == UIO_USERSPACE) {
329 		return pathbuf_copyin(path, ret);
330 	} else {
331 		*ret = pathbuf_create(path);
332 		if (*ret == NULL) {
333 			return ENOMEM;
334 		}
335 		return 0;
336 	}
337 }
338 
339 /*
340  * Get a copy of the path buffer as it currently exists. If this is
341  * called after namei starts the results may be arbitrary.
342  */
343 void
344 pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
345 {
346 	strlcpy(buf, pb->pb_path, maxlen);
347 }
348 
349 /*
350  * These two functions allow access to a saved copy of the original
351  * path string. The first copy should be gotten before namei is
352  * called. Each copy that is gotten should be put back.
353  */
354 
355 const char *
356 pathbuf_stringcopy_get(struct pathbuf *pb)
357 {
358 	if (pb->pb_pathcopyuses == 0) {
359 		pb->pb_pathcopy = PNBUF_GET();
360 		strcpy(pb->pb_pathcopy, pb->pb_path);
361 	}
362 	pb->pb_pathcopyuses++;
363 	return pb->pb_pathcopy;
364 }
365 
366 void
367 pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
368 {
369 	KASSERT(str == pb->pb_pathcopy);
370 	KASSERT(pb->pb_pathcopyuses > 0);
371 	pb->pb_pathcopyuses--;
372 	if (pb->pb_pathcopyuses == 0) {
373 		PNBUF_PUT(pb->pb_pathcopy);
374 		pb->pb_pathcopy = NULL;
375 	}
376 }
377 
378 
379 ////////////////////////////////////////////////////////////
380 
381 /*
382  * namei: convert a pathname into a pointer to a (maybe-locked) vnode,
383  * and maybe also its parent directory vnode, and assorted other guff.
384  * See namei(9) for the interface documentation.
385  *
386  *
387  * The FOLLOW flag is set when symbolic links are to be followed
388  * when they occur at the end of the name translation process.
389  * Symbolic links are always followed for all other pathname
390  * components other than the last.
391  *
392  * The segflg defines whether the name is to be copied from user
393  * space or kernel space.
394  *
395  * Overall outline of namei:
396  *
397  *	copy in name
398  *	get starting directory
399  *	while (!done && !error) {
400  *		call lookup to search path.
401  *		if symbolic link, massage name in buffer and continue
402  *	}
403  */
404 
405 /*
406  * Search a pathname.
407  * This is a very central and rather complicated routine.
408  *
409  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
410  * The starting directory is passed in. The pathname is descended
411  * until done, or a symbolic link is encountered. The variable ni_more
412  * is clear if the path is completed; it is set to one if a symbolic
413  * link needing interpretation is encountered.
414  *
415  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
416  * whether the name is to be looked up, created, renamed, or deleted.
417  * When CREATE, RENAME, or DELETE is specified, information usable in
418  * creating, renaming, or deleting a directory entry may be calculated.
419  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
420  * locked.  Otherwise the parent directory is not returned. If the target
421  * of the pathname exists and LOCKLEAF is or'ed into the flag the target
422  * is returned locked, otherwise it is returned unlocked.  When creating
423  * or renaming and LOCKPARENT is specified, the target may not be ".".
424  * When deleting and LOCKPARENT is specified, the target may be ".".
425  *
426  * Overall outline of lookup:
427  *
428  * dirloop:
429  *	identify next component of name at ndp->ni_ptr
430  *	handle degenerate case where name is null string
431  *	if .. and crossing mount points and on mounted filesys, find parent
432  *	call VOP_LOOKUP routine for next component name
433  *	    directory vnode returned in ni_dvp, locked.
434  *	    component vnode returned in ni_vp (if it exists), locked.
435  *	if result vnode is mounted on and crossing mount points,
436  *	    find mounted on vnode
437  *	if more components of name, do next level at dirloop
438  *	return the answer in ni_vp, locked if LOCKLEAF set
439  *	    if LOCKPARENT set, return locked parent in ni_dvp
440  */
441 
442 
443 /*
444  * Internal state for a namei operation.
445  *
446  * cnp is always equal to &ndp->ni_cnp.
447  */
448 struct namei_state {
449 	struct nameidata *ndp;
450 	struct componentname *cnp;
451 
452 	int docache;			/* == 0 do not cache last component */
453 	int rdonly;			/* lookup read-only flag bit */
454 	int slashes;
455 
456 	unsigned attempt_retry:1;	/* true if error allows emul retry */
457 	unsigned root_referenced:1;	/* true if ndp->ni_rootdir and
458 					     ndp->ni_erootdir were referenced */
459 };
460 
461 
462 /*
463  * Initialize the namei working state.
464  */
465 static void
466 namei_init(struct namei_state *state, struct nameidata *ndp)
467 {
468 
469 	state->ndp = ndp;
470 	state->cnp = &ndp->ni_cnd;
471 
472 	state->docache = 0;
473 	state->rdonly = 0;
474 	state->slashes = 0;
475 
476 	state->root_referenced = 0;
477 
478 	KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc");
479 	KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0),
480 	    "namei: nameiop contaminated with flags: %08"PRIx32,
481 	    state->cnp->cn_nameiop);
482 	KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0),
483 	    "name: flags contaminated with nameiops: %08"PRIx32,
484 	    state->cnp->cn_flags);
485 
486 	/*
487 	 * The buffer for name translation shall be the one inside the
488 	 * pathbuf.
489 	 */
490 	state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
491 }
492 
493 /*
494  * Clean up the working namei state, leaving things ready for return
495  * from namei.
496  */
497 static void
498 namei_cleanup(struct namei_state *state)
499 {
500 	KASSERT(state->cnp == &state->ndp->ni_cnd);
501 
502 	if (state->root_referenced) {
503 		if (state->ndp->ni_rootdir != NULL)
504 			vrele(state->ndp->ni_rootdir);
505 		if (state->ndp->ni_erootdir != NULL)
506 			vrele(state->ndp->ni_erootdir);
507 	}
508 }
509 
510 //////////////////////////////
511 
512 /*
513  * Get the directory context.
514  * Initializes the rootdir and erootdir state and returns a reference
515  * to the starting dir.
516  */
517 static struct vnode *
518 namei_getstartdir(struct namei_state *state)
519 {
520 	struct nameidata *ndp = state->ndp;
521 	struct componentname *cnp = state->cnp;
522 	struct cwdinfo *cwdi;		/* pointer to cwd state */
523 	struct lwp *self = curlwp;	/* thread doing namei() */
524 	struct vnode *rootdir, *erootdir, *curdir, *startdir;
525 
526 	if (state->root_referenced) {
527 		if (state->ndp->ni_rootdir != NULL)
528 			vrele(state->ndp->ni_rootdir);
529 		if (state->ndp->ni_erootdir != NULL)
530 			vrele(state->ndp->ni_erootdir);
531 		state->root_referenced = 0;
532 	}
533 
534 	cwdi = self->l_proc->p_cwdi;
535 	rw_enter(&cwdi->cwdi_lock, RW_READER);
536 
537 	/* root dir */
538 	if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
539 		rootdir = rootvnode;
540 	} else {
541 		rootdir = cwdi->cwdi_rdir;
542 	}
543 
544 	/* emulation root dir, if any */
545 	if ((cnp->cn_flags & TRYEMULROOT) == 0) {
546 		/* if we don't want it, don't fetch it */
547 		erootdir = NULL;
548 	} else if (cnp->cn_flags & EMULROOTSET) {
549 		/* explicitly set emulroot; "/../" doesn't override this */
550 		erootdir = ndp->ni_erootdir;
551 	} else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
552 		/* explicit reference to real rootdir */
553 		erootdir = NULL;
554 	} else {
555 		/* may be null */
556 		erootdir = cwdi->cwdi_edir;
557 	}
558 
559 	/* current dir */
560 	curdir = cwdi->cwdi_cdir;
561 
562 	if (ndp->ni_pnbuf[0] != '/') {
563 		if (ndp->ni_atdir != NULL) {
564 			startdir = ndp->ni_atdir;
565 		} else {
566 			startdir = curdir;
567 		}
568 		erootdir = NULL;
569 	} else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
570 		startdir = erootdir;
571 	} else {
572 		startdir = rootdir;
573 		erootdir = NULL;
574 	}
575 
576 	state->ndp->ni_rootdir = rootdir;
577 	state->ndp->ni_erootdir = erootdir;
578 
579 	/*
580 	 * Get a reference to the start dir so we can safely unlock cwdi.
581 	 *
582 	 * Must hold references to rootdir and erootdir while we're running.
583 	 * A multithreaded process may chroot during namei.
584 	 */
585 	if (startdir != NULL)
586 		vref(startdir);
587 	if (state->ndp->ni_rootdir != NULL)
588 		vref(state->ndp->ni_rootdir);
589 	if (state->ndp->ni_erootdir != NULL)
590 		vref(state->ndp->ni_erootdir);
591 	state->root_referenced = 1;
592 
593 	rw_exit(&cwdi->cwdi_lock);
594 	return startdir;
595 }
596 
597 /*
598  * Get the directory context for the nfsd case, in parallel to
599  * getstartdir. Initializes the rootdir and erootdir state and
600  * returns a reference to the passed-in starting dir.
601  */
602 static struct vnode *
603 namei_getstartdir_for_nfsd(struct namei_state *state)
604 {
605 	KASSERT(state->ndp->ni_atdir != NULL);
606 
607 	/* always use the real root, and never set an emulation root */
608 	if (rootvnode == NULL) {
609 		return NULL;
610 	}
611 	state->ndp->ni_rootdir = rootvnode;
612 	state->ndp->ni_erootdir = NULL;
613 
614 	vref(state->ndp->ni_atdir);
615 	KASSERT(! state->root_referenced);
616 	vref(state->ndp->ni_rootdir);
617 	state->root_referenced = 1;
618 	return state->ndp->ni_atdir;
619 }
620 
621 
622 /*
623  * Ktrace the namei operation.
624  */
625 static void
626 namei_ktrace(struct namei_state *state)
627 {
628 	struct nameidata *ndp = state->ndp;
629 	struct componentname *cnp = state->cnp;
630 	struct lwp *self = curlwp;	/* thread doing namei() */
631 	const char *emul_path;
632 
633 	if (ktrpoint(KTR_NAMEI)) {
634 		if (ndp->ni_erootdir != NULL) {
635 			/*
636 			 * To make any sense, the trace entry need to have the
637 			 * text of the emulation path prepended.
638 			 * Usually we can get this from the current process,
639 			 * but when called from emul_find_interp() it is only
640 			 * in the exec_package - so we get it passed in ni_next
641 			 * (this is a hack).
642 			 */
643 			if (cnp->cn_flags & EMULROOTSET)
644 				emul_path = ndp->ni_next;
645 			else
646 				emul_path = self->l_proc->p_emul->e_path;
647 			ktrnamei2(emul_path, strlen(emul_path),
648 			    ndp->ni_pnbuf, ndp->ni_pathlen);
649 		} else
650 			ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
651 	}
652 }
653 
654 /*
655  * Start up namei. Find the root dir and cwd, establish the starting
656  * directory for lookup, and lock it. Also calls ktrace when
657  * appropriate.
658  */
659 static int
660 namei_start(struct namei_state *state, int isnfsd,
661 	    struct vnode **startdir_ret)
662 {
663 	struct nameidata *ndp = state->ndp;
664 	struct vnode *startdir;
665 
666 	/* length includes null terminator (was originally from copyinstr) */
667 	ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
668 
669 	/*
670 	 * POSIX.1 requirement: "" is not a valid file name.
671 	 */
672 	if (ndp->ni_pathlen == 1) {
673 		ndp->ni_erootdir = NULL;
674 		return ENOENT;
675 	}
676 
677 	ndp->ni_loopcnt = 0;
678 
679 	/* Get starting directory, set up root, and ktrace. */
680 	if (isnfsd) {
681 		startdir = namei_getstartdir_for_nfsd(state);
682 		/* no ktrace */
683 	} else {
684 		startdir = namei_getstartdir(state);
685 		namei_ktrace(state);
686 	}
687 
688 	if (startdir == NULL) {
689 		return ENOENT;
690 	}
691 
692 	/* NDAT may feed us with a non directory namei_getstartdir */
693 	if (startdir->v_type != VDIR) {
694 		vrele(startdir);
695 		return ENOTDIR;
696 	}
697 
698 	*startdir_ret = startdir;
699 	return 0;
700 }
701 
702 /*
703  * Check for being at a symlink that we're going to follow.
704  */
705 static inline int
706 namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
707 {
708 	return (foundobj->v_type == VLNK) &&
709 		(state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
710 }
711 
712 /*
713  * Follow a symlink.
714  *
715  * Updates searchdir. inhibitmagic causes magic symlinks to not be
716  * interpreted; this is used by nfsd.
717  *
718  * Unlocks foundobj on success (ugh)
719  */
720 static inline int
721 namei_follow(struct namei_state *state, int inhibitmagic,
722 	     struct vnode *searchdir, struct vnode *foundobj,
723 	     struct vnode **newsearchdir_ret)
724 {
725 	struct nameidata *ndp = state->ndp;
726 	struct componentname *cnp = state->cnp;
727 
728 	struct lwp *self = curlwp;	/* thread doing namei() */
729 	struct iovec aiov;		/* uio for reading symbolic links */
730 	struct uio auio;
731 	char *cp;			/* pointer into pathname argument */
732 	size_t linklen;
733 	int error;
734 
735 	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
736 		return ELOOP;
737 	}
738 
739 	vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
740 	if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
741 		error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
742 		if (error != 0) {
743 			VOP_UNLOCK(foundobj);
744 			return error;
745 		}
746 	}
747 
748 	/* FUTURE: fix this to not use a second buffer */
749 	cp = PNBUF_GET();
750 	aiov.iov_base = cp;
751 	aiov.iov_len = MAXPATHLEN;
752 	auio.uio_iov = &aiov;
753 	auio.uio_iovcnt = 1;
754 	auio.uio_offset = 0;
755 	auio.uio_rw = UIO_READ;
756 	auio.uio_resid = MAXPATHLEN;
757 	UIO_SETUP_SYSSPACE(&auio);
758 	error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
759 	VOP_UNLOCK(foundobj);
760 	if (error) {
761 		PNBUF_PUT(cp);
762 		return error;
763 	}
764 	linklen = MAXPATHLEN - auio.uio_resid;
765 	if (linklen == 0) {
766 		PNBUF_PUT(cp);
767 		return ENOENT;
768 	}
769 
770 	/*
771 	 * Do symlink substitution, if appropriate, and
772 	 * check length for potential overflow.
773 	 *
774 	 * Inhibit symlink substitution for nfsd.
775 	 * XXX: This is how it was before; is that a bug or a feature?
776 	 */
777 	if ((!inhibitmagic && vfs_magiclinks &&
778 	     symlink_magic(self->l_proc, cp, &linklen)) ||
779 	    (linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
780 		PNBUF_PUT(cp);
781 		return ENAMETOOLONG;
782 	}
783 	if (ndp->ni_pathlen > 1) {
784 		/* includes a null-terminator */
785 		memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
786 	} else {
787 		cp[linklen] = '\0';
788 	}
789 	ndp->ni_pathlen += linklen;
790 	memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
791 	PNBUF_PUT(cp);
792 
793 	/* we're now starting from the beginning of the buffer again */
794 	cnp->cn_nameptr = ndp->ni_pnbuf;
795 
796 	/*
797 	 * Check if root directory should replace current directory.
798 	 */
799 	if (ndp->ni_pnbuf[0] == '/') {
800 		vrele(searchdir);
801 		/* Keep absolute symbolic links inside emulation root */
802 		searchdir = ndp->ni_erootdir;
803 		if (searchdir == NULL ||
804 		    (ndp->ni_pnbuf[1] == '.'
805 		     && ndp->ni_pnbuf[2] == '.'
806 		     && ndp->ni_pnbuf[3] == '/')) {
807 			ndp->ni_erootdir = NULL;
808 			searchdir = ndp->ni_rootdir;
809 		}
810 		vref(searchdir);
811 		while (cnp->cn_nameptr[0] == '/') {
812 			cnp->cn_nameptr++;
813 			ndp->ni_pathlen--;
814 		}
815 	}
816 
817 	*newsearchdir_ret = searchdir;
818 	return 0;
819 }
820 
821 //////////////////////////////
822 
823 /*
824  * Inspect the leading path component and update the state accordingly.
825  */
826 static int
827 lookup_parsepath(struct namei_state *state, struct vnode *searchdir)
828 {
829 	const char *cp;			/* pointer into pathname argument */
830 	int error;
831 
832 	struct componentname *cnp = state->cnp;
833 	struct nameidata *ndp = state->ndp;
834 
835 	KASSERT(cnp == &ndp->ni_cnd);
836 
837 	/*
838 	 * Search a new directory.
839 	 *
840 	 * The last component of the filename is left accessible via
841 	 * cnp->cn_nameptr for callers that need the name. Callers needing
842 	 * the name set the SAVENAME flag. When done, they assume
843 	 * responsibility for freeing the pathname buffer.
844 	 *
845 	 * At this point, our only vnode state is that the search dir
846 	 * is held.
847 	 */
848 	error = VOP_PARSEPATH(searchdir, cnp->cn_nameptr, &cnp->cn_namelen);
849 	if (error) {
850 		return error;
851 	}
852 	cp = cnp->cn_nameptr + cnp->cn_namelen;
853 	if (cnp->cn_namelen > KERNEL_NAME_MAX) {
854 		return ENAMETOOLONG;
855 	}
856 #ifdef NAMEI_DIAGNOSTIC
857 	{ char c = *cp;
858 	*(char *)cp = '\0';
859 	printf("{%s}: ", cnp->cn_nameptr);
860 	*(char *)cp = c; }
861 #endif /* NAMEI_DIAGNOSTIC */
862 	ndp->ni_pathlen -= cnp->cn_namelen;
863 	ndp->ni_next = cp;
864 	/*
865 	 * If this component is followed by a slash, then move the pointer to
866 	 * the next component forward, and remember that this component must be
867 	 * a directory.
868 	 */
869 	if (*cp == '/') {
870 		do {
871 			cp++;
872 		} while (*cp == '/');
873 		state->slashes = cp - ndp->ni_next;
874 		ndp->ni_pathlen -= state->slashes;
875 		ndp->ni_next = cp;
876 		cnp->cn_flags |= REQUIREDIR;
877 	} else {
878 		state->slashes = 0;
879 		cnp->cn_flags &= ~REQUIREDIR;
880 	}
881 	/*
882 	 * We do special processing on the last component, whether or not it's
883 	 * a directory.  Cache all intervening lookups, but not the final one.
884 	 */
885 	if (*cp == '\0') {
886 		if (state->docache)
887 			cnp->cn_flags |= MAKEENTRY;
888 		else
889 			cnp->cn_flags &= ~MAKEENTRY;
890 		cnp->cn_flags |= ISLASTCN;
891 	} else {
892 		cnp->cn_flags |= MAKEENTRY;
893 		cnp->cn_flags &= ~ISLASTCN;
894 	}
895 	if (cnp->cn_namelen == 2 &&
896 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
897 		cnp->cn_flags |= ISDOTDOT;
898 	else
899 		cnp->cn_flags &= ~ISDOTDOT;
900 
901 	return 0;
902 }
903 
904 /*
905  * Take care of crossing a mounted-on vnode.  On error, foundobj_ret will be
906  * vrele'd, but searchdir is left alone.
907  */
908 static int
909 lookup_crossmount(struct namei_state *state,
910 		  struct vnode **searchdir_ret,
911 		  struct vnode **foundobj_ret,
912 		  bool *searchdir_locked)
913 {
914 	struct componentname *cnp = state->cnp;
915 	struct vnode *foundobj, *vp;
916 	struct vnode *searchdir;
917 	struct mount *mp;
918 	int error, lktype;
919 
920 	searchdir = *searchdir_ret;
921 	foundobj = *foundobj_ret;
922 	error = 0;
923 
924 	KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);
925 
926 	/* First, unlock searchdir (oof). */
927 	if (*searchdir_locked) {
928 		KASSERT(searchdir != NULL);
929 		lktype = VOP_ISLOCKED(searchdir);
930 		VOP_UNLOCK(searchdir);
931 		*searchdir_locked = false;
932 	} else {
933 		lktype = LK_NONE;
934 	}
935 
936 	/*
937 	 * Do an unlocked check to see if the vnode has been mounted on; if
938 	 * so find the root of the mounted file system.
939 	 */
940 	while (foundobj->v_type == VDIR &&
941 	    (mp = foundobj->v_mountedhere) != NULL &&
942 	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
943 		KASSERTMSG(searchdir != foundobj, "same vn %p", searchdir);
944 
945 		/*
946 		 * Try the namecache first.  If that doesn't work, do
947 		 * it the hard way.
948 		 */
949 		if (cache_lookup_mount(foundobj, &vp)) {
950 			vrele(foundobj);
951 			foundobj = vp;
952 		} else {
953 			/* First get the vnode stable. */
954 			error = vn_lock(foundobj, LK_SHARED);
955 			if (error != 0) {
956 				vrele(foundobj);
957 				foundobj = NULL;
958 				break;
959 			}
960 
961 			/*
962 			 * Check to see if something is still mounted on it.
963 			 */
964 			if ((mp = foundobj->v_mountedhere) == NULL) {
965 				VOP_UNLOCK(foundobj);
966 				break;
967 			}
968 
969 			/*
970 			 * Get a reference to the mountpoint, and unlock
971 			 * foundobj.
972 			 */
973 			error = vfs_busy(mp);
974 			VOP_UNLOCK(foundobj);
975 			if (error != 0) {
976 				vrele(foundobj);
977 				foundobj = NULL;
978 				break;
979 			}
980 
981 			/*
982 			 * Now get a reference on the root vnode.
983 			 * XXX Future - maybe allow only VDIR here.
984 			 */
985 			error = VFS_ROOT(mp, LK_NONE, &vp);
986 
987 			/*
988 			 * If successful, enter it into the cache while
989 			 * holding the mount busy (competing with unmount).
990 			 */
991 			if (error == 0) {
992 				cache_enter_mount(foundobj, vp);
993 			}
994 
995 			/* Finally, drop references to foundobj & mountpoint. */
996 			vrele(foundobj);
997 			vfs_unbusy(mp);
998 			if (error) {
999 				foundobj = NULL;
1000 				break;
1001 			}
1002 			foundobj = vp;
1003 		}
1004 
1005 		/*
1006 		 * Avoid locking vnodes from two filesystems because
1007 		 * it's prone to deadlock, e.g. when using puffs.
1008 		 * Also, it isn't a good idea to propagate slowness of
1009 		 * a filesystem up to the root directory. For now,
1010 		 * only handle the common case, where foundobj is
1011 		 * VDIR.
1012 		 *
1013 		 * In this case set searchdir to null to avoid using
1014 		 * it again. It is not correct to set searchdir ==
1015 		 * foundobj here as that will confuse the caller.
1016 		 * (See PR 40740.)
1017 		 */
1018 		if (searchdir == NULL) {
1019 			/* already been here once; do nothing further */
1020 		} else if (foundobj->v_type == VDIR) {
1021 			vrele(searchdir);
1022 			*searchdir_ret = searchdir = NULL;
1023 			lktype = LK_NONE;
1024 		}
1025 	}
1026 
1027 	/* If searchdir is still around, re-lock it. */
1028  	if (error == 0 && lktype != LK_NONE) {
1029 		vn_lock(searchdir, lktype | LK_RETRY);
1030 		*searchdir_locked = true;
1031 	}
1032 	*foundobj_ret = foundobj;
1033 	return error;
1034 }
1035 
1036 /*
1037  * Determine the desired locking mode for the directory of a lookup.
1038  */
1039 static int
1040 lookup_lktype(struct vnode *searchdir, struct componentname *cnp)
1041 {
1042 
1043 	/*
1044 	 * If the file system supports VOP_LOOKUP() with a shared lock, and
1045 	 * we are not making any modifications (nameiop LOOKUP) or this is
1046 	 * not the last component then get a shared lock.  Where we can't do
1047 	 * fast-forwarded lookups (for example with layered file systems)
1048 	 * then this is the fallback for reducing lock contention.
1049 	 */
1050 	if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 &&
1051 	    (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) {
1052 		return LK_SHARED;
1053 	} else {
1054 		return LK_EXCLUSIVE;
1055 	}
1056 }
1057 
1058 /*
1059  * Call VOP_LOOKUP for a single lookup; return a new search directory
1060  * (used when crossing mountpoints up or searching union mounts down) and
1061  * the found object, which for create operations may be NULL on success.
1062  *
1063  * Note that the new search directory may be null, which means the
1064  * searchdir was unlocked and released. This happens in the common case
1065  * when crossing a mount point downwards, in order to avoid coupling
1066  * locks between different file system volumes. Importantly, this can
1067  * happen even if the call fails. (XXX: this is gross and should be
1068  * tidied somehow.)
1069  */
1070 static int
1071 lookup_once(struct namei_state *state,
1072 	    struct vnode *searchdir,
1073 	    struct vnode **newsearchdir_ret,
1074 	    struct vnode **foundobj_ret,
1075 	    bool *newsearchdir_locked_ret)
1076 {
1077 	struct vnode *tmpvn;		/* scratch vnode */
1078 	struct vnode *foundobj;		/* result */
1079 	struct lwp *l = curlwp;
1080 	bool searchdir_locked = false;
1081 	int error, lktype;
1082 
1083 	struct componentname *cnp = state->cnp;
1084 	struct nameidata *ndp = state->ndp;
1085 
1086 	KASSERT(cnp == &ndp->ni_cnd);
1087 	*newsearchdir_ret = searchdir;
1088 
1089 	/*
1090 	 * Handle "..": two special cases.
1091 	 * 1. If at root directory (e.g. after chroot)
1092 	 *    or at absolute root directory
1093 	 *    then ignore it so can't get out.
1094 	 * 1a. If at the root of the emulation filesystem go to the real
1095 	 *    root. So "/../<path>" is always absolute.
1096 	 * 1b. If we have somehow gotten out of a jail, warn
1097 	 *    and also ignore it so we can't get farther out.
1098 	 * 2. If this vnode is the root of a mounted
1099 	 *    filesystem, then replace it with the
1100 	 *    vnode which was mounted on so we take the
1101 	 *    .. in the other file system.
1102 	 */
1103 	if (cnp->cn_flags & ISDOTDOT) {
1104 		struct proc *p = l->l_proc;
1105 
1106 		for (;;) {
1107 			if (searchdir == ndp->ni_rootdir ||
1108 			    searchdir == rootvnode) {
1109 				foundobj = searchdir;
1110 				vref(foundobj);
1111 				*foundobj_ret = foundobj;
1112 				if (cnp->cn_flags & LOCKPARENT) {
1113 					lktype = lookup_lktype(searchdir, cnp);
1114 					vn_lock(searchdir, lktype | LK_RETRY);
1115 					searchdir_locked = true;
1116 				}
1117 				error = 0;
1118 				goto done;
1119 			}
1120 			if (ndp->ni_rootdir != rootvnode) {
1121 				int retval;
1122 
1123 				retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
1124 				if (!retval) {
1125 				    /* Oops! We got out of jail! */
1126 				    log(LOG_WARNING,
1127 					"chrooted pid %d uid %d (%s) "
1128 					"detected outside of its chroot\n",
1129 					p->p_pid, kauth_cred_geteuid(l->l_cred),
1130 					p->p_comm);
1131 				    /* Put us at the jail root. */
1132 				    vrele(searchdir);
1133 				    searchdir = NULL;
1134 				    foundobj = ndp->ni_rootdir;
1135 				    vref(foundobj);
1136 				    vref(foundobj);
1137 				    *newsearchdir_ret = foundobj;
1138 				    *foundobj_ret = foundobj;
1139 				    error = 0;
1140 				    goto done;
1141 				}
1142 			}
1143 			if ((searchdir->v_vflag & VV_ROOT) == 0 ||
1144 			    (cnp->cn_flags & NOCROSSMOUNT))
1145 				break;
1146 			tmpvn = searchdir;
1147 			searchdir = searchdir->v_mount->mnt_vnodecovered;
1148 			vref(searchdir);
1149 			vrele(tmpvn);
1150 			*newsearchdir_ret = searchdir;
1151 		}
1152 	}
1153 
1154 	lktype = lookup_lktype(searchdir, cnp);
1155 
1156 	/*
1157 	 * We now have a segment name to search for, and a directory to search.
1158 	 * Our vnode state here is that "searchdir" is held.
1159 	 */
1160 unionlookup:
1161 	foundobj = NULL;
1162 	if (!searchdir_locked) {
1163 		vn_lock(searchdir, lktype | LK_RETRY);
1164 		searchdir_locked = true;
1165 	}
1166 	error = VOP_LOOKUP(searchdir, &foundobj, cnp);
1167 
1168 	if (error != 0) {
1169 		KASSERTMSG((foundobj == NULL),
1170 		    "leaf `%s' should be empty but is %p",
1171 		    cnp->cn_nameptr, foundobj);
1172 #ifdef NAMEI_DIAGNOSTIC
1173 		printf("not found\n");
1174 #endif /* NAMEI_DIAGNOSTIC */
1175 
1176 		/*
1177 		 * If ENOLCK, the file system needs us to retry the lookup
1178 		 * with an exclusive lock.  It's likely nothing was found in
1179 		 * cache and/or modifications need to be made.
1180 		 */
1181 		if (error == ENOLCK) {
1182 			KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED);
1183 			KASSERT(searchdir_locked);
1184 			if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) {
1185 				VOP_UNLOCK(searchdir);
1186 				searchdir_locked = false;
1187 			}
1188 			lktype = LK_EXCLUSIVE;
1189 			goto unionlookup;
1190 		}
1191 
1192 		if ((error == ENOENT) &&
1193 		    (searchdir->v_vflag & VV_ROOT) &&
1194 		    (searchdir->v_mount->mnt_flag & MNT_UNION)) {
1195 			tmpvn = searchdir;
1196 			searchdir = searchdir->v_mount->mnt_vnodecovered;
1197 			vref(searchdir);
1198 			vput(tmpvn);
1199 			searchdir_locked = false;
1200 			*newsearchdir_ret = searchdir;
1201 			goto unionlookup;
1202 		}
1203 
1204 		if (error != EJUSTRETURN)
1205 			goto done;
1206 
1207 		/*
1208 		 * If this was not the last component, or there were trailing
1209 		 * slashes, and we are not going to create a directory,
1210 		 * then the name must exist.
1211 		 */
1212 		if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
1213 			error = ENOENT;
1214 			goto done;
1215 		}
1216 
1217 		/*
1218 		 * If creating and at end of pathname, then can consider
1219 		 * allowing file to be created.
1220 		 */
1221 		if (state->rdonly) {
1222 			error = EROFS;
1223 			goto done;
1224 		}
1225 
1226 		/*
1227 		 * We return success and a NULL foundobj to indicate
1228 		 * that the entry doesn't currently exist, leaving a
1229 		 * pointer to the (normally, locked) directory vnode
1230 		 * as searchdir.
1231 		 */
1232 		*foundobj_ret = NULL;
1233 		error = 0;
1234 		goto done;
1235 	}
1236 #ifdef NAMEI_DIAGNOSTIC
1237 	printf("found\n");
1238 #endif /* NAMEI_DIAGNOSTIC */
1239 
1240 	/* Unlock, unless the caller needs the parent locked. */
1241 	if (searchdir != NULL) {
1242 		KASSERT(searchdir_locked);
1243 		if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) !=
1244 		    (ISLASTCN | LOCKPARENT)) {
1245 		    	VOP_UNLOCK(searchdir);
1246 		    	searchdir_locked = false;
1247 		}
1248 	} else {
1249 		KASSERT(!searchdir_locked);
1250 	}
1251 
1252 	*foundobj_ret = foundobj;
1253 	error = 0;
1254 done:
1255 	*newsearchdir_locked_ret = searchdir_locked;
1256 	return error;
1257 }
1258 
1259 /*
1260  * Parse out the first path name component that we need to to consider.
1261  *
1262  * While doing this, attempt to use the name cache to fast-forward through
1263  * as many "easy" to find components of the path as possible.
1264  *
1265  * We use the namecache's node locks to form a chain, and avoid as many
1266  * vnode references and locks as possible.  In the ideal case, only the
1267  * final vnode will have its reference count adjusted and lock taken.
1268  */
1269 static int
1270 lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret,
1271 		   struct vnode **foundobj_ret)
1272 {
1273 	struct componentname *cnp = state->cnp;
1274 	struct nameidata *ndp = state->ndp;
1275 	krwlock_t *plock;
1276 	struct vnode *foundobj, *searchdir;
1277 	int error, error2;
1278 	size_t oldpathlen;
1279 	const char *oldnameptr;
1280 	bool terminal;
1281 
1282 	/*
1283 	 * Eat as many path name components as possible before giving up and
1284 	 * letting lookup_once() handle it.  Remember the starting point in
1285 	 * case we can't get vnode references and need to roll back.
1286 	 */
1287 	plock = NULL;
1288 	searchdir = *searchdir_ret;
1289 	oldnameptr = cnp->cn_nameptr;
1290 	oldpathlen = ndp->ni_pathlen;
1291 	terminal = false;
1292 	for (;;) {
1293 		foundobj = NULL;
1294 
1295 		/*
1296 		 * Get the next component name.  There should be no slashes
1297 		 * here, and we shouldn't have looped around if we were
1298 		 * done.
1299 		 */
1300 		KASSERT(cnp->cn_nameptr[0] != '/');
1301 		KASSERT(cnp->cn_nameptr[0] != '\0');
1302 		if ((error = lookup_parsepath(state, searchdir)) != 0) {
1303 			break;
1304 		}
1305 
1306 		/*
1307 		 * Can't deal with DOTDOT lookups if NOCROSSMOUNT or the
1308 		 * lookup is chrooted.
1309 		 */
1310 		if ((cnp->cn_flags & ISDOTDOT) != 0) {
1311 			if ((searchdir->v_vflag & VV_ROOT) != 0 &&
1312 			    (cnp->cn_flags & NOCROSSMOUNT)) {
1313 			    	error = EOPNOTSUPP;
1314 				break;
1315 			}
1316 			if (ndp->ni_rootdir != rootvnode) {
1317 			    	error = EOPNOTSUPP;
1318 				break;
1319 			}
1320 		}
1321 
1322 		/*
1323 		 * Can't deal with last component when modifying; this needs
1324 		 * searchdir locked and VOP_LOOKUP() called (which can and
1325 		 * does modify state, despite the name).  NB: this case means
1326 		 * terminal is never set true when LOCKPARENT.
1327 		 */
1328 		if ((cnp->cn_flags & ISLASTCN) != 0) {
1329 			if (cnp->cn_nameiop != LOOKUP ||
1330 			    (cnp->cn_flags & LOCKPARENT) != 0) {
1331 				error = EOPNOTSUPP;
1332 				break;
1333 			}
1334 		}
1335 
1336 		/*
1337 		 * Good, now look for it in cache.  cache_lookup_linked()
1338 		 * will fail if there's nothing there, or if there's no
1339 		 * ownership info for the directory, or if the user doesn't
1340 		 * have permission to look up files in this directory.
1341 		 */
1342 		if (!cache_lookup_linked(searchdir, cnp->cn_nameptr,
1343 		    cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) {
1344 			error = EOPNOTSUPP;
1345 			break;
1346 		}
1347 		KASSERT(plock != NULL && rw_lock_held(plock));
1348 
1349 		/*
1350 		 * Scored a hit.  Negative is good too (ENOENT).  If there's
1351 		 * a '-o union' mount here, punt and let lookup_once() deal
1352 		 * with it.
1353 		 */
1354 		if (foundobj == NULL) {
1355 			if ((searchdir->v_vflag & VV_ROOT) != 0 &&
1356 			    (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) {
1357 			    	error = EOPNOTSUPP;
1358 			} else {
1359 				error = ENOENT;
1360 				terminal = ((cnp->cn_flags & ISLASTCN) != 0);
1361 			}
1362 			break;
1363 		}
1364 
1365 		/*
1366 		 * Stop and get a hold on the vnode if we've encountered
1367 		 * something other than a dirctory.
1368 		 */
1369 		if (foundobj->v_type != VDIR) {
1370 			error = vcache_tryvget(foundobj);
1371 			if (error != 0) {
1372 				foundobj = NULL;
1373 				error = EOPNOTSUPP;
1374 			} else {
1375 				terminal = (foundobj->v_type != VLNK &&
1376 				    (cnp->cn_flags & ISLASTCN) != 0);
1377 			}
1378 			break;
1379 		}
1380 
1381 		/*
1382 		 * Try to cross mountpoints, bearing in mind that they can
1383 		 * be stacked.  If at any point we can't go further, stop
1384 		 * and try to get a reference on the vnode.  If we are able
1385 		 * to get a ref then lookup_crossmount() will take care of
1386 		 * it, otherwise we'll fall through to lookup_once().
1387 		 */
1388 		if (foundobj->v_mountedhere != NULL) {
1389 			while (foundobj->v_mountedhere != NULL &&
1390 			    (cnp->cn_flags & NOCROSSMOUNT) == 0 &&
1391 			    cache_cross_mount(&foundobj, &plock)) {
1392 				KASSERT(foundobj != NULL);
1393 				KASSERT(foundobj->v_type == VDIR);
1394 			}
1395 			if (foundobj->v_mountedhere != NULL) {
1396 				error = vcache_tryvget(foundobj);
1397 				if (error != 0) {
1398 					foundobj = NULL;
1399 					error = EOPNOTSUPP;
1400 				}
1401 				break;
1402 			} else {
1403 				searchdir = NULL;
1404 			}
1405 		}
1406 
1407 		/*
1408 		 * Time to stop if we found the last component & traversed
1409 		 * all mounts.
1410 		 */
1411 		if ((cnp->cn_flags & ISLASTCN) != 0) {
1412 			error = vcache_tryvget(foundobj);
1413 			if (error != 0) {
1414 				foundobj = NULL;
1415 				error = EOPNOTSUPP;
1416 			} else {
1417 				terminal = (foundobj->v_type != VLNK);
1418 			}
1419 			break;
1420 		}
1421 
1422 		/*
1423 		 * Otherwise, we're still in business.  Set the found VDIR
1424 		 * vnode as the search dir for the next component and
1425 		 * continue on to it.
1426 		 */
1427 		cnp->cn_nameptr = ndp->ni_next;
1428 		searchdir = foundobj;
1429 	}
1430 
1431 	if (terminal) {
1432 		/*
1433 		 * If we exited the loop above having successfully located
1434 		 * the last component with a zero error code, and it's not a
1435 		 * symbolic link, then the parent directory is not needed.
1436 		 * Release reference to the starting parent and make the
1437 		 * terminal parent disappear into thin air.
1438 		 */
1439 		KASSERT(plock != NULL);
1440 		rw_exit(plock);
1441 		vrele(*searchdir_ret);
1442 		*searchdir_ret = NULL;
1443 	} else if (searchdir != *searchdir_ret) {
1444 		/*
1445 		 * Otherwise we need to return the parent.  If we ended up
1446 		 * with a new search dir, ref it before dropping the
1447 		 * namecache's lock.  The lock prevents both searchdir and
1448 		 * foundobj from disappearing.  If we can't ref the new
1449 		 * searchdir, we have a bit of a problem.  Roll back the
1450 		 * fastforward to the beginning and let lookup_once() take
1451 		 * care of it.
1452 		 */
1453 		if (searchdir == NULL) {
1454 			/*
1455 			 * It's possible for searchdir to be NULL in the
1456 			 * case of a root vnode being reclaimed while
1457 			 * trying to cross a mount.
1458 			 */
1459 			error2 = EOPNOTSUPP;
1460 		} else {
1461 			error2 = vcache_tryvget(searchdir);
1462 		}
1463 		KASSERT(plock != NULL);
1464 		rw_exit(plock);
1465 		if (__predict_true(error2 == 0)) {
1466 			/* Returning new searchdir, and maybe new foundobj. */
1467 			vrele(*searchdir_ret);
1468 			*searchdir_ret = searchdir;
1469 		} else {
1470 			/* Returning nothing. */
1471 			if (foundobj != NULL) {
1472 				vrele(foundobj);
1473 				foundobj = NULL;
1474 			}
1475 			cnp->cn_nameptr = oldnameptr;
1476 			ndp->ni_pathlen = oldpathlen;
1477 			if (searchdir == NULL) {
1478 				error = EOPNOTSUPP;
1479 			} else {
1480 				error = lookup_parsepath(state, searchdir);
1481 				if (error == 0) {
1482 					error = EOPNOTSUPP;
1483 				}
1484 			}
1485 		}
1486 	} else if (plock != NULL) {
1487 		/* Drop any namecache lock still held. */
1488 		rw_exit(plock);
1489 	}
1490 
1491 	KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL);
1492 	*foundobj_ret = foundobj;
1493 	return error;
1494 }
1495 
1496 //////////////////////////////
1497 
1498 /*
1499  * Do a complete path search from a single root directory.
1500  * (This is called up to twice if TRYEMULROOT is in effect.)
1501  */
1502 static int
1503 namei_oneroot(struct namei_state *state,
1504 	 int neverfollow, int inhibitmagic, int isnfsd)
1505 {
1506 	struct nameidata *ndp = state->ndp;
1507 	struct componentname *cnp = state->cnp;
1508 	struct vnode *searchdir, *foundobj;
1509 	bool searchdir_locked = false;
1510 	int error;
1511 
1512 	error = namei_start(state, isnfsd, &searchdir);
1513 	if (error) {
1514 		ndp->ni_dvp = NULL;
1515 		ndp->ni_vp = NULL;
1516 		return error;
1517 	}
1518 	KASSERT(searchdir->v_type == VDIR);
1519 
1520 	/*
1521 	 * Setup: break out flag bits into variables.
1522 	 */
1523 	state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
1524 	if (cnp->cn_nameiop == DELETE)
1525 		state->docache = 0;
1526 	state->rdonly = cnp->cn_flags & RDONLY;
1527 
1528 	/*
1529 	 * Keep going until we run out of path components.
1530 	 */
1531 	cnp->cn_nameptr = ndp->ni_pnbuf;
1532 
1533 	/* drop leading slashes (already used them to choose startdir) */
1534 	while (cnp->cn_nameptr[0] == '/') {
1535 		cnp->cn_nameptr++;
1536 		ndp->ni_pathlen--;
1537 	}
1538 	/* was it just "/"? */
1539 	if (cnp->cn_nameptr[0] == '\0') {
1540 		foundobj = searchdir;
1541 		searchdir = NULL;
1542 		cnp->cn_flags |= ISLASTCN;
1543 
1544 		/* bleh */
1545 		goto skiploop;
1546 	}
1547 
1548 	for (;;) {
1549 		KASSERT(searchdir != NULL);
1550 		KASSERT(!searchdir_locked);
1551 
1552 		/*
1553 		 * Parse out the first path name component that we need to
1554 		 * to consider.  While doing this, attempt to use the name
1555 		 * cache to fast-forward through as many "easy" to find
1556 		 * components of the path as possible.
1557 		 */
1558 		error = lookup_fastforward(state, &searchdir, &foundobj);
1559 
1560 		/*
1561 		 * If we didn't get a good answer from the namecache, then
1562 		 * go directly to the file system.
1563 		 */
1564 		if (error == EOPNOTSUPP) {
1565 			error = lookup_once(state, searchdir, &searchdir,
1566 			    &foundobj, &searchdir_locked);
1567 		}
1568 
1569 		/*
1570 		 * If the vnode we found is mounted on, then cross the mount
1571 		 * and get the root vnode in foundobj.  If this encounters
1572 		 * an error, it will dispose of foundobj, but searchdir is
1573 		 * untouched.
1574 		 */
1575 		if (error == 0 && foundobj != NULL &&
1576 		    foundobj->v_type == VDIR &&
1577 		    foundobj->v_mountedhere != NULL &&
1578 		    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1579 		    	error = lookup_crossmount(state, &searchdir,
1580 		    	    &foundobj, &searchdir_locked);
1581 		}
1582 
1583 		if (error) {
1584 			if (searchdir != NULL) {
1585 				if (searchdir_locked) {
1586 					searchdir_locked = false;
1587 					vput(searchdir);
1588 				} else {
1589 					vrele(searchdir);
1590 				}
1591 			}
1592 			ndp->ni_dvp = NULL;
1593 			ndp->ni_vp = NULL;
1594 			/*
1595 			 * Note that if we're doing TRYEMULROOT we can
1596 			 * retry with the normal root. Where this is
1597 			 * currently set matches previous practice,
1598 			 * but the previous practice didn't make much
1599 			 * sense and somebody should sit down and
1600 			 * figure out which cases should cause retry
1601 			 * and which shouldn't. XXX.
1602 			 */
1603 			state->attempt_retry = 1;
1604 			return (error);
1605 		}
1606 
1607 		if (foundobj == NULL) {
1608 			/*
1609 			 * Success with no object returned means we're
1610 			 * creating something and it isn't already
1611 			 * there. Break out of the main loop now so
1612 			 * the code below doesn't have to test for
1613 			 * foundobj == NULL.
1614 			 */
1615 			/* lookup_once can't have dropped the searchdir */
1616 			KASSERT(searchdir != NULL ||
1617 			    (cnp->cn_flags & ISLASTCN) != 0);
1618 			break;
1619 		}
1620 
1621 		/*
1622 		 * Check for symbolic link. If we've reached one,
1623 		 * follow it, unless we aren't supposed to. Back up
1624 		 * over any slashes that we skipped, as we will need
1625 		 * them again.
1626 		 */
1627 		if (namei_atsymlink(state, foundobj)) {
1628 			/* Don't need searchdir locked any more. */
1629 			if (searchdir_locked) {
1630 				searchdir_locked = false;
1631 				VOP_UNLOCK(searchdir);
1632 			}
1633 			ndp->ni_pathlen += state->slashes;
1634 			ndp->ni_next -= state->slashes;
1635 			if (neverfollow) {
1636 				error = EINVAL;
1637 			} else if (searchdir == NULL) {
1638 				/*
1639 				 * dholland 20160410: lookup_once only
1640 				 * drops searchdir if it crossed a
1641 				 * mount point. Therefore, if we get
1642 				 * here it means we crossed a mount
1643 				 * point to a mounted filesystem whose
1644 				 * root vnode is a symlink. In theory
1645 				 * we could continue at this point by
1646 				 * using the pre-crossing searchdir
1647 				 * (e.g. just take out an extra
1648 				 * reference on it before calling
1649 				 * lookup_once so we still have it),
1650 				 * but this will make an ugly mess and
1651 				 * it should never happen in practice
1652 				 * as only badly broken filesystems
1653 				 * have non-directory root vnodes. (I
1654 				 * have seen this sort of thing with
1655 				 * NFS occasionally but even then it
1656 				 * means something's badly wrong.)
1657 				 */
1658 				error = ENOTDIR;
1659 			} else {
1660 				/*
1661 				 * dholland 20110410: if we're at a
1662 				 * union mount it might make sense to
1663 				 * use the top of the union stack here
1664 				 * rather than the layer we found the
1665 				 * symlink in. (FUTURE)
1666 				 */
1667 				error = namei_follow(state, inhibitmagic,
1668 						     searchdir, foundobj,
1669 						     &searchdir);
1670 			}
1671 			if (error) {
1672 				KASSERT(searchdir != foundobj);
1673 				if (searchdir != NULL) {
1674 					vrele(searchdir);
1675 				}
1676 				vrele(foundobj);
1677 				ndp->ni_dvp = NULL;
1678 				ndp->ni_vp = NULL;
1679 				return error;
1680 			}
1681 			vrele(foundobj);
1682 			foundobj = NULL;
1683 
1684 			/*
1685 			 * If we followed a symlink to `/' and there
1686 			 * are no more components after the symlink,
1687 			 * we're done with the loop and what we found
1688 			 * is the searchdir.
1689 			 */
1690 			if (cnp->cn_nameptr[0] == '\0') {
1691 				KASSERT(searchdir != NULL);
1692 				foundobj = searchdir;
1693 				searchdir = NULL;
1694 				cnp->cn_flags |= ISLASTCN;
1695 				break;
1696 			}
1697 
1698 			continue;
1699 		}
1700 
1701 		/*
1702 		 * Not a symbolic link.
1703 		 *
1704 		 * Check for directory, if the component was
1705 		 * followed by a series of slashes.
1706 		 */
1707 		if ((foundobj->v_type != VDIR) &&
1708 		    (cnp->cn_flags & REQUIREDIR)) {
1709 			KASSERT(foundobj != searchdir);
1710 			if (searchdir) {
1711 				if (searchdir_locked) {
1712 					searchdir_locked = false;
1713 					vput(searchdir);
1714 				} else {
1715 					vrele(searchdir);
1716 				}
1717 			} else {
1718 				KASSERT(!searchdir_locked);
1719 			}
1720 			vrele(foundobj);
1721 			ndp->ni_dvp = NULL;
1722 			ndp->ni_vp = NULL;
1723 			state->attempt_retry = 1;
1724 			return ENOTDIR;
1725 		}
1726 
1727 		/*
1728 		 * Stop if we've reached the last component.
1729 		 */
1730 		if (cnp->cn_flags & ISLASTCN) {
1731 			break;
1732 		}
1733 
1734 		/*
1735 		 * Continue with the next component.
1736 		 */
1737 		cnp->cn_nameptr = ndp->ni_next;
1738 		if (searchdir != NULL) {
1739 			if (searchdir_locked) {
1740 				searchdir_locked = false;
1741 				vput(searchdir);
1742 			} else {
1743 				vrele(searchdir);
1744 			}
1745 		}
1746 		searchdir = foundobj;
1747 		foundobj = NULL;
1748 	}
1749 
1750 	KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL ||
1751 	    VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
1752 
1753  skiploop:
1754 
1755 	if (foundobj != NULL) {
1756 		if (foundobj == ndp->ni_erootdir) {
1757 			/*
1758 			 * We are about to return the emulation root.
1759 			 * This isn't a good idea because code might
1760 			 * repeatedly lookup ".." until the file
1761 			 * matches that returned for "/" and loop
1762 			 * forever.  So convert it to the real root.
1763 			 */
1764 			if (searchdir != NULL) {
1765 				if (searchdir_locked) {
1766 					vput(searchdir);
1767 					searchdir_locked = false;
1768 				} else {
1769 					vrele(searchdir);
1770 				}
1771 				searchdir = NULL;
1772 			}
1773 			vrele(foundobj);
1774 			foundobj = ndp->ni_rootdir;
1775 			vref(foundobj);
1776 		}
1777 
1778 		/*
1779 		 * If the caller requested the parent node (i.e. it's
1780 		 * a CREATE, DELETE, or RENAME), and we don't have one
1781 		 * (because this is the root directory, or we crossed
1782 		 * a mount point), then we must fail.
1783 		 *
1784 		 * 20210604 dholland when NONEXCLHACK is set (open
1785 		 * with O_CREAT but not O_EXCL) skip this logic. Since
1786 		 * we have a foundobj, open will not be creating, so
1787 		 * it doesn't actually need or use the searchdir, so
1788 		 * it's ok to return it even if it's on a different
1789 		 * volume, and it's also ok to return NULL; by setting
1790 		 * NONEXCLHACK the open code promises to cope with
1791 		 * those cases correctly. (That is, it should do what
1792 		 * it would do anyway, that is, just release the
1793 		 * searchdir, except not crash if it's null.) This is
1794 		 * needed because otherwise opening mountpoints with
1795 		 * O_CREAT but not O_EXCL fails... which is a silly
1796 		 * thing to do but ought to work. (This whole issue
1797 		 * came to light because 3rd party code wanted to open
1798 		 * certain procfs nodes with O_CREAT for some 3rd
1799 		 * party reason, and it failed.)
1800 		 *
1801 		 * Note that NONEXCLHACK is properly a different
1802 		 * nameiop (it is partway between LOOKUP and CREATE)
1803 		 * but it was stuffed in as a flag instead to make the
1804 		 * resulting patch less invasive for pullup. Blah.
1805 		 */
1806 		if (cnp->cn_nameiop != LOOKUP &&
1807 		    (searchdir == NULL ||
1808 		     searchdir->v_mount != foundobj->v_mount) &&
1809 		    (cnp->cn_flags & NONEXCLHACK) == 0) {
1810 			if (searchdir) {
1811 				if (searchdir_locked) {
1812 					vput(searchdir);
1813 					searchdir_locked = false;
1814 				} else {
1815 					vrele(searchdir);
1816 				}
1817 				searchdir = NULL;
1818 			}
1819 			vrele(foundobj);
1820 			foundobj = NULL;
1821 			ndp->ni_dvp = NULL;
1822 			ndp->ni_vp = NULL;
1823 			state->attempt_retry = 1;
1824 
1825 			switch (cnp->cn_nameiop) {
1826 			    case CREATE:
1827 				return EEXIST;
1828 			    case DELETE:
1829 			    case RENAME:
1830 				return EBUSY;
1831 			    default:
1832 				break;
1833 			}
1834 			panic("Invalid nameiop\n");
1835 		}
1836 
1837 		/*
1838 		 * Disallow directory write attempts on read-only lookups.
1839 		 * Prefers EEXIST over EROFS for the CREATE case.
1840 		 */
1841 		if (state->rdonly &&
1842 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1843 			if (searchdir) {
1844 				if (searchdir_locked) {
1845 					vput(searchdir);
1846 					searchdir_locked = false;
1847 				} else {
1848 					vrele(searchdir);
1849 				}
1850 				searchdir = NULL;
1851 			}
1852 			vrele(foundobj);
1853 			foundobj = NULL;
1854 			ndp->ni_dvp = NULL;
1855 			ndp->ni_vp = NULL;
1856 			state->attempt_retry = 1;
1857 			return EROFS;
1858 		}
1859 
1860 		/* Lock the leaf node if requested. */
1861 		if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT &&
1862 		    searchdir == foundobj) {
1863 			/*
1864 			 * Note: if LOCKPARENT but not LOCKLEAF is
1865 			 * set, and searchdir == foundobj, this code
1866 			 * necessarily unlocks the parent as well as
1867 			 * the leaf. That is, just because you specify
1868 			 * LOCKPARENT doesn't mean you necessarily get
1869 			 * a locked parent vnode. The code in
1870 			 * vfs_syscalls.c, and possibly elsewhere,
1871 			 * that uses this combination "knows" this, so
1872 			 * it can't be safely changed. Feh. XXX
1873 			 */
1874 			KASSERT(searchdir_locked);
1875 		    	VOP_UNLOCK(searchdir);
1876 		    	searchdir_locked = false;
1877 		} else if ((cnp->cn_flags & LOCKLEAF) != 0 &&
1878 		    (searchdir != foundobj ||
1879 		    (cnp->cn_flags & LOCKPARENT) == 0)) {
1880 			const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ?
1881 			    LK_SHARED : LK_EXCLUSIVE;
1882 			vn_lock(foundobj, lktype | LK_RETRY);
1883 		}
1884 	}
1885 
1886 	/*
1887 	 * Done.
1888 	 */
1889 
1890 	/*
1891 	 * If LOCKPARENT is not set, the parent directory isn't returned.
1892 	 */
1893 	if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) {
1894 		vrele(searchdir);
1895 		searchdir = NULL;
1896 	}
1897 
1898 	ndp->ni_dvp = searchdir;
1899 	ndp->ni_vp = foundobj;
1900 	return 0;
1901 }
1902 
1903 /*
1904  * Do namei; wrapper layer that handles TRYEMULROOT.
1905  */
1906 static int
1907 namei_tryemulroot(struct namei_state *state,
1908 	 int neverfollow, int inhibitmagic, int isnfsd)
1909 {
1910 	int error;
1911 
1912 	struct nameidata *ndp = state->ndp;
1913 	struct componentname *cnp = state->cnp;
1914 	const char *savepath = NULL;
1915 
1916 	KASSERT(cnp == &ndp->ni_cnd);
1917 
1918 	if (cnp->cn_flags & TRYEMULROOT) {
1919 		savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
1920 	}
1921 
1922     emul_retry:
1923 	state->attempt_retry = 0;
1924 
1925 	error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd);
1926 	if (error) {
1927 		/*
1928 		 * Once namei has started up, the existence of ni_erootdir
1929 		 * tells us whether we're working from an emulation root.
1930 		 * The TRYEMULROOT flag isn't necessarily authoritative.
1931 		 */
1932 		if (ndp->ni_erootdir != NULL && state->attempt_retry) {
1933 			/* Retry the whole thing using the normal root */
1934 			cnp->cn_flags &= ~TRYEMULROOT;
1935 			state->attempt_retry = 0;
1936 
1937 			/* kinda gross */
1938 			strcpy(ndp->ni_pathbuf->pb_path, savepath);
1939 			pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1940 			savepath = NULL;
1941 
1942 			goto emul_retry;
1943 		}
1944 	}
1945 	if (savepath != NULL) {
1946 		pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
1947 	}
1948 	return error;
1949 }
1950 
1951 /*
1952  * External interface.
1953  */
1954 int
1955 namei(struct nameidata *ndp)
1956 {
1957 	struct namei_state state;
1958 	int error;
1959 
1960 	namei_init(&state, ndp);
1961 	error = namei_tryemulroot(&state,
1962 				  0/*!neverfollow*/, 0/*!inhibitmagic*/,
1963 				  0/*isnfsd*/);
1964 	namei_cleanup(&state);
1965 
1966 	if (error) {
1967 		/* make sure no stray refs leak out */
1968 		KASSERT(ndp->ni_dvp == NULL);
1969 		KASSERT(ndp->ni_vp == NULL);
1970 	}
1971 
1972 	return error;
1973 }
1974 
1975 ////////////////////////////////////////////////////////////
1976 
1977 /*
1978  * External interface used by nfsd. This is basically different from
1979  * namei only in that it has the ability to pass in the "current
1980  * directory", and uses an extra flag "neverfollow" for which there's
1981  * no physical flag defined in namei.h. (There used to be a cut&paste
1982  * copy of about half of namei in nfsd to allow these minor
1983  * adjustments to exist.)
1984  *
1985  * XXX: the namei interface should be adjusted so nfsd can just use
1986  * ordinary namei().
1987  */
1988 int
1989 lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
1990 {
1991 	struct namei_state state;
1992 	int error;
1993 
1994 	KASSERT(ndp->ni_atdir == NULL);
1995 	ndp->ni_atdir = forcecwd;
1996 
1997 	namei_init(&state, ndp);
1998 	error = namei_tryemulroot(&state,
1999 				  neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
2000 	namei_cleanup(&state);
2001 
2002 	if (error) {
2003 		/* make sure no stray refs leak out */
2004 		KASSERT(ndp->ni_dvp == NULL);
2005 		KASSERT(ndp->ni_vp == NULL);
2006 	}
2007 
2008 	return error;
2009 }
2010 
2011 /*
2012  * A second external interface used by nfsd. This turns out to be a
2013  * single lookup used by the WebNFS code (ha!) to get "index.html" or
2014  * equivalent when asked for a directory. It should eventually evolve
2015  * into some kind of namei_once() call; for the time being it's kind
2016  * of a mess. XXX.
2017  *
2018  * dholland 20110109: I don't think it works, and I don't think it
2019  * worked before I started hacking and slashing either, and I doubt
2020  * anyone will ever notice.
2021  */
2022 
2023 /*
2024  * Internals. This calls lookup_once() after setting up the assorted
2025  * pieces of state the way they ought to be.
2026  */
2027 static int
2028 do_lookup_for_nfsd_index(struct namei_state *state)
2029 {
2030 	int error;
2031 
2032 	struct componentname *cnp = state->cnp;
2033 	struct nameidata *ndp = state->ndp;
2034 	struct vnode *startdir;
2035 	struct vnode *foundobj;
2036 	bool startdir_locked;
2037 	const char *cp;			/* pointer into pathname argument */
2038 
2039 	KASSERT(cnp == &ndp->ni_cnd);
2040 
2041 	startdir = state->ndp->ni_atdir;
2042 
2043 	cnp->cn_nameptr = ndp->ni_pnbuf;
2044 	state->docache = 1;
2045 	state->rdonly = cnp->cn_flags & RDONLY;
2046 	ndp->ni_dvp = NULL;
2047 
2048 	error = VOP_PARSEPATH(startdir, cnp->cn_nameptr, &cnp->cn_namelen);
2049 	if (error) {
2050 		return error;
2051 	}
2052 
2053 	cp = cnp->cn_nameptr + cnp->cn_namelen;
2054 	KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
2055 	ndp->ni_pathlen -= cnp->cn_namelen;
2056 	ndp->ni_next = cp;
2057 	state->slashes = 0;
2058 	cnp->cn_flags &= ~REQUIREDIR;
2059 	cnp->cn_flags |= MAKEENTRY|ISLASTCN;
2060 
2061 	if (cnp->cn_namelen == 2 &&
2062 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
2063 		cnp->cn_flags |= ISDOTDOT;
2064 	else
2065 		cnp->cn_flags &= ~ISDOTDOT;
2066 
2067 	/*
2068 	 * Because lookup_once can change the startdir, we need our
2069 	 * own reference to it to avoid consuming the caller's.
2070 	 */
2071 	vref(startdir);
2072 	error = lookup_once(state, startdir, &startdir, &foundobj,
2073 	    &startdir_locked);
2074 
2075 	KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
2076 	if (startdir_locked) {
2077 		VOP_UNLOCK(startdir);
2078 		startdir_locked = false;
2079 	}
2080 
2081 	/*
2082 	 * If the vnode we found is mounted on, then cross the mount and get
2083 	 * the root vnode in foundobj.  If this encounters an error, it will
2084 	 * dispose of foundobj, but searchdir is untouched.
2085 	 */
2086 	if (error == 0 && foundobj != NULL &&
2087 	    foundobj->v_type == VDIR &&
2088 	    foundobj->v_mountedhere != NULL &&
2089 	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
2090 		error = lookup_crossmount(state, &startdir, &foundobj,
2091 		    &startdir_locked);
2092 	}
2093 
2094 	/* Now toss startdir and see if we have an error. */
2095 	if (startdir != NULL)
2096 		vrele(startdir);
2097 	if (error)
2098 		foundobj = NULL;
2099 	else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0)
2100 		vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
2101 
2102 	ndp->ni_vp = foundobj;
2103 	return (error);
2104 }
2105 
2106 /*
2107  * External interface. The partitioning between this function and the
2108  * above isn't very clear - the above function exists mostly so code
2109  * that uses "state->" can be shuffled around without having to change
2110  * it to "state.".
2111  */
2112 int
2113 lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
2114 {
2115 	struct namei_state state;
2116 	int error;
2117 
2118 	KASSERT(ndp->ni_atdir == NULL);
2119 	ndp->ni_atdir = startdir;
2120 
2121 	/*
2122 	 * Note: the name sent in here (is not|should not be) allowed
2123 	 * to contain a slash.
2124 	 */
2125 	if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
2126 		return ENAMETOOLONG;
2127 	}
2128 	if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
2129 		return EINVAL;
2130 	}
2131 
2132 	ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
2133 	ndp->ni_pnbuf = NULL;
2134 	ndp->ni_cnd.cn_nameptr = NULL;
2135 
2136 	namei_init(&state, ndp);
2137 	error = do_lookup_for_nfsd_index(&state);
2138 	namei_cleanup(&state);
2139 
2140 	return error;
2141 }
2142 
2143 ////////////////////////////////////////////////////////////
2144 
2145 /*
2146  * Reacquire a path name component.
2147  * dvp is locked on entry and exit.
2148  * *vpp is locked on exit unless it's NULL.
2149  */
2150 int
2151 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
2152 {
2153 	int rdonly;			/* lookup read-only flag bit */
2154 	int error = 0;
2155 #ifdef DEBUG
2156 	size_t newlen;			/* DEBUG: check name len */
2157 	const char *cp;			/* DEBUG: check name ptr */
2158 #endif /* DEBUG */
2159 
2160 	(void)dummy;
2161 
2162 	/*
2163 	 * Setup: break out flag bits into variables.
2164 	 */
2165 	rdonly = cnp->cn_flags & RDONLY;
2166 
2167 	/*
2168 	 * Search a new directory.
2169 	 *
2170 	 * The cn_hash value is for use by vfs_cache.
2171 	 * The last component of the filename is left accessible via
2172 	 * cnp->cn_nameptr for callers that need the name. Callers needing
2173 	 * the name set the SAVENAME flag. When done, they assume
2174 	 * responsibility for freeing the pathname buffer.
2175 	 */
2176 #ifdef DEBUG
2177 #if 0
2178 	cp = NULL;
2179 	newhash = namei_hash(cnp->cn_nameptr, &cp);
2180 	if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
2181 		panic("relookup: bad hash");
2182 #endif
2183 	error = VOP_PARSEPATH(dvp, cnp->cn_nameptr, &newlen);
2184 	if (error) {
2185 		panic("relookup: parsepath failed with error %d", error);
2186 	}
2187 	if (cnp->cn_namelen != newlen)
2188 		panic("relookup: bad len");
2189 	cp = cnp->cn_nameptr + cnp->cn_namelen;
2190 	while (*cp == '/')
2191 		cp++;
2192 	if (*cp != 0)
2193 		panic("relookup: not last component");
2194 #endif /* DEBUG */
2195 
2196 	/*
2197 	 * Check for degenerate name (e.g. / or "")
2198 	 * which is a way of talking about a directory,
2199 	 * e.g. like "/." or ".".
2200 	 */
2201 	if (cnp->cn_nameptr[0] == '\0')
2202 		panic("relookup: null name");
2203 
2204 	if (cnp->cn_flags & ISDOTDOT)
2205 		panic("relookup: lookup on dot-dot");
2206 
2207 	/*
2208 	 * We now have a segment name to search for, and a directory to search.
2209 	 */
2210 	*vpp = NULL;
2211 	error = VOP_LOOKUP(dvp, vpp, cnp);
2212 	if ((error) != 0) {
2213 		KASSERTMSG((*vpp == NULL),
2214 		    "leaf `%s' should be empty but is %p",
2215 		    cnp->cn_nameptr, *vpp);
2216 		if (error != EJUSTRETURN)
2217 			goto bad;
2218 	}
2219 
2220 	/*
2221 	 * Check for symbolic link
2222 	 */
2223 	KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK ||
2224 		(cnp->cn_flags & FOLLOW) == 0),
2225 	    "relookup: symlink found");
2226 
2227 	/*
2228 	 * Check for read-only lookups.
2229 	 */
2230 	if (rdonly && cnp->cn_nameiop != LOOKUP) {
2231 		error = EROFS;
2232 		if (*vpp) {
2233 			vrele(*vpp);
2234 		}
2235 		goto bad;
2236 	}
2237 	/*
2238 	 * Lock result.
2239 	 */
2240 	if (*vpp && *vpp != dvp) {
2241 		error = vn_lock(*vpp, LK_EXCLUSIVE);
2242 		if (error != 0) {
2243 			vrele(*vpp);
2244 			goto bad;
2245 		}
2246 	}
2247 	return (0);
2248 
2249 bad:
2250 	*vpp = NULL;
2251 	return (error);
2252 }
2253 
2254 /*
2255  * namei_simple - simple forms of namei.
2256  *
2257  * These are wrappers to allow the simple case callers of namei to be
2258  * left alone while everything else changes under them.
2259  */
2260 
2261 /* Flags */
2262 struct namei_simple_flags_type {
2263 	int dummy;
2264 };
2265 static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
2266 const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
2267 const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
2268 const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
2269 const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;
2270 
2271 static
2272 int
2273 namei_simple_convert_flags(namei_simple_flags_t sflags)
2274 {
2275 	if (sflags == NSM_NOFOLLOW_NOEMULROOT)
2276 		return NOFOLLOW | 0;
2277 	if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
2278 		return NOFOLLOW | TRYEMULROOT;
2279 	if (sflags == NSM_FOLLOW_NOEMULROOT)
2280 		return FOLLOW | 0;
2281 	if (sflags == NSM_FOLLOW_TRYEMULROOT)
2282 		return FOLLOW | TRYEMULROOT;
2283 	panic("namei_simple_convert_flags: bogus sflags\n");
2284 	return 0;
2285 }
2286 
2287 int
2288 namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
2289 	struct vnode **vp_ret)
2290 {
2291 	return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
2292 }
2293 
2294 int
2295 nameiat_simple_kernel(struct vnode *dvp, const char *path,
2296 	namei_simple_flags_t sflags, struct vnode **vp_ret)
2297 {
2298 	struct nameidata nd;
2299 	struct pathbuf *pb;
2300 	int err;
2301 
2302 	pb = pathbuf_create(path);
2303 	if (pb == NULL) {
2304 		return ENOMEM;
2305 	}
2306 
2307 	NDINIT(&nd,
2308 		LOOKUP,
2309 		namei_simple_convert_flags(sflags),
2310 		pb);
2311 
2312 	if (dvp != NULL)
2313 		NDAT(&nd, dvp);
2314 
2315 	err = namei(&nd);
2316 	if (err != 0) {
2317 		pathbuf_destroy(pb);
2318 		return err;
2319 	}
2320 	*vp_ret = nd.ni_vp;
2321 	pathbuf_destroy(pb);
2322 	return 0;
2323 }
2324 
2325 int
2326 namei_simple_user(const char *path, namei_simple_flags_t sflags,
2327 	struct vnode **vp_ret)
2328 {
2329 	return nameiat_simple_user(NULL, path, sflags, vp_ret);
2330 }
2331 
2332 int
2333 nameiat_simple_user(struct vnode *dvp, const char *path,
2334 	namei_simple_flags_t sflags, struct vnode **vp_ret)
2335 {
2336 	struct pathbuf *pb;
2337 	struct nameidata nd;
2338 	int err;
2339 
2340 	err = pathbuf_copyin(path, &pb);
2341 	if (err) {
2342 		return err;
2343 	}
2344 
2345 	NDINIT(&nd,
2346 		LOOKUP,
2347 		namei_simple_convert_flags(sflags),
2348 		pb);
2349 
2350 	if (dvp != NULL)
2351 		NDAT(&nd, dvp);
2352 
2353 	err = namei(&nd);
2354 	if (err != 0) {
2355 		pathbuf_destroy(pb);
2356 		return err;
2357 	}
2358 	*vp_ret = nd.ni_vp;
2359 	pathbuf_destroy(pb);
2360 	return 0;
2361 }
2362