xref: /csrg-svn/sys/kern/vfs_lookup.c (revision 26306)
1 /*
2  * Copyright (c) 1982 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  *
6  *	@(#)vfs_lookup.c	6.27 (Berkeley) 02/20/86
7  */
8 
9 #include "param.h"
10 #include "systm.h"
11 #include "inode.h"
12 #include "fs.h"
13 #include "mount.h"
14 #include "dir.h"
15 #include "user.h"
16 #include "buf.h"
17 #include "conf.h"
18 #include "uio.h"
19 #include "kernel.h"
20 
21 struct	buf *blkatoff();
22 struct	buf *freenamebuf;
23 int	dirchk = 0;
24 
25 /*
26  * Structures associated with name cacheing.
27  */
28 #define	NCHHASH		32	/* size of hash table */
29 
30 #if	((NCHHASH)&((NCHHASH)-1)) != 0
31 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) % (NCHHASH))
32 #else
33 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) & ((NCHHASH)-1))
34 #endif
35 
36 union nchash {
37 	union	nchash *nch_head[2];
38 	struct	namecache *nch_chain[2];
39 } nchash[NCHHASH];
40 #define	nch_forw	nch_chain[0]
41 #define	nch_back	nch_chain[1]
42 
43 struct	namecache *nchhead, **nchtail;	/* LRU chain pointers */
44 struct	nchstats nchstats;		/* cache effectiveness statistics */
45 
46 /*
47  * Convert a pathname into a pointer to a locked inode,
48  * with side effects usable in creating and removing files.
49  * This is a very central and rather complicated routine.
50  *
51  * The segflg defines whether the name is to be copied from user
52  * space or kernel space.
53  *
54  * The flag argument is (LOOKUP, CREATE, DELETE) depending on whether
55  * the name is to be (looked up, created, deleted).  If flag has
56  * LOCKPARENT or'ed into it and the target of the pathname exists,
57  * namei returns both the target and its parent directory locked.
58  * If the file system is not maintained in a strict tree hierarchy,
59  * this can result in a deadlock situation.  When creating and
60  * LOCKPARENT is specified, the target may not be ".".  When deleting
61  * and LOCKPARENT is specified, the target may be ".", but the caller
62  * must check to insure it does an irele and iput instead of two iputs.
63  *
64  * The FOLLOW flag is set when symbolic links are to be followed
65  * when they occur at the end of the name translation process.
66  *
67  * Name caching works as follows:
68  *
69  * Names found by directory scans are retained in a cache
70  * for future reference.  It is managed LRU, so frequently
71  * used names will hang around.  Cache is indexed by hash value
72  * obtained from (ino,dev,name) where ino & dev refer to the
73  * directory containing name.
74  *
75  * For simplicity (and economy of storage), names longer than
76  * some (small) maximum length are not cached, they occur
77  * infrequently in any case, and are almost never of interest.
78  *
79  * Upon reaching the last segment of a path, if the reference
80  * is for DELETE, or NOCACHE is set (rewrite), and the
81  * name is located in the cache, it will be dropped.
82  *
83  * We must be sure never to enter the name ".." into the cache
84  * because of the extremely kludgey way that rename() alters
85  * ".." in a situation like
86  * 	mv a/x b/x
87  * where x is a directory, and x/.. is the ".." in question.
88  *
89  * Overall outline of namei:
90  *
91  *	copy in name
92  *	get starting directory
93  * dirloop:
94  *	check accessibility of directory
95  * dirloop2:
96  *	copy next component of name to ndp->ni_dent
97  *	handle degenerate case where name is null string
98  *	look for name in cache, if found, then if at end of path
99  *	  and deleting or creating, drop it, else to haveino
100  *	search for name in directory, to found or notfound
101  * notfound:
102  *	if creating, return locked directory, leaving info on avail. slots
103  *	else return error
104  * found:
105  *	if at end of path and deleting, return information to allow delete
106  *	if at end of path and rewriting (create and LOCKPARENT), lock target
107  *	  inode and return info to allow rewrite
108  *	if .. and on mounted filesys, look in mount table for parent
109  *	if not at end, if neither creating nor deleting, add name to cache
110  * haveino:
111  *	if symbolic link, massage name in buffer and continue at dirloop
112  *	if more components of name, do next level at dirloop
113  *	return the answer as locked inode
114  *
115  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode,
116  *	 but unlocked.
117  */
118 struct inode *
119 namei(ndp)
120 	register struct nameidata *ndp;
121 {
122 	register char *cp;		/* pointer into pathname argument */
123 /* these variables refer to things which must be freed or unlocked */
124 	register struct inode *dp = 0;	/* the directory we are searching */
125 	register struct namecache *ncp;	/* cache slot for entry */
126 	register struct fs *fs;		/* file system that directory is in */
127 	register struct buf *bp = 0;	/* a buffer of directory entries */
128 	register struct direct *ep;	/* the current directory entry */
129 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
130 	register struct buf *nbp;	/* buffer storing path name argument */
131 /* these variables hold information about the search for a slot */
132 	enum {NONE, COMPACT, FOUND} slotstatus;
133 	int slotoffset = -1;		/* offset of area with free space */
134 	int slotsize;			/* size of area at slotoffset */
135 	int slotfreespace;		/* amount of space free in slot */
136 	int slotneeded;			/* size of the entry we're seeking */
137 /* */
138 	int numdirpasses;		/* strategy for directory search */
139 	int endsearch;			/* offset to end directory search */
140 	int prevoff;			/* ndp->ni_offset of previous entry */
141 	int nlink = 0;			/* number of symbolic links taken */
142 	struct inode *pdp;		/* saved dp during symlink work */
143 	int error, i;
144 	int lockparent;
145 	int docache;			/* == 0 do not cache last component */
146 	int makeentry;			/* != 0 if name to be added to cache */
147 	unsigned hash;			/* value of name hash for entry */
148 	union nchash *nhp;		/* cache chain head for entry */
149 	int isdotdot;			/* != 0 if current name is ".." */
150 	int flag;			/* op ie, LOOKUP, CREATE, or DELETE */
151 	off_t enduseful;		/* pointer past last used dir slot */
152 
153 	lockparent = ndp->ni_nameiop & LOCKPARENT;
154 	docache = (ndp->ni_nameiop & NOCACHE) ^ NOCACHE;
155 	flag = ndp->ni_nameiop &~ (LOCKPARENT|NOCACHE|FOLLOW);
156 	if (flag == DELETE || lockparent)
157 		docache = 0;
158 	/*
159 	 * Get a buffer for the name to be translated, and copy the
160 	 * name into the buffer.
161 	 */
162 	nbp = freenamebuf;
163 	if (nbp == NULL)
164 		nbp = geteblk(MAXPATHLEN);
165 	else
166 		freenamebuf = nbp->av_forw;
167 	if (ndp->ni_segflg == UIO_SYSSPACE)
168 		error = copystr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
169 		    (u_int *)0);
170 	else
171 		error = copyinstr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
172 		    (u_int *)0);
173 	if (error) {
174 		u.u_error = error;
175 		goto bad;
176 	}
177 
178 	/*
179 	 * Get starting directory.
180 	 */
181 	cp = nbp->b_un.b_addr;
182 	if (*cp == '/') {
183 		while (*cp == '/')
184 			cp++;
185 		if ((dp = u.u_rdir) == NULL)
186 			dp = rootdir;
187 	} else
188 		dp = u.u_cdir;
189 	fs = dp->i_fs;
190 	ILOCK(dp);
191 	dp->i_count++;
192 	ndp->ni_pdir = (struct inode *)0xc0000000;		/* illegal */
193 	ndp->ni_endoff = 0;
194 
195 	/*
196 	 * We come to dirloop to search a new directory.
197 	 * The directory must be locked so that it can be
198 	 * iput, and fs must be already set to dp->i_fs.
199 	 */
200 dirloop:
201 	/*
202 	 * Check accessiblity of directory.
203 	 */
204 	if ((dp->i_mode&IFMT) != IFDIR) {
205 		u.u_error = ENOTDIR;
206 		goto bad;
207 	}
208 	if (access(dp, IEXEC))
209 		goto bad;
210 
211 dirloop2:
212 	/*
213 	 * Copy next component of name to ndp->ni_dent.
214 	 */
215 	hash = 0;
216 	for (i = 0; *cp != 0 && *cp != '/'; cp++) {
217 		if (i >= MAXNAMLEN) {
218 			u.u_error = ENAMETOOLONG;
219 			goto bad;
220 		}
221 		if (*cp & 0200)
222 			if ((*cp&0377) == ('/'|0200) || flag != DELETE) {
223 				u.u_error = EINVAL;
224 				goto bad;
225 			}
226 		ndp->ni_dent.d_name[i++] = *cp;
227 		hash += (unsigned char)*cp * i;
228 	}
229 	ndp->ni_dent.d_namlen = i;
230 	ndp->ni_dent.d_name[i] = '\0';
231 	isdotdot = (i == 2 &&
232 		ndp->ni_dent.d_name[0] == '.' && ndp->ni_dent.d_name[1] == '.');
233 	makeentry = 1;
234 	if (*cp == '\0' && docache == 0)
235 		makeentry = 0;
236 
237 	/*
238 	 * Check for degenerate name (e.g. / or "")
239 	 * which is a way of talking about a directory,
240 	 * e.g. like "/." or ".".
241 	 */
242 	if (ndp->ni_dent.d_name[0] == '\0') {
243 		if (flag != LOOKUP || lockparent) {
244 			u.u_error = EISDIR;
245 			goto bad;
246 		}
247 		nbp->av_forw = freenamebuf;
248 		freenamebuf = nbp;
249 		return (dp);
250 	}
251 
252 	/*
253 	 * We now have a segment name to search for, and a directory to search.
254 	 *
255 	 * Before tediously performing a linear scan of the directory,
256 	 * check the name cache to see if the directory/name pair
257 	 * we are looking for is known already.  We don't do this
258 	 * if the segment name is long, simply so the cache can avoid
259 	 * holding long names (which would either waste space, or
260 	 * add greatly to the complexity).
261 	 */
262 	if (ndp->ni_dent.d_namlen > NCHNAMLEN) {
263 		nchstats.ncs_long++;
264 		makeentry = 0;
265 	} else {
266 		nhp = &nchash[NHASH(hash, dp->i_number, dp->i_dev)];
267 		for (ncp = nhp->nch_forw; ncp != (struct namecache *)nhp;
268 		    ncp = ncp->nc_forw) {
269 			if (ncp->nc_ino == dp->i_number &&
270 			    ncp->nc_dev == dp->i_dev &&
271 			    ncp->nc_nlen == ndp->ni_dent.d_namlen &&
272 			    !bcmp(ncp->nc_name, ndp->ni_dent.d_name,
273 				(unsigned)ncp->nc_nlen))
274 				break;
275 		}
276 		if (ncp == (struct namecache *)nhp) {
277 			nchstats.ncs_miss++;
278 			ncp = NULL;
279 		} else {
280 			if (ncp->nc_id != ncp->nc_ip->i_id)
281 				nchstats.ncs_falsehits++;
282 			else if (!makeentry)
283 				nchstats.ncs_badhits++;
284 			else {
285 				/*
286 				 * move this slot to end of LRU
287 				 * chain, if not already there
288 				 */
289 				if (ncp->nc_nxt) {
290 					/* remove from LRU chain */
291 					*ncp->nc_prev = ncp->nc_nxt;
292 					ncp->nc_nxt->nc_prev = ncp->nc_prev;
293 
294 					/* and replace at end of it */
295 					ncp->nc_nxt = NULL;
296 					ncp->nc_prev = nchtail;
297 					*nchtail = ncp;
298 					nchtail = &ncp->nc_nxt;
299 				}
300 
301 				/*
302 				 * Get the next inode in the path.
303 				 * See comment above other `IUNLOCK' code for
304 				 * an explaination of the locking protocol.
305 				 */
306 				pdp = dp;
307 				if (!isdotdot || dp != u.u_rdir)
308 					dp = ncp->nc_ip;
309 				if (dp == NULL)
310 					panic("nami: null cache ino");
311 				if (pdp == dp)
312 					dp->i_count++;
313 				else if (isdotdot) {
314 					IUNLOCK(pdp);
315 					igrab(dp);
316 				} else {
317 					igrab(dp);
318 					IUNLOCK(pdp);
319 				}
320 
321 				/*
322 				 * Verify that the inode that we got
323 				 * did not change while we were waiting
324 				 * for it to be locked.
325 				 */
326 				if (ncp->nc_id != ncp->nc_ip->i_id) {
327 					iput(dp);
328 					ILOCK(pdp);
329 					dp = pdp;
330 					nchstats.ncs_falsehits++;
331 				} else {
332 					ndp->ni_dent.d_ino = dp->i_number;
333 					/* ni_dent.d_reclen is garbage ... */
334 					nchstats.ncs_goodhits++;
335 					goto haveino;
336 				}
337 			}
338 
339 			/*
340 			 * Last component and we are renaming or deleting,
341 			 * the cache entry is invalid, or otherwise don't
342 			 * want cache entry to exist.
343 			 */
344 			/* remove from LRU chain */
345 			*ncp->nc_prev = ncp->nc_nxt;
346 			if (ncp->nc_nxt)
347 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
348 			else
349 				nchtail = ncp->nc_prev;
350 			remque(ncp);		/* remove from hash chain */
351 			/* insert at head of LRU list (first to grab) */
352 			ncp->nc_nxt = nchhead;
353 			ncp->nc_prev = &nchhead;
354 			nchhead->nc_prev = &ncp->nc_nxt;
355 			nchhead = ncp;
356 			/* and make a dummy hash chain */
357 			ncp->nc_forw = ncp;
358 			ncp->nc_back = ncp;
359 			ncp = NULL;
360 		}
361 	}
362 
363 	/*
364 	 * Suppress search for slots unless creating
365 	 * file and at end of pathname, in which case
366 	 * we watch for a place to put the new file in
367 	 * case it doesn't already exist.
368 	 */
369 	slotstatus = FOUND;
370 	if (flag == CREATE && *cp == 0) {
371 		slotstatus = NONE;
372 		slotfreespace = 0;
373 		slotneeded = DIRSIZ(&ndp->ni_dent);
374 	}
375 	/*
376 	 * If this is the same directory that this process
377 	 * previously searched, pick up where we last left off.
378 	 * We cache only lookups as these are the most common
379 	 * and have the greatest payoff. Caching CREATE has little
380 	 * benefit as it usually must search the entire directory
381 	 * to determine that the entry does not exist. Caching the
382 	 * location of the last DELETE has not reduced profiling time
383 	 * and hence has been removed in the interest of simplicity.
384 	 */
385 	if (flag != LOOKUP || dp->i_number != u.u_ncache.nc_inumber ||
386 	    dp->i_dev != u.u_ncache.nc_dev) {
387 		ndp->ni_offset = 0;
388 		numdirpasses = 1;
389 	} else {
390 		if ((dp->i_flag & ICHG) || dp->i_ctime >= u.u_ncache.nc_time) {
391 			if (u.u_ncache.nc_prevoffset > dp->i_size)
392 				u.u_ncache.nc_prevoffset = 0;
393 			else
394 				u.u_ncache.nc_prevoffset &= ~(DIRBLKSIZ - 1);
395 			u.u_ncache.nc_time = time.tv_sec;
396 		}
397 		ndp->ni_offset = u.u_ncache.nc_prevoffset;
398 		entryoffsetinblock = blkoff(fs, ndp->ni_offset);
399 		if (entryoffsetinblock != 0) {
400 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
401 			if (bp == 0)
402 				goto bad;
403 		}
404 		numdirpasses = 2;
405 		nchstats.ncs_2passes++;
406 	}
407 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
408 	enduseful = 0;
409 
410 searchloop:
411 	while (ndp->ni_offset < endsearch) {
412 		/*
413 		 * If offset is on a block boundary,
414 		 * read the next directory block.
415 		 * Release previous if it exists.
416 		 */
417 		if (blkoff(fs, ndp->ni_offset) == 0) {
418 			if (bp != NULL)
419 				brelse(bp);
420 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
421 			if (bp == 0)
422 				goto bad;
423 			entryoffsetinblock = 0;
424 		}
425 		/*
426 		 * If still looking for a slot, and at a DIRBLKSIZE
427 		 * boundary, have to start looking for free space again.
428 		 */
429 		if (slotstatus == NONE &&
430 		    (entryoffsetinblock&(DIRBLKSIZ-1)) == 0) {
431 			slotoffset = -1;
432 			slotfreespace = 0;
433 		}
434 		/*
435 		 * Get pointer to next entry.
436 		 * Full validation checks are slow, so we only check
437 		 * enough to insure forward progress through the
438 		 * directory. Complete checks can be run by patching
439 		 * "dirchk" to be true.
440 		 */
441 		ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock);
442 		if (ep->d_reclen == 0 ||
443 		    dirchk && dirbadentry(ep, entryoffsetinblock)) {
444 			dirbad(dp, ndp->ni_offset, "mangled entry");
445 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
446 			ndp->ni_offset += i;
447 			entryoffsetinblock += i;
448 			continue;
449 		}
450 
451 		/*
452 		 * If an appropriate sized slot has not yet been found,
453 		 * check to see if one is available. Also accumulate space
454 		 * in the current block so that we can determine if
455 		 * compaction is viable.
456 		 */
457 		if (slotstatus != FOUND) {
458 			int size = ep->d_reclen;
459 
460 			if (ep->d_ino != 0)
461 				size -= DIRSIZ(ep);
462 			if (size > 0) {
463 				if (size >= slotneeded) {
464 					slotstatus = FOUND;
465 					slotoffset = ndp->ni_offset;
466 					slotsize = ep->d_reclen;
467 				} else if (slotstatus == NONE) {
468 					slotfreespace += size;
469 					if (slotoffset == -1)
470 						slotoffset = ndp->ni_offset;
471 					if (slotfreespace >= slotneeded) {
472 						slotstatus = COMPACT;
473 						slotsize = ndp->ni_offset +
474 						      ep->d_reclen - slotoffset;
475 					}
476 				}
477 			}
478 		}
479 
480 		/*
481 		 * Check for a name match.
482 		 */
483 		if (ep->d_ino) {
484 			if (ep->d_namlen == ndp->ni_dent.d_namlen &&
485 			    !bcmp(ndp->ni_dent.d_name, ep->d_name,
486 				(unsigned)ep->d_namlen))
487 				goto found;
488 		}
489 		prevoff = ndp->ni_offset;
490 		ndp->ni_offset += ep->d_reclen;
491 		entryoffsetinblock += ep->d_reclen;
492 		if (ep->d_ino)
493 			enduseful = ndp->ni_offset;
494 	}
495 /* notfound: */
496 	/*
497 	 * If we started in the middle of the directory and failed
498 	 * to find our target, we must check the beginning as well.
499 	 */
500 	if (numdirpasses == 2) {
501 		numdirpasses--;
502 		ndp->ni_offset = 0;
503 		endsearch = u.u_ncache.nc_prevoffset;
504 		goto searchloop;
505 	}
506 	/*
507 	 * If creating, and at end of pathname and current
508 	 * directory has not been removed, then can consider
509 	 * allowing file to be created.
510 	 */
511 	if (flag == CREATE && *cp == 0 && dp->i_nlink != 0) {
512 		/*
513 		 * Access for write is interpreted as allowing
514 		 * creation of files in the directory.
515 		 */
516 		if (access(dp, IWRITE))
517 			goto bad;
518 		/*
519 		 * Return an indication of where the new directory
520 		 * entry should be put.  If we didn't find a slot,
521 		 * then set ndp->ni_count to 0 indicating that the new
522 		 * slot belongs at the end of the directory. If we found
523 		 * a slot, then the new entry can be put in the range
524 		 * [ndp->ni_offset .. ndp->ni_offset + ndp->ni_count)
525 		 */
526 		if (slotstatus == NONE) {
527 			ndp->ni_offset = roundup(dp->i_size, DIRBLKSIZ);
528 			ndp->ni_count = 0;
529 			enduseful = ndp->ni_offset;
530 		} else {
531 			ndp->ni_offset = slotoffset;
532 			ndp->ni_count = slotsize;
533 			if (enduseful < slotoffset + slotsize)
534 				enduseful = slotoffset + slotsize;
535 		}
536 		ndp->ni_endoff = roundup(enduseful, DIRBLKSIZ);
537 		dp->i_flag |= IUPD|ICHG;
538 		if (bp)
539 			brelse(bp);
540 		nbp->av_forw = freenamebuf;
541 		freenamebuf = nbp;
542 		/*
543 		 * We return with the directory locked, so that
544 		 * the parameters we set up above will still be
545 		 * valid if we actually decide to do a direnter().
546 		 * We return NULL to indicate that the entry doesn't
547 		 * currently exist, leaving a pointer to the (locked)
548 		 * directory inode in ndp->ni_pdir.
549 		 */
550 		ndp->ni_pdir = dp;
551 		return (NULL);
552 	}
553 	u.u_error = ENOENT;
554 	goto bad;
555 found:
556 	if (numdirpasses == 2)
557 		nchstats.ncs_pass2++;
558 	/*
559 	 * Check that directory length properly reflects presence
560 	 * of this entry.
561 	 */
562 	if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) {
563 		dirbad(dp, ndp->ni_offset, "i_size too small");
564 		dp->i_size = entryoffsetinblock + DIRSIZ(ep);
565 		dp->i_flag |= IUPD|ICHG;
566 	}
567 
568 	/*
569 	 * Found component in pathname.
570 	 * If the final component of path name, save information
571 	 * in the cache as to where the entry was found.
572 	 */
573 	if (*cp == '\0' && flag == LOOKUP) {
574 		u.u_ncache.nc_prevoffset = ndp->ni_offset;
575 		u.u_ncache.nc_inumber = dp->i_number;
576 		u.u_ncache.nc_dev = dp->i_dev;
577 		u.u_ncache.nc_time = time.tv_sec;
578 	}
579 	/*
580 	 * Save directory entry's inode number and reclen in ndp->ni_dent,
581 	 * and release directory buffer.
582 	 */
583 	ndp->ni_dent.d_ino = ep->d_ino;
584 	ndp->ni_dent.d_reclen = ep->d_reclen;
585 	brelse(bp);
586 	bp = NULL;
587 
588 	/*
589 	 * If deleting, and at end of pathname, return
590 	 * parameters which can be used to remove file.
591 	 * If the lockparent flag isn't set, we return only
592 	 * the directory (in ndp->ni_pdir), otherwise we go
593 	 * on and lock the inode, being careful with ".".
594 	 */
595 	if (flag == DELETE && *cp == 0) {
596 		/*
597 		 * Write access to directory required to delete files.
598 		 */
599 		if (access(dp, IWRITE))
600 			goto bad;
601 		ndp->ni_pdir = dp;		/* for dirremove() */
602 		/*
603 		 * Return pointer to current entry in ndp->ni_offset,
604 		 * and distance past previous entry (if there
605 		 * is a previous entry in this block) in ndp->ni_count.
606 		 * Save directory inode pointer in ndp->ni_pdir for dirremove().
607 		 */
608 		if ((ndp->ni_offset&(DIRBLKSIZ-1)) == 0)
609 			ndp->ni_count = 0;
610 		else
611 			ndp->ni_count = ndp->ni_offset - prevoff;
612 		if (lockparent) {
613 			if (dp->i_number == ndp->ni_dent.d_ino)
614 				dp->i_count++;
615 			else {
616 				dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
617 				if (dp == NULL) {
618 					iput(ndp->ni_pdir);
619 					goto bad;
620 				}
621 				/*
622 				 * If directory is "sticky", then user must own
623 				 * the directory, or the file in it, else he
624 				 * may not delete it (unless he's root). This
625 				 * implements append-only directories.
626 				 */
627 				if ((ndp->ni_pdir->i_mode & ISVTX) &&
628 				    u.u_uid != 0 &&
629 				    u.u_uid != ndp->ni_pdir->i_uid &&
630 				    dp->i_uid != u.u_uid) {
631 					iput(ndp->ni_pdir);
632 					u.u_error = EPERM;
633 					goto bad;
634 				}
635 			}
636 		}
637 		nbp->av_forw = freenamebuf;
638 		freenamebuf = nbp;
639 		return (dp);
640 	}
641 
642 	/*
643 	 * Special handling for ".." allowing chdir out of mounted
644 	 * file system: indirect .. in root inode to reevaluate
645 	 * in directory file system was mounted on.
646 	 */
647 	if (isdotdot) {
648 		if (dp == u.u_rdir) {
649 			ndp->ni_dent.d_ino = dp->i_number;
650 			makeentry = 0;
651 		} else if (ndp->ni_dent.d_ino == ROOTINO &&
652 		   dp->i_number == ROOTINO) {
653 			for (i = 1; i < NMOUNT; i++)
654 			if (mount[i].m_bufp != NULL &&
655 			   mount[i].m_dev == dp->i_dev) {
656 				iput(dp);
657 				dp = mount[i].m_inodp;
658 				ILOCK(dp);
659 				dp->i_count++;
660 				fs = dp->i_fs;
661 				cp -= 2;     /* back over .. */
662 				goto dirloop2;
663 			}
664 		}
665 	}
666 
667 	/*
668 	 * If rewriting (rename), return the inode and the
669 	 * information required to rewrite the present directory
670 	 * Must get inode of directory entry to verify it's a
671 	 * regular file, or empty directory.
672 	 */
673 	if ((flag == CREATE && lockparent) && *cp == 0) {
674 		if (access(dp, IWRITE))
675 			goto bad;
676 		ndp->ni_pdir = dp;		/* for dirrewrite() */
677 		/*
678 		 * Careful about locking second inode.
679 		 * This can only occur if the target is ".".
680 		 */
681 		if (dp->i_number == ndp->ni_dent.d_ino) {
682 			u.u_error = EISDIR;		/* XXX */
683 			goto bad;
684 		}
685 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
686 		if (dp == NULL) {
687 			iput(ndp->ni_pdir);
688 			goto bad;
689 		}
690 		nbp->av_forw = freenamebuf;
691 		freenamebuf = nbp;
692 		return (dp);
693 	}
694 
695 	/*
696 	 * Check for symbolic link, which may require us to massage the
697 	 * name before we continue translation.  We do not `iput' the
698 	 * directory because we may need it again if the symbolic link
699 	 * is relative to the current directory.  Instead we save it
700 	 * unlocked as "pdp".  We must get the target inode before unlocking
701 	 * the directory to insure that the inode will not be removed
702 	 * before we get it.  We prevent deadlock by always fetching
703 	 * inodes from the root, moving down the directory tree. Thus
704 	 * when following backward pointers ".." we must unlock the
705 	 * parent directory before getting the requested directory.
706 	 * There is a potential race condition here if both the current
707 	 * and parent directories are removed before the `iget' for the
708 	 * inode associated with ".." returns.  We hope that this occurs
709 	 * infrequently since we cannot avoid this race condition without
710 	 * implementing a sophisticated deadlock detection algorithm.
711 	 * Note also that this simple deadlock detection scheme will not
712 	 * work if the file system has any hard links other than ".."
713 	 * that point backwards in the directory structure.
714 	 */
715 	pdp = dp;
716 	if (isdotdot) {
717 		IUNLOCK(pdp);	/* race to get the inode */
718 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
719 		if (dp == NULL)
720 			goto bad2;
721 	} else if (dp->i_number == ndp->ni_dent.d_ino) {
722 		dp->i_count++;	/* we want ourself, ie "." */
723 	} else {
724 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
725 		IUNLOCK(pdp);
726 		if (dp == NULL)
727 			goto bad2;
728 	}
729 
730 	/*
731 	 * Insert name into cache if appropriate.
732 	 */
733 	if (makeentry) {
734 		if (ncp != NULL)
735 			panic("nami: duplicating cache");
736 		/*
737 		 * Free the cache slot at head of lru chain.
738 		 */
739 		if (ncp = nchhead) {
740 			/* remove from lru chain */
741 			*ncp->nc_prev = ncp->nc_nxt;
742 			if (ncp->nc_nxt)
743 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
744 			else
745 				nchtail = ncp->nc_prev;
746 			remque(ncp);		/* remove from old hash chain */
747 			/* grab the inode we just found */
748 			ncp->nc_ip = dp;
749 			/* fill in cache info */
750 			ncp->nc_ino = pdp->i_number;	/* parents inum */
751 			ncp->nc_dev = pdp->i_dev;	/* & device */
752 			ncp->nc_idev = dp->i_dev;	/* our device */
753 			ncp->nc_id = dp->i_id;		/* identifier */
754 			ncp->nc_nlen = ndp->ni_dent.d_namlen;
755 			bcopy(ndp->ni_dent.d_name, ncp->nc_name,
756 			    (unsigned)ncp->nc_nlen);
757 			/* link at end of lru chain */
758 			ncp->nc_nxt = NULL;
759 			ncp->nc_prev = nchtail;
760 			*nchtail = ncp;
761 			nchtail = &ncp->nc_nxt;
762 			/* and insert on hash chain */
763 			insque(ncp, nhp);
764 		}
765 	}
766 
767 haveino:
768 	fs = dp->i_fs;
769 
770 	/*
771 	 * Check for symbolic link
772 	 */
773 	if ((dp->i_mode & IFMT) == IFLNK &&
774 	    ((ndp->ni_nameiop & FOLLOW) || *cp == '/')) {
775 		u_int pathlen = strlen(cp) + 1;
776 
777 		if (dp->i_size + pathlen >= MAXPATHLEN - 1) {
778 			u.u_error = ENAMETOOLONG;
779 			goto bad2;
780 		}
781 		if (++nlink > MAXSYMLINKS) {
782 			u.u_error = ELOOP;
783 			goto bad2;
784 		}
785 		ovbcopy(cp, nbp->b_un.b_addr + dp->i_size, pathlen);
786 		u.u_error =
787 		    rdwri(UIO_READ, dp, nbp->b_un.b_addr, (int)dp->i_size,
788 			0, 1, (int *)0);
789 		if (u.u_error)
790 			goto bad2;
791 		cp = nbp->b_un.b_addr;
792 		iput(dp);
793 		if (*cp == '/') {
794 			irele(pdp);
795 			while (*cp == '/')
796 				cp++;
797 			if ((dp = u.u_rdir) == NULL)
798 				dp = rootdir;
799 			ILOCK(dp);
800 			dp->i_count++;
801 		} else {
802 			dp = pdp;
803 			ILOCK(dp);
804 		}
805 		fs = dp->i_fs;
806 		goto dirloop;
807 	}
808 
809 	/*
810 	 * Not a symbolic link.  If more pathname,
811 	 * continue at next component, else return.
812 	 */
813 	if (*cp == '/') {
814 		while (*cp == '/')
815 			cp++;
816 		irele(pdp);
817 		goto dirloop;
818 	}
819 	nbp->av_forw = freenamebuf;
820 	freenamebuf = nbp;
821 	if (lockparent)
822 		ndp->ni_pdir = pdp;
823 	else
824 		irele(pdp);
825 	return (dp);
826 bad2:
827 	irele(pdp);
828 bad:
829 	if (bp)
830 		brelse(bp);
831 	if (dp)
832 		iput(dp);
833 	nbp->av_forw = freenamebuf;
834 	freenamebuf = nbp;
835 	return (NULL);
836 }
837 
838 
839 dirbad(ip, offset, how)
840 	struct inode *ip;
841 	off_t offset;
842 	char *how;
843 {
844 
845 	printf("%s: bad dir ino %d at offset %d: %s\n",
846 	    ip->i_fs->fs_fsmnt, ip->i_number, offset, how);
847 }
848 
849 /*
850  * Do consistency checking on a directory entry:
851  *	record length must be multiple of 4
852  *	entry must fit in rest of its DIRBLKSIZ block
853  *	record must be large enough to contain entry
854  *	name is not longer than MAXNAMLEN
855  *	name must be as long as advertised, and null terminated
856  */
857 dirbadentry(ep, entryoffsetinblock)
858 	register struct direct *ep;
859 	int entryoffsetinblock;
860 {
861 	register int i;
862 
863 	if ((ep->d_reclen & 0x3) != 0 ||
864 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
865 	    ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN)
866 		return (1);
867 	for (i = 0; i < ep->d_namlen; i++)
868 		if (ep->d_name[i] == '\0')
869 			return (1);
870 	return (ep->d_name[i]);
871 }
872 
873 /*
874  * Write a directory entry after a call to namei, using the parameters
875  * which it left in the u. area.  The argument ip is the inode which
876  * the new directory entry will refer to.  The u. area field ndp->ni_pdir is
877  * a pointer to the directory to be written, which was left locked by
878  * namei.  Remaining parameters (ndp->ni_offset, ndp->ni_count) indicate
879  * how the space for the new entry is to be gotten.
880  */
881 direnter(ip, ndp)
882 	struct inode *ip;
883 	register struct nameidata *ndp;
884 {
885 	register struct direct *ep, *nep;
886 	register struct inode *dp = ndp->ni_pdir;
887 	struct buf *bp;
888 	int loc, spacefree, error = 0;
889 	u_int dsize;
890 	int newentrysize;
891 	char *dirbuf;
892 
893 	ndp->ni_dent.d_ino = ip->i_number;
894 	newentrysize = DIRSIZ(&ndp->ni_dent);
895 	if (ndp->ni_count == 0) {
896 		/*
897 		 * If ndp->ni_count is 0, then namei could find no space in the
898 		 * directory. In this case ndp->ni_offset will be on a directory
899 		 * block boundary and we will write the new entry into a fresh
900 		 * block.
901 		 */
902 		if (ndp->ni_offset&(DIRBLKSIZ-1))
903 			panic("wdir: newblk");
904 		ndp->ni_dent.d_reclen = DIRBLKSIZ;
905 		error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
906 		    newentrysize, ndp->ni_offset, 1, (int *)0);
907 		if (DIRBLKSIZ > dp->i_fs->fs_fsize)
908 			panic("wdir: blksize"); /* XXX - should grow w/bmap() */
909 		else
910 			dp->i_size = roundup(dp->i_size, DIRBLKSIZ);
911 		iput(dp);
912 		return (error);
913 	}
914 
915 	/*
916 	 * If ndp->ni_count is non-zero, then namei found space for the new
917 	 * entry in the range ndp->ni_offset to ndp->ni_offset + ndp->ni_count.
918 	 * in the directory.  To use this space, we may have to compact
919 	 * the entries located there, by copying them together towards
920 	 * the beginning of the block, leaving the free space in
921 	 * one usable chunk at the end.
922 	 */
923 
924 	/*
925 	 * Increase size of directory if entry eats into new space.
926 	 * This should never push the size past a new multiple of
927 	 * DIRBLKSIZE.
928 	 *
929 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
930 	 */
931 	if (ndp->ni_offset + ndp->ni_count > dp->i_size)
932 		dp->i_size = ndp->ni_offset + ndp->ni_count;
933 	/*
934 	 * Get the block containing the space for the new directory
935 	 * entry.  Should return error by result instead of u.u_error.
936 	 */
937 	bp = blkatoff(dp, ndp->ni_offset, (char **)&dirbuf);
938 	if (bp == 0) {
939 		iput(dp);
940 		return (u.u_error);
941 	}
942 	/*
943 	 * Find space for the new entry.  In the simple case, the
944 	 * entry at offset base will have the space.  If it does
945 	 * not, then namei arranged that compacting the region
946 	 * ndp->ni_offset to ndp->ni_offset+ndp->ni_count would yield the space.
947 	 */
948 	ep = (struct direct *)dirbuf;
949 	dsize = DIRSIZ(ep);
950 	spacefree = ep->d_reclen - dsize;
951 	for (loc = ep->d_reclen; loc < ndp->ni_count; ) {
952 		nep = (struct direct *)(dirbuf + loc);
953 		if (ep->d_ino) {
954 			/* trim the existing slot */
955 			ep->d_reclen = dsize;
956 			ep = (struct direct *)((char *)ep + dsize);
957 		} else {
958 			/* overwrite; nothing there; header is ours */
959 			spacefree += dsize;
960 		}
961 		dsize = DIRSIZ(nep);
962 		spacefree += nep->d_reclen - dsize;
963 		loc += nep->d_reclen;
964 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
965 	}
966 	/*
967 	 * Update the pointer fields in the previous entry (if any),
968 	 * copy in the new entry, and write out the block.
969 	 */
970 	if (ep->d_ino == 0) {
971 		if (spacefree + dsize < newentrysize)
972 			panic("wdir: compact1");
973 		ndp->ni_dent.d_reclen = spacefree + dsize;
974 	} else {
975 		if (spacefree < newentrysize)
976 			panic("wdir: compact2");
977 		ndp->ni_dent.d_reclen = spacefree;
978 		ep->d_reclen = dsize;
979 		ep = (struct direct *)((char *)ep + dsize);
980 	}
981 	bcopy((caddr_t)&ndp->ni_dent, (caddr_t)ep, (u_int)newentrysize);
982 	bwrite(bp);
983 	dp->i_flag |= IUPD|ICHG;
984 	if (ndp->ni_endoff && ndp->ni_endoff < dp->i_size)
985 		itrunc(dp, (u_long)ndp->ni_endoff);
986 	iput(dp);
987 	return (error);
988 }
989 
990 /*
991  * Remove a directory entry after a call to namei, using the
992  * parameters which it left in the u. area.  The u. entry
993  * ni_offset contains the offset into the directory of the
994  * entry to be eliminated.  The ni_count field contains the
995  * size of the previous record in the directory.  If this
996  * is 0, the first entry is being deleted, so we need only
997  * zero the inode number to mark the entry as free.  If the
998  * entry isn't the first in the directory, we must reclaim
999  * the space of the now empty record by adding the record size
1000  * to the size of the previous entry.
1001  */
1002 dirremove(ndp)
1003 	register struct nameidata *ndp;
1004 {
1005 	register struct inode *dp = ndp->ni_pdir;
1006 	register struct buf *bp;
1007 	struct direct *ep;
1008 
1009 	if (ndp->ni_count == 0) {
1010 		/*
1011 		 * First entry in block: set d_ino to zero.
1012 		 */
1013 		ndp->ni_dent.d_ino = 0;
1014 		(void) rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1015 		    (int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1016 	} else {
1017 		/*
1018 		 * Collapse new free space into previous entry.
1019 		 */
1020 		bp = blkatoff(dp, (int)(ndp->ni_offset - ndp->ni_count),
1021 			(char **)&ep);
1022 		if (bp == 0)
1023 			return (0);
1024 		ep->d_reclen += ndp->ni_dent.d_reclen;
1025 		bwrite(bp);
1026 		dp->i_flag |= IUPD|ICHG;
1027 	}
1028 	return (1);
1029 }
1030 
1031 /*
1032  * Rewrite an existing directory entry to point at the inode
1033  * supplied.  The parameters describing the directory entry are
1034  * set up by a call to namei.
1035  */
1036 dirrewrite(dp, ip, ndp)
1037 	struct inode *dp, *ip;
1038 	struct nameidata *ndp;
1039 {
1040 
1041 	ndp->ni_dent.d_ino = ip->i_number;
1042 	u.u_error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1043 		(int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1044 	iput(dp);
1045 }
1046 
1047 /*
1048  * Return buffer with contents of block "offset"
1049  * from the beginning of directory "ip".  If "res"
1050  * is non-zero, fill it in with a pointer to the
1051  * remaining space in the directory.
1052  */
1053 struct buf *
1054 blkatoff(ip, offset, res)
1055 	struct inode *ip;
1056 	off_t offset;
1057 	char **res;
1058 {
1059 	register struct fs *fs = ip->i_fs;
1060 	daddr_t lbn = lblkno(fs, offset);
1061 	int bsize = blksize(fs, ip, lbn);
1062 	register struct buf *bp;
1063 	daddr_t bn;
1064 
1065 	bn = bmap(ip, lbn, B_READ, bsize);
1066 	if (u.u_error)
1067 		return (0);
1068 	if (bn == (daddr_t)-1) {
1069 		dirbad(ip, offset, "hole in dir");
1070 		return (0);
1071 	}
1072 	bp = bread(ip->i_dev, fsbtodb(fs, bn), bsize);
1073 	if (bp->b_flags & B_ERROR) {
1074 		brelse(bp);
1075 		return (0);
1076 	}
1077 	if (res)
1078 		*res = bp->b_un.b_addr + blkoff(fs, offset);
1079 	return (bp);
1080 }
1081 
1082 /*
1083  * Check if a directory is empty or not.
1084  * Inode supplied must be locked.
1085  *
1086  * Using a struct dirtemplate here is not precisely
1087  * what we want, but better than using a struct direct.
1088  *
1089  * NB: does not handle corrupted directories.
1090  */
1091 dirempty(ip, parentino)
1092 	register struct inode *ip;
1093 	ino_t parentino;
1094 {
1095 	register off_t off;
1096 	struct dirtemplate dbuf;
1097 	register struct direct *dp = (struct direct *)&dbuf;
1098 	int error, count;
1099 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1100 
1101 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
1102 		error = rdwri(UIO_READ, ip, (caddr_t)dp, MINDIRSIZ,
1103 		    off, 1, &count);
1104 		/*
1105 		 * Since we read MINDIRSIZ, residual must
1106 		 * be 0 unless we're at end of file.
1107 		 */
1108 		if (error || count != 0)
1109 			return (0);
1110 		/* avoid infinite loops */
1111 		if (dp->d_reclen == 0)
1112 			return (0);
1113 		/* skip empty entries */
1114 		if (dp->d_ino == 0)
1115 			continue;
1116 		/* accept only "." and ".." */
1117 		if (dp->d_namlen > 2)
1118 			return (0);
1119 		if (dp->d_name[0] != '.')
1120 			return (0);
1121 		/*
1122 		 * At this point d_namlen must be 1 or 2.
1123 		 * 1 implies ".", 2 implies ".." if second
1124 		 * char is also "."
1125 		 */
1126 		if (dp->d_namlen == 1)
1127 			continue;
1128 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
1129 			continue;
1130 		return (0);
1131 	}
1132 	return (1);
1133 }
1134 
1135 /*
1136  * Check if source directory is in the path of the target directory.
1137  * Target is supplied locked, source is unlocked.
1138  * The target is always iput() before returning.
1139  */
1140 checkpath(source, target)
1141 	struct inode *source, *target;
1142 {
1143 	struct dirtemplate dirbuf;
1144 	register struct inode *ip;
1145 	int error = 0;
1146 
1147 	ip = target;
1148 	if (ip->i_number == source->i_number) {
1149 		error = EEXIST;
1150 		goto out;
1151 	}
1152 	if (ip->i_number == ROOTINO)
1153 		goto out;
1154 
1155 	for (;;) {
1156 		if ((ip->i_mode&IFMT) != IFDIR) {
1157 			error = ENOTDIR;
1158 			break;
1159 		}
1160 		error = rdwri(UIO_READ, ip, (caddr_t)&dirbuf,
1161 			sizeof (struct dirtemplate), (off_t)0, 1, (int *)0);
1162 		if (error != 0)
1163 			break;
1164 		if (dirbuf.dotdot_namlen != 2 ||
1165 		    dirbuf.dotdot_name[0] != '.' ||
1166 		    dirbuf.dotdot_name[1] != '.') {
1167 			error = ENOTDIR;
1168 			break;
1169 		}
1170 		if (dirbuf.dotdot_ino == source->i_number) {
1171 			error = EINVAL;
1172 			break;
1173 		}
1174 		if (dirbuf.dotdot_ino == ROOTINO)
1175 			break;
1176 		iput(ip);
1177 		ip = iget(ip->i_dev, ip->i_fs, dirbuf.dotdot_ino);
1178 		if (ip == NULL) {
1179 			error = u.u_error;
1180 			break;
1181 		}
1182 	}
1183 
1184 out:
1185 	if (error == ENOTDIR)
1186 		printf("checkpath: .. not a directory\n");
1187 	if (ip != NULL)
1188 		iput(ip);
1189 	return (error);
1190 }
1191 
1192 /*
1193  * Name cache initialization, from main() when we are booting
1194  */
1195 nchinit()
1196 {
1197 	register union nchash *nchp;
1198 	register struct namecache *ncp;
1199 
1200 	nchhead = 0;
1201 	nchtail = &nchhead;
1202 	for (ncp = namecache; ncp < &namecache[nchsize]; ncp++) {
1203 		ncp->nc_forw = ncp;			/* hash chain */
1204 		ncp->nc_back = ncp;
1205 		ncp->nc_nxt = NULL;			/* lru chain */
1206 		*nchtail = ncp;
1207 		ncp->nc_prev = nchtail;
1208 		nchtail = &ncp->nc_nxt;
1209 		/* all else is zero already */
1210 	}
1211 	for (nchp = nchash; nchp < &nchash[NCHHASH]; nchp++) {
1212 		nchp->nch_head[0] = nchp;
1213 		nchp->nch_head[1] = nchp;
1214 	}
1215 }
1216 
1217 /*
1218  * Cache flush, called when filesys is umounted to
1219  * remove entries that would now be invalid
1220  *
1221  * The line "nxtcp = nchhead" near the end is to avoid potential problems
1222  * if the cache lru chain is modified while we are dumping the
1223  * inode.  This makes the algorithm O(n^2), but do you think I care?
1224  */
1225 nchinval(dev)
1226 	register dev_t dev;
1227 {
1228 	register struct namecache *ncp, *nxtcp;
1229 
1230 	for (ncp = nchhead; ncp; ncp = nxtcp) {
1231 		nxtcp = ncp->nc_nxt;
1232 		if (ncp->nc_ip == NULL ||
1233 		    (ncp->nc_idev != dev && ncp->nc_dev != dev))
1234 			continue;
1235 		/* free the resources we had */
1236 		ncp->nc_idev = NODEV;
1237 		ncp->nc_dev = NODEV;
1238 		ncp->nc_id = NULL;
1239 		ncp->nc_ino = 0;
1240 		ncp->nc_ip = NULL;
1241 		remque(ncp);		/* remove entry from its hash chain */
1242 		ncp->nc_forw = ncp;	/* and make a dummy one */
1243 		ncp->nc_back = ncp;
1244 		/* delete this entry from LRU chain */
1245 		*ncp->nc_prev = nxtcp;
1246 		if (nxtcp)
1247 			nxtcp->nc_prev = ncp->nc_prev;
1248 		else
1249 			nchtail = ncp->nc_prev;
1250 		/* cause rescan of list, it may have altered */
1251 		nxtcp = nchhead;
1252 		/* put the now-free entry at head of LRU */
1253 		ncp->nc_nxt = nxtcp;
1254 		ncp->nc_prev = &nchhead;
1255 		nxtcp->nc_prev = &ncp->nc_nxt;
1256 		nchhead = ncp;
1257 	}
1258 }
1259 
1260 /*
1261  * Name cache invalidation of all entries.
1262  */
1263 cacheinvalall()
1264 {
1265 	register struct namecache *ncp;
1266 
1267 	for (ncp = namecache; ncp < &namecache[nchsize]; ncp++)
1268 		ncp->nc_id = 0;
1269 }
1270