xref: /csrg-svn/sys/kern/vfs_lookup.c (revision 27268)
1 /*
2  * Copyright (c) 1982 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  *
6  *	@(#)vfs_lookup.c	6.29 (Berkeley) 04/23/86
7  */
8 
9 #include "param.h"
10 #include "systm.h"
11 #include "inode.h"
12 #include "fs.h"
13 #include "mount.h"
14 #include "dir.h"
15 #include "user.h"
16 #include "buf.h"
17 #include "conf.h"
18 #include "uio.h"
19 #include "kernel.h"
20 
21 struct	buf *blkatoff();
22 struct	buf *freenamebuf;
23 int	dirchk = 0;
24 
25 /*
26  * Structures associated with name cacheing.
27  */
28 #define	NCHHASH		32	/* size of hash table */
29 
30 #if	((NCHHASH)&((NCHHASH)-1)) != 0
31 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) % (NCHHASH))
32 #else
33 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) & ((NCHHASH)-1))
34 #endif
35 
36 union nchash {
37 	union	nchash *nch_head[2];
38 	struct	namecache *nch_chain[2];
39 } nchash[NCHHASH];
40 #define	nch_forw	nch_chain[0]
41 #define	nch_back	nch_chain[1]
42 
43 struct	namecache *nchhead, **nchtail;	/* LRU chain pointers */
44 struct	nchstats nchstats;		/* cache effectiveness statistics */
45 
46 /*
47  * Convert a pathname into a pointer to a locked inode.
48  * This is a very central and rather complicated routine.
49  * If the file system is not maintained in a strict tree hierarchy,
50  * this can result in a deadlock situation (see comments in code below).
51  *
52  * The flag argument is LOOKUP, CREATE, or DELETE depending on whether
53  * the name is to be looked up, created, or deleted. When CREATE or
54  * DELETE is specified, information usable in creating or deleteing a
55  * directory entry is also calculated. If flag has LOCKPARENT or'ed
56  * into it and the target of the pathname exists, namei returns both
57  * the target and its parent directory locked. When creating and
58  * LOCKPARENT is specified, the target may not be ".".  When deleting
59  * and LOCKPARENT is specified, the target may be ".", but the caller
60  * must check to insure it does an irele and iput instead of two iputs.
61  *
62  * The FOLLOW flag is set when symbolic links are to be followed
63  * when they occur at the end of the name translation process.
64  * Symbolic links are always followed for all other pathname
65  * components other than the last.
66  *
67  * The segflg defines whether the name is to be copied from user
68  * space or kernel space.
69  *
70  * Name caching works as follows:
71  *
72  * Names found by directory scans are retained in a cache
73  * for future reference.  It is managed LRU, so frequently
74  * used names will hang around.  Cache is indexed by hash value
75  * obtained from (ino,dev,name) where ino & dev refer to the
76  * directory containing name.
77  *
78  * For simplicity (and economy of storage), names longer than
79  * a maximum length of NCHNAMLEN are not cached; they occur
80  * infrequently in any case, and are almost never of interest.
81  *
82  * Upon reaching the last segment of a path, if the reference
83  * is for DELETE, or NOCACHE is set (rewrite), and the
84  * name is located in the cache, it will be dropped.
85  *
86  * Overall outline of namei:
87  *
88  *	copy in name
89  *	get starting directory
90  * dirloop:
91  *	check accessibility of directory
92  * dirloop2:
93  *	copy next component of name to ndp->ni_dent
94  *	handle degenerate case where name is null string
95  *	look for name in cache, if found, then if at end of path
96  *	  and deleting or creating, drop it, else to haveino
97  *	search for name in directory, to found or notfound
98  * notfound:
99  *	if creating, return locked directory, leaving info on avail. slots
100  *	else return error
101  * found:
102  *	if at end of path and deleting, return information to allow delete
103  *	if at end of path and rewriting (CREATE and LOCKPARENT), lock target
104  *	  inode and return info to allow rewrite
105  *	if .. and on mounted filesys, look in mount table for parent
106  *	if not at end, add name to cache; if at end and neither creating
107  *	  nor deleting, add name to cache
108  * haveino:
109  *	if symbolic link, massage name in buffer and continue at dirloop
110  *	if more components of name, do next level at dirloop
111  *	return the answer as locked inode
112  *
113  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode,
114  *	 but unlocked.
115  */
116 struct inode *
117 namei(ndp)
118 	register struct nameidata *ndp;
119 {
120 	register char *cp;		/* pointer into pathname argument */
121 /* these variables refer to things which must be freed or unlocked */
122 	register struct inode *dp = 0;	/* the directory we are searching */
123 	register struct namecache *ncp;	/* cache slot for entry */
124 	register struct fs *fs;		/* file system that directory is in */
125 	register struct buf *bp = 0;	/* a buffer of directory entries */
126 	register struct direct *ep;	/* the current directory entry */
127 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
128 	register struct buf *nbp;	/* buffer storing path name argument */
129 /* these variables hold information about the search for a slot */
130 	enum {NONE, COMPACT, FOUND} slotstatus;
131 	int slotoffset = -1;		/* offset of area with free space */
132 	int slotsize;			/* size of area at slotoffset */
133 	int slotfreespace;		/* amount of space free in slot */
134 	int slotneeded;			/* size of the entry we're seeking */
135 /* */
136 	int numdirpasses;		/* strategy for directory search */
137 	int endsearch;			/* offset to end directory search */
138 	int prevoff;			/* ndp->ni_offset of previous entry */
139 	int nlink = 0;			/* number of symbolic links taken */
140 	struct inode *pdp;		/* saved dp during symlink work */
141 	int error, i;
142 	int lockparent;
143 	int docache;			/* == 0 do not cache last component */
144 	int makeentry;			/* != 0 if name to be added to cache */
145 	unsigned hash;			/* value of name hash for entry */
146 	union nchash *nhp;		/* cache chain head for entry */
147 	int isdotdot;			/* != 0 if current name is ".." */
148 	int flag;			/* op ie, LOOKUP, CREATE, or DELETE */
149 	off_t enduseful;		/* pointer past last used dir slot */
150 
151 	lockparent = ndp->ni_nameiop & LOCKPARENT;
152 	docache = (ndp->ni_nameiop & NOCACHE) ^ NOCACHE;
153 	flag = ndp->ni_nameiop &~ (LOCKPARENT|NOCACHE|FOLLOW);
154 	if (flag == DELETE || lockparent)
155 		docache = 0;
156 	/*
157 	 * Get a buffer for the name to be translated, and copy the
158 	 * name into the buffer.
159 	 */
160 	nbp = freenamebuf;
161 	if (nbp == NULL)
162 		nbp = geteblk(MAXPATHLEN);
163 	else
164 		freenamebuf = nbp->av_forw;
165 	if (ndp->ni_segflg == UIO_SYSSPACE)
166 		error = copystr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
167 		    (u_int *)0);
168 	else
169 		error = copyinstr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
170 		    (u_int *)0);
171 	if (error) {
172 		u.u_error = error;
173 		goto bad;
174 	}
175 
176 	/*
177 	 * Get starting directory.
178 	 */
179 	cp = nbp->b_un.b_addr;
180 	if (*cp == '/') {
181 		while (*cp == '/')
182 			cp++;
183 		if ((dp = u.u_rdir) == NULL)
184 			dp = rootdir;
185 	} else
186 		dp = u.u_cdir;
187 	fs = dp->i_fs;
188 	ILOCK(dp);
189 	dp->i_count++;
190 	ndp->ni_pdir = (struct inode *)0xc0000000;		/* illegal */
191 	ndp->ni_endoff = 0;
192 
193 	/*
194 	 * We come to dirloop to search a new directory.
195 	 * The directory must be locked so that it can be
196 	 * iput, and fs must be already set to dp->i_fs.
197 	 */
198 dirloop:
199 	/*
200 	 * Check accessiblity of directory.
201 	 */
202 	if ((dp->i_mode&IFMT) != IFDIR) {
203 		u.u_error = ENOTDIR;
204 		goto bad;
205 	}
206 	if (access(dp, IEXEC))
207 		goto bad;
208 
209 dirloop2:
210 	/*
211 	 * Copy next component of name to ndp->ni_dent.
212 	 */
213 	hash = 0;
214 	for (i = 0; *cp != 0 && *cp != '/'; cp++) {
215 		if (i >= MAXNAMLEN) {
216 			u.u_error = ENAMETOOLONG;
217 			goto bad;
218 		}
219 		if (*cp & 0200)
220 			if ((*cp&0377) == ('/'|0200) || flag != DELETE) {
221 				u.u_error = EINVAL;
222 				goto bad;
223 			}
224 		ndp->ni_dent.d_name[i++] = *cp;
225 		hash += (unsigned char)*cp * i;
226 	}
227 	ndp->ni_dent.d_namlen = i;
228 	ndp->ni_dent.d_name[i] = '\0';
229 	isdotdot = (i == 2 &&
230 		ndp->ni_dent.d_name[0] == '.' && ndp->ni_dent.d_name[1] == '.');
231 	makeentry = 1;
232 	if (*cp == '\0' && docache == 0)
233 		makeentry = 0;
234 
235 	/*
236 	 * Check for degenerate name (e.g. / or "")
237 	 * which is a way of talking about a directory,
238 	 * e.g. like "/." or ".".
239 	 */
240 	if (ndp->ni_dent.d_name[0] == '\0') {
241 		if (flag != LOOKUP || lockparent) {
242 			u.u_error = EISDIR;
243 			goto bad;
244 		}
245 		nbp->av_forw = freenamebuf;
246 		freenamebuf = nbp;
247 		return (dp);
248 	}
249 
250 	/*
251 	 * We now have a segment name to search for, and a directory to search.
252 	 *
253 	 * Before tediously performing a linear scan of the directory,
254 	 * check the name cache to see if the directory/name pair
255 	 * we are looking for is known already.  We don't do this
256 	 * if the segment name is long, simply so the cache can avoid
257 	 * holding long names (which would either waste space, or
258 	 * add greatly to the complexity).
259 	 */
260 	if (ndp->ni_dent.d_namlen > NCHNAMLEN) {
261 		nchstats.ncs_long++;
262 		makeentry = 0;
263 	} else {
264 		nhp = &nchash[NHASH(hash, dp->i_number, dp->i_dev)];
265 		for (ncp = nhp->nch_forw; ncp != (struct namecache *)nhp;
266 		    ncp = ncp->nc_forw) {
267 			if (ncp->nc_ino == dp->i_number &&
268 			    ncp->nc_dev == dp->i_dev &&
269 			    ncp->nc_nlen == ndp->ni_dent.d_namlen &&
270 			    !bcmp(ncp->nc_name, ndp->ni_dent.d_name,
271 				(unsigned)ncp->nc_nlen))
272 				break;
273 		}
274 		if (ncp == (struct namecache *)nhp) {
275 			nchstats.ncs_miss++;
276 			ncp = NULL;
277 		} else {
278 			if (ncp->nc_id != ncp->nc_ip->i_id)
279 				nchstats.ncs_falsehits++;
280 			else if (!makeentry)
281 				nchstats.ncs_badhits++;
282 			else {
283 				/*
284 				 * move this slot to end of LRU
285 				 * chain, if not already there
286 				 */
287 				if (ncp->nc_nxt) {
288 					/* remove from LRU chain */
289 					*ncp->nc_prev = ncp->nc_nxt;
290 					ncp->nc_nxt->nc_prev = ncp->nc_prev;
291 
292 					/* and replace at end of it */
293 					ncp->nc_nxt = NULL;
294 					ncp->nc_prev = nchtail;
295 					*nchtail = ncp;
296 					nchtail = &ncp->nc_nxt;
297 				}
298 
299 				/*
300 				 * Get the next inode in the path.
301 				 * See comment above other `IUNLOCK' code for
302 				 * an explaination of the locking protocol.
303 				 */
304 				pdp = dp;
305 				if (!isdotdot || dp != u.u_rdir)
306 					dp = ncp->nc_ip;
307 				if (dp == NULL)
308 					panic("nami: null cache ino");
309 				if (pdp == dp)
310 					dp->i_count++;
311 				else if (isdotdot) {
312 					IUNLOCK(pdp);
313 					igrab(dp);
314 				} else {
315 					igrab(dp);
316 					IUNLOCK(pdp);
317 				}
318 
319 				/*
320 				 * Verify that the inode that we got
321 				 * did not change while we were waiting
322 				 * for it to be locked.
323 				 */
324 				if (ncp->nc_id != ncp->nc_ip->i_id) {
325 					iput(dp);
326 					ILOCK(pdp);
327 					dp = pdp;
328 					nchstats.ncs_falsehits++;
329 				} else {
330 					ndp->ni_dent.d_ino = dp->i_number;
331 					/* ni_dent.d_reclen is garbage ... */
332 					nchstats.ncs_goodhits++;
333 					goto haveino;
334 				}
335 			}
336 
337 			/*
338 			 * Last component and we are renaming or deleting,
339 			 * the cache entry is invalid, or otherwise don't
340 			 * want cache entry to exist.
341 			 */
342 			/* remove from LRU chain */
343 			*ncp->nc_prev = ncp->nc_nxt;
344 			if (ncp->nc_nxt)
345 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
346 			else
347 				nchtail = ncp->nc_prev;
348 			remque(ncp);		/* remove from hash chain */
349 			/* insert at head of LRU list (first to grab) */
350 			ncp->nc_nxt = nchhead;
351 			ncp->nc_prev = &nchhead;
352 			nchhead->nc_prev = &ncp->nc_nxt;
353 			nchhead = ncp;
354 			/* and make a dummy hash chain */
355 			ncp->nc_forw = ncp;
356 			ncp->nc_back = ncp;
357 			ncp = NULL;
358 		}
359 	}
360 
361 	/*
362 	 * Suppress search for slots unless creating
363 	 * file and at end of pathname, in which case
364 	 * we watch for a place to put the new file in
365 	 * case it doesn't already exist.
366 	 */
367 	slotstatus = FOUND;
368 	if (flag == CREATE && *cp == 0) {
369 		slotstatus = NONE;
370 		slotfreespace = 0;
371 		slotneeded = DIRSIZ(&ndp->ni_dent);
372 	}
373 	/*
374 	 * If this is the same directory that this process
375 	 * previously searched, pick up where we last left off.
376 	 * We cache only lookups as these are the most common
377 	 * and have the greatest payoff. Caching CREATE has little
378 	 * benefit as it usually must search the entire directory
379 	 * to determine that the entry does not exist. Caching the
380 	 * location of the last DELETE has not reduced profiling time
381 	 * and hence has been removed in the interest of simplicity.
382 	 */
383 	if (flag != LOOKUP || dp->i_number != u.u_ncache.nc_inumber ||
384 	    dp->i_dev != u.u_ncache.nc_dev) {
385 		ndp->ni_offset = 0;
386 		numdirpasses = 1;
387 	} else {
388 		if ((dp->i_flag & (ICHG|IMOD)) ||
389 		    dp->i_ctime >= u.u_ncache.nc_time) {
390 			if (u.u_ncache.nc_prevoffset > dp->i_size)
391 				u.u_ncache.nc_prevoffset = 0;
392 			else
393 				u.u_ncache.nc_prevoffset &= ~(DIRBLKSIZ - 1);
394 			u.u_ncache.nc_time = time.tv_sec;
395 		}
396 		ndp->ni_offset = u.u_ncache.nc_prevoffset;
397 		entryoffsetinblock = blkoff(fs, ndp->ni_offset);
398 		if (entryoffsetinblock != 0) {
399 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
400 			if (bp == 0)
401 				goto bad;
402 		}
403 		numdirpasses = 2;
404 		nchstats.ncs_2passes++;
405 	}
406 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
407 	enduseful = 0;
408 
409 searchloop:
410 	while (ndp->ni_offset < endsearch) {
411 		/*
412 		 * If offset is on a block boundary,
413 		 * read the next directory block.
414 		 * Release previous if it exists.
415 		 */
416 		if (blkoff(fs, ndp->ni_offset) == 0) {
417 			if (bp != NULL)
418 				brelse(bp);
419 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
420 			if (bp == 0)
421 				goto bad;
422 			entryoffsetinblock = 0;
423 		}
424 		/*
425 		 * If still looking for a slot, and at a DIRBLKSIZE
426 		 * boundary, have to start looking for free space again.
427 		 */
428 		if (slotstatus == NONE &&
429 		    (entryoffsetinblock&(DIRBLKSIZ-1)) == 0) {
430 			slotoffset = -1;
431 			slotfreespace = 0;
432 		}
433 		/*
434 		 * Get pointer to next entry.
435 		 * Full validation checks are slow, so we only check
436 		 * enough to insure forward progress through the
437 		 * directory. Complete checks can be run by patching
438 		 * "dirchk" to be true.
439 		 */
440 		ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock);
441 		if (ep->d_reclen == 0 ||
442 		    dirchk && dirbadentry(ep, entryoffsetinblock)) {
443 			dirbad(dp, ndp->ni_offset, "mangled entry");
444 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
445 			ndp->ni_offset += i;
446 			entryoffsetinblock += i;
447 			continue;
448 		}
449 
450 		/*
451 		 * If an appropriate sized slot has not yet been found,
452 		 * check to see if one is available. Also accumulate space
453 		 * in the current block so that we can determine if
454 		 * compaction is viable.
455 		 */
456 		if (slotstatus != FOUND) {
457 			int size = ep->d_reclen;
458 
459 			if (ep->d_ino != 0)
460 				size -= DIRSIZ(ep);
461 			if (size > 0) {
462 				if (size >= slotneeded) {
463 					slotstatus = FOUND;
464 					slotoffset = ndp->ni_offset;
465 					slotsize = ep->d_reclen;
466 				} else if (slotstatus == NONE) {
467 					slotfreespace += size;
468 					if (slotoffset == -1)
469 						slotoffset = ndp->ni_offset;
470 					if (slotfreespace >= slotneeded) {
471 						slotstatus = COMPACT;
472 						slotsize = ndp->ni_offset +
473 						      ep->d_reclen - slotoffset;
474 					}
475 				}
476 			}
477 		}
478 
479 		/*
480 		 * Check for a name match.
481 		 */
482 		if (ep->d_ino) {
483 			if (ep->d_namlen == ndp->ni_dent.d_namlen &&
484 			    !bcmp(ndp->ni_dent.d_name, ep->d_name,
485 				(unsigned)ep->d_namlen))
486 				goto found;
487 		}
488 		prevoff = ndp->ni_offset;
489 		ndp->ni_offset += ep->d_reclen;
490 		entryoffsetinblock += ep->d_reclen;
491 		if (ep->d_ino)
492 			enduseful = ndp->ni_offset;
493 	}
494 /* notfound: */
495 	/*
496 	 * If we started in the middle of the directory and failed
497 	 * to find our target, we must check the beginning as well.
498 	 */
499 	if (numdirpasses == 2) {
500 		numdirpasses--;
501 		ndp->ni_offset = 0;
502 		endsearch = u.u_ncache.nc_prevoffset;
503 		goto searchloop;
504 	}
505 	/*
506 	 * If creating, and at end of pathname and current
507 	 * directory has not been removed, then can consider
508 	 * allowing file to be created.
509 	 */
510 	if (flag == CREATE && *cp == 0 && dp->i_nlink != 0) {
511 		/*
512 		 * Access for write is interpreted as allowing
513 		 * creation of files in the directory.
514 		 */
515 		if (access(dp, IWRITE))
516 			goto bad;
517 		/*
518 		 * Return an indication of where the new directory
519 		 * entry should be put.  If we didn't find a slot,
520 		 * then set ndp->ni_count to 0 indicating that the new
521 		 * slot belongs at the end of the directory. If we found
522 		 * a slot, then the new entry can be put in the range
523 		 * [ndp->ni_offset .. ndp->ni_offset + ndp->ni_count)
524 		 */
525 		if (slotstatus == NONE) {
526 			ndp->ni_offset = roundup(dp->i_size, DIRBLKSIZ);
527 			ndp->ni_count = 0;
528 			enduseful = ndp->ni_offset;
529 		} else {
530 			ndp->ni_offset = slotoffset;
531 			ndp->ni_count = slotsize;
532 			if (enduseful < slotoffset + slotsize)
533 				enduseful = slotoffset + slotsize;
534 		}
535 		ndp->ni_endoff = roundup(enduseful, DIRBLKSIZ);
536 		dp->i_flag |= IUPD|ICHG;
537 		if (bp)
538 			brelse(bp);
539 		nbp->av_forw = freenamebuf;
540 		freenamebuf = nbp;
541 		/*
542 		 * We return with the directory locked, so that
543 		 * the parameters we set up above will still be
544 		 * valid if we actually decide to do a direnter().
545 		 * We return NULL to indicate that the entry doesn't
546 		 * currently exist, leaving a pointer to the (locked)
547 		 * directory inode in ndp->ni_pdir.
548 		 */
549 		ndp->ni_pdir = dp;
550 		return (NULL);
551 	}
552 	u.u_error = ENOENT;
553 	goto bad;
554 found:
555 	if (numdirpasses == 2)
556 		nchstats.ncs_pass2++;
557 	/*
558 	 * Check that directory length properly reflects presence
559 	 * of this entry.
560 	 */
561 	if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) {
562 		dirbad(dp, ndp->ni_offset, "i_size too small");
563 		dp->i_size = entryoffsetinblock + DIRSIZ(ep);
564 		dp->i_flag |= IUPD|ICHG;
565 	}
566 
567 	/*
568 	 * Found component in pathname.
569 	 * If the final component of path name, save information
570 	 * in the cache as to where the entry was found.
571 	 */
572 	if (*cp == '\0' && flag == LOOKUP) {
573 		u.u_ncache.nc_prevoffset = ndp->ni_offset;
574 		u.u_ncache.nc_inumber = dp->i_number;
575 		u.u_ncache.nc_dev = dp->i_dev;
576 		u.u_ncache.nc_time = time.tv_sec;
577 	}
578 	/*
579 	 * Save directory entry's inode number and reclen in ndp->ni_dent,
580 	 * and release directory buffer.
581 	 */
582 	ndp->ni_dent.d_ino = ep->d_ino;
583 	ndp->ni_dent.d_reclen = ep->d_reclen;
584 	brelse(bp);
585 	bp = NULL;
586 
587 	/*
588 	 * If deleting, and at end of pathname, return
589 	 * parameters which can be used to remove file.
590 	 * If the lockparent flag isn't set, we return only
591 	 * the directory (in ndp->ni_pdir), otherwise we go
592 	 * on and lock the inode, being careful with ".".
593 	 */
594 	if (flag == DELETE && *cp == 0) {
595 		/*
596 		 * Write access to directory required to delete files.
597 		 */
598 		if (access(dp, IWRITE))
599 			goto bad;
600 		ndp->ni_pdir = dp;		/* for dirremove() */
601 		/*
602 		 * Return pointer to current entry in ndp->ni_offset,
603 		 * and distance past previous entry (if there
604 		 * is a previous entry in this block) in ndp->ni_count.
605 		 * Save directory inode pointer in ndp->ni_pdir for dirremove().
606 		 */
607 		if ((ndp->ni_offset&(DIRBLKSIZ-1)) == 0)
608 			ndp->ni_count = 0;
609 		else
610 			ndp->ni_count = ndp->ni_offset - prevoff;
611 		if (lockparent) {
612 			if (dp->i_number == ndp->ni_dent.d_ino)
613 				dp->i_count++;
614 			else {
615 				dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
616 				if (dp == NULL) {
617 					iput(ndp->ni_pdir);
618 					goto bad;
619 				}
620 				/*
621 				 * If directory is "sticky", then user must own
622 				 * the directory, or the file in it, else he
623 				 * may not delete it (unless he's root). This
624 				 * implements append-only directories.
625 				 */
626 				if ((ndp->ni_pdir->i_mode & ISVTX) &&
627 				    u.u_uid != 0 &&
628 				    u.u_uid != ndp->ni_pdir->i_uid &&
629 				    dp->i_uid != u.u_uid) {
630 					iput(ndp->ni_pdir);
631 					u.u_error = EPERM;
632 					goto bad;
633 				}
634 			}
635 		}
636 		nbp->av_forw = freenamebuf;
637 		freenamebuf = nbp;
638 		return (dp);
639 	}
640 
641 	/*
642 	 * Special handling for ".." allowing chdir out of mounted
643 	 * file system: indirect .. in root inode to reevaluate
644 	 * in directory file system was mounted on.
645 	 */
646 	if (isdotdot) {
647 		if (dp == u.u_rdir) {
648 			ndp->ni_dent.d_ino = dp->i_number;
649 			makeentry = 0;
650 		} else if (ndp->ni_dent.d_ino == ROOTINO &&
651 		   dp->i_number == ROOTINO) {
652 			for (i = 1; i < NMOUNT; i++)
653 			if (mount[i].m_bufp != NULL &&
654 			   mount[i].m_dev == dp->i_dev) {
655 				iput(dp);
656 				dp = mount[i].m_inodp;
657 				ILOCK(dp);
658 				dp->i_count++;
659 				fs = dp->i_fs;
660 				cp -= 2;     /* back over .. */
661 				goto dirloop2;
662 			}
663 		}
664 	}
665 
666 	/*
667 	 * If rewriting (rename), return the inode and the
668 	 * information required to rewrite the present directory
669 	 * Must get inode of directory entry to verify it's a
670 	 * regular file, or empty directory.
671 	 */
672 	if ((flag == CREATE && lockparent) && *cp == 0) {
673 		if (access(dp, IWRITE))
674 			goto bad;
675 		ndp->ni_pdir = dp;		/* for dirrewrite() */
676 		/*
677 		 * Careful about locking second inode.
678 		 * This can only occur if the target is ".".
679 		 */
680 		if (dp->i_number == ndp->ni_dent.d_ino) {
681 			u.u_error = EISDIR;		/* XXX */
682 			goto bad;
683 		}
684 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
685 		if (dp == NULL) {
686 			iput(ndp->ni_pdir);
687 			goto bad;
688 		}
689 		nbp->av_forw = freenamebuf;
690 		freenamebuf = nbp;
691 		return (dp);
692 	}
693 
694 	/*
695 	 * Check for symbolic link, which may require us to massage the
696 	 * name before we continue translation.  We do not `iput' the
697 	 * directory because we may need it again if the symbolic link
698 	 * is relative to the current directory.  Instead we save it
699 	 * unlocked as "pdp".  We must get the target inode before unlocking
700 	 * the directory to insure that the inode will not be removed
701 	 * before we get it.  We prevent deadlock by always fetching
702 	 * inodes from the root, moving down the directory tree. Thus
703 	 * when following backward pointers ".." we must unlock the
704 	 * parent directory before getting the requested directory.
705 	 * There is a potential race condition here if both the current
706 	 * and parent directories are removed before the `iget' for the
707 	 * inode associated with ".." returns.  We hope that this occurs
708 	 * infrequently since we cannot avoid this race condition without
709 	 * implementing a sophisticated deadlock detection algorithm.
710 	 * Note also that this simple deadlock detection scheme will not
711 	 * work if the file system has any hard links other than ".."
712 	 * that point backwards in the directory structure.
713 	 */
714 	pdp = dp;
715 	if (isdotdot) {
716 		IUNLOCK(pdp);	/* race to get the inode */
717 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
718 		if (dp == NULL)
719 			goto bad2;
720 	} else if (dp->i_number == ndp->ni_dent.d_ino) {
721 		dp->i_count++;	/* we want ourself, ie "." */
722 	} else {
723 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
724 		IUNLOCK(pdp);
725 		if (dp == NULL)
726 			goto bad2;
727 	}
728 
729 	/*
730 	 * Insert name into cache if appropriate.
731 	 */
732 	if (makeentry) {
733 		if (ncp != NULL)
734 			panic("nami: duplicating cache");
735 		/*
736 		 * Free the cache slot at head of lru chain.
737 		 */
738 		if (ncp = nchhead) {
739 			/* remove from lru chain */
740 			*ncp->nc_prev = ncp->nc_nxt;
741 			if (ncp->nc_nxt)
742 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
743 			else
744 				nchtail = ncp->nc_prev;
745 			remque(ncp);		/* remove from old hash chain */
746 			/* grab the inode we just found */
747 			ncp->nc_ip = dp;
748 			/* fill in cache info */
749 			ncp->nc_ino = pdp->i_number;	/* parents inum */
750 			ncp->nc_dev = pdp->i_dev;	/* & device */
751 			ncp->nc_idev = dp->i_dev;	/* our device */
752 			ncp->nc_id = dp->i_id;		/* identifier */
753 			ncp->nc_nlen = ndp->ni_dent.d_namlen;
754 			bcopy(ndp->ni_dent.d_name, ncp->nc_name,
755 			    (unsigned)ncp->nc_nlen);
756 			/* link at end of lru chain */
757 			ncp->nc_nxt = NULL;
758 			ncp->nc_prev = nchtail;
759 			*nchtail = ncp;
760 			nchtail = &ncp->nc_nxt;
761 			/* and insert on hash chain */
762 			insque(ncp, nhp);
763 		}
764 	}
765 
766 haveino:
767 	fs = dp->i_fs;
768 
769 	/*
770 	 * Check for symbolic link
771 	 */
772 	if ((dp->i_mode & IFMT) == IFLNK &&
773 	    ((ndp->ni_nameiop & FOLLOW) || *cp == '/')) {
774 		u_int pathlen = strlen(cp) + 1;
775 
776 		if (dp->i_size + pathlen >= MAXPATHLEN - 1) {
777 			u.u_error = ENAMETOOLONG;
778 			goto bad2;
779 		}
780 		if (++nlink > MAXSYMLINKS) {
781 			u.u_error = ELOOP;
782 			goto bad2;
783 		}
784 		ovbcopy(cp, nbp->b_un.b_addr + dp->i_size, pathlen);
785 		u.u_error =
786 		    rdwri(UIO_READ, dp, nbp->b_un.b_addr, (int)dp->i_size,
787 			(off_t)0, 1, (int *)0);
788 		if (u.u_error)
789 			goto bad2;
790 		cp = nbp->b_un.b_addr;
791 		iput(dp);
792 		if (*cp == '/') {
793 			irele(pdp);
794 			while (*cp == '/')
795 				cp++;
796 			if ((dp = u.u_rdir) == NULL)
797 				dp = rootdir;
798 			ILOCK(dp);
799 			dp->i_count++;
800 		} else {
801 			dp = pdp;
802 			ILOCK(dp);
803 		}
804 		fs = dp->i_fs;
805 		goto dirloop;
806 	}
807 
808 	/*
809 	 * Not a symbolic link.  If more pathname,
810 	 * continue at next component, else return.
811 	 */
812 	if (*cp == '/') {
813 		while (*cp == '/')
814 			cp++;
815 		irele(pdp);
816 		goto dirloop;
817 	}
818 	nbp->av_forw = freenamebuf;
819 	freenamebuf = nbp;
820 	if (lockparent)
821 		ndp->ni_pdir = pdp;
822 	else
823 		irele(pdp);
824 	return (dp);
825 bad2:
826 	irele(pdp);
827 bad:
828 	if (bp)
829 		brelse(bp);
830 	if (dp)
831 		iput(dp);
832 	nbp->av_forw = freenamebuf;
833 	freenamebuf = nbp;
834 	return (NULL);
835 }
836 
837 
838 dirbad(ip, offset, how)
839 	struct inode *ip;
840 	off_t offset;
841 	char *how;
842 {
843 
844 	printf("%s: bad dir ino %d at offset %d: %s\n",
845 	    ip->i_fs->fs_fsmnt, ip->i_number, offset, how);
846 }
847 
848 /*
849  * Do consistency checking on a directory entry:
850  *	record length must be multiple of 4
851  *	entry must fit in rest of its DIRBLKSIZ block
852  *	record must be large enough to contain entry
853  *	name is not longer than MAXNAMLEN
854  *	name must be as long as advertised, and null terminated
855  */
856 dirbadentry(ep, entryoffsetinblock)
857 	register struct direct *ep;
858 	int entryoffsetinblock;
859 {
860 	register int i;
861 
862 	if ((ep->d_reclen & 0x3) != 0 ||
863 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
864 	    ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN)
865 		return (1);
866 	for (i = 0; i < ep->d_namlen; i++)
867 		if (ep->d_name[i] == '\0')
868 			return (1);
869 	return (ep->d_name[i]);
870 }
871 
872 /*
873  * Write a directory entry after a call to namei, using the parameters
874  * which it left in the u. area.  The argument ip is the inode which
875  * the new directory entry will refer to.  The u. area field ndp->ni_pdir is
876  * a pointer to the directory to be written, which was left locked by
877  * namei.  Remaining parameters (ndp->ni_offset, ndp->ni_count) indicate
878  * how the space for the new entry is to be gotten.
879  */
880 direnter(ip, ndp)
881 	struct inode *ip;
882 	register struct nameidata *ndp;
883 {
884 	register struct direct *ep, *nep;
885 	register struct inode *dp = ndp->ni_pdir;
886 	struct buf *bp;
887 	int loc, spacefree, error = 0;
888 	u_int dsize;
889 	int newentrysize;
890 	char *dirbuf;
891 
892 	ndp->ni_dent.d_ino = ip->i_number;
893 	newentrysize = DIRSIZ(&ndp->ni_dent);
894 	if (ndp->ni_count == 0) {
895 		/*
896 		 * If ndp->ni_count is 0, then namei could find no space in the
897 		 * directory. In this case ndp->ni_offset will be on a directory
898 		 * block boundary and we will write the new entry into a fresh
899 		 * block.
900 		 */
901 		if (ndp->ni_offset&(DIRBLKSIZ-1))
902 			panic("wdir: newblk");
903 		ndp->ni_dent.d_reclen = DIRBLKSIZ;
904 		error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
905 		    newentrysize, ndp->ni_offset, 1, (int *)0);
906 		if (DIRBLKSIZ > dp->i_fs->fs_fsize)
907 			panic("wdir: blksize"); /* XXX - should grow w/bmap() */
908 		else
909 			dp->i_size = roundup(dp->i_size, DIRBLKSIZ);
910 		iput(dp);
911 		return (error);
912 	}
913 
914 	/*
915 	 * If ndp->ni_count is non-zero, then namei found space for the new
916 	 * entry in the range ndp->ni_offset to ndp->ni_offset + ndp->ni_count.
917 	 * in the directory.  To use this space, we may have to compact
918 	 * the entries located there, by copying them together towards
919 	 * the beginning of the block, leaving the free space in
920 	 * one usable chunk at the end.
921 	 */
922 
923 	/*
924 	 * Increase size of directory if entry eats into new space.
925 	 * This should never push the size past a new multiple of
926 	 * DIRBLKSIZE.
927 	 *
928 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
929 	 */
930 	if (ndp->ni_offset + ndp->ni_count > dp->i_size)
931 		dp->i_size = ndp->ni_offset + ndp->ni_count;
932 	/*
933 	 * Get the block containing the space for the new directory
934 	 * entry.  Should return error by result instead of u.u_error.
935 	 */
936 	bp = blkatoff(dp, ndp->ni_offset, (char **)&dirbuf);
937 	if (bp == 0) {
938 		iput(dp);
939 		return (u.u_error);
940 	}
941 	/*
942 	 * Find space for the new entry.  In the simple case, the
943 	 * entry at offset base will have the space.  If it does
944 	 * not, then namei arranged that compacting the region
945 	 * ndp->ni_offset to ndp->ni_offset+ndp->ni_count would yield the space.
946 	 */
947 	ep = (struct direct *)dirbuf;
948 	dsize = DIRSIZ(ep);
949 	spacefree = ep->d_reclen - dsize;
950 	for (loc = ep->d_reclen; loc < ndp->ni_count; ) {
951 		nep = (struct direct *)(dirbuf + loc);
952 		if (ep->d_ino) {
953 			/* trim the existing slot */
954 			ep->d_reclen = dsize;
955 			ep = (struct direct *)((char *)ep + dsize);
956 		} else {
957 			/* overwrite; nothing there; header is ours */
958 			spacefree += dsize;
959 		}
960 		dsize = DIRSIZ(nep);
961 		spacefree += nep->d_reclen - dsize;
962 		loc += nep->d_reclen;
963 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
964 	}
965 	/*
966 	 * Update the pointer fields in the previous entry (if any),
967 	 * copy in the new entry, and write out the block.
968 	 */
969 	if (ep->d_ino == 0) {
970 		if (spacefree + dsize < newentrysize)
971 			panic("wdir: compact1");
972 		ndp->ni_dent.d_reclen = spacefree + dsize;
973 	} else {
974 		if (spacefree < newentrysize)
975 			panic("wdir: compact2");
976 		ndp->ni_dent.d_reclen = spacefree;
977 		ep->d_reclen = dsize;
978 		ep = (struct direct *)((char *)ep + dsize);
979 	}
980 	bcopy((caddr_t)&ndp->ni_dent, (caddr_t)ep, (u_int)newentrysize);
981 	bwrite(bp);
982 	dp->i_flag |= IUPD|ICHG;
983 	if (ndp->ni_endoff && ndp->ni_endoff < dp->i_size)
984 		itrunc(dp, (u_long)ndp->ni_endoff);
985 	iput(dp);
986 	return (error);
987 }
988 
989 /*
990  * Remove a directory entry after a call to namei, using the
991  * parameters which it left in the u. area.  The u. entry
992  * ni_offset contains the offset into the directory of the
993  * entry to be eliminated.  The ni_count field contains the
994  * size of the previous record in the directory.  If this
995  * is 0, the first entry is being deleted, so we need only
996  * zero the inode number to mark the entry as free.  If the
997  * entry isn't the first in the directory, we must reclaim
998  * the space of the now empty record by adding the record size
999  * to the size of the previous entry.
1000  */
1001 dirremove(ndp)
1002 	register struct nameidata *ndp;
1003 {
1004 	register struct inode *dp = ndp->ni_pdir;
1005 	register struct buf *bp;
1006 	struct direct *ep;
1007 
1008 	if (ndp->ni_count == 0) {
1009 		/*
1010 		 * First entry in block: set d_ino to zero.
1011 		 */
1012 		ndp->ni_dent.d_ino = 0;
1013 		(void) rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1014 		    (int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1015 	} else {
1016 		/*
1017 		 * Collapse new free space into previous entry.
1018 		 */
1019 		bp = blkatoff(dp, ndp->ni_offset - ndp->ni_count, (char **)&ep);
1020 		if (bp == 0)
1021 			return (0);
1022 		ep->d_reclen += ndp->ni_dent.d_reclen;
1023 		bwrite(bp);
1024 		dp->i_flag |= IUPD|ICHG;
1025 	}
1026 	return (1);
1027 }
1028 
1029 /*
1030  * Rewrite an existing directory entry to point at the inode
1031  * supplied.  The parameters describing the directory entry are
1032  * set up by a call to namei.
1033  */
1034 dirrewrite(dp, ip, ndp)
1035 	struct inode *dp, *ip;
1036 	struct nameidata *ndp;
1037 {
1038 
1039 	ndp->ni_dent.d_ino = ip->i_number;
1040 	u.u_error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1041 		(int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1042 	iput(dp);
1043 }
1044 
1045 /*
1046  * Return buffer with contents of block "offset"
1047  * from the beginning of directory "ip".  If "res"
1048  * is non-zero, fill it in with a pointer to the
1049  * remaining space in the directory.
1050  */
1051 struct buf *
1052 blkatoff(ip, offset, res)
1053 	struct inode *ip;
1054 	off_t offset;
1055 	char **res;
1056 {
1057 	register struct fs *fs = ip->i_fs;
1058 	daddr_t lbn = lblkno(fs, offset);
1059 	int bsize = blksize(fs, ip, lbn);
1060 	register struct buf *bp;
1061 	daddr_t bn;
1062 
1063 	bn = bmap(ip, lbn, B_READ, bsize);
1064 	if (u.u_error)
1065 		return (0);
1066 	if (bn == (daddr_t)-1) {
1067 		dirbad(ip, offset, "hole in dir");
1068 		return (0);
1069 	}
1070 	bp = bread(ip->i_dev, fsbtodb(fs, bn), bsize);
1071 	if (bp->b_flags & B_ERROR) {
1072 		brelse(bp);
1073 		return (0);
1074 	}
1075 	if (res)
1076 		*res = bp->b_un.b_addr + blkoff(fs, offset);
1077 	return (bp);
1078 }
1079 
1080 /*
1081  * Check if a directory is empty or not.
1082  * Inode supplied must be locked.
1083  *
1084  * Using a struct dirtemplate here is not precisely
1085  * what we want, but better than using a struct direct.
1086  *
1087  * NB: does not handle corrupted directories.
1088  */
1089 dirempty(ip, parentino)
1090 	register struct inode *ip;
1091 	ino_t parentino;
1092 {
1093 	register off_t off;
1094 	struct dirtemplate dbuf;
1095 	register struct direct *dp = (struct direct *)&dbuf;
1096 	int error, count;
1097 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1098 
1099 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
1100 		error = rdwri(UIO_READ, ip, (caddr_t)dp, MINDIRSIZ,
1101 		    off, 1, &count);
1102 		/*
1103 		 * Since we read MINDIRSIZ, residual must
1104 		 * be 0 unless we're at end of file.
1105 		 */
1106 		if (error || count != 0)
1107 			return (0);
1108 		/* avoid infinite loops */
1109 		if (dp->d_reclen == 0)
1110 			return (0);
1111 		/* skip empty entries */
1112 		if (dp->d_ino == 0)
1113 			continue;
1114 		/* accept only "." and ".." */
1115 		if (dp->d_namlen > 2)
1116 			return (0);
1117 		if (dp->d_name[0] != '.')
1118 			return (0);
1119 		/*
1120 		 * At this point d_namlen must be 1 or 2.
1121 		 * 1 implies ".", 2 implies ".." if second
1122 		 * char is also "."
1123 		 */
1124 		if (dp->d_namlen == 1)
1125 			continue;
1126 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
1127 			continue;
1128 		return (0);
1129 	}
1130 	return (1);
1131 }
1132 
1133 /*
1134  * Check if source directory is in the path of the target directory.
1135  * Target is supplied locked, source is unlocked.
1136  * The target is always iput() before returning.
1137  */
1138 checkpath(source, target)
1139 	struct inode *source, *target;
1140 {
1141 	struct dirtemplate dirbuf;
1142 	register struct inode *ip;
1143 	int error = 0;
1144 
1145 	ip = target;
1146 	if (ip->i_number == source->i_number) {
1147 		error = EEXIST;
1148 		goto out;
1149 	}
1150 	if (ip->i_number == ROOTINO)
1151 		goto out;
1152 
1153 	for (;;) {
1154 		if ((ip->i_mode&IFMT) != IFDIR) {
1155 			error = ENOTDIR;
1156 			break;
1157 		}
1158 		error = rdwri(UIO_READ, ip, (caddr_t)&dirbuf,
1159 			sizeof (struct dirtemplate), (off_t)0, 1, (int *)0);
1160 		if (error != 0)
1161 			break;
1162 		if (dirbuf.dotdot_namlen != 2 ||
1163 		    dirbuf.dotdot_name[0] != '.' ||
1164 		    dirbuf.dotdot_name[1] != '.') {
1165 			error = ENOTDIR;
1166 			break;
1167 		}
1168 		if (dirbuf.dotdot_ino == source->i_number) {
1169 			error = EINVAL;
1170 			break;
1171 		}
1172 		if (dirbuf.dotdot_ino == ROOTINO)
1173 			break;
1174 		iput(ip);
1175 		ip = iget(ip->i_dev, ip->i_fs, dirbuf.dotdot_ino);
1176 		if (ip == NULL) {
1177 			error = u.u_error;
1178 			break;
1179 		}
1180 	}
1181 
1182 out:
1183 	if (error == ENOTDIR)
1184 		printf("checkpath: .. not a directory\n");
1185 	if (ip != NULL)
1186 		iput(ip);
1187 	return (error);
1188 }
1189 
1190 /*
1191  * Name cache initialization, from main() when we are booting
1192  */
1193 nchinit()
1194 {
1195 	register union nchash *nchp;
1196 	register struct namecache *ncp;
1197 
1198 	nchhead = 0;
1199 	nchtail = &nchhead;
1200 	for (ncp = namecache; ncp < &namecache[nchsize]; ncp++) {
1201 		ncp->nc_forw = ncp;			/* hash chain */
1202 		ncp->nc_back = ncp;
1203 		ncp->nc_nxt = NULL;			/* lru chain */
1204 		*nchtail = ncp;
1205 		ncp->nc_prev = nchtail;
1206 		nchtail = &ncp->nc_nxt;
1207 		/* all else is zero already */
1208 	}
1209 	for (nchp = nchash; nchp < &nchash[NCHHASH]; nchp++) {
1210 		nchp->nch_head[0] = nchp;
1211 		nchp->nch_head[1] = nchp;
1212 	}
1213 }
1214 
1215 /*
1216  * Cache flush, called when filesys is umounted to
1217  * remove entries that would now be invalid
1218  *
1219  * The line "nxtcp = nchhead" near the end is to avoid potential problems
1220  * if the cache lru chain is modified while we are dumping the
1221  * inode.  This makes the algorithm O(n^2), but do you think I care?
1222  */
1223 nchinval(dev)
1224 	register dev_t dev;
1225 {
1226 	register struct namecache *ncp, *nxtcp;
1227 
1228 	for (ncp = nchhead; ncp; ncp = nxtcp) {
1229 		nxtcp = ncp->nc_nxt;
1230 		if (ncp->nc_ip == NULL ||
1231 		    (ncp->nc_idev != dev && ncp->nc_dev != dev))
1232 			continue;
1233 		/* free the resources we had */
1234 		ncp->nc_idev = NODEV;
1235 		ncp->nc_dev = NODEV;
1236 		ncp->nc_id = NULL;
1237 		ncp->nc_ino = 0;
1238 		ncp->nc_ip = NULL;
1239 		remque(ncp);		/* remove entry from its hash chain */
1240 		ncp->nc_forw = ncp;	/* and make a dummy one */
1241 		ncp->nc_back = ncp;
1242 		/* delete this entry from LRU chain */
1243 		*ncp->nc_prev = nxtcp;
1244 		if (nxtcp)
1245 			nxtcp->nc_prev = ncp->nc_prev;
1246 		else
1247 			nchtail = ncp->nc_prev;
1248 		/* cause rescan of list, it may have altered */
1249 		nxtcp = nchhead;
1250 		/* put the now-free entry at head of LRU */
1251 		ncp->nc_nxt = nxtcp;
1252 		ncp->nc_prev = &nchhead;
1253 		nxtcp->nc_prev = &ncp->nc_nxt;
1254 		nchhead = ncp;
1255 	}
1256 }
1257 
1258 /*
1259  * Name cache invalidation of all entries.
1260  */
1261 cacheinvalall()
1262 {
1263 	register struct namecache *ncp;
1264 
1265 	for (ncp = namecache; ncp < &namecache[nchsize]; ncp++)
1266 		ncp->nc_id = 0;
1267 }
1268