xref: /csrg-svn/sys/kern/vfs_lookup.c (revision 18027)
1 /*	vfs_lookup.c	6.18	85/02/20	*/
2 
3 #include "param.h"
4 #include "systm.h"
5 #include "inode.h"
6 #include "fs.h"
7 #include "mount.h"
8 #include "dir.h"
9 #include "user.h"
10 #include "buf.h"
11 #include "conf.h"
12 #include "uio.h"
13 #include "kernel.h"
14 
15 struct	buf *blkatoff();
16 struct	buf *freenamebuf;
17 int	dirchk = 0;
18 
19 /*
20  * Structures associated with name cacheing.
21  */
22 #define	NCHHASH		32	/* size of hash table */
23 
24 #if	((NCHHASH)&((NCHHASH)-1)) != 0
25 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) % (NCHHASH))
26 #else
27 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) & ((NCHHASH)-1))
28 #endif
29 
30 union	nchash	{
31 	union	nchash	*nch_head[2];
32 	struct	nch	*nch_chain[2];
33 } nchash[NCHHASH];
34 #define	nch_forw	nch_chain[0]
35 #define	nch_back	nch_chain[1]
36 
37 struct	nch	*nchhead, **nchtail;	/* LRU chain pointers */
38 struct	nchstats nchstats;		/* cache effectiveness statistics */
39 
40 /*
41  * Convert a pathname into a pointer to a locked inode,
42  * with side effects usable in creating and removing files.
43  * This is a very central and rather complicated routine.
44  *
45  * The segflg defines whether the name is to be copied from user
46  * space or kernel space.
47  *
48  * The flag argument is (LOOKUP, CREATE, DELETE) depending on whether
49  * the name is to be (looked up, created, deleted).  If flag has
50  * LOCKPARENT or'ed into it and the target of the pathname exists,
51  * namei returns both the target and its parent directory locked.
52  * If the file system is not maintained in a strict tree hierarchy,
53  * this can result in a deadlock situation.  When creating and
54  * LOCKPARENT is specified, the target may not be ".".  When deleting
55  * and LOCKPARENT is specified, the target may be ".", but the caller
56  * must check to insure it does an irele and iput instead of two iputs.
57  *
58  * The FOLLOW flag is set when symbolic links are to be followed
59  * when they occur at the end of the name translation process.
60  *
61  * Name caching works as follows:
62  *
63  *	names found by directory scans are retained in a cache
64  *	for future reference.  It is managed LRU, so frequently
65  *	used names will hang around.  Cache is indexed by hash value
66  *	obtained from (ino,dev,name) where ino & dev refer to the
67  *	directory containing name.
68  *
69  *	For simplicity (and economy of storage), names longer than
70  *	some (small) maximum length are not cached, they occur
71  *	infrequently in any case, and are almost never of interest.
72  *
73  *	Upon reaching the last segment of a path, if the reference
74  *	is for DELETE, or NOCACHE is set (rewrite), and the
75  *	name is located in the cache, it will be dropped.
76  *
77  *	We must be sure never to enter the name ".." into the cache
78  *	because of the extremely kludgey way that rename() alters
79  *	".." in a situation like
80  *		mv a/x b/x
81  *	where x is a directory, and x/.. is the ".." in question.
82  *
83  * Overall outline of namei:
84  *
85  *	copy in name
86  *	get starting directory
87  * dirloop:
88  *	check accessibility of directory
89  * dirloop2:
90  *	copy next component of name to ndp->ni_dent
91  *	handle degenerate case where name is null string
92  *	look for name in cache, if found, then if at end of path
93  *	  and deleting or creating, drop it, else to haveino
94  *	search for name in directory, to found or notfound
95  * notfound:
96  *	if creating, return locked directory, leaving info on avail. slots
97  *	else return error
98  * found:
99  *	if at end of path and deleting, return information to allow delete
100  *	if at end of path and rewriting (create and LOCKPARENT), lock target
101  *	  inode and return info to allow rewrite
102  *	if .. and on mounted filesys, look in mount table for parent
103  *	if not at end, if neither creating nor deleting, add name to cache
104  * haveino:
105  *	if symbolic link, massage name in buffer and continue at dirloop
106  *	if more components of name, do next level at dirloop
107  *	return the answer as locked inode
108  *
109  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode,
110  *	 but unlocked.
111  */
112 struct inode *
113 namei(ndp)
114 	register struct nameidata *ndp;
115 {
116 	register char *cp;		/* pointer into pathname argument */
117 /* these variables refer to things which must be freed or unlocked */
118 	register struct inode *dp = 0;	/* the directory we are searching */
119 	register struct nch *ncp;	/* cache slot for entry */
120 	register struct fs *fs;		/* file system that directory is in */
121 	register struct buf *bp = 0;	/* a buffer of directory entries */
122 	register struct direct *ep;	/* the current directory entry */
123 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
124 	register struct buf *nbp;	/* buffer storing path name argument */
125 /* these variables hold information about the search for a slot */
126 	enum {NONE, COMPACT, FOUND} slotstatus;
127 	int slotoffset = -1;		/* offset of area with free space */
128 	int slotsize;			/* size of area at slotoffset */
129 	int slotfreespace;		/* amount of space free in slot */
130 	int slotneeded;			/* size of the entry we're seeking */
131 /* */
132 	int numdirpasses;		/* strategy for directory search */
133 	int endsearch;			/* offset to end directory search */
134 	int prevoff;			/* ndp->ni_offset of previous entry */
135 	int nlink = 0;			/* number of symbolic links taken */
136 	struct inode *pdp;		/* saved dp during symlink work */
137 	int error, i;
138 	int lockparent;
139 	int docache;
140 	unsigned hash;			/* value of name hash for entry */
141 	union nchash *nhp;		/* cache chain head for entry */
142 	int isdotdot;			/* != 0 if current name is ".." */
143 	int flag;			/* op ie, LOOKUP, CREATE, or DELETE */
144 	off_t enduseful;		/* pointer past last used dir slot */
145 
146 	lockparent = ndp->ni_nameiop & LOCKPARENT;
147 	docache = (ndp->ni_nameiop & NOCACHE) ^ NOCACHE;
148 	flag = ndp->ni_nameiop &~ (LOCKPARENT|NOCACHE|FOLLOW);
149 	if (flag == DELETE)
150 		docache = 0;
151 	/*
152 	 * Get a buffer for the name to be translated, and copy the
153 	 * name into the buffer.
154 	 */
155 	nbp = freenamebuf;
156 	if (nbp == NULL)
157 		nbp = geteblk(MAXPATHLEN);
158 	else
159 		freenamebuf = nbp->av_forw;
160 	if (ndp->ni_segflg == UIO_SYSSPACE)
161 		error = copystr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
162 		    (u_int *)0);
163 	else
164 		error = copyinstr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
165 		    (u_int *)0);
166 	if (error) {
167 		u.u_error = error;
168 		goto bad;
169 	}
170 
171 	/*
172 	 * Get starting directory.
173 	 */
174 	cp = nbp->b_un.b_addr;
175 	if (*cp == '/') {
176 		while (*cp == '/')
177 			cp++;
178 		if ((dp = u.u_rdir) == NULL)
179 			dp = rootdir;
180 	} else
181 		dp = u.u_cdir;
182 	fs = dp->i_fs;
183 	ILOCK(dp);
184 	dp->i_count++;
185 	ndp->ni_pdir = (struct inode *)0xc0000000;		/* illegal */
186 	ndp->ni_endoff = 0;
187 
188 	/*
189 	 * We come to dirloop to search a new directory.
190 	 * The directory must be locked so that it can be
191 	 * iput, and fs must be already set to dp->i_fs.
192 	 */
193 dirloop:
194 	/*
195 	 * Check accessiblity of directory.
196 	 */
197 	if ((dp->i_mode&IFMT) != IFDIR) {
198 		u.u_error = ENOTDIR;
199 		goto bad;
200 	}
201 	if (access(dp, IEXEC))
202 		goto bad;
203 
204 dirloop2:
205 	/*
206 	 * Copy next component of name to ndp->ni_dent.
207 	 */
208 	hash = 0;
209 	for (i = 0; *cp != 0 && *cp != '/'; cp++) {
210 		if (i >= MAXNAMLEN) {
211 			u.u_error = ENOENT;
212 			goto bad;
213 		}
214 		if ((*cp&0377) == ('/'|0200) || (*cp&0200) && flag != DELETE) {
215 			u.u_error = EPERM;
216 			goto bad;
217 		}
218 		ndp->ni_dent.d_name[i++] = *cp;
219 		hash += (unsigned char)*cp * i;
220 	}
221 	ndp->ni_dent.d_namlen = i;
222 	ndp->ni_dent.d_name[i] = '\0';
223 	isdotdot = (i == 2 &&
224 		ndp->ni_dent.d_name[0] == '.' && ndp->ni_dent.d_name[1] == '.');
225 
226 	/*
227 	 * Check for degenerate name (e.g. / or "")
228 	 * which is a way of talking about a directory,
229 	 * e.g. like "/." or ".".
230 	 */
231 	if (ndp->ni_dent.d_name[0] == '\0') {
232 		if (flag != LOOKUP || lockparent) {
233 			u.u_error = EISDIR;
234 			goto bad;
235 		}
236 		nbp->av_forw = freenamebuf;
237 		freenamebuf = nbp;
238 		return (dp);
239 	}
240 
241 	/*
242 	 * We now have a segment name to search for, and a directory to search.
243 	 *
244 	 * Before tediously performing a linear scan of the directory,
245 	 * check the name cache to see if the directory/name pair
246 	 * we are looking for is known already.  We don't do this
247 	 * if the segment name is long, simply so the cache can avoid
248 	 * holding long names (which would either waste space, or
249 	 * add greatly to the complexity).
250 	 */
251 	if (ndp->ni_dent.d_namlen > NCHNAMLEN) {
252 		nchstats.ncs_long++;
253 		docache = 0;
254 	} else {
255 		nhp = &nchash[NHASH(hash, dp->i_number, dp->i_dev)];
256 		for (ncp = nhp->nch_forw; ncp != (struct nch *)nhp;
257 		    ncp = ncp->nc_forw) {
258 			if (ncp->nc_ino == dp->i_number &&
259 			    ncp->nc_dev == dp->i_dev &&
260 			    ncp->nc_nlen == ndp->ni_dent.d_namlen &&
261 			    !bcmp(ncp->nc_name, ndp->ni_dent.d_name,
262 				ncp->nc_nlen))
263 				break;
264 		}
265 
266 		if (ncp == (struct nch *)nhp) {
267 			nchstats.ncs_miss++;
268 			ncp = NULL;
269 		} else {
270 			if (ncp->nc_id != ncp->nc_ip->i_id) {
271 				nchstats.ncs_falsehits++;
272 			} else if (*cp == '\0' && !docache) {
273 				nchstats.ncs_badhits++;
274 			} else {
275 
276 					/*
277 					 * move this slot to end of LRU
278 					 * chain, if not already there
279 					 */
280 				if (ncp->nc_nxt) {
281 						/* remove from LRU chain */
282 					*ncp->nc_prev = ncp->nc_nxt;
283 					ncp->nc_nxt->nc_prev = ncp->nc_prev;
284 
285 						/* and replace at end of it */
286 					ncp->nc_nxt = NULL;
287 					ncp->nc_prev = nchtail;
288 					*nchtail = ncp;
289 					nchtail = &ncp->nc_nxt;
290 				}
291 
292 				/*
293 				 * Get the next inode in the path.
294 				 * See comment above other `IUNLOCK' code for
295 				 * an explaination of the locking protocol.
296 				 */
297 				pdp = dp;
298 				dp = ncp->nc_ip;
299 				if (dp == NULL)
300 					panic("nami: null cache ino");
301 				if (pdp == dp)
302 					dp->i_count++;
303 				else {
304 					if (isdotdot) {
305 						IUNLOCK(pdp);
306 						igrab(dp);
307 					} else {
308 						igrab(dp);
309 						IUNLOCK(pdp);
310 					}
311 				}
312 
313 				/*
314 				 * Verify that the inode that we got
315 				 * did not change while we were waiting
316 				 * for it to be locked.
317 				 */
318 				if (ncp->nc_id != ncp->nc_ip->i_id) {
319 					iput(dp);
320 					ILOCK(pdp);
321 					dp = pdp;
322 					nchstats.ncs_falsehits++;
323 				} else {
324 					ndp->ni_dent.d_ino = dp->i_number;
325 					/* ni_dent.d_reclen is garbage ... */
326 					nchstats.ncs_goodhits++;
327 					goto haveino;
328 				}
329 			}
330 
331 			/*
332 			 * Last component and we are renaming or deleting,
333 			 * the cache entry is invalid, or otherwise don't
334 			 * want cache entry to exist.
335 			 */
336 
337 				/* remove from LRU chain */
338 			*ncp->nc_prev = ncp->nc_nxt;
339 			if (ncp->nc_nxt)
340 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
341 			else
342 				nchtail = ncp->nc_prev;
343 
344 				/* remove from hash chain */
345 			remque(ncp);
346 
347 				/* insert at head of LRU list (first to grab) */
348 			ncp->nc_nxt = nchhead;
349 			ncp->nc_prev = &nchhead;
350 			nchhead->nc_prev = &ncp->nc_nxt;
351 			nchhead = ncp;
352 
353 				/* and make a dummy hash chain */
354 			ncp->nc_forw = ncp;
355 			ncp->nc_back = ncp;
356 
357 			ncp = NULL;
358 		}
359 	}
360 
361 	/*
362 	 * Suppress search for slots unless creating
363 	 * file and at end of pathname, in which case
364 	 * we watch for a place to put the new file in
365 	 * case it doesn't already exist.
366 	 */
367 	slotstatus = FOUND;
368 	if (flag == CREATE && *cp == 0) {
369 		slotstatus = NONE;
370 		slotfreespace = 0;
371 		slotneeded = DIRSIZ(&ndp->ni_dent);
372 	}
373 	/*
374 	 * If this is the same directory that this process
375 	 * previously searched, pick up where we last left off.
376 	 * We cache only lookups as these are the most common
377 	 * and have the greatest payoff. Caching CREATE has little
378 	 * benefit as it usually must search the entire directory
379 	 * to determine that the entry does not exist. Caching the
380 	 * location of the last DELETE has not reduced profiling time
381 	 * and hence has been removed in the interest of simplicity.
382 	 */
383 	if (flag != LOOKUP || dp->i_number != u.u_ncache.nc_inumber ||
384 	    dp->i_dev != u.u_ncache.nc_dev) {
385 		ndp->ni_offset = 0;
386 		numdirpasses = 1;
387 	} else {
388 		if ((dp->i_flag & ICHG) || dp->i_ctime >= u.u_ncache.nc_time) {
389 			if (u.u_ncache.nc_prevoffset > dp->i_size)
390 				u.u_ncache.nc_prevoffset = 0;
391 			else
392 				u.u_ncache.nc_prevoffset &= ~(DIRBLKSIZ - 1);
393 			u.u_ncache.nc_time = time.tv_sec;
394 		}
395 		ndp->ni_offset = u.u_ncache.nc_prevoffset;
396 		entryoffsetinblock = blkoff(fs, ndp->ni_offset);
397 		if (entryoffsetinblock != 0) {
398 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
399 			if (bp == 0)
400 				goto bad;
401 		}
402 		numdirpasses = 2;
403 		nchstats.ncs_2passes++;
404 	}
405 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
406 	enduseful = 0;
407 
408 searchloop:
409 	while (ndp->ni_offset < endsearch) {
410 		/*
411 		 * If offset is on a block boundary,
412 		 * read the next directory block.
413 		 * Release previous if it exists.
414 		 */
415 		if (blkoff(fs, ndp->ni_offset) == 0) {
416 			if (bp != NULL)
417 				brelse(bp);
418 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
419 			if (bp == 0)
420 				goto bad;
421 			entryoffsetinblock = 0;
422 		}
423 
424 		/*
425 		 * If still looking for a slot, and at a DIRBLKSIZE
426 		 * boundary, have to start looking for free space again.
427 		 */
428 		if (slotstatus == NONE &&
429 		    (entryoffsetinblock&(DIRBLKSIZ-1)) == 0) {
430 			slotoffset = -1;
431 			slotfreespace = 0;
432 		}
433 
434 		/*
435 		 * Get pointer to next entry.
436 		 * Full validation checks are slow, so we only check
437 		 * enough to insure forward progress through the
438 		 * directory. Complete checks can be run by patching
439 		 * "dirchk" to be true.
440 		 */
441 		ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock);
442 		if (ep->d_reclen <= 0 ||
443 		    dirchk && dirbadentry(ep, entryoffsetinblock)) {
444 			dirbad(dp, ndp->ni_offset, "mangled entry");
445 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
446 			ndp->ni_offset += i;
447 			entryoffsetinblock += i;
448 			continue;
449 		}
450 
451 		/*
452 		 * If an appropriate sized slot has not yet been found,
453 		 * check to see if one is available. Also accumulate space
454 		 * in the current block so that we can determine if
455 		 * compaction is viable.
456 		 */
457 		if (slotstatus != FOUND) {
458 			int size = ep->d_reclen;
459 
460 			if (ep->d_ino != 0)
461 				size -= DIRSIZ(ep);
462 			if (size > 0) {
463 				if (size >= slotneeded) {
464 					slotstatus = FOUND;
465 					slotoffset = ndp->ni_offset;
466 					slotsize = ep->d_reclen;
467 				} else if (slotstatus == NONE) {
468 					slotfreespace += size;
469 					if (slotoffset == -1)
470 						slotoffset = ndp->ni_offset;
471 					if (slotfreespace >= slotneeded) {
472 						slotstatus = COMPACT;
473 						slotsize = ndp->ni_offset +
474 						      ep->d_reclen - slotoffset;
475 					}
476 				}
477 			}
478 		}
479 
480 		/*
481 		 * Check for a name match.
482 		 */
483 		if (ep->d_ino) {
484 			if (ep->d_namlen == ndp->ni_dent.d_namlen &&
485 			    !bcmp(ndp->ni_dent.d_name, ep->d_name,
486 				ep->d_namlen))
487 				goto found;
488 		}
489 		prevoff = ndp->ni_offset;
490 		ndp->ni_offset += ep->d_reclen;
491 		entryoffsetinblock += ep->d_reclen;
492 		if (ep->d_ino)
493 			enduseful = ndp->ni_offset;
494 	}
495 /* notfound: */
496 	/*
497 	 * If we started in the middle of the directory and failed
498 	 * to find our target, we must check the beginning as well.
499 	 */
500 	if (numdirpasses == 2) {
501 		numdirpasses--;
502 		ndp->ni_offset = 0;
503 		endsearch = u.u_ncache.nc_prevoffset;
504 		goto searchloop;
505 	}
506 	/*
507 	 * If creating, and at end of pathname and current
508 	 * directory has not been removed, then can consider
509 	 * allowing file to be created.
510 	 */
511 	if (flag == CREATE && *cp == 0 && dp->i_nlink != 0) {
512 		/*
513 		 * Access for write is interpreted as allowing
514 		 * creation of files in the directory.
515 		 */
516 		if (access(dp, IWRITE))
517 			goto bad;
518 		/*
519 		 * Return an indication of where the new directory
520 		 * entry should be put.  If we didn't find a slot,
521 		 * then set ndp->ni_count to 0 indicating that the new
522 		 * slot belongs at the end of the directory. If we found
523 		 * a slot, then the new entry can be put in the range
524 		 * [ndp->ni_offset .. ndp->ni_offset + ndp->ni_count)
525 		 */
526 		if (slotstatus == NONE) {
527 			ndp->ni_offset = roundup(dp->i_size, DIRBLKSIZ);
528 			ndp->ni_count = 0;
529 			enduseful = ndp->ni_offset;
530 		} else {
531 			ndp->ni_offset = slotoffset;
532 			ndp->ni_count = slotsize;
533 			if (enduseful < slotoffset + slotsize)
534 				enduseful = slotoffset + slotsize;
535 		}
536 		ndp->ni_endoff = roundup(enduseful, DIRBLKSIZ);
537 		dp->i_flag |= IUPD|ICHG;
538 		if (bp)
539 			brelse(bp);
540 		nbp->av_forw = freenamebuf;
541 		freenamebuf = nbp;
542 		/*
543 		 * We return with the directory locked, so that
544 		 * the parameters we set up above will still be
545 		 * valid if we actually decide to do a direnter().
546 		 * We return NULL to indicate that the entry doesn't
547 		 * currently exist, leaving a pointer to the (locked)
548 		 * directory inode in ndp->ni_pdir.
549 		 */
550 		ndp->ni_pdir = dp;
551 		return (NULL);
552 	}
553 	u.u_error = ENOENT;
554 	goto bad;
555 found:
556 	if (numdirpasses == 2)
557 		nchstats.ncs_pass2++;
558 	/*
559 	 * Check that directory length properly reflects presence
560 	 * of this entry.
561 	 */
562 	if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) {
563 		dirbad(dp, ndp->ni_offset, "i_size too small");
564 		dp->i_size = entryoffsetinblock + DIRSIZ(ep);
565 		dp->i_flag |= IUPD|ICHG;
566 	}
567 
568 	/*
569 	 * Found component in pathname.
570 	 * If the final component of path name, save information
571 	 * in the cache as to where the entry was found.
572 	 */
573 	if (*cp == '\0' && flag == LOOKUP) {
574 		u.u_ncache.nc_prevoffset = ndp->ni_offset;
575 		u.u_ncache.nc_inumber = dp->i_number;
576 		u.u_ncache.nc_dev = dp->i_dev;
577 		u.u_ncache.nc_time = time.tv_sec;
578 	}
579 	/*
580 	 * Save directory entry in ndp->ni_dent,
581 	 * and release directory buffer.
582 	 */
583 	bcopy((caddr_t)ep, (caddr_t)&ndp->ni_dent, (u_int)DIRSIZ(ep));
584 	brelse(bp);
585 	bp = NULL;
586 
587 	/*
588 	 * If deleting, and at end of pathname, return
589 	 * parameters which can be used to remove file.
590 	 * If the lockparent flag isn't set, we return only
591 	 * the directory (in ndp->ni_pdir), otherwise we go
592 	 * on and lock the inode, being careful with ".".
593 	 */
594 	if (flag == DELETE && *cp == 0) {
595 		/*
596 		 * Write access to directory required to delete files.
597 		 */
598 		if (access(dp, IWRITE))
599 			goto bad;
600 		ndp->ni_pdir = dp;		/* for dirremove() */
601 		/*
602 		 * Return pointer to current entry in ndp->ni_offset,
603 		 * and distance past previous entry (if there
604 		 * is a previous entry in this block) in ndp->ni_count.
605 		 * Save directory inode pointer in ndp->ni_pdir for dirremove().
606 		 */
607 		if ((ndp->ni_offset&(DIRBLKSIZ-1)) == 0)
608 			ndp->ni_count = 0;
609 		else
610 			ndp->ni_count = ndp->ni_offset - prevoff;
611 		if (lockparent) {
612 			if (dp->i_number == ndp->ni_dent.d_ino)
613 				dp->i_count++;
614 			else {
615 				dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
616 				if (dp == NULL) {
617 					iput(ndp->ni_pdir);
618 					goto bad;
619 				}
620 				/*
621 				 * If directory is "sticky", then user must own
622 				 * the directory, or the file in it, else he
623 				 * may not delete it (unless he's root). This
624 				 * implements append-only directories.
625 				 */
626 				if ((ndp->ni_pdir->i_mode & ISVTX) &&
627 				    u.u_uid != 0 &&
628 				    u.u_uid != ndp->ni_pdir->i_uid &&
629 				    dp->i_uid != u.u_uid) {
630 					iput(ndp->ni_pdir);
631 					u.u_error = EPERM;
632 					goto bad;
633 				}
634 			}
635 		}
636 		nbp->av_forw = freenamebuf;
637 		freenamebuf = nbp;
638 		return (dp);
639 	}
640 
641 	/*
642 	 * Special handling for ".." allowing chdir out of mounted
643 	 * file system: indirect .. in root inode to reevaluate
644 	 * in directory file system was mounted on.
645 	 */
646 	if (isdotdot) {
647 		if (dp == u.u_rdir)
648 			ndp->ni_dent.d_ino = dp->i_number;
649 		else if (ndp->ni_dent.d_ino == ROOTINO &&
650 		   dp->i_number == ROOTINO) {
651 			for (i = 1; i < NMOUNT; i++)
652 			if (mount[i].m_bufp != NULL &&
653 			   mount[i].m_dev == dp->i_dev) {
654 				iput(dp);
655 				dp = mount[i].m_inodp;
656 				ILOCK(dp);
657 				dp->i_count++;
658 				fs = dp->i_fs;
659 				cp -= 2;     /* back over .. */
660 				goto dirloop2;
661 			}
662 		}
663 	}
664 
665 	/*
666 	 * If rewriting (rename), return the inode and the
667 	 * information required to rewrite the present directory
668 	 * Must get inode of directory entry to verify it's a
669 	 * regular file, or empty directory.
670 	 */
671 	if ((flag == CREATE && lockparent) && *cp == 0) {
672 		if (access(dp, IWRITE))
673 			goto bad;
674 		ndp->ni_pdir = dp;		/* for dirrewrite() */
675 		/*
676 		 * Careful about locking second inode.
677 		 * This can only occur if the target is ".".
678 		 */
679 		if (dp->i_number == ndp->ni_dent.d_ino) {
680 			u.u_error = EISDIR;		/* XXX */
681 			goto bad;
682 		}
683 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
684 		if (dp == NULL) {
685 			iput(ndp->ni_pdir);
686 			goto bad;
687 		}
688 		nbp->av_forw = freenamebuf;
689 		freenamebuf = nbp;
690 		return (dp);
691 	}
692 
693 	/*
694 	 * Check for symbolic link, which may require us to massage the
695 	 * name before we continue translation.  We do not `iput' the
696 	 * directory because we may need it again if the symbolic link
697 	 * is relative to the current directory.  Instead we save it
698 	 * unlocked as "pdp".  We must get the target inode before unlocking
699 	 * the directory to insure that the inode will not be removed
700 	 * before we get it.  We prevent deadlock by always fetching
701 	 * inodes from the root, moving down the directory tree. Thus
702 	 * when following backward pointers ".." we must unlock the
703 	 * parent directory before getting the requested directory.
704 	 * There is a potential race condition here if both the current
705 	 * and parent directories are removed before the `iget' for the
706 	 * inode associated with ".." returns.  We hope that this occurs
707 	 * infrequently since we cannot avoid this race condition without
708 	 * implementing a sophisticated deadlock detection algorithm.
709 	 * Note also that this simple deadlock detection scheme will not
710 	 * work if the file system has any hard links other than ".."
711 	 * that point backwards in the directory structure.
712 	 */
713 	pdp = dp;
714 	if (isdotdot) {
715 		IUNLOCK(pdp);	/* race to get the inode */
716 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
717 		if (dp == NULL)
718 			goto bad2;
719 	} else if (dp->i_number == ndp->ni_dent.d_ino) {
720 		dp->i_count++;	/* we want ourself, ie "." */
721 	} else {
722 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
723 		IUNLOCK(pdp);
724 		if (dp == NULL)
725 			goto bad2;
726 	}
727 
728 	/*
729 	 * insert name into cache (if we want it, and it isn't "." or "..")
730 	 *
731 	 * all other cases where making a cache entry would be wrong
732 	 * have already departed from the code sequence somewhere above.
733 	 */
734 	if (docache) {
735 		if (ncp != NULL)
736 			panic("nami: duplicating cache");
737 
738 			/*
739 			 * free the cache slot at head of lru chain
740 			 */
741 		if (ncp = nchhead) {
742 				/* remove from lru chain */
743 			*ncp->nc_prev = ncp->nc_nxt;
744 			if (ncp->nc_nxt)
745 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
746 			else
747 				nchtail = ncp->nc_prev;
748 
749 				/* remove from old hash chain */
750 			remque(ncp);
751 
752 				/* grab the inode we just found */
753 			ncp->nc_ip = dp;
754 
755 				/* fill in cache info */
756 			ncp->nc_ino = pdp->i_number;	/* parents inum */
757 			ncp->nc_dev = pdp->i_dev;	/* & device */
758 			ncp->nc_idev = dp->i_dev;	/* our device */
759 			ncp->nc_id = dp->i_id;		/* identifier */
760 			ncp->nc_nlen = ndp->ni_dent.d_namlen;
761 			bcopy(ndp->ni_dent.d_name, ncp->nc_name, ncp->nc_nlen);
762 
763 				/* link at end of lru chain */
764 			ncp->nc_nxt = NULL;
765 			ncp->nc_prev = nchtail;
766 			*nchtail = ncp;
767 			nchtail = &ncp->nc_nxt;
768 
769 				/* and insert on hash chain */
770 			insque(ncp, nhp);
771 		}
772 	}
773 
774 haveino:
775 	fs = dp->i_fs;
776 
777 	/*
778 	 * Check for symbolic link
779 	 */
780 	if ((dp->i_mode & IFMT) == IFLNK &&
781 	    ((ndp->ni_nameiop & FOLLOW) || *cp == '/')) {
782 		u_int pathlen = strlen(cp) + 1;
783 
784 		if (dp->i_size + pathlen >= MAXPATHLEN - 1 ||
785 		    ++nlink > MAXSYMLINKS) {
786 			u.u_error = ELOOP;
787 			goto bad2;
788 		}
789 		ovbcopy(cp, nbp->b_un.b_addr + dp->i_size, pathlen);
790 		u.u_error =
791 		    rdwri(UIO_READ, dp, nbp->b_un.b_addr, (int)dp->i_size,
792 			0, 1, (int *)0);
793 		if (u.u_error)
794 			goto bad2;
795 		cp = nbp->b_un.b_addr;
796 		iput(dp);
797 		if (*cp == '/') {
798 			irele(pdp);
799 			while (*cp == '/')
800 				cp++;
801 			if ((dp = u.u_rdir) == NULL)
802 				dp = rootdir;
803 			ILOCK(dp);
804 			dp->i_count++;
805 		} else {
806 			dp = pdp;
807 			ILOCK(dp);
808 		}
809 		fs = dp->i_fs;
810 		goto dirloop;
811 	}
812 
813 	/*
814 	 * Not a symbolic link.  If more pathname,
815 	 * continue at next component, else return.
816 	 */
817 	if (*cp == '/') {
818 		while (*cp == '/')
819 			cp++;
820 		irele(pdp);
821 		goto dirloop;
822 	}
823 	nbp->av_forw = freenamebuf;
824 	freenamebuf = nbp;
825 	if (lockparent)
826 		ndp->ni_pdir = pdp;
827 	else
828 		irele(pdp);
829 	return (dp);
830 bad2:
831 	irele(pdp);
832 bad:
833 	if (bp)
834 		brelse(bp);
835 	if (dp)
836 		iput(dp);
837 	nbp->av_forw = freenamebuf;
838 	freenamebuf = nbp;
839 	return (NULL);
840 }
841 
842 
843 dirbad(ip, offset, how)
844 	struct inode *ip;
845 	off_t offset;
846 	char *how;
847 {
848 
849 	printf("%s: bad dir ino %d at offset %d: %s\n",
850 	    ip->i_fs->fs_fsmnt, ip->i_number, offset, how);
851 }
852 
853 /*
854  * Do consistency checking on a directory entry:
855  *	record length must be multiple of 4
856  *	record length must not be non-negative
857  *	entry must fit in rest of its DIRBLKSIZ block
858  *	record must be large enough to contain entry
859  *	name is not longer than MAXNAMLEN
860  *	name must be as long as advertised, and null terminated
861  */
862 dirbadentry(ep, entryoffsetinblock)
863 	register struct direct *ep;
864 	int entryoffsetinblock;
865 {
866 	register int i;
867 
868 	if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen <= 0 ||
869 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
870 	    ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN)
871 		return (1);
872 	for (i = 0; i < ep->d_namlen; i++)
873 		if (ep->d_name[i] == '\0')
874 			return (1);
875 	return (ep->d_name[i]);
876 }
877 
878 /*
879  * Write a directory entry after a call to namei, using the parameters
880  * which it left in the u. area.  The argument ip is the inode which
881  * the new directory entry will refer to.  The u. area field ndp->ni_pdir is
882  * a pointer to the directory to be written, which was left locked by
883  * namei.  Remaining parameters (ndp->ni_offset, ndp->ni_count) indicate
884  * how the space for the new entry is to be gotten.
885  */
886 direnter(ip, ndp)
887 	struct inode *ip;
888 	register struct nameidata *ndp;
889 {
890 	register struct direct *ep, *nep;
891 	register struct inode *dp = ndp->ni_pdir;
892 	struct buf *bp;
893 	int loc, spacefree, error = 0;
894 	u_int dsize;
895 	int newentrysize;
896 	char *dirbuf;
897 
898 	ndp->ni_dent.d_ino = ip->i_number;
899 	newentrysize = DIRSIZ(&ndp->ni_dent);
900 	if (ndp->ni_count == 0) {
901 		/*
902 		 * If ndp->ni_count is 0, then namei could find no space in the
903 		 * directory. In this case ndp->ni_offset will be on a directory
904 		 * block boundary and we will write the new entry into a fresh
905 		 * block.
906 		 */
907 		if (ndp->ni_offset&(DIRBLKSIZ-1))
908 			panic("wdir: newblk");
909 		ndp->ni_dent.d_reclen = DIRBLKSIZ;
910 		error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
911 		    newentrysize, ndp->ni_offset, 1, (int *)0);
912 		iput(dp);
913 		return (error);
914 	}
915 
916 	/*
917 	 * If ndp->ni_count is non-zero, then namei found space for the new
918 	 * entry in the range ndp->ni_offset to ndp->ni_offset + ndp->ni_count.
919 	 * in the directory.  To use this space, we may have to compact
920 	 * the entries located there, by copying them together towards
921 	 * the beginning of the block, leaving the free space in
922 	 * one usable chunk at the end.
923 	 */
924 
925 	/*
926 	 * Increase size of directory if entry eats into new space.
927 	 * This should never push the size past a new multiple of
928 	 * DIRBLKSIZE.
929 	 */
930 	if (ndp->ni_offset + ndp->ni_count > dp->i_size)
931 		dp->i_size = ndp->ni_offset + ndp->ni_count;
932 
933 	/*
934 	 * Get the block containing the space for the new directory
935 	 * entry.  Should return error by result instead of u.u_error.
936 	 */
937 	bp = blkatoff(dp, ndp->ni_offset, (char **)&dirbuf);
938 	if (bp == 0) {
939 		iput(dp);
940 		return (u.u_error);
941 	}
942 
943 	/*
944 	 * Find space for the new entry.  In the simple case, the
945 	 * entry at offset base will have the space.  If it does
946 	 * not, then namei arranged that compacting the region
947 	 * ndp->ni_offset to ndp->ni_offset+ndp->ni_count would yield the space.
948 	 */
949 	ep = (struct direct *)dirbuf;
950 	dsize = DIRSIZ(ep);
951 	spacefree = ep->d_reclen - dsize;
952 	for (loc = ep->d_reclen; loc < ndp->ni_count; ) {
953 		nep = (struct direct *)(dirbuf + loc);
954 		if (ep->d_ino) {
955 			/* trim the existing slot */
956 			ep->d_reclen = dsize;
957 			ep = (struct direct *)((char *)ep + dsize);
958 		} else {
959 			/* overwrite; nothing there; header is ours */
960 			spacefree += dsize;
961 		}
962 		dsize = DIRSIZ(nep);
963 		spacefree += nep->d_reclen - dsize;
964 		loc += nep->d_reclen;
965 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
966 	}
967 	/*
968 	 * Update the pointer fields in the previous entry (if any),
969 	 * copy in the new entry, and write out the block.
970 	 */
971 	if (ep->d_ino == 0) {
972 		if (spacefree + dsize < newentrysize)
973 			panic("wdir: compact1");
974 		ndp->ni_dent.d_reclen = spacefree + dsize;
975 	} else {
976 		if (spacefree < newentrysize)
977 			panic("wdir: compact2");
978 		ndp->ni_dent.d_reclen = spacefree;
979 		ep->d_reclen = dsize;
980 		ep = (struct direct *)((char *)ep + dsize);
981 	}
982 	bcopy((caddr_t)&ndp->ni_dent, (caddr_t)ep, (u_int)newentrysize);
983 	bwrite(bp);
984 	dp->i_flag |= IUPD|ICHG;
985 	if (ndp->ni_endoff && ndp->ni_endoff < dp->i_size)
986 		itrunc(dp, ndp->ni_endoff);
987 	iput(dp);
988 	return (error);
989 }
990 
991 /*
992  * Remove a directory entry after a call to namei, using the
993  * parameters which it left in the u. area.  The u. entry
994  * ni_offset contains the offset into the directory of the
995  * entry to be eliminated.  The ni_count field contains the
996  * size of the previous record in the directory.  If this
997  * is 0, the first entry is being deleted, so we need only
998  * zero the inode number to mark the entry as free.  If the
999  * entry isn't the first in the directory, we must reclaim
1000  * the space of the now empty record by adding the record size
1001  * to the size of the previous entry.
1002  */
1003 dirremove(ndp)
1004 	register struct nameidata *ndp;
1005 {
1006 	register struct inode *dp = ndp->ni_pdir;
1007 	register struct buf *bp;
1008 	struct direct *ep;
1009 
1010 	if (ndp->ni_count == 0) {
1011 		/*
1012 		 * First entry in block: set d_ino to zero.
1013 		 */
1014 		ndp->ni_dent.d_ino = 0;
1015 		(void) rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1016 		    (int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1017 	} else {
1018 		/*
1019 		 * Collapse new free space into previous entry.
1020 		 */
1021 		bp = blkatoff(dp, (int)(ndp->ni_offset - ndp->ni_count),
1022 			(char **)&ep);
1023 		if (bp == 0)
1024 			return (0);
1025 		ep->d_reclen += ndp->ni_dent.d_reclen;
1026 		bwrite(bp);
1027 		dp->i_flag |= IUPD|ICHG;
1028 	}
1029 	return (1);
1030 }
1031 
1032 /*
1033  * Rewrite an existing directory entry to point at the inode
1034  * supplied.  The parameters describing the directory entry are
1035  * set up by a call to namei.
1036  */
1037 dirrewrite(dp, ip, ndp)
1038 	struct inode *dp, *ip;
1039 	struct nameidata *ndp;
1040 {
1041 
1042 	ndp->ni_dent.d_ino = ip->i_number;
1043 	u.u_error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1044 		(int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1045 	iput(dp);
1046 }
1047 
1048 /*
1049  * Return buffer with contents of block "offset"
1050  * from the beginning of directory "ip".  If "res"
1051  * is non-zero, fill it in with a pointer to the
1052  * remaining space in the directory.
1053  */
1054 struct buf *
1055 blkatoff(ip, offset, res)
1056 	struct inode *ip;
1057 	off_t offset;
1058 	char **res;
1059 {
1060 	register struct fs *fs = ip->i_fs;
1061 	daddr_t lbn = lblkno(fs, offset);
1062 	int base = blkoff(fs, offset);
1063 	int bsize = blksize(fs, ip, lbn);
1064 	daddr_t bn = fsbtodb(fs, bmap(ip, lbn, B_WRITE, base, bsize));
1065 	register struct buf *bp;
1066 
1067 	if (u.u_error)
1068 		return (0);
1069 	bp = bread(ip->i_dev, bn, bsize);
1070 	if (bp->b_flags & B_ERROR) {
1071 		brelse(bp);
1072 		return (0);
1073 	}
1074 	if (res)
1075 		*res = bp->b_un.b_addr + base;
1076 	return (bp);
1077 }
1078 
1079 /*
1080  * Check if a directory is empty or not.
1081  * Inode supplied must be locked.
1082  *
1083  * Using a struct dirtemplate here is not precisely
1084  * what we want, but better than using a struct direct.
1085  *
1086  * NB: does not handle corrupted directories.
1087  */
1088 dirempty(ip, parentino)
1089 	register struct inode *ip;
1090 	ino_t parentino;
1091 {
1092 	register off_t off;
1093 	struct dirtemplate dbuf;
1094 	register struct direct *dp = (struct direct *)&dbuf;
1095 	int error, count;
1096 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1097 
1098 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
1099 		error = rdwri(UIO_READ, ip, (caddr_t)dp, MINDIRSIZ,
1100 		    off, 1, &count);
1101 		/*
1102 		 * Since we read MINDIRSIZ, residual must
1103 		 * be 0 unless we're at end of file.
1104 		 */
1105 		if (error || count != 0)
1106 			return (0);
1107 		/* skip empty entries */
1108 		if (dp->d_ino == 0)
1109 			continue;
1110 		/* accept only "." and ".." */
1111 		if (dp->d_namlen > 2)
1112 			return (0);
1113 		if (dp->d_name[0] != '.')
1114 			return (0);
1115 		/*
1116 		 * At this point d_namlen must be 1 or 2.
1117 		 * 1 implies ".", 2 implies ".." if second
1118 		 * char is also "."
1119 		 */
1120 		if (dp->d_namlen == 1)
1121 			continue;
1122 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
1123 			continue;
1124 		return (0);
1125 	}
1126 	return (1);
1127 }
1128 
1129 /*
1130  * Check if source directory is in the path of the target directory.
1131  * Target is supplied locked, source is unlocked.
1132  * The target is always iput() before returning.
1133  */
1134 checkpath(source, target)
1135 	struct inode *source, *target;
1136 {
1137 	struct dirtemplate dirbuf;
1138 	register struct inode *ip;
1139 	int error = 0;
1140 
1141 	ip = target;
1142 	if (ip->i_number == source->i_number) {
1143 		error = EEXIST;
1144 		goto out;
1145 	}
1146 	if (ip->i_number == ROOTINO)
1147 		goto out;
1148 
1149 	for (;;) {
1150 		if ((ip->i_mode&IFMT) != IFDIR) {
1151 			error = ENOTDIR;
1152 			break;
1153 		}
1154 		error = rdwri(UIO_READ, ip, (caddr_t)&dirbuf,
1155 			sizeof (struct dirtemplate), (off_t)0, 1, (int *)0);
1156 		if (error != 0)
1157 			break;
1158 		if (dirbuf.dotdot_namlen != 2 ||
1159 		    dirbuf.dotdot_name[0] != '.' ||
1160 		    dirbuf.dotdot_name[1] != '.') {
1161 			error = ENOTDIR;
1162 			break;
1163 		}
1164 		if (dirbuf.dotdot_ino == source->i_number) {
1165 			error = EINVAL;
1166 			break;
1167 		}
1168 		if (dirbuf.dotdot_ino == ROOTINO)
1169 			break;
1170 		iput(ip);
1171 		ip = iget(ip->i_dev, ip->i_fs, dirbuf.dotdot_ino);
1172 		if (ip == NULL) {
1173 			error = u.u_error;
1174 			break;
1175 		}
1176 	}
1177 
1178 out:
1179 	if (error == ENOTDIR)
1180 		printf("checkpath: .. not a directory\n");
1181 	if (ip != NULL)
1182 		iput(ip);
1183 	return (error);
1184 }
1185 
1186 /*
1187  * Name cache initialization, from main() when we are booting
1188  */
1189 nchinit()
1190 {
1191 	register union nchash *nchp;
1192 	register struct nch *ncp;
1193 
1194 	nchhead = 0;
1195 	nchtail = &nchhead;
1196 
1197 	for (ncp = nch; ncp < &nch[nchsize]; ncp++) {
1198 		ncp->nc_forw = ncp;			/* hash chain */
1199 		ncp->nc_back = ncp;
1200 
1201 		ncp->nc_nxt = NULL;			/* lru chain */
1202 		*nchtail = ncp;
1203 		ncp->nc_prev = nchtail;
1204 		nchtail = &ncp->nc_nxt;
1205 
1206 		/* all else is zero already */
1207 	}
1208 
1209 	for (nchp = nchash; nchp < &nchash[NCHHASH]; nchp++) {
1210 		nchp->nch_head[0] = nchp;
1211 		nchp->nch_head[1] = nchp;
1212 	}
1213 }
1214 
1215 /*
1216  * Cache flush, called when filesys is umounted to
1217  * remove entries that would now be invalid
1218  *
1219  * The line "nxtcp = nchhead" near the end is to avoid potential problems
1220  * if the cache lru chain is modified while we are dumping the
1221  * inode.  This makes the algorithm O(n^2), but do you think I care?
1222  */
1223 nchinval(dev)
1224 	register dev_t dev;
1225 {
1226 	register struct nch *ncp, *nxtcp;
1227 
1228 	for (ncp = nchhead; ncp; ncp = nxtcp) {
1229 		nxtcp = ncp->nc_nxt;
1230 
1231 		if (ncp->nc_ip == NULL ||
1232 		    (ncp->nc_idev != dev && ncp->nc_dev != dev))
1233 			continue;
1234 
1235 			/* free the resources we had */
1236 		ncp->nc_idev = NODEV;
1237 		ncp->nc_dev = NODEV;
1238 		ncp->nc_id = NULL;
1239 		ncp->nc_ino = 0;
1240 		ncp->nc_ip = NULL;
1241 
1242 
1243 			/* remove the entry from its hash chain */
1244 		remque(ncp);
1245 			/* and make a dummy one */
1246 		ncp->nc_forw = ncp;
1247 		ncp->nc_back = ncp;
1248 
1249 			/* delete this entry from LRU chain */
1250 		*ncp->nc_prev = nxtcp;
1251 		if (nxtcp)
1252 			nxtcp->nc_prev = ncp->nc_prev;
1253 		else
1254 			nchtail = ncp->nc_prev;
1255 
1256 			/* cause rescan of list, it may have altered */
1257 		nxtcp = nchhead;
1258 			/* put the now-free entry at head of LRU */
1259 		ncp->nc_nxt = nxtcp;
1260 		ncp->nc_prev = &nchhead;
1261 		nxtcp->nc_prev = &ncp->nc_nxt;
1262 		nchhead = ncp;
1263 	}
1264 }
1265 
1266 /*
1267  * Name cache invalidation of all entries.
1268  */
1269 cacheinvalall()
1270 {
1271 	register struct nch *ncp;
1272 
1273 	for (ncp = nch; ncp < &nch[nchsize]; ncp++) {
1274 		ncp->nc_id = 0;
1275 	}
1276 }
1277