xref: /csrg-svn/sys/kern/vfs_lookup.c (revision 17100)
1 /*	vfs_lookup.c	6.15	84/08/29	*/
2 
3 #include "param.h"
4 #include "systm.h"
5 #include "inode.h"
6 #include "fs.h"
7 #include "mount.h"
8 #include "dir.h"
9 #include "user.h"
10 #include "buf.h"
11 #include "conf.h"
12 #include "uio.h"
13 #include "kernel.h"
14 
15 struct	buf *blkatoff();
16 struct	buf *freenamebuf;
17 int	dirchk = 0;
18 
19 /*
20  * Structures associated with name cacheing.
21  */
22 #define	NCHHASH		32	/* size of hash table */
23 
24 #if	((NCHHASH)&((NCHHASH)-1)) != 0
25 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) % (NCHHASH))
26 #else
27 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) & ((NCHHASH)-1))
28 #endif
29 
30 union	nchash	{
31 	union	nchash	*nch_head[2];
32 	struct	nch	*nch_chain[2];
33 } nchash[NCHHASH];
34 #define	nch_forw	nch_chain[0]
35 #define	nch_back	nch_chain[1]
36 
37 struct	nch	*nchhead, **nchtail;	/* LRU chain pointers */
38 struct	nchstats nchstats;		/* cache effectiveness statistics */
39 
40 /*
41  * Convert a pathname into a pointer to a locked inode,
42  * with side effects usable in creating and removing files.
43  * This is a very central and rather complicated routine.
44  *
45  * The segflg defines whether the name is to be copied from user
46  * space or kernel space.
47  *
48  * The flag argument is (LOOKUP, CREATE, DELETE) depending on whether
49  * the name is to be (looked up, created, deleted).  If flag has
50  * LOCKPARENT or'ed into it and the target of the pathname exists,
51  * namei returns both the target and its parent directory locked.
52  * If the file system is not maintained in a strict tree hierarchy,
53  * this can result in a deadlock situation.  When creating and
54  * LOCKPARENT is specified, the target may not be ".".  When deleting
55  * and LOCKPARENT is specified, the target may be ".", but the caller
56  * must check to insure it does an irele and iput instead of two iputs.
57  *
58  * The FOLLOW flag is set when symbolic links are to be followed
59  * when they occur at the end of the name translation process.
60  *
61  * Name caching works as follows:
62  *
63  *	names found by directory scans are retained in a cache
64  *	for future reference.  It is managed LRU, so frequently
65  *	used names will hang around.  Cache is indexed by hash value
66  *	obtained from (ino,dev,name) where ino & dev refer to the
67  *	directory containing name.
68  *
69  *	For simplicity (and economy of storage), names longer than
70  *	some (small) maximum length are not cached, they occur
71  *	infrequently in any case, and are almost never of interest.
72  *
73  *	Upon reaching the last segment of a path, if the reference
74  *	is for DELETE, or NOCACHE is set (rewrite), and the
75  *	name is located in the cache, it will be dropped.
76  *
77  *	We must be sure never to enter the name ".." into the cache
78  *	because of the extremely kludgey way that rename() alters
79  *	".." in a situation like
80  *		mv a/x b/x
81  *	where x is a directory, and x/.. is the ".." in question.
82  *
83  * Overall outline of namei:
84  *
85  *	copy in name
86  *	get starting directory
87  * dirloop:
88  *	check accessibility of directory
89  * dirloop2:
90  *	copy next component of name to ndp->ni_dent
91  *	handle degenerate case where name is null string
92  *	look for name in cache, if found, then if at end of path
93  *	  and deleting or creating, drop it, else to haveino
94  *	search for name in directory, to found or notfound
95  * notfound:
96  *	if creating, return locked directory, leaving info on avail. slots
97  *	else return error
98  * found:
99  *	if at end of path and deleting, return information to allow delete
100  *	if at end of path and rewriting (create and LOCKPARENT), lock target
101  *	  inode and return info to allow rewrite
102  *	if .. and on mounted filesys, look in mount table for parent
103  *	if not at end, if neither creating nor deleting, add name to cache
104  * haveino:
105  *	if symbolic link, massage name in buffer and continue at dirloop
106  *	if more components of name, do next level at dirloop
107  *	return the answer as locked inode
108  *
109  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode,
110  *	 but unlocked.
111  */
112 struct inode *
113 namei(ndp)
114 	register struct nameidata *ndp;
115 {
116 	register char *cp;		/* pointer into pathname argument */
117 /* these variables refer to things which must be freed or unlocked */
118 	register struct inode *dp = 0;	/* the directory we are searching */
119 	register struct nch *ncp;	/* cache slot for entry */
120 	register struct fs *fs;		/* file system that directory is in */
121 	register struct buf *bp = 0;	/* a buffer of directory entries */
122 	register struct direct *ep;	/* the current directory entry */
123 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
124 	register struct buf *nbp;	/* buffer storing path name argument */
125 /* these variables hold information about the search for a slot */
126 	enum {NONE, COMPACT, FOUND} slotstatus;
127 	int slotoffset = -1;		/* offset of area with free space */
128 	int slotsize;			/* size of area at slotoffset */
129 	int slotfreespace;		/* amount of space free in slot */
130 	int slotneeded;			/* size of the entry we're seeking */
131 /* */
132 	int numdirpasses;		/* strategy for directory search */
133 	int endsearch;			/* offset to end directory search */
134 	int prevoff;			/* ndp->ni_offset of previous entry */
135 	int nlink = 0;			/* number of symbolic links taken */
136 	struct inode *pdp;		/* saved dp during symlink work */
137 	int error, i;
138 	int lockparent;
139 	int docache;
140 	unsigned hash;			/* value of name hash for entry */
141 	union nchash *nhp;		/* cache chain head for entry */
142 	int isdotdot;			/* != 0 if current name is ".." */
143 	int flag;			/* op ie, LOOKUP, CREATE, or DELETE */
144 
145 	lockparent = ndp->ni_nameiop & LOCKPARENT;
146 	docache = (ndp->ni_nameiop & NOCACHE) ^ NOCACHE;
147 	flag = ndp->ni_nameiop &~ (LOCKPARENT|NOCACHE|FOLLOW);
148 	if (flag == DELETE)
149 		docache = 0;
150 	/*
151 	 * Get a buffer for the name to be translated, and copy the
152 	 * name into the buffer.
153 	 */
154 	nbp = freenamebuf;
155 	if (nbp == NULL)
156 		nbp = geteblk(MAXPATHLEN);
157 	else
158 		freenamebuf = nbp->av_forw;
159 	if (ndp->ni_segflg == UIO_SYSSPACE)
160 		error = copystr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
161 		    (u_int *)0);
162 	else
163 		error = copyinstr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
164 		    (u_int *)0);
165 	if (error) {
166 		u.u_error = error;
167 		goto bad;
168 	}
169 
170 	/*
171 	 * Get starting directory.
172 	 */
173 	cp = nbp->b_un.b_addr;
174 	if (*cp == '/') {
175 		while (*cp == '/')
176 			cp++;
177 		if ((dp = u.u_rdir) == NULL)
178 			dp = rootdir;
179 	} else
180 		dp = u.u_cdir;
181 	fs = dp->i_fs;
182 	ILOCK(dp);
183 	dp->i_count++;
184 	ndp->ni_pdir = (struct inode *)0xc0000000;		/* illegal */
185 
186 	/*
187 	 * We come to dirloop to search a new directory.
188 	 * The directory must be locked so that it can be
189 	 * iput, and fs must be already set to dp->i_fs.
190 	 */
191 dirloop:
192 	/*
193 	 * Check accessiblity of directory.
194 	 */
195 	if ((dp->i_mode&IFMT) != IFDIR) {
196 		u.u_error = ENOTDIR;
197 		goto bad;
198 	}
199 	if (access(dp, IEXEC))
200 		goto bad;
201 
202 dirloop2:
203 	/*
204 	 * Copy next component of name to ndp->ni_dent.
205 	 */
206 	hash = 0;
207 	for (i = 0; *cp != 0 && *cp != '/'; cp++) {
208 		if (i >= MAXNAMLEN) {
209 			u.u_error = ENOENT;
210 			goto bad;
211 		}
212 		if ((*cp&0377) == ('/'|0200) || (*cp&0200) && flag != DELETE) {
213 			u.u_error = EPERM;
214 			goto bad;
215 		}
216 		ndp->ni_dent.d_name[i++] = *cp;
217 		hash += (unsigned char)*cp * i;
218 	}
219 	ndp->ni_dent.d_namlen = i;
220 	ndp->ni_dent.d_name[i] = '\0';
221 	isdotdot = (i == 2 &&
222 		ndp->ni_dent.d_name[0] == '.' && ndp->ni_dent.d_name[1] == '.');
223 
224 	/*
225 	 * Check for degenerate name (e.g. / or "")
226 	 * which is a way of talking about a directory,
227 	 * e.g. like "/." or ".".
228 	 */
229 	if (ndp->ni_dent.d_name[0] == '\0') {
230 		if (flag != LOOKUP || lockparent) {
231 			u.u_error = EISDIR;
232 			goto bad;
233 		}
234 		nbp->av_forw = freenamebuf;
235 		freenamebuf = nbp;
236 		return (dp);
237 	}
238 
239 	/*
240 	 * We now have a segment name to search for, and a directory to search.
241 	 *
242 	 * Before tediously performing a linear scan of the directory,
243 	 * check the name cache to see if the directory/name pair
244 	 * we are looking for is known already.  We don't do this
245 	 * if the segment name is long, simply so the cache can avoid
246 	 * holding long names (which would either waste space, or
247 	 * add greatly to the complexity).
248 	 */
249 	if (ndp->ni_dent.d_namlen > NCHNAMLEN) {
250 		nchstats.ncs_long++;
251 		docache = 0;
252 	} else {
253 		nhp = &nchash[NHASH(hash, dp->i_number, dp->i_dev)];
254 		for (ncp = nhp->nch_forw; ncp != (struct nch *)nhp;
255 		    ncp = ncp->nc_forw) {
256 			if (ncp->nc_ino == dp->i_number &&
257 			    ncp->nc_dev == dp->i_dev &&
258 			    ncp->nc_nlen == ndp->ni_dent.d_namlen &&
259 			    !bcmp(ncp->nc_name, ndp->ni_dent.d_name,
260 				ncp->nc_nlen))
261 				break;
262 		}
263 
264 		if (ncp == (struct nch *)nhp) {
265 			nchstats.ncs_miss++;
266 			ncp = NULL;
267 		} else {
268 			if (ncp->nc_id != ncp->nc_ip->i_id) {
269 				nchstats.ncs_falsehits++;
270 			} else if (*cp == '\0' && !docache) {
271 				nchstats.ncs_badhits++;
272 			} else {
273 
274 					/*
275 					 * move this slot to end of LRU
276 					 * chain, if not already there
277 					 */
278 				if (ncp->nc_nxt) {
279 						/* remove from LRU chain */
280 					*ncp->nc_prev = ncp->nc_nxt;
281 					ncp->nc_nxt->nc_prev = ncp->nc_prev;
282 
283 						/* and replace at end of it */
284 					ncp->nc_nxt = NULL;
285 					ncp->nc_prev = nchtail;
286 					*nchtail = ncp;
287 					nchtail = &ncp->nc_nxt;
288 				}
289 
290 				/*
291 				 * Get the next inode in the path.
292 				 * See comment above other `IUNLOCK' code for
293 				 * an explaination of the locking protocol.
294 				 */
295 				pdp = dp;
296 				dp = ncp->nc_ip;
297 				if (dp == NULL)
298 					panic("nami: null cache ino");
299 				if (pdp == dp)
300 					dp->i_count++;
301 				else {
302 					if (isdotdot) {
303 						IUNLOCK(pdp);
304 						igrab(dp);
305 					} else {
306 						igrab(dp);
307 						IUNLOCK(pdp);
308 					}
309 				}
310 
311 				/*
312 				 * Verify that the inode that we got
313 				 * did not change while we were waiting
314 				 * for it to be locked.
315 				 */
316 				if (ncp->nc_id != ncp->nc_ip->i_id) {
317 					iput(dp);
318 					ILOCK(pdp);
319 					dp = pdp;
320 					nchstats.ncs_falsehits++;
321 				} else {
322 					ndp->ni_dent.d_ino = dp->i_number;
323 					/* ni_dent.d_reclen is garbage ... */
324 					nchstats.ncs_goodhits++;
325 					goto haveino;
326 				}
327 			}
328 
329 			/*
330 			 * Last component and we are renaming or deleting,
331 			 * the cache entry is invalid, or otherwise don't
332 			 * want cache entry to exist.
333 			 */
334 
335 				/* remove from LRU chain */
336 			*ncp->nc_prev = ncp->nc_nxt;
337 			if (ncp->nc_nxt)
338 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
339 			else
340 				nchtail = ncp->nc_prev;
341 
342 				/* remove from hash chain */
343 			remque(ncp);
344 
345 				/* insert at head of LRU list (first to grab) */
346 			ncp->nc_nxt = nchhead;
347 			ncp->nc_prev = &nchhead;
348 			nchhead->nc_prev = &ncp->nc_nxt;
349 			nchhead = ncp;
350 
351 				/* and make a dummy hash chain */
352 			ncp->nc_forw = ncp;
353 			ncp->nc_back = ncp;
354 
355 			ncp = NULL;
356 		}
357 	}
358 
359 	/*
360 	 * Suppress search for slots unless creating
361 	 * file and at end of pathname, in which case
362 	 * we watch for a place to put the new file in
363 	 * case it doesn't already exist.
364 	 */
365 	slotstatus = FOUND;
366 	if (flag == CREATE && *cp == 0) {
367 		slotstatus = NONE;
368 		slotfreespace = 0;
369 		slotneeded = DIRSIZ(&ndp->ni_dent);
370 	}
371 	/*
372 	 * If this is the same directory that this process
373 	 * previously searched, pick up where we last left off.
374 	 * We cache only lookups as these are the most common
375 	 * and have the greatest payoff. Caching CREATE has little
376 	 * benefit as it usually must search the entire directory
377 	 * to determine that the entry does not exist. Caching the
378 	 * location of the last DELETE has not reduced profiling time
379 	 * and hence has been removed in the interest of simplicity.
380 	 */
381 	if (flag != LOOKUP || dp->i_number != u.u_ncache.nc_inumber ||
382 	    dp->i_dev != u.u_ncache.nc_dev) {
383 		ndp->ni_offset = 0;
384 		numdirpasses = 1;
385 	} else {
386 		if ((dp->i_flag & ICHG) || dp->i_ctime >= u.u_ncache.nc_time) {
387 			u.u_ncache.nc_prevoffset &= ~(DIRBLKSIZ - 1);
388 			u.u_ncache.nc_time = time.tv_sec;
389 		}
390 		ndp->ni_offset = u.u_ncache.nc_prevoffset;
391 		entryoffsetinblock = blkoff(fs, ndp->ni_offset);
392 		if (entryoffsetinblock != 0) {
393 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
394 			if (bp == 0)
395 				goto bad;
396 		}
397 		numdirpasses = 2;
398 		nchstats.ncs_2passes++;
399 	}
400 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
401 
402 searchloop:
403 	while (ndp->ni_offset < endsearch) {
404 		/*
405 		 * If offset is on a block boundary,
406 		 * read the next directory block.
407 		 * Release previous if it exists.
408 		 */
409 		if (blkoff(fs, ndp->ni_offset) == 0) {
410 			if (bp != NULL)
411 				brelse(bp);
412 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
413 			if (bp == 0)
414 				goto bad;
415 			entryoffsetinblock = 0;
416 		}
417 
418 		/*
419 		 * If still looking for a slot, and at a DIRBLKSIZE
420 		 * boundary, have to start looking for free space again.
421 		 */
422 		if (slotstatus == NONE &&
423 		    (entryoffsetinblock&(DIRBLKSIZ-1)) == 0) {
424 			slotoffset = -1;
425 			slotfreespace = 0;
426 		}
427 
428 		/*
429 		 * Get pointer to next entry.
430 		 * Full validation checks are slow, so we only check
431 		 * enough to insure forward progress through the
432 		 * directory. Complete checks can be run by patching
433 		 * "dirchk" to be true.
434 		 */
435 		ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock);
436 		if (ep->d_reclen <= 0 ||
437 		    dirchk && dirbadentry(ep, entryoffsetinblock)) {
438 			dirbad(dp, ndp->ni_offset, "mangled entry");
439 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
440 			ndp->ni_offset += i;
441 			entryoffsetinblock += i;
442 			continue;
443 		}
444 
445 		/*
446 		 * If an appropriate sized slot has not yet been found,
447 		 * check to see if one is available. Also accumulate space
448 		 * in the current block so that we can determine if
449 		 * compaction is viable.
450 		 */
451 		if (slotstatus != FOUND) {
452 			int size = ep->d_reclen;
453 
454 			if (ep->d_ino != 0)
455 				size -= DIRSIZ(ep);
456 			if (size > 0) {
457 				if (size >= slotneeded) {
458 					slotstatus = FOUND;
459 					slotoffset = ndp->ni_offset;
460 					slotsize = ep->d_reclen;
461 				} else if (slotstatus == NONE) {
462 					slotfreespace += size;
463 					if (slotoffset == -1)
464 						slotoffset = ndp->ni_offset;
465 					if (slotfreespace >= slotneeded) {
466 						slotstatus = COMPACT;
467 						slotsize = ndp->ni_offset +
468 						      ep->d_reclen - slotoffset;
469 					}
470 				}
471 			}
472 		}
473 
474 		/*
475 		 * Check for a name match.
476 		 */
477 		if (ep->d_ino) {
478 			if (ep->d_namlen == ndp->ni_dent.d_namlen &&
479 			    !bcmp(ndp->ni_dent.d_name, ep->d_name,
480 				ep->d_namlen))
481 				goto found;
482 		}
483 		prevoff = ndp->ni_offset;
484 		ndp->ni_offset += ep->d_reclen;
485 		entryoffsetinblock += ep->d_reclen;
486 	}
487 /* notfound: */
488 	/*
489 	 * If we started in the middle of the directory and failed
490 	 * to find our target, we must check the beginning as well.
491 	 */
492 	if (numdirpasses == 2) {
493 		numdirpasses--;
494 		ndp->ni_offset = 0;
495 		endsearch = u.u_ncache.nc_prevoffset;
496 		goto searchloop;
497 	}
498 	/*
499 	 * If creating, and at end of pathname and current
500 	 * directory has not been removed, then can consider
501 	 * allowing file to be created.
502 	 */
503 	if (flag == CREATE && *cp == 0 && dp->i_nlink != 0) {
504 		/*
505 		 * Access for write is interpreted as allowing
506 		 * creation of files in the directory.
507 		 */
508 		if (access(dp, IWRITE))
509 			goto bad;
510 		/*
511 		 * Return an indication of where the new directory
512 		 * entry should be put.  If we didn't find a slot,
513 		 * then set ndp->ni_count to 0 indicating that the new
514 		 * slot belongs at the end of the directory. If we found
515 		 * a slot, then the new entry can be put in the range
516 		 * [ndp->ni_offset .. ndp->ni_offset + ndp->ni_count)
517 		 */
518 		if (slotstatus == NONE) {
519 			ndp->ni_offset = roundup(dp->i_size, DIRBLKSIZ);
520 			ndp->ni_count = 0;
521 		} else {
522 			ndp->ni_offset = slotoffset;
523 			ndp->ni_count = slotsize;
524 		}
525 		dp->i_flag |= IUPD|ICHG;
526 		if (bp)
527 			brelse(bp);
528 		nbp->av_forw = freenamebuf;
529 		freenamebuf = nbp;
530 		/*
531 		 * We return with the directory locked, so that
532 		 * the parameters we set up above will still be
533 		 * valid if we actually decide to do a direnter().
534 		 * We return NULL to indicate that the entry doesn't
535 		 * currently exist, leaving a pointer to the (locked)
536 		 * directory inode in ndp->ni_pdir.
537 		 */
538 		ndp->ni_pdir = dp;
539 		return (NULL);
540 	}
541 	u.u_error = ENOENT;
542 	goto bad;
543 found:
544 	if (numdirpasses == 2)
545 		nchstats.ncs_pass2++;
546 	/*
547 	 * Check that directory length properly reflects presence
548 	 * of this entry.
549 	 */
550 	if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) {
551 		dirbad(dp, ndp->ni_offset, "i_size too small");
552 		dp->i_size = entryoffsetinblock + DIRSIZ(ep);
553 		dp->i_flag |= IUPD|ICHG;
554 	}
555 
556 	/*
557 	 * Found component in pathname.
558 	 * If the final component of path name, save information
559 	 * in the cache as to where the entry was found.
560 	 */
561 	if (*cp == '\0' && flag == LOOKUP) {
562 		u.u_ncache.nc_prevoffset = ndp->ni_offset;
563 		u.u_ncache.nc_inumber = dp->i_number;
564 		u.u_ncache.nc_dev = dp->i_dev;
565 		u.u_ncache.nc_time = time.tv_sec;
566 	}
567 	/*
568 	 * Save directory entry in ndp->ni_dent,
569 	 * and release directory buffer.
570 	 */
571 	bcopy((caddr_t)ep, (caddr_t)&ndp->ni_dent, (u_int)DIRSIZ(ep));
572 	brelse(bp);
573 	bp = NULL;
574 
575 	/*
576 	 * If deleting, and at end of pathname, return
577 	 * parameters which can be used to remove file.
578 	 * If the lockparent flag isn't set, we return only
579 	 * the directory (in ndp->ni_pdir), otherwise we go
580 	 * on and lock the inode, being careful with ".".
581 	 */
582 	if (flag == DELETE && *cp == 0) {
583 		/*
584 		 * Write access to directory required to delete files.
585 		 */
586 		if (access(dp, IWRITE))
587 			goto bad;
588 		ndp->ni_pdir = dp;		/* for dirremove() */
589 		/*
590 		 * Return pointer to current entry in ndp->ni_offset,
591 		 * and distance past previous entry (if there
592 		 * is a previous entry in this block) in ndp->ni_count.
593 		 * Save directory inode pointer in ndp->ni_pdir for dirremove().
594 		 */
595 		if ((ndp->ni_offset&(DIRBLKSIZ-1)) == 0)
596 			ndp->ni_count = 0;
597 		else
598 			ndp->ni_count = ndp->ni_offset - prevoff;
599 		if (lockparent) {
600 			if (dp->i_number == ndp->ni_dent.d_ino)
601 				dp->i_count++;
602 			else {
603 				dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
604 				if (dp == NULL) {
605 					iput(ndp->ni_pdir);
606 					goto bad;
607 				}
608 				/*
609 				 * If directory is "sticky", then user must own
610 				 * the directory, or the file in it, else he
611 				 * may not delete it (unless he's root). This
612 				 * implements append-only directories.
613 				 */
614 				if ((ndp->ni_pdir->i_mode & ISVTX) &&
615 				    u.u_uid != 0 &&
616 				    u.u_uid != ndp->ni_pdir->i_uid &&
617 				    dp->i_uid != u.u_uid) {
618 					iput(ndp->ni_pdir);
619 					u.u_error = EPERM;
620 					goto bad;
621 				}
622 			}
623 		}
624 		nbp->av_forw = freenamebuf;
625 		freenamebuf = nbp;
626 		return (dp);
627 	}
628 
629 	/*
630 	 * Special handling for ".." allowing chdir out of mounted
631 	 * file system: indirect .. in root inode to reevaluate
632 	 * in directory file system was mounted on.
633 	 */
634 	if (isdotdot) {
635 		if (dp == u.u_rdir)
636 			ndp->ni_dent.d_ino = dp->i_number;
637 		else if (ndp->ni_dent.d_ino == ROOTINO &&
638 		   dp->i_number == ROOTINO) {
639 			for (i = 1; i < NMOUNT; i++)
640 			if (mount[i].m_bufp != NULL &&
641 			   mount[i].m_dev == dp->i_dev) {
642 				iput(dp);
643 				dp = mount[i].m_inodp;
644 				ILOCK(dp);
645 				dp->i_count++;
646 				fs = dp->i_fs;
647 				cp -= 2;     /* back over .. */
648 				goto dirloop2;
649 			}
650 		}
651 	}
652 
653 	/*
654 	 * If rewriting (rename), return the inode and the
655 	 * information required to rewrite the present directory
656 	 * Must get inode of directory entry to verify it's a
657 	 * regular file, or empty directory.
658 	 */
659 	if ((flag == CREATE && lockparent) && *cp == 0) {
660 		if (access(dp, IWRITE))
661 			goto bad;
662 		ndp->ni_pdir = dp;		/* for dirrewrite() */
663 		/*
664 		 * Careful about locking second inode.
665 		 * This can only occur if the target is ".".
666 		 */
667 		if (dp->i_number == ndp->ni_dent.d_ino) {
668 			u.u_error = EISDIR;		/* XXX */
669 			goto bad;
670 		}
671 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
672 		if (dp == NULL) {
673 			iput(ndp->ni_pdir);
674 			goto bad;
675 		}
676 		nbp->av_forw = freenamebuf;
677 		freenamebuf = nbp;
678 		return (dp);
679 	}
680 
681 	/*
682 	 * Check for symbolic link, which may require us to massage the
683 	 * name before we continue translation.  We do not `iput' the
684 	 * directory because we may need it again if the symbolic link
685 	 * is relative to the current directory.  Instead we save it
686 	 * unlocked as "pdp".  We must get the target inode before unlocking
687 	 * the directory to insure that the inode will not be removed
688 	 * before we get it.  We prevent deadlock by always fetching
689 	 * inodes from the root, moving down the directory tree. Thus
690 	 * when following backward pointers ".." we must unlock the
691 	 * parent directory before getting the requested directory.
692 	 * There is a potential race condition here if both the current
693 	 * and parent directories are removed before the `iget' for the
694 	 * inode associated with ".." returns.  We hope that this occurs
695 	 * infrequently since we cannot avoid this race condition without
696 	 * implementing a sophisticated deadlock detection algorithm.
697 	 * Note also that this simple deadlock detection scheme will not
698 	 * work if the file system has any hard links other than ".."
699 	 * that point backwards in the directory structure.
700 	 */
701 	pdp = dp;
702 	if (isdotdot) {
703 		IUNLOCK(pdp);	/* race to get the inode */
704 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
705 		if (dp == NULL)
706 			goto bad2;
707 	} else if (dp->i_number == ndp->ni_dent.d_ino) {
708 		dp->i_count++;	/* we want ourself, ie "." */
709 	} else {
710 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
711 		IUNLOCK(pdp);
712 		if (dp == NULL)
713 			goto bad2;
714 	}
715 
716 	/*
717 	 * insert name into cache (if we want it, and it isn't "." or "..")
718 	 *
719 	 * all other cases where making a cache entry would be wrong
720 	 * have already departed from the code sequence somewhere above.
721 	 */
722 	if (docache) {
723 		if (ncp != NULL)
724 			panic("nami: duplicating cache");
725 
726 			/*
727 			 * free the cache slot at head of lru chain
728 			 */
729 		if (ncp = nchhead) {
730 				/* remove from lru chain */
731 			*ncp->nc_prev = ncp->nc_nxt;
732 			if (ncp->nc_nxt)
733 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
734 			else
735 				nchtail = ncp->nc_prev;
736 
737 				/* remove from old hash chain */
738 			remque(ncp);
739 
740 				/* grab the inode we just found */
741 			ncp->nc_ip = dp;
742 
743 				/* fill in cache info */
744 			ncp->nc_ino = pdp->i_number;	/* parents inum */
745 			ncp->nc_dev = pdp->i_dev;	/* & device */
746 			ncp->nc_idev = dp->i_dev;	/* our device */
747 			ncp->nc_id = dp->i_id;		/* identifier */
748 			ncp->nc_nlen = ndp->ni_dent.d_namlen;
749 			bcopy(ndp->ni_dent.d_name, ncp->nc_name, ncp->nc_nlen);
750 
751 				/* link at end of lru chain */
752 			ncp->nc_nxt = NULL;
753 			ncp->nc_prev = nchtail;
754 			*nchtail = ncp;
755 			nchtail = &ncp->nc_nxt;
756 
757 				/* and insert on hash chain */
758 			insque(ncp, nhp);
759 		}
760 	}
761 
762 haveino:
763 	fs = dp->i_fs;
764 
765 	/*
766 	 * Check for symbolic link
767 	 */
768 	if ((dp->i_mode & IFMT) == IFLNK &&
769 	    ((ndp->ni_nameiop & FOLLOW) || *cp == '/')) {
770 		u_int pathlen = strlen(cp) + 1;
771 
772 		if (dp->i_size + pathlen >= MAXPATHLEN - 1 ||
773 		    ++nlink > MAXSYMLINKS) {
774 			u.u_error = ELOOP;
775 			goto bad2;
776 		}
777 		ovbcopy(cp, nbp->b_un.b_addr + dp->i_size, pathlen);
778 		u.u_error =
779 		    rdwri(UIO_READ, dp, nbp->b_un.b_addr, (int)dp->i_size,
780 			0, 1, (int *)0);
781 		if (u.u_error)
782 			goto bad2;
783 		cp = nbp->b_un.b_addr;
784 		iput(dp);
785 		if (*cp == '/') {
786 			irele(pdp);
787 			while (*cp == '/')
788 				cp++;
789 			if ((dp = u.u_rdir) == NULL)
790 				dp = rootdir;
791 			ILOCK(dp);
792 			dp->i_count++;
793 		} else {
794 			dp = pdp;
795 			ILOCK(dp);
796 		}
797 		fs = dp->i_fs;
798 		goto dirloop;
799 	}
800 
801 	/*
802 	 * Not a symbolic link.  If more pathname,
803 	 * continue at next component, else return.
804 	 */
805 	if (*cp == '/') {
806 		while (*cp == '/')
807 			cp++;
808 		irele(pdp);
809 		goto dirloop;
810 	}
811 	nbp->av_forw = freenamebuf;
812 	freenamebuf = nbp;
813 	if (lockparent)
814 		ndp->ni_pdir = pdp;
815 	else
816 		irele(pdp);
817 	return (dp);
818 bad2:
819 	irele(pdp);
820 bad:
821 	if (bp)
822 		brelse(bp);
823 	if (dp)
824 		iput(dp);
825 	nbp->av_forw = freenamebuf;
826 	freenamebuf = nbp;
827 	return (NULL);
828 }
829 
830 
831 dirbad(ip, offset, how)
832 	struct inode *ip;
833 	off_t offset;
834 	char *how;
835 {
836 
837 	printf("%s: bad dir ino %d at offset %d: %s\n",
838 	    ip->i_fs->fs_fsmnt, ip->i_number, offset, how);
839 }
840 
841 /*
842  * Do consistency checking on a directory entry:
843  *	record length must be multiple of 4
844  *	record length must not be non-negative
845  *	entry must fit in rest of its DIRBLKSIZ block
846  *	record must be large enough to contain entry
847  *	name is not longer than MAXNAMLEN
848  *	name must be as long as advertised, and null terminated
849  */
850 dirbadentry(ep, entryoffsetinblock)
851 	register struct direct *ep;
852 	int entryoffsetinblock;
853 {
854 	register int i;
855 
856 	if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen <= 0 ||
857 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
858 	    ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN)
859 		return (1);
860 	for (i = 0; i < ep->d_namlen; i++)
861 		if (ep->d_name[i] == '\0')
862 			return (1);
863 	return (ep->d_name[i]);
864 }
865 
866 /*
867  * Write a directory entry after a call to namei, using the parameters
868  * which it left in the u. area.  The argument ip is the inode which
869  * the new directory entry will refer to.  The u. area field ndp->ni_pdir is
870  * a pointer to the directory to be written, which was left locked by
871  * namei.  Remaining parameters (ndp->ni_offset, ndp->ni_count) indicate
872  * how the space for the new entry is to be gotten.
873  */
874 direnter(ip, ndp)
875 	struct inode *ip;
876 	register struct nameidata *ndp;
877 {
878 	register struct direct *ep, *nep;
879 	struct buf *bp;
880 	int loc, spacefree, error = 0;
881 	u_int dsize;
882 	int newentrysize;
883 	char *dirbuf;
884 
885 	ndp->ni_dent.d_ino = ip->i_number;
886 	newentrysize = DIRSIZ(&ndp->ni_dent);
887 	if (ndp->ni_count == 0) {
888 		/*
889 		 * If ndp->ni_count is 0, then namei could find no space in the
890 		 * directory. In this case ndp->ni_offset will be on a directory
891 		 * block boundary and we will write the new entry into a fresh
892 		 * block.
893 		 */
894 		if (ndp->ni_offset&(DIRBLKSIZ-1))
895 			panic("wdir: newblk");
896 		ndp->ni_dent.d_reclen = DIRBLKSIZ;
897 		error = rdwri(UIO_WRITE, ndp->ni_pdir, (caddr_t)&ndp->ni_dent,
898 		    newentrysize, ndp->ni_offset, 1, (int *)0);
899 		iput(ndp->ni_pdir);
900 		return (error);
901 	}
902 
903 	/*
904 	 * If ndp->ni_count is non-zero, then namei found space for the new
905 	 * entry in the range ndp->ni_offset to ndp->ni_offset + ndp->ni_count.
906 	 * in the directory.  To use this space, we may have to compact
907 	 * the entries located there, by copying them together towards
908 	 * the beginning of the block, leaving the free space in
909 	 * one usable chunk at the end.
910 	 */
911 
912 	/*
913 	 * Increase size of directory if entry eats into new space.
914 	 * This should never push the size past a new multiple of
915 	 * DIRBLKSIZE.
916 	 */
917 	if (ndp->ni_offset + ndp->ni_count > ndp->ni_pdir->i_size)
918 		ndp->ni_pdir->i_size = ndp->ni_offset + ndp->ni_count;
919 
920 	/*
921 	 * Get the block containing the space for the new directory
922 	 * entry.  Should return error by result instead of u.u_error.
923 	 */
924 	bp = blkatoff(ndp->ni_pdir, ndp->ni_offset, (char **)&dirbuf);
925 	if (bp == 0) {
926 		iput(ndp->ni_pdir);
927 		return (u.u_error);
928 	}
929 
930 	/*
931 	 * Find space for the new entry.  In the simple case, the
932 	 * entry at offset base will have the space.  If it does
933 	 * not, then namei arranged that compacting the region
934 	 * ndp->ni_offset to ndp->ni_offset+ndp->ni_count would yield the space.
935 	 */
936 	ep = (struct direct *)dirbuf;
937 	dsize = DIRSIZ(ep);
938 	spacefree = ep->d_reclen - dsize;
939 	for (loc = ep->d_reclen; loc < ndp->ni_count; ) {
940 		nep = (struct direct *)(dirbuf + loc);
941 		if (ep->d_ino) {
942 			/* trim the existing slot */
943 			ep->d_reclen = dsize;
944 			ep = (struct direct *)((char *)ep + dsize);
945 		} else {
946 			/* overwrite; nothing there; header is ours */
947 			spacefree += dsize;
948 		}
949 		dsize = DIRSIZ(nep);
950 		spacefree += nep->d_reclen - dsize;
951 		loc += nep->d_reclen;
952 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
953 	}
954 	/*
955 	 * Update the pointer fields in the previous entry (if any),
956 	 * copy in the new entry, and write out the block.
957 	 */
958 	if (ep->d_ino == 0) {
959 		if (spacefree + dsize < newentrysize)
960 			panic("wdir: compact1");
961 		ndp->ni_dent.d_reclen = spacefree + dsize;
962 	} else {
963 		if (spacefree < newentrysize)
964 			panic("wdir: compact2");
965 		ndp->ni_dent.d_reclen = spacefree;
966 		ep->d_reclen = dsize;
967 		ep = (struct direct *)((char *)ep + dsize);
968 	}
969 	bcopy((caddr_t)&ndp->ni_dent, (caddr_t)ep, (u_int)newentrysize);
970 	bwrite(bp);
971 	ndp->ni_pdir->i_flag |= IUPD|ICHG;
972 	iput(ndp->ni_pdir);
973 	return (error);
974 }
975 
976 /*
977  * Remove a directory entry after a call to namei, using the
978  * parameters which it left in the u. area.  The u. entry
979  * ni_offset contains the offset into the directory of the
980  * entry to be eliminated.  The ni_count field contains the
981  * size of the previous record in the directory.  If this
982  * is 0, the first entry is being deleted, so we need only
983  * zero the inode number to mark the entry as free.  If the
984  * entry isn't the first in the directory, we must reclaim
985  * the space of the now empty record by adding the record size
986  * to the size of the previous entry.
987  */
988 dirremove(ndp)
989 	register struct nameidata *ndp;
990 {
991 	register struct inode *dp = ndp->ni_pdir;
992 	register struct buf *bp;
993 	struct direct *ep;
994 
995 	if (ndp->ni_count == 0) {
996 		/*
997 		 * First entry in block: set d_ino to zero.
998 		 */
999 		ndp->ni_dent.d_ino = 0;
1000 		(void) rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1001 		    (int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1002 	} else {
1003 		/*
1004 		 * Collapse new free space into previous entry.
1005 		 */
1006 		bp = blkatoff(dp, (int)(ndp->ni_offset - ndp->ni_count),
1007 			(char **)&ep);
1008 		if (bp == 0)
1009 			return (0);
1010 		ep->d_reclen += ndp->ni_dent.d_reclen;
1011 		bwrite(bp);
1012 		dp->i_flag |= IUPD|ICHG;
1013 	}
1014 	return (1);
1015 }
1016 
1017 /*
1018  * Rewrite an existing directory entry to point at the inode
1019  * supplied.  The parameters describing the directory entry are
1020  * set up by a call to namei.
1021  */
1022 dirrewrite(dp, ip, ndp)
1023 	struct inode *dp, *ip;
1024 	struct nameidata *ndp;
1025 {
1026 
1027 	ndp->ni_dent.d_ino = ip->i_number;
1028 	u.u_error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1029 		(int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1030 	iput(dp);
1031 }
1032 
1033 /*
1034  * Return buffer with contents of block "offset"
1035  * from the beginning of directory "ip".  If "res"
1036  * is non-zero, fill it in with a pointer to the
1037  * remaining space in the directory.
1038  */
1039 struct buf *
1040 blkatoff(ip, offset, res)
1041 	struct inode *ip;
1042 	off_t offset;
1043 	char **res;
1044 {
1045 	register struct fs *fs = ip->i_fs;
1046 	daddr_t lbn = lblkno(fs, offset);
1047 	int base = blkoff(fs, offset);
1048 	int bsize = blksize(fs, ip, lbn);
1049 	daddr_t bn = fsbtodb(fs, bmap(ip, lbn, B_WRITE, base, bsize));
1050 	register struct buf *bp;
1051 
1052 	if (u.u_error)
1053 		return (0);
1054 	bp = bread(ip->i_dev, bn, bsize);
1055 	if (bp->b_flags & B_ERROR) {
1056 		brelse(bp);
1057 		return (0);
1058 	}
1059 	if (res)
1060 		*res = bp->b_un.b_addr + base;
1061 	return (bp);
1062 }
1063 
1064 /*
1065  * Check if a directory is empty or not.
1066  * Inode supplied must be locked.
1067  *
1068  * Using a struct dirtemplate here is not precisely
1069  * what we want, but better than using a struct direct.
1070  *
1071  * NB: does not handle corrupted directories.
1072  */
1073 dirempty(ip, parentino)
1074 	register struct inode *ip;
1075 	ino_t parentino;
1076 {
1077 	register off_t off;
1078 	struct dirtemplate dbuf;
1079 	register struct direct *dp = (struct direct *)&dbuf;
1080 	int error, count;
1081 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1082 
1083 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
1084 		error = rdwri(UIO_READ, ip, (caddr_t)dp, MINDIRSIZ,
1085 		    off, 1, &count);
1086 		/*
1087 		 * Since we read MINDIRSIZ, residual must
1088 		 * be 0 unless we're at end of file.
1089 		 */
1090 		if (error || count != 0)
1091 			return (0);
1092 		/* skip empty entries */
1093 		if (dp->d_ino == 0)
1094 			continue;
1095 		/* accept only "." and ".." */
1096 		if (dp->d_namlen > 2)
1097 			return (0);
1098 		if (dp->d_name[0] != '.')
1099 			return (0);
1100 		/*
1101 		 * At this point d_namlen must be 1 or 2.
1102 		 * 1 implies ".", 2 implies ".." if second
1103 		 * char is also "."
1104 		 */
1105 		if (dp->d_namlen == 1)
1106 			continue;
1107 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
1108 			continue;
1109 		return (0);
1110 	}
1111 	return (1);
1112 }
1113 
1114 /*
1115  * Check if source directory is in the path of the target directory.
1116  * Target is supplied locked, source is unlocked.
1117  * The target is always iput() before returning.
1118  */
1119 checkpath(source, target)
1120 	struct inode *source, *target;
1121 {
1122 	struct dirtemplate dirbuf;
1123 	register struct inode *ip;
1124 	int error = 0;
1125 
1126 	ip = target;
1127 	if (ip->i_number == source->i_number) {
1128 		error = EEXIST;
1129 		goto out;
1130 	}
1131 	if (ip->i_number == ROOTINO)
1132 		goto out;
1133 
1134 	for (;;) {
1135 		if ((ip->i_mode&IFMT) != IFDIR) {
1136 			error = ENOTDIR;
1137 			break;
1138 		}
1139 		error = rdwri(UIO_READ, ip, (caddr_t)&dirbuf,
1140 			sizeof (struct dirtemplate), (off_t)0, 1, (int *)0);
1141 		if (error != 0)
1142 			break;
1143 		if (dirbuf.dotdot_namlen != 2 ||
1144 		    dirbuf.dotdot_name[0] != '.' ||
1145 		    dirbuf.dotdot_name[1] != '.') {
1146 			error = ENOTDIR;
1147 			break;
1148 		}
1149 		if (dirbuf.dotdot_ino == source->i_number) {
1150 			error = EINVAL;
1151 			break;
1152 		}
1153 		if (dirbuf.dotdot_ino == ROOTINO)
1154 			break;
1155 		iput(ip);
1156 		ip = iget(ip->i_dev, ip->i_fs, dirbuf.dotdot_ino);
1157 		if (ip == NULL) {
1158 			error = u.u_error;
1159 			break;
1160 		}
1161 	}
1162 
1163 out:
1164 	if (error == ENOTDIR)
1165 		printf("checkpath: .. not a directory\n");
1166 	if (ip != NULL)
1167 		iput(ip);
1168 	return (error);
1169 }
1170 
1171 /*
1172  * Name cache initialization, from main() when we are booting
1173  */
1174 nchinit()
1175 {
1176 	register union nchash *nchp;
1177 	register struct nch *ncp;
1178 
1179 	nchhead = 0;
1180 	nchtail = &nchhead;
1181 
1182 	for (ncp = nch; ncp < &nch[nchsize]; ncp++) {
1183 		ncp->nc_forw = ncp;			/* hash chain */
1184 		ncp->nc_back = ncp;
1185 
1186 		ncp->nc_nxt = NULL;			/* lru chain */
1187 		*nchtail = ncp;
1188 		ncp->nc_prev = nchtail;
1189 		nchtail = &ncp->nc_nxt;
1190 
1191 		/* all else is zero already */
1192 	}
1193 
1194 	for (nchp = nchash; nchp < &nchash[NCHHASH]; nchp++) {
1195 		nchp->nch_head[0] = nchp;
1196 		nchp->nch_head[1] = nchp;
1197 	}
1198 }
1199 
1200 /*
1201  * Cache flush, called when filesys is umounted to
1202  * remove entries that would now be invalid
1203  *
1204  * The line "nxtcp = nchhead" near the end is to avoid potential problems
1205  * if the cache lru chain is modified while we are dumping the
1206  * inode.  This makes the algorithm O(n^2), but do you think I care?
1207  */
1208 nchinval(dev)
1209 	register dev_t dev;
1210 {
1211 	register struct nch *ncp, *nxtcp;
1212 
1213 	for (ncp = nchhead; ncp; ncp = nxtcp) {
1214 		nxtcp = ncp->nc_nxt;
1215 
1216 		if (ncp->nc_ip == NULL ||
1217 		    (ncp->nc_idev != dev && ncp->nc_dev != dev))
1218 			continue;
1219 
1220 			/* free the resources we had */
1221 		ncp->nc_idev = NODEV;
1222 		ncp->nc_dev = NODEV;
1223 		ncp->nc_id = NULL;
1224 		ncp->nc_ino = 0;
1225 		ncp->nc_ip = NULL;
1226 
1227 
1228 			/* remove the entry from its hash chain */
1229 		remque(ncp);
1230 			/* and make a dummy one */
1231 		ncp->nc_forw = ncp;
1232 		ncp->nc_back = ncp;
1233 
1234 			/* delete this entry from LRU chain */
1235 		*ncp->nc_prev = nxtcp;
1236 		if (nxtcp)
1237 			nxtcp->nc_prev = ncp->nc_prev;
1238 		else
1239 			nchtail = ncp->nc_prev;
1240 
1241 			/* cause rescan of list, it may have altered */
1242 		nxtcp = nchhead;
1243 			/* put the now-free entry at head of LRU */
1244 		ncp->nc_nxt = nxtcp;
1245 		ncp->nc_prev = &nchhead;
1246 		nxtcp->nc_prev = &ncp->nc_nxt;
1247 		nchhead = ncp;
1248 	}
1249 }
1250