xref: /csrg-svn/sys/kern/vfs_lookup.c (revision 18663)
1 /*	vfs_lookup.c	6.22	85/04/19	*/
2 
3 #include "param.h"
4 #include "systm.h"
5 #include "inode.h"
6 #include "fs.h"
7 #include "mount.h"
8 #include "dir.h"
9 #include "user.h"
10 #include "buf.h"
11 #include "conf.h"
12 #include "uio.h"
13 #include "kernel.h"
14 
15 struct	buf *blkatoff();
16 struct	buf *freenamebuf;
17 int	dirchk = 0;
18 
19 /*
20  * Structures associated with name cacheing.
21  */
22 #define	NCHHASH		32	/* size of hash table */
23 
24 #if	((NCHHASH)&((NCHHASH)-1)) != 0
25 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) % (NCHHASH))
26 #else
27 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) & ((NCHHASH)-1))
28 #endif
29 
30 union	nchash	{
31 	union	nchash	*nch_head[2];
32 	struct	nch	*nch_chain[2];
33 } nchash[NCHHASH];
34 #define	nch_forw	nch_chain[0]
35 #define	nch_back	nch_chain[1]
36 
37 struct	nch	*nchhead, **nchtail;	/* LRU chain pointers */
38 struct	nchstats nchstats;		/* cache effectiveness statistics */
39 
40 /*
41  * Convert a pathname into a pointer to a locked inode,
42  * with side effects usable in creating and removing files.
43  * This is a very central and rather complicated routine.
44  *
45  * The segflg defines whether the name is to be copied from user
46  * space or kernel space.
47  *
48  * The flag argument is (LOOKUP, CREATE, DELETE) depending on whether
49  * the name is to be (looked up, created, deleted).  If flag has
50  * LOCKPARENT or'ed into it and the target of the pathname exists,
51  * namei returns both the target and its parent directory locked.
52  * If the file system is not maintained in a strict tree hierarchy,
53  * this can result in a deadlock situation.  When creating and
54  * LOCKPARENT is specified, the target may not be ".".  When deleting
55  * and LOCKPARENT is specified, the target may be ".", but the caller
56  * must check to insure it does an irele and iput instead of two iputs.
57  *
58  * The FOLLOW flag is set when symbolic links are to be followed
59  * when they occur at the end of the name translation process.
60  *
61  * Name caching works as follows:
62  *
63  *	names found by directory scans are retained in a cache
64  *	for future reference.  It is managed LRU, so frequently
65  *	used names will hang around.  Cache is indexed by hash value
66  *	obtained from (ino,dev,name) where ino & dev refer to the
67  *	directory containing name.
68  *
69  *	For simplicity (and economy of storage), names longer than
70  *	some (small) maximum length are not cached, they occur
71  *	infrequently in any case, and are almost never of interest.
72  *
73  *	Upon reaching the last segment of a path, if the reference
74  *	is for DELETE, or NOCACHE is set (rewrite), and the
75  *	name is located in the cache, it will be dropped.
76  *
77  *	We must be sure never to enter the name ".." into the cache
78  *	because of the extremely kludgey way that rename() alters
79  *	".." in a situation like
80  *		mv a/x b/x
81  *	where x is a directory, and x/.. is the ".." in question.
82  *
83  * Overall outline of namei:
84  *
85  *	copy in name
86  *	get starting directory
87  * dirloop:
88  *	check accessibility of directory
89  * dirloop2:
90  *	copy next component of name to ndp->ni_dent
91  *	handle degenerate case where name is null string
92  *	look for name in cache, if found, then if at end of path
93  *	  and deleting or creating, drop it, else to haveino
94  *	search for name in directory, to found or notfound
95  * notfound:
96  *	if creating, return locked directory, leaving info on avail. slots
97  *	else return error
98  * found:
99  *	if at end of path and deleting, return information to allow delete
100  *	if at end of path and rewriting (create and LOCKPARENT), lock target
101  *	  inode and return info to allow rewrite
102  *	if .. and on mounted filesys, look in mount table for parent
103  *	if not at end, if neither creating nor deleting, add name to cache
104  * haveino:
105  *	if symbolic link, massage name in buffer and continue at dirloop
106  *	if more components of name, do next level at dirloop
107  *	return the answer as locked inode
108  *
109  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode,
110  *	 but unlocked.
111  */
112 struct inode *
113 namei(ndp)
114 	register struct nameidata *ndp;
115 {
116 	register char *cp;		/* pointer into pathname argument */
117 /* these variables refer to things which must be freed or unlocked */
118 	register struct inode *dp = 0;	/* the directory we are searching */
119 	register struct nch *ncp;	/* cache slot for entry */
120 	register struct fs *fs;		/* file system that directory is in */
121 	register struct buf *bp = 0;	/* a buffer of directory entries */
122 	register struct direct *ep;	/* the current directory entry */
123 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
124 	register struct buf *nbp;	/* buffer storing path name argument */
125 /* these variables hold information about the search for a slot */
126 	enum {NONE, COMPACT, FOUND} slotstatus;
127 	int slotoffset = -1;		/* offset of area with free space */
128 	int slotsize;			/* size of area at slotoffset */
129 	int slotfreespace;		/* amount of space free in slot */
130 	int slotneeded;			/* size of the entry we're seeking */
131 /* */
132 	int numdirpasses;		/* strategy for directory search */
133 	int endsearch;			/* offset to end directory search */
134 	int prevoff;			/* ndp->ni_offset of previous entry */
135 	int nlink = 0;			/* number of symbolic links taken */
136 	struct inode *pdp;		/* saved dp during symlink work */
137 	int error, i;
138 	int lockparent;
139 	int docache;			/* == 0 do not cache last component */
140 	int makeentry;			/* != 0 if name to be added to cache */
141 	unsigned hash;			/* value of name hash for entry */
142 	union nchash *nhp;		/* cache chain head for entry */
143 	int isdotdot;			/* != 0 if current name is ".." */
144 	int flag;			/* op ie, LOOKUP, CREATE, or DELETE */
145 	off_t enduseful;		/* pointer past last used dir slot */
146 
147 	lockparent = ndp->ni_nameiop & LOCKPARENT;
148 	docache = (ndp->ni_nameiop & NOCACHE) ^ NOCACHE;
149 	flag = ndp->ni_nameiop &~ (LOCKPARENT|NOCACHE|FOLLOW);
150 	if (flag == DELETE || lockparent)
151 		docache = 0;
152 	/*
153 	 * Get a buffer for the name to be translated, and copy the
154 	 * name into the buffer.
155 	 */
156 	nbp = freenamebuf;
157 	if (nbp == NULL)
158 		nbp = geteblk(MAXPATHLEN);
159 	else
160 		freenamebuf = nbp->av_forw;
161 	if (ndp->ni_segflg == UIO_SYSSPACE)
162 		error = copystr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
163 		    (u_int *)0);
164 	else
165 		error = copyinstr(ndp->ni_dirp, nbp->b_un.b_addr, MAXPATHLEN,
166 		    (u_int *)0);
167 	if (error) {
168 		u.u_error = error;
169 		goto bad;
170 	}
171 
172 	/*
173 	 * Get starting directory.
174 	 */
175 	cp = nbp->b_un.b_addr;
176 	if (*cp == '/') {
177 		while (*cp == '/')
178 			cp++;
179 		if ((dp = u.u_rdir) == NULL)
180 			dp = rootdir;
181 	} else
182 		dp = u.u_cdir;
183 	fs = dp->i_fs;
184 	ILOCK(dp);
185 	dp->i_count++;
186 	ndp->ni_pdir = (struct inode *)0xc0000000;		/* illegal */
187 	ndp->ni_endoff = 0;
188 
189 	/*
190 	 * We come to dirloop to search a new directory.
191 	 * The directory must be locked so that it can be
192 	 * iput, and fs must be already set to dp->i_fs.
193 	 */
194 dirloop:
195 	/*
196 	 * Check accessiblity of directory.
197 	 */
198 	if ((dp->i_mode&IFMT) != IFDIR) {
199 		u.u_error = ENOTDIR;
200 		goto bad;
201 	}
202 	if (access(dp, IEXEC))
203 		goto bad;
204 
205 dirloop2:
206 	/*
207 	 * Copy next component of name to ndp->ni_dent.
208 	 */
209 	hash = 0;
210 	for (i = 0; *cp != 0 && *cp != '/'; cp++) {
211 		if (i >= MAXNAMLEN) {
212 			u.u_error = ENOENT;
213 			goto bad;
214 		}
215 		if ((*cp&0377) == ('/'|0200) || (*cp&0200) && flag != DELETE) {
216 			u.u_error = EPERM;
217 			goto bad;
218 		}
219 		ndp->ni_dent.d_name[i++] = *cp;
220 		hash += (unsigned char)*cp * i;
221 	}
222 	ndp->ni_dent.d_namlen = i;
223 	ndp->ni_dent.d_name[i] = '\0';
224 	isdotdot = (i == 2 &&
225 		ndp->ni_dent.d_name[0] == '.' && ndp->ni_dent.d_name[1] == '.');
226 	makeentry = 1;
227 	if (*cp == '\0' && docache == 0)
228 		makeentry = 0;
229 
230 	/*
231 	 * Check for degenerate name (e.g. / or "")
232 	 * which is a way of talking about a directory,
233 	 * e.g. like "/." or ".".
234 	 */
235 	if (ndp->ni_dent.d_name[0] == '\0') {
236 		if (flag != LOOKUP || lockparent) {
237 			u.u_error = EISDIR;
238 			goto bad;
239 		}
240 		nbp->av_forw = freenamebuf;
241 		freenamebuf = nbp;
242 		return (dp);
243 	}
244 
245 	/*
246 	 * We now have a segment name to search for, and a directory to search.
247 	 *
248 	 * Before tediously performing a linear scan of the directory,
249 	 * check the name cache to see if the directory/name pair
250 	 * we are looking for is known already.  We don't do this
251 	 * if the segment name is long, simply so the cache can avoid
252 	 * holding long names (which would either waste space, or
253 	 * add greatly to the complexity).
254 	 */
255 	if (ndp->ni_dent.d_namlen > NCHNAMLEN) {
256 		nchstats.ncs_long++;
257 		makeentry = 0;
258 	} else {
259 		nhp = &nchash[NHASH(hash, dp->i_number, dp->i_dev)];
260 		for (ncp = nhp->nch_forw; ncp != (struct nch *)nhp;
261 		    ncp = ncp->nc_forw) {
262 			if (ncp->nc_ino == dp->i_number &&
263 			    ncp->nc_dev == dp->i_dev &&
264 			    ncp->nc_nlen == ndp->ni_dent.d_namlen &&
265 			    !bcmp(ncp->nc_name, ndp->ni_dent.d_name,
266 				ncp->nc_nlen))
267 				break;
268 		}
269 
270 		if (ncp == (struct nch *)nhp) {
271 			nchstats.ncs_miss++;
272 			ncp = NULL;
273 		} else {
274 			if (ncp->nc_id != ncp->nc_ip->i_id) {
275 				nchstats.ncs_falsehits++;
276 			} else if (!makeentry) {
277 				nchstats.ncs_badhits++;
278 			} else {
279 
280 					/*
281 					 * move this slot to end of LRU
282 					 * chain, if not already there
283 					 */
284 				if (ncp->nc_nxt) {
285 						/* remove from LRU chain */
286 					*ncp->nc_prev = ncp->nc_nxt;
287 					ncp->nc_nxt->nc_prev = ncp->nc_prev;
288 
289 						/* and replace at end of it */
290 					ncp->nc_nxt = NULL;
291 					ncp->nc_prev = nchtail;
292 					*nchtail = ncp;
293 					nchtail = &ncp->nc_nxt;
294 				}
295 
296 				/*
297 				 * Get the next inode in the path.
298 				 * See comment above other `IUNLOCK' code for
299 				 * an explaination of the locking protocol.
300 				 */
301 				pdp = dp;
302 				if (!isdotdot || dp != u.u_rdir)
303 					dp = ncp->nc_ip;
304 				if (dp == NULL)
305 					panic("nami: null cache ino");
306 				if (pdp == dp) {
307 					dp->i_count++;
308 				} else if (isdotdot) {
309 					IUNLOCK(pdp);
310 					igrab(dp);
311 				} else {
312 					igrab(dp);
313 					IUNLOCK(pdp);
314 				}
315 
316 				/*
317 				 * Verify that the inode that we got
318 				 * did not change while we were waiting
319 				 * for it to be locked.
320 				 */
321 				if (ncp->nc_id != ncp->nc_ip->i_id) {
322 					iput(dp);
323 					ILOCK(pdp);
324 					dp = pdp;
325 					nchstats.ncs_falsehits++;
326 				} else {
327 					ndp->ni_dent.d_ino = dp->i_number;
328 					/* ni_dent.d_reclen is garbage ... */
329 					nchstats.ncs_goodhits++;
330 					goto haveino;
331 				}
332 			}
333 
334 			/*
335 			 * Last component and we are renaming or deleting,
336 			 * the cache entry is invalid, or otherwise don't
337 			 * want cache entry to exist.
338 			 */
339 
340 				/* remove from LRU chain */
341 			*ncp->nc_prev = ncp->nc_nxt;
342 			if (ncp->nc_nxt)
343 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
344 			else
345 				nchtail = ncp->nc_prev;
346 
347 				/* remove from hash chain */
348 			remque(ncp);
349 
350 				/* insert at head of LRU list (first to grab) */
351 			ncp->nc_nxt = nchhead;
352 			ncp->nc_prev = &nchhead;
353 			nchhead->nc_prev = &ncp->nc_nxt;
354 			nchhead = ncp;
355 
356 				/* and make a dummy hash chain */
357 			ncp->nc_forw = ncp;
358 			ncp->nc_back = ncp;
359 
360 			ncp = NULL;
361 		}
362 	}
363 
364 	/*
365 	 * Suppress search for slots unless creating
366 	 * file and at end of pathname, in which case
367 	 * we watch for a place to put the new file in
368 	 * case it doesn't already exist.
369 	 */
370 	slotstatus = FOUND;
371 	if (flag == CREATE && *cp == 0) {
372 		slotstatus = NONE;
373 		slotfreespace = 0;
374 		slotneeded = DIRSIZ(&ndp->ni_dent);
375 	}
376 	/*
377 	 * If this is the same directory that this process
378 	 * previously searched, pick up where we last left off.
379 	 * We cache only lookups as these are the most common
380 	 * and have the greatest payoff. Caching CREATE has little
381 	 * benefit as it usually must search the entire directory
382 	 * to determine that the entry does not exist. Caching the
383 	 * location of the last DELETE has not reduced profiling time
384 	 * and hence has been removed in the interest of simplicity.
385 	 */
386 	if (flag != LOOKUP || dp->i_number != u.u_ncache.nc_inumber ||
387 	    dp->i_dev != u.u_ncache.nc_dev) {
388 		ndp->ni_offset = 0;
389 		numdirpasses = 1;
390 	} else {
391 		if ((dp->i_flag & ICHG) || dp->i_ctime >= u.u_ncache.nc_time) {
392 			if (u.u_ncache.nc_prevoffset > dp->i_size)
393 				u.u_ncache.nc_prevoffset = 0;
394 			else
395 				u.u_ncache.nc_prevoffset &= ~(DIRBLKSIZ - 1);
396 			u.u_ncache.nc_time = time.tv_sec;
397 		}
398 		ndp->ni_offset = u.u_ncache.nc_prevoffset;
399 		entryoffsetinblock = blkoff(fs, ndp->ni_offset);
400 		if (entryoffsetinblock != 0) {
401 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
402 			if (bp == 0)
403 				goto bad;
404 		}
405 		numdirpasses = 2;
406 		nchstats.ncs_2passes++;
407 	}
408 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
409 	enduseful = 0;
410 
411 searchloop:
412 	while (ndp->ni_offset < endsearch) {
413 		/*
414 		 * If offset is on a block boundary,
415 		 * read the next directory block.
416 		 * Release previous if it exists.
417 		 */
418 		if (blkoff(fs, ndp->ni_offset) == 0) {
419 			if (bp != NULL)
420 				brelse(bp);
421 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
422 			if (bp == 0)
423 				goto bad;
424 			entryoffsetinblock = 0;
425 		}
426 
427 		/*
428 		 * If still looking for a slot, and at a DIRBLKSIZE
429 		 * boundary, have to start looking for free space again.
430 		 */
431 		if (slotstatus == NONE &&
432 		    (entryoffsetinblock&(DIRBLKSIZ-1)) == 0) {
433 			slotoffset = -1;
434 			slotfreespace = 0;
435 		}
436 
437 		/*
438 		 * Get pointer to next entry.
439 		 * Full validation checks are slow, so we only check
440 		 * enough to insure forward progress through the
441 		 * directory. Complete checks can be run by patching
442 		 * "dirchk" to be true.
443 		 */
444 		ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock);
445 		if (ep->d_reclen <= 0 ||
446 		    dirchk && dirbadentry(ep, entryoffsetinblock)) {
447 			dirbad(dp, ndp->ni_offset, "mangled entry");
448 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
449 			ndp->ni_offset += i;
450 			entryoffsetinblock += i;
451 			continue;
452 		}
453 
454 		/*
455 		 * If an appropriate sized slot has not yet been found,
456 		 * check to see if one is available. Also accumulate space
457 		 * in the current block so that we can determine if
458 		 * compaction is viable.
459 		 */
460 		if (slotstatus != FOUND) {
461 			int size = ep->d_reclen;
462 
463 			if (ep->d_ino != 0)
464 				size -= DIRSIZ(ep);
465 			if (size > 0) {
466 				if (size >= slotneeded) {
467 					slotstatus = FOUND;
468 					slotoffset = ndp->ni_offset;
469 					slotsize = ep->d_reclen;
470 				} else if (slotstatus == NONE) {
471 					slotfreespace += size;
472 					if (slotoffset == -1)
473 						slotoffset = ndp->ni_offset;
474 					if (slotfreespace >= slotneeded) {
475 						slotstatus = COMPACT;
476 						slotsize = ndp->ni_offset +
477 						      ep->d_reclen - slotoffset;
478 					}
479 				}
480 			}
481 		}
482 
483 		/*
484 		 * Check for a name match.
485 		 */
486 		if (ep->d_ino) {
487 			if (ep->d_namlen == ndp->ni_dent.d_namlen &&
488 			    !bcmp(ndp->ni_dent.d_name, ep->d_name,
489 				ep->d_namlen))
490 				goto found;
491 		}
492 		prevoff = ndp->ni_offset;
493 		ndp->ni_offset += ep->d_reclen;
494 		entryoffsetinblock += ep->d_reclen;
495 		if (ep->d_ino)
496 			enduseful = ndp->ni_offset;
497 	}
498 /* notfound: */
499 	/*
500 	 * If we started in the middle of the directory and failed
501 	 * to find our target, we must check the beginning as well.
502 	 */
503 	if (numdirpasses == 2) {
504 		numdirpasses--;
505 		ndp->ni_offset = 0;
506 		endsearch = u.u_ncache.nc_prevoffset;
507 		goto searchloop;
508 	}
509 	/*
510 	 * If creating, and at end of pathname and current
511 	 * directory has not been removed, then can consider
512 	 * allowing file to be created.
513 	 */
514 	if (flag == CREATE && *cp == 0 && dp->i_nlink != 0) {
515 		/*
516 		 * Access for write is interpreted as allowing
517 		 * creation of files in the directory.
518 		 */
519 		if (access(dp, IWRITE))
520 			goto bad;
521 		/*
522 		 * Return an indication of where the new directory
523 		 * entry should be put.  If we didn't find a slot,
524 		 * then set ndp->ni_count to 0 indicating that the new
525 		 * slot belongs at the end of the directory. If we found
526 		 * a slot, then the new entry can be put in the range
527 		 * [ndp->ni_offset .. ndp->ni_offset + ndp->ni_count)
528 		 */
529 		if (slotstatus == NONE) {
530 			ndp->ni_offset = roundup(dp->i_size, DIRBLKSIZ);
531 			ndp->ni_count = 0;
532 			enduseful = ndp->ni_offset;
533 		} else {
534 			ndp->ni_offset = slotoffset;
535 			ndp->ni_count = slotsize;
536 			if (enduseful < slotoffset + slotsize)
537 				enduseful = slotoffset + slotsize;
538 		}
539 		ndp->ni_endoff = roundup(enduseful, DIRBLKSIZ);
540 		dp->i_flag |= IUPD|ICHG;
541 		if (bp)
542 			brelse(bp);
543 		nbp->av_forw = freenamebuf;
544 		freenamebuf = nbp;
545 		/*
546 		 * We return with the directory locked, so that
547 		 * the parameters we set up above will still be
548 		 * valid if we actually decide to do a direnter().
549 		 * We return NULL to indicate that the entry doesn't
550 		 * currently exist, leaving a pointer to the (locked)
551 		 * directory inode in ndp->ni_pdir.
552 		 */
553 		ndp->ni_pdir = dp;
554 		return (NULL);
555 	}
556 	u.u_error = ENOENT;
557 	goto bad;
558 found:
559 	if (numdirpasses == 2)
560 		nchstats.ncs_pass2++;
561 	/*
562 	 * Check that directory length properly reflects presence
563 	 * of this entry.
564 	 */
565 	if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) {
566 		dirbad(dp, ndp->ni_offset, "i_size too small");
567 		dp->i_size = entryoffsetinblock + DIRSIZ(ep);
568 		dp->i_flag |= IUPD|ICHG;
569 	}
570 
571 	/*
572 	 * Found component in pathname.
573 	 * If the final component of path name, save information
574 	 * in the cache as to where the entry was found.
575 	 */
576 	if (*cp == '\0' && flag == LOOKUP) {
577 		u.u_ncache.nc_prevoffset = ndp->ni_offset;
578 		u.u_ncache.nc_inumber = dp->i_number;
579 		u.u_ncache.nc_dev = dp->i_dev;
580 		u.u_ncache.nc_time = time.tv_sec;
581 	}
582 	/*
583 	 * Save directory entry's inode number and reclen in ndp->ni_dent,
584 	 * and release directory buffer.
585 	 */
586 	ndp->ni_dent.d_ino = ep->d_ino;
587 	ndp->ni_dent.d_reclen = ep->d_reclen;
588 	brelse(bp);
589 	bp = NULL;
590 
591 	/*
592 	 * If deleting, and at end of pathname, return
593 	 * parameters which can be used to remove file.
594 	 * If the lockparent flag isn't set, we return only
595 	 * the directory (in ndp->ni_pdir), otherwise we go
596 	 * on and lock the inode, being careful with ".".
597 	 */
598 	if (flag == DELETE && *cp == 0) {
599 		/*
600 		 * Write access to directory required to delete files.
601 		 */
602 		if (access(dp, IWRITE))
603 			goto bad;
604 		ndp->ni_pdir = dp;		/* for dirremove() */
605 		/*
606 		 * Return pointer to current entry in ndp->ni_offset,
607 		 * and distance past previous entry (if there
608 		 * is a previous entry in this block) in ndp->ni_count.
609 		 * Save directory inode pointer in ndp->ni_pdir for dirremove().
610 		 */
611 		if ((ndp->ni_offset&(DIRBLKSIZ-1)) == 0)
612 			ndp->ni_count = 0;
613 		else
614 			ndp->ni_count = ndp->ni_offset - prevoff;
615 		if (lockparent) {
616 			if (dp->i_number == ndp->ni_dent.d_ino)
617 				dp->i_count++;
618 			else {
619 				dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
620 				if (dp == NULL) {
621 					iput(ndp->ni_pdir);
622 					goto bad;
623 				}
624 				/*
625 				 * If directory is "sticky", then user must own
626 				 * the directory, or the file in it, else he
627 				 * may not delete it (unless he's root). This
628 				 * implements append-only directories.
629 				 */
630 				if ((ndp->ni_pdir->i_mode & ISVTX) &&
631 				    u.u_uid != 0 &&
632 				    u.u_uid != ndp->ni_pdir->i_uid &&
633 				    dp->i_uid != u.u_uid) {
634 					iput(ndp->ni_pdir);
635 					u.u_error = EPERM;
636 					goto bad;
637 				}
638 			}
639 		}
640 		nbp->av_forw = freenamebuf;
641 		freenamebuf = nbp;
642 		return (dp);
643 	}
644 
645 	/*
646 	 * Special handling for ".." allowing chdir out of mounted
647 	 * file system: indirect .. in root inode to reevaluate
648 	 * in directory file system was mounted on.
649 	 */
650 	if (isdotdot) {
651 		if (dp == u.u_rdir) {
652 			ndp->ni_dent.d_ino = dp->i_number;
653 			makeentry = 0;
654 		} else if (ndp->ni_dent.d_ino == ROOTINO &&
655 		   dp->i_number == ROOTINO) {
656 			for (i = 1; i < NMOUNT; i++)
657 			if (mount[i].m_bufp != NULL &&
658 			   mount[i].m_dev == dp->i_dev) {
659 				iput(dp);
660 				dp = mount[i].m_inodp;
661 				ILOCK(dp);
662 				dp->i_count++;
663 				fs = dp->i_fs;
664 				cp -= 2;     /* back over .. */
665 				goto dirloop2;
666 			}
667 		}
668 	}
669 
670 	/*
671 	 * If rewriting (rename), return the inode and the
672 	 * information required to rewrite the present directory
673 	 * Must get inode of directory entry to verify it's a
674 	 * regular file, or empty directory.
675 	 */
676 	if ((flag == CREATE && lockparent) && *cp == 0) {
677 		if (access(dp, IWRITE))
678 			goto bad;
679 		ndp->ni_pdir = dp;		/* for dirrewrite() */
680 		/*
681 		 * Careful about locking second inode.
682 		 * This can only occur if the target is ".".
683 		 */
684 		if (dp->i_number == ndp->ni_dent.d_ino) {
685 			u.u_error = EISDIR;		/* XXX */
686 			goto bad;
687 		}
688 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
689 		if (dp == NULL) {
690 			iput(ndp->ni_pdir);
691 			goto bad;
692 		}
693 		nbp->av_forw = freenamebuf;
694 		freenamebuf = nbp;
695 		return (dp);
696 	}
697 
698 	/*
699 	 * Check for symbolic link, which may require us to massage the
700 	 * name before we continue translation.  We do not `iput' the
701 	 * directory because we may need it again if the symbolic link
702 	 * is relative to the current directory.  Instead we save it
703 	 * unlocked as "pdp".  We must get the target inode before unlocking
704 	 * the directory to insure that the inode will not be removed
705 	 * before we get it.  We prevent deadlock by always fetching
706 	 * inodes from the root, moving down the directory tree. Thus
707 	 * when following backward pointers ".." we must unlock the
708 	 * parent directory before getting the requested directory.
709 	 * There is a potential race condition here if both the current
710 	 * and parent directories are removed before the `iget' for the
711 	 * inode associated with ".." returns.  We hope that this occurs
712 	 * infrequently since we cannot avoid this race condition without
713 	 * implementing a sophisticated deadlock detection algorithm.
714 	 * Note also that this simple deadlock detection scheme will not
715 	 * work if the file system has any hard links other than ".."
716 	 * that point backwards in the directory structure.
717 	 */
718 	pdp = dp;
719 	if (isdotdot) {
720 		IUNLOCK(pdp);	/* race to get the inode */
721 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
722 		if (dp == NULL)
723 			goto bad2;
724 	} else if (dp->i_number == ndp->ni_dent.d_ino) {
725 		dp->i_count++;	/* we want ourself, ie "." */
726 	} else {
727 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
728 		IUNLOCK(pdp);
729 		if (dp == NULL)
730 			goto bad2;
731 	}
732 
733 	/*
734 	 * insert name into cache (if we want it, and it isn't "." or "..")
735 	 *
736 	 * all other cases where making a cache entry would be wrong
737 	 * have already departed from the code sequence somewhere above.
738 	 */
739 	if (makeentry) {
740 		if (ncp != NULL)
741 			panic("nami: duplicating cache");
742 
743 			/*
744 			 * free the cache slot at head of lru chain
745 			 */
746 		if (ncp = nchhead) {
747 				/* remove from lru chain */
748 			*ncp->nc_prev = ncp->nc_nxt;
749 			if (ncp->nc_nxt)
750 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
751 			else
752 				nchtail = ncp->nc_prev;
753 
754 				/* remove from old hash chain */
755 			remque(ncp);
756 
757 				/* grab the inode we just found */
758 			ncp->nc_ip = dp;
759 
760 				/* fill in cache info */
761 			ncp->nc_ino = pdp->i_number;	/* parents inum */
762 			ncp->nc_dev = pdp->i_dev;	/* & device */
763 			ncp->nc_idev = dp->i_dev;	/* our device */
764 			ncp->nc_id = dp->i_id;		/* identifier */
765 			ncp->nc_nlen = ndp->ni_dent.d_namlen;
766 			bcopy(ndp->ni_dent.d_name, ncp->nc_name, ncp->nc_nlen);
767 
768 				/* link at end of lru chain */
769 			ncp->nc_nxt = NULL;
770 			ncp->nc_prev = nchtail;
771 			*nchtail = ncp;
772 			nchtail = &ncp->nc_nxt;
773 
774 				/* and insert on hash chain */
775 			insque(ncp, nhp);
776 		}
777 	}
778 
779 haveino:
780 	fs = dp->i_fs;
781 
782 	/*
783 	 * Check for symbolic link
784 	 */
785 	if ((dp->i_mode & IFMT) == IFLNK &&
786 	    ((ndp->ni_nameiop & FOLLOW) || *cp == '/')) {
787 		u_int pathlen = strlen(cp) + 1;
788 
789 		if (dp->i_size + pathlen >= MAXPATHLEN - 1 ||
790 		    ++nlink > MAXSYMLINKS) {
791 			u.u_error = ELOOP;
792 			goto bad2;
793 		}
794 		ovbcopy(cp, nbp->b_un.b_addr + dp->i_size, pathlen);
795 		u.u_error =
796 		    rdwri(UIO_READ, dp, nbp->b_un.b_addr, (int)dp->i_size,
797 			0, 1, (int *)0);
798 		if (u.u_error)
799 			goto bad2;
800 		cp = nbp->b_un.b_addr;
801 		iput(dp);
802 		if (*cp == '/') {
803 			irele(pdp);
804 			while (*cp == '/')
805 				cp++;
806 			if ((dp = u.u_rdir) == NULL)
807 				dp = rootdir;
808 			ILOCK(dp);
809 			dp->i_count++;
810 		} else {
811 			dp = pdp;
812 			ILOCK(dp);
813 		}
814 		fs = dp->i_fs;
815 		goto dirloop;
816 	}
817 
818 	/*
819 	 * Not a symbolic link.  If more pathname,
820 	 * continue at next component, else return.
821 	 */
822 	if (*cp == '/') {
823 		while (*cp == '/')
824 			cp++;
825 		irele(pdp);
826 		goto dirloop;
827 	}
828 	nbp->av_forw = freenamebuf;
829 	freenamebuf = nbp;
830 	if (lockparent)
831 		ndp->ni_pdir = pdp;
832 	else
833 		irele(pdp);
834 	return (dp);
835 bad2:
836 	irele(pdp);
837 bad:
838 	if (bp)
839 		brelse(bp);
840 	if (dp)
841 		iput(dp);
842 	nbp->av_forw = freenamebuf;
843 	freenamebuf = nbp;
844 	return (NULL);
845 }
846 
847 
848 dirbad(ip, offset, how)
849 	struct inode *ip;
850 	off_t offset;
851 	char *how;
852 {
853 
854 	printf("%s: bad dir ino %d at offset %d: %s\n",
855 	    ip->i_fs->fs_fsmnt, ip->i_number, offset, how);
856 }
857 
858 /*
859  * Do consistency checking on a directory entry:
860  *	record length must be multiple of 4
861  *	record length must not be non-negative
862  *	entry must fit in rest of its DIRBLKSIZ block
863  *	record must be large enough to contain entry
864  *	name is not longer than MAXNAMLEN
865  *	name must be as long as advertised, and null terminated
866  */
867 dirbadentry(ep, entryoffsetinblock)
868 	register struct direct *ep;
869 	int entryoffsetinblock;
870 {
871 	register int i;
872 
873 	if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen <= 0 ||
874 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
875 	    ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN)
876 		return (1);
877 	for (i = 0; i < ep->d_namlen; i++)
878 		if (ep->d_name[i] == '\0')
879 			return (1);
880 	return (ep->d_name[i]);
881 }
882 
883 /*
884  * Write a directory entry after a call to namei, using the parameters
885  * which it left in the u. area.  The argument ip is the inode which
886  * the new directory entry will refer to.  The u. area field ndp->ni_pdir is
887  * a pointer to the directory to be written, which was left locked by
888  * namei.  Remaining parameters (ndp->ni_offset, ndp->ni_count) indicate
889  * how the space for the new entry is to be gotten.
890  */
891 direnter(ip, ndp)
892 	struct inode *ip;
893 	register struct nameidata *ndp;
894 {
895 	register struct direct *ep, *nep;
896 	register struct inode *dp = ndp->ni_pdir;
897 	struct buf *bp;
898 	int loc, spacefree, error = 0;
899 	u_int dsize;
900 	int newentrysize;
901 	char *dirbuf;
902 
903 	ndp->ni_dent.d_ino = ip->i_number;
904 	newentrysize = DIRSIZ(&ndp->ni_dent);
905 	if (ndp->ni_count == 0) {
906 		/*
907 		 * If ndp->ni_count is 0, then namei could find no space in the
908 		 * directory. In this case ndp->ni_offset will be on a directory
909 		 * block boundary and we will write the new entry into a fresh
910 		 * block.
911 		 */
912 		if (ndp->ni_offset&(DIRBLKSIZ-1))
913 			panic("wdir: newblk");
914 		ndp->ni_dent.d_reclen = DIRBLKSIZ;
915 		error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
916 		    newentrysize, ndp->ni_offset, 1, (int *)0);
917 		if (DIRBLKSIZ > dp->i_fs->fs_fsize)
918 			panic("wdir: blksize"); /* XXX - should grow w/bmap() */
919 		else
920 			dp->i_size = roundup(dp->i_size, DIRBLKSIZ);
921 		iput(dp);
922 		return (error);
923 	}
924 
925 	/*
926 	 * If ndp->ni_count is non-zero, then namei found space for the new
927 	 * entry in the range ndp->ni_offset to ndp->ni_offset + ndp->ni_count.
928 	 * in the directory.  To use this space, we may have to compact
929 	 * the entries located there, by copying them together towards
930 	 * the beginning of the block, leaving the free space in
931 	 * one usable chunk at the end.
932 	 */
933 
934 	/*
935 	 * Increase size of directory if entry eats into new space.
936 	 * This should never push the size past a new multiple of
937 	 * DIRBLKSIZE.
938 	 *
939 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
940 	 */
941 	if (ndp->ni_offset + ndp->ni_count > dp->i_size)
942 		dp->i_size = ndp->ni_offset + ndp->ni_count;
943 
944 	/*
945 	 * Get the block containing the space for the new directory
946 	 * entry.  Should return error by result instead of u.u_error.
947 	 */
948 	bp = blkatoff(dp, ndp->ni_offset, (char **)&dirbuf);
949 	if (bp == 0) {
950 		iput(dp);
951 		return (u.u_error);
952 	}
953 
954 	/*
955 	 * Find space for the new entry.  In the simple case, the
956 	 * entry at offset base will have the space.  If it does
957 	 * not, then namei arranged that compacting the region
958 	 * ndp->ni_offset to ndp->ni_offset+ndp->ni_count would yield the space.
959 	 */
960 	ep = (struct direct *)dirbuf;
961 	dsize = DIRSIZ(ep);
962 	spacefree = ep->d_reclen - dsize;
963 	for (loc = ep->d_reclen; loc < ndp->ni_count; ) {
964 		nep = (struct direct *)(dirbuf + loc);
965 		if (ep->d_ino) {
966 			/* trim the existing slot */
967 			ep->d_reclen = dsize;
968 			ep = (struct direct *)((char *)ep + dsize);
969 		} else {
970 			/* overwrite; nothing there; header is ours */
971 			spacefree += dsize;
972 		}
973 		dsize = DIRSIZ(nep);
974 		spacefree += nep->d_reclen - dsize;
975 		loc += nep->d_reclen;
976 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
977 	}
978 	/*
979 	 * Update the pointer fields in the previous entry (if any),
980 	 * copy in the new entry, and write out the block.
981 	 */
982 	if (ep->d_ino == 0) {
983 		if (spacefree + dsize < newentrysize)
984 			panic("wdir: compact1");
985 		ndp->ni_dent.d_reclen = spacefree + dsize;
986 	} else {
987 		if (spacefree < newentrysize)
988 			panic("wdir: compact2");
989 		ndp->ni_dent.d_reclen = spacefree;
990 		ep->d_reclen = dsize;
991 		ep = (struct direct *)((char *)ep + dsize);
992 	}
993 	bcopy((caddr_t)&ndp->ni_dent, (caddr_t)ep, (u_int)newentrysize);
994 	bwrite(bp);
995 	dp->i_flag |= IUPD|ICHG;
996 	if (ndp->ni_endoff && ndp->ni_endoff < dp->i_size)
997 		itrunc(dp, ndp->ni_endoff);
998 	iput(dp);
999 	return (error);
1000 }
1001 
1002 /*
1003  * Remove a directory entry after a call to namei, using the
1004  * parameters which it left in the u. area.  The u. entry
1005  * ni_offset contains the offset into the directory of the
1006  * entry to be eliminated.  The ni_count field contains the
1007  * size of the previous record in the directory.  If this
1008  * is 0, the first entry is being deleted, so we need only
1009  * zero the inode number to mark the entry as free.  If the
1010  * entry isn't the first in the directory, we must reclaim
1011  * the space of the now empty record by adding the record size
1012  * to the size of the previous entry.
1013  */
1014 dirremove(ndp)
1015 	register struct nameidata *ndp;
1016 {
1017 	register struct inode *dp = ndp->ni_pdir;
1018 	register struct buf *bp;
1019 	struct direct *ep;
1020 
1021 	if (ndp->ni_count == 0) {
1022 		/*
1023 		 * First entry in block: set d_ino to zero.
1024 		 */
1025 		ndp->ni_dent.d_ino = 0;
1026 		(void) rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1027 		    (int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1028 	} else {
1029 		/*
1030 		 * Collapse new free space into previous entry.
1031 		 */
1032 		bp = blkatoff(dp, (int)(ndp->ni_offset - ndp->ni_count),
1033 			(char **)&ep);
1034 		if (bp == 0)
1035 			return (0);
1036 		ep->d_reclen += ndp->ni_dent.d_reclen;
1037 		bwrite(bp);
1038 		dp->i_flag |= IUPD|ICHG;
1039 	}
1040 	return (1);
1041 }
1042 
1043 /*
1044  * Rewrite an existing directory entry to point at the inode
1045  * supplied.  The parameters describing the directory entry are
1046  * set up by a call to namei.
1047  */
1048 dirrewrite(dp, ip, ndp)
1049 	struct inode *dp, *ip;
1050 	struct nameidata *ndp;
1051 {
1052 
1053 	ndp->ni_dent.d_ino = ip->i_number;
1054 	u.u_error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1055 		(int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1056 	iput(dp);
1057 }
1058 
1059 /*
1060  * Return buffer with contents of block "offset"
1061  * from the beginning of directory "ip".  If "res"
1062  * is non-zero, fill it in with a pointer to the
1063  * remaining space in the directory.
1064  */
1065 struct buf *
1066 blkatoff(ip, offset, res)
1067 	struct inode *ip;
1068 	off_t offset;
1069 	char **res;
1070 {
1071 	register struct fs *fs = ip->i_fs;
1072 	daddr_t lbn = lblkno(fs, offset);
1073 	int bsize = blksize(fs, ip, lbn);
1074 	register struct buf *bp;
1075 	daddr_t bn;
1076 
1077 	bn = bmap(ip, lbn, B_READ, bsize);
1078 	if (u.u_error)
1079 		return (0);
1080 	if (bn == (daddr_t)-1) {
1081 		dirbad(ip, offset, "hole in dir");
1082 		return (0);
1083 	}
1084 	bp = bread(ip->i_dev, fsbtodb(fs, bn), bsize);
1085 	if (bp->b_flags & B_ERROR) {
1086 		brelse(bp);
1087 		return (0);
1088 	}
1089 	if (res)
1090 		*res = bp->b_un.b_addr + blkoff(fs, offset);
1091 	return (bp);
1092 }
1093 
1094 /*
1095  * Check if a directory is empty or not.
1096  * Inode supplied must be locked.
1097  *
1098  * Using a struct dirtemplate here is not precisely
1099  * what we want, but better than using a struct direct.
1100  *
1101  * NB: does not handle corrupted directories.
1102  */
1103 dirempty(ip, parentino)
1104 	register struct inode *ip;
1105 	ino_t parentino;
1106 {
1107 	register off_t off;
1108 	struct dirtemplate dbuf;
1109 	register struct direct *dp = (struct direct *)&dbuf;
1110 	int error, count;
1111 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1112 
1113 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
1114 		if (dp->d_reclen <= 0)
1115 			return (0);
1116 		error = rdwri(UIO_READ, ip, (caddr_t)dp, MINDIRSIZ,
1117 		    off, 1, &count);
1118 		/*
1119 		 * Since we read MINDIRSIZ, residual must
1120 		 * be 0 unless we're at end of file.
1121 		 */
1122 		if (error || count != 0)
1123 			return (0);
1124 		/* skip empty entries */
1125 		if (dp->d_ino == 0)
1126 			continue;
1127 		/* accept only "." and ".." */
1128 		if (dp->d_namlen > 2)
1129 			return (0);
1130 		if (dp->d_name[0] != '.')
1131 			return (0);
1132 		/*
1133 		 * At this point d_namlen must be 1 or 2.
1134 		 * 1 implies ".", 2 implies ".." if second
1135 		 * char is also "."
1136 		 */
1137 		if (dp->d_namlen == 1)
1138 			continue;
1139 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
1140 			continue;
1141 		return (0);
1142 	}
1143 	return (1);
1144 }
1145 
1146 /*
1147  * Check if source directory is in the path of the target directory.
1148  * Target is supplied locked, source is unlocked.
1149  * The target is always iput() before returning.
1150  */
1151 checkpath(source, target)
1152 	struct inode *source, *target;
1153 {
1154 	struct dirtemplate dirbuf;
1155 	register struct inode *ip;
1156 	int error = 0;
1157 
1158 	ip = target;
1159 	if (ip->i_number == source->i_number) {
1160 		error = EEXIST;
1161 		goto out;
1162 	}
1163 	if (ip->i_number == ROOTINO)
1164 		goto out;
1165 
1166 	for (;;) {
1167 		if ((ip->i_mode&IFMT) != IFDIR) {
1168 			error = ENOTDIR;
1169 			break;
1170 		}
1171 		error = rdwri(UIO_READ, ip, (caddr_t)&dirbuf,
1172 			sizeof (struct dirtemplate), (off_t)0, 1, (int *)0);
1173 		if (error != 0)
1174 			break;
1175 		if (dirbuf.dotdot_namlen != 2 ||
1176 		    dirbuf.dotdot_name[0] != '.' ||
1177 		    dirbuf.dotdot_name[1] != '.') {
1178 			error = ENOTDIR;
1179 			break;
1180 		}
1181 		if (dirbuf.dotdot_ino == source->i_number) {
1182 			error = EINVAL;
1183 			break;
1184 		}
1185 		if (dirbuf.dotdot_ino == ROOTINO)
1186 			break;
1187 		iput(ip);
1188 		ip = iget(ip->i_dev, ip->i_fs, dirbuf.dotdot_ino);
1189 		if (ip == NULL) {
1190 			error = u.u_error;
1191 			break;
1192 		}
1193 	}
1194 
1195 out:
1196 	if (error == ENOTDIR)
1197 		printf("checkpath: .. not a directory\n");
1198 	if (ip != NULL)
1199 		iput(ip);
1200 	return (error);
1201 }
1202 
1203 /*
1204  * Name cache initialization, from main() when we are booting
1205  */
1206 nchinit()
1207 {
1208 	register union nchash *nchp;
1209 	register struct nch *ncp;
1210 
1211 	nchhead = 0;
1212 	nchtail = &nchhead;
1213 
1214 	for (ncp = nch; ncp < &nch[nchsize]; ncp++) {
1215 		ncp->nc_forw = ncp;			/* hash chain */
1216 		ncp->nc_back = ncp;
1217 
1218 		ncp->nc_nxt = NULL;			/* lru chain */
1219 		*nchtail = ncp;
1220 		ncp->nc_prev = nchtail;
1221 		nchtail = &ncp->nc_nxt;
1222 
1223 		/* all else is zero already */
1224 	}
1225 
1226 	for (nchp = nchash; nchp < &nchash[NCHHASH]; nchp++) {
1227 		nchp->nch_head[0] = nchp;
1228 		nchp->nch_head[1] = nchp;
1229 	}
1230 }
1231 
1232 /*
1233  * Cache flush, called when filesys is umounted to
1234  * remove entries that would now be invalid
1235  *
1236  * The line "nxtcp = nchhead" near the end is to avoid potential problems
1237  * if the cache lru chain is modified while we are dumping the
1238  * inode.  This makes the algorithm O(n^2), but do you think I care?
1239  */
1240 nchinval(dev)
1241 	register dev_t dev;
1242 {
1243 	register struct nch *ncp, *nxtcp;
1244 
1245 	for (ncp = nchhead; ncp; ncp = nxtcp) {
1246 		nxtcp = ncp->nc_nxt;
1247 
1248 		if (ncp->nc_ip == NULL ||
1249 		    (ncp->nc_idev != dev && ncp->nc_dev != dev))
1250 			continue;
1251 
1252 			/* free the resources we had */
1253 		ncp->nc_idev = NODEV;
1254 		ncp->nc_dev = NODEV;
1255 		ncp->nc_id = NULL;
1256 		ncp->nc_ino = 0;
1257 		ncp->nc_ip = NULL;
1258 
1259 
1260 			/* remove the entry from its hash chain */
1261 		remque(ncp);
1262 			/* and make a dummy one */
1263 		ncp->nc_forw = ncp;
1264 		ncp->nc_back = ncp;
1265 
1266 			/* delete this entry from LRU chain */
1267 		*ncp->nc_prev = nxtcp;
1268 		if (nxtcp)
1269 			nxtcp->nc_prev = ncp->nc_prev;
1270 		else
1271 			nchtail = ncp->nc_prev;
1272 
1273 			/* cause rescan of list, it may have altered */
1274 		nxtcp = nchhead;
1275 			/* put the now-free entry at head of LRU */
1276 		ncp->nc_nxt = nxtcp;
1277 		ncp->nc_prev = &nchhead;
1278 		nxtcp->nc_prev = &ncp->nc_nxt;
1279 		nchhead = ncp;
1280 	}
1281 }
1282 
1283 /*
1284  * Name cache invalidation of all entries.
1285  */
1286 cacheinvalall()
1287 {
1288 	register struct nch *ncp;
1289 
1290 	for (ncp = nch; ncp < &nch[nchsize]; ncp++) {
1291 		ncp->nc_id = 0;
1292 	}
1293 }
1294