xref: /csrg-svn/sys/kern/vfs_lookup.c (revision 37582)
1 /*
2  * Copyright (c) 1982, 1986 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  *
6  *	@(#)vfs_lookup.c	7.6 (Berkeley) 05/01/89
7  */
8 
9 #include "param.h"
10 #include "systm.h"
11 #include "inode.h"
12 #include "fs.h"
13 #include "mount.h"
14 #include "dir.h"
15 #include "user.h"
16 #include "buf.h"
17 #include "conf.h"
18 #include "uio.h"
19 #include "kernel.h"
20 #include "malloc.h"
21 #ifdef KTRACE
22 #include "proc.h"
23 #include "ktrace.h"
24 #endif
25 
26 struct	buf *blkatoff();
27 int	dirchk = 1;
28 
29 /*
30  * Structures associated with name cacheing.
31  */
32 #define	NCHHASH		32	/* size of hash table */
33 
34 #if	((NCHHASH)&((NCHHASH)-1)) != 0
35 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) % (NCHHASH))
36 #else
37 #define	NHASH(h, i, d)	((unsigned)((h) + (i) + 13 * (int)(d)) & ((NCHHASH)-1))
38 #endif
39 
40 union nchash {
41 	union	nchash *nch_head[2];
42 	struct	namecache *nch_chain[2];
43 } nchash[NCHHASH];
44 #define	nch_forw	nch_chain[0]
45 #define	nch_back	nch_chain[1]
46 
47 struct	namecache *nchhead, **nchtail;	/* LRU chain pointers */
48 struct	nchstats nchstats;		/* cache effectiveness statistics */
49 
50 /*
51  * Convert a pathname into a pointer to a locked inode.
52  * This is a very central and rather complicated routine.
53  * If the file system is not maintained in a strict tree hierarchy,
54  * this can result in a deadlock situation (see comments in code below).
55  *
56  * The flag argument is LOOKUP, CREATE, or DELETE depending on whether
57  * the name is to be looked up, created, or deleted. When CREATE or
58  * DELETE is specified, information usable in creating or deleteing a
59  * directory entry is also calculated. If flag has LOCKPARENT or'ed
60  * into it and the target of the pathname exists, namei returns both
61  * the target and its parent directory locked. When creating and
62  * LOCKPARENT is specified, the target may not be ".".  When deleting
63  * and LOCKPARENT is specified, the target may be ".", but the caller
64  * must check to insure it does an irele and iput instead of two iputs.
65  *
66  * The FOLLOW flag is set when symbolic links are to be followed
67  * when they occur at the end of the name translation process.
68  * Symbolic links are always followed for all other pathname
69  * components other than the last.
70  *
71  * The segflg defines whether the name is to be copied from user
72  * space or kernel space.
73  *
74  * Name caching works as follows:
75  *
76  * Names found by directory scans are retained in a cache
77  * for future reference.  It is managed LRU, so frequently
78  * used names will hang around.  Cache is indexed by hash value
79  * obtained from (ino,dev,name) where ino & dev refer to the
80  * directory containing name.
81  *
82  * For simplicity (and economy of storage), names longer than
83  * a maximum length of NCHNAMLEN are not cached; they occur
84  * infrequently in any case, and are almost never of interest.
85  *
86  * Upon reaching the last segment of a path, if the reference
87  * is for DELETE, or NOCACHE is set (rewrite), and the
88  * name is located in the cache, it will be dropped.
89  *
90  * Overall outline of namei:
91  *
92  *	copy in name
93  *	get starting directory
94  * dirloop:
95  *	check accessibility of directory
96  * dirloop2:
97  *	copy next component of name to ndp->ni_dent
98  *	handle degenerate case where name is null string
99  *	look for name in cache, if found, then if at end of path
100  *	  and deleting or creating, drop it, else to haveino
101  *	search for name in directory, to found or notfound
102  * notfound:
103  *	if creating, return locked directory, leaving info on avail. slots
104  *	else return error
105  * found:
106  *	if at end of path and deleting, return information to allow delete
107  *	if at end of path and rewriting (CREATE and LOCKPARENT), lock target
108  *	  inode and return info to allow rewrite
109  *	if .. and on mounted filesys, look in mount table for parent
110  *	if not at end, add name to cache; if at end and neither creating
111  *	  nor deleting, add name to cache
112  * haveino:
113  *	if symbolic link, massage name in buffer and continue at dirloop
114  *	if more components of name, do next level at dirloop
115  *	return the answer as locked inode
116  *
117  * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode,
118  *	 but unlocked.
119  */
120 struct inode *
121 namei(ndp)
122 	register struct nameidata *ndp;
123 {
124 	register char *cp;		/* pointer into pathname argument */
125 /* these variables refer to things which must be freed or unlocked */
126 	register struct inode *dp = 0;	/* the directory we are searching */
127 	register struct namecache *ncp;	/* cache slot for entry */
128 	register struct fs *fs;		/* file system that directory is in */
129 	register struct buf *bp = 0;	/* a buffer of directory entries */
130 	register struct direct *ep;	/* the current directory entry */
131 	int entryoffsetinblock;		/* offset of ep in bp's buffer */
132 	register caddr_t nbp;		/* buffer storing path name argument */
133 /* these variables hold information about the search for a slot */
134 	enum {NONE, COMPACT, FOUND} slotstatus;
135 	int slotoffset = -1;		/* offset of area with free space */
136 	int slotsize;			/* size of area at slotoffset */
137 	int slotfreespace;		/* amount of space free in slot */
138 	int slotneeded;			/* size of the entry we're seeking */
139 /* */
140 	int numdirpasses;		/* strategy for directory search */
141 	int endsearch;			/* offset to end directory search */
142 	int prevoff;			/* ndp->ni_offset of previous entry */
143 	int nlink = 0;			/* number of symbolic links taken */
144 	struct inode *pdp;		/* saved dp during symlink work */
145 	int error, i;
146 	int lockparent;
147 	int docache;			/* == 0 do not cache last component */
148 	int makeentry;			/* != 0 if name to be added to cache */
149 	unsigned hash;			/* value of name hash for entry */
150 	union nchash *nhp;		/* cache chain head for entry */
151 	int isdotdot;			/* != 0 if current name is ".." */
152 	int flag;			/* op ie, LOOKUP, CREATE, or DELETE */
153 	off_t enduseful;		/* pointer past last used dir slot */
154 
155 	lockparent = ndp->ni_nameiop & LOCKPARENT;
156 	docache = (ndp->ni_nameiop & NOCACHE) ^ NOCACHE;
157 	flag = ndp->ni_nameiop &~ (LOCKPARENT|NOCACHE|FOLLOW);
158 	if (flag == DELETE || lockparent)
159 		docache = 0;
160 	/*
161 	 * Get a buffer for the name to be translated, and copy the
162 	 * name into the buffer.
163 	 */
164 	MALLOC(nbp, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
165 	if (ndp->ni_segflg == UIO_SYSSPACE)
166 		error = copystr(ndp->ni_dirp, nbp, MAXPATHLEN, (u_int *)0);
167 	else
168 		error = copyinstr(ndp->ni_dirp, nbp, MAXPATHLEN, (u_int *)0);
169 	if (error) {
170 		u.u_error = error;
171 		goto bad;
172 	}
173 #ifdef KTRACE
174 	if (KTRPOINT(u.u_procp, KTR_NAMEI))
175 		ktrnamei(u.u_procp->p_tracep, nbp);
176 #endif
177 
178 	/*
179 	 * Get starting directory.
180 	 */
181 	cp = nbp;
182 	if (*cp == '/') {
183 		while (*cp == '/')
184 			cp++;
185 		if ((dp = u.u_rdir) == NULL)
186 			dp = rootdir;
187 	} else
188 		dp = u.u_cdir;
189 	fs = dp->i_fs;
190 	ILOCK(dp);
191 	dp->i_count++;
192 	ndp->ni_pdir = (struct inode *)0xc0000000;		/* illegal */
193 	ndp->ni_endoff = 0;
194 
195 	/*
196 	 * We come to dirloop to search a new directory.
197 	 * The directory must be locked so that it can be
198 	 * iput, and fs must be already set to dp->i_fs.
199 	 */
200 dirloop:
201 	/*
202 	 * Check accessiblity of directory.
203 	 */
204 	if ((dp->i_mode&IFMT) != IFDIR) {
205 		u.u_error = ENOTDIR;
206 		goto bad;
207 	}
208 	if (access(dp, IEXEC))
209 		goto bad;
210 
211 dirloop2:
212 	/*
213 	 * Copy next component of name to ndp->ni_dent.
214 	 */
215 	hash = 0;
216 	for (i = 0; *cp != 0 && *cp != '/'; cp++) {
217 		if (i >= MAXNAMLEN) {
218 			u.u_error = ENAMETOOLONG;
219 			goto bad;
220 		}
221 		if (*cp & 0200)
222 			if ((*cp&0377) == ('/'|0200) || flag != DELETE) {
223 				u.u_error = EINVAL;
224 				goto bad;
225 			}
226 		ndp->ni_dent.d_name[i++] = *cp;
227 		hash += (unsigned char)*cp * i;
228 	}
229 	ndp->ni_dent.d_namlen = i;
230 	ndp->ni_dent.d_name[i] = '\0';
231 	isdotdot = (i == 2 &&
232 		ndp->ni_dent.d_name[0] == '.' && ndp->ni_dent.d_name[1] == '.');
233 	makeentry = 1;
234 	if (*cp == '\0' && docache == 0)
235 		makeentry = 0;
236 
237 	/*
238 	 * Check for degenerate name (e.g. / or "")
239 	 * which is a way of talking about a directory,
240 	 * e.g. like "/." or ".".
241 	 */
242 	if (ndp->ni_dent.d_name[0] == '\0') {
243 		if (flag != LOOKUP || lockparent) {
244 			u.u_error = EISDIR;
245 			goto bad;
246 		}
247 		FREE(nbp, M_NAMEI);
248 		return (dp);
249 	}
250 
251 	/*
252 	 * Special handling for ".." allowing chdir out of mounted
253 	 * file system: indirect .. in root inode to reevaluate
254 	 * in directory file system was mounted on.
255 	 */
256 	if (isdotdot) {
257 		for (;;) {
258 			if (dp == u.u_rdir || dp == rootdir) {
259 				ndp->ni_dent.d_ino = dp->i_number;
260 				pdp = dp;
261 				dp->i_count++;
262 				goto haveino;
263 			}
264 			if (dp->i_number != ROOTINO)
265 				break;
266 			for (i = 1; i < NMOUNT; i++) {
267 				if (mount[i].m_fs != NULL &&
268 				    mount[i].m_dev == dp->i_dev) {
269 					iput(dp);
270 					dp = mount[i].m_inodp;
271 					ILOCK(dp);
272 					dp->i_count++;
273 					fs = dp->i_fs;
274 					break;
275 				}
276 			}
277 		}
278 	}
279 
280 	/*
281 	 * We now have a segment name to search for, and a directory to search.
282 	 *
283 	 * Before tediously performing a linear scan of the directory,
284 	 * check the name cache to see if the directory/name pair
285 	 * we are looking for is known already.  We don't do this
286 	 * if the segment name is long, simply so the cache can avoid
287 	 * holding long names (which would either waste space, or
288 	 * add greatly to the complexity).
289 	 */
290 	if (ndp->ni_dent.d_namlen > NCHNAMLEN) {
291 		nchstats.ncs_long++;
292 		makeentry = 0;
293 	} else {
294 		nhp = &nchash[NHASH(hash, dp->i_number, dp->i_dev)];
295 		for (ncp = nhp->nch_forw; ncp != (struct namecache *)nhp;
296 		    ncp = ncp->nc_forw) {
297 			if (ncp->nc_ino == dp->i_number &&
298 			    ncp->nc_dev == dp->i_dev &&
299 			    ncp->nc_nlen == ndp->ni_dent.d_namlen &&
300 			    !bcmp(ncp->nc_name, ndp->ni_dent.d_name,
301 				(unsigned)ncp->nc_nlen))
302 				break;
303 		}
304 		if (ncp == (struct namecache *)nhp) {
305 			nchstats.ncs_miss++;
306 			ncp = NULL;
307 		} else {
308 			if (ncp->nc_id != ncp->nc_ip->i_id)
309 				nchstats.ncs_falsehits++;
310 			else if (!makeentry)
311 				nchstats.ncs_badhits++;
312 			else {
313 				/*
314 				 * move this slot to end of LRU
315 				 * chain, if not already there
316 				 */
317 				if (ncp->nc_nxt) {
318 					/* remove from LRU chain */
319 					*ncp->nc_prev = ncp->nc_nxt;
320 					ncp->nc_nxt->nc_prev = ncp->nc_prev;
321 
322 					/* and replace at end of it */
323 					ncp->nc_nxt = NULL;
324 					ncp->nc_prev = nchtail;
325 					*nchtail = ncp;
326 					nchtail = &ncp->nc_nxt;
327 				}
328 
329 				/*
330 				 * Get the next inode in the path.
331 				 * See comment above other `IUNLOCK' code for
332 				 * an explaination of the locking protocol.
333 				 */
334 				pdp = dp;
335 				if (!isdotdot || dp != u.u_rdir)
336 					dp = ncp->nc_ip;
337 				if (dp == NULL)
338 					panic("namei: null cache ino");
339 				if (pdp == dp)
340 					dp->i_count++;
341 				else if (isdotdot) {
342 					IUNLOCK(pdp);
343 					igrab(dp);
344 				} else {
345 					igrab(dp);
346 					IUNLOCK(pdp);
347 				}
348 
349 				/*
350 				 * Verify that the inode that we got
351 				 * did not change while we were waiting
352 				 * for it to be locked.
353 				 */
354 				if (ncp->nc_id != ncp->nc_ip->i_id) {
355 					iput(dp);
356 					ILOCK(pdp);
357 					dp = pdp;
358 					nchstats.ncs_falsehits++;
359 				} else {
360 					ndp->ni_dent.d_ino = dp->i_number;
361 					/* ni_dent.d_reclen is garbage ... */
362 					nchstats.ncs_goodhits++;
363 					goto haveino;
364 				}
365 			}
366 
367 			/*
368 			 * Last component and we are renaming or deleting,
369 			 * the cache entry is invalid, or otherwise don't
370 			 * want cache entry to exist.
371 			 */
372 			/* remove from LRU chain */
373 			*ncp->nc_prev = ncp->nc_nxt;
374 			if (ncp->nc_nxt)
375 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
376 			else
377 				nchtail = ncp->nc_prev;
378 			remque(ncp);		/* remove from hash chain */
379 			/* insert at head of LRU list (first to grab) */
380 			ncp->nc_nxt = nchhead;
381 			ncp->nc_prev = &nchhead;
382 			nchhead->nc_prev = &ncp->nc_nxt;
383 			nchhead = ncp;
384 			/* and make a dummy hash chain */
385 			ncp->nc_forw = ncp;
386 			ncp->nc_back = ncp;
387 			ncp = NULL;
388 		}
389 	}
390 
391 	/*
392 	 * Suppress search for slots unless creating
393 	 * file and at end of pathname, in which case
394 	 * we watch for a place to put the new file in
395 	 * case it doesn't already exist.
396 	 */
397 	slotstatus = FOUND;
398 	if (flag == CREATE && *cp == 0) {
399 		slotstatus = NONE;
400 		slotfreespace = 0;
401 		slotneeded = DIRSIZ(&ndp->ni_dent);
402 	}
403 	/*
404 	 * If this is the same directory that this process
405 	 * previously searched, pick up where we last left off.
406 	 * We cache only lookups as these are the most common
407 	 * and have the greatest payoff. Caching CREATE has little
408 	 * benefit as it usually must search the entire directory
409 	 * to determine that the entry does not exist. Caching the
410 	 * location of the last DELETE has not reduced profiling time
411 	 * and hence has been removed in the interest of simplicity.
412 	 */
413 	if (flag != LOOKUP || dp->i_number != u.u_ncache.nc_inumber ||
414 	    dp->i_dev != u.u_ncache.nc_dev) {
415 		ndp->ni_offset = 0;
416 		numdirpasses = 1;
417 	} else {
418 		if (u.u_ncache.nc_prevoffset > dp->i_size)
419 			u.u_ncache.nc_prevoffset = 0;
420 		ndp->ni_offset = u.u_ncache.nc_prevoffset;
421 		entryoffsetinblock = blkoff(fs, ndp->ni_offset);
422 		if (entryoffsetinblock != 0) {
423 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
424 			if (bp == 0)
425 				goto bad;
426 		}
427 		numdirpasses = 2;
428 		nchstats.ncs_2passes++;
429 	}
430 	endsearch = roundup(dp->i_size, DIRBLKSIZ);
431 	enduseful = 0;
432 
433 #ifdef NAMEI_DIAGNOSTIC
434 	printf("{%s}:\n", ndp->ni_dent.d_name);
435 #endif
436 searchloop:
437 	while (ndp->ni_offset < endsearch) {
438 		/*
439 		 * If offset is on a block boundary,
440 		 * read the next directory block.
441 		 * Release previous if it exists.
442 		 */
443 		if (blkoff(fs, ndp->ni_offset) == 0) {
444 			if (bp != NULL)
445 				brelse(bp);
446 			bp = blkatoff(dp, ndp->ni_offset, (char **)0);
447 			if (bp == 0)
448 				goto bad;
449 			entryoffsetinblock = 0;
450 		}
451 		/*
452 		 * If still looking for a slot, and at a DIRBLKSIZE
453 		 * boundary, have to start looking for free space again.
454 		 */
455 		if (slotstatus == NONE &&
456 		    (entryoffsetinblock&(DIRBLKSIZ-1)) == 0) {
457 			slotoffset = -1;
458 			slotfreespace = 0;
459 		}
460 		/*
461 		 * Get pointer to next entry.
462 		 * Full validation checks are slow, so we only check
463 		 * enough to insure forward progress through the
464 		 * directory. Complete checks can be run by patching
465 		 * "dirchk" to be true.
466 		 */
467 		ep = (struct direct *)(bp->b_un.b_addr + entryoffsetinblock);
468 		if (ep->d_reclen == 0 ||
469 		    dirchk && dirbadentry(ep, entryoffsetinblock)) {
470 			dirbad(dp, ndp->ni_offset, "mangled entry");
471 			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
472 			ndp->ni_offset += i;
473 			entryoffsetinblock += i;
474 			continue;
475 		}
476 
477 		/*
478 		 * If an appropriate sized slot has not yet been found,
479 		 * check to see if one is available. Also accumulate space
480 		 * in the current block so that we can determine if
481 		 * compaction is viable.
482 		 */
483 		if (slotstatus != FOUND) {
484 			int size = ep->d_reclen;
485 
486 			if (ep->d_ino != 0)
487 				size -= DIRSIZ(ep);
488 			if (size > 0) {
489 				if (size >= slotneeded) {
490 					slotstatus = FOUND;
491 					slotoffset = ndp->ni_offset;
492 					slotsize = ep->d_reclen;
493 				} else if (slotstatus == NONE) {
494 					slotfreespace += size;
495 					if (slotoffset == -1)
496 						slotoffset = ndp->ni_offset;
497 					if (slotfreespace >= slotneeded) {
498 						slotstatus = COMPACT;
499 						slotsize = ndp->ni_offset +
500 						      ep->d_reclen - slotoffset;
501 					}
502 				}
503 			}
504 		}
505 
506 		/*
507 		 * Check for a name match.
508 		 */
509 		if (ep->d_ino) {
510 #ifdef NAMEI_DIAGNOSTIC
511 			printf("{%s} ", ep->d_name);
512 #endif
513 			if (ep->d_namlen == ndp->ni_dent.d_namlen &&
514 			    !bcmp(ndp->ni_dent.d_name, ep->d_name,
515 				(unsigned)ep->d_namlen))
516 				goto found;
517 		}
518 		prevoff = ndp->ni_offset;
519 		ndp->ni_offset += ep->d_reclen;
520 		entryoffsetinblock += ep->d_reclen;
521 		if (ep->d_ino)
522 			enduseful = ndp->ni_offset;
523 	}
524 #ifdef NAMEI_DIAGNOSTIC
525 	printf("\nnotfound\n");
526 #endif
527 /* notfound: */
528 	/*
529 	 * If we started in the middle of the directory and failed
530 	 * to find our target, we must check the beginning as well.
531 	 */
532 	if (numdirpasses == 2) {
533 		numdirpasses--;
534 		ndp->ni_offset = 0;
535 		endsearch = u.u_ncache.nc_prevoffset;
536 		goto searchloop;
537 	}
538 	/*
539 	 * If creating, and at end of pathname and current
540 	 * directory has not been removed, then can consider
541 	 * allowing file to be created.
542 	 */
543 	if (flag == CREATE && *cp == 0 && dp->i_nlink != 0) {
544 		/*
545 		 * Access for write is interpreted as allowing
546 		 * creation of files in the directory.
547 		 */
548 		if (access(dp, IWRITE))
549 			goto bad;
550 		/*
551 		 * Return an indication of where the new directory
552 		 * entry should be put.  If we didn't find a slot,
553 		 * then set ndp->ni_count to 0 indicating that the new
554 		 * slot belongs at the end of the directory. If we found
555 		 * a slot, then the new entry can be put in the range
556 		 * [ndp->ni_offset .. ndp->ni_offset + ndp->ni_count)
557 		 */
558 		if (slotstatus == NONE) {
559 			ndp->ni_offset = roundup(dp->i_size, DIRBLKSIZ);
560 			ndp->ni_count = 0;
561 			enduseful = ndp->ni_offset;
562 		} else {
563 			ndp->ni_offset = slotoffset;
564 			ndp->ni_count = slotsize;
565 			if (enduseful < slotoffset + slotsize)
566 				enduseful = slotoffset + slotsize;
567 		}
568 		ndp->ni_endoff = roundup(enduseful, DIRBLKSIZ);
569 		dp->i_flag |= IUPD|ICHG;
570 		if (bp)
571 			brelse(bp);
572 		FREE(nbp, M_NAMEI);
573 		/*
574 		 * We return with the directory locked, so that
575 		 * the parameters we set up above will still be
576 		 * valid if we actually decide to do a direnter().
577 		 * We return NULL to indicate that the entry doesn't
578 		 * currently exist, leaving a pointer to the (locked)
579 		 * directory inode in ndp->ni_pdir.
580 		 */
581 		ndp->ni_pdir = dp;
582 		return (NULL);
583 	}
584 	u.u_error = ENOENT;
585 	goto bad;
586 found:
587 #ifdef NAMEI_DIAGNOSTIC
588 	printf("\nfound\n");
589 #endif
590 	if (numdirpasses == 2)
591 		nchstats.ncs_pass2++;
592 	/*
593 	 * Check that directory length properly reflects presence
594 	 * of this entry.
595 	 */
596 	if (entryoffsetinblock + DIRSIZ(ep) > dp->i_size) {
597 		dirbad(dp, ndp->ni_offset, "i_size too small");
598 		dp->i_size = entryoffsetinblock + DIRSIZ(ep);
599 		dp->i_flag |= IUPD|ICHG;
600 	}
601 
602 	/*
603 	 * Found component in pathname.
604 	 * If the final component of path name, save information
605 	 * in the cache as to where the entry was found.
606 	 */
607 	if (*cp == '\0' && flag == LOOKUP) {
608 		u.u_ncache.nc_prevoffset = ndp->ni_offset &~ (DIRBLKSIZ - 1);
609 		u.u_ncache.nc_inumber = dp->i_number;
610 		u.u_ncache.nc_dev = dp->i_dev;
611 	}
612 	/*
613 	 * Save directory entry's inode number and reclen in ndp->ni_dent,
614 	 * and release directory buffer.
615 	 */
616 	ndp->ni_dent.d_ino = ep->d_ino;
617 	ndp->ni_dent.d_reclen = ep->d_reclen;
618 	brelse(bp);
619 	bp = NULL;
620 
621 	/*
622 	 * If deleting, and at end of pathname, return
623 	 * parameters which can be used to remove file.
624 	 * If the lockparent flag isn't set, we return only
625 	 * the directory (in ndp->ni_pdir), otherwise we go
626 	 * on and lock the inode, being careful with ".".
627 	 */
628 	if (flag == DELETE && *cp == 0) {
629 		/*
630 		 * Write access to directory required to delete files.
631 		 */
632 		if (access(dp, IWRITE))
633 			goto bad;
634 		ndp->ni_pdir = dp;		/* for dirremove() */
635 		/*
636 		 * Return pointer to current entry in ndp->ni_offset,
637 		 * and distance past previous entry (if there
638 		 * is a previous entry in this block) in ndp->ni_count.
639 		 * Save directory inode pointer in ndp->ni_pdir for dirremove().
640 		 */
641 		if ((ndp->ni_offset&(DIRBLKSIZ-1)) == 0)
642 			ndp->ni_count = 0;
643 		else
644 			ndp->ni_count = ndp->ni_offset - prevoff;
645 		if (lockparent) {
646 			if (dp->i_number == ndp->ni_dent.d_ino)
647 				dp->i_count++;
648 			else {
649 				dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
650 				if (dp == NULL) {
651 					iput(ndp->ni_pdir);
652 					goto bad;
653 				}
654 				/*
655 				 * If directory is "sticky", then user must own
656 				 * the directory, or the file in it, else he
657 				 * may not delete it (unless he's root). This
658 				 * implements append-only directories.
659 				 */
660 				if ((ndp->ni_pdir->i_mode & ISVTX) &&
661 				    u.u_uid != 0 &&
662 				    u.u_uid != ndp->ni_pdir->i_uid &&
663 				    dp->i_uid != u.u_uid) {
664 					iput(ndp->ni_pdir);
665 					u.u_error = EPERM;
666 					goto bad;
667 				}
668 			}
669 		}
670 		FREE(nbp, M_NAMEI);
671 		return (dp);
672 	}
673 
674 	/*
675 	 * If rewriting (rename), return the inode and the
676 	 * information required to rewrite the present directory
677 	 * Must get inode of directory entry to verify it's a
678 	 * regular file, or empty directory.
679 	 */
680 	if ((flag == CREATE && lockparent) && *cp == 0) {
681 		if (access(dp, IWRITE))
682 			goto bad;
683 		ndp->ni_pdir = dp;		/* for dirrewrite() */
684 		/*
685 		 * Careful about locking second inode.
686 		 * This can only occur if the target is ".".
687 		 */
688 		if (dp->i_number == ndp->ni_dent.d_ino) {
689 			u.u_error = EISDIR;		/* XXX */
690 			goto bad;
691 		}
692 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
693 		if (dp == NULL) {
694 			iput(ndp->ni_pdir);
695 			goto bad;
696 		}
697 		FREE(nbp, M_NAMEI);
698 		return (dp);
699 	}
700 
701 	/*
702 	 * Check for symbolic link, which may require us to massage the
703 	 * name before we continue translation.  We do not `iput' the
704 	 * directory because we may need it again if the symbolic link
705 	 * is relative to the current directory.  Instead we save it
706 	 * unlocked as "pdp".  We must get the target inode before unlocking
707 	 * the directory to insure that the inode will not be removed
708 	 * before we get it.  We prevent deadlock by always fetching
709 	 * inodes from the root, moving down the directory tree. Thus
710 	 * when following backward pointers ".." we must unlock the
711 	 * parent directory before getting the requested directory.
712 	 * There is a potential race condition here if both the current
713 	 * and parent directories are removed before the `iget' for the
714 	 * inode associated with ".." returns.  We hope that this occurs
715 	 * infrequently since we cannot avoid this race condition without
716 	 * implementing a sophisticated deadlock detection algorithm.
717 	 * Note also that this simple deadlock detection scheme will not
718 	 * work if the file system has any hard links other than ".."
719 	 * that point backwards in the directory structure.
720 	 */
721 	pdp = dp;
722 	if (isdotdot) {
723 		IUNLOCK(pdp);	/* race to get the inode */
724 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
725 		if (dp == NULL)
726 			goto bad2;
727 	} else if (dp->i_number == ndp->ni_dent.d_ino) {
728 		dp->i_count++;	/* we want ourself, ie "." */
729 	} else {
730 		dp = iget(dp->i_dev, fs, ndp->ni_dent.d_ino);
731 		IUNLOCK(pdp);
732 		if (dp == NULL)
733 			goto bad2;
734 	}
735 
736 	/*
737 	 * Insert name into cache if appropriate.
738 	 */
739 	if (makeentry) {
740 		if (ncp != NULL)
741 			panic("namei: duplicating cache");
742 		/*
743 		 * Free the cache slot at head of lru chain.
744 		 */
745 		if (ncp = nchhead) {
746 			/* remove from lru chain */
747 			*ncp->nc_prev = ncp->nc_nxt;
748 			if (ncp->nc_nxt)
749 				ncp->nc_nxt->nc_prev = ncp->nc_prev;
750 			else
751 				nchtail = ncp->nc_prev;
752 			remque(ncp);		/* remove from old hash chain */
753 			/* grab the inode we just found */
754 			ncp->nc_ip = dp;
755 			/* fill in cache info */
756 			ncp->nc_ino = pdp->i_number;	/* parents inum */
757 			ncp->nc_dev = pdp->i_dev;	/* & device */
758 			ncp->nc_idev = dp->i_dev;	/* our device */
759 			ncp->nc_id = dp->i_id;		/* identifier */
760 			ncp->nc_nlen = ndp->ni_dent.d_namlen;
761 			bcopy(ndp->ni_dent.d_name, ncp->nc_name,
762 			    (unsigned)ncp->nc_nlen);
763 			/* link at end of lru chain */
764 			ncp->nc_nxt = NULL;
765 			ncp->nc_prev = nchtail;
766 			*nchtail = ncp;
767 			nchtail = &ncp->nc_nxt;
768 			/* and insert on hash chain */
769 			insque(ncp, nhp);
770 		}
771 	}
772 
773 haveino:
774 	fs = dp->i_fs;
775 
776 	/*
777 	 * Check for symbolic link
778 	 */
779 	if ((dp->i_mode & IFMT) == IFLNK &&
780 	    ((ndp->ni_nameiop & FOLLOW) || *cp == '/')) {
781 		u_int pathlen = strlen(cp) + 1;
782 
783 		if (dp->i_size + pathlen >= MAXPATHLEN - 1) {
784 			u.u_error = ENAMETOOLONG;
785 			goto bad2;
786 		}
787 		if (++nlink > MAXSYMLINKS) {
788 			u.u_error = ELOOP;
789 			goto bad2;
790 		}
791 		ovbcopy(cp, nbp + dp->i_size, pathlen);
792 		u.u_error =
793 		    rdwri(UIO_READ, dp, nbp, (int)dp->i_size,
794 			(off_t)0, 1, (int *)0);
795 		if (u.u_error)
796 			goto bad2;
797 		cp = nbp;
798 		iput(dp);
799 		if (*cp == '/') {
800 			irele(pdp);
801 			while (*cp == '/')
802 				cp++;
803 			if ((dp = u.u_rdir) == NULL)
804 				dp = rootdir;
805 			ILOCK(dp);
806 			dp->i_count++;
807 		} else {
808 			dp = pdp;
809 			ILOCK(dp);
810 		}
811 		fs = dp->i_fs;
812 		goto dirloop;
813 	}
814 
815 	/*
816 	 * Not a symbolic link.  If more pathname,
817 	 * continue at next component, else return.
818 	 */
819 	if (*cp == '/') {
820 		while (*cp == '/')
821 			cp++;
822 		irele(pdp);
823 		goto dirloop;
824 	}
825 	FREE(nbp, M_NAMEI);
826 	if (lockparent)
827 		ndp->ni_pdir = pdp;
828 	else
829 		irele(pdp);
830 	return (dp);
831 bad2:
832 	irele(pdp);
833 bad:
834 	if (bp)
835 		brelse(bp);
836 	if (dp)
837 		iput(dp);
838 	FREE(nbp, M_NAMEI);
839 	return (NULL);
840 }
841 
842 
843 dirbad(ip, offset, how)
844 	struct inode *ip;
845 	off_t offset;
846 	char *how;
847 {
848 
849 	printf("%s: bad dir ino %d at offset %d: %s\n",
850 	    ip->i_fs->fs_fsmnt, ip->i_number, offset, how);
851 }
852 
853 /*
854  * Do consistency checking on a directory entry:
855  *	record length must be multiple of 4
856  *	entry must fit in rest of its DIRBLKSIZ block
857  *	record must be large enough to contain entry
858  *	name is not longer than MAXNAMLEN
859  *	name must be as long as advertised, and null terminated
860  */
861 dirbadentry(ep, entryoffsetinblock)
862 	register struct direct *ep;
863 	int entryoffsetinblock;
864 {
865 	register int i;
866 
867 	if ((ep->d_reclen & 0x3) != 0 ||
868 	    ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) ||
869 	    ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN)
870 		return (1);
871 	for (i = 0; i < ep->d_namlen; i++)
872 		if (ep->d_name[i] == '\0')
873 			return (1);
874 	return (ep->d_name[i]);
875 }
876 
877 /*
878  * Write a directory entry after a call to namei, using the parameters
879  * which it left in the u. area.  The argument ip is the inode which
880  * the new directory entry will refer to.  The u. area field ndp->ni_pdir is
881  * a pointer to the directory to be written, which was left locked by
882  * namei.  Remaining parameters (ndp->ni_offset, ndp->ni_count) indicate
883  * how the space for the new entry is to be gotten.
884  */
885 direnter(ip, ndp)
886 	struct inode *ip;
887 	register struct nameidata *ndp;
888 {
889 	register struct direct *ep, *nep;
890 	register struct inode *dp = ndp->ni_pdir;
891 	struct buf *bp;
892 	int loc, spacefree, error = 0;
893 	u_int dsize;
894 	int newentrysize;
895 	char *dirbuf;
896 
897 	ndp->ni_dent.d_ino = ip->i_number;
898 	newentrysize = DIRSIZ(&ndp->ni_dent);
899 	if (ndp->ni_count == 0) {
900 		/*
901 		 * If ndp->ni_count is 0, then namei could find no space in the
902 		 * directory. In this case ndp->ni_offset will be on a directory
903 		 * block boundary and we will write the new entry into a fresh
904 		 * block.
905 		 */
906 		if (ndp->ni_offset&(DIRBLKSIZ-1))
907 			panic("wdir: newblk");
908 		ndp->ni_dent.d_reclen = DIRBLKSIZ;
909 		error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
910 		    newentrysize, ndp->ni_offset, 1, (int *)0);
911 		if (DIRBLKSIZ > dp->i_fs->fs_fsize)
912 			panic("wdir: blksize"); /* XXX - should grow w/bmap() */
913 		else
914 			dp->i_size = roundup(dp->i_size, DIRBLKSIZ);
915 		iput(dp);
916 		return (error);
917 	}
918 
919 	/*
920 	 * If ndp->ni_count is non-zero, then namei found space for the new
921 	 * entry in the range ndp->ni_offset to ndp->ni_offset + ndp->ni_count.
922 	 * in the directory.  To use this space, we may have to compact
923 	 * the entries located there, by copying them together towards
924 	 * the beginning of the block, leaving the free space in
925 	 * one usable chunk at the end.
926 	 */
927 
928 	/*
929 	 * Increase size of directory if entry eats into new space.
930 	 * This should never push the size past a new multiple of
931 	 * DIRBLKSIZE.
932 	 *
933 	 * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
934 	 */
935 	if (ndp->ni_offset + ndp->ni_count > dp->i_size)
936 		dp->i_size = ndp->ni_offset + ndp->ni_count;
937 	/*
938 	 * Get the block containing the space for the new directory
939 	 * entry.  Should return error by result instead of u.u_error.
940 	 */
941 	bp = blkatoff(dp, ndp->ni_offset, (char **)&dirbuf);
942 	if (bp == 0) {
943 		iput(dp);
944 		return (u.u_error);
945 	}
946 	/*
947 	 * Find space for the new entry.  In the simple case, the
948 	 * entry at offset base will have the space.  If it does
949 	 * not, then namei arranged that compacting the region
950 	 * ndp->ni_offset to ndp->ni_offset+ndp->ni_count would yield the space.
951 	 */
952 	ep = (struct direct *)dirbuf;
953 	dsize = DIRSIZ(ep);
954 	spacefree = ep->d_reclen - dsize;
955 	for (loc = ep->d_reclen; loc < ndp->ni_count; ) {
956 		nep = (struct direct *)(dirbuf + loc);
957 		if (ep->d_ino) {
958 			/* trim the existing slot */
959 			ep->d_reclen = dsize;
960 			ep = (struct direct *)((char *)ep + dsize);
961 		} else {
962 			/* overwrite; nothing there; header is ours */
963 			spacefree += dsize;
964 		}
965 		dsize = DIRSIZ(nep);
966 		spacefree += nep->d_reclen - dsize;
967 		loc += nep->d_reclen;
968 		bcopy((caddr_t)nep, (caddr_t)ep, dsize);
969 	}
970 	/*
971 	 * Update the pointer fields in the previous entry (if any),
972 	 * copy in the new entry, and write out the block.
973 	 */
974 	if (ep->d_ino == 0) {
975 		if (spacefree + dsize < newentrysize)
976 			panic("wdir: compact1");
977 		ndp->ni_dent.d_reclen = spacefree + dsize;
978 	} else {
979 		if (spacefree < newentrysize)
980 			panic("wdir: compact2");
981 		ndp->ni_dent.d_reclen = spacefree;
982 		ep->d_reclen = dsize;
983 		ep = (struct direct *)((char *)ep + dsize);
984 	}
985 	bcopy((caddr_t)&ndp->ni_dent, (caddr_t)ep, (u_int)newentrysize);
986 	bwrite(bp);
987 	dp->i_flag |= IUPD|ICHG;
988 	if (ndp->ni_endoff && ndp->ni_endoff < dp->i_size)
989 		itrunc(dp, (u_long)ndp->ni_endoff);
990 	iput(dp);
991 	return (error);
992 }
993 
994 /*
995  * Remove a directory entry after a call to namei, using the
996  * parameters which it left in the u. area.  The u. entry
997  * ni_offset contains the offset into the directory of the
998  * entry to be eliminated.  The ni_count field contains the
999  * size of the previous record in the directory.  If this
1000  * is 0, the first entry is being deleted, so we need only
1001  * zero the inode number to mark the entry as free.  If the
1002  * entry isn't the first in the directory, we must reclaim
1003  * the space of the now empty record by adding the record size
1004  * to the size of the previous entry.
1005  */
1006 dirremove(ndp)
1007 	register struct nameidata *ndp;
1008 {
1009 	register struct inode *dp = ndp->ni_pdir;
1010 	register struct buf *bp;
1011 	struct direct *ep;
1012 
1013 	if (ndp->ni_count == 0) {
1014 		/*
1015 		 * First entry in block: set d_ino to zero.
1016 		 */
1017 		ndp->ni_dent.d_ino = 0;
1018 		(void) rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1019 		    (int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1020 	} else {
1021 		/*
1022 		 * Collapse new free space into previous entry.
1023 		 */
1024 		bp = blkatoff(dp, ndp->ni_offset - ndp->ni_count, (char **)&ep);
1025 		if (bp == 0)
1026 			return (0);
1027 		ep->d_reclen += ndp->ni_dent.d_reclen;
1028 		bwrite(bp);
1029 		dp->i_flag |= IUPD|ICHG;
1030 	}
1031 	return (1);
1032 }
1033 
1034 /*
1035  * Rewrite an existing directory entry to point at the inode
1036  * supplied.  The parameters describing the directory entry are
1037  * set up by a call to namei.
1038  */
1039 dirrewrite(dp, ip, ndp)
1040 	struct inode *dp, *ip;
1041 	struct nameidata *ndp;
1042 {
1043 
1044 	ndp->ni_dent.d_ino = ip->i_number;
1045 	u.u_error = rdwri(UIO_WRITE, dp, (caddr_t)&ndp->ni_dent,
1046 		(int)DIRSIZ(&ndp->ni_dent), ndp->ni_offset, 1, (int *)0);
1047 	iput(dp);
1048 }
1049 
1050 /*
1051  * Return buffer with contents of block "offset"
1052  * from the beginning of directory "ip".  If "res"
1053  * is non-zero, fill it in with a pointer to the
1054  * remaining space in the directory.
1055  */
1056 struct buf *
1057 blkatoff(ip, offset, res)
1058 	struct inode *ip;
1059 	off_t offset;
1060 	char **res;
1061 {
1062 	register struct fs *fs = ip->i_fs;
1063 	daddr_t lbn = lblkno(fs, offset);
1064 	int bsize = blksize(fs, ip, lbn);
1065 	register struct buf *bp;
1066 	daddr_t bn;
1067 
1068 	bn = bmap(ip, lbn, B_READ, bsize);
1069 	if (u.u_error)
1070 		return (0);
1071 	if (bn == (daddr_t)-1) {
1072 		dirbad(ip, offset, "hole in dir");
1073 		return (0);
1074 	}
1075 	bp = bread(ip->i_dev, fsbtodb(fs, bn), bsize);
1076 	if (bp->b_flags & B_ERROR) {
1077 		brelse(bp);
1078 		return (0);
1079 	}
1080 	if (res)
1081 		*res = bp->b_un.b_addr + blkoff(fs, offset);
1082 	return (bp);
1083 }
1084 
1085 /*
1086  * Check if a directory is empty or not.
1087  * Inode supplied must be locked.
1088  *
1089  * Using a struct dirtemplate here is not precisely
1090  * what we want, but better than using a struct direct.
1091  *
1092  * NB: does not handle corrupted directories.
1093  */
1094 dirempty(ip, parentino)
1095 	register struct inode *ip;
1096 	ino_t parentino;
1097 {
1098 	register off_t off;
1099 	struct dirtemplate dbuf;
1100 	register struct direct *dp = (struct direct *)&dbuf;
1101 	int error, count;
1102 #define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)
1103 
1104 	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
1105 		error = rdwri(UIO_READ, ip, (caddr_t)dp, MINDIRSIZ,
1106 		    off, 1, &count);
1107 		/*
1108 		 * Since we read MINDIRSIZ, residual must
1109 		 * be 0 unless we're at end of file.
1110 		 */
1111 		if (error || count != 0)
1112 			return (0);
1113 		/* avoid infinite loops */
1114 		if (dp->d_reclen == 0)
1115 			return (0);
1116 		/* skip empty entries */
1117 		if (dp->d_ino == 0)
1118 			continue;
1119 		/* accept only "." and ".." */
1120 		if (dp->d_namlen > 2)
1121 			return (0);
1122 		if (dp->d_name[0] != '.')
1123 			return (0);
1124 		/*
1125 		 * At this point d_namlen must be 1 or 2.
1126 		 * 1 implies ".", 2 implies ".." if second
1127 		 * char is also "."
1128 		 */
1129 		if (dp->d_namlen == 1)
1130 			continue;
1131 		if (dp->d_name[1] == '.' && dp->d_ino == parentino)
1132 			continue;
1133 		return (0);
1134 	}
1135 	return (1);
1136 }
1137 
1138 /*
1139  * Check if source directory is in the path of the target directory.
1140  * Target is supplied locked, source is unlocked.
1141  * The target is always iput() before returning.
1142  */
1143 checkpath(source, target)
1144 	struct inode *source, *target;
1145 {
1146 	struct dirtemplate dirbuf;
1147 	register struct inode *ip;
1148 	int error = 0;
1149 
1150 	ip = target;
1151 	if (ip->i_number == source->i_number) {
1152 		error = EEXIST;
1153 		goto out;
1154 	}
1155 	if (ip->i_number == ROOTINO)
1156 		goto out;
1157 
1158 	for (;;) {
1159 		if ((ip->i_mode&IFMT) != IFDIR) {
1160 			error = ENOTDIR;
1161 			break;
1162 		}
1163 		error = rdwri(UIO_READ, ip, (caddr_t)&dirbuf,
1164 			sizeof (struct dirtemplate), (off_t)0, 1, (int *)0);
1165 		if (error != 0)
1166 			break;
1167 		if (dirbuf.dotdot_namlen != 2 ||
1168 		    dirbuf.dotdot_name[0] != '.' ||
1169 		    dirbuf.dotdot_name[1] != '.') {
1170 			error = ENOTDIR;
1171 			break;
1172 		}
1173 		if (dirbuf.dotdot_ino == source->i_number) {
1174 			error = EINVAL;
1175 			break;
1176 		}
1177 		if (dirbuf.dotdot_ino == ROOTINO)
1178 			break;
1179 		iput(ip);
1180 		ip = iget(ip->i_dev, ip->i_fs, dirbuf.dotdot_ino);
1181 		if (ip == NULL) {
1182 			error = u.u_error;
1183 			break;
1184 		}
1185 	}
1186 
1187 out:
1188 	if (error == ENOTDIR)
1189 		printf("checkpath: .. not a directory\n");
1190 	if (ip != NULL)
1191 		iput(ip);
1192 	return (error);
1193 }
1194 
1195 /*
1196  * Name cache initialization, from main() when we are booting
1197  */
1198 nchinit()
1199 {
1200 	register union nchash *nchp;
1201 	register struct namecache *ncp;
1202 
1203 	nchhead = 0;
1204 	nchtail = &nchhead;
1205 	for (ncp = namecache; ncp < &namecache[nchsize]; ncp++) {
1206 		ncp->nc_forw = ncp;			/* hash chain */
1207 		ncp->nc_back = ncp;
1208 		ncp->nc_nxt = NULL;			/* lru chain */
1209 		*nchtail = ncp;
1210 		ncp->nc_prev = nchtail;
1211 		nchtail = &ncp->nc_nxt;
1212 		/* all else is zero already */
1213 	}
1214 	for (nchp = nchash; nchp < &nchash[NCHHASH]; nchp++) {
1215 		nchp->nch_head[0] = nchp;
1216 		nchp->nch_head[1] = nchp;
1217 	}
1218 }
1219 
1220 /*
1221  * Cache flush, called when filesys is umounted to
1222  * remove entries that would now be invalid
1223  *
1224  * The line "nxtcp = nchhead" near the end is to avoid potential problems
1225  * if the cache lru chain is modified while we are dumping the
1226  * inode.  This makes the algorithm O(n^2), but do you think I care?
1227  */
1228 nchinval(dev)
1229 	register dev_t dev;
1230 {
1231 	register struct namecache *ncp, *nxtcp;
1232 
1233 	for (ncp = nchhead; ncp; ncp = nxtcp) {
1234 		nxtcp = ncp->nc_nxt;
1235 		if (ncp->nc_ip == NULL ||
1236 		    (ncp->nc_idev != dev && ncp->nc_dev != dev))
1237 			continue;
1238 		/* free the resources we had */
1239 		ncp->nc_idev = NODEV;
1240 		ncp->nc_dev = NODEV;
1241 		ncp->nc_id = NULL;
1242 		ncp->nc_ino = 0;
1243 		ncp->nc_ip = NULL;
1244 		remque(ncp);		/* remove entry from its hash chain */
1245 		ncp->nc_forw = ncp;	/* and make a dummy one */
1246 		ncp->nc_back = ncp;
1247 		/* delete this entry from LRU chain */
1248 		*ncp->nc_prev = nxtcp;
1249 		if (nxtcp)
1250 			nxtcp->nc_prev = ncp->nc_prev;
1251 		else
1252 			nchtail = ncp->nc_prev;
1253 		/* cause rescan of list, it may have altered */
1254 		nxtcp = nchhead;
1255 		/* put the now-free entry at head of LRU */
1256 		ncp->nc_nxt = nxtcp;
1257 		ncp->nc_prev = &nchhead;
1258 		nxtcp->nc_prev = &ncp->nc_nxt;
1259 		nchhead = ncp;
1260 	}
1261 }
1262 
1263 /*
1264  * Name cache invalidation of all entries.
1265  */
1266 cacheinvalall()
1267 {
1268 	register struct namecache *ncp;
1269 
1270 	for (ncp = namecache; ncp < &namecache[nchsize]; ncp++)
1271 		ncp->nc_id = 0;
1272 }
1273