xref: /netbsd-src/sys/fs/efs/efs_subr.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: efs_subr.c,v 1.7 2008/05/16 09:21:59 hannken Exp $	*/
2 
3 /*
4  * Copyright (c) 2006 Stephen M. Rumble <rumble@ephemeral.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/cdefs.h>
20 __KERNEL_RCSID(0, "$NetBSD: efs_subr.c,v 1.7 2008/05/16 09:21:59 hannken Exp $");
21 
22 #include <sys/param.h>
23 #include <sys/kauth.h>
24 #include <sys/lwp.h>
25 #include <sys/proc.h>
26 #include <sys/buf.h>
27 #include <sys/mount.h>
28 #include <sys/vnode.h>
29 #include <sys/namei.h>
30 #include <sys/stat.h>
31 #include <sys/malloc.h>
32 
33 #include <miscfs/genfs/genfs_node.h>
34 
35 #include <fs/efs/efs.h>
36 #include <fs/efs/efs_sb.h>
37 #include <fs/efs/efs_dir.h>
38 #include <fs/efs/efs_genfs.h>
39 #include <fs/efs/efs_mount.h>
40 #include <fs/efs/efs_extent.h>
41 #include <fs/efs/efs_dinode.h>
42 #include <fs/efs/efs_inode.h>
43 #include <fs/efs/efs_subr.h>
44 
45 struct pool efs_inode_pool;
46 
47 /*
48  * Calculate a checksum for the provided superblock in __host byte order__.
49  *
50  * At some point SGI changed the checksum algorithm slightly, which can be
51  * enabled with the 'new' flag.
52  *
53  * Presumably this change occured on or before 24 Oct 1988 (around IRIX 3.1),
54  * so we're pretty unlikely to ever actually see an old checksum. Further, it
55  * means that EFS_NEWMAGIC filesystems (IRIX >= 3.3) must match the new
56  * checksum whereas EFS_MAGIC filesystems could potentially use either
57  * algorithm.
58  *
59  * See comp.sys.sgi <1991Aug9.050838.16876@odin.corp.sgi.com>
60  */
61 int32_t
62 efs_sb_checksum(struct efs_sb *esb, int new)
63 {
64 	int i;
65 	int32_t cksum;
66 	uint16_t *sbarray = (uint16_t *)esb;
67 
68 	KASSERT((EFS_SB_CHECKSUM_SIZE % 2) == 0);
69 
70 	for (i = cksum = 0; i < (EFS_SB_CHECKSUM_SIZE / 2); i++) {
71 		cksum ^= be16toh(sbarray[i]);
72 		cksum  = (cksum << 1) | (new && cksum < 0);
73 	}
74 
75 	return (cksum);
76 }
77 
78 /*
79  * Determine if the superblock is valid.
80  *
81  * Returns 0 if valid, else invalid. If invalid, 'why' is set to an
82  * explanation.
83  */
84 int
85 efs_sb_validate(struct efs_sb *esb, const char **why)
86 {
87 	uint32_t ocksum, ncksum;
88 
89 	*why = NULL;
90 
91 	if (be32toh(esb->sb_magic) != EFS_SB_MAGIC &&
92 	    be32toh(esb->sb_magic) != EFS_SB_NEWMAGIC) {
93 		*why = "sb_magic invalid";
94 		return (1);
95 	}
96 
97 	ocksum = htobe32(efs_sb_checksum(esb, 0));
98 	ncksum = htobe32(efs_sb_checksum(esb, 1));
99 	if (esb->sb_checksum != ocksum && esb->sb_checksum != ncksum) {
100 		*why = "sb_checksum invalid";
101 		return (1);
102 	}
103 
104 	if (be32toh(esb->sb_size) > EFS_SIZE_MAX) {
105 		*why = "sb_size > EFS_SIZE_MAX";
106 		return (1);
107 	}
108 
109 	if (be32toh(esb->sb_firstcg) <= EFS_BB_BITMAP) {
110 		*why = "sb_firstcg <= EFS_BB_BITMAP";
111 		return (1);
112 	}
113 
114 	/* XXX - add better sb consistency checks here */
115 	if (esb->sb_cgfsize == 0 ||
116 	    esb->sb_cgisize == 0 ||
117 	    esb->sb_ncg == 0 ||
118 	    esb->sb_bmsize == 0) {
119 		*why = "something bad happened";
120 		return (1);
121 	}
122 
123 	return (0);
124 }
125 
126 /*
127  * Determine the basic block offset and inode index within that block, given
128  * the inode 'ino' and filesystem parameters _in host byte order_. The inode
129  * will live at byte address 'bboff' * EFS_BB_SIZE + 'index' * EFS_DINODE_SIZE.
130  */
131 void
132 efs_locate_inode(ino_t ino, struct efs_sb *sbp, uint32_t *bboff, int *index)
133 {
134 	uint32_t cgfsize, firstcg;
135 	uint16_t cgisize;
136 
137 	cgisize = be16toh(sbp->sb_cgisize);
138 	cgfsize = be32toh(sbp->sb_cgfsize);
139 	firstcg = be32toh(sbp->sb_firstcg),
140 
141 	*bboff = firstcg + ((ino / (cgisize * EFS_DINODES_PER_BB)) * cgfsize) +
142 	    ((ino % (cgisize * EFS_DINODES_PER_BB)) / EFS_DINODES_PER_BB);
143 	*index = ino & (EFS_DINODES_PER_BB - 1);
144 }
145 
146 /*
147  * Read in an inode from disk.
148  *
149  * We actually take in four inodes at a time. Hopefully these will stick
150  * around in the buffer cache and get used without going to disk.
151  *
152  * Returns 0 on success.
153  */
154 int
155 efs_read_inode(struct efs_mount *emp, ino_t ino, struct lwp *l,
156     struct efs_dinode *di)
157 {
158 	struct efs_sb *sbp;
159 	struct buf *bp;
160 	int index, err;
161 	uint32_t bboff;
162 
163 	sbp = &emp->em_sb;
164 	efs_locate_inode(ino, sbp, &bboff, &index);
165 
166 	err = efs_bread(emp, bboff, l, &bp);
167 	if (err) {
168 		brelse(bp, 0);
169 		return (err);
170 	}
171 	memcpy(di, ((struct efs_dinode *)bp->b_data) + index, sizeof(*di));
172 	brelse(bp, 0);
173 
174 	return (0);
175 }
176 
177 /*
178  * Perform a read from our device handling the potential DEV_BSIZE
179  * messiness (although as of 19.2.2006, all ports appear to use 512) as
180  * we as EFS block sizing.
181  *
182  * bboff: basic block offset
183  *
184  * Returns 0 on success.
185  */
186 int
187 efs_bread(struct efs_mount *emp, uint32_t bboff, struct lwp *l, struct buf **bp)
188 {
189 	KASSERT(bboff < EFS_SIZE_MAX);
190 
191 	return (bread(emp->em_devvp, (daddr_t)bboff * (EFS_BB_SIZE / DEV_BSIZE),
192 	    EFS_BB_SIZE, (l == NULL) ? NOCRED : l->l_cred, 0, bp));
193 }
194 
195 /*
196  * Synchronise the in-core, host ordered and typed inode fields with their
197  * corresponding on-disk, EFS ordered and typed copies.
198  *
199  * This is the inverse of efs_dinode_sync_inode(), and should be called when
200  * an inode is loaded from disk.
201  */
202 void
203 efs_sync_dinode_to_inode(struct efs_inode *ei)
204 {
205 
206 	ei->ei_mode		= be16toh(ei->ei_di.di_mode);	/*same as nbsd*/
207 	ei->ei_nlink		= be16toh(ei->ei_di.di_nlink);
208 	ei->ei_uid		= be16toh(ei->ei_di.di_uid);
209 	ei->ei_gid		= be16toh(ei->ei_di.di_gid);
210 	ei->ei_size		= be32toh(ei->ei_di.di_size);
211 	ei->ei_atime		= be32toh(ei->ei_di.di_atime);
212 	ei->ei_mtime		= be32toh(ei->ei_di.di_mtime);
213 	ei->ei_ctime		= be32toh(ei->ei_di.di_ctime);
214 	ei->ei_gen		= be32toh(ei->ei_di.di_gen);
215 	ei->ei_numextents 	= be16toh(ei->ei_di.di_numextents);
216 	ei->ei_version		= ei->ei_di.di_version;
217 }
218 
219 /*
220  * Synchronise the on-disk, EFS ordered and typed inode fields with their
221  * corresponding in-core, host ordered and typed copies.
222  *
223  * This is the inverse of efs_inode_sync_dinode(), and should be called before
224  * an inode is flushed to disk.
225  */
226 void
227 efs_sync_inode_to_dinode(struct efs_inode *ei)
228 {
229 
230 	panic("readonly -- no need to call me");
231 }
232 
233 #ifdef DIAGNOSTIC
234 /*
235  * Ensure that the in-core inode's host cached fields match its on-disk copy.
236  *
237  * Returns 0 if they match.
238  */
239 static int
240 efs_is_inode_synced(struct efs_inode *ei)
241 {
242 	int s;
243 
244 	s = 0;
245 	/* XXX -- see above remarks about assumption */
246 	s += (ei->ei_mode	!= be16toh(ei->ei_di.di_mode));
247 	s += (ei->ei_nlink	!= be16toh(ei->ei_di.di_nlink));
248 	s += (ei->ei_uid	!= be16toh(ei->ei_di.di_uid));
249 	s += (ei->ei_gid	!= be16toh(ei->ei_di.di_gid));
250 	s += (ei->ei_size	!= be32toh(ei->ei_di.di_size));
251 	s += (ei->ei_atime	!= be32toh(ei->ei_di.di_atime));
252 	s += (ei->ei_mtime	!= be32toh(ei->ei_di.di_mtime));
253 	s += (ei->ei_ctime	!= be32toh(ei->ei_di.di_ctime));
254 	s += (ei->ei_gen	!= be32toh(ei->ei_di.di_gen));
255 	s += (ei->ei_numextents	!= be16toh(ei->ei_di.di_numextents));
256 	s += (ei->ei_version	!= ei->ei_di.di_version);
257 
258 	return (s);
259 }
260 #endif
261 
262 /*
263  * Given an efs_dirblk structure and a componentname to search for, return the
264  * corresponding inode if it is found.
265  *
266  * Returns 0 on success.
267  */
268 static int
269 efs_dirblk_lookup(struct efs_dirblk *dir, struct componentname *cn,
270     ino_t *inode)
271 {
272 	struct efs_dirent *de;
273 	int i, slot, offset;
274 
275 	KASSERT(cn->cn_namelen <= EFS_DIRENT_NAMELEN_MAX);
276 
277 	slot = offset = 0;
278 
279 	for (i = 0; i < dir->db_slots; i++) {
280 		offset = EFS_DIRENT_OFF_EXPND(dir->db_space[i]);
281 
282 		if (offset == EFS_DIRBLK_SLOT_FREE)
283 			continue;
284 
285 		de = (struct efs_dirent *)((char *)dir + offset);
286 		if (de->de_namelen == cn->cn_namelen &&
287 		   (strncmp(cn->cn_nameptr, de->de_name, cn->cn_namelen) == 0)){
288 			slot = i;
289 			break;
290 		}
291 	}
292 	if (i == dir->db_slots)
293 		return (ENOENT);
294 
295 	KASSERT(slot < offset && offset < EFS_DIRBLK_SPACE_SIZE);
296 	de = (struct efs_dirent *)((char *)dir + offset);
297 	*inode = be32toh(de->de_inumber);
298 
299 	return (0);
300 }
301 
302 /*
303  * Given an extent descriptor that represents a directory, look up
304  * componentname within its efs_dirblk's. If it is found, return the
305  * corresponding inode in 'ino'.
306  *
307  * Returns 0 on success.
308  */
309 static int
310 efs_extent_lookup(struct efs_mount *emp, struct efs_extent *ex,
311     struct componentname *cn, ino_t *ino)
312 {
313 	struct efs_dirblk *db;
314 	struct buf *bp;
315 	int i, err;
316 
317 	/*
318 	 * Read in each of the dirblks until we find our entry.
319 	 * If we don't, return ENOENT.
320 	 */
321 	for (i = 0; i < ex->ex_length; i++) {
322 		err = efs_bread(emp, ex->ex_bn + i, NULL, &bp);
323 		if (err) {
324 			printf("efs: warning: invalid extent descriptor\n");
325 			brelse(bp, 0);
326 			return (err);
327 		}
328 
329 		db = (struct efs_dirblk *)bp->b_data;
330 		if (efs_dirblk_lookup(db, cn, ino) == 0) {
331 			brelse(bp, 0);
332 			return (0);
333 		}
334 		brelse(bp, 0);
335 	}
336 
337 	return (ENOENT);
338 }
339 
340 /*
341  * Given the provided in-core inode, look up the pathname requested. If
342  * we find it, 'ino' reflects its corresponding on-disk inode number.
343  *
344  * Returns 0 on success.
345  */
346 int
347 efs_inode_lookup(struct efs_mount *emp, struct efs_inode *ei,
348     struct componentname *cn, ino_t *ino)
349 {
350 	struct efs_extent ex;
351 	struct efs_extent_iterator exi;
352 	int ret;
353 
354 	KASSERT(VOP_ISLOCKED(ei->ei_vp));
355 	KASSERT(efs_is_inode_synced(ei) == 0);
356 	KASSERT((ei->ei_mode & S_IFMT) == S_IFDIR);
357 
358 	efs_extent_iterator_init(&exi, ei, 0);
359 	while ((ret = efs_extent_iterator_next(&exi, &ex)) == 0) {
360 		if (efs_extent_lookup(emp, &ex, cn, ino) == 0) {
361 			return (0);
362 		}
363 	}
364 
365 	return ((ret == -1) ? ENOENT : ret);
366 }
367 
368 /*
369  * Convert on-disk extent structure to in-core format.
370  */
371 void
372 efs_dextent_to_extent(struct efs_dextent *dex, struct efs_extent *ex)
373 {
374 
375 	KASSERT(dex != NULL && ex != NULL);
376 
377 	ex->ex_magic	= dex->ex_bytes[0];
378 	ex->ex_bn	= be32toh(dex->ex_words[0]) & 0x00ffffff;
379 	ex->ex_length	= dex->ex_bytes[4];
380 	ex->ex_offset	= be32toh(dex->ex_words[1]) & 0x00ffffff;
381 }
382 
383 /*
384  * Convert in-core extent format to on-disk structure.
385  */
386 void
387 efs_extent_to_dextent(struct efs_extent *ex, struct efs_dextent *dex)
388 {
389 
390 	KASSERT(ex != NULL && dex != NULL);
391 	KASSERT(ex->ex_magic == EFS_EXTENT_MAGIC);
392 	KASSERT((ex->ex_bn & ~EFS_EXTENT_BN_MASK) == 0);
393 	KASSERT((ex->ex_offset & ~EFS_EXTENT_OFFSET_MASK) == 0);
394 
395 	dex->ex_words[0] = htobe32(ex->ex_bn);
396 	dex->ex_bytes[0] = ex->ex_magic;
397 	dex->ex_words[1] = htobe32(ex->ex_offset);
398 	dex->ex_bytes[4] = ex->ex_length;
399 }
400 
401 /*
402  * Initialise an extent iterator.
403  *
404  * If start_hint is non-0, attempt to set up the iterator beginning with the
405  * extent descriptor in which the start_hint'th byte exists. Callers must not
406  * expect success (this is simply an optimisation), so we reserve the right
407  * to start from the beginning.
408  */
409 void
410 efs_extent_iterator_init(struct efs_extent_iterator *exi, struct efs_inode *eip,
411     off_t start_hint)
412 {
413 	struct efs_extent ex, ex2;
414 	struct buf *bp;
415 	struct efs_mount *emp = VFSTOEFS(eip->ei_vp->v_mount);
416 	off_t offset, length, next;
417 	int i, err, numextents, numinextents;
418 	int hi, lo, mid;
419 	int indir;
420 
421 	exi->exi_eip	= eip;
422 	exi->exi_next	= 0;
423 	exi->exi_dnext	= 0;
424 	exi->exi_innext	= 0;
425 
426 	if (start_hint == 0)
427 		return;
428 
429 	/* force iterator to end if hint is too big */
430 	if (start_hint >= eip->ei_size) {
431 		exi->exi_next = eip->ei_numextents;
432 		return;
433 	}
434 
435 	/*
436 	 * Use start_hint to jump to the right extent descriptor. We'll
437 	 * iterate over the 12 indirect extents because it's cheap, then
438 	 * bring the appropriate vector into core and binary search it.
439 	 */
440 
441 	/*
442 	 * Handle the small file case separately first...
443 	 */
444 	if (eip->ei_numextents <= EFS_DIRECTEXTENTS) {
445 		for (i = 0; i < eip->ei_numextents; i++) {
446 			efs_dextent_to_extent(&eip->ei_di.di_extents[i], &ex);
447 
448 			offset = ex.ex_offset * EFS_BB_SIZE;
449 			length = ex.ex_length * EFS_BB_SIZE;
450 
451 			if (start_hint >= offset &&
452 			    start_hint < (offset + length)) {
453 				exi->exi_next = exi->exi_dnext = i;
454 				return;
455 			}
456 		}
457 
458 		/* shouldn't get here, no? */
459 		EFS_DPRINTF(("efs_extent_iterator_init: bad direct extents\n"));
460 		return;
461 	}
462 
463 	/*
464 	 * Now do the large files with indirect extents...
465 	 *
466 	 * The first indirect extent's ex_offset field contains the
467 	 * number of indirect extents used.
468 	 */
469 	efs_dextent_to_extent(&eip->ei_di.di_extents[0], &ex);
470 
471 	numinextents = ex.ex_offset;
472 	if (numinextents < 1 || numinextents >= EFS_DIRECTEXTENTS) {
473 		EFS_DPRINTF(("efs_extent_iterator_init: bad ex.ex_offset\n"));
474 		return;
475 	}
476 
477 	next = 0;
478 	indir = -1;
479 	numextents = 0;
480 	for (i = 0; i < numinextents; i++) {
481 		efs_dextent_to_extent(&eip->ei_di.di_extents[i], &ex);
482 
483 		err = efs_bread(emp, ex.ex_bn, NULL, &bp);
484 		if (err) {
485 			brelse(bp, 0);
486 			return;
487 		}
488 
489 		efs_dextent_to_extent((struct efs_dextent *)bp->b_data, &ex2);
490 		brelse(bp, 0);
491 
492 		offset = ex2.ex_offset * EFS_BB_SIZE;
493 
494 		if (offset > start_hint) {
495 			indir = MAX(0, i - 1);
496 			break;
497 		}
498 
499 		/* number of extents prior to this indirect vector of extents */
500 		next += numextents;
501 
502 		/* number of extents within this indirect vector of extents */
503 		numextents = ex.ex_length * EFS_EXTENTS_PER_BB;
504 		numextents = MIN(numextents, eip->ei_numextents - next);
505 	}
506 
507 	/*
508 	 * We hit the end, so assume it's in the last extent.
509 	 */
510 	if (indir == -1)
511 		indir = numinextents - 1;
512 
513 	/*
514 	 * Binary search to find our desired direct extent.
515 	 */
516 	lo = 0;
517 	mid = 0;
518 	hi = numextents - 1;
519 	efs_dextent_to_extent(&eip->ei_di.di_extents[indir], &ex);
520 	while (lo <= hi) {
521 		int bboff;
522 		int index;
523 
524 		mid = (lo + hi) / 2;
525 
526 		bboff = mid / EFS_EXTENTS_PER_BB;
527 		index = mid % EFS_EXTENTS_PER_BB;
528 
529 		err = efs_bread(emp, ex.ex_bn + bboff, NULL, &bp);
530 		if (err) {
531 			brelse(bp, 0);
532 			EFS_DPRINTF(("efs_extent_iterator_init: bsrch read\n"));
533 			return;
534 		}
535 
536 		efs_dextent_to_extent((struct efs_dextent *)bp->b_data + index,
537 		    &ex2);
538 		brelse(bp, 0);
539 
540 		offset = ex2.ex_offset * EFS_BB_SIZE;
541 		length = ex2.ex_length * EFS_BB_SIZE;
542 
543 		if (start_hint >= offset && start_hint < (offset + length))
544 			break;
545 
546 		if (start_hint < offset)
547 			hi = mid - 1;
548 		else
549 			lo = mid + 1;
550 	}
551 
552 	/*
553 	 * This is bad. Either the hint is bogus (which shouldn't
554 	 * happen) or the extent list must be screwed up. We
555 	 * have to abort.
556 	 */
557 	if (lo > hi) {
558 		EFS_DPRINTF(("efs_extent_iterator_init: bsearch "
559 		    "failed to find extent\n"));
560 		return;
561 	}
562 
563 	exi->exi_next	= next + mid;
564 	exi->exi_dnext	= indir;
565 	exi->exi_innext	= mid;
566 }
567 
568 /*
569  * Return the next EFS extent.
570  *
571  * Returns 0 if another extent was iterated, -1 if we've exhausted all
572  * extents, or an error number. If 'exi' is non-NULL, the next extent is
573  * written to it (should it exist).
574  */
575 int
576 efs_extent_iterator_next(struct efs_extent_iterator *exi,
577     struct efs_extent *exp)
578 {
579 	struct efs_extent ex;
580 	struct efs_dextent *dexp;
581 	struct efs_inode *eip = exi->exi_eip;
582 	struct buf *bp;
583 	int err, bboff, index;
584 
585 	if (exi->exi_next++ >= eip->ei_numextents)
586 		return (-1);
587 
588 	/* direct or indirect extents? */
589 	if (eip->ei_numextents <= EFS_DIRECTEXTENTS) {
590 		if (exp != NULL) {
591 			dexp = &eip->ei_di.di_extents[exi->exi_dnext++];
592 			efs_dextent_to_extent(dexp, exp);
593 		}
594 	} else {
595 		efs_dextent_to_extent(
596 		    &eip->ei_di.di_extents[exi->exi_dnext], &ex);
597 
598 		bboff	= exi->exi_innext / EFS_EXTENTS_PER_BB;
599 		index	= exi->exi_innext % EFS_EXTENTS_PER_BB;
600 
601 		err = efs_bread(VFSTOEFS(eip->ei_vp->v_mount),
602 		    ex.ex_bn + bboff, NULL, &bp);
603 		if (err) {
604 			EFS_DPRINTF(("efs_extent_iterator_next: "
605 			    "efs_bread failed: %d\n", err));
606 			brelse(bp, 0);
607 			return (err);
608 		}
609 
610 		if (exp != NULL) {
611 			dexp = (struct efs_dextent *)bp->b_data + index;
612 			efs_dextent_to_extent(dexp, exp);
613 		}
614 		brelse(bp, 0);
615 
616 		bboff = exi->exi_innext++ / EFS_EXTENTS_PER_BB;
617 		if (bboff >= ex.ex_length) {
618 			exi->exi_innext = 0;
619 			exi->exi_dnext++;
620 		}
621 	}
622 
623 	return (0);
624 }
625