xref: /netbsd-src/sys/ufs/lfs/lfs_rfw.c (revision 9fc453562f6ebe8eabdfd51e21ae0a0058906d4f)
1 /*	$NetBSD: lfs_rfw.c,v 1.36 2020/09/05 16:30:13 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.36 2020/09/05 16:30:13 riastradh Exp $");
34 
35 #if defined(_KERNEL_OPT)
36 #include "opt_quota.h"
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/namei.h>
42 #include <sys/proc.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode.h>
45 #include <sys/mount.h>
46 #include <sys/kthread.h>
47 #include <sys/buf.h>
48 #include <sys/device.h>
49 #include <sys/file.h>
50 #include <sys/disklabel.h>
51 #include <sys/ioctl.h>
52 #include <sys/errno.h>
53 #include <sys/malloc.h>
54 #include <sys/pool.h>
55 #include <sys/socket.h>
56 #include <sys/syslog.h>
57 #include <sys/sysctl.h>
58 #include <sys/conf.h>
59 #include <sys/kauth.h>
60 
61 #include <miscfs/specfs/specdev.h>
62 
63 #include <ufs/lfs/ulfs_quotacommon.h>
64 #include <ufs/lfs/ulfs_inode.h>
65 #include <ufs/lfs/ulfsmount.h>
66 #include <ufs/lfs/ulfs_extern.h>
67 
68 #include <uvm/uvm_extern.h>
69 
70 #include <ufs/lfs/lfs.h>
71 #include <ufs/lfs/lfs_accessors.h>
72 #include <ufs/lfs/lfs_kernel.h>
73 #include <ufs/lfs/lfs_extern.h>
74 
75 #include <miscfs/genfs/genfs.h>
76 #include <miscfs/genfs/genfs_node.h>
77 
78 /*
79  * Roll-forward code.
80  */
81 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
82     kauth_cred_t, int, int *, struct lwp *);
83 
84 extern int lfs_do_rfw;
85 
86 /*
87  * Allocate a particular inode with a particular version number, freeing
88  * any previous versions of this inode that may have gone before.
89  * Used by the roll-forward code.
90  *
91  * XXX this function does not have appropriate locking to be used on a live fs;
92  * XXX but something similar could probably be used for an "undelete" call.
93  *
94  * Called with the Ifile inode locked.
95  */
96 int
lfs_rf_valloc(struct lfs * fs,ino_t ino,int vers,struct lwp * l,struct vnode ** vpp)97 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
98 	      struct vnode **vpp)
99 {
100 	struct vattr va;
101 	struct vnode *vp;
102 	struct inode *ip;
103 	int error;
104 
105 	ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
106 
107 	/*
108 	 * First, just try a vget. If the version number is the one we want,
109 	 * we don't have to do anything else.  If the version number is wrong,
110 	 * take appropriate action.
111 	 */
112 	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
113 	if (error == 0) {
114 		DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp));
115 
116 		*vpp = vp;
117 		ip = VTOI(vp);
118 		if (ip->i_gen == vers)
119 			return 0;
120 		else if (ip->i_gen < vers) {
121 			lfs_truncate(vp, (off_t)0, 0, NOCRED);
122 			ip->i_gen = vers;
123 			lfs_dino_setgen(fs, ip->i_din, vers);
124 			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
125 			return 0;
126 		} else {
127 			DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
128 			       ino, vers, lfs_dino_getgen(fs, ip->i_din)));
129 			vput(vp);
130 			*vpp = NULLVP;
131 			return EEXIST;
132 		}
133 	}
134 
135 	/* Not found, create as regular file. */
136 	vattr_null(&va);
137 	va.va_type = VREG;
138 	va.va_mode = 0;
139 	va.va_fileid = ino;
140 	va.va_gen = vers;
141 	error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL,
142 	    &vp);
143 	if (error)
144 		return error;
145 	error = vn_lock(vp, LK_EXCLUSIVE);
146 	if (error) {
147 		vrele(vp);
148 		*vpp = NULLVP;
149 		return error;
150 	}
151 	ip = VTOI(vp);
152 	ip->i_nlink = 1;
153 	lfs_dino_setnlink(fs, ip->i_din, 1);
154 	*vpp = vp;
155 	return 0;
156 }
157 
158 /*
159  * Load the appropriate indirect block, and change the appropriate pointer.
160  * Mark the block dirty.  Do segment and avail accounting.
161  */
162 static int
update_meta(struct lfs * fs,ino_t ino,int vers,daddr_t lbn,daddr_t ndaddr,size_t size,struct lwp * l)163 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
164 	    daddr_t ndaddr, size_t size, struct lwp *l)
165 {
166 	int error;
167 	struct vnode *vp;
168 	struct inode *ip;
169 #ifdef DEBUG
170 	daddr_t odaddr;
171 	struct indir a[ULFS_NIADDR];
172 	int num;
173 	int i;
174 #endif /* DEBUG */
175 	struct buf *bp;
176 	SEGUSE *sup;
177 
178 	KASSERT(lbn >= 0);	/* no indirect blocks */
179 
180 	if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) {
181 		DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc"
182 		      " returned %d\n", ino, error));
183 		return error;
184 	}
185 
186 	if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), size,
187 				NOCRED, 0, &bp)) != 0) {
188 		vput(vp);
189 		return (error);
190 	}
191 	/* No need to write, the block is already on disk */
192 	if (bp->b_oflags & BO_DELWRI) {
193 		LFS_UNLOCK_BUF(bp);
194 		lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
195 		/* XXX should this wake up fs->lfs_availsleep? */
196 	}
197 	brelse(bp, BC_INVAL);
198 
199 	/*
200 	 * Extend the file, if it is not large enough already.
201 	 * XXX this is not exactly right, we don't know how much of the
202 	 * XXX last block is actually used.  We hope that an inode will
203 	 * XXX appear later to give the correct size.
204 	 */
205 	ip = VTOI(vp);
206 	if (ip->i_size <= (lbn << lfs_sb_getbshift(fs))) {
207 		u_int64_t newsize;
208 
209 		if (lbn < ULFS_NDADDR) {
210 			newsize = (lbn << lfs_sb_getbshift(fs)) +
211 				(size - lfs_sb_getfsize(fs)) + 1;
212 		} else {
213 			newsize = (lbn << lfs_sb_getbshift(fs)) + 1;
214 		}
215 		lfs_dino_setsize(fs, ip->i_din, newsize);
216 
217 		if (ip->i_size < newsize) {
218 			ip->i_size = newsize;
219 			/*
220 			 * tell vm our new size for the case the inode won't
221 			 * appear later.
222 			 */
223 			uvm_vnp_setsize(vp, newsize);
224 		}
225 	}
226 
227 	lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
228 
229 	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
230 	sup->su_nbytes += size;
231 	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
232 
233 	/* differences here should be due to UNWRITTEN indirect blocks. */
234 	KASSERT((lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR &&
235 	    ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) ||
236 	    ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
237 
238 #ifdef DEBUG
239 	/* Now look again to make sure it worked */
240 	ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
241 	for (i = num; i > 0; i--) {
242 		if (!a[i].in_exists)
243 			panic("update_meta: absent %d lv indirect block", i);
244 	}
245 	if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
246 		DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %"
247 		      PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr));
248 #endif /* DEBUG */
249 	vput(vp);
250 	return 0;
251 }
252 
253 /*
254  * Copy some the fields of the dinode as needed by update_inoblk().
255  */
256 static void
update_inoblk_copy_dinode(struct lfs * fs,union lfs_dinode * dstu,const union lfs_dinode * srcu)257 update_inoblk_copy_dinode(struct lfs *fs,
258     union lfs_dinode *dstu, const union lfs_dinode *srcu)
259 {
260 	if (fs->lfs_is64) {
261 		struct lfs64_dinode *dst = &dstu->u_64;
262 		const struct lfs64_dinode *src = &srcu->u_64;
263 		unsigned i;
264 
265 		/*
266 		 * Copy everything but the block pointers and di_blocks.
267 		 * XXX what about di_extb?
268 		 */
269 		dst->di_mode = src->di_mode;
270 		dst->di_nlink = src->di_nlink;
271 		dst->di_uid = src->di_uid;
272 		dst->di_gid = src->di_gid;
273 		dst->di_blksize = src->di_blksize;
274 		dst->di_size = src->di_size;
275 		dst->di_atime = src->di_atime;
276 		dst->di_mtime = src->di_mtime;
277 		dst->di_ctime = src->di_ctime;
278 		dst->di_birthtime = src->di_birthtime;
279 		dst->di_mtimensec = src->di_mtimensec;
280 		dst->di_atimensec = src->di_atimensec;
281 		dst->di_ctimensec = src->di_ctimensec;
282 		dst->di_birthnsec = src->di_birthnsec;
283 		dst->di_gen = src->di_gen;
284 		dst->di_kernflags = src->di_kernflags;
285 		dst->di_flags = src->di_flags;
286 		dst->di_extsize = src->di_extsize;
287 		dst->di_modrev = src->di_modrev;
288 		dst->di_inumber = src->di_inumber;
289 		for (i = 0; i < __arraycount(src->di_spare); i++) {
290 			dst->di_spare[i] = src->di_spare[i];
291 		}
292 	} else {
293 		struct lfs32_dinode *dst = &dstu->u_32;
294 		const struct lfs32_dinode *src = &srcu->u_32;
295 
296 		/* Get mode, link count, size, and times */
297 		memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
298 
299 		/* Then the rest, except di_blocks */
300 		dst->di_flags = src->di_flags;
301 		dst->di_gen = src->di_gen;
302 		dst->di_uid = src->di_uid;
303 		dst->di_gid = src->di_gid;
304 		dst->di_modrev = src->di_modrev;
305 	}
306 }
307 
308 static int
update_inoblk(struct lfs * fs,daddr_t offset,kauth_cred_t cred,struct lwp * l)309 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
310 	      struct lwp *l)
311 {
312 	struct vnode *devvp, *vp;
313 	struct inode *ip;
314 	union lfs_dinode *dip;
315 	struct buf *dbp, *ibp;
316 	int error;
317 	daddr_t daddr;
318 	IFILE *ifp;
319 	SEGUSE *sup;
320 	unsigned i, num;
321 
322 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
323 
324 	/*
325 	 * Get the inode, update times and perms.
326 	 * DO NOT update disk blocks, we do that separately.
327 	 */
328 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
329 	    0, &dbp);
330 	if (error) {
331 		DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
332 		return error;
333 	}
334 	num = LFS_INOPB(fs);
335 	for (i = num; i-- > 0; ) {
336 		dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
337 		if (lfs_dino_getinumber(fs, dip) > LFS_IFILE_INUM) {
338 			error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip),
339 					      lfs_dino_getgen(fs, dip),
340 					      l, &vp);
341 			if (error) {
342 				DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
343 				      " returned %d\n", error));
344 				continue;
345 			}
346 			ip = VTOI(vp);
347 			if (lfs_dino_getsize(fs, dip) != ip->i_size)
348 				lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
349 					     NOCRED);
350 			update_inoblk_copy_dinode(fs, ip->i_din, dip);
351 
352 			ip->i_flags = lfs_dino_getflags(fs, dip);
353 			ip->i_gen = lfs_dino_getgen(fs, dip);
354 			ip->i_uid = lfs_dino_getuid(fs, dip);
355 			ip->i_gid = lfs_dino_getgid(fs, dip);
356 
357 			ip->i_mode = lfs_dino_getmode(fs, dip);
358 			ip->i_nlink = lfs_dino_getnlink(fs, dip);
359 			ip->i_size = lfs_dino_getsize(fs, dip);
360 
361 			LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
362 
363 			/* Re-initialize to get type right */
364 			ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
365 				  &vp);
366 			vput(vp);
367 
368 			/* Record change in location */
369 			LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
370 			daddr = lfs_if_getdaddr(fs, ifp);
371 			lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno));
372 			error = LFS_BWRITE_LOG(ibp); /* Ifile */
373 			/* And do segment accounting */
374 			if (lfs_dtosn(fs, daddr) != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) {
375 				if (daddr > 0) {
376 					LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, daddr),
377 						     ibp);
378 					sup->su_nbytes -= DINOSIZE(fs);
379 					LFS_WRITESEGENTRY(sup, fs,
380 							  lfs_dtosn(fs, daddr),
381 							  ibp);
382 				}
383 				LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)),
384 					     ibp);
385 				sup->su_nbytes += DINOSIZE(fs);
386 				LFS_WRITESEGENTRY(sup, fs,
387 						  lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)),
388 						  ibp);
389 			}
390 		}
391 	}
392 	brelse(dbp, BC_AGE);
393 
394 	return 0;
395 }
396 
397 #define CHECK_CKSUM   0x0001  /* Check the checksum to make sure it's valid */
398 #define CHECK_UPDATE  0x0002  /* Update Ifile for new data blocks / inodes */
399 
400 static daddr_t
check_segsum(struct lfs * fs,daddr_t offset,u_int64_t nextserial,kauth_cred_t cred,int flags,int * pseg_flags,struct lwp * l)401 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
402 	     kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l)
403 {
404 	struct vnode *devvp;
405 	struct buf *bp, *dbp;
406 	int error, nblocks = 0, ninos, i, j; /* XXX: gcc */
407 	SEGSUM *ssp;
408 	u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */
409 	daddr_t oldoffset;
410 	IINFO *iip;
411 	FINFO *fip;
412 	SEGUSE *sup;
413 	size_t size;
414 	uint32_t datasum, foundsum;
415 
416 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
417 	/*
418 	 * If the segment has a superblock and we're at the top
419 	 * of the segment, skip the superblock.
420 	 */
421 	if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) {
422 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
423 		if (sup->su_flags & SEGUSE_SUPERBLOCK)
424 			offset += lfs_btofsb(fs, LFS_SBPAD);
425 		brelse(bp, 0);
426 	}
427 
428 	/* Read in the segment summary */
429 	error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
430 	    0, &bp);
431 	if (error)
432 		return -1;
433 
434 	/* Check summary checksum */
435 	ssp = (SEGSUM *)bp->b_data;
436 	if (flags & CHECK_CKSUM) {
437 		size_t sumstart;
438 
439 		sumstart = lfs_ss_getsumstart(fs);
440 		if (lfs_ss_getsumsum(fs, ssp) !=
441 		    cksum((char *)ssp + sumstart,
442 			  lfs_sb_getsumsize(fs) - sumstart)) {
443 			DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset));
444 			offset = -1;
445 			goto err1;
446 		}
447 		if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
448 		    lfs_ss_getninos(fs, ssp) == 0) {
449 			DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset));
450 			offset = -1;
451 			goto err1;
452 		}
453 		if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
454 			DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
455 			offset = -1;
456 			goto err1;
457 		}
458 	}
459 	if (lfs_sb_getversion(fs) > 1) {
460 		if (lfs_ss_getserial(fs, ssp) != nextserial) {
461 			DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64
462 			      "\n", offset));
463 			offset = -1;
464 			goto err1;
465 		}
466 		if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
467 			DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
468 			      PRIx64 "\n", lfs_ss_getident(fs, ssp),
469 			      lfs_sb_getident(fs), offset));
470 			offset = -1;
471 			goto err1;
472 		}
473 	}
474 	if (pseg_flags)
475 		*pseg_flags = lfs_ss_getflags(fs, ssp);
476 	oldoffset = offset;
477 	offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
478 
479 	ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
480 	iip = SEGSUM_IINFOSTART(fs, bp->b_data);
481 	if (flags & CHECK_CKSUM) {
482 		/* Count blocks */
483 		nblocks = 0;
484 		fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data);
485 		for (i = 0; i < lfs_ss_getnfinfo(fs, ssp); ++i) {
486 			nblocks += lfs_fi_getnblocks(fs, fip);
487 			if (lfs_fi_getnblocks(fs, fip) <= 0)
488 				break;
489 			fip = NEXT_FINFO(fs, fip);
490 		}
491 		nblocks += ninos;
492 		/* Create the sum array */
493 		datap = dp = malloc(nblocks * sizeof(u_long),
494 				    M_SEGMENT, M_WAITOK);
495 	}
496 
497 	/* Handle individual blocks */
498 	fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data);
499 	for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
500 		/* Inode block? */
501 		if (ninos && lfs_ii_getblock(fs, iip) == offset) {
502 			if (flags & CHECK_CKSUM) {
503 				/* Read in the head and add to the buffer */
504 				error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getbsize(fs),
505 					      0, &dbp);
506 				if (error) {
507 					offset = -1;
508 					goto err2;
509 				}
510 				/* XXX this can't be right, on-disk u_long? */
511 				(*dp++) = ((u_long *)(dbp->b_data))[0];
512 				brelse(dbp, BC_AGE);
513 			}
514 			if (flags & CHECK_UPDATE) {
515 				if ((error = update_inoblk(fs, offset, cred, l))
516 				    != 0) {
517 					offset = -1;
518 					goto err2;
519 				}
520 			}
521 			offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
522 			iip = NEXTLOWER_IINFO(fs, iip);
523 			--ninos;
524 			--i; /* compensate for ++i in loop header */
525 			continue;
526 		}
527 		size = lfs_sb_getbsize(fs);
528 		for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
529 			if (j == lfs_fi_getnblocks(fs, fip) - 1)
530 				size = lfs_fi_getlastlength(fs, fip);
531 			if (flags & CHECK_CKSUM) {
532 				error = bread(devvp, LFS_FSBTODB(fs, offset), size,
533 				    0, &dbp);
534 				if (error) {
535 					offset = -1;
536 					goto err2;
537 				}
538 				(*dp++) = ((u_long *)(dbp->b_data))[0];
539 				brelse(dbp, BC_AGE);
540 			}
541 			/* Account for and update any direct blocks */
542 			if ((flags & CHECK_UPDATE) &&
543 			   lfs_fi_getino(fs, fip) > LFS_IFILE_INUM &&
544 			   lfs_fi_getblock(fs, fip, j) >= 0) {
545 				update_meta(fs, lfs_fi_getino(fs, fip),
546 					    lfs_fi_getversion(fs, fip),
547 					    lfs_fi_getblock(fs, fip, j),
548 					    offset, size, l);
549 			}
550 			offset += lfs_btofsb(fs, size);
551 		}
552 		fip = NEXT_FINFO(fs, fip);
553 	}
554 	/* Checksum the array, compare */
555 	datasum = lfs_ss_getdatasum(fs, ssp);
556 	foundsum = cksum(datap, nblocks * sizeof(u_long));
557 	if ((flags & CHECK_CKSUM) && datasum != foundsum) {
558 		DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
559 		      " (wanted %x got %x)\n",
560 		      offset, datasum, foundsum));
561 		offset = -1;
562 		goto err2;
563 	}
564 
565 	/* If we're at the end of the segment, move to the next */
566 	if (lfs_dtosn(fs, offset + lfs_btofsb(fs, lfs_sb_getsumsize(fs) + lfs_sb_getbsize(fs))) !=
567 	   lfs_dtosn(fs, offset)) {
568 		if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))) {
569 			offset = -1;
570 			goto err2;
571 		}
572 		offset = lfs_ss_getnext(fs, ssp);
573 		DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
574 		       " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
575 	}
576 
577 	if (flags & CHECK_UPDATE) {
578 		lfs_sb_subavail(fs, offset - oldoffset);
579 		/* Don't clog the buffer queue */
580 		mutex_enter(&lfs_lock);
581 		if (locked_queue_count > LFS_MAX_BUFS ||
582 		    locked_queue_bytes > LFS_MAX_BYTES) {
583 			lfs_flush(fs, SEGM_CKP, 0);
584 		}
585 		mutex_exit(&lfs_lock);
586 	}
587 
588     err2:
589 	if (flags & CHECK_CKSUM)
590 		free(datap, M_SEGMENT);
591     err1:
592 	brelse(bp, BC_AGE);
593 
594 	/* XXX should we update the serial number even for bad psegs? */
595 	if ((flags & CHECK_UPDATE) && offset > 0 && lfs_sb_getversion(fs) > 1)
596 		lfs_sb_setserial(fs, nextserial);
597 	return offset;
598 }
599 
600 void
lfs_roll_forward(struct lfs * fs,struct mount * mp,struct lwp * l)601 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
602 {
603 	int flags, dirty;
604 	daddr_t offset, oldoffset, lastgoodpseg;
605 	int sn, curseg, do_rollforward;
606 	struct proc *p;
607 	kauth_cred_t cred;
608 	SEGUSE *sup;
609 	struct buf *bp;
610 
611 	p = l ? l->l_proc : NULL;
612 	cred = p ? p->p_cred : NOCRED;
613 
614 	/*
615 	 * Roll forward.
616 	 *
617 	 * We don't roll forward for v1 filesystems, because
618 	 * of the danger that the clock was turned back between the last
619 	 * checkpoint and crash.  This would roll forward garbage.
620 	 *
621 	 * v2 filesystems don't have this problem because they use a
622 	 * monotonically increasing serial number instead of a timestamp.
623 	 */
624 	do_rollforward = (!(lfs_sb_getpflags(fs) & LFS_PF_CLEAN) &&
625 			  lfs_do_rfw && lfs_sb_getversion(fs) > 1 && p != NULL);
626 	if (do_rollforward) {
627 		u_int64_t nextserial;
628 		/*
629 		 * Phase I: Find the address of the last good partial
630 		 * segment that was written after the checkpoint.  Mark
631 		 * the segments in question dirty, so they won't be
632 		 * reallocated.
633 		 */
634 		lastgoodpseg = oldoffset = offset = lfs_sb_getoffset(fs);
635 		flags = 0x0;
636 		DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
637 		      PRIx64 "\n", offset));
638 		LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
639 		if (!(sup->su_flags & SEGUSE_DIRTY))
640 			lfs_sb_subnclean(fs, 1);
641 		sup->su_flags |= SEGUSE_DIRTY;
642 		LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
643 		nextserial = lfs_sb_getserial(fs) + 1;
644 		while ((offset = check_segsum(fs, offset, nextserial,
645 		    cred, CHECK_CKSUM, &flags, l)) > 0) {
646 			nextserial++;
647 			if (lfs_sntod(fs, oldoffset) != lfs_sntod(fs, offset)) {
648 				LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset),
649 					     bp);
650 				if (!(sup->su_flags & SEGUSE_DIRTY))
651 					lfs_sb_subnclean(fs, 1);
652 				sup->su_flags |= SEGUSE_DIRTY;
653 				LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset),
654 					     bp);
655 			}
656 
657 			DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%"
658 			      PRIx64 "\n", offset));
659 			if (flags & SS_DIROP) {
660 				DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
661 				      PRIx64 "\n", oldoffset));
662 				if (!(flags & SS_CONT)) {
663 				     DLOG((DLOG_RF, "lfs_mountfs: dirops end "
664 					   "at 0x%" PRIx64 "\n", oldoffset));
665 				}
666 			}
667 			if (!(flags & SS_CONT))
668 				lastgoodpseg = offset;
669 			oldoffset = offset;
670 		}
671 		if (flags & SS_CONT) {
672 			DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
673 			      "dirops discarded\n"));
674 		}
675 		DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
676 		      "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg));
677 		oldoffset = lfs_sb_getoffset(fs);
678 		if (lfs_sb_getoffset(fs) != lastgoodpseg) {
679 			/* Don't overwrite what we're trying to preserve */
680 			offset = lfs_sb_getoffset(fs);
681 			lfs_sb_setoffset(fs, lastgoodpseg);
682 			lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, lfs_sb_getoffset(fs))));
683 			for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
684 				sn = (sn + 1) % lfs_sb_getnseg(fs);
685 				if (sn == curseg)
686 					panic("lfs_mountfs: no clean segments");
687 				LFS_SEGENTRY(sup, fs, sn, bp);
688 				dirty = (sup->su_flags & SEGUSE_DIRTY);
689 				brelse(bp, 0);
690 				if (!dirty)
691 					break;
692 			}
693 			lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
694 
695 			/*
696 			 * Phase II: Roll forward from the first superblock.
697 			 */
698 			while (offset != lastgoodpseg) {
699 				DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%"
700 				      PRIx64 "\n", offset));
701 				offset = check_segsum(fs, offset,
702 				    lfs_sb_getserial(fs) + 1, cred, CHECK_UPDATE,
703 				    NULL, l);
704 			}
705 
706 			/*
707 			 * Finish: flush our changes to disk.
708 			 */
709 			lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
710 			DLOG((DLOG_RF, "lfs_mountfs: roll forward ",
711 			      "recovered %jd blocks\n",
712 			      (intmax_t)(lastgoodpseg - oldoffset)));
713 		}
714 		DLOG((DLOG_RF, "LFS roll forward complete\n"));
715 	}
716 }
717