xref: /netbsd-src/sys/ufs/ffs/ffs_snapshot.c (revision c34236556bea94afcaca1782d7d228301edc3ea0)
1 /*	$NetBSD: ffs_snapshot.c,v 1.143 2016/10/28 20:38:12 jdolecek Exp $	*/
2 
3 /*
4  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5  *
6  * Further information about snapshots can be obtained from:
7  *
8  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
9  *	1614 Oxford Street		mckusick@mckusick.com
10  *	Berkeley, CA 94709-1608		+1-510-843-9542
11  *	USA
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  *
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
36  *
37  *	from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38  */
39 
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.143 2016/10/28 20:38:12 jdolecek Exp $");
42 
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47 
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65 
66 #include <miscfs/specfs/specdev.h>
67 
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74 
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77 
78 #include <uvm/uvm.h>
79 
80 TAILQ_HEAD(inodelst, inode);			/* List of active snapshots */
81 
82 struct snap_info {
83 	kmutex_t si_lock;			/* Lock this snapinfo */
84 	kmutex_t si_snaplock;			/* Snapshot vnode common lock */
85 	lwp_t *si_owner;			/* Snaplock owner */
86 	struct inodelst si_snapshots;		/* List of active snapshots */
87 	daddr_t *si_snapblklist;		/* Snapshot block hints list */
88 	uint32_t si_gen;			/* Incremented on change */
89 };
90 
91 #if !defined(FFS_NO_SNAPSHOT)
92 typedef int (*acctfunc_t)
93     (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
94 
95 static int snapshot_setup(struct mount *, struct vnode *);
96 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
97 static int snapshot_expunge(struct mount *, struct vnode *,
98     struct fs *, daddr_t *, daddr_t **);
99 static int snapshot_expunge_snap(struct mount *, struct vnode *,
100     struct fs *, daddr_t);
101 static int snapshot_writefs(struct mount *, struct vnode *, void *);
102 static int cgaccount(struct vnode *, int, int *);
103 static int cgaccount1(int, struct vnode *, void *, int);
104 static int expunge(struct vnode *, struct inode *, struct fs *,
105     acctfunc_t, int);
106 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
107     daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
108 static int fullacct(struct vnode *, void *, int, int, struct fs *,
109     daddr_t, int);
110 static int snapacct(struct vnode *, void *, int, int, struct fs *,
111     daddr_t, int);
112 static int mapacct(struct vnode *, void *, int, int, struct fs *,
113     daddr_t, int);
114 #endif /* !defined(FFS_NO_SNAPSHOT) */
115 
116 static int ffs_copyonwrite(void *, struct buf *, bool);
117 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
118 static int rwfsblk(struct vnode *, int, void *, daddr_t);
119 static int syncsnap(struct vnode *);
120 static int wrsnapblk(struct vnode *, void *, daddr_t);
121 #if !defined(FFS_NO_SNAPSHOT)
122 static int blocks_in_journal(struct fs *);
123 #endif
124 
125 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
126 static inline daddr_t db_get(struct inode *, int);
127 static inline void db_assign(struct inode *, int, daddr_t);
128 static inline daddr_t ib_get(struct inode *, int);
129 static inline daddr_t idb_get(struct inode *, void *, int);
130 static inline void idb_assign(struct inode *, void *, int, daddr_t);
131 
132 #ifdef DEBUG
133 static int snapdebug = 0;
134 #endif
135 
136 int
137 ffs_snapshot_init(struct ufsmount *ump)
138 {
139 	struct snap_info *si;
140 
141 	si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
142 	if (si == NULL)
143 		return ENOMEM;
144 
145 	TAILQ_INIT(&si->si_snapshots);
146 	mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
147 	mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
148 	si->si_owner = NULL;
149 	si->si_gen = 0;
150 	si->si_snapblklist = NULL;
151 
152 	return 0;
153 }
154 
155 void
156 ffs_snapshot_fini(struct ufsmount *ump)
157 {
158 	struct snap_info *si;
159 
160 	si = ump->um_snapinfo;
161 	ump->um_snapinfo = NULL;
162 
163 	KASSERT(TAILQ_EMPTY(&si->si_snapshots));
164 	mutex_destroy(&si->si_lock);
165 	mutex_destroy(&si->si_snaplock);
166 	KASSERT(si->si_snapblklist == NULL);
167 	kmem_free(si, sizeof(*si));
168 }
169 
170 /*
171  * Create a snapshot file and initialize it for the filesystem.
172  * Vnode is locked on entry and return.
173  */
174 int
175 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
176 {
177 #if defined(FFS_NO_SNAPSHOT)
178 	return EOPNOTSUPP;
179 }
180 #else /* defined(FFS_NO_SNAPSHOT) */
181 	bool suspended = false;
182 	int error, redo = 0, snaploc;
183 	void *sbbuf = NULL;
184 	daddr_t *snaplist = NULL, snaplistsize = 0;
185 	struct buf *bp, *nbp;
186 	struct fs *copy_fs = NULL;
187 	struct fs *fs = VFSTOUFS(mp)->um_fs;
188 	struct inode *ip = VTOI(vp);
189 	struct lwp *l = curlwp;
190 	struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
191 	struct timespec ts;
192 	struct timeval starttime;
193 #ifdef DEBUG
194 	struct timeval endtime;
195 #endif
196 	struct vnode *devvp = ip->i_devvp;
197 
198 	/*
199 	 * If the vnode already is a snapshot, return.
200 	 */
201 	if ((ip->i_flags & SF_SNAPSHOT)) {
202 		if ((ip->i_flags & SF_SNAPINVAL))
203 			return EINVAL;
204 		if (ctime) {
205 			ctime->tv_sec = DIP(ip, mtime);
206 			ctime->tv_nsec = DIP(ip, mtimensec);
207 		}
208 		return 0;
209 	}
210 	/*
211 	 * Check for free snapshot slot in the superblock.
212 	 */
213 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
214 		if (fs->fs_snapinum[snaploc] == 0)
215 			break;
216 	if (snaploc == FSMAXSNAP)
217 		return (ENOSPC);
218 	/*
219 	 * Prepare the vnode to become a snapshot.
220 	 */
221 	error = snapshot_setup(mp, vp);
222 	if (error)
223 		goto out;
224 
225 	/*
226 	 * Copy all the cylinder group maps. Although the
227 	 * filesystem is still active, we hope that only a few
228 	 * cylinder groups will change between now and when we
229 	 * suspend operations. Thus, we will be able to quickly
230 	 * touch up the few cylinder groups that changed during
231 	 * the suspension period.
232 	 */
233 	error = cgaccount(vp, 1, NULL);
234 	if (error)
235 		goto out;
236 
237 	/*
238 	 * snapshot is now valid
239 	 */
240 	ip->i_flags &= ~SF_SNAPINVAL;
241 	DIP_ASSIGN(ip, flags, ip->i_flags);
242 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
243 
244 	/*
245 	 * Ensure that the snapshot is completely on disk.
246 	 * Since we have marked it as a snapshot it is safe to
247 	 * unlock it as no process will be allowed to write to it.
248 	 */
249 	error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
250 	if (error)
251 		goto out;
252 	VOP_UNLOCK(vp);
253 	/*
254 	 * All allocations are done, so we can now suspend the filesystem.
255 	 */
256 	error = vfs_suspend(vp->v_mount, 0);
257 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
258 	if (error)
259 		goto out;
260 	suspended = true;
261 	getmicrotime(&starttime);
262 	/*
263 	 * First, copy all the cylinder group maps that have changed.
264 	 */
265 	error = cgaccount(vp, 2, &redo);
266 	if (error)
267 		goto out;
268 	/*
269 	 * Create a copy of the superblock and its summary information.
270 	 */
271 	error = snapshot_copyfs(mp, vp, &sbbuf);
272 	if (error)
273 		goto out;
274 	copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
275 	/*
276 	 * Expunge unlinked files from our view.
277 	 */
278 	error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
279 	if (error)
280 		goto out;
281 	/*
282 	 * Record snapshot inode. Since this is the newest snapshot,
283 	 * it must be placed at the end of the list.
284 	 */
285 	if (ip->i_nlink > 0)
286 		fs->fs_snapinum[snaploc] = ip->i_number;
287 
288 	mutex_enter(&si->si_lock);
289 	if (is_active_snapshot(si, ip))
290 		panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
291 	TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
292 	if (TAILQ_FIRST(&si->si_snapshots) == ip) {
293 		/*
294 		 * If this is the first snapshot on this filesystem, put the
295 		 * preliminary list in place and establish the cow handler.
296 		 */
297 		si->si_snapblklist = snaplist;
298 		fscow_establish(mp, ffs_copyonwrite, devvp);
299 	}
300 	si->si_gen++;
301 	mutex_exit(&si->si_lock);
302 
303 	vp->v_vflag |= VV_SYSTEM;
304 	/*
305 	 * Set the mtime to the time the snapshot has been taken.
306 	 */
307 	TIMEVAL_TO_TIMESPEC(&starttime, &ts);
308 	if (ctime)
309 		*ctime = ts;
310 	DIP_ASSIGN(ip, mtime, ts.tv_sec);
311 	DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
312 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
313 	/*
314 	 * Copy allocation information from all snapshots and then
315 	 * expunge them from our view.
316 	 */
317 	error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
318 	if (error)
319 		goto out;
320 	/*
321 	 * Write the superblock and its summary information to the snapshot.
322 	 */
323 	error = snapshot_writefs(mp, vp, sbbuf);
324 	if (error)
325 		goto out;
326 	/*
327 	 * We're nearly done, ensure that the snapshot is completely on disk.
328 	 */
329 	error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
330 	if (error)
331 		goto out;
332 	/*
333 	 * Invalidate and free all pages on the snapshot vnode.
334 	 * We will read and write through the buffercache.
335 	 */
336 	mutex_enter(vp->v_interlock);
337 	error = VOP_PUTPAGES(vp, 0, 0,
338 		    PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
339 	if (error)
340 		goto out;
341 	/*
342 	 * Invalidate short ( < fs_bsize ) buffers.  We will always read
343 	 * full size buffers later.
344 	 */
345 	mutex_enter(&bufcache_lock);
346 	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
347 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
348 		nbp = LIST_NEXT(bp, b_vnbufs);
349 		if (bp->b_bcount == fs->fs_bsize)
350 			continue;
351 		error = bbusy(bp, false, 0, NULL);
352 		if (error != 0) {
353 			if (error == EPASSTHROUGH) {
354 				nbp = LIST_FIRST(&vp->v_cleanblkhd);
355 				continue;
356 			}
357 			break;
358 		}
359 		brelsel(bp, BC_INVAL | BC_VFLUSH);
360 	}
361 	mutex_exit(&bufcache_lock);
362 
363 out:
364 	if (sbbuf != NULL) {
365 		free(copy_fs->fs_csp, M_UFSMNT);
366 		free(sbbuf, M_UFSMNT);
367 	}
368 	if (fs->fs_active != NULL) {
369 		free(fs->fs_active, M_DEVBUF);
370 		fs->fs_active = NULL;
371 	}
372 
373 	mutex_enter(&si->si_lock);
374 	if (snaplist != NULL) {
375 		if (si->si_snapblklist == snaplist)
376 			si->si_snapblklist = NULL;
377 		free(snaplist, M_UFSMNT);
378 	}
379 	if (error) {
380 		fs->fs_snapinum[snaploc] = 0;
381 	} else {
382 		/*
383 		 * As this is the newest list, it is the most inclusive, so
384 		 * should replace the previous list.
385 		 */
386 		si->si_snapblklist = ip->i_snapblklist;
387 	}
388 	si->si_gen++;
389 	mutex_exit(&si->si_lock);
390 
391 	if (suspended) {
392 		VOP_UNLOCK(vp);
393 		vfs_resume(vp->v_mount);
394 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
395 #ifdef DEBUG
396 		getmicrotime(&endtime);
397 		timersub(&endtime, &starttime, &endtime);
398 		printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
399 		    mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
400 		    endtime.tv_usec / 1000, redo, fs->fs_ncg);
401 #endif
402 	}
403 	if (error) {
404 		if (UFS_WAPBL_BEGIN(mp) == 0) {
405 			/*
406 			 * We depend on ffs_truncate() to call ffs_snapremove()
407 			 * before it may return an error. On failed
408 			 * ffs_truncate() we have normal file with leaked
409 			 * (meta-) data, but no snapshot to use.
410 			 */
411 			(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
412 			UFS_WAPBL_END(mp);
413 		}
414 	} else if (ip->i_nlink > 0)
415 		vref(vp);
416 	return (error);
417 }
418 
419 /*
420  * Prepare vnode to become a snapshot.
421  */
422 static int
423 snapshot_setup(struct mount *mp, struct vnode *vp)
424 {
425 	int error, n, len, loc, cg;
426 	daddr_t blkno, numblks;
427 	struct buf *ibp, *nbp;
428 	struct fs *fs = VFSTOUFS(mp)->um_fs;
429 	struct lwp *l = curlwp;
430 	const int wbreak = blocks_in_journal(fs)/8;
431 	struct inode *ip = VTOI(vp);
432 
433 	/*
434 	 * Check mount, readonly reference and owner.
435 	 */
436 	if (vp->v_mount != mp)
437 		return EXDEV;
438 	if (vp->v_writecount != 0)
439 		return EBUSY;
440 	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
441 	    0, mp, vp, NULL);
442 	if (error)
443 		return EACCES;
444 
445 	if (vp->v_size != 0) {
446 		/*
447 		 * Must completely truncate the file here. Allocated
448 		 * blocks on a snapshot mean that block has been copied
449 		 * on write, see ffs_copyonwrite() testing "blkno != 0"
450 		 */
451 		error = ufs_truncate_retry(vp, 0, NOCRED);
452 		if (error)
453 			return error;
454 	}
455 
456 	/* Change inode to snapshot type file. */
457 	error = UFS_WAPBL_BEGIN(mp);
458 	if (error)
459 		return error;
460 #if defined(QUOTA) || defined(QUOTA2)
461 	/* shapshot inodes are not accounted in quotas */
462 	chkiq(ip, -1, l->l_cred, 0);
463 #endif
464 	ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
465 	DIP_ASSIGN(ip, flags, ip->i_flags);
466 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
467 	ffs_update(vp, NULL, NULL, UPDATE_WAIT);
468 	UFS_WAPBL_END(mp);
469 
470 	KASSERT(ip->i_flags & SF_SNAPSHOT);
471 	/*
472 	 * Write an empty list of preallocated blocks to the end of
473 	 * the snapshot to set size to at least that of the filesystem.
474 	 */
475 	numblks = howmany(fs->fs_size, fs->fs_frag);
476 	blkno = 1;
477 	blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
478 	error = vn_rdwr(UIO_WRITE, vp,
479 	    (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
480 	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
481 	if (error)
482 		return error;
483 	/*
484 	 * Preallocate critical data structures so that we can copy
485 	 * them in without further allocation after we suspend all
486 	 * operations on the filesystem. We would like to just release
487 	 * the allocated buffers without writing them since they will
488 	 * be filled in below once we are ready to go, but this upsets
489 	 * the soft update code, so we go ahead and write the new buffers.
490 	 *
491 	 * Allocate all indirect blocks and mark all of them as not
492 	 * needing to be copied.
493 	 */
494 	error = UFS_WAPBL_BEGIN(mp);
495 	if (error)
496 		return error;
497 	for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
498 		error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
499 		    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
500 		if (error)
501 			goto out;
502 		brelse(ibp, 0);
503 		if (wbreak > 0 && (++n % wbreak) == 0) {
504 			UFS_WAPBL_END(mp);
505 			error = UFS_WAPBL_BEGIN(mp);
506 			if (error)
507 				return error;
508 		}
509 	}
510 	/*
511 	 * Allocate copies for the superblock and its summary information.
512 	 */
513 	error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
514 	    0, &nbp);
515 	if (error)
516 		goto out;
517 	bawrite(nbp);
518 	blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
519 	len = howmany(fs->fs_cssize, fs->fs_bsize);
520 	for (loc = 0; loc < len; loc++) {
521 		error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
522 		    fs->fs_bsize, l->l_cred, 0, &nbp);
523 		if (error)
524 			goto out;
525 		bawrite(nbp);
526 		if (wbreak > 0 && (++n % wbreak) == 0) {
527 			UFS_WAPBL_END(mp);
528 			error = UFS_WAPBL_BEGIN(mp);
529 			if (error)
530 				return error;
531 		}
532 	}
533 	/*
534 	 * Allocate all cylinder group blocks.
535 	 */
536 	for (cg = 0; cg < fs->fs_ncg; cg++) {
537 		error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
538 		    fs->fs_bsize, l->l_cred, 0, &nbp);
539 		if (error)
540 			goto out;
541 		bawrite(nbp);
542 		if (wbreak > 0 && (++n % wbreak) == 0) {
543 			UFS_WAPBL_END(mp);
544 			error = UFS_WAPBL_BEGIN(mp);
545 			if (error)
546 				return error;
547 		}
548 	}
549 
550 out:
551 	UFS_WAPBL_END(mp);
552 	return error;
553 }
554 
555 /*
556  * Create a copy of the superblock and its summary information.
557  * It is up to the caller to free copyfs and copy_fs->fs_csp.
558  */
559 static int
560 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
561 {
562 	int error, i, len, loc, size;
563 	void *space;
564 	int32_t *lp;
565 	struct buf *bp;
566 	struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
567 	struct vnode *devvp = VTOI(vp)->i_devvp;
568 
569 	/*
570 	 * Grab a copy of the superblock and its summary information.
571 	 * We delay writing it until the suspension is released below.
572 	 */
573 	*sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
574 	loc = ffs_blkoff(fs, fs->fs_sblockloc);
575 	if (loc > 0)
576 		memset(*sbbuf, 0, loc);
577 	copyfs = (struct fs *)((char *)(*sbbuf) + loc);
578 	memcpy(copyfs, fs, fs->fs_sbsize);
579 	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
580 	if (fs->fs_sbsize < size)
581 		memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
582 		    size - fs->fs_sbsize);
583 	size = ffs_blkroundup(fs, fs->fs_cssize);
584 	if (fs->fs_contigsumsize > 0)
585 		size += fs->fs_ncg * sizeof(int32_t);
586 	space = malloc(size, M_UFSMNT, M_WAITOK);
587 	copyfs->fs_csp = space;
588 	memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
589 	space = (char *)space + fs->fs_cssize;
590 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
591 	i = fs->fs_frag - loc % fs->fs_frag;
592 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
593 	if (len > 0) {
594 		if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
595 		    len, 0, &bp)) != 0) {
596 			free(copyfs->fs_csp, M_UFSMNT);
597 			free(*sbbuf, M_UFSMNT);
598 			*sbbuf = NULL;
599 			return error;
600 		}
601 		memcpy(space, bp->b_data, (u_int)len);
602 		space = (char *)space + len;
603 		brelse(bp, BC_INVAL | BC_NOCACHE);
604 	}
605 	if (fs->fs_contigsumsize > 0) {
606 		copyfs->fs_maxcluster = lp = space;
607 		for (i = 0; i < fs->fs_ncg; i++)
608 			*lp++ = fs->fs_contigsumsize;
609 	}
610 	if (mp->mnt_wapbl)
611 		copyfs->fs_flags &= ~FS_DOWAPBL;
612 	return 0;
613 }
614 
615 struct snapshot_expunge_ctx {
616 	struct vnode *logvp;
617 	struct lwp *l;
618 	struct vnode *vp;
619 	struct fs *copy_fs;
620 };
621 
622 static bool
623 snapshot_expunge_selector(void *cl, struct vnode *xvp)
624 {
625 	struct vattr vat;
626 	struct snapshot_expunge_ctx *c = cl;
627 	struct inode *xp;
628 
629 	xp = VTOI(xvp);
630 	if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
631 	    (xp->i_flags & SF_SNAPSHOT))
632 		return false;
633 #ifdef DEBUG
634 	if (snapdebug)
635 		vprint("ffs_snapshot: busy vnode", xvp);
636 #endif
637 
638 	if (xvp == c->logvp)
639 		return true;
640 
641 	if (VOP_GETATTR(xvp, &vat, c->l->l_cred) == 0 &&
642 	    vat.va_nlink > 0)
643 		return false;
644 
645 	if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
646 		return false;
647 
648 	return true;
649 }
650 
651 /*
652  * We must check for active files that have been unlinked (e.g., with a zero
653  * link count). We have to expunge all trace of these files from the snapshot
654  * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
655  * Note that we skip unlinked snapshot files as they will be handled separately.
656  * Calculate the snapshot list size and create a preliminary list.
657  */
658 static int
659 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
660     daddr_t *snaplistsize, daddr_t **snaplist)
661 {
662 	int cg, error = 0, len, loc;
663 	daddr_t blkno, *blkp;
664 	struct fs *fs = VFSTOUFS(mp)->um_fs;
665 	struct inode *xp;
666 	struct lwp *l = curlwp;
667 	struct vnode *logvp = NULL, *xvp;
668 	struct vnode_iterator *marker;
669 	struct snapshot_expunge_ctx ctx;
670 
671 	*snaplist = NULL;
672 	/*
673 	 * Get the log inode if any.
674 	 */
675 	if ((fs->fs_flags & FS_DOWAPBL) &&
676 	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
677 		error = VFS_VGET(mp,
678 		    fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
679 		if (error)
680 			goto out;
681 	}
682 	/*
683 	 * We also calculate the needed size for the snapshot list.
684 	 */
685 	*snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
686 	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
687 
688 	vfs_vnode_iterator_init(mp, &marker);
689 	ctx.logvp = logvp;
690 	ctx.l = l;
691 	ctx.vp = vp;
692 	ctx.copy_fs = copy_fs;
693 	while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
694 	    &ctx)))
695 	{
696 		/*
697 		 * If there is a fragment, clear it here.
698 		 */
699 		xp = VTOI(xvp);
700 		blkno = 0;
701 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
702 		if (loc < UFS_NDADDR) {
703 			len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
704 			if (len > 0 && len < fs->fs_bsize) {
705 				error = UFS_WAPBL_BEGIN(mp);
706 				if (error) {
707 					vrele(xvp);
708 					vfs_vnode_iterator_destroy(marker);
709 					goto out;
710 				}
711 				ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
712 				    len, xp->i_number);
713 				blkno = db_get(xp, loc);
714 				db_assign(xp, loc, 0);
715 				UFS_WAPBL_END(mp);
716 			}
717 		}
718 		*snaplistsize += 1;
719 		error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
720 		if (blkno)
721 			db_assign(xp, loc, blkno);
722 		if (!error) {
723 			error = UFS_WAPBL_BEGIN(mp);
724 			if (!error) {
725 				error = ffs_freefile_snap(copy_fs, vp,
726 				    xp->i_number, xp->i_mode);
727 				UFS_WAPBL_END(mp);
728 			}
729 		}
730 		vrele(xvp);
731 		if (error) {
732 			vfs_vnode_iterator_destroy(marker);
733 			goto out;
734 		}
735 	}
736 	vfs_vnode_iterator_destroy(marker);
737 
738 	/*
739 	 * Create a preliminary list of preallocated snapshot blocks.
740 	 */
741 	*snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
742 	blkp = &(*snaplist)[1];
743 	*blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
744 	blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
745 	for (cg = 0; cg < fs->fs_ncg; cg++) {
746 		if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
747 			break;
748 		*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
749 	}
750 	len = howmany(fs->fs_cssize, fs->fs_bsize);
751 	for (loc = 0; loc < len; loc++)
752 		*blkp++ = blkno + loc;
753 	for (; cg < fs->fs_ncg; cg++)
754 		*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
755 	(*snaplist)[0] = blkp - &(*snaplist)[0];
756 
757 out:
758 	if (logvp != NULL)
759 		vput(logvp);
760 	if (error && *snaplist != NULL) {
761 		free(*snaplist, M_UFSMNT);
762 		*snaplist = NULL;
763 	}
764 
765 	return error;
766 }
767 
768 /*
769  * Copy allocation information from all the snapshots in this snapshot and
770  * then expunge them from its view. Also, collect the list of allocated
771  * blocks in i_snapblklist.
772  */
773 static int
774 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
775     struct fs *copy_fs, daddr_t snaplistsize)
776 {
777 	int error = 0, i;
778 	daddr_t numblks, *snaplist = NULL;
779 	struct fs *fs = VFSTOUFS(mp)->um_fs;
780 	struct inode *ip = VTOI(vp), *xp;
781 	struct lwp *l = curlwp;
782 	struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
783 
784 	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
785 		if (xp != ip) {
786 			error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
787 			if (error)
788 				break;
789 		}
790 		if (xp->i_nlink != 0)
791 			continue;
792 		error = UFS_WAPBL_BEGIN(mp);
793 		if (error)
794 			break;
795 		error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
796 		UFS_WAPBL_END(mp);
797 		if (error)
798 			break;
799 	}
800 	if (error)
801 		goto out;
802 	/*
803 	 * Allocate space for the full list of preallocated snapshot blocks.
804 	 */
805 	snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
806 	ip->i_snapblklist = &snaplist[1];
807 	/*
808 	 * Expunge the blocks used by the snapshots from the set of
809 	 * blocks marked as used in the snapshot bitmaps. Also, collect
810 	 * the list of allocated blocks in i_snapblklist.
811 	 */
812 	error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
813 	if (error)
814 		goto out;
815 	if (snaplistsize < ip->i_snapblklist - snaplist)
816 		panic("ffs_snapshot: list too small");
817 	snaplistsize = ip->i_snapblklist - snaplist;
818 	snaplist[0] = snaplistsize;
819 	ip->i_snapblklist = &snaplist[0];
820 	/*
821 	 * Write out the list of allocated blocks to the end of the snapshot.
822 	 */
823 	numblks = howmany(fs->fs_size, fs->fs_frag);
824 	for (i = 0; i < snaplistsize; i++)
825 		snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
826 	error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
827 	    snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
828 	    UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
829 	for (i = 0; i < snaplistsize; i++)
830 		snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
831 out:
832 	if (error && snaplist != NULL) {
833 		free(snaplist, M_UFSMNT);
834 		ip->i_snapblklist = NULL;
835 	}
836 	return error;
837 }
838 
839 /*
840  * Write the superblock and its summary information to the snapshot.
841  * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
842  */
843 static int
844 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
845 {
846 	int error, len, loc;
847 	void *space;
848 	daddr_t blkno;
849 	struct buf *bp;
850 	struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
851 	struct inode *ip = VTOI(vp);
852 	struct lwp *l = curlwp;
853 
854 	copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
855 
856 	/*
857 	 * Write the superblock and its summary information
858 	 * to the snapshot.
859 	 */
860 	blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
861 	len = howmany(fs->fs_cssize, fs->fs_bsize);
862 	space = copyfs->fs_csp;
863 #ifdef FFS_EI
864 	if (UFS_FSNEEDSWAP(fs)) {
865 		ffs_sb_swap(copyfs, copyfs);
866 		ffs_csum_swap(space, space, fs->fs_cssize);
867 	}
868 #endif
869 	error = UFS_WAPBL_BEGIN(mp);
870 	if (error)
871 		return error;
872 	for (loc = 0; loc < len; loc++) {
873 		error = bread(vp, blkno + loc, fs->fs_bsize,
874 		    B_MODIFY, &bp);
875 		if (error) {
876 			break;
877 		}
878 		memcpy(bp->b_data, space, fs->fs_bsize);
879 		space = (char *)space + fs->fs_bsize;
880 		bawrite(bp);
881 	}
882 	if (error)
883 		goto out;
884 	error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
885 	    fs->fs_bsize, B_MODIFY, &bp);
886 	if (error) {
887 		goto out;
888 	} else {
889 		memcpy(bp->b_data, sbbuf, fs->fs_bsize);
890 		bawrite(bp);
891 	}
892 	/*
893 	 * Copy the first UFS_NDADDR blocks to the snapshot so
894 	 * ffs_copyonwrite() and ffs_snapblkfree() will always work on
895 	 * indirect blocks.
896 	 */
897 	for (loc = 0; loc < UFS_NDADDR; loc++) {
898 		if (db_get(ip, loc) != 0)
899 			continue;
900 		error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
901 		    fs->fs_bsize, l->l_cred, 0, &bp);
902 		if (error)
903 			break;
904 		error = rwfsblk(vp, B_READ, bp->b_data, loc);
905 		if (error) {
906 			brelse(bp, 0);
907 			break;
908 		}
909 		bawrite(bp);
910 	}
911 
912 out:
913 	UFS_WAPBL_END(mp);
914 	return error;
915 }
916 
917 /*
918  * Copy all cylinder group maps.
919  */
920 static int
921 cgaccount(struct vnode *vp, int passno, int *redo)
922 {
923 	int cg, error = 0;
924 	struct buf *nbp;
925 	struct fs *fs = VTOI(vp)->i_fs;
926 
927 	if (redo != NULL)
928 		*redo = 0;
929 	if (passno == 1)
930 		fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
931 		    M_DEVBUF, M_WAITOK | M_ZERO);
932 	for (cg = 0; cg < fs->fs_ncg; cg++) {
933 		if (passno == 2 && ACTIVECG_ISSET(fs, cg))
934 			continue;
935 
936 		if (redo != NULL)
937 			*redo += 1;
938 		error = UFS_WAPBL_BEGIN(vp->v_mount);
939 		if (error)
940 			return error;
941 		error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
942 		    fs->fs_bsize, curlwp->l_cred, 0, &nbp);
943 		if (error) {
944 			UFS_WAPBL_END(vp->v_mount);
945 			break;
946 		}
947 		error = cgaccount1(cg, vp, nbp->b_data, passno);
948 		bawrite(nbp);
949 		UFS_WAPBL_END(vp->v_mount);
950 		if (error)
951 			break;
952 	}
953 	return error;
954 }
955 
956 /*
957  * Copy a cylinder group map. All the unallocated blocks are marked
958  * BLK_NOCOPY so that the snapshot knows that it need not copy them
959  * if they are later written. If passno is one, then this is a first
960  * pass, so only setting needs to be done. If passno is 2, then this
961  * is a revision to a previous pass which must be undone as the
962  * replacement pass is done.
963  */
964 static int
965 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
966 {
967 	struct buf *bp, *ibp;
968 	struct inode *ip;
969 	struct cg *cgp;
970 	struct fs *fs;
971 	struct lwp *l = curlwp;
972 	daddr_t base, numblks;
973 	int error, len, loc, ns __unused, indiroff;
974 
975 	ip = VTOI(vp);
976 	fs = ip->i_fs;
977 	ns = UFS_FSNEEDSWAP(fs);
978 	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
979 		(int)fs->fs_cgsize, 0, &bp);
980 	if (error) {
981 		return (error);
982 	}
983 	cgp = (struct cg *)bp->b_data;
984 	if (!cg_chkmagic(cgp, ns)) {
985 		brelse(bp, 0);
986 		return (EIO);
987 	}
988 	ACTIVECG_SET(fs, cg);
989 
990 	memcpy(data, bp->b_data, fs->fs_cgsize);
991 	brelse(bp, 0);
992 	if (fs->fs_cgsize < fs->fs_bsize)
993 		memset((char *)data + fs->fs_cgsize, 0,
994 		    fs->fs_bsize - fs->fs_cgsize);
995 	numblks = howmany(fs->fs_size, fs->fs_frag);
996 	len = howmany(fs->fs_fpg, fs->fs_frag);
997 	base = cg * fs->fs_fpg / fs->fs_frag;
998 	if (base + len >= numblks)
999 		len = numblks - base - 1;
1000 	loc = 0;
1001 	if (base < UFS_NDADDR) {
1002 		for ( ; loc < UFS_NDADDR; loc++) {
1003 			if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1004 				db_assign(ip, loc, BLK_NOCOPY);
1005 			else if (db_get(ip, loc) == BLK_NOCOPY) {
1006 				if (passno == 2)
1007 					db_assign(ip, loc, 0);
1008 				else if (passno == 1)
1009 					panic("ffs_snapshot: lost direct block");
1010 			}
1011 		}
1012 	}
1013 	if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
1014 	    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1015 		return (error);
1016 	indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
1017 	for ( ; loc < len; loc++, indiroff++) {
1018 		if (indiroff >= FFS_NINDIR(fs)) {
1019 			bawrite(ibp);
1020 			if ((error = ffs_balloc(vp,
1021 			    ffs_lblktosize(fs, (off_t)(base + loc)),
1022 			    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1023 				return (error);
1024 			indiroff = 0;
1025 		}
1026 		if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1027 			idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1028 		else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1029 			if (passno == 2)
1030 				idb_assign(ip, ibp->b_data, indiroff, 0);
1031 			else if (passno == 1)
1032 				panic("ffs_snapshot: lost indirect block");
1033 		}
1034 	}
1035 	bdwrite(ibp);
1036 	return (0);
1037 }
1038 
1039 /*
1040  * Before expunging a snapshot inode, note all the
1041  * blocks that it claims with BLK_SNAP so that fsck will
1042  * be able to account for those blocks properly and so
1043  * that this snapshot knows that it need not copy them
1044  * if the other snapshot holding them is freed.
1045  */
1046 static int
1047 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1048     acctfunc_t acctfunc, int expungetype)
1049 {
1050 	int i, error, ns __unused;
1051 	daddr_t lbn, rlbn;
1052 	daddr_t len, blkno, numblks, blksperindir;
1053 	struct ufs1_dinode *dip1;
1054 	struct ufs2_dinode *dip2;
1055 	struct lwp *l = curlwp;
1056 	void *bap;
1057 	struct buf *bp;
1058 	struct mount *mp;
1059 
1060 	ns = UFS_FSNEEDSWAP(fs);
1061 	mp = snapvp->v_mount;
1062 
1063 	error = UFS_WAPBL_BEGIN(mp);
1064 	if (error)
1065 		return error;
1066 	/*
1067 	 * Prepare to expunge the inode. If its inode block has not
1068 	 * yet been copied, then allocate and fill the copy.
1069 	 */
1070 	lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1071 	error = snapblkaddr(snapvp, lbn, &blkno);
1072 	if (error)
1073 		return error;
1074 	if (blkno != 0) {
1075 		error = bread(snapvp, lbn, fs->fs_bsize,
1076 		    B_MODIFY, &bp);
1077 	} else {
1078 		error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1079 		    fs->fs_bsize, l->l_cred, 0, &bp);
1080 		if (! error)
1081 			error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1082 	}
1083 	if (error) {
1084 		UFS_WAPBL_END(mp);
1085 		return error;
1086 	}
1087 	/*
1088 	 * Set a snapshot inode to be a zero length file, regular files
1089 	 * or unlinked snapshots to be completely unallocated.
1090 	 */
1091 	if (fs->fs_magic == FS_UFS1_MAGIC) {
1092 		dip1 = (struct ufs1_dinode *)bp->b_data +
1093 		    ino_to_fsbo(fs, cancelip->i_number);
1094 		if (cancelip->i_flags & SF_SNAPSHOT) {
1095 			dip1->di_flags =
1096 			    ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1097 			    SF_SNAPINVAL, ns);
1098 		}
1099 		if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1100 			dip1->di_mode = 0;
1101 		dip1->di_size = 0;
1102 		dip1->di_blocks = 0;
1103 		memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1104 	} else {
1105 		dip2 = (struct ufs2_dinode *)bp->b_data +
1106 		    ino_to_fsbo(fs, cancelip->i_number);
1107 		if (cancelip->i_flags & SF_SNAPSHOT) {
1108 			dip2->di_flags =
1109 			    ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1110 			    SF_SNAPINVAL, ns);
1111 		}
1112 		if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1113 			dip2->di_mode = 0;
1114 		dip2->di_size = 0;
1115 		dip2->di_blocks = 0;
1116 		memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1117 	}
1118 	bdwrite(bp);
1119 	UFS_WAPBL_END(mp);
1120 	/*
1121 	 * Now go through and expunge all the blocks in the file
1122 	 * using the function requested.
1123 	 */
1124 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1125 	if (fs->fs_magic == FS_UFS1_MAGIC)
1126 		bap = &cancelip->i_ffs1_db[0];
1127 	else
1128 		bap = &cancelip->i_ffs2_db[0];
1129 	error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
1130 	if (error)
1131 		return (error);
1132 	if (fs->fs_magic == FS_UFS1_MAGIC)
1133 		bap = &cancelip->i_ffs1_ib[0];
1134 	else
1135 		bap = &cancelip->i_ffs2_ib[0];
1136 	error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
1137 	if (error)
1138 		return (error);
1139 	blksperindir = 1;
1140 	lbn = -UFS_NDADDR;
1141 	len = numblks - UFS_NDADDR;
1142 	rlbn = UFS_NDADDR;
1143 	for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1144 		error = indiracct(snapvp, ITOV(cancelip), i,
1145 		    ib_get(cancelip, i), lbn, rlbn, len,
1146 		    blksperindir, fs, acctfunc, expungetype);
1147 		if (error)
1148 			return (error);
1149 		blksperindir *= FFS_NINDIR(fs);
1150 		lbn -= blksperindir + 1;
1151 		len -= blksperindir;
1152 		rlbn += blksperindir;
1153 	}
1154 	return (0);
1155 }
1156 
1157 /*
1158  * Descend an indirect block chain for vnode cancelvp accounting for all
1159  * its indirect blocks in snapvp.
1160  */
1161 static int
1162 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1163     daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1164     daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1165 {
1166 	int error, num, i;
1167 	daddr_t subblksperindir;
1168 	struct indir indirs[UFS_NIADDR + 2];
1169 	daddr_t last;
1170 	void *bap;
1171 	struct buf *bp;
1172 
1173 	if (blkno == 0) {
1174 		if (expungetype == BLK_NOCOPY)
1175 			return (0);
1176 		panic("indiracct: missing indir");
1177 	}
1178 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1179 		return (error);
1180 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1181 		panic("indiracct: botched params");
1182 	/*
1183 	 * We have to expand bread here since it will deadlock looking
1184 	 * up the block number for any blocks that are not in the cache.
1185 	 */
1186 	error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1187 	    false, &bp);
1188 	if (error)
1189 		return error;
1190 	if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1191 	    rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1192 		brelse(bp, 0);
1193 		return (error);
1194 	}
1195 	/*
1196 	 * Account for the block pointers in this indirect block.
1197 	 */
1198 	last = howmany(remblks, blksperindir);
1199 	if (last > FFS_NINDIR(fs))
1200 		last = FFS_NINDIR(fs);
1201 	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1202 	memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1203 	brelse(bp, 0);
1204 	error = (*acctfunc)(snapvp, bap, 0, last,
1205 	    fs, level == 0 ? rlbn : -1, expungetype);
1206 	if (error || level == 0)
1207 		goto out;
1208 	/*
1209 	 * Account for the block pointers in each of the indirect blocks
1210 	 * in the levels below us.
1211 	 */
1212 	subblksperindir = blksperindir / FFS_NINDIR(fs);
1213 	for (lbn++, level--, i = 0; i < last; i++) {
1214 		error = indiracct(snapvp, cancelvp, level,
1215 		    idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1216 		    subblksperindir, fs, acctfunc, expungetype);
1217 		if (error)
1218 			goto out;
1219 		rlbn += blksperindir;
1220 		lbn -= blksperindir;
1221 		remblks -= blksperindir;
1222 	}
1223 out:
1224 	free(bap, M_DEVBUF);
1225 	return (error);
1226 }
1227 
1228 /*
1229  * Do both snap accounting and map accounting.
1230  */
1231 static int
1232 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1233     struct fs *fs, daddr_t lblkno,
1234     int exptype /* BLK_SNAP or BLK_NOCOPY */)
1235 {
1236 	int error;
1237 
1238 	if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1239 		return (error);
1240 	return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1241 }
1242 
1243 /*
1244  * Identify a set of blocks allocated in a snapshot inode.
1245  */
1246 static int
1247 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1248     struct fs *fs, daddr_t lblkno,
1249     int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1250 {
1251 	struct inode *ip = VTOI(vp);
1252 	struct lwp *l = curlwp;
1253 	struct mount *mp = vp->v_mount;
1254 	daddr_t blkno;
1255 	daddr_t lbn;
1256 	struct buf *ibp;
1257 	int error, n;
1258 	const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1259 
1260 	error = UFS_WAPBL_BEGIN(mp);
1261 	if (error)
1262 		return error;
1263 	for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1264 		blkno = idb_get(ip, bap, oldblkp);
1265 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1266 			continue;
1267 		lbn = ffs_fragstoblks(fs, blkno);
1268 		if (lbn < UFS_NDADDR) {
1269 			blkno = db_get(ip, lbn);
1270 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1271 		} else {
1272 			error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1273 			    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1274 			if (error)
1275 				break;
1276 			blkno = idb_get(ip, ibp->b_data,
1277 			    (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1278 		}
1279 		/*
1280 		 * If we are expunging a snapshot vnode and we
1281 		 * find a block marked BLK_NOCOPY, then it is
1282 		 * one that has been allocated to this snapshot after
1283 		 * we took our current snapshot and can be ignored.
1284 		 */
1285 		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1286 			if (lbn >= UFS_NDADDR)
1287 				brelse(ibp, 0);
1288 		} else {
1289 			if (blkno != 0)
1290 				panic("snapacct: bad block");
1291 			if (lbn < UFS_NDADDR)
1292 				db_assign(ip, lbn, expungetype);
1293 			else {
1294 				idb_assign(ip, ibp->b_data,
1295 				    (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1296 				bdwrite(ibp);
1297 			}
1298 		}
1299 		if (wbreak > 0 && (++n % wbreak) == 0) {
1300 			UFS_WAPBL_END(mp);
1301 			error = UFS_WAPBL_BEGIN(mp);
1302 			if (error)
1303 				return error;
1304 		}
1305 	}
1306 	UFS_WAPBL_END(mp);
1307 	return error;
1308 }
1309 
1310 /*
1311  * Account for a set of blocks allocated in a snapshot inode.
1312  */
1313 static int
1314 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1315     struct fs *fs, daddr_t lblkno, int expungetype)
1316 {
1317 	daddr_t blkno;
1318 	struct inode *ip;
1319 	struct mount *mp = vp->v_mount;
1320 	ino_t inum;
1321 	int acctit, error, n;
1322 	const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1323 
1324 	error = UFS_WAPBL_BEGIN(mp);
1325 	if (error)
1326 		return error;
1327 	ip = VTOI(vp);
1328 	inum = ip->i_number;
1329 	if (lblkno == -1)
1330 		acctit = 0;
1331 	else
1332 		acctit = 1;
1333 	for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1334 		blkno = idb_get(ip, bap, oldblkp);
1335 		if (blkno == 0 || blkno == BLK_NOCOPY)
1336 			continue;
1337 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1338 			*ip->i_snapblklist++ = lblkno;
1339 		if (blkno == BLK_SNAP)
1340 			blkno = ffs_blkstofrags(fs, lblkno);
1341 		ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1342 		if (wbreak > 0 && (++n % wbreak) == 0) {
1343 			UFS_WAPBL_END(mp);
1344 			error = UFS_WAPBL_BEGIN(mp);
1345 			if (error)
1346 				return error;
1347 		}
1348 	}
1349 	UFS_WAPBL_END(mp);
1350 	return (0);
1351 }
1352 
1353 /*
1354  * Number of blocks that fit into the journal or zero if not logging.
1355  */
1356 static int
1357 blocks_in_journal(struct fs *fs)
1358 {
1359 	off_t bpj;
1360 
1361 	if ((fs->fs_flags & FS_DOWAPBL) == 0)
1362 		return 0;
1363 	bpj = 1;
1364 	if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1365 		switch (fs->fs_journal_location) {
1366 		case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1367 			bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1368 			    fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1369 			break;
1370 		case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1371 			bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1372 			    fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1373 			break;
1374 		}
1375 	}
1376 	bpj /= fs->fs_bsize;
1377 	return (bpj > 0 ? bpj : 1);
1378 }
1379 #endif /* defined(FFS_NO_SNAPSHOT) */
1380 
1381 /*
1382  * Decrement extra reference on snapshot when last name is removed.
1383  * It will not be freed until the last open reference goes away.
1384  */
1385 void
1386 ffs_snapgone(struct vnode *vp)
1387 {
1388 	struct inode *xp, *ip = VTOI(vp);
1389 	struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1390 	struct fs *fs;
1391 	struct snap_info *si;
1392 	int snaploc;
1393 
1394 	si = VFSTOUFS(mp)->um_snapinfo;
1395 
1396 	/*
1397 	 * Find snapshot in incore list.
1398 	 */
1399 	mutex_enter(&si->si_lock);
1400 	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1401 		if (xp == ip)
1402 			break;
1403 	mutex_exit(&si->si_lock);
1404 	if (xp != NULL)
1405 		vrele(ITOV(ip));
1406 #ifdef DEBUG
1407 	else if (snapdebug)
1408 		printf("ffs_snapgone: lost snapshot vnode %llu\n",
1409 		    (unsigned long long)ip->i_number);
1410 #endif
1411 	/*
1412 	 * Delete snapshot inode from superblock. Keep list dense.
1413 	 */
1414 	mutex_enter(&si->si_lock);
1415 	fs = ip->i_fs;
1416 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1417 		if (fs->fs_snapinum[snaploc] == ip->i_number)
1418 			break;
1419 	if (snaploc < FSMAXSNAP) {
1420 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1421 			if (fs->fs_snapinum[snaploc] == 0)
1422 				break;
1423 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1424 		}
1425 		fs->fs_snapinum[snaploc - 1] = 0;
1426 	}
1427 	si->si_gen++;
1428 	mutex_exit(&si->si_lock);
1429 }
1430 
1431 /*
1432  * Prepare a snapshot file for being removed.
1433  */
1434 void
1435 ffs_snapremove(struct vnode *vp)
1436 {
1437 	struct inode *ip = VTOI(vp), *xp;
1438 	struct vnode *devvp = ip->i_devvp;
1439 	struct fs *fs = ip->i_fs;
1440 	struct mount *mp = spec_node_getmountedfs(devvp);
1441 	struct buf *ibp;
1442 	struct snap_info *si;
1443 	struct lwp *l = curlwp;
1444 	daddr_t numblks, blkno, dblk;
1445 	int error, loc, last;
1446 
1447 	si = VFSTOUFS(mp)->um_snapinfo;
1448 	/*
1449 	 * If active, delete from incore list (this snapshot may
1450 	 * already have been in the process of being deleted, so
1451 	 * would not have been active).
1452 	 *
1453 	 * Clear copy-on-write flag if last snapshot.
1454 	 */
1455 	mutex_enter(&si->si_snaplock);
1456 	mutex_enter(&si->si_lock);
1457 	if (is_active_snapshot(si, ip)) {
1458 		TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1459 		if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1460 			/* Roll back the list of preallocated blocks. */
1461 			xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1462 			si->si_snapblklist = xp->i_snapblklist;
1463 			si->si_gen++;
1464 			mutex_exit(&si->si_lock);
1465 			mutex_exit(&si->si_snaplock);
1466 		} else {
1467 			si->si_snapblklist = 0;
1468 			si->si_gen++;
1469 			mutex_exit(&si->si_lock);
1470 			mutex_exit(&si->si_snaplock);
1471 			fscow_disestablish(mp, ffs_copyonwrite, devvp);
1472 		}
1473 		if (ip->i_snapblklist != NULL) {
1474 			free(ip->i_snapblklist, M_UFSMNT);
1475 			ip->i_snapblklist = NULL;
1476 		}
1477 	} else {
1478 		mutex_exit(&si->si_lock);
1479 		mutex_exit(&si->si_snaplock);
1480 	}
1481 	/*
1482 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1483 	 * snapshots that want them (see ffs_snapblkfree below).
1484 	 */
1485 	for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1486 		dblk = db_get(ip, blkno);
1487 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1488 			db_assign(ip, blkno, 0);
1489 		else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1490 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1491 		     ip->i_number))) {
1492 			DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1493 			db_assign(ip, blkno, 0);
1494 		}
1495 	}
1496 	numblks = howmany(ip->i_size, fs->fs_bsize);
1497 	for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1498 		error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1499 		    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1500 		if (error)
1501 			continue;
1502 		if (fs->fs_size - blkno > FFS_NINDIR(fs))
1503 			last = FFS_NINDIR(fs);
1504 		else
1505 			last = fs->fs_size - blkno;
1506 		for (loc = 0; loc < last; loc++) {
1507 			dblk = idb_get(ip, ibp->b_data, loc);
1508 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1509 				idb_assign(ip, ibp->b_data, loc, 0);
1510 			else if (dblk == ffs_blkstofrags(fs, blkno) &&
1511 			    ffs_snapblkfree(fs, ip->i_devvp, dblk,
1512 			    fs->fs_bsize, ip->i_number)) {
1513 				DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1514 				idb_assign(ip, ibp->b_data, loc, 0);
1515 			}
1516 		}
1517 		bawrite(ibp);
1518 		UFS_WAPBL_END(mp);
1519 		error = UFS_WAPBL_BEGIN(mp);
1520 		KASSERT(error == 0);
1521 	}
1522 	/*
1523 	 * Clear snapshot flag and drop reference.
1524 	 */
1525 	ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1526 	DIP_ASSIGN(ip, flags, ip->i_flags);
1527 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1528 #if defined(QUOTA) || defined(QUOTA2)
1529 	chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1530 	chkiq(ip, 1, l->l_cred, FORCE);
1531 #endif
1532 }
1533 
1534 /*
1535  * Notification that a block is being freed. Return zero if the free
1536  * should be allowed to proceed. Return non-zero if the snapshot file
1537  * wants to claim the block. The block will be claimed if it is an
1538  * uncopied part of one of the snapshots. It will be freed if it is
1539  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1540  * If a fragment is being freed, then all snapshots that care about
1541  * it must make a copy since a snapshot file can only claim full sized
1542  * blocks. Note that if more than one snapshot file maps the block,
1543  * we can pick one at random to claim it. Since none of the snapshots
1544  * can change, we are assurred that they will all see the same unmodified
1545  * image. When deleting a snapshot file (see ffs_snapremove above), we
1546  * must push any of these claimed blocks to one of the other snapshots
1547  * that maps it. These claimed blocks are easily identified as they will
1548  * have a block number equal to their logical block number within the
1549  * snapshot. A copied block can never have this property because they
1550  * must always have been allocated from a BLK_NOCOPY location.
1551  */
1552 int
1553 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1554     long size, ino_t inum)
1555 {
1556 	struct mount *mp = spec_node_getmountedfs(devvp);
1557 	struct buf *ibp;
1558 	struct inode *ip;
1559 	struct vnode *vp = NULL;
1560 	struct snap_info *si;
1561 	void *saved_data = NULL;
1562 	daddr_t lbn;
1563 	daddr_t blkno;
1564 	uint32_t gen;
1565 	int indiroff = 0, error = 0, claimedblk = 0;
1566 
1567 	si = VFSTOUFS(mp)->um_snapinfo;
1568 	lbn = ffs_fragstoblks(fs, bno);
1569 	mutex_enter(&si->si_snaplock);
1570 	mutex_enter(&si->si_lock);
1571 	si->si_owner = curlwp;
1572 
1573 retry:
1574 	gen = si->si_gen;
1575 	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1576 		vp = ITOV(ip);
1577 		/*
1578 		 * Lookup block being written.
1579 		 */
1580 		if (lbn < UFS_NDADDR) {
1581 			blkno = db_get(ip, lbn);
1582 		} else {
1583 			mutex_exit(&si->si_lock);
1584 			error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1585 			    fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1586 			if (error) {
1587 				mutex_enter(&si->si_lock);
1588 				break;
1589 			}
1590 			indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1591 			blkno = idb_get(ip, ibp->b_data, indiroff);
1592 			mutex_enter(&si->si_lock);
1593 			if (gen != si->si_gen) {
1594 				brelse(ibp, 0);
1595 				goto retry;
1596 			}
1597 		}
1598 		/*
1599 		 * Check to see if block needs to be copied.
1600 		 */
1601 		if (blkno == 0) {
1602 			/*
1603 			 * A block that we map is being freed. If it has not
1604 			 * been claimed yet, we will claim or copy it (below).
1605 			 */
1606 			claimedblk = 1;
1607 		} else if (blkno == BLK_SNAP) {
1608 			/*
1609 			 * No previous snapshot claimed the block,
1610 			 * so it will be freed and become a BLK_NOCOPY
1611 			 * (don't care) for us.
1612 			 */
1613 			if (claimedblk)
1614 				panic("snapblkfree: inconsistent block type");
1615 			if (lbn < UFS_NDADDR) {
1616 				db_assign(ip, lbn, BLK_NOCOPY);
1617 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1618 			} else {
1619 				idb_assign(ip, ibp->b_data, indiroff,
1620 				    BLK_NOCOPY);
1621 				mutex_exit(&si->si_lock);
1622 				if (ip->i_nlink > 0)
1623 					bwrite(ibp);
1624 				else
1625 					bdwrite(ibp);
1626 				mutex_enter(&si->si_lock);
1627 				if (gen != si->si_gen)
1628 					goto retry;
1629 			}
1630 			continue;
1631 		} else /* BLK_NOCOPY or default */ {
1632 			/*
1633 			 * If the snapshot has already copied the block
1634 			 * (default), or does not care about the block,
1635 			 * it is not needed.
1636 			 */
1637 			if (lbn >= UFS_NDADDR)
1638 				brelse(ibp, 0);
1639 			continue;
1640 		}
1641 		/*
1642 		 * If this is a full size block, we will just grab it
1643 		 * and assign it to the snapshot inode. Otherwise we
1644 		 * will proceed to copy it. See explanation for this
1645 		 * routine as to why only a single snapshot needs to
1646 		 * claim this block.
1647 		 */
1648 		if (size == fs->fs_bsize) {
1649 #ifdef DEBUG
1650 			if (snapdebug)
1651 				printf("%s %llu lbn %" PRId64
1652 				    "from inum %llu\n",
1653 				    "Grabonremove: snapino",
1654 				    (unsigned long long)ip->i_number,
1655 				    lbn, (unsigned long long)inum);
1656 #endif
1657 			mutex_exit(&si->si_lock);
1658 			if (lbn < UFS_NDADDR) {
1659 				db_assign(ip, lbn, bno);
1660 			} else {
1661 				idb_assign(ip, ibp->b_data, indiroff, bno);
1662 				if (ip->i_nlink > 0)
1663 					bwrite(ibp);
1664 				else
1665 					bdwrite(ibp);
1666 			}
1667 			DIP_ADD(ip, blocks, btodb(size));
1668 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1669 			if (ip->i_nlink > 0 && mp->mnt_wapbl)
1670 				error = syncsnap(vp);
1671 			else
1672 				error = 0;
1673 			mutex_enter(&si->si_lock);
1674 			si->si_owner = NULL;
1675 			mutex_exit(&si->si_lock);
1676 			mutex_exit(&si->si_snaplock);
1677 			return (error == 0);
1678 		}
1679 		if (lbn >= UFS_NDADDR)
1680 			brelse(ibp, 0);
1681 #ifdef DEBUG
1682 		if (snapdebug)
1683 			printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1684 			    "Copyonremove: snapino ",
1685 			    (unsigned long long)ip->i_number,
1686 			    lbn, "for inum", (unsigned long long)inum, size);
1687 #endif
1688 		/*
1689 		 * If we have already read the old block contents, then
1690 		 * simply copy them to the new block. Note that we need
1691 		 * to synchronously write snapshots that have not been
1692 		 * unlinked, and hence will be visible after a crash,
1693 		 * to ensure their integrity.
1694 		 */
1695 		mutex_exit(&si->si_lock);
1696 		if (saved_data == NULL) {
1697 			saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1698 			error = rwfsblk(vp, B_READ, saved_data, lbn);
1699 			if (error) {
1700 				free(saved_data, M_UFSMNT);
1701 				saved_data = NULL;
1702 				mutex_enter(&si->si_lock);
1703 				break;
1704 			}
1705 		}
1706 		error = wrsnapblk(vp, saved_data, lbn);
1707 		if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1708 			error = syncsnap(vp);
1709 		mutex_enter(&si->si_lock);
1710 		if (error)
1711 			break;
1712 		if (gen != si->si_gen)
1713 			goto retry;
1714 	}
1715 	si->si_owner = NULL;
1716 	mutex_exit(&si->si_lock);
1717 	mutex_exit(&si->si_snaplock);
1718 	if (saved_data)
1719 		free(saved_data, M_UFSMNT);
1720 	/*
1721 	 * If we have been unable to allocate a block in which to do
1722 	 * the copy, then return non-zero so that the fragment will
1723 	 * not be freed. Although space will be lost, the snapshot
1724 	 * will stay consistent.
1725 	 */
1726 	return (error);
1727 }
1728 
1729 /*
1730  * Associate snapshot files when mounting.
1731  */
1732 void
1733 ffs_snapshot_mount(struct mount *mp)
1734 {
1735 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1736 	struct fs *fs = VFSTOUFS(mp)->um_fs;
1737 	struct lwp *l = curlwp;
1738 	struct vnode *vp;
1739 	struct inode *ip, *xp;
1740 	struct snap_info *si;
1741 	daddr_t snaplistsize, *snapblklist;
1742 	int i, error, ns __unused, snaploc, loc;
1743 
1744 	/*
1745 	 * No persistent snapshots on apple ufs file systems.
1746 	 */
1747 	if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1748 		return;
1749 
1750 	si = VFSTOUFS(mp)->um_snapinfo;
1751 	ns = UFS_FSNEEDSWAP(fs);
1752 	/*
1753 	 * XXX The following needs to be set before ffs_truncate or
1754 	 * VOP_READ can be called.
1755 	 */
1756 	mp->mnt_stat.f_iosize = fs->fs_bsize;
1757 	/*
1758 	 * Process each snapshot listed in the superblock.
1759 	 */
1760 	vp = NULL;
1761 	mutex_enter(&si->si_lock);
1762 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1763 		if (fs->fs_snapinum[snaploc] == 0)
1764 			break;
1765 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1766 		    &vp)) != 0) {
1767 			printf("ffs_snapshot_mount: vget failed %d\n", error);
1768 			continue;
1769 		}
1770 		ip = VTOI(vp);
1771 		if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1772 		    SF_SNAPSHOT) {
1773 			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1774 			    fs->fs_snapinum[snaploc]);
1775 			vput(vp);
1776 			vp = NULL;
1777 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1778 				if (fs->fs_snapinum[loc] == 0)
1779 					break;
1780 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1781 			}
1782 			fs->fs_snapinum[loc - 1] = 0;
1783 			snaploc--;
1784 			continue;
1785 		}
1786 
1787 		/*
1788 		 * Read the block hints list. Use an empty list on
1789 		 * read errors.
1790 		 */
1791 		error = vn_rdwr(UIO_READ, vp,
1792 		    (void *)&snaplistsize, sizeof(snaplistsize),
1793 		    ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1794 		    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1795 		    l->l_cred, NULL, NULL);
1796 		if (error) {
1797 			printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1798 			snaplistsize = 1;
1799 		} else
1800 			snaplistsize = ufs_rw64(snaplistsize, ns);
1801 		snapblklist = malloc(
1802 		    snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1803 		if (error)
1804 			snapblklist[0] = 1;
1805 		else {
1806 			error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1807 			    snaplistsize * sizeof(daddr_t),
1808 			    ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1809 			    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1810 			    l->l_cred, NULL, NULL);
1811 			for (i = 0; i < snaplistsize; i++)
1812 				snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1813 			if (error) {
1814 				printf("ffs_snapshot_mount: read_2 failed %d\n",
1815 				    error);
1816 				snapblklist[0] = 1;
1817 			}
1818 		}
1819 		ip->i_snapblklist = &snapblklist[0];
1820 
1821 		/*
1822 		 * Link it onto the active snapshot list.
1823 		 */
1824 		if (is_active_snapshot(si, ip))
1825 			panic("ffs_snapshot_mount: %"PRIu64" already on list",
1826 			    ip->i_number);
1827 		else
1828 			TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1829 		vp->v_vflag |= VV_SYSTEM;
1830 		VOP_UNLOCK(vp);
1831 	}
1832 	/*
1833 	 * No usable snapshots found.
1834 	 */
1835 	if (vp == NULL) {
1836 		mutex_exit(&si->si_lock);
1837 		return;
1838 	}
1839 	/*
1840 	 * Attach the block hints list. We always want to
1841 	 * use the list from the newest snapshot.
1842 	*/
1843 	xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1844 	si->si_snapblklist = xp->i_snapblklist;
1845 	fscow_establish(mp, ffs_copyonwrite, devvp);
1846 	si->si_gen++;
1847 	mutex_exit(&si->si_lock);
1848 }
1849 
1850 /*
1851  * Disassociate snapshot files when unmounting.
1852  */
1853 void
1854 ffs_snapshot_unmount(struct mount *mp)
1855 {
1856 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1857 	struct inode *xp;
1858 	struct vnode *vp = NULL;
1859 	struct snap_info *si;
1860 
1861 	si = VFSTOUFS(mp)->um_snapinfo;
1862 	mutex_enter(&si->si_lock);
1863 	while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1864 		vp = ITOV(xp);
1865 		TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1866 		if (xp->i_snapblklist == si->si_snapblklist)
1867 			si->si_snapblklist = NULL;
1868 		free(xp->i_snapblklist, M_UFSMNT);
1869 		if (xp->i_nlink > 0) {
1870 			si->si_gen++;
1871 			mutex_exit(&si->si_lock);
1872 			vrele(vp);
1873 			mutex_enter(&si->si_lock);
1874 		}
1875 	}
1876 	si->si_gen++;
1877 	mutex_exit(&si->si_lock);
1878 	if (vp)
1879 		fscow_disestablish(mp, ffs_copyonwrite, devvp);
1880 }
1881 
1882 /*
1883  * Check for need to copy block that is about to be written,
1884  * copying the block if necessary.
1885  */
1886 static int
1887 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1888 {
1889 	struct fs *fs;
1890 	struct inode *ip;
1891 	struct vnode *devvp = v, *vp = NULL;
1892 	struct mount *mp = spec_node_getmountedfs(devvp);
1893 	struct snap_info *si;
1894 	void *saved_data = NULL;
1895 	daddr_t lbn, blkno, *snapblklist;
1896 	uint32_t gen;
1897 	int lower, upper, mid, snapshot_locked = 0, error = 0;
1898 
1899 	/*
1900 	 * Check for valid snapshots.
1901 	 */
1902 	si = VFSTOUFS(mp)->um_snapinfo;
1903 	mutex_enter(&si->si_lock);
1904 	ip = TAILQ_FIRST(&si->si_snapshots);
1905 	if (ip == NULL) {
1906 		mutex_exit(&si->si_lock);
1907 		return 0;
1908 	}
1909 	/*
1910 	 * First check to see if it is after the file system,
1911 	 * in the journal or in the preallocated list.
1912 	 * By doing these checks we avoid several potential deadlocks.
1913 	 */
1914 	fs = ip->i_fs;
1915 	lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1916 	if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1917 		mutex_exit(&si->si_lock);
1918 		return 0;
1919 	}
1920 	if ((fs->fs_flags & FS_DOWAPBL) &&
1921 	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1922 		off_t blk_off, log_start, log_end;
1923 
1924 		log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1925 		    fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1926 		log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1927 		    fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1928 		blk_off = dbtob(bp->b_blkno);
1929 		if (blk_off >= log_start && blk_off < log_end) {
1930 			mutex_exit(&si->si_lock);
1931 			return 0;
1932 		}
1933 	}
1934 	snapblklist = si->si_snapblklist;
1935 	upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1936 	lower = 1;
1937 	while (lower <= upper) {
1938 		mid = (lower + upper) / 2;
1939 		if (snapblklist[mid] == lbn)
1940 			break;
1941 		if (snapblklist[mid] < lbn)
1942 			lower = mid + 1;
1943 		else
1944 			upper = mid - 1;
1945 	}
1946 	if (lower <= upper) {
1947 		mutex_exit(&si->si_lock);
1948 		return 0;
1949 	}
1950 	/*
1951 	 * Not in the precomputed list, so check the snapshots.
1952 	 */
1953 	 if (si->si_owner != curlwp) {
1954 		if (!mutex_tryenter(&si->si_snaplock)) {
1955 			mutex_exit(&si->si_lock);
1956 			mutex_enter(&si->si_snaplock);
1957 			mutex_enter(&si->si_lock);
1958 		}
1959 		si->si_owner = curlwp;
1960 		snapshot_locked = 1;
1961 	 }
1962 	 if (data_valid && bp->b_bcount == fs->fs_bsize)
1963 		saved_data = bp->b_data;
1964 retry:
1965 	gen = si->si_gen;
1966 	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1967 		vp = ITOV(ip);
1968 		/*
1969 		 * We ensure that everything of our own that needs to be
1970 		 * copied will be done at the time that ffs_snapshot is
1971 		 * called. Thus we can skip the check here which can
1972 		 * deadlock in doing the lookup in ffs_balloc.
1973 		 */
1974 		if (bp->b_vp == vp)
1975 			continue;
1976 		/*
1977 		 * Check to see if block needs to be copied.
1978 		 */
1979 		if (lbn < UFS_NDADDR) {
1980 			blkno = db_get(ip, lbn);
1981 		} else {
1982 			mutex_exit(&si->si_lock);
1983 			blkno = 0; /* XXX: GCC */
1984 			if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1985 				mutex_enter(&si->si_lock);
1986 				break;
1987 			}
1988 			mutex_enter(&si->si_lock);
1989 			if (gen != si->si_gen)
1990 				goto retry;
1991 		}
1992 #ifdef DIAGNOSTIC
1993 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1994 			panic("ffs_copyonwrite: bad copy block");
1995 #endif
1996 		if (blkno != 0)
1997 			continue;
1998 
1999 		if (curlwp == uvm.pagedaemon_lwp) {
2000 			error = ENOMEM;
2001 			break;
2002 		}
2003 		/* Only one level of recursion allowed. */
2004 		KASSERT(snapshot_locked);
2005 		/*
2006 		 * Allocate the block into which to do the copy. Since
2007 		 * multiple processes may all try to copy the same block,
2008 		 * we have to recheck our need to do a copy if we sleep
2009 		 * waiting for the lock.
2010 		 *
2011 		 * Because all snapshots on a filesystem share a single
2012 		 * lock, we ensure that we will never be in competition
2013 		 * with another process to allocate a block.
2014 		 */
2015 #ifdef DEBUG
2016 		if (snapdebug) {
2017 			printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2018 			    (unsigned long long)ip->i_number, lbn);
2019 			if (bp->b_vp == devvp)
2020 				printf("fs metadata");
2021 			else
2022 				printf("inum %llu", (unsigned long long)
2023 				    VTOI(bp->b_vp)->i_number);
2024 			printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2025 		}
2026 #endif
2027 		/*
2028 		 * If we have already read the old block contents, then
2029 		 * simply copy them to the new block. Note that we need
2030 		 * to synchronously write snapshots that have not been
2031 		 * unlinked, and hence will be visible after a crash,
2032 		 * to ensure their integrity.
2033 		 */
2034 		mutex_exit(&si->si_lock);
2035 		if (saved_data == NULL) {
2036 			saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2037 			error = rwfsblk(vp, B_READ, saved_data, lbn);
2038 			if (error) {
2039 				free(saved_data, M_UFSMNT);
2040 				saved_data = NULL;
2041 				mutex_enter(&si->si_lock);
2042 				break;
2043 			}
2044 		}
2045 		error = wrsnapblk(vp, saved_data, lbn);
2046 		if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2047 			error = syncsnap(vp);
2048 		mutex_enter(&si->si_lock);
2049 		if (error)
2050 			break;
2051 		if (gen != si->si_gen)
2052 			goto retry;
2053 	}
2054 	/*
2055 	 * Note that we need to synchronously write snapshots that
2056 	 * have not been unlinked, and hence will be visible after
2057 	 * a crash, to ensure their integrity.
2058 	 */
2059 	if (snapshot_locked) {
2060 		si->si_owner = NULL;
2061 		mutex_exit(&si->si_lock);
2062 		mutex_exit(&si->si_snaplock);
2063 	} else
2064 		mutex_exit(&si->si_lock);
2065 	if (saved_data && saved_data != bp->b_data)
2066 		free(saved_data, M_UFSMNT);
2067 	return error;
2068 }
2069 
2070 /*
2071  * Read from a snapshot.
2072  */
2073 int
2074 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2075 {
2076 	struct inode *ip = VTOI(vp);
2077 	struct fs *fs = ip->i_fs;
2078 	struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2079 	struct buf *bp;
2080 	daddr_t lbn, nextlbn;
2081 	off_t fsbytes, bytesinfile;
2082 	long size, xfersize, blkoffset;
2083 	int error;
2084 
2085 	fstrans_start(vp->v_mount, FSTRANS_SHARED);
2086 	mutex_enter(&si->si_snaplock);
2087 
2088 	if (ioflag & IO_ALTSEMANTICS)
2089 		fsbytes = ip->i_size;
2090 	else
2091 		fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2092 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2093 		bytesinfile = fsbytes - uio->uio_offset;
2094 		if (bytesinfile <= 0)
2095 			break;
2096 		lbn = ffs_lblkno(fs, uio->uio_offset);
2097 		nextlbn = lbn + 1;
2098 		size = fs->fs_bsize;
2099 		blkoffset = ffs_blkoff(fs, uio->uio_offset);
2100 		xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2101 		    bytesinfile);
2102 
2103 		if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
2104 			if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2105 				size = ffs_fragroundup(fs,
2106 				    fsbytes - ffs_lblktosize(fs, lbn));
2107 			error = bread(vp, lbn, size, 0, &bp);
2108 		} else {
2109 			int nextsize = fs->fs_bsize;
2110 			error = breadn(vp, lbn,
2111 			    size, &nextlbn, &nextsize, 1, 0, &bp);
2112 		}
2113 		if (error)
2114 			break;
2115 
2116 		/*
2117 		 * We should only get non-zero b_resid when an I/O error
2118 		 * has occurred, which should cause us to break above.
2119 		 * However, if the short read did not cause an error,
2120 		 * then we want to ensure that we do not uiomove bad
2121 		 * or uninitialized data.
2122 		 */
2123 		size -= bp->b_resid;
2124 		if (size < blkoffset + xfersize) {
2125 			xfersize = size - blkoffset;
2126 			if (xfersize <= 0)
2127 				break;
2128 		}
2129 		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2130 		if (error)
2131 			break;
2132 		brelse(bp, BC_AGE);
2133 	}
2134 	if (bp != NULL)
2135 		brelse(bp, BC_AGE);
2136 
2137 	mutex_exit(&si->si_snaplock);
2138 	fstrans_done(vp->v_mount);
2139 	return error;
2140 }
2141 
2142 /*
2143  * Lookup a snapshots data block address.
2144  * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2145  * and safe even for the pagedaemon where we cannot bread().
2146  */
2147 static int
2148 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2149 {
2150 	struct indir indirs[UFS_NIADDR + 2];
2151 	struct inode *ip = VTOI(vp);
2152 	struct fs *fs = ip->i_fs;
2153 	struct buf *bp;
2154 	int error, num;
2155 
2156 	KASSERT(lbn >= 0);
2157 
2158 	if (lbn < UFS_NDADDR) {
2159 		*res = db_get(ip, lbn);
2160 		return 0;
2161 	}
2162 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2163 		return error;
2164 	if (curlwp == uvm.pagedaemon_lwp) {
2165 		mutex_enter(&bufcache_lock);
2166 		bp = incore(vp, indirs[num-1].in_lbn);
2167 		if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2168 			*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2169 			error = 0;
2170 		} else
2171 			error = ENOMEM;
2172 		mutex_exit(&bufcache_lock);
2173 		return error;
2174 	}
2175 	error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
2176 	if (error == 0) {
2177 		*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2178 		brelse(bp, 0);
2179 	}
2180 
2181 	return error;
2182 }
2183 
2184 /*
2185  * Read or write the specified block of the filesystem vp resides on
2186  * from or to the disk bypassing the buffer cache.
2187  */
2188 static int
2189 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2190 {
2191 	int error;
2192 	struct inode *ip = VTOI(vp);
2193 	struct fs *fs = ip->i_fs;
2194 	struct buf *nbp;
2195 
2196 	nbp = getiobuf(NULL, true);
2197 	nbp->b_flags = flags;
2198 	nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2199 	nbp->b_error = 0;
2200 	nbp->b_data = data;
2201 	nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2202 	nbp->b_proc = NULL;
2203 	nbp->b_dev = ip->i_devvp->v_rdev;
2204 	SET(nbp->b_cflags, BC_BUSY);	/* mark buffer busy */
2205 
2206 	bdev_strategy(nbp);
2207 
2208 	error = biowait(nbp);
2209 
2210 	putiobuf(nbp);
2211 
2212 	return error;
2213 }
2214 
2215 /*
2216  * Write all dirty buffers to disk and invalidate them.
2217  */
2218 static int
2219 syncsnap(struct vnode *vp)
2220 {
2221 	int error;
2222 	buf_t *bp;
2223 	struct fs *fs = VTOI(vp)->i_fs;
2224 
2225 	mutex_enter(&bufcache_lock);
2226 	while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2227 		error = bbusy(bp, false, 0, NULL);
2228 		if (error == EPASSTHROUGH)
2229 			continue;
2230 		else if (error != 0) {
2231 			mutex_exit(&bufcache_lock);
2232 			return error;
2233 		}
2234 		KASSERT(bp->b_bcount == fs->fs_bsize);
2235 		mutex_exit(&bufcache_lock);
2236 		error = rwfsblk(vp, B_WRITE, bp->b_data,
2237 		    ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2238 		brelse(bp, BC_INVAL | BC_VFLUSH);
2239 		if (error)
2240 			return error;
2241 		mutex_enter(&bufcache_lock);
2242 	}
2243 	mutex_exit(&bufcache_lock);
2244 
2245 	return 0;
2246 }
2247 
2248 /*
2249  * Write the specified block to a snapshot.
2250  */
2251 static int
2252 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2253 {
2254 	struct inode *ip = VTOI(vp);
2255 	struct fs *fs = ip->i_fs;
2256 	struct buf *bp;
2257 	int error;
2258 
2259 	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2260 	    FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2261 	if (error)
2262 		return error;
2263 	memcpy(bp->b_data, data, fs->fs_bsize);
2264 	if (ip->i_nlink > 0)
2265 		error = bwrite(bp);
2266 	else
2267 		bawrite(bp);
2268 
2269 	return error;
2270 }
2271 
2272 /*
2273  * Check if this inode is present on the active snapshot list.
2274  * Must be called with snapinfo locked.
2275  */
2276 static inline bool
2277 is_active_snapshot(struct snap_info *si, struct inode *ip)
2278 {
2279 	struct inode *xp;
2280 
2281 	KASSERT(mutex_owned(&si->si_lock));
2282 
2283 	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2284 		if (xp == ip)
2285 			return true;
2286 	return false;
2287 }
2288 
2289 /*
2290  * Get/Put direct block from inode or buffer containing disk addresses. Take
2291  * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2292  * into a global include.
2293  */
2294 static inline daddr_t
2295 db_get(struct inode *ip, int loc)
2296 {
2297 	if (ip->i_ump->um_fstype == UFS1)
2298 		return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2299 	else
2300 		return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2301 }
2302 
2303 static inline void
2304 db_assign(struct inode *ip, int loc, daddr_t val)
2305 {
2306 	if (ip->i_ump->um_fstype == UFS1)
2307 		ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2308 	else
2309 		ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2310 }
2311 
2312 __unused static inline daddr_t
2313 ib_get(struct inode *ip, int loc)
2314 {
2315 	if (ip->i_ump->um_fstype == UFS1)
2316 		return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2317 	else
2318 		return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2319 }
2320 
2321 static inline daddr_t
2322 idb_get(struct inode *ip, void *bf, int loc)
2323 {
2324 	if (ip->i_ump->um_fstype == UFS1)
2325 		return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2326 	else
2327 		return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2328 }
2329 
2330 static inline void
2331 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2332 {
2333 	if (ip->i_ump->um_fstype == UFS1)
2334 		((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2335 	else
2336 		((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2337 }
2338