1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * 35 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 36 */ 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.4 2004/06/20 18:55:58 hannken Exp $"); 40 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/systm.h> 44 #include <sys/conf.h> 45 #include <sys/buf.h> 46 #include <sys/proc.h> 47 #include <sys/namei.h> 48 #include <sys/sched.h> 49 #include <sys/stat.h> 50 #include <sys/malloc.h> 51 #include <sys/mount.h> 52 #include <sys/resource.h> 53 #include <sys/resourcevar.h> 54 #include <sys/vnode.h> 55 56 #include <miscfs/specfs/specdev.h> 57 58 #include <ufs/ufs/quota.h> 59 #include <ufs/ufs/ufsmount.h> 60 #include <ufs/ufs/inode.h> 61 #include <ufs/ufs/ufs_extern.h> 62 #include <ufs/ufs/ufs_bswap.h> 63 64 #include <ufs/ffs/fs.h> 65 #include <ufs/ffs/ffs_extern.h> 66 67 /* FreeBSD -> NetBSD conversion */ 68 #define KERNCRED proc0.p_ucred 69 #define ufs1_daddr_t int32_t 70 #define ufs2_daddr_t int64_t 71 #define ufs_lbn_t daddr_t 72 #define VI_MTX(v) (&(v)->v_interlock) 73 #define VI_LOCK(v) simple_lock(&(v)->v_interlock) 74 #define VI_UNLOCK(v) simple_unlock(&(v)->v_interlock) 75 #define MNT_ILOCK(v) simple_lock(&mntvnode_slock) 76 #define MNT_IUNLOCK(v) simple_unlock(&mntvnode_slock) 77 78 static int cgaccount(int, struct vnode *, caddr_t, int); 79 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 80 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 81 ufs_lbn_t, int), int); 82 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 83 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 84 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 93 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 94 ufs_lbn_t, int), int); 95 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 96 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 97 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 98 ufs_lbn_t, int), int); 99 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 102 struct fs *, ufs_lbn_t, int); 103 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 104 struct fs *, ufs_lbn_t, int); 105 static int ffs_copyonwrite(void *, struct buf *); 106 static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t); 107 static int readvnblk(struct vnode *, caddr_t, ufs2_daddr_t); 108 static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t); 109 static inline int cow_enter(void); 110 static inline void cow_leave(int); 111 static inline ufs2_daddr_t db_get(struct inode *, int); 112 static inline void db_assign(struct inode *, int, ufs2_daddr_t); 113 static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int); 114 static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t); 115 116 #ifdef DEBUG 117 static int snapdebug = 0; 118 #endif 119 120 /* 121 * Create a snapshot file and initialize it for the filesystem. 122 * Vnode is locked on entry and return. 123 */ 124 int 125 ffs_snapshot(mp, vp, ctime) 126 struct mount *mp; 127 struct vnode *vp; 128 struct timespec *ctime; 129 { 130 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 131 int error, ns, cg, snaploc; 132 int i, size, len, loc; 133 int flag = mp->mnt_flag; 134 struct timeval starttime; 135 #ifdef DEBUG 136 struct timeval endtime; 137 #endif 138 struct timespec ts; 139 long redo = 0; 140 int32_t *lp; 141 void *space; 142 caddr_t cgbuf; 143 struct ufsmount *ump = VFSTOUFS(mp); 144 struct fs *copy_fs = NULL, *fs = ump->um_fs; 145 struct proc *p = curproc; 146 struct inode *ip, *xp; 147 struct buf *bp, *ibp; 148 struct vattr vat; 149 struct vnode *xvp, *nvp, *devvp; 150 struct vop_vfree_args args; 151 152 ns = UFS_FSNEEDSWAP(fs); 153 /* 154 * Need to serialize access to snapshot code per filesystem. 155 */ 156 /* 157 * If the vnode already is a snapshot, return. 158 */ 159 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 160 if (ctime) { 161 ctime->tv_sec = DIP(VTOI(vp), mtime); 162 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 163 } 164 return 0; 165 } 166 /* 167 * Check mount and check for exclusive reference. 168 */ 169 if (vp->v_mount != mp) 170 return EXDEV; 171 if (vp->v_usecount != 1 || vp->v_writecount != 0) 172 return EBUSY; 173 if (vp->v_size != 0) { 174 error = VOP_TRUNCATE(vp, 0, 0, NOCRED, p); 175 if (error) 176 return error; 177 } 178 /* 179 * Assign a snapshot slot in the superblock. 180 */ 181 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 182 if (fs->fs_snapinum[snaploc] == 0) 183 break; 184 if (snaploc == FSMAXSNAP) 185 return (ENOSPC); 186 ip = VTOI(vp); 187 devvp = ip->i_devvp; 188 /* 189 * Allocate and copy the last block contents so as to be able 190 * to set size to that of the filesystem. 191 */ 192 numblks = howmany(fs->fs_size, fs->fs_frag); 193 cgbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 194 if ((error = readfsblk(vp, cgbuf, numblks - 1)) != 0) 195 goto out; 196 error = vn_rdwr(UIO_WRITE, vp, 197 cgbuf, fs->fs_bsize, lblktosize(fs, (off_t)(numblks - 1)), 198 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 199 if (error) 200 goto out; 201 /* 202 * Preallocate critical data structures so that we can copy 203 * them in without further allocation after we suspend all 204 * operations on the filesystem. We would like to just release 205 * the allocated buffers without writing them since they will 206 * be filled in below once we are ready to go, but this upsets 207 * the soft update code, so we go ahead and write the new buffers. 208 * 209 * Allocate all indirect blocks and mark all of them as not 210 * needing to be copied. 211 */ 212 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 213 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 214 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 215 if (error) 216 goto out; 217 bwrite(ibp); 218 } 219 /* 220 * Allocate copies for the superblock and its summary information. 221 */ 222 bzero(cgbuf, fs->fs_bsize); 223 blkno = lblkno(fs, fs->fs_sblockloc); 224 for (loc = 0; loc < howmany(fs->fs_sbsize, fs->fs_bsize); loc++) 225 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0) 226 goto out; 227 blkno = fragstoblks(fs, fs->fs_csaddr); 228 for (loc = 0; loc < howmany(fs->fs_cssize, fs->fs_bsize); loc++) 229 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0) 230 goto out; 231 /* 232 * Allocate all cylinder group blocks. 233 */ 234 for (cg = 0; cg < fs->fs_ncg; cg++) 235 if ((error = writevnblk(vp, cgbuf, 236 fragstoblks(fs, cgtod(fs, cg)))) != 0) 237 goto out; 238 /* 239 * Copy all the cylinder group maps. Although the 240 * filesystem is still active, we hope that only a few 241 * cylinder groups will change between now and when we 242 * suspend operations. Thus, we will be able to quickly 243 * touch up the few cylinder groups that changed during 244 * the suspension period. 245 */ 246 len = howmany(fs->fs_ncg, NBBY); 247 MALLOC(fs->fs_active, u_char *, len, M_DEVBUF, M_WAITOK | M_ZERO); 248 for (cg = 0; cg < fs->fs_ncg; cg++) { 249 if ((error = cgaccount(cg, vp, cgbuf, 1)) != 0) 250 goto out; 251 if ((error = writevnblk(vp, cgbuf, 252 fragstoblks(fs, cgtod(fs, cg)))) != 0) 253 goto out; 254 } 255 /* 256 * Change inode to snapshot type file. 257 */ 258 ip->i_flags |= SF_SNAPSHOT; 259 DIP_ASSIGN(ip, flags, ip->i_flags); 260 ip->i_flag |= IN_CHANGE | IN_UPDATE; 261 /* 262 * Ensure that the snapshot is completely on disk. 263 * Since we have marked it as a snapshot it is safe to 264 * unlock it as no process will be allowed to write to it. 265 */ 266 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, p)) != 0) 267 goto out; 268 VOP_UNLOCK(vp, 0); 269 /* 270 * All allocations are done, so we can now snapshot the system. 271 * 272 * Suspend operation on filesystem. 273 */ 274 if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) { 275 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 276 goto out; 277 } 278 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 279 microtime(&starttime); 280 /* 281 * First, copy all the cylinder group maps that have changed. 282 */ 283 for (cg = 0; cg < fs->fs_ncg; cg++) { 284 if (ACTIVECG_ISSET(fs, cg)) 285 continue; 286 redo++; 287 if ((error = cgaccount(cg, vp, cgbuf, 2)) != 0) 288 goto out1; 289 if ((error = writevnblk(vp, cgbuf, 290 fragstoblks(fs, cgtod(fs, cg)))) != 0) 291 goto out1; 292 } 293 /* 294 * Grab a copy of the superblock and its summary information. 295 * We delay writing it until the suspension is released below. 296 */ 297 loc = blkoff(fs, fs->fs_sblockloc); 298 if (loc > 0) 299 bzero(&cgbuf[0], loc); 300 copy_fs = (struct fs *)(cgbuf + loc); 301 bcopy(fs, copy_fs, fs->fs_sbsize); 302 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 303 if (fs->fs_sbsize < size) 304 bzero(&cgbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize); 305 size = blkroundup(fs, fs->fs_cssize); 306 if (fs->fs_contigsumsize > 0) 307 size += fs->fs_ncg * sizeof(int32_t); 308 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 309 copy_fs->fs_csp = space; 310 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 311 (char *)space += fs->fs_cssize; 312 loc = howmany(fs->fs_cssize, fs->fs_fsize); 313 i = fs->fs_frag - loc % fs->fs_frag; 314 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 315 if (len > 0) { 316 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 317 len, KERNCRED, &bp)) != 0) { 318 brelse(bp); 319 free(copy_fs->fs_csp, M_UFSMNT); 320 goto out1; 321 } 322 bcopy(bp->b_data, space, (u_int)len); 323 (char *)space += len; 324 bp->b_flags |= B_INVAL | B_NOCACHE; 325 brelse(bp); 326 } 327 if (fs->fs_contigsumsize > 0) { 328 copy_fs->fs_maxcluster = lp = space; 329 for (i = 0; i < fs->fs_ncg; i++) 330 *lp++ = fs->fs_contigsumsize; 331 } 332 /* 333 * We must check for active files that have been unlinked 334 * (e.g., with a zero link count). We have to expunge all 335 * trace of these files from the snapshot so that they are 336 * not reclaimed prematurely by fsck or unnecessarily dumped. 337 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 338 * spec_strategy about writing on a suspended filesystem. 339 * Note that we skip unlinked snapshot files as they will 340 * be handled separately below. 341 * 342 * We also calculate the needed size for the snapshot list. 343 */ 344 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 345 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 346 MNT_ILOCK(mp); 347 loop: 348 for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) { 349 /* 350 * Make sure this vnode wasn't reclaimed in getnewvnode(). 351 * Start over if it has (it won't be on the list anymore). 352 */ 353 if (xvp->v_mount != mp) 354 goto loop; 355 nvp = LIST_NEXT(xvp, v_mntvnodes); 356 VI_LOCK(xvp); 357 MNT_IUNLOCK(mp); 358 if ((xvp->v_flag & VXLOCK) || 359 xvp->v_usecount == 0 || xvp->v_type == VNON || 360 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 361 VI_UNLOCK(xvp); 362 MNT_ILOCK(mp); 363 continue; 364 } 365 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 366 MNT_ILOCK(mp); 367 goto loop; 368 } 369 #ifdef DEBUG 370 if (snapdebug) 371 vprint("ffs_snapshot: busy vnode", xvp); 372 #endif 373 if (VOP_GETATTR(xvp, &vat, p->p_ucred, p) == 0 && 374 vat.va_nlink > 0) { 375 VOP_UNLOCK(xvp, 0); 376 MNT_ILOCK(mp); 377 continue; 378 } 379 xp = VTOI(xvp); 380 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 381 VOP_UNLOCK(xvp, 0); 382 MNT_ILOCK(mp); 383 continue; 384 } 385 /* 386 * If there is a fragment, clear it here. 387 */ 388 blkno = 0; 389 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 390 if (loc < NDADDR) { 391 len = fragroundup(fs, blkoff(fs, xp->i_size)); 392 if (len < fs->fs_bsize) { 393 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 394 len, xp->i_number); 395 blkno = db_get(xp, loc); 396 db_assign(xp, loc, 0); 397 } 398 } 399 snaplistsize += 1; 400 if (xp->i_ump->um_fstype == UFS1) 401 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 402 BLK_NOCOPY); 403 else 404 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 405 BLK_NOCOPY); 406 if (blkno) 407 db_assign(xp, loc, blkno); 408 if (!error) { 409 args.a_pvp = vp; 410 args.a_ino = xp->i_number; 411 args.a_mode = xp->i_mode; 412 error = ffs_freefile(&args); 413 } 414 VOP_UNLOCK(xvp, 0); 415 if (error) { 416 free(copy_fs->fs_csp, M_UFSMNT); 417 goto out1; 418 } 419 MNT_ILOCK(mp); 420 } 421 MNT_IUNLOCK(mp); 422 /* 423 * If there already exist snapshots on this filesystem, grab a 424 * reference to their shared lock. If this is the first snapshot 425 * on this filesystem, we need to allocate a lock for the snapshots 426 * to share. In either case, acquire the snapshot lock and give 427 * up our original private lock. 428 */ 429 VI_LOCK(devvp); 430 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 431 struct lock *lkp; 432 433 lkp = ITOV(xp)->v_vnlock; 434 VI_UNLOCK(devvp); 435 VI_LOCK(vp); 436 vp->v_vnlock = lkp; 437 } else { 438 struct lock *lkp; 439 440 VI_UNLOCK(devvp); 441 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 442 M_WAITOK); 443 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 444 VI_LOCK(vp); 445 vp->v_vnlock = lkp; 446 } 447 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 448 transferlockers(&vp->v_lock, vp->v_vnlock); 449 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 450 /* 451 * If this is the first snapshot on this filesystem, then we need 452 * to allocate the space for the list of preallocated snapshot blocks. 453 * This list will be refined below, but this preliminary one will 454 * keep us out of deadlock until the full one is ready. 455 */ 456 if (xp == NULL) { 457 MALLOC(snapblklist, ufs2_daddr_t *, 458 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 459 blkp = &snapblklist[1]; 460 *blkp++ = ufs_rw64(lblkno(fs, fs->fs_sblockloc), ns); 461 blkno = fragstoblks(fs, fs->fs_csaddr); 462 for (cg = 0; cg < fs->fs_ncg; cg++) { 463 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 464 break; 465 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns); 466 } 467 len = howmany(fs->fs_cssize, fs->fs_bsize); 468 for (loc = 0; loc < len; loc++) 469 *blkp++ = ufs_rw64(blkno + loc, ns); 470 for (; cg < fs->fs_ncg; cg++) 471 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns); 472 snapblklist[0] = ufs_rw64(blkp - snapblklist, ns); 473 VI_LOCK(devvp); 474 if (ump->um_snapblklist != NULL) 475 panic("ffs_snapshot: non-empty list"); 476 ump->um_snapblklist = snapblklist; 477 ump->um_snaplistsize = blkp - snapblklist; 478 VI_UNLOCK(devvp); 479 } 480 /* 481 * Record snapshot inode. Since this is the newest snapshot, 482 * it must be placed at the end of the list. 483 */ 484 VI_LOCK(devvp); 485 fs->fs_snapinum[snaploc] = ip->i_number; 486 if (ip->i_nextsnap.tqe_prev != 0) 487 panic("ffs_snapshot: %d already on list", ip->i_number); 488 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 489 VI_UNLOCK(devvp); 490 if (xp == NULL) 491 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 492 vp->v_flag |= VSYSTEM; 493 out1: 494 /* 495 * Resume operation on filesystem. 496 */ 497 vfs_write_resume(vp->v_mount); 498 /* 499 * Set the mtime to the time the snapshot has been taken. 500 */ 501 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 502 if (ctime) 503 *ctime = ts; 504 DIP_ASSIGN(ip, mtime, ts.tv_sec); 505 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 506 ip->i_flag |= IN_CHANGE | IN_UPDATE; 507 508 #ifdef DEBUG 509 if (starttime.tv_sec > 0) { 510 microtime(&endtime); 511 timersub(&endtime, &starttime, &endtime); 512 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 513 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 514 endtime.tv_usec / 1000, redo, fs->fs_ncg); 515 } 516 #endif 517 if (error) 518 goto out; 519 /* 520 * Copy allocation information from all the snapshots in 521 * this snapshot and then expunge them from its view. 522 */ 523 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) { 524 if (xp == ip) 525 break; 526 if (xp->i_ump->um_fstype == UFS1) 527 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 528 BLK_SNAP); 529 else 530 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 531 BLK_SNAP); 532 if (error) { 533 fs->fs_snapinum[snaploc] = 0; 534 goto done; 535 } 536 } 537 /* 538 * Allocate space for the full list of preallocated snapshot blocks. 539 */ 540 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 541 M_UFSMNT, M_WAITOK); 542 ip->i_snapblklist = &snapblklist[1]; 543 /* 544 * Expunge the blocks used by the snapshots from the set of 545 * blocks marked as used in the snapshot bitmaps. Also, collect 546 * the list of allocated blocks in i_snapblklist. 547 */ 548 if (ip->i_ump->um_fstype == UFS1) 549 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 550 else 551 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 552 if (error) { 553 fs->fs_snapinum[snaploc] = 0; 554 FREE(snapblklist, M_UFSMNT); 555 goto done; 556 } 557 if (snaplistsize < ip->i_snapblklist - snapblklist) 558 panic("ffs_snapshot: list too small"); 559 snaplistsize = ip->i_snapblklist - snapblklist; 560 snapblklist[0] = ufs_rw64(snaplistsize, ns); 561 ip->i_snapblklist = 0; 562 /* 563 * Write out the list of allocated blocks to the end of the snapshot. 564 */ 565 error = vn_rdwr(UIO_WRITE, vp, 566 (caddr_t)snapblklist, snaplistsize*sizeof(ufs2_daddr_t), ip->i_size, 567 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 568 if (error) { 569 fs->fs_snapinum[snaploc] = 0; 570 FREE(snapblklist, M_UFSMNT); 571 goto done; 572 } 573 /* 574 * Write the superblock and its summary information 575 * to the snapshot. 576 */ 577 blkno = fragstoblks(fs, fs->fs_csaddr); 578 len = howmany(fs->fs_cssize, fs->fs_bsize); 579 space = copy_fs->fs_csp; 580 if (ns) { 581 ffs_sb_swap(copy_fs, copy_fs); 582 ffs_csum_swap(space, space, fs->fs_cssize); 583 } 584 for (loc = 0; loc < len; loc++) { 585 if ((error = writevnblk(vp, space, blkno + loc)) != 0) { 586 fs->fs_snapinum[snaploc] = 0; 587 FREE(snapblklist, M_UFSMNT); 588 goto done; 589 } 590 space = (char *)space + fs->fs_bsize; 591 } 592 /* 593 * As this is the newest list, it is the most inclusive, so 594 * should replace the previous list. 595 */ 596 VI_LOCK(devvp); 597 space = ump->um_snapblklist; 598 ump->um_snapblklist = snapblklist; 599 ump->um_snaplistsize = snaplistsize; 600 VI_UNLOCK(devvp); 601 if (space != NULL) 602 FREE(space, M_UFSMNT); 603 done: 604 free(copy_fs->fs_csp, M_UFSMNT); 605 blkno = lblkno(fs, fs->fs_sblockloc); 606 if (error == 0 && (error = writevnblk(vp, cgbuf, blkno)) != 0) 607 fs->fs_snapinum[snaploc] = 0; 608 out: 609 /* 610 * All block address modifications are done. Invalidate and free 611 * all pages on the snapshot vnode. Those coming from read ahead 612 * are no longer valid. 613 */ 614 if (!error) { 615 simple_lock(&vp->v_interlock); 616 error = VOP_PUTPAGES(vp, 0, 0, 617 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 618 } 619 if (cgbuf) 620 free(cgbuf, M_UFSMNT); 621 if (fs->fs_active != 0) { 622 FREE(fs->fs_active, M_DEVBUF); 623 fs->fs_active = 0; 624 } 625 mp->mnt_flag = flag; 626 if (error) 627 (void) VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p); 628 else 629 vref(vp); 630 return (error); 631 } 632 633 /* 634 * Copy a cylinder group map. All the unallocated blocks are marked 635 * BLK_NOCOPY so that the snapshot knows that it need not copy them 636 * if they are later written. If passno is one, then this is a first 637 * pass, so only setting needs to be done. If passno is 2, then this 638 * is a revision to a previous pass which must be undone as the 639 * replacement pass is done. 640 */ 641 static int 642 cgaccount(cg, vp, data, passno) 643 int cg; 644 struct vnode *vp; 645 caddr_t data; 646 int passno; 647 { 648 struct buf *bp, *ibp; 649 struct inode *ip; 650 struct cg *cgp; 651 struct fs *fs; 652 ufs2_daddr_t base, numblks; 653 int error, len, loc, ns, indiroff; 654 655 ip = VTOI(vp); 656 fs = ip->i_fs; 657 ns = UFS_FSNEEDSWAP(fs); 658 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 659 (int)fs->fs_cgsize, KERNCRED, &bp); 660 if (error) { 661 brelse(bp); 662 return (error); 663 } 664 cgp = (struct cg *)bp->b_data; 665 if (!cg_chkmagic(cgp, ns)) { 666 brelse(bp); 667 return (EIO); 668 } 669 ACTIVECG_SET(fs, cg); 670 671 bcopy(bp->b_data, data, fs->fs_cgsize); 672 brelse(bp); 673 if (fs->fs_cgsize < fs->fs_bsize) 674 bzero(&data[fs->fs_cgsize], 675 fs->fs_bsize - fs->fs_cgsize); 676 numblks = howmany(fs->fs_size, fs->fs_frag); 677 len = howmany(fs->fs_fpg, fs->fs_frag); 678 base = cg * fs->fs_fpg / fs->fs_frag; 679 if (base + len >= numblks) 680 len = numblks - base - 1; 681 loc = 0; 682 if (base < NDADDR) { 683 for ( ; loc < NDADDR; loc++) { 684 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 685 db_assign(ip, loc, BLK_NOCOPY); 686 else if (db_get(ip, loc) == BLK_NOCOPY) { 687 if (passno == 2) 688 db_assign(ip, loc, 0); 689 else if (passno == 1) 690 panic("ffs_snapshot: lost direct block"); 691 } 692 } 693 } 694 if ((error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 695 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 696 return (error); 697 indiroff = (base + loc - NDADDR) % NINDIR(fs); 698 for ( ; loc < len; loc++, indiroff++) { 699 if (indiroff >= NINDIR(fs)) { 700 bwrite(ibp); 701 if ((error = VOP_BALLOC(vp, 702 lblktosize(fs, (off_t)(base + loc)), 703 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 704 return (error); 705 indiroff = 0; 706 } 707 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 708 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 709 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 710 if (passno == 2) 711 idb_assign(ip, ibp->b_data, indiroff, 0); 712 else if (passno == 1) 713 panic("ffs_snapshot: lost indirect block"); 714 } 715 } 716 bwrite(ibp); 717 return (0); 718 } 719 720 /* 721 * Before expunging a snapshot inode, note all the 722 * blocks that it claims with BLK_SNAP so that fsck will 723 * be able to account for those blocks properly and so 724 * that this snapshot knows that it need not copy them 725 * if the other snapshot holding them is freed. This code 726 * is reproduced once each for UFS1 and UFS2. 727 */ 728 static int 729 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 730 struct vnode *snapvp; 731 struct inode *cancelip; 732 struct fs *fs; 733 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 734 struct fs *, ufs_lbn_t, int); 735 int expungetype; 736 { 737 int i, s, error, ns, indiroff; 738 ufs_lbn_t lbn, rlbn; 739 ufs2_daddr_t len, blkno, numblks, blksperindir; 740 struct ufs1_dinode *dip; 741 struct buf *bp; 742 caddr_t buf; 743 744 ns = UFS_FSNEEDSWAP(fs); 745 /* 746 * Prepare to expunge the inode. If its inode block has not 747 * yet been copied, then allocate and fill the copy. 748 */ 749 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 750 blkno = 0; 751 if (lbn < NDADDR) { 752 blkno = db_get(VTOI(snapvp), lbn); 753 } else { 754 s = cow_enter(); 755 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 756 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 757 cow_leave(s); 758 if (error) 759 return (error); 760 indiroff = (lbn - NDADDR) % NINDIR(fs); 761 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 762 brelse(bp); 763 } 764 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 765 if (blkno != 0) 766 error = readvnblk(snapvp, buf, lbn); 767 else 768 error = readfsblk(snapvp, buf, lbn); 769 if (error) { 770 free(buf, M_UFSMNT); 771 return error; 772 } 773 /* 774 * Set a snapshot inode to be a zero length file, regular files 775 * to be completely unallocated. 776 */ 777 dip = (struct ufs1_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number); 778 if (expungetype == BLK_NOCOPY) 779 dip->di_mode = 0; 780 dip->di_size = 0; 781 dip->di_blocks = 0; 782 dip->di_flags = 783 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 784 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 785 error = writevnblk(snapvp, buf, lbn); 786 free(buf, M_UFSMNT); 787 if (error) 788 return error; 789 /* 790 * Now go through and expunge all the blocks in the file 791 * using the function requested. 792 */ 793 numblks = howmany(cancelip->i_size, fs->fs_bsize); 794 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 795 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 796 return (error); 797 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 798 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 799 return (error); 800 blksperindir = 1; 801 lbn = -NDADDR; 802 len = numblks - NDADDR; 803 rlbn = NDADDR; 804 for (i = 0; len > 0 && i < NIADDR; i++) { 805 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 806 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 807 blksperindir, fs, acctfunc, expungetype); 808 if (error) 809 return (error); 810 blksperindir *= NINDIR(fs); 811 lbn -= blksperindir + 1; 812 len -= blksperindir; 813 rlbn += blksperindir; 814 } 815 return (0); 816 } 817 818 /* 819 * Descend an indirect block chain for vnode cancelvp accounting for all 820 * its indirect blocks in snapvp. 821 */ 822 static int 823 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 824 blksperindir, fs, acctfunc, expungetype) 825 struct vnode *snapvp; 826 struct vnode *cancelvp; 827 int level; 828 ufs1_daddr_t blkno; 829 ufs_lbn_t lbn; 830 ufs_lbn_t rlbn; 831 ufs_lbn_t remblks; 832 ufs_lbn_t blksperindir; 833 struct fs *fs; 834 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 835 struct fs *, ufs_lbn_t, int); 836 int expungetype; 837 { 838 int error, ns, num, i; 839 ufs_lbn_t subblksperindir; 840 struct indir indirs[NIADDR + 2]; 841 ufs1_daddr_t last, *bap; 842 struct buf *bp; 843 844 ns = UFS_FSNEEDSWAP(fs); 845 846 if (blkno == 0) { 847 if (expungetype == BLK_NOCOPY) 848 return (0); 849 panic("indiracct_ufs1: missing indir"); 850 } 851 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 852 return (error); 853 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 854 panic("indiracct_ufs1: botched params"); 855 /* 856 * We have to expand bread here since it will deadlock looking 857 * up the block number for any blocks that are not in the cache. 858 */ 859 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 860 bp->b_blkno = fsbtodb(fs, blkno); 861 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 862 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 863 brelse(bp); 864 return (error); 865 } 866 /* 867 * Account for the block pointers in this indirect block. 868 */ 869 last = howmany(remblks, blksperindir); 870 if (last > NINDIR(fs)) 871 last = NINDIR(fs); 872 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 873 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 874 brelse(bp); 875 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 876 level == 0 ? rlbn : -1, expungetype); 877 if (error || level == 0) 878 goto out; 879 /* 880 * Account for the block pointers in each of the indirect blocks 881 * in the levels below us. 882 */ 883 subblksperindir = blksperindir / NINDIR(fs); 884 for (lbn++, level--, i = 0; i < last; i++) { 885 error = indiracct_ufs1(snapvp, cancelvp, level, 886 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 887 fs, acctfunc, expungetype); 888 if (error) 889 goto out; 890 rlbn += blksperindir; 891 lbn -= blksperindir; 892 remblks -= blksperindir; 893 } 894 out: 895 FREE(bap, M_DEVBUF); 896 return (error); 897 } 898 899 /* 900 * Do both snap accounting and map accounting. 901 */ 902 static int 903 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 904 struct vnode *vp; 905 ufs1_daddr_t *oldblkp, *lastblkp; 906 struct fs *fs; 907 ufs_lbn_t lblkno; 908 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 909 { 910 int error; 911 912 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 913 return (error); 914 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 915 } 916 917 /* 918 * Identify a set of blocks allocated in a snapshot inode. 919 */ 920 static int 921 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 922 struct vnode *vp; 923 ufs1_daddr_t *oldblkp, *lastblkp; 924 struct fs *fs; 925 ufs_lbn_t lblkno; 926 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 927 { 928 struct inode *ip = VTOI(vp); 929 ufs1_daddr_t blkno, *blkp; 930 ufs_lbn_t lbn; 931 struct buf *ibp; 932 int error, ns; 933 934 ns = UFS_FSNEEDSWAP(fs); 935 936 for ( ; oldblkp < lastblkp; oldblkp++) { 937 blkno = ufs_rw32(*oldblkp, ns); 938 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 939 continue; 940 lbn = fragstoblks(fs, blkno); 941 if (lbn < NDADDR) { 942 blkp = &ip->i_ffs1_db[lbn]; 943 ip->i_flag |= IN_CHANGE | IN_UPDATE; 944 } else { 945 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 946 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 947 if (error) 948 return (error); 949 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 950 [(lbn - NDADDR) % NINDIR(fs)]; 951 } 952 /* 953 * If we are expunging a snapshot vnode and we 954 * find a block marked BLK_NOCOPY, then it is 955 * one that has been allocated to this snapshot after 956 * we took our current snapshot and can be ignored. 957 */ 958 blkno = ufs_rw32(*blkp, ns); 959 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 960 if (lbn >= NDADDR) 961 brelse(ibp); 962 } else { 963 if (blkno != 0) 964 panic("snapacct_ufs1: bad block"); 965 *blkp = ufs_rw32(expungetype, ns); 966 if (lbn >= NDADDR) 967 bwrite(ibp); 968 } 969 } 970 return (0); 971 } 972 973 /* 974 * Account for a set of blocks allocated in a snapshot inode. 975 */ 976 static int 977 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 978 struct vnode *vp; 979 ufs1_daddr_t *oldblkp, *lastblkp; 980 struct fs *fs; 981 ufs_lbn_t lblkno; 982 int expungetype; 983 { 984 ufs1_daddr_t blkno; 985 struct inode *ip; 986 ino_t inum; 987 int acctit, ns; 988 989 ns = UFS_FSNEEDSWAP(fs); 990 ip = VTOI(vp); 991 inum = ip->i_number; 992 if (lblkno == -1) 993 acctit = 0; 994 else 995 acctit = 1; 996 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 997 blkno = ufs_rw32(*oldblkp, ns); 998 if (blkno == 0 || blkno == BLK_NOCOPY) 999 continue; 1000 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1001 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns); 1002 if (blkno == BLK_SNAP) 1003 blkno = blkstofrags(fs, lblkno); 1004 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1005 } 1006 return (0); 1007 } 1008 1009 /* 1010 * Before expunging a snapshot inode, note all the 1011 * blocks that it claims with BLK_SNAP so that fsck will 1012 * be able to account for those blocks properly and so 1013 * that this snapshot knows that it need not copy them 1014 * if the other snapshot holding them is freed. This code 1015 * is reproduced once each for UFS1 and UFS2. 1016 */ 1017 static int 1018 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1019 struct vnode *snapvp; 1020 struct inode *cancelip; 1021 struct fs *fs; 1022 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1023 struct fs *, ufs_lbn_t, int); 1024 int expungetype; 1025 { 1026 int i, s, error, ns, indiroff; 1027 ufs_lbn_t lbn, rlbn; 1028 ufs2_daddr_t len, blkno, numblks, blksperindir; 1029 struct ufs2_dinode *dip; 1030 struct buf *bp; 1031 caddr_t buf; 1032 1033 ns = UFS_FSNEEDSWAP(fs); 1034 /* 1035 * Prepare to expunge the inode. If its inode block has not 1036 * yet been copied, then allocate and fill the copy. 1037 */ 1038 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1039 blkno = 0; 1040 if (lbn < NDADDR) { 1041 blkno = db_get(VTOI(snapvp), lbn); 1042 } else { 1043 s = cow_enter(); 1044 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1045 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 1046 cow_leave(s); 1047 if (error) 1048 return (error); 1049 indiroff = (lbn - NDADDR) % NINDIR(fs); 1050 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 1051 brelse(bp); 1052 } 1053 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1054 if (blkno != 0) 1055 error = readvnblk(snapvp, buf, lbn); 1056 else 1057 error = readfsblk(snapvp, buf, lbn); 1058 if (error) { 1059 free(buf, M_UFSMNT); 1060 return error; 1061 } 1062 /* 1063 * Set a snapshot inode to be a zero length file, regular files 1064 * to be completely unallocated. 1065 */ 1066 dip = (struct ufs2_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number); 1067 if (expungetype == BLK_NOCOPY) 1068 dip->di_mode = 0; 1069 dip->di_size = 0; 1070 dip->di_blocks = 0; 1071 dip->di_flags = 1072 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1073 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1074 error = writevnblk(snapvp, buf, lbn); 1075 free(buf, M_UFSMNT); 1076 if (error) 1077 return error; 1078 /* 1079 * Now go through and expunge all the blocks in the file 1080 * using the function requested. 1081 */ 1082 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1083 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1084 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1085 return (error); 1086 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1087 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1088 return (error); 1089 blksperindir = 1; 1090 lbn = -NDADDR; 1091 len = numblks - NDADDR; 1092 rlbn = NDADDR; 1093 for (i = 0; len > 0 && i < NIADDR; i++) { 1094 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1095 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1096 blksperindir, fs, acctfunc, expungetype); 1097 if (error) 1098 return (error); 1099 blksperindir *= NINDIR(fs); 1100 lbn -= blksperindir + 1; 1101 len -= blksperindir; 1102 rlbn += blksperindir; 1103 } 1104 return (0); 1105 } 1106 1107 /* 1108 * Descend an indirect block chain for vnode cancelvp accounting for all 1109 * its indirect blocks in snapvp. 1110 */ 1111 static int 1112 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1113 blksperindir, fs, acctfunc, expungetype) 1114 struct vnode *snapvp; 1115 struct vnode *cancelvp; 1116 int level; 1117 ufs2_daddr_t blkno; 1118 ufs_lbn_t lbn; 1119 ufs_lbn_t rlbn; 1120 ufs_lbn_t remblks; 1121 ufs_lbn_t blksperindir; 1122 struct fs *fs; 1123 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1124 struct fs *, ufs_lbn_t, int); 1125 int expungetype; 1126 { 1127 int error, ns, num, i; 1128 ufs_lbn_t subblksperindir; 1129 struct indir indirs[NIADDR + 2]; 1130 ufs2_daddr_t last, *bap; 1131 struct buf *bp; 1132 1133 ns = UFS_FSNEEDSWAP(fs); 1134 1135 if (blkno == 0) { 1136 if (expungetype == BLK_NOCOPY) 1137 return (0); 1138 panic("indiracct_ufs2: missing indir"); 1139 } 1140 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1141 return (error); 1142 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1143 panic("indiracct_ufs2: botched params"); 1144 /* 1145 * We have to expand bread here since it will deadlock looking 1146 * up the block number for any blocks that are not in the cache. 1147 */ 1148 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1149 bp->b_blkno = fsbtodb(fs, blkno); 1150 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1151 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 1152 brelse(bp); 1153 return (error); 1154 } 1155 /* 1156 * Account for the block pointers in this indirect block. 1157 */ 1158 last = howmany(remblks, blksperindir); 1159 if (last > NINDIR(fs)) 1160 last = NINDIR(fs); 1161 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1162 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1163 brelse(bp); 1164 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1165 level == 0 ? rlbn : -1, expungetype); 1166 if (error || level == 0) 1167 goto out; 1168 /* 1169 * Account for the block pointers in each of the indirect blocks 1170 * in the levels below us. 1171 */ 1172 subblksperindir = blksperindir / NINDIR(fs); 1173 for (lbn++, level--, i = 0; i < last; i++) { 1174 error = indiracct_ufs2(snapvp, cancelvp, level, 1175 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1176 fs, acctfunc, expungetype); 1177 if (error) 1178 goto out; 1179 rlbn += blksperindir; 1180 lbn -= blksperindir; 1181 remblks -= blksperindir; 1182 } 1183 out: 1184 FREE(bap, M_DEVBUF); 1185 return (error); 1186 } 1187 1188 /* 1189 * Do both snap accounting and map accounting. 1190 */ 1191 static int 1192 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1193 struct vnode *vp; 1194 ufs2_daddr_t *oldblkp, *lastblkp; 1195 struct fs *fs; 1196 ufs_lbn_t lblkno; 1197 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1198 { 1199 int error; 1200 1201 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1202 return (error); 1203 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1204 } 1205 1206 /* 1207 * Identify a set of blocks allocated in a snapshot inode. 1208 */ 1209 static int 1210 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1211 struct vnode *vp; 1212 ufs2_daddr_t *oldblkp, *lastblkp; 1213 struct fs *fs; 1214 ufs_lbn_t lblkno; 1215 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1216 { 1217 struct inode *ip = VTOI(vp); 1218 ufs2_daddr_t blkno, *blkp; 1219 ufs_lbn_t lbn; 1220 struct buf *ibp; 1221 int error, ns; 1222 1223 ns = UFS_FSNEEDSWAP(fs); 1224 1225 for ( ; oldblkp < lastblkp; oldblkp++) { 1226 blkno = ufs_rw64(*oldblkp, ns); 1227 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1228 continue; 1229 lbn = fragstoblks(fs, blkno); 1230 if (lbn < NDADDR) { 1231 blkp = &ip->i_ffs2_db[lbn]; 1232 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1233 } else { 1234 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1235 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1236 if (error) 1237 return (error); 1238 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1239 [(lbn - NDADDR) % NINDIR(fs)]; 1240 } 1241 /* 1242 * If we are expunging a snapshot vnode and we 1243 * find a block marked BLK_NOCOPY, then it is 1244 * one that has been allocated to this snapshot after 1245 * we took our current snapshot and can be ignored. 1246 */ 1247 blkno = ufs_rw64(*blkp, ns); 1248 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1249 if (lbn >= NDADDR) 1250 brelse(ibp); 1251 } else { 1252 if (blkno != 0) 1253 panic("snapacct_ufs2: bad block"); 1254 *blkp = ufs_rw64(expungetype, ns); 1255 if (lbn >= NDADDR) 1256 bwrite(ibp); 1257 } 1258 } 1259 return (0); 1260 } 1261 1262 /* 1263 * Account for a set of blocks allocated in a snapshot inode. 1264 */ 1265 static int 1266 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1267 struct vnode *vp; 1268 ufs2_daddr_t *oldblkp, *lastblkp; 1269 struct fs *fs; 1270 ufs_lbn_t lblkno; 1271 int expungetype; 1272 { 1273 ufs2_daddr_t blkno; 1274 struct inode *ip; 1275 ino_t inum; 1276 int acctit, ns; 1277 1278 ns = UFS_FSNEEDSWAP(fs); 1279 ip = VTOI(vp); 1280 inum = ip->i_number; 1281 if (lblkno == -1) 1282 acctit = 0; 1283 else 1284 acctit = 1; 1285 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1286 blkno = ufs_rw64(*oldblkp, ns); 1287 if (blkno == 0 || blkno == BLK_NOCOPY) 1288 continue; 1289 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1290 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns); 1291 if (blkno == BLK_SNAP) 1292 blkno = blkstofrags(fs, lblkno); 1293 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1294 } 1295 return (0); 1296 } 1297 1298 /* 1299 * Decrement extra reference on snapshot when last name is removed. 1300 * It will not be freed until the last open reference goes away. 1301 */ 1302 void 1303 ffs_snapgone(ip) 1304 struct inode *ip; 1305 { 1306 struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint); 1307 struct inode *xp; 1308 struct fs *fs; 1309 int snaploc; 1310 1311 /* 1312 * Find snapshot in incore list. 1313 */ 1314 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) 1315 if (xp == ip) 1316 break; 1317 if (xp != NULL) 1318 vrele(ITOV(ip)); 1319 #ifdef DEBUG 1320 else if (snapdebug) 1321 printf("ffs_snapgone: lost snapshot vnode %d\n", 1322 ip->i_number); 1323 #endif 1324 /* 1325 * Delete snapshot inode from superblock. Keep list dense. 1326 */ 1327 fs = ip->i_fs; 1328 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1329 if (fs->fs_snapinum[snaploc] == ip->i_number) 1330 break; 1331 if (snaploc < FSMAXSNAP) { 1332 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1333 if (fs->fs_snapinum[snaploc] == 0) 1334 break; 1335 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1336 } 1337 fs->fs_snapinum[snaploc - 1] = 0; 1338 } 1339 } 1340 1341 /* 1342 * Prepare a snapshot file for being removed. 1343 */ 1344 void 1345 ffs_snapremove(vp) 1346 struct vnode *vp; 1347 { 1348 struct inode *ip = VTOI(vp); 1349 struct vnode *devvp = ip->i_devvp; 1350 struct fs *fs = ip->i_fs; 1351 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1352 struct lock *lkp; 1353 struct buf *ibp; 1354 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1355 int error, ns, loc, last; 1356 1357 ns = UFS_FSNEEDSWAP(fs); 1358 /* 1359 * If active, delete from incore list (this snapshot may 1360 * already have been in the process of being deleted, so 1361 * would not have been active). 1362 * 1363 * Clear copy-on-write flag if last snapshot. 1364 */ 1365 if (ip->i_nextsnap.tqe_prev != 0) { 1366 VI_LOCK(devvp); 1367 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1368 VI_MTX(devvp)); 1369 VI_LOCK(devvp); 1370 TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap); 1371 ip->i_nextsnap.tqe_prev = 0; 1372 lkp = vp->v_vnlock; 1373 vp->v_vnlock = &vp->v_lock; 1374 lockmgr(lkp, LK_RELEASE, NULL); 1375 if (TAILQ_FIRST(&ump->um_snapshots) != 0) { 1376 VI_UNLOCK(devvp); 1377 } else { 1378 snapblklist = ump->um_snapblklist; 1379 ump->um_snapblklist = 0; 1380 ump->um_snaplistsize = 0; 1381 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 1382 lockmgr(lkp, LK_RELEASE, NULL); 1383 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1384 FREE(lkp, M_UFSMNT); 1385 FREE(snapblklist, M_UFSMNT); 1386 } 1387 } 1388 /* 1389 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1390 * snapshots that want them (see ffs_snapblkfree below). 1391 */ 1392 for (blkno = 1; blkno < NDADDR; blkno++) { 1393 dblk = db_get(ip, blkno); 1394 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1395 db_assign(ip, blkno, 0); 1396 else if ((dblk == blkstofrags(fs, blkno) && 1397 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1398 ip->i_number))) { 1399 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1400 db_assign(ip, blkno, 0); 1401 } 1402 } 1403 numblks = howmany(ip->i_size, fs->fs_bsize); 1404 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1405 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1406 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1407 if (error) 1408 continue; 1409 if (fs->fs_size - blkno > NINDIR(fs)) 1410 last = NINDIR(fs); 1411 else 1412 last = fs->fs_size - blkno; 1413 for (loc = 0; loc < last; loc++) { 1414 dblk = idb_get(ip, ibp->b_data, loc); 1415 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1416 idb_assign(ip, ibp->b_data, loc, 0); 1417 else if (dblk == blkstofrags(fs, blkno) && 1418 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1419 fs->fs_bsize, ip->i_number)) { 1420 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1421 idb_assign(ip, ibp->b_data, loc, 0); 1422 } 1423 } 1424 bwrite(ibp); 1425 } 1426 /* 1427 * Clear snapshot flag and drop reference. 1428 */ 1429 ip->i_flags &= ~SF_SNAPSHOT; 1430 DIP_ASSIGN(ip, flags, ip->i_flags); 1431 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1432 } 1433 1434 /* 1435 * Notification that a block is being freed. Return zero if the free 1436 * should be allowed to proceed. Return non-zero if the snapshot file 1437 * wants to claim the block. The block will be claimed if it is an 1438 * uncopied part of one of the snapshots. It will be freed if it is 1439 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1440 * If a fragment is being freed, then all snapshots that care about 1441 * it must make a copy since a snapshot file can only claim full sized 1442 * blocks. Note that if more than one snapshot file maps the block, 1443 * we can pick one at random to claim it. Since none of the snapshots 1444 * can change, we are assurred that they will all see the same unmodified 1445 * image. When deleting a snapshot file (see ffs_snapremove above), we 1446 * must push any of these claimed blocks to one of the other snapshots 1447 * that maps it. These claimed blocks are easily identified as they will 1448 * have a block number equal to their logical block number within the 1449 * snapshot. A copied block can never have this property because they 1450 * must always have been allocated from a BLK_NOCOPY location. 1451 */ 1452 int 1453 ffs_snapblkfree(fs, devvp, bno, size, inum) 1454 struct fs *fs; 1455 struct vnode *devvp; 1456 ufs2_daddr_t bno; 1457 long size; 1458 ino_t inum; 1459 { 1460 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1461 struct buf *ibp; 1462 struct inode *ip; 1463 struct vnode *vp = NULL, *saved_vp = NULL; 1464 caddr_t saved_data = NULL; 1465 ufs_lbn_t lbn; 1466 ufs2_daddr_t blkno; 1467 int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1468 1469 lbn = fragstoblks(fs, bno); 1470 retry: 1471 VI_LOCK(devvp); 1472 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1473 vp = ITOV(ip); 1474 /* 1475 * Lookup block being written. 1476 */ 1477 if (lbn < NDADDR) { 1478 blkno = db_get(ip, lbn); 1479 } else { 1480 if (snapshot_locked == 0 && 1481 lockmgr(vp->v_vnlock, 1482 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1483 VI_MTX(devvp)) != 0) 1484 goto retry; 1485 snapshot_locked = 1; 1486 s = cow_enter(); 1487 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1488 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1489 cow_leave(s); 1490 if (error) 1491 break; 1492 indiroff = (lbn - NDADDR) % NINDIR(fs); 1493 blkno = idb_get(ip, ibp->b_data, indiroff); 1494 } 1495 /* 1496 * Check to see if block needs to be copied. 1497 */ 1498 if (blkno == 0) { 1499 /* 1500 * A block that we map is being freed. If it has not 1501 * been claimed yet, we will claim or copy it (below). 1502 */ 1503 claimedblk = 1; 1504 } else if (blkno == BLK_SNAP) { 1505 /* 1506 * No previous snapshot claimed the block, 1507 * so it will be freed and become a BLK_NOCOPY 1508 * (don't care) for us. 1509 */ 1510 if (claimedblk) 1511 panic("snapblkfree: inconsistent block type"); 1512 if (snapshot_locked == 0 && 1513 lockmgr(vp->v_vnlock, 1514 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1515 VI_MTX(devvp)) != 0) { 1516 if (lbn >= NDADDR) 1517 brelse(ibp); 1518 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1519 goto retry; 1520 } 1521 snapshot_locked = 1; 1522 if (lbn < NDADDR) { 1523 db_assign(ip, lbn, BLK_NOCOPY); 1524 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1525 } else { 1526 idb_assign(ip, ibp->b_data, indiroff, 1527 BLK_NOCOPY); 1528 bwrite(ibp); 1529 } 1530 continue; 1531 } else /* BLK_NOCOPY or default */ { 1532 /* 1533 * If the snapshot has already copied the block 1534 * (default), or does not care about the block, 1535 * it is not needed. 1536 */ 1537 if (lbn >= NDADDR) 1538 brelse(ibp); 1539 continue; 1540 } 1541 /* 1542 * If this is a full size block, we will just grab it 1543 * and assign it to the snapshot inode. Otherwise we 1544 * will proceed to copy it. See explanation for this 1545 * routine as to why only a single snapshot needs to 1546 * claim this block. 1547 */ 1548 if (snapshot_locked == 0 && 1549 lockmgr(vp->v_vnlock, 1550 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1551 VI_MTX(devvp)) != 0) { 1552 if (lbn >= NDADDR) 1553 brelse(ibp); 1554 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1555 goto retry; 1556 } 1557 snapshot_locked = 1; 1558 if (size == fs->fs_bsize) { 1559 #ifdef DEBUG 1560 if (snapdebug) 1561 printf("%s %d lbn %" PRId64 " from inum %d\n", 1562 "Grabonremove: snapino", ip->i_number, 1563 lbn, inum); 1564 #endif 1565 if (lbn < NDADDR) { 1566 db_assign(ip, lbn, bno); 1567 } else { 1568 idb_assign(ip, ibp->b_data, indiroff, bno); 1569 bwrite(ibp); 1570 } 1571 DIP_ADD(ip, blocks, btodb(size)); 1572 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1573 VOP_UNLOCK(vp, 0); 1574 return (1); 1575 } 1576 if (lbn >= NDADDR) 1577 brelse(ibp); 1578 #ifdef DEBUG 1579 if (snapdebug) 1580 printf("%s%d lbn %" PRId64 " %s %d size %ld\n", 1581 "Copyonremove: snapino ", ip->i_number, 1582 lbn, "for inum", inum, size); 1583 #endif 1584 /* 1585 * If we have already read the old block contents, then 1586 * simply copy them to the new block. Note that we need 1587 * to synchronously write snapshots that have not been 1588 * unlinked, and hence will be visible after a crash, 1589 * to ensure their integrity. 1590 */ 1591 if (saved_data) { 1592 error = writevnblk(vp, saved_data, lbn); 1593 if (error) 1594 break; 1595 continue; 1596 } 1597 /* 1598 * Otherwise, read the old block contents into the buffer. 1599 */ 1600 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1601 saved_vp = vp; 1602 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1603 free(saved_data, M_UFSMNT); 1604 saved_data = NULL; 1605 break; 1606 } 1607 } 1608 /* 1609 * Note that we need to synchronously write snapshots that 1610 * have not been unlinked, and hence will be visible after 1611 * a crash, to ensure their integrity. 1612 */ 1613 if (saved_data) { 1614 error = writevnblk(saved_vp, saved_data, lbn); 1615 free(saved_data, M_UFSMNT); 1616 } 1617 /* 1618 * If we have been unable to allocate a block in which to do 1619 * the copy, then return non-zero so that the fragment will 1620 * not be freed. Although space will be lost, the snapshot 1621 * will stay consistent. 1622 */ 1623 if (snapshot_locked) 1624 VOP_UNLOCK(vp, 0); 1625 else 1626 VI_UNLOCK(devvp); 1627 return (error); 1628 } 1629 1630 /* 1631 * Associate snapshot files when mounting. 1632 */ 1633 void 1634 ffs_snapshot_mount(mp) 1635 struct mount *mp; 1636 { 1637 struct ufsmount *ump = VFSTOUFS(mp); 1638 struct vnode *devvp = ump->um_devvp; 1639 struct fs *fs = ump->um_fs; 1640 struct proc *p = curproc; 1641 struct vnode *vp; 1642 struct inode *ip, *xp; 1643 ufs2_daddr_t snaplistsize, *snapblklist; 1644 int error, ns, snaploc, loc; 1645 1646 ns = UFS_FSNEEDSWAP(fs); 1647 /* 1648 * XXX The following needs to be set before VOP_TRUNCATE or 1649 * VOP_READ can be called. 1650 */ 1651 mp->mnt_stat.f_iosize = fs->fs_bsize; 1652 /* 1653 * Process each snapshot listed in the superblock. 1654 */ 1655 vp = NULL; 1656 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1657 if (fs->fs_snapinum[snaploc] == 0) 1658 break; 1659 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1660 &vp)) != 0) { 1661 printf("ffs_snapshot_mount: vget failed %d\n", error); 1662 continue; 1663 } 1664 ip = VTOI(vp); 1665 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1666 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1667 fs->fs_snapinum[snaploc]); 1668 vput(vp); 1669 vp = NULL; 1670 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1671 if (fs->fs_snapinum[loc] == 0) 1672 break; 1673 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1674 } 1675 fs->fs_snapinum[loc - 1] = 0; 1676 snaploc--; 1677 continue; 1678 } 1679 /* 1680 * If there already exist snapshots on this filesystem, grab a 1681 * reference to their shared lock. If this is the first snapshot 1682 * on this filesystem, we need to allocate a lock for the 1683 * snapshots to share. In either case, acquire the snapshot 1684 * lock and give up our original private lock. 1685 */ 1686 VI_LOCK(devvp); 1687 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 1688 struct lock *lkp; 1689 1690 lkp = ITOV(xp)->v_vnlock; 1691 VI_UNLOCK(devvp); 1692 VI_LOCK(vp); 1693 vp->v_vnlock = lkp; 1694 } else { 1695 struct lock *lkp; 1696 1697 VI_UNLOCK(devvp); 1698 MALLOC(lkp, struct lock *, sizeof(struct lock), 1699 M_UFSMNT, M_WAITOK); 1700 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 1701 VI_LOCK(vp); 1702 vp->v_vnlock = lkp; 1703 } 1704 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1705 transferlockers(&vp->v_lock, vp->v_vnlock); 1706 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1707 /* 1708 * Link it onto the active snapshot list. 1709 */ 1710 VI_LOCK(devvp); 1711 if (ip->i_nextsnap.tqe_prev != 0) 1712 panic("ffs_snapshot_mount: %d already on list", 1713 ip->i_number); 1714 else 1715 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 1716 vp->v_flag |= VSYSTEM; 1717 VI_UNLOCK(devvp); 1718 VOP_UNLOCK(vp, 0); 1719 } 1720 /* 1721 * No usable snapshots found. 1722 */ 1723 if (vp == NULL) 1724 return; 1725 /* 1726 * Allocate the space for the block hints list. We always want to 1727 * use the list from the newest snapshot. 1728 */ 1729 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1730 error = vn_rdwr(UIO_READ, vp, 1731 (caddr_t)&snaplistsize, sizeof(snaplistsize), 1732 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1733 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 1734 if (error) { 1735 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1736 VOP_UNLOCK(vp, 0); 1737 return; 1738 } 1739 snaplistsize = ufs_rw64(snaplistsize, ns); 1740 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 1741 M_UFSMNT, M_WAITOK); 1742 error = vn_rdwr(UIO_READ, vp, 1743 (caddr_t)snapblklist, snaplistsize * sizeof(ufs2_daddr_t), 1744 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1745 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, p); 1746 if (error) { 1747 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1748 VOP_UNLOCK(vp, 0); 1749 FREE(snapblklist, M_UFSMNT); 1750 return; 1751 } 1752 VOP_UNLOCK(vp, 0); 1753 VI_LOCK(devvp); 1754 ump->um_snaplistsize = snaplistsize; 1755 ump->um_snapblklist = snapblklist; 1756 VI_UNLOCK(devvp); 1757 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 1758 } 1759 1760 /* 1761 * Disassociate snapshot files when unmounting. 1762 */ 1763 void 1764 ffs_snapshot_unmount(mp) 1765 struct mount *mp; 1766 { 1767 struct ufsmount *ump = VFSTOUFS(mp); 1768 struct vnode *devvp = ump->um_devvp; 1769 struct lock *lkp = NULL; 1770 struct inode *xp; 1771 struct vnode *vp; 1772 1773 VI_LOCK(devvp); 1774 while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) { 1775 vp = ITOV(xp); 1776 lkp = vp->v_vnlock; 1777 vp->v_vnlock = &vp->v_lock; 1778 TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap); 1779 xp->i_nextsnap.tqe_prev = 0; 1780 if (xp->i_ffs_effnlink > 0) { 1781 VI_UNLOCK(devvp); 1782 vrele(vp); 1783 VI_LOCK(devvp); 1784 } 1785 } 1786 if (ump->um_snapblklist != NULL) { 1787 FREE(ump->um_snapblklist, M_UFSMNT); 1788 ump->um_snapblklist = NULL; 1789 ump->um_snaplistsize = 0; 1790 } 1791 VI_UNLOCK(devvp); 1792 if (lkp != NULL) { 1793 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1794 FREE(lkp, M_UFSMNT); 1795 } 1796 } 1797 1798 /* 1799 * Check for need to copy block that is about to be written, 1800 * copying the block if necessary. 1801 */ 1802 static int 1803 ffs_copyonwrite(v, bp) 1804 void *v; 1805 struct buf *bp; 1806 { 1807 struct buf *ibp; 1808 struct fs *fs; 1809 struct inode *ip; 1810 struct vnode *devvp = v, *vp = 0, *saved_vp = NULL; 1811 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1812 caddr_t saved_data = NULL; 1813 ufs2_daddr_t lbn, blkno, *snapblklist; 1814 int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0; 1815 1816 /* 1817 * Check for valid snapshots. 1818 */ 1819 VI_LOCK(devvp); 1820 ip = TAILQ_FIRST(&ump->um_snapshots); 1821 if (ip == NULL) { 1822 VI_UNLOCK(devvp); 1823 return 0; 1824 } 1825 /* 1826 * First check to see if it is in the preallocated list. 1827 * By doing this check we avoid several potential deadlocks. 1828 */ 1829 fs = ip->i_fs; 1830 ns = UFS_FSNEEDSWAP(fs); 1831 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1832 snapblklist = ump->um_snapblklist; 1833 upper = ump->um_snaplistsize - 1; 1834 lower = 1; 1835 while (lower <= upper) { 1836 mid = (lower + upper) / 2; 1837 if (ufs_rw64(snapblklist[mid], ns) == lbn) 1838 break; 1839 if (ufs_rw64(snapblklist[mid], ns) < lbn) 1840 lower = mid + 1; 1841 else 1842 upper = mid - 1; 1843 } 1844 if (lower <= upper) { 1845 VI_UNLOCK(devvp); 1846 return 0; 1847 } 1848 /* 1849 * Not in the precomputed list, so check the snapshots. 1850 */ 1851 retry: 1852 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1853 vp = ITOV(ip); 1854 /* 1855 * We ensure that everything of our own that needs to be 1856 * copied will be done at the time that ffs_snapshot is 1857 * called. Thus we can skip the check here which can 1858 * deadlock in doing the lookup in VOP_BALLOC. 1859 */ 1860 if (bp->b_vp == vp) 1861 continue; 1862 /* 1863 * Check to see if block needs to be copied. We do not have 1864 * to hold the snapshot lock while doing this lookup as it 1865 * will never require any additional allocations for the 1866 * snapshot inode. 1867 */ 1868 if (lbn < NDADDR) { 1869 blkno = db_get(ip, lbn); 1870 } else { 1871 if (snapshot_locked == 0 && 1872 lockmgr(vp->v_vnlock, 1873 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1874 VI_MTX(devvp)) != 0) { 1875 VI_LOCK(devvp); 1876 goto retry; 1877 } 1878 snapshot_locked = 1; 1879 s = cow_enter(); 1880 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1881 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1882 cow_leave(s); 1883 if (error) 1884 break; 1885 indiroff = (lbn - NDADDR) % NINDIR(fs); 1886 blkno = idb_get(ip, ibp->b_data, indiroff); 1887 brelse(ibp); 1888 } 1889 #ifdef DIAGNOSTIC 1890 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1891 panic("ffs_copyonwrite: bad copy block"); 1892 #endif 1893 if (blkno != 0) 1894 continue; 1895 #ifdef DIAGNOSTIC 1896 if (curlwp->l_flag & L_COWINPROGRESS) 1897 printf("ffs_copyonwrite: recursive call\n"); 1898 #endif 1899 /* 1900 * Allocate the block into which to do the copy. Since 1901 * multiple processes may all try to copy the same block, 1902 * we have to recheck our need to do a copy if we sleep 1903 * waiting for the lock. 1904 * 1905 * Because all snapshots on a filesystem share a single 1906 * lock, we ensure that we will never be in competition 1907 * with another process to allocate a block. 1908 */ 1909 if (snapshot_locked == 0 && 1910 lockmgr(vp->v_vnlock, 1911 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1912 VI_MTX(devvp)) != 0) { 1913 VI_LOCK(devvp); 1914 goto retry; 1915 } 1916 snapshot_locked = 1; 1917 #ifdef DEBUG 1918 if (snapdebug) { 1919 printf("Copyonwrite: snapino %d lbn %" PRId64 " for ", 1920 ip->i_number, lbn); 1921 if (bp->b_vp == devvp) 1922 printf("fs metadata"); 1923 else 1924 printf("inum %d", VTOI(bp->b_vp)->i_number); 1925 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1926 } 1927 #endif 1928 /* 1929 * If we have already read the old block contents, then 1930 * simply copy them to the new block. Note that we need 1931 * to synchronously write snapshots that have not been 1932 * unlinked, and hence will be visible after a crash, 1933 * to ensure their integrity. 1934 */ 1935 if (saved_data) { 1936 error = writevnblk(vp, saved_data, lbn); 1937 if (error) 1938 break; 1939 continue; 1940 } 1941 /* 1942 * Otherwise, read the old block contents into the buffer. 1943 */ 1944 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1945 saved_vp = vp; 1946 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1947 free(saved_data, M_UFSMNT); 1948 saved_data = NULL; 1949 break; 1950 } 1951 } 1952 /* 1953 * Note that we need to synchronously write snapshots that 1954 * have not been unlinked, and hence will be visible after 1955 * a crash, to ensure their integrity. 1956 */ 1957 if (saved_data) { 1958 error = writevnblk(saved_vp, saved_data, lbn); 1959 free(saved_data, M_UFSMNT); 1960 } 1961 if (snapshot_locked) 1962 VOP_UNLOCK(vp, 0); 1963 else 1964 VI_UNLOCK(devvp); 1965 return error; 1966 } 1967 1968 /* 1969 * Read the specified block from disk. Vp is usually a snapshot vnode. 1970 */ 1971 static int 1972 readfsblk(vp, data, lbn) 1973 struct vnode *vp; 1974 caddr_t data; 1975 ufs2_daddr_t lbn; 1976 { 1977 int s, error; 1978 struct inode *ip = VTOI(vp); 1979 struct fs *fs = ip->i_fs; 1980 struct buf *nbp; 1981 1982 s = splbio(); 1983 nbp = pool_get(&bufpool, PR_WAITOK); 1984 splx(s); 1985 1986 BUF_INIT(nbp); 1987 nbp->b_flags = B_READ; 1988 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 1989 nbp->b_error = 0; 1990 nbp->b_data = data; 1991 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 1992 nbp->b_proc = NULL; 1993 nbp->b_dev = ip->i_devvp->v_rdev; 1994 nbp->b_vp = NULLVP; 1995 1996 DEV_STRATEGY(nbp); 1997 1998 error = biowait(nbp); 1999 2000 s = splbio(); 2001 pool_put(&bufpool, nbp); 2002 splx(s); 2003 2004 return error; 2005 } 2006 2007 /* 2008 * Read the specified block. Bypass UBC to prevent deadlocks. 2009 */ 2010 static int 2011 readvnblk(vp, data, lbn) 2012 struct vnode *vp; 2013 caddr_t data; 2014 ufs2_daddr_t lbn; 2015 { 2016 int error; 2017 daddr_t bn; 2018 off_t offset; 2019 struct inode *ip = VTOI(vp); 2020 struct fs *fs = ip->i_fs; 2021 2022 error = VOP_BMAP(vp, lbn, NULL, &bn, NULL); 2023 if (error) 2024 return error; 2025 2026 if (bn != (daddr_t)-1) { 2027 offset = dbtob(bn); 2028 simple_lock(&vp->v_interlock); 2029 error = VOP_PUTPAGES(vp, trunc_page(offset), 2030 round_page(offset+fs->fs_bsize), 2031 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2032 if (error) 2033 return error; 2034 2035 return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn))); 2036 } 2037 2038 bzero(data, fs->fs_bsize); 2039 2040 return 0; 2041 } 2042 2043 /* 2044 * Write the specified block. Bypass UBC to prevent deadlocks. 2045 */ 2046 static int 2047 writevnblk(vp, data, lbn) 2048 struct vnode *vp; 2049 caddr_t data; 2050 ufs2_daddr_t lbn; 2051 { 2052 int s, error; 2053 off_t offset; 2054 struct buf *bp; 2055 struct inode *ip = VTOI(vp); 2056 struct fs *fs = ip->i_fs; 2057 2058 offset = lblktosize(fs, (off_t)lbn); 2059 s = cow_enter(); 2060 simple_lock(&vp->v_interlock); 2061 error = VOP_PUTPAGES(vp, trunc_page(offset), 2062 round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2063 if (error == 0) 2064 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2065 fs->fs_bsize, KERNCRED, B_SYNC, &bp); 2066 cow_leave(s); 2067 if (error) 2068 return error; 2069 2070 bcopy(data, bp->b_data, fs->fs_bsize); 2071 bp->b_flags |= B_NOCACHE; 2072 2073 return bwrite(bp); 2074 } 2075 2076 /* 2077 * Set/reset lwp's L_COWINPROGRESS flag. 2078 * May be called recursive. 2079 */ 2080 static inline int 2081 cow_enter(void) 2082 { 2083 struct lwp *l = curlwp; 2084 2085 if (l->l_flag & L_COWINPROGRESS) { 2086 return 0; 2087 } else { 2088 l->l_flag |= L_COWINPROGRESS; 2089 return L_COWINPROGRESS; 2090 } 2091 } 2092 2093 static inline void 2094 cow_leave(int flag) 2095 { 2096 struct lwp *l = curlwp; 2097 2098 l->l_flag &= ~flag; 2099 } 2100 2101 /* 2102 * Get/Put direct block from inode or buffer containing disk addresses. Take 2103 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2104 * into a global include. 2105 */ 2106 static inline ufs2_daddr_t 2107 db_get(struct inode *ip, int loc) 2108 { 2109 if (ip->i_ump->um_fstype == UFS1) 2110 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2111 else 2112 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2113 } 2114 2115 static inline void 2116 db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2117 { 2118 if (ip->i_ump->um_fstype == UFS1) 2119 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2120 else 2121 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2122 } 2123 2124 static inline ufs2_daddr_t 2125 idb_get(struct inode *ip, caddr_t buf, int loc) 2126 { 2127 if (ip->i_ump->um_fstype == UFS1) 2128 return ufs_rw32(((ufs1_daddr_t *)(buf))[loc], 2129 UFS_IPNEEDSWAP(ip)); 2130 else 2131 return ufs_rw64(((ufs2_daddr_t *)(buf))[loc], 2132 UFS_IPNEEDSWAP(ip)); 2133 } 2134 2135 static inline void 2136 idb_assign(struct inode *ip, caddr_t buf, int loc, ufs2_daddr_t val) 2137 { 2138 if (ip->i_ump->um_fstype == UFS1) 2139 ((ufs1_daddr_t *)(buf))[loc] = 2140 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2141 else 2142 ((ufs2_daddr_t *)(buf))[loc] = 2143 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2144 } 2145