xref: /netbsd-src/sys/dev/fss.c (revision e5548b402ae4c44fb816de42c7bba9581ce23ef5)
1 /*	$NetBSD: fss.c,v 1.18 2005/12/11 12:20:53 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2003 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Juergen Hannken-Illjes.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 /*
40  * File system snapshot disk driver.
41  *
42  * Block/character interface to the snapshot of a mounted file system.
43  */
44 
45 #include <sys/cdefs.h>
46 __KERNEL_RCSID(0, "$NetBSD: fss.c,v 1.18 2005/12/11 12:20:53 christos Exp $");
47 
48 #include "fss.h"
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/namei.h>
53 #include <sys/proc.h>
54 #include <sys/errno.h>
55 #include <sys/buf.h>
56 #include <sys/malloc.h>
57 #include <sys/ioctl.h>
58 #include <sys/disklabel.h>
59 #include <sys/device.h>
60 #include <sys/disk.h>
61 #include <sys/stat.h>
62 #include <sys/mount.h>
63 #include <sys/vnode.h>
64 #include <sys/file.h>
65 #include <sys/uio.h>
66 #include <sys/conf.h>
67 #include <sys/kthread.h>
68 
69 #include <miscfs/specfs/specdev.h>
70 
71 #include <dev/fssvar.h>
72 
73 #include <machine/stdarg.h>
74 
75 #ifdef DEBUG
76 #define FSS_STATISTICS
77 #endif
78 
79 #ifdef FSS_STATISTICS
80 struct fss_stat {
81 	u_int64_t	cow_calls;
82 	u_int64_t	cow_copied;
83 	u_int64_t	cow_cache_full;
84 	u_int64_t	indir_read;
85 	u_int64_t	indir_write;
86 };
87 
88 static struct fss_stat fss_stat[NFSS];
89 
90 #define FSS_STAT_INC(sc, field)	\
91 			do { \
92 				fss_stat[sc->sc_unit].field++; \
93 			} while (0)
94 #define FSS_STAT_SET(sc, field, value) \
95 			do { \
96 				fss_stat[sc->sc_unit].field = value; \
97 			} while (0)
98 #define FSS_STAT_ADD(sc, field, value) \
99 			do { \
100 				fss_stat[sc->sc_unit].field += value; \
101 			} while (0)
102 #define FSS_STAT_VAL(sc, field) fss_stat[sc->sc_unit].field
103 #define FSS_STAT_CLEAR(sc) \
104 			do { \
105 				memset(&fss_stat[sc->sc_unit], 0, \
106 				    sizeof(struct fss_stat)); \
107 			} while (0)
108 #else /* FSS_STATISTICS */
109 #define FSS_STAT_INC(sc, field)
110 #define FSS_STAT_SET(sc, field, value)
111 #define FSS_STAT_ADD(sc, field, value)
112 #define FSS_STAT_CLEAR(sc)
113 #endif /* FSS_STATISTICS */
114 
115 static struct fss_softc fss_softc[NFSS];
116 
117 void fssattach(int);
118 
119 dev_type_open(fss_open);
120 dev_type_close(fss_close);
121 dev_type_read(fss_read);
122 dev_type_write(fss_write);
123 dev_type_ioctl(fss_ioctl);
124 dev_type_strategy(fss_strategy);
125 dev_type_dump(fss_dump);
126 dev_type_size(fss_size);
127 
128 static int fss_copy_on_write(void *, struct buf *);
129 static inline void fss_error(struct fss_softc *, const char *, ...);
130 static int fss_create_files(struct fss_softc *, struct fss_set *,
131     off_t *, struct lwp *);
132 static int fss_create_snapshot(struct fss_softc *, struct fss_set *,
133     struct lwp *);
134 static int fss_delete_snapshot(struct fss_softc *, struct lwp *);
135 static int fss_softc_alloc(struct fss_softc *);
136 static void fss_softc_free(struct fss_softc *);
137 static void fss_cluster_iodone(struct buf *);
138 static void fss_read_cluster(struct fss_softc *, u_int32_t);
139 static void fss_bs_thread(void *);
140 static int fss_bs_io(struct fss_softc *, fss_io_type,
141     u_int32_t, off_t, int, caddr_t);
142 static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
143 
144 const struct bdevsw fss_bdevsw = {
145 	fss_open, fss_close, fss_strategy, fss_ioctl,
146 	fss_dump, fss_size, D_DISK
147 };
148 
149 const struct cdevsw fss_cdevsw = {
150 	fss_open, fss_close, fss_read, fss_write, fss_ioctl,
151 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
152 };
153 
154 void
155 fssattach(int num)
156 {
157 	int i;
158 	struct fss_softc *sc;
159 
160 	for (i = 0; i < NFSS; i++) {
161 		sc = &fss_softc[i];
162 		sc->sc_unit = i;
163 		sc->sc_bdev = NODEV;
164 		simple_lock_init(&sc->sc_slock);
165 		bufq_alloc(&sc->sc_bufq, "fcfs", 0);
166 	}
167 }
168 
169 int
170 fss_open(dev_t dev, int flags, int mode, struct lwp *l)
171 {
172 	int s, mflag;
173 	struct fss_softc *sc;
174 
175 	mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
176 
177 	if ((sc = FSS_DEV_TO_SOFTC(dev)) == NULL)
178 		return ENODEV;
179 
180 	FSS_LOCK(sc, s);
181 
182 	sc->sc_flags |= mflag;
183 
184 	FSS_UNLOCK(sc, s);
185 
186 	return 0;
187 }
188 
189 int
190 fss_close(dev_t dev, int flags, int mode, struct lwp *l)
191 {
192 	int s, mflag, error;
193 	struct fss_softc *sc;
194 
195 	mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
196 
197 	if ((sc = FSS_DEV_TO_SOFTC(dev)) == NULL)
198 		return ENODEV;
199 
200 	FSS_LOCK(sc, s);
201 
202 	if ((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) == mflag) {
203 		if ((sc->sc_uflags & FSS_UNCONFIG_ON_CLOSE) != 0 &&
204 		    (sc->sc_flags & FSS_ACTIVE) != 0) {
205 			FSS_UNLOCK(sc, s);
206 			error = fss_ioctl(dev, FSSIOCCLR, NULL, FWRITE, l);
207 			if (error)
208 				return error;
209 			FSS_LOCK(sc, s);
210 		}
211 		sc->sc_uflags &= ~FSS_UNCONFIG_ON_CLOSE;
212 	}
213 
214 	sc->sc_flags &= ~mflag;
215 
216 	FSS_UNLOCK(sc, s);
217 
218 	return 0;
219 }
220 
221 void
222 fss_strategy(struct buf *bp)
223 {
224 	int s;
225 	struct fss_softc *sc;
226 
227 	sc = FSS_DEV_TO_SOFTC(bp->b_dev);
228 
229 	FSS_LOCK(sc, s);
230 
231 	if ((bp->b_flags & B_READ) != B_READ ||
232 	    sc == NULL || !FSS_ISVALID(sc)) {
233 
234 		FSS_UNLOCK(sc, s);
235 
236 		bp->b_error = (sc == NULL ? ENODEV : EROFS);
237 		bp->b_flags |= B_ERROR;
238 		bp->b_resid = bp->b_bcount;
239 		biodone(bp);
240 		return;
241 	}
242 
243 	bp->b_rawblkno = bp->b_blkno;
244 	BUFQ_PUT(sc->sc_bufq, bp);
245 	wakeup(&sc->sc_bs_proc);
246 
247 	FSS_UNLOCK(sc, s);
248 }
249 
250 int
251 fss_read(dev_t dev, struct uio *uio, int flags)
252 {
253 	return physio(fss_strategy, NULL, dev, B_READ, minphys, uio);
254 }
255 
256 int
257 fss_write(dev_t dev, struct uio *uio, int flags)
258 {
259 	return physio(fss_strategy, NULL, dev, B_WRITE, minphys, uio);
260 }
261 
262 int
263 fss_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct lwp *l)
264 {
265 	int s, error;
266 	struct fss_softc *sc;
267 	struct fss_set *fss = (struct fss_set *)data;
268 	struct fss_get *fsg = (struct fss_get *)data;
269 
270 	if ((sc = FSS_DEV_TO_SOFTC(dev)) == NULL)
271 		return ENODEV;
272 
273 	FSS_LOCK(sc, s);
274 	while ((sc->sc_flags & FSS_EXCL) == FSS_EXCL) {
275 		error = ltsleep(sc, PRIBIO|PCATCH, "fsslock", 0, &sc->sc_slock);
276 		if (error) {
277 			FSS_UNLOCK(sc, s);
278 			return error;
279 		}
280 	}
281 	sc->sc_flags |= FSS_EXCL;
282 	FSS_UNLOCK(sc, s);
283 
284 	switch (cmd) {
285 	case FSSIOCSET:
286 		if ((flag & FWRITE) == 0)
287 			error = EPERM;
288 		else if ((sc->sc_flags & FSS_ACTIVE) != 0)
289 			error = EBUSY;
290 		else
291 			error = fss_create_snapshot(sc, fss, l);
292 		break;
293 
294 	case FSSIOCCLR:
295 		if ((flag & FWRITE) == 0)
296 			error = EPERM;
297 		else if ((sc->sc_flags & FSS_ACTIVE) == 0)
298 			error = ENXIO;
299 		else
300 			error = fss_delete_snapshot(sc, l);
301 		break;
302 
303 	case FSSIOCGET:
304 		switch (sc->sc_flags & (FSS_PERSISTENT | FSS_ACTIVE)) {
305 		case FSS_ACTIVE:
306 			memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
307 			fsg->fsg_csize = FSS_CLSIZE(sc);
308 			fsg->fsg_time = sc->sc_time;
309 			fsg->fsg_mount_size = sc->sc_clcount;
310 			fsg->fsg_bs_size = sc->sc_clnext;
311 			error = 0;
312 			break;
313 		case FSS_PERSISTENT | FSS_ACTIVE:
314 			memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
315 			fsg->fsg_csize = 0;
316 			fsg->fsg_time = sc->sc_time;
317 			fsg->fsg_mount_size = 0;
318 			fsg->fsg_bs_size = 0;
319 			error = 0;
320 			break;
321 		default:
322 			error = ENXIO;
323 			break;
324 		}
325 		break;
326 
327 	case FSSIOFSET:
328 		sc->sc_uflags = *(int *)data;
329 		error = 0;
330 		break;
331 
332 	case FSSIOFGET:
333 		*(int *)data = sc->sc_uflags;
334 		error = 0;
335 		break;
336 
337 	default:
338 		error = EINVAL;
339 		break;
340 	}
341 
342 	FSS_LOCK(sc, s);
343 	sc->sc_flags &= ~FSS_EXCL;
344 	FSS_UNLOCK(sc, s);
345 	wakeup(sc);
346 
347 	return error;
348 }
349 
350 int
351 fss_size(dev_t dev)
352 {
353 	return -1;
354 }
355 
356 int
357 fss_dump(dev_t dev, daddr_t blkno, caddr_t va, size_t size)
358 {
359 	return EROFS;
360 }
361 
362 /*
363  * An error occurred reading or writing the snapshot or backing store.
364  * If it is the first error log to console.
365  * The caller holds the simplelock.
366  */
367 static inline void
368 fss_error(struct fss_softc *sc, const char *fmt, ...)
369 {
370 	va_list ap;
371 
372 	if ((sc->sc_flags & (FSS_ACTIVE|FSS_ERROR)) == FSS_ACTIVE) {
373 		va_start(ap, fmt);
374 		printf("fss%d: snapshot invalid: ", sc->sc_unit);
375 		vprintf(fmt, ap);
376 		printf("\n");
377 		va_end(ap);
378 	}
379 	if ((sc->sc_flags & FSS_ACTIVE) == FSS_ACTIVE)
380 		sc->sc_flags |= FSS_ERROR;
381 }
382 
383 /*
384  * Allocate the variable sized parts of the softc and
385  * fork the kernel thread.
386  *
387  * The fields sc_clcount, sc_clshift, sc_cache_size and sc_indir_size
388  * must be initialized.
389  */
390 static int
391 fss_softc_alloc(struct fss_softc *sc)
392 {
393 	int i, len, error;
394 
395 	len = (sc->sc_clcount+NBBY-1)/NBBY;
396 	sc->sc_copied = malloc(len, M_TEMP, M_ZERO|M_WAITOK|M_CANFAIL);
397 	if (sc->sc_copied == NULL)
398 		return(ENOMEM);
399 
400 	len = sc->sc_cache_size*sizeof(struct fss_cache);
401 	sc->sc_cache = malloc(len, M_TEMP, M_ZERO|M_WAITOK|M_CANFAIL);
402 	if (sc->sc_cache == NULL)
403 		return(ENOMEM);
404 
405 	len = FSS_CLSIZE(sc);
406 	for (i = 0; i < sc->sc_cache_size; i++) {
407 		sc->sc_cache[i].fc_type = FSS_CACHE_FREE;
408 		sc->sc_cache[i].fc_softc = sc;
409 		sc->sc_cache[i].fc_xfercount = 0;
410 		sc->sc_cache[i].fc_data = malloc(len, M_TEMP,
411 		    M_WAITOK|M_CANFAIL);
412 		if (sc->sc_cache[i].fc_data == NULL)
413 			return(ENOMEM);
414 	}
415 
416 	len = (sc->sc_indir_size+NBBY-1)/NBBY;
417 	sc->sc_indir_valid = malloc(len, M_TEMP, M_ZERO|M_WAITOK|M_CANFAIL);
418 	if (sc->sc_indir_valid == NULL)
419 		return(ENOMEM);
420 
421 	len = FSS_CLSIZE(sc);
422 	sc->sc_indir_data = malloc(len, M_TEMP, M_ZERO|M_WAITOK|M_CANFAIL);
423 	if (sc->sc_indir_data == NULL)
424 		return(ENOMEM);
425 
426 	if ((error = kthread_create1(fss_bs_thread, sc, &sc->sc_bs_proc,
427 	    "fssbs%d", sc->sc_unit)) != 0)
428 		return error;
429 
430 	sc->sc_flags |= FSS_BS_THREAD;
431 	return 0;
432 }
433 
434 /*
435  * Free the variable sized parts of the softc.
436  */
437 static void
438 fss_softc_free(struct fss_softc *sc)
439 {
440 	int s, i;
441 
442 	if ((sc->sc_flags & FSS_BS_THREAD) != 0) {
443 		FSS_LOCK(sc, s);
444 		sc->sc_flags &= ~FSS_BS_THREAD;
445 		wakeup(&sc->sc_bs_proc);
446 		while (sc->sc_bs_proc != NULL)
447 			ltsleep(&sc->sc_bs_proc, PRIBIO, "fssthread", 0,
448 			    &sc->sc_slock);
449 		FSS_UNLOCK(sc, s);
450 	}
451 
452 	if (sc->sc_copied != NULL)
453 		free(sc->sc_copied, M_TEMP);
454 	sc->sc_copied = NULL;
455 
456 	if (sc->sc_cache != NULL) {
457 		for (i = 0; i < sc->sc_cache_size; i++)
458 			if (sc->sc_cache[i].fc_data != NULL)
459 				free(sc->sc_cache[i].fc_data, M_TEMP);
460 		free(sc->sc_cache, M_TEMP);
461 	}
462 	sc->sc_cache = NULL;
463 
464 	if (sc->sc_indir_valid != NULL)
465 		free(sc->sc_indir_valid, M_TEMP);
466 	sc->sc_indir_valid = NULL;
467 
468 	if (sc->sc_indir_data != NULL)
469 		free(sc->sc_indir_data, M_TEMP);
470 	sc->sc_indir_data = NULL;
471 }
472 
473 /*
474  * Check if an unmount is ok. If forced, set this snapshot into ERROR state.
475  */
476 int
477 fss_umount_hook(struct mount *mp, int forced)
478 {
479 	int i, s;
480 
481 	for (i = 0; i < NFSS; i++) {
482 		FSS_LOCK(&fss_softc[i], s);
483 		if ((fss_softc[i].sc_flags & FSS_ACTIVE) != 0 &&
484 		    fss_softc[i].sc_mount == mp) {
485 			if (forced)
486 				fss_error(&fss_softc[i], "forced unmount");
487 			else {
488 				FSS_UNLOCK(&fss_softc[i], s);
489 				return EBUSY;
490 			}
491 		}
492 		FSS_UNLOCK(&fss_softc[i], s);
493 	}
494 
495 	return 0;
496 }
497 
498 /*
499  * A buffer is written to the snapshotted block device. Copy to
500  * backing store if needed.
501  */
502 static int
503 fss_copy_on_write(void *v, struct buf *bp)
504 {
505 	int s;
506 	u_int32_t cl, ch, c;
507 	struct fss_softc *sc = v;
508 
509 	FSS_LOCK(sc, s);
510 	if (!FSS_ISVALID(sc)) {
511 		FSS_UNLOCK(sc, s);
512 		return 0;
513 	}
514 
515 	FSS_UNLOCK(sc, s);
516 
517 	FSS_STAT_INC(sc, cow_calls);
518 
519 	cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
520 	ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
521 
522 	for (c = cl; c <= ch; c++)
523 		fss_read_cluster(sc, c);
524 
525 	return 0;
526 }
527 
528 /*
529  * Lookup and open needed files.
530  *
531  * For file system internal snapshot initializes sc_mntname, sc_mount,
532  * sc_bs_vp and sc_time.
533  *
534  * Otherwise returns dev and size of the underlying block device.
535  * Initializes sc_mntname, sc_mount_vp, sc_bdev, sc_bs_vp and sc_mount
536  */
537 static int
538 fss_create_files(struct fss_softc *sc, struct fss_set *fss,
539     off_t *bsize, struct lwp *l)
540 {
541 	int error, bits, fsbsize;
542 	struct timespec ts;
543 	struct partinfo dpart;
544 	struct vattr va;
545 	struct nameidata nd;
546 
547 	/*
548 	 * Get the mounted file system.
549 	 */
550 
551 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fss->fss_mount, l);
552 	if ((error = namei(&nd)) != 0)
553 		return error;
554 
555 	if ((nd.ni_vp->v_flag & VROOT) != VROOT) {
556 		vrele(nd.ni_vp);
557 		return EINVAL;
558 	}
559 
560 	sc->sc_mount = nd.ni_vp->v_mount;
561 	memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
562 
563 	vrele(nd.ni_vp);
564 
565 	/*
566 	 * Check for file system internal snapshot.
567 	 */
568 
569 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fss->fss_bstore, l);
570 	if ((error = namei(&nd)) != 0)
571 		return error;
572 
573 	if (nd.ni_vp->v_type == VREG && nd.ni_vp->v_mount == sc->sc_mount) {
574 		vrele(nd.ni_vp);
575 		sc->sc_flags |= FSS_PERSISTENT;
576 
577 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fss->fss_bstore, l);
578 		if ((error = vn_open(&nd, FREAD, 0)) != 0)
579 			return error;
580 		sc->sc_bs_vp = nd.ni_vp;
581 
582 		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
583 		bits = sizeof(sc->sc_bs_bshift)*NBBY;
584 		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
585 		    sc->sc_bs_bshift++)
586 			if (FSS_FSBSIZE(sc) == fsbsize)
587 				break;
588 		if (sc->sc_bs_bshift >= bits) {
589 			VOP_UNLOCK(sc->sc_bs_vp, 0);
590 			return EINVAL;
591 		}
592 
593 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
594 		sc->sc_clshift = 0;
595 
596 		error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
597 		TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
598 
599 		VOP_UNLOCK(sc->sc_bs_vp, 0);
600 
601 		return error;
602 	}
603 	vrele(nd.ni_vp);
604 
605 	/*
606 	 * Get the block device it is mounted on.
607 	 */
608 
609 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE,
610 	    sc->sc_mount->mnt_stat.f_mntfromname, l);
611 	if ((error = namei(&nd)) != 0)
612 		return error;
613 
614 	if (nd.ni_vp->v_type != VBLK) {
615 		vrele(nd.ni_vp);
616 		return EINVAL;
617 	}
618 
619 	error = VOP_IOCTL(nd.ni_vp, DIOCGPART, &dpart, FREAD,
620 	    l->l_proc->p_ucred, l);
621 	if (error) {
622 		vrele(nd.ni_vp);
623 		return error;
624 	}
625 
626 	sc->sc_mount_vp = nd.ni_vp;
627 	sc->sc_bdev = nd.ni_vp->v_rdev;
628 	*bsize = (off_t)dpart.disklab->d_secsize*dpart.part->p_size;
629 	vrele(nd.ni_vp);
630 
631 	/*
632 	 * Get the backing store
633 	 */
634 
635 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fss->fss_bstore, l);
636 	if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0)
637 		return error;
638 	VOP_UNLOCK(nd.ni_vp, 0);
639 
640 	sc->sc_bs_vp = nd.ni_vp;
641 
642 	if (nd.ni_vp->v_type != VREG && nd.ni_vp->v_type != VCHR)
643 		return EINVAL;
644 
645 	if (sc->sc_bs_vp->v_type == VREG) {
646 		error = VOP_GETATTR(sc->sc_bs_vp, &va, l->l_proc->p_ucred, l);
647 		if (error != 0)
648 			return error;
649 		sc->sc_bs_size = va.va_size;
650 		fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
651 		if (fsbsize & (fsbsize-1))	/* No power of two */
652 			return EINVAL;
653 		for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
654 		    sc->sc_bs_bshift++)
655 			if (FSS_FSBSIZE(sc) == fsbsize)
656 				break;
657 		if (sc->sc_bs_bshift >= 32)
658 			return EINVAL;
659 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
660 	} else {
661 		sc->sc_bs_bshift = DEV_BSHIFT;
662 		sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
663 	}
664 
665 	/*
666 	 * As all IO to from/to the backing store goes through
667 	 * VOP_STRATEGY() clean the buffer cache to prevent
668 	 * cache incoherencies.
669 	 */
670 	if ((error = vinvalbuf(sc->sc_bs_vp, V_SAVE, l->l_proc->p_ucred, l, 0, 0)) != 0)
671 		return error;
672 
673 	return 0;
674 }
675 
676 /*
677  * Create a snapshot.
678  */
679 static int
680 fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
681 {
682 	int len, error;
683 	u_int32_t csize;
684 	off_t bsize;
685 
686 	/*
687 	 * Open needed files.
688 	 */
689 	if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
690 		goto bad;
691 
692 	if (sc->sc_flags & FSS_PERSISTENT) {
693 		fss_softc_alloc(sc);
694 		sc->sc_flags |= FSS_ACTIVE;
695 		return 0;
696 	}
697 
698 	/*
699 	 * Set cluster size. Must be a power of two and
700 	 * a multiple of backing store block size.
701 	 */
702 	if (fss->fss_csize <= 0)
703 		csize = MAXPHYS;
704 	else
705 		csize = fss->fss_csize;
706 	if (bsize/csize > FSS_CLUSTER_MAX)
707 		csize = bsize/FSS_CLUSTER_MAX+1;
708 
709 	for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
710 	    sc->sc_clshift++)
711 		if (FSS_CLSIZE(sc) >= csize)
712 			break;
713 	if (sc->sc_clshift >= 32) {
714 		error = EINVAL;
715 		goto bad;
716 	}
717 	sc->sc_clmask = FSS_CLSIZE(sc)-1;
718 
719 	/*
720 	 * Set number of cache slots.
721 	 */
722 	if (FSS_CLSIZE(sc) <= 8192)
723 		sc->sc_cache_size = 32;
724 	else if (FSS_CLSIZE(sc) <= 65536)
725 		sc->sc_cache_size = 8;
726 	else
727 		sc->sc_cache_size = 4;
728 
729 	/*
730 	 * Set number of clusters and size of last cluster.
731 	 */
732 	sc->sc_clcount = FSS_BTOCL(sc, bsize-1)+1;
733 	sc->sc_clresid = FSS_CLOFF(sc, bsize-1)+1;
734 
735 	/*
736 	 * Set size of indirect table.
737 	 */
738 	len = sc->sc_clcount*sizeof(u_int32_t);
739 	sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
740 	sc->sc_clnext = sc->sc_indir_size;
741 	sc->sc_indir_cur = 0;
742 
743 	if ((error = fss_softc_alloc(sc)) != 0)
744 		goto bad;
745 
746 	/*
747 	 * Activate the snapshot.
748 	 */
749 
750 	if ((error = vfs_write_suspend(sc->sc_mount, PUSER|PCATCH, 0)) != 0)
751 		goto bad;
752 
753 	microtime(&sc->sc_time);
754 
755 	if (error == 0)
756 		error = vn_cow_establish(sc->sc_mount_vp,
757 		    fss_copy_on_write, sc);
758 	if (error == 0)
759 		sc->sc_flags |= FSS_ACTIVE;
760 
761 	vfs_write_resume(sc->sc_mount);
762 
763 	if (error != 0)
764 		goto bad;
765 
766 #ifdef DEBUG
767 	printf("fss%d: %s snapshot active\n", sc->sc_unit, sc->sc_mntname);
768 	printf("fss%d: %u clusters of %u, %u cache slots, %u indir clusters\n",
769 	    sc->sc_unit, sc->sc_clcount, FSS_CLSIZE(sc),
770 	    sc->sc_cache_size, sc->sc_indir_size);
771 #endif
772 
773 	return 0;
774 
775 bad:
776 	fss_softc_free(sc);
777 	if (sc->sc_bs_vp != NULL) {
778 		if (sc->sc_flags & FSS_PERSISTENT)
779 			vn_close(sc->sc_bs_vp, FREAD, l->l_proc->p_ucred, l);
780 		else
781 			vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_proc->p_ucred, l);
782 	}
783 	sc->sc_bs_vp = NULL;
784 
785 	return error;
786 }
787 
788 /*
789  * Delete a snapshot.
790  */
791 static int
792 fss_delete_snapshot(struct fss_softc *sc, struct lwp *l)
793 {
794 	int s;
795 
796 	if ((sc->sc_flags & FSS_PERSISTENT) == 0)
797 		vn_cow_disestablish(sc->sc_mount_vp, fss_copy_on_write, sc);
798 
799 	FSS_LOCK(sc, s);
800 	sc->sc_flags &= ~(FSS_ACTIVE|FSS_ERROR);
801 	sc->sc_mount = NULL;
802 	sc->sc_bdev = NODEV;
803 	FSS_UNLOCK(sc, s);
804 
805 	fss_softc_free(sc);
806 	if (sc->sc_flags & FSS_PERSISTENT)
807 		vn_close(sc->sc_bs_vp, FREAD, l->l_proc->p_ucred, l);
808 	else
809 		vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_proc->p_ucred, l);
810 	sc->sc_bs_vp = NULL;
811 	sc->sc_flags &= ~FSS_PERSISTENT;
812 
813 	FSS_STAT_CLEAR(sc);
814 
815 	return 0;
816 }
817 
818 /*
819  * A read from the snapshotted block device has completed.
820  */
821 static void
822 fss_cluster_iodone(struct buf *bp)
823 {
824 	int s;
825 	struct fss_cache *scp = bp->b_private;
826 
827 	FSS_LOCK(scp->fc_softc, s);
828 
829 	if (bp->b_flags & B_EINTR)
830 		fss_error(scp->fc_softc, "fs read interrupted");
831 	if (bp->b_flags & B_ERROR)
832 		fss_error(scp->fc_softc, "fs read error %d", bp->b_error);
833 
834 	if (bp->b_vp != NULL)
835 		brelvp(bp);
836 
837 	if (--scp->fc_xfercount == 0)
838 		wakeup(&scp->fc_data);
839 
840 	FSS_UNLOCK(scp->fc_softc, s);
841 
842 	s = splbio();
843 	pool_put(&bufpool, bp);
844 	splx(s);
845 }
846 
847 /*
848  * Read a cluster from the snapshotted block device to the cache.
849  */
850 static void
851 fss_read_cluster(struct fss_softc *sc, u_int32_t cl)
852 {
853 	int s, todo, len;
854 	caddr_t addr;
855 	daddr_t dblk;
856 	struct buf *bp;
857 	struct fss_cache *scp, *scl;
858 
859 	/*
860 	 * Get a free cache slot.
861 	 */
862 	scl = sc->sc_cache+sc->sc_cache_size;
863 
864 	FSS_LOCK(sc, s);
865 
866 restart:
867 	if (isset(sc->sc_copied, cl) || !FSS_ISVALID(sc)) {
868 		FSS_UNLOCK(sc, s);
869 		return;
870 	}
871 
872 	for (scp = sc->sc_cache; scp < scl; scp++)
873 		if (scp->fc_type != FSS_CACHE_FREE &&
874 		    scp->fc_cluster == cl) {
875 			ltsleep(&scp->fc_type, PRIBIO, "cowwait2", 0,
876 			    &sc->sc_slock);
877 			goto restart;
878 		}
879 
880 	for (scp = sc->sc_cache; scp < scl; scp++)
881 		if (scp->fc_type == FSS_CACHE_FREE) {
882 			scp->fc_type = FSS_CACHE_BUSY;
883 			scp->fc_cluster = cl;
884 			break;
885 		}
886 	if (scp >= scl) {
887 		FSS_STAT_INC(sc, cow_cache_full);
888 		ltsleep(&sc->sc_cache, PRIBIO, "cowwait3", 0, &sc->sc_slock);
889 		goto restart;
890 	}
891 
892 	FSS_UNLOCK(sc, s);
893 
894 	/*
895 	 * Start the read.
896 	 */
897 	FSS_STAT_INC(sc, cow_copied);
898 
899 	dblk = btodb(FSS_CLTOB(sc, cl));
900 	addr = scp->fc_data;
901 	if (cl == sc->sc_clcount-1) {
902 		todo = sc->sc_clresid;
903 		memset(addr+todo, 0, FSS_CLSIZE(sc)-todo);
904 	} else
905 		todo = FSS_CLSIZE(sc);
906 	while (todo > 0) {
907 		len = todo;
908 		if (len > MAXPHYS)
909 			len = MAXPHYS;
910 
911 		s = splbio();
912 		bp = pool_get(&bufpool, PR_WAITOK);
913 		splx(s);
914 
915 		BUF_INIT(bp);
916 		bp->b_flags = B_READ|B_CALL;
917 		bp->b_bcount = len;
918 		bp->b_bufsize = bp->b_bcount;
919 		bp->b_error = 0;
920 		bp->b_data = addr;
921 		bp->b_blkno = bp->b_rawblkno = dblk;
922 		bp->b_proc = NULL;
923 		bp->b_dev = sc->sc_bdev;
924 		bp->b_vp = NULLVP;
925 		bp->b_private = scp;
926 		bp->b_iodone = fss_cluster_iodone;
927 
928 		DEV_STRATEGY(bp);
929 
930 		FSS_LOCK(sc, s);
931 		scp->fc_xfercount++;
932 		FSS_UNLOCK(sc, s);
933 
934 		dblk += btodb(len);
935 		addr += len;
936 		todo -= len;
937 	}
938 
939 	/*
940 	 * Wait for all read requests to complete.
941 	 */
942 	FSS_LOCK(sc, s);
943 	while (scp->fc_xfercount > 0)
944 		ltsleep(&scp->fc_data, PRIBIO, "cowwait", 0, &sc->sc_slock);
945 
946 	scp->fc_type = FSS_CACHE_VALID;
947 	setbit(sc->sc_copied, scp->fc_cluster);
948 	FSS_UNLOCK(sc, s);
949 
950 	wakeup(&sc->sc_bs_proc);
951 }
952 
953 /*
954  * Read/write clusters from/to backing store.
955  * For persistent snapshots must be called with cl == 0. off is the
956  * offset into the snapshot.
957  */
958 static int
959 fss_bs_io(struct fss_softc *sc, fss_io_type rw,
960     u_int32_t cl, off_t off, int len, caddr_t data)
961 {
962 	int error;
963 
964 	off += FSS_CLTOB(sc, cl);
965 
966 	vn_lock(sc->sc_bs_vp, LK_EXCLUSIVE|LK_RETRY);
967 
968 	error = vn_rdwr((rw == FSS_READ ? UIO_READ : UIO_WRITE), sc->sc_bs_vp,
969 	    data, len, off, UIO_SYSSPACE, IO_UNIT|IO_NODELOCKED,
970 	    sc->sc_bs_proc->p_ucred, NULL, NULL);
971 	if (error == 0) {
972 		simple_lock(&sc->sc_bs_vp->v_interlock);
973 		error = VOP_PUTPAGES(sc->sc_bs_vp, trunc_page(off),
974 		    round_page(off+len), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
975 	}
976 
977 	VOP_UNLOCK(sc->sc_bs_vp, 0);
978 
979 	return error;
980 }
981 
982 /*
983  * Get a pointer to the indirect slot for this cluster.
984  */
985 static u_int32_t *
986 fss_bs_indir(struct fss_softc *sc, u_int32_t cl)
987 {
988 	u_int32_t icl;
989 	int ioff;
990 
991 	icl = cl/(FSS_CLSIZE(sc)/sizeof(u_int32_t));
992 	ioff = cl%(FSS_CLSIZE(sc)/sizeof(u_int32_t));
993 
994 	if (sc->sc_indir_cur == icl)
995 		return &sc->sc_indir_data[ioff];
996 
997 	if (sc->sc_indir_dirty) {
998 		FSS_STAT_INC(sc, indir_write);
999 		if (fss_bs_io(sc, FSS_WRITE, sc->sc_indir_cur, 0,
1000 		    FSS_CLSIZE(sc), (caddr_t)sc->sc_indir_data) != 0)
1001 			return NULL;
1002 		setbit(sc->sc_indir_valid, sc->sc_indir_cur);
1003 	}
1004 
1005 	sc->sc_indir_dirty = 0;
1006 	sc->sc_indir_cur = icl;
1007 
1008 	if (isset(sc->sc_indir_valid, sc->sc_indir_cur)) {
1009 		FSS_STAT_INC(sc, indir_read);
1010 		if (fss_bs_io(sc, FSS_READ, sc->sc_indir_cur, 0,
1011 		    FSS_CLSIZE(sc), (caddr_t)sc->sc_indir_data) != 0)
1012 			return NULL;
1013 	} else
1014 		memset(sc->sc_indir_data, 0, FSS_CLSIZE(sc));
1015 
1016 	return &sc->sc_indir_data[ioff];
1017 }
1018 
1019 /*
1020  * The kernel thread (one for every active snapshot).
1021  *
1022  * After wakeup it cleans the cache and runs the I/O requests.
1023  */
1024 static void
1025 fss_bs_thread(void *arg)
1026 {
1027 	int error, len, nfreed, nio, s;
1028 	long off;
1029 	caddr_t addr;
1030 	u_int32_t c, cl, ch, *indirp;
1031 	struct buf *bp, *nbp;
1032 	struct fss_softc *sc;
1033 	struct fss_cache *scp, *scl;
1034 
1035 	sc = arg;
1036 
1037 	scl = sc->sc_cache+sc->sc_cache_size;
1038 
1039 	s = splbio();
1040 	nbp = pool_get(&bufpool, PR_WAITOK);
1041 	splx(s);
1042 
1043 	nfreed = nio = 1;		/* Dont sleep the first time */
1044 
1045 	FSS_LOCK(sc, s);
1046 
1047 	for (;;) {
1048 		if (nfreed == 0 && nio == 0)
1049 			ltsleep(&sc->sc_bs_proc, PVM-1, "fssbs", 0,
1050 			    &sc->sc_slock);
1051 
1052 		if ((sc->sc_flags & FSS_BS_THREAD) == 0) {
1053 			sc->sc_bs_proc = NULL;
1054 			wakeup(&sc->sc_bs_proc);
1055 
1056 			FSS_UNLOCK(sc, s);
1057 
1058 			s = splbio();
1059 			pool_put(&bufpool, nbp);
1060 			splx(s);
1061 #ifdef FSS_STATISTICS
1062 			if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
1063 				printf("fss%d: cow called %" PRId64 " times,"
1064 				    " copied %" PRId64 " clusters,"
1065 				    " cache full %" PRId64 " times\n",
1066 				    sc->sc_unit,
1067 				    FSS_STAT_VAL(sc, cow_calls),
1068 				    FSS_STAT_VAL(sc, cow_copied),
1069 				    FSS_STAT_VAL(sc, cow_cache_full));
1070 				printf("fss%d: %" PRId64 " indir reads,"
1071 				    " %" PRId64 " indir writes\n",
1072 				    sc->sc_unit,
1073 				    FSS_STAT_VAL(sc, indir_read),
1074 				    FSS_STAT_VAL(sc, indir_write));
1075 			}
1076 #endif /* FSS_STATISTICS */
1077 			kthread_exit(0);
1078 		}
1079 
1080 		/*
1081 		 * Process I/O requests (persistent)
1082 		 */
1083 
1084 		if (sc->sc_flags & FSS_PERSISTENT) {
1085 			nfreed = nio = 0;
1086 
1087 			if ((bp = BUFQ_GET(sc->sc_bufq)) == NULL)
1088 				continue;
1089 
1090 			nio++;
1091 
1092 			if (FSS_ISVALID(sc)) {
1093 				FSS_UNLOCK(sc, s);
1094 
1095 				error = fss_bs_io(sc, FSS_READ, 0,
1096 				    dbtob(bp->b_blkno), bp->b_bcount,
1097 				    bp->b_data);
1098 
1099 				FSS_LOCK(sc, s);
1100 			} else
1101 				error = ENXIO;
1102 
1103 			if (error) {
1104 				bp->b_error = error;
1105 				bp->b_flags |= B_ERROR;
1106 				bp->b_resid = bp->b_bcount;
1107 			}
1108 			biodone(bp);
1109 
1110 			continue;
1111 		}
1112 
1113 		/*
1114 		 * Clean the cache
1115 		 */
1116 		nfreed = 0;
1117 		for (scp = sc->sc_cache; scp < scl; scp++) {
1118 			if (scp->fc_type != FSS_CACHE_VALID)
1119 				continue;
1120 
1121 			FSS_UNLOCK(sc, s);
1122 
1123 			indirp = fss_bs_indir(sc, scp->fc_cluster);
1124 			if (indirp != NULL) {
1125 				error = fss_bs_io(sc, FSS_WRITE, sc->sc_clnext,
1126 				    0, FSS_CLSIZE(sc), scp->fc_data);
1127 			} else
1128 				error = EIO;
1129 
1130 			FSS_LOCK(sc, s);
1131 
1132 			if (error == 0) {
1133 				*indirp = sc->sc_clnext++;
1134 				sc->sc_indir_dirty = 1;
1135 			} else
1136 				fss_error(sc, "write bs error %d", error);
1137 
1138 			scp->fc_type = FSS_CACHE_FREE;
1139 			nfreed++;
1140 			wakeup(&scp->fc_type);
1141 		}
1142 
1143 		if (nfreed)
1144 			wakeup(&sc->sc_cache);
1145 
1146 		/*
1147 		 * Process I/O requests
1148 		 */
1149 		nio = 0;
1150 
1151 		if ((bp = BUFQ_GET(sc->sc_bufq)) == NULL)
1152 			continue;
1153 
1154 		nio++;
1155 
1156 		if (!FSS_ISVALID(sc)) {
1157 			bp->b_error = ENXIO;
1158 			bp->b_flags |= B_ERROR;
1159 			bp->b_resid = bp->b_bcount;
1160 			biodone(bp);
1161 			continue;
1162 		}
1163 
1164 		/*
1165 		 * First read from the snapshotted block device.
1166 		 * XXX Split to only read those parts that have not
1167 		 * been saved to backing store?
1168 		 */
1169 
1170 		FSS_UNLOCK(sc, s);
1171 
1172 		BUF_INIT(nbp);
1173 		nbp->b_flags = B_READ;
1174 		nbp->b_bcount = bp->b_bcount;
1175 		nbp->b_bufsize = bp->b_bcount;
1176 		nbp->b_error = 0;
1177 		nbp->b_data = bp->b_data;
1178 		nbp->b_blkno = nbp->b_rawblkno = bp->b_blkno;
1179 		nbp->b_proc = bp->b_proc;
1180 		nbp->b_dev = sc->sc_bdev;
1181 		nbp->b_vp = NULLVP;
1182 
1183 		DEV_STRATEGY(nbp);
1184 
1185 		if (biowait(nbp) != 0) {
1186 			bp->b_resid = bp->b_bcount;
1187 			bp->b_error = nbp->b_error;
1188 			bp->b_flags |= B_ERROR;
1189 			biodone(bp);
1190 			continue;
1191 		}
1192 
1193 		cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
1194 		off = FSS_CLOFF(sc, dbtob(bp->b_blkno));
1195 		ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
1196 		bp->b_resid = bp->b_bcount;
1197 		addr = bp->b_data;
1198 
1199 		FSS_LOCK(sc, s);
1200 
1201 		/*
1202 		 * Replace those parts that have been saved to backing store.
1203 		 */
1204 
1205 		for (c = cl; c <= ch;
1206 		    c++, off = 0, bp->b_resid -= len, addr += len) {
1207 			len = FSS_CLSIZE(sc)-off;
1208 			if (len > bp->b_resid)
1209 				len = bp->b_resid;
1210 
1211 			if (isclr(sc->sc_copied, c))
1212 				continue;
1213 
1214 			FSS_UNLOCK(sc, s);
1215 
1216 			indirp = fss_bs_indir(sc, c);
1217 
1218 			FSS_LOCK(sc, s);
1219 
1220 			if (indirp == NULL || *indirp == 0) {
1221 				/*
1222 				 * Not on backing store. Either in cache
1223 				 * or hole in the snapshotted block device.
1224 				 */
1225 				for (scp = sc->sc_cache; scp < scl; scp++)
1226 					if (scp->fc_type == FSS_CACHE_VALID &&
1227 					    scp->fc_cluster == c)
1228 						break;
1229 				if (scp < scl)
1230 					memcpy(addr, scp->fc_data+off, len);
1231 				else
1232 					memset(addr, 0, len);
1233 				continue;
1234 			}
1235 			/*
1236 			 * Read from backing store.
1237 			 */
1238 
1239 			FSS_UNLOCK(sc, s);
1240 
1241 			if ((error = fss_bs_io(sc, FSS_READ, *indirp,
1242 			    off, len, addr)) != 0) {
1243 				bp->b_resid = bp->b_bcount;
1244 				bp->b_error = error;
1245 				bp->b_flags |= B_ERROR;
1246 				break;
1247 			}
1248 
1249 			FSS_LOCK(sc, s);
1250 
1251 		}
1252 
1253 		biodone(bp);
1254 	}
1255 }
1256