xref: /onnv-gate/usr/src/uts/common/io/fssnap.c (revision 10542:ef3706982293)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51925Srsb  * Common Development and Distribution License (the "License").
61925Srsb  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*10542SFrank.Batschulat@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #include <sys/debug.h>
280Sstevel@tonic-gate #include <sys/types.h>
290Sstevel@tonic-gate #include <sys/file.h>
300Sstevel@tonic-gate #include <sys/errno.h>
310Sstevel@tonic-gate #include <sys/uio.h>
320Sstevel@tonic-gate #include <sys/open.h>
330Sstevel@tonic-gate #include <sys/cred.h>
340Sstevel@tonic-gate #include <sys/kmem.h>
350Sstevel@tonic-gate #include <sys/conf.h>
360Sstevel@tonic-gate #include <sys/cmn_err.h>
370Sstevel@tonic-gate #include <sys/modctl.h>
380Sstevel@tonic-gate #include <sys/disp.h>
390Sstevel@tonic-gate #include <sys/atomic.h>
400Sstevel@tonic-gate #include <sys/filio.h>
410Sstevel@tonic-gate #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
420Sstevel@tonic-gate #include <sys/kstat.h>
430Sstevel@tonic-gate 
440Sstevel@tonic-gate #include <sys/ddi.h>
450Sstevel@tonic-gate #include <sys/devops.h>
460Sstevel@tonic-gate #include <sys/sunddi.h>
474582Scth #include <sys/esunddi.h>
480Sstevel@tonic-gate #include <sys/priv_names.h>
490Sstevel@tonic-gate 
500Sstevel@tonic-gate #include <sys/fssnap.h>
510Sstevel@tonic-gate #include <sys/fssnap_if.h>
520Sstevel@tonic-gate 
530Sstevel@tonic-gate /*
540Sstevel@tonic-gate  * This module implements the file system snapshot code, which provides a
550Sstevel@tonic-gate  * point-in-time image of a file system for the purposes of online backup.
560Sstevel@tonic-gate  * There are essentially two parts to this project: the driver half and the
570Sstevel@tonic-gate  * file system half.  The driver half is a pseudo device driver called
580Sstevel@tonic-gate  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
590Sstevel@tonic-gate  * number that corresponds to the minor number of the device, and a control
600Sstevel@tonic-gate  * device with a high minor number is used to initiate snapshot creation and
610Sstevel@tonic-gate  * deletion.  For all practical purposes the driver half acts like a
620Sstevel@tonic-gate  * read-only disk device whose contents are exactly the same as the master
630Sstevel@tonic-gate  * file system at the time the snapshot was created.
640Sstevel@tonic-gate  *
650Sstevel@tonic-gate  * The file system half provides interfaces necessary for performing the
660Sstevel@tonic-gate  * file system dependent operations required to create and delete snapshots
670Sstevel@tonic-gate  * and a special driver strategy routine that must always be used by the file
680Sstevel@tonic-gate  * system for snapshots to work correctly.
690Sstevel@tonic-gate  *
700Sstevel@tonic-gate  * When a snapshot is to be created, the user utility will send an ioctl to
710Sstevel@tonic-gate  * the control device of the driver half specifying the file system to be
720Sstevel@tonic-gate  * snapshotted, the file descriptor of a backing-store file which is used to
730Sstevel@tonic-gate  * hold old data before it is overwritten, and other snapshot parameters.
740Sstevel@tonic-gate  * This ioctl is passed on to the file system specified in the original
750Sstevel@tonic-gate  * ioctl request.  The file system is expected to be able to flush
760Sstevel@tonic-gate  * everything out to make the file system consistent and lock it to ensure
770Sstevel@tonic-gate  * no changes occur while the snapshot is being created.  It then calls
780Sstevel@tonic-gate  * fssnap_create() to create state for a new snapshot, from which an opaque
790Sstevel@tonic-gate  * handle is returned with the snapshot locked.  Next, the file system must
800Sstevel@tonic-gate  * populate the "candidate bitmap", which tells the snapshot code which
810Sstevel@tonic-gate  * "chunks" should be considered for copy-on-write (a chunk is the unit of
820Sstevel@tonic-gate  * granularity used for copy-on-write, which is independent of the device
830Sstevel@tonic-gate  * and file system block sizes).  This is typically done by scanning the
840Sstevel@tonic-gate  * file system allocation bitmaps to determine which chunks contain
850Sstevel@tonic-gate  * allocated blocks in the file system at the time the snapshot was created.
860Sstevel@tonic-gate  * If a chunk has no allocated blocks, it does not need to be copied before
870Sstevel@tonic-gate  * being written to.  Once the candidate bitmap is populated with
880Sstevel@tonic-gate  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
890Sstevel@tonic-gate  * complete the snapshot creation and unlock the snapshot.  The file system
900Sstevel@tonic-gate  * may now be unlocked and modifications to it resumed.
910Sstevel@tonic-gate  *
920Sstevel@tonic-gate  * Once a snapshot is created, the file system must perform all writes
930Sstevel@tonic-gate  * through a special strategy routine, fssnap_strategy().  This strategy
940Sstevel@tonic-gate  * routine determines whether the chunks contained by the write must be
950Sstevel@tonic-gate  * copied before being overwritten by consulting the candidate bitmap
960Sstevel@tonic-gate  * described above, and the "hastrans bitmap" which tells it whether the chunk
970Sstevel@tonic-gate  * has been copied already or not.  If the chunk is a candidate but has not
980Sstevel@tonic-gate  * been copied, it reads the old data in and adds it to a queue.  The
990Sstevel@tonic-gate  * old data can then be overwritten with the new data.  An asynchronous
1000Sstevel@tonic-gate  * task queue is dispatched for each old chunk read in which writes the old
1010Sstevel@tonic-gate  * data to the backing file specified at snapshot creation time.  The
1020Sstevel@tonic-gate  * backing file is a sparse file the same size as the file system that
1030Sstevel@tonic-gate  * contains the old data at the offset that data originally had in the
1040Sstevel@tonic-gate  * file system.  If the queue containing in-memory chunks gets too large,
1050Sstevel@tonic-gate  * writes to the file system may be throttled by a semaphore until the
1060Sstevel@tonic-gate  * task queues have a chance to push some of the chunks to the backing file.
1070Sstevel@tonic-gate  *
1080Sstevel@tonic-gate  * With the candidate bitmap, the hastrans bitmap, the data on the master
1090Sstevel@tonic-gate  * file system, and the old data in memory and in the backing file, the
1100Sstevel@tonic-gate  * snapshot pseudo-driver can piece together the original file system
1110Sstevel@tonic-gate  * information to satisfy read requests.  If the requested chunk is not a
1120Sstevel@tonic-gate  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
1130Sstevel@tonic-gate  * has not been copied it reads it from the master file system.  If it is a
1140Sstevel@tonic-gate  * candidate and has been copied, it either copies the data from the
1150Sstevel@tonic-gate  * in-memory queue or it reads it in from the backing file.  The result is
1160Sstevel@tonic-gate  * a replication of the original file system that can be backed up, mounted,
1170Sstevel@tonic-gate  * or manipulated by other file system utilities that work on a read-only
1180Sstevel@tonic-gate  * device.
1190Sstevel@tonic-gate  *
1200Sstevel@tonic-gate  * This module is divided into three roughly logical sections:
1210Sstevel@tonic-gate  *
1220Sstevel@tonic-gate  *     - The snapshot driver, which is a character/block driver
1230Sstevel@tonic-gate  *       representing the snapshot itself.  These routines are
1240Sstevel@tonic-gate  *       prefixed with "snap_".
1250Sstevel@tonic-gate  *
1260Sstevel@tonic-gate  *     - The library routines that are defined in fssnap_if.h that
1270Sstevel@tonic-gate  *       are used by file systems that use this snapshot implementation.
1280Sstevel@tonic-gate  *       These functions are prefixed with "fssnap_" and are called through
1290Sstevel@tonic-gate  *       a function vector from the file system.
1300Sstevel@tonic-gate  *
1310Sstevel@tonic-gate  *     - The helper routines used by the snapshot driver and the fssnap
1320Sstevel@tonic-gate  *       library routines for managing the translation table and other
1330Sstevel@tonic-gate  *       useful functions.  These routines are all static and are
1340Sstevel@tonic-gate  *       prefixed with either "fssnap_" or "transtbl_" if they
1350Sstevel@tonic-gate  *       are specifically used for translation table activities.
1360Sstevel@tonic-gate  */
1370Sstevel@tonic-gate 
1380Sstevel@tonic-gate static dev_info_t		*fssnap_dip = NULL;
1390Sstevel@tonic-gate static struct snapshot_id	*snapshot = NULL;
1400Sstevel@tonic-gate static struct snapshot_id	snap_ctl;
1410Sstevel@tonic-gate static int			num_snapshots = 0;
1420Sstevel@tonic-gate static kmutex_t			snapshot_mutex;
1430Sstevel@tonic-gate static char			snapname[] = SNAP_NAME;
1440Sstevel@tonic-gate 
1450Sstevel@tonic-gate /* "tunable" parameters */
1460Sstevel@tonic-gate static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
1470Sstevel@tonic-gate static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
1480Sstevel@tonic-gate static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
1490Sstevel@tonic-gate 
1500Sstevel@tonic-gate /* static function prototypes */
1510Sstevel@tonic-gate 
1520Sstevel@tonic-gate /* snapshot driver */
1530Sstevel@tonic-gate static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
1540Sstevel@tonic-gate static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
1550Sstevel@tonic-gate static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
1560Sstevel@tonic-gate static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
1570Sstevel@tonic-gate static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
1580Sstevel@tonic-gate static int snap_strategy(struct buf *bp);
1590Sstevel@tonic-gate static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
1600Sstevel@tonic-gate static int snap_print(dev_t dev, char *str);
1610Sstevel@tonic-gate static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
1620Sstevel@tonic-gate     cred_t *credp, int *rvalp);
1630Sstevel@tonic-gate static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
1640Sstevel@tonic-gate     int flags, char *name, caddr_t valuep, int *lengthp);
1650Sstevel@tonic-gate static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
1660Sstevel@tonic-gate     int offset, int len, char *buffer);
1670Sstevel@tonic-gate 
1680Sstevel@tonic-gate 
1690Sstevel@tonic-gate /* fssnap interface implementations (see fssnap_if.h) */
1700Sstevel@tonic-gate static void fssnap_strategy_impl(void *, struct buf *);
1710Sstevel@tonic-gate static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
1720Sstevel@tonic-gate     struct vnode *, int, struct vnode **, char *, u_offset_t);
1730Sstevel@tonic-gate static void fssnap_set_candidate_impl(void *, chunknumber_t);
1740Sstevel@tonic-gate static int fssnap_is_candidate_impl(void *, u_offset_t);
1750Sstevel@tonic-gate static int fssnap_create_done_impl(void *);
1760Sstevel@tonic-gate static int fssnap_delete_impl(void *);
1770Sstevel@tonic-gate 
1780Sstevel@tonic-gate /* fssnap interface support routines */
1790Sstevel@tonic-gate static int  fssnap_translate(struct snapshot_id **, struct buf *);
1800Sstevel@tonic-gate static void fssnap_write_taskq(void *);
1810Sstevel@tonic-gate static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
1820Sstevel@tonic-gate     const char *);
1830Sstevel@tonic-gate static int  fssnap_update_kstat_num(kstat_t *, int);
1840Sstevel@tonic-gate static void fssnap_delete_kstats(struct cow_info *);
1850Sstevel@tonic-gate 
1860Sstevel@tonic-gate /* translation table prototypes */
1870Sstevel@tonic-gate static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
1880Sstevel@tonic-gate static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
1890Sstevel@tonic-gate static void transtbl_delete(cow_map_t *, cow_map_node_t *);
1900Sstevel@tonic-gate static void transtbl_free(cow_map_t *);
1910Sstevel@tonic-gate 
1920Sstevel@tonic-gate static kstat_t *fssnap_highwater_kstat;
1930Sstevel@tonic-gate 
1940Sstevel@tonic-gate /* ************************************************************************ */
1950Sstevel@tonic-gate 
1960Sstevel@tonic-gate /* Device and Module Structures */
1970Sstevel@tonic-gate 
1980Sstevel@tonic-gate static struct cb_ops snap_cb_ops = {
1990Sstevel@tonic-gate 	snap_open,
2000Sstevel@tonic-gate 	snap_close,
2010Sstevel@tonic-gate 	snap_strategy,
2020Sstevel@tonic-gate 	snap_print,
2030Sstevel@tonic-gate 	nodev,		/* no snap_dump */
2040Sstevel@tonic-gate 	snap_read,
2050Sstevel@tonic-gate 	nodev,		/* no snap_write */
2060Sstevel@tonic-gate 	snap_ioctl,
2070Sstevel@tonic-gate 	nodev,		/* no snap_devmap */
2080Sstevel@tonic-gate 	nodev,		/* no snap_mmap   */
2090Sstevel@tonic-gate 	nodev,		/* no snap_segmap */
2100Sstevel@tonic-gate 	nochpoll,
2110Sstevel@tonic-gate 	snap_prop_op,
2120Sstevel@tonic-gate 	NULL,		/* streamtab */
2130Sstevel@tonic-gate 	D_64BIT | D_NEW | D_MP, /* driver compatibility */
2140Sstevel@tonic-gate 	CB_REV,
2150Sstevel@tonic-gate 	nodev,		/* async I/O read entry point */
2160Sstevel@tonic-gate 	nodev		/* async I/O write entry point */
2170Sstevel@tonic-gate };
2180Sstevel@tonic-gate 
2190Sstevel@tonic-gate static struct dev_ops snap_ops = {
2200Sstevel@tonic-gate 	DEVO_REV,
2210Sstevel@tonic-gate 	0,			/* ref count */
2220Sstevel@tonic-gate 	snap_getinfo,
2230Sstevel@tonic-gate 	nulldev,		/* snap_identify obsolete */
2240Sstevel@tonic-gate 	nulldev,		/* no snap_probe */
2250Sstevel@tonic-gate 	snap_attach,
2260Sstevel@tonic-gate 	snap_detach,
2270Sstevel@tonic-gate 	nodev,			/* no snap_reset */
2280Sstevel@tonic-gate 	&snap_cb_ops,
2290Sstevel@tonic-gate 	(struct bus_ops *)NULL,
2307656SSherry.Moore@Sun.COM 	nulldev,		/* no snap_power() */
2317656SSherry.Moore@Sun.COM 	ddi_quiesce_not_needed,		/* quiesce */
2320Sstevel@tonic-gate };
2330Sstevel@tonic-gate 
2340Sstevel@tonic-gate extern struct mod_ops mod_driverops;
2350Sstevel@tonic-gate 
2360Sstevel@tonic-gate static struct modldrv md = {
2370Sstevel@tonic-gate 	&mod_driverops, /* Type of module. This is a driver */
2387656SSherry.Moore@Sun.COM 	"snapshot driver", 	/* Name of the module */
2390Sstevel@tonic-gate 	&snap_ops,
2400Sstevel@tonic-gate };
2410Sstevel@tonic-gate 
2420Sstevel@tonic-gate static struct modlinkage ml = {
2430Sstevel@tonic-gate 	MODREV_1,
2440Sstevel@tonic-gate 	&md,
2450Sstevel@tonic-gate 	NULL
2460Sstevel@tonic-gate };
2470Sstevel@tonic-gate 
2480Sstevel@tonic-gate static void *statep;
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate int
_init(void)2510Sstevel@tonic-gate _init(void)
2520Sstevel@tonic-gate {
2530Sstevel@tonic-gate 	int	error;
2540Sstevel@tonic-gate 	kstat_t	*ksp;
2550Sstevel@tonic-gate 	kstat_named_t	*ksdata;
2560Sstevel@tonic-gate 
2570Sstevel@tonic-gate 	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
2580Sstevel@tonic-gate 	if (error) {
2590Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
2600Sstevel@tonic-gate 		return (error);
2610Sstevel@tonic-gate 	}
2620Sstevel@tonic-gate 
2630Sstevel@tonic-gate 	error = mod_install(&ml);
2640Sstevel@tonic-gate 
2650Sstevel@tonic-gate 	if (error) {
2660Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to mod_install.");
2670Sstevel@tonic-gate 		ddi_soft_state_fini(&statep);
2680Sstevel@tonic-gate 		return (error);
2690Sstevel@tonic-gate 	}
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate 	/*
2720Sstevel@tonic-gate 	 * Fill in the snapshot operations vector for file systems
2730Sstevel@tonic-gate 	 * (defined in fssnap_if.c)
2740Sstevel@tonic-gate 	 */
2750Sstevel@tonic-gate 
2760Sstevel@tonic-gate 	snapops.fssnap_create = fssnap_create_impl;
2770Sstevel@tonic-gate 	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
2780Sstevel@tonic-gate 	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
2790Sstevel@tonic-gate 	snapops.fssnap_create_done = fssnap_create_done_impl;
2800Sstevel@tonic-gate 	snapops.fssnap_delete = fssnap_delete_impl;
2810Sstevel@tonic-gate 	snapops.fssnap_strategy = fssnap_strategy_impl;
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate 	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
2840Sstevel@tonic-gate 
2850Sstevel@tonic-gate 	/*
2860Sstevel@tonic-gate 	 * Initialize the fssnap highwater kstat
2870Sstevel@tonic-gate 	 */
2880Sstevel@tonic-gate 	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
2890Sstevel@tonic-gate 	    KSTAT_TYPE_NAMED, 1, 0);
2900Sstevel@tonic-gate 	if (ksp != NULL) {
2910Sstevel@tonic-gate 		ksdata = (kstat_named_t *)ksp->ks_data;
2920Sstevel@tonic-gate 		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
2930Sstevel@tonic-gate 		    KSTAT_DATA_UINT32);
2940Sstevel@tonic-gate 		ksdata->value.ui32 = 0;
2950Sstevel@tonic-gate 		kstat_install(ksp);
2960Sstevel@tonic-gate 	} else {
2970Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
2980Sstevel@tonic-gate 	}
2990Sstevel@tonic-gate 	fssnap_highwater_kstat = ksp;
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 	return (0);
3020Sstevel@tonic-gate }
3030Sstevel@tonic-gate 
3040Sstevel@tonic-gate int
_info(struct modinfo * modinfop)3050Sstevel@tonic-gate _info(struct modinfo *modinfop)
3060Sstevel@tonic-gate {
3070Sstevel@tonic-gate 	return (mod_info(&ml, modinfop));
3080Sstevel@tonic-gate }
3090Sstevel@tonic-gate 
3100Sstevel@tonic-gate int
_fini(void)3110Sstevel@tonic-gate _fini(void)
3120Sstevel@tonic-gate {
3130Sstevel@tonic-gate 	int	error;
3140Sstevel@tonic-gate 
3150Sstevel@tonic-gate 	error = mod_remove(&ml);
3160Sstevel@tonic-gate 	if (error)
3170Sstevel@tonic-gate 		return (error);
3180Sstevel@tonic-gate 	ddi_soft_state_fini(&statep);
3190Sstevel@tonic-gate 
3200Sstevel@tonic-gate 	/*
3210Sstevel@tonic-gate 	 * delete the fssnap highwater kstat
3220Sstevel@tonic-gate 	 */
3230Sstevel@tonic-gate 	kstat_delete(fssnap_highwater_kstat);
3240Sstevel@tonic-gate 
3250Sstevel@tonic-gate 	mutex_destroy(&snapshot_mutex);
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 	/* Clear out the file system operations vector */
3280Sstevel@tonic-gate 	snapops.fssnap_create = NULL;
3290Sstevel@tonic-gate 	snapops.fssnap_set_candidate = NULL;
3300Sstevel@tonic-gate 	snapops.fssnap_create_done = NULL;
3310Sstevel@tonic-gate 	snapops.fssnap_delete = NULL;
3320Sstevel@tonic-gate 	snapops.fssnap_strategy = NULL;
3330Sstevel@tonic-gate 
3340Sstevel@tonic-gate 	return (0);
3350Sstevel@tonic-gate }
3360Sstevel@tonic-gate 
3370Sstevel@tonic-gate /* ************************************************************************ */
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate /*
3400Sstevel@tonic-gate  * Snapshot Driver Routines
3410Sstevel@tonic-gate  *
3420Sstevel@tonic-gate  * This section implements the snapshot character and block drivers.  The
3430Sstevel@tonic-gate  * device will appear to be a consistent read-only file system to
3440Sstevel@tonic-gate  * applications that wish to back it up or mount it.  The snapshot driver
3450Sstevel@tonic-gate  * communicates with the file system through the translation table, which
3460Sstevel@tonic-gate  * tells the snapshot driver where to find the data necessary to piece
3470Sstevel@tonic-gate  * together the frozen file system.  The data may either be on the master
3480Sstevel@tonic-gate  * device (no translation exists), in memory (a translation exists but has
3490Sstevel@tonic-gate  * not been flushed to the backing store), or in the backing store file.
3505331Samw  * The read request may require the snapshot driver to retrieve data from
3510Sstevel@tonic-gate  * several different places and piece it together to look like a single
3520Sstevel@tonic-gate  * contiguous read.
3530Sstevel@tonic-gate  *
3540Sstevel@tonic-gate  * The device minor number corresponds to the snapshot number in the list of
3550Sstevel@tonic-gate  * snapshot identifiers.  The soft state for each minor number is simply a
3560Sstevel@tonic-gate  * pointer to the snapshot id, which holds all of the snapshot state.  One
3570Sstevel@tonic-gate  * minor number is designated as the control device.  All snapshot create
3580Sstevel@tonic-gate  * and delete requests go through the control device to ensure this module
3590Sstevel@tonic-gate  * is properly loaded and attached before the file system starts calling
3600Sstevel@tonic-gate  * routines defined here.
3610Sstevel@tonic-gate  */
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate 
3640Sstevel@tonic-gate /*
3650Sstevel@tonic-gate  * snap_getinfo() - snapshot driver getinfo(9E) routine
3660Sstevel@tonic-gate  *
3670Sstevel@tonic-gate  */
3680Sstevel@tonic-gate /*ARGSUSED*/
3690Sstevel@tonic-gate static int
snap_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)3700Sstevel@tonic-gate snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
3710Sstevel@tonic-gate {
3720Sstevel@tonic-gate 	switch (infocmd) {
3730Sstevel@tonic-gate 	case DDI_INFO_DEVT2DEVINFO:
3740Sstevel@tonic-gate 		*result = fssnap_dip;
3750Sstevel@tonic-gate 		return (DDI_SUCCESS);
3760Sstevel@tonic-gate 	case DDI_INFO_DEVT2INSTANCE:
3770Sstevel@tonic-gate 		*result = 0;	/* we only have one instance */
3780Sstevel@tonic-gate 		return (DDI_SUCCESS);
3790Sstevel@tonic-gate 	}
3800Sstevel@tonic-gate 	return (DDI_FAILURE);
3810Sstevel@tonic-gate }
3820Sstevel@tonic-gate 
3830Sstevel@tonic-gate /*
3840Sstevel@tonic-gate  * snap_attach() - snapshot driver attach(9E) routine
3850Sstevel@tonic-gate  *
3860Sstevel@tonic-gate  *    sets up snapshot control device and control state.  The control state
3870Sstevel@tonic-gate  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
3880Sstevel@tonic-gate  */
3890Sstevel@tonic-gate static int
snap_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)3900Sstevel@tonic-gate snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3910Sstevel@tonic-gate {
3920Sstevel@tonic-gate 	int			error;
3930Sstevel@tonic-gate 
3940Sstevel@tonic-gate 	switch (cmd) {
3950Sstevel@tonic-gate 	case DDI_ATTACH:
3960Sstevel@tonic-gate 		/* create the control device */
3970Sstevel@tonic-gate 		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
3980Sstevel@tonic-gate 		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
3990Sstevel@tonic-gate 		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
4000Sstevel@tonic-gate 		if (error == DDI_FAILURE) {
4010Sstevel@tonic-gate 			return (DDI_FAILURE);
4020Sstevel@tonic-gate 		}
4030Sstevel@tonic-gate 
4040Sstevel@tonic-gate 		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
4050Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
4060Sstevel@tonic-gate 		fssnap_dip = dip;
4070Sstevel@tonic-gate 		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
4080Sstevel@tonic-gate 		/* the control sid is not linked into the snapshot list */
4090Sstevel@tonic-gate 		snap_ctl.sid_next = NULL;
4100Sstevel@tonic-gate 		snap_ctl.sid_cowinfo = NULL;
4110Sstevel@tonic-gate 		snap_ctl.sid_flags = 0;
4120Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
4130Sstevel@tonic-gate 		ddi_report_dev(dip);
4140Sstevel@tonic-gate 
4150Sstevel@tonic-gate 		return (DDI_SUCCESS);
4160Sstevel@tonic-gate 	case DDI_PM_RESUME:
4170Sstevel@tonic-gate 		return (DDI_SUCCESS);
4180Sstevel@tonic-gate 
4190Sstevel@tonic-gate 	case DDI_RESUME:
4200Sstevel@tonic-gate 		return (DDI_SUCCESS);
4210Sstevel@tonic-gate 
4220Sstevel@tonic-gate 	default:
4230Sstevel@tonic-gate 		return (DDI_FAILURE);
4240Sstevel@tonic-gate 	}
4250Sstevel@tonic-gate }
4260Sstevel@tonic-gate 
4270Sstevel@tonic-gate /*
4280Sstevel@tonic-gate  * snap_detach() - snapshot driver detach(9E) routine
4290Sstevel@tonic-gate  *
4300Sstevel@tonic-gate  *    destroys snapshot control device and control state.  If any snapshots
4310Sstevel@tonic-gate  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
4320Sstevel@tonic-gate  */
4330Sstevel@tonic-gate static int
snap_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)4340Sstevel@tonic-gate snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4350Sstevel@tonic-gate {
4360Sstevel@tonic-gate 	struct snapshot_id *sidp, *sidnextp;
4370Sstevel@tonic-gate 
4380Sstevel@tonic-gate 	switch (cmd) {
4390Sstevel@tonic-gate 	case DDI_DETACH:
4400Sstevel@tonic-gate 		/* do not detach if the device is active */
4410Sstevel@tonic-gate 		mutex_enter(&snapshot_mutex);
4420Sstevel@tonic-gate 		if ((num_snapshots != 0) ||
4430Sstevel@tonic-gate 		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
4440Sstevel@tonic-gate 			mutex_exit(&snapshot_mutex);
4450Sstevel@tonic-gate 			return (DDI_FAILURE);
4460Sstevel@tonic-gate 		}
4470Sstevel@tonic-gate 
4480Sstevel@tonic-gate 		/* free up the snapshot list */
4490Sstevel@tonic-gate 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
4500Sstevel@tonic-gate 			ASSERT(SID_AVAILABLE(sidp) &&
4510Sstevel@tonic-gate 			    !RW_LOCK_HELD(&sidp->sid_rwlock));
4520Sstevel@tonic-gate 			sidnextp = sidp->sid_next;
4530Sstevel@tonic-gate 			rw_destroy(&sidp->sid_rwlock);
4540Sstevel@tonic-gate 			kmem_free(sidp, sizeof (struct snapshot_id));
4550Sstevel@tonic-gate 		}
4560Sstevel@tonic-gate 		snapshot = NULL;
4570Sstevel@tonic-gate 
4580Sstevel@tonic-gate 		/* delete the control device */
4590Sstevel@tonic-gate 		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
4600Sstevel@tonic-gate 		fssnap_dip = NULL;
4610Sstevel@tonic-gate 
4620Sstevel@tonic-gate 		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
4630Sstevel@tonic-gate 		rw_destroy(&snap_ctl.sid_rwlock);
4640Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
4650Sstevel@tonic-gate 
4660Sstevel@tonic-gate 		return (DDI_SUCCESS);
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate 	default:
4690Sstevel@tonic-gate 		return (DDI_FAILURE);
4700Sstevel@tonic-gate 	}
4710Sstevel@tonic-gate }
4720Sstevel@tonic-gate 
4730Sstevel@tonic-gate /*
4740Sstevel@tonic-gate  * snap_open() - snapshot driver open(9E) routine
4750Sstevel@tonic-gate  *
4760Sstevel@tonic-gate  *     marks the snapshot id as busy so it will not be recycled when deleted
4770Sstevel@tonic-gate  *     until the snapshot is closed.
4780Sstevel@tonic-gate  */
4790Sstevel@tonic-gate /* ARGSUSED */
4800Sstevel@tonic-gate static int
snap_open(dev_t * devp,int flag,int otyp,cred_t * cred)4810Sstevel@tonic-gate snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
4820Sstevel@tonic-gate {
4830Sstevel@tonic-gate 	minor_t	minor;
4840Sstevel@tonic-gate 	struct snapshot_id **sidpp, *sidp;
4850Sstevel@tonic-gate 
4860Sstevel@tonic-gate 	/* snapshots are read-only */
4870Sstevel@tonic-gate 	if (flag & FWRITE)
4880Sstevel@tonic-gate 		return (EROFS);
4890Sstevel@tonic-gate 
4900Sstevel@tonic-gate 	minor = getminor(*devp);
4910Sstevel@tonic-gate 
4920Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR) {
4930Sstevel@tonic-gate 		/* control device must be opened exclusively */
4940Sstevel@tonic-gate 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
4950Sstevel@tonic-gate 			return (EINVAL);
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
4980Sstevel@tonic-gate 		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
4990Sstevel@tonic-gate 			rw_exit(&snap_ctl.sid_rwlock);
5000Sstevel@tonic-gate 			return (EBUSY);
5010Sstevel@tonic-gate 		}
5020Sstevel@tonic-gate 
5030Sstevel@tonic-gate 		snap_ctl.sid_flags |= SID_CHAR_BUSY;
5040Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
5050Sstevel@tonic-gate 
5060Sstevel@tonic-gate 		return (0);
5070Sstevel@tonic-gate 	}
5080Sstevel@tonic-gate 
5090Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
5100Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL)
5110Sstevel@tonic-gate 		return (ENXIO);
5120Sstevel@tonic-gate 	sidp = *sidpp;
5130Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate 	if ((flag & FEXCL) && SID_BUSY(sidp)) {
5160Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5170Sstevel@tonic-gate 		return (EAGAIN);
5180Sstevel@tonic-gate 	}
5190Sstevel@tonic-gate 
5200Sstevel@tonic-gate 	ASSERT(sidpp != NULL && sidp != NULL);
5210Sstevel@tonic-gate 	/* check to see if this snapshot has been killed on us */
5220Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
5230Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
5240Sstevel@tonic-gate 		    minor);
5250Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5260Sstevel@tonic-gate 		return (ENXIO);
5270Sstevel@tonic-gate 	}
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 	switch (otyp) {
5300Sstevel@tonic-gate 	case OTYP_CHR:
5310Sstevel@tonic-gate 		sidp->sid_flags |= SID_CHAR_BUSY;
5320Sstevel@tonic-gate 		break;
5330Sstevel@tonic-gate 	case OTYP_BLK:
5340Sstevel@tonic-gate 		sidp->sid_flags |= SID_BLOCK_BUSY;
5350Sstevel@tonic-gate 		break;
5360Sstevel@tonic-gate 	default:
5370Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5380Sstevel@tonic-gate 		return (EINVAL);
5390Sstevel@tonic-gate 	}
5400Sstevel@tonic-gate 
5410Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
5420Sstevel@tonic-gate 
5430Sstevel@tonic-gate 	/*
5440Sstevel@tonic-gate 	 * at this point if a valid snapshot was found then it has
5450Sstevel@tonic-gate 	 * been marked busy and we can use it.
5460Sstevel@tonic-gate 	 */
5470Sstevel@tonic-gate 	return (0);
5480Sstevel@tonic-gate }
5490Sstevel@tonic-gate 
5500Sstevel@tonic-gate /*
5510Sstevel@tonic-gate  * snap_close() - snapshot driver close(9E) routine
5520Sstevel@tonic-gate  *
5530Sstevel@tonic-gate  *    unsets the busy bits in the snapshot id.  If the snapshot has been
5540Sstevel@tonic-gate  *    deleted while the snapshot device was open, the close call will clean
5550Sstevel@tonic-gate  *    up the remaining state information.
5560Sstevel@tonic-gate  */
5570Sstevel@tonic-gate /* ARGSUSED */
5580Sstevel@tonic-gate static int
snap_close(dev_t dev,int flag,int otyp,cred_t * cred)5590Sstevel@tonic-gate snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
5600Sstevel@tonic-gate {
5610Sstevel@tonic-gate 	struct snapshot_id	**sidpp, *sidp;
5620Sstevel@tonic-gate 	minor_t			minor;
5630Sstevel@tonic-gate 	char			name[20];
5640Sstevel@tonic-gate 
5650Sstevel@tonic-gate 	minor = getminor(dev);
5660Sstevel@tonic-gate 
5670Sstevel@tonic-gate 	/* if this is the control device, close it and return */
5680Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR) {
5690Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
5700Sstevel@tonic-gate 		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
5710Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
5720Sstevel@tonic-gate 		return (0);
5730Sstevel@tonic-gate 	}
5740Sstevel@tonic-gate 
5750Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
5760Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
5770Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_close: could not find state for "
5780Sstevel@tonic-gate 		    "snapshot %d.", minor);
5790Sstevel@tonic-gate 		return (ENXIO);
5800Sstevel@tonic-gate 	}
5810Sstevel@tonic-gate 	sidp = *sidpp;
5820Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
5830Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
5840Sstevel@tonic-gate 
5850Sstevel@tonic-gate 	/* Mark the snapshot as not being busy anymore */
5860Sstevel@tonic-gate 	switch (otyp) {
5870Sstevel@tonic-gate 	case OTYP_CHR:
5880Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_CHAR_BUSY);
5890Sstevel@tonic-gate 		break;
5900Sstevel@tonic-gate 	case OTYP_BLK:
5910Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
5920Sstevel@tonic-gate 		break;
5930Sstevel@tonic-gate 	default:
5940Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
5950Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5960Sstevel@tonic-gate 		return (EINVAL);
5970Sstevel@tonic-gate 	}
5980Sstevel@tonic-gate 
5990Sstevel@tonic-gate 	if (SID_AVAILABLE(sidp)) {
6000Sstevel@tonic-gate 		/*
6010Sstevel@tonic-gate 		 * if this is the last close on a snapshot that has been
6020Sstevel@tonic-gate 		 * deleted, then free up the soft state.  The snapdelete
6030Sstevel@tonic-gate 		 * ioctl does not free this when the device is in use so
6040Sstevel@tonic-gate 		 * we do it here after the last reference goes away.
6050Sstevel@tonic-gate 		 */
6060Sstevel@tonic-gate 
6070Sstevel@tonic-gate 		/* remove the device nodes */
6080Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
6090Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d",
6100Sstevel@tonic-gate 		    sidp->sid_snapnumber);
6110Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
6120Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d,raw",
6130Sstevel@tonic-gate 		    sidp->sid_snapnumber);
6140Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
6150Sstevel@tonic-gate 
6160Sstevel@tonic-gate 		/* delete the state structure */
6170Sstevel@tonic-gate 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
6180Sstevel@tonic-gate 		num_snapshots--;
6190Sstevel@tonic-gate 	}
6200Sstevel@tonic-gate 
6210Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
6220Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
6230Sstevel@tonic-gate 
6240Sstevel@tonic-gate 	return (0);
6250Sstevel@tonic-gate }
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate /*
6280Sstevel@tonic-gate  * snap_read() - snapshot driver read(9E) routine
6290Sstevel@tonic-gate  *
6300Sstevel@tonic-gate  *    reads data from the snapshot by calling snap_strategy() through physio()
6310Sstevel@tonic-gate  */
6320Sstevel@tonic-gate /* ARGSUSED */
6330Sstevel@tonic-gate static int
snap_read(dev_t dev,struct uio * uiop,cred_t * credp)6340Sstevel@tonic-gate snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
6350Sstevel@tonic-gate {
6360Sstevel@tonic-gate 	minor_t		minor;
6370Sstevel@tonic-gate 	struct snapshot_id **sidpp;
6380Sstevel@tonic-gate 
6390Sstevel@tonic-gate 	minor = getminor(dev);
6400Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
6410Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
6420Sstevel@tonic-gate 		cmn_err(CE_WARN,
6430Sstevel@tonic-gate 		    "snap_read: could not find state for snapshot %d.", minor);
6440Sstevel@tonic-gate 		return (ENXIO);
6450Sstevel@tonic-gate 	}
6460Sstevel@tonic-gate 	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
6470Sstevel@tonic-gate }
6480Sstevel@tonic-gate 
6490Sstevel@tonic-gate /*
6500Sstevel@tonic-gate  * snap_strategy() - snapshot driver strategy(9E) routine
6510Sstevel@tonic-gate  *
6520Sstevel@tonic-gate  *    cycles through each chunk in the requested buffer and calls
6530Sstevel@tonic-gate  *    snap_getchunk() on each chunk to retrieve it from the appropriate
6540Sstevel@tonic-gate  *    place.  Once all of the parts are put together the requested buffer
6550Sstevel@tonic-gate  *    is returned.  The snapshot driver is read-only, so a write is invalid.
6560Sstevel@tonic-gate  */
6570Sstevel@tonic-gate static int
snap_strategy(struct buf * bp)6580Sstevel@tonic-gate snap_strategy(struct buf *bp)
6590Sstevel@tonic-gate {
6600Sstevel@tonic-gate 	struct snapshot_id **sidpp, *sidp;
6610Sstevel@tonic-gate 	minor_t		minor;
6620Sstevel@tonic-gate 	chunknumber_t	chunk;
6630Sstevel@tonic-gate 	int		off, len;
6640Sstevel@tonic-gate 	u_longlong_t	reqptr;
6650Sstevel@tonic-gate 	int		error = 0;
6660Sstevel@tonic-gate 	size_t		chunksz;
6670Sstevel@tonic-gate 	caddr_t		buf;
6680Sstevel@tonic-gate 
6690Sstevel@tonic-gate 	/* snapshot device is read-only */
6700Sstevel@tonic-gate 	if (bp->b_flags & B_WRITE) {
6710Sstevel@tonic-gate 		bioerror(bp, EROFS);
6720Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
6730Sstevel@tonic-gate 		biodone(bp);
6740Sstevel@tonic-gate 		return (0);
6750Sstevel@tonic-gate 	}
6760Sstevel@tonic-gate 
6770Sstevel@tonic-gate 	minor = getminor(bp->b_edev);
6780Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
6790Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
6800Sstevel@tonic-gate 		cmn_err(CE_WARN,
6810Sstevel@tonic-gate 		    "snap_strategy: could not find state for snapshot %d.",
6820Sstevel@tonic-gate 		    minor);
6830Sstevel@tonic-gate 		bioerror(bp, ENXIO);
6840Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
6850Sstevel@tonic-gate 		biodone(bp);
6860Sstevel@tonic-gate 		return (0);
6870Sstevel@tonic-gate 	}
6880Sstevel@tonic-gate 	sidp = *sidpp;
6890Sstevel@tonic-gate 	ASSERT(sidp);
6900Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
6930Sstevel@tonic-gate 		bioerror(bp, ENXIO);
6940Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
6950Sstevel@tonic-gate 		biodone(bp);
6960Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
6970Sstevel@tonic-gate 		return (0);
6980Sstevel@tonic-gate 	}
6990Sstevel@tonic-gate 
7000Sstevel@tonic-gate 	if (bp->b_flags & (B_PAGEIO|B_PHYS))
7010Sstevel@tonic-gate 		bp_mapin(bp);
7020Sstevel@tonic-gate 
7030Sstevel@tonic-gate 	bp->b_resid = bp->b_bcount;
7040Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr);
7050Sstevel@tonic-gate 	buf = bp->b_un.b_addr;
7060Sstevel@tonic-gate 
7070Sstevel@tonic-gate 	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
7080Sstevel@tonic-gate 
7090Sstevel@tonic-gate 	/* reqptr is the current DEV_BSIZE offset into the device */
7100Sstevel@tonic-gate 	/* chunk is the chunk containing reqptr */
7110Sstevel@tonic-gate 	/* len is the length of the request (in the current chunk) in bytes */
7120Sstevel@tonic-gate 	/* off is the byte offset into the current chunk */
7130Sstevel@tonic-gate 	reqptr = bp->b_lblkno;
7140Sstevel@tonic-gate 	while (bp->b_resid > 0) {
7150Sstevel@tonic-gate 		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
7160Sstevel@tonic-gate 		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
7170Sstevel@tonic-gate 		len = min(chunksz - off, bp->b_resid);
7180Sstevel@tonic-gate 		ASSERT((off + len) <= chunksz);
7190Sstevel@tonic-gate 
7200Sstevel@tonic-gate 		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
7210Sstevel@tonic-gate 			/*
7220Sstevel@tonic-gate 			 * EINVAL means the user tried to go out of range.
7230Sstevel@tonic-gate 			 * Anything else means it's likely that we're
7240Sstevel@tonic-gate 			 * confused.
7250Sstevel@tonic-gate 			 */
7260Sstevel@tonic-gate 			if (error != EINVAL) {
7270Sstevel@tonic-gate 				cmn_err(CE_WARN, "snap_strategy: error "
7280Sstevel@tonic-gate 				    "calling snap_getchunk, chunk = %llu, "
7290Sstevel@tonic-gate 				    "offset = %d, len = %d, resid = %lu, "
7300Sstevel@tonic-gate 				    "error = %d.",
7310Sstevel@tonic-gate 				    chunk, off, len, bp->b_resid, error);
7320Sstevel@tonic-gate 			}
7330Sstevel@tonic-gate 			bioerror(bp, error);
7340Sstevel@tonic-gate 			biodone(bp);
7350Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
7360Sstevel@tonic-gate 			return (0);
7370Sstevel@tonic-gate 		}
7380Sstevel@tonic-gate 		bp->b_resid -= len;
7390Sstevel@tonic-gate 		reqptr += (len >> DEV_BSHIFT);
7400Sstevel@tonic-gate 		buf += len;
7410Sstevel@tonic-gate 	}
7420Sstevel@tonic-gate 
7430Sstevel@tonic-gate 	ASSERT(bp->b_resid == 0);
7440Sstevel@tonic-gate 	biodone(bp);
7450Sstevel@tonic-gate 
7460Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
7470Sstevel@tonic-gate 	return (0);
7480Sstevel@tonic-gate }
7490Sstevel@tonic-gate 
7500Sstevel@tonic-gate /*
7510Sstevel@tonic-gate  * snap_getchunk() - helper function for snap_strategy()
7520Sstevel@tonic-gate  *
7530Sstevel@tonic-gate  *    gets the requested data from the appropriate place and fills in the
7540Sstevel@tonic-gate  *    buffer.  chunk is the chunk number of the request, offset is the
7550Sstevel@tonic-gate  *    offset into that chunk and must be less than the chunk size.  len is
7560Sstevel@tonic-gate  *    the length of the request starting at offset, and must not exceed a
7570Sstevel@tonic-gate  *    chunk boundary.  buffer is the address to copy the data to.  len
7580Sstevel@tonic-gate  *    bytes are copied into the buffer starting at the location specified.
7590Sstevel@tonic-gate  *
7600Sstevel@tonic-gate  *    A chunk is located according to the following algorithm:
7610Sstevel@tonic-gate  *        - If the chunk does not have a translation or is not a candidate
7620Sstevel@tonic-gate  *          for translation, it is read straight from the master device.
7630Sstevel@tonic-gate  *        - If the chunk does have a translation, then it is either on
7640Sstevel@tonic-gate  *          disk or in memory:
7650Sstevel@tonic-gate  *            o If it is in memory the requested data is simply copied out
7660Sstevel@tonic-gate  *              of the in-memory buffer.
7670Sstevel@tonic-gate  *            o If it is in the backing store, it is read from there.
7680Sstevel@tonic-gate  *
7690Sstevel@tonic-gate  *    This function does the real work of the snapshot driver.
7700Sstevel@tonic-gate  */
7710Sstevel@tonic-gate static int
snap_getchunk(struct snapshot_id * sidp,chunknumber_t chunk,int offset,int len,char * buffer)7720Sstevel@tonic-gate snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
7730Sstevel@tonic-gate     int len, char *buffer)
7740Sstevel@tonic-gate {
7750Sstevel@tonic-gate 	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
7760Sstevel@tonic-gate 	cow_map_node_t	*cmn;
7770Sstevel@tonic-gate 	struct buf	*snapbuf;
7780Sstevel@tonic-gate 	int		error = 0;
7790Sstevel@tonic-gate 	char		*newbuffer;
7800Sstevel@tonic-gate 	int		newlen = 0;
7810Sstevel@tonic-gate 	int		partial = 0;
7820Sstevel@tonic-gate 
7830Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
7840Sstevel@tonic-gate 	ASSERT(offset + len <= cmap->cmap_chunksz);
7850Sstevel@tonic-gate 
7860Sstevel@tonic-gate 	/*
7870Sstevel@tonic-gate 	 * Check if the chunk number is out of range and if so bail out
7880Sstevel@tonic-gate 	 */
7890Sstevel@tonic-gate 	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
7900Sstevel@tonic-gate 		return (EINVAL);
7910Sstevel@tonic-gate 	}
7920Sstevel@tonic-gate 
7930Sstevel@tonic-gate 	/*
7940Sstevel@tonic-gate 	 * If the chunk is not a candidate for translation, then the chunk
7950Sstevel@tonic-gate 	 * was not allocated when the snapshot was taken.  Since it does
7960Sstevel@tonic-gate 	 * not contain data associated with this snapshot, just return a
7970Sstevel@tonic-gate 	 * zero buffer instead.
7980Sstevel@tonic-gate 	 */
7990Sstevel@tonic-gate 	if (isclr(cmap->cmap_candidate, chunk)) {
8000Sstevel@tonic-gate 		bzero(buffer, len);
8010Sstevel@tonic-gate 		return (0);
8020Sstevel@tonic-gate 	}
8030Sstevel@tonic-gate 
8040Sstevel@tonic-gate 	/*
8050Sstevel@tonic-gate 	 * if the chunk is a candidate for translation but a
8060Sstevel@tonic-gate 	 * translation does not exist, then read through to the
8070Sstevel@tonic-gate 	 * original file system.  The rwlock is held until the read
8080Sstevel@tonic-gate 	 * completes if it hasn't been translated to make sure the
8090Sstevel@tonic-gate 	 * file system does not translate the block before we
8100Sstevel@tonic-gate 	 * access it. If it has already been translated we don't
8110Sstevel@tonic-gate 	 * need the lock, because the translation will never go away.
8120Sstevel@tonic-gate 	 */
8130Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_READER);
8140Sstevel@tonic-gate 	if (isclr(cmap->cmap_hastrans, chunk)) {
8150Sstevel@tonic-gate 		snapbuf = getrbuf(KM_SLEEP);
8160Sstevel@tonic-gate 		/*
8170Sstevel@tonic-gate 		 * Reading into the buffer saves having to do a copy,
8180Sstevel@tonic-gate 		 * but gets tricky if the request size is not a
8190Sstevel@tonic-gate 		 * multiple of DEV_BSIZE.  However, we are filling the
8200Sstevel@tonic-gate 		 * buffer left to right, so future reads will write
8210Sstevel@tonic-gate 		 * over any extra data we might have read.
8220Sstevel@tonic-gate 		 */
8230Sstevel@tonic-gate 
8240Sstevel@tonic-gate 		partial = len % DEV_BSIZE;
8250Sstevel@tonic-gate 
8260Sstevel@tonic-gate 		snapbuf->b_bcount = len;
8270Sstevel@tonic-gate 		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
8280Sstevel@tonic-gate 		snapbuf->b_un.b_addr = buffer;
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate 		snapbuf->b_iodone = NULL;
8310Sstevel@tonic-gate 		snapbuf->b_proc = NULL;		/* i.e. the kernel */
8320Sstevel@tonic-gate 		snapbuf->b_flags = B_READ | B_BUSY;
8330Sstevel@tonic-gate 		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
8340Sstevel@tonic-gate 
8350Sstevel@tonic-gate 		if (partial) {
8360Sstevel@tonic-gate 			/*
8370Sstevel@tonic-gate 			 * Partial block read in progress.
8380Sstevel@tonic-gate 			 * This is bad as modules further down the line
8390Sstevel@tonic-gate 			 * assume buf's are exact multiples of DEV_BSIZE
8400Sstevel@tonic-gate 			 * and we end up with fewer, or zero, bytes read.
8410Sstevel@tonic-gate 			 * To get round this we need to round up to the
8420Sstevel@tonic-gate 			 * nearest full block read and then return only
8430Sstevel@tonic-gate 			 * len bytes.
8440Sstevel@tonic-gate 			 */
8450Sstevel@tonic-gate 			newlen = (len - partial) + DEV_BSIZE;
8460Sstevel@tonic-gate 			newbuffer = kmem_alloc(newlen, KM_SLEEP);
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate 			snapbuf->b_bcount = newlen;
8490Sstevel@tonic-gate 			snapbuf->b_un.b_addr = newbuffer;
8500Sstevel@tonic-gate 		}
8510Sstevel@tonic-gate 
8520Sstevel@tonic-gate 		(void) bdev_strategy(snapbuf);
8530Sstevel@tonic-gate 		(void) biowait(snapbuf);
8540Sstevel@tonic-gate 
8550Sstevel@tonic-gate 		error = geterror(snapbuf);
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 		if (partial) {
8580Sstevel@tonic-gate 			/*
8590Sstevel@tonic-gate 			 * Partial block read. Now we need to bcopy the
8600Sstevel@tonic-gate 			 * correct number of bytes back into the
8610Sstevel@tonic-gate 			 * supplied buffer, and tidy up our temp
8620Sstevel@tonic-gate 			 * buffer.
8630Sstevel@tonic-gate 			 */
8640Sstevel@tonic-gate 			bcopy(newbuffer, buffer, len);
8650Sstevel@tonic-gate 			kmem_free(newbuffer, newlen);
8660Sstevel@tonic-gate 		}
8670Sstevel@tonic-gate 
8680Sstevel@tonic-gate 		freerbuf(snapbuf);
8690Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
8700Sstevel@tonic-gate 
8710Sstevel@tonic-gate 		return (error);
8720Sstevel@tonic-gate 	}
8730Sstevel@tonic-gate 
8740Sstevel@tonic-gate 	/*
8750Sstevel@tonic-gate 	 * finally, if the chunk is a candidate for translation and it
8760Sstevel@tonic-gate 	 * has been translated, then we clone the chunk of the buffer
8770Sstevel@tonic-gate 	 * that was copied aside by the file system.
8780Sstevel@tonic-gate 	 * The cmap_rwlock does not need to be held after we know the
8790Sstevel@tonic-gate 	 * data has already been copied. Once a chunk has been copied
8800Sstevel@tonic-gate 	 * to the backing file, it is stable read only data.
8810Sstevel@tonic-gate 	 */
8820Sstevel@tonic-gate 	cmn = transtbl_get(cmap, chunk);
8830Sstevel@tonic-gate 
8840Sstevel@tonic-gate 	/* check whether the data is in memory or in the backing file */
8850Sstevel@tonic-gate 	if (cmn != NULL) {
8860Sstevel@tonic-gate 		ASSERT(cmn->cmn_buf);
8870Sstevel@tonic-gate 		/* already in memory */
8880Sstevel@tonic-gate 		bcopy(cmn->cmn_buf + offset, buffer, len);
8890Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
8900Sstevel@tonic-gate 	} else {
8910Sstevel@tonic-gate 		ssize_t resid = len;
8920Sstevel@tonic-gate 		int	bf_index;
8930Sstevel@tonic-gate 		/*
8940Sstevel@tonic-gate 		 * can cause deadlock with writer if we don't drop the
8950Sstevel@tonic-gate 		 * cmap_rwlock before trying to get the backing store file
8960Sstevel@tonic-gate 		 * vnode rwlock.
8970Sstevel@tonic-gate 		 */
8980Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
8990Sstevel@tonic-gate 
9000Sstevel@tonic-gate 		bf_index = chunk / cmap->cmap_chunksperbf;
9010Sstevel@tonic-gate 
9020Sstevel@tonic-gate 		/* read buffer from backing file */
9030Sstevel@tonic-gate 		error = vn_rdwr(UIO_READ,
9040Sstevel@tonic-gate 		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
9050Sstevel@tonic-gate 		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
9060Sstevel@tonic-gate 		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
9070Sstevel@tonic-gate 		    RLIM64_INFINITY, kcred, &resid);
9080Sstevel@tonic-gate 	}
9090Sstevel@tonic-gate 
9100Sstevel@tonic-gate 	return (error);
9110Sstevel@tonic-gate }
9120Sstevel@tonic-gate 
9130Sstevel@tonic-gate /*
9140Sstevel@tonic-gate  * snap_print() - snapshot driver print(9E) routine
9150Sstevel@tonic-gate  *
9160Sstevel@tonic-gate  *    prints the device identification string.
9170Sstevel@tonic-gate  */
9180Sstevel@tonic-gate static int
snap_print(dev_t dev,char * str)9190Sstevel@tonic-gate snap_print(dev_t dev, char *str)
9200Sstevel@tonic-gate {
9210Sstevel@tonic-gate 	struct snapshot_id **sidpp;
9220Sstevel@tonic-gate 	minor_t		minor;
9230Sstevel@tonic-gate 
9240Sstevel@tonic-gate 	minor = getminor(dev);
9250Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
9260Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
9270Sstevel@tonic-gate 		cmn_err(CE_WARN,
9280Sstevel@tonic-gate 		    "snap_print: could not find state for snapshot %d.", minor);
9290Sstevel@tonic-gate 		return (ENXIO);
9300Sstevel@tonic-gate 	}
9310Sstevel@tonic-gate 
9320Sstevel@tonic-gate 	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
9330Sstevel@tonic-gate 
9340Sstevel@tonic-gate 	return (0);
9350Sstevel@tonic-gate }
9360Sstevel@tonic-gate 
9370Sstevel@tonic-gate /*
9380Sstevel@tonic-gate  * snap_prop_op() - snapshot driver prop_op(9E) routine
9390Sstevel@tonic-gate  *
9400Sstevel@tonic-gate  *    get 32-bit and 64-bit values for size (character driver) and nblocks
9410Sstevel@tonic-gate  *    (block driver).
9420Sstevel@tonic-gate  */
9430Sstevel@tonic-gate static int
snap_prop_op(dev_t dev,dev_info_t * dip,ddi_prop_op_t prop_op,int flags,char * name,caddr_t valuep,int * lengthp)9440Sstevel@tonic-gate snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
9450Sstevel@tonic-gate     int flags, char *name, caddr_t valuep, int *lengthp)
9460Sstevel@tonic-gate {
9474582Scth 	int		minor;
9480Sstevel@tonic-gate 	struct snapshot_id **sidpp;
9490Sstevel@tonic-gate 	dev_t		mdev;
9504582Scth 	dev_info_t	*mdip;
9514582Scth 	int		error;
9520Sstevel@tonic-gate 
9530Sstevel@tonic-gate 	minor = getminor(dev);
9540Sstevel@tonic-gate 
955*10542SFrank.Batschulat@Sun.COM 	/*
956*10542SFrank.Batschulat@Sun.COM 	 * If this is the control device just check for .conf properties,
957*10542SFrank.Batschulat@Sun.COM 	 * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
958*10542SFrank.Batschulat@Sun.COM 	 * just fall back to the defaults.
959*10542SFrank.Batschulat@Sun.COM 	 */
960*10542SFrank.Batschulat@Sun.COM 	if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
9610Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
9624582Scth 		    valuep, lengthp));
9634582Scth 
9640Sstevel@tonic-gate 	/* check to see if there is a master device plumbed */
9650Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
9660Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
9670Sstevel@tonic-gate 		cmn_err(CE_WARN,
9680Sstevel@tonic-gate 		    "snap_prop_op: could not find state for "
9690Sstevel@tonic-gate 		    "snapshot %d.", minor);
9700Sstevel@tonic-gate 		return (DDI_PROP_NOT_FOUND);
9710Sstevel@tonic-gate 	}
9720Sstevel@tonic-gate 
9730Sstevel@tonic-gate 	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
9740Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
9754582Scth 		    valuep, lengthp));
9764582Scth 
9774582Scth 	/* hold master device and pass operation down */
9780Sstevel@tonic-gate 	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
9794582Scth 	if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
9800Sstevel@tonic-gate 
9814582Scth 		/* get size information from the master device. */
9824582Scth 		error = cdev_prop_op(mdev, mdip,
9834582Scth 		    prop_op, flags, name, valuep, lengthp);
9844582Scth 		ddi_release_devi(mdip);
9854582Scth 		if (error == DDI_PROP_SUCCESS)
9864582Scth 			return (error);
9870Sstevel@tonic-gate 	}
9880Sstevel@tonic-gate 
9894582Scth 	/* master device did not service the request, try framework */
9904582Scth 	return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
9910Sstevel@tonic-gate 
9920Sstevel@tonic-gate }
9930Sstevel@tonic-gate 
9940Sstevel@tonic-gate /*
9950Sstevel@tonic-gate  * snap_ioctl() - snapshot driver ioctl(9E) routine
9960Sstevel@tonic-gate  *
9970Sstevel@tonic-gate  *    only applies to the control device.  The control device accepts two
9980Sstevel@tonic-gate  *    ioctl requests: create a snapshot or delete a snapshot.  In either
9990Sstevel@tonic-gate  *    case, the vnode for the requested file system is extracted, and the
10000Sstevel@tonic-gate  *    request is passed on to the file system via the same ioctl.  The file
10010Sstevel@tonic-gate  *    system is responsible for doing the things necessary for creating or
10020Sstevel@tonic-gate  *    destroying a snapshot, including any file system specific operations
10030Sstevel@tonic-gate  *    that must be performed as well as setting up and deleting the snapshot
10040Sstevel@tonic-gate  *    state through the fssnap interfaces.
10050Sstevel@tonic-gate  */
10060Sstevel@tonic-gate static int
snap_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)10070Sstevel@tonic-gate snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
10080Sstevel@tonic-gate int *rvalp)
10090Sstevel@tonic-gate {
10100Sstevel@tonic-gate 	minor_t	minor;
10110Sstevel@tonic-gate 	int error = 0;
10120Sstevel@tonic-gate 
10130Sstevel@tonic-gate 	minor = getminor(dev);
10140Sstevel@tonic-gate 
10150Sstevel@tonic-gate 	if (minor != SNAP_CTL_MINOR) {
10160Sstevel@tonic-gate 		return (EINVAL);
10170Sstevel@tonic-gate 	}
10180Sstevel@tonic-gate 
10190Sstevel@tonic-gate 	switch (cmd) {
10200Sstevel@tonic-gate 	case _FIOSNAPSHOTCREATE:
10210Sstevel@tonic-gate 	{
10220Sstevel@tonic-gate 		struct fiosnapcreate	fc;
10230Sstevel@tonic-gate 		struct file		*fp;
10240Sstevel@tonic-gate 		struct vnode		*vp;
10250Sstevel@tonic-gate 
10260Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
10270Sstevel@tonic-gate 			return (EFAULT);
10280Sstevel@tonic-gate 
10290Sstevel@tonic-gate 		/* get vnode for file system mount point */
10300Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
10310Sstevel@tonic-gate 			return (EBADF);
10320Sstevel@tonic-gate 
10330Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
10340Sstevel@tonic-gate 		vp = fp->f_vnode;
10350Sstevel@tonic-gate 		VN_HOLD(vp);
10360Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
10370Sstevel@tonic-gate 
10380Sstevel@tonic-gate 		/* pass ioctl request to file system */
10395331Samw 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
10400Sstevel@tonic-gate 		VN_RELE(vp);
10410Sstevel@tonic-gate 		break;
10420Sstevel@tonic-gate 	}
10430Sstevel@tonic-gate 	case _FIOSNAPSHOTCREATE_MULTI:
10440Sstevel@tonic-gate 	{
10450Sstevel@tonic-gate 		struct fiosnapcreate_multi	fc;
10460Sstevel@tonic-gate 		struct file		*fp;
10470Sstevel@tonic-gate 		struct vnode		*vp;
10480Sstevel@tonic-gate 
10490Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
10500Sstevel@tonic-gate 			return (EFAULT);
10510Sstevel@tonic-gate 
10520Sstevel@tonic-gate 		/* get vnode for file system mount point */
10530Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
10540Sstevel@tonic-gate 			return (EBADF);
10550Sstevel@tonic-gate 
10560Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
10570Sstevel@tonic-gate 		vp = fp->f_vnode;
10580Sstevel@tonic-gate 		VN_HOLD(vp);
10590Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
10600Sstevel@tonic-gate 
10610Sstevel@tonic-gate 		/* pass ioctl request to file system */
10625331Samw 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
10630Sstevel@tonic-gate 		VN_RELE(vp);
10640Sstevel@tonic-gate 		break;
10650Sstevel@tonic-gate 	}
10660Sstevel@tonic-gate 	case _FIOSNAPSHOTDELETE:
10670Sstevel@tonic-gate 	{
10680Sstevel@tonic-gate 		major_t			major;
10690Sstevel@tonic-gate 		struct fiosnapdelete	fc;
10700Sstevel@tonic-gate 		snapshot_id_t		*sidp = NULL;
10710Sstevel@tonic-gate 		snapshot_id_t		*sidnextp = NULL;
10720Sstevel@tonic-gate 		struct file		*fp = NULL;
10730Sstevel@tonic-gate 		struct vnode		*vp = NULL;
10740Sstevel@tonic-gate 		struct vfs 		*vfsp = NULL;
10750Sstevel@tonic-gate 		vfsops_t		*vfsops = EIO_vfsops;
10760Sstevel@tonic-gate 
10770Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
10780Sstevel@tonic-gate 			return (EFAULT);
10790Sstevel@tonic-gate 
10800Sstevel@tonic-gate 		/* get vnode for file system mount point */
10810Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
10820Sstevel@tonic-gate 			return (EBADF);
10830Sstevel@tonic-gate 
10840Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
10850Sstevel@tonic-gate 		vp = fp->f_vnode;
10860Sstevel@tonic-gate 		VN_HOLD(vp);
10870Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
10880Sstevel@tonic-gate 		/*
10890Sstevel@tonic-gate 		 * Test for two formats of delete and set correct minor/vp:
10900Sstevel@tonic-gate 		 * pseudo device:
10910Sstevel@tonic-gate 		 * fssnap -d [/dev/fssnap/x]
10920Sstevel@tonic-gate 		 * or
10930Sstevel@tonic-gate 		 * mount point:
10940Sstevel@tonic-gate 		 * fssnap -d [/mntpt]
10950Sstevel@tonic-gate 		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
10960Sstevel@tonic-gate 		 * at this point which is an invalid minor number.
10970Sstevel@tonic-gate 		 */
10980Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
10990Sstevel@tonic-gate 		major = ddi_driver_major(fssnap_dip);
11000Sstevel@tonic-gate 		mutex_enter(&snapshot_mutex);
11010Sstevel@tonic-gate 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
11020Sstevel@tonic-gate 			rw_enter(&sidp->sid_rwlock, RW_READER);
11030Sstevel@tonic-gate 			sidnextp = sidp->sid_next;
11040Sstevel@tonic-gate 			/* pseudo device: */
11050Sstevel@tonic-gate 			if (major == getmajor(vp->v_rdev)) {
11060Sstevel@tonic-gate 				minor = getminor(vp->v_rdev);
11070Sstevel@tonic-gate 				if (sidp->sid_snapnumber == (uint_t)minor &&
11080Sstevel@tonic-gate 				    sidp->sid_fvp) {
11090Sstevel@tonic-gate 					VN_RELE(vp);
11100Sstevel@tonic-gate 					vp = sidp->sid_fvp;
11110Sstevel@tonic-gate 					VN_HOLD(vp);
11120Sstevel@tonic-gate 					rw_exit(&sidp->sid_rwlock);
11130Sstevel@tonic-gate 					break;
11140Sstevel@tonic-gate 				}
11150Sstevel@tonic-gate 			/* Mount point: */
11160Sstevel@tonic-gate 			} else {
11170Sstevel@tonic-gate 				if (sidp->sid_fvp == vp) {
11180Sstevel@tonic-gate 					minor = sidp->sid_snapnumber;
11190Sstevel@tonic-gate 					rw_exit(&sidp->sid_rwlock);
11200Sstevel@tonic-gate 					break;
11210Sstevel@tonic-gate 				}
11220Sstevel@tonic-gate 			}
11230Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
11240Sstevel@tonic-gate 		}
11250Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
11260Sstevel@tonic-gate 		/* Verify minor got set correctly above */
11270Sstevel@tonic-gate 		if (minor == SNAP_CTL_MINOR) {
11280Sstevel@tonic-gate 			VN_RELE(vp);
11290Sstevel@tonic-gate 			return (EINVAL);
11300Sstevel@tonic-gate 		}
11310Sstevel@tonic-gate 		dev = makedevice(major, minor);
11320Sstevel@tonic-gate 		/*
11330Sstevel@tonic-gate 		 * Create dummy vfs entry
11340Sstevel@tonic-gate 		 * to use as a locking semaphore across the IOCTL
11350Sstevel@tonic-gate 		 * for mount in progress cases...
11360Sstevel@tonic-gate 		 */
11375331Samw 		vfsp = vfs_alloc(KM_SLEEP);
11380Sstevel@tonic-gate 		VFS_INIT(vfsp, vfsops, NULL);
11391925Srsb 		VFS_HOLD(vfsp);
11400Sstevel@tonic-gate 		vfs_addmip(dev, vfsp);
11410Sstevel@tonic-gate 		if ((vfs_devmounting(dev, vfsp)) ||
11420Sstevel@tonic-gate 		    (vfs_devismounted(dev))) {
11430Sstevel@tonic-gate 			vfs_delmip(vfsp);
11441925Srsb 			VFS_RELE(vfsp);
11450Sstevel@tonic-gate 			VN_RELE(vp);
11460Sstevel@tonic-gate 			return (EBUSY);
11470Sstevel@tonic-gate 		}
11480Sstevel@tonic-gate 		/*
11490Sstevel@tonic-gate 		 * Nobody mounted but do not release mount in progress lock
11500Sstevel@tonic-gate 		 * until IOCTL complete to prohibit a mount sneaking
11510Sstevel@tonic-gate 		 * in
11520Sstevel@tonic-gate 		 */
11535331Samw 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
11540Sstevel@tonic-gate 		vfs_delmip(vfsp);
11551925Srsb 		VFS_RELE(vfsp);
11560Sstevel@tonic-gate 		VN_RELE(vp);
11570Sstevel@tonic-gate 		break;
11580Sstevel@tonic-gate 	}
11590Sstevel@tonic-gate 	default:
11600Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
11610Sstevel@tonic-gate 		    cmd, minor);
11620Sstevel@tonic-gate 		return (EINVAL);
11630Sstevel@tonic-gate 	}
11640Sstevel@tonic-gate 
11650Sstevel@tonic-gate 	return (error);
11660Sstevel@tonic-gate }
11670Sstevel@tonic-gate 
11680Sstevel@tonic-gate 
11690Sstevel@tonic-gate /* ************************************************************************ */
11700Sstevel@tonic-gate 
11710Sstevel@tonic-gate /*
11720Sstevel@tonic-gate  * Translation Table Routines
11730Sstevel@tonic-gate  *
11740Sstevel@tonic-gate  *    These support routines implement a simple doubly linked list
11750Sstevel@tonic-gate  *    to keep track of chunks that are currently in memory.  The maximum
11760Sstevel@tonic-gate  *    size of the list is determined by the fssnap_max_mem_chunks variable.
11770Sstevel@tonic-gate  *    The cmap_rwlock is used to protect the linkage of the list.
11780Sstevel@tonic-gate  */
11790Sstevel@tonic-gate 
11800Sstevel@tonic-gate /*
11810Sstevel@tonic-gate  * transtbl_add() - add a node to the translation table
11820Sstevel@tonic-gate  *
11830Sstevel@tonic-gate  *    allocates a new node and points it at the buffer passed in.  The node
11840Sstevel@tonic-gate  *    is added to the beginning of the doubly linked list and the head of
11850Sstevel@tonic-gate  *    the list is moved.  The cmap_rwlock must be held as a writer through
11860Sstevel@tonic-gate  *    this operation.
11870Sstevel@tonic-gate  */
11880Sstevel@tonic-gate static cow_map_node_t *
transtbl_add(cow_map_t * cmap,chunknumber_t chunk,caddr_t buf)11890Sstevel@tonic-gate transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
11900Sstevel@tonic-gate {
11910Sstevel@tonic-gate 	cow_map_node_t	*cmnode;
11920Sstevel@tonic-gate 
11930Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
11940Sstevel@tonic-gate 
11950Sstevel@tonic-gate 	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
11960Sstevel@tonic-gate 
11970Sstevel@tonic-gate 	/*
11980Sstevel@tonic-gate 	 * insert new translations at the beginning so cmn_table is always
11990Sstevel@tonic-gate 	 * the first node.
12000Sstevel@tonic-gate 	 */
12010Sstevel@tonic-gate 	cmnode->cmn_chunk = chunk;
12020Sstevel@tonic-gate 	cmnode->cmn_buf = buf;
12030Sstevel@tonic-gate 	cmnode->cmn_prev = NULL;
12040Sstevel@tonic-gate 	cmnode->cmn_next = cmap->cmap_table;
12050Sstevel@tonic-gate 	if (cmnode->cmn_next)
12060Sstevel@tonic-gate 		cmnode->cmn_next->cmn_prev = cmnode;
12070Sstevel@tonic-gate 	cmap->cmap_table = cmnode;
12080Sstevel@tonic-gate 
12090Sstevel@tonic-gate 	return (cmnode);
12100Sstevel@tonic-gate }
12110Sstevel@tonic-gate 
12120Sstevel@tonic-gate /*
12130Sstevel@tonic-gate  * transtbl_get() - look up a node in the translation table
12140Sstevel@tonic-gate  *
12150Sstevel@tonic-gate  *    called by the snapshot driver to find data that has been translated.
12160Sstevel@tonic-gate  *    The lookup is done by the chunk number, and the node is returned.
12170Sstevel@tonic-gate  *    If the node was not found, NULL is returned.
12180Sstevel@tonic-gate  */
12190Sstevel@tonic-gate static cow_map_node_t *
transtbl_get(cow_map_t * cmap,chunknumber_t chunk)12200Sstevel@tonic-gate transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
12210Sstevel@tonic-gate {
12220Sstevel@tonic-gate 	cow_map_node_t *cmn;
12230Sstevel@tonic-gate 
12240Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
12250Sstevel@tonic-gate 	ASSERT(cmap);
12260Sstevel@tonic-gate 
12270Sstevel@tonic-gate 	/* search the translation table */
12280Sstevel@tonic-gate 	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
12290Sstevel@tonic-gate 		if (cmn->cmn_chunk == chunk)
12300Sstevel@tonic-gate 			return (cmn);
12310Sstevel@tonic-gate 	}
12320Sstevel@tonic-gate 
12330Sstevel@tonic-gate 	/* not found */
12340Sstevel@tonic-gate 	return (NULL);
12350Sstevel@tonic-gate }
12360Sstevel@tonic-gate 
12370Sstevel@tonic-gate /*
12380Sstevel@tonic-gate  * transtbl_delete() - delete a node from the translation table
12390Sstevel@tonic-gate  *
12400Sstevel@tonic-gate  *    called when a node's data has been written out to disk.  The
12410Sstevel@tonic-gate  *    cmap_rwlock must be held as a writer for this operation.  If the node
12420Sstevel@tonic-gate  *    being deleted is the head of the list, then the head is moved to the
12430Sstevel@tonic-gate  *    next node.  Both the node's data and the node itself are freed.
12440Sstevel@tonic-gate  */
12450Sstevel@tonic-gate static void
transtbl_delete(cow_map_t * cmap,cow_map_node_t * cmn)12460Sstevel@tonic-gate transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
12470Sstevel@tonic-gate {
12480Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
12490Sstevel@tonic-gate 	ASSERT(cmn);
12500Sstevel@tonic-gate 	ASSERT(cmap->cmap_table);
12510Sstevel@tonic-gate 
12520Sstevel@tonic-gate 	/* if the head of the list is being deleted, then move the head up */
12530Sstevel@tonic-gate 	if (cmap->cmap_table == cmn) {
12540Sstevel@tonic-gate 		ASSERT(cmn->cmn_prev == NULL);
12550Sstevel@tonic-gate 		cmap->cmap_table = cmn->cmn_next;
12560Sstevel@tonic-gate 	}
12570Sstevel@tonic-gate 
12580Sstevel@tonic-gate 
12590Sstevel@tonic-gate 	/* make previous node's next pointer skip over current node */
12600Sstevel@tonic-gate 	if (cmn->cmn_prev != NULL) {
12610Sstevel@tonic-gate 		ASSERT(cmn->cmn_prev->cmn_next == cmn);
12620Sstevel@tonic-gate 		cmn->cmn_prev->cmn_next = cmn->cmn_next;
12630Sstevel@tonic-gate 	}
12640Sstevel@tonic-gate 
12650Sstevel@tonic-gate 	/* make next node's previous pointer skip over current node */
12660Sstevel@tonic-gate 	if (cmn->cmn_next != NULL) {
12670Sstevel@tonic-gate 		ASSERT(cmn->cmn_next->cmn_prev == cmn);
12680Sstevel@tonic-gate 		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
12690Sstevel@tonic-gate 	}
12700Sstevel@tonic-gate 
12710Sstevel@tonic-gate 	/* free the data and the node */
12720Sstevel@tonic-gate 	ASSERT(cmn->cmn_buf);
12730Sstevel@tonic-gate 	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
12740Sstevel@tonic-gate 	kmem_free(cmn, sizeof (cow_map_node_t));
12750Sstevel@tonic-gate }
12760Sstevel@tonic-gate 
12770Sstevel@tonic-gate /*
12780Sstevel@tonic-gate  * transtbl_free() - free the entire translation table
12790Sstevel@tonic-gate  *
12800Sstevel@tonic-gate  *    called when the snapshot is deleted.  This frees all of the nodes in
12810Sstevel@tonic-gate  *    the translation table (but not the bitmaps).
12820Sstevel@tonic-gate  */
12830Sstevel@tonic-gate static void
transtbl_free(cow_map_t * cmap)12840Sstevel@tonic-gate transtbl_free(cow_map_t *cmap)
12850Sstevel@tonic-gate {
12860Sstevel@tonic-gate 	cow_map_node_t	*curnode;
12870Sstevel@tonic-gate 	cow_map_node_t	*tempnode;
12880Sstevel@tonic-gate 
12890Sstevel@tonic-gate 	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
12900Sstevel@tonic-gate 		tempnode = curnode->cmn_next;
12910Sstevel@tonic-gate 
12920Sstevel@tonic-gate 		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
12930Sstevel@tonic-gate 		kmem_free(curnode, sizeof (cow_map_node_t));
12940Sstevel@tonic-gate 	}
12950Sstevel@tonic-gate }
12960Sstevel@tonic-gate 
12970Sstevel@tonic-gate 
12980Sstevel@tonic-gate /* ************************************************************************ */
12990Sstevel@tonic-gate 
13000Sstevel@tonic-gate /*
13010Sstevel@tonic-gate  * Interface Implementation Routines
13020Sstevel@tonic-gate  *
13030Sstevel@tonic-gate  * The following functions implement snapshot interface routines that are
13040Sstevel@tonic-gate  * called by the file system to create, delete, and use a snapshot.  The
13050Sstevel@tonic-gate  * interfaces are defined in fssnap_if.c and are filled in by this driver
13060Sstevel@tonic-gate  * when it is loaded.  This technique allows the file system to depend on
13070Sstevel@tonic-gate  * the interface module without having to load the full implementation and
13080Sstevel@tonic-gate  * snapshot device drivers.
13090Sstevel@tonic-gate  */
13100Sstevel@tonic-gate 
13110Sstevel@tonic-gate /*
13120Sstevel@tonic-gate  * fssnap_strategy_impl() - strategy routine called by the file system
13130Sstevel@tonic-gate  *
13140Sstevel@tonic-gate  *    called by the file system to handle copy-on-write when necessary.  All
13150Sstevel@tonic-gate  *    reads and writes that the file system performs should go through this
13160Sstevel@tonic-gate  *    function.  If the file system calls the underlying device's strategy
13170Sstevel@tonic-gate  *    routine without going through fssnap_strategy() (eg. by calling
13180Sstevel@tonic-gate  *    bdev_strategy()), the snapshot may not be consistent.
13190Sstevel@tonic-gate  *
13200Sstevel@tonic-gate  *    This function starts by doing significant sanity checking to insure
13210Sstevel@tonic-gate  *    the snapshot was not deleted out from under it or deleted and then
13220Sstevel@tonic-gate  *    recreated.  To do this, it checks the actual pointer passed into it
13230Sstevel@tonic-gate  *    (ie. the handle held by the file system).  NOTE that the parameter is
13240Sstevel@tonic-gate  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
13250Sstevel@tonic-gate  *    locked, it knows things are ok and that this snapshot is really for
13260Sstevel@tonic-gate  *    this file system.
13270Sstevel@tonic-gate  *
13280Sstevel@tonic-gate  *    If the request is a write, fssnap_translate() is called to determine
13290Sstevel@tonic-gate  *    whether a copy-on-write is required.  If it is a read, the read is
13300Sstevel@tonic-gate  *    simply passed on to the underlying device.
13310Sstevel@tonic-gate  */
13320Sstevel@tonic-gate static void
fssnap_strategy_impl(void * snapshot_id,buf_t * bp)13330Sstevel@tonic-gate fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
13340Sstevel@tonic-gate {
13350Sstevel@tonic-gate 	struct snapshot_id **sidpp;
13360Sstevel@tonic-gate 	struct snapshot_id *sidp;
13370Sstevel@tonic-gate 	int error;
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate 	/* read requests are always passed through */
13400Sstevel@tonic-gate 	if (bp->b_flags & B_READ) {
13410Sstevel@tonic-gate 		(void) bdev_strategy(bp);
13420Sstevel@tonic-gate 		return;
13430Sstevel@tonic-gate 	}
13440Sstevel@tonic-gate 
13450Sstevel@tonic-gate 	/*
13460Sstevel@tonic-gate 	 * Because we were not able to take the snapshot read lock BEFORE
13470Sstevel@tonic-gate 	 * checking for a snapshot back in the file system, things may have
13480Sstevel@tonic-gate 	 * drastically changed out from under us.  For instance, the snapshot
13490Sstevel@tonic-gate 	 * may have been deleted, deleted and recreated, or worse yet, deleted
13500Sstevel@tonic-gate 	 * for this file system but now the snapshot number is in use by another
13510Sstevel@tonic-gate 	 * file system.
13520Sstevel@tonic-gate 	 *
13530Sstevel@tonic-gate 	 * Having a pointer to the file system's snapshot id pointer allows us
13540Sstevel@tonic-gate 	 * to sanity check most of this, though it assumes the file system is
13550Sstevel@tonic-gate 	 * keeping track of a pointer to the snapshot_id somewhere.
13560Sstevel@tonic-gate 	 */
13570Sstevel@tonic-gate 	sidpp = (struct snapshot_id **)snapshot_id;
13580Sstevel@tonic-gate 	sidp = *sidpp;
13590Sstevel@tonic-gate 
13600Sstevel@tonic-gate 	/*
13610Sstevel@tonic-gate 	 * if this file system's snapshot was disabled, just pass the
13620Sstevel@tonic-gate 	 * request through.
13630Sstevel@tonic-gate 	 */
13640Sstevel@tonic-gate 	if (sidp == NULL) {
13650Sstevel@tonic-gate 		(void) bdev_strategy(bp);
13660Sstevel@tonic-gate 		return;
13670Sstevel@tonic-gate 	}
13680Sstevel@tonic-gate 
13690Sstevel@tonic-gate 	/*
13700Sstevel@tonic-gate 	 * Once we have the reader lock the snapshot will not magically go
13710Sstevel@tonic-gate 	 * away.  But things may have changed on us before this so double check.
13720Sstevel@tonic-gate 	 */
13730Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 	/*
13760Sstevel@tonic-gate 	 * if an error was founds somewhere the DELETE flag will be
13770Sstevel@tonic-gate 	 * set to indicate the snapshot should be deleted and no new
13780Sstevel@tonic-gate 	 * translations should occur.
13790Sstevel@tonic-gate 	 */
13800Sstevel@tonic-gate 	if (sidp->sid_flags & SID_DELETE) {
13810Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
13820Sstevel@tonic-gate 		(void) fssnap_delete_impl(sidpp);
13830Sstevel@tonic-gate 		(void) bdev_strategy(bp);
13840Sstevel@tonic-gate 		return;
13850Sstevel@tonic-gate 	}
13860Sstevel@tonic-gate 
13870Sstevel@tonic-gate 	/*
13880Sstevel@tonic-gate 	 * If the file system is no longer pointing to the snapshot we were
13890Sstevel@tonic-gate 	 * called with, then it should not attempt to translate this buffer as
13900Sstevel@tonic-gate 	 * it may be going to a snapshot for a different file system.
13910Sstevel@tonic-gate 	 * Even if the file system snapshot pointer is still the same, the
13920Sstevel@tonic-gate 	 * snapshot may have been disabled before we got the reader lock.
13930Sstevel@tonic-gate 	 */
13940Sstevel@tonic-gate 	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
13950Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
13960Sstevel@tonic-gate 		(void) bdev_strategy(bp);
13970Sstevel@tonic-gate 		return;
13980Sstevel@tonic-gate 	}
13990Sstevel@tonic-gate 
14000Sstevel@tonic-gate 	/*
14010Sstevel@tonic-gate 	 * At this point we're sure the snapshot will not go away while the
14020Sstevel@tonic-gate 	 * reader lock is held, and we are reasonably certain that we are
14030Sstevel@tonic-gate 	 * writing to the correct snapshot.
14040Sstevel@tonic-gate 	 */
14050Sstevel@tonic-gate 	if ((error = fssnap_translate(sidpp, bp)) != 0) {
14060Sstevel@tonic-gate 		/*
14070Sstevel@tonic-gate 		 * fssnap_translate can release the reader lock if it
14080Sstevel@tonic-gate 		 * has to wait for a semaphore.  In this case it is possible
14090Sstevel@tonic-gate 		 * for the snapshot to be deleted in this time frame.  If this
14100Sstevel@tonic-gate 		 * happens just sent the buf thru to the filesystems device.
14110Sstevel@tonic-gate 		 */
14120Sstevel@tonic-gate 		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
14130Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
14140Sstevel@tonic-gate 			(void) bdev_strategy(bp);
14150Sstevel@tonic-gate 			return;
14160Sstevel@tonic-gate 		}
14170Sstevel@tonic-gate 		bioerror(bp, error);
14180Sstevel@tonic-gate 		biodone(bp);
14190Sstevel@tonic-gate 	}
14200Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
14210Sstevel@tonic-gate }
14220Sstevel@tonic-gate 
14230Sstevel@tonic-gate /*
14240Sstevel@tonic-gate  * fssnap_translate() - helper function for fssnap_strategy()
14250Sstevel@tonic-gate  *
14260Sstevel@tonic-gate  *    performs the actual copy-on-write for write requests, if required.
14270Sstevel@tonic-gate  *    This function does the real work of the file system side of things.
14280Sstevel@tonic-gate  *
14290Sstevel@tonic-gate  *    It first checks the candidate bitmap to quickly determine whether any
14300Sstevel@tonic-gate  *    action is necessary.  If the candidate bitmap indicates the chunk was
14310Sstevel@tonic-gate  *    allocated when the snapshot was created, then it checks to see whether
14320Sstevel@tonic-gate  *    a translation already exists.  If a translation already exists then no
14330Sstevel@tonic-gate  *    action is required.  If the chunk is a candidate for copy-on-write,
14340Sstevel@tonic-gate  *    and a translation does not already exist, then the chunk is read in
14350Sstevel@tonic-gate  *    and a node is added to the translation table.
14360Sstevel@tonic-gate  *
14370Sstevel@tonic-gate  *    Once all of the chunks in the request range have been copied (if they
14380Sstevel@tonic-gate  *    needed to be), then the original request can be satisfied and the old
14390Sstevel@tonic-gate  *    data can be overwritten.
14400Sstevel@tonic-gate  */
14410Sstevel@tonic-gate static int
fssnap_translate(struct snapshot_id ** sidpp,struct buf * wbp)14420Sstevel@tonic-gate fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
14430Sstevel@tonic-gate {
14440Sstevel@tonic-gate 	snapshot_id_t	*sidp = *sidpp;
14450Sstevel@tonic-gate 	struct buf	*oldbp;	/* buffer to store old data in */
14460Sstevel@tonic-gate 	struct cow_info	*cowp = sidp->sid_cowinfo;
14470Sstevel@tonic-gate 	cow_map_t	*cmap = &cowp->cow_map;
14480Sstevel@tonic-gate 	cow_map_node_t	*cmn;
14490Sstevel@tonic-gate 	chunknumber_t	cowchunk, startchunk, endchunk;
14500Sstevel@tonic-gate 	int		error;
14510Sstevel@tonic-gate 	int	throttle_write = 0;
14520Sstevel@tonic-gate 
14530Sstevel@tonic-gate 	/* make sure the snapshot is active */
14540Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
14550Sstevel@tonic-gate 
14560Sstevel@tonic-gate 	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
14570Sstevel@tonic-gate 	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
14580Sstevel@tonic-gate 	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
14590Sstevel@tonic-gate 
14600Sstevel@tonic-gate 	/*
14610Sstevel@tonic-gate 	 * Do not throttle the writes of the fssnap taskq thread and
14620Sstevel@tonic-gate 	 * the log roll (trans_roll) thread. Furthermore the writes to
14630Sstevel@tonic-gate 	 * the on-disk log are also not subject to throttling.
14640Sstevel@tonic-gate 	 * The fssnap_write_taskq thread's write can block on the throttling
14650Sstevel@tonic-gate 	 * semaphore which leads to self-deadlock as this same thread
14660Sstevel@tonic-gate 	 * releases the throttling semaphore after completing the IO.
14670Sstevel@tonic-gate 	 * If the trans_roll thread's write is throttled then we can deadlock
14680Sstevel@tonic-gate 	 * because the fssnap_taskq_thread which releases the throttling
14690Sstevel@tonic-gate 	 * semaphore can block waiting for log space which can only be
14700Sstevel@tonic-gate 	 * released by the trans_roll thread.
14710Sstevel@tonic-gate 	 */
14720Sstevel@tonic-gate 
14730Sstevel@tonic-gate 	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
14744582Scth 	    tsd_get(bypass_snapshot_throttle_key));
14750Sstevel@tonic-gate 
14760Sstevel@tonic-gate 	/*
14770Sstevel@tonic-gate 	 * Iterate through all chunks covered by this write and perform the
14780Sstevel@tonic-gate 	 * copy-aside if necessary.  Once all chunks have been safely
14790Sstevel@tonic-gate 	 * stowed away, the new data may be written in a single sweep.
14800Sstevel@tonic-gate 	 *
14810Sstevel@tonic-gate 	 * For each chunk in the range, the following sequence is performed:
14820Sstevel@tonic-gate 	 *	- Is the chunk a candidate for translation?
14830Sstevel@tonic-gate 	 *		o If not, then no translation is necessary, continue
14840Sstevel@tonic-gate 	 *	- If it is a candidate, then does it already have a translation?
14850Sstevel@tonic-gate 	 *		o If so, then no translation is necessary, continue
14860Sstevel@tonic-gate 	 *	- If it is a candidate, but does not yet have a translation,
14870Sstevel@tonic-gate 	 *	  then read the old data and schedule an asynchronous taskq
14880Sstevel@tonic-gate 	 *	  to write the old data to the backing file.
14890Sstevel@tonic-gate 	 *
14900Sstevel@tonic-gate 	 * Once this has been performed over the entire range of chunks, then
14910Sstevel@tonic-gate 	 * it is safe to overwrite the data that is there.
14920Sstevel@tonic-gate 	 *
14930Sstevel@tonic-gate 	 * Note that no lock is required to check the candidate bitmap because
14940Sstevel@tonic-gate 	 * it never changes once the snapshot is created.  The reader lock is
14950Sstevel@tonic-gate 	 * taken to check the hastrans bitmap since it may change.  If it
14960Sstevel@tonic-gate 	 * turns out a copy is required, then the lock is upgraded to a
14970Sstevel@tonic-gate 	 * writer, and the bitmap is re-checked as it may have changed while
14980Sstevel@tonic-gate 	 * the lock was released.  Finally, the write lock is held while
14990Sstevel@tonic-gate 	 * reading the old data to make sure it is not translated out from
15000Sstevel@tonic-gate 	 * under us.
15010Sstevel@tonic-gate 	 *
15020Sstevel@tonic-gate 	 * This locking mechanism should be sufficient to handle multiple
15030Sstevel@tonic-gate 	 * threads writing to overlapping chunks simultaneously.
15040Sstevel@tonic-gate 	 */
15050Sstevel@tonic-gate 	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
15060Sstevel@tonic-gate 		/*
15070Sstevel@tonic-gate 		 * If the cowchunk is outside of the range of our
15080Sstevel@tonic-gate 		 * candidate maps, then simply break out of the
15090Sstevel@tonic-gate 		 * loop and pass the I/O through to bdev_strategy.
15100Sstevel@tonic-gate 		 * This would occur if the file system has grown
15110Sstevel@tonic-gate 		 * larger since the snapshot was taken.
15120Sstevel@tonic-gate 		 */
15130Sstevel@tonic-gate 		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
15140Sstevel@tonic-gate 			break;
15150Sstevel@tonic-gate 
15160Sstevel@tonic-gate 		/*
15170Sstevel@tonic-gate 		 * If no disk blocks were allocated in this chunk when the
15180Sstevel@tonic-gate 		 * snapshot was created then no copy-on-write will be
15190Sstevel@tonic-gate 		 * required.  Since this bitmap is read-only no locks are
15200Sstevel@tonic-gate 		 * necessary.
15210Sstevel@tonic-gate 		 */
15220Sstevel@tonic-gate 		if (isclr(cmap->cmap_candidate, cowchunk)) {
15230Sstevel@tonic-gate 			continue;
15240Sstevel@tonic-gate 		}
15250Sstevel@tonic-gate 
15260Sstevel@tonic-gate 		/*
15270Sstevel@tonic-gate 		 * If a translation already exists, the data can be written
15280Sstevel@tonic-gate 		 * through since the old data has already been saved off.
15290Sstevel@tonic-gate 		 */
15300Sstevel@tonic-gate 		if (isset(cmap->cmap_hastrans, cowchunk)) {
15310Sstevel@tonic-gate 			continue;
15320Sstevel@tonic-gate 		}
15330Sstevel@tonic-gate 
15340Sstevel@tonic-gate 
15350Sstevel@tonic-gate 		/*
15360Sstevel@tonic-gate 		 * Throttle translations if there are too many outstanding
15370Sstevel@tonic-gate 		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
15380Sstevel@tonic-gate 		 *
15390Sstevel@tonic-gate 		 * You can't keep the sid_rwlock if you would go to sleep.
15400Sstevel@tonic-gate 		 * This will result in deadlock when someone tries to delete
15410Sstevel@tonic-gate 		 * the snapshot (wants the sid_rwlock as a writer, but can't
15420Sstevel@tonic-gate 		 * get it).
15430Sstevel@tonic-gate 		 */
15440Sstevel@tonic-gate 		if (throttle_write) {
15450Sstevel@tonic-gate 			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
15460Sstevel@tonic-gate 				rw_exit(&sidp->sid_rwlock);
15470Sstevel@tonic-gate 				atomic_add_32(&cmap->cmap_waiters, 1);
15480Sstevel@tonic-gate 				sema_p(&cmap->cmap_throttle_sem);
15490Sstevel@tonic-gate 				atomic_add_32(&cmap->cmap_waiters, -1);
15500Sstevel@tonic-gate 				rw_enter(&sidp->sid_rwlock, RW_READER);
15510Sstevel@tonic-gate 
15520Sstevel@tonic-gate 			/*
15530Sstevel@tonic-gate 			 * Now since we released the sid_rwlock the state may
15540Sstevel@tonic-gate 			 * have transitioned underneath us. so check that again.
15550Sstevel@tonic-gate 			 */
15560Sstevel@tonic-gate 				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
15570Sstevel@tonic-gate 					sema_v(&cmap->cmap_throttle_sem);
15580Sstevel@tonic-gate 					return (ENXIO);
15590Sstevel@tonic-gate 				}
15600Sstevel@tonic-gate 			}
15610Sstevel@tonic-gate 		}
15620Sstevel@tonic-gate 
15630Sstevel@tonic-gate 		/*
15640Sstevel@tonic-gate 		 * Acquire the lock as a writer and check to see if a
15650Sstevel@tonic-gate 		 * translation has been added in the meantime.
15660Sstevel@tonic-gate 		 */
15670Sstevel@tonic-gate 		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
15680Sstevel@tonic-gate 		if (isset(cmap->cmap_hastrans, cowchunk)) {
15690Sstevel@tonic-gate 			if (throttle_write)
15700Sstevel@tonic-gate 				sema_v(&cmap->cmap_throttle_sem);
15710Sstevel@tonic-gate 			rw_exit(&cmap->cmap_rwlock);
15720Sstevel@tonic-gate 			continue; /* go to the next chunk */
15730Sstevel@tonic-gate 		}
15740Sstevel@tonic-gate 
15750Sstevel@tonic-gate 		/*
15760Sstevel@tonic-gate 		 * read a full chunk of data from the requested offset rounded
15770Sstevel@tonic-gate 		 * down to the nearest chunk size.
15780Sstevel@tonic-gate 		 */
15790Sstevel@tonic-gate 		oldbp = getrbuf(KM_SLEEP);
15800Sstevel@tonic-gate 		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
15810Sstevel@tonic-gate 		oldbp->b_edev = wbp->b_edev;
15820Sstevel@tonic-gate 		oldbp->b_bcount = cmap->cmap_chunksz;
15830Sstevel@tonic-gate 		oldbp->b_bufsize = cmap->cmap_chunksz;
15840Sstevel@tonic-gate 		oldbp->b_iodone = NULL;
15850Sstevel@tonic-gate 		oldbp->b_proc = NULL;
15860Sstevel@tonic-gate 		oldbp->b_flags = B_READ;
15870Sstevel@tonic-gate 		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
15880Sstevel@tonic-gate 
15890Sstevel@tonic-gate 		(void) bdev_strategy(oldbp);
15900Sstevel@tonic-gate 		(void) biowait(oldbp);
15910Sstevel@tonic-gate 
15920Sstevel@tonic-gate 		/*
15930Sstevel@tonic-gate 		 * It's ok to bail in the middle of translating the range
15940Sstevel@tonic-gate 		 * because the extra copy-asides will not hurt anything
15950Sstevel@tonic-gate 		 * (except by using extra space in the backing store).
15960Sstevel@tonic-gate 		 */
15970Sstevel@tonic-gate 		if ((error = geterror(oldbp)) != 0) {
15980Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_translate: error reading "
15990Sstevel@tonic-gate 			    "old data for snapshot %d, chunk %llu, disk block "
16000Sstevel@tonic-gate 			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
16010Sstevel@tonic-gate 			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
16020Sstevel@tonic-gate 			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
16030Sstevel@tonic-gate 			freerbuf(oldbp);
16040Sstevel@tonic-gate 			rw_exit(&cmap->cmap_rwlock);
16050Sstevel@tonic-gate 			if (throttle_write)
16060Sstevel@tonic-gate 				sema_v(&cmap->cmap_throttle_sem);
16070Sstevel@tonic-gate 			return (error);
16080Sstevel@tonic-gate 		}
16090Sstevel@tonic-gate 
16100Sstevel@tonic-gate 		/*
16110Sstevel@tonic-gate 		 * add the node to the translation table and save a reference
16120Sstevel@tonic-gate 		 * to pass to the taskq for writing out to the backing file
16130Sstevel@tonic-gate 		 */
16140Sstevel@tonic-gate 		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
16150Sstevel@tonic-gate 		freerbuf(oldbp);
16160Sstevel@tonic-gate 
16170Sstevel@tonic-gate 		/*
16180Sstevel@tonic-gate 		 * Add a reference to the snapshot id so the lower level
16190Sstevel@tonic-gate 		 * processing (ie. the taskq) can get back to the state
16200Sstevel@tonic-gate 		 * information.
16210Sstevel@tonic-gate 		 */
16220Sstevel@tonic-gate 		cmn->cmn_sid = sidp;
16230Sstevel@tonic-gate 		cmn->release_sem = throttle_write;
16240Sstevel@tonic-gate 		setbit(cmap->cmap_hastrans, cowchunk);
16250Sstevel@tonic-gate 
16260Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
16270Sstevel@tonic-gate 
16280Sstevel@tonic-gate 		/*
16290Sstevel@tonic-gate 		 * schedule the asynchronous write to the backing file
16300Sstevel@tonic-gate 		 */
16310Sstevel@tonic-gate 		if (cowp->cow_backfile_array != NULL)
16320Sstevel@tonic-gate 			(void) taskq_dispatch(cowp->cow_taskq,
16330Sstevel@tonic-gate 			    fssnap_write_taskq, cmn, TQ_SLEEP);
16340Sstevel@tonic-gate 	}
16350Sstevel@tonic-gate 
16360Sstevel@tonic-gate 	/*
16370Sstevel@tonic-gate 	 * Write new data in place of the old data.  At this point all of the
16380Sstevel@tonic-gate 	 * chunks touched by this write have been copied aside and so the new
16390Sstevel@tonic-gate 	 * data can be written out all at once.
16400Sstevel@tonic-gate 	 */
16410Sstevel@tonic-gate 	(void) bdev_strategy(wbp);
16420Sstevel@tonic-gate 
16430Sstevel@tonic-gate 	return (0);
16440Sstevel@tonic-gate }
16450Sstevel@tonic-gate 
16460Sstevel@tonic-gate /*
16470Sstevel@tonic-gate  * fssnap_write_taskq() - write in-memory translations to the backing file
16480Sstevel@tonic-gate  *
16490Sstevel@tonic-gate  *    writes in-memory translations to the backing file asynchronously.  A
16500Sstevel@tonic-gate  *    task is dispatched each time a new translation is created.  The task
16510Sstevel@tonic-gate  *    writes the data to the backing file and removes it from the memory
16520Sstevel@tonic-gate  *    list. The throttling semaphore is released only if the particular
16530Sstevel@tonic-gate  *    translation was throttled in fssnap_translate.
16540Sstevel@tonic-gate  */
16550Sstevel@tonic-gate static void
fssnap_write_taskq(void * arg)16560Sstevel@tonic-gate fssnap_write_taskq(void *arg)
16570Sstevel@tonic-gate {
16580Sstevel@tonic-gate 	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
16590Sstevel@tonic-gate 	snapshot_id_t	*sidp = cmn->cmn_sid;
16600Sstevel@tonic-gate 	cow_info_t	*cowp = sidp->sid_cowinfo;
16610Sstevel@tonic-gate 	cow_map_t	*cmap = &cowp->cow_map;
16620Sstevel@tonic-gate 	int		error;
16630Sstevel@tonic-gate 	int		bf_index;
16640Sstevel@tonic-gate 	int		release_sem = cmn->release_sem;
16650Sstevel@tonic-gate 
16660Sstevel@tonic-gate 	/*
16670Sstevel@tonic-gate 	 * The sid_rwlock does not need to be held here because the taskqs
16680Sstevel@tonic-gate 	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
16690Sstevel@tonic-gate 	 * held as a writer).  taskq_destroy() will flush all of the tasks
16700Sstevel@tonic-gate 	 * out before fssnap_delete frees up all of the structures.
16710Sstevel@tonic-gate 	 */
16720Sstevel@tonic-gate 
16730Sstevel@tonic-gate 	/* if the snapshot was disabled from under us, drop the request. */
16740Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
16750Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
16760Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
16770Sstevel@tonic-gate 		if (release_sem)
16780Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
16790Sstevel@tonic-gate 		return;
16800Sstevel@tonic-gate 	}
16810Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
16820Sstevel@tonic-gate 
16830Sstevel@tonic-gate 	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
16840Sstevel@tonic-gate 
16850Sstevel@tonic-gate 	if ((cmap->cmap_maxsize != 0) &&
16860Sstevel@tonic-gate 	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
16870Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
16880Sstevel@tonic-gate 		    "reached the maximum backing file size specified (%llu "
16890Sstevel@tonic-gate 		    "bytes) and will be deleted.", sidp->sid_snapnumber,
16900Sstevel@tonic-gate 		    (char *)cowp->cow_kstat_mntpt->ks_data,
16910Sstevel@tonic-gate 		    cmap->cmap_maxsize);
16920Sstevel@tonic-gate 		if (release_sem)
16930Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
16940Sstevel@tonic-gate 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
16950Sstevel@tonic-gate 		return;
16960Sstevel@tonic-gate 	}
16970Sstevel@tonic-gate 
16980Sstevel@tonic-gate 	/* perform the write */
16990Sstevel@tonic-gate 	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
17000Sstevel@tonic-gate 
17010Sstevel@tonic-gate 	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
17020Sstevel@tonic-gate 	    cmn->cmn_buf, cmap->cmap_chunksz,
17030Sstevel@tonic-gate 	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
17040Sstevel@tonic-gate 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
17050Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
17060Sstevel@tonic-gate 		    "backing file.  DELETING SNAPSHOT %d, backing file path "
17070Sstevel@tonic-gate 		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
17080Sstevel@tonic-gate 		    (char *)cowp->cow_kstat_bfname->ks_data,
17090Sstevel@tonic-gate 		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
17100Sstevel@tonic-gate 		if (release_sem)
17110Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
17120Sstevel@tonic-gate 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
17130Sstevel@tonic-gate 		return;
17140Sstevel@tonic-gate 	}
17150Sstevel@tonic-gate 
17160Sstevel@tonic-gate 	/*
17170Sstevel@tonic-gate 	 * now remove the node and buffer from memory
17180Sstevel@tonic-gate 	 */
17190Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
17200Sstevel@tonic-gate 	transtbl_delete(cmap, cmn);
17210Sstevel@tonic-gate 	rw_exit(&cmap->cmap_rwlock);
17220Sstevel@tonic-gate 
17230Sstevel@tonic-gate 	/* Allow more translations */
17240Sstevel@tonic-gate 	if (release_sem)
17250Sstevel@tonic-gate 		sema_v(&cmap->cmap_throttle_sem);
17260Sstevel@tonic-gate 
17270Sstevel@tonic-gate }
17280Sstevel@tonic-gate 
17290Sstevel@tonic-gate /*
17300Sstevel@tonic-gate  * fssnap_create_impl() - called from the file system to create a new snapshot
17310Sstevel@tonic-gate  *
17320Sstevel@tonic-gate  *    allocates and initializes the structures needed for a new snapshot.
17330Sstevel@tonic-gate  *    This is called by the file system when it receives an ioctl request to
17340Sstevel@tonic-gate  *    create a new snapshot.  An unused snapshot identifier is either found
17350Sstevel@tonic-gate  *    or created, and eventually returned as the opaque handle the file
17360Sstevel@tonic-gate  *    system will use to identify this snapshot.  The snapshot number
17370Sstevel@tonic-gate  *    associated with the snapshot identifier is the same as the minor
17380Sstevel@tonic-gate  *    number for the snapshot device that is used to access that snapshot.
17390Sstevel@tonic-gate  *
17400Sstevel@tonic-gate  *    The snapshot can not be used until the candidate bitmap is populated
17410Sstevel@tonic-gate  *    by the file system (see fssnap_set_candidate_impl()), and the file
17420Sstevel@tonic-gate  *    system finishes the setup process by calling fssnap_create_done().
17430Sstevel@tonic-gate  *    Nearly all of the snapshot locks are held for the duration of the
17440Sstevel@tonic-gate  *    create, and are not released until fssnap_create_done is called().
17450Sstevel@tonic-gate  */
17460Sstevel@tonic-gate static void *
fssnap_create_impl(chunknumber_t nchunks,uint_t chunksz,u_offset_t maxsize,struct vnode * fsvp,int backfilecount,struct vnode ** bfvpp,char * backpath,u_offset_t max_backfile_size)17470Sstevel@tonic-gate fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
17480Sstevel@tonic-gate     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
17490Sstevel@tonic-gate     u_offset_t max_backfile_size)
17500Sstevel@tonic-gate {
17510Sstevel@tonic-gate 	refstr_t *mountpoint;
17520Sstevel@tonic-gate 	char taskqname[50];
17530Sstevel@tonic-gate 	struct cow_info *cowp;
17540Sstevel@tonic-gate 	struct cow_map	*cmap;
17550Sstevel@tonic-gate 	struct snapshot_id *sidp;
17560Sstevel@tonic-gate 	int lastsnap;
17570Sstevel@tonic-gate 
17580Sstevel@tonic-gate 	/*
17590Sstevel@tonic-gate 	 * Sanity check the parameters we care about
17600Sstevel@tonic-gate 	 * (we don't care about the informational parameters)
17610Sstevel@tonic-gate 	 */
17620Sstevel@tonic-gate 	if ((nchunks == 0) ||
17630Sstevel@tonic-gate 	    ((chunksz % DEV_BSIZE) != 0) ||
17640Sstevel@tonic-gate 	    (bfvpp == NULL)) {
17650Sstevel@tonic-gate 		return (NULL);
17660Sstevel@tonic-gate 	}
17670Sstevel@tonic-gate 
17680Sstevel@tonic-gate 	/*
17690Sstevel@tonic-gate 	 * Look for unused snapshot identifiers.  Snapshot ids are never
17700Sstevel@tonic-gate 	 * freed, but deleted snapshot ids will be recycled as needed.
17710Sstevel@tonic-gate 	 */
17720Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
17730Sstevel@tonic-gate 
17740Sstevel@tonic-gate findagain:
17750Sstevel@tonic-gate 	lastsnap = 0;
17760Sstevel@tonic-gate 	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
17770Sstevel@tonic-gate 		if (sidp->sid_snapnumber > lastsnap)
17780Sstevel@tonic-gate 			lastsnap = sidp->sid_snapnumber;
17790Sstevel@tonic-gate 
17800Sstevel@tonic-gate 		/*
17810Sstevel@tonic-gate 		 * The sid_rwlock is taken as a reader initially so that
17820Sstevel@tonic-gate 		 * activity on each snapshot is not stalled while searching
17830Sstevel@tonic-gate 		 * for a free snapshot id.
17840Sstevel@tonic-gate 		 */
17850Sstevel@tonic-gate 		rw_enter(&sidp->sid_rwlock, RW_READER);
17860Sstevel@tonic-gate 
17870Sstevel@tonic-gate 		/*
17880Sstevel@tonic-gate 		 * If the snapshot has been deleted and nobody is using the
17890Sstevel@tonic-gate 		 * snapshot device than we can reuse this snapshot_id.  If
17900Sstevel@tonic-gate 		 * the snapshot is marked to be deleted (SID_DELETE), then
17910Sstevel@tonic-gate 		 * it hasn't been deleted yet so don't reuse it.
17920Sstevel@tonic-gate 		 */
17930Sstevel@tonic-gate 		if (SID_AVAILABLE(sidp))
17940Sstevel@tonic-gate 			break; /* This spot is unused, so take it */
17950Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
17960Sstevel@tonic-gate 	}
17970Sstevel@tonic-gate 
17980Sstevel@tonic-gate 	/*
17990Sstevel@tonic-gate 	 * add a new snapshot identifier if there are no deleted
18000Sstevel@tonic-gate 	 * entries.  Since it doesn't matter what order the entries
18010Sstevel@tonic-gate 	 * are in we can just add it to the beginning of the list.
18020Sstevel@tonic-gate 	 */
18030Sstevel@tonic-gate 	if (sidp) {
18040Sstevel@tonic-gate 		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
18050Sstevel@tonic-gate 			/* someone else grabbed it as a writer, try again */
18060Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
18070Sstevel@tonic-gate 			goto findagain;
18080Sstevel@tonic-gate 		}
18090Sstevel@tonic-gate 	} else {
18100Sstevel@tonic-gate 		/* Create a new node if we didn't find an unused one */
18110Sstevel@tonic-gate 		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
18120Sstevel@tonic-gate 		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
18130Sstevel@tonic-gate 		rw_enter(&sidp->sid_rwlock, RW_WRITER);
18140Sstevel@tonic-gate 		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
18150Sstevel@tonic-gate 		sidp->sid_cowinfo = NULL;
18160Sstevel@tonic-gate 		sidp->sid_flags = 0;
18170Sstevel@tonic-gate 		sidp->sid_next = snapshot;
18180Sstevel@tonic-gate 		snapshot = sidp;
18190Sstevel@tonic-gate 	}
18200Sstevel@tonic-gate 
18210Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
18220Sstevel@tonic-gate 	ASSERT(sidp->sid_cowinfo == NULL);
18230Sstevel@tonic-gate 	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
18240Sstevel@tonic-gate 
18250Sstevel@tonic-gate 	sidp->sid_flags |= SID_CREATING;
18260Sstevel@tonic-gate 	/* The root vnode is held until snap_delete_impl() is called */
18270Sstevel@tonic-gate 	VN_HOLD(fsvp);
18280Sstevel@tonic-gate 	sidp->sid_fvp = fsvp;
18290Sstevel@tonic-gate 	num_snapshots++;
18300Sstevel@tonic-gate 
18310Sstevel@tonic-gate 	/* allocate and initialize structures */
18320Sstevel@tonic-gate 
18330Sstevel@tonic-gate 	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
18340Sstevel@tonic-gate 
18350Sstevel@tonic-gate 	cowp->cow_backfile_array = bfvpp;
18360Sstevel@tonic-gate 	cowp->cow_backcount = backfilecount;
18370Sstevel@tonic-gate 	cowp->cow_backfile_sz = max_backfile_size;
18380Sstevel@tonic-gate 
18390Sstevel@tonic-gate 	/*
18400Sstevel@tonic-gate 	 * Initialize task queues for this snapshot.  Only a small number
18410Sstevel@tonic-gate 	 * of threads are required because they will be serialized on the
18420Sstevel@tonic-gate 	 * backing file's reader/writer lock anyway.
18430Sstevel@tonic-gate 	 */
18440Sstevel@tonic-gate 	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
18450Sstevel@tonic-gate 	    sidp->sid_snapnumber);
18460Sstevel@tonic-gate 	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
18470Sstevel@tonic-gate 	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
18480Sstevel@tonic-gate 
18490Sstevel@tonic-gate 	/* don't allow tasks to start until after everything is ready */
18500Sstevel@tonic-gate 	taskq_suspend(cowp->cow_taskq);
18510Sstevel@tonic-gate 
18520Sstevel@tonic-gate 	/* initialize translation table */
18530Sstevel@tonic-gate 	cmap = &cowp->cow_map;
18540Sstevel@tonic-gate 	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
18550Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
18560Sstevel@tonic-gate 
18570Sstevel@tonic-gate 	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
18580Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
18590Sstevel@tonic-gate 
18600Sstevel@tonic-gate 	cmap->cmap_chunksz = chunksz;
18610Sstevel@tonic-gate 	cmap->cmap_maxsize = maxsize;
18620Sstevel@tonic-gate 	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
18630Sstevel@tonic-gate 
18640Sstevel@tonic-gate 	/*
18650Sstevel@tonic-gate 	 * allocate one bit per chunk for the bitmaps, round up
18660Sstevel@tonic-gate 	 */
18670Sstevel@tonic-gate 	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
18680Sstevel@tonic-gate 	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
18690Sstevel@tonic-gate 	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
18700Sstevel@tonic-gate 
18710Sstevel@tonic-gate 	sidp->sid_cowinfo = cowp;
18720Sstevel@tonic-gate 
18730Sstevel@tonic-gate 	/* initialize kstats for this snapshot */
18740Sstevel@tonic-gate 	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
18750Sstevel@tonic-gate 	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
18760Sstevel@tonic-gate 	    refstr_value(mountpoint), backpath);
18770Sstevel@tonic-gate 	refstr_rele(mountpoint);
18780Sstevel@tonic-gate 
18790Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
18800Sstevel@tonic-gate 
18810Sstevel@tonic-gate 	/*
18820Sstevel@tonic-gate 	 * return with snapshot id rwlock held as a writer until
18830Sstevel@tonic-gate 	 * fssnap_create_done is called
18840Sstevel@tonic-gate 	 */
18850Sstevel@tonic-gate 	return (sidp);
18860Sstevel@tonic-gate }
18870Sstevel@tonic-gate 
18880Sstevel@tonic-gate /*
18890Sstevel@tonic-gate  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
18900Sstevel@tonic-gate  *
18910Sstevel@tonic-gate  *    sets a bit in the candidate bitmap that indicates that a chunk is a
18920Sstevel@tonic-gate  *    candidate for copy-on-write.  Typically, chunks that are allocated on
18930Sstevel@tonic-gate  *    the file system at the time the snapshot is taken are candidates,
18940Sstevel@tonic-gate  *    while chunks that have no allocated data do not need to be copied.
18950Sstevel@tonic-gate  *    Chunks containing metadata must be marked as candidates as well.
18960Sstevel@tonic-gate  */
18970Sstevel@tonic-gate static void
fssnap_set_candidate_impl(void * snapshot_id,chunknumber_t chunknumber)18980Sstevel@tonic-gate fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
18990Sstevel@tonic-gate {
19000Sstevel@tonic-gate 	struct snapshot_id	*sid = snapshot_id;
19010Sstevel@tonic-gate 	struct cow_info *cowp = sid->sid_cowinfo;
19020Sstevel@tonic-gate 	struct cow_map	*cmap = &cowp->cow_map;
19030Sstevel@tonic-gate 
19040Sstevel@tonic-gate 	/* simple bitmap operation for now */
19050Sstevel@tonic-gate 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
19060Sstevel@tonic-gate 	setbit(cmap->cmap_candidate, chunknumber);
19070Sstevel@tonic-gate }
19080Sstevel@tonic-gate 
19090Sstevel@tonic-gate /*
19100Sstevel@tonic-gate  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
19110Sstevel@tonic-gate  *
19120Sstevel@tonic-gate  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
19130Sstevel@tonic-gate  *    candidate.  This can be used by the file system to change behavior for
19140Sstevel@tonic-gate  *    chunks that might induce a copy-on-write.  The offset is specified in
19150Sstevel@tonic-gate  *    bytes since the chunk size may not be known by the file system.
19160Sstevel@tonic-gate  */
19170Sstevel@tonic-gate static int
fssnap_is_candidate_impl(void * snapshot_id,u_offset_t off)19180Sstevel@tonic-gate fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
19190Sstevel@tonic-gate {
19200Sstevel@tonic-gate 	struct snapshot_id	*sid = snapshot_id;
19210Sstevel@tonic-gate 	struct cow_info *cowp = sid->sid_cowinfo;
19220Sstevel@tonic-gate 	struct cow_map	*cmap = &cowp->cow_map;
19230Sstevel@tonic-gate 	ulong_t chunknumber = off / cmap->cmap_chunksz;
19240Sstevel@tonic-gate 
19250Sstevel@tonic-gate 	/* simple bitmap operation for now */
19260Sstevel@tonic-gate 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
19270Sstevel@tonic-gate 	return (isset(cmap->cmap_candidate, chunknumber));
19280Sstevel@tonic-gate }
19290Sstevel@tonic-gate 
19300Sstevel@tonic-gate /*
19310Sstevel@tonic-gate  * fssnap_create_done_impl() - complete the snapshot setup process
19320Sstevel@tonic-gate  *
19330Sstevel@tonic-gate  *    called when the file system is done populating the candidate bitmap
19340Sstevel@tonic-gate  *    and it is ready to start using the snapshot.  This routine releases
19350Sstevel@tonic-gate  *    the snapshot locks, allows taskq tasks to start processing, and
19360Sstevel@tonic-gate  *    creates the device minor nodes associated with the snapshot.
19370Sstevel@tonic-gate  */
19380Sstevel@tonic-gate static int
fssnap_create_done_impl(void * snapshot_id)19390Sstevel@tonic-gate fssnap_create_done_impl(void *snapshot_id)
19400Sstevel@tonic-gate {
19410Sstevel@tonic-gate 	struct snapshot_id	**sidpp, *sidp = snapshot_id;
19420Sstevel@tonic-gate 	struct cow_info		*cowp;
19430Sstevel@tonic-gate 	struct cow_map		*cmap;
19440Sstevel@tonic-gate 	int			snapnumber = -1;
19450Sstevel@tonic-gate 	char			name[20];
19460Sstevel@tonic-gate 
19470Sstevel@tonic-gate 	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
19480Sstevel@tonic-gate 	ASSERT(sidp);
19490Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
19500Sstevel@tonic-gate 	ASSERT(sidp->sid_cowinfo);
19510Sstevel@tonic-gate 
19520Sstevel@tonic-gate 	cowp = sidp->sid_cowinfo;
19530Sstevel@tonic-gate 	cmap = &cowp->cow_map;
19540Sstevel@tonic-gate 
19550Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
19560Sstevel@tonic-gate 
19570Sstevel@tonic-gate 	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
19580Sstevel@tonic-gate 	snapnumber = sidp->sid_snapnumber;
19590Sstevel@tonic-gate 
19600Sstevel@tonic-gate 	/* allocate state structure and find new snapshot id */
19610Sstevel@tonic-gate 	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
19620Sstevel@tonic-gate 		cmn_err(CE_WARN,
19630Sstevel@tonic-gate 		    "snap_ioctl: create: could not allocate "
19640Sstevel@tonic-gate 		    "state for snapshot %d.", snapnumber);
19650Sstevel@tonic-gate 		snapnumber = -1;
19660Sstevel@tonic-gate 		goto out;
19670Sstevel@tonic-gate 	}
19680Sstevel@tonic-gate 
19690Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, snapnumber);
19700Sstevel@tonic-gate 	*sidpp = sidp;
19710Sstevel@tonic-gate 
19720Sstevel@tonic-gate 	/* create minor node based on snapshot number */
19730Sstevel@tonic-gate 	ASSERT(fssnap_dip != NULL);
19740Sstevel@tonic-gate 	(void) snprintf(name, sizeof (name), "%d", snapnumber);
19750Sstevel@tonic-gate 	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
19760Sstevel@tonic-gate 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
19770Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: could not create "
19780Sstevel@tonic-gate 		    "block minor node for snapshot %d.", snapnumber);
19790Sstevel@tonic-gate 		snapnumber = -1;
19800Sstevel@tonic-gate 		goto out;
19810Sstevel@tonic-gate 	}
19820Sstevel@tonic-gate 
19830Sstevel@tonic-gate 	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
19840Sstevel@tonic-gate 	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
19850Sstevel@tonic-gate 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
19860Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: could not create "
19870Sstevel@tonic-gate 		    "character minor node for snapshot %d.", snapnumber);
19880Sstevel@tonic-gate 		snapnumber = -1;
19890Sstevel@tonic-gate 	}
19900Sstevel@tonic-gate 
19910Sstevel@tonic-gate out:
19920Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
19930Sstevel@tonic-gate 	rw_exit(&cmap->cmap_rwlock);
19940Sstevel@tonic-gate 
19950Sstevel@tonic-gate 	/* let the taskq threads start processing */
19960Sstevel@tonic-gate 	taskq_resume(cowp->cow_taskq);
19970Sstevel@tonic-gate 
19980Sstevel@tonic-gate 	return (snapnumber);
19990Sstevel@tonic-gate }
20000Sstevel@tonic-gate 
20010Sstevel@tonic-gate /*
20020Sstevel@tonic-gate  * fssnap_delete_impl() - delete a snapshot
20030Sstevel@tonic-gate  *
20040Sstevel@tonic-gate  *    used when a snapshot is no longer needed.  This is called by the file
20050Sstevel@tonic-gate  *    system when it receives an ioctl request to delete a snapshot.  It is
20060Sstevel@tonic-gate  *    also called internally when error conditions such as disk full, errors
20070Sstevel@tonic-gate  *    writing to the backing file, or backing file maxsize exceeded occur.
20080Sstevel@tonic-gate  *    If the snapshot device is busy when the delete request is received,
20090Sstevel@tonic-gate  *    all state will be deleted except for the soft state and device files
20100Sstevel@tonic-gate  *    associated with the snapshot; they will be deleted when the snapshot
20110Sstevel@tonic-gate  *    device is closed.
20120Sstevel@tonic-gate  *
20130Sstevel@tonic-gate  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
20140Sstevel@tonic-gate  *    and expects to be able to set the handle held by the file system to
20150Sstevel@tonic-gate  *    NULL.  This depends on the file system checking that variable for NULL
20160Sstevel@tonic-gate  *    before calling fssnap_strategy().
20170Sstevel@tonic-gate  */
20180Sstevel@tonic-gate static int
fssnap_delete_impl(void * snapshot_id)20190Sstevel@tonic-gate fssnap_delete_impl(void *snapshot_id)
20200Sstevel@tonic-gate {
20210Sstevel@tonic-gate 	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
20220Sstevel@tonic-gate 	struct snapshot_id	*sidp;
20230Sstevel@tonic-gate 	struct snapshot_id	**statesidpp;
20240Sstevel@tonic-gate 	struct cow_info		*cowp;
20250Sstevel@tonic-gate 	struct cow_map		*cmap;
20260Sstevel@tonic-gate 	char			name[20];
20270Sstevel@tonic-gate 	int			snapnumber = -1;
20280Sstevel@tonic-gate 	vnode_t			**vpp;
20290Sstevel@tonic-gate 
20300Sstevel@tonic-gate 	/*
20310Sstevel@tonic-gate 	 * sidp is guaranteed to be valid if sidpp is valid because
20320Sstevel@tonic-gate 	 * the snapshot list is append-only.
20330Sstevel@tonic-gate 	 */
20340Sstevel@tonic-gate 	if (sidpp == NULL) {
20350Sstevel@tonic-gate 		return (-1);
20360Sstevel@tonic-gate 	}
20370Sstevel@tonic-gate 
20380Sstevel@tonic-gate 	sidp = *sidpp;
20390Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
20400Sstevel@tonic-gate 
20410Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
20420Sstevel@tonic-gate 
20430Sstevel@tonic-gate 	/*
20440Sstevel@tonic-gate 	 * double check that the snapshot is still valid for THIS file system
20450Sstevel@tonic-gate 	 */
20460Sstevel@tonic-gate 	if (*sidpp == NULL) {
20470Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
20480Sstevel@tonic-gate 		return (-1);
20490Sstevel@tonic-gate 	}
20500Sstevel@tonic-gate 
20510Sstevel@tonic-gate 	/*
20520Sstevel@tonic-gate 	 * Now we know the snapshot is still valid and will not go away
20530Sstevel@tonic-gate 	 * because we have the write lock.  Once the state is transitioned
20540Sstevel@tonic-gate 	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
20550Sstevel@tonic-gate 	 * waiting for the lock as a reader will check for this state and
20560Sstevel@tonic-gate 	 * abort without touching data that may be getting freed.
20570Sstevel@tonic-gate 	 */
20580Sstevel@tonic-gate 	sidp->sid_flags |= SID_DISABLING;
20590Sstevel@tonic-gate 	if (sidp->sid_flags & SID_DELETE) {
20600Sstevel@tonic-gate 		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
20610Sstevel@tonic-gate 		    sidp->sid_snapnumber);
20620Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_DELETE);
20630Sstevel@tonic-gate 	}
20640Sstevel@tonic-gate 
20650Sstevel@tonic-gate 
20660Sstevel@tonic-gate 	/*
20670Sstevel@tonic-gate 	 * This is pointing into file system specific data!  The assumption is
20680Sstevel@tonic-gate 	 * that fssnap_strategy() gets called from the file system based on
20690Sstevel@tonic-gate 	 * whether this reference to the snapshot_id is NULL or not.  So
20700Sstevel@tonic-gate 	 * setting this to NULL should disable snapshots for the file system.
20710Sstevel@tonic-gate 	 */
20720Sstevel@tonic-gate 	*sidpp = NULL;
20730Sstevel@tonic-gate 
20740Sstevel@tonic-gate 	/* remove cowinfo */
20750Sstevel@tonic-gate 	cowp = sidp->sid_cowinfo;
20760Sstevel@tonic-gate 	if (cowp == NULL) {
20770Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
20780Sstevel@tonic-gate 		return (-1);
20790Sstevel@tonic-gate 	}
20800Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
20810Sstevel@tonic-gate 
20820Sstevel@tonic-gate 	/* destroy task queues first so they don't reference freed data. */
20830Sstevel@tonic-gate 	if (cowp->cow_taskq) {
20840Sstevel@tonic-gate 		taskq_destroy(cowp->cow_taskq);
20850Sstevel@tonic-gate 		cowp->cow_taskq = NULL;
20860Sstevel@tonic-gate 	}
20870Sstevel@tonic-gate 
20880Sstevel@tonic-gate 	if (cowp->cow_backfile_array != NULL) {
20890Sstevel@tonic-gate 		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
20900Sstevel@tonic-gate 			VN_RELE(*vpp);
20910Sstevel@tonic-gate 		kmem_free(cowp->cow_backfile_array,
20920Sstevel@tonic-gate 		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
20930Sstevel@tonic-gate 		cowp->cow_backfile_array = NULL;
20940Sstevel@tonic-gate 	}
20950Sstevel@tonic-gate 
20960Sstevel@tonic-gate 	sidp->sid_cowinfo = NULL;
20970Sstevel@tonic-gate 
20980Sstevel@tonic-gate 	/* remove cmap */
20990Sstevel@tonic-gate 	cmap = &cowp->cow_map;
21000Sstevel@tonic-gate 	ASSERT(cmap);
21010Sstevel@tonic-gate 
21020Sstevel@tonic-gate 	if (cmap->cmap_candidate)
21030Sstevel@tonic-gate 		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
21040Sstevel@tonic-gate 
21050Sstevel@tonic-gate 	if (cmap->cmap_hastrans)
21060Sstevel@tonic-gate 		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
21070Sstevel@tonic-gate 
21080Sstevel@tonic-gate 	if (cmap->cmap_table)
21090Sstevel@tonic-gate 		transtbl_free(&cowp->cow_map);
21100Sstevel@tonic-gate 
21110Sstevel@tonic-gate 	rw_destroy(&cmap->cmap_rwlock);
21120Sstevel@tonic-gate 
21130Sstevel@tonic-gate 	while (cmap->cmap_waiters) {
21140Sstevel@tonic-gate 		sema_p(&cmap->cmap_throttle_sem);
21150Sstevel@tonic-gate 		sema_v(&cmap->cmap_throttle_sem);
21160Sstevel@tonic-gate 	}
21170Sstevel@tonic-gate 	sema_destroy(&cmap->cmap_throttle_sem);
21180Sstevel@tonic-gate 
21190Sstevel@tonic-gate 	/* remove kstats */
21200Sstevel@tonic-gate 	fssnap_delete_kstats(cowp);
21210Sstevel@tonic-gate 
21220Sstevel@tonic-gate 	kmem_free(cowp, sizeof (struct cow_info));
21230Sstevel@tonic-gate 
21240Sstevel@tonic-gate 	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
21250Sstevel@tonic-gate 	if (statesidpp == NULL || *statesidpp == NULL) {
21260Sstevel@tonic-gate 		cmn_err(CE_WARN,
21270Sstevel@tonic-gate 		    "fssnap_delete_impl: could not find state for snapshot %d.",
21280Sstevel@tonic-gate 		    sidp->sid_snapnumber);
21290Sstevel@tonic-gate 	}
21300Sstevel@tonic-gate 	ASSERT(*statesidpp == sidp);
21310Sstevel@tonic-gate 
21320Sstevel@tonic-gate 	/*
21330Sstevel@tonic-gate 	 * Leave the node in the list marked DISABLED so it can be reused
21340Sstevel@tonic-gate 	 * and avoid many race conditions.  Return the snapshot number
21350Sstevel@tonic-gate 	 * that was deleted.
21360Sstevel@tonic-gate 	 */
21370Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
21380Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
21390Sstevel@tonic-gate 	sidp->sid_flags &= ~(SID_DISABLING);
21400Sstevel@tonic-gate 	sidp->sid_flags |= SID_DISABLED;
21410Sstevel@tonic-gate 	VN_RELE(sidp->sid_fvp);
21420Sstevel@tonic-gate 	sidp->sid_fvp = NULL;
21430Sstevel@tonic-gate 	snapnumber = sidp->sid_snapnumber;
21440Sstevel@tonic-gate 
21450Sstevel@tonic-gate 	/*
21460Sstevel@tonic-gate 	 * If the snapshot is not busy, free the device info now.  Otherwise
21470Sstevel@tonic-gate 	 * the device nodes are freed in snap_close() when the device is
21480Sstevel@tonic-gate 	 * closed.  The sid will not be reused until the device is not busy.
21490Sstevel@tonic-gate 	 */
21500Sstevel@tonic-gate 	if (SID_AVAILABLE(sidp)) {
21510Sstevel@tonic-gate 		/* remove the device nodes */
21520Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
21530Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d",
21540Sstevel@tonic-gate 		    sidp->sid_snapnumber);
21550Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
21560Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d,raw",
21570Sstevel@tonic-gate 		    sidp->sid_snapnumber);
21580Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
21590Sstevel@tonic-gate 
21600Sstevel@tonic-gate 		/* delete the state structure */
21610Sstevel@tonic-gate 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
21620Sstevel@tonic-gate 		num_snapshots--;
21630Sstevel@tonic-gate 	}
21640Sstevel@tonic-gate 
21650Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
21660Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
21670Sstevel@tonic-gate 
21680Sstevel@tonic-gate 	return (snapnumber);
21690Sstevel@tonic-gate }
21700Sstevel@tonic-gate 
21710Sstevel@tonic-gate /*
21720Sstevel@tonic-gate  * fssnap_create_kstats() - allocate and initialize snapshot kstats
21730Sstevel@tonic-gate  *
21740Sstevel@tonic-gate  */
21750Sstevel@tonic-gate static void
fssnap_create_kstats(snapshot_id_t * sidp,int snapnum,const char * mountpoint,const char * backfilename)21760Sstevel@tonic-gate fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
21770Sstevel@tonic-gate     const char *mountpoint, const char *backfilename)
21780Sstevel@tonic-gate {
21790Sstevel@tonic-gate 	kstat_t *num, *mntpoint, *bfname;
21800Sstevel@tonic-gate 	kstat_named_t *hw;
21810Sstevel@tonic-gate 	struct cow_info *cowp = sidp->sid_cowinfo;
21820Sstevel@tonic-gate 	struct cow_kstat_num *stats;
21830Sstevel@tonic-gate 
21840Sstevel@tonic-gate 	/* update the high water mark */
21850Sstevel@tonic-gate 	if (fssnap_highwater_kstat == NULL) {
21860Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
21870Sstevel@tonic-gate 		    "high water mark kstat.");
21880Sstevel@tonic-gate 		return;
21890Sstevel@tonic-gate 	}
21900Sstevel@tonic-gate 
21910Sstevel@tonic-gate 	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
21920Sstevel@tonic-gate 	if (hw->value.ui32 < snapnum)
21930Sstevel@tonic-gate 		hw->value.ui32 = snapnum;
21940Sstevel@tonic-gate 
21950Sstevel@tonic-gate 	/* initialize the mount point kstat */
21960Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
21970Sstevel@tonic-gate 
21980Sstevel@tonic-gate 	if (mountpoint != NULL) {
21990Sstevel@tonic-gate 		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
22000Sstevel@tonic-gate 		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
22010Sstevel@tonic-gate 		if (mntpoint == NULL) {
22020Sstevel@tonic-gate 			cowp->cow_kstat_mntpt = NULL;
22030Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
22040Sstevel@tonic-gate 			    "create mount point kstat");
22050Sstevel@tonic-gate 		} else {
22060Sstevel@tonic-gate 			(void) strncpy(mntpoint->ks_data, mountpoint,
22070Sstevel@tonic-gate 			    strlen(mountpoint));
22080Sstevel@tonic-gate 			cowp->cow_kstat_mntpt = mntpoint;
22090Sstevel@tonic-gate 			kstat_install(mntpoint);
22100Sstevel@tonic-gate 		}
22110Sstevel@tonic-gate 	} else {
22120Sstevel@tonic-gate 		cowp->cow_kstat_mntpt = NULL;
22130Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
22140Sstevel@tonic-gate 		    "specified.");
22150Sstevel@tonic-gate 	}
22160Sstevel@tonic-gate 
22170Sstevel@tonic-gate 	/* initialize the backing file kstat */
22180Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
22190Sstevel@tonic-gate 
22200Sstevel@tonic-gate 	if (backfilename == NULL) {
22210Sstevel@tonic-gate 		cowp->cow_kstat_bfname = NULL;
22220Sstevel@tonic-gate 	} else {
22230Sstevel@tonic-gate 		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
22240Sstevel@tonic-gate 		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
22250Sstevel@tonic-gate 		if (bfname != NULL) {
22260Sstevel@tonic-gate 			(void) strncpy(bfname->ks_data, backfilename,
22270Sstevel@tonic-gate 			    strlen(backfilename));
22280Sstevel@tonic-gate 			cowp->cow_kstat_bfname = bfname;
22290Sstevel@tonic-gate 			kstat_install(bfname);
22300Sstevel@tonic-gate 		} else {
22310Sstevel@tonic-gate 			cowp->cow_kstat_bfname = NULL;
22320Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
22330Sstevel@tonic-gate 			    "create backing file name kstat");
22340Sstevel@tonic-gate 		}
22350Sstevel@tonic-gate 	}
22360Sstevel@tonic-gate 
22370Sstevel@tonic-gate 	/* initialize numeric kstats */
22380Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
22390Sstevel@tonic-gate 
22400Sstevel@tonic-gate 	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
22410Sstevel@tonic-gate 	    "misc", KSTAT_TYPE_NAMED,
22420Sstevel@tonic-gate 	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
22430Sstevel@tonic-gate 	    0);
22440Sstevel@tonic-gate 	if (num == NULL) {
22450Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
22460Sstevel@tonic-gate 		    "numeric kstats");
22470Sstevel@tonic-gate 		cowp->cow_kstat_num = NULL;
22480Sstevel@tonic-gate 		return;
22490Sstevel@tonic-gate 	}
22500Sstevel@tonic-gate 
22510Sstevel@tonic-gate 	cowp->cow_kstat_num = num;
22520Sstevel@tonic-gate 	stats = num->ks_data;
22530Sstevel@tonic-gate 	num->ks_update = fssnap_update_kstat_num;
22540Sstevel@tonic-gate 	num->ks_private = sidp;
22550Sstevel@tonic-gate 
22560Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
22570Sstevel@tonic-gate 	    KSTAT_DATA_INT32);
22580Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
22590Sstevel@tonic-gate 	    KSTAT_DATA_UINT64);
22600Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
22610Sstevel@tonic-gate 	    KSTAT_DATA_UINT64);
22620Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
22630Sstevel@tonic-gate 	    KSTAT_DATA_LONG);
22640Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
22650Sstevel@tonic-gate 	    KSTAT_DATA_UINT32);
22660Sstevel@tonic-gate 
22670Sstevel@tonic-gate 	/* initialize the static kstats */
22680Sstevel@tonic-gate 	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
22690Sstevel@tonic-gate 	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
22700Sstevel@tonic-gate 	stats->ckn_createtime.value.l = gethrestime_sec();
22710Sstevel@tonic-gate 
22720Sstevel@tonic-gate 	kstat_install(num);
22730Sstevel@tonic-gate }
22740Sstevel@tonic-gate 
22750Sstevel@tonic-gate /*
22760Sstevel@tonic-gate  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
22770Sstevel@tonic-gate  *
22780Sstevel@tonic-gate  */
22790Sstevel@tonic-gate int
fssnap_update_kstat_num(kstat_t * ksp,int rw)22800Sstevel@tonic-gate fssnap_update_kstat_num(kstat_t *ksp, int rw)
22810Sstevel@tonic-gate {
22820Sstevel@tonic-gate 	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
22830Sstevel@tonic-gate 	struct cow_info *cowp = sidp->sid_cowinfo;
22840Sstevel@tonic-gate 	struct cow_kstat_num *stats = ksp->ks_data;
22850Sstevel@tonic-gate 
22860Sstevel@tonic-gate 	if (rw == KSTAT_WRITE)
22870Sstevel@tonic-gate 		return (EACCES);
22880Sstevel@tonic-gate 
22890Sstevel@tonic-gate 	/* state */
22900Sstevel@tonic-gate 	if (sidp->sid_flags & SID_CREATING)
22910Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_CREATING;
22920Sstevel@tonic-gate 	else if (SID_INACTIVE(sidp))
22930Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
22940Sstevel@tonic-gate 	else if (SID_BUSY(sidp))
22950Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
22960Sstevel@tonic-gate 	else
22970Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_IDLE;
22980Sstevel@tonic-gate 
22990Sstevel@tonic-gate 	/* bfsize */
23000Sstevel@tonic-gate 	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
23010Sstevel@tonic-gate 	    cowp->cow_map.cmap_chunksz;
23020Sstevel@tonic-gate 
23030Sstevel@tonic-gate 	return (0);
23040Sstevel@tonic-gate }
23050Sstevel@tonic-gate 
23060Sstevel@tonic-gate /*
23070Sstevel@tonic-gate  * fssnap_delete_kstats() - deallocate snapshot kstats
23080Sstevel@tonic-gate  *
23090Sstevel@tonic-gate  */
23100Sstevel@tonic-gate void
fssnap_delete_kstats(struct cow_info * cowp)23110Sstevel@tonic-gate fssnap_delete_kstats(struct cow_info *cowp)
23120Sstevel@tonic-gate {
23130Sstevel@tonic-gate 	if (cowp->cow_kstat_num != NULL) {
23140Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_num);
23150Sstevel@tonic-gate 		cowp->cow_kstat_num = NULL;
23160Sstevel@tonic-gate 	}
23170Sstevel@tonic-gate 	if (cowp->cow_kstat_mntpt != NULL) {
23180Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_mntpt);
23190Sstevel@tonic-gate 		cowp->cow_kstat_mntpt = NULL;
23200Sstevel@tonic-gate 	}
23210Sstevel@tonic-gate 	if (cowp->cow_kstat_bfname != NULL) {
23220Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_bfname);
23230Sstevel@tonic-gate 		cowp->cow_kstat_bfname = NULL;
23240Sstevel@tonic-gate 	}
23250Sstevel@tonic-gate }
2326