lvm/md/md.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Md - is the meta-disk driver.   It sits below the UFS file system
 * but above the 'real' disk drivers, xy, id, sd etc.
 *
 * To the UFS software, md looks like a normal driver, since it has
 * the normal kinds of entries in the bdevsw and cdevsw arrays. So
 * UFS accesses md in the usual ways.  In particular, the strategy
 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
 * and ufs_writelbn().
 *
 * Md maintains an array of minor devices (meta-partitions).   Each
 * meta partition stands for a matrix of real partitions, in rows
 * which are not necessarily of equal length.	Md maintains a table,
 * with one entry for each meta-partition,  which lists the rows and
 * columns of actual partitions, and the job of the strategy routine
 * is to translate from the meta-partition device and block numbers
 * known to UFS into the actual partitions' device and block numbers.
 *
 * See below, in mdstrategy(), mdreal(), and mddone() for details of
 * this translation.
 */

/*
 * Driver for Virtual Disk.
 */

#include <sys/user.h>
#include <sys/sysmacros.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/open.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/cmn_err.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/debug.h>
#include <sys/utsname.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_names.h>
#include <sys/lvm/md_mddb.h>
#include <sys/lvm/md_sp.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/cladm.h>
#include <sys/priv_names.h>
#include <sys/modhash.h>

#ifndef	lint
char 		_depends_on[] = "strmod/rpcmod";
#endif	/* lint */
int		md_init_debug	= 0;	/* module binding debug */

/*
 * Tunable to turn off the failfast behavior.
 */
int		md_ff_disable = 0;

/*
 * dynamically allocated list of non FF driver names - needs to
 * be freed when md is detached.
 */
char	**non_ff_drivers = NULL;

md_krwlock_t	md_unit_array_rw;	/* protects all unit arrays */
md_krwlock_t	nm_lock;		/* protects all the name spaces */

md_resync_t	md_cpr_resync;

extern char	svm_bootpath[];
#define	SVM_PSEUDO_STR	"/pseudo/md@0:"

#define		VERSION_LENGTH	6
#define		VERSION		"1.0"

/*
 * Keep track of possible 'orphan' entries in the name space
 */
int		*md_nm_snarfed = NULL;

/*
 * Global tunable giving the percentage of free space left in replica during
 * conversion of non-devid style replica to devid style replica.
 */
int		md_conv_perc = MDDB_DEVID_CONV_PERC;

#ifdef	DEBUG
/* debug code to verify framework exclusion guarantees */
int		md_in;
kmutex_t	md_in_mx;			/* used to md global stuff */
#define	IN_INIT		0x01
#define	IN_FINI		0x02
#define	IN_ATTACH	0x04
#define	IN_DETACH	0x08
#define	IN_OPEN		0x10
#define	MD_SET_IN(x) {						\
	mutex_enter(&md_in_mx);					\
	if (md_in)						\
		debug_enter("MD_SET_IN exclusion lost");	\
	if (md_in & x)						\
		debug_enter("MD_SET_IN already set");		\
	md_in |= x;						\
	mutex_exit(&md_in_mx);					\
}

#define	MD_CLR_IN(x) {						\
	mutex_enter(&md_in_mx);					\
	if (md_in & ~(x))					\
		debug_enter("MD_CLR_IN exclusion lost");	\
	if (!(md_in & x))					\
		debug_enter("MD_CLR_IN already clr");		\
	md_in &= ~x;						\
	mutex_exit(&md_in_mx);					\
}
#else	/* DEBUG */
#define	MD_SET_IN(x)
#define	MD_CLR_IN(x)
#endif	/* DEBUG */
hrtime_t savetime1, savetime2;


/*
 * list things protected by md_mx even if they aren't
 * used in this file.
 */
kmutex_t	md_mx;			/* used to md global stuff */
kcondvar_t	md_cv;			/* md_status events */
int		md_status = 0;		/* global status for the meta-driver */
int		md_num_daemons = 0;
int		md_ioctl_cnt = 0;
int		md_mtioctl_cnt = 0;	/* multithreaded ioctl cnt */
uint_t		md_mdelay = 10;		/* variable so can be patched */

int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);

major_t		md_major, md_major_targ;

unit_t		md_nunits = MD_MAXUNITS;
set_t		md_nsets = MD_MAXSETS;
int		md_nmedh = 0;
char		*md_med_trans_lst = NULL;
md_set_t	md_set[MD_MAXSETS];
md_set_io_t	md_set_io[MD_MAXSETS];

md_krwlock_t	hsp_rwlp;		/* protects hot_spare_interface */
md_krwlock_t	ni_rwlp;		/* protects notify_interface */
md_ops_t	**md_ops = NULL;
ddi_modhandle_t	*md_mods = NULL;
md_ops_t	*md_opslist;
clock_t		md_hz;
md_event_queue_t	*md_event_queue = NULL;

int		md_in_upgrade;
int		md_keep_repl_state;
int		md_devid_destroy;

/* for sending messages thru a door to userland */
door_handle_t	mdmn_door_handle = NULL;
int		mdmn_door_did = -1;

dev_info_t		*md_devinfo = NULL;

md_mn_nodeid_t	md_mn_mynode_id = ~0u;	/* My node id (for multi-node sets) */

static	uint_t		md_ocnt[OTYPCNT];

static int		mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int		mdattach(dev_info_t *, ddi_attach_cmd_t);
static int		mddetach(dev_info_t *, ddi_detach_cmd_t);
static int		mdopen(dev_t *, int, int, cred_t *);
static int		mdclose(dev_t, int, int, cred_t *);
static int		mddump(dev_t, caddr_t, daddr_t, int);
static int		mdread(dev_t, struct uio *, cred_t *);
static int		mdwrite(dev_t, struct uio *, cred_t *);
static int		mdaread(dev_t, struct aio_req *, cred_t *);
static int		mdawrite(dev_t, struct aio_req *, cred_t *);
static int		mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
static int		mdprop_op(dev_t, dev_info_t *,
				ddi_prop_op_t, int, char *, caddr_t, int *);

static struct cb_ops md_cb_ops = {
	mdopen,			/* open */
	mdclose,		/* close */
	mdstrategy,		/* strategy */
				/* print routine -- none yet */
	(int(*)(dev_t, char *))nulldev,
	mddump,			/* dump */
	mdread,			/* read */
	mdwrite,		/* write */
	mdioctl,		/* ioctl */
				/* devmap */
	(int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
			uint_t))nodev,
				/* mmap */
	(int(*)(dev_t, off_t, int))nodev,
				/* segmap */
	(int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
		unsigned, unsigned, cred_t *))nodev,
	nochpoll,		/* poll */
	mdprop_op,		/* prop_op */
	0,			/* streamtab */
	(D_64BIT|D_MP|D_NEW),	/* driver compatibility flag */
	CB_REV,			/* cb_ops version */
	mdaread,		/* aread */
	mdawrite,		/* awrite */
};

static struct dev_ops md_devops = {
	DEVO_REV,		/* dev_ops version */
	0,			/* device reference count */
	mdinfo,			/* info routine */
	nulldev,		/* identify routine */
	nulldev,		/* probe - not defined */
	mdattach,		/* attach routine */
	mddetach,		/* detach routine */
	nodev,			/* reset - not defined */
	&md_cb_ops,		/* driver operations */
	NULL,			/* bus operations */
	nodev			/* power management */
};

/*
 * loadable module wrapper
 */
#include <sys/modctl.h>

static struct modldrv modldrv = {
	&mod_driverops,			/* type of module -- a pseudodriver */
	"Solaris Volume Manager base module", /* name of the module */
	&md_devops,			/* driver ops */
};

static struct modlinkage modlinkage = {
	MODREV_1,
	(void *)&modldrv,
	NULL
};


/* md_medd.c */
extern	void	med_init(void);
extern	void	med_fini(void);
extern  void	md_devid_cleanup(set_t, uint_t);

/* md_names.c */
extern void			*lookup_entry(struct nm_next_hdr *, set_t,
					side_t, mdkey_t, md_dev64_t, int);
extern struct nm_next_hdr	*get_first_record(set_t, int, int);
extern int			remove_entry(struct nm_next_hdr *,
					side_t, mdkey_t, int);

int		md_maxphys	= 0;	/* maximum io size in bytes */
#define		MD_MAXBCOUNT	(1024 * 1024)
unsigned	md_maxbcount	= 0;	/* maximum physio size in bytes */

/*
 * Some md ioctls trigger io framework device tree operations.  An
 * example is md ioctls that call md_resolve_bydevid(): which uses the
 * io framework to resolve a devid. Such operations result in acquiring
 * io framework locks (like ndi_devi_enter() of "/") while holding
 * driver locks (like md_unit_writerlock()).
 *
 * The prop_op(9E) entry point is called from the devinfo driver with
 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
 * implementation must avoid taking a lock that is held per above md
 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
 * without risking deadlock.
 *
 * To service "size" requests without risking deadlock, we maintain a
 * "mnum->nblocks" sizemap (protected by a short-term global mutex).
 */
static kmutex_t		md_nblocks_mutex;
static mod_hash_t	*md_nblocksmap;		/* mnum -> nblocks */
int			md_nblocksmap_size = 512;

/*
 * Maintain "mnum->nblocks" sizemap for mdprop_op use:
 *
 * Create: any code that establishes a unit's un_total_blocks needs the
 * following type of call to establish nblocks for mdprop_op():
 *	md_nblocks_set(mnum, un->c.un_total_blocks);"
 *	NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
 *		...or  "MD_UNIT..*="
 *
 * Change: any code that changes a unit's un_total_blocks needs the
 * following type of call to sync nblocks for mdprop_op():
 *	md_nblocks_set(mnum, un->c.un_total_blocks);"
 *	NOTE: locate via cscope for "un_total_blocks[ \t]*="
 *
 * Destroy: any code that deletes a unit needs the following type of call
 * to sync nblocks for mdprop_op():
 *	md_nblocks_set(mnum, -1ULL);
 *	NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
 *		...or  "MD_UNIT..*="
 */
void
md_nblocks_set(minor_t mnum, uint64_t nblocks)
{
	mutex_enter(&md_nblocks_mutex);
	if (nblocks == -1ULL)
		(void) mod_hash_destroy(md_nblocksmap,
		    (mod_hash_key_t)(intptr_t)mnum);
	else
		(void) mod_hash_replace(md_nblocksmap,
		    (mod_hash_key_t)(intptr_t)mnum,
		    (mod_hash_val_t)(intptr_t)nblocks);
	mutex_exit(&md_nblocks_mutex);
}

/* get the size of a mnum from "mnum->nblocks" sizemap */
uint64_t
md_nblocks_get(minor_t mnum)
{
	mod_hash_val_t	hv;

	mutex_enter(&md_nblocks_mutex);
	if (mod_hash_find(md_nblocksmap,
	    (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
		mutex_exit(&md_nblocks_mutex);
		return ((uint64_t)(intptr_t)hv);
	}
	mutex_exit(&md_nblocks_mutex);
	return (0);
}

/* allocate/free dynamic space associated with driver globals */
void
md_global_alloc_free(int alloc)
{
	set_t	s;

	if (alloc) {
		/* initialize driver global locks */
		cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
		mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
		rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
		rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
		rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
		rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
		mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
		    MUTEX_DEFAULT, NULL);
		mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);

		/* initialize per set driver global locks */
		for (s = 0; s < MD_MAXSETS; s++) {
			/* initialize per set driver globals locks */
			mutex_init(&md_set[s].s_dbmx,
			    NULL, MUTEX_DEFAULT, NULL);
			mutex_init(&md_set_io[s].md_io_mx,
			    NULL, MUTEX_DEFAULT, NULL);
			cv_init(&md_set_io[s].md_io_cv,
			    NULL, CV_DEFAULT, NULL);
		}
	} else {
		/* destroy per set driver global locks */
		for (s = 0; s < MD_MAXSETS; s++) {
			cv_destroy(&md_set_io[s].md_io_cv);
			mutex_destroy(&md_set_io[s].md_io_mx);
			mutex_destroy(&md_set[s].s_dbmx);
		}

		/* destroy driver global locks */
		mutex_destroy(&md_nblocks_mutex);
		mutex_destroy(&md_cpr_resync.md_resync_mutex);
		rw_destroy(&hsp_rwlp.lock);
		rw_destroy(&ni_rwlp.lock);
		rw_destroy(&nm_lock.lock);
		rw_destroy(&md_unit_array_rw.lock);
		mutex_destroy(&md_mx);
		cv_destroy(&md_cv);
	}
}

int
_init(void)
{
	set_t	s;
	int	err;

	MD_SET_IN(IN_INIT);

	/* allocate dynamic space associated with driver globals */
	md_global_alloc_free(1);

	/* initialize driver globals */
	md_major = ddi_name_to_major("md");
	md_hz = drv_usectohz(NUM_USEC_IN_SEC);

	/* initialize tunable globals */
	if (md_maxphys == 0)		/* maximum io size in bytes */
		md_maxphys = maxphys;
	if (md_maxbcount == 0)		/* maximum physio size in bytes */
		md_maxbcount = MD_MAXBCOUNT;

	/* initialize per set driver globals */
	for (s = 0; s < MD_MAXSETS; s++)
		md_set_io[s].io_state = MD_SET_ACTIVE;

	/*
	 * NOTE: the framework does not currently guarantee exclusion
	 * between _init and attach after calling mod_install.
	 */
	MD_CLR_IN(IN_INIT);
	if ((err = mod_install(&modlinkage))) {
		MD_SET_IN(IN_INIT);
		md_global_alloc_free(0);	/* free dynamic space */
		MD_CLR_IN(IN_INIT);
	}
	return (err);
}

int
_fini(void)
{
	int	err;

	/*
	 * NOTE: the framework currently does not guarantee exclusion
	 * with attach until after mod_remove returns 0.
	 */
	if ((err = mod_remove(&modlinkage)))
		return (err);

	MD_SET_IN(IN_FINI);
	md_global_alloc_free(0);	/* free dynamic space */
	MD_CLR_IN(IN_FINI);
	return (err);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

/* ARGSUSED */
static int
mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	int	len;
	unit_t	i;
	size_t	sz;
	char	ver[VERSION_LENGTH];
	char	**maj_str_array;
	char	*str, *str2;

	MD_SET_IN(IN_ATTACH);
	md_in_upgrade = 0;
	md_keep_repl_state = 0;
	md_devid_destroy = 0;

	if (cmd != DDI_ATTACH) {
		MD_CLR_IN(IN_ATTACH);
		return (DDI_FAILURE);
	}

	if (md_devinfo != NULL) {
		MD_CLR_IN(IN_ATTACH);
		return (DDI_FAILURE);
	}

	mddb_init();

	if (md_start_daemons(TRUE)) {
		MD_CLR_IN(IN_ATTACH);
		mddb_unload();		/* undo mddb_init() allocations */
		return (DDI_FAILURE);
	}

	/* clear the halted state */
	md_clr_status(MD_GBL_HALTED);

	/* see if the diagnostic switch is on */
	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
	    DDI_PROP_DONTPASS, "md_init_debug", 0))
		md_init_debug++;

	/* see if the failfast disable switch is on */
	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
	    DDI_PROP_DONTPASS, "md_ff_disable", 0))
		md_ff_disable++;

	/* try and get the md_nmedh property */
	md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
	    DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
	if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
		md_nmedh = MED_DEF_HOSTS;

	/* try and get the md_med_trans_lst property */
	len = 0;
	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
	    0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
	    len == 0) {
		md_med_trans_lst = md_strdup("tcp");
	} else {
		md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
		if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
		    0, "md_med_trans_lst", md_med_trans_lst, &len) !=
		    DDI_PROP_SUCCESS) {
			kmem_free(md_med_trans_lst, (size_t)len);
			md_med_trans_lst = md_strdup("tcp");
		}
	}

	/*
	 * Must initialize the internal data structures before the
	 * any possible calls to 'goto attach_failure' as _fini
	 * routine references them.
	 */
	med_init();

	md_ops = (md_ops_t **)kmem_zalloc(
	    sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
	md_mods = (ddi_modhandle_t *)kmem_zalloc(
	    sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);

	/* try and get the md_xlate property */
	/* Should we only do this if upgrade? */
	len = sizeof (char) * 5;
	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
	    0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
		if (strcmp(ver, VERSION) == 0) {
			len = 0;
			if (ddi_prop_op(DDI_DEV_T_ANY, dip,
			    PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
			    (caddr_t)&md_tuple_table, &len) !=
			    DDI_PROP_SUCCESS) {
				if (md_init_debug)
					cmn_err(CE_WARN,
					    "md_xlate ddi_prop_op failed");
				goto attach_failure;
			} else {
				md_tuple_length =
				    len/(2 * ((int)sizeof (dev32_t)));
				md_in_upgrade = 1;
			}

			/* Get target's name to major table */
			if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
			    dip, DDI_PROP_DONTPASS,
			    "md_targ_nm_table", &maj_str_array,
			    &md_majortab_len) != DDI_PROP_SUCCESS) {
				md_majortab_len = 0;
				if (md_init_debug)
					cmn_err(CE_WARN, "md_targ_nm_table "
					    "ddi_prop_lookup_string_array "
					    "failed");
				goto attach_failure;
			}

			md_major_tuple_table =
			    (struct md_xlate_major_table *)
			    kmem_zalloc(md_majortab_len *
			    sizeof (struct md_xlate_major_table), KM_SLEEP);

			for (i = 0; i < md_majortab_len; i++) {
				/* Getting major name */
				str = strchr(maj_str_array[i], ' ');
				if (str == NULL)
					continue;
				*str = '\0';
				md_major_tuple_table[i].drv_name =
				    md_strdup(maj_str_array[i]);

				/* Simplified atoi to get major number */
				str2 = str + 1;
				md_major_tuple_table[i].targ_maj = 0;
				while ((*str2 >= '0') && (*str2 <= '9')) {
					md_major_tuple_table[i].targ_maj *= 10;
					md_major_tuple_table[i].targ_maj +=
					    *str2++ - '0';
				}
				*str = ' ';
			}
			ddi_prop_free((void *)maj_str_array);
		} else {
			if (md_init_debug)
				cmn_err(CE_WARN, "md_xlate_ver is incorrect");
			goto attach_failure;
		}
	}

	/*
	 * Check for properties:
	 * 	md_keep_repl_state and md_devid_destroy
	 * and set globals if these exist.
	 */
	md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
	    0, "md_keep_repl_state", 0);

	md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
	    0, "md_devid_destroy", 0);

	if (MD_UPGRADE)
		md_major_targ = md_targ_name_to_major("md");
	else
		md_major_targ = 0;

	/* allocate admin device node */
	if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
	    MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
		goto attach_failure;

	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
		goto attach_failure;

	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
	    "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
		goto attach_failure;

	/* these could have been cleared by a detach */
	md_nunits = MD_MAXUNITS;
	md_nsets = MD_MAXSETS;

	sz = sizeof (void *) * MD_MAXUNITS;
	if (md_set[0].s_un == NULL)
		md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
	if (md_set[0].s_ui == NULL)
		md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);

	md_devinfo = dip;

	/*
	 * Only allocate device node for root mirror metadevice.
	 * Don't pre-allocate unnecessary device nodes (thus slowing down a
	 * boot when we attach).
	 * We can't read the mddbs in attach.  The mddbs will be read
	 * by metainit during the boot process when it is doing the
	 * auto-take processing and any other minor nodes will be
	 * allocated at that point.
	 *
	 * There are two scenarios to be aware of here:
	 * 1) when we are booting from a mirrored root we need the root
	 *    metadevice to exist very early (during vfs_mountroot processing)
	 * 2) we need all of the nodes to be created so that any mnttab entries
	 *    will succeed (handled by metainit reading the mddb during boot).
	 */
	if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
	    == 0) {
		char *p;
		int mnum = 0;

		/*
		 * The svm_bootpath string looks something like
		 * /pseudo/md@0:0,150,blk where 150 is the minor number
		 * in this example so we need to set the pointer p onto
		 * the first digit of the minor number and convert it
		 * from ascii.
		 */
		for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
		    *p >= '0' && *p <= '9'; p++) {
			mnum *= 10;
			mnum += *p - '0';
		}

		if (md_create_minor_node(0, mnum)) {
			kmem_free(md_set[0].s_un, sz);
			kmem_free(md_set[0].s_ui, sz);
			goto attach_failure;
		}
	}

	/* create the hash to store the meta device sizes */
	md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
	    md_nblocksmap_size, mod_hash_null_valdtor);

	MD_CLR_IN(IN_ATTACH);
	return (DDI_SUCCESS);

attach_failure:
	/*
	 * Use our own detach routine to toss any stuff we allocated above.
	 * NOTE: detach will call md_halt to free the mddb_init allocations.
	 */
	MD_CLR_IN(IN_ATTACH);
	if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
		cmn_err(CE_WARN, "detach from attach failed");
	return (DDI_FAILURE);
}

/* ARGSUSED */
static int
mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	extern int	check_active_locators();
	set_t		s;
	size_t		sz;
	int		len;

	MD_SET_IN(IN_DETACH);

	/* check command */
	if (cmd != DDI_DETACH) {
		MD_CLR_IN(IN_DETACH);
		return (DDI_FAILURE);
	}

	/*
	 * if we have not already halted yet we have no active config
	 * then automatically initiate a halt so we can detach.
	 */
	if (!(md_get_status() & MD_GBL_HALTED)) {
		if (check_active_locators() == 0) {
			/*
			 * NOTE: a successful md_halt will have done the
			 * mddb_unload to free allocations done in mddb_init
			 */
			if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
				cmn_err(CE_NOTE, "md:detach: "
				    "Could not halt Solaris Volume Manager");
				MD_CLR_IN(IN_DETACH);
				return (DDI_FAILURE);
			}
		}

		/* fail detach if we have not halted */
		if (!(md_get_status() & MD_GBL_HALTED)) {
			MD_CLR_IN(IN_DETACH);
			return (DDI_FAILURE);
		}
	}

	/* must be in halted state, this will be cleared on next attach */
	ASSERT(md_get_status() & MD_GBL_HALTED);

	/* cleanup attach allocations and initializations */
	md_major_targ = 0;

	sz = sizeof (void *) * md_nunits;
	for (s = 0; s < md_nsets; s++) {
		if (md_set[s].s_un != NULL) {
			kmem_free(md_set[s].s_un, sz);
			md_set[s].s_un = NULL;
		}

		if (md_set[s].s_ui != NULL) {
			kmem_free(md_set[s].s_ui, sz);
			md_set[s].s_ui = NULL;
		}
	}
	md_nunits = 0;
	md_nsets = 0;
	md_nmedh = 0;

	if (non_ff_drivers != NULL) {
		int	i;

		for (i = 0; non_ff_drivers[i] != NULL; i++)
			kmem_free(non_ff_drivers[i],
			    strlen(non_ff_drivers[i]) + 1);

		/* free i+1 entries because there is a null entry at list end */
		kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
		non_ff_drivers = NULL;
	}

	if (md_med_trans_lst != NULL) {
		kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
		md_med_trans_lst = NULL;
	}

	if (md_mods != NULL) {
		kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
		md_mods = NULL;
	}

	if (md_ops != NULL) {
		kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
		md_ops = NULL;
	}

	if (MD_UPGRADE) {
		len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
		md_in_upgrade = 0;
		md_xlate_free(len);
		md_majortab_free();
	}

	/*
	 * Undo what we did in mdattach, freeing resources
	 * and removing things we installed.  The system
	 * framework guarantees we are not active with this devinfo
	 * node in any other entry points at this time.
	 */
	ddi_prop_remove_all(dip);
	ddi_remove_minor_node(dip, NULL);

	med_fini();

	mod_hash_destroy_idhash(md_nblocksmap);

	md_devinfo = NULL;

	MD_CLR_IN(IN_DETACH);
	return (DDI_SUCCESS);
}


/*
 * Given the device number return the devinfo pointer
 * given to md via md_attach
 */
/*ARGSUSED*/
static int
mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
	int		error = DDI_FAILURE;

	switch (infocmd) {
	case DDI_INFO_DEVT2DEVINFO:
		if (md_devinfo) {
			*result = (void *)md_devinfo;
			error = DDI_SUCCESS;
		}
		break;

	case DDI_INFO_DEVT2INSTANCE:
		*result = (void *)0;
		error = DDI_SUCCESS;
		break;
	}
	return (error);
}

/*
 * property operation routine.  return the number of blocks for the partition
 * in question or forward the request to the property facilities.
 */
static int
mdprop_op(
	dev_t dev,		/* device number associated with device */
	dev_info_t *dip,	/* device info struct for this device */
	ddi_prop_op_t prop_op,	/* property operator */
	int mod_flags,		/* property flags */
	char *name,		/* name of property */
	caddr_t valuep,		/* where to put property value */
	int *lengthp)		/* put length of property here */
{
	return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
	    name, valuep, lengthp, md_nblocks_get(getminor(dev))));
}

static void
snarf_user_data(set_t setno)
{
	mddb_recid_t		recid;
	mddb_recstatus_t	status;

	recid = mddb_makerecid(setno, 0);
	while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
			continue;

		status = mddb_getrecstatus(recid);
		if (status == MDDB_STALE)
			continue;

		if (status == MDDB_NODATA) {
			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
			continue;
		}

		ASSERT(status == MDDB_OK);

		mddb_setrecprivate(recid, MD_PRV_GOTIT);
	}
}

static void
md_print_block_usage(mddb_set_t *s, uint_t blks)
{
	uint_t		ib;
	int		li;
	mddb_mb_ic_t	*mbip;
	uint_t		max_blk_needed;
	mddb_lb_t	*lbp;
	mddb_sidelocator_t	*slp;
	int		drv_index;
	md_splitname	sn;
	char		*name;
	char		*suffix;
	size_t		prefixlen;
	size_t		suffixlen;
	int		alloc_sz;


	max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;

	cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
	    "            Additional Blocks Needed:            %d\n\n"
	    "            Increase size of following replicas for\n"
	    "            device relocatability by deleting listed\n"
	    "            replica and re-adding replica with\n"
	    "            increased size (see metadb(1M)):\n"
	    "                Replica                   Increase By",
	    s->s_totalblkcnt, (blks - s->s_freeblkcnt));

	lbp = s->s_lbp;

	for (li = 0; li < lbp->lb_loccnt; li++) {
		if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
			continue;
		ib = 0;
		for (mbip = s->s_mbiarray[li]; mbip != NULL;
		    mbip = mbip->mbi_next) {
			ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
		}
		if (ib == 0)
			continue;
		if (ib < max_blk_needed) {
			slp = &lbp->lb_sidelocators[s->s_sideno][li];
			drv_index = slp->l_drvnm_index;
			mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
			    &sn);
			prefixlen = SPN_PREFIX(&sn).pre_len;
			suffixlen = SPN_SUFFIX(&sn).suf_len;
			alloc_sz = (int)(prefixlen + suffixlen + 2);
			name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
			(void) strncpy(name, SPN_PREFIX(&sn).pre_data,
			    prefixlen);
			name[prefixlen] = '/';
			suffix = name + (prefixlen + 1);
			(void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
			    suffixlen);
			name[prefixlen + suffixlen + 1] = '\0';
			cmn_err(CE_WARN,
			    "  %s (%s:%d:%d)   %d blocks",
			    name, lbp->lb_drvnm[drv_index].dn_data,
			    slp->l_mnum, lbp->lb_locators[li].l_blkno,
			    (max_blk_needed - ib));
			kmem_free(name, alloc_sz);
		}
	}
}

/*
 * md_create_minor_node:
 *	Create the minor device for the given set and un_self_id.
 *
 * Input:
 *	setno	- set number
 *	mnum	- selfID of unit
 *
 * Output:
 *	None.
 *
 * Returns 0 for success, 1 for failure.
 *
 * Side-effects:
 *	None.
 */
int
md_create_minor_node(set_t setno, minor_t mnum)
{
	char		name[20];

	/* Check for valid arguments */
	if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
		return (1);

	(void) snprintf(name, 20, "%u,%u,blk",
	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));

	if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
		return (1);

	(void) snprintf(name, 20, "%u,%u,raw",
	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));

	if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
		return (1);

	return (0);
}

/*
 * For a given key check if it is an orphaned record.
 * The following conditions are used to determine an orphan.
 * 1. The device associated with that key is not a metadevice.
 * 2. If DEVID_STYLE then the physical device does not have a device Id
 * associated with it.
 *
 * If a key does not have an entry in the devid namespace it could be
 * a device that does not support device ids. Hence the record is not
 * deleted.
 */

static int
md_verify_orphaned_record(set_t setno, mdkey_t key)
{
	md_dev64_t	odev; /* orphaned dev */
	mddb_set_t	*s;
	side_t		side = 0;
	struct nm_next_hdr	*did_nh = NULL;

	s = (mddb_set_t *)md_set[setno].s_db;
	if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
	    == NULL)
		return (0);
	/*
	 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
	 */
	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
		odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
		if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
			return (0);
		if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
		    NULL)
			return (1);
	}
	return (0);
}

int
md_snarf_db_set(set_t setno, md_error_t *ep)
{
	int			err = 0;
	int			i;
	mddb_recid_t		recid;
	mddb_type_t		drvrid;
	mddb_recstatus_t	status;
	md_ops_t		*ops;
	uint_t			privat;
	mddb_set_t		*s;
	uint_t			cvt_blks;
	struct nm_next_hdr	*nh;
	mdkey_t			key = MD_KEYWILD;
	side_t			side = 0;
	int			size;
	int			devid_flag;
	int			retval;
	uint_t			un;
	int			un_next_set = 0;

	md_haltsnarf_enter(setno);

	mutex_enter(&md_mx);
	if (md_set[setno].s_status & MD_SET_SNARFED) {
		mutex_exit(&md_mx);
		md_haltsnarf_exit(setno);
		return (0);
	}
	mutex_exit(&md_mx);

	if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
		if (md_start_daemons(TRUE)) {
			if (ep != NULL)
				(void) mdsyserror(ep, ENXIO);
			err = -1;
			goto out;
		}
	}


	/*
	 * Load the devid name space if it exists
	 */
	(void) md_load_namespace(setno, NULL, NM_DEVID);
	if (!md_load_namespace(setno, ep, 0L)) {
		/*
		 * Unload the devid namespace
		 */
		(void) md_unload_namespace(setno, NM_DEVID);
		err = -1;
		goto out;
	}

	/*
	 * If replica is in non-devid state, convert if:
	 * 	- not in probe during upgrade (md_keep_repl_state = 0)
	 * 	- enough space available in replica
	 *	- local set
	 *	- not a multi-node diskset
	 *	- clustering is not present (for non-local set)
	 */
	s = (mddb_set_t *)md_set[setno].s_db;
	devid_flag = 0;
	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
		devid_flag = 1;
	if (cluster_bootflags & CLUSTER_CONFIGURED)
		if (setno != MD_LOCAL_SET)
			devid_flag = 0;
	if (MD_MNSET_SETNO(setno))
		devid_flag = 0;
	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
		devid_flag = 0;

	/*
	 * if we weren't devid style before and md_keep_repl_state=1
	 * we need to stay non-devid
	 */
	if ((md_keep_repl_state == 1) &&
	    ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
		devid_flag = 0;
	if (devid_flag) {
		/*
		 * Determine number of free blocks needed to convert
		 * entire replica to device id format - locator blocks
		 * and namespace.
		 */
		cvt_blks = 0;
		if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
			if (ep != NULL)
				(void) mdsyserror(ep, EIO);
			err = -1;
			goto out;

		}
		cvt_blks += md_nm_did_chkspace(setno);

		/* add MDDB_DEVID_CONV_PERC% */
		if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
			cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
		}

		if (cvt_blks <= s->s_freeblkcnt) {
			if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
				if (ep != NULL)
					(void) mdsyserror(ep, EIO);
				err = -1;
				goto out;
			}

		} else {
			/*
			 * Print message that replica can't be converted for
			 * lack of space.   No failure - just continue to
			 * run without device ids.
			 */
			cmn_err(CE_WARN,
			    "Unable to add Solaris Volume Manager device "
			    "relocation data.\n"
			    "          To use device relocation feature:\n"
			    "          - Increase size of listed replicas\n"
			    "          - Reboot");
			md_print_block_usage(s, cvt_blks);
			cmn_err(CE_WARN,
			    "Loading set without device relocation data.\n"
			    "          Solaris Volume Manager disk movement "
			    "not tracked in local set.");
		}
	}

	/*
	 * go through and load any modules referenced in
	 * data base
	 */
	recid = mddb_makerecid(setno, 0);
	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
		status = mddb_getrecstatus(recid);
		if (status == MDDB_STALE) {
			if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
				md_set_setstatus(setno, MD_SET_STALE);
				cmn_err(CE_WARN,
				    "md: state database is stale");
			}
		} else if (status == MDDB_NODATA) {
			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
			continue;
		}
		drvrid = mddb_getrectype1(recid);
		if (drvrid < MDDB_FIRST_MODID)
			continue;
		if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
		    drvrid) < 0) {
			cmn_err(CE_NOTE, "md: could not load misc/%s",
			    md_getshared_name(setno, drvrid));
		}
	}

	if (recid < 0)
		goto out;

	snarf_user_data(setno);

	/*
	 * Initialize the md_nm_snarfed array
	 * this array is indexed by the key and
	 * is set by md_getdevnum during the snarf time
	 */
	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
		size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
		    r_next_key) * (sizeof (int)));
		md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
	}

	/*
	 * go through and snarf until nothing gets added
	 */
	do {
		i = 0;
		for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
			if (ops->md_snarf != NULL) {
				retval = ops->md_snarf(MD_SNARF_DOIT, setno);
				if (retval == -1) {
					err = -1;
					/* Don't know the failed unit */
					(void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
					    0);
					(void) md_halt_set(setno, MD_HALT_ALL);
					(void) mddb_unload_set(setno);
					md_haltsnarf_exit(setno);
					return (err);
				} else {
					i += retval;
				}
			}
		}
	} while (i);

	/*
	 * Set the first available slot and availability
	 */
	md_set[setno].s_un_avail = 0;
	for (un = 0; un < MD_MAXUNITS; un++) {
		if (md_set[setno].s_un[un] != NULL) {
			continue;
		} else {
			if (!un_next_set) {
				md_set[setno].s_un_next = un;
				un_next_set = 1;
			}
			md_set[setno].s_un_avail++;
		}
	}

	md_set_setstatus(setno, MD_SET_SNARFED);

	recid = mddb_makerecid(setno, 0);
	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
		privat = mddb_getrecprivate(recid);
		if (privat & MD_PRV_COMMIT) {
			if (mddb_commitrec(recid)) {
				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
					md_set_setstatus(setno, MD_SET_STALE);
					cmn_err(CE_WARN,
					    "md: state database is stale");
				}
			}
			mddb_setrecprivate(recid, MD_PRV_GOTIT);
		}
	}

	/* Deletes must happen after all the commits */
	recid = mddb_makerecid(setno, 0);
	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
		privat = mddb_getrecprivate(recid);
		if (privat & MD_PRV_DELETE) {
			if (mddb_deleterec(recid)) {
				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
					md_set_setstatus(setno, MD_SET_STALE);
					cmn_err(CE_WARN,
					    "md: state database is stale");
				}
				mddb_setrecprivate(recid, MD_PRV_GOTIT);
			}
			recid = mddb_makerecid(setno, 0);
		}
	}

	/*
	 * go through and clean up records until nothing gets cleaned up.
	 */
	do {
		i = 0;
		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
			if (ops->md_snarf != NULL)
				i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
	} while (i);

	if (md_nm_snarfed != NULL &&
	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
		/*
		 * go thru and cleanup the namespace and the device id
		 * name space
		 */
		for (key = 1;
		    key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
		    key++) {
			/*
			 * Is the entry an 'orphan'?
			 */
			if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
			    NULL) {
				/*
				 * If the value is not set then apparently
				 * it is not part of the current configuration,
				 * remove it this can happen when system panic
				 * between the primary name space update and
				 * the device id name space update
				 */
				if (md_nm_snarfed[key] == 0) {
					if (md_verify_orphaned_record(setno,
					    key) == 1)
						(void) remove_entry(nh,
						    side, key, 0L);
				}
			}
		}
	}

	if (md_nm_snarfed != NULL) {
		/*
		 * Done and free the memory
		 */
		kmem_free(md_nm_snarfed, size);
		md_nm_snarfed = NULL;
	}

	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
		/*
		 * if the destroy flag has been set and
		 * the MD_SET_DIDCLUP bit is not set in
		 * the set's status field, cleanup the
		 * entire device id namespace
		 */
		if (md_devid_destroy &&
		    !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
			(void) md_devid_cleanup(setno, 1);
			md_set_setstatus(setno, MD_SET_DIDCLUP);
		} else
			(void) md_devid_cleanup(setno, 0);
	}

	/*
	 * clear single threading on snarf, return success or error
	 */
out:
	md_haltsnarf_exit(setno);
	return (err);
}

void
get_minfo(struct dk_minfo *info, minor_t mnum)
{
	md_unit_t	*un;
	mdi_unit_t	*ui;

	info->dki_capacity = 0;
	info->dki_lbsize = 0;
	info->dki_media_type = 0;

	if ((ui = MDI_UNIT(mnum)) == NULL) {
		return;
	}
	un = (md_unit_t *)md_unit_readerlock(ui);
	info->dki_capacity = un->c.un_total_blocks;
	md_unit_readerexit(ui);
	info->dki_lbsize = DEV_BSIZE;
	info->dki_media_type = DK_UNKNOWN;
}


void
get_info(struct dk_cinfo *info, minor_t mnum)
{
	/*
	 * Controller Information
	 */
	info->dki_ctype = DKC_MD;
	info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
	(void) strcpy(info->dki_cname,
	    ddi_get_name(ddi_get_parent(md_devinfo)));
	/*
	 * Unit Information
	 */
	info->dki_unit = mnum;
	info->dki_slave = 0;
	(void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
	info->dki_flags = 0;
	info->dki_partition = 0;
	info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);

	/*
	 * We can't get from here to there yet
	 */
	info->dki_addr = 0;
	info->dki_space = 0;
	info->dki_prio = 0;
	info->dki_vec = 0;
}

/*
 * open admin device
 */
static int
mdadminopen(
	int	flag,
	int	otyp)
{
	int	err = 0;

	/* single thread */
	mutex_enter(&md_mx);

	/* check type and flags */
	if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
		err = EINVAL;
		goto out;
	}
	if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
	    (md_status & MD_GBL_EXCL)) {
		err = EBUSY;
		goto out;
	}

	/* count and flag open */
	md_ocnt[otyp]++;
	md_status |= MD_GBL_OPEN;
	if (flag & FEXCL)
		md_status |= MD_GBL_EXCL;

	/* unlock return success */
out:
	mutex_exit(&md_mx);
	return (err);
}

/*
 * open entry point
 */
static int
mdopen(
	dev_t		*dev,
	int		flag,
	int		otyp,
	cred_t		*cred_p)
{
	minor_t		mnum = getminor(*dev);
	unit_t		unit = MD_MIN2UNIT(mnum);
	set_t		setno = MD_MIN2SET(mnum);
	mdi_unit_t	*ui = NULL;
	int		err = 0;
	md_parent_t	parent;

	/* dispatch admin device opens */
	if (mnum == MD_ADM_MINOR)
		return (mdadminopen(flag, otyp));

	/* lock, check status */
	rw_enter(&md_unit_array_rw.lock, RW_READER);

tryagain:
	if (md_get_status() & MD_GBL_HALTED)  {
		err = ENODEV;
		goto out;
	}

	/* check minor */
	if ((setno >= md_nsets) || (unit >= md_nunits)) {
		err = ENXIO;
		goto out;
	}

	/* make sure we're snarfed */
	if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
		if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
			err = ENODEV;
			goto out;
		}
	}
	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
		err = ENODEV;
		goto out;
	}

	/* check unit */
	if ((ui = MDI_UNIT(mnum)) == NULL) {
		err = ENXIO;
		goto out;
	}

	/*
	 * The softpart open routine may do an I/O during the open, in
	 * which case the open routine will set the OPENINPROGRESS flag
	 * and drop all locks during the I/O.  If this thread sees
	 * the OPENINPROGRESS flag set, if should wait until the flag
	 * is reset before calling the driver's open routine.  It must
	 * also revalidate the world after it grabs the unit_array lock
	 * since the set may have been released or the metadevice cleared
	 * during the sleep.
	 */
	if (MD_MNSET_SETNO(setno)) {
		mutex_enter(&ui->ui_mx);
		if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
			rw_exit(&md_unit_array_rw.lock);
			cv_wait(&ui->ui_cv, &ui->ui_mx);
			rw_enter(&md_unit_array_rw.lock, RW_READER);
			mutex_exit(&ui->ui_mx);
			goto tryagain;
		}
		mutex_exit(&ui->ui_mx);
	}

	/* Test if device is openable */
	if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
		err = ENXIO;
		goto out;
	}

	/* don't allow opens w/WRITE flag if stale */
	if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
		err = EROFS;
		goto out;
	}

	/* don't allow writes to subdevices */
	parent = md_get_parent(md_expldev(*dev));
	if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
		err = EROFS;
		goto out;
	}

	/* open underlying driver */
	if (md_ops[ui->ui_opsindex]->md_open != NULL) {
		if ((err = (*md_ops[ui->ui_opsindex]->md_open)
		    (dev, flag, otyp, cred_p, 0)) != 0)
			goto out;
	}

	/* or do it ourselves */
	else {
		/* single thread */
		(void) md_unit_openclose_enter(ui);
		err = md_unit_incopen(mnum, flag, otyp);
		md_unit_openclose_exit(ui);
		if (err != 0)
			goto out;
	}

	/* unlock, return status */
out:
	rw_exit(&md_unit_array_rw.lock);
	return (err);
}

/*
 * close admin device
 */
static int
mdadminclose(
	int	otyp)
{
	int	i;
	int	err = 0;

	/* single thread */
	mutex_enter(&md_mx);

	/* check type and flags */
	if ((otyp < 0) || (otyp >= OTYPCNT)) {
		err = EINVAL;
		goto out;
	} else if (md_ocnt[otyp] == 0) {
		err = ENXIO;
		goto out;
	}

	/* count and flag closed */
	if (otyp == OTYP_LYR)
		md_ocnt[otyp]--;
	else
		md_ocnt[otyp] = 0;
	md_status &= ~MD_GBL_OPEN;
	for (i = 0; (i < OTYPCNT); ++i)
		if (md_ocnt[i] != 0)
			md_status |= MD_GBL_OPEN;
	if (! (md_status & MD_GBL_OPEN))
		md_status &= ~MD_GBL_EXCL;

	/* unlock return success */
out:
	mutex_exit(&md_mx);
	return (err);
}

/*
 * close entry point
 */
static int
mdclose(
	dev_t		dev,
	int		flag,
	int		otyp,
	cred_t		*cred_p)
{
	minor_t		mnum = getminor(dev);
	set_t		setno = MD_MIN2SET(mnum);
	unit_t		unit = MD_MIN2UNIT(mnum);
	mdi_unit_t	*ui = NULL;
	int		err = 0;

	/* dispatch admin device closes */
	if (mnum == MD_ADM_MINOR)
		return (mdadminclose(otyp));

	/* check minor */
	if ((setno >= md_nsets) || (unit >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL)) {
		err = ENXIO;
		goto out;
	}

	/* close underlying driver */
	if (md_ops[ui->ui_opsindex]->md_close != NULL) {
		if ((err = (*md_ops[ui->ui_opsindex]->md_close)
		    (dev, flag, otyp, cred_p, 0)) != 0)
			goto out;
	}

	/* or do it ourselves */
	else {
		/* single thread */
		(void) md_unit_openclose_enter(ui);
		err = md_unit_decopen(mnum, otyp);
		md_unit_openclose_exit(ui);
		if (err != 0)
			goto out;
	}

	/* return success */
out:
	return (err);
}


/*
 * This routine performs raw read operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *uio struct which is passed to us may
 * specify a read which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This will
 * be handled by mdstrategy.
 */
/*ARGSUSED*/
static int
mdread(dev_t dev, struct uio *uio, cred_t *credp)
{
	minor_t		mnum;
	mdi_unit_t	*ui;
	int		error;

	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
	    (MD_MIN2SET(mnum) >= md_nsets) ||
	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL))
		return (ENXIO);

	if (md_ops[ui->ui_opsindex]->md_read  != NULL)
		return ((*md_ops[ui->ui_opsindex]->md_read)
		    (dev, uio, credp));

	if ((error = md_chk_uio(uio)) != 0)
		return (error);

	return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
}

/*
 * This routine performs async raw read operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *aio struct which is passed to us may
 * specify a read which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This will
 * be handled by mdstrategy.
 */
/*ARGSUSED*/
static int
mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
{
	minor_t		mnum;
	mdi_unit_t	*ui;
	int		error;


	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
	    (MD_MIN2SET(mnum) >= md_nsets) ||
	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL))
		return (ENXIO);

	if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
		return ((*md_ops[ui->ui_opsindex]->md_aread)
		    (dev, aio, credp));

	if ((error = md_chk_uio(aio->aio_uio)) != 0)
		return (error);

	return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
}

/*
 * This routine performs raw write operations.	It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *uio struct which is passed to us may
 * specify a write which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This is
 * handled by mdstrategy.
 *
 */
/*ARGSUSED*/
static int
mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
{
	minor_t		mnum;
	mdi_unit_t	*ui;
	int		error;

	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
	    (MD_MIN2SET(mnum) >= md_nsets) ||
	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL))
		return (ENXIO);

	if (md_ops[ui->ui_opsindex]->md_write  != NULL)
		return ((*md_ops[ui->ui_opsindex]->md_write)
		    (dev, uio, credp));

	if ((error = md_chk_uio(uio)) != 0)
		return (error);

	return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
}

/*
 * This routine performs async raw write operations.  It is called from the
 * device switch at normal priority.
 *
 * The main catch is that the *aio struct which is passed to us may
 * specify a write which spans two buffers, which would be contiguous
 * on a single partition,  but not on a striped partition. This is
 * handled by mdstrategy.
 *
 */
/*ARGSUSED*/
static int
mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
{
	minor_t		mnum;
	mdi_unit_t	*ui;
	int		error;


	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
	    (MD_MIN2SET(mnum) >= md_nsets) ||
	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL))
		return (ENXIO);

	if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
		return ((*md_ops[ui->ui_opsindex]->md_awrite)
		    (dev, aio, credp));

	if ((error = md_chk_uio(aio->aio_uio)) != 0)
		return (error);

	return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
}

int
mdstrategy(struct buf *bp)
{
	minor_t		mnum;
	mdi_unit_t	*ui;

	ASSERT((bp->b_flags & B_DONE) == 0);

	if (panicstr)
		md_clr_status(MD_GBL_DAEMONS_LIVE);

	if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
	    (MD_MIN2SET(mnum) >= md_nsets) ||
	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL)) {
		bp->b_flags |= B_ERROR;
		bp->b_error = ENXIO;
		bp->b_resid = bp->b_bcount;
		biodone(bp);
		return (0);
	}

	bp->b_flags &= ~(B_ERROR | B_DONE);
	if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
		(*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
	} else {
		(void) errdone(ui, bp, ENXIO);
	}
	return (0);
}

/*
 * Return true if the ioctl is allowed to be multithreaded.
 * All the ioctls with MN are sent only from the message handlers through
 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
 * ioctl for the same metadevice are issued at the same time.
 * So we are safe here.
 * The other ioctls do not mess with any metadevice structures and therefor
 * are harmless too, if called multiple times at the same time.
 */
static boolean_t
is_mt_ioctl(int cmd) {

	switch (cmd) {
	case MD_IOCGUNIQMSGID:
	case MD_IOCGVERSION:
	case MD_IOCISOPEN:
	case MD_MN_SET_MM_OWNER:
	case MD_MN_SET_STATE:
	case MD_MN_SUSPEND_WRITES:
	case MD_MN_ALLOCATE_HOTSPARE:
	case MD_MN_SET_SETFLAGS:
	case MD_MN_GET_SETFLAGS:
	case MD_MN_MDDB_OPTRECFIX:
	case MD_MN_MDDB_PARSE:
	case MD_MN_MDDB_BLOCK:
	case MD_MN_DB_USERREQ:
	case MD_IOC_SPSTATUS:
	case MD_MN_COMMD_ERR:
	case MD_MN_SET_COMMD_RUNNING:
	case MD_MN_RESYNC:
	case MD_MN_SETSYNC:
	case MD_MN_POKE_HOTSPARES:
		return (1);
	default:
		return (0);
	}
}

/*
 * This routine implements the ioctl calls for the Virtual Disk System.
 * It is called from the device switch at normal priority.
 */
/* ARGSUSED */
static int
mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
	int *rval_p)
{
	minor_t		mnum = getminor(dev);
	mdi_unit_t	*ui;
	IOLOCK		lock;
	int		err;

	/*
	 * For multinode disksets  number of ioctls are allowed to be
	 * multithreaded.
	 * A fundamental assumption made in this implementation is that
	 * ioctls either do not interact with other md structures  or the
	 * ioctl to the admin device can only occur if the metadevice
	 * device is open. i.e. avoid a race between metaclear and the
	 * progress of a multithreaded ioctl.
	 */

	if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
		return (EINTR);
	}

	/*
	 * initialize lock tracker
	 */
	IOLOCK_INIT(&lock);

	/* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */

	if (is_mt_ioctl(cmd)) {
		/* increment the md_mtioctl_cnt */
		mutex_enter(&md_mx);
		md_mtioctl_cnt++;
		mutex_exit(&md_mx);
		lock.l_flags |= MD_MT_IOCTL;
	}

	/*
	 * this has been added to prevent notification from re-snarfing
	 * so metaunload will work.  It may interfere with other modules
	 * halt process.
	 */
	if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
		return (IOLOCK_RETURN(ENXIO, &lock));

	/*
	 * admin device ioctls
	 */
	if (mnum == MD_ADM_MINOR) {
		err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
		    mode, &lock);
	}

	/*
	 * metadevice ioctls
	 */
	else if ((MD_MIN2SET(mnum) >= md_nsets) ||
	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL)) {
		err = ENXIO;
	} else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
		err = ENOTTY;
	} else {
		err = (*md_ops[ui->ui_opsindex]->md_ioctl)
		    (dev, cmd, (void *) data, mode, &lock);
	}

	/*
	 * drop any locks we grabbed
	 */
	return (IOLOCK_RETURN_IOCTLEND(err, &lock));
}

static int
mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
	minor_t		mnum;
	set_t		setno;
	mdi_unit_t	*ui;

	if ((mnum = getminor(dev)) == MD_ADM_MINOR)
		return (ENXIO);

	setno = MD_MIN2SET(mnum);

	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
	    ((ui = MDI_UNIT(mnum)) == NULL))
		return (ENXIO);


	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
		return (ENXIO);

	if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
		return ((*md_ops[ui->ui_opsindex]->md_dump)
		    (dev, addr, blkno, nblk));

	return (ENXIO);
}

/*
 * Metadevice unit number dispatcher
 * When this routine is called it will scan the
 * incore unit array and return the avail slot
 * hence the unit number to the caller
 *
 * Return -1 if there is nothing available
 */
unit_t
md_get_nextunit(set_t setno)
{
	unit_t	un, start;

	/*
	 * If nothing available
	 */
	if (md_set[setno].s_un_avail == 0) {
		return (MD_UNITBAD);
	}

	mutex_enter(&md_mx);
	start = un = md_set[setno].s_un_next;

	/* LINTED: E_CONSTANT_CONDITION */
	while (1) {
		if (md_set[setno].s_un[un] == NULL) {
			/*
			 * Advance the starting index for the next
			 * md_get_nextunit call
			 */
			if (un == MD_MAXUNITS - 1) {
				md_set[setno].s_un_next = 0;
			} else {
				md_set[setno].s_un_next = un + 1;
			}
			break;
		}

		un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);

		if (un == start) {
			un = MD_UNITBAD;
			break;
		}

	}

	mutex_exit(&md_mx);
	return (un);
}