10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*1366Spetede  * Common Development and Distribution License (the "License").
6*1366Spetede  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*1366Spetede  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * Soft partitioning metadevice driver (md_sp).
300Sstevel@tonic-gate  *
310Sstevel@tonic-gate  * This file contains the primary operations of the soft partitioning
320Sstevel@tonic-gate  * metadevice driver.  This includes all routines for normal operation
330Sstevel@tonic-gate  * (open/close/read/write).  Please see mdvar.h for a definition of
340Sstevel@tonic-gate  * metadevice operations vector (md_ops_t).  This driver is loosely
350Sstevel@tonic-gate  * based on the stripe driver (md_stripe).
360Sstevel@tonic-gate  *
370Sstevel@tonic-gate  * All metadevice administration is done through the use of ioctl's.
380Sstevel@tonic-gate  * As such, all administrative routines appear in sp_ioctl.c.
390Sstevel@tonic-gate  *
400Sstevel@tonic-gate  * Soft partitions are represented both in-core and in the metadb with a
410Sstevel@tonic-gate  * unit structure.  The soft partition-specific information in the unit
420Sstevel@tonic-gate  * structure includes the following information:
430Sstevel@tonic-gate  *	- Device information (md_dev64_t & md key) about the device on which
440Sstevel@tonic-gate  *	  the soft partition is built.
450Sstevel@tonic-gate  *	- Soft partition status information.
460Sstevel@tonic-gate  *	- The size of the soft partition and number of extents used to
470Sstevel@tonic-gate  *	  make up that size.
480Sstevel@tonic-gate  *	- An array of exents which define virtual/physical offset
490Sstevel@tonic-gate  *	  mappings and lengths for each extent.
500Sstevel@tonic-gate  *
510Sstevel@tonic-gate  * Typical soft partition operation proceeds as follows:
520Sstevel@tonic-gate  *	- The unit structure is fetched from the metadb and placed into
530Sstevel@tonic-gate  *	  an in-core array (as with other metadevices).  This operation
540Sstevel@tonic-gate  *	  is performed via sp_build_incore( ) and takes place during
550Sstevel@tonic-gate  *	  "snarfing" (when all metadevices are brought in-core at
560Sstevel@tonic-gate  *	  once) and when a new soft partition is created.
570Sstevel@tonic-gate  *	- A soft partition is opened via sp_open( ).  At open time the
580Sstevel@tonic-gate  *	  the soft partition unit structure is verified with the soft
590Sstevel@tonic-gate  *	  partition on-disk structures.  Additionally, the soft partition
600Sstevel@tonic-gate  *	  status is checked (only soft partitions in the OK state may be
610Sstevel@tonic-gate  *	  opened).
620Sstevel@tonic-gate  *	- Soft partition I/O is performed via sp_strategy( ) which relies on
630Sstevel@tonic-gate  *	  a support routine, sp_mapbuf( ), to do most of the work.
640Sstevel@tonic-gate  *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
650Sstevel@tonic-gate  *	  search of the extent array in the soft partition unit structure.
660Sstevel@tonic-gate  *	  Once a translation has been performed, the I/O is passed down
670Sstevel@tonic-gate  *	  to the next layer, which may be another metadevice or a physical
680Sstevel@tonic-gate  *	  disk.  Since a soft partition may contain multiple, non-contiguous
690Sstevel@tonic-gate  *	  extents, a single I/O may have to be fragmented.
700Sstevel@tonic-gate  *	- Soft partitions are closed using sp_close.
710Sstevel@tonic-gate  *
720Sstevel@tonic-gate  */
730Sstevel@tonic-gate 
740Sstevel@tonic-gate #include <sys/param.h>
750Sstevel@tonic-gate #include <sys/systm.h>
760Sstevel@tonic-gate #include <sys/conf.h>
770Sstevel@tonic-gate #include <sys/file.h>
780Sstevel@tonic-gate #include <sys/user.h>
790Sstevel@tonic-gate #include <sys/uio.h>
800Sstevel@tonic-gate #include <sys/t_lock.h>
810Sstevel@tonic-gate #include <sys/buf.h>
820Sstevel@tonic-gate #include <sys/dkio.h>
830Sstevel@tonic-gate #include <sys/vtoc.h>
840Sstevel@tonic-gate #include <sys/kmem.h>
850Sstevel@tonic-gate #include <vm/page.h>
860Sstevel@tonic-gate #include <sys/cmn_err.h>
870Sstevel@tonic-gate #include <sys/sysmacros.h>
880Sstevel@tonic-gate #include <sys/types.h>
890Sstevel@tonic-gate #include <sys/mkdev.h>
900Sstevel@tonic-gate #include <sys/stat.h>
910Sstevel@tonic-gate #include <sys/open.h>
920Sstevel@tonic-gate #include <sys/lvm/mdvar.h>
930Sstevel@tonic-gate #include <sys/lvm/md_sp.h>
940Sstevel@tonic-gate #include <sys/lvm/md_convert.h>
950Sstevel@tonic-gate #include <sys/lvm/md_notify.h>
960Sstevel@tonic-gate #include <sys/lvm/md_crc.h>
970Sstevel@tonic-gate #include <sys/modctl.h>
980Sstevel@tonic-gate #include <sys/ddi.h>
990Sstevel@tonic-gate #include <sys/sunddi.h>
1000Sstevel@tonic-gate #include <sys/debug.h>
1010Sstevel@tonic-gate 
1020Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
1030Sstevel@tonic-gate #include <sys/sysevent/svm.h>
1040Sstevel@tonic-gate 
1050Sstevel@tonic-gate md_ops_t		sp_md_ops;
1060Sstevel@tonic-gate #ifndef	lint
107*1366Spetede char			_depends_on[] = "drv/md";
1080Sstevel@tonic-gate md_ops_t		*md_interface_ops = &sp_md_ops;
1090Sstevel@tonic-gate #endif
1100Sstevel@tonic-gate 
1110Sstevel@tonic-gate extern unit_t		md_nunits;
1120Sstevel@tonic-gate extern set_t		md_nsets;
1130Sstevel@tonic-gate extern md_set_t		md_set[];
1140Sstevel@tonic-gate 
1150Sstevel@tonic-gate extern int		md_status;
1160Sstevel@tonic-gate extern major_t		md_major;
1170Sstevel@tonic-gate extern mdq_anchor_t	md_done_daemon;
1180Sstevel@tonic-gate extern mdq_anchor_t	md_sp_daemon;
1190Sstevel@tonic-gate extern kmutex_t		md_mx;
1200Sstevel@tonic-gate extern kcondvar_t	md_cv;
1210Sstevel@tonic-gate extern md_krwlock_t	md_unit_array_rw;
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate static kmem_cache_t	*sp_parent_cache = NULL;
1240Sstevel@tonic-gate static kmem_cache_t	*sp_child_cache = NULL;
1250Sstevel@tonic-gate static void		sp_send_stat_ok(mp_unit_t *);
1260Sstevel@tonic-gate static void		sp_send_stat_err(mp_unit_t *);
1270Sstevel@tonic-gate 
1280Sstevel@tonic-gate /*
1290Sstevel@tonic-gate  * FUNCTION:	sp_parent_constructor()
1300Sstevel@tonic-gate  * INPUT:	none.
1310Sstevel@tonic-gate  * OUTPUT:	ps	- parent save structure initialized.
1320Sstevel@tonic-gate  * RETURNS:	void *	- ptr to initialized parent save structure.
1330Sstevel@tonic-gate  * PURPOSE:	initialize parent save structure.
1340Sstevel@tonic-gate  */
1350Sstevel@tonic-gate /*ARGSUSED1*/
1360Sstevel@tonic-gate static int
1370Sstevel@tonic-gate sp_parent_constructor(void *p, void *d1, int d2)
1380Sstevel@tonic-gate {
1390Sstevel@tonic-gate 	mutex_init(&((md_spps_t *)p)->ps_mx,
1400Sstevel@tonic-gate 	    NULL, MUTEX_DEFAULT, NULL);
1410Sstevel@tonic-gate 	return (0);
1420Sstevel@tonic-gate }
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate static void
1450Sstevel@tonic-gate sp_parent_init(md_spps_t *ps)
1460Sstevel@tonic-gate {
1470Sstevel@tonic-gate 	bzero(ps, offsetof(md_spps_t, ps_mx));
1480Sstevel@tonic-gate }
1490Sstevel@tonic-gate 
1500Sstevel@tonic-gate /*ARGSUSED1*/
1510Sstevel@tonic-gate static void
1520Sstevel@tonic-gate sp_parent_destructor(void *p, void *d)
1530Sstevel@tonic-gate {
1540Sstevel@tonic-gate 	mutex_destroy(&((md_spps_t *)p)->ps_mx);
1550Sstevel@tonic-gate }
1560Sstevel@tonic-gate 
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate  * FUNCTION:	sp_child_constructor()
1590Sstevel@tonic-gate  * INPUT:	none.
1600Sstevel@tonic-gate  * OUTPUT:	cs	- child save structure initialized.
1610Sstevel@tonic-gate  * RETURNS:	void *	- ptr to initialized child save structure.
1620Sstevel@tonic-gate  * PURPOSE:	initialize child save structure.
1630Sstevel@tonic-gate  */
1640Sstevel@tonic-gate /*ARGSUSED1*/
1650Sstevel@tonic-gate static int
1660Sstevel@tonic-gate sp_child_constructor(void *p, void *d1, int d2)
1670Sstevel@tonic-gate {
1680Sstevel@tonic-gate 	bioinit(&((md_spcs_t *)p)->cs_buf);
1690Sstevel@tonic-gate 	return (0);
1700Sstevel@tonic-gate }
1710Sstevel@tonic-gate 
1720Sstevel@tonic-gate static void
1730Sstevel@tonic-gate sp_child_init(md_spcs_t *cs)
1740Sstevel@tonic-gate {
1750Sstevel@tonic-gate 	cs->cs_mdunit = 0;
1760Sstevel@tonic-gate 	cs->cs_ps = NULL;
1770Sstevel@tonic-gate 	md_bioreset(&cs->cs_buf);
1780Sstevel@tonic-gate }
1790Sstevel@tonic-gate 
1800Sstevel@tonic-gate /*ARGSUSED1*/
1810Sstevel@tonic-gate static void
1820Sstevel@tonic-gate sp_child_destructor(void *p, void *d)
1830Sstevel@tonic-gate {
1840Sstevel@tonic-gate 	biofini(&((md_spcs_t *)p)->cs_buf);
1850Sstevel@tonic-gate }
1860Sstevel@tonic-gate 
1870Sstevel@tonic-gate /*
1880Sstevel@tonic-gate  * FUNCTION:	sp_run_queue()
1890Sstevel@tonic-gate  * INPUT:	none.
1900Sstevel@tonic-gate  * OUTPUT:	none.
1910Sstevel@tonic-gate  * RETURNS:	void.
1920Sstevel@tonic-gate  * PURPOSE:	run the md_daemon to clean up memory pool.
1930Sstevel@tonic-gate  */
1940Sstevel@tonic-gate /*ARGSUSED*/
1950Sstevel@tonic-gate static void
1960Sstevel@tonic-gate sp_run_queue(void *d)
1970Sstevel@tonic-gate {
1980Sstevel@tonic-gate 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
1990Sstevel@tonic-gate 		md_daemon(1, &md_done_daemon);
2000Sstevel@tonic-gate }
2010Sstevel@tonic-gate 
2020Sstevel@tonic-gate 
2030Sstevel@tonic-gate /*
2040Sstevel@tonic-gate  * FUNCTION:	sp_build_incore()
2050Sstevel@tonic-gate  * INPUT:	p		- ptr to unit structure.
2060Sstevel@tonic-gate  *		snarfing	- flag to tell us we are snarfing.
2070Sstevel@tonic-gate  * OUTPUT:	non.
2080Sstevel@tonic-gate  * RETURNS:	int	- 0 (always).
2090Sstevel@tonic-gate  * PURPOSE:	place unit structure into in-core unit array (keyed from
2100Sstevel@tonic-gate  *		minor number).
2110Sstevel@tonic-gate  */
2120Sstevel@tonic-gate int
2130Sstevel@tonic-gate sp_build_incore(void *p, int snarfing)
2140Sstevel@tonic-gate {
2150Sstevel@tonic-gate 	mp_unit_t	*un = (mp_unit_t *)p;
2160Sstevel@tonic-gate 	minor_t		mnum;
2170Sstevel@tonic-gate 	set_t		setno;
2180Sstevel@tonic-gate 	md_dev64_t	tmpdev;
2190Sstevel@tonic-gate 
2200Sstevel@tonic-gate 	mnum = MD_SID(un);
2210Sstevel@tonic-gate 
2220Sstevel@tonic-gate 	if (MD_UNIT(mnum) != NULL)
2230Sstevel@tonic-gate 		return (0);
2240Sstevel@tonic-gate 
2250Sstevel@tonic-gate 	MD_STATUS(un) = 0;
2260Sstevel@tonic-gate 
2270Sstevel@tonic-gate 	if (snarfing) {
2280Sstevel@tonic-gate 		/*
2290Sstevel@tonic-gate 		 * if we are snarfing, we get the device information
2300Sstevel@tonic-gate 		 * from the metadb record (using the metadb key for
2310Sstevel@tonic-gate 		 * that device).
2320Sstevel@tonic-gate 		 */
2330Sstevel@tonic-gate 		setno = MD_MIN2SET(mnum);
2340Sstevel@tonic-gate 
2350Sstevel@tonic-gate 		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
2360Sstevel@tonic-gate 		    un->un_key, MD_NOTRUST_DEVT);
2370Sstevel@tonic-gate 		un->un_dev = tmpdev;
2380Sstevel@tonic-gate 	}
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate 	/* place unit in in-core array */
2410Sstevel@tonic-gate 	MD_UNIT(mnum) = un;
2420Sstevel@tonic-gate 	return (0);
2430Sstevel@tonic-gate }
2440Sstevel@tonic-gate 
2450Sstevel@tonic-gate /*
2460Sstevel@tonic-gate  * FUNCTION:	reset_sp()
2470Sstevel@tonic-gate  * INPUT:	un		- unit structure to be reset/removed.
2480Sstevel@tonic-gate  *		mnum		- minor number to be reset/removed.
2490Sstevel@tonic-gate  *		removing	- flag to tell us if we are removing
2500Sstevel@tonic-gate  *				  permanently or just reseting in-core
2510Sstevel@tonic-gate  *				  structures.
2520Sstevel@tonic-gate  * OUTPUT:	none.
2530Sstevel@tonic-gate  * RETURNS:	void.
2540Sstevel@tonic-gate  * PURPOSE:	used to either simply reset in-core structures or to
2550Sstevel@tonic-gate  *		permanently remove metadevices from the metadb.
2560Sstevel@tonic-gate  */
2570Sstevel@tonic-gate void
2580Sstevel@tonic-gate reset_sp(mp_unit_t *un, minor_t mnum, int removing)
2590Sstevel@tonic-gate {
2600Sstevel@tonic-gate 	sv_dev_t	*sv;
2610Sstevel@tonic-gate 	mddb_recid_t	vtoc_id;
2620Sstevel@tonic-gate 
2630Sstevel@tonic-gate 	/* clean up in-core structures */
2640Sstevel@tonic-gate 	md_destroy_unit_incore(mnum, &sp_md_ops);
2650Sstevel@tonic-gate 
2660Sstevel@tonic-gate 	MD_UNIT(mnum) = NULL;
2670Sstevel@tonic-gate 
2680Sstevel@tonic-gate 	if (!removing)
2690Sstevel@tonic-gate 		return;
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate 	/* we are removing the soft partition from the metadb */
2720Sstevel@tonic-gate 
2730Sstevel@tonic-gate 	/*
2740Sstevel@tonic-gate 	 * Save off device information so we can get to
2750Sstevel@tonic-gate 	 * it after we do the mddb_deleterec().
2760Sstevel@tonic-gate 	 */
2770Sstevel@tonic-gate 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
2780Sstevel@tonic-gate 	sv->setno = MD_MIN2SET(mnum);
2790Sstevel@tonic-gate 	sv->key = un->un_key;
2800Sstevel@tonic-gate 	vtoc_id = un->c.un_vtoc_id;
2810Sstevel@tonic-gate 
2820Sstevel@tonic-gate 	/* Remove the unit structure */
2830Sstevel@tonic-gate 	mddb_deleterec_wrapper(un->c.un_record_id);
2840Sstevel@tonic-gate 
2850Sstevel@tonic-gate 	if (vtoc_id)
2860Sstevel@tonic-gate 		mddb_deleterec_wrapper(vtoc_id);
2870Sstevel@tonic-gate 
2880Sstevel@tonic-gate 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
2890Sstevel@tonic-gate 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
2900Sstevel@tonic-gate 
2910Sstevel@tonic-gate 	/*
2920Sstevel@tonic-gate 	 * remove the underlying device name from the metadb.  if other
2930Sstevel@tonic-gate 	 * soft partitions are built on this device, this will simply
2940Sstevel@tonic-gate 	 * decrease the reference count for this device.  otherwise the
2950Sstevel@tonic-gate 	 * name record for this device will be removed from the metadb.
2960Sstevel@tonic-gate 	 */
2970Sstevel@tonic-gate 	md_rem_names(sv, 1);
2980Sstevel@tonic-gate 	kmem_free(sv, sizeof (sv_dev_t));
2990Sstevel@tonic-gate }
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate /*
3020Sstevel@tonic-gate  * FUNCTION:	sp_send_stat_msg
3030Sstevel@tonic-gate  * INPUT:	un	- unit reference
3040Sstevel@tonic-gate  *		status	- status to be sent to master node
3050Sstevel@tonic-gate  *			MD_SP_OK - soft-partition is now OK
3060Sstevel@tonic-gate  *			MD_SP_ERR	"	"	 errored
3070Sstevel@tonic-gate  * OUTPUT:	none.
3080Sstevel@tonic-gate  * RETURNS:	void.
3090Sstevel@tonic-gate  * PURPOSE:	send a soft-partition status change to the master node. If the
3100Sstevel@tonic-gate  *		message succeeds we simply return. If it fails we panic as the
3110Sstevel@tonic-gate  *		cluster-wide view of the metadevices is now inconsistent.
3120Sstevel@tonic-gate  * CALLING CONTEXT:
3130Sstevel@tonic-gate  *	Blockable. No locks can be held.
3140Sstevel@tonic-gate  */
3150Sstevel@tonic-gate static void
3160Sstevel@tonic-gate sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
3170Sstevel@tonic-gate {
3180Sstevel@tonic-gate 	md_mn_msg_sp_setstat_t	sp_msg;
3190Sstevel@tonic-gate 	md_mn_kresult_t	*kres;
3200Sstevel@tonic-gate 	set_t		setno = MD_UN2SET(un);
3210Sstevel@tonic-gate 	int		rval;
3220Sstevel@tonic-gate 	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
3230Sstevel@tonic-gate 
3240Sstevel@tonic-gate 	sp_msg.sp_setstat_mnum = MD_SID(un);
3250Sstevel@tonic-gate 	sp_msg.sp_setstat_status = status;
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
3300Sstevel@tonic-gate 	    (char *)&sp_msg, sizeof (sp_msg), kres);
3310Sstevel@tonic-gate 
3320Sstevel@tonic-gate 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3330Sstevel@tonic-gate 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
3340Sstevel@tonic-gate 
3350Sstevel@tonic-gate 		/*
3360Sstevel@tonic-gate 		 * Panic as we are now in an inconsistent state.
3370Sstevel@tonic-gate 		 */
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
3400Sstevel@tonic-gate 		    md_shortname(MD_SID(un)), str);
3410Sstevel@tonic-gate 	}
3420Sstevel@tonic-gate 
3430Sstevel@tonic-gate 	kmem_free(kres, sizeof (md_mn_kresult_t));
3440Sstevel@tonic-gate }
3450Sstevel@tonic-gate 
3460Sstevel@tonic-gate /*
3470Sstevel@tonic-gate  * FUNCTION:	sp_finish_error
3480Sstevel@tonic-gate  * INPUT:	ps	- parent save structure for error-ed I/O.
3490Sstevel@tonic-gate  *		lock_held	- set if the unit readerlock is held
3500Sstevel@tonic-gate  * OUTPUT:	none.
3510Sstevel@tonic-gate  * RETURNS:	void.
3520Sstevel@tonic-gate  * PURPOSE:	report a driver error
3530Sstevel@tonic-gate  */
3540Sstevel@tonic-gate static void
3550Sstevel@tonic-gate sp_finish_error(md_spps_t *ps, int lock_held)
3560Sstevel@tonic-gate {
3570Sstevel@tonic-gate 	struct buf	*pb = ps->ps_bp;
3580Sstevel@tonic-gate 	mdi_unit_t	*ui = ps->ps_ui;
3590Sstevel@tonic-gate 	md_dev64_t	un_dev;			/* underlying device */
3600Sstevel@tonic-gate 	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
3610Sstevel@tonic-gate 	char		*str;
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate 	un_dev = md_expldev(ps->ps_un->un_dev);
3640Sstevel@tonic-gate 	/* set error type */
3650Sstevel@tonic-gate 	if (pb->b_flags & B_READ) {
3660Sstevel@tonic-gate 		str = "read";
3670Sstevel@tonic-gate 	} else {
3680Sstevel@tonic-gate 		str = "write";
3690Sstevel@tonic-gate 	}
3700Sstevel@tonic-gate 
3710Sstevel@tonic-gate 
3720Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
3730Sstevel@tonic-gate 	pb->b_flags |= B_ERROR;
3740Sstevel@tonic-gate 
3750Sstevel@tonic-gate 	md_kstat_done(ui, pb, 0);
3760Sstevel@tonic-gate 
3770Sstevel@tonic-gate 	if (lock_held) {
3780Sstevel@tonic-gate 		md_unit_readerexit(ui);
3790Sstevel@tonic-gate 	}
3800Sstevel@tonic-gate 	md_biodone(pb);
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	cmn_err(CE_WARN, "md: %s: %s error on %s",
3830Sstevel@tonic-gate 	    md_shortname(md_getminor(md_dev)), str,
3840Sstevel@tonic-gate 	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
3850Sstevel@tonic-gate }
3860Sstevel@tonic-gate 
3870Sstevel@tonic-gate 
3880Sstevel@tonic-gate /*
3890Sstevel@tonic-gate  * FUNCTION:	sp_xmit_ok
3900Sstevel@tonic-gate  * INPUT:	dq	- daemon queue referencing failing ps structure
3910Sstevel@tonic-gate  * OUTPUT:	none.
3920Sstevel@tonic-gate  * RETURNS:	void.
3930Sstevel@tonic-gate  * PURPOSE:	send a message to the master node in a multi-owner diskset to
3940Sstevel@tonic-gate  *		update all attached nodes view of the soft-part to be MD_SP_OK.
3950Sstevel@tonic-gate  * CALLING CONTEXT:
3960Sstevel@tonic-gate  *	Blockable. No unit lock held.
3970Sstevel@tonic-gate  */
3980Sstevel@tonic-gate static void
3990Sstevel@tonic-gate sp_xmit_ok(daemon_queue_t *dq)
4000Sstevel@tonic-gate {
4010Sstevel@tonic-gate 	md_spps_t	*ps = (md_spps_t *)dq;
4020Sstevel@tonic-gate 
4030Sstevel@tonic-gate 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
4040Sstevel@tonic-gate 	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
4050Sstevel@tonic-gate 
4060Sstevel@tonic-gate 	/*
4070Sstevel@tonic-gate 	 * Successfully transmitted error state to all nodes, now release this
4080Sstevel@tonic-gate 	 * parent structure.
4090Sstevel@tonic-gate 	 */
4100Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
4110Sstevel@tonic-gate }
4120Sstevel@tonic-gate 
4130Sstevel@tonic-gate /*
4140Sstevel@tonic-gate  * FUNCTION:	sp_xmit_error
4150Sstevel@tonic-gate  * INPUT:	dq	- daemon queue referencing failing ps structure
4160Sstevel@tonic-gate  * OUTPUT:	none.
4170Sstevel@tonic-gate  * RETURNS:	void.
4180Sstevel@tonic-gate  * PURPOSE:	send a message to the master node in a multi-owner diskset to
4190Sstevel@tonic-gate  *		update all attached nodes view of the soft-part to be MD_SP_ERR.
4200Sstevel@tonic-gate  * CALLING CONTEXT:
4210Sstevel@tonic-gate  *	Blockable. No unit lock held.
4220Sstevel@tonic-gate  */
4230Sstevel@tonic-gate static void
4240Sstevel@tonic-gate sp_xmit_error(daemon_queue_t *dq)
4250Sstevel@tonic-gate {
4260Sstevel@tonic-gate 	md_spps_t	*ps = (md_spps_t *)dq;
4270Sstevel@tonic-gate 
4280Sstevel@tonic-gate 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
4290Sstevel@tonic-gate 	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
4300Sstevel@tonic-gate 
4310Sstevel@tonic-gate 	/*
4320Sstevel@tonic-gate 	 * Successfully transmitted error state to all nodes, now release this
4330Sstevel@tonic-gate 	 * parent structure.
4340Sstevel@tonic-gate 	 */
4350Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
4360Sstevel@tonic-gate }
4370Sstevel@tonic-gate static void
4380Sstevel@tonic-gate sp_send_stat_ok(mp_unit_t *un)
4390Sstevel@tonic-gate {
4400Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
4410Sstevel@tonic-gate 	md_spps_t	*ps;
4420Sstevel@tonic-gate 
4430Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
4440Sstevel@tonic-gate 	sp_parent_init(ps);
4450Sstevel@tonic-gate 	ps->ps_un = un;
4460Sstevel@tonic-gate 	ps->ps_ui = MDI_UNIT(mnum);
4470Sstevel@tonic-gate 
4480Sstevel@tonic-gate 	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
4490Sstevel@tonic-gate 	REQ_OLD);
4500Sstevel@tonic-gate }
4510Sstevel@tonic-gate 
4520Sstevel@tonic-gate static void
4530Sstevel@tonic-gate sp_send_stat_err(mp_unit_t *un)
4540Sstevel@tonic-gate {
4550Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
4560Sstevel@tonic-gate 	md_spps_t	*ps;
4570Sstevel@tonic-gate 
4580Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
4590Sstevel@tonic-gate 	sp_parent_init(ps);
4600Sstevel@tonic-gate 	ps->ps_un = un;
4610Sstevel@tonic-gate 	ps->ps_ui = MDI_UNIT(mnum);
4620Sstevel@tonic-gate 
4630Sstevel@tonic-gate 	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
4640Sstevel@tonic-gate 	REQ_OLD);
4650Sstevel@tonic-gate }
4660Sstevel@tonic-gate 
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate /*
4690Sstevel@tonic-gate  * FUNCTION:	sp_error()
4700Sstevel@tonic-gate  * INPUT:	ps	- parent save structure for error-ed I/O.
4710Sstevel@tonic-gate  * OUTPUT:	none.
4720Sstevel@tonic-gate  * RETURNS:	void.
4730Sstevel@tonic-gate  * PURPOSE:	report a driver error.
4740Sstevel@tonic-gate  * CALLING CONTEXT:
4750Sstevel@tonic-gate  *	Interrupt - non-blockable
4760Sstevel@tonic-gate  */
4770Sstevel@tonic-gate static void
4780Sstevel@tonic-gate sp_error(md_spps_t *ps)
4790Sstevel@tonic-gate {
4800Sstevel@tonic-gate 	set_t		setno = MD_UN2SET(ps->ps_un);
4810Sstevel@tonic-gate 
4820Sstevel@tonic-gate 	/*
4830Sstevel@tonic-gate 	 * Drop the mutex associated with this request before (potentially)
4840Sstevel@tonic-gate 	 * enqueuing the free onto a separate thread. We have to release the
4850Sstevel@tonic-gate 	 * mutex before destroying the parent structure.
4860Sstevel@tonic-gate 	 */
4870Sstevel@tonic-gate 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
4880Sstevel@tonic-gate 		if (MUTEX_HELD(&ps->ps_mx)) {
4890Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
4900Sstevel@tonic-gate 		}
4910Sstevel@tonic-gate 	} else {
4920Sstevel@tonic-gate 		/*
4930Sstevel@tonic-gate 		 * this should only ever happen if we are panicking,
4940Sstevel@tonic-gate 		 * since DONTFREE is only set on the parent if panicstr
4950Sstevel@tonic-gate 		 * is non-NULL.
4960Sstevel@tonic-gate 		 */
4970Sstevel@tonic-gate 		ASSERT(panicstr);
4980Sstevel@tonic-gate 	}
4990Sstevel@tonic-gate 
5000Sstevel@tonic-gate 	/*
5010Sstevel@tonic-gate 	 * For a multi-owner set we need to send a message to the master so that
5020Sstevel@tonic-gate 	 * all nodes get the errored status when we first encounter it. To avoid
5030Sstevel@tonic-gate 	 * deadlocking when multiple soft-partitions encounter an error on one
5040Sstevel@tonic-gate 	 * physical unit we drop the unit readerlock before enqueueing the
5050Sstevel@tonic-gate 	 * request. That way we can service any messages that require a
5060Sstevel@tonic-gate 	 * writerlock to be held. Additionally, to avoid deadlocking when at
5070Sstevel@tonic-gate 	 * the bottom of a metadevice stack and a higher level mirror has
5080Sstevel@tonic-gate 	 * multiple requests outstanding on this soft-part, we clone the ps
5090Sstevel@tonic-gate 	 * that failed and pass the error back up the stack to release the
5100Sstevel@tonic-gate 	 * reference that this i/o may have in the higher-level metadevice.
5110Sstevel@tonic-gate 	 * The other nodes in the cluster just have to modify the soft-part
5120Sstevel@tonic-gate 	 * status and we do not need to block the i/o completion for this.
5130Sstevel@tonic-gate 	 */
5140Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno)) {
5150Sstevel@tonic-gate 		md_spps_t	*err_ps;
5160Sstevel@tonic-gate 		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
5170Sstevel@tonic-gate 		sp_parent_init(err_ps);
5180Sstevel@tonic-gate 
5190Sstevel@tonic-gate 		err_ps->ps_un = ps->ps_un;
5200Sstevel@tonic-gate 		err_ps->ps_ui = ps->ps_ui;
5210Sstevel@tonic-gate 
5220Sstevel@tonic-gate 		md_unit_readerexit(ps->ps_ui);
5230Sstevel@tonic-gate 
5240Sstevel@tonic-gate 		daemon_request(&md_sp_daemon, sp_xmit_error,
5250Sstevel@tonic-gate 		    (daemon_queue_t *)err_ps, REQ_OLD);
5260Sstevel@tonic-gate 
5270Sstevel@tonic-gate 		sp_finish_error(ps, 0);
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 		return;
5300Sstevel@tonic-gate 	} else {
5310Sstevel@tonic-gate 		ps->ps_un->un_status = MD_SP_ERR;
5320Sstevel@tonic-gate 	}
5330Sstevel@tonic-gate 
5340Sstevel@tonic-gate 	/* Flag the error */
5350Sstevel@tonic-gate 	sp_finish_error(ps, 1);
5360Sstevel@tonic-gate 
5370Sstevel@tonic-gate }
5380Sstevel@tonic-gate 
5390Sstevel@tonic-gate /*
5400Sstevel@tonic-gate  * FUNCTION:	sp_mapbuf()
5410Sstevel@tonic-gate  * INPUT:	un	- unit structure for soft partition we are doing
5420Sstevel@tonic-gate  *			  I/O on.
5430Sstevel@tonic-gate  *		voff	- virtual offset in soft partition to map.
5440Sstevel@tonic-gate  *		bcount	- # of blocks in the I/O.
5450Sstevel@tonic-gate  * OUTPUT:	bp	- translated buffer to be passed down to next layer.
5460Sstevel@tonic-gate  * RETURNS:	1	- request must be fragmented, more work to do,
5470Sstevel@tonic-gate  *		0	- request satisified, no more work to do
5480Sstevel@tonic-gate  *		-1	- error
5490Sstevel@tonic-gate  * PURPOSE:	Map the the virtual offset in the soft partition (passed
5500Sstevel@tonic-gate  *		in via voff) to the "physical" offset on whatever the soft
5510Sstevel@tonic-gate  *		partition is built on top of.  We do this by doing a binary
5520Sstevel@tonic-gate  *		search of the extent array in the soft partition unit
5530Sstevel@tonic-gate  *		structure.  Once the current extent is found, we do the
5540Sstevel@tonic-gate  *		translation, determine if the I/O will cross extent
5550Sstevel@tonic-gate  *		boundaries (if so, we have to fragment the I/O), then
5560Sstevel@tonic-gate  *		fill in the buf structure to be passed down to the next layer.
5570Sstevel@tonic-gate  */
5580Sstevel@tonic-gate static int
5590Sstevel@tonic-gate sp_mapbuf(
5600Sstevel@tonic-gate 	mp_unit_t	*un,
5610Sstevel@tonic-gate 	sp_ext_offset_t	voff,
5620Sstevel@tonic-gate 	sp_ext_length_t	bcount,
5630Sstevel@tonic-gate 	buf_t		*bp
5640Sstevel@tonic-gate )
5650Sstevel@tonic-gate {
5660Sstevel@tonic-gate 	int		lo, mid, hi, found, more;
5670Sstevel@tonic-gate 	size_t		new_bcount;
5680Sstevel@tonic-gate 	sp_ext_offset_t new_blkno;
5690Sstevel@tonic-gate 	sp_ext_offset_t	new_offset;
5700Sstevel@tonic-gate 	sp_ext_offset_t	ext_endblk;
5710Sstevel@tonic-gate 	md_dev64_t	new_edev;
5720Sstevel@tonic-gate 	extern unsigned	md_maxphys;
5730Sstevel@tonic-gate 
5740Sstevel@tonic-gate 	found = 0;
5750Sstevel@tonic-gate 	lo = 0;
5760Sstevel@tonic-gate 	hi = un->un_numexts - 1;
5770Sstevel@tonic-gate 
5780Sstevel@tonic-gate 	/*
5790Sstevel@tonic-gate 	 * do a binary search to find the extent that contains the
5800Sstevel@tonic-gate 	 * starting offset.  after this loop, mid contains the index
5810Sstevel@tonic-gate 	 * of the correct extent.
5820Sstevel@tonic-gate 	 */
5830Sstevel@tonic-gate 	while (lo <= hi && !found) {
5840Sstevel@tonic-gate 		mid = (lo + hi) / 2;
5850Sstevel@tonic-gate 		/* is the starting offset contained within the mid-ext? */
5860Sstevel@tonic-gate 		if (voff >= un->un_ext[mid].un_voff &&
5870Sstevel@tonic-gate 		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
5880Sstevel@tonic-gate 			found = 1;
5890Sstevel@tonic-gate 		else if (voff < un->un_ext[mid].un_voff)
5900Sstevel@tonic-gate 			hi = mid - 1;
5910Sstevel@tonic-gate 		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
5920Sstevel@tonic-gate 			lo = mid + 1;
5930Sstevel@tonic-gate 	}
5940Sstevel@tonic-gate 
5950Sstevel@tonic-gate 	if (!found) {
5960Sstevel@tonic-gate 		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
5970Sstevel@tonic-gate 		return (-1);
5980Sstevel@tonic-gate 	}
5990Sstevel@tonic-gate 
6000Sstevel@tonic-gate 	/* translate to underlying physical offset/device */
6010Sstevel@tonic-gate 	new_offset = voff - un->un_ext[mid].un_voff;
6020Sstevel@tonic-gate 	new_blkno = un->un_ext[mid].un_poff + new_offset;
6030Sstevel@tonic-gate 	new_edev = un->un_dev;
6040Sstevel@tonic-gate 
6050Sstevel@tonic-gate 	/* determine if we need to break the I/O into fragments */
6060Sstevel@tonic-gate 	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
6070Sstevel@tonic-gate 	if (voff + btodb(bcount) > ext_endblk) {
6080Sstevel@tonic-gate 		new_bcount = dbtob(ext_endblk - voff);
6090Sstevel@tonic-gate 		more = 1;
6100Sstevel@tonic-gate 	} else {
6110Sstevel@tonic-gate 		new_bcount = bcount;
6120Sstevel@tonic-gate 		more = 0;
6130Sstevel@tonic-gate 	}
6140Sstevel@tonic-gate 
6150Sstevel@tonic-gate 	/* only break up the I/O if we're not built on another metadevice */
6160Sstevel@tonic-gate 	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
6170Sstevel@tonic-gate 		new_bcount = md_maxphys;
6180Sstevel@tonic-gate 		more = 1;
6190Sstevel@tonic-gate 	}
6200Sstevel@tonic-gate 	if (bp != (buf_t *)NULL) {
6210Sstevel@tonic-gate 		/* do bp updates */
6220Sstevel@tonic-gate 		bp->b_bcount = new_bcount;
6230Sstevel@tonic-gate 		bp->b_lblkno = new_blkno;
6240Sstevel@tonic-gate 		bp->b_edev = md_dev64_to_dev(new_edev);
6250Sstevel@tonic-gate 	}
6260Sstevel@tonic-gate 	return (more);
6270Sstevel@tonic-gate }
6280Sstevel@tonic-gate 
6290Sstevel@tonic-gate /*
6300Sstevel@tonic-gate  * FUNCTION:	sp_validate()
6310Sstevel@tonic-gate  * INPUT:	un	- unit structure to be validated.
6320Sstevel@tonic-gate  * OUTPUT:	none.
6330Sstevel@tonic-gate  * RETURNS:	0	- soft partition ok.
6340Sstevel@tonic-gate  *		-1	- error.
6350Sstevel@tonic-gate  * PURPOSE:	called on open to sanity check the soft partition.  In
6360Sstevel@tonic-gate  *		order to open a soft partition:
6370Sstevel@tonic-gate  *		- it must have at least one extent
6380Sstevel@tonic-gate  *		- the extent info in core and on disk must match
6390Sstevel@tonic-gate  *		- it may not be in an intermediate state (which would
6400Sstevel@tonic-gate  *		  imply that a two-phase commit was interrupted)
6410Sstevel@tonic-gate  *
6420Sstevel@tonic-gate  *		If the extent checking fails (B_ERROR returned from the read
6430Sstevel@tonic-gate  *		strategy call) _and_ we're a multi-owner diskset, we send a
6440Sstevel@tonic-gate  *		message to the master so that all nodes inherit the same view
6450Sstevel@tonic-gate  *		of the soft partition.
6460Sstevel@tonic-gate  *		If we are checking a soft-part that is marked as in error, and
6470Sstevel@tonic-gate  *		we can actually read and validate the watermarks we send a
6480Sstevel@tonic-gate  *		message to clear the error to the master node.
6490Sstevel@tonic-gate  */
6500Sstevel@tonic-gate static int
6510Sstevel@tonic-gate sp_validate(mp_unit_t *un)
6520Sstevel@tonic-gate {
6530Sstevel@tonic-gate 	uint_t		ext;
6540Sstevel@tonic-gate 	struct buf	*buf;
6550Sstevel@tonic-gate 	sp_ext_length_t	len;
6560Sstevel@tonic-gate 	mp_watermark_t	*wm;
6570Sstevel@tonic-gate 	set_t		setno;
6580Sstevel@tonic-gate 	int		reset_error = 0;
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate 	setno = MD_UN2SET(un);
6610Sstevel@tonic-gate 
6620Sstevel@tonic-gate 	/* sanity check unit structure components ?? */
6630Sstevel@tonic-gate 	if (un->un_status != MD_SP_OK) {
6640Sstevel@tonic-gate 		if (un->un_status != MD_SP_ERR) {
6650Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
6660Sstevel@tonic-gate 			    "status is %u.",
6670Sstevel@tonic-gate 			    md_shortname(MD_SID(un)),
6680Sstevel@tonic-gate 			    un->un_status);
6690Sstevel@tonic-gate 			return (-1);
6700Sstevel@tonic-gate 		} else {
6710Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open of soft partition "
6720Sstevel@tonic-gate 			    "in Errored state.",
6730Sstevel@tonic-gate 			    md_shortname(MD_SID(un)));
6740Sstevel@tonic-gate 			reset_error = 1;
6750Sstevel@tonic-gate 		}
6760Sstevel@tonic-gate 	}
6770Sstevel@tonic-gate 
6780Sstevel@tonic-gate 	if (un->un_numexts == 0) {
6790Sstevel@tonic-gate 		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
6800Sstevel@tonic-gate 		    "not have any extents.", md_shortname(MD_SID(un)));
6810Sstevel@tonic-gate 		return (-1);
6820Sstevel@tonic-gate 	}
6830Sstevel@tonic-gate 
6840Sstevel@tonic-gate 	len = 0LL;
6850Sstevel@tonic-gate 	for (ext = 0; ext < un->un_numexts; ext++) {
6860Sstevel@tonic-gate 
6870Sstevel@tonic-gate 		/* tally extent lengths to check total size */
6880Sstevel@tonic-gate 		len += un->un_ext[ext].un_len;
6890Sstevel@tonic-gate 
6900Sstevel@tonic-gate 		/* allocate buffer for watermark */
6910Sstevel@tonic-gate 		buf = getrbuf(KM_SLEEP);
6920Sstevel@tonic-gate 
6930Sstevel@tonic-gate 		/* read watermark */
6940Sstevel@tonic-gate 		buf->b_flags = B_READ;
6950Sstevel@tonic-gate 		buf->b_edev = md_dev64_to_dev(un->un_dev);
6960Sstevel@tonic-gate 		buf->b_iodone = NULL;
6970Sstevel@tonic-gate 		buf->b_proc = NULL;
6980Sstevel@tonic-gate 		buf->b_bcount = sizeof (mp_watermark_t);
6990Sstevel@tonic-gate 		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
7000Sstevel@tonic-gate 		buf->b_bufsize = sizeof (mp_watermark_t);
7010Sstevel@tonic-gate 		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
7020Sstevel@tonic-gate 		    KM_SLEEP);
7030Sstevel@tonic-gate 
7040Sstevel@tonic-gate 		/*
7050Sstevel@tonic-gate 		 * make the call non-blocking so that it is not affected
7060Sstevel@tonic-gate 		 * by a set take.
7070Sstevel@tonic-gate 		 */
7080Sstevel@tonic-gate 		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
7090Sstevel@tonic-gate 		(void) biowait(buf);
7100Sstevel@tonic-gate 
7110Sstevel@tonic-gate 		if (buf->b_flags & B_ERROR) {
7120Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, could not "
7130Sstevel@tonic-gate 			    "read watermark at block %llu for extent %u, "
7140Sstevel@tonic-gate 			    "error %d.", md_shortname(MD_SID(un)),
7150Sstevel@tonic-gate 			    buf->b_lblkno, ext, buf->b_error);
7160Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7170Sstevel@tonic-gate 			freerbuf(buf);
7180Sstevel@tonic-gate 
7190Sstevel@tonic-gate 			/*
7200Sstevel@tonic-gate 			 * If we're a multi-owner diskset we send a message
7210Sstevel@tonic-gate 			 * indicating that this soft-part has an invalid
7220Sstevel@tonic-gate 			 * extent to the master node. This ensures a consistent
7230Sstevel@tonic-gate 			 * view of the soft-part across the cluster.
7240Sstevel@tonic-gate 			 */
7250Sstevel@tonic-gate 			if (MD_MNSET_SETNO(setno)) {
7260Sstevel@tonic-gate 				sp_send_stat_err(un);
7270Sstevel@tonic-gate 			}
7280Sstevel@tonic-gate 			return (-1);
7290Sstevel@tonic-gate 		}
7300Sstevel@tonic-gate 
7310Sstevel@tonic-gate 		wm = (mp_watermark_t *)buf->b_un.b_addr;
7320Sstevel@tonic-gate 
7330Sstevel@tonic-gate 		/* make sure the checksum is correct first */
7340Sstevel@tonic-gate 		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
7350Sstevel@tonic-gate 		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
7360Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
7370Sstevel@tonic-gate 			    "at block %llu for extent %u does not have a "
7380Sstevel@tonic-gate 			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
7390Sstevel@tonic-gate 			    buf->b_lblkno, ext, wm->wm_checksum);
7400Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7410Sstevel@tonic-gate 			freerbuf(buf);
7420Sstevel@tonic-gate 			return (-1);
7430Sstevel@tonic-gate 		}
7440Sstevel@tonic-gate 
7450Sstevel@tonic-gate 		if (wm->wm_magic != MD_SP_MAGIC) {
7460Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
7470Sstevel@tonic-gate 			    "at block %llu for extent %u does not have a "
7480Sstevel@tonic-gate 			    "valid watermark magic number, expected 0x%x, "
7490Sstevel@tonic-gate 			    "found 0x%x.", md_shortname(MD_SID(un)),
7500Sstevel@tonic-gate 			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
7510Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7520Sstevel@tonic-gate 			freerbuf(buf);
7530Sstevel@tonic-gate 			return (-1);
7540Sstevel@tonic-gate 		}
7550Sstevel@tonic-gate 
7560Sstevel@tonic-gate 		/* make sure sequence number matches the current extent */
7570Sstevel@tonic-gate 		if (wm->wm_seq != ext) {
7580Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
7590Sstevel@tonic-gate 			    "at block %llu for extent %u has invalid "
7600Sstevel@tonic-gate 			    "sequence number %u.", md_shortname(MD_SID(un)),
7610Sstevel@tonic-gate 			    buf->b_lblkno, ext, wm->wm_seq);
7620Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7630Sstevel@tonic-gate 			freerbuf(buf);
7640Sstevel@tonic-gate 			return (-1);
7650Sstevel@tonic-gate 		}
7660Sstevel@tonic-gate 
7670Sstevel@tonic-gate 		/* make sure watermark length matches unit structure */
7680Sstevel@tonic-gate 		if (wm->wm_length != un->un_ext[ext].un_len) {
7690Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
7700Sstevel@tonic-gate 			    "at block %llu for extent %u has inconsistent "
7710Sstevel@tonic-gate 			    "length, expected %llu, found %llu.",
7720Sstevel@tonic-gate 			    md_shortname(MD_SID(un)), buf->b_lblkno,
7730Sstevel@tonic-gate 			    ext, un->un_ext[ext].un_len,
7740Sstevel@tonic-gate 			    (u_longlong_t)wm->wm_length);
7750Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7760Sstevel@tonic-gate 			freerbuf(buf);
7770Sstevel@tonic-gate 			return (-1);
7780Sstevel@tonic-gate 		}
7790Sstevel@tonic-gate 
7800Sstevel@tonic-gate 		/*
7810Sstevel@tonic-gate 		 * make sure the type is a valid soft partition and not
7820Sstevel@tonic-gate 		 * a free extent or the end.
7830Sstevel@tonic-gate 		 */
7840Sstevel@tonic-gate 		if (wm->wm_type != EXTTYP_ALLOC) {
7850Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
7860Sstevel@tonic-gate 			    "at block %llu for extent %u is not marked "
7870Sstevel@tonic-gate 			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
7880Sstevel@tonic-gate 			    buf->b_lblkno, ext, wm->wm_type);
7890Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7900Sstevel@tonic-gate 			freerbuf(buf);
7910Sstevel@tonic-gate 			return (-1);
7920Sstevel@tonic-gate 		}
7930Sstevel@tonic-gate 		/* free up buffer */
7940Sstevel@tonic-gate 		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7950Sstevel@tonic-gate 		freerbuf(buf);
7960Sstevel@tonic-gate 	}
7970Sstevel@tonic-gate 
7980Sstevel@tonic-gate 	if (len != un->un_length) {
7990Sstevel@tonic-gate 		cmn_err(CE_WARN, "md: %s: open failed, computed length "
8000Sstevel@tonic-gate 		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
8010Sstevel@tonic-gate 		    len, un->un_length);
8020Sstevel@tonic-gate 		return (-1);
8030Sstevel@tonic-gate 	}
8040Sstevel@tonic-gate 
8050Sstevel@tonic-gate 	/*
8060Sstevel@tonic-gate 	 * If we're a multi-owner set _and_ reset_error is set, we should clear
8070Sstevel@tonic-gate 	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
8080Sstevel@tonic-gate 	 * MD_SP_OK.
8090Sstevel@tonic-gate 	 */
8100Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno) && reset_error) {
8110Sstevel@tonic-gate 		sp_send_stat_ok(un);
8120Sstevel@tonic-gate 	}
8130Sstevel@tonic-gate 	return (0);
8140Sstevel@tonic-gate }
8150Sstevel@tonic-gate 
8160Sstevel@tonic-gate /*
8170Sstevel@tonic-gate  * FUNCTION:	sp_done()
8180Sstevel@tonic-gate  * INPUT:	child_buf	- buffer attached to child save structure.
8190Sstevel@tonic-gate  *				  this is the buffer on which I/O has just
8200Sstevel@tonic-gate  *				  completed.
8210Sstevel@tonic-gate  * OUTPUT:	none.
8220Sstevel@tonic-gate  * RETURNS:	0	- success.
8230Sstevel@tonic-gate  *		1	- error.
8240Sstevel@tonic-gate  * PURPOSE:	called on I/O completion.
8250Sstevel@tonic-gate  */
8260Sstevel@tonic-gate static int
8270Sstevel@tonic-gate sp_done(struct buf *child_buf)
8280Sstevel@tonic-gate {
8290Sstevel@tonic-gate 	struct buf	*parent_buf;
8300Sstevel@tonic-gate 	mdi_unit_t	*ui;
8310Sstevel@tonic-gate 	md_spps_t	*ps;
8320Sstevel@tonic-gate 	md_spcs_t	*cs;
8330Sstevel@tonic-gate 
8340Sstevel@tonic-gate 	/* find the child save structure to which this buffer belongs */
8350Sstevel@tonic-gate 	cs = (md_spcs_t *)((caddr_t)child_buf -
8360Sstevel@tonic-gate 	    (sizeof (md_spcs_t) - sizeof (buf_t)));
8370Sstevel@tonic-gate 	/* now get the parent save structure */
8380Sstevel@tonic-gate 	ps = cs->cs_ps;
8390Sstevel@tonic-gate 	parent_buf = ps->ps_bp;
8400Sstevel@tonic-gate 
8410Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
8420Sstevel@tonic-gate 	/* pass any errors back up to the parent */
8430Sstevel@tonic-gate 	if (child_buf->b_flags & B_ERROR) {
8440Sstevel@tonic-gate 		ps->ps_flags |= MD_SPPS_ERROR;
8450Sstevel@tonic-gate 		parent_buf->b_error = child_buf->b_error;
8460Sstevel@tonic-gate 	}
8470Sstevel@tonic-gate 	/* mapout, if needed */
8480Sstevel@tonic-gate 	if (child_buf->b_flags & B_REMAPPED)
8490Sstevel@tonic-gate 		bp_mapout(child_buf);
8500Sstevel@tonic-gate 
8510Sstevel@tonic-gate 	ps->ps_frags--;
8520Sstevel@tonic-gate 	if (ps->ps_frags != 0) {
8530Sstevel@tonic-gate 		/*
8540Sstevel@tonic-gate 		 * if this parent has more children, we just free the
8550Sstevel@tonic-gate 		 * child and return.
8560Sstevel@tonic-gate 		 */
8570Sstevel@tonic-gate 		kmem_cache_free(sp_child_cache, cs);
8580Sstevel@tonic-gate 		mutex_exit(&ps->ps_mx);
8590Sstevel@tonic-gate 		return (1);
8600Sstevel@tonic-gate 	}
8610Sstevel@tonic-gate 	/* there are no more children */
8620Sstevel@tonic-gate 	kmem_cache_free(sp_child_cache, cs);
8630Sstevel@tonic-gate 	if (ps->ps_flags & MD_SPPS_ERROR) {
8640Sstevel@tonic-gate 		sp_error(ps);
8650Sstevel@tonic-gate 		return (1);
8660Sstevel@tonic-gate 	}
8670Sstevel@tonic-gate 	ui = ps->ps_ui;
8680Sstevel@tonic-gate 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
8690Sstevel@tonic-gate 		mutex_exit(&ps->ps_mx);
8700Sstevel@tonic-gate 	} else {
8710Sstevel@tonic-gate 		/*
8720Sstevel@tonic-gate 		 * this should only ever happen if we are panicking,
8730Sstevel@tonic-gate 		 * since DONTFREE is only set on the parent if panicstr
8740Sstevel@tonic-gate 		 * is non-NULL.
8750Sstevel@tonic-gate 		 */
8760Sstevel@tonic-gate 		ASSERT(panicstr);
8770Sstevel@tonic-gate 	}
8780Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
8790Sstevel@tonic-gate 	md_kstat_done(ui, parent_buf, 0);
8800Sstevel@tonic-gate 	md_unit_readerexit(ui);
8810Sstevel@tonic-gate 	md_biodone(parent_buf);
8820Sstevel@tonic-gate 	return (0);
8830Sstevel@tonic-gate }
8840Sstevel@tonic-gate 
8850Sstevel@tonic-gate /*
8860Sstevel@tonic-gate  * FUNCTION:	md_sp_strategy()
8870Sstevel@tonic-gate  * INPUT:	parent_buf	- parent buffer
8880Sstevel@tonic-gate  *		flag		- flags
8890Sstevel@tonic-gate  *		private		- private data
8900Sstevel@tonic-gate  * OUTPUT:	none.
8910Sstevel@tonic-gate  * RETURNS:	void.
8920Sstevel@tonic-gate  * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
8930Sstevel@tonic-gate  *		needed to do I/O to a soft partition.  The basic
8940Sstevel@tonic-gate  *		algorithm is as follows:
8950Sstevel@tonic-gate  *			- Allocate a child save structure to keep track
8960Sstevel@tonic-gate  *			  of the I/O we are going to pass down.
8970Sstevel@tonic-gate  *			- Map the I/O to the correct extent in the soft
8980Sstevel@tonic-gate  *			  partition (see sp_mapbuf()).
8990Sstevel@tonic-gate  *			- bioclone() the buffer and pass it down the
9000Sstevel@tonic-gate  *			  stack using md_call_strategy.
9010Sstevel@tonic-gate  *			- If the I/O needs to split across extents,
9020Sstevel@tonic-gate  *			  repeat the above steps until all fragments
9030Sstevel@tonic-gate  *			  are finished.
9040Sstevel@tonic-gate  */
9050Sstevel@tonic-gate static void
9060Sstevel@tonic-gate md_sp_strategy(buf_t *parent_buf, int flag, void *private)
9070Sstevel@tonic-gate {
9080Sstevel@tonic-gate 	md_spps_t	*ps;
9090Sstevel@tonic-gate 	md_spcs_t	*cs;
9100Sstevel@tonic-gate 	int		more;
9110Sstevel@tonic-gate 	mp_unit_t	*un;
9120Sstevel@tonic-gate 	mdi_unit_t	*ui;
9130Sstevel@tonic-gate 	size_t		current_count;
9140Sstevel@tonic-gate 	off_t		current_offset;
9150Sstevel@tonic-gate 	sp_ext_offset_t	current_blkno;
9160Sstevel@tonic-gate 	buf_t		*child_buf;
9170Sstevel@tonic-gate 	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
9180Sstevel@tonic-gate 	int		strat_flag = flag;
9190Sstevel@tonic-gate 
9200Sstevel@tonic-gate 	/*
9210Sstevel@tonic-gate 	 * When doing IO to a multi owner meta device, check if set is halted.
9220Sstevel@tonic-gate 	 * We do this check without the needed lock held, for performance
9230Sstevel@tonic-gate 	 * reasons.
9240Sstevel@tonic-gate 	 * If an IO just slips through while the set is locked via an
9250Sstevel@tonic-gate 	 * MD_MN_SUSPEND_SET, we don't care about it.
9260Sstevel@tonic-gate 	 * Only check for suspension if we are a top-level i/o request
9270Sstevel@tonic-gate 	 * (MD_STR_NOTTOP is cleared in 'flag');
9280Sstevel@tonic-gate 	 */
9290Sstevel@tonic-gate 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
9300Sstevel@tonic-gate 	    (MD_SET_HALTED | MD_SET_MNSET)) {
9310Sstevel@tonic-gate 		if ((flag & MD_STR_NOTTOP) == 0) {
9320Sstevel@tonic-gate 			mutex_enter(&md_mx);
9330Sstevel@tonic-gate 			/* Here we loop until the set is no longer halted */
9340Sstevel@tonic-gate 			while (md_set[setno].s_status & MD_SET_HALTED) {
9350Sstevel@tonic-gate 				cv_wait(&md_cv, &md_mx);
9360Sstevel@tonic-gate 			}
9370Sstevel@tonic-gate 			mutex_exit(&md_mx);
9380Sstevel@tonic-gate 		}
9390Sstevel@tonic-gate 	}
9400Sstevel@tonic-gate 
9410Sstevel@tonic-gate 	ui = MDI_UNIT(getminor(parent_buf->b_edev));
9420Sstevel@tonic-gate 
9430Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
9440Sstevel@tonic-gate 
9450Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_readerlock(ui);
9460Sstevel@tonic-gate 
9470Sstevel@tonic-gate 	if ((flag & MD_NOBLOCK) == 0) {
9480Sstevel@tonic-gate 		if (md_inc_iocount(setno) != 0) {
9490Sstevel@tonic-gate 			parent_buf->b_flags |= B_ERROR;
9500Sstevel@tonic-gate 			parent_buf->b_error = ENXIO;
9510Sstevel@tonic-gate 			parent_buf->b_resid = parent_buf->b_bcount;
9520Sstevel@tonic-gate 			md_unit_readerexit(ui);
9530Sstevel@tonic-gate 			biodone(parent_buf);
9540Sstevel@tonic-gate 			return;
9550Sstevel@tonic-gate 		}
9560Sstevel@tonic-gate 	} else {
9570Sstevel@tonic-gate 		md_inc_iocount_noblock(setno);
9580Sstevel@tonic-gate 	}
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP)) {
9610Sstevel@tonic-gate 		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
9620Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
9630Sstevel@tonic-gate 			return;
9640Sstevel@tonic-gate 		}
9650Sstevel@tonic-gate 	}
9660Sstevel@tonic-gate 
9670Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
9680Sstevel@tonic-gate 	sp_parent_init(ps);
9690Sstevel@tonic-gate 
9700Sstevel@tonic-gate 	/*
9710Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
9720Sstevel@tonic-gate 	 * in the parent.
9730Sstevel@tonic-gate 	 */
9740Sstevel@tonic-gate 	ps->ps_un = un;
9750Sstevel@tonic-gate 	ps->ps_ui = ui;
9760Sstevel@tonic-gate 	ps->ps_bp = parent_buf;
9770Sstevel@tonic-gate 	ps->ps_addr = parent_buf->b_un.b_addr;
9780Sstevel@tonic-gate 
9790Sstevel@tonic-gate 	current_count = parent_buf->b_bcount;
9800Sstevel@tonic-gate 	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
9810Sstevel@tonic-gate 	current_offset  = 0;
9820Sstevel@tonic-gate 
9830Sstevel@tonic-gate 	/*
9840Sstevel@tonic-gate 	 * if we are at the top and we are panicking,
9850Sstevel@tonic-gate 	 * we don't free in order to save state.
9860Sstevel@tonic-gate 	 */
9870Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
9880Sstevel@tonic-gate 		ps->ps_flags |= MD_SPPS_DONTFREE;
9890Sstevel@tonic-gate 
9900Sstevel@tonic-gate 	md_kstat_waitq_to_runq(ui);
9910Sstevel@tonic-gate 
9920Sstevel@tonic-gate 	ps->ps_frags++;
9930Sstevel@tonic-gate 
9940Sstevel@tonic-gate 	/*
9950Sstevel@tonic-gate 	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
9960Sstevel@tonic-gate 	 * metadevice.
9970Sstevel@tonic-gate 	 */
9980Sstevel@tonic-gate 	if (ui->ui_tstate & MD_ABR_CAP)
9990Sstevel@tonic-gate 		strat_flag |= MD_STR_ABR;
10000Sstevel@tonic-gate 
10010Sstevel@tonic-gate 	/*
10020Sstevel@tonic-gate 	 * this loop does the main work of an I/O.  we allocate a
10030Sstevel@tonic-gate 	 * a child save for each buf, do the logical to physical
10040Sstevel@tonic-gate 	 * mapping, decide if we need to frag the I/O, clone the
10050Sstevel@tonic-gate 	 * new I/O to pass down the stack.  repeat until we've
10060Sstevel@tonic-gate 	 * taken care of the entire buf that was passed to us.
10070Sstevel@tonic-gate 	 */
10080Sstevel@tonic-gate 	do {
10090Sstevel@tonic-gate 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
10100Sstevel@tonic-gate 		sp_child_init(cs);
10110Sstevel@tonic-gate 		child_buf = &cs->cs_buf;
10120Sstevel@tonic-gate 		cs->cs_ps = ps;
10130Sstevel@tonic-gate 
10140Sstevel@tonic-gate 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
10150Sstevel@tonic-gate 		if (more == -1) {
10160Sstevel@tonic-gate 			parent_buf->b_flags |= B_ERROR;
10170Sstevel@tonic-gate 			parent_buf->b_error = EIO;
10180Sstevel@tonic-gate 			md_kstat_done(ui, parent_buf, 0);
10190Sstevel@tonic-gate 			md_unit_readerexit(ui);
10200Sstevel@tonic-gate 			md_biodone(parent_buf);
10210Sstevel@tonic-gate 			kmem_cache_free(sp_parent_cache, ps);
10220Sstevel@tonic-gate 			return;
10230Sstevel@tonic-gate 		}
10240Sstevel@tonic-gate 
10250Sstevel@tonic-gate 		child_buf = md_bioclone(parent_buf, current_offset,
10260Sstevel@tonic-gate 					child_buf->b_bcount, child_buf->b_edev,
10270Sstevel@tonic-gate 					child_buf->b_blkno, sp_done, child_buf,
10280Sstevel@tonic-gate 					KM_NOSLEEP);
10290Sstevel@tonic-gate 		/* calculate new offset, counts, etc... */
10300Sstevel@tonic-gate 		current_offset += child_buf->b_bcount;
10310Sstevel@tonic-gate 		current_count -=  child_buf->b_bcount;
10320Sstevel@tonic-gate 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
10330Sstevel@tonic-gate 
10340Sstevel@tonic-gate 		if (more) {
10350Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
10360Sstevel@tonic-gate 			ps->ps_frags++;
10370Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
10380Sstevel@tonic-gate 		}
10390Sstevel@tonic-gate 
10400Sstevel@tonic-gate 		md_call_strategy(child_buf, strat_flag, private);
10410Sstevel@tonic-gate 	} while (more);
10420Sstevel@tonic-gate 
10430Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
10440Sstevel@tonic-gate 		while (!(ps->ps_flags & MD_SPPS_DONE)) {
10450Sstevel@tonic-gate 			md_daemon(1, &md_done_daemon);
10460Sstevel@tonic-gate 		}
10470Sstevel@tonic-gate 		kmem_cache_free(sp_parent_cache, ps);
10480Sstevel@tonic-gate 	}
10490Sstevel@tonic-gate }
10500Sstevel@tonic-gate 
10510Sstevel@tonic-gate /*
10520Sstevel@tonic-gate  * FUNCTION:	sp_directed_read()
10530Sstevel@tonic-gate  * INPUT:	mnum	- minor number
10540Sstevel@tonic-gate  *		vdr	- vol_directed_rd_t from user
10550Sstevel@tonic-gate  *		mode	- access mode for copying data out.
10560Sstevel@tonic-gate  * OUTPUT:	none.
10570Sstevel@tonic-gate  * RETURNS:	0	- success
10580Sstevel@tonic-gate  *		Exxxxx	- failure error-code
10590Sstevel@tonic-gate  * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
10600Sstevel@tonic-gate  *		directed read as requested by the user. This is essentially the
10610Sstevel@tonic-gate  *		same as md_sp_strategy() with the exception being that the
10620Sstevel@tonic-gate  *		underlying 'md_call_strategy' is replaced with an ioctl call.
10630Sstevel@tonic-gate  */
10640Sstevel@tonic-gate int
10650Sstevel@tonic-gate sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
10660Sstevel@tonic-gate {
10670Sstevel@tonic-gate 	md_spps_t	*ps;
10680Sstevel@tonic-gate 	md_spcs_t	*cs;
10690Sstevel@tonic-gate 	int		more;
10700Sstevel@tonic-gate 	mp_unit_t	*un;
10710Sstevel@tonic-gate 	mdi_unit_t	*ui;
10720Sstevel@tonic-gate 	size_t		current_count;
10730Sstevel@tonic-gate 	off_t		current_offset;
10740Sstevel@tonic-gate 	sp_ext_offset_t	current_blkno;
10750Sstevel@tonic-gate 	buf_t		*child_buf, *parent_buf;
10760Sstevel@tonic-gate 	void		*kbuffer;
10770Sstevel@tonic-gate 	vol_directed_rd_t	cvdr;
10780Sstevel@tonic-gate 	caddr_t		userbuf;
10790Sstevel@tonic-gate 	offset_t	useroff;
10800Sstevel@tonic-gate 	int		ret = 0;
10810Sstevel@tonic-gate 
10820Sstevel@tonic-gate 	ui = MDI_UNIT(mnum);
10830Sstevel@tonic-gate 
10840Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
10850Sstevel@tonic-gate 
10860Sstevel@tonic-gate 	bzero(&cvdr, sizeof (cvdr));
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_readerlock(ui);
10890Sstevel@tonic-gate 
10900Sstevel@tonic-gate 	/*
10910Sstevel@tonic-gate 	 * Construct a parent_buf header which reflects the user-supplied
10920Sstevel@tonic-gate 	 * request.
10930Sstevel@tonic-gate 	 */
10940Sstevel@tonic-gate 
10950Sstevel@tonic-gate 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
10960Sstevel@tonic-gate 	if (kbuffer == NULL) {
10970Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
10980Sstevel@tonic-gate 		md_unit_readerexit(ui);
10990Sstevel@tonic-gate 		return (ENOMEM);
11000Sstevel@tonic-gate 	}
11010Sstevel@tonic-gate 
11020Sstevel@tonic-gate 	parent_buf = getrbuf(KM_NOSLEEP);
11030Sstevel@tonic-gate 	if (parent_buf == NULL) {
11040Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
11050Sstevel@tonic-gate 		md_unit_readerexit(ui);
11060Sstevel@tonic-gate 		kmem_free(kbuffer, vdr->vdr_nbytes);
11070Sstevel@tonic-gate 		return (ENOMEM);
11080Sstevel@tonic-gate 	}
11090Sstevel@tonic-gate 	parent_buf->b_un.b_addr = kbuffer;
11100Sstevel@tonic-gate 	parent_buf->b_flags = B_READ;
11110Sstevel@tonic-gate 	parent_buf->b_bcount = vdr->vdr_nbytes;
11120Sstevel@tonic-gate 	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
11130Sstevel@tonic-gate 	parent_buf->b_edev = un->un_dev;
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate 
11160Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
11170Sstevel@tonic-gate 	sp_parent_init(ps);
11180Sstevel@tonic-gate 
11190Sstevel@tonic-gate 	/*
11200Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
11210Sstevel@tonic-gate 	 * in the parent.
11220Sstevel@tonic-gate 	 */
11230Sstevel@tonic-gate 	ps->ps_un = un;
11240Sstevel@tonic-gate 	ps->ps_ui = ui;
11250Sstevel@tonic-gate 	ps->ps_bp = parent_buf;
11260Sstevel@tonic-gate 	ps->ps_addr = parent_buf->b_un.b_addr;
11270Sstevel@tonic-gate 
11280Sstevel@tonic-gate 	current_count = parent_buf->b_bcount;
11290Sstevel@tonic-gate 	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
11300Sstevel@tonic-gate 	current_offset  = 0;
11310Sstevel@tonic-gate 
11320Sstevel@tonic-gate 	ps->ps_frags++;
11330Sstevel@tonic-gate 	vdr->vdr_bytesread = 0;
11340Sstevel@tonic-gate 
11350Sstevel@tonic-gate 	/*
11360Sstevel@tonic-gate 	 * this loop does the main work of an I/O.  we allocate a
11370Sstevel@tonic-gate 	 * a child save for each buf, do the logical to physical
11380Sstevel@tonic-gate 	 * mapping, decide if we need to frag the I/O, clone the
11390Sstevel@tonic-gate 	 * new I/O to pass down the stack.  repeat until we've
11400Sstevel@tonic-gate 	 * taken care of the entire buf that was passed to us.
11410Sstevel@tonic-gate 	 */
11420Sstevel@tonic-gate 	do {
11430Sstevel@tonic-gate 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
11440Sstevel@tonic-gate 		sp_child_init(cs);
11450Sstevel@tonic-gate 		child_buf = &cs->cs_buf;
11460Sstevel@tonic-gate 		cs->cs_ps = ps;
11470Sstevel@tonic-gate 
11480Sstevel@tonic-gate 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
11490Sstevel@tonic-gate 		if (more == -1) {
11500Sstevel@tonic-gate 			ret = EIO;
11510Sstevel@tonic-gate 			vdr->vdr_flags |= DKV_DMR_SHORT;
11520Sstevel@tonic-gate 			kmem_cache_free(sp_child_cache, cs);
11530Sstevel@tonic-gate 			goto err_out;
11540Sstevel@tonic-gate 		}
11550Sstevel@tonic-gate 
11560Sstevel@tonic-gate 		cvdr.vdr_flags = vdr->vdr_flags;
11570Sstevel@tonic-gate 		cvdr.vdr_side = vdr->vdr_side;
11580Sstevel@tonic-gate 		cvdr.vdr_nbytes = child_buf->b_bcount;
11590Sstevel@tonic-gate 		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
11600Sstevel@tonic-gate 		/* Work out where we are in the allocated buffer */
116162Sjeanm 		useroff = (offset_t)(uintptr_t)kbuffer;
11620Sstevel@tonic-gate 		useroff = useroff + (offset_t)current_offset;
116362Sjeanm 		cvdr.vdr_data = (void *)(uintptr_t)useroff;
11640Sstevel@tonic-gate 		child_buf = md_bioclone(parent_buf, current_offset,
11650Sstevel@tonic-gate 					child_buf->b_bcount, child_buf->b_edev,
11660Sstevel@tonic-gate 					child_buf->b_blkno, NULL,
11670Sstevel@tonic-gate 					child_buf, KM_NOSLEEP);
11680Sstevel@tonic-gate 		/* calculate new offset, counts, etc... */
11690Sstevel@tonic-gate 		current_offset += child_buf->b_bcount;
11700Sstevel@tonic-gate 		current_count -=  child_buf->b_bcount;
11710Sstevel@tonic-gate 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
11720Sstevel@tonic-gate 
11730Sstevel@tonic-gate 		if (more) {
11740Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
11750Sstevel@tonic-gate 			ps->ps_frags++;
11760Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
11770Sstevel@tonic-gate 		}
11780Sstevel@tonic-gate 
11790Sstevel@tonic-gate 		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
11800Sstevel@tonic-gate 		    (mode | FKIOCTL), NULL);
11810Sstevel@tonic-gate 
11820Sstevel@tonic-gate 		/*
11830Sstevel@tonic-gate 		 * Free the child structure as we've finished with it.
11840Sstevel@tonic-gate 		 * Normally this would be done by sp_done() but we're just
11850Sstevel@tonic-gate 		 * using md_bioclone() to segment the transfer and we never
11860Sstevel@tonic-gate 		 * issue a strategy request so the iodone will not be called.
11870Sstevel@tonic-gate 		 */
11880Sstevel@tonic-gate 		kmem_cache_free(sp_child_cache, cs);
11890Sstevel@tonic-gate 		if (ret == 0) {
11900Sstevel@tonic-gate 			/* copyout the returned data to vdr_data + offset */
11910Sstevel@tonic-gate 			userbuf = (caddr_t)kbuffer;
11920Sstevel@tonic-gate 			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
11930Sstevel@tonic-gate 			if (ddi_copyout(userbuf, vdr->vdr_data,
11940Sstevel@tonic-gate 			    cvdr.vdr_bytesread, mode)) {
11950Sstevel@tonic-gate 				ret = EFAULT;
11960Sstevel@tonic-gate 				goto err_out;
11970Sstevel@tonic-gate 			}
11980Sstevel@tonic-gate 			vdr->vdr_bytesread += cvdr.vdr_bytesread;
11990Sstevel@tonic-gate 		} else {
12000Sstevel@tonic-gate 			goto err_out;
12010Sstevel@tonic-gate 		}
12020Sstevel@tonic-gate 	} while (more);
12030Sstevel@tonic-gate 
12040Sstevel@tonic-gate 	/*
12050Sstevel@tonic-gate 	 * Update the user-supplied vol_directed_rd_t structure with the
12060Sstevel@tonic-gate 	 * contents of the last issued child request.
12070Sstevel@tonic-gate 	 */
12080Sstevel@tonic-gate 	vdr->vdr_flags = cvdr.vdr_flags;
12090Sstevel@tonic-gate 	vdr->vdr_side = cvdr.vdr_side;
12100Sstevel@tonic-gate 	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
12110Sstevel@tonic-gate 
12120Sstevel@tonic-gate err_out:
12130Sstevel@tonic-gate 	if (ret != 0) {
12140Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
12150Sstevel@tonic-gate 	}
12160Sstevel@tonic-gate 	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
12170Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_SHORT;
12180Sstevel@tonic-gate 	}
12190Sstevel@tonic-gate 	kmem_cache_free(sp_parent_cache, ps);
12200Sstevel@tonic-gate 	kmem_free(kbuffer, vdr->vdr_nbytes);
12210Sstevel@tonic-gate 	freerbuf(parent_buf);
12220Sstevel@tonic-gate 	md_unit_readerexit(ui);
12230Sstevel@tonic-gate 	return (ret);
12240Sstevel@tonic-gate }
12250Sstevel@tonic-gate 
12260Sstevel@tonic-gate /*
12270Sstevel@tonic-gate  * FUNCTION:	sp_snarf()
12280Sstevel@tonic-gate  * INPUT:	cmd	- snarf cmd.
12290Sstevel@tonic-gate  *		setno	- set number.
12300Sstevel@tonic-gate  * OUTPUT:	none.
12310Sstevel@tonic-gate  * RETURNS:	1	- soft partitions were snarfed.
12320Sstevel@tonic-gate  *		0	- no soft partitions were snarfed.
12330Sstevel@tonic-gate  * PURPOSE:	Snarf soft partition metadb records into their in-core
12340Sstevel@tonic-gate  *		structures.  This routine is called at "snarf time" when
12350Sstevel@tonic-gate  *		md loads and gets all metadevices records into memory.
12360Sstevel@tonic-gate  *		The basic algorithm is simply to walk the soft partition
12370Sstevel@tonic-gate  *		records in the metadb and call the soft partitioning
12380Sstevel@tonic-gate  *		build_incore routine to set up the in-core structures.
12390Sstevel@tonic-gate  */
12400Sstevel@tonic-gate static int
12410Sstevel@tonic-gate sp_snarf(md_snarfcmd_t cmd, set_t setno)
12420Sstevel@tonic-gate {
12430Sstevel@tonic-gate 	mp_unit_t	*un;
12440Sstevel@tonic-gate 	mddb_recid_t	recid;
12450Sstevel@tonic-gate 	int		gotsomething;
12460Sstevel@tonic-gate 	int		all_sp_gotten;
12470Sstevel@tonic-gate 	mddb_type_t	rec_type;
12480Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
12490Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
12500Sstevel@tonic-gate 	mp_unit_t	*big_un;
12510Sstevel@tonic-gate 	mp_unit32_od_t	*small_un;
12520Sstevel@tonic-gate 	size_t		newreqsize;
12530Sstevel@tonic-gate 
12540Sstevel@tonic-gate 
12550Sstevel@tonic-gate 	if (cmd == MD_SNARF_CLEANUP)
12560Sstevel@tonic-gate 		return (0);
12570Sstevel@tonic-gate 
12580Sstevel@tonic-gate 	all_sp_gotten = 1;
12590Sstevel@tonic-gate 	gotsomething = 0;
12600Sstevel@tonic-gate 
12610Sstevel@tonic-gate 	/* get the record type */
12620Sstevel@tonic-gate 	rec_type = (mddb_type_t)md_getshared_key(setno,
12630Sstevel@tonic-gate 	    sp_md_ops.md_driver.md_drivername);
12640Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
12650Sstevel@tonic-gate 
12660Sstevel@tonic-gate 	/*
12670Sstevel@tonic-gate 	 * walk soft partition records in the metadb and call
12680Sstevel@tonic-gate 	 * sp_build_incore to build in-core structures.
12690Sstevel@tonic-gate 	 */
12700Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
12710Sstevel@tonic-gate 		/* if we've already gotten this record, go to the next one */
12720Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
12730Sstevel@tonic-gate 			continue;
12740Sstevel@tonic-gate 
12750Sstevel@tonic-gate 
12760Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
12770Sstevel@tonic-gate 		dep->de_flags = MDDB_F_SOFTPART;
12780Sstevel@tonic-gate 		rbp = dep->de_rb;
12790Sstevel@tonic-gate 
12800Sstevel@tonic-gate 		if ((rbp->rb_revision == MDDB_REV_RB) &&
12810Sstevel@tonic-gate 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
12820Sstevel@tonic-gate 			/*
12830Sstevel@tonic-gate 			 * This means, we have an old and small record.
12840Sstevel@tonic-gate 			 * And this record hasn't already been converted :-o
12850Sstevel@tonic-gate 			 * before we create an incore metadevice from this
12860Sstevel@tonic-gate 			 * we have to convert it to a big record.
12870Sstevel@tonic-gate 			 */
12880Sstevel@tonic-gate 			small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid);
12890Sstevel@tonic-gate 			newreqsize = sizeof (mp_unit_t) +
12900Sstevel@tonic-gate 					((small_un->un_numexts - 1) *
12910Sstevel@tonic-gate 					sizeof (struct mp_ext));
12920Sstevel@tonic-gate 			big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
12930Sstevel@tonic-gate 			softpart_convert((caddr_t)small_un, (caddr_t)big_un,
12940Sstevel@tonic-gate 			    SMALL_2_BIG);
12950Sstevel@tonic-gate 			kmem_free(small_un, dep->de_reqsize);
12960Sstevel@tonic-gate 			dep->de_rb_userdata = big_un;
12970Sstevel@tonic-gate 			dep->de_reqsize = newreqsize;
12980Sstevel@tonic-gate 			rbp->rb_private |= MD_PRV_CONVD;
12990Sstevel@tonic-gate 			un = big_un;
13000Sstevel@tonic-gate 		} else {
13010Sstevel@tonic-gate 			/* Large device */
13020Sstevel@tonic-gate 			un = (mp_unit_t *)mddb_getrecaddr(recid);
13030Sstevel@tonic-gate 		}
13040Sstevel@tonic-gate 
13050Sstevel@tonic-gate 		/* Set revision and flag accordingly */
13060Sstevel@tonic-gate 		if (rbp->rb_revision == MDDB_REV_RB) {
13070Sstevel@tonic-gate 			un->c.un_revision = MD_32BIT_META_DEV;
13080Sstevel@tonic-gate 		} else {
13090Sstevel@tonic-gate 			un->c.un_revision = MD_64BIT_META_DEV;
13100Sstevel@tonic-gate 			un->c.un_flag |= MD_EFILABEL;
13110Sstevel@tonic-gate 		}
13120Sstevel@tonic-gate 
13130Sstevel@tonic-gate 		/*
13140Sstevel@tonic-gate 		 * Create minor node for snarfed entry.
13150Sstevel@tonic-gate 		 */
13160Sstevel@tonic-gate 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
13170Sstevel@tonic-gate 
13180Sstevel@tonic-gate 		if (MD_UNIT(MD_SID(un)) != NULL) {
13190Sstevel@tonic-gate 			/* unit is already in-core */
13200Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
13210Sstevel@tonic-gate 			continue;
13220Sstevel@tonic-gate 		}
13230Sstevel@tonic-gate 		all_sp_gotten = 0;
13240Sstevel@tonic-gate 		if (sp_build_incore((void *)un, 1) == 0) {
13250Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
13260Sstevel@tonic-gate 			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
13270Sstevel@tonic-gate 			gotsomething = 1;
13280Sstevel@tonic-gate 		}
13290Sstevel@tonic-gate 	}
13300Sstevel@tonic-gate 
13310Sstevel@tonic-gate 	if (!all_sp_gotten)
13320Sstevel@tonic-gate 		return (gotsomething);
13330Sstevel@tonic-gate 	/* double-check records */
13340Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
13350Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
13360Sstevel@tonic-gate 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
13370Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate 	return (0);
13400Sstevel@tonic-gate }
13410Sstevel@tonic-gate 
13420Sstevel@tonic-gate /*
13430Sstevel@tonic-gate  * FUNCTION:	sp_halt()
13440Sstevel@tonic-gate  * INPUT:	cmd	- halt cmd.
13450Sstevel@tonic-gate  *		setno	- set number.
13460Sstevel@tonic-gate  * RETURNS:	0	- success.
13470Sstevel@tonic-gate  *		1	- err.
13480Sstevel@tonic-gate  * PURPOSE:	Perform driver halt operations.  As with stripe, we
13490Sstevel@tonic-gate  *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
13500Sstevel@tonic-gate  *		does a check to see if halting can be done safely
13510Sstevel@tonic-gate  *		(no open soft partitions), the second cleans up and
13520Sstevel@tonic-gate  *		shuts down the driver.
13530Sstevel@tonic-gate  */
13540Sstevel@tonic-gate static int
13550Sstevel@tonic-gate sp_halt(md_haltcmd_t cmd, set_t setno)
13560Sstevel@tonic-gate {
13570Sstevel@tonic-gate 	int		i;
13580Sstevel@tonic-gate 	mdi_unit_t	*ui;
13590Sstevel@tonic-gate 	minor_t		mnum;
13600Sstevel@tonic-gate 
13610Sstevel@tonic-gate 	if (cmd == MD_HALT_CLOSE)
13620Sstevel@tonic-gate 		return (0);
13630Sstevel@tonic-gate 
13640Sstevel@tonic-gate 	if (cmd == MD_HALT_OPEN)
13650Sstevel@tonic-gate 		return (0);
13660Sstevel@tonic-gate 
13670Sstevel@tonic-gate 	if (cmd == MD_HALT_UNLOAD)
13680Sstevel@tonic-gate 		return (0);
13690Sstevel@tonic-gate 
13700Sstevel@tonic-gate 	if (cmd == MD_HALT_CHECK) {
13710Sstevel@tonic-gate 		for (i = 0; i < md_nunits; i++) {
13720Sstevel@tonic-gate 			mnum = MD_MKMIN(setno, i);
13730Sstevel@tonic-gate 			if ((ui = MDI_UNIT(mnum)) == NULL)
13740Sstevel@tonic-gate 				continue;
13750Sstevel@tonic-gate 			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
13760Sstevel@tonic-gate 				continue;
13770Sstevel@tonic-gate 			if (md_unit_isopen(ui))
13780Sstevel@tonic-gate 				return (1);
13790Sstevel@tonic-gate 		}
13800Sstevel@tonic-gate 		return (0);
13810Sstevel@tonic-gate 	}
13820Sstevel@tonic-gate 
13830Sstevel@tonic-gate 	if (cmd != MD_HALT_DOIT)
13840Sstevel@tonic-gate 		return (1);
13850Sstevel@tonic-gate 
13860Sstevel@tonic-gate 	for (i = 0; i < md_nunits; i++) {
13870Sstevel@tonic-gate 		mnum = MD_MKMIN(setno, i);
13880Sstevel@tonic-gate 		if ((ui = MDI_UNIT(mnum)) == NULL)
13890Sstevel@tonic-gate 			continue;
13900Sstevel@tonic-gate 		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
13910Sstevel@tonic-gate 			continue;
13920Sstevel@tonic-gate 		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
13930Sstevel@tonic-gate 	}
13940Sstevel@tonic-gate 
13950Sstevel@tonic-gate 	return (0);
13960Sstevel@tonic-gate }
13970Sstevel@tonic-gate 
13980Sstevel@tonic-gate /*
13990Sstevel@tonic-gate  * FUNCTION:	sp_open_dev()
14000Sstevel@tonic-gate  * INPUT:	un	- unit structure.
14010Sstevel@tonic-gate  *		oflags	- open flags.
14020Sstevel@tonic-gate  * OUTPUT:	none.
14030Sstevel@tonic-gate  * RETURNS:	0		- success.
14040Sstevel@tonic-gate  *		non-zero	- err.
14050Sstevel@tonic-gate  * PURPOSE:	open underlying device via md_layered_open.
14060Sstevel@tonic-gate  */
14070Sstevel@tonic-gate static int
14080Sstevel@tonic-gate sp_open_dev(mp_unit_t *un, int oflags)
14090Sstevel@tonic-gate {
14100Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
14110Sstevel@tonic-gate 	int		err;
14120Sstevel@tonic-gate 	md_dev64_t	tmpdev;
14130Sstevel@tonic-gate 	set_t		setno = MD_MIN2SET(MD_SID(un));
14140Sstevel@tonic-gate 	side_t		side = mddb_getsidenum(setno);
14150Sstevel@tonic-gate 
14160Sstevel@tonic-gate 	tmpdev = un->un_dev;
14170Sstevel@tonic-gate 	/*
14180Sstevel@tonic-gate 	 * Do the open by device id if underlying is regular
14190Sstevel@tonic-gate 	 */
14200Sstevel@tonic-gate 	if ((md_getmajor(tmpdev) != md_major) &&
14210Sstevel@tonic-gate 		md_devid_found(setno, side, un->un_key) == 1) {
14220Sstevel@tonic-gate 		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
14230Sstevel@tonic-gate 	}
14240Sstevel@tonic-gate 	err = md_layered_open(mnum, &tmpdev, oflags);
14250Sstevel@tonic-gate 	un->un_dev = tmpdev;
14260Sstevel@tonic-gate 
14270Sstevel@tonic-gate 	if (err)
14280Sstevel@tonic-gate 		return (ENXIO);
14290Sstevel@tonic-gate 
14300Sstevel@tonic-gate 	return (0);
14310Sstevel@tonic-gate }
14320Sstevel@tonic-gate 
14330Sstevel@tonic-gate /*
14340Sstevel@tonic-gate  * FUNCTION:	sp_open()
14350Sstevel@tonic-gate  * INPUT:	dev		- device to open.
14360Sstevel@tonic-gate  *		flag		- pass-through flag.
14370Sstevel@tonic-gate  *		otyp		- pass-through open type.
14380Sstevel@tonic-gate  *		cred_p		- credentials.
14390Sstevel@tonic-gate  *		md_oflags	- open flags.
14400Sstevel@tonic-gate  * OUTPUT:	none.
14410Sstevel@tonic-gate  * RETURNS:	0		- success.
14420Sstevel@tonic-gate  *		non-zero	- err.
14430Sstevel@tonic-gate  * PURPOSE:	open a soft partition.
14440Sstevel@tonic-gate  */
14450Sstevel@tonic-gate /* ARGSUSED */
14460Sstevel@tonic-gate static int
14470Sstevel@tonic-gate sp_open(
14480Sstevel@tonic-gate 	dev_t		*dev,
14490Sstevel@tonic-gate 	int		flag,
14500Sstevel@tonic-gate 	int		otyp,
14510Sstevel@tonic-gate 	cred_t		*cred_p,
14520Sstevel@tonic-gate 	int		md_oflags
14530Sstevel@tonic-gate )
14540Sstevel@tonic-gate {
14550Sstevel@tonic-gate 	minor_t		mnum = getminor(*dev);
14560Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
14570Sstevel@tonic-gate 	mp_unit_t	*un;
14580Sstevel@tonic-gate 	int		err = 0;
14590Sstevel@tonic-gate 	set_t		setno;
14600Sstevel@tonic-gate 
146146Sskamm 	/*
146246Sskamm 	 * When doing an open of a multi owner metadevice, check to see if this
146346Sskamm 	 * node is a starting node and if a reconfig cycle is underway.
146446Sskamm 	 * If so, the system isn't sufficiently set up enough to handle the
146546Sskamm 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
146646Sskamm 	 */
146746Sskamm 	setno = MD_MIN2SET(mnum);
146846Sskamm 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
146946Sskamm 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
147046Sskamm 			return (ENXIO);
147146Sskamm 	}
147246Sskamm 
14730Sstevel@tonic-gate 	/* grab necessary locks */
14740Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
14750Sstevel@tonic-gate 	setno = MD_UN2SET(un);
14760Sstevel@tonic-gate 
14770Sstevel@tonic-gate 	/* open underlying device, if necessary */
14780Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
14790Sstevel@tonic-gate 		if ((err = sp_open_dev(un, md_oflags)) != 0)
14800Sstevel@tonic-gate 			goto out;
14810Sstevel@tonic-gate 
14820Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno)) {
14830Sstevel@tonic-gate 			/* For probe, don't incur the overhead of validate */
14840Sstevel@tonic-gate 			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
14850Sstevel@tonic-gate 				/*
14860Sstevel@tonic-gate 				 * Don't call sp_validate while
14870Sstevel@tonic-gate 				 * unit_openclose lock is held.  So, actually
14880Sstevel@tonic-gate 				 * open the device, drop openclose lock,
14890Sstevel@tonic-gate 				 * call sp_validate, reacquire openclose lock,
14900Sstevel@tonic-gate 				 * and close the device.  If sp_validate
14910Sstevel@tonic-gate 				 * succeeds, then device will be re-opened.
14920Sstevel@tonic-gate 				 */
14930Sstevel@tonic-gate 				if ((err = md_unit_incopen(mnum, flag,
14940Sstevel@tonic-gate 				    otyp)) != 0)
14950Sstevel@tonic-gate 					goto out;
14960Sstevel@tonic-gate 
14970Sstevel@tonic-gate 				mutex_enter(&ui->ui_mx);
14980Sstevel@tonic-gate 				ui->ui_lock |= MD_UL_OPENINPROGRESS;
14990Sstevel@tonic-gate 				mutex_exit(&ui->ui_mx);
15000Sstevel@tonic-gate 				md_unit_openclose_exit(ui);
15010Sstevel@tonic-gate 				if (otyp != OTYP_LYR)
15020Sstevel@tonic-gate 					rw_exit(&md_unit_array_rw.lock);
15030Sstevel@tonic-gate 
15040Sstevel@tonic-gate 				err = sp_validate(un);
15050Sstevel@tonic-gate 
15060Sstevel@tonic-gate 				if (otyp != OTYP_LYR)
15070Sstevel@tonic-gate 					rw_enter(&md_unit_array_rw.lock,
15080Sstevel@tonic-gate 					    RW_READER);
15090Sstevel@tonic-gate 				(void) md_unit_openclose_enter(ui);
15100Sstevel@tonic-gate 				(void) md_unit_decopen(mnum, otyp);
15110Sstevel@tonic-gate 				mutex_enter(&ui->ui_mx);
15120Sstevel@tonic-gate 				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
15130Sstevel@tonic-gate 				cv_broadcast(&ui->ui_cv);
15140Sstevel@tonic-gate 				mutex_exit(&ui->ui_mx);
15150Sstevel@tonic-gate 				/*
15160Sstevel@tonic-gate 				 * Should be in the same state as before
15170Sstevel@tonic-gate 				 * the sp_validate.
15180Sstevel@tonic-gate 				 */
15190Sstevel@tonic-gate 				if (err != 0) {
15200Sstevel@tonic-gate 					/* close the device opened above */
15210Sstevel@tonic-gate 					md_layered_close(un->un_dev, md_oflags);
15220Sstevel@tonic-gate 					err = EIO;
15230Sstevel@tonic-gate 					goto out;
15240Sstevel@tonic-gate 				}
15250Sstevel@tonic-gate 			}
15260Sstevel@tonic-gate 			/*
15270Sstevel@tonic-gate 			 * As we're a multi-owner metadevice we need to ensure
15280Sstevel@tonic-gate 			 * that all nodes have the same idea of the status.
15290Sstevel@tonic-gate 			 * sp_validate() will mark the device as errored (if
15300Sstevel@tonic-gate 			 * it cannot read the watermark) or ok (if it was
15310Sstevel@tonic-gate 			 * previously errored but the watermark is now valid).
15320Sstevel@tonic-gate 			 * This code-path is only entered on the non-probe open
15330Sstevel@tonic-gate 			 * so we will maintain the errored state during a probe
15340Sstevel@tonic-gate 			 * call. This means the sys-admin must metarecover -m
15350Sstevel@tonic-gate 			 * to reset the soft-partition error.
15360Sstevel@tonic-gate 			 */
15370Sstevel@tonic-gate 		} else {
15380Sstevel@tonic-gate 			/* For probe, don't incur the overhead of validate */
15390Sstevel@tonic-gate 			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
15400Sstevel@tonic-gate 			    (err = sp_validate(un)) != 0) {
15410Sstevel@tonic-gate 				/* close the device opened above */
15420Sstevel@tonic-gate 				md_layered_close(un->un_dev, md_oflags);
15430Sstevel@tonic-gate 				err = EIO;
15440Sstevel@tonic-gate 				goto out;
15450Sstevel@tonic-gate 			} else {
15460Sstevel@tonic-gate 				/*
15470Sstevel@tonic-gate 				 * we succeeded in validating the on disk
15480Sstevel@tonic-gate 				 * format versus the in core, so reset the
15490Sstevel@tonic-gate 				 * status if it's in error
15500Sstevel@tonic-gate 				 */
15510Sstevel@tonic-gate 				if (un->un_status == MD_SP_ERR) {
15520Sstevel@tonic-gate 					un->un_status = MD_SP_OK;
15530Sstevel@tonic-gate 				}
15540Sstevel@tonic-gate 			}
15550Sstevel@tonic-gate 		}
15560Sstevel@tonic-gate 	}
15570Sstevel@tonic-gate 
15580Sstevel@tonic-gate 	/* count open */
15590Sstevel@tonic-gate 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
15600Sstevel@tonic-gate 		goto out;
15610Sstevel@tonic-gate 
15620Sstevel@tonic-gate out:
15630Sstevel@tonic-gate 	md_unit_openclose_exit(ui);
15640Sstevel@tonic-gate 	return (err);
15650Sstevel@tonic-gate }
15660Sstevel@tonic-gate 
15670Sstevel@tonic-gate /*
15680Sstevel@tonic-gate  * FUNCTION:	sp_close()
15690Sstevel@tonic-gate  * INPUT:	dev		- device to close.
15700Sstevel@tonic-gate  *		flag		- pass-through flag.
15710Sstevel@tonic-gate  *		otyp		- pass-through type.
15720Sstevel@tonic-gate  *		cred_p		- credentials.
15730Sstevel@tonic-gate  *		md_cflags	- close flags.
15740Sstevel@tonic-gate  * OUTPUT:	none.
15750Sstevel@tonic-gate  * RETURNS:	0		- success.
15760Sstevel@tonic-gate  *		non-zero	- err.
15770Sstevel@tonic-gate  * PURPOSE:	close a soft paritition.
15780Sstevel@tonic-gate  */
15790Sstevel@tonic-gate /* ARGSUSED */
15800Sstevel@tonic-gate static int
15810Sstevel@tonic-gate sp_close(
15820Sstevel@tonic-gate 	dev_t		dev,
15830Sstevel@tonic-gate 	int		flag,
15840Sstevel@tonic-gate 	int		otyp,
15850Sstevel@tonic-gate 	cred_t		*cred_p,
15860Sstevel@tonic-gate 	int		md_cflags
15870Sstevel@tonic-gate )
15880Sstevel@tonic-gate {
15890Sstevel@tonic-gate 	minor_t		mnum = getminor(dev);
15900Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
15910Sstevel@tonic-gate 	mp_unit_t	*un;
15920Sstevel@tonic-gate 	int		err = 0;
15930Sstevel@tonic-gate 
15940Sstevel@tonic-gate 	/* grab necessary locks */
15950Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
15960Sstevel@tonic-gate 
15970Sstevel@tonic-gate 	/* count closed */
15980Sstevel@tonic-gate 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
15990Sstevel@tonic-gate 		goto out;
16000Sstevel@tonic-gate 
16010Sstevel@tonic-gate 	/* close devices, if necessary */
16020Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
16030Sstevel@tonic-gate 		md_layered_close(un->un_dev, md_cflags);
16040Sstevel@tonic-gate 	}
16050Sstevel@tonic-gate 
16060Sstevel@tonic-gate 	/*
16070Sstevel@tonic-gate 	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
16080Sstevel@tonic-gate 	 * clear these capabilities if this is the last close in
16090Sstevel@tonic-gate 	 * the cluster
16100Sstevel@tonic-gate 	 */
16110Sstevel@tonic-gate 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
16120Sstevel@tonic-gate 	    (ui->ui_tstate & MD_ABR_CAP)) {
16130Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
16140Sstevel@tonic-gate 		mdmn_clear_all_capabilities(mnum);
16150Sstevel@tonic-gate 		return (0);
16160Sstevel@tonic-gate 	}
16170Sstevel@tonic-gate 	/* unlock, return success */
16180Sstevel@tonic-gate out:
16190Sstevel@tonic-gate 	md_unit_openclose_exit(ui);
16200Sstevel@tonic-gate 	return (err);
16210Sstevel@tonic-gate }
16220Sstevel@tonic-gate 
16230Sstevel@tonic-gate 
16240Sstevel@tonic-gate /* used in sp_dump routine */
16250Sstevel@tonic-gate static struct buf dumpbuf;
16260Sstevel@tonic-gate 
16270Sstevel@tonic-gate /*
16280Sstevel@tonic-gate  * FUNCTION:	sp_dump()
16290Sstevel@tonic-gate  * INPUT:	dev	- device to dump to.
16300Sstevel@tonic-gate  *		addr	- address to dump.
16310Sstevel@tonic-gate  *		blkno	- blkno on device.
16320Sstevel@tonic-gate  *		nblk	- number of blocks to dump.
16330Sstevel@tonic-gate  * OUTPUT:	none.
16340Sstevel@tonic-gate  * RETURNS:	result from bdev_dump.
16350Sstevel@tonic-gate  * PURPOSE:  This routine dumps memory to the disk.  It assumes that
16360Sstevel@tonic-gate  *           the memory has already been mapped into mainbus space.
16370Sstevel@tonic-gate  *           It is called at disk interrupt priority when the system
16380Sstevel@tonic-gate  *           is in trouble.
16390Sstevel@tonic-gate  *           NOTE: this function is defined using 32-bit arguments,
16400Sstevel@tonic-gate  *           but soft partitioning is internally 64-bit.  Arguments
16410Sstevel@tonic-gate  *           are casted where appropriate.
16420Sstevel@tonic-gate  */
16430Sstevel@tonic-gate static int
16440Sstevel@tonic-gate sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
16450Sstevel@tonic-gate {
16460Sstevel@tonic-gate 	mp_unit_t	*un;
16470Sstevel@tonic-gate 	buf_t		*bp;
16480Sstevel@tonic-gate 	sp_ext_length_t	nb;
16490Sstevel@tonic-gate 	daddr_t		mapblk;
16500Sstevel@tonic-gate 	int		result;
16510Sstevel@tonic-gate 	int		more;
16520Sstevel@tonic-gate 	int		saveresult = 0;
16530Sstevel@tonic-gate 
16540Sstevel@tonic-gate 	/*
16550Sstevel@tonic-gate 	 * Don't need to grab the unit lock.
16560Sstevel@tonic-gate 	 * Cause nothing else is supposed to be happenning.
16570Sstevel@tonic-gate 	 * Also dump is not supposed to sleep.
16580Sstevel@tonic-gate 	 */
16590Sstevel@tonic-gate 	un = (mp_unit_t *)MD_UNIT(getminor(dev));
16600Sstevel@tonic-gate 
16610Sstevel@tonic-gate 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
16620Sstevel@tonic-gate 		return (EINVAL);
16630Sstevel@tonic-gate 
16640Sstevel@tonic-gate 	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
16650Sstevel@tonic-gate 		return (EINVAL);
16660Sstevel@tonic-gate 
16670Sstevel@tonic-gate 	bp = &dumpbuf;
16680Sstevel@tonic-gate 	nb = (sp_ext_length_t)dbtob(nblk);
16690Sstevel@tonic-gate 	do {
16700Sstevel@tonic-gate 		bzero((caddr_t)bp, sizeof (*bp));
16710Sstevel@tonic-gate 		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
16720Sstevel@tonic-gate 		nblk = (int)(btodb(bp->b_bcount));
16730Sstevel@tonic-gate 		mapblk = bp->b_blkno;
16740Sstevel@tonic-gate 		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
16750Sstevel@tonic-gate 		if (result)
16760Sstevel@tonic-gate 			saveresult = result;
16770Sstevel@tonic-gate 
16780Sstevel@tonic-gate 		nb -= bp->b_bcount;
16790Sstevel@tonic-gate 		addr += bp->b_bcount;
16800Sstevel@tonic-gate 		blkno += nblk;
16810Sstevel@tonic-gate 	} while (more);
16820Sstevel@tonic-gate 
16830Sstevel@tonic-gate 	return (saveresult);
16840Sstevel@tonic-gate }
16850Sstevel@tonic-gate 
16860Sstevel@tonic-gate static int
16870Sstevel@tonic-gate sp_imp_set(
16880Sstevel@tonic-gate 	set_t	setno
16890Sstevel@tonic-gate )
16900Sstevel@tonic-gate {
16910Sstevel@tonic-gate 	mddb_recid_t	recid;
16920Sstevel@tonic-gate 	int		gotsomething;
16930Sstevel@tonic-gate 	mddb_type_t	rec_type;
16940Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
16950Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
16960Sstevel@tonic-gate 	mp_unit_t	*un64;
16970Sstevel@tonic-gate 	mp_unit32_od_t	*un32;
16980Sstevel@tonic-gate 	minor_t		*self_id;	/* minor needs to be updated */
16990Sstevel@tonic-gate 	md_parent_t	*parent_id;	/* parent needs to be updated */
17000Sstevel@tonic-gate 	mddb_recid_t	*record_id;	/* record id needs to be updated */
17010Sstevel@tonic-gate 
17020Sstevel@tonic-gate 	gotsomething = 0;
17030Sstevel@tonic-gate 
17040Sstevel@tonic-gate 	rec_type = (mddb_type_t)md_getshared_key(setno,
17050Sstevel@tonic-gate 		sp_md_ops.md_driver.md_drivername);
17060Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
17070Sstevel@tonic-gate 
17080Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
17090Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
17100Sstevel@tonic-gate 			continue;
17110Sstevel@tonic-gate 
17120Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
17130Sstevel@tonic-gate 		rbp = dep->de_rb;
17140Sstevel@tonic-gate 
17150Sstevel@tonic-gate 		if (rbp->rb_revision == MDDB_REV_RB) {
17160Sstevel@tonic-gate 			/*
17170Sstevel@tonic-gate 			 * Small device
17180Sstevel@tonic-gate 			 */
17190Sstevel@tonic-gate 			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
17200Sstevel@tonic-gate 			self_id = &(un32->c.un_self_id);
17210Sstevel@tonic-gate 			parent_id = &(un32->c.un_parent);
17220Sstevel@tonic-gate 			record_id = &(un32->c.un_record_id);
17230Sstevel@tonic-gate 
17240Sstevel@tonic-gate 			if (!md_update_minor(setno, mddb_getsidenum
17250Sstevel@tonic-gate 				(setno), un32->un_key))
17260Sstevel@tonic-gate 				goto out;
17270Sstevel@tonic-gate 		} else {
17280Sstevel@tonic-gate 			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
17290Sstevel@tonic-gate 			self_id = &(un64->c.un_self_id);
17300Sstevel@tonic-gate 			parent_id = &(un64->c.un_parent);
17310Sstevel@tonic-gate 			record_id = &(un64->c.un_record_id);
17320Sstevel@tonic-gate 
17330Sstevel@tonic-gate 			if (!md_update_minor(setno, mddb_getsidenum
17340Sstevel@tonic-gate 				(setno), un64->un_key))
17350Sstevel@tonic-gate 				goto out;
17360Sstevel@tonic-gate 		}
17370Sstevel@tonic-gate 
17380Sstevel@tonic-gate 		/*
17390Sstevel@tonic-gate 		 * Update unit with the imported setno
17400Sstevel@tonic-gate 		 *
17410Sstevel@tonic-gate 		 */
17420Sstevel@tonic-gate 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
17430Sstevel@tonic-gate 
17440Sstevel@tonic-gate 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
17450Sstevel@tonic-gate 		if (*parent_id != MD_NO_PARENT)
17460Sstevel@tonic-gate 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
17470Sstevel@tonic-gate 		*record_id = MAKERECID(setno, DBID(*record_id));
17480Sstevel@tonic-gate 
17490Sstevel@tonic-gate 		gotsomething = 1;
17500Sstevel@tonic-gate 	}
17510Sstevel@tonic-gate 
17520Sstevel@tonic-gate out:
17530Sstevel@tonic-gate 	return (gotsomething);
17540Sstevel@tonic-gate }
17550Sstevel@tonic-gate 
17560Sstevel@tonic-gate static md_named_services_t sp_named_services[] = {
17570Sstevel@tonic-gate 	{NULL,					0}
17580Sstevel@tonic-gate };
17590Sstevel@tonic-gate 
17600Sstevel@tonic-gate md_ops_t sp_md_ops = {
17610Sstevel@tonic-gate 	sp_open,		/* open */
17620Sstevel@tonic-gate 	sp_close,		/* close */
17630Sstevel@tonic-gate 	md_sp_strategy,		/* strategy */
17640Sstevel@tonic-gate 	NULL,			/* print */
17650Sstevel@tonic-gate 	sp_dump,		/* dump */
17660Sstevel@tonic-gate 	NULL,			/* read */
17670Sstevel@tonic-gate 	NULL,			/* write */
17680Sstevel@tonic-gate 	md_sp_ioctl,		/* ioctl, */
17690Sstevel@tonic-gate 	sp_snarf,		/* snarf */
17700Sstevel@tonic-gate 	sp_halt,		/* halt */
17710Sstevel@tonic-gate 	NULL,			/* aread */
17720Sstevel@tonic-gate 	NULL,			/* awrite */
17730Sstevel@tonic-gate 	sp_imp_set,		/* import set */
17740Sstevel@tonic-gate 	sp_named_services
17750Sstevel@tonic-gate };
17760Sstevel@tonic-gate 
17770Sstevel@tonic-gate static void
17780Sstevel@tonic-gate init_init()
17790Sstevel@tonic-gate {
17800Sstevel@tonic-gate 	sp_parent_cache = kmem_cache_create("md_softpart_parent",
17810Sstevel@tonic-gate 	    sizeof (md_spps_t), 0, sp_parent_constructor,
17820Sstevel@tonic-gate 	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
17830Sstevel@tonic-gate 	sp_child_cache = kmem_cache_create("md_softpart_child",
17840Sstevel@tonic-gate 	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
17850Sstevel@tonic-gate 	    sp_child_constructor, sp_child_destructor, sp_run_queue,
17860Sstevel@tonic-gate 	    NULL, NULL, 0);
17870Sstevel@tonic-gate }
17880Sstevel@tonic-gate 
17890Sstevel@tonic-gate static void
17900Sstevel@tonic-gate fini_uninit()
17910Sstevel@tonic-gate {
17920Sstevel@tonic-gate 	kmem_cache_destroy(sp_parent_cache);
17930Sstevel@tonic-gate 	kmem_cache_destroy(sp_child_cache);
17940Sstevel@tonic-gate 	sp_parent_cache = sp_child_cache = NULL;
17950Sstevel@tonic-gate }
17960Sstevel@tonic-gate 
17970Sstevel@tonic-gate /* define the module linkage */
17980Sstevel@tonic-gate MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit())
1799