xref: /onnv-gate/usr/src/uts/common/io/lvm/softpart/sp.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate  * Soft partitioning metadevice driver (md_sp).
31*0Sstevel@tonic-gate  *
32*0Sstevel@tonic-gate  * This file contains the primary operations of the soft partitioning
33*0Sstevel@tonic-gate  * metadevice driver.  This includes all routines for normal operation
34*0Sstevel@tonic-gate  * (open/close/read/write).  Please see mdvar.h for a definition of
35*0Sstevel@tonic-gate  * metadevice operations vector (md_ops_t).  This driver is loosely
36*0Sstevel@tonic-gate  * based on the stripe driver (md_stripe).
37*0Sstevel@tonic-gate  *
38*0Sstevel@tonic-gate  * All metadevice administration is done through the use of ioctl's.
39*0Sstevel@tonic-gate  * As such, all administrative routines appear in sp_ioctl.c.
40*0Sstevel@tonic-gate  *
41*0Sstevel@tonic-gate  * Soft partitions are represented both in-core and in the metadb with a
42*0Sstevel@tonic-gate  * unit structure.  The soft partition-specific information in the unit
43*0Sstevel@tonic-gate  * structure includes the following information:
44*0Sstevel@tonic-gate  *	- Device information (md_dev64_t & md key) about the device on which
45*0Sstevel@tonic-gate  *	  the soft partition is built.
46*0Sstevel@tonic-gate  *	- Soft partition status information.
47*0Sstevel@tonic-gate  *	- The size of the soft partition and number of extents used to
48*0Sstevel@tonic-gate  *	  make up that size.
49*0Sstevel@tonic-gate  *	- An array of exents which define virtual/physical offset
50*0Sstevel@tonic-gate  *	  mappings and lengths for each extent.
51*0Sstevel@tonic-gate  *
52*0Sstevel@tonic-gate  * Typical soft partition operation proceeds as follows:
53*0Sstevel@tonic-gate  *	- The unit structure is fetched from the metadb and placed into
54*0Sstevel@tonic-gate  *	  an in-core array (as with other metadevices).  This operation
55*0Sstevel@tonic-gate  *	  is performed via sp_build_incore( ) and takes place during
56*0Sstevel@tonic-gate  *	  "snarfing" (when all metadevices are brought in-core at
57*0Sstevel@tonic-gate  *	  once) and when a new soft partition is created.
58*0Sstevel@tonic-gate  *	- A soft partition is opened via sp_open( ).  At open time the
59*0Sstevel@tonic-gate  *	  the soft partition unit structure is verified with the soft
60*0Sstevel@tonic-gate  *	  partition on-disk structures.  Additionally, the soft partition
61*0Sstevel@tonic-gate  *	  status is checked (only soft partitions in the OK state may be
62*0Sstevel@tonic-gate  *	  opened).
63*0Sstevel@tonic-gate  *	- Soft partition I/O is performed via sp_strategy( ) which relies on
64*0Sstevel@tonic-gate  *	  a support routine, sp_mapbuf( ), to do most of the work.
65*0Sstevel@tonic-gate  *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
66*0Sstevel@tonic-gate  *	  search of the extent array in the soft partition unit structure.
67*0Sstevel@tonic-gate  *	  Once a translation has been performed, the I/O is passed down
68*0Sstevel@tonic-gate  *	  to the next layer, which may be another metadevice or a physical
69*0Sstevel@tonic-gate  *	  disk.  Since a soft partition may contain multiple, non-contiguous
70*0Sstevel@tonic-gate  *	  extents, a single I/O may have to be fragmented.
71*0Sstevel@tonic-gate  *	- Soft partitions are closed using sp_close.
72*0Sstevel@tonic-gate  *
73*0Sstevel@tonic-gate  */
74*0Sstevel@tonic-gate 
75*0Sstevel@tonic-gate #include <sys/param.h>
76*0Sstevel@tonic-gate #include <sys/systm.h>
77*0Sstevel@tonic-gate #include <sys/conf.h>
78*0Sstevel@tonic-gate #include <sys/file.h>
79*0Sstevel@tonic-gate #include <sys/user.h>
80*0Sstevel@tonic-gate #include <sys/uio.h>
81*0Sstevel@tonic-gate #include <sys/t_lock.h>
82*0Sstevel@tonic-gate #include <sys/buf.h>
83*0Sstevel@tonic-gate #include <sys/dkio.h>
84*0Sstevel@tonic-gate #include <sys/vtoc.h>
85*0Sstevel@tonic-gate #include <sys/kmem.h>
86*0Sstevel@tonic-gate #include <vm/page.h>
87*0Sstevel@tonic-gate #include <sys/cmn_err.h>
88*0Sstevel@tonic-gate #include <sys/sysmacros.h>
89*0Sstevel@tonic-gate #include <sys/types.h>
90*0Sstevel@tonic-gate #include <sys/mkdev.h>
91*0Sstevel@tonic-gate #include <sys/stat.h>
92*0Sstevel@tonic-gate #include <sys/open.h>
93*0Sstevel@tonic-gate #include <sys/lvm/mdvar.h>
94*0Sstevel@tonic-gate #include <sys/lvm/md_sp.h>
95*0Sstevel@tonic-gate #include <sys/lvm/md_convert.h>
96*0Sstevel@tonic-gate #include <sys/lvm/md_notify.h>
97*0Sstevel@tonic-gate #include <sys/lvm/md_crc.h>
98*0Sstevel@tonic-gate #include <sys/modctl.h>
99*0Sstevel@tonic-gate #include <sys/ddi.h>
100*0Sstevel@tonic-gate #include <sys/sunddi.h>
101*0Sstevel@tonic-gate #include <sys/debug.h>
102*0Sstevel@tonic-gate 
103*0Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
104*0Sstevel@tonic-gate #include <sys/sysevent/svm.h>
105*0Sstevel@tonic-gate 
106*0Sstevel@tonic-gate md_ops_t		sp_md_ops;
107*0Sstevel@tonic-gate #ifndef	lint
108*0Sstevel@tonic-gate static char		_depends_on[] = "drv/md";
109*0Sstevel@tonic-gate md_ops_t		*md_interface_ops = &sp_md_ops;
110*0Sstevel@tonic-gate #endif
111*0Sstevel@tonic-gate 
112*0Sstevel@tonic-gate extern unit_t		md_nunits;
113*0Sstevel@tonic-gate extern set_t		md_nsets;
114*0Sstevel@tonic-gate extern md_set_t		md_set[];
115*0Sstevel@tonic-gate 
116*0Sstevel@tonic-gate extern int		md_status;
117*0Sstevel@tonic-gate extern major_t		md_major;
118*0Sstevel@tonic-gate extern mdq_anchor_t	md_done_daemon;
119*0Sstevel@tonic-gate extern mdq_anchor_t	md_sp_daemon;
120*0Sstevel@tonic-gate extern kmutex_t		md_mx;
121*0Sstevel@tonic-gate extern kcondvar_t	md_cv;
122*0Sstevel@tonic-gate extern md_krwlock_t	md_unit_array_rw;
123*0Sstevel@tonic-gate 
124*0Sstevel@tonic-gate static kmem_cache_t	*sp_parent_cache = NULL;
125*0Sstevel@tonic-gate static kmem_cache_t	*sp_child_cache = NULL;
126*0Sstevel@tonic-gate static void		sp_send_stat_ok(mp_unit_t *);
127*0Sstevel@tonic-gate static void		sp_send_stat_err(mp_unit_t *);
128*0Sstevel@tonic-gate 
129*0Sstevel@tonic-gate /*
130*0Sstevel@tonic-gate  * FUNCTION:	sp_parent_constructor()
131*0Sstevel@tonic-gate  * INPUT:	none.
132*0Sstevel@tonic-gate  * OUTPUT:	ps	- parent save structure initialized.
133*0Sstevel@tonic-gate  * RETURNS:	void *	- ptr to initialized parent save structure.
134*0Sstevel@tonic-gate  * PURPOSE:	initialize parent save structure.
135*0Sstevel@tonic-gate  */
136*0Sstevel@tonic-gate /*ARGSUSED1*/
137*0Sstevel@tonic-gate static int
138*0Sstevel@tonic-gate sp_parent_constructor(void *p, void *d1, int d2)
139*0Sstevel@tonic-gate {
140*0Sstevel@tonic-gate 	mutex_init(&((md_spps_t *)p)->ps_mx,
141*0Sstevel@tonic-gate 	    NULL, MUTEX_DEFAULT, NULL);
142*0Sstevel@tonic-gate 	return (0);
143*0Sstevel@tonic-gate }
144*0Sstevel@tonic-gate 
145*0Sstevel@tonic-gate static void
146*0Sstevel@tonic-gate sp_parent_init(md_spps_t *ps)
147*0Sstevel@tonic-gate {
148*0Sstevel@tonic-gate 	bzero(ps, offsetof(md_spps_t, ps_mx));
149*0Sstevel@tonic-gate }
150*0Sstevel@tonic-gate 
151*0Sstevel@tonic-gate /*ARGSUSED1*/
152*0Sstevel@tonic-gate static void
153*0Sstevel@tonic-gate sp_parent_destructor(void *p, void *d)
154*0Sstevel@tonic-gate {
155*0Sstevel@tonic-gate 	mutex_destroy(&((md_spps_t *)p)->ps_mx);
156*0Sstevel@tonic-gate }
157*0Sstevel@tonic-gate 
158*0Sstevel@tonic-gate /*
159*0Sstevel@tonic-gate  * FUNCTION:	sp_child_constructor()
160*0Sstevel@tonic-gate  * INPUT:	none.
161*0Sstevel@tonic-gate  * OUTPUT:	cs	- child save structure initialized.
162*0Sstevel@tonic-gate  * RETURNS:	void *	- ptr to initialized child save structure.
163*0Sstevel@tonic-gate  * PURPOSE:	initialize child save structure.
164*0Sstevel@tonic-gate  */
165*0Sstevel@tonic-gate /*ARGSUSED1*/
166*0Sstevel@tonic-gate static int
167*0Sstevel@tonic-gate sp_child_constructor(void *p, void *d1, int d2)
168*0Sstevel@tonic-gate {
169*0Sstevel@tonic-gate 	bioinit(&((md_spcs_t *)p)->cs_buf);
170*0Sstevel@tonic-gate 	return (0);
171*0Sstevel@tonic-gate }
172*0Sstevel@tonic-gate 
173*0Sstevel@tonic-gate static void
174*0Sstevel@tonic-gate sp_child_init(md_spcs_t *cs)
175*0Sstevel@tonic-gate {
176*0Sstevel@tonic-gate 	cs->cs_mdunit = 0;
177*0Sstevel@tonic-gate 	cs->cs_ps = NULL;
178*0Sstevel@tonic-gate 	md_bioreset(&cs->cs_buf);
179*0Sstevel@tonic-gate }
180*0Sstevel@tonic-gate 
181*0Sstevel@tonic-gate /*ARGSUSED1*/
182*0Sstevel@tonic-gate static void
183*0Sstevel@tonic-gate sp_child_destructor(void *p, void *d)
184*0Sstevel@tonic-gate {
185*0Sstevel@tonic-gate 	biofini(&((md_spcs_t *)p)->cs_buf);
186*0Sstevel@tonic-gate }
187*0Sstevel@tonic-gate 
188*0Sstevel@tonic-gate /*
189*0Sstevel@tonic-gate  * FUNCTION:	sp_run_queue()
190*0Sstevel@tonic-gate  * INPUT:	none.
191*0Sstevel@tonic-gate  * OUTPUT:	none.
192*0Sstevel@tonic-gate  * RETURNS:	void.
193*0Sstevel@tonic-gate  * PURPOSE:	run the md_daemon to clean up memory pool.
194*0Sstevel@tonic-gate  */
195*0Sstevel@tonic-gate /*ARGSUSED*/
196*0Sstevel@tonic-gate static void
197*0Sstevel@tonic-gate sp_run_queue(void *d)
198*0Sstevel@tonic-gate {
199*0Sstevel@tonic-gate 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
200*0Sstevel@tonic-gate 		md_daemon(1, &md_done_daemon);
201*0Sstevel@tonic-gate }
202*0Sstevel@tonic-gate 
203*0Sstevel@tonic-gate 
204*0Sstevel@tonic-gate /*
205*0Sstevel@tonic-gate  * FUNCTION:	sp_build_incore()
206*0Sstevel@tonic-gate  * INPUT:	p		- ptr to unit structure.
207*0Sstevel@tonic-gate  *		snarfing	- flag to tell us we are snarfing.
208*0Sstevel@tonic-gate  * OUTPUT:	non.
209*0Sstevel@tonic-gate  * RETURNS:	int	- 0 (always).
210*0Sstevel@tonic-gate  * PURPOSE:	place unit structure into in-core unit array (keyed from
211*0Sstevel@tonic-gate  *		minor number).
212*0Sstevel@tonic-gate  */
213*0Sstevel@tonic-gate int
214*0Sstevel@tonic-gate sp_build_incore(void *p, int snarfing)
215*0Sstevel@tonic-gate {
216*0Sstevel@tonic-gate 	mp_unit_t	*un = (mp_unit_t *)p;
217*0Sstevel@tonic-gate 	minor_t		mnum;
218*0Sstevel@tonic-gate 	set_t		setno;
219*0Sstevel@tonic-gate 	md_dev64_t	tmpdev;
220*0Sstevel@tonic-gate 
221*0Sstevel@tonic-gate 	mnum = MD_SID(un);
222*0Sstevel@tonic-gate 
223*0Sstevel@tonic-gate 	if (MD_UNIT(mnum) != NULL)
224*0Sstevel@tonic-gate 		return (0);
225*0Sstevel@tonic-gate 
226*0Sstevel@tonic-gate 	MD_STATUS(un) = 0;
227*0Sstevel@tonic-gate 
228*0Sstevel@tonic-gate 	if (snarfing) {
229*0Sstevel@tonic-gate 		/*
230*0Sstevel@tonic-gate 		 * if we are snarfing, we get the device information
231*0Sstevel@tonic-gate 		 * from the metadb record (using the metadb key for
232*0Sstevel@tonic-gate 		 * that device).
233*0Sstevel@tonic-gate 		 */
234*0Sstevel@tonic-gate 		setno = MD_MIN2SET(mnum);
235*0Sstevel@tonic-gate 
236*0Sstevel@tonic-gate 		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
237*0Sstevel@tonic-gate 		    un->un_key, MD_NOTRUST_DEVT);
238*0Sstevel@tonic-gate 		un->un_dev = tmpdev;
239*0Sstevel@tonic-gate 	}
240*0Sstevel@tonic-gate 
241*0Sstevel@tonic-gate 	/* place unit in in-core array */
242*0Sstevel@tonic-gate 	MD_UNIT(mnum) = un;
243*0Sstevel@tonic-gate 	return (0);
244*0Sstevel@tonic-gate }
245*0Sstevel@tonic-gate 
246*0Sstevel@tonic-gate /*
247*0Sstevel@tonic-gate  * FUNCTION:	reset_sp()
248*0Sstevel@tonic-gate  * INPUT:	un		- unit structure to be reset/removed.
249*0Sstevel@tonic-gate  *		mnum		- minor number to be reset/removed.
250*0Sstevel@tonic-gate  *		removing	- flag to tell us if we are removing
251*0Sstevel@tonic-gate  *				  permanently or just reseting in-core
252*0Sstevel@tonic-gate  *				  structures.
253*0Sstevel@tonic-gate  * OUTPUT:	none.
254*0Sstevel@tonic-gate  * RETURNS:	void.
255*0Sstevel@tonic-gate  * PURPOSE:	used to either simply reset in-core structures or to
256*0Sstevel@tonic-gate  *		permanently remove metadevices from the metadb.
257*0Sstevel@tonic-gate  */
258*0Sstevel@tonic-gate void
259*0Sstevel@tonic-gate reset_sp(mp_unit_t *un, minor_t mnum, int removing)
260*0Sstevel@tonic-gate {
261*0Sstevel@tonic-gate 	sv_dev_t	*sv;
262*0Sstevel@tonic-gate 	mddb_recid_t	vtoc_id;
263*0Sstevel@tonic-gate 
264*0Sstevel@tonic-gate 	/* clean up in-core structures */
265*0Sstevel@tonic-gate 	md_destroy_unit_incore(mnum, &sp_md_ops);
266*0Sstevel@tonic-gate 
267*0Sstevel@tonic-gate 	MD_UNIT(mnum) = NULL;
268*0Sstevel@tonic-gate 
269*0Sstevel@tonic-gate 	if (!removing)
270*0Sstevel@tonic-gate 		return;
271*0Sstevel@tonic-gate 
272*0Sstevel@tonic-gate 	/* we are removing the soft partition from the metadb */
273*0Sstevel@tonic-gate 
274*0Sstevel@tonic-gate 	/*
275*0Sstevel@tonic-gate 	 * Save off device information so we can get to
276*0Sstevel@tonic-gate 	 * it after we do the mddb_deleterec().
277*0Sstevel@tonic-gate 	 */
278*0Sstevel@tonic-gate 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
279*0Sstevel@tonic-gate 	sv->setno = MD_MIN2SET(mnum);
280*0Sstevel@tonic-gate 	sv->key = un->un_key;
281*0Sstevel@tonic-gate 	vtoc_id = un->c.un_vtoc_id;
282*0Sstevel@tonic-gate 
283*0Sstevel@tonic-gate 	/* Remove the unit structure */
284*0Sstevel@tonic-gate 	mddb_deleterec_wrapper(un->c.un_record_id);
285*0Sstevel@tonic-gate 
286*0Sstevel@tonic-gate 	if (vtoc_id)
287*0Sstevel@tonic-gate 		mddb_deleterec_wrapper(vtoc_id);
288*0Sstevel@tonic-gate 
289*0Sstevel@tonic-gate 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
290*0Sstevel@tonic-gate 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
291*0Sstevel@tonic-gate 
292*0Sstevel@tonic-gate 	/*
293*0Sstevel@tonic-gate 	 * remove the underlying device name from the metadb.  if other
294*0Sstevel@tonic-gate 	 * soft partitions are built on this device, this will simply
295*0Sstevel@tonic-gate 	 * decrease the reference count for this device.  otherwise the
296*0Sstevel@tonic-gate 	 * name record for this device will be removed from the metadb.
297*0Sstevel@tonic-gate 	 */
298*0Sstevel@tonic-gate 	md_rem_names(sv, 1);
299*0Sstevel@tonic-gate 	kmem_free(sv, sizeof (sv_dev_t));
300*0Sstevel@tonic-gate }
301*0Sstevel@tonic-gate 
302*0Sstevel@tonic-gate /*
303*0Sstevel@tonic-gate  * FUNCTION:	sp_send_stat_msg
304*0Sstevel@tonic-gate  * INPUT:	un	- unit reference
305*0Sstevel@tonic-gate  *		status	- status to be sent to master node
306*0Sstevel@tonic-gate  *			MD_SP_OK - soft-partition is now OK
307*0Sstevel@tonic-gate  *			MD_SP_ERR	"	"	 errored
308*0Sstevel@tonic-gate  * OUTPUT:	none.
309*0Sstevel@tonic-gate  * RETURNS:	void.
310*0Sstevel@tonic-gate  * PURPOSE:	send a soft-partition status change to the master node. If the
311*0Sstevel@tonic-gate  *		message succeeds we simply return. If it fails we panic as the
312*0Sstevel@tonic-gate  *		cluster-wide view of the metadevices is now inconsistent.
313*0Sstevel@tonic-gate  * CALLING CONTEXT:
314*0Sstevel@tonic-gate  *	Blockable. No locks can be held.
315*0Sstevel@tonic-gate  */
316*0Sstevel@tonic-gate static void
317*0Sstevel@tonic-gate sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
318*0Sstevel@tonic-gate {
319*0Sstevel@tonic-gate 	md_mn_msg_sp_setstat_t	sp_msg;
320*0Sstevel@tonic-gate 	md_mn_kresult_t	*kres;
321*0Sstevel@tonic-gate 	set_t		setno = MD_UN2SET(un);
322*0Sstevel@tonic-gate 	int		rval;
323*0Sstevel@tonic-gate 	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
324*0Sstevel@tonic-gate 
325*0Sstevel@tonic-gate 	sp_msg.sp_setstat_mnum = MD_SID(un);
326*0Sstevel@tonic-gate 	sp_msg.sp_setstat_status = status;
327*0Sstevel@tonic-gate 
328*0Sstevel@tonic-gate 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
329*0Sstevel@tonic-gate 
330*0Sstevel@tonic-gate 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
331*0Sstevel@tonic-gate 	    (char *)&sp_msg, sizeof (sp_msg), kres);
332*0Sstevel@tonic-gate 
333*0Sstevel@tonic-gate 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
334*0Sstevel@tonic-gate 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
335*0Sstevel@tonic-gate 
336*0Sstevel@tonic-gate 		/*
337*0Sstevel@tonic-gate 		 * Panic as we are now in an inconsistent state.
338*0Sstevel@tonic-gate 		 */
339*0Sstevel@tonic-gate 
340*0Sstevel@tonic-gate 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
341*0Sstevel@tonic-gate 		    md_shortname(MD_SID(un)), str);
342*0Sstevel@tonic-gate 	}
343*0Sstevel@tonic-gate 
344*0Sstevel@tonic-gate 	kmem_free(kres, sizeof (md_mn_kresult_t));
345*0Sstevel@tonic-gate }
346*0Sstevel@tonic-gate 
347*0Sstevel@tonic-gate /*
348*0Sstevel@tonic-gate  * FUNCTION:	sp_finish_error
349*0Sstevel@tonic-gate  * INPUT:	ps	- parent save structure for error-ed I/O.
350*0Sstevel@tonic-gate  *		lock_held	- set if the unit readerlock is held
351*0Sstevel@tonic-gate  * OUTPUT:	none.
352*0Sstevel@tonic-gate  * RETURNS:	void.
353*0Sstevel@tonic-gate  * PURPOSE:	report a driver error
354*0Sstevel@tonic-gate  */
355*0Sstevel@tonic-gate static void
356*0Sstevel@tonic-gate sp_finish_error(md_spps_t *ps, int lock_held)
357*0Sstevel@tonic-gate {
358*0Sstevel@tonic-gate 	struct buf	*pb = ps->ps_bp;
359*0Sstevel@tonic-gate 	mdi_unit_t	*ui = ps->ps_ui;
360*0Sstevel@tonic-gate 	md_dev64_t	un_dev;			/* underlying device */
361*0Sstevel@tonic-gate 	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
362*0Sstevel@tonic-gate 	char		*str;
363*0Sstevel@tonic-gate 
364*0Sstevel@tonic-gate 	un_dev = md_expldev(ps->ps_un->un_dev);
365*0Sstevel@tonic-gate 	/* set error type */
366*0Sstevel@tonic-gate 	if (pb->b_flags & B_READ) {
367*0Sstevel@tonic-gate 		str = "read";
368*0Sstevel@tonic-gate 	} else {
369*0Sstevel@tonic-gate 		str = "write";
370*0Sstevel@tonic-gate 	}
371*0Sstevel@tonic-gate 
372*0Sstevel@tonic-gate 
373*0Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
374*0Sstevel@tonic-gate 	pb->b_flags |= B_ERROR;
375*0Sstevel@tonic-gate 
376*0Sstevel@tonic-gate 	md_kstat_done(ui, pb, 0);
377*0Sstevel@tonic-gate 
378*0Sstevel@tonic-gate 	if (lock_held) {
379*0Sstevel@tonic-gate 		md_unit_readerexit(ui);
380*0Sstevel@tonic-gate 	}
381*0Sstevel@tonic-gate 	md_biodone(pb);
382*0Sstevel@tonic-gate 
383*0Sstevel@tonic-gate 	cmn_err(CE_WARN, "md: %s: %s error on %s",
384*0Sstevel@tonic-gate 	    md_shortname(md_getminor(md_dev)), str,
385*0Sstevel@tonic-gate 	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
386*0Sstevel@tonic-gate }
387*0Sstevel@tonic-gate 
388*0Sstevel@tonic-gate 
389*0Sstevel@tonic-gate /*
390*0Sstevel@tonic-gate  * FUNCTION:	sp_xmit_ok
391*0Sstevel@tonic-gate  * INPUT:	dq	- daemon queue referencing failing ps structure
392*0Sstevel@tonic-gate  * OUTPUT:	none.
393*0Sstevel@tonic-gate  * RETURNS:	void.
394*0Sstevel@tonic-gate  * PURPOSE:	send a message to the master node in a multi-owner diskset to
395*0Sstevel@tonic-gate  *		update all attached nodes view of the soft-part to be MD_SP_OK.
396*0Sstevel@tonic-gate  * CALLING CONTEXT:
397*0Sstevel@tonic-gate  *	Blockable. No unit lock held.
398*0Sstevel@tonic-gate  */
399*0Sstevel@tonic-gate static void
400*0Sstevel@tonic-gate sp_xmit_ok(daemon_queue_t *dq)
401*0Sstevel@tonic-gate {
402*0Sstevel@tonic-gate 	md_spps_t	*ps = (md_spps_t *)dq;
403*0Sstevel@tonic-gate 
404*0Sstevel@tonic-gate 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
405*0Sstevel@tonic-gate 	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
406*0Sstevel@tonic-gate 
407*0Sstevel@tonic-gate 	/*
408*0Sstevel@tonic-gate 	 * Successfully transmitted error state to all nodes, now release this
409*0Sstevel@tonic-gate 	 * parent structure.
410*0Sstevel@tonic-gate 	 */
411*0Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
412*0Sstevel@tonic-gate }
413*0Sstevel@tonic-gate 
414*0Sstevel@tonic-gate /*
415*0Sstevel@tonic-gate  * FUNCTION:	sp_xmit_error
416*0Sstevel@tonic-gate  * INPUT:	dq	- daemon queue referencing failing ps structure
417*0Sstevel@tonic-gate  * OUTPUT:	none.
418*0Sstevel@tonic-gate  * RETURNS:	void.
419*0Sstevel@tonic-gate  * PURPOSE:	send a message to the master node in a multi-owner diskset to
420*0Sstevel@tonic-gate  *		update all attached nodes view of the soft-part to be MD_SP_ERR.
421*0Sstevel@tonic-gate  * CALLING CONTEXT:
422*0Sstevel@tonic-gate  *	Blockable. No unit lock held.
423*0Sstevel@tonic-gate  */
424*0Sstevel@tonic-gate static void
425*0Sstevel@tonic-gate sp_xmit_error(daemon_queue_t *dq)
426*0Sstevel@tonic-gate {
427*0Sstevel@tonic-gate 	md_spps_t	*ps = (md_spps_t *)dq;
428*0Sstevel@tonic-gate 
429*0Sstevel@tonic-gate 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
430*0Sstevel@tonic-gate 	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
431*0Sstevel@tonic-gate 
432*0Sstevel@tonic-gate 	/*
433*0Sstevel@tonic-gate 	 * Successfully transmitted error state to all nodes, now release this
434*0Sstevel@tonic-gate 	 * parent structure.
435*0Sstevel@tonic-gate 	 */
436*0Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
437*0Sstevel@tonic-gate }
438*0Sstevel@tonic-gate static void
439*0Sstevel@tonic-gate sp_send_stat_ok(mp_unit_t *un)
440*0Sstevel@tonic-gate {
441*0Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
442*0Sstevel@tonic-gate 	md_spps_t	*ps;
443*0Sstevel@tonic-gate 
444*0Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
445*0Sstevel@tonic-gate 	sp_parent_init(ps);
446*0Sstevel@tonic-gate 	ps->ps_un = un;
447*0Sstevel@tonic-gate 	ps->ps_ui = MDI_UNIT(mnum);
448*0Sstevel@tonic-gate 
449*0Sstevel@tonic-gate 	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
450*0Sstevel@tonic-gate 	REQ_OLD);
451*0Sstevel@tonic-gate }
452*0Sstevel@tonic-gate 
453*0Sstevel@tonic-gate static void
454*0Sstevel@tonic-gate sp_send_stat_err(mp_unit_t *un)
455*0Sstevel@tonic-gate {
456*0Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
457*0Sstevel@tonic-gate 	md_spps_t	*ps;
458*0Sstevel@tonic-gate 
459*0Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
460*0Sstevel@tonic-gate 	sp_parent_init(ps);
461*0Sstevel@tonic-gate 	ps->ps_un = un;
462*0Sstevel@tonic-gate 	ps->ps_ui = MDI_UNIT(mnum);
463*0Sstevel@tonic-gate 
464*0Sstevel@tonic-gate 	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
465*0Sstevel@tonic-gate 	REQ_OLD);
466*0Sstevel@tonic-gate }
467*0Sstevel@tonic-gate 
468*0Sstevel@tonic-gate 
469*0Sstevel@tonic-gate /*
470*0Sstevel@tonic-gate  * FUNCTION:	sp_error()
471*0Sstevel@tonic-gate  * INPUT:	ps	- parent save structure for error-ed I/O.
472*0Sstevel@tonic-gate  * OUTPUT:	none.
473*0Sstevel@tonic-gate  * RETURNS:	void.
474*0Sstevel@tonic-gate  * PURPOSE:	report a driver error.
475*0Sstevel@tonic-gate  * CALLING CONTEXT:
476*0Sstevel@tonic-gate  *	Interrupt - non-blockable
477*0Sstevel@tonic-gate  */
478*0Sstevel@tonic-gate static void
479*0Sstevel@tonic-gate sp_error(md_spps_t *ps)
480*0Sstevel@tonic-gate {
481*0Sstevel@tonic-gate 	set_t		setno = MD_UN2SET(ps->ps_un);
482*0Sstevel@tonic-gate 
483*0Sstevel@tonic-gate 	/*
484*0Sstevel@tonic-gate 	 * Drop the mutex associated with this request before (potentially)
485*0Sstevel@tonic-gate 	 * enqueuing the free onto a separate thread. We have to release the
486*0Sstevel@tonic-gate 	 * mutex before destroying the parent structure.
487*0Sstevel@tonic-gate 	 */
488*0Sstevel@tonic-gate 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
489*0Sstevel@tonic-gate 		if (MUTEX_HELD(&ps->ps_mx)) {
490*0Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
491*0Sstevel@tonic-gate 		}
492*0Sstevel@tonic-gate 	} else {
493*0Sstevel@tonic-gate 		/*
494*0Sstevel@tonic-gate 		 * this should only ever happen if we are panicking,
495*0Sstevel@tonic-gate 		 * since DONTFREE is only set on the parent if panicstr
496*0Sstevel@tonic-gate 		 * is non-NULL.
497*0Sstevel@tonic-gate 		 */
498*0Sstevel@tonic-gate 		ASSERT(panicstr);
499*0Sstevel@tonic-gate 	}
500*0Sstevel@tonic-gate 
501*0Sstevel@tonic-gate 	/*
502*0Sstevel@tonic-gate 	 * For a multi-owner set we need to send a message to the master so that
503*0Sstevel@tonic-gate 	 * all nodes get the errored status when we first encounter it. To avoid
504*0Sstevel@tonic-gate 	 * deadlocking when multiple soft-partitions encounter an error on one
505*0Sstevel@tonic-gate 	 * physical unit we drop the unit readerlock before enqueueing the
506*0Sstevel@tonic-gate 	 * request. That way we can service any messages that require a
507*0Sstevel@tonic-gate 	 * writerlock to be held. Additionally, to avoid deadlocking when at
508*0Sstevel@tonic-gate 	 * the bottom of a metadevice stack and a higher level mirror has
509*0Sstevel@tonic-gate 	 * multiple requests outstanding on this soft-part, we clone the ps
510*0Sstevel@tonic-gate 	 * that failed and pass the error back up the stack to release the
511*0Sstevel@tonic-gate 	 * reference that this i/o may have in the higher-level metadevice.
512*0Sstevel@tonic-gate 	 * The other nodes in the cluster just have to modify the soft-part
513*0Sstevel@tonic-gate 	 * status and we do not need to block the i/o completion for this.
514*0Sstevel@tonic-gate 	 */
515*0Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno)) {
516*0Sstevel@tonic-gate 		md_spps_t	*err_ps;
517*0Sstevel@tonic-gate 		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
518*0Sstevel@tonic-gate 		sp_parent_init(err_ps);
519*0Sstevel@tonic-gate 
520*0Sstevel@tonic-gate 		err_ps->ps_un = ps->ps_un;
521*0Sstevel@tonic-gate 		err_ps->ps_ui = ps->ps_ui;
522*0Sstevel@tonic-gate 
523*0Sstevel@tonic-gate 		md_unit_readerexit(ps->ps_ui);
524*0Sstevel@tonic-gate 
525*0Sstevel@tonic-gate 		daemon_request(&md_sp_daemon, sp_xmit_error,
526*0Sstevel@tonic-gate 		    (daemon_queue_t *)err_ps, REQ_OLD);
527*0Sstevel@tonic-gate 
528*0Sstevel@tonic-gate 		sp_finish_error(ps, 0);
529*0Sstevel@tonic-gate 
530*0Sstevel@tonic-gate 		return;
531*0Sstevel@tonic-gate 	} else {
532*0Sstevel@tonic-gate 		ps->ps_un->un_status = MD_SP_ERR;
533*0Sstevel@tonic-gate 	}
534*0Sstevel@tonic-gate 
535*0Sstevel@tonic-gate 	/* Flag the error */
536*0Sstevel@tonic-gate 	sp_finish_error(ps, 1);
537*0Sstevel@tonic-gate 
538*0Sstevel@tonic-gate }
539*0Sstevel@tonic-gate 
540*0Sstevel@tonic-gate /*
541*0Sstevel@tonic-gate  * FUNCTION:	sp_mapbuf()
542*0Sstevel@tonic-gate  * INPUT:	un	- unit structure for soft partition we are doing
543*0Sstevel@tonic-gate  *			  I/O on.
544*0Sstevel@tonic-gate  *		voff	- virtual offset in soft partition to map.
545*0Sstevel@tonic-gate  *		bcount	- # of blocks in the I/O.
546*0Sstevel@tonic-gate  * OUTPUT:	bp	- translated buffer to be passed down to next layer.
547*0Sstevel@tonic-gate  * RETURNS:	1	- request must be fragmented, more work to do,
548*0Sstevel@tonic-gate  *		0	- request satisified, no more work to do
549*0Sstevel@tonic-gate  *		-1	- error
550*0Sstevel@tonic-gate  * PURPOSE:	Map the the virtual offset in the soft partition (passed
551*0Sstevel@tonic-gate  *		in via voff) to the "physical" offset on whatever the soft
552*0Sstevel@tonic-gate  *		partition is built on top of.  We do this by doing a binary
553*0Sstevel@tonic-gate  *		search of the extent array in the soft partition unit
554*0Sstevel@tonic-gate  *		structure.  Once the current extent is found, we do the
555*0Sstevel@tonic-gate  *		translation, determine if the I/O will cross extent
556*0Sstevel@tonic-gate  *		boundaries (if so, we have to fragment the I/O), then
557*0Sstevel@tonic-gate  *		fill in the buf structure to be passed down to the next layer.
558*0Sstevel@tonic-gate  */
559*0Sstevel@tonic-gate static int
560*0Sstevel@tonic-gate sp_mapbuf(
561*0Sstevel@tonic-gate 	mp_unit_t	*un,
562*0Sstevel@tonic-gate 	sp_ext_offset_t	voff,
563*0Sstevel@tonic-gate 	sp_ext_length_t	bcount,
564*0Sstevel@tonic-gate 	buf_t		*bp
565*0Sstevel@tonic-gate )
566*0Sstevel@tonic-gate {
567*0Sstevel@tonic-gate 	int		lo, mid, hi, found, more;
568*0Sstevel@tonic-gate 	size_t		new_bcount;
569*0Sstevel@tonic-gate 	sp_ext_offset_t new_blkno;
570*0Sstevel@tonic-gate 	sp_ext_offset_t	new_offset;
571*0Sstevel@tonic-gate 	sp_ext_offset_t	ext_endblk;
572*0Sstevel@tonic-gate 	md_dev64_t	new_edev;
573*0Sstevel@tonic-gate 	extern unsigned	md_maxphys;
574*0Sstevel@tonic-gate 
575*0Sstevel@tonic-gate 	found = 0;
576*0Sstevel@tonic-gate 	lo = 0;
577*0Sstevel@tonic-gate 	hi = un->un_numexts - 1;
578*0Sstevel@tonic-gate 
579*0Sstevel@tonic-gate 	/*
580*0Sstevel@tonic-gate 	 * do a binary search to find the extent that contains the
581*0Sstevel@tonic-gate 	 * starting offset.  after this loop, mid contains the index
582*0Sstevel@tonic-gate 	 * of the correct extent.
583*0Sstevel@tonic-gate 	 */
584*0Sstevel@tonic-gate 	while (lo <= hi && !found) {
585*0Sstevel@tonic-gate 		mid = (lo + hi) / 2;
586*0Sstevel@tonic-gate 		/* is the starting offset contained within the mid-ext? */
587*0Sstevel@tonic-gate 		if (voff >= un->un_ext[mid].un_voff &&
588*0Sstevel@tonic-gate 		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
589*0Sstevel@tonic-gate 			found = 1;
590*0Sstevel@tonic-gate 		else if (voff < un->un_ext[mid].un_voff)
591*0Sstevel@tonic-gate 			hi = mid - 1;
592*0Sstevel@tonic-gate 		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
593*0Sstevel@tonic-gate 			lo = mid + 1;
594*0Sstevel@tonic-gate 	}
595*0Sstevel@tonic-gate 
596*0Sstevel@tonic-gate 	if (!found) {
597*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
598*0Sstevel@tonic-gate 		return (-1);
599*0Sstevel@tonic-gate 	}
600*0Sstevel@tonic-gate 
601*0Sstevel@tonic-gate 	/* translate to underlying physical offset/device */
602*0Sstevel@tonic-gate 	new_offset = voff - un->un_ext[mid].un_voff;
603*0Sstevel@tonic-gate 	new_blkno = un->un_ext[mid].un_poff + new_offset;
604*0Sstevel@tonic-gate 	new_edev = un->un_dev;
605*0Sstevel@tonic-gate 
606*0Sstevel@tonic-gate 	/* determine if we need to break the I/O into fragments */
607*0Sstevel@tonic-gate 	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
608*0Sstevel@tonic-gate 	if (voff + btodb(bcount) > ext_endblk) {
609*0Sstevel@tonic-gate 		new_bcount = dbtob(ext_endblk - voff);
610*0Sstevel@tonic-gate 		more = 1;
611*0Sstevel@tonic-gate 	} else {
612*0Sstevel@tonic-gate 		new_bcount = bcount;
613*0Sstevel@tonic-gate 		more = 0;
614*0Sstevel@tonic-gate 	}
615*0Sstevel@tonic-gate 
616*0Sstevel@tonic-gate 	/* only break up the I/O if we're not built on another metadevice */
617*0Sstevel@tonic-gate 	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
618*0Sstevel@tonic-gate 		new_bcount = md_maxphys;
619*0Sstevel@tonic-gate 		more = 1;
620*0Sstevel@tonic-gate 	}
621*0Sstevel@tonic-gate 	if (bp != (buf_t *)NULL) {
622*0Sstevel@tonic-gate 		/* do bp updates */
623*0Sstevel@tonic-gate 		bp->b_bcount = new_bcount;
624*0Sstevel@tonic-gate 		bp->b_lblkno = new_blkno;
625*0Sstevel@tonic-gate 		bp->b_edev = md_dev64_to_dev(new_edev);
626*0Sstevel@tonic-gate 	}
627*0Sstevel@tonic-gate 	return (more);
628*0Sstevel@tonic-gate }
629*0Sstevel@tonic-gate 
630*0Sstevel@tonic-gate /*
631*0Sstevel@tonic-gate  * FUNCTION:	sp_validate()
632*0Sstevel@tonic-gate  * INPUT:	un	- unit structure to be validated.
633*0Sstevel@tonic-gate  * OUTPUT:	none.
634*0Sstevel@tonic-gate  * RETURNS:	0	- soft partition ok.
635*0Sstevel@tonic-gate  *		-1	- error.
636*0Sstevel@tonic-gate  * PURPOSE:	called on open to sanity check the soft partition.  In
637*0Sstevel@tonic-gate  *		order to open a soft partition:
638*0Sstevel@tonic-gate  *		- it must have at least one extent
639*0Sstevel@tonic-gate  *		- the extent info in core and on disk must match
640*0Sstevel@tonic-gate  *		- it may not be in an intermediate state (which would
641*0Sstevel@tonic-gate  *		  imply that a two-phase commit was interrupted)
642*0Sstevel@tonic-gate  *
643*0Sstevel@tonic-gate  *		If the extent checking fails (B_ERROR returned from the read
644*0Sstevel@tonic-gate  *		strategy call) _and_ we're a multi-owner diskset, we send a
645*0Sstevel@tonic-gate  *		message to the master so that all nodes inherit the same view
646*0Sstevel@tonic-gate  *		of the soft partition.
647*0Sstevel@tonic-gate  *		If we are checking a soft-part that is marked as in error, and
648*0Sstevel@tonic-gate  *		we can actually read and validate the watermarks we send a
649*0Sstevel@tonic-gate  *		message to clear the error to the master node.
650*0Sstevel@tonic-gate  */
651*0Sstevel@tonic-gate static int
652*0Sstevel@tonic-gate sp_validate(mp_unit_t *un)
653*0Sstevel@tonic-gate {
654*0Sstevel@tonic-gate 	uint_t		ext;
655*0Sstevel@tonic-gate 	struct buf	*buf;
656*0Sstevel@tonic-gate 	sp_ext_length_t	len;
657*0Sstevel@tonic-gate 	mp_watermark_t	*wm;
658*0Sstevel@tonic-gate 	set_t		setno;
659*0Sstevel@tonic-gate 	int		reset_error = 0;
660*0Sstevel@tonic-gate 
661*0Sstevel@tonic-gate 	setno = MD_UN2SET(un);
662*0Sstevel@tonic-gate 
663*0Sstevel@tonic-gate 	/* sanity check unit structure components ?? */
664*0Sstevel@tonic-gate 	if (un->un_status != MD_SP_OK) {
665*0Sstevel@tonic-gate 		if (un->un_status != MD_SP_ERR) {
666*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
667*0Sstevel@tonic-gate 			    "status is %u.",
668*0Sstevel@tonic-gate 			    md_shortname(MD_SID(un)),
669*0Sstevel@tonic-gate 			    un->un_status);
670*0Sstevel@tonic-gate 			return (-1);
671*0Sstevel@tonic-gate 		} else {
672*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open of soft partition "
673*0Sstevel@tonic-gate 			    "in Errored state.",
674*0Sstevel@tonic-gate 			    md_shortname(MD_SID(un)));
675*0Sstevel@tonic-gate 			reset_error = 1;
676*0Sstevel@tonic-gate 		}
677*0Sstevel@tonic-gate 	}
678*0Sstevel@tonic-gate 
679*0Sstevel@tonic-gate 	if (un->un_numexts == 0) {
680*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
681*0Sstevel@tonic-gate 		    "not have any extents.", md_shortname(MD_SID(un)));
682*0Sstevel@tonic-gate 		return (-1);
683*0Sstevel@tonic-gate 	}
684*0Sstevel@tonic-gate 
685*0Sstevel@tonic-gate 	len = 0LL;
686*0Sstevel@tonic-gate 	for (ext = 0; ext < un->un_numexts; ext++) {
687*0Sstevel@tonic-gate 
688*0Sstevel@tonic-gate 		/* tally extent lengths to check total size */
689*0Sstevel@tonic-gate 		len += un->un_ext[ext].un_len;
690*0Sstevel@tonic-gate 
691*0Sstevel@tonic-gate 		/* allocate buffer for watermark */
692*0Sstevel@tonic-gate 		buf = getrbuf(KM_SLEEP);
693*0Sstevel@tonic-gate 
694*0Sstevel@tonic-gate 		/* read watermark */
695*0Sstevel@tonic-gate 		buf->b_flags = B_READ;
696*0Sstevel@tonic-gate 		buf->b_edev = md_dev64_to_dev(un->un_dev);
697*0Sstevel@tonic-gate 		buf->b_iodone = NULL;
698*0Sstevel@tonic-gate 		buf->b_proc = NULL;
699*0Sstevel@tonic-gate 		buf->b_bcount = sizeof (mp_watermark_t);
700*0Sstevel@tonic-gate 		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
701*0Sstevel@tonic-gate 		buf->b_bufsize = sizeof (mp_watermark_t);
702*0Sstevel@tonic-gate 		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
703*0Sstevel@tonic-gate 		    KM_SLEEP);
704*0Sstevel@tonic-gate 
705*0Sstevel@tonic-gate 		/*
706*0Sstevel@tonic-gate 		 * make the call non-blocking so that it is not affected
707*0Sstevel@tonic-gate 		 * by a set take.
708*0Sstevel@tonic-gate 		 */
709*0Sstevel@tonic-gate 		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
710*0Sstevel@tonic-gate 		(void) biowait(buf);
711*0Sstevel@tonic-gate 
712*0Sstevel@tonic-gate 		if (buf->b_flags & B_ERROR) {
713*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, could not "
714*0Sstevel@tonic-gate 			    "read watermark at block %llu for extent %u, "
715*0Sstevel@tonic-gate 			    "error %d.", md_shortname(MD_SID(un)),
716*0Sstevel@tonic-gate 			    buf->b_lblkno, ext, buf->b_error);
717*0Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
718*0Sstevel@tonic-gate 			freerbuf(buf);
719*0Sstevel@tonic-gate 
720*0Sstevel@tonic-gate 			/*
721*0Sstevel@tonic-gate 			 * If we're a multi-owner diskset we send a message
722*0Sstevel@tonic-gate 			 * indicating that this soft-part has an invalid
723*0Sstevel@tonic-gate 			 * extent to the master node. This ensures a consistent
724*0Sstevel@tonic-gate 			 * view of the soft-part across the cluster.
725*0Sstevel@tonic-gate 			 */
726*0Sstevel@tonic-gate 			if (MD_MNSET_SETNO(setno)) {
727*0Sstevel@tonic-gate 				sp_send_stat_err(un);
728*0Sstevel@tonic-gate 			}
729*0Sstevel@tonic-gate 			return (-1);
730*0Sstevel@tonic-gate 		}
731*0Sstevel@tonic-gate 
732*0Sstevel@tonic-gate 		wm = (mp_watermark_t *)buf->b_un.b_addr;
733*0Sstevel@tonic-gate 
734*0Sstevel@tonic-gate 		/* make sure the checksum is correct first */
735*0Sstevel@tonic-gate 		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
736*0Sstevel@tonic-gate 		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
737*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
738*0Sstevel@tonic-gate 			    "at block %llu for extent %u does not have a "
739*0Sstevel@tonic-gate 			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
740*0Sstevel@tonic-gate 			    buf->b_lblkno, ext, wm->wm_checksum);
741*0Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
742*0Sstevel@tonic-gate 			freerbuf(buf);
743*0Sstevel@tonic-gate 			return (-1);
744*0Sstevel@tonic-gate 		}
745*0Sstevel@tonic-gate 
746*0Sstevel@tonic-gate 		if (wm->wm_magic != MD_SP_MAGIC) {
747*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
748*0Sstevel@tonic-gate 			    "at block %llu for extent %u does not have a "
749*0Sstevel@tonic-gate 			    "valid watermark magic number, expected 0x%x, "
750*0Sstevel@tonic-gate 			    "found 0x%x.", md_shortname(MD_SID(un)),
751*0Sstevel@tonic-gate 			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
752*0Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
753*0Sstevel@tonic-gate 			freerbuf(buf);
754*0Sstevel@tonic-gate 			return (-1);
755*0Sstevel@tonic-gate 		}
756*0Sstevel@tonic-gate 
757*0Sstevel@tonic-gate 		/* make sure sequence number matches the current extent */
758*0Sstevel@tonic-gate 		if (wm->wm_seq != ext) {
759*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
760*0Sstevel@tonic-gate 			    "at block %llu for extent %u has invalid "
761*0Sstevel@tonic-gate 			    "sequence number %u.", md_shortname(MD_SID(un)),
762*0Sstevel@tonic-gate 			    buf->b_lblkno, ext, wm->wm_seq);
763*0Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
764*0Sstevel@tonic-gate 			freerbuf(buf);
765*0Sstevel@tonic-gate 			return (-1);
766*0Sstevel@tonic-gate 		}
767*0Sstevel@tonic-gate 
768*0Sstevel@tonic-gate 		/* make sure watermark length matches unit structure */
769*0Sstevel@tonic-gate 		if (wm->wm_length != un->un_ext[ext].un_len) {
770*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
771*0Sstevel@tonic-gate 			    "at block %llu for extent %u has inconsistent "
772*0Sstevel@tonic-gate 			    "length, expected %llu, found %llu.",
773*0Sstevel@tonic-gate 			    md_shortname(MD_SID(un)), buf->b_lblkno,
774*0Sstevel@tonic-gate 			    ext, un->un_ext[ext].un_len,
775*0Sstevel@tonic-gate 			    (u_longlong_t)wm->wm_length);
776*0Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
777*0Sstevel@tonic-gate 			freerbuf(buf);
778*0Sstevel@tonic-gate 			return (-1);
779*0Sstevel@tonic-gate 		}
780*0Sstevel@tonic-gate 
781*0Sstevel@tonic-gate 		/*
782*0Sstevel@tonic-gate 		 * make sure the type is a valid soft partition and not
783*0Sstevel@tonic-gate 		 * a free extent or the end.
784*0Sstevel@tonic-gate 		 */
785*0Sstevel@tonic-gate 		if (wm->wm_type != EXTTYP_ALLOC) {
786*0Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
787*0Sstevel@tonic-gate 			    "at block %llu for extent %u is not marked "
788*0Sstevel@tonic-gate 			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
789*0Sstevel@tonic-gate 			    buf->b_lblkno, ext, wm->wm_type);
790*0Sstevel@tonic-gate 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
791*0Sstevel@tonic-gate 			freerbuf(buf);
792*0Sstevel@tonic-gate 			return (-1);
793*0Sstevel@tonic-gate 		}
794*0Sstevel@tonic-gate 		/* free up buffer */
795*0Sstevel@tonic-gate 		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
796*0Sstevel@tonic-gate 		freerbuf(buf);
797*0Sstevel@tonic-gate 	}
798*0Sstevel@tonic-gate 
799*0Sstevel@tonic-gate 	if (len != un->un_length) {
800*0Sstevel@tonic-gate 		cmn_err(CE_WARN, "md: %s: open failed, computed length "
801*0Sstevel@tonic-gate 		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
802*0Sstevel@tonic-gate 		    len, un->un_length);
803*0Sstevel@tonic-gate 		return (-1);
804*0Sstevel@tonic-gate 	}
805*0Sstevel@tonic-gate 
806*0Sstevel@tonic-gate 	/*
807*0Sstevel@tonic-gate 	 * If we're a multi-owner set _and_ reset_error is set, we should clear
808*0Sstevel@tonic-gate 	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
809*0Sstevel@tonic-gate 	 * MD_SP_OK.
810*0Sstevel@tonic-gate 	 */
811*0Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno) && reset_error) {
812*0Sstevel@tonic-gate 		sp_send_stat_ok(un);
813*0Sstevel@tonic-gate 	}
814*0Sstevel@tonic-gate 	return (0);
815*0Sstevel@tonic-gate }
816*0Sstevel@tonic-gate 
817*0Sstevel@tonic-gate /*
818*0Sstevel@tonic-gate  * FUNCTION:	sp_done()
819*0Sstevel@tonic-gate  * INPUT:	child_buf	- buffer attached to child save structure.
820*0Sstevel@tonic-gate  *				  this is the buffer on which I/O has just
821*0Sstevel@tonic-gate  *				  completed.
822*0Sstevel@tonic-gate  * OUTPUT:	none.
823*0Sstevel@tonic-gate  * RETURNS:	0	- success.
824*0Sstevel@tonic-gate  *		1	- error.
825*0Sstevel@tonic-gate  * PURPOSE:	called on I/O completion.
826*0Sstevel@tonic-gate  */
827*0Sstevel@tonic-gate static int
828*0Sstevel@tonic-gate sp_done(struct buf *child_buf)
829*0Sstevel@tonic-gate {
830*0Sstevel@tonic-gate 	struct buf	*parent_buf;
831*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
832*0Sstevel@tonic-gate 	md_spps_t	*ps;
833*0Sstevel@tonic-gate 	md_spcs_t	*cs;
834*0Sstevel@tonic-gate 
835*0Sstevel@tonic-gate 	/* find the child save structure to which this buffer belongs */
836*0Sstevel@tonic-gate 	cs = (md_spcs_t *)((caddr_t)child_buf -
837*0Sstevel@tonic-gate 	    (sizeof (md_spcs_t) - sizeof (buf_t)));
838*0Sstevel@tonic-gate 	/* now get the parent save structure */
839*0Sstevel@tonic-gate 	ps = cs->cs_ps;
840*0Sstevel@tonic-gate 	parent_buf = ps->ps_bp;
841*0Sstevel@tonic-gate 
842*0Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
843*0Sstevel@tonic-gate 	/* pass any errors back up to the parent */
844*0Sstevel@tonic-gate 	if (child_buf->b_flags & B_ERROR) {
845*0Sstevel@tonic-gate 		ps->ps_flags |= MD_SPPS_ERROR;
846*0Sstevel@tonic-gate 		parent_buf->b_error = child_buf->b_error;
847*0Sstevel@tonic-gate 	}
848*0Sstevel@tonic-gate 	/* mapout, if needed */
849*0Sstevel@tonic-gate 	if (child_buf->b_flags & B_REMAPPED)
850*0Sstevel@tonic-gate 		bp_mapout(child_buf);
851*0Sstevel@tonic-gate 
852*0Sstevel@tonic-gate 	ps->ps_frags--;
853*0Sstevel@tonic-gate 	if (ps->ps_frags != 0) {
854*0Sstevel@tonic-gate 		/*
855*0Sstevel@tonic-gate 		 * if this parent has more children, we just free the
856*0Sstevel@tonic-gate 		 * child and return.
857*0Sstevel@tonic-gate 		 */
858*0Sstevel@tonic-gate 		kmem_cache_free(sp_child_cache, cs);
859*0Sstevel@tonic-gate 		mutex_exit(&ps->ps_mx);
860*0Sstevel@tonic-gate 		return (1);
861*0Sstevel@tonic-gate 	}
862*0Sstevel@tonic-gate 	/* there are no more children */
863*0Sstevel@tonic-gate 	kmem_cache_free(sp_child_cache, cs);
864*0Sstevel@tonic-gate 	if (ps->ps_flags & MD_SPPS_ERROR) {
865*0Sstevel@tonic-gate 		sp_error(ps);
866*0Sstevel@tonic-gate 		return (1);
867*0Sstevel@tonic-gate 	}
868*0Sstevel@tonic-gate 	ui = ps->ps_ui;
869*0Sstevel@tonic-gate 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
870*0Sstevel@tonic-gate 		mutex_exit(&ps->ps_mx);
871*0Sstevel@tonic-gate 	} else {
872*0Sstevel@tonic-gate 		/*
873*0Sstevel@tonic-gate 		 * this should only ever happen if we are panicking,
874*0Sstevel@tonic-gate 		 * since DONTFREE is only set on the parent if panicstr
875*0Sstevel@tonic-gate 		 * is non-NULL.
876*0Sstevel@tonic-gate 		 */
877*0Sstevel@tonic-gate 		ASSERT(panicstr);
878*0Sstevel@tonic-gate 	}
879*0Sstevel@tonic-gate 	SPPS_FREE(sp_parent_cache, ps);
880*0Sstevel@tonic-gate 	md_kstat_done(ui, parent_buf, 0);
881*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
882*0Sstevel@tonic-gate 	md_biodone(parent_buf);
883*0Sstevel@tonic-gate 	return (0);
884*0Sstevel@tonic-gate }
885*0Sstevel@tonic-gate 
886*0Sstevel@tonic-gate /*
887*0Sstevel@tonic-gate  * FUNCTION:	md_sp_strategy()
888*0Sstevel@tonic-gate  * INPUT:	parent_buf	- parent buffer
889*0Sstevel@tonic-gate  *		flag		- flags
890*0Sstevel@tonic-gate  *		private		- private data
891*0Sstevel@tonic-gate  * OUTPUT:	none.
892*0Sstevel@tonic-gate  * RETURNS:	void.
893*0Sstevel@tonic-gate  * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
894*0Sstevel@tonic-gate  *		needed to do I/O to a soft partition.  The basic
895*0Sstevel@tonic-gate  *		algorithm is as follows:
896*0Sstevel@tonic-gate  *			- Allocate a child save structure to keep track
897*0Sstevel@tonic-gate  *			  of the I/O we are going to pass down.
898*0Sstevel@tonic-gate  *			- Map the I/O to the correct extent in the soft
899*0Sstevel@tonic-gate  *			  partition (see sp_mapbuf()).
900*0Sstevel@tonic-gate  *			- bioclone() the buffer and pass it down the
901*0Sstevel@tonic-gate  *			  stack using md_call_strategy.
902*0Sstevel@tonic-gate  *			- If the I/O needs to split across extents,
903*0Sstevel@tonic-gate  *			  repeat the above steps until all fragments
904*0Sstevel@tonic-gate  *			  are finished.
905*0Sstevel@tonic-gate  */
906*0Sstevel@tonic-gate static void
907*0Sstevel@tonic-gate md_sp_strategy(buf_t *parent_buf, int flag, void *private)
908*0Sstevel@tonic-gate {
909*0Sstevel@tonic-gate 	md_spps_t	*ps;
910*0Sstevel@tonic-gate 	md_spcs_t	*cs;
911*0Sstevel@tonic-gate 	int		more;
912*0Sstevel@tonic-gate 	mp_unit_t	*un;
913*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
914*0Sstevel@tonic-gate 	size_t		current_count;
915*0Sstevel@tonic-gate 	off_t		current_offset;
916*0Sstevel@tonic-gate 	sp_ext_offset_t	current_blkno;
917*0Sstevel@tonic-gate 	buf_t		*child_buf;
918*0Sstevel@tonic-gate 	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
919*0Sstevel@tonic-gate 	int		strat_flag = flag;
920*0Sstevel@tonic-gate 
921*0Sstevel@tonic-gate 	/*
922*0Sstevel@tonic-gate 	 * When doing IO to a multi owner meta device, check if set is halted.
923*0Sstevel@tonic-gate 	 * We do this check without the needed lock held, for performance
924*0Sstevel@tonic-gate 	 * reasons.
925*0Sstevel@tonic-gate 	 * If an IO just slips through while the set is locked via an
926*0Sstevel@tonic-gate 	 * MD_MN_SUSPEND_SET, we don't care about it.
927*0Sstevel@tonic-gate 	 * Only check for suspension if we are a top-level i/o request
928*0Sstevel@tonic-gate 	 * (MD_STR_NOTTOP is cleared in 'flag');
929*0Sstevel@tonic-gate 	 */
930*0Sstevel@tonic-gate 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
931*0Sstevel@tonic-gate 	    (MD_SET_HALTED | MD_SET_MNSET)) {
932*0Sstevel@tonic-gate 		if ((flag & MD_STR_NOTTOP) == 0) {
933*0Sstevel@tonic-gate 			mutex_enter(&md_mx);
934*0Sstevel@tonic-gate 			/* Here we loop until the set is no longer halted */
935*0Sstevel@tonic-gate 			while (md_set[setno].s_status & MD_SET_HALTED) {
936*0Sstevel@tonic-gate 				cv_wait(&md_cv, &md_mx);
937*0Sstevel@tonic-gate 			}
938*0Sstevel@tonic-gate 			mutex_exit(&md_mx);
939*0Sstevel@tonic-gate 		}
940*0Sstevel@tonic-gate 	}
941*0Sstevel@tonic-gate 
942*0Sstevel@tonic-gate 	ui = MDI_UNIT(getminor(parent_buf->b_edev));
943*0Sstevel@tonic-gate 
944*0Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
945*0Sstevel@tonic-gate 
946*0Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_readerlock(ui);
947*0Sstevel@tonic-gate 
948*0Sstevel@tonic-gate 	if ((flag & MD_NOBLOCK) == 0) {
949*0Sstevel@tonic-gate 		if (md_inc_iocount(setno) != 0) {
950*0Sstevel@tonic-gate 			parent_buf->b_flags |= B_ERROR;
951*0Sstevel@tonic-gate 			parent_buf->b_error = ENXIO;
952*0Sstevel@tonic-gate 			parent_buf->b_resid = parent_buf->b_bcount;
953*0Sstevel@tonic-gate 			md_unit_readerexit(ui);
954*0Sstevel@tonic-gate 			biodone(parent_buf);
955*0Sstevel@tonic-gate 			return;
956*0Sstevel@tonic-gate 		}
957*0Sstevel@tonic-gate 	} else {
958*0Sstevel@tonic-gate 		md_inc_iocount_noblock(setno);
959*0Sstevel@tonic-gate 	}
960*0Sstevel@tonic-gate 
961*0Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP)) {
962*0Sstevel@tonic-gate 		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
963*0Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
964*0Sstevel@tonic-gate 			return;
965*0Sstevel@tonic-gate 		}
966*0Sstevel@tonic-gate 	}
967*0Sstevel@tonic-gate 
968*0Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
969*0Sstevel@tonic-gate 	sp_parent_init(ps);
970*0Sstevel@tonic-gate 
971*0Sstevel@tonic-gate 	/*
972*0Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
973*0Sstevel@tonic-gate 	 * in the parent.
974*0Sstevel@tonic-gate 	 */
975*0Sstevel@tonic-gate 	ps->ps_un = un;
976*0Sstevel@tonic-gate 	ps->ps_ui = ui;
977*0Sstevel@tonic-gate 	ps->ps_bp = parent_buf;
978*0Sstevel@tonic-gate 	ps->ps_addr = parent_buf->b_un.b_addr;
979*0Sstevel@tonic-gate 
980*0Sstevel@tonic-gate 	current_count = parent_buf->b_bcount;
981*0Sstevel@tonic-gate 	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
982*0Sstevel@tonic-gate 	current_offset  = 0;
983*0Sstevel@tonic-gate 
984*0Sstevel@tonic-gate 	/*
985*0Sstevel@tonic-gate 	 * if we are at the top and we are panicking,
986*0Sstevel@tonic-gate 	 * we don't free in order to save state.
987*0Sstevel@tonic-gate 	 */
988*0Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
989*0Sstevel@tonic-gate 		ps->ps_flags |= MD_SPPS_DONTFREE;
990*0Sstevel@tonic-gate 
991*0Sstevel@tonic-gate 	md_kstat_waitq_to_runq(ui);
992*0Sstevel@tonic-gate 
993*0Sstevel@tonic-gate 	ps->ps_frags++;
994*0Sstevel@tonic-gate 
995*0Sstevel@tonic-gate 	/*
996*0Sstevel@tonic-gate 	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
997*0Sstevel@tonic-gate 	 * metadevice.
998*0Sstevel@tonic-gate 	 */
999*0Sstevel@tonic-gate 	if (ui->ui_tstate & MD_ABR_CAP)
1000*0Sstevel@tonic-gate 		strat_flag |= MD_STR_ABR;
1001*0Sstevel@tonic-gate 
1002*0Sstevel@tonic-gate 	/*
1003*0Sstevel@tonic-gate 	 * this loop does the main work of an I/O.  we allocate a
1004*0Sstevel@tonic-gate 	 * a child save for each buf, do the logical to physical
1005*0Sstevel@tonic-gate 	 * mapping, decide if we need to frag the I/O, clone the
1006*0Sstevel@tonic-gate 	 * new I/O to pass down the stack.  repeat until we've
1007*0Sstevel@tonic-gate 	 * taken care of the entire buf that was passed to us.
1008*0Sstevel@tonic-gate 	 */
1009*0Sstevel@tonic-gate 	do {
1010*0Sstevel@tonic-gate 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1011*0Sstevel@tonic-gate 		sp_child_init(cs);
1012*0Sstevel@tonic-gate 		child_buf = &cs->cs_buf;
1013*0Sstevel@tonic-gate 		cs->cs_ps = ps;
1014*0Sstevel@tonic-gate 
1015*0Sstevel@tonic-gate 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1016*0Sstevel@tonic-gate 		if (more == -1) {
1017*0Sstevel@tonic-gate 			parent_buf->b_flags |= B_ERROR;
1018*0Sstevel@tonic-gate 			parent_buf->b_error = EIO;
1019*0Sstevel@tonic-gate 			md_kstat_done(ui, parent_buf, 0);
1020*0Sstevel@tonic-gate 			md_unit_readerexit(ui);
1021*0Sstevel@tonic-gate 			md_biodone(parent_buf);
1022*0Sstevel@tonic-gate 			kmem_cache_free(sp_parent_cache, ps);
1023*0Sstevel@tonic-gate 			return;
1024*0Sstevel@tonic-gate 		}
1025*0Sstevel@tonic-gate 
1026*0Sstevel@tonic-gate 		child_buf = md_bioclone(parent_buf, current_offset,
1027*0Sstevel@tonic-gate 					child_buf->b_bcount, child_buf->b_edev,
1028*0Sstevel@tonic-gate 					child_buf->b_blkno, sp_done, child_buf,
1029*0Sstevel@tonic-gate 					KM_NOSLEEP);
1030*0Sstevel@tonic-gate 		/* calculate new offset, counts, etc... */
1031*0Sstevel@tonic-gate 		current_offset += child_buf->b_bcount;
1032*0Sstevel@tonic-gate 		current_count -=  child_buf->b_bcount;
1033*0Sstevel@tonic-gate 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1034*0Sstevel@tonic-gate 
1035*0Sstevel@tonic-gate 		if (more) {
1036*0Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
1037*0Sstevel@tonic-gate 			ps->ps_frags++;
1038*0Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
1039*0Sstevel@tonic-gate 		}
1040*0Sstevel@tonic-gate 
1041*0Sstevel@tonic-gate 		md_call_strategy(child_buf, strat_flag, private);
1042*0Sstevel@tonic-gate 	} while (more);
1043*0Sstevel@tonic-gate 
1044*0Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
1045*0Sstevel@tonic-gate 		while (!(ps->ps_flags & MD_SPPS_DONE)) {
1046*0Sstevel@tonic-gate 			md_daemon(1, &md_done_daemon);
1047*0Sstevel@tonic-gate 		}
1048*0Sstevel@tonic-gate 		kmem_cache_free(sp_parent_cache, ps);
1049*0Sstevel@tonic-gate 	}
1050*0Sstevel@tonic-gate }
1051*0Sstevel@tonic-gate 
1052*0Sstevel@tonic-gate /*
1053*0Sstevel@tonic-gate  * FUNCTION:	sp_directed_read()
1054*0Sstevel@tonic-gate  * INPUT:	mnum	- minor number
1055*0Sstevel@tonic-gate  *		vdr	- vol_directed_rd_t from user
1056*0Sstevel@tonic-gate  *		mode	- access mode for copying data out.
1057*0Sstevel@tonic-gate  * OUTPUT:	none.
1058*0Sstevel@tonic-gate  * RETURNS:	0	- success
1059*0Sstevel@tonic-gate  *		Exxxxx	- failure error-code
1060*0Sstevel@tonic-gate  * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
1061*0Sstevel@tonic-gate  *		directed read as requested by the user. This is essentially the
1062*0Sstevel@tonic-gate  *		same as md_sp_strategy() with the exception being that the
1063*0Sstevel@tonic-gate  *		underlying 'md_call_strategy' is replaced with an ioctl call.
1064*0Sstevel@tonic-gate  */
1065*0Sstevel@tonic-gate int
1066*0Sstevel@tonic-gate sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
1067*0Sstevel@tonic-gate {
1068*0Sstevel@tonic-gate 	md_spps_t	*ps;
1069*0Sstevel@tonic-gate 	md_spcs_t	*cs;
1070*0Sstevel@tonic-gate 	int		more;
1071*0Sstevel@tonic-gate 	mp_unit_t	*un;
1072*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
1073*0Sstevel@tonic-gate 	size_t		current_count;
1074*0Sstevel@tonic-gate 	off_t		current_offset;
1075*0Sstevel@tonic-gate 	sp_ext_offset_t	current_blkno;
1076*0Sstevel@tonic-gate 	buf_t		*child_buf, *parent_buf;
1077*0Sstevel@tonic-gate 	void		*kbuffer;
1078*0Sstevel@tonic-gate 	vol_directed_rd_t	cvdr;
1079*0Sstevel@tonic-gate 	caddr_t		userbuf;
1080*0Sstevel@tonic-gate 	offset_t	useroff;
1081*0Sstevel@tonic-gate 	int		ret = 0;
1082*0Sstevel@tonic-gate 
1083*0Sstevel@tonic-gate 	ui = MDI_UNIT(mnum);
1084*0Sstevel@tonic-gate 
1085*0Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
1086*0Sstevel@tonic-gate 
1087*0Sstevel@tonic-gate 	bzero(&cvdr, sizeof (cvdr));
1088*0Sstevel@tonic-gate 
1089*0Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_readerlock(ui);
1090*0Sstevel@tonic-gate 
1091*0Sstevel@tonic-gate 	/*
1092*0Sstevel@tonic-gate 	 * Construct a parent_buf header which reflects the user-supplied
1093*0Sstevel@tonic-gate 	 * request.
1094*0Sstevel@tonic-gate 	 */
1095*0Sstevel@tonic-gate 
1096*0Sstevel@tonic-gate 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
1097*0Sstevel@tonic-gate 	if (kbuffer == NULL) {
1098*0Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
1099*0Sstevel@tonic-gate 		md_unit_readerexit(ui);
1100*0Sstevel@tonic-gate 		return (ENOMEM);
1101*0Sstevel@tonic-gate 	}
1102*0Sstevel@tonic-gate 
1103*0Sstevel@tonic-gate 	parent_buf = getrbuf(KM_NOSLEEP);
1104*0Sstevel@tonic-gate 	if (parent_buf == NULL) {
1105*0Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
1106*0Sstevel@tonic-gate 		md_unit_readerexit(ui);
1107*0Sstevel@tonic-gate 		kmem_free(kbuffer, vdr->vdr_nbytes);
1108*0Sstevel@tonic-gate 		return (ENOMEM);
1109*0Sstevel@tonic-gate 	}
1110*0Sstevel@tonic-gate 	parent_buf->b_un.b_addr = kbuffer;
1111*0Sstevel@tonic-gate 	parent_buf->b_flags = B_READ;
1112*0Sstevel@tonic-gate 	parent_buf->b_bcount = vdr->vdr_nbytes;
1113*0Sstevel@tonic-gate 	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
1114*0Sstevel@tonic-gate 	parent_buf->b_edev = un->un_dev;
1115*0Sstevel@tonic-gate 
1116*0Sstevel@tonic-gate 
1117*0Sstevel@tonic-gate 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
1118*0Sstevel@tonic-gate 	sp_parent_init(ps);
1119*0Sstevel@tonic-gate 
1120*0Sstevel@tonic-gate 	/*
1121*0Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
1122*0Sstevel@tonic-gate 	 * in the parent.
1123*0Sstevel@tonic-gate 	 */
1124*0Sstevel@tonic-gate 	ps->ps_un = un;
1125*0Sstevel@tonic-gate 	ps->ps_ui = ui;
1126*0Sstevel@tonic-gate 	ps->ps_bp = parent_buf;
1127*0Sstevel@tonic-gate 	ps->ps_addr = parent_buf->b_un.b_addr;
1128*0Sstevel@tonic-gate 
1129*0Sstevel@tonic-gate 	current_count = parent_buf->b_bcount;
1130*0Sstevel@tonic-gate 	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
1131*0Sstevel@tonic-gate 	current_offset  = 0;
1132*0Sstevel@tonic-gate 
1133*0Sstevel@tonic-gate 	ps->ps_frags++;
1134*0Sstevel@tonic-gate 	vdr->vdr_bytesread = 0;
1135*0Sstevel@tonic-gate 
1136*0Sstevel@tonic-gate 	/*
1137*0Sstevel@tonic-gate 	 * this loop does the main work of an I/O.  we allocate a
1138*0Sstevel@tonic-gate 	 * a child save for each buf, do the logical to physical
1139*0Sstevel@tonic-gate 	 * mapping, decide if we need to frag the I/O, clone the
1140*0Sstevel@tonic-gate 	 * new I/O to pass down the stack.  repeat until we've
1141*0Sstevel@tonic-gate 	 * taken care of the entire buf that was passed to us.
1142*0Sstevel@tonic-gate 	 */
1143*0Sstevel@tonic-gate 	do {
1144*0Sstevel@tonic-gate 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1145*0Sstevel@tonic-gate 		sp_child_init(cs);
1146*0Sstevel@tonic-gate 		child_buf = &cs->cs_buf;
1147*0Sstevel@tonic-gate 		cs->cs_ps = ps;
1148*0Sstevel@tonic-gate 
1149*0Sstevel@tonic-gate 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1150*0Sstevel@tonic-gate 		if (more == -1) {
1151*0Sstevel@tonic-gate 			ret = EIO;
1152*0Sstevel@tonic-gate 			vdr->vdr_flags |= DKV_DMR_SHORT;
1153*0Sstevel@tonic-gate 			kmem_cache_free(sp_child_cache, cs);
1154*0Sstevel@tonic-gate 			goto err_out;
1155*0Sstevel@tonic-gate 		}
1156*0Sstevel@tonic-gate 
1157*0Sstevel@tonic-gate 		cvdr.vdr_flags = vdr->vdr_flags;
1158*0Sstevel@tonic-gate 		cvdr.vdr_side = vdr->vdr_side;
1159*0Sstevel@tonic-gate 		cvdr.vdr_nbytes = child_buf->b_bcount;
1160*0Sstevel@tonic-gate 		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
1161*0Sstevel@tonic-gate 		/* Work out where we are in the allocated buffer */
1162*0Sstevel@tonic-gate 		useroff = (offset_t)kbuffer;
1163*0Sstevel@tonic-gate 		useroff = useroff + (offset_t)current_offset;
1164*0Sstevel@tonic-gate 		cvdr.vdr_data = (void *)useroff;
1165*0Sstevel@tonic-gate 		child_buf = md_bioclone(parent_buf, current_offset,
1166*0Sstevel@tonic-gate 					child_buf->b_bcount, child_buf->b_edev,
1167*0Sstevel@tonic-gate 					child_buf->b_blkno, NULL,
1168*0Sstevel@tonic-gate 					child_buf, KM_NOSLEEP);
1169*0Sstevel@tonic-gate 		/* calculate new offset, counts, etc... */
1170*0Sstevel@tonic-gate 		current_offset += child_buf->b_bcount;
1171*0Sstevel@tonic-gate 		current_count -=  child_buf->b_bcount;
1172*0Sstevel@tonic-gate 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1173*0Sstevel@tonic-gate 
1174*0Sstevel@tonic-gate 		if (more) {
1175*0Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
1176*0Sstevel@tonic-gate 			ps->ps_frags++;
1177*0Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
1178*0Sstevel@tonic-gate 		}
1179*0Sstevel@tonic-gate 
1180*0Sstevel@tonic-gate 		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
1181*0Sstevel@tonic-gate 		    (mode | FKIOCTL), NULL);
1182*0Sstevel@tonic-gate 
1183*0Sstevel@tonic-gate 		/*
1184*0Sstevel@tonic-gate 		 * Free the child structure as we've finished with it.
1185*0Sstevel@tonic-gate 		 * Normally this would be done by sp_done() but we're just
1186*0Sstevel@tonic-gate 		 * using md_bioclone() to segment the transfer and we never
1187*0Sstevel@tonic-gate 		 * issue a strategy request so the iodone will not be called.
1188*0Sstevel@tonic-gate 		 */
1189*0Sstevel@tonic-gate 		kmem_cache_free(sp_child_cache, cs);
1190*0Sstevel@tonic-gate 		if (ret == 0) {
1191*0Sstevel@tonic-gate 			/* copyout the returned data to vdr_data + offset */
1192*0Sstevel@tonic-gate 			userbuf = (caddr_t)kbuffer;
1193*0Sstevel@tonic-gate 			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
1194*0Sstevel@tonic-gate 			if (ddi_copyout(userbuf, vdr->vdr_data,
1195*0Sstevel@tonic-gate 			    cvdr.vdr_bytesread, mode)) {
1196*0Sstevel@tonic-gate 				ret = EFAULT;
1197*0Sstevel@tonic-gate 				goto err_out;
1198*0Sstevel@tonic-gate 			}
1199*0Sstevel@tonic-gate 			vdr->vdr_bytesread += cvdr.vdr_bytesread;
1200*0Sstevel@tonic-gate 		} else {
1201*0Sstevel@tonic-gate 			goto err_out;
1202*0Sstevel@tonic-gate 		}
1203*0Sstevel@tonic-gate 	} while (more);
1204*0Sstevel@tonic-gate 
1205*0Sstevel@tonic-gate 	/*
1206*0Sstevel@tonic-gate 	 * Update the user-supplied vol_directed_rd_t structure with the
1207*0Sstevel@tonic-gate 	 * contents of the last issued child request.
1208*0Sstevel@tonic-gate 	 */
1209*0Sstevel@tonic-gate 	vdr->vdr_flags = cvdr.vdr_flags;
1210*0Sstevel@tonic-gate 	vdr->vdr_side = cvdr.vdr_side;
1211*0Sstevel@tonic-gate 	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
1212*0Sstevel@tonic-gate 
1213*0Sstevel@tonic-gate err_out:
1214*0Sstevel@tonic-gate 	if (ret != 0) {
1215*0Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
1216*0Sstevel@tonic-gate 	}
1217*0Sstevel@tonic-gate 	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
1218*0Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_SHORT;
1219*0Sstevel@tonic-gate 	}
1220*0Sstevel@tonic-gate 	kmem_cache_free(sp_parent_cache, ps);
1221*0Sstevel@tonic-gate 	kmem_free(kbuffer, vdr->vdr_nbytes);
1222*0Sstevel@tonic-gate 	freerbuf(parent_buf);
1223*0Sstevel@tonic-gate 	md_unit_readerexit(ui);
1224*0Sstevel@tonic-gate 	return (ret);
1225*0Sstevel@tonic-gate }
1226*0Sstevel@tonic-gate 
1227*0Sstevel@tonic-gate /*
1228*0Sstevel@tonic-gate  * FUNCTION:	sp_snarf()
1229*0Sstevel@tonic-gate  * INPUT:	cmd	- snarf cmd.
1230*0Sstevel@tonic-gate  *		setno	- set number.
1231*0Sstevel@tonic-gate  * OUTPUT:	none.
1232*0Sstevel@tonic-gate  * RETURNS:	1	- soft partitions were snarfed.
1233*0Sstevel@tonic-gate  *		0	- no soft partitions were snarfed.
1234*0Sstevel@tonic-gate  * PURPOSE:	Snarf soft partition metadb records into their in-core
1235*0Sstevel@tonic-gate  *		structures.  This routine is called at "snarf time" when
1236*0Sstevel@tonic-gate  *		md loads and gets all metadevices records into memory.
1237*0Sstevel@tonic-gate  *		The basic algorithm is simply to walk the soft partition
1238*0Sstevel@tonic-gate  *		records in the metadb and call the soft partitioning
1239*0Sstevel@tonic-gate  *		build_incore routine to set up the in-core structures.
1240*0Sstevel@tonic-gate  */
1241*0Sstevel@tonic-gate static int
1242*0Sstevel@tonic-gate sp_snarf(md_snarfcmd_t cmd, set_t setno)
1243*0Sstevel@tonic-gate {
1244*0Sstevel@tonic-gate 	mp_unit_t	*un;
1245*0Sstevel@tonic-gate 	mddb_recid_t	recid;
1246*0Sstevel@tonic-gate 	int		gotsomething;
1247*0Sstevel@tonic-gate 	int		all_sp_gotten;
1248*0Sstevel@tonic-gate 	mddb_type_t	rec_type;
1249*0Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
1250*0Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
1251*0Sstevel@tonic-gate 	mp_unit_t	*big_un;
1252*0Sstevel@tonic-gate 	mp_unit32_od_t	*small_un;
1253*0Sstevel@tonic-gate 	size_t		newreqsize;
1254*0Sstevel@tonic-gate 
1255*0Sstevel@tonic-gate 
1256*0Sstevel@tonic-gate 	if (cmd == MD_SNARF_CLEANUP)
1257*0Sstevel@tonic-gate 		return (0);
1258*0Sstevel@tonic-gate 
1259*0Sstevel@tonic-gate 	all_sp_gotten = 1;
1260*0Sstevel@tonic-gate 	gotsomething = 0;
1261*0Sstevel@tonic-gate 
1262*0Sstevel@tonic-gate 	/* get the record type */
1263*0Sstevel@tonic-gate 	rec_type = (mddb_type_t)md_getshared_key(setno,
1264*0Sstevel@tonic-gate 	    sp_md_ops.md_driver.md_drivername);
1265*0Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
1266*0Sstevel@tonic-gate 
1267*0Sstevel@tonic-gate 	/*
1268*0Sstevel@tonic-gate 	 * walk soft partition records in the metadb and call
1269*0Sstevel@tonic-gate 	 * sp_build_incore to build in-core structures.
1270*0Sstevel@tonic-gate 	 */
1271*0Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1272*0Sstevel@tonic-gate 		/* if we've already gotten this record, go to the next one */
1273*0Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1274*0Sstevel@tonic-gate 			continue;
1275*0Sstevel@tonic-gate 
1276*0Sstevel@tonic-gate 
1277*0Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
1278*0Sstevel@tonic-gate 		dep->de_flags = MDDB_F_SOFTPART;
1279*0Sstevel@tonic-gate 		rbp = dep->de_rb;
1280*0Sstevel@tonic-gate 
1281*0Sstevel@tonic-gate 		if ((rbp->rb_revision == MDDB_REV_RB) &&
1282*0Sstevel@tonic-gate 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
1283*0Sstevel@tonic-gate 			/*
1284*0Sstevel@tonic-gate 			 * This means, we have an old and small record.
1285*0Sstevel@tonic-gate 			 * And this record hasn't already been converted :-o
1286*0Sstevel@tonic-gate 			 * before we create an incore metadevice from this
1287*0Sstevel@tonic-gate 			 * we have to convert it to a big record.
1288*0Sstevel@tonic-gate 			 */
1289*0Sstevel@tonic-gate 			small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1290*0Sstevel@tonic-gate 			newreqsize = sizeof (mp_unit_t) +
1291*0Sstevel@tonic-gate 					((small_un->un_numexts - 1) *
1292*0Sstevel@tonic-gate 					sizeof (struct mp_ext));
1293*0Sstevel@tonic-gate 			big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
1294*0Sstevel@tonic-gate 			softpart_convert((caddr_t)small_un, (caddr_t)big_un,
1295*0Sstevel@tonic-gate 			    SMALL_2_BIG);
1296*0Sstevel@tonic-gate 			kmem_free(small_un, dep->de_reqsize);
1297*0Sstevel@tonic-gate 			dep->de_rb_userdata = big_un;
1298*0Sstevel@tonic-gate 			dep->de_reqsize = newreqsize;
1299*0Sstevel@tonic-gate 			rbp->rb_private |= MD_PRV_CONVD;
1300*0Sstevel@tonic-gate 			un = big_un;
1301*0Sstevel@tonic-gate 		} else {
1302*0Sstevel@tonic-gate 			/* Large device */
1303*0Sstevel@tonic-gate 			un = (mp_unit_t *)mddb_getrecaddr(recid);
1304*0Sstevel@tonic-gate 		}
1305*0Sstevel@tonic-gate 
1306*0Sstevel@tonic-gate 		/* Set revision and flag accordingly */
1307*0Sstevel@tonic-gate 		if (rbp->rb_revision == MDDB_REV_RB) {
1308*0Sstevel@tonic-gate 			un->c.un_revision = MD_32BIT_META_DEV;
1309*0Sstevel@tonic-gate 		} else {
1310*0Sstevel@tonic-gate 			un->c.un_revision = MD_64BIT_META_DEV;
1311*0Sstevel@tonic-gate 			un->c.un_flag |= MD_EFILABEL;
1312*0Sstevel@tonic-gate 		}
1313*0Sstevel@tonic-gate 
1314*0Sstevel@tonic-gate 		/*
1315*0Sstevel@tonic-gate 		 * Create minor node for snarfed entry.
1316*0Sstevel@tonic-gate 		 */
1317*0Sstevel@tonic-gate 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
1318*0Sstevel@tonic-gate 
1319*0Sstevel@tonic-gate 		if (MD_UNIT(MD_SID(un)) != NULL) {
1320*0Sstevel@tonic-gate 			/* unit is already in-core */
1321*0Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1322*0Sstevel@tonic-gate 			continue;
1323*0Sstevel@tonic-gate 		}
1324*0Sstevel@tonic-gate 		all_sp_gotten = 0;
1325*0Sstevel@tonic-gate 		if (sp_build_incore((void *)un, 1) == 0) {
1326*0Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1327*0Sstevel@tonic-gate 			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
1328*0Sstevel@tonic-gate 			gotsomething = 1;
1329*0Sstevel@tonic-gate 		}
1330*0Sstevel@tonic-gate 	}
1331*0Sstevel@tonic-gate 
1332*0Sstevel@tonic-gate 	if (!all_sp_gotten)
1333*0Sstevel@tonic-gate 		return (gotsomething);
1334*0Sstevel@tonic-gate 	/* double-check records */
1335*0Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
1336*0Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
1337*0Sstevel@tonic-gate 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
1338*0Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1339*0Sstevel@tonic-gate 
1340*0Sstevel@tonic-gate 	return (0);
1341*0Sstevel@tonic-gate }
1342*0Sstevel@tonic-gate 
1343*0Sstevel@tonic-gate /*
1344*0Sstevel@tonic-gate  * FUNCTION:	sp_halt()
1345*0Sstevel@tonic-gate  * INPUT:	cmd	- halt cmd.
1346*0Sstevel@tonic-gate  *		setno	- set number.
1347*0Sstevel@tonic-gate  * RETURNS:	0	- success.
1348*0Sstevel@tonic-gate  *		1	- err.
1349*0Sstevel@tonic-gate  * PURPOSE:	Perform driver halt operations.  As with stripe, we
1350*0Sstevel@tonic-gate  *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
1351*0Sstevel@tonic-gate  *		does a check to see if halting can be done safely
1352*0Sstevel@tonic-gate  *		(no open soft partitions), the second cleans up and
1353*0Sstevel@tonic-gate  *		shuts down the driver.
1354*0Sstevel@tonic-gate  */
1355*0Sstevel@tonic-gate static int
1356*0Sstevel@tonic-gate sp_halt(md_haltcmd_t cmd, set_t setno)
1357*0Sstevel@tonic-gate {
1358*0Sstevel@tonic-gate 	int		i;
1359*0Sstevel@tonic-gate 	mdi_unit_t	*ui;
1360*0Sstevel@tonic-gate 	minor_t		mnum;
1361*0Sstevel@tonic-gate 
1362*0Sstevel@tonic-gate 	if (cmd == MD_HALT_CLOSE)
1363*0Sstevel@tonic-gate 		return (0);
1364*0Sstevel@tonic-gate 
1365*0Sstevel@tonic-gate 	if (cmd == MD_HALT_OPEN)
1366*0Sstevel@tonic-gate 		return (0);
1367*0Sstevel@tonic-gate 
1368*0Sstevel@tonic-gate 	if (cmd == MD_HALT_UNLOAD)
1369*0Sstevel@tonic-gate 		return (0);
1370*0Sstevel@tonic-gate 
1371*0Sstevel@tonic-gate 	if (cmd == MD_HALT_CHECK) {
1372*0Sstevel@tonic-gate 		for (i = 0; i < md_nunits; i++) {
1373*0Sstevel@tonic-gate 			mnum = MD_MKMIN(setno, i);
1374*0Sstevel@tonic-gate 			if ((ui = MDI_UNIT(mnum)) == NULL)
1375*0Sstevel@tonic-gate 				continue;
1376*0Sstevel@tonic-gate 			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1377*0Sstevel@tonic-gate 				continue;
1378*0Sstevel@tonic-gate 			if (md_unit_isopen(ui))
1379*0Sstevel@tonic-gate 				return (1);
1380*0Sstevel@tonic-gate 		}
1381*0Sstevel@tonic-gate 		return (0);
1382*0Sstevel@tonic-gate 	}
1383*0Sstevel@tonic-gate 
1384*0Sstevel@tonic-gate 	if (cmd != MD_HALT_DOIT)
1385*0Sstevel@tonic-gate 		return (1);
1386*0Sstevel@tonic-gate 
1387*0Sstevel@tonic-gate 	for (i = 0; i < md_nunits; i++) {
1388*0Sstevel@tonic-gate 		mnum = MD_MKMIN(setno, i);
1389*0Sstevel@tonic-gate 		if ((ui = MDI_UNIT(mnum)) == NULL)
1390*0Sstevel@tonic-gate 			continue;
1391*0Sstevel@tonic-gate 		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1392*0Sstevel@tonic-gate 			continue;
1393*0Sstevel@tonic-gate 		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
1394*0Sstevel@tonic-gate 	}
1395*0Sstevel@tonic-gate 
1396*0Sstevel@tonic-gate 	return (0);
1397*0Sstevel@tonic-gate }
1398*0Sstevel@tonic-gate 
1399*0Sstevel@tonic-gate /*
1400*0Sstevel@tonic-gate  * FUNCTION:	sp_open_dev()
1401*0Sstevel@tonic-gate  * INPUT:	un	- unit structure.
1402*0Sstevel@tonic-gate  *		oflags	- open flags.
1403*0Sstevel@tonic-gate  * OUTPUT:	none.
1404*0Sstevel@tonic-gate  * RETURNS:	0		- success.
1405*0Sstevel@tonic-gate  *		non-zero	- err.
1406*0Sstevel@tonic-gate  * PURPOSE:	open underlying device via md_layered_open.
1407*0Sstevel@tonic-gate  */
1408*0Sstevel@tonic-gate static int
1409*0Sstevel@tonic-gate sp_open_dev(mp_unit_t *un, int oflags)
1410*0Sstevel@tonic-gate {
1411*0Sstevel@tonic-gate 	minor_t		mnum = MD_SID(un);
1412*0Sstevel@tonic-gate 	int		err;
1413*0Sstevel@tonic-gate 	md_dev64_t	tmpdev;
1414*0Sstevel@tonic-gate 	set_t		setno = MD_MIN2SET(MD_SID(un));
1415*0Sstevel@tonic-gate 	side_t		side = mddb_getsidenum(setno);
1416*0Sstevel@tonic-gate 
1417*0Sstevel@tonic-gate 	tmpdev = un->un_dev;
1418*0Sstevel@tonic-gate 	/*
1419*0Sstevel@tonic-gate 	 * Do the open by device id if underlying is regular
1420*0Sstevel@tonic-gate 	 */
1421*0Sstevel@tonic-gate 	if ((md_getmajor(tmpdev) != md_major) &&
1422*0Sstevel@tonic-gate 		md_devid_found(setno, side, un->un_key) == 1) {
1423*0Sstevel@tonic-gate 		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
1424*0Sstevel@tonic-gate 	}
1425*0Sstevel@tonic-gate 	err = md_layered_open(mnum, &tmpdev, oflags);
1426*0Sstevel@tonic-gate 	un->un_dev = tmpdev;
1427*0Sstevel@tonic-gate 
1428*0Sstevel@tonic-gate 	if (err)
1429*0Sstevel@tonic-gate 		return (ENXIO);
1430*0Sstevel@tonic-gate 
1431*0Sstevel@tonic-gate 	return (0);
1432*0Sstevel@tonic-gate }
1433*0Sstevel@tonic-gate 
1434*0Sstevel@tonic-gate /*
1435*0Sstevel@tonic-gate  * FUNCTION:	sp_open()
1436*0Sstevel@tonic-gate  * INPUT:	dev		- device to open.
1437*0Sstevel@tonic-gate  *		flag		- pass-through flag.
1438*0Sstevel@tonic-gate  *		otyp		- pass-through open type.
1439*0Sstevel@tonic-gate  *		cred_p		- credentials.
1440*0Sstevel@tonic-gate  *		md_oflags	- open flags.
1441*0Sstevel@tonic-gate  * OUTPUT:	none.
1442*0Sstevel@tonic-gate  * RETURNS:	0		- success.
1443*0Sstevel@tonic-gate  *		non-zero	- err.
1444*0Sstevel@tonic-gate  * PURPOSE:	open a soft partition.
1445*0Sstevel@tonic-gate  */
1446*0Sstevel@tonic-gate /* ARGSUSED */
1447*0Sstevel@tonic-gate static int
1448*0Sstevel@tonic-gate sp_open(
1449*0Sstevel@tonic-gate 	dev_t		*dev,
1450*0Sstevel@tonic-gate 	int		flag,
1451*0Sstevel@tonic-gate 	int		otyp,
1452*0Sstevel@tonic-gate 	cred_t		*cred_p,
1453*0Sstevel@tonic-gate 	int		md_oflags
1454*0Sstevel@tonic-gate )
1455*0Sstevel@tonic-gate {
1456*0Sstevel@tonic-gate 	minor_t		mnum = getminor(*dev);
1457*0Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1458*0Sstevel@tonic-gate 	mp_unit_t	*un;
1459*0Sstevel@tonic-gate 	int		err = 0;
1460*0Sstevel@tonic-gate 	set_t		setno;
1461*0Sstevel@tonic-gate 
1462*0Sstevel@tonic-gate 	/* grab necessary locks */
1463*0Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1464*0Sstevel@tonic-gate 	setno = MD_UN2SET(un);
1465*0Sstevel@tonic-gate 
1466*0Sstevel@tonic-gate 	/* open underlying device, if necessary */
1467*0Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
1468*0Sstevel@tonic-gate 		if ((err = sp_open_dev(un, md_oflags)) != 0)
1469*0Sstevel@tonic-gate 			goto out;
1470*0Sstevel@tonic-gate 
1471*0Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno)) {
1472*0Sstevel@tonic-gate 			/* For probe, don't incur the overhead of validate */
1473*0Sstevel@tonic-gate 			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
1474*0Sstevel@tonic-gate 				/*
1475*0Sstevel@tonic-gate 				 * Don't call sp_validate while
1476*0Sstevel@tonic-gate 				 * unit_openclose lock is held.  So, actually
1477*0Sstevel@tonic-gate 				 * open the device, drop openclose lock,
1478*0Sstevel@tonic-gate 				 * call sp_validate, reacquire openclose lock,
1479*0Sstevel@tonic-gate 				 * and close the device.  If sp_validate
1480*0Sstevel@tonic-gate 				 * succeeds, then device will be re-opened.
1481*0Sstevel@tonic-gate 				 */
1482*0Sstevel@tonic-gate 				if ((err = md_unit_incopen(mnum, flag,
1483*0Sstevel@tonic-gate 				    otyp)) != 0)
1484*0Sstevel@tonic-gate 					goto out;
1485*0Sstevel@tonic-gate 
1486*0Sstevel@tonic-gate 				mutex_enter(&ui->ui_mx);
1487*0Sstevel@tonic-gate 				ui->ui_lock |= MD_UL_OPENINPROGRESS;
1488*0Sstevel@tonic-gate 				mutex_exit(&ui->ui_mx);
1489*0Sstevel@tonic-gate 				md_unit_openclose_exit(ui);
1490*0Sstevel@tonic-gate 				if (otyp != OTYP_LYR)
1491*0Sstevel@tonic-gate 					rw_exit(&md_unit_array_rw.lock);
1492*0Sstevel@tonic-gate 
1493*0Sstevel@tonic-gate 				err = sp_validate(un);
1494*0Sstevel@tonic-gate 
1495*0Sstevel@tonic-gate 				if (otyp != OTYP_LYR)
1496*0Sstevel@tonic-gate 					rw_enter(&md_unit_array_rw.lock,
1497*0Sstevel@tonic-gate 					    RW_READER);
1498*0Sstevel@tonic-gate 				(void) md_unit_openclose_enter(ui);
1499*0Sstevel@tonic-gate 				(void) md_unit_decopen(mnum, otyp);
1500*0Sstevel@tonic-gate 				mutex_enter(&ui->ui_mx);
1501*0Sstevel@tonic-gate 				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
1502*0Sstevel@tonic-gate 				cv_broadcast(&ui->ui_cv);
1503*0Sstevel@tonic-gate 				mutex_exit(&ui->ui_mx);
1504*0Sstevel@tonic-gate 				/*
1505*0Sstevel@tonic-gate 				 * Should be in the same state as before
1506*0Sstevel@tonic-gate 				 * the sp_validate.
1507*0Sstevel@tonic-gate 				 */
1508*0Sstevel@tonic-gate 				if (err != 0) {
1509*0Sstevel@tonic-gate 					/* close the device opened above */
1510*0Sstevel@tonic-gate 					md_layered_close(un->un_dev, md_oflags);
1511*0Sstevel@tonic-gate 					err = EIO;
1512*0Sstevel@tonic-gate 					goto out;
1513*0Sstevel@tonic-gate 				}
1514*0Sstevel@tonic-gate 			}
1515*0Sstevel@tonic-gate 			/*
1516*0Sstevel@tonic-gate 			 * As we're a multi-owner metadevice we need to ensure
1517*0Sstevel@tonic-gate 			 * that all nodes have the same idea of the status.
1518*0Sstevel@tonic-gate 			 * sp_validate() will mark the device as errored (if
1519*0Sstevel@tonic-gate 			 * it cannot read the watermark) or ok (if it was
1520*0Sstevel@tonic-gate 			 * previously errored but the watermark is now valid).
1521*0Sstevel@tonic-gate 			 * This code-path is only entered on the non-probe open
1522*0Sstevel@tonic-gate 			 * so we will maintain the errored state during a probe
1523*0Sstevel@tonic-gate 			 * call. This means the sys-admin must metarecover -m
1524*0Sstevel@tonic-gate 			 * to reset the soft-partition error.
1525*0Sstevel@tonic-gate 			 */
1526*0Sstevel@tonic-gate 		} else {
1527*0Sstevel@tonic-gate 			/* For probe, don't incur the overhead of validate */
1528*0Sstevel@tonic-gate 			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
1529*0Sstevel@tonic-gate 			    (err = sp_validate(un)) != 0) {
1530*0Sstevel@tonic-gate 				/* close the device opened above */
1531*0Sstevel@tonic-gate 				md_layered_close(un->un_dev, md_oflags);
1532*0Sstevel@tonic-gate 				err = EIO;
1533*0Sstevel@tonic-gate 				goto out;
1534*0Sstevel@tonic-gate 			} else {
1535*0Sstevel@tonic-gate 				/*
1536*0Sstevel@tonic-gate 				 * we succeeded in validating the on disk
1537*0Sstevel@tonic-gate 				 * format versus the in core, so reset the
1538*0Sstevel@tonic-gate 				 * status if it's in error
1539*0Sstevel@tonic-gate 				 */
1540*0Sstevel@tonic-gate 				if (un->un_status == MD_SP_ERR) {
1541*0Sstevel@tonic-gate 					un->un_status = MD_SP_OK;
1542*0Sstevel@tonic-gate 				}
1543*0Sstevel@tonic-gate 			}
1544*0Sstevel@tonic-gate 		}
1545*0Sstevel@tonic-gate 	}
1546*0Sstevel@tonic-gate 
1547*0Sstevel@tonic-gate 	/* count open */
1548*0Sstevel@tonic-gate 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
1549*0Sstevel@tonic-gate 		goto out;
1550*0Sstevel@tonic-gate 
1551*0Sstevel@tonic-gate out:
1552*0Sstevel@tonic-gate 	md_unit_openclose_exit(ui);
1553*0Sstevel@tonic-gate 	return (err);
1554*0Sstevel@tonic-gate }
1555*0Sstevel@tonic-gate 
1556*0Sstevel@tonic-gate /*
1557*0Sstevel@tonic-gate  * FUNCTION:	sp_close()
1558*0Sstevel@tonic-gate  * INPUT:	dev		- device to close.
1559*0Sstevel@tonic-gate  *		flag		- pass-through flag.
1560*0Sstevel@tonic-gate  *		otyp		- pass-through type.
1561*0Sstevel@tonic-gate  *		cred_p		- credentials.
1562*0Sstevel@tonic-gate  *		md_cflags	- close flags.
1563*0Sstevel@tonic-gate  * OUTPUT:	none.
1564*0Sstevel@tonic-gate  * RETURNS:	0		- success.
1565*0Sstevel@tonic-gate  *		non-zero	- err.
1566*0Sstevel@tonic-gate  * PURPOSE:	close a soft paritition.
1567*0Sstevel@tonic-gate  */
1568*0Sstevel@tonic-gate /* ARGSUSED */
1569*0Sstevel@tonic-gate static int
1570*0Sstevel@tonic-gate sp_close(
1571*0Sstevel@tonic-gate 	dev_t		dev,
1572*0Sstevel@tonic-gate 	int		flag,
1573*0Sstevel@tonic-gate 	int		otyp,
1574*0Sstevel@tonic-gate 	cred_t		*cred_p,
1575*0Sstevel@tonic-gate 	int		md_cflags
1576*0Sstevel@tonic-gate )
1577*0Sstevel@tonic-gate {
1578*0Sstevel@tonic-gate 	minor_t		mnum = getminor(dev);
1579*0Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1580*0Sstevel@tonic-gate 	mp_unit_t	*un;
1581*0Sstevel@tonic-gate 	int		err = 0;
1582*0Sstevel@tonic-gate 
1583*0Sstevel@tonic-gate 	/* grab necessary locks */
1584*0Sstevel@tonic-gate 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1585*0Sstevel@tonic-gate 
1586*0Sstevel@tonic-gate 	/* count closed */
1587*0Sstevel@tonic-gate 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1588*0Sstevel@tonic-gate 		goto out;
1589*0Sstevel@tonic-gate 
1590*0Sstevel@tonic-gate 	/* close devices, if necessary */
1591*0Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1592*0Sstevel@tonic-gate 		md_layered_close(un->un_dev, md_cflags);
1593*0Sstevel@tonic-gate 	}
1594*0Sstevel@tonic-gate 
1595*0Sstevel@tonic-gate 	/*
1596*0Sstevel@tonic-gate 	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
1597*0Sstevel@tonic-gate 	 * clear these capabilities if this is the last close in
1598*0Sstevel@tonic-gate 	 * the cluster
1599*0Sstevel@tonic-gate 	 */
1600*0Sstevel@tonic-gate 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1601*0Sstevel@tonic-gate 	    (ui->ui_tstate & MD_ABR_CAP)) {
1602*0Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
1603*0Sstevel@tonic-gate 		mdmn_clear_all_capabilities(mnum);
1604*0Sstevel@tonic-gate 		return (0);
1605*0Sstevel@tonic-gate 	}
1606*0Sstevel@tonic-gate 	/* unlock, return success */
1607*0Sstevel@tonic-gate out:
1608*0Sstevel@tonic-gate 	md_unit_openclose_exit(ui);
1609*0Sstevel@tonic-gate 	return (err);
1610*0Sstevel@tonic-gate }
1611*0Sstevel@tonic-gate 
1612*0Sstevel@tonic-gate 
1613*0Sstevel@tonic-gate /* used in sp_dump routine */
1614*0Sstevel@tonic-gate static struct buf dumpbuf;
1615*0Sstevel@tonic-gate 
1616*0Sstevel@tonic-gate /*
1617*0Sstevel@tonic-gate  * FUNCTION:	sp_dump()
1618*0Sstevel@tonic-gate  * INPUT:	dev	- device to dump to.
1619*0Sstevel@tonic-gate  *		addr	- address to dump.
1620*0Sstevel@tonic-gate  *		blkno	- blkno on device.
1621*0Sstevel@tonic-gate  *		nblk	- number of blocks to dump.
1622*0Sstevel@tonic-gate  * OUTPUT:	none.
1623*0Sstevel@tonic-gate  * RETURNS:	result from bdev_dump.
1624*0Sstevel@tonic-gate  * PURPOSE:  This routine dumps memory to the disk.  It assumes that
1625*0Sstevel@tonic-gate  *           the memory has already been mapped into mainbus space.
1626*0Sstevel@tonic-gate  *           It is called at disk interrupt priority when the system
1627*0Sstevel@tonic-gate  *           is in trouble.
1628*0Sstevel@tonic-gate  *           NOTE: this function is defined using 32-bit arguments,
1629*0Sstevel@tonic-gate  *           but soft partitioning is internally 64-bit.  Arguments
1630*0Sstevel@tonic-gate  *           are casted where appropriate.
1631*0Sstevel@tonic-gate  */
1632*0Sstevel@tonic-gate static int
1633*0Sstevel@tonic-gate sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1634*0Sstevel@tonic-gate {
1635*0Sstevel@tonic-gate 	mp_unit_t	*un;
1636*0Sstevel@tonic-gate 	buf_t		*bp;
1637*0Sstevel@tonic-gate 	sp_ext_length_t	nb;
1638*0Sstevel@tonic-gate 	daddr_t		mapblk;
1639*0Sstevel@tonic-gate 	int		result;
1640*0Sstevel@tonic-gate 	int		more;
1641*0Sstevel@tonic-gate 	int		saveresult = 0;
1642*0Sstevel@tonic-gate 
1643*0Sstevel@tonic-gate 	/*
1644*0Sstevel@tonic-gate 	 * Don't need to grab the unit lock.
1645*0Sstevel@tonic-gate 	 * Cause nothing else is supposed to be happenning.
1646*0Sstevel@tonic-gate 	 * Also dump is not supposed to sleep.
1647*0Sstevel@tonic-gate 	 */
1648*0Sstevel@tonic-gate 	un = (mp_unit_t *)MD_UNIT(getminor(dev));
1649*0Sstevel@tonic-gate 
1650*0Sstevel@tonic-gate 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1651*0Sstevel@tonic-gate 		return (EINVAL);
1652*0Sstevel@tonic-gate 
1653*0Sstevel@tonic-gate 	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
1654*0Sstevel@tonic-gate 		return (EINVAL);
1655*0Sstevel@tonic-gate 
1656*0Sstevel@tonic-gate 	bp = &dumpbuf;
1657*0Sstevel@tonic-gate 	nb = (sp_ext_length_t)dbtob(nblk);
1658*0Sstevel@tonic-gate 	do {
1659*0Sstevel@tonic-gate 		bzero((caddr_t)bp, sizeof (*bp));
1660*0Sstevel@tonic-gate 		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
1661*0Sstevel@tonic-gate 		nblk = (int)(btodb(bp->b_bcount));
1662*0Sstevel@tonic-gate 		mapblk = bp->b_blkno;
1663*0Sstevel@tonic-gate 		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
1664*0Sstevel@tonic-gate 		if (result)
1665*0Sstevel@tonic-gate 			saveresult = result;
1666*0Sstevel@tonic-gate 
1667*0Sstevel@tonic-gate 		nb -= bp->b_bcount;
1668*0Sstevel@tonic-gate 		addr += bp->b_bcount;
1669*0Sstevel@tonic-gate 		blkno += nblk;
1670*0Sstevel@tonic-gate 	} while (more);
1671*0Sstevel@tonic-gate 
1672*0Sstevel@tonic-gate 	return (saveresult);
1673*0Sstevel@tonic-gate }
1674*0Sstevel@tonic-gate 
1675*0Sstevel@tonic-gate static int
1676*0Sstevel@tonic-gate sp_imp_set(
1677*0Sstevel@tonic-gate 	set_t	setno
1678*0Sstevel@tonic-gate )
1679*0Sstevel@tonic-gate {
1680*0Sstevel@tonic-gate 	mddb_recid_t	recid;
1681*0Sstevel@tonic-gate 	int		gotsomething;
1682*0Sstevel@tonic-gate 	mddb_type_t	rec_type;
1683*0Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
1684*0Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
1685*0Sstevel@tonic-gate 	mp_unit_t	*un64;
1686*0Sstevel@tonic-gate 	mp_unit32_od_t	*un32;
1687*0Sstevel@tonic-gate 	minor_t		*self_id;	/* minor needs to be updated */
1688*0Sstevel@tonic-gate 	md_parent_t	*parent_id;	/* parent needs to be updated */
1689*0Sstevel@tonic-gate 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1690*0Sstevel@tonic-gate 
1691*0Sstevel@tonic-gate 	gotsomething = 0;
1692*0Sstevel@tonic-gate 
1693*0Sstevel@tonic-gate 	rec_type = (mddb_type_t)md_getshared_key(setno,
1694*0Sstevel@tonic-gate 		sp_md_ops.md_driver.md_drivername);
1695*0Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
1696*0Sstevel@tonic-gate 
1697*0Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1698*0Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1699*0Sstevel@tonic-gate 			continue;
1700*0Sstevel@tonic-gate 
1701*0Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
1702*0Sstevel@tonic-gate 		rbp = dep->de_rb;
1703*0Sstevel@tonic-gate 
1704*0Sstevel@tonic-gate 		if (rbp->rb_revision == MDDB_REV_RB) {
1705*0Sstevel@tonic-gate 			/*
1706*0Sstevel@tonic-gate 			 * Small device
1707*0Sstevel@tonic-gate 			 */
1708*0Sstevel@tonic-gate 			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1709*0Sstevel@tonic-gate 			self_id = &(un32->c.un_self_id);
1710*0Sstevel@tonic-gate 			parent_id = &(un32->c.un_parent);
1711*0Sstevel@tonic-gate 			record_id = &(un32->c.un_record_id);
1712*0Sstevel@tonic-gate 
1713*0Sstevel@tonic-gate 			if (!md_update_minor(setno, mddb_getsidenum
1714*0Sstevel@tonic-gate 				(setno), un32->un_key))
1715*0Sstevel@tonic-gate 				goto out;
1716*0Sstevel@tonic-gate 		} else {
1717*0Sstevel@tonic-gate 			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
1718*0Sstevel@tonic-gate 			self_id = &(un64->c.un_self_id);
1719*0Sstevel@tonic-gate 			parent_id = &(un64->c.un_parent);
1720*0Sstevel@tonic-gate 			record_id = &(un64->c.un_record_id);
1721*0Sstevel@tonic-gate 
1722*0Sstevel@tonic-gate 			if (!md_update_minor(setno, mddb_getsidenum
1723*0Sstevel@tonic-gate 				(setno), un64->un_key))
1724*0Sstevel@tonic-gate 				goto out;
1725*0Sstevel@tonic-gate 		}
1726*0Sstevel@tonic-gate 
1727*0Sstevel@tonic-gate 		/*
1728*0Sstevel@tonic-gate 		 * Update unit with the imported setno
1729*0Sstevel@tonic-gate 		 *
1730*0Sstevel@tonic-gate 		 */
1731*0Sstevel@tonic-gate 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1732*0Sstevel@tonic-gate 
1733*0Sstevel@tonic-gate 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1734*0Sstevel@tonic-gate 		if (*parent_id != MD_NO_PARENT)
1735*0Sstevel@tonic-gate 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1736*0Sstevel@tonic-gate 		*record_id = MAKERECID(setno, DBID(*record_id));
1737*0Sstevel@tonic-gate 
1738*0Sstevel@tonic-gate 		gotsomething = 1;
1739*0Sstevel@tonic-gate 	}
1740*0Sstevel@tonic-gate 
1741*0Sstevel@tonic-gate out:
1742*0Sstevel@tonic-gate 	return (gotsomething);
1743*0Sstevel@tonic-gate }
1744*0Sstevel@tonic-gate 
1745*0Sstevel@tonic-gate static md_named_services_t sp_named_services[] = {
1746*0Sstevel@tonic-gate 	{NULL,					0}
1747*0Sstevel@tonic-gate };
1748*0Sstevel@tonic-gate 
1749*0Sstevel@tonic-gate md_ops_t sp_md_ops = {
1750*0Sstevel@tonic-gate 	sp_open,		/* open */
1751*0Sstevel@tonic-gate 	sp_close,		/* close */
1752*0Sstevel@tonic-gate 	md_sp_strategy,		/* strategy */
1753*0Sstevel@tonic-gate 	NULL,			/* print */
1754*0Sstevel@tonic-gate 	sp_dump,		/* dump */
1755*0Sstevel@tonic-gate 	NULL,			/* read */
1756*0Sstevel@tonic-gate 	NULL,			/* write */
1757*0Sstevel@tonic-gate 	md_sp_ioctl,		/* ioctl, */
1758*0Sstevel@tonic-gate 	sp_snarf,		/* snarf */
1759*0Sstevel@tonic-gate 	sp_halt,		/* halt */
1760*0Sstevel@tonic-gate 	NULL,			/* aread */
1761*0Sstevel@tonic-gate 	NULL,			/* awrite */
1762*0Sstevel@tonic-gate 	sp_imp_set,		/* import set */
1763*0Sstevel@tonic-gate 	sp_named_services
1764*0Sstevel@tonic-gate };
1765*0Sstevel@tonic-gate 
1766*0Sstevel@tonic-gate static void
1767*0Sstevel@tonic-gate init_init()
1768*0Sstevel@tonic-gate {
1769*0Sstevel@tonic-gate 	sp_parent_cache = kmem_cache_create("md_softpart_parent",
1770*0Sstevel@tonic-gate 	    sizeof (md_spps_t), 0, sp_parent_constructor,
1771*0Sstevel@tonic-gate 	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
1772*0Sstevel@tonic-gate 	sp_child_cache = kmem_cache_create("md_softpart_child",
1773*0Sstevel@tonic-gate 	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
1774*0Sstevel@tonic-gate 	    sp_child_constructor, sp_child_destructor, sp_run_queue,
1775*0Sstevel@tonic-gate 	    NULL, NULL, 0);
1776*0Sstevel@tonic-gate }
1777*0Sstevel@tonic-gate 
1778*0Sstevel@tonic-gate static void
1779*0Sstevel@tonic-gate fini_uninit()
1780*0Sstevel@tonic-gate {
1781*0Sstevel@tonic-gate 	kmem_cache_destroy(sp_parent_cache);
1782*0Sstevel@tonic-gate 	kmem_cache_destroy(sp_child_cache);
1783*0Sstevel@tonic-gate 	sp_parent_cache = sp_child_cache = NULL;
1784*0Sstevel@tonic-gate }
1785*0Sstevel@tonic-gate 
1786*0Sstevel@tonic-gate /* define the module linkage */
1787*0Sstevel@tonic-gate MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit())
1788