xref: /onnv-gate/usr/src/uts/common/io/lvm/md/md.c (revision 7627:8599a7568728)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Md - is the meta-disk driver.   It sits below the UFS file system
29  * but above the 'real' disk drivers, xy, id, sd etc.
30  *
31  * To the UFS software, md looks like a normal driver, since it has
32  * the normal kinds of entries in the bdevsw and cdevsw arrays. So
33  * UFS accesses md in the usual ways.  In particular, the strategy
34  * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
35  * and ufs_writelbn().
36  *
37  * Md maintains an array of minor devices (meta-partitions).   Each
38  * meta partition stands for a matrix of real partitions, in rows
39  * which are not necessarily of equal length.	Md maintains a table,
40  * with one entry for each meta-partition,  which lists the rows and
41  * columns of actual partitions, and the job of the strategy routine
42  * is to translate from the meta-partition device and block numbers
43  * known to UFS into the actual partitions' device and block numbers.
44  *
45  * See below, in mdstrategy(), mdreal(), and mddone() for details of
46  * this translation.
47  */
48 
49 /*
50  * Driver for Virtual Disk.
51  */
52 
53 #include <sys/user.h>
54 #include <sys/sysmacros.h>
55 #include <sys/conf.h>
56 #include <sys/stat.h>
57 #include <sys/errno.h>
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/file.h>
61 #include <sys/open.h>
62 #include <sys/dkio.h>
63 #include <sys/vtoc.h>
64 #include <sys/cmn_err.h>
65 #include <sys/ddi.h>
66 #include <sys/sunddi.h>
67 #include <sys/debug.h>
68 #include <sys/utsname.h>
69 #include <sys/lvm/mdvar.h>
70 #include <sys/lvm/md_names.h>
71 #include <sys/lvm/md_mddb.h>
72 #include <sys/lvm/md_sp.h>
73 #include <sys/types.h>
74 #include <sys/kmem.h>
75 #include <sys/cladm.h>
76 #include <sys/priv_names.h>
77 #include <sys/modhash.h>
78 
79 #ifndef	lint
80 char 		_depends_on[] = "strmod/rpcmod";
81 #endif	/* lint */
82 int		md_init_debug	= 0;	/* module binding debug */
83 
84 /*
85  * Tunable to turn off the failfast behavior.
86  */
87 int		md_ff_disable = 0;
88 
89 /*
90  * dynamically allocated list of non FF driver names - needs to
91  * be freed when md is detached.
92  */
93 char	**non_ff_drivers = NULL;
94 
95 md_krwlock_t	md_unit_array_rw;	/* protects all unit arrays */
96 md_krwlock_t	nm_lock;		/* protects all the name spaces */
97 
98 md_resync_t	md_cpr_resync;
99 
100 extern char	svm_bootpath[];
101 #define	SVM_PSEUDO_STR	"/pseudo/md@0:"
102 
103 #define		VERSION_LENGTH	6
104 #define		VERSION		"1.0"
105 
106 /*
107  * Keep track of possible 'orphan' entries in the name space
108  */
109 int		*md_nm_snarfed = NULL;
110 
111 /*
112  * Global tunable giving the percentage of free space left in replica during
113  * conversion of non-devid style replica to devid style replica.
114  */
115 int		md_conv_perc = MDDB_DEVID_CONV_PERC;
116 
117 #ifdef	DEBUG
118 /* debug code to verify framework exclusion guarantees */
119 int		md_in;
120 kmutex_t	md_in_mx;			/* used to md global stuff */
121 #define	IN_INIT		0x01
122 #define	IN_FINI		0x02
123 #define	IN_ATTACH	0x04
124 #define	IN_DETACH	0x08
125 #define	IN_OPEN		0x10
126 #define	MD_SET_IN(x) {						\
127 	mutex_enter(&md_in_mx);					\
128 	if (md_in)						\
129 		debug_enter("MD_SET_IN exclusion lost");	\
130 	if (md_in & x)						\
131 		debug_enter("MD_SET_IN already set");		\
132 	md_in |= x;						\
133 	mutex_exit(&md_in_mx);					\
134 }
135 
136 #define	MD_CLR_IN(x) {						\
137 	mutex_enter(&md_in_mx);					\
138 	if (md_in & ~(x))					\
139 		debug_enter("MD_CLR_IN exclusion lost");	\
140 	if (!(md_in & x))					\
141 		debug_enter("MD_CLR_IN already clr");		\
142 	md_in &= ~x;						\
143 	mutex_exit(&md_in_mx);					\
144 }
145 #else	/* DEBUG */
146 #define	MD_SET_IN(x)
147 #define	MD_CLR_IN(x)
148 #endif	/* DEBUG */
149 hrtime_t savetime1, savetime2;
150 
151 
152 /*
153  * list things protected by md_mx even if they aren't
154  * used in this file.
155  */
156 kmutex_t	md_mx;			/* used to md global stuff */
157 kcondvar_t	md_cv;			/* md_status events */
158 int		md_status = 0;		/* global status for the meta-driver */
159 int		md_num_daemons = 0;
160 int		md_ioctl_cnt = 0;
161 int		md_mtioctl_cnt = 0;	/* multithreaded ioctl cnt */
162 uint_t		md_mdelay = 10;		/* variable so can be patched */
163 
164 int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
165 
166 major_t		md_major, md_major_targ;
167 
168 unit_t		md_nunits = MD_MAXUNITS;
169 set_t		md_nsets = MD_MAXSETS;
170 int		md_nmedh = 0;
171 char		*md_med_trans_lst = NULL;
172 md_set_t	md_set[MD_MAXSETS];
173 md_set_io_t	md_set_io[MD_MAXSETS];
174 
175 md_krwlock_t	hsp_rwlp;		/* protects hot_spare_interface */
176 md_krwlock_t	ni_rwlp;		/* protects notify_interface */
177 md_ops_t	**md_ops = NULL;
178 ddi_modhandle_t	*md_mods = NULL;
179 md_ops_t	*md_opslist;
180 clock_t		md_hz;
181 md_event_queue_t	*md_event_queue = NULL;
182 
183 int		md_in_upgrade;
184 int		md_keep_repl_state;
185 int		md_devid_destroy;
186 
187 /* for sending messages thru a door to userland */
188 door_handle_t	mdmn_door_handle = NULL;
189 int		mdmn_door_did = -1;
190 
191 dev_info_t		*md_devinfo = NULL;
192 
193 md_mn_nodeid_t	md_mn_mynode_id = ~0u;	/* My node id (for multi-node sets) */
194 
195 static	uint_t		md_ocnt[OTYPCNT];
196 
197 static int		mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
198 static int		mdattach(dev_info_t *, ddi_attach_cmd_t);
199 static int		mddetach(dev_info_t *, ddi_detach_cmd_t);
200 static int		mdopen(dev_t *, int, int, cred_t *);
201 static int		mdclose(dev_t, int, int, cred_t *);
202 static int		mddump(dev_t, caddr_t, daddr_t, int);
203 static int		mdread(dev_t, struct uio *, cred_t *);
204 static int		mdwrite(dev_t, struct uio *, cred_t *);
205 static int		mdaread(dev_t, struct aio_req *, cred_t *);
206 static int		mdawrite(dev_t, struct aio_req *, cred_t *);
207 static int		mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
208 static int		mdprop_op(dev_t, dev_info_t *,
209 				ddi_prop_op_t, int, char *, caddr_t, int *);
210 
211 static struct cb_ops md_cb_ops = {
212 	mdopen,			/* open */
213 	mdclose,		/* close */
214 	mdstrategy,		/* strategy */
215 				/* print routine -- none yet */
216 	(int(*)(dev_t, char *))nulldev,
217 	mddump,			/* dump */
218 	mdread,			/* read */
219 	mdwrite,		/* write */
220 	mdioctl,		/* ioctl */
221 				/* devmap */
222 	(int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
223 			uint_t))nodev,
224 				/* mmap */
225 	(int(*)(dev_t, off_t, int))nodev,
226 				/* segmap */
227 	(int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
228 		unsigned, unsigned, cred_t *))nodev,
229 	nochpoll,		/* poll */
230 	mdprop_op,		/* prop_op */
231 	0,			/* streamtab */
232 	(D_64BIT|D_MP|D_NEW),	/* driver compatibility flag */
233 	CB_REV,			/* cb_ops version */
234 	mdaread,		/* aread */
235 	mdawrite,		/* awrite */
236 };
237 
238 static struct dev_ops md_devops = {
239 	DEVO_REV,		/* dev_ops version */
240 	0,			/* device reference count */
241 	mdinfo,			/* info routine */
242 	nulldev,		/* identify routine */
243 	nulldev,		/* probe - not defined */
244 	mdattach,		/* attach routine */
245 	mddetach,		/* detach routine */
246 	nodev,			/* reset - not defined */
247 	&md_cb_ops,		/* driver operations */
248 	NULL,			/* bus operations */
249 	nodev			/* power management */
250 };
251 
252 /*
253  * loadable module wrapper
254  */
255 #include <sys/modctl.h>
256 
257 static struct modldrv modldrv = {
258 	&mod_driverops,			/* type of module -- a pseudodriver */
259 	"Solaris Volume Manager base module", /* name of the module */
260 	&md_devops,			/* driver ops */
261 };
262 
263 static struct modlinkage modlinkage = {
264 	MODREV_1,
265 	(void *)&modldrv,
266 	NULL
267 };
268 
269 
270 /* md_medd.c */
271 extern	void	med_init(void);
272 extern	void	med_fini(void);
273 extern  void	md_devid_cleanup(set_t, uint_t);
274 
275 /* md_names.c */
276 extern void			*lookup_entry(struct nm_next_hdr *, set_t,
277 					side_t, mdkey_t, md_dev64_t, int);
278 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
279 extern int			remove_entry(struct nm_next_hdr *,
280 					side_t, mdkey_t, int);
281 
282 int		md_maxphys	= 0;	/* maximum io size in bytes */
283 #define		MD_MAXBCOUNT	(1024 * 1024)
284 unsigned	md_maxbcount	= 0;	/* maximum physio size in bytes */
285 
286 /*
287  * Some md ioctls trigger io framework device tree operations.  An
288  * example is md ioctls that call md_resolve_bydevid(): which uses the
289  * io framework to resolve a devid. Such operations result in acquiring
290  * io framework locks (like ndi_devi_enter() of "/") while holding
291  * driver locks (like md_unit_writerlock()).
292  *
293  * The prop_op(9E) entry point is called from the devinfo driver with
294  * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
295  * implementation must avoid taking a lock that is held per above md
296  * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
297  * without risking deadlock.
298  *
299  * To service "size" requests without risking deadlock, we maintain a
300  * "mnum->nblocks" sizemap (protected by a short-term global mutex).
301  */
302 static kmutex_t		md_nblocks_mutex;
303 static mod_hash_t	*md_nblocksmap;		/* mnum -> nblocks */
304 int			md_nblocksmap_size = 512;
305 
306 /*
307  * Maintain "mnum->nblocks" sizemap for mdprop_op use:
308  *
309  * Create: any code that establishes a unit's un_total_blocks needs the
310  * following type of call to establish nblocks for mdprop_op():
311  *	md_nblocks_set(mnum, un->c.un_total_blocks);"
312  *	NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
313  *		...or  "MD_UNIT..*="
314  *
315  * Change: any code that changes a unit's un_total_blocks needs the
316  * following type of call to sync nblocks for mdprop_op():
317  *	md_nblocks_set(mnum, un->c.un_total_blocks);"
318  *	NOTE: locate via cscope for "un_total_blocks[ \t]*="
319  *
320  * Destroy: any code that deletes a unit needs the following type of call
321  * to sync nblocks for mdprop_op():
322  *	md_nblocks_set(mnum, -1ULL);
323  *	NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
324  *		...or  "MD_UNIT..*="
325  */
326 void
327 md_nblocks_set(minor_t mnum, uint64_t nblocks)
328 {
329 	mutex_enter(&md_nblocks_mutex);
330 	if (nblocks == -1ULL)
331 		(void) mod_hash_destroy(md_nblocksmap,
332 		    (mod_hash_key_t)(intptr_t)mnum);
333 	else
334 		(void) mod_hash_replace(md_nblocksmap,
335 		    (mod_hash_key_t)(intptr_t)mnum,
336 		    (mod_hash_val_t)(intptr_t)nblocks);
337 	mutex_exit(&md_nblocks_mutex);
338 }
339 
340 /* get the size of a mnum from "mnum->nblocks" sizemap */
341 uint64_t
342 md_nblocks_get(minor_t mnum)
343 {
344 	mod_hash_val_t	hv;
345 
346 	mutex_enter(&md_nblocks_mutex);
347 	if (mod_hash_find(md_nblocksmap,
348 	    (mod_hash_key_t)(intptr_t)mnum, &hv) == 0) {
349 		mutex_exit(&md_nblocks_mutex);
350 		return ((uint64_t)(intptr_t)hv);
351 	}
352 	mutex_exit(&md_nblocks_mutex);
353 	return (0);
354 }
355 
356 /* allocate/free dynamic space associated with driver globals */
357 void
358 md_global_alloc_free(int alloc)
359 {
360 	set_t	s;
361 
362 	if (alloc) {
363 		/* initialize driver global locks */
364 		cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
365 		mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
366 		rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
367 		rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
368 		rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
369 		rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
370 		mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
371 		    MUTEX_DEFAULT, NULL);
372 		mutex_init(&md_nblocks_mutex, NULL, MUTEX_DEFAULT, NULL);
373 
374 		/* initialize per set driver global locks */
375 		for (s = 0; s < MD_MAXSETS; s++) {
376 			/* initialize per set driver globals locks */
377 			mutex_init(&md_set[s].s_dbmx,
378 			    NULL, MUTEX_DEFAULT, NULL);
379 			mutex_init(&md_set_io[s].md_io_mx,
380 			    NULL, MUTEX_DEFAULT, NULL);
381 			cv_init(&md_set_io[s].md_io_cv,
382 			    NULL, CV_DEFAULT, NULL);
383 		}
384 	} else {
385 		/* destroy per set driver global locks */
386 		for (s = 0; s < MD_MAXSETS; s++) {
387 			cv_destroy(&md_set_io[s].md_io_cv);
388 			mutex_destroy(&md_set_io[s].md_io_mx);
389 			mutex_destroy(&md_set[s].s_dbmx);
390 		}
391 
392 		/* destroy driver global locks */
393 		mutex_destroy(&md_nblocks_mutex);
394 		mutex_destroy(&md_cpr_resync.md_resync_mutex);
395 		rw_destroy(&hsp_rwlp.lock);
396 		rw_destroy(&ni_rwlp.lock);
397 		rw_destroy(&nm_lock.lock);
398 		rw_destroy(&md_unit_array_rw.lock);
399 		mutex_destroy(&md_mx);
400 		cv_destroy(&md_cv);
401 	}
402 }
403 
404 int
405 _init(void)
406 {
407 	set_t	s;
408 	int	err;
409 
410 	MD_SET_IN(IN_INIT);
411 
412 	/* allocate dynamic space associated with driver globals */
413 	md_global_alloc_free(1);
414 
415 	/* initialize driver globals */
416 	md_major = ddi_name_to_major("md");
417 	md_hz = drv_usectohz(NUM_USEC_IN_SEC);
418 
419 	/* initialize tunable globals */
420 	if (md_maxphys == 0)		/* maximum io size in bytes */
421 		md_maxphys = maxphys;
422 	if (md_maxbcount == 0)		/* maximum physio size in bytes */
423 		md_maxbcount = MD_MAXBCOUNT;
424 
425 	/* initialize per set driver globals */
426 	for (s = 0; s < MD_MAXSETS; s++)
427 		md_set_io[s].io_state = MD_SET_ACTIVE;
428 
429 	/*
430 	 * NOTE: the framework does not currently guarantee exclusion
431 	 * between _init and attach after calling mod_install.
432 	 */
433 	MD_CLR_IN(IN_INIT);
434 	if ((err = mod_install(&modlinkage))) {
435 		MD_SET_IN(IN_INIT);
436 		md_global_alloc_free(0);	/* free dynamic space */
437 		MD_CLR_IN(IN_INIT);
438 	}
439 	return (err);
440 }
441 
442 int
443 _fini(void)
444 {
445 	int	err;
446 
447 	/*
448 	 * NOTE: the framework currently does not guarantee exclusion
449 	 * with attach until after mod_remove returns 0.
450 	 */
451 	if ((err = mod_remove(&modlinkage)))
452 		return (err);
453 
454 	MD_SET_IN(IN_FINI);
455 	md_global_alloc_free(0);	/* free dynamic space */
456 	MD_CLR_IN(IN_FINI);
457 	return (err);
458 }
459 
460 int
461 _info(struct modinfo *modinfop)
462 {
463 	return (mod_info(&modlinkage, modinfop));
464 }
465 
466 /* ARGSUSED */
467 static int
468 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
469 {
470 	int	len;
471 	unit_t	i;
472 	size_t	sz;
473 	char	ver[VERSION_LENGTH];
474 	char	**maj_str_array;
475 	char	*str, *str2;
476 
477 	MD_SET_IN(IN_ATTACH);
478 	md_in_upgrade = 0;
479 	md_keep_repl_state = 0;
480 	md_devid_destroy = 0;
481 
482 	if (cmd != DDI_ATTACH) {
483 		MD_CLR_IN(IN_ATTACH);
484 		return (DDI_FAILURE);
485 	}
486 
487 	if (md_devinfo != NULL) {
488 		MD_CLR_IN(IN_ATTACH);
489 		return (DDI_FAILURE);
490 	}
491 
492 	mddb_init();
493 
494 	if (md_start_daemons(TRUE)) {
495 		MD_CLR_IN(IN_ATTACH);
496 		mddb_unload();		/* undo mddb_init() allocations */
497 		return (DDI_FAILURE);
498 	}
499 
500 	/* clear the halted state */
501 	md_clr_status(MD_GBL_HALTED);
502 
503 	/* see if the diagnostic switch is on */
504 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
505 	    DDI_PROP_DONTPASS, "md_init_debug", 0))
506 		md_init_debug++;
507 
508 	/* see if the failfast disable switch is on */
509 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
510 	    DDI_PROP_DONTPASS, "md_ff_disable", 0))
511 		md_ff_disable++;
512 
513 	/* try and get the md_nmedh property */
514 	md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
515 	    DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
516 	if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
517 		md_nmedh = MED_DEF_HOSTS;
518 
519 	/* try and get the md_med_trans_lst property */
520 	len = 0;
521 	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
522 	    0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
523 	    len == 0) {
524 		md_med_trans_lst = md_strdup("tcp");
525 	} else {
526 		md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
527 		if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
528 		    0, "md_med_trans_lst", md_med_trans_lst, &len) !=
529 		    DDI_PROP_SUCCESS) {
530 			kmem_free(md_med_trans_lst, (size_t)len);
531 			md_med_trans_lst = md_strdup("tcp");
532 		}
533 	}
534 
535 	/*
536 	 * Must initialize the internal data structures before the
537 	 * any possible calls to 'goto attach_failure' as _fini
538 	 * routine references them.
539 	 */
540 	med_init();
541 
542 	md_ops = (md_ops_t **)kmem_zalloc(
543 	    sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
544 	md_mods = (ddi_modhandle_t *)kmem_zalloc(
545 	    sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
546 
547 	/* try and get the md_xlate property */
548 	/* Should we only do this if upgrade? */
549 	len = sizeof (char) * 5;
550 	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
551 	    0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
552 		if (strcmp(ver, VERSION) == 0) {
553 			len = 0;
554 			if (ddi_prop_op(DDI_DEV_T_ANY, dip,
555 			    PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
556 			    (caddr_t)&md_tuple_table, &len) !=
557 			    DDI_PROP_SUCCESS) {
558 				if (md_init_debug)
559 					cmn_err(CE_WARN,
560 					    "md_xlate ddi_prop_op failed");
561 				goto attach_failure;
562 			} else {
563 				md_tuple_length =
564 				    len/(2 * ((int)sizeof (dev32_t)));
565 				md_in_upgrade = 1;
566 			}
567 
568 			/* Get target's name to major table */
569 			if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
570 			    dip, DDI_PROP_DONTPASS,
571 			    "md_targ_nm_table", &maj_str_array,
572 			    &md_majortab_len) != DDI_PROP_SUCCESS) {
573 				md_majortab_len = 0;
574 				if (md_init_debug)
575 					cmn_err(CE_WARN, "md_targ_nm_table "
576 					    "ddi_prop_lookup_string_array "
577 					    "failed");
578 				goto attach_failure;
579 			}
580 
581 			md_major_tuple_table =
582 			    (struct md_xlate_major_table *)
583 			    kmem_zalloc(md_majortab_len *
584 			    sizeof (struct md_xlate_major_table), KM_SLEEP);
585 
586 			for (i = 0; i < md_majortab_len; i++) {
587 				/* Getting major name */
588 				str = strchr(maj_str_array[i], ' ');
589 				if (str == NULL)
590 					continue;
591 				*str = '\0';
592 				md_major_tuple_table[i].drv_name =
593 				    md_strdup(maj_str_array[i]);
594 
595 				/* Simplified atoi to get major number */
596 				str2 = str + 1;
597 				md_major_tuple_table[i].targ_maj = 0;
598 				while ((*str2 >= '0') && (*str2 <= '9')) {
599 					md_major_tuple_table[i].targ_maj *= 10;
600 					md_major_tuple_table[i].targ_maj +=
601 					    *str2++ - '0';
602 				}
603 				*str = ' ';
604 			}
605 			ddi_prop_free((void *)maj_str_array);
606 		} else {
607 			if (md_init_debug)
608 				cmn_err(CE_WARN, "md_xlate_ver is incorrect");
609 			goto attach_failure;
610 		}
611 	}
612 
613 	/*
614 	 * Check for properties:
615 	 * 	md_keep_repl_state and md_devid_destroy
616 	 * and set globals if these exist.
617 	 */
618 	md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
619 	    0, "md_keep_repl_state", 0);
620 
621 	md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
622 	    0, "md_devid_destroy", 0);
623 
624 	if (MD_UPGRADE)
625 		md_major_targ = md_targ_name_to_major("md");
626 	else
627 		md_major_targ = 0;
628 
629 	/* allocate admin device node */
630 	if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
631 	    MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
632 		goto attach_failure;
633 
634 	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
635 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
636 		goto attach_failure;
637 
638 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
639 	    "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
640 		goto attach_failure;
641 
642 	/* these could have been cleared by a detach */
643 	md_nunits = MD_MAXUNITS;
644 	md_nsets = MD_MAXSETS;
645 
646 	sz = sizeof (void *) * MD_MAXUNITS;
647 	if (md_set[0].s_un == NULL)
648 		md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
649 	if (md_set[0].s_ui == NULL)
650 		md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
651 
652 	md_devinfo = dip;
653 
654 	/*
655 	 * Only allocate device node for root mirror metadevice.
656 	 * Don't pre-allocate unnecessary device nodes (thus slowing down a
657 	 * boot when we attach).
658 	 * We can't read the mddbs in attach.  The mddbs will be read
659 	 * by metainit during the boot process when it is doing the
660 	 * auto-take processing and any other minor nodes will be
661 	 * allocated at that point.
662 	 *
663 	 * There are two scenarios to be aware of here:
664 	 * 1) when we are booting from a mirrored root we need the root
665 	 *    metadevice to exist very early (during vfs_mountroot processing)
666 	 * 2) we need all of the nodes to be created so that any mnttab entries
667 	 *    will succeed (handled by metainit reading the mddb during boot).
668 	 */
669 	if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
670 	    == 0) {
671 		char *p;
672 		int mnum = 0;
673 
674 		/*
675 		 * The svm_bootpath string looks something like
676 		 * /pseudo/md@0:0,150,blk where 150 is the minor number
677 		 * in this example so we need to set the pointer p onto
678 		 * the first digit of the minor number and convert it
679 		 * from ascii.
680 		 */
681 		for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
682 		    *p >= '0' && *p <= '9'; p++) {
683 			mnum *= 10;
684 			mnum += *p - '0';
685 		}
686 
687 		if (md_create_minor_node(0, mnum)) {
688 			kmem_free(md_set[0].s_un, sz);
689 			kmem_free(md_set[0].s_ui, sz);
690 			goto attach_failure;
691 		}
692 	}
693 
694 	/* create the hash to store the meta device sizes */
695 	md_nblocksmap = mod_hash_create_idhash("md_nblocksmap",
696 	    md_nblocksmap_size, mod_hash_null_valdtor);
697 
698 	MD_CLR_IN(IN_ATTACH);
699 	return (DDI_SUCCESS);
700 
701 attach_failure:
702 	/*
703 	 * Use our own detach routine to toss any stuff we allocated above.
704 	 * NOTE: detach will call md_halt to free the mddb_init allocations.
705 	 */
706 	MD_CLR_IN(IN_ATTACH);
707 	if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
708 		cmn_err(CE_WARN, "detach from attach failed");
709 	return (DDI_FAILURE);
710 }
711 
712 /* ARGSUSED */
713 static int
714 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
715 {
716 	extern int	check_active_locators();
717 	set_t		s;
718 	size_t		sz;
719 	int		len;
720 
721 	MD_SET_IN(IN_DETACH);
722 
723 	/* check command */
724 	if (cmd != DDI_DETACH) {
725 		MD_CLR_IN(IN_DETACH);
726 		return (DDI_FAILURE);
727 	}
728 
729 	/*
730 	 * if we have not already halted yet we have no active config
731 	 * then automatically initiate a halt so we can detach.
732 	 */
733 	if (!(md_get_status() & MD_GBL_HALTED)) {
734 		if (check_active_locators() == 0) {
735 			/*
736 			 * NOTE: a successful md_halt will have done the
737 			 * mddb_unload to free allocations done in mddb_init
738 			 */
739 			if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
740 				cmn_err(CE_NOTE, "md:detach: "
741 				    "Could not halt Solaris Volume Manager");
742 				MD_CLR_IN(IN_DETACH);
743 				return (DDI_FAILURE);
744 			}
745 		}
746 
747 		/* fail detach if we have not halted */
748 		if (!(md_get_status() & MD_GBL_HALTED)) {
749 			MD_CLR_IN(IN_DETACH);
750 			return (DDI_FAILURE);
751 		}
752 	}
753 
754 	/* must be in halted state, this will be cleared on next attach */
755 	ASSERT(md_get_status() & MD_GBL_HALTED);
756 
757 	/* cleanup attach allocations and initializations */
758 	md_major_targ = 0;
759 
760 	sz = sizeof (void *) * md_nunits;
761 	for (s = 0; s < md_nsets; s++) {
762 		if (md_set[s].s_un != NULL) {
763 			kmem_free(md_set[s].s_un, sz);
764 			md_set[s].s_un = NULL;
765 		}
766 
767 		if (md_set[s].s_ui != NULL) {
768 			kmem_free(md_set[s].s_ui, sz);
769 			md_set[s].s_ui = NULL;
770 		}
771 	}
772 	md_nunits = 0;
773 	md_nsets = 0;
774 	md_nmedh = 0;
775 
776 	if (non_ff_drivers != NULL) {
777 		int	i;
778 
779 		for (i = 0; non_ff_drivers[i] != NULL; i++)
780 			kmem_free(non_ff_drivers[i],
781 			    strlen(non_ff_drivers[i]) + 1);
782 
783 		/* free i+1 entries because there is a null entry at list end */
784 		kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
785 		non_ff_drivers = NULL;
786 	}
787 
788 	if (md_med_trans_lst != NULL) {
789 		kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
790 		md_med_trans_lst = NULL;
791 	}
792 
793 	if (md_mods != NULL) {
794 		kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
795 		md_mods = NULL;
796 	}
797 
798 	if (md_ops != NULL) {
799 		kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
800 		md_ops = NULL;
801 	}
802 
803 	if (MD_UPGRADE) {
804 		len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
805 		md_in_upgrade = 0;
806 		md_xlate_free(len);
807 		md_majortab_free();
808 	}
809 
810 	/*
811 	 * Undo what we did in mdattach, freeing resources
812 	 * and removing things we installed.  The system
813 	 * framework guarantees we are not active with this devinfo
814 	 * node in any other entry points at this time.
815 	 */
816 	ddi_prop_remove_all(dip);
817 	ddi_remove_minor_node(dip, NULL);
818 
819 	med_fini();
820 
821 	mod_hash_destroy_idhash(md_nblocksmap);
822 
823 	md_devinfo = NULL;
824 
825 	MD_CLR_IN(IN_DETACH);
826 	return (DDI_SUCCESS);
827 }
828 
829 
830 /*
831  * Given the device number return the devinfo pointer
832  * given to md via md_attach
833  */
834 /*ARGSUSED*/
835 static int
836 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
837 {
838 	int		error = DDI_FAILURE;
839 
840 	switch (infocmd) {
841 	case DDI_INFO_DEVT2DEVINFO:
842 		if (md_devinfo) {
843 			*result = (void *)md_devinfo;
844 			error = DDI_SUCCESS;
845 		}
846 		break;
847 
848 	case DDI_INFO_DEVT2INSTANCE:
849 		*result = (void *)0;
850 		error = DDI_SUCCESS;
851 		break;
852 	}
853 	return (error);
854 }
855 
856 /*
857  * property operation routine.  return the number of blocks for the partition
858  * in question or forward the request to the property facilities.
859  */
860 static int
861 mdprop_op(
862 	dev_t dev,		/* device number associated with device */
863 	dev_info_t *dip,	/* device info struct for this device */
864 	ddi_prop_op_t prop_op,	/* property operator */
865 	int mod_flags,		/* property flags */
866 	char *name,		/* name of property */
867 	caddr_t valuep,		/* where to put property value */
868 	int *lengthp)		/* put length of property here */
869 {
870 	return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
871 	    name, valuep, lengthp, md_nblocks_get(getminor(dev))));
872 }
873 
874 static void
875 snarf_user_data(set_t setno)
876 {
877 	mddb_recid_t		recid;
878 	mddb_recstatus_t	status;
879 
880 	recid = mddb_makerecid(setno, 0);
881 	while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
882 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
883 			continue;
884 
885 		status = mddb_getrecstatus(recid);
886 		if (status == MDDB_STALE)
887 			continue;
888 
889 		if (status == MDDB_NODATA) {
890 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
891 			continue;
892 		}
893 
894 		ASSERT(status == MDDB_OK);
895 
896 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
897 	}
898 }
899 
900 static void
901 md_print_block_usage(mddb_set_t *s, uint_t blks)
902 {
903 	uint_t		ib;
904 	int		li;
905 	mddb_mb_ic_t	*mbip;
906 	uint_t		max_blk_needed;
907 	mddb_lb_t	*lbp;
908 	mddb_sidelocator_t	*slp;
909 	int		drv_index;
910 	md_splitname	sn;
911 	char		*name;
912 	char		*suffix;
913 	size_t		prefixlen;
914 	size_t		suffixlen;
915 	int		alloc_sz;
916 
917 
918 	max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
919 
920 	cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
921 	    "            Additional Blocks Needed:            %d\n\n"
922 	    "            Increase size of following replicas for\n"
923 	    "            device relocatability by deleting listed\n"
924 	    "            replica and re-adding replica with\n"
925 	    "            increased size (see metadb(1M)):\n"
926 	    "                Replica                   Increase By",
927 	    s->s_totalblkcnt, (blks - s->s_freeblkcnt));
928 
929 	lbp = s->s_lbp;
930 
931 	for (li = 0; li < lbp->lb_loccnt; li++) {
932 		if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
933 			continue;
934 		ib = 0;
935 		for (mbip = s->s_mbiarray[li]; mbip != NULL;
936 		    mbip = mbip->mbi_next) {
937 			ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
938 		}
939 		if (ib == 0)
940 			continue;
941 		if (ib < max_blk_needed) {
942 			slp = &lbp->lb_sidelocators[s->s_sideno][li];
943 			drv_index = slp->l_drvnm_index;
944 			mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
945 			    &sn);
946 			prefixlen = SPN_PREFIX(&sn).pre_len;
947 			suffixlen = SPN_SUFFIX(&sn).suf_len;
948 			alloc_sz = (int)(prefixlen + suffixlen + 2);
949 			name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
950 			(void) strncpy(name, SPN_PREFIX(&sn).pre_data,
951 			    prefixlen);
952 			name[prefixlen] = '/';
953 			suffix = name + (prefixlen + 1);
954 			(void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
955 			    suffixlen);
956 			name[prefixlen + suffixlen + 1] = '\0';
957 			cmn_err(CE_WARN,
958 			    "  %s (%s:%d:%d)   %d blocks",
959 			    name, lbp->lb_drvnm[drv_index].dn_data,
960 			    slp->l_mnum, lbp->lb_locators[li].l_blkno,
961 			    (max_blk_needed - ib));
962 			kmem_free(name, alloc_sz);
963 		}
964 	}
965 }
966 
967 /*
968  * md_create_minor_node:
969  *	Create the minor device for the given set and un_self_id.
970  *
971  * Input:
972  *	setno	- set number
973  *	mnum	- selfID of unit
974  *
975  * Output:
976  *	None.
977  *
978  * Returns 0 for success, 1 for failure.
979  *
980  * Side-effects:
981  *	None.
982  */
983 int
984 md_create_minor_node(set_t setno, minor_t mnum)
985 {
986 	char		name[20];
987 
988 	/* Check for valid arguments */
989 	if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
990 		return (1);
991 
992 	(void) snprintf(name, 20, "%u,%u,blk",
993 	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
994 
995 	if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
996 	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
997 		return (1);
998 
999 	(void) snprintf(name, 20, "%u,%u,raw",
1000 	    (unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
1001 
1002 	if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
1003 	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
1004 		return (1);
1005 
1006 	return (0);
1007 }
1008 
1009 /*
1010  * For a given key check if it is an orphaned record.
1011  * The following conditions are used to determine an orphan.
1012  * 1. The device associated with that key is not a metadevice.
1013  * 2. If DEVID_STYLE then the physical device does not have a device Id
1014  * associated with it.
1015  *
1016  * If a key does not have an entry in the devid namespace it could be
1017  * a device that does not support device ids. Hence the record is not
1018  * deleted.
1019  */
1020 
1021 static int
1022 md_verify_orphaned_record(set_t setno, mdkey_t key)
1023 {
1024 	md_dev64_t	odev; /* orphaned dev */
1025 	mddb_set_t	*s;
1026 	side_t		side = 0;
1027 	struct nm_next_hdr	*did_nh = NULL;
1028 
1029 	s = (mddb_set_t *)md_set[setno].s_db;
1030 	if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
1031 	    == NULL)
1032 		return (0);
1033 	/*
1034 	 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1035 	 */
1036 	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
1037 		odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
1038 		if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
1039 			return (0);
1040 		if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
1041 		    NULL)
1042 			return (1);
1043 	}
1044 	return (0);
1045 }
1046 
1047 int
1048 md_snarf_db_set(set_t setno, md_error_t *ep)
1049 {
1050 	int			err = 0;
1051 	int			i;
1052 	mddb_recid_t		recid;
1053 	mddb_type_t		drvrid;
1054 	mddb_recstatus_t	status;
1055 	md_ops_t		*ops;
1056 	uint_t			privat;
1057 	mddb_set_t		*s;
1058 	uint_t			cvt_blks;
1059 	struct nm_next_hdr	*nh;
1060 	mdkey_t			key = MD_KEYWILD;
1061 	side_t			side = 0;
1062 	int			size;
1063 	int			devid_flag;
1064 	int			retval;
1065 	uint_t			un;
1066 	int			un_next_set = 0;
1067 
1068 	md_haltsnarf_enter(setno);
1069 
1070 	mutex_enter(&md_mx);
1071 	if (md_set[setno].s_status & MD_SET_SNARFED) {
1072 		mutex_exit(&md_mx);
1073 		md_haltsnarf_exit(setno);
1074 		return (0);
1075 	}
1076 	mutex_exit(&md_mx);
1077 
1078 	if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1079 		if (md_start_daemons(TRUE)) {
1080 			if (ep != NULL)
1081 				(void) mdsyserror(ep, ENXIO);
1082 			err = -1;
1083 			goto out;
1084 		}
1085 	}
1086 
1087 
1088 	/*
1089 	 * Load the devid name space if it exists
1090 	 */
1091 	(void) md_load_namespace(setno, NULL, NM_DEVID);
1092 	if (!md_load_namespace(setno, ep, 0L)) {
1093 		/*
1094 		 * Unload the devid namespace
1095 		 */
1096 		(void) md_unload_namespace(setno, NM_DEVID);
1097 		err = -1;
1098 		goto out;
1099 	}
1100 
1101 	/*
1102 	 * If replica is in non-devid state, convert if:
1103 	 * 	- not in probe during upgrade (md_keep_repl_state = 0)
1104 	 * 	- enough space available in replica
1105 	 *	- local set
1106 	 *	- not a multi-node diskset
1107 	 *	- clustering is not present (for non-local set)
1108 	 */
1109 	s = (mddb_set_t *)md_set[setno].s_db;
1110 	devid_flag = 0;
1111 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1112 		devid_flag = 1;
1113 	if (cluster_bootflags & CLUSTER_CONFIGURED)
1114 		if (setno != MD_LOCAL_SET)
1115 			devid_flag = 0;
1116 	if (MD_MNSET_SETNO(setno))
1117 		devid_flag = 0;
1118 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1119 		devid_flag = 0;
1120 
1121 	/*
1122 	 * if we weren't devid style before and md_keep_repl_state=1
1123 	 * we need to stay non-devid
1124 	 */
1125 	if ((md_keep_repl_state == 1) &&
1126 	    ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1127 		devid_flag = 0;
1128 	if (devid_flag) {
1129 		/*
1130 		 * Determine number of free blocks needed to convert
1131 		 * entire replica to device id format - locator blocks
1132 		 * and namespace.
1133 		 */
1134 		cvt_blks = 0;
1135 		if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1136 			if (ep != NULL)
1137 				(void) mdsyserror(ep, EIO);
1138 			err = -1;
1139 			goto out;
1140 
1141 		}
1142 		cvt_blks += md_nm_did_chkspace(setno);
1143 
1144 		/* add MDDB_DEVID_CONV_PERC% */
1145 		if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1146 			cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1147 		}
1148 
1149 		if (cvt_blks <= s->s_freeblkcnt) {
1150 			if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1151 				if (ep != NULL)
1152 					(void) mdsyserror(ep, EIO);
1153 				err = -1;
1154 				goto out;
1155 			}
1156 
1157 		} else {
1158 			/*
1159 			 * Print message that replica can't be converted for
1160 			 * lack of space.   No failure - just continue to
1161 			 * run without device ids.
1162 			 */
1163 			cmn_err(CE_WARN,
1164 			    "Unable to add Solaris Volume Manager device "
1165 			    "relocation data.\n"
1166 			    "          To use device relocation feature:\n"
1167 			    "          - Increase size of listed replicas\n"
1168 			    "          - Reboot");
1169 			md_print_block_usage(s, cvt_blks);
1170 			cmn_err(CE_WARN,
1171 			    "Loading set without device relocation data.\n"
1172 			    "          Solaris Volume Manager disk movement "
1173 			    "not tracked in local set.");
1174 		}
1175 	}
1176 
1177 	/*
1178 	 * go through and load any modules referenced in
1179 	 * data base
1180 	 */
1181 	recid = mddb_makerecid(setno, 0);
1182 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1183 		status = mddb_getrecstatus(recid);
1184 		if (status == MDDB_STALE) {
1185 			if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1186 				md_set_setstatus(setno, MD_SET_STALE);
1187 				cmn_err(CE_WARN,
1188 				    "md: state database is stale");
1189 			}
1190 		} else if (status == MDDB_NODATA) {
1191 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1192 			continue;
1193 		}
1194 		drvrid = mddb_getrectype1(recid);
1195 		if (drvrid < MDDB_FIRST_MODID)
1196 			continue;
1197 		if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1198 		    drvrid) < 0) {
1199 			cmn_err(CE_NOTE, "md: could not load misc/%s",
1200 			    md_getshared_name(setno, drvrid));
1201 		}
1202 	}
1203 
1204 	if (recid < 0)
1205 		goto out;
1206 
1207 	snarf_user_data(setno);
1208 
1209 	/*
1210 	 * Initialize the md_nm_snarfed array
1211 	 * this array is indexed by the key and
1212 	 * is set by md_getdevnum during the snarf time
1213 	 */
1214 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1215 		size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1216 		    r_next_key) * (sizeof (int)));
1217 		md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1218 	}
1219 
1220 	/*
1221 	 * go through and snarf until nothing gets added
1222 	 */
1223 	do {
1224 		i = 0;
1225 		for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1226 			if (ops->md_snarf != NULL) {
1227 				retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1228 				if (retval == -1) {
1229 					err = -1;
1230 					/* Don't know the failed unit */
1231 					(void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1232 					    0);
1233 					(void) md_halt_set(setno, MD_HALT_ALL);
1234 					(void) mddb_unload_set(setno);
1235 					md_haltsnarf_exit(setno);
1236 					return (err);
1237 				} else {
1238 					i += retval;
1239 				}
1240 			}
1241 		}
1242 	} while (i);
1243 
1244 	/*
1245 	 * Set the first available slot and availability
1246 	 */
1247 	md_set[setno].s_un_avail = 0;
1248 	for (un = 0; un < MD_MAXUNITS; un++) {
1249 		if (md_set[setno].s_un[un] != NULL) {
1250 			continue;
1251 		} else {
1252 			if (!un_next_set) {
1253 				md_set[setno].s_un_next = un;
1254 				un_next_set = 1;
1255 			}
1256 			md_set[setno].s_un_avail++;
1257 		}
1258 	}
1259 
1260 	md_set_setstatus(setno, MD_SET_SNARFED);
1261 
1262 	recid = mddb_makerecid(setno, 0);
1263 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1264 		privat = mddb_getrecprivate(recid);
1265 		if (privat & MD_PRV_COMMIT) {
1266 			if (mddb_commitrec(recid)) {
1267 				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1268 					md_set_setstatus(setno, MD_SET_STALE);
1269 					cmn_err(CE_WARN,
1270 					    "md: state database is stale");
1271 				}
1272 			}
1273 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1274 		}
1275 	}
1276 
1277 	/* Deletes must happen after all the commits */
1278 	recid = mddb_makerecid(setno, 0);
1279 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1280 		privat = mddb_getrecprivate(recid);
1281 		if (privat & MD_PRV_DELETE) {
1282 			if (mddb_deleterec(recid)) {
1283 				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1284 					md_set_setstatus(setno, MD_SET_STALE);
1285 					cmn_err(CE_WARN,
1286 					    "md: state database is stale");
1287 				}
1288 				mddb_setrecprivate(recid, MD_PRV_GOTIT);
1289 			}
1290 			recid = mddb_makerecid(setno, 0);
1291 		}
1292 	}
1293 
1294 	/*
1295 	 * go through and clean up records until nothing gets cleaned up.
1296 	 */
1297 	do {
1298 		i = 0;
1299 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1300 			if (ops->md_snarf != NULL)
1301 				i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1302 	} while (i);
1303 
1304 	if (md_nm_snarfed != NULL &&
1305 	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
1306 		/*
1307 		 * go thru and cleanup the namespace and the device id
1308 		 * name space
1309 		 */
1310 		for (key = 1;
1311 		    key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1312 		    key++) {
1313 			/*
1314 			 * Is the entry an 'orphan'?
1315 			 */
1316 			if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1317 			    NULL) {
1318 				/*
1319 				 * If the value is not set then apparently
1320 				 * it is not part of the current configuration,
1321 				 * remove it this can happen when system panic
1322 				 * between the primary name space update and
1323 				 * the device id name space update
1324 				 */
1325 				if (md_nm_snarfed[key] == 0) {
1326 					if (md_verify_orphaned_record(setno,
1327 					    key) == 1)
1328 						(void) remove_entry(nh,
1329 						    side, key, 0L);
1330 				}
1331 			}
1332 		}
1333 	}
1334 
1335 	if (md_nm_snarfed != NULL) {
1336 		/*
1337 		 * Done and free the memory
1338 		 */
1339 		kmem_free(md_nm_snarfed, size);
1340 		md_nm_snarfed = NULL;
1341 	}
1342 
1343 	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1344 	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
1345 		/*
1346 		 * if the destroy flag has been set and
1347 		 * the MD_SET_DIDCLUP bit is not set in
1348 		 * the set's status field, cleanup the
1349 		 * entire device id namespace
1350 		 */
1351 		if (md_devid_destroy &&
1352 		    !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1353 			(void) md_devid_cleanup(setno, 1);
1354 			md_set_setstatus(setno, MD_SET_DIDCLUP);
1355 		} else
1356 			(void) md_devid_cleanup(setno, 0);
1357 	}
1358 
1359 	/*
1360 	 * clear single threading on snarf, return success or error
1361 	 */
1362 out:
1363 	md_haltsnarf_exit(setno);
1364 	return (err);
1365 }
1366 
1367 void
1368 get_minfo(struct dk_minfo *info, minor_t mnum)
1369 {
1370 	md_unit_t	*un;
1371 	mdi_unit_t	*ui;
1372 
1373 	info->dki_capacity = 0;
1374 	info->dki_lbsize = 0;
1375 	info->dki_media_type = 0;
1376 
1377 	if ((ui = MDI_UNIT(mnum)) == NULL) {
1378 		return;
1379 	}
1380 	un = (md_unit_t *)md_unit_readerlock(ui);
1381 	info->dki_capacity = un->c.un_total_blocks;
1382 	md_unit_readerexit(ui);
1383 	info->dki_lbsize = DEV_BSIZE;
1384 	info->dki_media_type = DK_UNKNOWN;
1385 }
1386 
1387 
1388 void
1389 get_info(struct dk_cinfo *info, minor_t mnum)
1390 {
1391 	/*
1392 	 * Controller Information
1393 	 */
1394 	info->dki_ctype = DKC_MD;
1395 	info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1396 	(void) strcpy(info->dki_cname,
1397 	    ddi_get_name(ddi_get_parent(md_devinfo)));
1398 	/*
1399 	 * Unit Information
1400 	 */
1401 	info->dki_unit = mnum;
1402 	info->dki_slave = 0;
1403 	(void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1404 	info->dki_flags = 0;
1405 	info->dki_partition = 0;
1406 	info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1407 
1408 	/*
1409 	 * We can't get from here to there yet
1410 	 */
1411 	info->dki_addr = 0;
1412 	info->dki_space = 0;
1413 	info->dki_prio = 0;
1414 	info->dki_vec = 0;
1415 }
1416 
1417 /*
1418  * open admin device
1419  */
1420 static int
1421 mdadminopen(
1422 	int	flag,
1423 	int	otyp)
1424 {
1425 	int	err = 0;
1426 
1427 	/* single thread */
1428 	mutex_enter(&md_mx);
1429 
1430 	/* check type and flags */
1431 	if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1432 		err = EINVAL;
1433 		goto out;
1434 	}
1435 	if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1436 	    (md_status & MD_GBL_EXCL)) {
1437 		err = EBUSY;
1438 		goto out;
1439 	}
1440 
1441 	/* count and flag open */
1442 	md_ocnt[otyp]++;
1443 	md_status |= MD_GBL_OPEN;
1444 	if (flag & FEXCL)
1445 		md_status |= MD_GBL_EXCL;
1446 
1447 	/* unlock return success */
1448 out:
1449 	mutex_exit(&md_mx);
1450 	return (err);
1451 }
1452 
1453 /*
1454  * open entry point
1455  */
1456 static int
1457 mdopen(
1458 	dev_t		*dev,
1459 	int		flag,
1460 	int		otyp,
1461 	cred_t		*cred_p)
1462 {
1463 	minor_t		mnum = getminor(*dev);
1464 	unit_t		unit = MD_MIN2UNIT(mnum);
1465 	set_t		setno = MD_MIN2SET(mnum);
1466 	mdi_unit_t	*ui = NULL;
1467 	int		err = 0;
1468 	md_parent_t	parent;
1469 
1470 	/* dispatch admin device opens */
1471 	if (mnum == MD_ADM_MINOR)
1472 		return (mdadminopen(flag, otyp));
1473 
1474 	/* lock, check status */
1475 	rw_enter(&md_unit_array_rw.lock, RW_READER);
1476 
1477 tryagain:
1478 	if (md_get_status() & MD_GBL_HALTED)  {
1479 		err = ENODEV;
1480 		goto out;
1481 	}
1482 
1483 	/* check minor */
1484 	if ((setno >= md_nsets) || (unit >= md_nunits)) {
1485 		err = ENXIO;
1486 		goto out;
1487 	}
1488 
1489 	/* make sure we're snarfed */
1490 	if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1491 		if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1492 			err = ENODEV;
1493 			goto out;
1494 		}
1495 	}
1496 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1497 		err = ENODEV;
1498 		goto out;
1499 	}
1500 
1501 	/* check unit */
1502 	if ((ui = MDI_UNIT(mnum)) == NULL) {
1503 		err = ENXIO;
1504 		goto out;
1505 	}
1506 
1507 	/*
1508 	 * The softpart open routine may do an I/O during the open, in
1509 	 * which case the open routine will set the OPENINPROGRESS flag
1510 	 * and drop all locks during the I/O.  If this thread sees
1511 	 * the OPENINPROGRESS flag set, if should wait until the flag
1512 	 * is reset before calling the driver's open routine.  It must
1513 	 * also revalidate the world after it grabs the unit_array lock
1514 	 * since the set may have been released or the metadevice cleared
1515 	 * during the sleep.
1516 	 */
1517 	if (MD_MNSET_SETNO(setno)) {
1518 		mutex_enter(&ui->ui_mx);
1519 		if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1520 			rw_exit(&md_unit_array_rw.lock);
1521 			cv_wait(&ui->ui_cv, &ui->ui_mx);
1522 			rw_enter(&md_unit_array_rw.lock, RW_READER);
1523 			mutex_exit(&ui->ui_mx);
1524 			goto tryagain;
1525 		}
1526 		mutex_exit(&ui->ui_mx);
1527 	}
1528 
1529 	/* Test if device is openable */
1530 	if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1531 		err = ENXIO;
1532 		goto out;
1533 	}
1534 
1535 	/* don't allow opens w/WRITE flag if stale */
1536 	if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1537 		err = EROFS;
1538 		goto out;
1539 	}
1540 
1541 	/* don't allow writes to subdevices */
1542 	parent = md_get_parent(md_expldev(*dev));
1543 	if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1544 		err = EROFS;
1545 		goto out;
1546 	}
1547 
1548 	/* open underlying driver */
1549 	if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1550 		if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1551 		    (dev, flag, otyp, cred_p, 0)) != 0)
1552 			goto out;
1553 	}
1554 
1555 	/* or do it ourselves */
1556 	else {
1557 		/* single thread */
1558 		(void) md_unit_openclose_enter(ui);
1559 		err = md_unit_incopen(mnum, flag, otyp);
1560 		md_unit_openclose_exit(ui);
1561 		if (err != 0)
1562 			goto out;
1563 	}
1564 
1565 	/* unlock, return status */
1566 out:
1567 	rw_exit(&md_unit_array_rw.lock);
1568 	return (err);
1569 }
1570 
1571 /*
1572  * close admin device
1573  */
1574 static int
1575 mdadminclose(
1576 	int	otyp)
1577 {
1578 	int	i;
1579 	int	err = 0;
1580 
1581 	/* single thread */
1582 	mutex_enter(&md_mx);
1583 
1584 	/* check type and flags */
1585 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1586 		err = EINVAL;
1587 		goto out;
1588 	} else if (md_ocnt[otyp] == 0) {
1589 		err = ENXIO;
1590 		goto out;
1591 	}
1592 
1593 	/* count and flag closed */
1594 	if (otyp == OTYP_LYR)
1595 		md_ocnt[otyp]--;
1596 	else
1597 		md_ocnt[otyp] = 0;
1598 	md_status &= ~MD_GBL_OPEN;
1599 	for (i = 0; (i < OTYPCNT); ++i)
1600 		if (md_ocnt[i] != 0)
1601 			md_status |= MD_GBL_OPEN;
1602 	if (! (md_status & MD_GBL_OPEN))
1603 		md_status &= ~MD_GBL_EXCL;
1604 
1605 	/* unlock return success */
1606 out:
1607 	mutex_exit(&md_mx);
1608 	return (err);
1609 }
1610 
1611 /*
1612  * close entry point
1613  */
1614 static int
1615 mdclose(
1616 	dev_t		dev,
1617 	int		flag,
1618 	int		otyp,
1619 	cred_t		*cred_p)
1620 {
1621 	minor_t		mnum = getminor(dev);
1622 	set_t		setno = MD_MIN2SET(mnum);
1623 	unit_t		unit = MD_MIN2UNIT(mnum);
1624 	mdi_unit_t	*ui = NULL;
1625 	int		err = 0;
1626 
1627 	/* dispatch admin device closes */
1628 	if (mnum == MD_ADM_MINOR)
1629 		return (mdadminclose(otyp));
1630 
1631 	/* check minor */
1632 	if ((setno >= md_nsets) || (unit >= md_nunits) ||
1633 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1634 		err = ENXIO;
1635 		goto out;
1636 	}
1637 
1638 	/* close underlying driver */
1639 	if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1640 		if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1641 		    (dev, flag, otyp, cred_p, 0)) != 0)
1642 			goto out;
1643 	}
1644 
1645 	/* or do it ourselves */
1646 	else {
1647 		/* single thread */
1648 		(void) md_unit_openclose_enter(ui);
1649 		err = md_unit_decopen(mnum, otyp);
1650 		md_unit_openclose_exit(ui);
1651 		if (err != 0)
1652 			goto out;
1653 	}
1654 
1655 	/* return success */
1656 out:
1657 	return (err);
1658 }
1659 
1660 
1661 /*
1662  * This routine performs raw read operations.  It is called from the
1663  * device switch at normal priority.
1664  *
1665  * The main catch is that the *uio struct which is passed to us may
1666  * specify a read which spans two buffers, which would be contiguous
1667  * on a single partition,  but not on a striped partition. This will
1668  * be handled by mdstrategy.
1669  */
1670 /*ARGSUSED*/
1671 static int
1672 mdread(dev_t dev, struct uio *uio, cred_t *credp)
1673 {
1674 	minor_t		mnum;
1675 	mdi_unit_t	*ui;
1676 	int		error;
1677 
1678 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1679 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1680 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1681 	    ((ui = MDI_UNIT(mnum)) == NULL))
1682 		return (ENXIO);
1683 
1684 	if (md_ops[ui->ui_opsindex]->md_read  != NULL)
1685 		return ((*md_ops[ui->ui_opsindex]->md_read)
1686 		    (dev, uio, credp));
1687 
1688 	if ((error = md_chk_uio(uio)) != 0)
1689 		return (error);
1690 
1691 	return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1692 }
1693 
1694 /*
1695  * This routine performs async raw read operations.  It is called from the
1696  * device switch at normal priority.
1697  *
1698  * The main catch is that the *aio struct which is passed to us may
1699  * specify a read which spans two buffers, which would be contiguous
1700  * on a single partition,  but not on a striped partition. This will
1701  * be handled by mdstrategy.
1702  */
1703 /*ARGSUSED*/
1704 static int
1705 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1706 {
1707 	minor_t		mnum;
1708 	mdi_unit_t	*ui;
1709 	int		error;
1710 
1711 
1712 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1713 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1714 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1715 	    ((ui = MDI_UNIT(mnum)) == NULL))
1716 		return (ENXIO);
1717 
1718 	if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
1719 		return ((*md_ops[ui->ui_opsindex]->md_aread)
1720 		    (dev, aio, credp));
1721 
1722 	if ((error = md_chk_uio(aio->aio_uio)) != 0)
1723 		return (error);
1724 
1725 	return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1726 }
1727 
1728 /*
1729  * This routine performs raw write operations.	It is called from the
1730  * device switch at normal priority.
1731  *
1732  * The main catch is that the *uio struct which is passed to us may
1733  * specify a write which spans two buffers, which would be contiguous
1734  * on a single partition,  but not on a striped partition. This is
1735  * handled by mdstrategy.
1736  *
1737  */
1738 /*ARGSUSED*/
1739 static int
1740 mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1741 {
1742 	minor_t		mnum;
1743 	mdi_unit_t	*ui;
1744 	int		error;
1745 
1746 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1747 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1748 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1749 	    ((ui = MDI_UNIT(mnum)) == NULL))
1750 		return (ENXIO);
1751 
1752 	if (md_ops[ui->ui_opsindex]->md_write  != NULL)
1753 		return ((*md_ops[ui->ui_opsindex]->md_write)
1754 		    (dev, uio, credp));
1755 
1756 	if ((error = md_chk_uio(uio)) != 0)
1757 		return (error);
1758 
1759 	return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1760 }
1761 
1762 /*
1763  * This routine performs async raw write operations.  It is called from the
1764  * device switch at normal priority.
1765  *
1766  * The main catch is that the *aio struct which is passed to us may
1767  * specify a write which spans two buffers, which would be contiguous
1768  * on a single partition,  but not on a striped partition. This is
1769  * handled by mdstrategy.
1770  *
1771  */
1772 /*ARGSUSED*/
1773 static int
1774 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1775 {
1776 	minor_t		mnum;
1777 	mdi_unit_t	*ui;
1778 	int		error;
1779 
1780 
1781 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1782 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1783 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1784 	    ((ui = MDI_UNIT(mnum)) == NULL))
1785 		return (ENXIO);
1786 
1787 	if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
1788 		return ((*md_ops[ui->ui_opsindex]->md_awrite)
1789 		    (dev, aio, credp));
1790 
1791 	if ((error = md_chk_uio(aio->aio_uio)) != 0)
1792 		return (error);
1793 
1794 	return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1795 }
1796 
1797 int
1798 mdstrategy(struct buf *bp)
1799 {
1800 	minor_t		mnum;
1801 	mdi_unit_t	*ui;
1802 
1803 	ASSERT((bp->b_flags & B_DONE) == 0);
1804 
1805 	if (panicstr)
1806 		md_clr_status(MD_GBL_DAEMONS_LIVE);
1807 
1808 	if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1809 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1810 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1811 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1812 		bp->b_flags |= B_ERROR;
1813 		bp->b_error = ENXIO;
1814 		bp->b_resid = bp->b_bcount;
1815 		biodone(bp);
1816 		return (0);
1817 	}
1818 
1819 	bp->b_flags &= ~(B_ERROR | B_DONE);
1820 	if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
1821 		(*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1822 	} else {
1823 		(void) errdone(ui, bp, ENXIO);
1824 	}
1825 	return (0);
1826 }
1827 
1828 /*
1829  * Return true if the ioctl is allowed to be multithreaded.
1830  * All the ioctls with MN are sent only from the message handlers through
1831  * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1832  * ioctl for the same metadevice are issued at the same time.
1833  * So we are safe here.
1834  * The other ioctls do not mess with any metadevice structures and therefor
1835  * are harmless too, if called multiple times at the same time.
1836  */
1837 static boolean_t
1838 is_mt_ioctl(int cmd) {
1839 
1840 	switch (cmd) {
1841 	case MD_IOCGUNIQMSGID:
1842 	case MD_IOCGVERSION:
1843 	case MD_IOCISOPEN:
1844 	case MD_MN_SET_MM_OWNER:
1845 	case MD_MN_SET_STATE:
1846 	case MD_MN_SUSPEND_WRITES:
1847 	case MD_MN_ALLOCATE_HOTSPARE:
1848 	case MD_MN_SET_SETFLAGS:
1849 	case MD_MN_GET_SETFLAGS:
1850 	case MD_MN_MDDB_OPTRECFIX:
1851 	case MD_MN_MDDB_PARSE:
1852 	case MD_MN_MDDB_BLOCK:
1853 	case MD_MN_DB_USERREQ:
1854 	case MD_IOC_SPSTATUS:
1855 	case MD_MN_COMMD_ERR:
1856 	case MD_MN_SET_COMMD_RUNNING:
1857 	case MD_MN_RESYNC:
1858 	case MD_MN_SETSYNC:
1859 	case MD_MN_POKE_HOTSPARES:
1860 		return (1);
1861 	default:
1862 		return (0);
1863 	}
1864 }
1865 
1866 /*
1867  * This routine implements the ioctl calls for the Virtual Disk System.
1868  * It is called from the device switch at normal priority.
1869  */
1870 /* ARGSUSED */
1871 static int
1872 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1873 	int *rval_p)
1874 {
1875 	minor_t		mnum = getminor(dev);
1876 	mdi_unit_t	*ui;
1877 	IOLOCK		lock;
1878 	int		err;
1879 
1880 	/*
1881 	 * For multinode disksets  number of ioctls are allowed to be
1882 	 * multithreaded.
1883 	 * A fundamental assumption made in this implementation is that
1884 	 * ioctls either do not interact with other md structures  or the
1885 	 * ioctl to the admin device can only occur if the metadevice
1886 	 * device is open. i.e. avoid a race between metaclear and the
1887 	 * progress of a multithreaded ioctl.
1888 	 */
1889 
1890 	if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1891 		return (EINTR);
1892 	}
1893 
1894 	/*
1895 	 * initialize lock tracker
1896 	 */
1897 	IOLOCK_INIT(&lock);
1898 
1899 	/* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1900 
1901 	if (is_mt_ioctl(cmd)) {
1902 		/* increment the md_mtioctl_cnt */
1903 		mutex_enter(&md_mx);
1904 		md_mtioctl_cnt++;
1905 		mutex_exit(&md_mx);
1906 		lock.l_flags |= MD_MT_IOCTL;
1907 	}
1908 
1909 	/*
1910 	 * this has been added to prevent notification from re-snarfing
1911 	 * so metaunload will work.  It may interfere with other modules
1912 	 * halt process.
1913 	 */
1914 	if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1915 		return (IOLOCK_RETURN(ENXIO, &lock));
1916 
1917 	/*
1918 	 * admin device ioctls
1919 	 */
1920 	if (mnum == MD_ADM_MINOR) {
1921 		err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1922 		    mode, &lock);
1923 	}
1924 
1925 	/*
1926 	 * metadevice ioctls
1927 	 */
1928 	else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1929 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1930 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1931 		err = ENXIO;
1932 	} else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1933 		err = ENOTTY;
1934 	} else {
1935 		err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1936 		    (dev, cmd, (void *) data, mode, &lock);
1937 	}
1938 
1939 	/*
1940 	 * drop any locks we grabbed
1941 	 */
1942 	return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1943 }
1944 
1945 static int
1946 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1947 {
1948 	minor_t		mnum;
1949 	set_t		setno;
1950 	mdi_unit_t	*ui;
1951 
1952 	if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1953 		return (ENXIO);
1954 
1955 	setno = MD_MIN2SET(mnum);
1956 
1957 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1958 	    ((ui = MDI_UNIT(mnum)) == NULL))
1959 		return (ENXIO);
1960 
1961 
1962 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1963 		return (ENXIO);
1964 
1965 	if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
1966 		return ((*md_ops[ui->ui_opsindex]->md_dump)
1967 		    (dev, addr, blkno, nblk));
1968 
1969 	return (ENXIO);
1970 }
1971 
1972 /*
1973  * Metadevice unit number dispatcher
1974  * When this routine is called it will scan the
1975  * incore unit array and return the avail slot
1976  * hence the unit number to the caller
1977  *
1978  * Return -1 if there is nothing available
1979  */
1980 unit_t
1981 md_get_nextunit(set_t setno)
1982 {
1983 	unit_t	un, start;
1984 
1985 	/*
1986 	 * If nothing available
1987 	 */
1988 	if (md_set[setno].s_un_avail == 0) {
1989 		return (MD_UNITBAD);
1990 	}
1991 
1992 	mutex_enter(&md_mx);
1993 	start = un = md_set[setno].s_un_next;
1994 
1995 	/* LINTED: E_CONSTANT_CONDITION */
1996 	while (1) {
1997 		if (md_set[setno].s_un[un] == NULL) {
1998 			/*
1999 			 * Advance the starting index for the next
2000 			 * md_get_nextunit call
2001 			 */
2002 			if (un == MD_MAXUNITS - 1) {
2003 				md_set[setno].s_un_next = 0;
2004 			} else {
2005 				md_set[setno].s_un_next = un + 1;
2006 			}
2007 			break;
2008 		}
2009 
2010 		un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
2011 
2012 		if (un == start) {
2013 			un = MD_UNITBAD;
2014 			break;
2015 		}
2016 
2017 	}
2018 
2019 	mutex_exit(&md_mx);
2020 	return (un);
2021 }
2022