xref: /onnv-gate/usr/src/uts/common/io/lvm/md/md_subr.c (revision 3073:c5251d7eaee3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Driver for Virtual Disk.
30  */
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/user.h>
36 #include <sys/uio.h>
37 #include <sys/proc.h>
38 #include <sys/t_lock.h>
39 #include <sys/dkio.h>
40 #include <sys/kmem.h>
41 #include <sys/debug.h>
42 #include <sys/cmn_err.h>
43 #include <sys/sysmacros.h>
44 #include <sys/types.h>
45 #include <sys/mkdev.h>
46 #include <sys/vtoc.h>
47 #include <sys/open.h>
48 #include <sys/file.h>
49 #include <vm/page.h>
50 #include <sys/callb.h>
51 #include <sys/disp.h>
52 #include <sys/modctl.h>
53 #include <sys/errno.h>
54 #include <sys/door.h>
55 #include <sys/lvm/mdmn_commd.h>
56 #include <sys/lvm/md_hotspares.h>
57 
58 #include <sys/lvm/mdvar.h>
59 #include <sys/lvm/md_names.h>
60 
61 #include <sys/ddi.h>
62 #include <sys/proc.h>
63 #include <sys/sunddi.h>
64 #include <sys/esunddi.h>
65 
66 #include <sys/sysevent.h>
67 #include <sys/sysevent/eventdefs.h>
68 
69 #include <sys/sysevent/svm.h>
70 #include <sys/lvm/md_basic.h>
71 
72 
73 /*
74  * Machine specific Hertz is kept here
75  */
76 extern clock_t			md_hz;
77 
78 /*
79  * Externs.
80  */
81 extern int			(*mdv_strategy_tstpnt)(buf_t *, int, void*);
82 extern major_t			md_major;
83 extern unit_t			md_nunits;
84 extern set_t			md_nsets;
85 extern md_set_t			md_set[];
86 extern md_set_io_t		md_set_io[];
87 extern md_ops_t			**md_ops;
88 extern md_ops_t			*md_opslist;
89 extern ddi_modhandle_t		*md_mods;
90 
91 extern md_krwlock_t		md_unit_array_rw;
92 extern kmutex_t			md_mx;
93 extern kcondvar_t		md_cv;
94 
95 extern md_krwlock_t		hsp_rwlp;
96 extern md_krwlock_t		ni_rwlp;
97 
98 extern int			md_num_daemons;
99 extern int			md_status;
100 extern int			md_ioctl_cnt;
101 extern int			md_mtioctl_cnt;
102 
103 extern struct metatransops	metatransops;
104 extern md_event_queue_t		*md_event_queue;
105 extern md_resync_t		md_cpr_resync;
106 extern int			md_done_daemon_threads;
107 extern int			md_ff_daemon_threads;
108 
109 
110 extern mddb_set_t	*mddb_setenter(set_t setno, int flag, int *errorcodep);
111 extern void		mddb_setexit(mddb_set_t *s);
112 extern void		*lookup_entry(struct nm_next_hdr *, set_t,
113 				side_t, mdkey_t, md_dev64_t, int);
114 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
115 
116 struct mdq_anchor	md_done_daemon; /* done request queue */
117 struct mdq_anchor	md_mstr_daemon; /* mirror timeout requests */
118 struct mdq_anchor	md_mhs_daemon;	/* mirror hotspare requests queue */
119 struct mdq_anchor	md_hs_daemon;	/* raid hotspare requests queue */
120 struct mdq_anchor	md_ff_daemonq;	/* failfast request queue */
121 struct mdq_anchor	md_mirror_daemon; /* mirror owner queue */
122 struct mdq_anchor	md_mirror_io_daemon; /* mirror owner i/o queue */
123 struct mdq_anchor	md_mirror_rs_daemon; /* mirror resync done queue */
124 struct mdq_anchor	md_sp_daemon;	/* soft-part error daemon queue */
125 
126 int md_done_daemon_threads = 1;	/* threads for md_done_daemon requestq */
127 int md_mstr_daemon_threads = 1;	/* threads for md_mstr_daemon requestq */
128 int md_mhs_daemon_threads = 1;	/* threads for md_mhs_daemon requestq */
129 int md_hs_daemon_threads = 1;	/* threads for md_hs_daemon requestq */
130 int md_ff_daemon_threads = 3;	/* threads for md_ff_daemon requestq */
131 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
132 int md_sp_daemon_threads = 1;	/* threads for md_sp_daemon requestq */
133 
134 #ifdef DEBUG
135 /* Flag to switch on debug messages */
136 int md_release_reacquire_debug = 0;	/* debug flag */
137 #endif
138 
139 /*
140  *
141  * The md_request_queues is table of pointers to request queues and the number
142  * of threads associated with the request queues.
143  * When the number of threads is set to 1, then the order of execution is
144  * sequential.
145  * The number of threads for all the queues have been defined as global
146  * variables to enable kernel tuning.
147  *
148  */
149 
150 #define	MD_DAEMON_QUEUES 10
151 
152 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
153 	{&md_done_daemon, &md_done_daemon_threads},
154 	{&md_mstr_daemon, &md_mstr_daemon_threads},
155 	{&md_hs_daemon, &md_hs_daemon_threads},
156 	{&md_ff_daemonq, &md_ff_daemon_threads},
157 	{&md_mirror_daemon, &md_mirror_daemon_threads},
158 	{&md_mirror_io_daemon, &md_mirror_daemon_threads},
159 	{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
160 	{&md_sp_daemon, &md_sp_daemon_threads},
161 	{&md_mhs_daemon, &md_mhs_daemon_threads},
162 	{0, 0}
163 };
164 
165 /*
166  * Number of times a message is retried before issuing a warning to the operator
167  */
168 #define	MD_MN_WARN_INTVL	10
169 
170 /*
171  * Setting retry cnt to one (pre decremented) so that we actually do no
172  * retries when committing/deleting a mddb rec. The underlying disk driver
173  * does several retries to check if the disk is really dead or not so there
174  * is no reason for us to retry on top of the drivers retries.
175  */
176 
177 uint_t			md_retry_cnt = 1; /* global so it can be patched */
178 
179 /*
180  * Bug # 1212146
181  * Before this change the user had to pass in a short aligned buffer because of
182  * problems in some underlying device drivers.  This problem seems to have been
183  * corrected in the underlying drivers so we will default to not requiring any
184  * alignment.  If the user needs to check for a specific alignment,
185  * md_uio_alignment_mask may be set in /etc/system to accomplish this.  To get
186  * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
187  * to check for word alignment, it can be set to 3, for double word alignment,
188  * it can be set to 7, etc.
189  *
190  * [Other part of fix is in function md_chk_uio()]
191  */
192 static int		md_uio_alignment_mask = 0;
193 
194 /*
195  * for md_dev64_t translation
196  */
197 struct md_xlate_table		*md_tuple_table;
198 struct md_xlate_major_table	*md_major_tuple_table;
199 int				md_tuple_length;
200 uint_t				md_majortab_len;
201 
202 /* Function declarations */
203 
204 static int md_create_probe_rqlist(md_probedev_impl_t *plist,
205 			daemon_queue_t **hdr, intptr_t (*probe_test)());
206 
207 /*
208  * manipulate global status
209  */
210 void
211 md_set_status(int bits)
212 {
213 	mutex_enter(&md_mx);
214 	md_status |= bits;
215 	mutex_exit(&md_mx);
216 }
217 
218 void
219 md_clr_status(int bits)
220 {
221 	mutex_enter(&md_mx);
222 	md_status &= ~bits;
223 	mutex_exit(&md_mx);
224 }
225 
226 int
227 md_get_status()
228 {
229 	int result;
230 	mutex_enter(&md_mx);
231 	result = md_status;
232 	mutex_exit(&md_mx);
233 	return (result);
234 }
235 
236 void
237 md_set_setstatus(set_t setno, int bits)
238 {
239 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
240 
241 	mutex_enter(&md_mx);
242 	md_set[setno].s_status |= bits;
243 	mutex_exit(&md_mx);
244 }
245 
246 void
247 md_clr_setstatus(set_t setno, int bits)
248 {
249 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
250 
251 	mutex_enter(&md_mx);
252 	md_set[setno].s_status &= ~bits;
253 	mutex_exit(&md_mx);
254 }
255 
256 uint_t
257 md_get_setstatus(set_t setno)
258 {
259 	uint_t result;
260 
261 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
262 
263 	mutex_enter(&md_mx);
264 	result = md_set[setno].s_status;
265 	mutex_exit(&md_mx);
266 	return (result);
267 }
268 
269 /*
270  * md_unit_readerlock_common:
271  * -------------------------
272  * Mark the given unit as having a reader reference. Spin waiting for any
273  * writer references to be released.
274  *
275  * Input:
276  *	ui		unit reference
277  *	lock_held	0 => ui_mx needs to be grabbed
278  *			1 => ui_mx already held
279  * Output:
280  *	mm_unit_t corresponding to unit structure
281  *	ui->ui_readercnt incremented
282  */
283 static void *
284 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
285 {
286 	uint_t	flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
287 
288 	if (!lock_held)
289 		mutex_enter(&ui->ui_mx);
290 	while (ui->ui_lock & flag) {
291 		if (panicstr) {
292 			if (ui->ui_lock & MD_UL_WRITER)
293 				panic("md: writer lock is held");
294 			break;
295 		}
296 		cv_wait(&ui->ui_cv, &ui->ui_mx);
297 	}
298 	ui->ui_readercnt++;
299 	if (!lock_held)
300 		mutex_exit(&ui->ui_mx);
301 	return (MD_UNIT(ui->ui_link.ln_id));
302 }
303 
304 void *
305 md_unit_readerlock(mdi_unit_t *ui)
306 {
307 	return (md_unit_readerlock_common(ui, 0));
308 }
309 
310 /*
311  * md_unit_writerlock_common:
312  * -------------------------
313  * Acquire a unique writer reference. Causes previous readers to drain.
314  * Spins if a writer reference already exists or if a previous reader/writer
315  * dropped the lock to allow a ksend_message to be despatched.
316  *
317  * Input:
318  *	ui		unit reference
319  *	lock_held	0 => grab ui_mx
320  *			1 => ui_mx already held on entry
321  * Output:
322  *	mm_unit_t reference
323  */
324 static void *
325 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
326 {
327 	uint_t	flag = MD_UL_WRITER;
328 
329 	if (panicstr)
330 		panic("md: writer lock not allowed");
331 
332 	if (!lock_held)
333 		mutex_enter(&ui->ui_mx);
334 
335 	while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
336 		ui->ui_wanabecnt++;
337 		ui->ui_lock |= MD_UL_WANABEWRITER;
338 		cv_wait(&ui->ui_cv, &ui->ui_mx);
339 		if (--ui->ui_wanabecnt == 0)
340 			ui->ui_lock &= ~MD_UL_WANABEWRITER;
341 	}
342 	ui->ui_lock |= MD_UL_WRITER;
343 	ui->ui_owner = curthread;
344 
345 	if (!lock_held)
346 		mutex_exit(&ui->ui_mx);
347 	return (MD_UNIT(ui->ui_link.ln_id));
348 }
349 
350 void *
351 md_unit_writerlock(mdi_unit_t *ui)
352 {
353 	return (md_unit_writerlock_common(ui, 0));
354 }
355 
356 /*
357  * md_unit_readerexit_common:
358  * -------------------------
359  * Release the readerlock for the specified unit. If the reader count reaches
360  * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
361  *
362  * Input:
363  *	ui		unit reference
364  *	lock_held	0 => ui_mx needs to be acquired
365  *			1 => ui_mx already held
366  */
367 static void
368 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
369 {
370 	if (!lock_held)
371 		mutex_enter(&ui->ui_mx);
372 	ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
373 	ASSERT(ui->ui_readercnt != 0);
374 	ui->ui_readercnt--;
375 	if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
376 		cv_broadcast(&ui->ui_cv);
377 
378 	if (!lock_held)
379 		mutex_exit(&ui->ui_mx);
380 }
381 
382 void
383 md_unit_readerexit(mdi_unit_t *ui)
384 {
385 	md_unit_readerexit_common(ui, 0);
386 }
387 
388 /*
389  * md_unit_writerexit_common:
390  * -------------------------
391  * Release the writerlock currently held on the unit. Wake any threads waiting
392  * on becoming reader or writer (MD_UL_WANABEWRITER set).
393  *
394  * Input:
395  *	ui		unit reference
396  *	lock_held	0 => ui_mx to be acquired
397  *			1 => ui_mx already held
398  */
399 static void
400 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
401 {
402 	if (!lock_held)
403 		mutex_enter(&ui->ui_mx);
404 	ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
405 	ASSERT(ui->ui_readercnt == 0);
406 	ui->ui_lock &= ~MD_UL_WRITER;
407 	ui->ui_owner = NULL;
408 
409 	cv_broadcast(&ui->ui_cv);
410 	if (!lock_held)
411 		mutex_exit(&ui->ui_mx);
412 }
413 
414 void
415 md_unit_writerexit(mdi_unit_t *ui)
416 {
417 	md_unit_writerexit_common(ui, 0);
418 }
419 
420 void *
421 md_io_readerlock(mdi_unit_t *ui)
422 {
423 	md_io_lock_t	*io = ui->ui_io_lock;
424 
425 	ASSERT(io);  /* checks case where no io lock allocated */
426 	mutex_enter(&io->io_mx);
427 	while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
428 		if (panicstr) {
429 			if (io->io_lock & MD_UL_WRITER)
430 				panic("md: writer lock is held");
431 			break;
432 		}
433 		cv_wait(&io->io_cv, &io->io_mx);
434 	}
435 	io->io_readercnt++;
436 	mutex_exit(&io->io_mx);
437 	return (MD_UNIT(ui->ui_link.ln_id));
438 }
439 
440 void *
441 md_io_writerlock(mdi_unit_t *ui)
442 {
443 	md_io_lock_t	*io = ui->ui_io_lock;
444 
445 	ASSERT(io);  /* checks case where no io lock allocated */
446 	if (panicstr)
447 		panic("md: writer lock not allowed");
448 
449 	mutex_enter(&io->io_mx);
450 	while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
451 		io->io_wanabecnt++;
452 		io->io_lock |= MD_UL_WANABEWRITER;
453 		cv_wait(&io->io_cv, &io->io_mx);
454 		if (--io->io_wanabecnt == 0)
455 			io->io_lock &= ~MD_UL_WANABEWRITER;
456 	}
457 	io->io_lock |= MD_UL_WRITER;
458 	io->io_owner = curthread;
459 
460 	mutex_exit(&io->io_mx);
461 	return (MD_UNIT(ui->ui_link.ln_id));
462 }
463 
464 void
465 md_io_readerexit(mdi_unit_t *ui)
466 {
467 	md_io_lock_t	*io = ui->ui_io_lock;
468 
469 	mutex_enter(&io->io_mx);
470 	ASSERT((io->io_lock & MD_UL_WRITER) == 0);
471 	ASSERT(io->io_readercnt != 0);
472 	io->io_readercnt--;
473 	if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
474 		cv_broadcast(&io->io_cv);
475 	}
476 	mutex_exit(&io->io_mx);
477 }
478 
479 void
480 md_io_writerexit(mdi_unit_t *ui)
481 {
482 	md_io_lock_t	*io = ui->ui_io_lock;
483 
484 	mutex_enter(&io->io_mx);
485 	ASSERT((io->io_lock & MD_UL_WRITER) != 0);
486 	ASSERT(io->io_readercnt == 0);
487 	io->io_lock &= ~MD_UL_WRITER;
488 	io->io_owner = NULL;
489 
490 	cv_broadcast(&io->io_cv);
491 	mutex_exit(&io->io_mx);
492 }
493 
494 /*
495  * Attempt to grab that set of locks defined as global.
496  * A mask containing the set of global locks that are owned upon
497  * entry is input.  Any additional global locks are then grabbed.
498  * This keeps the caller from having to know the set of global
499  * locks.
500  */
501 static int
502 md_global_lock_enter(int global_locks_owned_mask)
503 {
504 
505 	/*
506 	 * The current implementation has been verified by inspection
507 	 * and test to be deadlock free.  If another global lock is
508 	 * added, changing the algorithm used by this function should
509 	 * be considered.  With more than 2 locks it is difficult to
510 	 * guarantee that locks are being acquired in the correct order.
511 	 * The safe approach would be to drop all of the locks that are
512 	 * owned at function entry and then reacquire all of the locks
513 	 * in the order defined by the lock hierarchy.
514 	 */
515 	mutex_enter(&md_mx);
516 	if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
517 		while ((md_mtioctl_cnt != 0) ||
518 		    (md_status & MD_GBL_IOCTL_LOCK)) {
519 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
520 				mutex_exit(&md_mx);
521 				return (EINTR);
522 			}
523 		}
524 		md_status |= MD_GBL_IOCTL_LOCK;
525 		md_ioctl_cnt++;
526 	}
527 	if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
528 		while (md_status & MD_GBL_HS_LOCK) {
529 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
530 				md_status &= ~MD_GBL_IOCTL_LOCK;
531 				mutex_exit(&md_mx);
532 				return (EINTR);
533 			}
534 		}
535 		md_status |= MD_GBL_HS_LOCK;
536 	}
537 	mutex_exit(&md_mx);
538 	return (0);
539 }
540 
541 /*
542  * Release the set of global locks that were grabbed in md_global_lock_enter
543  * that were not already owned by the calling thread.  The set of previously
544  * owned global locks is passed in as a mask parameter.
545  */
546 static int
547 md_global_lock_exit(int global_locks_owned_mask, int code,
548 	int flags, mdi_unit_t *ui)
549 {
550 	mutex_enter(&md_mx);
551 
552 	/* If MT ioctl decrement mt_ioctl_cnt */
553 	if ((flags & MD_MT_IOCTL)) {
554 		md_mtioctl_cnt--;
555 	} else {
556 		if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
557 			/* clear the lock and decrement count */
558 			ASSERT(md_ioctl_cnt == 1);
559 			md_ioctl_cnt--;
560 			md_status &= ~MD_GBL_IOCTL_LOCK;
561 		}
562 		if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
563 			md_status &= ~MD_GBL_HS_LOCK;
564 	}
565 	if (flags & MD_READER_HELD)
566 		md_unit_readerexit(ui);
567 	if (flags & MD_WRITER_HELD)
568 		md_unit_writerexit(ui);
569 	if (flags & MD_IO_HELD)
570 		md_io_writerexit(ui);
571 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
572 		rw_exit(&md_unit_array_rw.lock);
573 	}
574 	cv_broadcast(&md_cv);
575 	mutex_exit(&md_mx);
576 
577 	return (code);
578 }
579 
580 /*
581  * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
582  * use of the md_global_lock_{enter|exit} functions to avoid duplication
583  * of code.  They rely upon the fact that the locks that are specified in
584  * the input mask are not acquired or freed.  If this algorithm changes
585  * as described in the block comment at the beginning of md_global_lock_enter
586  * then it will be necessary to change these 2 functions.  Otherwise these
587  * functions will be grabbing and holding global locks unnecessarily.
588  */
589 int
590 md_ioctl_lock_enter(void)
591 {
592 	/* grab only the ioctl lock */
593 	return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
594 }
595 
596 /*
597  * If md_ioctl_lock_exit is being called at the end of an ioctl before
598  * returning to user space, then ioctl_end is set to 1.
599  * Otherwise, the ioctl lock is being dropped in the middle of handling
600  * an ioctl and will be reacquired before the end of the ioctl.
601  * Do not attempt to process the MN diskset mddb parse flags unless
602  * ioctl_end is true - otherwise a deadlock situation could arise.
603  */
604 int
605 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
606 {
607 	int				ret_val;
608 	uint_t				status;
609 	mddb_set_t			*s;
610 	int				i;
611 	int				err;
612 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
613 	md_mn_kresult_t			*kresult;
614 	mddb_lb_t			*lbp;
615 	int				rval = 1;
616 	int				flag;
617 
618 	/* release only the ioctl lock */
619 	ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
620 
621 	/*
622 	 * If md_ioctl_lock_exit is being called with a possible lock held
623 	 * (ioctl_end is 0), then don't check the MN disksets since the
624 	 * call to mddb_setenter may cause a lock ordering deadlock.
625 	 */
626 	if (!ioctl_end)
627 		return (ret_val);
628 
629 	/*
630 	 * Walk through disksets to see if there is a MN diskset that
631 	 * has messages that need to be sent.  Set must be snarfed and
632 	 * be a MN diskset in order to be checked.
633 	 *
634 	 * In a MN diskset, this routine may send messages to the
635 	 * rpc.mdcommd in order to have the slave nodes re-parse parts
636 	 * of the mddb.  Messages can only be sent with no locks held,
637 	 * so if mddb change occurred while the ioctl lock is held, this
638 	 * routine must send the messages.
639 	 */
640 	for (i = 1; i < md_nsets; i++) {
641 		status = md_get_setstatus(i);
642 
643 		/* Set must be snarfed and be a MN diskset */
644 		if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
645 		    (MD_SET_SNARFED | MD_SET_MNSET))
646 			continue;
647 
648 		/* Grab set lock so that set can't change */
649 		if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
650 			continue;
651 
652 		lbp = s->s_lbp;
653 
654 		/* Re-get set status now that lock is held */
655 		status = md_get_setstatus(i);
656 
657 		/*
658 		 * If MN parsing block flag is set - continue to next set.
659 		 *
660 		 * If s_mn_parseflags_sending is non-zero, then another thread
661 		 * is already currently sending a parse message, so just
662 		 * release the set mutex.  If this ioctl had caused an mddb
663 		 * change that results in a parse message to be generated,
664 		 * the thread that is currently sending a parse message would
665 		 * generate the additional parse message.
666 		 *
667 		 * If s_mn_parseflags_sending is zero then loop until
668 		 * s_mn_parseflags is 0 (until there are no more
669 		 * messages to send).
670 		 * While s_mn_parseflags is non-zero,
671 		 *	put snapshot of parse_flags in s_mn_parseflags_sending
672 		 *	set s_mn_parseflags to zero
673 		 *	release set mutex
674 		 *	send message
675 		 *	re-grab set mutex
676 		 *	set s_mn_parseflags_sending to zero
677 		 *
678 		 * If set is STALE, send message with NO_LOG flag so that
679 		 * rpc.mdcommd won't attempt to log message to non-writeable
680 		 * replica.
681 		 */
682 		mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
683 			KM_SLEEP);
684 		while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
685 		    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
686 		    (!(status & MD_SET_MNPARSE_BLK))) {
687 
688 			/* Grab snapshot of parse flags */
689 			s->s_mn_parseflags_sending = s->s_mn_parseflags;
690 			s->s_mn_parseflags = 0;
691 
692 			mutex_exit(&md_set[(s)->s_setno].s_dbmx);
693 
694 			/*
695 			 * Send the message to the slaves to re-parse
696 			 * the indicated portions of the mddb. Send the status
697 			 * of the 50 mddbs in this set so that slaves know
698 			 * which mddbs that the master node thinks are 'good'.
699 			 * Otherwise, slave may reparse, but from wrong
700 			 * replica.
701 			 */
702 			mddb_parse_msg->msg_parse_flags =
703 				s->s_mn_parseflags_sending;
704 
705 			for (i = 0; i < MDDB_NLB; i++) {
706 				mddb_parse_msg->msg_lb_flags[i] =
707 					lbp->lb_locators[i].l_flags;
708 			}
709 			kresult = kmem_zalloc(sizeof (md_mn_kresult_t),
710 				KM_SLEEP);
711 			while (rval != 0) {
712 				flag = 0;
713 				if (status & MD_SET_STALE)
714 					flag |= MD_MSGF_NO_LOG;
715 				rval = mdmn_ksend_message(s->s_setno,
716 				    MD_MN_MSG_MDDB_PARSE, flag,
717 				    (char *)mddb_parse_msg,
718 				    sizeof (mddb_parse_msg), kresult);
719 				/* if the node hasn't yet joined, it's Ok. */
720 				if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
721 				    (kresult->kmmr_comm_state !=
722 							MDMNE_NOT_JOINED)) {
723 					mdmn_ksend_show_error(rval, kresult,
724 					    "MD_MN_MSG_MDDB_PARSE");
725 					cmn_err(CE_WARN, "md_ioctl_lock_exit: "
726 					    "Unable to send mddb update "
727 					    "message to other nodes in "
728 					    "diskset %s\n", s->s_setname);
729 					rval = 1;
730 				}
731 			}
732 			kmem_free(kresult, sizeof (md_mn_kresult_t));
733 
734 			/*
735 			 * Re-grab mutex to clear sending field and to
736 			 * see if another parse message needs to be generated.
737 			 */
738 			mutex_enter(&md_set[(s)->s_setno].s_dbmx);
739 			s->s_mn_parseflags_sending = 0;
740 		}
741 		kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
742 		mutex_exit(&md_set[(s)->s_setno].s_dbmx);
743 	}
744 	return (ret_val);
745 }
746 
747 /*
748  * Called when in an ioctl and need readerlock.
749  */
750 void *
751 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
752 {
753 	ASSERT(lock != NULL);
754 	lock->l_ui = ui;
755 	lock->l_flags |= MD_READER_HELD;
756 	return (md_unit_readerlock_common(ui, 0));
757 }
758 
759 /*
760  * Called when in an ioctl and need writerlock.
761  */
762 void *
763 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
764 {
765 	ASSERT(lock != NULL);
766 	lock->l_ui = ui;
767 	lock->l_flags |= MD_WRITER_HELD;
768 	return (md_unit_writerlock_common(ui, 0));
769 }
770 
771 void *
772 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
773 {
774 	ASSERT(lock != NULL);
775 	lock->l_ui = ui;
776 	lock->l_flags |= MD_IO_HELD;
777 	return (md_io_writerlock(ui));
778 }
779 
780 void
781 md_ioctl_readerexit(IOLOCK *lock)
782 {
783 	ASSERT(lock != NULL);
784 	lock->l_flags &= ~MD_READER_HELD;
785 	md_unit_readerexit(lock->l_ui);
786 }
787 
788 void
789 md_ioctl_writerexit(IOLOCK *lock)
790 {
791 	ASSERT(lock != NULL);
792 	lock->l_flags &= ~MD_WRITER_HELD;
793 	md_unit_writerexit(lock->l_ui);
794 }
795 
796 void
797 md_ioctl_io_exit(IOLOCK *lock)
798 {
799 	ASSERT(lock != NULL);
800 	lock->l_flags &= ~MD_IO_HELD;
801 	md_io_writerexit(lock->l_ui);
802 }
803 
804 /*
805  * md_ioctl_releaselocks:
806  * --------------------
807  * Release the unit locks that are held and stop subsequent
808  * md_unit_reader/writerlock calls from progressing. This allows the caller
809  * to send messages across the cluster when running in a multinode
810  * environment.
811  * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
812  * allowed to progress as normal. This is required as these typically are
813  * invoked by the message handler that may be called while a unit lock is
814  * marked as released.
815  *
816  * On entry:
817  *	variety of unit locks may be held including ioctl lock
818  *
819  * On exit:
820  *      locks released and unit structure updated to prevent subsequent reader/
821  *      writer locks being acquired until md_ioctl_reacquirelocks is called
822  */
823 void
824 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
825 {
826 	/* This actually releases the locks. */
827 	(void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
828 }
829 
830 /*
831  * md_ioctl_reacquirelocks:
832  * ----------------------
833  * Reacquire the locks that were held when md_ioctl_releaselocks
834  * was called.
835  *
836  * On entry:
837  *      No unit locks held
838  * On exit:
839  *	locks held that were held at md_ioctl_releaselocks time including
840  *	the ioctl lock.
841  */
842 void
843 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
844 {
845 	if (flags & MD_MT_IOCTL) {
846 		mutex_enter(&md_mx);
847 		md_mtioctl_cnt++;
848 		mutex_exit(&md_mx);
849 	} else {
850 		while (md_ioctl_lock_enter() == EINTR);
851 	}
852 	if (flags & MD_ARRAY_WRITER) {
853 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
854 	} else if (flags & MD_ARRAY_READER) {
855 		rw_enter(&md_unit_array_rw.lock, RW_READER);
856 	}
857 	if (ui != (mdi_unit_t *)NULL) {
858 		if (flags & MD_IO_HELD) {
859 			(void) md_io_writerlock(ui);
860 		}
861 
862 		mutex_enter(&ui->ui_mx);
863 		if (flags & MD_READER_HELD) {
864 			(void) md_unit_readerlock_common(ui, 1);
865 		} else if (flags & MD_WRITER_HELD) {
866 			(void) md_unit_writerlock_common(ui, 1);
867 		}
868 		/* Wake up any blocked readerlock() calls */
869 		cv_broadcast(&ui->ui_cv);
870 		mutex_exit(&ui->ui_mx);
871 	}
872 }
873 
874 void
875 md_ioctl_droplocks(IOLOCK *lock)
876 {
877 	mdi_unit_t	*ui;
878 	int		flags;
879 
880 	ASSERT(lock != NULL);
881 	ui = lock->l_ui;
882 	flags = lock->l_flags;
883 	if (flags & MD_READER_HELD) {
884 		lock->l_flags &= ~MD_READER_HELD;
885 		md_unit_readerexit(ui);
886 	}
887 	if (flags & MD_WRITER_HELD) {
888 		lock->l_flags &= ~MD_WRITER_HELD;
889 		md_unit_writerexit(ui);
890 	}
891 	if (flags & MD_IO_HELD) {
892 		lock->l_flags &= ~MD_IO_HELD;
893 		md_io_writerexit(ui);
894 	}
895 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
896 		lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
897 		rw_exit(&md_unit_array_rw.lock);
898 	}
899 }
900 
901 void
902 md_array_writer(IOLOCK *lock)
903 {
904 	ASSERT(lock != NULL);
905 	lock->l_flags |= MD_ARRAY_WRITER;
906 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
907 }
908 
909 void
910 md_array_reader(IOLOCK *lock)
911 {
912 	ASSERT(lock != NULL);
913 	lock->l_flags |= MD_ARRAY_READER;
914 	rw_enter(&md_unit_array_rw.lock, RW_READER);
915 }
916 
917 /*
918  * Called when in an ioctl and need opencloselock.
919  * Sets flags in lockp for READER_HELD.
920  */
921 void *
922 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
923 {
924 	void	*un;
925 
926 	ASSERT(lockp != NULL);
927 	mutex_enter(&ui->ui_mx);
928 	while (ui->ui_lock & MD_UL_OPENORCLOSE)
929 		cv_wait(&ui->ui_cv, &ui->ui_mx);
930 	ui->ui_lock |= MD_UL_OPENORCLOSE;
931 
932 	/* Maintain mutex across the readerlock call */
933 	lockp->l_ui = ui;
934 	lockp->l_flags |= MD_READER_HELD;
935 	un = md_unit_readerlock_common(ui, 1);
936 	mutex_exit(&ui->ui_mx);
937 
938 	return (un);
939 }
940 
941 /*
942  * Clears reader lock using md_ioctl instead of md_unit
943  * and updates lockp.
944  */
945 void
946 md_ioctl_openclose_exit(IOLOCK *lockp)
947 {
948 	mdi_unit_t	*ui;
949 
950 	ASSERT(lockp != NULL);
951 	ui = lockp->l_ui;
952 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
953 
954 	md_ioctl_readerexit(lockp);
955 
956 	mutex_enter(&ui->ui_mx);
957 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
958 
959 	cv_broadcast(&ui->ui_cv);
960 	mutex_exit(&ui->ui_mx);
961 }
962 
963 /*
964  * Clears reader lock using md_ioctl instead of md_unit
965  * and updates lockp.
966  * Does not acquire or release the ui_mx lock since the calling
967  * routine has already acquired this lock.
968  */
969 void
970 md_ioctl_openclose_exit_lh(IOLOCK *lockp)
971 {
972 	mdi_unit_t	*ui;
973 
974 	ASSERT(lockp != NULL);
975 	ui = lockp->l_ui;
976 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
977 
978 	lockp->l_flags &= ~MD_READER_HELD;
979 	md_unit_readerexit_common(lockp->l_ui, 1);
980 
981 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
982 	cv_broadcast(&ui->ui_cv);
983 }
984 
985 void *
986 md_unit_openclose_enter(mdi_unit_t *ui)
987 {
988 	void	*un;
989 
990 	mutex_enter(&ui->ui_mx);
991 	while (ui->ui_lock & (MD_UL_OPENORCLOSE))
992 		cv_wait(&ui->ui_cv, &ui->ui_mx);
993 	ui->ui_lock |= MD_UL_OPENORCLOSE;
994 
995 	/* Maintain mutex across the readerlock call */
996 	un = md_unit_readerlock_common(ui, 1);
997 	mutex_exit(&ui->ui_mx);
998 
999 	return (un);
1000 }
1001 
1002 void
1003 md_unit_openclose_exit(mdi_unit_t *ui)
1004 {
1005 	md_unit_readerexit(ui);
1006 
1007 	mutex_enter(&ui->ui_mx);
1008 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1009 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1010 
1011 	cv_broadcast(&ui->ui_cv);
1012 	mutex_exit(&ui->ui_mx);
1013 }
1014 
1015 /*
1016  * Drop the openclose and readerlocks without acquiring or
1017  * releasing the ui_mx lock since the calling routine has
1018  * already acquired this lock.
1019  */
1020 void
1021 md_unit_openclose_exit_lh(mdi_unit_t *ui)
1022 {
1023 	md_unit_readerexit_common(ui, 1);
1024 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1025 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1026 	cv_broadcast(&ui->ui_cv);
1027 }
1028 
1029 int
1030 md_unit_isopen(
1031 	mdi_unit_t	*ui
1032 )
1033 {
1034 	int		isopen;
1035 
1036 	/* check status */
1037 	mutex_enter(&ui->ui_mx);
1038 	isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
1039 	mutex_exit(&ui->ui_mx);
1040 	return (isopen);
1041 }
1042 
1043 int
1044 md_unit_incopen(
1045 	minor_t		mnum,
1046 	int		flag,
1047 	int		otyp
1048 )
1049 {
1050 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1051 	int		err = 0;
1052 
1053 	/* check type and flags */
1054 	ASSERT(ui != NULL);
1055 	mutex_enter(&ui->ui_mx);
1056 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1057 		err = EINVAL;
1058 		goto out;
1059 	}
1060 	if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
1061 	    (ui->ui_lock & MD_UL_EXCL)) {
1062 		err = EBUSY;
1063 		goto out;
1064 	}
1065 
1066 	/* count and flag open */
1067 	ui->ui_ocnt[otyp]++;
1068 	ui->ui_lock |= MD_UL_OPEN;
1069 	if (flag & FEXCL)
1070 		ui->ui_lock |= MD_UL_EXCL;
1071 
1072 	/* setup kstat, return success */
1073 	mutex_exit(&ui->ui_mx);
1074 	md_kstat_init(mnum);
1075 	return (0);
1076 
1077 	/* return error */
1078 out:
1079 	mutex_exit(&ui->ui_mx);
1080 	return (err);
1081 }
1082 
1083 int
1084 md_unit_decopen(
1085 	minor_t		mnum,
1086 	int		otyp
1087 )
1088 {
1089 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1090 	int		err = 0;
1091 	unsigned	i;
1092 
1093 	/* check type and flags */
1094 	ASSERT(ui != NULL);
1095 	mutex_enter(&ui->ui_mx);
1096 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1097 		err = EINVAL;
1098 		goto out;
1099 	} else if (ui->ui_ocnt[otyp] == 0) {
1100 		err = ENXIO;
1101 		goto out;
1102 	}
1103 
1104 	/* count and flag closed */
1105 	if (otyp == OTYP_LYR)
1106 		ui->ui_ocnt[otyp]--;
1107 	else
1108 		ui->ui_ocnt[otyp] = 0;
1109 	ui->ui_lock &= ~MD_UL_OPEN;
1110 	for (i = 0; (i < OTYPCNT); ++i)
1111 		if (ui->ui_ocnt[i] != 0)
1112 			ui->ui_lock |= MD_UL_OPEN;
1113 	if (! (ui->ui_lock & MD_UL_OPEN))
1114 		ui->ui_lock &= ~MD_UL_EXCL;
1115 
1116 	/* teardown kstat, return success */
1117 	if (! (ui->ui_lock & MD_UL_OPEN)) {
1118 		mutex_exit(&ui->ui_mx);
1119 		md_kstat_destroy(mnum);
1120 		return (0);
1121 	}
1122 
1123 	/* return success */
1124 out:
1125 	mutex_exit(&ui->ui_mx);
1126 	return (err);
1127 }
1128 
1129 md_dev64_t
1130 md_xlate_targ_2_mini(md_dev64_t targ_devt)
1131 {
1132 	dev32_t		mini_32_devt, targ_32_devt;
1133 	int		i;
1134 
1135 	/*
1136 	 * check to see if we're in an upgrade situation
1137 	 * if we are not in upgrade just return the input device
1138 	 */
1139 
1140 	if (!MD_UPGRADE)
1141 		return (targ_devt);
1142 
1143 	targ_32_devt = md_cmpldev(targ_devt);
1144 
1145 	i = 0;
1146 	while (i != md_tuple_length) {
1147 		if (md_tuple_table[i].targ_devt == targ_32_devt) {
1148 			mini_32_devt = md_tuple_table[i].mini_devt;
1149 			return (md_expldev((md_dev64_t)mini_32_devt));
1150 		}
1151 		i++;
1152 	}
1153 	return (NODEV64);
1154 }
1155 
1156 md_dev64_t
1157 md_xlate_mini_2_targ(md_dev64_t mini_devt)
1158 {
1159 	dev32_t		mini_32_devt, targ_32_devt;
1160 	int		i;
1161 
1162 	if (!MD_UPGRADE)
1163 		return (mini_devt);
1164 
1165 	mini_32_devt = md_cmpldev(mini_devt);
1166 
1167 	i = 0;
1168 	while (i != md_tuple_length) {
1169 		if (md_tuple_table[i].mini_devt == mini_32_devt) {
1170 			targ_32_devt = md_tuple_table[i].targ_devt;
1171 			return (md_expldev((md_dev64_t)targ_32_devt));
1172 		}
1173 		i++;
1174 	}
1175 	return (NODEV64);
1176 }
1177 
1178 void
1179 md_xlate_free(int size)
1180 {
1181 	kmem_free(md_tuple_table, size);
1182 }
1183 
1184 char *
1185 md_targ_major_to_name(major_t maj)
1186 {
1187 	char *drv_name = NULL;
1188 	int	i;
1189 
1190 	if (!MD_UPGRADE)
1191 		return (ddi_major_to_name(maj));
1192 
1193 	for (i = 0; i < md_majortab_len; i++) {
1194 		if (md_major_tuple_table[i].targ_maj == maj) {
1195 			drv_name = md_major_tuple_table[i].drv_name;
1196 			break;
1197 		}
1198 	}
1199 	return (drv_name);
1200 }
1201 
1202 major_t
1203 md_targ_name_to_major(char *drv_name)
1204 {
1205 	major_t maj;
1206 	int	i;
1207 
1208 	maj = md_getmajor(NODEV64);
1209 	if (!MD_UPGRADE)
1210 		return (ddi_name_to_major(drv_name));
1211 
1212 	for (i = 0; i < md_majortab_len; i++) {
1213 		if ((strcmp(md_major_tuple_table[i].drv_name,
1214 		    drv_name)) == 0) {
1215 			maj = md_major_tuple_table[i].targ_maj;
1216 			break;
1217 		}
1218 	}
1219 
1220 	return (maj);
1221 }
1222 
1223 void
1224 md_majortab_free()
1225 {
1226 	size_t	sz;
1227 	int	i;
1228 
1229 	for (i = 0; i < md_majortab_len; i++) {
1230 		freestr(md_major_tuple_table[i].drv_name);
1231 	}
1232 
1233 	sz = md_majortab_len * sizeof (struct md_xlate_major_table);
1234 	kmem_free(md_major_tuple_table, sz);
1235 }
1236 
1237 /* functions return a pointer to a function which returns an int */
1238 
1239 intptr_t (*
1240 md_get_named_service(md_dev64_t dev, int modindex, char *name,
1241 	intptr_t (*Default)()))()
1242 {
1243 	mdi_unit_t		*ui;
1244 	md_named_services_t	*sp;
1245 	int			i;
1246 
1247 	/*
1248 	 * Return the first named service found.
1249 	 * Use this path when it is known that there is only
1250 	 * one named service possible (e.g., hotspare interface)
1251 	 */
1252 	if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
1253 		for (i = 0; i < MD_NOPS; i++) {
1254 			if (md_ops[i] == NULL) {
1255 				continue;
1256 			}
1257 			sp = md_ops[i]->md_services;
1258 			if (sp == NULL)
1259 				continue;
1260 			while (sp->md_service != NULL) {
1261 				if (strcmp(name, sp->md_name) == 0)
1262 					return (sp->md_service);
1263 				sp++;
1264 			}
1265 		}
1266 		return (Default);
1267 	}
1268 
1269 	/*
1270 	 * Return the named service for the given modindex.
1271 	 * This is used if there are multiple possible named services
1272 	 * and each one needs to be called (e.g., poke hotspares)
1273 	 */
1274 	if (dev == NODEV64) {
1275 		if (modindex >= MD_NOPS)
1276 			return (Default);
1277 
1278 		if (md_ops[modindex] == NULL)
1279 			return (Default);
1280 
1281 		sp = md_ops[modindex]->md_services;
1282 		if (sp == NULL)
1283 			return (Default);
1284 
1285 		while (sp->md_service != NULL) {
1286 			if (strcmp(name, sp->md_name) == 0)
1287 				return (sp->md_service);
1288 			sp++;
1289 		}
1290 		return (Default);
1291 	}
1292 
1293 	/*
1294 	 * Return the named service for this md_dev64_t
1295 	 */
1296 	if (md_getmajor(dev) != md_major)
1297 		return (Default);
1298 
1299 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
1300 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
1301 		return (NULL);
1302 
1303 
1304 	if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
1305 		return (NULL);
1306 
1307 	sp = md_ops[ui->ui_opsindex]->md_services;
1308 	if (sp == NULL)
1309 		return (Default);
1310 	while (sp->md_service != NULL) {
1311 		if (strcmp(name, sp->md_name) == 0)
1312 			return (sp->md_service);
1313 		sp++;
1314 	}
1315 	return (Default);
1316 }
1317 
1318 /*
1319  * md_daemon callback routine
1320  */
1321 boolean_t
1322 callb_md_cpr(void *arg, int code)
1323 {
1324 	callb_cpr_t *cp = (callb_cpr_t *)arg;
1325 	int ret = 0;				/* assume success */
1326 
1327 	mutex_enter(cp->cc_lockp);
1328 
1329 	switch (code) {
1330 	case CB_CODE_CPR_CHKPT:
1331 		/*
1332 		 * Check for active resync threads
1333 		 */
1334 		mutex_enter(&md_cpr_resync.md_resync_mutex);
1335 		if ((md_cpr_resync.md_mirror_resync > 0) ||
1336 				(md_cpr_resync.md_raid_resync > 0)) {
1337 			mutex_exit(&md_cpr_resync.md_resync_mutex);
1338 			cmn_err(CE_WARN, "There are Solaris Volume Manager "
1339 			    "synchronization threads running.");
1340 			cmn_err(CE_WARN, "Please try system suspension at "
1341 							"a later time.");
1342 			ret = -1;
1343 			break;
1344 		}
1345 		mutex_exit(&md_cpr_resync.md_resync_mutex);
1346 
1347 		cp->cc_events |= CALLB_CPR_START;
1348 		while (!(cp->cc_events & CALLB_CPR_SAFE))
1349 			/* cv_timedwait() returns -1 if it times out. */
1350 			if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp,
1351 				lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1)
1352 				break;
1353 			break;
1354 
1355 	case CB_CODE_CPR_RESUME:
1356 		cp->cc_events &= ~CALLB_CPR_START;
1357 		cv_signal(&cp->cc_stop_cv);
1358 		break;
1359 	}
1360 	mutex_exit(cp->cc_lockp);
1361 	return (ret != -1);
1362 }
1363 
1364 void
1365 md_daemon(int pass_thru, mdq_anchor_t *anchor)
1366 {
1367 	daemon_queue_t  *dq;
1368 	callb_cpr_t	cprinfo;
1369 
1370 	if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
1371 		return;
1372 	/*
1373 	 * Register cpr callback
1374 	 */
1375 	CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
1376 
1377 	/*CONSTCOND*/
1378 	while (1) {
1379 		mutex_enter(&anchor->a_mx);
1380 		while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
1381 			if (pass_thru) {
1382 				/*
1383 				 * CALLB_CPR_EXIT Will do
1384 				 * mutex_exit(&anchor->a_mx)
1385 				 */
1386 				CALLB_CPR_EXIT(&cprinfo);
1387 				return;
1388 			}
1389 			if (md_get_status() & MD_GBL_DAEMONS_DIE) {
1390 				mutex_exit(&anchor->a_mx);
1391 				mutex_enter(&md_mx);
1392 				md_num_daemons--;
1393 				mutex_exit(&md_mx);
1394 				/*
1395 				 * CALLB_CPR_EXIT will do
1396 				 * mutex_exit(&anchor->a_mx)
1397 				 */
1398 				mutex_enter(&anchor->a_mx);
1399 				CALLB_CPR_EXIT(&cprinfo);
1400 				thread_exit();
1401 			}
1402 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1403 			cv_wait(&anchor->a_cv, &anchor->a_mx);
1404 			CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
1405 		}
1406 		dq->dq_prev->dq_next = dq->dq_next;
1407 		dq->dq_next->dq_prev = dq->dq_prev;
1408 		dq->dq_prev = dq->dq_next = NULL;
1409 		anchor->dq.qlen--;
1410 		mutex_exit(&anchor->a_mx);
1411 		(*(dq->dq_call))(dq);
1412 	}
1413 	/*NOTREACHED*/
1414 }
1415 
1416 /*
1417  * daemon_request:
1418  *
1419  * Adds requests to appropriate requestq which is
1420  * anchored by *anchor.
1421  * The request is the first element of a doubly linked circular list.
1422  * When the request is a single element, the forward and backward
1423  * pointers MUST point to the element itself.
1424  */
1425 
1426 void
1427 daemon_request(mdq_anchor_t *anchor, void (*func)(),
1428 				daemon_queue_t *request, callstyle_t style)
1429 {
1430 	daemon_queue_t *rqtp;
1431 	int i = 0;
1432 
1433 	rqtp = request;
1434 	if (style == REQ_OLD) {
1435 		ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
1436 		/* set it to the new style */
1437 		rqtp->dq_prev = rqtp->dq_next = rqtp;
1438 	}
1439 	ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
1440 
1441 	/* scan the list and add the function to each element */
1442 
1443 	do {
1444 		rqtp->dq_call = func;
1445 		i++;
1446 		rqtp = rqtp->dq_next;
1447 	} while (rqtp != request);
1448 
1449 	/* save pointer to tail of the request list */
1450 	rqtp = request->dq_prev;
1451 
1452 	mutex_enter(&anchor->a_mx);
1453 	/* stats */
1454 	anchor->dq.qlen += i;
1455 	anchor->dq.treqs += i;
1456 	anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
1457 					anchor->dq.qlen : anchor->dq.maxq_len;
1458 
1459 	/* now add the list to request queue */
1460 	request->dq_prev = anchor->dq.dq_prev;
1461 	rqtp->dq_next = &anchor->dq;
1462 	anchor->dq.dq_prev->dq_next = request;
1463 	anchor->dq.dq_prev = rqtp;
1464 	cv_broadcast(&anchor->a_cv);
1465 	mutex_exit(&anchor->a_mx);
1466 }
1467 
1468 void
1469 mddb_commitrec_wrapper(mddb_recid_t recid)
1470 {
1471 	int sent_log = 0;
1472 	uint_t retry = md_retry_cnt;
1473 	set_t	setno;
1474 
1475 	while (mddb_commitrec(recid)) {
1476 		if (! sent_log) {
1477 			cmn_err(CE_WARN,
1478 			    "md: state database commit failed");
1479 			sent_log = 1;
1480 		}
1481 		delay(md_hz);
1482 
1483 		/*
1484 		 * Setting retry cnt to one (pre decremented) so that we
1485 		 * actually do no retries when committing/deleting a mddb rec.
1486 		 * The underlying disk driver does several retries to check
1487 		 * if the disk is really dead or not so there
1488 		 * is no reason for us to retry on top of the drivers retries.
1489 		 */
1490 
1491 		if (--retry == 0) {
1492 			setno = mddb_getsetnum(recid);
1493 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1494 				panic(
1495 				    "md: Panic due to lack of DiskSuite state\n"
1496 				    " database replicas. Fewer than 50%% of "
1497 				    "the total were available,\n so panic to "
1498 				    "ensure data integrity.");
1499 			} else {
1500 				panic("md: state database problem");
1501 			}
1502 			/*NOTREACHED*/
1503 		}
1504 	}
1505 }
1506 
1507 void
1508 mddb_commitrecs_wrapper(mddb_recid_t *recids)
1509 {
1510 	int sent_log = 0;
1511 	uint_t retry = md_retry_cnt;
1512 	set_t	setno;
1513 
1514 	while (mddb_commitrecs(recids)) {
1515 		if (! sent_log) {
1516 			cmn_err(CE_WARN,
1517 			    "md: state database commit failed");
1518 			sent_log = 1;
1519 		}
1520 		delay(md_hz);
1521 
1522 		/*
1523 		 * Setting retry cnt to one (pre decremented) so that we
1524 		 * actually do no retries when committing/deleting a mddb rec.
1525 		 * The underlying disk driver does several retries to check
1526 		 * if the disk is really dead or not so there
1527 		 * is no reason for us to retry on top of the drivers retries.
1528 		 */
1529 
1530 		if (--retry == 0) {
1531 			/*
1532 			 * since all the records are part of the same set
1533 			 * use the first one to get setno
1534 			 */
1535 			setno = mddb_getsetnum(*recids);
1536 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1537 				panic(
1538 				    "md: Panic due to lack of DiskSuite state\n"
1539 				    " database replicas. Fewer than 50%% of "
1540 				    "the total were available,\n so panic to "
1541 				    "ensure data integrity.");
1542 			} else {
1543 				panic("md: state database problem");
1544 			}
1545 			/*NOTREACHED*/
1546 		}
1547 	}
1548 }
1549 
1550 void
1551 mddb_deleterec_wrapper(mddb_recid_t recid)
1552 {
1553 	int sent_log = 0;
1554 	uint_t retry = md_retry_cnt;
1555 	set_t	setno;
1556 
1557 	while (mddb_deleterec(recid)) {
1558 		if (! sent_log) {
1559 			cmn_err(CE_WARN,
1560 			    "md: state database delete failed");
1561 			sent_log = 1;
1562 		}
1563 		delay(md_hz);
1564 
1565 		/*
1566 		 * Setting retry cnt to one (pre decremented) so that we
1567 		 * actually do no retries when committing/deleting a mddb rec.
1568 		 * The underlying disk driver does several retries to check
1569 		 * if the disk is really dead or not so there
1570 		 * is no reason for us to retry on top of the drivers retries.
1571 		 */
1572 
1573 		if (--retry == 0) {
1574 			setno = mddb_getsetnum(recid);
1575 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1576 				panic(
1577 				    "md: Panic due to lack of DiskSuite state\n"
1578 				    " database replicas. Fewer than 50%% of "
1579 				    "the total were available,\n so panic to "
1580 				    "ensure data integrity.");
1581 			} else {
1582 				panic("md: state database problem");
1583 			}
1584 			/*NOTREACHED*/
1585 		}
1586 	}
1587 }
1588 
1589 /*
1590  * md_holdset_enter is called in order to hold the set in its
1591  * current state (loaded, unloaded, snarfed, unsnarfed, etc)
1592  * until md_holdset_exit is called.  This is used by the mirror
1593  * code to mark the set as HOLD so that the set won't be
1594  * unloaded while hotspares are being allocated in check_4_hotspares.
1595  * The original fix to the mirror code to hold the set was to call
1596  * md_haltsnarf_enter, but this will block all ioctls and ioctls
1597  * must work for a MN diskset while hotspares are allocated.
1598  */
1599 void
1600 md_holdset_enter(set_t setno)
1601 {
1602 	mutex_enter(&md_mx);
1603 	while (md_set[setno].s_status & MD_SET_HOLD)
1604 		cv_wait(&md_cv, &md_mx);
1605 	md_set[setno].s_status |= MD_SET_HOLD;
1606 	mutex_exit(&md_mx);
1607 }
1608 
1609 void
1610 md_holdset_exit(set_t setno)
1611 {
1612 	mutex_enter(&md_mx);
1613 	md_set[setno].s_status &= ~MD_SET_HOLD;
1614 	cv_broadcast(&md_cv);
1615 	mutex_exit(&md_mx);
1616 }
1617 
1618 /*
1619  * Returns a 0 if this thread marked the set as HOLD (success),
1620  * returns a -1 if set was already marked HOLD (failure).
1621  * Used by the release_set code to see if set is marked HOLD.
1622  * HOLD is set by a daemon when hotspares are being allocated
1623  * to mirror units.
1624  */
1625 int
1626 md_holdset_testandenter(set_t setno)
1627 {
1628 	mutex_enter(&md_mx);
1629 	if (md_set[setno].s_status & MD_SET_HOLD) {
1630 		mutex_exit(&md_mx);
1631 		return (-1);
1632 	}
1633 	md_set[setno].s_status |= MD_SET_HOLD;
1634 	mutex_exit(&md_mx);
1635 	return (0);
1636 }
1637 
1638 void
1639 md_haltsnarf_enter(set_t setno)
1640 {
1641 	mutex_enter(&md_mx);
1642 	while (md_set[setno].s_status & MD_SET_SNARFING)
1643 		cv_wait(&md_cv, &md_mx);
1644 
1645 	md_set[setno].s_status |= MD_SET_SNARFING;
1646 	mutex_exit(&md_mx);
1647 }
1648 
1649 void
1650 md_haltsnarf_exit(set_t setno)
1651 {
1652 	mutex_enter(&md_mx);
1653 	md_set[setno].s_status &= ~MD_SET_SNARFING;
1654 	cv_broadcast(&md_cv);
1655 	mutex_exit(&md_mx);
1656 }
1657 
1658 void
1659 md_haltsnarf_wait(set_t setno)
1660 {
1661 	mutex_enter(&md_mx);
1662 	while (md_set[setno].s_status & MD_SET_SNARFING)
1663 		cv_wait(&md_cv, &md_mx);
1664 	mutex_exit(&md_mx);
1665 }
1666 
1667 /*
1668  * ASSUMED that the md_unit_array_rw WRITER lock is held.
1669  */
1670 int
1671 md_halt_set(set_t setno, enum md_haltcmd cmd)
1672 {
1673 	int	i, err;
1674 
1675 	if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
1676 		return (0);
1677 	}
1678 
1679 	if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
1680 		for (i = 0; i < MD_NOPS; i++) {
1681 			if (md_ops[i] == NULL)
1682 				continue;
1683 			if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
1684 				for (--i; i > 0; --i) {
1685 					if (md_ops[i] == NULL)
1686 						continue;
1687 					(void) (*(md_ops[i]->md_halt))
1688 					    (MD_HALT_OPEN, setno);
1689 				}
1690 				return (EBUSY);
1691 			}
1692 		}
1693 
1694 		for (i = 0; i < MD_NOPS; i++) {
1695 			if (md_ops[i] == NULL)
1696 				continue;
1697 			if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
1698 				for (i = 0; i < MD_NOPS; i++) {
1699 					if (md_ops[i] == NULL)
1700 						continue;
1701 					(void) (*(md_ops[i]->md_halt))
1702 					    (MD_HALT_OPEN, setno);
1703 				}
1704 				return (EBUSY);
1705 			}
1706 		}
1707 	}
1708 
1709 	if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
1710 		for (i = 0; i < MD_NOPS; i++) {
1711 			if (md_ops[i] == NULL)
1712 				continue;
1713 			err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
1714 			if (err != 0)
1715 				cmn_err(CE_NOTE,
1716 				    "md: halt failed for %s, error %d",
1717 				    md_ops[i]->md_driver.md_drivername, err);
1718 		}
1719 
1720 		/*
1721 		 * Unload the devid namespace if it is loaded
1722 		 */
1723 		md_unload_namespace(setno, NM_DEVID);
1724 		md_unload_namespace(setno, 0L);
1725 		md_clr_setstatus(setno, MD_SET_SNARFED);
1726 	}
1727 
1728 	return (0);
1729 }
1730 
1731 int
1732 md_halt(int global_locks_owned_mask)
1733 {
1734 	set_t			i, j;
1735 	int			err;
1736 	int			init_queues;
1737 	md_requestq_entry_t	*rqp;
1738 	md_ops_t		**pops, *ops, *lops;
1739 	ddi_modhandle_t		mod;
1740 	char			*name;
1741 
1742 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1743 
1744 	/*
1745 	 * Grab the all of the global locks that are not
1746 	 * already owned to ensure that there isn't another
1747 	 * thread trying to access a global resource
1748 	 * while the halt is in progress
1749 	 */
1750 	if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
1751 		return (EINTR);
1752 
1753 	for (i = 0; i < md_nsets; i++)
1754 		md_haltsnarf_enter(i);
1755 
1756 	/*
1757 	 * Kill the daemon threads.
1758 	 */
1759 	init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
1760 	md_clr_status(MD_GBL_DAEMONS_LIVE);
1761 	md_set_status(MD_GBL_DAEMONS_DIE);
1762 
1763 	rqp = &md_daemon_queues[0];
1764 	i = 0;
1765 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
1766 		cv_broadcast(&rqp->dispq_headp->a_cv);
1767 		rqp = &md_daemon_queues[++i];
1768 	}
1769 
1770 	mutex_enter(&md_mx);
1771 	while (md_num_daemons != 0) {
1772 		mutex_exit(&md_mx);
1773 		delay(md_hz);
1774 		mutex_enter(&md_mx);
1775 	}
1776 	mutex_exit(&md_mx);
1777 	md_clr_status(MD_GBL_DAEMONS_DIE);
1778 
1779 	for (i = 0; i < md_nsets; i++)
1780 		/*
1781 		 * Only call into md_halt_set if s_un / s_ui are both set.
1782 		 * If they are NULL this set hasn't been accessed, so its
1783 		 * pointless performing the call.
1784 		 */
1785 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1786 			if (md_halt_set(i, MD_HALT_CHECK)) {
1787 				if (md_start_daemons(init_queues))
1788 					cmn_err(CE_WARN,
1789 					    "md: restart of daemon threads "
1790 					    "failed");
1791 				for (j = 0; j < md_nsets; j++)
1792 					md_haltsnarf_exit(j);
1793 
1794 				return (md_global_lock_exit(
1795 				    global_locks_owned_mask, EBUSY,
1796 				    MD_ARRAY_WRITER, NULL));
1797 			}
1798 		}
1799 
1800 	/*
1801 	 * if we get here we are going to do it
1802 	 */
1803 	for (i = 0; i < md_nsets; i++) {
1804 		/*
1805 		 * Only call into md_halt_set if s_un / s_ui are both set.
1806 		 * If they are NULL this set hasn't been accessed, so its
1807 		 * pointless performing the call.
1808 		 */
1809 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1810 			err = md_halt_set(i, MD_HALT_DOIT);
1811 			if (err != 0)
1812 				cmn_err(CE_NOTE,
1813 				    "md: halt failed set %u, error %d",
1814 				    (unsigned)i, err);
1815 		}
1816 	}
1817 
1818 	/*
1819 	 * issue a halt unload to each module to indicate that it
1820 	 * is about to be unloaded.  Each module is called once, set
1821 	 * has no meaning at this point in time.
1822 	 */
1823 	for (i = 0; i < MD_NOPS; i++) {
1824 		if (md_ops[i] == NULL)
1825 			continue;
1826 		err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
1827 		if (err != 0)
1828 			cmn_err(CE_NOTE,
1829 			    "md: halt failed for %s, error %d",
1830 			    md_ops[i]->md_driver.md_drivername, err);
1831 	}
1832 
1833 	/* ddi_modclose the submodules */
1834 	for (i = 0; i < MD_NOPS; i++) {
1835 		/* skip if not open */
1836 		if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
1837 			continue;
1838 
1839 		/* find and unlink from md_opslist */
1840 		ops = md_ops[i];
1841 		mod = md_mods[i];
1842 		pops = &md_opslist;
1843 		for (lops = *pops; lops;
1844 		    pops = &lops->md_next, lops = *pops) {
1845 			if (lops == ops) {
1846 				*pops = ops->md_next;
1847 				ops->md_next = NULL;
1848 				break;
1849 			}
1850 		}
1851 
1852 		/* uninitialize */
1853 		name = ops->md_driver.md_drivername,
1854 		md_ops[i] = NULL;
1855 		md_mods[i] = NULL;
1856 		ops->md_selfindex = 0;
1857 		ops->md_driver.md_drivername[0] = '\0';
1858 		rw_destroy(&ops->md_link_rw.lock);
1859 
1860 		/* close */
1861 		err = ddi_modclose(mod);
1862 		if (err != 0)
1863 			cmn_err(CE_NOTE,
1864 			    "md: halt close failed for %s, error %d",
1865 			    name ? name : "UNKNOWN", err);
1866 	}
1867 
1868 	/* Unload the database */
1869 	mddb_unload();
1870 
1871 	md_set_status(MD_GBL_HALTED);	/* we are ready to be unloaded */
1872 
1873 	for (i = 0; i < md_nsets; i++)
1874 		md_haltsnarf_exit(i);
1875 
1876 	return (md_global_lock_exit(global_locks_owned_mask, 0,
1877 		MD_ARRAY_WRITER, NULL));
1878 }
1879 
1880 /*
1881  * md_layered_open() is an internal routine only for SVM modules.
1882  * So the input device will be a md_dev64_t, because all SVM modules internally
1883  * work with that device type.
1884  * ddi routines on the other hand work with dev_t. So, if we call any ddi
1885  * routines from here we first have to convert that device into a dev_t.
1886  */
1887 
1888 int
1889 md_layered_open(
1890 	minor_t		mnum,
1891 	md_dev64_t	*dev,
1892 	int		md_oflags
1893 )
1894 {
1895 	int		flag = (FREAD | FWRITE);
1896 	cred_t		*cred_p = kcred;
1897 	major_t		major;
1898 	int		err;
1899 	dev_t		ddi_dev = md_dev64_to_dev(*dev);
1900 
1901 	if (ddi_dev == NODEV)
1902 		return (ENODEV);
1903 
1904 	major = getmajor(ddi_dev);
1905 
1906 	/* metadevice */
1907 	if (major == md_major) {
1908 		mdi_unit_t	*ui;
1909 
1910 		/* open underlying driver */
1911 		mnum = getminor(ddi_dev);
1912 
1913 		ui = MDI_UNIT(mnum);
1914 		if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1915 			int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
1916 					flag, OTYP_LYR, cred_p, md_oflags);
1917 			/*
1918 			 * As open() may change the device,
1919 			 * send this info back to the caller.
1920 			 */
1921 			*dev = md_expldev(ddi_dev);
1922 			return (ret);
1923 		}
1924 
1925 		/* or do it ourselves */
1926 		(void) md_unit_openclose_enter(ui);
1927 		err = md_unit_incopen(mnum, flag, OTYP_LYR);
1928 		md_unit_openclose_exit(ui);
1929 		/* convert our ddi_dev back to the dev we were given */
1930 		*dev = md_expldev(ddi_dev);
1931 		return (err);
1932 	}
1933 
1934 	/*
1935 	 * Open regular device, since open() may change dev_t give new dev_t
1936 	 * back to the caller.
1937 	 */
1938 	err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
1939 	*dev = md_expldev(ddi_dev);
1940 	return (err);
1941 }
1942 
1943 /*
1944  * md_layered_close() is an internal routine only for SVM modules.
1945  * So the input device will be a md_dev64_t, because all SVM modules internally
1946  * work with that device type.
1947  * ddi routines on the other hand work with dev_t. So, if we call any ddi
1948  * routines from here we first have to convert that device into a dev_t.
1949  */
1950 void
1951 md_layered_close(
1952 	md_dev64_t	dev,
1953 	int		md_cflags
1954 )
1955 {
1956 	int		flag = (FREAD | FWRITE);
1957 	cred_t		*cred_p = kcred;
1958 	dev_t		ddi_dev = md_dev64_to_dev(dev);
1959 	major_t		major = getmajor(ddi_dev);
1960 	minor_t		mnum = getminor(ddi_dev);
1961 
1962 	/* metadevice */
1963 	if (major == md_major) {
1964 		mdi_unit_t	*ui = MDI_UNIT(mnum);
1965 
1966 		/* close underlying driver */
1967 		if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1968 			(*md_ops[ui->ui_opsindex]->md_close)
1969 			    (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
1970 			return;
1971 		}
1972 
1973 		/* or do it ourselves */
1974 		(void) md_unit_openclose_enter(ui);
1975 		(void) md_unit_decopen(mnum, OTYP_LYR);
1976 		md_unit_openclose_exit(ui);
1977 		return;
1978 	}
1979 
1980 	/* close regular device */
1981 	(void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
1982 }
1983 
1984 /*
1985  * saves a little code in mdstrategy
1986  */
1987 int
1988 errdone(mdi_unit_t *ui, struct buf *bp, int err)
1989 {
1990 	if ((bp->b_error = err) != 0)
1991 		bp->b_flags |= B_ERROR;
1992 	else
1993 		bp->b_resid = bp->b_bcount;
1994 	md_unit_readerexit(ui);
1995 	md_biodone(bp);
1996 	return (1);
1997 }
1998 
1999 static int	md_write_label = 0;
2000 
2001 int
2002 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
2003 {
2004 	diskaddr_t endblk;
2005 	set_t	setno = MD_UN2SET(un);
2006 
2007 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
2008 	    (! (bp->b_flags & B_READ)))
2009 		return (errdone(ui, bp, EROFS));
2010 	/*
2011 	 * Check early for unreasonable block number.
2012 	 *
2013 	 * b_blkno is defined as adaddr_t which is typedef'd to a long.
2014 	 * A problem occurs if b_blkno has bit 31 set and un_total_blocks
2015 	 * doesn't, b_blkno is then compared as a negative number which is
2016 	 * always less than a positive.
2017 	 */
2018 	if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
2019 		return (errdone(ui, bp, EINVAL));
2020 
2021 	if (bp->b_lblkno == un->c.un_total_blocks)
2022 		return (errdone(ui, bp, 0));
2023 
2024 	/*
2025 	 * make sure we don't clobber any labels
2026 	 */
2027 	if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
2028 	    (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
2029 		cmn_err(CE_NOTE, "md: %s: write to label",
2030 		    md_shortname(getminor(bp->b_edev)));
2031 		return (errdone(ui, bp, EINVAL));
2032 	}
2033 
2034 	bp->b_resid = 0;
2035 	endblk = (diskaddr_t)(bp->b_lblkno +
2036 			howmany(bp->b_bcount, DEV_BSIZE) - 1);
2037 
2038 	if (endblk > (un->c.un_total_blocks - 1)) {
2039 		bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
2040 		endblk = un->c.un_total_blocks - 1;
2041 		bp->b_bcount -= bp->b_resid;
2042 	}
2043 	return (0);
2044 }
2045 
2046 /*
2047  * init_request_queue: initializes the request queues and creates the threads.
2048  *	return value =  0  :invalid num_threads
2049  *		     =  n   : n is the number of threads created.
2050  */
2051 
2052 int
2053 init_requestq(
2054 	md_requestq_entry_t *rq, /* request queue info */
2055 	void (*threadfn)(),	 /* function to start the thread */
2056 	caddr_t threadfn_args,	 /* args to the function */
2057 	int pri,		 /* thread priority */
2058 	int init_queue)		 /* flag to init queues */
2059 {
2060 	struct mdq_anchor *rqhead;
2061 	int	i;
2062 	int	num_threads;
2063 
2064 
2065 	num_threads = *(rq->num_threadsp);
2066 	rqhead = rq->dispq_headp;
2067 
2068 	if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
2069 		return (0);
2070 
2071 	if (init_queue) {
2072 		rqhead->dq.maxq_len = 0;
2073 		rqhead->dq.treqs = 0;
2074 		rqhead->dq.dq_next = &rqhead->dq;
2075 		rqhead->dq.dq_prev = &rqhead->dq;
2076 		cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
2077 		mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
2078 	}
2079 	for (i = 0; i < num_threads; i++) {
2080 		(void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
2081 		    TS_RUN, pri);
2082 	}
2083 	return (i);
2084 }
2085 
2086 static void
2087 start_daemon(struct mdq_anchor *q)
2088 {
2089 	md_daemon(0, q);
2090 	ASSERT(0);
2091 }
2092 
2093 /*
2094  * Creates all the md daemons.
2095  * Global:
2096  *	md_num_daemons is set to number of daemons.
2097  *	MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
2098  *
2099  * Return value: 0  success
2100  *		 1  failure
2101  */
2102 int
2103 md_start_daemons(int init_queue)
2104 {
2105 	md_requestq_entry_t	*rqp;
2106 	int	cnt;
2107 	int	i;
2108 	int	retval = 0;
2109 
2110 
2111 	if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
2112 		return (retval);
2113 	}
2114 	md_clr_status(MD_GBL_DAEMONS_DIE);
2115 
2116 	rqp = &md_daemon_queues[0];
2117 	i = 0;
2118 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
2119 		cnt = init_requestq(rqp, start_daemon,
2120 			(caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
2121 
2122 		if (cnt && cnt != *rqp->num_threadsp) {
2123 			retval = 1;
2124 			break;
2125 		}
2126 		/*
2127 		 * initialize variables
2128 		 */
2129 		md_num_daemons += cnt;
2130 		rqp = &md_daemon_queues[++i];
2131 	}
2132 
2133 	md_set_status(MD_GBL_DAEMONS_LIVE);
2134 	return (retval);
2135 }
2136 
2137 int
2138 md_loadsubmod(set_t setno, char *name, int drvrid)
2139 {
2140 	ddi_modhandle_t	mod;
2141 	md_ops_t	**pops, *ops;
2142 	int		i, err;
2143 
2144 	/*
2145 	 * See if the submodule is mdopened. If not, i is the index of the
2146 	 * next empty slot.
2147 	 */
2148 	for (i = 0; md_ops[i] != NULL; i++) {
2149 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2150 		    MD_DRIVERNAMELEN) == 0)
2151 			return (i);
2152 
2153 		if (i == (MD_NOPS - 1))
2154 			return (-1);
2155 	}
2156 
2157 	if (drvrid < 0) {
2158 		/* Do not try to add any records to the DB when stale. */
2159 		if (md_get_setstatus(setno) & MD_SET_STALE)
2160 			return (-1);
2161 		drvrid = md_setshared_name(setno, name, 0L);
2162 	}
2163 
2164 	if (drvrid < 0)
2165 		return (-1);
2166 
2167 	/* open and import the md_ops of the submodules */
2168 	mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
2169 	if (mod == NULL) {
2170 		cmn_err(CE_WARN, "md_loadsubmod: "
2171 		    "unable to ddi_modopen %s, error %d\n", name, err);
2172 		return (-1);
2173 	}
2174 	pops = ddi_modsym(mod, "md_interface_ops", &err);
2175 	if (pops == NULL) {
2176 		cmn_err(CE_WARN, "md_loadsubmod: "
2177 		    "unable to import md_interface_ops from %s, error %d\n",
2178 		    name, err);
2179 		(void) ddi_modclose(mod);
2180 		return (-1);
2181 	}
2182 
2183 	/* ddi_modsym returns pointer to md_interface_ops in submod */
2184 	ops = *pops;
2185 
2186 	/* initialize */
2187 	ops->md_selfindex = i;
2188 	rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
2189 	(void) strncpy(ops->md_driver.md_drivername, name,
2190 	    MD_DRIVERNAMELEN);
2191 
2192 	/* plumb */
2193 	md_ops[i] = ops;
2194 	md_mods[i] = mod;
2195 	ops->md_next = md_opslist;
2196 	md_opslist = ops;
2197 
2198 	/* return index */
2199 	return (i);
2200 }
2201 
2202 int
2203 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
2204 {
2205 	int	i;
2206 	int	modindex;
2207 	char	*name = driver->md_drivername;
2208 	set_t	setno = driver->md_setno;
2209 	int	drvid;
2210 	int	local_dont_load;
2211 
2212 	if (setno >= md_nsets)
2213 		return (-1);
2214 
2215 	for (i = 0; name[i] != 0; i++)
2216 		if (i == (MD_DRIVERNAMELEN -1))
2217 			return (-1);
2218 
2219 	/*
2220 	 * If set is STALE, set local_dont_load to 1 since no records
2221 	 * should be added to DB when stale.
2222 	 */
2223 	if (md_get_setstatus(setno) & MD_SET_STALE) {
2224 		local_dont_load = 1;
2225 	} else {
2226 		local_dont_load = dont_load;
2227 	}
2228 
2229 	/*
2230 	 * Single thread ioctl module binding with respect to
2231 	 * similar code executed in md_loadsubmod that is called
2232 	 * from md_snarf_db_set (which is where that path does
2233 	 * its md_haltsnarf_enter call).
2234 	 */
2235 	md_haltsnarf_enter(setno);
2236 
2237 	/* See if the submodule is already ddi_modopened. */
2238 	for (i = 0; md_ops[i] != NULL; i++) {
2239 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2240 		    MD_DRIVERNAMELEN) == 0) {
2241 			if (! local_dont_load &&
2242 			    (md_getshared_key(setno, name) == MD_KEYBAD)) {
2243 				if (md_setshared_name(setno, name, 0L)
2244 				    == MD_KEYBAD) {
2245 					if (!db_notrequired)
2246 						goto err;
2247 				}
2248 			}
2249 			md_haltsnarf_exit(setno);
2250 			return (i);
2251 		}
2252 
2253 		if (i == (MD_NOPS -1))
2254 			break;
2255 	}
2256 
2257 	if (local_dont_load)
2258 		goto err;
2259 
2260 	drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
2261 
2262 	/* ddi_modopen the submodule */
2263 	modindex = md_loadsubmod(setno, name, drvid);
2264 	if (modindex < 0)
2265 		goto err;
2266 
2267 	if (md_ops[modindex]->md_snarf != NULL)
2268 		(*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
2269 
2270 	md_haltsnarf_exit(setno);
2271 	return (modindex);
2272 
2273 err:	md_haltsnarf_exit(setno);
2274 	return (-1);
2275 }
2276 
2277 void
2278 md_call_strategy(buf_t *bp, int flags, void *private)
2279 {
2280 	mdi_unit_t	*ui;
2281 
2282 	if (mdv_strategy_tstpnt)
2283 		if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
2284 			return;
2285 	if (getmajor(bp->b_edev) != md_major) {
2286 		(void) bdev_strategy(bp);
2287 		return;
2288 	}
2289 
2290 	flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
2291 	ui = MDI_UNIT(getminor(bp->b_edev));
2292 	ASSERT(ui != NULL);
2293 	(*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
2294 }
2295 
2296 /*
2297  * md_call_ioctl:
2298  * -------------
2299  * Issue the specified ioctl to the device associated with the given md_dev64_t
2300  *
2301  * Arguments:
2302  *	dev	- underlying device [md_dev64_t]
2303  *	cmd	- ioctl to perform
2304  *	data	- arguments / result location
2305  *	mode	- read/write/layered ioctl
2306  *	lockp	- lock reference
2307  *
2308  * Returns:
2309  *	0	success
2310  *	!=0	Failure (error code)
2311  */
2312 int
2313 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
2314 {
2315 	dev_t		device = md_dev64_to_dev(dev);
2316 	int		rval;
2317 	mdi_unit_t	*ui;
2318 
2319 	/*
2320 	 * See if device is a metadevice. If not call cdev_ioctl(), otherwise
2321 	 * call the ioctl entry-point in the metadevice.
2322 	 */
2323 	if (md_getmajor(dev) != md_major) {
2324 		int	rv;
2325 		rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
2326 		    ddi_get_cred(), &rv);
2327 	} else {
2328 		ui = MDI_UNIT(md_getminor(dev));
2329 		ASSERT(ui != NULL);
2330 		rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
2331 		    mode, lockp);
2332 	}
2333 	return (rval);
2334 }
2335 
2336 void
2337 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
2338 {
2339 	md_link_t	*next;
2340 	md_link_t	**pprev;
2341 
2342 	rw_enter(rw, RW_WRITER);
2343 
2344 	next = *head;
2345 	pprev = head;
2346 	while (next) {
2347 		if ((next->ln_setno == setno) && (next->ln_id == id)) {
2348 			*pprev = next->ln_next;
2349 			rw_exit(rw);
2350 			return;
2351 		}
2352 		pprev = &next->ln_next;
2353 		next = next->ln_next;
2354 	}
2355 
2356 	rw_exit(rw);
2357 }
2358 
2359 int
2360 md_dev_exists(md_dev64_t dev)
2361 {
2362 
2363 	if (dev == NODEV64)
2364 		return (0);
2365 
2366 	if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
2367 		return (1);
2368 
2369 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
2370 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
2371 		return (0);
2372 
2373 	if (MDI_UNIT(md_getminor(dev)) != NULL)
2374 		return (1);
2375 
2376 	return (0);
2377 }
2378 
2379 md_parent_t
2380 md_get_parent(md_dev64_t dev)
2381 {
2382 	md_unit_t	*un;
2383 	mdi_unit_t	*ui;
2384 	md_parent_t	parent;
2385 
2386 	if (md_getmajor(dev) != md_major)
2387 		return (MD_NO_PARENT);
2388 
2389 	ui = MDI_UNIT(md_getminor(dev));
2390 
2391 	un = (md_unit_t *)md_unit_readerlock(ui);
2392 	parent = un->c.un_parent;
2393 	md_unit_readerexit(ui);
2394 
2395 	return (parent);
2396 }
2397 
2398 void
2399 md_set_parent(md_dev64_t dev, md_parent_t parent)
2400 {
2401 	md_unit_t	*un;
2402 	mdi_unit_t	*ui;
2403 
2404 	if (md_getmajor(dev) != md_major)
2405 		return;
2406 
2407 	ui = MDI_UNIT(md_getminor(dev));
2408 
2409 	un = (md_unit_t *)md_unit_readerlock(ui);
2410 	un->c.un_parent = parent;
2411 	md_unit_readerexit(ui);
2412 }
2413 
2414 void
2415 md_reset_parent(md_dev64_t dev)
2416 {
2417 	md_unit_t	*un;
2418 	mdi_unit_t	*ui;
2419 
2420 	if (md_getmajor(dev) != md_major)
2421 		return;
2422 
2423 	ui = MDI_UNIT(md_getminor(dev));
2424 
2425 	un = (md_unit_t *)md_unit_readerlock(ui);
2426 	un->c.un_parent = MD_NO_PARENT;
2427 	md_unit_readerexit(ui);
2428 }
2429 
2430 
2431 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
2432 
2433 int
2434 md_hot_spare_ifc(
2435 	hs_cmds_t	cmd,
2436 	mddb_recid_t	id,
2437 	u_longlong_t	size,
2438 	int		labeled,
2439 	mddb_recid_t	*hs_id,
2440 	mdkey_t		*key,
2441 	md_dev64_t	*dev,
2442 	diskaddr_t	*sblock)
2443 {
2444 	int		err;
2445 
2446 	/*
2447 	 * RW lock on hot_spare_interface. We don't want it to change from
2448 	 * underneath us. If hot_spare_interface is NULL we're going to
2449 	 * need to set it. So we need to upgrade to a WRITER lock. If that
2450 	 * doesn't work, we drop the lock and reenter as WRITER. This leaves
2451 	 * a small hole during which hot_spare_interface could be modified
2452 	 * so we check it for NULL again. What a pain. Then if still null
2453 	 * load from md_get_named_service.
2454 	 */
2455 
2456 	rw_enter(&hsp_rwlp.lock, RW_READER);
2457 	if (hot_spare_interface == NULL) {
2458 		if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
2459 			rw_exit(&hsp_rwlp.lock);
2460 			rw_enter(&hsp_rwlp.lock, RW_WRITER);
2461 			if (hot_spare_interface != NULL) {
2462 				err = ((*hot_spare_interface)
2463 				    (cmd, id, size, labeled, hs_id, key, dev,
2464 				    sblock));
2465 				rw_exit(&hsp_rwlp.lock);
2466 				return (err);
2467 			}
2468 		}
2469 		hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2470 		    "hot spare interface", 0);
2471 		rw_downgrade(&hsp_rwlp.lock);
2472 	}
2473 
2474 	if (hot_spare_interface == NULL) {
2475 		cmn_err(CE_WARN, "md: no hotspare interface");
2476 		rw_exit(&hsp_rwlp.lock);
2477 		return (0);
2478 	}
2479 
2480 	err = ((*hot_spare_interface)
2481 	    (cmd, id, size, labeled, hs_id, key, dev, sblock));
2482 	rw_exit(&hsp_rwlp.lock);
2483 	return (err);
2484 }
2485 
2486 void
2487 md_clear_hot_spare_interface()
2488 {
2489 	rw_enter(&hsp_rwlp.lock, RW_WRITER);
2490 	hot_spare_interface = NULL;
2491 	rw_exit(&hsp_rwlp.lock);
2492 }
2493 
2494 
2495 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
2496 
2497 int
2498 md_notify_interface(
2499 	md_event_cmds_t cmd,
2500 	md_tags_t	tag,
2501 	set_t		set,
2502 	md_dev64_t	dev,
2503 	md_event_type_t event
2504 )
2505 {
2506 	int		err;
2507 
2508 	if (md_event_queue == NULL)
2509 		return (0);
2510 	rw_enter(&ni_rwlp.lock, RW_READER);
2511 	if (notify_interface == NULL) {
2512 		if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
2513 			rw_exit(&ni_rwlp.lock);
2514 			rw_enter(&ni_rwlp.lock, RW_WRITER);
2515 			if (notify_interface != NULL) {
2516 				err = ((*notify_interface)
2517 				    (cmd, tag, set, dev, event));
2518 				rw_exit(&ni_rwlp.lock);
2519 				return (err);
2520 			}
2521 		}
2522 		notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2523 		    "notify interface", 0);
2524 		rw_downgrade(&ni_rwlp.lock);
2525 	}
2526 	if (notify_interface == NULL) {
2527 		cmn_err(CE_WARN, "md: no notify interface");
2528 		rw_exit(&ni_rwlp.lock);
2529 		return (0);
2530 	}
2531 	err = ((*notify_interface)(cmd, tag, set, dev, event));
2532 	rw_exit(&ni_rwlp.lock);
2533 	return (err);
2534 }
2535 
2536 char *
2537 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
2538 {
2539 	char		*setname;
2540 	char		name[MD_MAX_CTDLEN];
2541 	minor_t		mnum = md_getminor(dev);
2542 	major_t		maj = md_getmajor(dev);
2543 	int		rtn = 0;
2544 
2545 	/*
2546 	 * Verify that the passed dev_t refers to a valid metadevice.
2547 	 * If it doesn't we can make no assumptions as to what the device
2548 	 * name is. Return NULL in these cases.
2549 	 */
2550 	if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
2551 	    (MD_MIN2SET(mnum) >= md_nsets)) {
2552 		return (NULL);
2553 	}
2554 
2555 	setname = NULL;
2556 	name[0] = '\0';
2557 	switch (tag) {
2558 	case SVM_TAG_HSP:
2559 		if (setno == 0) {
2560 			rtn = snprintf(name, sizeof (name), "hsp%u",
2561 			    (unsigned)MD_MIN2UNIT(mnum));
2562 		} else {
2563 			setname = mddb_getsetname(setno);
2564 			if (setname != NULL) {
2565 				rtn = snprintf(name, sizeof (name), "%s/hsp%u",
2566 				    setname, (unsigned)MD_MIN2UNIT(mnum));
2567 			}
2568 		}
2569 		break;
2570 	case SVM_TAG_DRIVE:
2571 		(void) sprintf(name, "drive");
2572 		break;
2573 	case SVM_TAG_HOST:
2574 		(void) sprintf(name, "host");
2575 		break;
2576 	case SVM_TAG_SET:
2577 		rtn = snprintf(name, sizeof (name), "%s",
2578 		    mddb_getsetname(setno));
2579 		if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2580 			(void) sprintf(name, "diskset");
2581 			rtn = 0;
2582 		}
2583 		break;
2584 	default:
2585 		rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
2586 		break;
2587 	}
2588 
2589 	/* Check if we got any rubbish for any of the snprintf's */
2590 	if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2591 		return (NULL);
2592 	}
2593 
2594 	return (md_strdup(name));
2595 }
2596 
2597 /* Sysevent subclass and mdnotify event type pairs */
2598 struct node {
2599 	char		*se_ev;
2600 	md_event_type_t	md_ev;
2601 };
2602 
2603 /*
2604  * Table must be sorted in case sensitive ascending order of
2605  * the sysevents values
2606  */
2607 static struct node ev_table[] = {
2608 	{ ESC_SVM_ADD,			EQ_ADD },
2609 	{ ESC_SVM_ATTACH,		EQ_ATTACH },
2610 	{ ESC_SVM_ATTACHING,		EQ_ATTACHING },
2611 	{ ESC_SVM_CHANGE,		EQ_CHANGE },
2612 	{ ESC_SVM_CREATE,		EQ_CREATE },
2613 	{ ESC_SVM_DELETE,		EQ_DELETE },
2614 	{ ESC_SVM_DETACH,		EQ_DETACH },
2615 	{ ESC_SVM_DETACHING,		EQ_DETACHING },
2616 	{ ESC_SVM_DRIVE_ADD,		EQ_DRIVE_ADD },
2617 	{ ESC_SVM_DRIVE_DELETE,		EQ_DRIVE_DELETE },
2618 	{ ESC_SVM_ENABLE,		EQ_ENABLE },
2619 	{ ESC_SVM_ERRED,		EQ_ERRED },
2620 	{ ESC_SVM_EXCHANGE,		EQ_EXCHANGE },
2621 	{ ESC_SVM_GROW,			EQ_GROW },
2622 	{ ESC_SVM_HS_CHANGED,		EQ_HS_CHANGED },
2623 	{ ESC_SVM_HS_FREED,		EQ_HS_FREED },
2624 	{ ESC_SVM_HOST_ADD,		EQ_HOST_ADD },
2625 	{ ESC_SVM_HOST_DELETE,		EQ_HOST_DELETE },
2626 	{ ESC_SVM_HOTSPARED,		EQ_HOTSPARED },
2627 	{ ESC_SVM_INIT_FAILED,		EQ_INIT_FAILED },
2628 	{ ESC_SVM_INIT_FATAL,		EQ_INIT_FATAL },
2629 	{ ESC_SVM_INIT_START,		EQ_INIT_START },
2630 	{ ESC_SVM_INIT_SUCCESS,		EQ_INIT_SUCCESS },
2631 	{ ESC_SVM_IOERR,		EQ_IOERR },
2632 	{ ESC_SVM_LASTERRED,		EQ_LASTERRED },
2633 	{ ESC_SVM_MEDIATOR_ADD,		EQ_MEDIATOR_ADD },
2634 	{ ESC_SVM_MEDIATOR_DELETE,	EQ_MEDIATOR_DELETE },
2635 	{ ESC_SVM_OFFLINE,		EQ_OFFLINE },
2636 	{ ESC_SVM_OK,			EQ_OK },
2637 	{ ESC_SVM_ONLINE,		EQ_ONLINE },
2638 	{ ESC_SVM_OPEN_FAIL,		EQ_OPEN_FAIL },
2639 	{ ESC_SVM_REGEN_DONE,		EQ_REGEN_DONE },
2640 	{ ESC_SVM_REGEN_FAILED,		EQ_REGEN_FAILED },
2641 	{ ESC_SVM_REGEN_START,		EQ_REGEN_START },
2642 	{ ESC_SVM_RELEASE,		EQ_RELEASE },
2643 	{ ESC_SVM_REMOVE,		EQ_REMOVE },
2644 	{ ESC_SVM_RENAME_DST,		EQ_RENAME_DST },
2645 	{ ESC_SVM_RENAME_SRC,		EQ_RENAME_SRC },
2646 	{ ESC_SVM_REPLACE,		EQ_REPLACE },
2647 	{ ESC_SVM_RESYNC_DONE,		EQ_RESYNC_DONE },
2648 	{ ESC_SVM_RESYNC_FAILED,	EQ_RESYNC_FAILED },
2649 	{ ESC_SVM_RESYNC_START,		EQ_RESYNC_START },
2650 	{ ESC_SVM_RESYNC_SUCCESS,	EQ_RESYNC_SUCCESS },
2651 	{ ESC_SVM_TAKEOVER,		EQ_TAKEOVER }
2652 };
2653 
2654 static md_tags_t md_tags[] = {
2655 	TAG_UNK,
2656 	TAG_METADEVICE,
2657 	TAG_UNK,
2658 	TAG_UNK,
2659 	TAG_UNK,
2660 	TAG_UNK,
2661 	TAG_REPLICA,
2662 	TAG_HSP,
2663 	TAG_HS,
2664 	TAG_SET,
2665 	TAG_DRIVE,
2666 	TAG_HOST,
2667 	TAG_MEDIATOR
2668 };
2669 
2670 md_event_type_t
2671 ev_get(char *subclass)
2672 {
2673 	int	high, mid, low, p;
2674 
2675 	low = 0;
2676 	high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
2677 	while (low <= high) {
2678 		mid = (high + low) / 2;
2679 		p = strcmp(subclass, ev_table[mid].se_ev);
2680 		if (p == 0) {
2681 			return (ev_table[mid].md_ev);
2682 		} else if (p < 0) {
2683 			high = mid - 1;
2684 		} else {
2685 			low = mid + 1;
2686 		}
2687 	}
2688 
2689 	return (EQ_EMPTY);
2690 }
2691 
2692 /*
2693  * Log mdnotify event
2694  */
2695 void
2696 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
2697 {
2698 	md_event_type_t	ev_type;
2699 	md_tags_t	md_tag;
2700 
2701 	/* Translate sysevent into mdnotify event */
2702 	ev_type = ev_get(se_subclass);
2703 
2704 	if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
2705 		md_tag = TAG_UNK;
2706 	} else {
2707 		md_tag = md_tags[tag];
2708 	}
2709 
2710 	NOTIFY_MD(md_tag, setno, devid, ev_type);
2711 }
2712 
2713 /*
2714  * Log SVM sys events
2715  */
2716 void
2717 svm_gen_sysevent(
2718 	char		*se_class,
2719 	char		*se_subclass,
2720 	uint32_t	tag,
2721 	set_t		setno,
2722 	md_dev64_t	devid
2723 )
2724 {
2725 	nvlist_t		*attr_list;
2726 	sysevent_id_t		eid;
2727 	int			err = DDI_SUCCESS;
2728 	char			*devname;
2729 	extern dev_info_t	*md_devinfo;
2730 
2731 	/* Raise the mdnotify event before anything else */
2732 	do_mdnotify(se_subclass, tag, setno, devid);
2733 
2734 	if (md_devinfo == NULL) {
2735 		return;
2736 	}
2737 
2738 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
2739 
2740 	if (err == DDI_SUCCESS) {
2741 		/* Add the version numver */
2742 		err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
2743 		    (uint32_t)SVM_VERSION);
2744 		if (err != DDI_SUCCESS) {
2745 			goto fail;
2746 		}
2747 
2748 		/* Add the tag attribute */
2749 		err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
2750 		if (err != DDI_SUCCESS) {
2751 			goto fail;
2752 		}
2753 
2754 		/* Add the set number attribute */
2755 		err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
2756 		if (err != DDI_SUCCESS) {
2757 			goto fail;
2758 		}
2759 
2760 		/* Add the device id attribute */
2761 		err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
2762 		if (err != DDI_SUCCESS) {
2763 			goto fail;
2764 		}
2765 
2766 		/* Add the device name attribute */
2767 		devname = obj2devname(tag, setno, devid);
2768 		if (devname != NULL) {
2769 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2770 			    devname);
2771 			freestr(devname);
2772 		} else {
2773 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2774 			    "unspecified");
2775 		}
2776 		if (err != DDI_SUCCESS) {
2777 			goto fail;
2778 		}
2779 
2780 		/* Attempt to post event */
2781 		err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
2782 		    se_subclass, attr_list, &eid, DDI_SLEEP);
2783 
2784 		nvlist_free(attr_list);
2785 		if (err != DDI_SUCCESS) {
2786 			cmn_err(CE_WARN, "Failed to log event for %s, %s,"
2787 			    " err=%x", se_class, se_subclass, err);
2788 		}
2789 	}
2790 
2791 	return;
2792 
2793 fail:
2794 	nvlist_free(attr_list);
2795 	cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
2796 	    se_class, se_subclass, err);
2797 }
2798 
2799 void
2800 md_clear_named_service()
2801 {
2802 	rw_enter(&ni_rwlp.lock, RW_WRITER);
2803 	notify_interface = NULL;
2804 	rw_exit(&ni_rwlp.lock);
2805 }
2806 
2807 void
2808 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
2809 {
2810 	mdi_unit_t	*ui;
2811 	set_t		setno = MD_MIN2SET(mnum);
2812 
2813 	ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
2814 	ui->ui_opsindex = ops->md_selfindex;
2815 
2816 	/* initialize all the incore conditional variables */
2817 	mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
2818 	cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
2819 
2820 	if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
2821 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
2822 		MDI_VOIDUNIT(mnum) = (void *) ui;
2823 		rw_exit(&md_unit_array_rw.lock);
2824 	} else
2825 		MDI_VOIDUNIT(mnum) = (void *) ui;
2826 
2827 	rw_enter(&ops->md_link_rw.lock, RW_WRITER);
2828 	ui->ui_link.ln_next = ops->md_head;
2829 	ui->ui_link.ln_setno = setno;
2830 	ui->ui_link.ln_id = mnum;
2831 	ops->md_head = &ui->ui_link;
2832 	if (alloc_lock) {
2833 		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
2834 		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
2835 		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
2836 		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
2837 		    MUTEX_DEFAULT, NULL);
2838 		ui->ui_io_lock->io_list_front = NULL;
2839 		ui->ui_io_lock->io_list_back = NULL;
2840 	}
2841 	/* setup the unavailable field */
2842 #if defined(_ILP32)
2843 	if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
2844 		ui->ui_tstate |= MD_64MD_ON_32KERNEL;
2845 		cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
2846 		    "metadevices are not accessible on a 32 bit kernel",
2847 		    mnum);
2848 	}
2849 #endif
2850 
2851 	rw_exit(&ops->md_link_rw.lock);
2852 }
2853 
2854 void
2855 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
2856 {
2857 	mdi_unit_t	*ui;
2858 
2859 	/*
2860 	 * ASSUMPTION: md_unit_array_rw WRITER lock is held.
2861 	 */
2862 	ui = MDI_UNIT(mnum);
2863 	if (ui == NULL)
2864 		return;
2865 
2866 	md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
2867 	    &ops->md_head);
2868 
2869 	/* destroy the io lock if one is being used */
2870 	if (ui->ui_io_lock) {
2871 		mutex_destroy(&ui->ui_io_lock->io_mx);
2872 		cv_destroy(&ui->ui_io_lock->io_cv);
2873 		kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
2874 	}
2875 
2876 	/* teardown kstat */
2877 	md_kstat_destroy(mnum);
2878 
2879 	/* destroy all the incore conditional variables */
2880 	mutex_destroy(&ui->ui_mx);
2881 	cv_destroy(&ui->ui_cv);
2882 
2883 	kmem_free(ui, sizeof (mdi_unit_t));
2884 	MDI_VOIDUNIT(mnum) = (void *) NULL;
2885 }
2886 
2887 void
2888 md_rem_names(sv_dev_t *sv, int nsv)
2889 {
2890 	int	i, s;
2891 	int	max_sides;
2892 
2893 	if (nsv == 0)
2894 		return;
2895 
2896 	/* All entries removed are in the same diskset */
2897 	if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
2898 		max_sides = MD_MNMAXSIDES;
2899 	else
2900 		max_sides = MD_MAXSIDES;
2901 
2902 	for (i = 0; i < nsv; i++)
2903 		for (s = 0; s < max_sides; s++)
2904 			(void) md_remdevname(sv[i].setno, s, sv[i].key);
2905 }
2906 
2907 /*
2908  * Checking user args before we get into physio - returns 0 for ok, else errno
2909  * We do a lot of checking against illegal arguments here because some of the
2910  * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
2911  * like odd address user buffer.) Those drivers capture bad arguments in
2912  * xxread and xxwrite. But since meta-driver calls their strategy routines
2913  * directly, two bad scenario might happen:
2914  *	1. the real strategy doesn't like it and panic.
2915  *	2. the real strategy doesn't like it and set B_ERROR.
2916  *
2917  * The second case is no better than the first one, since the meta-driver
2918  * will treat it as a media-error and off line the mirror metapartition.
2919  * (Too bad there is no way to tell what error it is.)
2920  *
2921  */
2922 int
2923 md_chk_uio(struct uio *uio)
2924 {
2925 	int	i;
2926 	struct iovec *iov;
2927 
2928 	/*
2929 	 * Check for negative or not block-aligned offset
2930 	 */
2931 	if ((uio->uio_loffset < 0) ||
2932 	    ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
2933 		return (EINVAL);
2934 	}
2935 	iov = uio->uio_iov;
2936 	i = uio->uio_iovcnt;
2937 
2938 	while (i--) {
2939 		if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
2940 			return (EINVAL);
2941 		/*
2942 		 * Bug # 1212146
2943 		 * The default is to not check alignment, but we can now check
2944 		 * for a larger number of alignments if desired.
2945 		 */
2946 		if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
2947 			return (EINVAL);
2948 		iov++;
2949 	}
2950 	return (0);
2951 }
2952 
2953 char *
2954 md_shortname(
2955 	minor_t		mnum
2956 )
2957 {
2958 	static char	buf[MAXPATHLEN];
2959 	char		*devname;
2960 	char		*invalid = " (Invalid minor number %u) ";
2961 	char		*metaname;
2962 	mdc_unit_t	*un;
2963 	side_t		side;
2964 	set_t		setno = MD_MIN2SET(mnum);
2965 	unit_t		unit = MD_MIN2UNIT(mnum);
2966 
2967 	if ((un = MD_UNIT(mnum)) == NULL) {
2968 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
2969 		return (buf);
2970 	}
2971 
2972 	/*
2973 	 * If unit is not a friendly name unit, derive the name from the
2974 	 * minor number.
2975 	 */
2976 	if ((un->un_revision & MD_FN_META_DEV) == 0) {
2977 		/* This is a traditional metadevice */
2978 		if (setno == MD_LOCAL_SET) {
2979 			(void) snprintf(buf, sizeof (buf), "d%u",
2980 				(unsigned)unit);
2981 		} else {
2982 			(void) snprintf(buf, sizeof (buf), "%s/d%u",
2983 			    mddb_getsetname(setno), (unsigned)unit);
2984 		}
2985 		return (buf);
2986 	}
2987 
2988 	/*
2989 	 * It is a friendly name metadevice, so we need to get its name.
2990 	 */
2991 	side = mddb_getsidenum(setno);
2992 	devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
2993 	if (md_getdevname(setno, side, MD_KEYWILD,
2994 		md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
2995 		/*
2996 		 * md_getdevname has given us either /dev/md/dsk/<metaname>
2997 		 * or /dev/md/<setname>/dsk/<metname> depending on whether
2998 		 * or not we are in the local set.  Thus, we'll pull the
2999 		 * metaname from this string.
3000 		 */
3001 		if ((metaname = strrchr(devname, '/')) == NULL) {
3002 			(void) snprintf(buf, sizeof (buf), invalid, mnum);
3003 			goto out;
3004 		}
3005 		metaname++;	/* move past slash */
3006 		if (setno == MD_LOCAL_SET) {
3007 			/* No set name. */
3008 			(void) snprintf(buf, sizeof (buf), "%s", metaname);
3009 		} else {
3010 			/* Include setname */
3011 			(void) snprintf(buf, sizeof (buf), "%s/%s",
3012 				mddb_getsetname(setno), metaname);
3013 		}
3014 	} else {
3015 		/* We couldn't find the name. */
3016 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
3017 	}
3018 
3019 out:
3020 	kmem_free(devname, MAXPATHLEN);
3021 	return (buf);
3022 }
3023 
3024 char *
3025 md_devname(
3026 	set_t		setno,
3027 	md_dev64_t	dev,
3028 	char		*buf,
3029 	size_t		size
3030 )
3031 {
3032 	static char	mybuf[MD_MAX_CTDLEN];
3033 	int		err;
3034 
3035 	if (buf == NULL) {
3036 		buf = mybuf;
3037 		size = sizeof (mybuf);
3038 	} else {
3039 		ASSERT(size >= MD_MAX_CTDLEN);
3040 	}
3041 
3042 	err = md_getdevname(setno, mddb_getsidenum(setno),
3043 		0, dev, buf, size);
3044 	if (err) {
3045 		if (err == ENOENT) {
3046 			(void) sprintf(buf, "(Unavailable)");
3047 		} else {
3048 			(void) sprintf(buf, "(%u.%u)",
3049 			    md_getmajor(dev), md_getminor(dev));
3050 		}
3051 	}
3052 
3053 	return (buf);
3054 }
3055 void
3056 md_minphys(buf_t *pb)
3057 {
3058 	extern unsigned md_maxbcount;
3059 
3060 	if (pb->b_bcount > md_maxbcount)
3061 		pb->b_bcount = md_maxbcount;
3062 }
3063 
3064 void
3065 md_bioinit(struct buf *bp)
3066 {
3067 	ASSERT(bp);
3068 
3069 	bioinit(bp);
3070 	bp->b_back = bp;
3071 	bp->b_forw = bp;
3072 	bp->b_flags = B_BUSY;	/* initialize flags */
3073 }
3074 
3075 void
3076 md_bioreset(struct buf *bp)
3077 {
3078 	ASSERT(bp);
3079 
3080 	bioreset(bp);
3081 	bp->b_back = bp;
3082 	bp->b_forw = bp;
3083 	bp->b_flags = B_BUSY;	/* initialize flags */
3084 }
3085 
3086 /*
3087  * md_bioclone is needed as long as the real bioclone only takes a daddr_t
3088  * as block number.
3089  * We simply call bioclone with all input parameters but blkno, and set the
3090  * correct blkno afterwards.
3091  * Caveat Emptor: bp_mem must not be NULL!
3092  */
3093 buf_t *
3094 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
3095 		int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
3096 {
3097 	(void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
3098 	bp_mem->b_lblkno = blkno;
3099 	return (bp_mem);
3100 }
3101 
3102 
3103 /*
3104  * kstat stuff
3105  */
3106 void
3107 md_kstat_init_ui(
3108 	minor_t		 mnum,
3109 	mdi_unit_t	*ui
3110 )
3111 {
3112 	if ((ui != NULL) && (ui->ui_kstat == NULL)) {
3113 		set_t	setno = MD_MIN2SET(mnum);
3114 		unit_t  unit = MD_MIN2UNIT(mnum);
3115 		char	module[KSTAT_STRLEN];
3116 		char	*p = module;
3117 
3118 		if (setno != MD_LOCAL_SET) {
3119 			char	buf[64];
3120 			char	*s = buf;
3121 			char	*e = module + sizeof (module) - 4;
3122 
3123 			(void) sprintf(buf, "%u", setno);
3124 			while ((p < e) && (*s != '\0'))
3125 				*p++ = *s++;
3126 			*p++ = '/';
3127 		}
3128 		*p++ = 'm';
3129 		*p++ = 'd';
3130 		*p = '\0';
3131 		if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
3132 		    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
3133 			ui->ui_kstat->ks_lock = &ui->ui_mx;
3134 			kstat_install(ui->ui_kstat);
3135 		}
3136 	}
3137 }
3138 
3139 void
3140 md_kstat_init(
3141 	minor_t		mnum
3142 )
3143 {
3144 	md_kstat_init_ui(mnum, MDI_UNIT(mnum));
3145 }
3146 
3147 void
3148 md_kstat_destroy_ui(
3149 	mdi_unit_t	*ui
3150 )
3151 {
3152 	/*
3153 	 * kstat_delete() interface has it's own locking mechanism and
3154 	 * does not allow holding of kstat lock (ks_lock).
3155 	 * Note: ks_lock == ui_mx from the md_kstat_init_ui().
3156 	 */
3157 	if ((ui != NULL) && (ui->ui_kstat != NULL)) {
3158 		kstat_delete(ui->ui_kstat);
3159 		ui->ui_kstat = NULL;
3160 	}
3161 }
3162 
3163 void
3164 md_kstat_destroy(
3165 	minor_t		mnum
3166 )
3167 {
3168 	md_kstat_destroy_ui(MDI_UNIT(mnum));
3169 }
3170 
3171 /*
3172  * In the following subsequent routines, locks are held before checking the
3173  * validity of ui_kstat. This is done to make sure that we don't trip over
3174  * a NULL ui_kstat anymore.
3175  */
3176 
3177 void
3178 md_kstat_waitq_enter(
3179 	mdi_unit_t	*ui
3180 )
3181 {
3182 	mutex_enter(&ui->ui_mx);
3183 	if (ui->ui_kstat != NULL)
3184 		kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3185 	mutex_exit(&ui->ui_mx);
3186 }
3187 
3188 void
3189 md_kstat_waitq_to_runq(
3190 	mdi_unit_t	*ui
3191 )
3192 {
3193 	mutex_enter(&ui->ui_mx);
3194 	if (ui->ui_kstat != NULL)
3195 		kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
3196 	mutex_exit(&ui->ui_mx);
3197 }
3198 
3199 void
3200 md_kstat_waitq_exit(
3201 	mdi_unit_t	*ui
3202 )
3203 {
3204 	mutex_enter(&ui->ui_mx);
3205 	if (ui->ui_kstat != NULL)
3206 		kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3207 	mutex_exit(&ui->ui_mx);
3208 }
3209 
3210 void
3211 md_kstat_runq_enter(
3212 	mdi_unit_t	*ui
3213 )
3214 {
3215 	mutex_enter(&ui->ui_mx);
3216 	if (ui->ui_kstat != NULL)
3217 		kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3218 	mutex_exit(&ui->ui_mx);
3219 }
3220 
3221 void
3222 md_kstat_runq_exit(
3223 	mdi_unit_t	*ui
3224 )
3225 {
3226 	mutex_enter(&ui->ui_mx);
3227 	if (ui->ui_kstat != NULL)
3228 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3229 	mutex_exit(&ui->ui_mx);
3230 }
3231 
3232 void
3233 md_kstat_done(
3234 	mdi_unit_t	*ui,
3235 	buf_t		*bp,
3236 	int		war
3237 )
3238 {
3239 	size_t  n_done;
3240 
3241 	/* check for end of device */
3242 	if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
3243 		n_done = bp->b_bcount;
3244 	} else if (bp->b_bcount < bp->b_resid) {
3245 		n_done = 0;
3246 	} else {
3247 		n_done = bp->b_bcount - bp->b_resid;
3248 	}
3249 
3250 	/* do accounting */
3251 	mutex_enter(&ui->ui_mx);
3252 	if (ui->ui_kstat != NULL) {
3253 		if ((! war) && (bp->b_flags & B_READ)) {
3254 			KSTAT_IO_PTR(ui->ui_kstat)->reads++;
3255 			KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
3256 		} else {
3257 			KSTAT_IO_PTR(ui->ui_kstat)->writes++;
3258 			KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
3259 		}
3260 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3261 	}
3262 	mutex_exit(&ui->ui_mx);
3263 }
3264 
3265 pid_t
3266 md_getpid()
3267 {
3268 	pid_t valuep;
3269 	if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
3270 		ASSERT(0);
3271 		return ((pid_t)0);
3272 	} else {
3273 		ASSERT(valuep);
3274 		return (valuep);
3275 	}
3276 }
3277 
3278 
3279 proc_t *
3280 md_getproc()
3281 {
3282 	proc_t  *valuep;
3283 	if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
3284 		ASSERT(0);
3285 		return ((proc_t *)NULL);
3286 	} else {
3287 		ASSERT(valuep);
3288 		return (valuep);
3289 	}
3290 }
3291 
3292 extern kmutex_t pidlock;
3293 
3294 /*
3295  * this check to see if a process pid pair are still running.  For the
3296  * disk set lock when both pid/proc are zero then the locks is not
3297  * currently held.
3298  */
3299 int
3300 md_checkpid(pid_t pid, proc_t *proc)
3301 {
3302 	int	retval = 1;
3303 
3304 	if (pid == 0 && proc == NULL)
3305 		return (0);
3306 
3307 	mutex_enter(&pidlock);
3308 	if (prfind(pid)  != proc)
3309 		retval = 0;
3310 	mutex_exit(&pidlock);
3311 	return (retval);
3312 }
3313 
3314 /*
3315  * NAME: md_init_probereq
3316  *
3317  * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
3318  *		they can be dispatched to multiple daemon threads.
3319  *
3320  * PARAMETERS: struct md_probedev *p	pointer ioctl input
3321  *
3322  * RETURN VALUE: Returns errno
3323  *
3324  */
3325 
3326 int
3327 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
3328 {
3329 	int		err = 0;
3330 	int		modindx;
3331 	intptr_t	(*probe_test)();
3332 
3333 	/*
3334 	 * Initialize the semaphores and mutex
3335 	 * for the request
3336 	 */
3337 
3338 	p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
3339 
3340 	p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
3341 	sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
3342 	mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
3343 
3344 	modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
3345 	probe_test = md_get_named_service(NODEV64, modindx,
3346 		p->probe.test_name, 0);
3347 	if (probe_test == NULL) {
3348 		err = EINVAL;
3349 		goto err_out;
3350 	}
3351 
3352 	err = md_create_probe_rqlist(p, hdrpp, probe_test);
3353 err_out:
3354 	return (err);
3355 }
3356 
3357 /*
3358  * NAME: md_probe_one
3359  *
3360  * DESCRIPTION: Generic routine for probing disks. This is called from the
3361  *		daemon.
3362  *
3363  * PARAMETERS: probe_req_t	*reqp	pointer to the probe request structure.
3364  *
3365  */
3366 
3367 void
3368 md_probe_one(probe_req_t *reqp)
3369 {
3370 	mdi_unit_t		*ui;
3371 	md_probedev_impl_t	*p;
3372 	int			err = 0;
3373 
3374 	p = (md_probedev_impl_t *)reqp->private_handle;
3375 	/*
3376 	 * Validate the unit while holding the global ioctl lock, then
3377 	 * obtain the unit_writerlock. Once the writerlock has been obtained
3378 	 * we can release the global lock. As long as we hold one of these
3379 	 * locks this will prevent a metaclear operation being performed
3380 	 * on the metadevice because metaclear takes the readerlock (via
3381 	 * openclose lock).
3382 	 */
3383 	while (md_ioctl_lock_enter() == EINTR);
3384 	ui = MDI_UNIT(reqp->mnum);
3385 	if (ui != NULL) {
3386 		(void) md_unit_writerlock_common(ui, 0);
3387 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3388 		err = (*reqp->probe_fcn)(ui, reqp->mnum);
3389 		md_unit_writerexit(ui);
3390 	} else {
3391 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3392 	}
3393 
3394 	/* update the info info in the probe structure */
3395 
3396 	mutex_enter(PROBE_MX(p));
3397 	if (err != 0) {
3398 		cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
3399 			reqp->mnum);
3400 		(void) mdsyserror(&(p->probe.mde), err);
3401 	}
3402 
3403 	mutex_exit(PROBE_MX(p));
3404 	sema_v(PROBE_SEMA(p));
3405 
3406 	kmem_free(reqp, sizeof (probe_req_t));
3407 }
3408 char *
3409 md_strdup(char *cp)
3410 {
3411 	char *new_cp = NULL;
3412 
3413 	new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
3414 
3415 	return (strcpy(new_cp, cp));
3416 }
3417 
3418 void
3419 freestr(char *cp)
3420 {
3421 	kmem_free(cp, strlen(cp) + 1);
3422 }
3423 
3424 /*
3425  * Validate the list and skip invalid devices. Then create
3426  * a doubly linked circular list of devices to probe.
3427  * The hdr points to the head and tail of this list.
3428  */
3429 
3430 static int
3431 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
3432 			intptr_t (*probe_test)())
3433 {
3434 	int i, err, nodevcnt;
3435 	probe_req_t *tp;
3436 	daemon_queue_t *hp;
3437 	minor_t mnum;
3438 
3439 	nodevcnt = 0;
3440 
3441 	hp = NULL;
3442 
3443 	for (i = 0; i <  plist->probe.nmdevs; i++) {
3444 		mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
3445 		if (MDI_UNIT(mnum) == NULL) {
3446 			cmn_err(CE_WARN, "md: Cannot probe %s since it does "
3447 			    "not exist", md_shortname(mnum));
3448 			nodevcnt++;
3449 			continue;
3450 		}
3451 		tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
3452 		tp->mnum = mnum;
3453 		tp->private_handle = (void *)plist;
3454 		tp->probe_fcn = probe_test;
3455 		if (hp == NULL) {
3456 			hp = (daemon_queue_t *)tp;
3457 			hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
3458 		} else {
3459 			tp->dq.dq_next = hp;
3460 			tp->dq.dq_prev = hp->dq_prev;
3461 			hp->dq_prev->dq_next = (daemon_queue_t *)tp;
3462 			hp->dq_prev = (daemon_queue_t *)tp;
3463 		}
3464 	}
3465 
3466 	*hdr = hp;
3467 	if (nodevcnt > 0)
3468 		plist->probe.nmdevs -= nodevcnt;
3469 
3470 	/*
3471 	 * If there are no devices to be probed because they were
3472 	 * incorrect, then return an error.
3473 	 */
3474 	err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
3475 
3476 	return (err);
3477 }
3478 
3479 /*
3480  * This routine increments the I/O count for set I/O operations.  This
3481  * value is used to determine if an I/O can done.  If a release is in
3482  * process this will return an error and cause the I/O to be errored.
3483  */
3484 int
3485 md_inc_iocount(set_t setno)
3486 {
3487 	int	rc = 0;
3488 
3489 	if (setno == 0)
3490 		return (0);
3491 
3492 	mutex_enter(&md_set_io[setno].md_io_mx);
3493 	if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
3494 		rc = EIO;
3495 		goto out;
3496 	}
3497 
3498 	ASSERT(md_set_io[setno].io_cnt >= 0);
3499 	md_set_io[setno].io_cnt++;
3500 
3501 out:	mutex_exit(&md_set_io[setno].md_io_mx);
3502 	return (rc);
3503 }
3504 
3505 void
3506 md_inc_iocount_noblock(set_t setno)
3507 {
3508 
3509 	if (setno == 0)
3510 		return;
3511 
3512 	mutex_enter(&md_set_io[setno].md_io_mx);
3513 	md_set_io[setno].io_cnt++;
3514 	mutex_exit(&md_set_io[setno].md_io_mx);
3515 }
3516 void
3517 md_dec_iocount(set_t setno)
3518 {
3519 
3520 	if (setno == 0)
3521 		return;
3522 
3523 	mutex_enter(&md_set_io[setno].md_io_mx);
3524 	md_set_io[setno].io_cnt--;
3525 	ASSERT(md_set_io[setno].io_cnt >= 0);
3526 	if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
3527 	    (md_set_io[setno].io_cnt == 0))
3528 		cv_broadcast(&md_set_io[setno].md_io_cv);
3529 	mutex_exit(&md_set_io[setno].md_io_mx);
3530 }
3531 
3532 int
3533 md_isblock_setio(set_t setno)
3534 {
3535 	int	rc = 0;
3536 
3537 	if (setno == 0)
3538 		return (0);
3539 
3540 	mutex_enter(&md_set_io[setno].md_io_mx);
3541 	if (md_set_io[setno].io_state & MD_SET_RELEASE)
3542 		rc = 1;
3543 
3544 	mutex_exit(&md_set_io[setno].md_io_mx);
3545 	return (rc);
3546 }
3547 
3548 int
3549 md_block_setio(set_t setno)
3550 {
3551 	int	rc = 0;
3552 
3553 	if (setno == 0)
3554 		return (1);
3555 
3556 	mutex_enter(&md_set_io[setno].md_io_mx);
3557 	md_set_io[setno].io_state = MD_SET_RELEASE;
3558 
3559 	while (md_set_io[setno].io_cnt > 0) {
3560 		cv_wait(&md_set_io[setno].md_io_cv,
3561 		    &md_set_io[setno].md_io_mx);
3562 	}
3563 	rc = 1;
3564 
3565 
3566 	ASSERT(md_set_io[setno].io_cnt == 0);
3567 	mutex_exit(&md_set_io[setno].md_io_mx);
3568 
3569 	return (rc);
3570 }
3571 
3572 void
3573 md_clearblock_setio(set_t setno)
3574 {
3575 	if (setno == 0)
3576 		return;
3577 
3578 	mutex_enter(&md_set_io[setno].md_io_mx);
3579 	md_set_io[setno].io_state = MD_SET_ACTIVE;
3580 	mutex_exit(&md_set_io[setno].md_io_mx);
3581 }
3582 
3583 void
3584 md_unblock_setio(set_t setno)
3585 {
3586 	if (setno == 0)
3587 		return;
3588 
3589 	mutex_enter(&md_set_io[setno].md_io_mx);
3590 #ifdef DEBUG
3591 	if (md_set_io[setno].io_cnt != 0) {
3592 		cmn_err(CE_NOTE, "set %d count was %ld at take",
3593 		    setno, md_set_io[setno].io_cnt);
3594 	}
3595 #endif /* DEBUG */
3596 
3597 	md_set_io[setno].io_state = MD_SET_ACTIVE;
3598 	md_set_io[setno].io_cnt = 0;
3599 	mutex_exit(&md_set_io[setno].md_io_mx);
3600 }
3601 
3602 /*
3603  * Test and set version of the md_block_setio.
3604  * Set the io_state to keep new I/O from being issued.
3605  * If there is I/O currently in progress, then set io_state to active
3606  * and return failure.  Otherwise, return a 1 for success.
3607  *
3608  * Used in a MN diskset since the commd must be suspended before
3609  * this node can attempt to withdraw from a diskset.  But, with commd
3610  * suspended, I/O may have been issued that can never finish until
3611  * commd is resumed (allocation of hotspare, etc). So, if I/O is
3612  * outstanding after diskset io_state is marked RELEASE, then set diskset
3613  * io_state back to ACTIVE and return failure.
3614  */
3615 int
3616 md_tas_block_setio(set_t setno)
3617 {
3618 	int	rc;
3619 
3620 	if (setno == 0)
3621 		return (1);
3622 
3623 	mutex_enter(&md_set_io[setno].md_io_mx);
3624 	md_set_io[setno].io_state = MD_SET_RELEASE;
3625 
3626 	if (md_set_io[setno].io_cnt > 0) {
3627 		md_set_io[setno].io_state = MD_SET_ACTIVE;
3628 		rc = 0;
3629 	} else {
3630 		rc = 1;
3631 	}
3632 
3633 	mutex_exit(&md_set_io[setno].md_io_mx);
3634 
3635 	return (rc);
3636 }
3637 
3638 void
3639 md_biodone(struct buf *pb)
3640 {
3641 	minor_t	mnum;
3642 	set_t	setno;
3643 	mdi_unit_t	*ui;
3644 
3645 	mnum = getminor(pb->b_edev);
3646 	setno = MD_MIN2SET(mnum);
3647 
3648 	if (setno == 0) {
3649 		biodone(pb);
3650 		return;
3651 	}
3652 
3653 #ifdef DEBUG
3654 	ui = MDI_UNIT(mnum);
3655 	if (!md_unit_isopen(ui))
3656 		cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
3657 #endif /* DEBUG */
3658 
3659 	/*
3660 	 * Handle the local diskset
3661 	 */
3662 	if (md_set_io[setno].io_cnt > 0)
3663 		md_dec_iocount(setno);
3664 
3665 #ifdef DEBUG
3666 	/*
3667 	 * this is being done after the lock is dropped so there
3668 	 * are cases it may be invalid.  It is advisory.
3669 	 */
3670 	if (md_set_io[setno].io_state & MD_SET_RELEASE) {
3671 		/* Only display this error once for this metadevice */
3672 		if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
3673 			cmn_err(CE_NOTE,
3674 			    "I/O to %s attempted during set RELEASE\n",
3675 			    md_shortname(mnum));
3676 			ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
3677 		}
3678 	}
3679 #endif /* DEBUG */
3680 
3681 	biodone(pb);
3682 }
3683 
3684 
3685 /*
3686  * Driver special private devt handling routine
3687  * INPUT:  md_dev64_t
3688  * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
3689  */
3690 dev_t
3691 md_dev64_to_dev(md_dev64_t dev)
3692 {
3693 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3694 	minor_t minor = (minor_t)(dev & MAXMIN64);
3695 
3696 	return (makedevice(major, minor));
3697 
3698 }
3699 
3700 /*
3701  * Driver private makedevice routine
3702  * INPUT:  major_t major, minor_t minor
3703  * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
3704  */
3705 md_dev64_t
3706 md_makedevice(major_t major, minor_t minor)
3707 {
3708 	return (((md_dev64_t)major << NBITSMINOR64) | minor);
3709 
3710 }
3711 
3712 
3713 /*
3714  * Driver private devt md_getmajor routine
3715  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3716  * OUTPUT: the appropriate major number
3717  */
3718 major_t
3719 md_getmajor(md_dev64_t dev)
3720 {
3721 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3722 
3723 	if (major == 0) {
3724 		/* Here we were given a 32bit dev */
3725 		major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
3726 	}
3727 	return (major);
3728 }
3729 
3730 /*
3731  * Driver private devt md_getminor routine
3732  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3733  * OUTPUT: the appropriate minor number
3734  */
3735 minor_t
3736 md_getminor(md_dev64_t dev)
3737 {
3738 	minor_t minor;
3739 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3740 
3741 	if (major == 0) {
3742 		/* Here we were given a 32bit dev */
3743 		minor = (minor_t)(dev & MAXMIN32);
3744 	} else {
3745 		minor = (minor_t)(dev & MAXMIN64);
3746 	}
3747 	return (minor);
3748 }
3749 
3750 int
3751 md_check_ioctl_against_efi(int cmd, ushort_t flags)
3752 {
3753 	/*
3754 	 * If the metadevice is an old style device, it has a vtoc,
3755 	 *	in that case all reading EFI ioctls are not applicable.
3756 	 * If the metadevice has an EFI label, reading vtoc and geom ioctls
3757 	 *	are not supposed to work.
3758 	 */
3759 	switch (cmd) {
3760 		case DKIOCGGEOM:
3761 		case DKIOCGVTOC:
3762 		case DKIOCGAPART:
3763 			if ((flags & MD_EFILABEL) != 0) {
3764 				return (ENOTSUP);
3765 			}
3766 			break;
3767 		case DKIOCGETEFI:
3768 		case DKIOCPARTITION:
3769 			if ((flags & MD_EFILABEL) == 0) {
3770 				return (ENOTSUP);
3771 			}
3772 			break;
3773 
3774 		case DKIOCSETEFI:
3775 		/* setting an EFI label should always be ok */
3776 			return (0);
3777 
3778 		case DKIOCSVTOC:
3779 		/*
3780 		 * This one is ok for small devices, even if they have an EFI
3781 		 * label. The appropriate check is in md_set_vtoc
3782 		 */
3783 			return (0);
3784 	}
3785 	return (0);
3786 }
3787 
3788 /*
3789  * md_vtoc_to_efi_record()
3790  * Input:  record id of the vtoc record
3791  * Output: record id of the efi record
3792  * Function:
3793  *	- reads the  volume name from the vtoc record
3794  *	- converts the volume name to a format, libefi understands
3795  *	- creates a new record of size MD_EFI_PARTNAME_BYTES
3796  *	- stores the volname in that record,
3797  *	- commits that record
3798  *	- returns the recid of the efi record.
3799  * Caveat Emptor:
3800  *	The calling routine must do something like
3801  *	- un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
3802  *	- commit(un)
3803  *	- delete(vtoc_recid)
3804  *	in order to keep the mddb consistent in case of a panic in the middle.
3805  * Errors:
3806  *	- returns 0 on any error
3807  */
3808 mddb_recid_t
3809 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
3810 {
3811 	struct vtoc	*vtoc;
3812 	ushort_t	*v;
3813 	mddb_recid_t	efi_recid;
3814 	int		i;
3815 
3816 	if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
3817 		return (0);
3818 	}
3819 	vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
3820 	efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
3821 					MD_CRO_32BIT, setno);
3822 	if (efi_recid < 0) {
3823 		return (0);
3824 	}
3825 	v = (ushort_t *)mddb_getrecaddr(efi_recid);
3826 
3827 	/* This for loop read, converts and writes */
3828 	for (i = 0; i < LEN_DKL_VVOL; i++) {
3829 		v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
3830 	}
3831 	/* commit the new record */
3832 	mddb_commitrec_wrapper(efi_recid);
3833 
3834 	return (efi_recid);
3835 }
3836 
3837 /*
3838  * Send a kernel message.
3839  * user has to provide for an allocated result structure
3840  * If the door handler disappears we retry forever emitting warnings every so
3841  * often.
3842  * TODO: make this a flaggable attribute so that the caller can decide if the
3843  *	 message is to be a 'one-shot' message or not.
3844  */
3845 int
3846 mdmn_ksend_message(
3847 	set_t		setno,
3848 	md_mn_msgtype_t	type,
3849 	uint_t		flags,
3850 	char		*data,
3851 	int		size,
3852 	md_mn_kresult_t	*result)
3853 {
3854 	door_arg_t	da;
3855 	md_mn_kmsg_t	*kmsg;
3856 	uint_t		retry_cnt = 0;
3857 	int		rval;
3858 
3859 	if (size > MDMN_MAX_KMSG_DATA)
3860 		return (ENOMEM);
3861 	kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
3862 	kmsg->kmsg_flags = flags;
3863 	kmsg->kmsg_setno = setno;
3864 	kmsg->kmsg_type	= type;
3865 	kmsg->kmsg_size	= size;
3866 	bcopy(data, &(kmsg->kmsg_data), size);
3867 
3868 #ifdef DEBUG_COMM
3869 	printf("send msg: set=%d, flags=%d, type=%d, txid = 0x%llx,"
3870 		" size=%d, data=%d, data2=%d\n",
3871 			kmsg->kmsg_setno,
3872 			kmsg->kmsg_flags,
3873 			kmsg->kmsg_type,
3874 			kmsg->kmsg_size,
3875 			*(int *)data,
3876 			*(int *)(char *)(&kmsg->kmsg_data));
3877 
3878 
3879 #endif /* DEBUG_COMM */
3880 
3881 	da.data_ptr	= (char *)(kmsg);
3882 	da.data_size	= sizeof (md_mn_kmsg_t);
3883 	da.desc_ptr	= NULL;
3884 	da.desc_num	= 0;
3885 	da.rbuf		= (char *)result;
3886 	da.rsize	= sizeof (*result);
3887 
3888 	/*
3889 	 * Wait for the door handle to be established.
3890 	 */
3891 
3892 	while (mdmn_door_did == -1) {
3893 		if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) {
3894 			cmn_err(CE_WARN, "door handle not yet ready. "
3895 			    "Check if /usr/lib/lvm/mddoors is running");
3896 		}
3897 		delay(md_hz);
3898 	}
3899 	retry_cnt = 0;
3900 
3901 	while ((rval = door_ki_upcall(mdmn_door_handle, &da)) != 0) {
3902 		if (rval == EAGAIN)  {
3903 			if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) {
3904 				cmn_err(CE_WARN, "door call failed. "
3905 				"Check if /usr/lib/lvm/mddoors is running");
3906 			}
3907 		} else {
3908 			cmn_err(CE_WARN,
3909 				"md door call failed. Returned %d", rval);
3910 		}
3911 		delay(md_hz);
3912 	}
3913 	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
3914 
3915 	/*
3916 	 * Attempt to determine if the message failed (with an RPC_FAILURE)
3917 	 * because we are in the middle of shutting the system down.
3918 	 *
3919 	 * If message failed with an RPC_FAILURE when rpc.mdcommd had
3920 	 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
3921 	 * then don't retry the message anymore.  If message
3922 	 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
3923 	 * times which should allow a shutting down system time to
3924 	 * notify the kernel of a graceful shutdown of rpc.mdcommd.
3925 	 *
3926 	 * Caller of this routine will need to check the md_mn_commd_present
3927 	 * flag and the failure error in order to determine whether to panic
3928 	 * or not.  If md_mn_commd_present is set to 0 and failure error
3929 	 * is RPC_FAILURE, the calling routine should not panic since the
3930 	 * system is in the process of being shutdown.
3931 	 *
3932 	 */
3933 
3934 	retry_cnt = 0;
3935 
3936 	if (result->kmmr_comm_state == MDMNE_RPC_FAIL) {
3937 		while (md_mn_is_commd_present() == 1) {
3938 			if ((++retry_cnt % MD_MN_WARN_INTVL) == 0)
3939 				break;
3940 			delay(md_hz);
3941 		}
3942 	}
3943 
3944 	return (0);
3945 }
3946 
3947 /*
3948  * Called to propagate the capability of a metadevice to all nodes in the set.
3949  *
3950  * On entry, lockp is set if the function has been called from within an ioctl.
3951  *
3952  * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
3953  * routine to enable other mdioctls to enter the kernel while this
3954  * thread of execution waits on the completion of mdmn_ksend_message. When
3955  * the message is completed the thread continues and md_ioctl_lock must be
3956  * reacquired.  Even though md_ioctl_lock is interruptable, we choose to
3957  * ignore EINTR as we must not return without acquiring md_ioctl_lock.
3958  */
3959 
3960 int
3961 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
3962 {
3963 	md_mn_msg_setcap_t	msg;
3964 	md_mn_kresult_t		*kres;
3965 	mdi_unit_t		*ui = MDI_UNIT(mnum);
3966 	int			ret;
3967 	k_sigset_t		oldmask, newmask;
3968 
3969 	(void) strncpy((char *)&msg.msg_setcap_driver,
3970 	    md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
3971 	msg.msg_setcap_mnum = mnum;
3972 	msg.msg_setcap_set = vc.vc_set;
3973 
3974 	if (lockp)
3975 		IOLOCK_RETURN_RELEASE(0, lockp);
3976 	kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3977 
3978 	/*
3979 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
3980 	 * interface from failing if the user process receives a signal while
3981 	 * in mdmn_ksend_message.
3982 	 */
3983 	sigfillset(&newmask);
3984 	sigreplace(&newmask, &oldmask);
3985 	ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
3986 	    MD_MSGF_NO_LOG, (char *)&msg, sizeof (md_mn_msg_setcap_t),
3987 	    kres));
3988 	sigreplace(&oldmask, (k_sigset_t *)NULL);
3989 
3990 	if (!MDMN_KSEND_MSG_OK(ret, kres)) {
3991 		mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
3992 		ret = EIO;
3993 	}
3994 	kmem_free(kres, sizeof (md_mn_kresult_t));
3995 
3996 	if (lockp) {
3997 		IOLOCK_RETURN_REACQUIRE(lockp);
3998 	}
3999 	return (ret);
4000 }
4001 
4002 /*
4003  * Called to clear all of the transient capabilities for a metadevice when it is
4004  * not open on any node in the cluster
4005  * Called from close for mirror and sp.
4006  */
4007 
4008 void
4009 mdmn_clear_all_capabilities(minor_t mnum)
4010 {
4011 	md_isopen_t	clumsg;
4012 	int		ret;
4013 	md_mn_kresult_t	*kresult;
4014 	volcap_t	vc;
4015 	k_sigset_t	oldmask, newmask;
4016 
4017 	clumsg.dev = md_makedevice(md_major, mnum);
4018 	clumsg.mde = mdnullerror;
4019 	/*
4020 	 * The check open message doesn't have to be logged, nor should the
4021 	 * result be stored in the MCT. We want an up-to-date state.
4022 	 */
4023 	kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4024 
4025 	/*
4026 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4027 	 * interface from failing if the user process receives a signal while
4028 	 * in mdmn_ksend_message.
4029 	 */
4030 	sigfillset(&newmask);
4031 	sigreplace(&newmask, &oldmask);
4032 	ret = mdmn_ksend_message(MD_MIN2SET(mnum),
4033 	    MD_MN_MSG_CLU_CHECK,
4034 	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT,
4035 	    (char *)&clumsg, sizeof (clumsg), kresult);
4036 	sigreplace(&oldmask, (k_sigset_t *)NULL);
4037 
4038 	if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
4039 		/*
4040 		 * Not open on any node, clear all capabilities, eg ABR and
4041 		 * DMR
4042 		 */
4043 		vc.vc_set = 0;
4044 		(void) mdmn_send_capability_message(mnum, vc, NULL);
4045 	}
4046 	kmem_free(kresult, sizeof (md_mn_kresult_t));
4047 }
4048 
4049 /*
4050  * mdmn_ksend_show_error:
4051  * ---------------------
4052  * Called to display the error contents of a failing mdmn_ksend_message() result
4053  *
4054  * Input:
4055  *	rv	- return value from mdmn_ksend_message()
4056  *	kres	- pointer to result structure filled in by mdmn_ksend_message
4057  *	s	- Informative message to identify failing condition (e.g.
4058  *		  "Ownership change") This string will be displayed with
4059  *		  cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
4060  *		  administrator
4061  */
4062 void
4063 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
4064 {
4065 	if (rv == 0) {
4066 		cmn_err(CE_WARN, "%s *FAILED*", s);
4067 		cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
4068 		    " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
4069 		    kres->kmmr_failing_node);
4070 	} else {
4071 		cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
4072 	}
4073 }
4074 
4075 /*
4076  * Callback routine for resync thread. If requested to suspend we mark the
4077  * commd as not being present.
4078  */
4079 boolean_t
4080 callb_md_mrs_cpr(void *arg, int code)
4081 {
4082 	callb_cpr_t *cp = (callb_cpr_t *)arg;
4083 	int ret = 0;				/* assume success */
4084 
4085 	mutex_enter(cp->cc_lockp);
4086 
4087 	switch (code) {
4088 	case CB_CODE_CPR_CHKPT:
4089 		/*
4090 		 * Mark the rpc.mdcommd as no longer present. We are trying to
4091 		 * suspend the system and so we should expect RPC failures to
4092 		 * occur.
4093 		 */
4094 		md_mn_clear_commd_present();
4095 		cp->cc_events |= CALLB_CPR_START;
4096 		while (!(cp->cc_events & CALLB_CPR_SAFE))
4097 			/* cv_timedwait() returns -1 if it times out. */
4098 			if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp,
4099 				lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1)
4100 				break;
4101 			break;
4102 
4103 	case CB_CODE_CPR_RESUME:
4104 		cp->cc_events &= ~CALLB_CPR_START;
4105 		cv_signal(&cp->cc_stop_cv);
4106 		break;
4107 	}
4108 	mutex_exit(cp->cc_lockp);
4109 	return (ret != -1);
4110 }
4111 
4112 
4113 void
4114 md_rem_hspname(set_t setno, mdkey_t n_key)
4115 {
4116 	int	s;
4117 	int	max_sides;
4118 
4119 
4120 	/* All entries removed are in the same diskset */
4121 	if (md_get_setstatus(setno) & MD_SET_MNSET)
4122 		max_sides = MD_MNMAXSIDES;
4123 	else
4124 		max_sides = MD_MAXSIDES;
4125 
4126 	for (s = 0; s < max_sides; s++)
4127 		(void) md_remdevname(setno, s, n_key);
4128 }
4129 
4130 
4131 int
4132 md_rem_selfname(minor_t selfid)
4133 {
4134 	int	s;
4135 	set_t	setno = MD_MIN2SET(selfid);
4136 	int	max_sides;
4137 	md_dev64_t	dev;
4138 	struct nm_next_hdr	*nh;
4139 	struct nm_name	*n;
4140 	mdkey_t key;
4141 
4142 	/*
4143 	 * Get the key since remove routine expects it
4144 	 */
4145 	dev = md_makedevice(md_major, selfid);
4146 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
4147 		return (ENOENT);
4148 	}
4149 
4150 	if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
4151 		MD_KEYWILD, dev, 0L)) == NULL) {
4152 		return (ENOENT);
4153 	}
4154 
4155 	/* All entries removed are in the same diskset */
4156 	key = n->n_key;
4157 	if (md_get_setstatus(setno) & MD_SET_MNSET)
4158 		max_sides = MD_MNMAXSIDES;
4159 	else
4160 		max_sides = MD_MAXSIDES;
4161 
4162 	for (s = 0; s < max_sides; s++)
4163 		(void) md_remdevname(setno, s, key);
4164 
4165 	return (0);
4166 }
4167 
4168 void
4169 md_upd_set_unnext(set_t setno, unit_t un)
4170 {
4171 	if (un < md_set[setno].s_un_next) {
4172 		md_set[setno].s_un_next = un;
4173 	}
4174 }
4175 
4176 struct hot_spare_pool *
4177 find_hot_spare_pool(set_t setno, int hsp_id)
4178 {
4179 	hot_spare_pool_t *hsp;
4180 
4181 	hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
4182 	while (hsp != NULL) {
4183 		if (hsp->hsp_self_id == hsp_id)
4184 			return (hsp);
4185 		hsp = hsp->hsp_next;
4186 	}
4187 
4188 	return ((hot_spare_pool_t *)0);
4189 }
4190