xref: /onnv-gate/usr/src/uts/common/io/lvm/md/md_subr.c (revision 10549:bb77cedc0815)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Driver for Virtual Disk.
29  */
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/user.h>
35 #include <sys/uio.h>
36 #include <sys/proc.h>
37 #include <sys/t_lock.h>
38 #include <sys/dkio.h>
39 #include <sys/kmem.h>
40 #include <sys/debug.h>
41 #include <sys/cmn_err.h>
42 #include <sys/sysmacros.h>
43 #include <sys/types.h>
44 #include <sys/mkdev.h>
45 #include <sys/vtoc.h>
46 #include <sys/open.h>
47 #include <sys/file.h>
48 #include <vm/page.h>
49 #include <sys/callb.h>
50 #include <sys/disp.h>
51 #include <sys/modctl.h>
52 #include <sys/errno.h>
53 #include <sys/door.h>
54 #include <sys/lvm/mdmn_commd.h>
55 #include <sys/lvm/md_hotspares.h>
56 
57 #include <sys/lvm/mdvar.h>
58 #include <sys/lvm/md_names.h>
59 
60 #include <sys/ddi.h>
61 #include <sys/proc.h>
62 #include <sys/sunddi.h>
63 #include <sys/esunddi.h>
64 
65 #include <sys/sysevent.h>
66 #include <sys/sysevent/eventdefs.h>
67 
68 #include <sys/sysevent/svm.h>
69 #include <sys/lvm/md_basic.h>
70 
71 
72 /*
73  * Machine specific Hertz is kept here
74  */
75 extern clock_t			md_hz;
76 
77 /*
78  * Externs.
79  */
80 extern int			(*mdv_strategy_tstpnt)(buf_t *, int, void*);
81 extern major_t			md_major;
82 extern unit_t			md_nunits;
83 extern set_t			md_nsets;
84 extern md_set_t			md_set[];
85 extern md_set_io_t		md_set_io[];
86 extern md_ops_t			**md_ops;
87 extern md_ops_t			*md_opslist;
88 extern ddi_modhandle_t		*md_mods;
89 extern dev_info_t		*md_devinfo;
90 
91 extern md_krwlock_t		md_unit_array_rw;
92 extern kmutex_t			md_mx;
93 extern kcondvar_t		md_cv;
94 
95 extern md_krwlock_t		hsp_rwlp;
96 extern md_krwlock_t		ni_rwlp;
97 
98 extern int			md_num_daemons;
99 extern int			md_status;
100 extern int			md_ioctl_cnt;
101 extern int			md_mtioctl_cnt;
102 
103 extern struct metatransops	metatransops;
104 extern md_event_queue_t		*md_event_queue;
105 extern md_resync_t		md_cpr_resync;
106 extern int			md_done_daemon_threads;
107 extern int			md_ff_daemon_threads;
108 
109 
110 extern mddb_set_t	*mddb_setenter(set_t setno, int flag, int *errorcodep);
111 extern void		mddb_setexit(mddb_set_t *s);
112 extern void		*lookup_entry(struct nm_next_hdr *, set_t,
113 				side_t, mdkey_t, md_dev64_t, int);
114 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
115 extern dev_t		getrootdev(void);
116 
117 struct mdq_anchor	md_done_daemon; /* done request queue */
118 struct mdq_anchor	md_mstr_daemon; /* mirror error, WOW requests */
119 struct mdq_anchor	md_mhs_daemon;	/* mirror hotspare requests queue */
120 struct mdq_anchor	md_hs_daemon;	/* raid hotspare requests queue */
121 struct mdq_anchor	md_ff_daemonq;	/* failfast request queue */
122 struct mdq_anchor	md_mirror_daemon; /* mirror owner queue */
123 struct mdq_anchor	md_mirror_io_daemon; /* mirror owner i/o queue */
124 struct mdq_anchor	md_mirror_rs_daemon; /* mirror resync done queue */
125 struct mdq_anchor	md_sp_daemon;	/* soft-part error daemon queue */
126 struct mdq_anchor	md_mto_daemon;	/* mirror timeout daemon queue */
127 
128 int md_done_daemon_threads = 1;	/* threads for md_done_daemon requestq */
129 int md_mstr_daemon_threads = 1;	/* threads for md_mstr_daemon requestq */
130 int md_mhs_daemon_threads = 1;	/* threads for md_mhs_daemon requestq */
131 int md_hs_daemon_threads = 1;	/* threads for md_hs_daemon requestq */
132 int md_ff_daemon_threads = 3;	/* threads for md_ff_daemon requestq */
133 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
134 int md_sp_daemon_threads = 1;	/* threads for md_sp_daemon requestq */
135 int md_mto_daemon_threads = 1;	/* threads for md_mto_daemon requestq */
136 
137 #ifdef DEBUG
138 /* Flag to switch on debug messages */
139 int md_release_reacquire_debug = 0;	/* debug flag */
140 #endif
141 
142 /*
143  *
144  * The md_request_queues is table of pointers to request queues and the number
145  * of threads associated with the request queues.
146  * When the number of threads is set to 1, then the order of execution is
147  * sequential.
148  * The number of threads for all the queues have been defined as global
149  * variables to enable kernel tuning.
150  *
151  */
152 
153 #define	MD_DAEMON_QUEUES 11
154 
155 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
156 	{&md_done_daemon, &md_done_daemon_threads},
157 	{&md_mstr_daemon, &md_mstr_daemon_threads},
158 	{&md_hs_daemon, &md_hs_daemon_threads},
159 	{&md_ff_daemonq, &md_ff_daemon_threads},
160 	{&md_mirror_daemon, &md_mirror_daemon_threads},
161 	{&md_mirror_io_daemon, &md_mirror_daemon_threads},
162 	{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
163 	{&md_sp_daemon, &md_sp_daemon_threads},
164 	{&md_mhs_daemon, &md_mhs_daemon_threads},
165 	{&md_mto_daemon, &md_mto_daemon_threads},
166 	{0, 0}
167 };
168 
169 /*
170  * Number of times a message is retried before issuing a warning to the operator
171  */
172 #define	MD_MN_WARN_INTVL	10
173 
174 /*
175  * Setting retry cnt to one (pre decremented) so that we actually do no
176  * retries when committing/deleting a mddb rec. The underlying disk driver
177  * does several retries to check if the disk is really dead or not so there
178  * is no reason for us to retry on top of the drivers retries.
179  */
180 
181 uint_t			md_retry_cnt = 1; /* global so it can be patched */
182 
183 /*
184  * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
185  * Again, made patchable here should it prove useful.
186  */
187 uint_t			md_send_retry_limit = 30;
188 
189 /*
190  * Bug # 1212146
191  * Before this change the user had to pass in a short aligned buffer because of
192  * problems in some underlying device drivers.  This problem seems to have been
193  * corrected in the underlying drivers so we will default to not requiring any
194  * alignment.  If the user needs to check for a specific alignment,
195  * md_uio_alignment_mask may be set in /etc/system to accomplish this.  To get
196  * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
197  * to check for word alignment, it can be set to 3, for double word alignment,
198  * it can be set to 7, etc.
199  *
200  * [Other part of fix is in function md_chk_uio()]
201  */
202 static int		md_uio_alignment_mask = 0;
203 
204 /*
205  * for md_dev64_t translation
206  */
207 struct md_xlate_table		*md_tuple_table;
208 struct md_xlate_major_table	*md_major_tuple_table;
209 int				md_tuple_length;
210 uint_t				md_majortab_len;
211 
212 /* Function declarations */
213 
214 static int md_create_probe_rqlist(md_probedev_impl_t *plist,
215 			daemon_queue_t **hdr, intptr_t (*probe_test)());
216 
217 /*
218  * manipulate global status
219  */
220 void
221 md_set_status(int bits)
222 {
223 	mutex_enter(&md_mx);
224 	md_status |= bits;
225 	mutex_exit(&md_mx);
226 }
227 
228 void
229 md_clr_status(int bits)
230 {
231 	mutex_enter(&md_mx);
232 	md_status &= ~bits;
233 	mutex_exit(&md_mx);
234 }
235 
236 int
237 md_get_status()
238 {
239 	int result;
240 	mutex_enter(&md_mx);
241 	result = md_status;
242 	mutex_exit(&md_mx);
243 	return (result);
244 }
245 
246 void
247 md_set_setstatus(set_t setno, int bits)
248 {
249 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
250 
251 	mutex_enter(&md_mx);
252 	md_set[setno].s_status |= bits;
253 	mutex_exit(&md_mx);
254 }
255 
256 void
257 md_clr_setstatus(set_t setno, int bits)
258 {
259 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
260 
261 	mutex_enter(&md_mx);
262 	md_set[setno].s_status &= ~bits;
263 	mutex_exit(&md_mx);
264 }
265 
266 uint_t
267 md_get_setstatus(set_t setno)
268 {
269 	uint_t result;
270 
271 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
272 
273 	mutex_enter(&md_mx);
274 	result = md_set[setno].s_status;
275 	mutex_exit(&md_mx);
276 	return (result);
277 }
278 
279 /*
280  * md_unit_readerlock_common:
281  * -------------------------
282  * Mark the given unit as having a reader reference. Spin waiting for any
283  * writer references to be released.
284  *
285  * Input:
286  *	ui		unit reference
287  *	lock_held	0 => ui_mx needs to be grabbed
288  *			1 => ui_mx already held
289  * Output:
290  *	mm_unit_t corresponding to unit structure
291  *	ui->ui_readercnt incremented
292  */
293 static void *
294 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
295 {
296 	uint_t	flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
297 
298 	if (!lock_held)
299 		mutex_enter(&ui->ui_mx);
300 	while (ui->ui_lock & flag) {
301 		if (panicstr) {
302 			if (ui->ui_lock & MD_UL_WRITER)
303 				panic("md: writer lock is held");
304 			break;
305 		}
306 		cv_wait(&ui->ui_cv, &ui->ui_mx);
307 	}
308 	ui->ui_readercnt++;
309 	if (!lock_held)
310 		mutex_exit(&ui->ui_mx);
311 	return (MD_UNIT(ui->ui_link.ln_id));
312 }
313 
314 void *
315 md_unit_readerlock(mdi_unit_t *ui)
316 {
317 	return (md_unit_readerlock_common(ui, 0));
318 }
319 
320 /*
321  * md_unit_writerlock_common:
322  * -------------------------
323  * Acquire a unique writer reference. Causes previous readers to drain.
324  * Spins if a writer reference already exists or if a previous reader/writer
325  * dropped the lock to allow a ksend_message to be despatched.
326  *
327  * Input:
328  *	ui		unit reference
329  *	lock_held	0 => grab ui_mx
330  *			1 => ui_mx already held on entry
331  * Output:
332  *	mm_unit_t reference
333  */
334 static void *
335 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
336 {
337 	uint_t	flag = MD_UL_WRITER;
338 
339 	if (panicstr)
340 		panic("md: writer lock not allowed");
341 
342 	if (!lock_held)
343 		mutex_enter(&ui->ui_mx);
344 
345 	while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
346 		ui->ui_wanabecnt++;
347 		ui->ui_lock |= MD_UL_WANABEWRITER;
348 		cv_wait(&ui->ui_cv, &ui->ui_mx);
349 		if (--ui->ui_wanabecnt == 0)
350 			ui->ui_lock &= ~MD_UL_WANABEWRITER;
351 	}
352 	ui->ui_lock |= MD_UL_WRITER;
353 	ui->ui_owner = curthread;
354 
355 	if (!lock_held)
356 		mutex_exit(&ui->ui_mx);
357 	return (MD_UNIT(ui->ui_link.ln_id));
358 }
359 
360 void *
361 md_unit_writerlock(mdi_unit_t *ui)
362 {
363 	return (md_unit_writerlock_common(ui, 0));
364 }
365 
366 /*
367  * md_unit_readerexit_common:
368  * -------------------------
369  * Release the readerlock for the specified unit. If the reader count reaches
370  * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
371  *
372  * Input:
373  *	ui		unit reference
374  *	lock_held	0 => ui_mx needs to be acquired
375  *			1 => ui_mx already held
376  */
377 static void
378 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
379 {
380 	if (!lock_held)
381 		mutex_enter(&ui->ui_mx);
382 	ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
383 	ASSERT(ui->ui_readercnt != 0);
384 	ui->ui_readercnt--;
385 	if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
386 		cv_broadcast(&ui->ui_cv);
387 
388 	if (!lock_held)
389 		mutex_exit(&ui->ui_mx);
390 }
391 
392 void
393 md_unit_readerexit(mdi_unit_t *ui)
394 {
395 	md_unit_readerexit_common(ui, 0);
396 }
397 
398 /*
399  * md_unit_writerexit_common:
400  * -------------------------
401  * Release the writerlock currently held on the unit. Wake any threads waiting
402  * on becoming reader or writer (MD_UL_WANABEWRITER set).
403  *
404  * Input:
405  *	ui		unit reference
406  *	lock_held	0 => ui_mx to be acquired
407  *			1 => ui_mx already held
408  */
409 static void
410 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
411 {
412 	if (!lock_held)
413 		mutex_enter(&ui->ui_mx);
414 	ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
415 	ASSERT(ui->ui_readercnt == 0);
416 	ui->ui_lock &= ~MD_UL_WRITER;
417 	ui->ui_owner = NULL;
418 
419 	cv_broadcast(&ui->ui_cv);
420 	if (!lock_held)
421 		mutex_exit(&ui->ui_mx);
422 }
423 
424 void
425 md_unit_writerexit(mdi_unit_t *ui)
426 {
427 	md_unit_writerexit_common(ui, 0);
428 }
429 
430 void *
431 md_io_readerlock(mdi_unit_t *ui)
432 {
433 	md_io_lock_t	*io = ui->ui_io_lock;
434 
435 	ASSERT(io);  /* checks case where no io lock allocated */
436 	mutex_enter(&io->io_mx);
437 	while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
438 		if (panicstr) {
439 			if (io->io_lock & MD_UL_WRITER)
440 				panic("md: writer lock is held");
441 			break;
442 		}
443 		cv_wait(&io->io_cv, &io->io_mx);
444 	}
445 	io->io_readercnt++;
446 	mutex_exit(&io->io_mx);
447 	return (MD_UNIT(ui->ui_link.ln_id));
448 }
449 
450 void *
451 md_io_writerlock(mdi_unit_t *ui)
452 {
453 	md_io_lock_t	*io = ui->ui_io_lock;
454 
455 	ASSERT(io);  /* checks case where no io lock allocated */
456 	if (panicstr)
457 		panic("md: writer lock not allowed");
458 
459 	mutex_enter(&io->io_mx);
460 	while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
461 		io->io_wanabecnt++;
462 		io->io_lock |= MD_UL_WANABEWRITER;
463 		cv_wait(&io->io_cv, &io->io_mx);
464 		if (--io->io_wanabecnt == 0)
465 			io->io_lock &= ~MD_UL_WANABEWRITER;
466 	}
467 	io->io_lock |= MD_UL_WRITER;
468 	io->io_owner = curthread;
469 
470 	mutex_exit(&io->io_mx);
471 	return (MD_UNIT(ui->ui_link.ln_id));
472 }
473 
474 void
475 md_io_readerexit(mdi_unit_t *ui)
476 {
477 	md_io_lock_t	*io = ui->ui_io_lock;
478 
479 	mutex_enter(&io->io_mx);
480 	ASSERT((io->io_lock & MD_UL_WRITER) == 0);
481 	ASSERT(io->io_readercnt != 0);
482 	io->io_readercnt--;
483 	if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
484 		cv_broadcast(&io->io_cv);
485 	}
486 	mutex_exit(&io->io_mx);
487 }
488 
489 void
490 md_io_writerexit(mdi_unit_t *ui)
491 {
492 	md_io_lock_t	*io = ui->ui_io_lock;
493 
494 	mutex_enter(&io->io_mx);
495 	ASSERT((io->io_lock & MD_UL_WRITER) != 0);
496 	ASSERT(io->io_readercnt == 0);
497 	io->io_lock &= ~MD_UL_WRITER;
498 	io->io_owner = NULL;
499 
500 	cv_broadcast(&io->io_cv);
501 	mutex_exit(&io->io_mx);
502 }
503 
504 /*
505  * Attempt to grab that set of locks defined as global.
506  * A mask containing the set of global locks that are owned upon
507  * entry is input.  Any additional global locks are then grabbed.
508  * This keeps the caller from having to know the set of global
509  * locks.
510  */
511 static int
512 md_global_lock_enter(int global_locks_owned_mask)
513 {
514 
515 	/*
516 	 * The current implementation has been verified by inspection
517 	 * and test to be deadlock free.  If another global lock is
518 	 * added, changing the algorithm used by this function should
519 	 * be considered.  With more than 2 locks it is difficult to
520 	 * guarantee that locks are being acquired in the correct order.
521 	 * The safe approach would be to drop all of the locks that are
522 	 * owned at function entry and then reacquire all of the locks
523 	 * in the order defined by the lock hierarchy.
524 	 */
525 	mutex_enter(&md_mx);
526 	if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
527 		while ((md_mtioctl_cnt != 0) ||
528 		    (md_status & MD_GBL_IOCTL_LOCK)) {
529 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
530 				mutex_exit(&md_mx);
531 				return (EINTR);
532 			}
533 		}
534 		md_status |= MD_GBL_IOCTL_LOCK;
535 		md_ioctl_cnt++;
536 	}
537 	if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
538 		while (md_status & MD_GBL_HS_LOCK) {
539 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
540 				md_status &= ~MD_GBL_IOCTL_LOCK;
541 				mutex_exit(&md_mx);
542 				return (EINTR);
543 			}
544 		}
545 		md_status |= MD_GBL_HS_LOCK;
546 	}
547 	mutex_exit(&md_mx);
548 	return (0);
549 }
550 
551 /*
552  * Release the set of global locks that were grabbed in md_global_lock_enter
553  * that were not already owned by the calling thread.  The set of previously
554  * owned global locks is passed in as a mask parameter.
555  */
556 static int
557 md_global_lock_exit(int global_locks_owned_mask, int code,
558 	int flags, mdi_unit_t *ui)
559 {
560 	mutex_enter(&md_mx);
561 
562 	/* If MT ioctl decrement mt_ioctl_cnt */
563 	if ((flags & MD_MT_IOCTL)) {
564 		md_mtioctl_cnt--;
565 	} else {
566 		if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
567 			/* clear the lock and decrement count */
568 			ASSERT(md_ioctl_cnt == 1);
569 			md_ioctl_cnt--;
570 			md_status &= ~MD_GBL_IOCTL_LOCK;
571 		}
572 		if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
573 			md_status &= ~MD_GBL_HS_LOCK;
574 	}
575 	if (flags & MD_READER_HELD)
576 		md_unit_readerexit(ui);
577 	if (flags & MD_WRITER_HELD)
578 		md_unit_writerexit(ui);
579 	if (flags & MD_IO_HELD)
580 		md_io_writerexit(ui);
581 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
582 		rw_exit(&md_unit_array_rw.lock);
583 	}
584 	cv_broadcast(&md_cv);
585 	mutex_exit(&md_mx);
586 
587 	return (code);
588 }
589 
590 /*
591  * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
592  * use of the md_global_lock_{enter|exit} functions to avoid duplication
593  * of code.  They rely upon the fact that the locks that are specified in
594  * the input mask are not acquired or freed.  If this algorithm changes
595  * as described in the block comment at the beginning of md_global_lock_enter
596  * then it will be necessary to change these 2 functions.  Otherwise these
597  * functions will be grabbing and holding global locks unnecessarily.
598  */
599 int
600 md_ioctl_lock_enter(void)
601 {
602 	/* grab only the ioctl lock */
603 	return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
604 }
605 
606 /*
607  * If md_ioctl_lock_exit is being called at the end of an ioctl before
608  * returning to user space, then ioctl_end is set to 1.
609  * Otherwise, the ioctl lock is being dropped in the middle of handling
610  * an ioctl and will be reacquired before the end of the ioctl.
611  * Do not attempt to process the MN diskset mddb parse flags unless
612  * ioctl_end is true - otherwise a deadlock situation could arise.
613  */
614 int
615 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
616 {
617 	int				ret_val;
618 	uint_t				status;
619 	mddb_set_t			*s;
620 	int				i;
621 	int				err;
622 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
623 	md_mn_kresult_t			*kresult;
624 	mddb_lb_t			*lbp;
625 	int				rval = 1;
626 	int				flag;
627 
628 	/* release only the ioctl lock */
629 	ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
630 
631 	/*
632 	 * If md_ioctl_lock_exit is being called with a possible lock held
633 	 * (ioctl_end is 0), then don't check the MN disksets since the
634 	 * call to mddb_setenter may cause a lock ordering deadlock.
635 	 */
636 	if (!ioctl_end)
637 		return (ret_val);
638 
639 	/*
640 	 * Walk through disksets to see if there is a MN diskset that
641 	 * has messages that need to be sent.  Set must be snarfed and
642 	 * be a MN diskset in order to be checked.
643 	 *
644 	 * In a MN diskset, this routine may send messages to the
645 	 * rpc.mdcommd in order to have the slave nodes re-parse parts
646 	 * of the mddb.  Messages can only be sent with no locks held,
647 	 * so if mddb change occurred while the ioctl lock is held, this
648 	 * routine must send the messages.
649 	 */
650 	for (i = 1; i < md_nsets; i++) {
651 		status = md_get_setstatus(i);
652 
653 		/* Set must be snarfed and be a MN diskset */
654 		if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
655 		    (MD_SET_SNARFED | MD_SET_MNSET))
656 			continue;
657 
658 		/* Grab set lock so that set can't change */
659 		if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
660 			continue;
661 
662 		lbp = s->s_lbp;
663 
664 		/* Re-get set status now that lock is held */
665 		status = md_get_setstatus(i);
666 
667 		/*
668 		 * If MN parsing block flag is set - continue to next set.
669 		 *
670 		 * If s_mn_parseflags_sending is non-zero, then another thread
671 		 * is already currently sending a parse message, so just
672 		 * release the set mutex.  If this ioctl had caused an mddb
673 		 * change that results in a parse message to be generated,
674 		 * the thread that is currently sending a parse message would
675 		 * generate the additional parse message.
676 		 *
677 		 * If s_mn_parseflags_sending is zero then loop until
678 		 * s_mn_parseflags is 0 (until there are no more
679 		 * messages to send).
680 		 * While s_mn_parseflags is non-zero,
681 		 *	put snapshot of parse_flags in s_mn_parseflags_sending
682 		 *	set s_mn_parseflags to zero
683 		 *	release set mutex
684 		 *	send message
685 		 *	re-grab set mutex
686 		 *	set s_mn_parseflags_sending to zero
687 		 *
688 		 * If set is STALE, send message with NO_LOG flag so that
689 		 * rpc.mdcommd won't attempt to log message to non-writeable
690 		 * replica.
691 		 */
692 		mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
693 		    KM_SLEEP);
694 		while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
695 		    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
696 		    (!(status & MD_SET_MNPARSE_BLK))) {
697 
698 			/* Grab snapshot of parse flags */
699 			s->s_mn_parseflags_sending = s->s_mn_parseflags;
700 			s->s_mn_parseflags = 0;
701 
702 			mutex_exit(&md_set[(s)->s_setno].s_dbmx);
703 
704 			/*
705 			 * Send the message to the slaves to re-parse
706 			 * the indicated portions of the mddb. Send the status
707 			 * of the 50 mddbs in this set so that slaves know
708 			 * which mddbs that the master node thinks are 'good'.
709 			 * Otherwise, slave may reparse, but from wrong
710 			 * replica.
711 			 */
712 			mddb_parse_msg->msg_parse_flags =
713 			    s->s_mn_parseflags_sending;
714 
715 			for (i = 0; i < MDDB_NLB; i++) {
716 				mddb_parse_msg->msg_lb_flags[i] =
717 				    lbp->lb_locators[i].l_flags;
718 			}
719 			kresult = kmem_zalloc(sizeof (md_mn_kresult_t),
720 			    KM_SLEEP);
721 			while (rval != 0) {
722 				flag = 0;
723 				if (status & MD_SET_STALE)
724 					flag |= MD_MSGF_NO_LOG;
725 				rval = mdmn_ksend_message(s->s_setno,
726 				    MD_MN_MSG_MDDB_PARSE, flag, 0,
727 				    (char *)mddb_parse_msg,
728 				    sizeof (md_mn_msg_mddb_parse_t), kresult);
729 				/* if the node hasn't yet joined, it's Ok. */
730 				if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
731 				    (kresult->kmmr_comm_state !=
732 				    MDMNE_NOT_JOINED)) {
733 					mdmn_ksend_show_error(rval, kresult,
734 					    "MD_MN_MSG_MDDB_PARSE");
735 					cmn_err(CE_WARN, "md_ioctl_lock_exit: "
736 					    "Unable to send mddb update "
737 					    "message to other nodes in "
738 					    "diskset %s\n", s->s_setname);
739 					rval = 1;
740 				}
741 			}
742 			kmem_free(kresult, sizeof (md_mn_kresult_t));
743 
744 			/*
745 			 * Re-grab mutex to clear sending field and to
746 			 * see if another parse message needs to be generated.
747 			 */
748 			mutex_enter(&md_set[(s)->s_setno].s_dbmx);
749 			s->s_mn_parseflags_sending = 0;
750 		}
751 		kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
752 		mutex_exit(&md_set[(s)->s_setno].s_dbmx);
753 	}
754 	return (ret_val);
755 }
756 
757 /*
758  * Called when in an ioctl and need readerlock.
759  */
760 void *
761 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
762 {
763 	ASSERT(lock != NULL);
764 	lock->l_ui = ui;
765 	lock->l_flags |= MD_READER_HELD;
766 	return (md_unit_readerlock_common(ui, 0));
767 }
768 
769 /*
770  * Called when in an ioctl and need writerlock.
771  */
772 void *
773 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
774 {
775 	ASSERT(lock != NULL);
776 	lock->l_ui = ui;
777 	lock->l_flags |= MD_WRITER_HELD;
778 	return (md_unit_writerlock_common(ui, 0));
779 }
780 
781 void *
782 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
783 {
784 	ASSERT(lock != NULL);
785 	lock->l_ui = ui;
786 	lock->l_flags |= MD_IO_HELD;
787 	return (md_io_writerlock(ui));
788 }
789 
790 void
791 md_ioctl_readerexit(IOLOCK *lock)
792 {
793 	ASSERT(lock != NULL);
794 	lock->l_flags &= ~MD_READER_HELD;
795 	md_unit_readerexit(lock->l_ui);
796 }
797 
798 void
799 md_ioctl_writerexit(IOLOCK *lock)
800 {
801 	ASSERT(lock != NULL);
802 	lock->l_flags &= ~MD_WRITER_HELD;
803 	md_unit_writerexit(lock->l_ui);
804 }
805 
806 void
807 md_ioctl_io_exit(IOLOCK *lock)
808 {
809 	ASSERT(lock != NULL);
810 	lock->l_flags &= ~MD_IO_HELD;
811 	md_io_writerexit(lock->l_ui);
812 }
813 
814 /*
815  * md_ioctl_releaselocks:
816  * --------------------
817  * Release the unit locks that are held and stop subsequent
818  * md_unit_reader/writerlock calls from progressing. This allows the caller
819  * to send messages across the cluster when running in a multinode
820  * environment.
821  * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
822  * allowed to progress as normal. This is required as these typically are
823  * invoked by the message handler that may be called while a unit lock is
824  * marked as released.
825  *
826  * On entry:
827  *	variety of unit locks may be held including ioctl lock
828  *
829  * On exit:
830  *      locks released and unit structure updated to prevent subsequent reader/
831  *      writer locks being acquired until md_ioctl_reacquirelocks is called
832  */
833 void
834 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
835 {
836 	/* This actually releases the locks. */
837 	(void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
838 }
839 
840 /*
841  * md_ioctl_reacquirelocks:
842  * ----------------------
843  * Reacquire the locks that were held when md_ioctl_releaselocks
844  * was called.
845  *
846  * On entry:
847  *      No unit locks held
848  * On exit:
849  *	locks held that were held at md_ioctl_releaselocks time including
850  *	the ioctl lock.
851  */
852 void
853 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
854 {
855 	if (flags & MD_MT_IOCTL) {
856 		mutex_enter(&md_mx);
857 		md_mtioctl_cnt++;
858 		mutex_exit(&md_mx);
859 	} else {
860 		while (md_ioctl_lock_enter() == EINTR)
861 			;
862 	}
863 	if (flags & MD_ARRAY_WRITER) {
864 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
865 	} else if (flags & MD_ARRAY_READER) {
866 		rw_enter(&md_unit_array_rw.lock, RW_READER);
867 	}
868 	if (ui != (mdi_unit_t *)NULL) {
869 		if (flags & MD_IO_HELD) {
870 			(void) md_io_writerlock(ui);
871 		}
872 
873 		mutex_enter(&ui->ui_mx);
874 		if (flags & MD_READER_HELD) {
875 			(void) md_unit_readerlock_common(ui, 1);
876 		} else if (flags & MD_WRITER_HELD) {
877 			(void) md_unit_writerlock_common(ui, 1);
878 		}
879 		/* Wake up any blocked readerlock() calls */
880 		cv_broadcast(&ui->ui_cv);
881 		mutex_exit(&ui->ui_mx);
882 	}
883 }
884 
885 void
886 md_ioctl_droplocks(IOLOCK *lock)
887 {
888 	mdi_unit_t	*ui;
889 	int		flags;
890 
891 	ASSERT(lock != NULL);
892 	ui = lock->l_ui;
893 	flags = lock->l_flags;
894 	if (flags & MD_READER_HELD) {
895 		lock->l_flags &= ~MD_READER_HELD;
896 		md_unit_readerexit(ui);
897 	}
898 	if (flags & MD_WRITER_HELD) {
899 		lock->l_flags &= ~MD_WRITER_HELD;
900 		md_unit_writerexit(ui);
901 	}
902 	if (flags & MD_IO_HELD) {
903 		lock->l_flags &= ~MD_IO_HELD;
904 		md_io_writerexit(ui);
905 	}
906 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
907 		lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
908 		rw_exit(&md_unit_array_rw.lock);
909 	}
910 }
911 
912 void
913 md_array_writer(IOLOCK *lock)
914 {
915 	ASSERT(lock != NULL);
916 	lock->l_flags |= MD_ARRAY_WRITER;
917 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
918 }
919 
920 void
921 md_array_reader(IOLOCK *lock)
922 {
923 	ASSERT(lock != NULL);
924 	lock->l_flags |= MD_ARRAY_READER;
925 	rw_enter(&md_unit_array_rw.lock, RW_READER);
926 }
927 
928 /*
929  * Called when in an ioctl and need opencloselock.
930  * Sets flags in lockp for READER_HELD.
931  */
932 void *
933 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
934 {
935 	void	*un;
936 
937 	ASSERT(lockp != NULL);
938 	mutex_enter(&ui->ui_mx);
939 	while (ui->ui_lock & MD_UL_OPENORCLOSE)
940 		cv_wait(&ui->ui_cv, &ui->ui_mx);
941 	ui->ui_lock |= MD_UL_OPENORCLOSE;
942 
943 	/* Maintain mutex across the readerlock call */
944 	lockp->l_ui = ui;
945 	lockp->l_flags |= MD_READER_HELD;
946 	un = md_unit_readerlock_common(ui, 1);
947 	mutex_exit(&ui->ui_mx);
948 
949 	return (un);
950 }
951 
952 /*
953  * Clears reader lock using md_ioctl instead of md_unit
954  * and updates lockp.
955  */
956 void
957 md_ioctl_openclose_exit(IOLOCK *lockp)
958 {
959 	mdi_unit_t	*ui;
960 
961 	ASSERT(lockp != NULL);
962 	ui = lockp->l_ui;
963 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
964 
965 	md_ioctl_readerexit(lockp);
966 
967 	mutex_enter(&ui->ui_mx);
968 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
969 
970 	cv_broadcast(&ui->ui_cv);
971 	mutex_exit(&ui->ui_mx);
972 }
973 
974 /*
975  * Clears reader lock using md_ioctl instead of md_unit
976  * and updates lockp.
977  * Does not acquire or release the ui_mx lock since the calling
978  * routine has already acquired this lock.
979  */
980 void
981 md_ioctl_openclose_exit_lh(IOLOCK *lockp)
982 {
983 	mdi_unit_t	*ui;
984 
985 	ASSERT(lockp != NULL);
986 	ui = lockp->l_ui;
987 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
988 
989 	lockp->l_flags &= ~MD_READER_HELD;
990 	md_unit_readerexit_common(lockp->l_ui, 1);
991 
992 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
993 	cv_broadcast(&ui->ui_cv);
994 }
995 
996 void *
997 md_unit_openclose_enter(mdi_unit_t *ui)
998 {
999 	void	*un;
1000 
1001 	mutex_enter(&ui->ui_mx);
1002 	while (ui->ui_lock & (MD_UL_OPENORCLOSE))
1003 		cv_wait(&ui->ui_cv, &ui->ui_mx);
1004 	ui->ui_lock |= MD_UL_OPENORCLOSE;
1005 
1006 	/* Maintain mutex across the readerlock call */
1007 	un = md_unit_readerlock_common(ui, 1);
1008 	mutex_exit(&ui->ui_mx);
1009 
1010 	return (un);
1011 }
1012 
1013 void
1014 md_unit_openclose_exit(mdi_unit_t *ui)
1015 {
1016 	md_unit_readerexit(ui);
1017 
1018 	mutex_enter(&ui->ui_mx);
1019 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1020 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1021 
1022 	cv_broadcast(&ui->ui_cv);
1023 	mutex_exit(&ui->ui_mx);
1024 }
1025 
1026 /*
1027  * Drop the openclose and readerlocks without acquiring or
1028  * releasing the ui_mx lock since the calling routine has
1029  * already acquired this lock.
1030  */
1031 void
1032 md_unit_openclose_exit_lh(mdi_unit_t *ui)
1033 {
1034 	md_unit_readerexit_common(ui, 1);
1035 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1036 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1037 	cv_broadcast(&ui->ui_cv);
1038 }
1039 
1040 int
1041 md_unit_isopen(
1042 	mdi_unit_t	*ui
1043 )
1044 {
1045 	int		isopen;
1046 
1047 	/* check status */
1048 	mutex_enter(&ui->ui_mx);
1049 	isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
1050 	mutex_exit(&ui->ui_mx);
1051 	return (isopen);
1052 }
1053 
1054 int
1055 md_unit_incopen(
1056 	minor_t		mnum,
1057 	int		flag,
1058 	int		otyp
1059 )
1060 {
1061 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1062 	int		err = 0;
1063 
1064 	/* check type and flags */
1065 	ASSERT(ui != NULL);
1066 	mutex_enter(&ui->ui_mx);
1067 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1068 		err = EINVAL;
1069 		goto out;
1070 	}
1071 	if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
1072 	    (ui->ui_lock & MD_UL_EXCL)) {
1073 		err = EBUSY;
1074 		goto out;
1075 	}
1076 
1077 	/* count and flag open */
1078 	ui->ui_ocnt[otyp]++;
1079 	ui->ui_lock |= MD_UL_OPEN;
1080 	if (flag & FEXCL)
1081 		ui->ui_lock |= MD_UL_EXCL;
1082 
1083 	/* setup kstat, return success */
1084 	mutex_exit(&ui->ui_mx);
1085 	md_kstat_init(mnum);
1086 	return (0);
1087 
1088 	/* return error */
1089 out:
1090 	mutex_exit(&ui->ui_mx);
1091 	return (err);
1092 }
1093 
1094 int
1095 md_unit_decopen(
1096 	minor_t		mnum,
1097 	int		otyp
1098 )
1099 {
1100 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1101 	int		err = 0;
1102 	unsigned	i;
1103 
1104 	/* check type and flags */
1105 	ASSERT(ui != NULL);
1106 	mutex_enter(&ui->ui_mx);
1107 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1108 		err = EINVAL;
1109 		goto out;
1110 	} else if (ui->ui_ocnt[otyp] == 0) {
1111 		err = ENXIO;
1112 		goto out;
1113 	}
1114 
1115 	/* count and flag closed */
1116 	if (otyp == OTYP_LYR)
1117 		ui->ui_ocnt[otyp]--;
1118 	else
1119 		ui->ui_ocnt[otyp] = 0;
1120 	ui->ui_lock &= ~MD_UL_OPEN;
1121 	for (i = 0; (i < OTYPCNT); ++i)
1122 		if (ui->ui_ocnt[i] != 0)
1123 			ui->ui_lock |= MD_UL_OPEN;
1124 	if (! (ui->ui_lock & MD_UL_OPEN))
1125 		ui->ui_lock &= ~MD_UL_EXCL;
1126 
1127 	/* teardown kstat, return success */
1128 	if (! (ui->ui_lock & MD_UL_OPEN)) {
1129 		mutex_exit(&ui->ui_mx);
1130 		md_kstat_destroy(mnum);
1131 		return (0);
1132 	}
1133 
1134 	/* return success */
1135 out:
1136 	mutex_exit(&ui->ui_mx);
1137 	return (err);
1138 }
1139 
1140 md_dev64_t
1141 md_xlate_targ_2_mini(md_dev64_t targ_devt)
1142 {
1143 	dev32_t		mini_32_devt, targ_32_devt;
1144 	int		i;
1145 
1146 	/*
1147 	 * check to see if we're in an upgrade situation
1148 	 * if we are not in upgrade just return the input device
1149 	 */
1150 
1151 	if (!MD_UPGRADE)
1152 		return (targ_devt);
1153 
1154 	targ_32_devt = md_cmpldev(targ_devt);
1155 
1156 	i = 0;
1157 	while (i != md_tuple_length) {
1158 		if (md_tuple_table[i].targ_devt == targ_32_devt) {
1159 			mini_32_devt = md_tuple_table[i].mini_devt;
1160 			return (md_expldev((md_dev64_t)mini_32_devt));
1161 		}
1162 		i++;
1163 	}
1164 	return (NODEV64);
1165 }
1166 
1167 md_dev64_t
1168 md_xlate_mini_2_targ(md_dev64_t mini_devt)
1169 {
1170 	dev32_t		mini_32_devt, targ_32_devt;
1171 	int		i;
1172 
1173 	if (!MD_UPGRADE)
1174 		return (mini_devt);
1175 
1176 	mini_32_devt = md_cmpldev(mini_devt);
1177 
1178 	i = 0;
1179 	while (i != md_tuple_length) {
1180 		if (md_tuple_table[i].mini_devt == mini_32_devt) {
1181 			targ_32_devt = md_tuple_table[i].targ_devt;
1182 			return (md_expldev((md_dev64_t)targ_32_devt));
1183 		}
1184 		i++;
1185 	}
1186 	return (NODEV64);
1187 }
1188 
1189 void
1190 md_xlate_free(int size)
1191 {
1192 	kmem_free(md_tuple_table, size);
1193 }
1194 
1195 char *
1196 md_targ_major_to_name(major_t maj)
1197 {
1198 	char *drv_name = NULL;
1199 	int	i;
1200 
1201 	if (!MD_UPGRADE)
1202 		return (ddi_major_to_name(maj));
1203 
1204 	for (i = 0; i < md_majortab_len; i++) {
1205 		if (md_major_tuple_table[i].targ_maj == maj) {
1206 			drv_name = md_major_tuple_table[i].drv_name;
1207 			break;
1208 		}
1209 	}
1210 	return (drv_name);
1211 }
1212 
1213 major_t
1214 md_targ_name_to_major(char *drv_name)
1215 {
1216 	major_t maj;
1217 	int	i;
1218 
1219 	maj = md_getmajor(NODEV64);
1220 	if (!MD_UPGRADE)
1221 		return (ddi_name_to_major(drv_name));
1222 
1223 	for (i = 0; i < md_majortab_len; i++) {
1224 		if ((strcmp(md_major_tuple_table[i].drv_name,
1225 		    drv_name)) == 0) {
1226 			maj = md_major_tuple_table[i].targ_maj;
1227 			break;
1228 		}
1229 	}
1230 
1231 	return (maj);
1232 }
1233 
1234 void
1235 md_majortab_free()
1236 {
1237 	size_t	sz;
1238 	int	i;
1239 
1240 	for (i = 0; i < md_majortab_len; i++) {
1241 		freestr(md_major_tuple_table[i].drv_name);
1242 	}
1243 
1244 	sz = md_majortab_len * sizeof (struct md_xlate_major_table);
1245 	kmem_free(md_major_tuple_table, sz);
1246 }
1247 
1248 /* functions return a pointer to a function which returns an int */
1249 
1250 intptr_t (*
1251 md_get_named_service(md_dev64_t dev, int modindex, char *name,
1252 	intptr_t (*Default)()))()
1253 {
1254 	mdi_unit_t		*ui;
1255 	md_named_services_t	*sp;
1256 	int			i;
1257 
1258 	/*
1259 	 * Return the first named service found.
1260 	 * Use this path when it is known that there is only
1261 	 * one named service possible (e.g., hotspare interface)
1262 	 */
1263 	if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
1264 		for (i = 0; i < MD_NOPS; i++) {
1265 			if (md_ops[i] == NULL) {
1266 				continue;
1267 			}
1268 			sp = md_ops[i]->md_services;
1269 			if (sp == NULL)
1270 				continue;
1271 			while (sp->md_service != NULL) {
1272 				if (strcmp(name, sp->md_name) == 0)
1273 					return (sp->md_service);
1274 				sp++;
1275 			}
1276 		}
1277 		return (Default);
1278 	}
1279 
1280 	/*
1281 	 * Return the named service for the given modindex.
1282 	 * This is used if there are multiple possible named services
1283 	 * and each one needs to be called (e.g., poke hotspares)
1284 	 */
1285 	if (dev == NODEV64) {
1286 		if (modindex >= MD_NOPS)
1287 			return (Default);
1288 
1289 		if (md_ops[modindex] == NULL)
1290 			return (Default);
1291 
1292 		sp = md_ops[modindex]->md_services;
1293 		if (sp == NULL)
1294 			return (Default);
1295 
1296 		while (sp->md_service != NULL) {
1297 			if (strcmp(name, sp->md_name) == 0)
1298 				return (sp->md_service);
1299 			sp++;
1300 		}
1301 		return (Default);
1302 	}
1303 
1304 	/*
1305 	 * Return the named service for this md_dev64_t
1306 	 */
1307 	if (md_getmajor(dev) != md_major)
1308 		return (Default);
1309 
1310 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
1311 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
1312 		return (NULL);
1313 
1314 
1315 	if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
1316 		return (NULL);
1317 
1318 	sp = md_ops[ui->ui_opsindex]->md_services;
1319 	if (sp == NULL)
1320 		return (Default);
1321 	while (sp->md_service != NULL) {
1322 		if (strcmp(name, sp->md_name) == 0)
1323 			return (sp->md_service);
1324 		sp++;
1325 	}
1326 	return (Default);
1327 }
1328 
1329 /*
1330  * md_daemon callback routine
1331  */
1332 boolean_t
1333 callb_md_cpr(void *arg, int code)
1334 {
1335 	callb_cpr_t *cp = (callb_cpr_t *)arg;
1336 	int ret = 0;				/* assume success */
1337 
1338 	mutex_enter(cp->cc_lockp);
1339 
1340 	switch (code) {
1341 	case CB_CODE_CPR_CHKPT:
1342 		/*
1343 		 * Check for active resync threads
1344 		 */
1345 		mutex_enter(&md_cpr_resync.md_resync_mutex);
1346 		if ((md_cpr_resync.md_mirror_resync > 0) ||
1347 		    (md_cpr_resync.md_raid_resync > 0)) {
1348 			mutex_exit(&md_cpr_resync.md_resync_mutex);
1349 			cmn_err(CE_WARN, "There are Solaris Volume Manager "
1350 			    "synchronization threads running.");
1351 			cmn_err(CE_WARN, "Please try system suspension at "
1352 			    "a later time.");
1353 			ret = -1;
1354 			break;
1355 		}
1356 		mutex_exit(&md_cpr_resync.md_resync_mutex);
1357 
1358 		cp->cc_events |= CALLB_CPR_START;
1359 		while (!(cp->cc_events & CALLB_CPR_SAFE))
1360 			/* cv_timedwait() returns -1 if it times out. */
1361 			if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp,
1362 			    lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1)
1363 				break;
1364 			break;
1365 
1366 	case CB_CODE_CPR_RESUME:
1367 		cp->cc_events &= ~CALLB_CPR_START;
1368 		cv_signal(&cp->cc_stop_cv);
1369 		break;
1370 	}
1371 	mutex_exit(cp->cc_lockp);
1372 	return (ret != -1);
1373 }
1374 
1375 void
1376 md_daemon(int pass_thru, mdq_anchor_t *anchor)
1377 {
1378 	daemon_queue_t  *dq;
1379 	callb_cpr_t	cprinfo;
1380 
1381 	if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
1382 		return;
1383 	/*
1384 	 * Register cpr callback
1385 	 */
1386 	CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
1387 
1388 	/*CONSTCOND*/
1389 	while (1) {
1390 		mutex_enter(&anchor->a_mx);
1391 		while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
1392 			if (pass_thru) {
1393 				/*
1394 				 * CALLB_CPR_EXIT Will do
1395 				 * mutex_exit(&anchor->a_mx)
1396 				 */
1397 				CALLB_CPR_EXIT(&cprinfo);
1398 				return;
1399 			}
1400 			if (md_get_status() & MD_GBL_DAEMONS_DIE) {
1401 				mutex_exit(&anchor->a_mx);
1402 				mutex_enter(&md_mx);
1403 				md_num_daemons--;
1404 				mutex_exit(&md_mx);
1405 				/*
1406 				 * CALLB_CPR_EXIT will do
1407 				 * mutex_exit(&anchor->a_mx)
1408 				 */
1409 				mutex_enter(&anchor->a_mx);
1410 				CALLB_CPR_EXIT(&cprinfo);
1411 				thread_exit();
1412 			}
1413 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1414 			cv_wait(&anchor->a_cv, &anchor->a_mx);
1415 			CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
1416 		}
1417 		dq->dq_prev->dq_next = dq->dq_next;
1418 		dq->dq_next->dq_prev = dq->dq_prev;
1419 		dq->dq_prev = dq->dq_next = NULL;
1420 		anchor->dq.qlen--;
1421 		mutex_exit(&anchor->a_mx);
1422 		(*(dq->dq_call))(dq);
1423 	}
1424 	/*NOTREACHED*/
1425 }
1426 
1427 /*
1428  * daemon_request:
1429  *
1430  * Adds requests to appropriate requestq which is
1431  * anchored by *anchor.
1432  * The request is the first element of a doubly linked circular list.
1433  * When the request is a single element, the forward and backward
1434  * pointers MUST point to the element itself.
1435  */
1436 
1437 void
1438 daemon_request(mdq_anchor_t *anchor, void (*func)(),
1439 				daemon_queue_t *request, callstyle_t style)
1440 {
1441 	daemon_queue_t *rqtp;
1442 	int i = 0;
1443 
1444 	rqtp = request;
1445 	if (style == REQ_OLD) {
1446 		ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
1447 		/* set it to the new style */
1448 		rqtp->dq_prev = rqtp->dq_next = rqtp;
1449 	}
1450 	ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
1451 
1452 	/* scan the list and add the function to each element */
1453 
1454 	do {
1455 		rqtp->dq_call = func;
1456 		i++;
1457 		rqtp = rqtp->dq_next;
1458 	} while (rqtp != request);
1459 
1460 	/* save pointer to tail of the request list */
1461 	rqtp = request->dq_prev;
1462 
1463 	mutex_enter(&anchor->a_mx);
1464 	/* stats */
1465 	anchor->dq.qlen += i;
1466 	anchor->dq.treqs += i;
1467 	anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
1468 	    anchor->dq.qlen : anchor->dq.maxq_len;
1469 
1470 	/* now add the list to request queue */
1471 	request->dq_prev = anchor->dq.dq_prev;
1472 	rqtp->dq_next = &anchor->dq;
1473 	anchor->dq.dq_prev->dq_next = request;
1474 	anchor->dq.dq_prev = rqtp;
1475 	cv_broadcast(&anchor->a_cv);
1476 	mutex_exit(&anchor->a_mx);
1477 }
1478 
1479 void
1480 mddb_commitrec_wrapper(mddb_recid_t recid)
1481 {
1482 	int sent_log = 0;
1483 	uint_t retry = md_retry_cnt;
1484 	set_t	setno;
1485 
1486 	while (mddb_commitrec(recid)) {
1487 		if (! sent_log) {
1488 			cmn_err(CE_WARN,
1489 			    "md: state database commit failed");
1490 			sent_log = 1;
1491 		}
1492 		delay(md_hz);
1493 
1494 		/*
1495 		 * Setting retry cnt to one (pre decremented) so that we
1496 		 * actually do no retries when committing/deleting a mddb rec.
1497 		 * The underlying disk driver does several retries to check
1498 		 * if the disk is really dead or not so there
1499 		 * is no reason for us to retry on top of the drivers retries.
1500 		 */
1501 
1502 		if (--retry == 0) {
1503 			setno = mddb_getsetnum(recid);
1504 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1505 				panic(
1506 				    "md: Panic due to lack of DiskSuite state\n"
1507 				    " database replicas. Fewer than 50%% of "
1508 				    "the total were available,\n so panic to "
1509 				    "ensure data integrity.");
1510 			} else {
1511 				panic("md: state database problem");
1512 			}
1513 			/*NOTREACHED*/
1514 		}
1515 	}
1516 }
1517 
1518 void
1519 mddb_commitrecs_wrapper(mddb_recid_t *recids)
1520 {
1521 	int sent_log = 0;
1522 	uint_t retry = md_retry_cnt;
1523 	set_t	setno;
1524 
1525 	while (mddb_commitrecs(recids)) {
1526 		if (! sent_log) {
1527 			cmn_err(CE_WARN,
1528 			    "md: state database commit failed");
1529 			sent_log = 1;
1530 		}
1531 		delay(md_hz);
1532 
1533 		/*
1534 		 * Setting retry cnt to one (pre decremented) so that we
1535 		 * actually do no retries when committing/deleting a mddb rec.
1536 		 * The underlying disk driver does several retries to check
1537 		 * if the disk is really dead or not so there
1538 		 * is no reason for us to retry on top of the drivers retries.
1539 		 */
1540 
1541 		if (--retry == 0) {
1542 			/*
1543 			 * since all the records are part of the same set
1544 			 * use the first one to get setno
1545 			 */
1546 			setno = mddb_getsetnum(*recids);
1547 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1548 				panic(
1549 				    "md: Panic due to lack of DiskSuite state\n"
1550 				    " database replicas. Fewer than 50%% of "
1551 				    "the total were available,\n so panic to "
1552 				    "ensure data integrity.");
1553 			} else {
1554 				panic("md: state database problem");
1555 			}
1556 			/*NOTREACHED*/
1557 		}
1558 	}
1559 }
1560 
1561 void
1562 mddb_deleterec_wrapper(mddb_recid_t recid)
1563 {
1564 	int sent_log = 0;
1565 	uint_t retry = md_retry_cnt;
1566 	set_t	setno;
1567 
1568 	while (mddb_deleterec(recid)) {
1569 		if (! sent_log) {
1570 			cmn_err(CE_WARN,
1571 			    "md: state database delete failed");
1572 			sent_log = 1;
1573 		}
1574 		delay(md_hz);
1575 
1576 		/*
1577 		 * Setting retry cnt to one (pre decremented) so that we
1578 		 * actually do no retries when committing/deleting a mddb rec.
1579 		 * The underlying disk driver does several retries to check
1580 		 * if the disk is really dead or not so there
1581 		 * is no reason for us to retry on top of the drivers retries.
1582 		 */
1583 
1584 		if (--retry == 0) {
1585 			setno = mddb_getsetnum(recid);
1586 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1587 				panic(
1588 				    "md: Panic due to lack of DiskSuite state\n"
1589 				    " database replicas. Fewer than 50%% of "
1590 				    "the total were available,\n so panic to "
1591 				    "ensure data integrity.");
1592 			} else {
1593 				panic("md: state database problem");
1594 			}
1595 			/*NOTREACHED*/
1596 		}
1597 	}
1598 }
1599 
1600 /*
1601  * md_holdset_enter is called in order to hold the set in its
1602  * current state (loaded, unloaded, snarfed, unsnarfed, etc)
1603  * until md_holdset_exit is called.  This is used by the mirror
1604  * code to mark the set as HOLD so that the set won't be
1605  * unloaded while hotspares are being allocated in check_4_hotspares.
1606  * The original fix to the mirror code to hold the set was to call
1607  * md_haltsnarf_enter, but this will block all ioctls and ioctls
1608  * must work for a MN diskset while hotspares are allocated.
1609  */
1610 void
1611 md_holdset_enter(set_t setno)
1612 {
1613 	mutex_enter(&md_mx);
1614 	while (md_set[setno].s_status & MD_SET_HOLD)
1615 		cv_wait(&md_cv, &md_mx);
1616 	md_set[setno].s_status |= MD_SET_HOLD;
1617 	mutex_exit(&md_mx);
1618 }
1619 
1620 void
1621 md_holdset_exit(set_t setno)
1622 {
1623 	mutex_enter(&md_mx);
1624 	md_set[setno].s_status &= ~MD_SET_HOLD;
1625 	cv_broadcast(&md_cv);
1626 	mutex_exit(&md_mx);
1627 }
1628 
1629 /*
1630  * Returns a 0 if this thread marked the set as HOLD (success),
1631  * returns a -1 if set was already marked HOLD (failure).
1632  * Used by the release_set code to see if set is marked HOLD.
1633  * HOLD is set by a daemon when hotspares are being allocated
1634  * to mirror units.
1635  */
1636 int
1637 md_holdset_testandenter(set_t setno)
1638 {
1639 	mutex_enter(&md_mx);
1640 	if (md_set[setno].s_status & MD_SET_HOLD) {
1641 		mutex_exit(&md_mx);
1642 		return (-1);
1643 	}
1644 	md_set[setno].s_status |= MD_SET_HOLD;
1645 	mutex_exit(&md_mx);
1646 	return (0);
1647 }
1648 
1649 void
1650 md_haltsnarf_enter(set_t setno)
1651 {
1652 	mutex_enter(&md_mx);
1653 	while (md_set[setno].s_status & MD_SET_SNARFING)
1654 		cv_wait(&md_cv, &md_mx);
1655 
1656 	md_set[setno].s_status |= MD_SET_SNARFING;
1657 	mutex_exit(&md_mx);
1658 }
1659 
1660 void
1661 md_haltsnarf_exit(set_t setno)
1662 {
1663 	mutex_enter(&md_mx);
1664 	md_set[setno].s_status &= ~MD_SET_SNARFING;
1665 	cv_broadcast(&md_cv);
1666 	mutex_exit(&md_mx);
1667 }
1668 
1669 void
1670 md_haltsnarf_wait(set_t setno)
1671 {
1672 	mutex_enter(&md_mx);
1673 	while (md_set[setno].s_status & MD_SET_SNARFING)
1674 		cv_wait(&md_cv, &md_mx);
1675 	mutex_exit(&md_mx);
1676 }
1677 
1678 /*
1679  * ASSUMED that the md_unit_array_rw WRITER lock is held.
1680  */
1681 int
1682 md_halt_set(set_t setno, enum md_haltcmd cmd)
1683 {
1684 	int	i, err;
1685 
1686 	if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
1687 		return (0);
1688 	}
1689 
1690 	if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
1691 		for (i = 0; i < MD_NOPS; i++) {
1692 			if (md_ops[i] == NULL)
1693 				continue;
1694 			if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
1695 				for (--i; i > 0; --i) {
1696 					if (md_ops[i] == NULL)
1697 						continue;
1698 					(void) (*(md_ops[i]->md_halt))
1699 					    (MD_HALT_OPEN, setno);
1700 				}
1701 				return (EBUSY);
1702 			}
1703 		}
1704 
1705 		for (i = 0; i < MD_NOPS; i++) {
1706 			if (md_ops[i] == NULL)
1707 				continue;
1708 			if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
1709 				for (i = 0; i < MD_NOPS; i++) {
1710 					if (md_ops[i] == NULL)
1711 						continue;
1712 					(void) (*(md_ops[i]->md_halt))
1713 					    (MD_HALT_OPEN, setno);
1714 				}
1715 				return (EBUSY);
1716 			}
1717 		}
1718 	}
1719 
1720 	if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
1721 		for (i = 0; i < MD_NOPS; i++) {
1722 			if (md_ops[i] == NULL)
1723 				continue;
1724 			err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
1725 			if (err != 0)
1726 				cmn_err(CE_NOTE,
1727 				    "md: halt failed for %s, error %d",
1728 				    md_ops[i]->md_driver.md_drivername, err);
1729 		}
1730 
1731 		/*
1732 		 * Unload the devid namespace if it is loaded
1733 		 */
1734 		md_unload_namespace(setno, NM_DEVID);
1735 		md_unload_namespace(setno, 0L);
1736 		md_clr_setstatus(setno, MD_SET_SNARFED);
1737 	}
1738 
1739 	return (0);
1740 }
1741 
1742 int
1743 md_halt(int global_locks_owned_mask)
1744 {
1745 	set_t			i, j;
1746 	int			err;
1747 	int			init_queues;
1748 	md_requestq_entry_t	*rqp;
1749 	md_ops_t		**pops, *ops, *lops;
1750 	ddi_modhandle_t		mod;
1751 	char			*name;
1752 
1753 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1754 
1755 	/*
1756 	 * Grab the all of the global locks that are not
1757 	 * already owned to ensure that there isn't another
1758 	 * thread trying to access a global resource
1759 	 * while the halt is in progress
1760 	 */
1761 	if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
1762 		return (EINTR);
1763 
1764 	for (i = 0; i < md_nsets; i++)
1765 		md_haltsnarf_enter(i);
1766 
1767 	/*
1768 	 * Kill the daemon threads.
1769 	 */
1770 	init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
1771 	md_clr_status(MD_GBL_DAEMONS_LIVE);
1772 	md_set_status(MD_GBL_DAEMONS_DIE);
1773 
1774 	rqp = &md_daemon_queues[0];
1775 	i = 0;
1776 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
1777 		cv_broadcast(&rqp->dispq_headp->a_cv);
1778 		rqp = &md_daemon_queues[++i];
1779 	}
1780 
1781 	mutex_enter(&md_mx);
1782 	while (md_num_daemons != 0) {
1783 		mutex_exit(&md_mx);
1784 		delay(md_hz);
1785 		mutex_enter(&md_mx);
1786 	}
1787 	mutex_exit(&md_mx);
1788 	md_clr_status(MD_GBL_DAEMONS_DIE);
1789 
1790 	for (i = 0; i < md_nsets; i++)
1791 		/*
1792 		 * Only call into md_halt_set if s_un / s_ui are both set.
1793 		 * If they are NULL this set hasn't been accessed, so its
1794 		 * pointless performing the call.
1795 		 */
1796 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1797 			if (md_halt_set(i, MD_HALT_CHECK)) {
1798 				if (md_start_daemons(init_queues))
1799 					cmn_err(CE_WARN,
1800 					    "md: restart of daemon threads "
1801 					    "failed");
1802 				for (j = 0; j < md_nsets; j++)
1803 					md_haltsnarf_exit(j);
1804 
1805 				return (md_global_lock_exit(
1806 				    global_locks_owned_mask, EBUSY,
1807 				    MD_ARRAY_WRITER, NULL));
1808 			}
1809 		}
1810 
1811 	/*
1812 	 * if we get here we are going to do it
1813 	 */
1814 	for (i = 0; i < md_nsets; i++) {
1815 		/*
1816 		 * Only call into md_halt_set if s_un / s_ui are both set.
1817 		 * If they are NULL this set hasn't been accessed, so its
1818 		 * pointless performing the call.
1819 		 */
1820 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1821 			err = md_halt_set(i, MD_HALT_DOIT);
1822 			if (err != 0)
1823 				cmn_err(CE_NOTE,
1824 				    "md: halt failed set %u, error %d",
1825 				    (unsigned)i, err);
1826 		}
1827 	}
1828 
1829 	/*
1830 	 * issue a halt unload to each module to indicate that it
1831 	 * is about to be unloaded.  Each module is called once, set
1832 	 * has no meaning at this point in time.
1833 	 */
1834 	for (i = 0; i < MD_NOPS; i++) {
1835 		if (md_ops[i] == NULL)
1836 			continue;
1837 		err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
1838 		if (err != 0)
1839 			cmn_err(CE_NOTE,
1840 			    "md: halt failed for %s, error %d",
1841 			    md_ops[i]->md_driver.md_drivername, err);
1842 	}
1843 
1844 	/* ddi_modclose the submodules */
1845 	for (i = 0; i < MD_NOPS; i++) {
1846 		/* skip if not open */
1847 		if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
1848 			continue;
1849 
1850 		/* find and unlink from md_opslist */
1851 		ops = md_ops[i];
1852 		mod = md_mods[i];
1853 		pops = &md_opslist;
1854 		for (lops = *pops; lops;
1855 		    pops = &lops->md_next, lops = *pops) {
1856 			if (lops == ops) {
1857 				*pops = ops->md_next;
1858 				ops->md_next = NULL;
1859 				break;
1860 			}
1861 		}
1862 
1863 		/* uninitialize */
1864 		name = ops->md_driver.md_drivername;
1865 		md_ops[i] = NULL;
1866 		md_mods[i] = NULL;
1867 		ops->md_selfindex = 0;
1868 		ops->md_driver.md_drivername[0] = '\0';
1869 		rw_destroy(&ops->md_link_rw.lock);
1870 
1871 		/* close */
1872 		err = ddi_modclose(mod);
1873 		if (err != 0)
1874 			cmn_err(CE_NOTE,
1875 			    "md: halt close failed for %s, error %d",
1876 			    name ? name : "UNKNOWN", err);
1877 	}
1878 
1879 	/* Unload the database */
1880 	mddb_unload();
1881 
1882 	md_set_status(MD_GBL_HALTED);	/* we are ready to be unloaded */
1883 
1884 	for (i = 0; i < md_nsets; i++)
1885 		md_haltsnarf_exit(i);
1886 
1887 	return (md_global_lock_exit(global_locks_owned_mask, 0,
1888 	    MD_ARRAY_WRITER, NULL));
1889 }
1890 
1891 /*
1892  * md_layered_open() is an internal routine only for SVM modules.
1893  * So the input device will be a md_dev64_t, because all SVM modules internally
1894  * work with that device type.
1895  * ddi routines on the other hand work with dev_t. So, if we call any ddi
1896  * routines from here we first have to convert that device into a dev_t.
1897  */
1898 
1899 int
1900 md_layered_open(
1901 	minor_t		mnum,
1902 	md_dev64_t	*dev,
1903 	int		md_oflags
1904 )
1905 {
1906 	int		flag = (FREAD | FWRITE);
1907 	cred_t		*cred_p = kcred;
1908 	major_t		major;
1909 	int		err;
1910 	dev_t		ddi_dev = md_dev64_to_dev(*dev);
1911 
1912 	if (ddi_dev == NODEV)
1913 		return (ENODEV);
1914 
1915 	major = getmajor(ddi_dev);
1916 
1917 	/* metadevice */
1918 	if (major == md_major) {
1919 		mdi_unit_t	*ui;
1920 
1921 		/* open underlying driver */
1922 		mnum = getminor(ddi_dev);
1923 
1924 		ui = MDI_UNIT(mnum);
1925 		if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1926 			int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
1927 			    flag, OTYP_LYR, cred_p, md_oflags);
1928 			/*
1929 			 * As open() may change the device,
1930 			 * send this info back to the caller.
1931 			 */
1932 			*dev = md_expldev(ddi_dev);
1933 			return (ret);
1934 		}
1935 
1936 		/* or do it ourselves */
1937 		(void) md_unit_openclose_enter(ui);
1938 		err = md_unit_incopen(mnum, flag, OTYP_LYR);
1939 		md_unit_openclose_exit(ui);
1940 		/* convert our ddi_dev back to the dev we were given */
1941 		*dev = md_expldev(ddi_dev);
1942 		return (err);
1943 	}
1944 
1945 	/*
1946 	 * Open regular device, since open() may change dev_t give new dev_t
1947 	 * back to the caller.
1948 	 */
1949 	err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
1950 	*dev = md_expldev(ddi_dev);
1951 	return (err);
1952 }
1953 
1954 /*
1955  * md_layered_close() is an internal routine only for SVM modules.
1956  * So the input device will be a md_dev64_t, because all SVM modules internally
1957  * work with that device type.
1958  * ddi routines on the other hand work with dev_t. So, if we call any ddi
1959  * routines from here we first have to convert that device into a dev_t.
1960  */
1961 void
1962 md_layered_close(
1963 	md_dev64_t	dev,
1964 	int		md_cflags
1965 )
1966 {
1967 	int		flag = (FREAD | FWRITE);
1968 	cred_t		*cred_p = kcred;
1969 	dev_t		ddi_dev = md_dev64_to_dev(dev);
1970 	major_t		major = getmajor(ddi_dev);
1971 	minor_t		mnum = getminor(ddi_dev);
1972 
1973 	/* metadevice */
1974 	if (major == md_major) {
1975 		mdi_unit_t	*ui = MDI_UNIT(mnum);
1976 
1977 		/* close underlying driver */
1978 		if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1979 			(*md_ops[ui->ui_opsindex]->md_close)
1980 			    (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
1981 			return;
1982 		}
1983 
1984 		/* or do it ourselves */
1985 		(void) md_unit_openclose_enter(ui);
1986 		(void) md_unit_decopen(mnum, OTYP_LYR);
1987 		md_unit_openclose_exit(ui);
1988 		return;
1989 	}
1990 
1991 	/* close regular device */
1992 	(void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
1993 }
1994 
1995 /*
1996  * saves a little code in mdstrategy
1997  */
1998 int
1999 errdone(mdi_unit_t *ui, struct buf *bp, int err)
2000 {
2001 	if ((bp->b_error = err) != 0)
2002 		bp->b_flags |= B_ERROR;
2003 	else
2004 		bp->b_resid = bp->b_bcount;
2005 	md_unit_readerexit(ui);
2006 	md_biodone(bp);
2007 	return (1);
2008 }
2009 
2010 static int	md_write_label = 0;
2011 
2012 int
2013 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
2014 {
2015 	diskaddr_t endblk;
2016 	set_t	setno = MD_UN2SET(un);
2017 
2018 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
2019 	    (! (bp->b_flags & B_READ)))
2020 		return (errdone(ui, bp, EROFS));
2021 	/*
2022 	 * Check early for unreasonable block number.
2023 	 *
2024 	 * b_blkno is defined as adaddr_t which is typedef'd to a long.
2025 	 * A problem occurs if b_blkno has bit 31 set and un_total_blocks
2026 	 * doesn't, b_blkno is then compared as a negative number which is
2027 	 * always less than a positive.
2028 	 */
2029 	if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
2030 		return (errdone(ui, bp, EINVAL));
2031 
2032 	if (bp->b_lblkno == un->c.un_total_blocks)
2033 		return (errdone(ui, bp, 0));
2034 
2035 	/*
2036 	 * make sure we don't clobber any labels
2037 	 */
2038 	if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
2039 	    (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
2040 		cmn_err(CE_NOTE, "md: %s: write to label",
2041 		    md_shortname(getminor(bp->b_edev)));
2042 		return (errdone(ui, bp, EINVAL));
2043 	}
2044 
2045 	bp->b_resid = 0;
2046 	endblk = (diskaddr_t)(bp->b_lblkno +
2047 	    howmany(bp->b_bcount, DEV_BSIZE) - 1);
2048 
2049 	if (endblk > (un->c.un_total_blocks - 1)) {
2050 		bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
2051 		endblk = un->c.un_total_blocks - 1;
2052 		bp->b_bcount -= bp->b_resid;
2053 	}
2054 	return (0);
2055 }
2056 
2057 /*
2058  * init_request_queue: initializes the request queues and creates the threads.
2059  *	return value =  0  :invalid num_threads
2060  *		     =  n   : n is the number of threads created.
2061  */
2062 
2063 int
2064 init_requestq(
2065 	md_requestq_entry_t *rq, /* request queue info */
2066 	void (*threadfn)(),	 /* function to start the thread */
2067 	caddr_t threadfn_args,	 /* args to the function */
2068 	int pri,		 /* thread priority */
2069 	int init_queue)		 /* flag to init queues */
2070 {
2071 	struct mdq_anchor *rqhead;
2072 	int	i;
2073 	int	num_threads;
2074 
2075 
2076 	num_threads = *(rq->num_threadsp);
2077 	rqhead = rq->dispq_headp;
2078 
2079 	if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
2080 		return (0);
2081 
2082 	if (init_queue) {
2083 		rqhead->dq.maxq_len = 0;
2084 		rqhead->dq.treqs = 0;
2085 		rqhead->dq.dq_next = &rqhead->dq;
2086 		rqhead->dq.dq_prev = &rqhead->dq;
2087 		cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
2088 		mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
2089 	}
2090 	for (i = 0; i < num_threads; i++) {
2091 		(void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
2092 		    TS_RUN, pri);
2093 	}
2094 	return (i);
2095 }
2096 
2097 static void
2098 start_daemon(struct mdq_anchor *q)
2099 {
2100 	md_daemon(0, q);
2101 	ASSERT(0);
2102 }
2103 
2104 /*
2105  * Creates all the md daemons.
2106  * Global:
2107  *	md_num_daemons is set to number of daemons.
2108  *	MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
2109  *
2110  * Return value: 0  success
2111  *		 1  failure
2112  */
2113 int
2114 md_start_daemons(int init_queue)
2115 {
2116 	md_requestq_entry_t	*rqp;
2117 	int	cnt;
2118 	int	i;
2119 	int	retval = 0;
2120 
2121 
2122 	if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
2123 		return (retval);
2124 	}
2125 	md_clr_status(MD_GBL_DAEMONS_DIE);
2126 
2127 	rqp = &md_daemon_queues[0];
2128 	i = 0;
2129 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
2130 		cnt = init_requestq(rqp, start_daemon,
2131 		    (caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
2132 
2133 		if (cnt && cnt != *rqp->num_threadsp) {
2134 			retval = 1;
2135 			break;
2136 		}
2137 		/*
2138 		 * initialize variables
2139 		 */
2140 		md_num_daemons += cnt;
2141 		rqp = &md_daemon_queues[++i];
2142 	}
2143 
2144 	md_set_status(MD_GBL_DAEMONS_LIVE);
2145 	return (retval);
2146 }
2147 
2148 int
2149 md_loadsubmod(set_t setno, char *name, int drvrid)
2150 {
2151 	ddi_modhandle_t	mod;
2152 	md_ops_t	**pops, *ops;
2153 	int		i, err;
2154 
2155 	/*
2156 	 * See if the submodule is mdopened. If not, i is the index of the
2157 	 * next empty slot.
2158 	 */
2159 	for (i = 0; md_ops[i] != NULL; i++) {
2160 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2161 		    MD_DRIVERNAMELEN) == 0)
2162 			return (i);
2163 
2164 		if (i == (MD_NOPS - 1))
2165 			return (-1);
2166 	}
2167 
2168 	if (drvrid < 0) {
2169 		/* Do not try to add any records to the DB when stale. */
2170 		if (md_get_setstatus(setno) & MD_SET_STALE)
2171 			return (-1);
2172 		drvrid = md_setshared_name(setno, name, 0L);
2173 	}
2174 
2175 	if (drvrid < 0)
2176 		return (-1);
2177 
2178 	/* open and import the md_ops of the submodules */
2179 	mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
2180 	if (mod == NULL) {
2181 		cmn_err(CE_WARN, "md_loadsubmod: "
2182 		    "unable to ddi_modopen %s, error %d\n", name, err);
2183 		return (-1);
2184 	}
2185 	pops = ddi_modsym(mod, "md_interface_ops", &err);
2186 	if (pops == NULL) {
2187 		cmn_err(CE_WARN, "md_loadsubmod: "
2188 		    "unable to import md_interface_ops from %s, error %d\n",
2189 		    name, err);
2190 		(void) ddi_modclose(mod);
2191 		return (-1);
2192 	}
2193 
2194 	/* ddi_modsym returns pointer to md_interface_ops in submod */
2195 	ops = *pops;
2196 
2197 	/* initialize */
2198 	ops->md_selfindex = i;
2199 	rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
2200 	(void) strncpy(ops->md_driver.md_drivername, name,
2201 	    MD_DRIVERNAMELEN);
2202 
2203 	/* plumb */
2204 	md_ops[i] = ops;
2205 	md_mods[i] = mod;
2206 	ops->md_next = md_opslist;
2207 	md_opslist = ops;
2208 
2209 	/* return index */
2210 	return (i);
2211 }
2212 
2213 int
2214 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
2215 {
2216 	int	i;
2217 	int	modindex;
2218 	char	*name = driver->md_drivername;
2219 	set_t	setno = driver->md_setno;
2220 	int	drvid;
2221 	int	local_dont_load;
2222 
2223 	if (setno >= md_nsets)
2224 		return (-1);
2225 
2226 	for (i = 0; name[i] != 0; i++)
2227 		if (i == (MD_DRIVERNAMELEN -1))
2228 			return (-1);
2229 
2230 	/*
2231 	 * If set is STALE, set local_dont_load to 1 since no records
2232 	 * should be added to DB when stale.
2233 	 */
2234 	if (md_get_setstatus(setno) & MD_SET_STALE) {
2235 		local_dont_load = 1;
2236 	} else {
2237 		local_dont_load = dont_load;
2238 	}
2239 
2240 	/*
2241 	 * Single thread ioctl module binding with respect to
2242 	 * similar code executed in md_loadsubmod that is called
2243 	 * from md_snarf_db_set (which is where that path does
2244 	 * its md_haltsnarf_enter call).
2245 	 */
2246 	md_haltsnarf_enter(setno);
2247 
2248 	/* See if the submodule is already ddi_modopened. */
2249 	for (i = 0; md_ops[i] != NULL; i++) {
2250 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2251 		    MD_DRIVERNAMELEN) == 0) {
2252 			if (! local_dont_load &&
2253 			    (md_getshared_key(setno, name) == MD_KEYBAD)) {
2254 				if (md_setshared_name(setno, name, 0L)
2255 				    == MD_KEYBAD) {
2256 					if (!db_notrequired)
2257 						goto err;
2258 				}
2259 			}
2260 			md_haltsnarf_exit(setno);
2261 			return (i);
2262 		}
2263 
2264 		if (i == (MD_NOPS -1))
2265 			break;
2266 	}
2267 
2268 	if (local_dont_load)
2269 		goto err;
2270 
2271 	drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
2272 
2273 	/* ddi_modopen the submodule */
2274 	modindex = md_loadsubmod(setno, name, drvid);
2275 	if (modindex < 0)
2276 		goto err;
2277 
2278 	if (md_ops[modindex]->md_snarf != NULL)
2279 		(*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
2280 
2281 	md_haltsnarf_exit(setno);
2282 	return (modindex);
2283 
2284 err:	md_haltsnarf_exit(setno);
2285 	return (-1);
2286 }
2287 
2288 void
2289 md_call_strategy(buf_t *bp, int flags, void *private)
2290 {
2291 	mdi_unit_t	*ui;
2292 
2293 	if (mdv_strategy_tstpnt)
2294 		if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
2295 			return;
2296 	if (getmajor(bp->b_edev) != md_major) {
2297 		(void) bdev_strategy(bp);
2298 		return;
2299 	}
2300 
2301 	flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
2302 	ui = MDI_UNIT(getminor(bp->b_edev));
2303 	ASSERT(ui != NULL);
2304 	(*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
2305 }
2306 
2307 /*
2308  * md_call_ioctl:
2309  * -------------
2310  * Issue the specified ioctl to the device associated with the given md_dev64_t
2311  *
2312  * Arguments:
2313  *	dev	- underlying device [md_dev64_t]
2314  *	cmd	- ioctl to perform
2315  *	data	- arguments / result location
2316  *	mode	- read/write/layered ioctl
2317  *	lockp	- lock reference
2318  *
2319  * Returns:
2320  *	0	success
2321  *	!=0	Failure (error code)
2322  */
2323 int
2324 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
2325 {
2326 	dev_t		device = md_dev64_to_dev(dev);
2327 	int		rval;
2328 	mdi_unit_t	*ui;
2329 
2330 	/*
2331 	 * See if device is a metadevice. If not call cdev_ioctl(), otherwise
2332 	 * call the ioctl entry-point in the metadevice.
2333 	 */
2334 	if (md_getmajor(dev) != md_major) {
2335 		int	rv;
2336 		rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
2337 		    ddi_get_cred(), &rv);
2338 	} else {
2339 		ui = MDI_UNIT(md_getminor(dev));
2340 		ASSERT(ui != NULL);
2341 		rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
2342 		    mode, lockp);
2343 	}
2344 	return (rval);
2345 }
2346 
2347 void
2348 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
2349 {
2350 	md_link_t	*next;
2351 	md_link_t	**pprev;
2352 
2353 	rw_enter(rw, RW_WRITER);
2354 
2355 	next = *head;
2356 	pprev = head;
2357 	while (next) {
2358 		if ((next->ln_setno == setno) && (next->ln_id == id)) {
2359 			*pprev = next->ln_next;
2360 			rw_exit(rw);
2361 			return;
2362 		}
2363 		pprev = &next->ln_next;
2364 		next = next->ln_next;
2365 	}
2366 
2367 	rw_exit(rw);
2368 }
2369 
2370 int
2371 md_dev_exists(md_dev64_t dev)
2372 {
2373 
2374 	if (dev == NODEV64)
2375 		return (0);
2376 
2377 	if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
2378 		return (1);
2379 
2380 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
2381 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
2382 		return (0);
2383 
2384 	if (MDI_UNIT(md_getminor(dev)) != NULL)
2385 		return (1);
2386 
2387 	return (0);
2388 }
2389 
2390 md_parent_t
2391 md_get_parent(md_dev64_t dev)
2392 {
2393 	md_unit_t	*un;
2394 	mdi_unit_t	*ui;
2395 	md_parent_t	parent;
2396 
2397 	if (md_getmajor(dev) != md_major)
2398 		return (MD_NO_PARENT);
2399 
2400 	ui = MDI_UNIT(md_getminor(dev));
2401 
2402 	un = (md_unit_t *)md_unit_readerlock(ui);
2403 	parent = un->c.un_parent;
2404 	md_unit_readerexit(ui);
2405 
2406 	return (parent);
2407 }
2408 
2409 void
2410 md_set_parent(md_dev64_t dev, md_parent_t parent)
2411 {
2412 	md_unit_t	*un;
2413 	mdi_unit_t	*ui;
2414 
2415 	if (md_getmajor(dev) != md_major)
2416 		return;
2417 
2418 	ui = MDI_UNIT(md_getminor(dev));
2419 
2420 	un = (md_unit_t *)md_unit_readerlock(ui);
2421 	un->c.un_parent = parent;
2422 	md_unit_readerexit(ui);
2423 }
2424 
2425 void
2426 md_reset_parent(md_dev64_t dev)
2427 {
2428 	md_unit_t	*un;
2429 	mdi_unit_t	*ui;
2430 
2431 	if (md_getmajor(dev) != md_major)
2432 		return;
2433 
2434 	ui = MDI_UNIT(md_getminor(dev));
2435 
2436 	un = (md_unit_t *)md_unit_readerlock(ui);
2437 	un->c.un_parent = MD_NO_PARENT;
2438 	md_unit_readerexit(ui);
2439 }
2440 
2441 
2442 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
2443 
2444 int
2445 md_hot_spare_ifc(
2446 	hs_cmds_t	cmd,
2447 	mddb_recid_t	id,
2448 	u_longlong_t	size,
2449 	int		labeled,
2450 	mddb_recid_t	*hs_id,
2451 	mdkey_t		*key,
2452 	md_dev64_t	*dev,
2453 	diskaddr_t	*sblock)
2454 {
2455 	int		err;
2456 
2457 	/*
2458 	 * RW lock on hot_spare_interface. We don't want it to change from
2459 	 * underneath us. If hot_spare_interface is NULL we're going to
2460 	 * need to set it. So we need to upgrade to a WRITER lock. If that
2461 	 * doesn't work, we drop the lock and reenter as WRITER. This leaves
2462 	 * a small hole during which hot_spare_interface could be modified
2463 	 * so we check it for NULL again. What a pain. Then if still null
2464 	 * load from md_get_named_service.
2465 	 */
2466 
2467 	rw_enter(&hsp_rwlp.lock, RW_READER);
2468 	if (hot_spare_interface == NULL) {
2469 		if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
2470 			rw_exit(&hsp_rwlp.lock);
2471 			rw_enter(&hsp_rwlp.lock, RW_WRITER);
2472 			if (hot_spare_interface != NULL) {
2473 				err = ((*hot_spare_interface)
2474 				    (cmd, id, size, labeled, hs_id, key, dev,
2475 				    sblock));
2476 				rw_exit(&hsp_rwlp.lock);
2477 				return (err);
2478 			}
2479 		}
2480 		hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2481 		    "hot spare interface", 0);
2482 		rw_downgrade(&hsp_rwlp.lock);
2483 	}
2484 
2485 	if (hot_spare_interface == NULL) {
2486 		cmn_err(CE_WARN, "md: no hotspare interface");
2487 		rw_exit(&hsp_rwlp.lock);
2488 		return (0);
2489 	}
2490 
2491 	err = ((*hot_spare_interface)
2492 	    (cmd, id, size, labeled, hs_id, key, dev, sblock));
2493 	rw_exit(&hsp_rwlp.lock);
2494 	return (err);
2495 }
2496 
2497 void
2498 md_clear_hot_spare_interface()
2499 {
2500 	rw_enter(&hsp_rwlp.lock, RW_WRITER);
2501 	hot_spare_interface = NULL;
2502 	rw_exit(&hsp_rwlp.lock);
2503 }
2504 
2505 
2506 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
2507 
2508 int
2509 md_notify_interface(
2510 	md_event_cmds_t cmd,
2511 	md_tags_t	tag,
2512 	set_t		set,
2513 	md_dev64_t	dev,
2514 	md_event_type_t event
2515 )
2516 {
2517 	int		err;
2518 
2519 	if (md_event_queue == NULL)
2520 		return (0);
2521 	rw_enter(&ni_rwlp.lock, RW_READER);
2522 	if (notify_interface == NULL) {
2523 		if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
2524 			rw_exit(&ni_rwlp.lock);
2525 			rw_enter(&ni_rwlp.lock, RW_WRITER);
2526 			if (notify_interface != NULL) {
2527 				err = ((*notify_interface)
2528 				    (cmd, tag, set, dev, event));
2529 				rw_exit(&ni_rwlp.lock);
2530 				return (err);
2531 			}
2532 		}
2533 		notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2534 		    "notify interface", 0);
2535 		rw_downgrade(&ni_rwlp.lock);
2536 	}
2537 	if (notify_interface == NULL) {
2538 		cmn_err(CE_WARN, "md: no notify interface");
2539 		rw_exit(&ni_rwlp.lock);
2540 		return (0);
2541 	}
2542 	err = ((*notify_interface)(cmd, tag, set, dev, event));
2543 	rw_exit(&ni_rwlp.lock);
2544 	return (err);
2545 }
2546 
2547 char *
2548 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
2549 {
2550 	char		*setname;
2551 	char		name[MD_MAX_CTDLEN];
2552 	minor_t		mnum = md_getminor(dev);
2553 	major_t		maj = md_getmajor(dev);
2554 	int		rtn = 0;
2555 
2556 	/*
2557 	 * Verify that the passed dev_t refers to a valid metadevice.
2558 	 * If it doesn't we can make no assumptions as to what the device
2559 	 * name is. Return NULL in these cases.
2560 	 */
2561 	if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
2562 	    (MD_MIN2SET(mnum) >= md_nsets)) {
2563 		return (NULL);
2564 	}
2565 
2566 	setname = NULL;
2567 	name[0] = '\0';
2568 	switch (tag) {
2569 	case SVM_TAG_HSP:
2570 		if (setno == 0) {
2571 			rtn = snprintf(name, sizeof (name), "hsp%u",
2572 			    (unsigned)MD_MIN2UNIT(mnum));
2573 		} else {
2574 			setname = mddb_getsetname(setno);
2575 			if (setname != NULL) {
2576 				rtn = snprintf(name, sizeof (name), "%s/hsp%u",
2577 				    setname, (unsigned)MD_MIN2UNIT(mnum));
2578 			}
2579 		}
2580 		break;
2581 	case SVM_TAG_DRIVE:
2582 		(void) sprintf(name, "drive");
2583 		break;
2584 	case SVM_TAG_HOST:
2585 		(void) sprintf(name, "host");
2586 		break;
2587 	case SVM_TAG_SET:
2588 		rtn = snprintf(name, sizeof (name), "%s",
2589 		    mddb_getsetname(setno));
2590 		if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2591 			(void) sprintf(name, "diskset");
2592 			rtn = 0;
2593 		}
2594 		break;
2595 	default:
2596 		rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
2597 		break;
2598 	}
2599 
2600 	/* Check if we got any rubbish for any of the snprintf's */
2601 	if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2602 		return (NULL);
2603 	}
2604 
2605 	return (md_strdup(name));
2606 }
2607 
2608 /* Sysevent subclass and mdnotify event type pairs */
2609 struct node {
2610 	char		*se_ev;
2611 	md_event_type_t	md_ev;
2612 };
2613 
2614 /*
2615  * Table must be sorted in case sensitive ascending order of
2616  * the sysevents values
2617  */
2618 static struct node ev_table[] = {
2619 	{ ESC_SVM_ADD,			EQ_ADD },
2620 	{ ESC_SVM_ATTACH,		EQ_ATTACH },
2621 	{ ESC_SVM_ATTACHING,		EQ_ATTACHING },
2622 	{ ESC_SVM_CHANGE,		EQ_CHANGE },
2623 	{ ESC_SVM_CREATE,		EQ_CREATE },
2624 	{ ESC_SVM_DELETE,		EQ_DELETE },
2625 	{ ESC_SVM_DETACH,		EQ_DETACH },
2626 	{ ESC_SVM_DETACHING,		EQ_DETACHING },
2627 	{ ESC_SVM_DRIVE_ADD,		EQ_DRIVE_ADD },
2628 	{ ESC_SVM_DRIVE_DELETE,		EQ_DRIVE_DELETE },
2629 	{ ESC_SVM_ENABLE,		EQ_ENABLE },
2630 	{ ESC_SVM_ERRED,		EQ_ERRED },
2631 	{ ESC_SVM_EXCHANGE,		EQ_EXCHANGE },
2632 	{ ESC_SVM_GROW,			EQ_GROW },
2633 	{ ESC_SVM_HS_CHANGED,		EQ_HS_CHANGED },
2634 	{ ESC_SVM_HS_FREED,		EQ_HS_FREED },
2635 	{ ESC_SVM_HOST_ADD,		EQ_HOST_ADD },
2636 	{ ESC_SVM_HOST_DELETE,		EQ_HOST_DELETE },
2637 	{ ESC_SVM_HOTSPARED,		EQ_HOTSPARED },
2638 	{ ESC_SVM_INIT_FAILED,		EQ_INIT_FAILED },
2639 	{ ESC_SVM_INIT_FATAL,		EQ_INIT_FATAL },
2640 	{ ESC_SVM_INIT_START,		EQ_INIT_START },
2641 	{ ESC_SVM_INIT_SUCCESS,		EQ_INIT_SUCCESS },
2642 	{ ESC_SVM_IOERR,		EQ_IOERR },
2643 	{ ESC_SVM_LASTERRED,		EQ_LASTERRED },
2644 	{ ESC_SVM_MEDIATOR_ADD,		EQ_MEDIATOR_ADD },
2645 	{ ESC_SVM_MEDIATOR_DELETE,	EQ_MEDIATOR_DELETE },
2646 	{ ESC_SVM_OFFLINE,		EQ_OFFLINE },
2647 	{ ESC_SVM_OK,			EQ_OK },
2648 	{ ESC_SVM_ONLINE,		EQ_ONLINE },
2649 	{ ESC_SVM_OPEN_FAIL,		EQ_OPEN_FAIL },
2650 	{ ESC_SVM_REGEN_DONE,		EQ_REGEN_DONE },
2651 	{ ESC_SVM_REGEN_FAILED,		EQ_REGEN_FAILED },
2652 	{ ESC_SVM_REGEN_START,		EQ_REGEN_START },
2653 	{ ESC_SVM_RELEASE,		EQ_RELEASE },
2654 	{ ESC_SVM_REMOVE,		EQ_REMOVE },
2655 	{ ESC_SVM_RENAME_DST,		EQ_RENAME_DST },
2656 	{ ESC_SVM_RENAME_SRC,		EQ_RENAME_SRC },
2657 	{ ESC_SVM_REPLACE,		EQ_REPLACE },
2658 	{ ESC_SVM_RESYNC_DONE,		EQ_RESYNC_DONE },
2659 	{ ESC_SVM_RESYNC_FAILED,	EQ_RESYNC_FAILED },
2660 	{ ESC_SVM_RESYNC_START,		EQ_RESYNC_START },
2661 	{ ESC_SVM_RESYNC_SUCCESS,	EQ_RESYNC_SUCCESS },
2662 	{ ESC_SVM_TAKEOVER,		EQ_TAKEOVER }
2663 };
2664 
2665 static md_tags_t md_tags[] = {
2666 	TAG_UNK,
2667 	TAG_METADEVICE,
2668 	TAG_UNK,
2669 	TAG_UNK,
2670 	TAG_UNK,
2671 	TAG_UNK,
2672 	TAG_REPLICA,
2673 	TAG_HSP,
2674 	TAG_HS,
2675 	TAG_SET,
2676 	TAG_DRIVE,
2677 	TAG_HOST,
2678 	TAG_MEDIATOR
2679 };
2680 
2681 md_event_type_t
2682 ev_get(char *subclass)
2683 {
2684 	int	high, mid, low, p;
2685 
2686 	low = 0;
2687 	high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
2688 	while (low <= high) {
2689 		mid = (high + low) / 2;
2690 		p = strcmp(subclass, ev_table[mid].se_ev);
2691 		if (p == 0) {
2692 			return (ev_table[mid].md_ev);
2693 		} else if (p < 0) {
2694 			high = mid - 1;
2695 		} else {
2696 			low = mid + 1;
2697 		}
2698 	}
2699 
2700 	return (EQ_EMPTY);
2701 }
2702 
2703 /*
2704  * Log mdnotify event
2705  */
2706 void
2707 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
2708 {
2709 	md_event_type_t	ev_type;
2710 	md_tags_t	md_tag;
2711 
2712 	/* Translate sysevent into mdnotify event */
2713 	ev_type = ev_get(se_subclass);
2714 
2715 	if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
2716 		md_tag = TAG_UNK;
2717 	} else {
2718 		md_tag = md_tags[tag];
2719 	}
2720 
2721 	NOTIFY_MD(md_tag, setno, devid, ev_type);
2722 }
2723 
2724 /*
2725  * Log SVM sys events
2726  */
2727 void
2728 svm_gen_sysevent(
2729 	char		*se_class,
2730 	char		*se_subclass,
2731 	uint32_t	tag,
2732 	set_t		setno,
2733 	md_dev64_t	devid
2734 )
2735 {
2736 	nvlist_t		*attr_list;
2737 	sysevent_id_t		eid;
2738 	int			err = DDI_SUCCESS;
2739 	char			*devname;
2740 	extern dev_info_t	*md_devinfo;
2741 
2742 	/* Raise the mdnotify event before anything else */
2743 	do_mdnotify(se_subclass, tag, setno, devid);
2744 
2745 	if (md_devinfo == NULL) {
2746 		return;
2747 	}
2748 
2749 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
2750 
2751 	if (err == DDI_SUCCESS) {
2752 		/* Add the version numver */
2753 		err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
2754 		    (uint32_t)SVM_VERSION);
2755 		if (err != DDI_SUCCESS) {
2756 			goto fail;
2757 		}
2758 
2759 		/* Add the tag attribute */
2760 		err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
2761 		if (err != DDI_SUCCESS) {
2762 			goto fail;
2763 		}
2764 
2765 		/* Add the set number attribute */
2766 		err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
2767 		if (err != DDI_SUCCESS) {
2768 			goto fail;
2769 		}
2770 
2771 		/* Add the device id attribute */
2772 		err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
2773 		if (err != DDI_SUCCESS) {
2774 			goto fail;
2775 		}
2776 
2777 		/* Add the device name attribute */
2778 		devname = obj2devname(tag, setno, devid);
2779 		if (devname != NULL) {
2780 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2781 			    devname);
2782 			freestr(devname);
2783 		} else {
2784 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2785 			    "unspecified");
2786 		}
2787 		if (err != DDI_SUCCESS) {
2788 			goto fail;
2789 		}
2790 
2791 		/* Attempt to post event */
2792 		err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
2793 		    se_subclass, attr_list, &eid, DDI_SLEEP);
2794 
2795 		nvlist_free(attr_list);
2796 		if (err != DDI_SUCCESS) {
2797 			cmn_err(CE_WARN, "Failed to log event for %s, %s,"
2798 			    " err=%x", se_class, se_subclass, err);
2799 		}
2800 	}
2801 
2802 	return;
2803 
2804 fail:
2805 	nvlist_free(attr_list);
2806 	cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
2807 	    se_class, se_subclass, err);
2808 }
2809 
2810 void
2811 md_clear_named_service()
2812 {
2813 	rw_enter(&ni_rwlp.lock, RW_WRITER);
2814 	notify_interface = NULL;
2815 	rw_exit(&ni_rwlp.lock);
2816 }
2817 
2818 void
2819 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
2820 {
2821 	mdi_unit_t	*ui;
2822 	set_t		setno = MD_MIN2SET(mnum);
2823 
2824 	ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
2825 	ui->ui_opsindex = ops->md_selfindex;
2826 
2827 	/* initialize all the incore conditional variables */
2828 	mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
2829 	cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
2830 
2831 	if (alloc_lock) {
2832 		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
2833 		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
2834 		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
2835 		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
2836 		    MUTEX_DEFAULT, NULL);
2837 		ui->ui_io_lock->io_list_front = NULL;
2838 		ui->ui_io_lock->io_list_back = NULL;
2839 	}
2840 	if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
2841 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
2842 		MDI_VOIDUNIT(mnum) = (void *) ui;
2843 		rw_exit(&md_unit_array_rw.lock);
2844 	} else
2845 		MDI_VOIDUNIT(mnum) = (void *) ui;
2846 
2847 	rw_enter(&ops->md_link_rw.lock, RW_WRITER);
2848 	ui->ui_link.ln_next = ops->md_head;
2849 	ui->ui_link.ln_setno = setno;
2850 	ui->ui_link.ln_id = mnum;
2851 	ops->md_head = &ui->ui_link;
2852 	/* setup the unavailable field */
2853 #if defined(_ILP32)
2854 	if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
2855 		ui->ui_tstate |= MD_64MD_ON_32KERNEL;
2856 		cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
2857 		    "metadevices are not accessible on a 32 bit kernel",
2858 		    mnum);
2859 	}
2860 #endif
2861 
2862 	rw_exit(&ops->md_link_rw.lock);
2863 }
2864 
2865 void
2866 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
2867 {
2868 	mdi_unit_t	*ui;
2869 
2870 	/*
2871 	 * ASSUMPTION: md_unit_array_rw WRITER lock is held.
2872 	 */
2873 	ui = MDI_UNIT(mnum);
2874 	if (ui == NULL)
2875 		return;
2876 
2877 	md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
2878 	    &ops->md_head);
2879 
2880 	/* destroy the io lock if one is being used */
2881 	if (ui->ui_io_lock) {
2882 		mutex_destroy(&ui->ui_io_lock->io_mx);
2883 		cv_destroy(&ui->ui_io_lock->io_cv);
2884 		kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
2885 	}
2886 
2887 	/* teardown kstat */
2888 	md_kstat_destroy(mnum);
2889 
2890 	/* destroy all the incore conditional variables */
2891 	mutex_destroy(&ui->ui_mx);
2892 	cv_destroy(&ui->ui_cv);
2893 
2894 	kmem_free(ui, sizeof (mdi_unit_t));
2895 	MDI_VOIDUNIT(mnum) = (void *) NULL;
2896 }
2897 
2898 void
2899 md_rem_names(sv_dev_t *sv, int nsv)
2900 {
2901 	int	i, s;
2902 	int	max_sides;
2903 
2904 	if (nsv == 0)
2905 		return;
2906 
2907 	/* All entries removed are in the same diskset */
2908 	if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
2909 		max_sides = MD_MNMAXSIDES;
2910 	else
2911 		max_sides = MD_MAXSIDES;
2912 
2913 	for (i = 0; i < nsv; i++)
2914 		for (s = 0; s < max_sides; s++)
2915 			(void) md_remdevname(sv[i].setno, s, sv[i].key);
2916 }
2917 
2918 /*
2919  * Checking user args before we get into physio - returns 0 for ok, else errno
2920  * We do a lot of checking against illegal arguments here because some of the
2921  * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
2922  * like odd address user buffer.) Those drivers capture bad arguments in
2923  * xxread and xxwrite. But since meta-driver calls their strategy routines
2924  * directly, two bad scenario might happen:
2925  *	1. the real strategy doesn't like it and panic.
2926  *	2. the real strategy doesn't like it and set B_ERROR.
2927  *
2928  * The second case is no better than the first one, since the meta-driver
2929  * will treat it as a media-error and off line the mirror metapartition.
2930  * (Too bad there is no way to tell what error it is.)
2931  *
2932  */
2933 int
2934 md_chk_uio(struct uio *uio)
2935 {
2936 	int	i;
2937 	struct iovec *iov;
2938 
2939 	/*
2940 	 * Check for negative or not block-aligned offset
2941 	 */
2942 	if ((uio->uio_loffset < 0) ||
2943 	    ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
2944 		return (EINVAL);
2945 	}
2946 	iov = uio->uio_iov;
2947 	i = uio->uio_iovcnt;
2948 
2949 	while (i--) {
2950 		if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
2951 			return (EINVAL);
2952 		/*
2953 		 * Bug # 1212146
2954 		 * The default is to not check alignment, but we can now check
2955 		 * for a larger number of alignments if desired.
2956 		 */
2957 		if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
2958 			return (EINVAL);
2959 		iov++;
2960 	}
2961 	return (0);
2962 }
2963 
2964 char *
2965 md_shortname(
2966 	minor_t		mnum
2967 )
2968 {
2969 	static char	buf[MAXPATHLEN];
2970 	char		*devname;
2971 	char		*invalid = " (Invalid minor number %u) ";
2972 	char		*metaname;
2973 	mdc_unit_t	*un;
2974 	side_t		side;
2975 	set_t		setno = MD_MIN2SET(mnum);
2976 	unit_t		unit = MD_MIN2UNIT(mnum);
2977 
2978 	if ((un = MD_UNIT(mnum)) == NULL) {
2979 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
2980 		return (buf);
2981 	}
2982 
2983 	/*
2984 	 * If unit is not a friendly name unit, derive the name from the
2985 	 * minor number.
2986 	 */
2987 	if ((un->un_revision & MD_FN_META_DEV) == 0) {
2988 		/* This is a traditional metadevice */
2989 		if (setno == MD_LOCAL_SET) {
2990 			(void) snprintf(buf, sizeof (buf), "d%u",
2991 			    (unsigned)unit);
2992 		} else {
2993 			(void) snprintf(buf, sizeof (buf), "%s/d%u",
2994 			    mddb_getsetname(setno), (unsigned)unit);
2995 		}
2996 		return (buf);
2997 	}
2998 
2999 	/*
3000 	 * It is a friendly name metadevice, so we need to get its name.
3001 	 */
3002 	side = mddb_getsidenum(setno);
3003 	devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
3004 	if (md_getdevname(setno, side, MD_KEYWILD,
3005 	    md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
3006 		/*
3007 		 * md_getdevname has given us either /dev/md/dsk/<metaname>
3008 		 * or /dev/md/<setname>/dsk/<metname> depending on whether
3009 		 * or not we are in the local set.  Thus, we'll pull the
3010 		 * metaname from this string.
3011 		 */
3012 		if ((metaname = strrchr(devname, '/')) == NULL) {
3013 			(void) snprintf(buf, sizeof (buf), invalid, mnum);
3014 			goto out;
3015 		}
3016 		metaname++;	/* move past slash */
3017 		if (setno == MD_LOCAL_SET) {
3018 			/* No set name. */
3019 			(void) snprintf(buf, sizeof (buf), "%s", metaname);
3020 		} else {
3021 			/* Include setname */
3022 			(void) snprintf(buf, sizeof (buf), "%s/%s",
3023 			    mddb_getsetname(setno), metaname);
3024 		}
3025 	} else {
3026 		/* We couldn't find the name. */
3027 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
3028 	}
3029 
3030 out:
3031 	kmem_free(devname, MAXPATHLEN);
3032 	return (buf);
3033 }
3034 
3035 char *
3036 md_devname(
3037 	set_t		setno,
3038 	md_dev64_t	dev,
3039 	char		*buf,
3040 	size_t		size
3041 )
3042 {
3043 	static char	mybuf[MD_MAX_CTDLEN];
3044 	int		err;
3045 
3046 	if (buf == NULL) {
3047 		buf = mybuf;
3048 		size = sizeof (mybuf);
3049 	} else {
3050 		ASSERT(size >= MD_MAX_CTDLEN);
3051 	}
3052 
3053 	err = md_getdevname_common(setno, mddb_getsidenum(setno),
3054 	    0, dev, buf, size, MD_NOWAIT_LOCK);
3055 	if (err) {
3056 		if (err == ENOENT) {
3057 			(void) sprintf(buf, "(Unavailable)");
3058 		} else {
3059 			(void) sprintf(buf, "(%u.%u)",
3060 			    md_getmajor(dev), md_getminor(dev));
3061 		}
3062 	}
3063 
3064 	return (buf);
3065 }
3066 void
3067 md_minphys(buf_t *pb)
3068 {
3069 	extern unsigned md_maxbcount;
3070 
3071 	if (pb->b_bcount > md_maxbcount)
3072 		pb->b_bcount = md_maxbcount;
3073 }
3074 
3075 void
3076 md_bioinit(struct buf *bp)
3077 {
3078 	ASSERT(bp);
3079 
3080 	bioinit(bp);
3081 	bp->b_back = bp;
3082 	bp->b_forw = bp;
3083 	bp->b_flags = B_BUSY;	/* initialize flags */
3084 }
3085 
3086 void
3087 md_bioreset(struct buf *bp)
3088 {
3089 	ASSERT(bp);
3090 
3091 	bioreset(bp);
3092 	bp->b_back = bp;
3093 	bp->b_forw = bp;
3094 	bp->b_flags = B_BUSY;	/* initialize flags */
3095 }
3096 
3097 /*
3098  * md_bioclone is needed as long as the real bioclone only takes a daddr_t
3099  * as block number.
3100  * We simply call bioclone with all input parameters but blkno, and set the
3101  * correct blkno afterwards.
3102  * Caveat Emptor: bp_mem must not be NULL!
3103  */
3104 buf_t *
3105 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
3106 		int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
3107 {
3108 	(void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
3109 	bp_mem->b_lblkno = blkno;
3110 	return (bp_mem);
3111 }
3112 
3113 
3114 /*
3115  * kstat stuff
3116  */
3117 void
3118 md_kstat_init_ui(
3119 	minor_t		 mnum,
3120 	mdi_unit_t	*ui
3121 )
3122 {
3123 	if ((ui != NULL) && (ui->ui_kstat == NULL)) {
3124 		set_t	setno = MD_MIN2SET(mnum);
3125 		unit_t  unit = MD_MIN2UNIT(mnum);
3126 		char	module[KSTAT_STRLEN];
3127 		char	*p = module;
3128 
3129 		if (setno != MD_LOCAL_SET) {
3130 			char	buf[64];
3131 			char	*s = buf;
3132 			char	*e = module + sizeof (module) - 4;
3133 
3134 			(void) sprintf(buf, "%u", setno);
3135 			while ((p < e) && (*s != '\0'))
3136 				*p++ = *s++;
3137 			*p++ = '/';
3138 		}
3139 		*p++ = 'm';
3140 		*p++ = 'd';
3141 		*p = '\0';
3142 		if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
3143 		    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
3144 			ui->ui_kstat->ks_lock = &ui->ui_mx;
3145 			kstat_install(ui->ui_kstat);
3146 		}
3147 	}
3148 }
3149 
3150 void
3151 md_kstat_init(
3152 	minor_t		mnum
3153 )
3154 {
3155 	md_kstat_init_ui(mnum, MDI_UNIT(mnum));
3156 }
3157 
3158 void
3159 md_kstat_destroy_ui(
3160 	mdi_unit_t	*ui
3161 )
3162 {
3163 	/*
3164 	 * kstat_delete() interface has it's own locking mechanism and
3165 	 * does not allow holding of kstat lock (ks_lock).
3166 	 * Note: ks_lock == ui_mx from the md_kstat_init_ui().
3167 	 */
3168 	if ((ui != NULL) && (ui->ui_kstat != NULL)) {
3169 		kstat_delete(ui->ui_kstat);
3170 		ui->ui_kstat = NULL;
3171 	}
3172 }
3173 
3174 void
3175 md_kstat_destroy(
3176 	minor_t		mnum
3177 )
3178 {
3179 	md_kstat_destroy_ui(MDI_UNIT(mnum));
3180 }
3181 
3182 /*
3183  * In the following subsequent routines, locks are held before checking the
3184  * validity of ui_kstat. This is done to make sure that we don't trip over
3185  * a NULL ui_kstat anymore.
3186  */
3187 
3188 void
3189 md_kstat_waitq_enter(
3190 	mdi_unit_t	*ui
3191 )
3192 {
3193 	mutex_enter(&ui->ui_mx);
3194 	if (ui->ui_kstat != NULL)
3195 		kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3196 	mutex_exit(&ui->ui_mx);
3197 }
3198 
3199 void
3200 md_kstat_waitq_to_runq(
3201 	mdi_unit_t	*ui
3202 )
3203 {
3204 	mutex_enter(&ui->ui_mx);
3205 	if (ui->ui_kstat != NULL)
3206 		kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
3207 	mutex_exit(&ui->ui_mx);
3208 }
3209 
3210 void
3211 md_kstat_waitq_exit(
3212 	mdi_unit_t	*ui
3213 )
3214 {
3215 	mutex_enter(&ui->ui_mx);
3216 	if (ui->ui_kstat != NULL)
3217 		kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3218 	mutex_exit(&ui->ui_mx);
3219 }
3220 
3221 void
3222 md_kstat_runq_enter(
3223 	mdi_unit_t	*ui
3224 )
3225 {
3226 	mutex_enter(&ui->ui_mx);
3227 	if (ui->ui_kstat != NULL)
3228 		kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3229 	mutex_exit(&ui->ui_mx);
3230 }
3231 
3232 void
3233 md_kstat_runq_exit(
3234 	mdi_unit_t	*ui
3235 )
3236 {
3237 	mutex_enter(&ui->ui_mx);
3238 	if (ui->ui_kstat != NULL)
3239 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3240 	mutex_exit(&ui->ui_mx);
3241 }
3242 
3243 void
3244 md_kstat_done(
3245 	mdi_unit_t	*ui,
3246 	buf_t		*bp,
3247 	int		war
3248 )
3249 {
3250 	size_t  n_done;
3251 
3252 	/* check for end of device */
3253 	if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
3254 		n_done = bp->b_bcount;
3255 	} else if (bp->b_bcount < bp->b_resid) {
3256 		n_done = 0;
3257 	} else {
3258 		n_done = bp->b_bcount - bp->b_resid;
3259 	}
3260 
3261 	/* do accounting */
3262 	mutex_enter(&ui->ui_mx);
3263 	if (ui->ui_kstat != NULL) {
3264 		if ((! war) && (bp->b_flags & B_READ)) {
3265 			KSTAT_IO_PTR(ui->ui_kstat)->reads++;
3266 			KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
3267 		} else {
3268 			KSTAT_IO_PTR(ui->ui_kstat)->writes++;
3269 			KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
3270 		}
3271 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3272 	}
3273 	mutex_exit(&ui->ui_mx);
3274 }
3275 
3276 pid_t
3277 md_getpid()
3278 {
3279 	pid_t valuep;
3280 	if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
3281 		ASSERT(0);
3282 		return ((pid_t)0);
3283 	} else {
3284 		ASSERT(valuep);
3285 		return (valuep);
3286 	}
3287 }
3288 
3289 
3290 proc_t *
3291 md_getproc()
3292 {
3293 	proc_t  *valuep;
3294 	if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
3295 		ASSERT(0);
3296 		return ((proc_t *)NULL);
3297 	} else {
3298 		ASSERT(valuep);
3299 		return (valuep);
3300 	}
3301 }
3302 
3303 extern kmutex_t pidlock;
3304 
3305 /*
3306  * this check to see if a process pid pair are still running.  For the
3307  * disk set lock when both pid/proc are zero then the locks is not
3308  * currently held.
3309  */
3310 int
3311 md_checkpid(pid_t pid, proc_t *proc)
3312 {
3313 	int	retval = 1;
3314 
3315 	if (pid == 0 && proc == NULL)
3316 		return (0);
3317 
3318 	mutex_enter(&pidlock);
3319 	if (prfind(pid)  != proc)
3320 		retval = 0;
3321 	mutex_exit(&pidlock);
3322 	return (retval);
3323 }
3324 
3325 /*
3326  * NAME: md_init_probereq
3327  *
3328  * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
3329  *		they can be dispatched to multiple daemon threads.
3330  *
3331  * PARAMETERS: struct md_probedev *p	pointer ioctl input
3332  *
3333  * RETURN VALUE: Returns errno
3334  *
3335  */
3336 
3337 int
3338 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
3339 {
3340 	int		err = 0;
3341 	int		modindx;
3342 	intptr_t	(*probe_test)();
3343 
3344 	/*
3345 	 * Initialize the semaphores and mutex
3346 	 * for the request
3347 	 */
3348 
3349 	p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
3350 
3351 	p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
3352 	sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
3353 	mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
3354 
3355 	modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
3356 	probe_test = md_get_named_service(NODEV64, modindx,
3357 	    p->probe.test_name, 0);
3358 	if (probe_test == NULL) {
3359 		err = EINVAL;
3360 		goto err_out;
3361 	}
3362 
3363 	err = md_create_probe_rqlist(p, hdrpp, probe_test);
3364 err_out:
3365 	return (err);
3366 }
3367 
3368 /*
3369  * NAME: md_probe_one
3370  *
3371  * DESCRIPTION: Generic routine for probing disks. This is called from the
3372  *		daemon.
3373  *
3374  * PARAMETERS: probe_req_t	*reqp	pointer to the probe request structure.
3375  *
3376  */
3377 
3378 void
3379 md_probe_one(probe_req_t *reqp)
3380 {
3381 	mdi_unit_t		*ui;
3382 	md_probedev_impl_t	*p;
3383 	int			err = 0;
3384 	set_t			setno;
3385 
3386 	p = (md_probedev_impl_t *)reqp->private_handle;
3387 	/*
3388 	 * Validate the unit while holding the global ioctl lock, then
3389 	 * obtain the unit_writerlock. Once the writerlock has been obtained
3390 	 * we can release the global lock. As long as we hold one of these
3391 	 * locks this will prevent a metaclear operation being performed
3392 	 * on the metadevice because metaclear takes the readerlock (via
3393 	 * openclose lock).
3394 	 * To avoid a potential deadlock with the probe_fcn() causing i/o to
3395 	 * be issued to the writerlock'd metadevice we only grab the writerlock
3396 	 * if the unit is not an SVM root device.
3397 	 */
3398 	while (md_ioctl_lock_enter() == EINTR)
3399 		;
3400 	setno = MD_MIN2SET(reqp->mnum);
3401 	ui = MDI_UNIT(reqp->mnum);
3402 	if (ui != NULL) {
3403 		int	writer_grabbed;
3404 		dev_t	svm_root;
3405 
3406 		if ((setno == MD_LOCAL_SET) && root_is_svm) {
3407 			svm_root = getrootdev();
3408 
3409 			if (getminor(svm_root) == reqp->mnum) {
3410 				writer_grabbed = 0;
3411 			} else {
3412 				writer_grabbed = 1;
3413 				(void) md_unit_writerlock_common(ui, 0);
3414 			}
3415 		} else {
3416 			writer_grabbed = 1;
3417 			(void) md_unit_writerlock_common(ui, 0);
3418 		}
3419 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3420 		err = (*reqp->probe_fcn)(ui, reqp->mnum);
3421 		if (writer_grabbed) {
3422 			md_unit_writerexit(ui);
3423 		}
3424 	} else {
3425 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3426 	}
3427 
3428 	/* update the info in the probe structure */
3429 
3430 	mutex_enter(PROBE_MX(p));
3431 	if (err != 0) {
3432 		cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
3433 		    reqp->mnum);
3434 		(void) mdsyserror(&(p->probe.mde), err);
3435 	}
3436 
3437 	mutex_exit(PROBE_MX(p));
3438 	sema_v(PROBE_SEMA(p));
3439 
3440 	kmem_free(reqp, sizeof (probe_req_t));
3441 }
3442 char *
3443 md_strdup(char *cp)
3444 {
3445 	char *new_cp = NULL;
3446 
3447 	new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
3448 
3449 	return (strcpy(new_cp, cp));
3450 }
3451 
3452 void
3453 freestr(char *cp)
3454 {
3455 	kmem_free(cp, strlen(cp) + 1);
3456 }
3457 
3458 /*
3459  * Validate the list and skip invalid devices. Then create
3460  * a doubly linked circular list of devices to probe.
3461  * The hdr points to the head and tail of this list.
3462  */
3463 
3464 static int
3465 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
3466 			intptr_t (*probe_test)())
3467 {
3468 	int i, err, nodevcnt;
3469 	probe_req_t *tp;
3470 	daemon_queue_t *hp;
3471 	minor_t mnum;
3472 
3473 	nodevcnt = 0;
3474 
3475 	hp = NULL;
3476 
3477 	for (i = 0; i <  plist->probe.nmdevs; i++) {
3478 		mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
3479 		if (MDI_UNIT(mnum) == NULL) {
3480 			cmn_err(CE_WARN, "md: Cannot probe %s since it does "
3481 			    "not exist", md_shortname(mnum));
3482 			nodevcnt++;
3483 			continue;
3484 		}
3485 		tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
3486 		tp->mnum = mnum;
3487 		tp->private_handle = (void *)plist;
3488 		tp->probe_fcn = probe_test;
3489 		if (hp == NULL) {
3490 			hp = (daemon_queue_t *)tp;
3491 			hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
3492 		} else {
3493 			tp->dq.dq_next = hp;
3494 			tp->dq.dq_prev = hp->dq_prev;
3495 			hp->dq_prev->dq_next = (daemon_queue_t *)tp;
3496 			hp->dq_prev = (daemon_queue_t *)tp;
3497 		}
3498 	}
3499 
3500 	*hdr = hp;
3501 	if (nodevcnt > 0)
3502 		plist->probe.nmdevs -= nodevcnt;
3503 
3504 	/*
3505 	 * If there are no devices to be probed because they were
3506 	 * incorrect, then return an error.
3507 	 */
3508 	err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
3509 
3510 	return (err);
3511 }
3512 
3513 /*
3514  * This routine increments the I/O count for set I/O operations.  This
3515  * value is used to determine if an I/O can done.  If a release is in
3516  * process this will return an error and cause the I/O to be errored.
3517  */
3518 int
3519 md_inc_iocount(set_t setno)
3520 {
3521 	int	rc = 0;
3522 
3523 	if (setno == 0)
3524 		return (0);
3525 
3526 	mutex_enter(&md_set_io[setno].md_io_mx);
3527 	if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
3528 		rc = EIO;
3529 		goto out;
3530 	}
3531 
3532 	ASSERT(md_set_io[setno].io_cnt >= 0);
3533 	md_set_io[setno].io_cnt++;
3534 
3535 out:	mutex_exit(&md_set_io[setno].md_io_mx);
3536 	return (rc);
3537 }
3538 
3539 void
3540 md_inc_iocount_noblock(set_t setno)
3541 {
3542 
3543 	if (setno == 0)
3544 		return;
3545 
3546 	mutex_enter(&md_set_io[setno].md_io_mx);
3547 	md_set_io[setno].io_cnt++;
3548 	mutex_exit(&md_set_io[setno].md_io_mx);
3549 }
3550 void
3551 md_dec_iocount(set_t setno)
3552 {
3553 
3554 	if (setno == 0)
3555 		return;
3556 
3557 	mutex_enter(&md_set_io[setno].md_io_mx);
3558 	md_set_io[setno].io_cnt--;
3559 	ASSERT(md_set_io[setno].io_cnt >= 0);
3560 	if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
3561 	    (md_set_io[setno].io_cnt == 0))
3562 		cv_broadcast(&md_set_io[setno].md_io_cv);
3563 	mutex_exit(&md_set_io[setno].md_io_mx);
3564 }
3565 
3566 int
3567 md_isblock_setio(set_t setno)
3568 {
3569 	int	rc = 0;
3570 
3571 	if (setno == 0)
3572 		return (0);
3573 
3574 	mutex_enter(&md_set_io[setno].md_io_mx);
3575 	if (md_set_io[setno].io_state & MD_SET_RELEASE)
3576 		rc = 1;
3577 
3578 	mutex_exit(&md_set_io[setno].md_io_mx);
3579 	return (rc);
3580 }
3581 
3582 int
3583 md_block_setio(set_t setno)
3584 {
3585 	int	rc = 0;
3586 
3587 	if (setno == 0)
3588 		return (1);
3589 
3590 	mutex_enter(&md_set_io[setno].md_io_mx);
3591 	md_set_io[setno].io_state = MD_SET_RELEASE;
3592 
3593 	while (md_set_io[setno].io_cnt > 0) {
3594 		cv_wait(&md_set_io[setno].md_io_cv,
3595 		    &md_set_io[setno].md_io_mx);
3596 	}
3597 	rc = 1;
3598 
3599 
3600 	ASSERT(md_set_io[setno].io_cnt == 0);
3601 	mutex_exit(&md_set_io[setno].md_io_mx);
3602 
3603 	return (rc);
3604 }
3605 
3606 void
3607 md_clearblock_setio(set_t setno)
3608 {
3609 	if (setno == 0)
3610 		return;
3611 
3612 	mutex_enter(&md_set_io[setno].md_io_mx);
3613 	md_set_io[setno].io_state = MD_SET_ACTIVE;
3614 	mutex_exit(&md_set_io[setno].md_io_mx);
3615 }
3616 
3617 void
3618 md_unblock_setio(set_t setno)
3619 {
3620 	if (setno == 0)
3621 		return;
3622 
3623 	mutex_enter(&md_set_io[setno].md_io_mx);
3624 #ifdef DEBUG
3625 	if (md_set_io[setno].io_cnt != 0) {
3626 		cmn_err(CE_NOTE, "set %d count was %ld at take",
3627 		    setno, md_set_io[setno].io_cnt);
3628 	}
3629 #endif /* DEBUG */
3630 
3631 	md_set_io[setno].io_state = MD_SET_ACTIVE;
3632 	md_set_io[setno].io_cnt = 0;
3633 	mutex_exit(&md_set_io[setno].md_io_mx);
3634 }
3635 
3636 /*
3637  * Test and set version of the md_block_setio.
3638  * Set the io_state to keep new I/O from being issued.
3639  * If there is I/O currently in progress, then set io_state to active
3640  * and return failure.  Otherwise, return a 1 for success.
3641  *
3642  * Used in a MN diskset since the commd must be suspended before
3643  * this node can attempt to withdraw from a diskset.  But, with commd
3644  * suspended, I/O may have been issued that can never finish until
3645  * commd is resumed (allocation of hotspare, etc). So, if I/O is
3646  * outstanding after diskset io_state is marked RELEASE, then set diskset
3647  * io_state back to ACTIVE and return failure.
3648  */
3649 int
3650 md_tas_block_setio(set_t setno)
3651 {
3652 	int	rc;
3653 
3654 	if (setno == 0)
3655 		return (1);
3656 
3657 	mutex_enter(&md_set_io[setno].md_io_mx);
3658 	md_set_io[setno].io_state = MD_SET_RELEASE;
3659 
3660 	if (md_set_io[setno].io_cnt > 0) {
3661 		md_set_io[setno].io_state = MD_SET_ACTIVE;
3662 		rc = 0;
3663 	} else {
3664 		rc = 1;
3665 	}
3666 
3667 	mutex_exit(&md_set_io[setno].md_io_mx);
3668 
3669 	return (rc);
3670 }
3671 
3672 void
3673 md_biodone(struct buf *pb)
3674 {
3675 	minor_t	mnum;
3676 	set_t	setno;
3677 	mdi_unit_t	*ui;
3678 
3679 	mnum = getminor(pb->b_edev);
3680 	setno = MD_MIN2SET(mnum);
3681 
3682 	if (setno == 0) {
3683 		biodone(pb);
3684 		return;
3685 	}
3686 
3687 #ifdef DEBUG
3688 	ui = MDI_UNIT(mnum);
3689 	if (!md_unit_isopen(ui))
3690 		cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
3691 #endif /* DEBUG */
3692 
3693 	/*
3694 	 * Handle the local diskset
3695 	 */
3696 	if (md_set_io[setno].io_cnt > 0)
3697 		md_dec_iocount(setno);
3698 
3699 #ifdef DEBUG
3700 	/*
3701 	 * this is being done after the lock is dropped so there
3702 	 * are cases it may be invalid.  It is advisory.
3703 	 */
3704 	if (md_set_io[setno].io_state & MD_SET_RELEASE) {
3705 		/* Only display this error once for this metadevice */
3706 		if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
3707 			cmn_err(CE_NOTE,
3708 			    "I/O to %s attempted during set RELEASE\n",
3709 			    md_shortname(mnum));
3710 			ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
3711 		}
3712 	}
3713 #endif /* DEBUG */
3714 
3715 	biodone(pb);
3716 }
3717 
3718 
3719 /*
3720  * Driver special private devt handling routine
3721  * INPUT:  md_dev64_t
3722  * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
3723  */
3724 dev_t
3725 md_dev64_to_dev(md_dev64_t dev)
3726 {
3727 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3728 	minor_t minor = (minor_t)(dev & MAXMIN64);
3729 
3730 	return (makedevice(major, minor));
3731 
3732 }
3733 
3734 /*
3735  * Driver private makedevice routine
3736  * INPUT:  major_t major, minor_t minor
3737  * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
3738  */
3739 md_dev64_t
3740 md_makedevice(major_t major, minor_t minor)
3741 {
3742 	return (((md_dev64_t)major << NBITSMINOR64) | minor);
3743 
3744 }
3745 
3746 
3747 /*
3748  * Driver private devt md_getmajor routine
3749  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3750  * OUTPUT: the appropriate major number
3751  */
3752 major_t
3753 md_getmajor(md_dev64_t dev)
3754 {
3755 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3756 
3757 	if (major == 0) {
3758 		/* Here we were given a 32bit dev */
3759 		major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
3760 	}
3761 	return (major);
3762 }
3763 
3764 /*
3765  * Driver private devt md_getminor routine
3766  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3767  * OUTPUT: the appropriate minor number
3768  */
3769 minor_t
3770 md_getminor(md_dev64_t dev)
3771 {
3772 	minor_t minor;
3773 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3774 
3775 	if (major == 0) {
3776 		/* Here we were given a 32bit dev */
3777 		minor = (minor_t)(dev & MAXMIN32);
3778 	} else {
3779 		minor = (minor_t)(dev & MAXMIN64);
3780 	}
3781 	return (minor);
3782 }
3783 
3784 int
3785 md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
3786 {
3787 	/*
3788 	 * If the metadevice is an old style device, it has a vtoc,
3789 	 *	in that case all reading EFI ioctls are not applicable.
3790 	 * If the metadevice has an EFI label, reading vtoc and geom ioctls
3791 	 *	are not supposed to work.
3792 	 */
3793 	switch (cmd) {
3794 		case DKIOCGGEOM:
3795 		case DKIOCGAPART:
3796 			/* if > 2 TB then fail */
3797 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3798 				return (ENOTSUP);
3799 			}
3800 			break;
3801 		case DKIOCGVTOC:
3802 			/* if > 2 TB then fail */
3803 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3804 				return (ENOTSUP);
3805 			}
3806 
3807 			/* if > 1 TB but < 2TB return overflow */
3808 			if (c.un_revision & MD_64BIT_META_DEV) {
3809 				return (EOVERFLOW);
3810 			}
3811 			break;
3812 		case DKIOCGEXTVTOC:
3813 			/* if > 2 TB then fail */
3814 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3815 				return (ENOTSUP);
3816 			}
3817 			break;
3818 		case DKIOCGETEFI:
3819 		case DKIOCPARTITION:
3820 			if ((c.un_flag & MD_EFILABEL) == 0) {
3821 				return (ENOTSUP);
3822 			}
3823 			break;
3824 
3825 		case DKIOCSETEFI:
3826 		/* setting an EFI label should always be ok */
3827 			return (0);
3828 
3829 		case DKIOCSVTOC:
3830 			/* if > 2 TB then fail */
3831 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3832 				return (ENOTSUP);
3833 			}
3834 
3835 			/* if > 1 TB but < 2TB return overflow */
3836 			if (c.un_revision & MD_64BIT_META_DEV) {
3837 				return (EOVERFLOW);
3838 			}
3839 			break;
3840 		case DKIOCSEXTVTOC:
3841 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3842 				return (ENOTSUP);
3843 			}
3844 			break;
3845 	}
3846 	return (0);
3847 }
3848 
3849 /*
3850  * md_vtoc_to_efi_record()
3851  * Input:  record id of the vtoc record
3852  * Output: record id of the efi record
3853  * Function:
3854  *	- reads the  volume name from the vtoc record
3855  *	- converts the volume name to a format, libefi understands
3856  *	- creates a new record of size MD_EFI_PARTNAME_BYTES
3857  *	- stores the volname in that record,
3858  *	- commits that record
3859  *	- returns the recid of the efi record.
3860  * Caveat Emptor:
3861  *	The calling routine must do something like
3862  *	- un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
3863  *	- commit(un)
3864  *	- delete(vtoc_recid)
3865  *	in order to keep the mddb consistent in case of a panic in the middle.
3866  * Errors:
3867  *	- returns 0 on any error
3868  */
3869 mddb_recid_t
3870 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
3871 {
3872 	struct vtoc	*vtoc;
3873 	ushort_t	*v;
3874 	mddb_recid_t	efi_recid;
3875 	int		i;
3876 
3877 	if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
3878 		return (0);
3879 	}
3880 	vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
3881 	efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
3882 	    MD_CRO_32BIT, setno);
3883 	if (efi_recid < 0) {
3884 		return (0);
3885 	}
3886 	v = (ushort_t *)mddb_getrecaddr(efi_recid);
3887 
3888 	/* This for loop read, converts and writes */
3889 	for (i = 0; i < LEN_DKL_VVOL; i++) {
3890 		v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
3891 	}
3892 	/* commit the new record */
3893 	mddb_commitrec_wrapper(efi_recid);
3894 
3895 	return (efi_recid);
3896 }
3897 
3898 /*
3899  * Send a kernel message.
3900  * user has to provide for an allocated result structure
3901  * If the door handler disappears we retry, emitting warnings every so often.
3902  *
3903  * The recipient argument is almost always unused, and is therefore typically
3904  * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
3905  * marking and clearing of the DRL from a node that is not currently the
3906  * owner.  In these cases, the recipient argument will be the nodeid of the
3907  * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
3908  * nodes will not receive these messages.
3909  *
3910  * For the case where md_mn_is_commd_present() is false, we rely on the
3911  * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for
3912  * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0.
3913  */
3914 int
3915 mdmn_ksend_message(
3916 	set_t		setno,
3917 	md_mn_msgtype_t	type,
3918 	uint_t		flags,
3919 	md_mn_nodeid_t	recipient,
3920 	char		*data,
3921 	int		size,
3922 	md_mn_kresult_t	*result)
3923 {
3924 	door_arg_t	da;
3925 	md_mn_kmsg_t	*kmsg;
3926 	uint_t		send_try_cnt = 0;
3927 	uint_t		retry_noise_cnt = 0;
3928 	int		rval;
3929 	k_sigset_t	oldmask, newmask;
3930 
3931 	if (size > MDMN_MAX_KMSG_DATA)
3932 		return (ENOMEM);
3933 	kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
3934 	kmsg->kmsg_flags = flags;
3935 	kmsg->kmsg_setno = setno;
3936 	kmsg->kmsg_recipient = recipient;
3937 	kmsg->kmsg_type	= type;
3938 	kmsg->kmsg_size	= size;
3939 	bcopy(data, &(kmsg->kmsg_data), size);
3940 
3941 	/*
3942 	 * Wait for the door handle to be established.
3943 	 */
3944 	while (mdmn_door_did == -1) {
3945 		if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
3946 			cmn_err(CE_WARN, "door handle not yet ready. "
3947 			    "Check if /usr/lib/lvm/mddoors is running");
3948 		}
3949 		delay(md_hz);
3950 	}
3951 
3952 	/*
3953 	 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
3954 	 * do not fail if the user process receives a signal while we're
3955 	 * active in the door interface.
3956 	 */
3957 	if (flags & MD_MSGF_BLK_SIGNAL) {
3958 		sigfillset(&newmask);
3959 		sigreplace(&newmask, &oldmask);
3960 	}
3961 
3962 	/*
3963 	 * If message failed with an RPC_FAILURE when rpc.mdcommd had
3964 	 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
3965 	 * then don't retry the message anymore.  If message
3966 	 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
3967 	 * times which should allow a shutting down system time to
3968 	 * notify the kernel of a graceful shutdown of rpc.mdcommd.
3969 	 *
3970 	 * Caller of this routine will need to check the md_mn_commd_present
3971 	 * flag and the failure error in order to determine whether to panic
3972 	 * or not.  If md_mn_commd_present is set to 0 and failure error
3973 	 * is RPC_FAILURE, the calling routine should not panic since the
3974 	 * system is in the process of being shutdown.
3975 	 *
3976 	 */
3977 
3978 	retry_noise_cnt = send_try_cnt = 0;
3979 	while (md_mn_is_commd_present_lite()) {
3980 		/*
3981 		 * data_ptr and data_size are initialized here because on
3982 		 * return from the upcall, they contain data duplicated from
3983 		 * rbuf and rsize.  This causes subsequent upcalls to fail.
3984 		 */
3985 		da.data_ptr = (char *)(kmsg);
3986 		da.data_size = sizeof (md_mn_kmsg_t);
3987 		da.desc_ptr = NULL;
3988 		da.desc_num = 0;
3989 		da.rbuf = (char *)result;
3990 		da.rsize = sizeof (*result);
3991 
3992 		while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
3993 		    NULL, SIZE_MAX, 0)) != 0) {
3994 			if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
3995 				if (rval == EAGAIN)  {
3996 					cmn_err(CE_WARN,
3997 					    "md: door_upcall failed. "
3998 					    "Check if mddoors is running.");
3999 				} else if (rval == EINTR) {
4000 					cmn_err(CE_WARN,
4001 					    "md: door_upcall failed. "
4002 					    "Check if rpc.mdcommd is running.");
4003 				} else {
4004 					cmn_err(CE_WARN,
4005 					    "md: door_upcall failed. "
4006 					    "Returned %d",
4007 					    rval);
4008 				}
4009 			}
4010 			if (++send_try_cnt >= md_send_retry_limit)
4011 				break;
4012 
4013 			delay(md_hz);
4014 
4015 			/*
4016 			 * data_ptr and data_size are re-initialized here
4017 			 * because on return from the upcall, they contain
4018 			 * data duplicated from rbuf and rsize.  This causes
4019 			 * subsequent upcalls to fail.
4020 			 */
4021 			da.data_ptr = (char *)(kmsg);
4022 			da.data_size = sizeof (md_mn_kmsg_t);
4023 			da.desc_ptr = NULL;
4024 			da.desc_num = 0;
4025 			da.rbuf = (char *)result;
4026 			da.rsize = sizeof (*result);
4027 		}
4028 
4029 
4030 		/*
4031 		 * If:
4032 		 * - the send succeeded (MDMNE_ACK)
4033 		 * - we had an MDMNE_RPC_FAIL and commd is now gone
4034 		 *   (note: since the outer loop is commd-dependent,
4035 		 *   checking MDMN_RPC_FAIL here is meaningless)
4036 		 * - we were told not to retry
4037 		 * - we exceeded the RPC failure send limit
4038 		 * punch out of the outer loop prior to the delay()
4039 		 */
4040 		if (result->kmmr_comm_state == MDMNE_ACK ||
4041 		    (flags & MD_MSGF_KSEND_NORETRY) ||
4042 		    (++send_try_cnt % md_send_retry_limit) == 0 ||
4043 		    !md_mn_is_commd_present())
4044 			break;
4045 		delay(md_hz);
4046 	}
4047 
4048 	if (flags & MD_MSGF_BLK_SIGNAL) {
4049 		sigreplace(&oldmask, (k_sigset_t *)NULL);
4050 	}
4051 	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
4052 
4053 	return (0);
4054 }
4055 
4056 /*
4057  * Called to propagate the capability of a metadevice to all nodes in the set.
4058  *
4059  * On entry, lockp is set if the function has been called from within an ioctl.
4060  *
4061  * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
4062  * routine to enable other mdioctls to enter the kernel while this
4063  * thread of execution waits on the completion of mdmn_ksend_message. When
4064  * the message is completed the thread continues and md_ioctl_lock must be
4065  * reacquired.  Even though md_ioctl_lock is interruptable, we choose to
4066  * ignore EINTR as we must not return without acquiring md_ioctl_lock.
4067  */
4068 
4069 int
4070 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
4071 {
4072 	md_mn_msg_setcap_t	msg;
4073 	md_mn_kresult_t		*kres;
4074 	mdi_unit_t		*ui = MDI_UNIT(mnum);
4075 	int			ret;
4076 	k_sigset_t		oldmask, newmask;
4077 
4078 	(void) strncpy((char *)&msg.msg_setcap_driver,
4079 	    md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
4080 	msg.msg_setcap_mnum = mnum;
4081 	msg.msg_setcap_set = vc.vc_set;
4082 
4083 	if (lockp)
4084 		IOLOCK_RETURN_RELEASE(0, lockp);
4085 	kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4086 
4087 	/*
4088 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4089 	 * interface from failing if the user process receives a signal while
4090 	 * in mdmn_ksend_message.
4091 	 */
4092 	sigfillset(&newmask);
4093 	sigreplace(&newmask, &oldmask);
4094 	ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
4095 	    MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
4096 	    kres));
4097 	sigreplace(&oldmask, (k_sigset_t *)NULL);
4098 
4099 	if (!MDMN_KSEND_MSG_OK(ret, kres)) {
4100 		mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
4101 		ret = EIO;
4102 	}
4103 	kmem_free(kres, sizeof (md_mn_kresult_t));
4104 
4105 	if (lockp) {
4106 		IOLOCK_RETURN_REACQUIRE(lockp);
4107 	}
4108 	return (ret);
4109 }
4110 
4111 /*
4112  * Called to clear all of the transient capabilities for a metadevice when it is
4113  * not open on any node in the cluster
4114  * Called from close for mirror and sp.
4115  */
4116 
4117 void
4118 mdmn_clear_all_capabilities(minor_t mnum)
4119 {
4120 	md_isopen_t	clumsg;
4121 	int		ret;
4122 	md_mn_kresult_t	*kresult;
4123 	volcap_t	vc;
4124 	k_sigset_t	oldmask, newmask;
4125 
4126 	clumsg.dev = md_makedevice(md_major, mnum);
4127 	clumsg.mde = mdnullerror;
4128 	/*
4129 	 * The check open message doesn't have to be logged, nor should the
4130 	 * result be stored in the MCT. We want an up-to-date state.
4131 	 */
4132 	kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4133 
4134 	/*
4135 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4136 	 * interface from failing if the user process receives a signal while
4137 	 * in mdmn_ksend_message.
4138 	 */
4139 	sigfillset(&newmask);
4140 	sigreplace(&newmask, &oldmask);
4141 	ret = mdmn_ksend_message(MD_MIN2SET(mnum),
4142 	    MD_MN_MSG_CLU_CHECK,
4143 	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
4144 	    (char *)&clumsg, sizeof (clumsg), kresult);
4145 	sigreplace(&oldmask, (k_sigset_t *)NULL);
4146 
4147 	if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
4148 		/*
4149 		 * Not open on any node, clear all capabilities, eg ABR and
4150 		 * DMR
4151 		 */
4152 		vc.vc_set = 0;
4153 		(void) mdmn_send_capability_message(mnum, vc, NULL);
4154 	}
4155 	kmem_free(kresult, sizeof (md_mn_kresult_t));
4156 }
4157 
4158 /*
4159  * mdmn_ksend_show_error:
4160  * ---------------------
4161  * Called to display the error contents of a failing mdmn_ksend_message() result
4162  *
4163  * Input:
4164  *	rv	- return value from mdmn_ksend_message()
4165  *	kres	- pointer to result structure filled in by mdmn_ksend_message
4166  *	s	- Informative message to identify failing condition (e.g.
4167  *		  "Ownership change") This string will be displayed with
4168  *		  cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
4169  *		  administrator
4170  */
4171 void
4172 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
4173 {
4174 	if (rv == 0) {
4175 		cmn_err(CE_WARN, "%s *FAILED*", s);
4176 		cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
4177 		    " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
4178 		    kres->kmmr_failing_node);
4179 	} else {
4180 		cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
4181 	}
4182 }
4183 
4184 /*
4185  * Callback routine for resync thread. If requested to suspend we mark the
4186  * commd as not being present.
4187  */
4188 boolean_t
4189 callb_md_mrs_cpr(void *arg, int code)
4190 {
4191 	callb_cpr_t *cp = (callb_cpr_t *)arg;
4192 	int ret = 0;				/* assume success */
4193 
4194 	mutex_enter(cp->cc_lockp);
4195 
4196 	switch (code) {
4197 	case CB_CODE_CPR_CHKPT:
4198 		/*
4199 		 * Mark the rpc.mdcommd as no longer present. We are trying to
4200 		 * suspend the system and so we should expect RPC failures to
4201 		 * occur.
4202 		 */
4203 		md_mn_clear_commd_present();
4204 		cp->cc_events |= CALLB_CPR_START;
4205 		while (!(cp->cc_events & CALLB_CPR_SAFE))
4206 			/* cv_timedwait() returns -1 if it times out. */
4207 			if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp,
4208 			    lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1)
4209 				break;
4210 			break;
4211 
4212 	case CB_CODE_CPR_RESUME:
4213 		cp->cc_events &= ~CALLB_CPR_START;
4214 		cv_signal(&cp->cc_stop_cv);
4215 		break;
4216 	}
4217 	mutex_exit(cp->cc_lockp);
4218 	return (ret != -1);
4219 }
4220 
4221 
4222 void
4223 md_rem_hspname(set_t setno, mdkey_t n_key)
4224 {
4225 	int	s;
4226 	int	max_sides;
4227 
4228 
4229 	/* All entries removed are in the same diskset */
4230 	if (md_get_setstatus(setno) & MD_SET_MNSET)
4231 		max_sides = MD_MNMAXSIDES;
4232 	else
4233 		max_sides = MD_MAXSIDES;
4234 
4235 	for (s = 0; s < max_sides; s++)
4236 		(void) md_remdevname(setno, s, n_key);
4237 }
4238 
4239 
4240 int
4241 md_rem_selfname(minor_t selfid)
4242 {
4243 	int	s;
4244 	set_t	setno = MD_MIN2SET(selfid);
4245 	int	max_sides;
4246 	md_dev64_t	dev;
4247 	struct nm_next_hdr	*nh;
4248 	struct nm_name	*n;
4249 	mdkey_t key;
4250 
4251 	/*
4252 	 * Get the key since remove routine expects it
4253 	 */
4254 	dev = md_makedevice(md_major, selfid);
4255 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
4256 		return (ENOENT);
4257 	}
4258 
4259 	if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
4260 	    MD_KEYWILD, dev, 0L)) == NULL) {
4261 		return (ENOENT);
4262 	}
4263 
4264 	/* All entries removed are in the same diskset */
4265 	key = n->n_key;
4266 	if (md_get_setstatus(setno) & MD_SET_MNSET)
4267 		max_sides = MD_MNMAXSIDES;
4268 	else
4269 		max_sides = MD_MAXSIDES;
4270 
4271 	for (s = 0; s < max_sides; s++)
4272 		(void) md_remdevname(setno, s, key);
4273 
4274 	return (0);
4275 }
4276 
4277 void
4278 md_upd_set_unnext(set_t setno, unit_t un)
4279 {
4280 	if (un < md_set[setno].s_un_next) {
4281 		md_set[setno].s_un_next = un;
4282 	}
4283 }
4284 
4285 struct hot_spare_pool *
4286 find_hot_spare_pool(set_t setno, int hsp_id)
4287 {
4288 	hot_spare_pool_t *hsp;
4289 
4290 	hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
4291 	while (hsp != NULL) {
4292 		if (hsp->hsp_self_id == hsp_id)
4293 			return (hsp);
4294 		hsp = hsp->hsp_next;
4295 	}
4296 
4297 	return ((hot_spare_pool_t *)0);
4298 }
4299 
4300 /*
4301  * md_create_taskq:
4302  *
4303  * Create a kernel taskq for the given set/unit combination. This is typically
4304  * used to complete a RR_CLEAN request when the callee is unable to obtain the
4305  * mutex / condvar access required to update the DRL safely.
4306  */
4307 void *
4308 md_create_taskq(set_t setno, minor_t mnum)
4309 {
4310 	char			name[20];
4311 	ddi_taskq_t		*tqp;
4312 
4313 	(void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
4314 
4315 	tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
4316 
4317 	return ((void *)tqp);
4318 }
4319