xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 46:042bf15ebd92)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/conf.h>
32 #include <sys/file.h>
33 #include <sys/user.h>
34 #include <sys/uio.h>
35 #include <sys/t_lock.h>
36 #include <sys/buf.h>
37 #include <sys/dkio.h>
38 #include <sys/vtoc.h>
39 #include <sys/kmem.h>
40 #include <vm/page.h>
41 #include <sys/cmn_err.h>
42 #include <sys/sysmacros.h>
43 #include <sys/types.h>
44 #include <sys/mkdev.h>
45 #include <sys/stat.h>
46 #include <sys/open.h>
47 #include <sys/modctl.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/debug.h>
51 #include <sys/dklabel.h>
52 #include <vm/hat.h>
53 #include <sys/lvm/md_mirror.h>
54 #include <sys/lvm/md_convert.h>
55 #include <sys/lvm/md_mddb.h>
56 #include <sys/esunddi.h>
57 
58 #include <sys/sysevent/eventdefs.h>
59 #include <sys/sysevent/svm.h>
60 #include <sys/lvm/mdmn_commd.h>
61 
62 md_ops_t		mirror_md_ops;
63 #ifndef	lint
64 static char		_depends_on[] = "drv/md";
65 md_ops_t		*md_interface_ops = &mirror_md_ops;
66 #endif
67 
68 extern mdq_anchor_t	md_done_daemon;
69 extern mdq_anchor_t	md_mstr_daemon;
70 extern mdq_anchor_t	md_mirror_daemon;
71 extern mdq_anchor_t	md_mirror_io_daemon;
72 extern mdq_anchor_t	md_mirror_rs_daemon;
73 extern mdq_anchor_t	md_mhs_daemon;
74 
75 extern unit_t		md_nunits;
76 extern set_t		md_nsets;
77 extern md_set_t		md_set[];
78 
79 extern int		md_status;
80 extern clock_t		md_hz;
81 
82 extern md_krwlock_t	md_unit_array_rw;
83 extern kmutex_t		md_mx;
84 extern kcondvar_t	md_cv;
85 extern int		md_mtioctl_cnt;
86 
87 daemon_request_t	mirror_timeout;
88 static daemon_request_t	hotspare_request;
89 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
90 
91 int	md_mirror_mcs_buf_off;
92 
93 /* Flags for mdmn_ksend_message to allow debugging */
94 int	md_mirror_msg_flags;
95 
96 #ifdef DEBUG
97 /* Flag to switch on debug messages */
98 int	mirror_debug_flag = 0;
99 #endif
100 
101 /*
102  * Struct used to hold count of DMR reads and the timestamp of last DMR read
103  * It is used to verify, using a debugger, that the DMR read ioctl has been
104  * executed.
105  */
106 dmr_stats_t	mirror_dmr_stats = {0, 0};
107 
108 /*
109  * Mutex protecting list of non-failfast drivers.
110  */
111 static kmutex_t	non_ff_drv_mutex;
112 static char	**non_ff_drivers = NULL;
113 
114 extern major_t	md_major;
115 
116 /*
117  * Write-On-Write memory pool.
118  */
119 static void		copy_write_cont(wowhdr_t *wowhdr);
120 static kmem_cache_t	*mirror_wowblk_cache = NULL;
121 static int		md_wowbuf_size = 16384;
122 static size_t		md_wowblk_size;
123 
124 /*
125  * This is a flag that allows:
126  *	- disabling the write-on-write mechanism.
127  *	- logging occurrences of write-on-write
128  *	- switching wow handling procedure processing
129  * Counter for occurences of WOW.
130  */
131 static uint_t	md_mirror_wow_flg = 0;
132 static int	md_mirror_wow_cnt = 0;
133 
134 /*
135  * Tunable to enable/disable dirty region
136  * processing when closing down a mirror.
137  */
138 static int	new_resync = 1;
139 kmem_cache_t	*mirror_parent_cache = NULL;
140 kmem_cache_t	*mirror_child_cache = NULL;
141 
142 extern int	md_ff_disable;		/* disable failfast */
143 
144 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
145 static void	mirror_read_strategy(buf_t *, int, void *);
146 static void	mirror_write_strategy(buf_t *, int, void *);
147 static void	become_owner(daemon_queue_t *);
148 static int	mirror_done(struct buf *cb);
149 static int	mirror_done_common(struct buf *cb);
150 static void	clear_retry_error(struct buf *cb);
151 
152 /*
153  * patchables
154  */
155 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
156 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
157 
158 /*
159  * patchable to change delay before rescheduling mirror ownership request.
160  * Value is clock ticks, default 0.5 seconds
161  */
162 clock_t	md_mirror_owner_to = 500000;
163 
164 /*ARGSUSED1*/
165 static int
166 mirror_parent_constructor(void *p, void *d1, int d2)
167 {
168 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
169 	return (0);
170 }
171 
172 static void
173 mirror_parent_init(md_mps_t *ps)
174 {
175 	bzero(ps, offsetof(md_mps_t, ps_mx));
176 }
177 
178 /*ARGSUSED1*/
179 static void
180 mirror_parent_destructor(void *p, void *d)
181 {
182 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
183 }
184 
185 /*ARGSUSED1*/
186 static int
187 mirror_child_constructor(void *p, void *d1, int d2)
188 {
189 	bioinit(&((md_mcs_t *)p)->cs_buf);
190 	return (0);
191 }
192 
193 void
194 mirror_child_init(md_mcs_t *cs)
195 {
196 	cs->cs_ps = NULL;
197 	cs->cs_mdunit = 0;
198 	md_bioreset(&cs->cs_buf);
199 }
200 
201 /*ARGSUSED1*/
202 static void
203 mirror_child_destructor(void *p, void *d)
204 {
205 	biofini(&((md_mcs_t *)p)->cs_buf);
206 }
207 
208 static void
209 mirror_wowblk_init(wowhdr_t *p)
210 {
211 	bzero(p, md_wowblk_size);
212 }
213 
214 static void
215 send_poke_hotspares_msg(daemon_request_t *drq)
216 {
217 	int			rval;
218 	md_mn_msg_pokehsp_t	pokehsp;
219 	md_mn_kresult_t		*kresult;
220 	set_t			setno = (set_t)drq->dq.qlen;
221 
222 	pokehsp.pokehsp_setno = setno;
223 
224 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
225 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
226 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
227 	    sizeof (pokehsp), kresult);
228 
229 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
230 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
231 		cmn_err(CE_PANIC,
232 		    "ksend_message failure: POKE_HOTSPARES");
233 	}
234 	kmem_free(kresult, sizeof (md_mn_kresult_t));
235 
236 	/* Allow further requests to use this set's queue structure */
237 	mutex_enter(&drq->dr_mx);
238 	drq->dr_pending = 0;
239 	mutex_exit(&drq->dr_mx);
240 }
241 
242 /*
243  * Send a poke_hotspares message to the master node. To avoid swamping the
244  * commd handler with requests we only send a message if there is not one
245  * already outstanding. We punt the request to a separate thread context as
246  * cannot afford to block waiting on the request to be serviced. This is
247  * essential when a reconfig cycle is in progress as any open() of a multinode
248  * metadevice may result in a livelock.
249  */
250 static void
251 send_poke_hotspares(set_t setno)
252 {
253 	daemon_request_t	*drq = &mn_hs_request[setno];
254 
255 	mutex_enter(&drq->dr_mx);
256 	if (drq->dr_pending == 0) {
257 		drq->dr_pending = 1;
258 		drq->dq.qlen = (int)setno;
259 		daemon_request(&md_mhs_daemon,
260 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
261 	}
262 	mutex_exit(&drq->dr_mx);
263 }
264 
265 void
266 mirror_set_sm_state(
267 	mm_submirror_t		*sm,
268 	mm_submirror_ic_t	*smic,
269 	sm_state_t		newstate,
270 	int			force)
271 {
272 	int			compcnt;
273 	int			i;
274 	int			errcnt;
275 	sm_state_t		origstate;
276 	md_m_shared_t		*shared;
277 
278 	if (force) {
279 		sm->sm_state = newstate;
280 		uniqtime32(&sm->sm_timestamp);
281 		return;
282 	}
283 
284 	origstate = newstate;
285 
286 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
287 	for (i = 0, errcnt = 0; i < compcnt; i++) {
288 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
289 		    (sm->sm_dev, sm, i);
290 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
291 			newstate |= SMS_COMP_ERRED;
292 		if (shared->ms_state & (CS_RESYNC))
293 			newstate |= SMS_COMP_RESYNC;
294 		if (shared->ms_state & CS_ERRED)
295 			errcnt++;
296 	}
297 
298 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
299 		newstate &= ~origstate;
300 
301 	if (errcnt == compcnt)
302 		newstate |= SMS_ALL_ERRED;
303 	else
304 		newstate &= ~SMS_ALL_ERRED;
305 
306 	sm->sm_state = newstate;
307 	uniqtime32(&sm->sm_timestamp);
308 }
309 
310 static int
311 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
312 							int frm_probe)
313 {
314 	mm_submirror_t		*sm;
315 	mm_submirror_ic_t	*smic;
316 	md_m_shared_t		*shared;
317 	int			ci;
318 	int			i;
319 	int			compcnt;
320 	int			open_comp; /* flag for open component */
321 
322 	for (i = *smi; i < NMIRROR; i++) {
323 		sm = &un->un_sm[i];
324 		smic = &un->un_smic[i];
325 
326 		if (!SMS_IS(sm, SMS_INUSE))
327 			continue;
328 
329 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
330 		for (ci = *cip; ci < compcnt; ci++) {
331 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
332 			    (sm->sm_dev, sm, ci);
333 			/*
334 			 * if called from any routine but probe, we check for
335 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
336 			 * it sets MDM_S_PROBEOPEN flag and we test for this
337 			 * flag. They are both exclusive tests.
338 			 */
339 			open_comp = (frm_probe) ?
340 					(shared->ms_flags & MDM_S_PROBEOPEN):
341 					(shared->ms_flags & MDM_S_ISOPEN);
342 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
343 				((shared->ms_state == CS_OKAY) ||
344 				(shared->ms_state == CS_RESYNC))) {
345 				if (clr_error) {
346 					shared->ms_flags &= ~MDM_S_IOERR;
347 				}
348 				*cip = ci;
349 				*smi = i;
350 				return (1);
351 			}
352 
353 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
354 				shared->ms_flags &= ~MDM_S_IOERR;
355 			}
356 		}
357 
358 		*cip = 0;
359 	}
360 	return (0);
361 }
362 
363 /*ARGSUSED*/
364 static void
365 mirror_run_queue(void *d)
366 {
367 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
368 		md_daemon(1, &md_done_daemon);
369 }
370 /*
371  * check_comp_4_hotspares
372  *
373  * This function attempts to allocate a hotspare for this component if the
374  * component is in error. In a MN set, the function can be called in 2 modes.
375  * It can be called either when a component error has been detected or when a
376  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
377  * in flags and the request is sent to all nodes.
378  * The handler on each of the nodes then calls this function with
379  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
380  *
381  * For non-MN sets the function simply attempts to allocate a hotspare.
382  *
383  * On entry, the following locks are held
384  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
385  *	md_unit_writerlock
386  *
387  * Returns	0 if ok
388  *		1 if the unit containing the component has been cleared while
389  *		  the mdmn_ksend_message() was being executed
390  */
391 extern int
392 check_comp_4_hotspares(
393 	mm_unit_t	*un,
394 	int		smi,
395 	int		ci,
396 	uint_t		flags,
397 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
398 	IOLOCK		*lockp	/* can be NULL */
399 )
400 {
401 	mm_submirror_t		*sm;
402 	mm_submirror_ic_t	*smic;
403 	md_m_shared_t		*shared;
404 	mddb_recid_t		recids[6];
405 	minor_t			mnum;
406 	intptr_t		(*hs_dev)();
407 	void			(*hs_done)();
408 	void			*hs_data;
409 	md_error_t		mde = mdnullerror;
410 	set_t			setno;
411 	md_mn_msg_allochsp_t	allochspmsg;
412 	md_mn_kresult_t		*kresult;
413 	mm_unit_t		*new_un;
414 	int			rval;
415 
416 	mnum = MD_SID(un);
417 	setno = MD_UN2SET(un);
418 	sm = &un->un_sm[smi];
419 	smic = &un->un_smic[smi];
420 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
421 		(sm->sm_dev, sm, ci);
422 
423 	if (shared->ms_state != CS_ERRED)
424 		return (0);
425 
426 	/* Don't start a new component resync if a resync is already running. */
427 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
428 		return (0);
429 
430 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
431 		uint_t		msgflags;
432 		md_mn_msgtype_t	msgtype;
433 
434 		/* Send allocate hotspare message to all nodes */
435 
436 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
437 		allochspmsg.msg_allochsp_sm = smi;
438 		allochspmsg.msg_allochsp_comp = ci;
439 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
440 
441 		/*
442 		 * Before calling mdmn_ksend_message(), release locks
443 		 * Can never be in the context of an ioctl.
444 		 */
445 		md_unit_writerexit(MDI_UNIT(mnum));
446 		if (flags & MD_HOTSPARE_LINKHELD)
447 			rw_exit(&mirror_md_ops.md_link_rw.lock);
448 #ifdef DEBUG
449 		if (mirror_debug_flag)
450 		    printf("send alloc hotspare, flags=0x%x %x, %x, %x, %x\n",
451 			flags,
452 			allochspmsg.msg_allochsp_mnum,
453 			allochspmsg.msg_allochsp_sm,
454 			allochspmsg.msg_allochsp_comp,
455 			allochspmsg.msg_allochsp_hs_id);
456 #endif
457 		if (flags & MD_HOTSPARE_WMUPDATE) {
458 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
459 			/*
460 			 * When coming from an update of watermarks, there
461 			 * must already be a message logged that triggered
462 			 * this action. So, no need to log this message, too.
463 			 */
464 			msgflags = MD_MSGF_NO_LOG;
465 		} else {
466 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
467 			msgflags = MD_MSGF_DEFAULT_FLAGS;
468 		}
469 
470 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
471 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
472 		    (char *)&allochspmsg, sizeof (allochspmsg),
473 		    kresult);
474 
475 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
476 #ifdef DEBUG
477 			if (mirror_debug_flag)
478 				mdmn_ksend_show_error(rval, kresult,
479 				    "ALLOCATE HOTSPARE");
480 #endif
481 			/*
482 			 * If message is sent ok but exitval indicates an error
483 			 * it must be because the mirror has been cleared. In
484 			 * this case re-obtain lock and return an error
485 			 */
486 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
487 				if (flags & MD_HOTSPARE_LINKHELD) {
488 					rw_enter(&mirror_md_ops.md_link_rw.lock,
489 					    RW_READER);
490 				}
491 				kmem_free(kresult, sizeof (md_mn_kresult_t));
492 				return (1);
493 			}
494 			cmn_err(CE_PANIC,
495 			    "ksend_message failure: ALLOCATE_HOTSPARE");
496 		}
497 		kmem_free(kresult, sizeof (md_mn_kresult_t));
498 
499 		/*
500 		 * re-obtain the locks
501 		 */
502 		if (flags & MD_HOTSPARE_LINKHELD)
503 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
504 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
505 
506 		/*
507 		 * As we had to release the locks in order to send the
508 		 * message to all nodes, we need to check to see if the
509 		 * unit has changed. If it has we release the writerlock
510 		 * and return fail.
511 		 */
512 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
513 			md_unit_writerexit(MDI_UNIT(mnum));
514 			return (1);
515 		}
516 	} else {
517 		if (MD_MNSET_SETNO(setno)) {
518 			/*
519 			 * If 2 or more nodes simultaneously see a
520 			 * component failure, these nodes will each
521 			 * send an ALLOCATE_HOTSPARE[2] message.
522 			 * The first message will allocate the hotspare
523 			 * and the subsequent messages should do nothing.
524 			 *
525 			 * If a slave node doesn't have a hotspare allocated
526 			 * at the time the message is initiated, then the
527 			 * passed in hs_id will be 0.  If the node
528 			 * executing this routine has a component shared
529 			 * ms_hs_id of non-zero, but the message shows a
530 			 * hs_id of 0, then just return since a hotspare
531 			 * has already been allocated for this failing
532 			 * component.  When the slave node returns from
533 			 * the ksend_message the hotspare will have
534 			 * already been allocated.
535 			 *
536 			 * If the slave node does send an hs_id of non-zero,
537 			 * and the slave node's hs_id matches this node's
538 			 * ms_hs_id, then the hotspare has error'd and
539 			 * should be replaced.
540 			 *
541 			 * If the slave node sends an hs_id of non-zero and
542 			 * this node has a different shared ms_hs_id, then
543 			 * just return since this hotspare has already
544 			 * been hotspared.
545 			 */
546 			if (shared->ms_hs_id != 0) {
547 				if (hs_id == 0) {
548 #ifdef DEBUG
549 					if (mirror_debug_flag) {
550 						printf("check_comp_4_hotspares"
551 						    "(NOXMIT), short circuit "
552 						    "hs_id=0x%x, "
553 						    "ms_hs_id=0x%x\n",
554 						    hs_id, shared->ms_hs_id);
555 					}
556 #endif
557 					return (0);
558 				}
559 				if (hs_id != shared->ms_hs_id) {
560 #ifdef DEBUG
561 					if (mirror_debug_flag) {
562 						printf("check_comp_4_hotspares"
563 						    "(NOXMIT), short circuit2 "
564 						    "hs_id=0x%x, "
565 						    "ms_hs_id=0x%x\n",
566 						    hs_id, shared->ms_hs_id);
567 					}
568 #endif
569 					return (0);
570 				}
571 			}
572 		}
573 
574 		sm = &un->un_sm[smi];
575 		hs_dev = md_get_named_service(sm->sm_dev, 0,
576 		    "hotspare device", 0);
577 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
578 		    &hs_data) != 0)
579 			return (0);
580 
581 		/*
582 		 * set_sm_comp_state() commits the modified records.
583 		 * As we don't transmit the changes, no need to drop the lock.
584 		 */
585 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
586 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
587 
588 		(*hs_done)(sm->sm_dev, hs_data);
589 
590 		mirror_check_failfast(mnum);
591 
592 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
593 		    setno, MD_SID(un));
594 
595 		/*
596 		 * For a multi-node set we need to reset the un_rs_type,
597 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
598 		 * hot-spare resync must copy all applicable data.
599 		 */
600 		if (MD_MNSET_SETNO(setno)) {
601 			un->un_rs_type = MD_RS_NONE;
602 			un->un_rs_resync_done = 0;
603 			un->un_rs_resync_2_do = 0;
604 		}
605 
606 		/*
607 		 * Must drop writer lock since mirror_resync_unit will
608 		 * open devices and must be able to grab readerlock.
609 		 * Don't need to drop IOLOCK since any descendent routines
610 		 * calling ksend_messages will drop the IOLOCK as needed.
611 		 *
612 		 */
613 		if (lockp) {
614 			md_ioctl_writerexit(lockp);
615 		} else {
616 			md_unit_writerexit(MDI_UNIT(mnum));
617 		}
618 
619 		/* start resync */
620 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
621 
622 		if (lockp) {
623 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
624 		} else {
625 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
626 		}
627 	}
628 	return (0);
629 }
630 
631 /*
632  * check_unit_4_hotspares
633  *
634  * For a given mirror, allocate hotspares, if available for any components
635  * that are in error
636  *
637  * Returns	0 if ok
638  *		1 if check_comp_4_hotspares returns non-zero. This will only
639  *		  happen for a MN unit where the unit has been cleared while
640  *		  the allocate hotspare message is sent to all nodes.
641  */
642 static int
643 check_unit_4_hotspares(mm_unit_t *un, int flags)
644 {
645 	mm_submirror_t		*sm;
646 	mm_submirror_ic_t	*smic;
647 	int			ci;
648 	int			i;
649 	int			compcnt;
650 
651 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
652 		return (0);
653 
654 	for (i = 0; i < NMIRROR; i++) {
655 		sm = &un->un_sm[i];
656 		smic = &un->un_smic[i];
657 		if (!SMS_IS(sm, SMS_INUSE))
658 			continue;
659 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
660 		for (ci = 0; ci < compcnt; ci++) {
661 			md_m_shared_t		*shared;
662 
663 			shared = (md_m_shared_t *)
664 				(*(smic->sm_shared_by_indx))(sm->sm_dev,
665 				sm, ci);
666 			/*
667 			 * Never called from ioctl context, so pass in
668 			 * (IOLOCK *)NULL.  Pass through flags from calling
669 			 * routine, also setting XMIT flag.
670 			 */
671 			if (check_comp_4_hotspares(un, i, ci,
672 				(MD_HOTSPARE_XMIT | flags),
673 				shared->ms_hs_id, (IOLOCK *)NULL) != 0)
674 				return (1);
675 		}
676 	}
677 	return (0);
678 }
679 
680 static void
681 check_4_hotspares(daemon_request_t *drq)
682 {
683 	mdi_unit_t	*ui;
684 	mm_unit_t	*un;
685 	md_link_t	*next;
686 	int		x;
687 
688 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
689 	drq->dr_pending = 0;		/* again in low level routine if */
690 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
691 
692 	/*
693 	 * Used to have a problem here. The disksets weren't marked as being
694 	 * MNHOLD. This opened a window where we could be searching for
695 	 * hotspares and have the disk set unloaded (released) from under
696 	 * us causing a panic in stripe_component_count().
697 	 * The way to prevent that is to mark the set MNHOLD which prevents
698 	 * any diskset from being released while we are scanning the mirrors,
699 	 * submirrors and components.
700 	 */
701 
702 	for (x = 0; x < md_nsets; x++)
703 		md_holdset_enter(x);
704 
705 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
706 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
707 		ui = MDI_UNIT(next->ln_id);
708 
709 		un = (mm_unit_t *)md_unit_readerlock(ui);
710 
711 		/*
712 		 * Only check the unit if we are the master for this set
713 		 * For an MN set, poke_hotspares() is only effective on the
714 		 * master
715 		 */
716 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
717 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
718 			md_unit_readerexit(ui);
719 			continue;
720 		}
721 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
722 			md_unit_readerexit(ui);
723 			continue;
724 		}
725 		md_unit_readerexit(ui);
726 
727 		un = (mm_unit_t *)md_unit_writerlock(ui);
728 		/*
729 		 * check_unit_4_hotspares will exit 1 if the unit has been
730 		 * removed during the process of allocating the hotspare.
731 		 * This can only happen for a MN metadevice. If unit no longer
732 		 * exists, no need to release writerlock
733 		 */
734 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
735 			md_unit_writerexit(ui);
736 		else {
737 			/*
738 			 * If check_unit_4_hotspares failed, queue another
739 			 * request and break out of this one
740 			 */
741 			(void) poke_hotspares();
742 			break;
743 		}
744 	}
745 	rw_exit(&mirror_md_ops.md_link_rw.lock);
746 
747 	for (x = 0; x < md_nsets; x++)
748 		md_holdset_exit(x);
749 }
750 
751 /*
752  * poke_hotspares
753  *
754  * If there is not a pending poke_hotspares request pending, queue a requent
755  * to call check_4_hotspares(). This will scan all mirrors and attempt to
756  * allocate hotspares for all components in error.
757  */
758 int
759 poke_hotspares()
760 {
761 	mutex_enter(&hotspare_request.dr_mx);
762 	if (hotspare_request.dr_pending == 0) {
763 		hotspare_request.dr_pending = 1;
764 		daemon_request(&md_mhs_daemon,
765 		    check_4_hotspares,
766 				(daemon_queue_t *)&hotspare_request, REQ_OLD);
767 	}
768 	mutex_exit(&hotspare_request.dr_mx);
769 	return (0);
770 }
771 
772 static void
773 free_all_ecomps(err_comp_t *ecomp)
774 {
775 	err_comp_t	*d;
776 
777 	while (ecomp != NULL) {
778 		d = ecomp;
779 		ecomp = ecomp->ec_next;
780 		kmem_free(d, sizeof (err_comp_t));
781 	}
782 }
783 
784 /*
785  * NAME: mirror_openfail_console_info
786  *
787  * DESCRIPTION: Prints a informative message to the console when mirror
788  *		cannot be opened.
789  *
790  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
791  *	       int		smi - submirror index
792  *	       int		ci - component index
793  */
794 
795 void
796 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
797 {
798 	void (*get_dev)();
799 	ms_cd_info_t cd;
800 	md_dev64_t tmpdev;
801 
802 	tmpdev = un->un_sm[smi].sm_dev;
803 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
804 	if (get_dev != NULL) {
805 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
806 		cmn_err(CE_WARN, "md %s: open error on %s",
807 			md_shortname(MD_SID(un)),
808 			md_devname(MD_UN2SET(un), cd.cd_dev,
809 			NULL, 0));
810 	} else {
811 		cmn_err(CE_WARN, "md %s: open error",
812 			md_shortname(MD_SID(un)));
813 	}
814 }
815 
816 static int
817 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
818 {
819 	int i;
820 	md_dev64_t dev;
821 
822 	for (i = 0; i < NMIRROR; i++) {
823 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
824 			continue;
825 		dev = un->un_sm[i].sm_dev;
826 		md_layered_close(dev, md_cflags);
827 	}
828 	return (0);
829 }
830 
831 /*
832  * Keep track of drivers that don't support failfast.  We use this so that
833  * we only log one diagnostic message for each of these drivers, no matter
834  * how many times we run the mirror_check_failfast function.
835  * Return 1 if this is a new driver that does not support failfast,
836  * return 0 if we have already seen this non-failfast driver.
837  */
838 static int
839 new_non_ff_driver(const char *s)
840 {
841 	mutex_enter(&non_ff_drv_mutex);
842 	if (non_ff_drivers == NULL) {
843 	    non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
844 		KM_NOSLEEP);
845 	    if (non_ff_drivers == NULL) {
846 		mutex_exit(&non_ff_drv_mutex);
847 		return (1);
848 	    }
849 
850 	    non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
851 	    if (non_ff_drivers[0] == NULL) {
852 		kmem_free(non_ff_drivers, 2 * sizeof (char *));
853 		non_ff_drivers = NULL;
854 		mutex_exit(&non_ff_drv_mutex);
855 		return (1);
856 	    }
857 
858 	    (void) strcpy(non_ff_drivers[0], s);
859 	    non_ff_drivers[1] = NULL;
860 
861 	} else {
862 	    int i;
863 	    char **tnames;
864 	    char **tmp;
865 
866 	    for (i = 0; non_ff_drivers[i] != NULL; i++) {
867 		if (strcmp(s, non_ff_drivers[i]) == 0) {
868 		    mutex_exit(&non_ff_drv_mutex);
869 		    return (0);
870 		}
871 	    }
872 
873 	    /* allow for new element and null */
874 	    i += 2;
875 	    tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
876 	    if (tnames == NULL) {
877 		mutex_exit(&non_ff_drv_mutex);
878 		return (1);
879 	    }
880 
881 	    for (i = 0; non_ff_drivers[i] != NULL; i++)
882 		tnames[i] = non_ff_drivers[i];
883 
884 	    tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
885 	    if (tnames[i] == NULL) {
886 		/* adjust i so that it is the right count to free */
887 		kmem_free(tnames, (i + 2) * sizeof (char *));
888 		mutex_exit(&non_ff_drv_mutex);
889 		return (1);
890 	    }
891 
892 	    (void) strcpy(tnames[i++], s);
893 	    tnames[i] = NULL;
894 
895 	    tmp = non_ff_drivers;
896 	    non_ff_drivers = tnames;
897 	    /* i now represents the count we previously alloced */
898 	    kmem_free(tmp, i * sizeof (char *));
899 	}
900 	mutex_exit(&non_ff_drv_mutex);
901 
902 	return (1);
903 }
904 
905 /*
906  * Check for the "ddi-failfast-supported" devtree property on each submirror
907  * component to indicate if we should do I/O to that submirror with the
908  * B_FAILFAST flag set or not.  This check is made at various state transitions
909  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
910  * only need to check one drive (e.g. hotspare) but since the check is
911  * fast and infrequent and sometimes needs to be done on all components we
912  * just check all components on each call.
913  */
914 void
915 mirror_check_failfast(minor_t mnum)
916 {
917 	int		i;
918 	mm_unit_t	*un;
919 
920 	if (md_ff_disable)
921 	    return;
922 
923 	un = MD_UNIT(mnum);
924 
925 	for (i = 0; i < NMIRROR; i++) {
926 	    int			ci;
927 	    int			cnt;
928 	    int			ff = 1;
929 	    mm_submirror_t	*sm;
930 	    mm_submirror_ic_t	*smic;
931 	    void		(*get_dev)();
932 
933 	    if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
934 		continue;
935 
936 	    sm = &un->un_sm[i];
937 	    smic = &un->un_smic[i];
938 
939 	    get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
940 		"get device", 0);
941 
942 	    cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
943 	    for (ci = 0; ci < cnt; ci++) {
944 		int		found = 0;
945 		dev_t		ci_dev;
946 		major_t		major;
947 		dev_info_t	*devi;
948 		ms_cd_info_t	cd;
949 
950 		/* this already returns the hs dev if the device is spared */
951 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
952 
953 		ci_dev = md_dev64_to_dev(cd.cd_dev);
954 		major = getmajor(ci_dev);
955 
956 		if (major == md_major) {
957 		    /* this component must be a soft partition; get real dev */
958 		    minor_t	dev_mnum;
959 		    mdi_unit_t	*ui;
960 		    mp_unit_t	*un;
961 		    set_t	setno;
962 		    side_t	side;
963 		    md_dev64_t	tmpdev;
964 
965 		    ui = MDI_UNIT(getminor(ci_dev));
966 
967 		    /* grab necessary lock */
968 		    un = (mp_unit_t *)md_unit_readerlock(ui);
969 
970 		    dev_mnum = MD_SID(un);
971 		    setno = MD_MIN2SET(dev_mnum);
972 		    side = mddb_getsidenum(setno);
973 
974 		    tmpdev = un->un_dev;
975 
976 		    /* Get dev by device id */
977 		    if (md_devid_found(setno, side, un->un_key) == 1) {
978 			tmpdev = md_resolve_bydevid(dev_mnum, tmpdev,
979 				un->un_key);
980 		    }
981 
982 		    md_unit_readerexit(ui);
983 
984 		    ci_dev = md_dev64_to_dev(tmpdev);
985 		    major = getmajor(ci_dev);
986 		}
987 
988 		if (ci_dev != NODEV32 &&
989 		    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) {
990 		    ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
991 		    int			propvalue = 0;
992 		    int			proplength = sizeof (int);
993 		    int			error;
994 		    struct cb_ops	*cb;
995 
996 		    if ((cb = devopsp[major]->devo_cb_ops) != NULL) {
997 			error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op,
998 			    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
999 			    "ddi-failfast-supported",
1000 			    (caddr_t)&propvalue, &proplength);
1001 
1002 			if (error == DDI_PROP_SUCCESS)
1003 			    found = 1;
1004 		    }
1005 
1006 		    if (!found && new_non_ff_driver(ddi_driver_name(devi)))
1007 			cmn_err(CE_NOTE, "!md: B_FAILFAST I/O disabled on %s",
1008 			    ddi_driver_name(devi));
1009 
1010 		    ddi_release_devi(devi);
1011 		}
1012 
1013 		/* All components must support failfast in the submirror. */
1014 		if (!found) {
1015 		    ff = 0;
1016 		    break;
1017 		}
1018 	    }
1019 
1020 	    if (ff) {
1021 		sm->sm_flags |= MD_SM_FAILFAST;
1022 	    } else {
1023 		sm->sm_flags &= ~MD_SM_FAILFAST;
1024 	    }
1025 	}
1026 }
1027 
1028 /*
1029  * Return true if the submirror is unavailable.
1030  * If any of the submirror components are opened then the submirror cannot
1031  * be unavailable (MD_INACCESSIBLE).
1032  * If any of the components are already in the errored state, then the submirror
1033  * cannot be unavailable (MD_INACCESSIBLE).
1034  */
1035 static bool_t
1036 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1037 {
1038 	mm_submirror_t		*sm;
1039 	mm_submirror_ic_t	*smic;
1040 	md_m_shared_t		*shared;
1041 	int			ci;
1042 	int			compcnt;
1043 
1044 	sm = &un->un_sm[smi];
1045 	smic = &un->un_smic[smi];
1046 
1047 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1048 	for (ci = 0; ci < compcnt; ci++) {
1049 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1050 		    (sm->sm_dev, sm, ci);
1051 		if (from_probe) {
1052 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1053 				return (B_FALSE);
1054 		} else {
1055 			if (shared->ms_flags & MDM_S_ISOPEN)
1056 				return (B_FALSE);
1057 		}
1058 		if (shared->ms_state == CS_ERRED ||
1059 		    shared->ms_state == CS_LAST_ERRED)
1060 			return (B_FALSE);
1061 	}
1062 
1063 	return (B_TRUE);
1064 }
1065 
1066 static int
1067 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1068 {
1069 	int		i;
1070 	mm_unit_t	*un;
1071 	mdi_unit_t	*ui;
1072 	int		err;
1073 	int		smi;
1074 	int		ci;
1075 	err_comp_t	*c;
1076 	err_comp_t	*ecomps = NULL;
1077 	int		smmask = 0;
1078 	set_t		setno;
1079 	int		sm_cnt;
1080 	int		sm_unavail_cnt;
1081 
1082 	mirror_check_failfast(mnum);
1083 
1084 	un = MD_UNIT(mnum);
1085 	ui = MDI_UNIT(mnum);
1086 	setno = MD_UN2SET(un);
1087 
1088 	for (i = 0; i < NMIRROR; i++) {
1089 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1090 
1091 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1092 			continue;
1093 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1094 			smmask |= SMI2BIT(i);
1095 		un->un_sm[i].sm_dev = tmpdev;
1096 	}
1097 
1098 	/*
1099 	 * If smmask is clear, all submirrors are accessible. Clear the
1100 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1101 	 * mirror device.   If smmask is set, we have to determine which of the
1102 	 * submirrors are in error. If no submirror is accessible we mark the
1103 	 * whole mirror as MD_INACCESSIBLE.
1104 	 */
1105 	if (smmask == 0) {
1106 		if (lockp) {
1107 			md_ioctl_readerexit(lockp);
1108 			(void) md_ioctl_writerlock(lockp, ui);
1109 		} else {
1110 			md_unit_readerexit(ui);
1111 			(void) md_unit_writerlock(ui);
1112 		}
1113 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1114 		if (lockp) {
1115 			md_ioctl_writerexit(lockp);
1116 			(void) md_ioctl_readerlock(lockp, ui);
1117 		} else {
1118 			md_unit_writerexit(ui);
1119 			(void) md_unit_readerlock(ui);
1120 		}
1121 
1122 		for (i = 0; i < NMIRROR; i++) {
1123 			md_dev64_t	tmpdev;
1124 			mdi_unit_t	*sm_ui;
1125 
1126 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1127 				continue;
1128 
1129 			tmpdev = un->un_sm[i].sm_dev;
1130 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1131 			(void) md_unit_writerlock(sm_ui);
1132 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1133 			md_unit_writerexit(sm_ui);
1134 		}
1135 
1136 		return (0);
1137 	}
1138 
1139 	for (i = 0; i < NMIRROR; i++) {
1140 		md_dev64_t tmpdev;
1141 
1142 		if (!(smmask & SMI2BIT(i)))
1143 			continue;
1144 
1145 		tmpdev = un->un_sm[i].sm_dev;
1146 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1147 		un->un_sm[i].sm_dev = tmpdev;
1148 		ASSERT(err == 0);
1149 	}
1150 
1151 	if (lockp) {
1152 		md_ioctl_readerexit(lockp);
1153 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1154 	} else {
1155 		md_unit_readerexit(ui);
1156 		un = (mm_unit_t *)md_unit_writerlock(ui);
1157 	}
1158 
1159 	/*
1160 	 * We want to make sure the unavailable flag is not masking a real
1161 	 * error on the submirror.
1162 	 * For each submirror,
1163 	 *    if all of the submirror components couldn't be opened and there
1164 	 *    are no errors on the submirror, then set the unavailable flag
1165 	 *    otherwise, clear unavailable.
1166 	 */
1167 	sm_cnt = 0;
1168 	sm_unavail_cnt = 0;
1169 	for (i = 0; i < NMIRROR; i++) {
1170 		md_dev64_t	tmpdev;
1171 		mdi_unit_t	*sm_ui;
1172 
1173 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1174 			continue;
1175 
1176 		sm_cnt++;
1177 		tmpdev = un->un_sm[i].sm_dev;
1178 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1179 
1180 		(void) md_unit_writerlock(sm_ui);
1181 		if (submirror_unavailable(un, i, 0)) {
1182 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1183 			sm_unavail_cnt++;
1184 		} else {
1185 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1186 		}
1187 		md_unit_writerexit(sm_ui);
1188 	}
1189 
1190 	/*
1191 	 * If all of the submirrors are unavailable, the mirror is also
1192 	 * unavailable.
1193 	 */
1194 	if (sm_cnt == sm_unavail_cnt) {
1195 		ui->ui_tstate |= MD_INACCESSIBLE;
1196 	} else {
1197 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1198 	}
1199 
1200 	smi = 0;
1201 	ci = 0;
1202 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1203 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1204 
1205 			free_all_ecomps(ecomps);
1206 			(void) mirror_close_all_devs(un, md_oflags);
1207 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1208 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1209 			mirror_openfail_console_info(un, smi, ci);
1210 			if (lockp) {
1211 				md_ioctl_writerexit(lockp);
1212 				(void) md_ioctl_readerlock(lockp, ui);
1213 			} else {
1214 				md_unit_writerexit(ui);
1215 				(void) md_unit_readerlock(ui);
1216 			}
1217 			return (ENXIO);
1218 		}
1219 
1220 		/* track all component states that need changing */
1221 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1222 		c->ec_next = ecomps;
1223 		c->ec_smi = smi;
1224 		c->ec_ci = ci;
1225 		ecomps = c;
1226 		ci++;
1227 	}
1228 
1229 	/* Make all state changes and commit them */
1230 	for (c = ecomps; c != NULL; c = c->ec_next) {
1231 		/*
1232 		 * If lockp is set, then entering kernel through ioctl.
1233 		 * For a MN set, the only ioctl path is via a commd message
1234 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1235 		 * being sent to each node.
1236 		 * In this case, set NO_XMIT so that set_sm_comp_state
1237 		 * won't attempt to send a message on a message.
1238 		 *
1239 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1240 		 * which flag is passed.
1241 		 */
1242 		if (lockp) {
1243 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1244 			    MD_STATE_NO_XMIT, lockp);
1245 		} else {
1246 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1247 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1248 		}
1249 		/*
1250 		 * For a MN set, the NOTIFY is done when the state change is
1251 		 * processed on each node
1252 		 */
1253 		if (!MD_MNSET_SETNO(setno)) {
1254 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1255 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1256 		}
1257 	}
1258 
1259 	if (lockp) {
1260 		md_ioctl_writerexit(lockp);
1261 		(void) md_ioctl_readerlock(lockp, ui);
1262 	} else {
1263 		md_unit_writerexit(ui);
1264 		(void) md_unit_readerlock(ui);
1265 	}
1266 
1267 	free_all_ecomps(ecomps);
1268 
1269 	/* allocate hotspares for all errored components */
1270 	if (MD_MNSET_SETNO(setno)) {
1271 		/*
1272 		 * If we're called from an ioctl (lockp set) then we cannot
1273 		 * directly call send_poke_hotspares as this will block until
1274 		 * the message gets despatched to all nodes. If the cluster is
1275 		 * going through a reconfig cycle then the message will block
1276 		 * until the cycle is complete, and as we originate from a
1277 		 * service call from commd we will livelock.
1278 		 */
1279 		if (lockp == NULL) {
1280 			md_unit_readerexit(ui);
1281 			send_poke_hotspares(setno);
1282 			(void) md_unit_readerlock(ui);
1283 		}
1284 	} else {
1285 		(void) poke_hotspares();
1286 	}
1287 	return (0);
1288 }
1289 
1290 void
1291 mirror_overlap_chain_remove(md_mps_t *ps)
1292 {
1293 	mm_unit_t	*un;
1294 
1295 	if (panicstr)
1296 		return;
1297 
1298 	ASSERT(ps->ps_flags & MD_MPS_ON_OVERLAP);
1299 
1300 	un = ps->ps_un;
1301 
1302 	mutex_enter(&un->un_ovrlap_chn_mx);
1303 	if (ps->ps_ovrlap_prev != &un->un_ovrlap_chn)
1304 		ps->ps_ovrlap_prev->ps_ovrlap_next = ps->ps_ovrlap_next;
1305 	else
1306 		un->un_ovrlap_chn.ps_ovrlap_next = ps->ps_ovrlap_next;
1307 	if (ps->ps_ovrlap_next != &un->un_ovrlap_chn)
1308 		ps->ps_ovrlap_next->ps_ovrlap_prev = ps->ps_ovrlap_prev;
1309 	else
1310 		un->un_ovrlap_chn.ps_ovrlap_prev = ps->ps_ovrlap_prev;
1311 	/* Handle empty overlap chain */
1312 	if (un->un_ovrlap_chn.ps_ovrlap_prev == &un->un_ovrlap_chn) {
1313 		un->un_ovrlap_chn.ps_ovrlap_prev =
1314 		    un->un_ovrlap_chn.ps_ovrlap_next = NULL;
1315 	}
1316 	if (un->un_ovrlap_chn_flg) {
1317 		un->un_ovrlap_chn_flg = 0;
1318 		cv_broadcast(&un->un_ovrlap_chn_cv);
1319 	}
1320 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1321 	mutex_exit(&un->un_ovrlap_chn_mx);
1322 }
1323 
1324 
1325 /*
1326  * wait_for_overlaps:
1327  * -----------------
1328  * Check that given i/o request does not cause an overlap with already pending
1329  * i/o. If it does, block until the overlapped i/o completes.
1330  *
1331  * Note: the overlap chain is held as a monotonically increasing doubly-linked
1332  * list with the sentinel contained in un->un_ovrlap_chn. We avoid a linear
1333  * search of the list by the following logic:
1334  *	ps->ps_lastblk < un_ovrlap_chn.ps_ovrlap_next->ps_firstblk => No overlap
1335  *	ps->ps_firstblk > un_ovrlap_chn.ps_ovrlap_prev->ps_lastblk => No overlap
1336  * otherwise
1337  *	scan un_ovrlap_chn.ps_ovrlap_next for location where ps->ps_firstblk
1338  *	> chain->ps_lastblk. This is the insertion point. As the list is
1339  *	guaranteed to be ordered there is no need to continue scanning.
1340  *
1341  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1342  *	structure to be already on the overlap chain and MD_OVERLAP_NO_REPEAT
1343  *	if it must not already be on the chain
1344  */
1345 static void
1346 wait_for_overlaps(md_mps_t *ps, int flags)
1347 {
1348 	mm_unit_t	*un;
1349 	md_mps_t	*ps1, **head, **tail;
1350 
1351 	if (panicstr)
1352 		return;
1353 
1354 
1355 	un = ps->ps_un;
1356 
1357 	mutex_enter(&un->un_ovrlap_chn_mx);
1358 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1359 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1360 		mutex_exit(&un->un_ovrlap_chn_mx);
1361 		return;
1362 	}
1363 
1364 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1365 	head = &(un->un_ovrlap_chn.ps_ovrlap_next);
1366 	tail = &(un->un_ovrlap_chn.ps_ovrlap_prev);
1367 	ps1 = *head;
1368 	/*
1369 	 * Check for simple limit cases:
1370 	 *	*head == NULL
1371 	 *		insert ps at head of list
1372 	 *	lastblk < head->firstblk
1373 	 *		insert at head of list
1374 	 *	firstblk > tail->lastblk
1375 	 *		insert at tail of list
1376 	 */
1377 	if (ps1 == NULL) {
1378 		/* Insert at head */
1379 		ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1380 		ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1381 		*head = ps;
1382 		*tail = ps;
1383 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1384 		mutex_exit(&un->un_ovrlap_chn_mx);
1385 		return;
1386 	} else if (ps->ps_lastblk < (*head)->ps_firstblk) {
1387 		/* Insert at head */
1388 		ps->ps_ovrlap_next = (*head);
1389 		ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1390 		(*head)->ps_ovrlap_prev = ps;
1391 		*head = ps;
1392 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1393 		mutex_exit(&un->un_ovrlap_chn_mx);
1394 		return;
1395 	} else if (ps->ps_firstblk > (*tail)->ps_lastblk) {
1396 		/* Insert at tail */
1397 		ps->ps_ovrlap_prev = (*tail);
1398 		ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1399 		(*tail)->ps_ovrlap_next = ps;
1400 		*tail = ps;
1401 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1402 		mutex_exit(&un->un_ovrlap_chn_mx);
1403 		return;
1404 	}
1405 	/* Now we have to scan the list for possible overlaps */
1406 	while (ps1 != NULL) {
1407 		/*
1408 		 * If this region has been put on the chain by another thread
1409 		 * just exit
1410 		 */
1411 		if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1412 		    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1413 			mutex_exit(&un->un_ovrlap_chn_mx);
1414 			return;
1415 
1416 		}
1417 		for (ps1 = *head; ps1 && (ps1 != &un->un_ovrlap_chn);
1418 		    ps1 = ps1->ps_ovrlap_next) {
1419 			if (ps->ps_firstblk > (*tail)->ps_lastblk) {
1420 				/* Insert at tail */
1421 				ps->ps_ovrlap_prev = (*tail);
1422 				ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1423 				(*tail)->ps_ovrlap_next = ps;
1424 				*tail = ps;
1425 				ps->ps_flags |= MD_MPS_ON_OVERLAP;
1426 				mutex_exit(&un->un_ovrlap_chn_mx);
1427 				return;
1428 			}
1429 			if (ps->ps_firstblk > ps1->ps_lastblk)
1430 				continue;
1431 			if (ps->ps_lastblk < ps1->ps_firstblk) {
1432 				/* Insert into list at current 'ps1' position */
1433 				ps->ps_ovrlap_next = ps1;
1434 				ps->ps_ovrlap_prev = ps1->ps_ovrlap_prev;
1435 				ps1->ps_ovrlap_prev->ps_ovrlap_next = ps;
1436 				ps1->ps_ovrlap_prev = ps;
1437 				ps->ps_flags |= MD_MPS_ON_OVERLAP;
1438 				mutex_exit(&un->un_ovrlap_chn_mx);
1439 				return;
1440 			}
1441 			break;
1442 		}
1443 		if (ps1 != NULL) {
1444 			un->un_ovrlap_chn_flg = 1;
1445 			cv_wait(&un->un_ovrlap_chn_cv, &un->un_ovrlap_chn_mx);
1446 			/*
1447 			 * Now ps1 refers to the old insertion point and we
1448 			 * have to check the whole chain to see if we're still
1449 			 * overlapping any other i/o.
1450 			 */
1451 		}
1452 	}
1453 
1454 	/*
1455 	 * Only get here if we had one overlapping i/o on the list and that
1456 	 * has now completed. In this case the list is empty so we insert <ps>
1457 	 * at the head of the chain.
1458 	 */
1459 	ASSERT(*head == NULL);
1460 	*tail = *head = ps;
1461 	ps->ps_ovrlap_next = ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1462 	ps->ps_flags |= MD_MPS_ON_OVERLAP;
1463 	mutex_exit(&un->un_ovrlap_chn_mx);
1464 }
1465 
1466 /*
1467  * This function is called from mirror_done to check whether any pages have
1468  * been modified while a mirrored write was in progress.  Returns 0 if
1469  * all pages associated with bp are clean, 1 otherwise.
1470  */
1471 static int
1472 any_pages_dirty(struct buf *bp)
1473 {
1474 	int	rval;
1475 
1476 	rval = biomodified(bp);
1477 	if (rval == -1)
1478 		rval = 0;
1479 
1480 	return (rval);
1481 }
1482 
1483 #define	MAX_EXTRAS 10
1484 
1485 void
1486 mirror_commit(
1487 	mm_unit_t	*un,
1488 	int		smmask,
1489 	mddb_recid_t	*extras
1490 )
1491 {
1492 	mm_submirror_t		*sm;
1493 	md_unit_t		*su;
1494 	int			i;
1495 
1496 	/* 2=mirror,null id */
1497 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1498 
1499 	int			ri = 0;
1500 
1501 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1502 		return;
1503 
1504 	/* Add two, this includes the mirror unit and the null recid */
1505 	if (extras != NULL) {
1506 		int	nrecids = 0;
1507 		while (extras[nrecids] != 0) {
1508 			nrecids++;
1509 		}
1510 		ASSERT(nrecids <= MAX_EXTRAS);
1511 	}
1512 
1513 	if (un != NULL)
1514 		recids[ri++] = un->c.un_record_id;
1515 	for (i = 0;  i < NMIRROR; i++) {
1516 		if (!(smmask & SMI2BIT(i)))
1517 			continue;
1518 		sm = &un->un_sm[i];
1519 		if (!SMS_IS(sm, SMS_INUSE))
1520 			continue;
1521 		if (md_getmajor(sm->sm_dev) != md_major)
1522 			continue;
1523 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1524 		recids[ri++] = su->c.un_record_id;
1525 	}
1526 
1527 	if (extras != NULL)
1528 		while (*extras != 0) {
1529 			recids[ri++] = *extras;
1530 			extras++;
1531 		}
1532 
1533 	if (ri == 0)
1534 		return;
1535 	recids[ri] = 0;
1536 
1537 	/*
1538 	 * Ok to hold ioctl lock across record commit to mddb as
1539 	 * long as the record(s) being committed aren't resync records.
1540 	 */
1541 	mddb_commitrecs_wrapper(recids);
1542 }
1543 
1544 
1545 /*
1546  * This routine is used to set a bit in the writable_bm bitmap
1547  * which represents each submirror in a metamirror which
1548  * is writable. The first writable submirror index is assigned
1549  * to the sm_index.  The number of writable submirrors are returned in nunits.
1550  *
1551  * This routine returns the submirror's unit number.
1552  */
1553 
1554 static void
1555 select_write_units(struct mm_unit *un, md_mps_t *ps)
1556 {
1557 
1558 	int		i;
1559 	unsigned	writable_bm = 0;
1560 	unsigned	nunits = 0;
1561 
1562 	for (i = 0; i < NMIRROR; i++) {
1563 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1564 			/* set bit of all writable units */
1565 			writable_bm |= SMI2BIT(i);
1566 			nunits++;
1567 		}
1568 	}
1569 	ps->ps_writable_sm = writable_bm;
1570 	ps->ps_active_cnt = nunits;
1571 	ps->ps_current_sm = 0;
1572 }
1573 
1574 static
1575 unsigned
1576 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1577 {
1578 
1579 	int		i;
1580 	unsigned	writable_bm = 0;
1581 	unsigned	nunits = 0;
1582 
1583 	for (i = 0; i < NMIRROR; i++) {
1584 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1585 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1586 			writable_bm |= SMI2BIT(i);
1587 			nunits++;
1588 		}
1589 	}
1590 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1591 		writable_bm &= ~ps->ps_allfrom_sm;
1592 		nunits--;
1593 	}
1594 	ps->ps_writable_sm = writable_bm;
1595 	ps->ps_active_cnt = nunits;
1596 	ps->ps_current_sm = 0;
1597 	return (nunits);
1598 }
1599 
1600 static md_dev64_t
1601 select_read_unit(
1602 	mm_unit_t	*un,
1603 	diskaddr_t	blkno,
1604 	u_longlong_t	reqcount,
1605 	u_longlong_t	*cando,
1606 	int		must_be_opened,
1607 	md_m_shared_t	**shared,
1608 	md_mcs_t	*cs)
1609 {
1610 	int			i;
1611 	md_m_shared_t		*s;
1612 	uint_t			lasterrcnt = 0;
1613 	md_dev64_t		dev = 0;
1614 	u_longlong_t		cnt;
1615 	u_longlong_t		mincnt;
1616 	mm_submirror_t		*sm;
1617 	mm_submirror_ic_t	*smic;
1618 	mdi_unit_t		*ui;
1619 
1620 	mincnt = reqcount;
1621 	for (i = 0; i < NMIRROR; i++) {
1622 		if (!SUBMIRROR_IS_READABLE(un, i))
1623 			continue;
1624 		sm = &un->un_sm[i];
1625 		smic = &un->un_smic[i];
1626 		cnt = reqcount;
1627 
1628 		/*
1629 		 * If the current submirror is marked as inaccessible, do not
1630 		 * try to access it.
1631 		 */
1632 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1633 		(void) md_unit_readerlock(ui);
1634 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1635 			md_unit_readerexit(ui);
1636 			continue;
1637 		}
1638 		md_unit_readerexit(ui);
1639 
1640 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1641 		    (sm->sm_dev, sm, blkno, &cnt);
1642 
1643 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1644 			continue;
1645 		if (s->ms_state == CS_OKAY) {
1646 			*cando = cnt;
1647 			if (shared != NULL)
1648 				*shared = s;
1649 
1650 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1651 			    cs != NULL) {
1652 				cs->cs_buf.b_flags |= B_FAILFAST;
1653 			}
1654 
1655 			return (un->un_sm[i].sm_dev);
1656 		}
1657 		if (s->ms_state != CS_LAST_ERRED)
1658 			continue;
1659 
1660 		/* don't use B_FAILFAST since we're Last Erred */
1661 
1662 		if (mincnt > cnt)
1663 			mincnt = cnt;
1664 		if (s->ms_lasterrcnt > lasterrcnt) {
1665 			lasterrcnt = s->ms_lasterrcnt;
1666 			if (shared != NULL)
1667 				*shared = s;
1668 			dev = un->un_sm[i].sm_dev;
1669 		}
1670 	}
1671 	*cando = mincnt;
1672 	return (dev);
1673 }
1674 
1675 /*
1676  * Given a 32-bit bitmap, this routine will return the bit number
1677  * of the nth bit set.	The nth bit set is passed via the index integer.
1678  *
1679  * This routine is used to run through the writable submirror bitmap
1680  * and starting all of the writes.  See the value returned is the
1681  * index to appropriate submirror structure, in the md_sm
1682  * array for metamirrors.
1683  */
1684 static int
1685 md_find_nth_unit(uint_t mask, int index)
1686 {
1687 	int	bit, nfound;
1688 
1689 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1690 		ASSERT(mask != 0);
1691 		nfound += (mask & 1);
1692 		mask >>= 1;
1693 	}
1694 	return (bit);
1695 }
1696 
1697 static int
1698 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1699 {
1700 	mm_unit_t	*un;
1701 	buf_t		*bp;
1702 	int		i;
1703 	unsigned	nunits = 0;
1704 	int		iunit;
1705 	uint_t		running_bm = 0;
1706 	uint_t		sm_index;
1707 
1708 	bp = &cs->cs_buf;
1709 	un = ps->ps_un;
1710 
1711 	for (i = 0; i < NMIRROR; i++) {
1712 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1713 			continue;
1714 		running_bm |= SMI2BIT(i);
1715 		nunits++;
1716 	}
1717 	if (nunits == 0)
1718 		return (1);
1719 
1720 	/*
1721 	 * For directed mirror read (DMR) we only use the specified side and
1722 	 * do not compute the source of the read.
1723 	 */
1724 	if (ps->ps_flags & MD_MPS_DMR) {
1725 		sm_index = un->un_dmr_last_read;
1726 	} else {
1727 		/* Normal (non-DMR) operation */
1728 		switch (un->un_read_option) {
1729 		case RD_GEOMETRY:
1730 			iunit = (int)(bp->b_lblkno /
1731 			    howmany(un->c.un_total_blocks, nunits));
1732 			sm_index = md_find_nth_unit(running_bm, iunit);
1733 			break;
1734 		case RD_FIRST:
1735 			sm_index = md_find_nth_unit(running_bm, 0);
1736 			break;
1737 		case RD_LOAD_BAL:
1738 			/* this is intentional to fall into the default */
1739 		default:
1740 			un->un_last_read = (un->un_last_read + 1) % nunits;
1741 			sm_index = md_find_nth_unit(running_bm,
1742 			    un->un_last_read);
1743 			break;
1744 		}
1745 	}
1746 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1747 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1748 
1749 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1750 	    bp->b_flags |= B_FAILFAST;
1751 	}
1752 
1753 	return (0);
1754 }
1755 
1756 static
1757 int
1758 mirror_are_submirrors_available(mm_unit_t *un)
1759 {
1760 	int i;
1761 	for (i = 0; i < NMIRROR; i++) {
1762 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1763 
1764 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1765 		    md_getmajor(tmpdev) != md_major)
1766 			continue;
1767 
1768 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1769 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1770 			return (0);
1771 
1772 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1773 			return (0);
1774 	}
1775 	return (1);
1776 }
1777 
1778 void
1779 build_submirror(mm_unit_t *un, int i, int snarfing)
1780 {
1781 	struct mm_submirror	*sm;
1782 	struct mm_submirror_ic	*smic;
1783 	md_unit_t		*su;
1784 	set_t			setno;
1785 
1786 	sm = &un->un_sm[i];
1787 	smic = &un->un_smic[i];
1788 
1789 	sm->sm_flags = 0; /* sometime we may need to do more here */
1790 
1791 	setno = MD_UN2SET(un);
1792 
1793 	if (!SMS_IS(sm, SMS_INUSE))
1794 		return;
1795 	if (snarfing) {
1796 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1797 						sm->sm_key, MD_NOTRUST_DEVT);
1798 	} else {
1799 		if (md_getmajor(sm->sm_dev) == md_major) {
1800 			su = MD_UNIT(md_getminor(sm->sm_dev));
1801 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1802 			/* submirror can no longer be soft partitioned */
1803 			MD_CAPAB(su) &= (~MD_CAN_SP);
1804 		}
1805 	}
1806 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1807 	    0, "shared by blk", 0);
1808 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1809 	    0, "shared by indx", 0);
1810 	smic->sm_get_component_count =
1811 	    (int (*)())md_get_named_service(sm->sm_dev, 0,
1812 		    "get component count", 0);
1813 	smic->sm_get_bcss =
1814 	    (int (*)())md_get_named_service(sm->sm_dev, 0,
1815 		    "get block count skip size", 0);
1816 	sm->sm_state &= ~SMS_IGNORE;
1817 	if (SMS_IS(sm, SMS_OFFLINE))
1818 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1819 	md_set_parent(sm->sm_dev, MD_SID(un));
1820 }
1821 
1822 static void
1823 mirror_cleanup(mm_unit_t *un)
1824 {
1825 	mddb_recid_t	recid;
1826 	int		smi;
1827 	sv_dev_t	sv[NMIRROR];
1828 	int		nsv = 0;
1829 
1830 	/*
1831 	 * If a MN diskset and this node is not the master, do
1832 	 * not delete any records on snarf of the mirror records.
1833 	 */
1834 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1835 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1836 		return;
1837 	}
1838 
1839 	for (smi = 0; smi < NMIRROR; smi++) {
1840 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1841 			continue;
1842 		sv[nsv].setno = MD_UN2SET(un);
1843 		sv[nsv++].key = un->un_sm[smi].sm_key;
1844 	}
1845 
1846 	recid = un->un_rr_dirty_recid;
1847 	mddb_deleterec_wrapper(un->c.un_record_id);
1848 	if (recid > 0)
1849 		mddb_deleterec_wrapper(recid);
1850 
1851 	md_rem_names(sv, nsv);
1852 }
1853 
1854 /* Return a -1 if optimized record unavailable and set should be released */
1855 int
1856 mirror_build_incore(mm_unit_t *un, int snarfing)
1857 {
1858 	int		i;
1859 
1860 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1861 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1862 		return (1);
1863 	}
1864 
1865 	if (mirror_are_submirrors_available(un) == 0)
1866 		return (1);
1867 
1868 	if (MD_UNIT(MD_SID(un)) != NULL)
1869 		return (0);
1870 
1871 	MD_STATUS(un) = 0;
1872 
1873 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1874 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1875 
1876 	un->un_ovrlap_chn_flg = 0;
1877 	bzero(&un->un_ovrlap_chn, sizeof (un->un_ovrlap_chn));
1878 
1879 	for (i = 0; i < NMIRROR; i++)
1880 		build_submirror(un, i, snarfing);
1881 
1882 	if (unit_setup_resync(un, snarfing) != 0) {
1883 		if (snarfing) {
1884 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1885 			/*
1886 			 * If a MN set and set is not stale, then return -1
1887 			 * which will force the caller to unload the set.
1888 			 * The MN diskset nodes will return failure if
1889 			 * unit_setup_resync fails so that nodes won't
1890 			 * get out of sync.
1891 			 *
1892 			 * If set is STALE, the master node can't allocate
1893 			 * a resync record (if needed), but node needs to
1894 			 * join the set so that user can delete broken mddbs.
1895 			 * So, if set is STALE, just continue on.
1896 			 */
1897 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1898 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1899 				return (-1);
1900 			}
1901 		} else
1902 			return (1);
1903 	}
1904 
1905 	mutex_init(&un->un_ovrlap_chn_mx, NULL, MUTEX_DEFAULT, NULL);
1906 	cv_init(&un->un_ovrlap_chn_cv, NULL, CV_DEFAULT, NULL);
1907 
1908 	un->un_suspend_wr_flag = 0;
1909 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1910 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1911 
1912 	/*
1913 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1914 	 * All references to the owner message state field must be guarded
1915 	 * by this mutex.
1916 	 */
1917 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1918 
1919 	/*
1920 	 * Allocate mutex and condvar for resync thread manipulation. These
1921 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1922 	 */
1923 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1924 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1925 
1926 	/*
1927 	 * Allocate mutex and condvar for resync progress thread manipulation.
1928 	 * This allows resyncs to be continued across an intervening reboot.
1929 	 */
1930 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1931 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1932 
1933 	/*
1934 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1935 	 * provides synchronization between a user-ioctl and the resulting
1936 	 * strategy() call that performs the read().
1937 	 */
1938 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1939 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1940 
1941 	MD_UNIT(MD_SID(un)) = un;
1942 	return (0);
1943 }
1944 
1945 
1946 void
1947 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
1948 {
1949 	mddb_recid_t	recid, vtoc_id;
1950 	size_t		bitcnt;
1951 	size_t		shortcnt;
1952 	int		smi;
1953 	sv_dev_t	sv[NMIRROR];
1954 	int		nsv = 0;
1955 	uint_t		bits = 0;
1956 	minor_t		selfid;
1957 	md_unit_t	*su;
1958 
1959 	md_destroy_unit_incore(mnum, &mirror_md_ops);
1960 
1961 	shortcnt = un->un_rrd_num * sizeof (short);
1962 	bitcnt = howmany(un->un_rrd_num, NBBY);
1963 
1964 	if (un->un_outstanding_writes)
1965 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
1966 	if (un->un_goingclean_bm)
1967 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
1968 	if (un->un_goingdirty_bm)
1969 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
1970 	if (un->un_resync_bm)
1971 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
1972 
1973 	MD_UNIT(mnum) = NULL;
1974 
1975 	if (!removing)
1976 		return;
1977 
1978 	for (smi = 0; smi < NMIRROR; smi++) {
1979 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1980 			continue;
1981 		/* reallow soft partitioning of submirror and reset parent */
1982 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
1983 		MD_CAPAB(su) |= MD_CAN_SP;
1984 		md_reset_parent(un->un_sm[smi].sm_dev);
1985 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
1986 
1987 		sv[nsv].setno = MD_MIN2SET(mnum);
1988 		sv[nsv++].key = un->un_sm[smi].sm_key;
1989 		bits |= SMI2BIT(smi);
1990 	}
1991 
1992 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1993 	recid = un->un_rr_dirty_recid;
1994 	vtoc_id = un->c.un_vtoc_id;
1995 	selfid = MD_SID(un);
1996 
1997 	mirror_commit(un, bits, 0);
1998 
1999 	/* Destroy all mutexes and condvars before returning. */
2000 	mutex_destroy(&un->un_suspend_wr_mx);
2001 	cv_destroy(&un->un_suspend_wr_cv);
2002 	mutex_destroy(&un->un_ovrlap_chn_mx);
2003 	cv_destroy(&un->un_ovrlap_chn_cv);
2004 	mutex_destroy(&un->un_owner_mx);
2005 	mutex_destroy(&un->un_rs_thread_mx);
2006 	cv_destroy(&un->un_rs_thread_cv);
2007 	mutex_destroy(&un->un_rs_progress_mx);
2008 	cv_destroy(&un->un_rs_progress_cv);
2009 	mutex_destroy(&un->un_dmr_mx);
2010 	cv_destroy(&un->un_dmr_cv);
2011 	mddb_deleterec_wrapper(un->c.un_record_id);
2012 	if (recid != 0)
2013 		mddb_deleterec_wrapper(recid);
2014 
2015 	/* Remove the vtoc, if present */
2016 	if (vtoc_id)
2017 		mddb_deleterec_wrapper(vtoc_id);
2018 
2019 	md_rem_names(sv, nsv);
2020 
2021 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2022 	    MD_MIN2SET(selfid), selfid);
2023 }
2024 
2025 int
2026 mirror_internal_open(
2027 	minor_t		mnum,
2028 	int		flag,
2029 	int		otyp,
2030 	int		md_oflags,
2031 	IOLOCK		*lockp		/* can be NULL */
2032 )
2033 {
2034 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2035 	int		err = 0;
2036 
2037 tryagain:
2038 	/* single thread */
2039 	if (lockp) {
2040 		/*
2041 		 * If ioctl lock is held, use openclose_enter
2042 		 * routine that will set the ioctl flag when
2043 		 * grabbing the readerlock.
2044 		 */
2045 		(void) md_ioctl_openclose_enter(lockp, ui);
2046 	} else {
2047 		(void) md_unit_openclose_enter(ui);
2048 	}
2049 
2050 	/*
2051 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2052 	 * message in a MN diskset and this requires that the openclose
2053 	 * lock is dropped in order to send this message.  So, another
2054 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2055 	 * attempting an open while this thread has an open in progress.
2056 	 * Call the *_lh version of the lock exit routines since the ui_mx
2057 	 * mutex must be held from checking for OPENINPROGRESS until
2058 	 * after the cv_wait call.
2059 	 */
2060 	mutex_enter(&ui->ui_mx);
2061 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2062 		if (lockp) {
2063 			(void) md_ioctl_openclose_exit_lh(lockp);
2064 		} else {
2065 			md_unit_openclose_exit_lh(ui);
2066 		}
2067 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2068 		mutex_exit(&ui->ui_mx);
2069 		goto tryagain;
2070 	}
2071 
2072 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2073 	mutex_exit(&ui->ui_mx);
2074 
2075 	/* open devices, if necessary */
2076 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2077 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2078 			goto out;
2079 	}
2080 
2081 	/* count open */
2082 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2083 		goto out;
2084 
2085 	/* unlock, return success */
2086 out:
2087 	mutex_enter(&ui->ui_mx);
2088 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2089 	mutex_exit(&ui->ui_mx);
2090 
2091 	if (lockp) {
2092 		/*
2093 		 * If ioctl lock is held, use openclose_exit
2094 		 * routine that will clear the lockp reader flag.
2095 		 */
2096 		(void) md_ioctl_openclose_exit(lockp);
2097 	} else {
2098 		md_unit_openclose_exit(ui);
2099 	}
2100 	return (err);
2101 }
2102 
2103 int
2104 mirror_internal_close(
2105 	minor_t		mnum,
2106 	int		otyp,
2107 	int		md_cflags,
2108 	IOLOCK		*lockp		/* can be NULL */
2109 )
2110 {
2111 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2112 	mm_unit_t	*un;
2113 	int		err = 0;
2114 
2115 	/* single thread */
2116 	if (lockp) {
2117 		/*
2118 		 * If ioctl lock is held, use openclose_enter
2119 		 * routine that will set the ioctl flag when
2120 		 * grabbing the readerlock.
2121 		 */
2122 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2123 	} else {
2124 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2125 	}
2126 
2127 	/* count closed */
2128 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2129 		goto out;
2130 
2131 	/* close devices, if necessary */
2132 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2133 		/*
2134 		 * Clean up dirty bitmap for this unit. Do this
2135 		 * before closing the underlying devices to avoid
2136 		 * race conditions with reset_mirror() as a
2137 		 * result of a 'metaset -r' command running in
2138 		 * parallel. This might cause deallocation of
2139 		 * dirty region bitmaps; with underlying metadevices
2140 		 * in place this can't happen.
2141 		 * Don't do this if a MN set and ABR not set
2142 		 */
2143 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2144 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2145 			    !(ui->ui_tstate & MD_ABR_CAP))
2146 				mirror_process_unit_resync(un);
2147 		}
2148 		(void) mirror_close_all_devs(un, md_cflags);
2149 
2150 		/*
2151 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2152 		 * clear these capabilities on the last open in the cluster.
2153 		 * To do this we send a message to all nodes to see of the
2154 		 * device is open.
2155 		 */
2156 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2157 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2158 			if (lockp) {
2159 				(void) md_ioctl_openclose_exit(lockp);
2160 			} else {
2161 				md_unit_openclose_exit(ui);
2162 			}
2163 
2164 			/*
2165 			 * if we are in the context of an ioctl, drop the
2166 			 * ioctl lock.
2167 			 * Otherwise, no other locks should be held.
2168 			 */
2169 			if (lockp) {
2170 				IOLOCK_RETURN_RELEASE(0, lockp);
2171 			}
2172 
2173 			mdmn_clear_all_capabilities(mnum);
2174 
2175 			/* if dropped the lock previously, regain it */
2176 			if (lockp) {
2177 				IOLOCK_RETURN_REACQUIRE(lockp);
2178 			}
2179 			return (0);
2180 		}
2181 		/* unlock and return success */
2182 	}
2183 out:
2184 	/* Call whether lockp is NULL or not. */
2185 	if (lockp) {
2186 		md_ioctl_openclose_exit(lockp);
2187 	} else {
2188 		md_unit_openclose_exit(ui);
2189 	}
2190 	return (err);
2191 }
2192 
2193 /*
2194  * When a component has completed resyncing and is now ok, check if the
2195  * corresponding component in the other submirrors is in the Last Erred
2196  * state.  If it is, we want to change that to the Erred state so we stop
2197  * using that component and start using this good component instead.
2198  *
2199  * This is called from set_sm_comp_state and recursively calls
2200  * set_sm_comp_state if it needs to change the Last Erred state.
2201  */
2202 static void
2203 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2204 	IOLOCK *lockp)
2205 {
2206 	mm_submirror_t		*sm;
2207 	mm_submirror_ic_t	*smic;
2208 	int			ci;
2209 	int			i;
2210 	int			compcnt;
2211 	int			changed = 0;
2212 
2213 	for (i = 0; i < NMIRROR; i++) {
2214 		sm = &un->un_sm[i];
2215 		smic = &un->un_smic[i];
2216 
2217 		if (!SMS_IS(sm, SMS_INUSE))
2218 			continue;
2219 
2220 		/* ignore the submirror that we just made ok */
2221 		if (i == smi)
2222 			continue;
2223 
2224 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2225 		for (ci = 0; ci < compcnt; ci++) {
2226 			md_m_shared_t	*shared;
2227 
2228 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2229 			    (sm->sm_dev, sm, ci);
2230 
2231 			if ((shared->ms_state & CS_LAST_ERRED) &&
2232 			    !mirror_other_sources(un, i, ci, 1)) {
2233 
2234 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2235 				    flags, lockp);
2236 				changed = 1;
2237 			}
2238 		}
2239 	}
2240 
2241 	/* maybe there is a hotspare for this newly erred component */
2242 	if (changed) {
2243 		set_t	setno;
2244 
2245 		setno = MD_UN2SET(un);
2246 		if (MD_MNSET_SETNO(setno)) {
2247 			send_poke_hotspares(setno);
2248 		} else {
2249 			(void) poke_hotspares();
2250 		}
2251 	}
2252 }
2253 
2254 /*
2255  * set_sm_comp_state
2256  *
2257  * Set the state of a submirror component to the specified new state.
2258  * If the mirror is in a multi-node set, send messages to all nodes to
2259  * block all writes to the mirror and then update the state and release the
2260  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2261  * MD_STATE_XMIT will be unset in 2 cases:
2262  * 1. When the state is changed to CS_RESYNC as this state change
2263  * will already have been updated on each node by the processing of the
2264  * distributed metasync command, hence no need to xmit.
2265  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2266  * the resync completion will already have been processed on each node by
2267  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2268  * resync, hence no need to xmit.
2269  *
2270  * In case we are called from the updates of a watermark,
2271  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2272  * a metainit or similar. In this case the message that we sent to propagate
2273  * the state change must not be a class1 message as that would deadlock with
2274  * the metainit command that is still being processed.
2275  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2276  * instead. This also makes the submessage generator to create a class2
2277  * submessage rather than a class1 (which would also block)
2278  *
2279  * On entry, unit_writerlock is held
2280  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2281  * also held.
2282  */
2283 void
2284 set_sm_comp_state(
2285 	mm_unit_t	*un,
2286 	int		smi,
2287 	int		ci,
2288 	int		newstate,
2289 	mddb_recid_t	*extras,
2290 	uint_t		flags,
2291 	IOLOCK		*lockp
2292 )
2293 {
2294 	mm_submirror_t		*sm;
2295 	mm_submirror_ic_t	*smic;
2296 	md_m_shared_t		*shared;
2297 	int			origstate;
2298 	void			(*get_dev)();
2299 	ms_cd_info_t		cd;
2300 	char			devname[MD_MAX_CTDLEN];
2301 	int			err;
2302 	set_t			setno = MD_UN2SET(un);
2303 	md_mn_msg_stch_t	stchmsg;
2304 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2305 	md_mn_kresult_t		*kresult;
2306 	int			rval;
2307 	uint_t			msgflags;
2308 	md_mn_msgtype_t		msgtype;
2309 	int			save_lock = 0;
2310 	mdi_unit_t		*ui_sm;
2311 
2312 	sm = &un->un_sm[smi];
2313 	smic = &un->un_smic[smi];
2314 
2315 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2316 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2317 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2318 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2319 	    ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2320 	}
2321 
2322 	shared = (md_m_shared_t *)
2323 		(*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
2324 	origstate = shared->ms_state;
2325 
2326 	/*
2327 	 * If the new state is an error and the old one wasn't, generate
2328 	 * a console message. We do this before we send the state to other
2329 	 * nodes in a MN set because the state change may change the component
2330 	 * name  if a hotspare is allocated.
2331 	 */
2332 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2333 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2334 
2335 		get_dev =
2336 		    (void (*)())md_get_named_service(sm->sm_dev, 0,
2337 				"get device", 0);
2338 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2339 
2340 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2341 		    cd.cd_dev, devname, sizeof (devname));
2342 
2343 		if (err == ENOENT) {
2344 			(void) md_devname(setno, cd.cd_dev, devname,
2345 				sizeof (devname));
2346 		}
2347 
2348 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2349 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2350 
2351 		if (newstate & CS_LAST_ERRED) {
2352 			cmn_err(CE_WARN, "md: %s: %s last erred",
2353 			    md_shortname(md_getminor(sm->sm_dev)),
2354 			    devname);
2355 
2356 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2357 			/*
2358 			 * Close the broken device and clear the open flag on
2359 			 * it.  Closing the device means the RCM framework will
2360 			 * be able to unconfigure the device if required.
2361 			 *
2362 			 * We have to check that the device is open, otherwise
2363 			 * the first open on it has resulted in the error that
2364 			 * is being processed and the actual cd.cd_dev will be
2365 			 * NODEV64.
2366 			 *
2367 			 * If this is a multi-node mirror, then the multinode
2368 			 * state checks following this code will cause the
2369 			 * slave nodes to close the mirror in the function
2370 			 * mirror_set_state().
2371 			 */
2372 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2373 			shared->ms_flags &= ~MDM_S_ISOPEN;
2374 		}
2375 
2376 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2377 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2378 		/*
2379 		 * Similar to logic above except no log messages since we
2380 		 * are just transitioning from Last Erred to Erred.
2381 		 */
2382 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2383 		    "get device", 0);
2384 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2385 
2386 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2387 		shared->ms_flags &= ~MDM_S_ISOPEN;
2388 	}
2389 
2390 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2391 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2392 		/*
2393 		 * For a multi-node mirror, send the state change to the
2394 		 * master, which broadcasts to all nodes, including this
2395 		 * one. Once the message is received, the state is set
2396 		 * in-core and the master commits the change to disk.
2397 		 * There is a case, comp_replace,  where this function
2398 		 * can be called from within an ioctl and therefore in this
2399 		 * case, as the ioctl will already be called on each node,
2400 		 * there is no need to xmit the state change to the master for
2401 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2402 		 * to indicate whether a xmit is required. The mirror's
2403 		 * transient state is set to MD_ERR_PENDING to avoid sending
2404 		 * multiple messages.
2405 		 */
2406 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2407 			ui->ui_tstate |= MD_ERR_PENDING;
2408 
2409 		/*
2410 		 * Send a state update message to all nodes. This message
2411 		 * will generate 2 submessages, the first one to suspend
2412 		 * all writes to the mirror and the second to update the
2413 		 * state and resume writes.
2414 		 */
2415 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2416 		stchmsg.msg_stch_sm = smi;
2417 		stchmsg.msg_stch_comp = ci;
2418 		stchmsg.msg_stch_new_state = newstate;
2419 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2420 #ifdef DEBUG
2421 		if (mirror_debug_flag)
2422 			printf("send set state, %x, %x, %x, %x, %x\n",
2423 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2424 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2425 			    stchmsg.msg_stch_hs_id);
2426 #endif
2427 		if (flags & MD_STATE_WMUPDATE) {
2428 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2429 			/*
2430 			 * When coming from an update of watermarks, there
2431 			 * must already be a message logged that triggered
2432 			 * this action. So, no need to log this message, too.
2433 			 */
2434 			msgflags = MD_MSGF_NO_LOG;
2435 		} else {
2436 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2437 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2438 		}
2439 
2440 		/*
2441 		 * If we are in the context of an ioctl, drop the ioctl lock.
2442 		 * lockp holds the list of locks held.
2443 		 *
2444 		 * Otherwise, increment the appropriate reacquire counters.
2445 		 * If openclose lock is *held, then must reacquire reader
2446 		 * lock before releasing the openclose lock.
2447 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2448 		 * to reacquire it.
2449 		 */
2450 		if (lockp) {
2451 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2452 				save_lock = MD_ARRAY_WRITER;
2453 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2454 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2455 				save_lock = MD_ARRAY_READER;
2456 				lockp->l_flags &= ~MD_ARRAY_READER;
2457 			}
2458 			IOLOCK_RETURN_RELEASE(0, lockp);
2459 		} else {
2460 			if (flags & MD_STATE_OCHELD) {
2461 				md_unit_writerexit(ui);
2462 				(void) md_unit_readerlock(ui);
2463 				md_unit_openclose_exit(ui);
2464 			} else {
2465 				md_unit_writerexit(ui);
2466 			}
2467 		}
2468 
2469 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2470 		rval = mdmn_ksend_message(setno,
2471 					msgtype,
2472 					msgflags,
2473 					(char *)&stchmsg,
2474 					sizeof (stchmsg),
2475 					kresult);
2476 
2477 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2478 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2479 			cmn_err(CE_PANIC,
2480 			    "ksend_message failure: STATE_UPDATE");
2481 		}
2482 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2483 
2484 		/* if dropped the lock previously, regain it */
2485 		if (lockp) {
2486 			IOLOCK_RETURN_REACQUIRE(lockp);
2487 			lockp->l_flags |= save_lock;
2488 		} else {
2489 			/*
2490 			 * Reacquire dropped locks and update acquirecnts
2491 			 * appropriately.
2492 			 */
2493 			if (flags & MD_STATE_OCHELD) {
2494 				/*
2495 				 * openclose also grabs readerlock.
2496 				 */
2497 				(void) md_unit_openclose_enter(ui);
2498 				md_unit_readerexit(ui);
2499 				(void) md_unit_writerlock(ui);
2500 			} else {
2501 				(void) md_unit_writerlock(ui);
2502 			}
2503 		}
2504 
2505 		ui->ui_tstate &= ~MD_ERR_PENDING;
2506 	} else {
2507 		shared->ms_state = newstate;
2508 		uniqtime32(&shared->ms_timestamp);
2509 
2510 		if (newstate == CS_ERRED)
2511 			shared->ms_flags |= MDM_S_NOWRITE;
2512 		else
2513 			shared->ms_flags &= ~MDM_S_NOWRITE;
2514 
2515 		shared->ms_flags &= ~MDM_S_IOERR;
2516 		un->un_changecnt++;
2517 		shared->ms_lasterrcnt = un->un_changecnt;
2518 
2519 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2520 		mirror_commit(un, SMI2BIT(smi), extras);
2521 	}
2522 
2523 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2524 		/*
2525 		 * Resetting the Last Erred state will recursively call back
2526 		 * into this function (set_sm_comp_state) to update the state.
2527 		 */
2528 		reset_lasterred(un, smi, extras, flags, lockp);
2529 	}
2530 }
2531 
2532 static int
2533 find_another_logical(
2534 	mm_unit_t		*un,
2535 	mm_submirror_t		*esm,
2536 	diskaddr_t		blk,
2537 	u_longlong_t		cnt,
2538 	int			must_be_open,
2539 	int			state,
2540 	int			err_cnt)
2541 {
2542 	u_longlong_t	cando;
2543 	md_dev64_t	dev;
2544 	md_m_shared_t	*s;
2545 
2546 	esm->sm_state |= SMS_IGNORE;
2547 	while (cnt != 0) {
2548 		u_longlong_t	 mcnt;
2549 
2550 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2551 
2552 		dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s,
2553 			NULL);
2554 		if (dev == (md_dev64_t)0)
2555 			break;
2556 
2557 		if ((state == CS_LAST_ERRED) &&
2558 		    (s->ms_state == CS_LAST_ERRED) &&
2559 		    (err_cnt > s->ms_lasterrcnt))
2560 			break;
2561 
2562 		cnt -= cando;
2563 		blk += cando;
2564 	}
2565 	esm->sm_state &= ~SMS_IGNORE;
2566 	return (cnt != 0);
2567 }
2568 
2569 int
2570 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2571 {
2572 	mm_submirror_t		*sm;
2573 	mm_submirror_ic_t	*smic;
2574 	size_t			count;
2575 	diskaddr_t		block;
2576 	u_longlong_t		skip;
2577 	u_longlong_t		size;
2578 	md_dev64_t		dev;
2579 	int			cnt;
2580 	md_m_shared_t		*s;
2581 	int			not_found;
2582 
2583 	sm = &un->un_sm[smi];
2584 	smic = &un->un_smic[smi];
2585 	dev = sm->sm_dev;
2586 
2587 	/*
2588 	 * Make sure every component of the submirror
2589 	 * has other sources.
2590 	 */
2591 	if (ci < 0) {
2592 		/* Find the highest lasterrcnt */
2593 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2594 		for (ci = 0; ci < cnt; ci++) {
2595 			not_found = mirror_other_sources(un, smi, ci,
2596 			    must_be_open);
2597 			if (not_found)
2598 				return (1);
2599 		}
2600 		return (0);
2601 	}
2602 
2603 	/*
2604 	 * Make sure this component has other sources
2605 	 */
2606 	(void) (*(smic->sm_get_bcss))
2607 		(dev, sm, ci, &block, &count, &skip, &size);
2608 
2609 	if (count == 0)
2610 		return (1);
2611 
2612 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2613 
2614 	while (count--) {
2615 		if (block >= un->c.un_total_blocks)
2616 			return (0);
2617 
2618 		if ((block + size) > un->c.un_total_blocks)
2619 			size = un->c.un_total_blocks - block;
2620 
2621 		not_found = find_another_logical(un, sm, block, size,
2622 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2623 		if (not_found)
2624 			return (1);
2625 
2626 		block += size + skip;
2627 	}
2628 	return (0);
2629 }
2630 
2631 static void
2632 finish_error(md_mps_t *ps)
2633 {
2634 	struct buf	*pb;
2635 	mm_unit_t	*un;
2636 	mdi_unit_t	*ui;
2637 	uint_t		new_str_flags;
2638 
2639 	pb = ps->ps_bp;
2640 	un = ps->ps_un;
2641 	ui = ps->ps_ui;
2642 
2643 	/*
2644 	 * Must flag any error to the resync originator if we're performing
2645 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2646 	 * target device and in this case we ought to abort the resync as there
2647 	 * is nothing that can be done to recover from this without operator
2648 	 * intervention. If we don't set the B_ERROR flag we will continue
2649 	 * reading from the mirror but won't write to the target (as it will
2650 	 * have been placed into an errored state).
2651 	 * To handle the case of multiple components within a submirror we only
2652 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2653 	 * The originator of the resync read will cause this bit to be set if
2654 	 * the underlying component count is one for a submirror resync. All
2655 	 * other resync types will have the flag set as there is no underlying
2656 	 * resync which can be performed on a contained metadevice for these
2657 	 * resync types (optimized or component).
2658 	 */
2659 
2660 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2661 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2662 			pb->b_flags |= B_ERROR;
2663 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2664 		MPS_FREE(mirror_parent_cache, ps);
2665 		md_unit_readerexit(ui);
2666 		md_biodone(pb);
2667 		return;
2668 	}
2669 	/*
2670 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2671 	 * operation therefore this I/O request has already been counted,
2672 	 * the I/O count variable will be decremented by mirror_done()'s
2673 	 * call to md_biodone().
2674 	 */
2675 	if (ps->ps_changecnt != un->un_changecnt) {
2676 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2677 		if (ps->ps_flags & MD_MPS_WOW)
2678 			new_str_flags |= MD_STR_WOW;
2679 		if (ps->ps_flags & MD_MPS_MAPPED)
2680 			new_str_flags |= MD_STR_MAPPED;
2681 		/*
2682 		 * If this I/O request was a read that was part of a resync,
2683 		 * set MD_STR_WAR for the retried read to ensure that the
2684 		 * resync write (i.e. write-after-read) will be performed
2685 		 */
2686 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2687 			new_str_flags |= MD_STR_WAR;
2688 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2689 		MPS_FREE(mirror_parent_cache, ps);
2690 		md_unit_readerexit(ui);
2691 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2692 		return;
2693 	}
2694 
2695 	pb->b_flags |= B_ERROR;
2696 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2697 	MPS_FREE(mirror_parent_cache, ps);
2698 	md_unit_readerexit(ui);
2699 	md_biodone(pb);
2700 }
2701 
2702 static void
2703 error_update_unit(md_mps_t *ps)
2704 {
2705 	mm_unit_t		*un;
2706 	mdi_unit_t		*ui;
2707 	int			smi;	/* sub mirror index */
2708 	int			ci;	/* errored component */
2709 	set_t			setno;
2710 	uint_t			flags;	/* for set_sm_comp_state() */
2711 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2712 
2713 	ui = ps->ps_ui;
2714 	un = (mm_unit_t *)md_unit_writerlock(ui);
2715 	setno = MD_UN2SET(un);
2716 
2717 	/* All of these updates have to propagated in case of MN set */
2718 	flags = MD_STATE_XMIT;
2719 	hspflags = MD_HOTSPARE_XMIT;
2720 
2721 	/* special treatment if we are called during updating watermarks */
2722 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2723 		flags |= MD_STATE_WMUPDATE;
2724 		hspflags |= MD_HOTSPARE_WMUPDATE;
2725 	}
2726 	smi = 0;
2727 	ci = 0;
2728 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2729 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2730 
2731 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2732 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2733 				(IOLOCK *)NULL);
2734 			/*
2735 			 * For a MN set, the NOTIFY is done when the state
2736 			 * change is processed on each node
2737 			 */
2738 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2739 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2740 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2741 			}
2742 			continue;
2743 		}
2744 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2745 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2746 			(IOLOCK *)NULL);
2747 		/*
2748 		 * For a MN set, the NOTIFY is done when the state
2749 		 * change is processed on each node
2750 		 */
2751 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2752 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2753 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2754 		}
2755 		smi = 0;
2756 		ci = 0;
2757 	}
2758 
2759 	md_unit_writerexit(ui);
2760 	if (MD_MNSET_SETNO(setno)) {
2761 		send_poke_hotspares(setno);
2762 	} else {
2763 		(void) poke_hotspares();
2764 	}
2765 	(void) md_unit_readerlock(ui);
2766 
2767 	finish_error(ps);
2768 }
2769 
2770 /*
2771  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2772  * retry the IO without B_FAILFAST set so that we try to ensure that the
2773  * component "sees" each IO.
2774  */
2775 static void
2776 last_err_retry(md_mcs_t *cs)
2777 {
2778 	struct buf	*cb;
2779 	md_mps_t	*ps;
2780 	uint_t		flags;
2781 
2782 	cb = &cs->cs_buf;
2783 	cb->b_flags &= ~B_FAILFAST;
2784 
2785 	/* if we're panicing just let this I/O error out */
2786 	if (panicstr) {
2787 	    (void) mirror_done(cb);
2788 	    return;
2789 	}
2790 
2791 	/* reissue the I/O */
2792 
2793 	ps = cs->cs_ps;
2794 
2795 	bioerror(cb, 0);
2796 
2797 	mutex_enter(&ps->ps_mx);
2798 
2799 	flags = MD_STR_NOTTOP;
2800 	if (ps->ps_flags & MD_MPS_MAPPED)
2801 		flags |= MD_STR_MAPPED;
2802 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2803 		flags |= MD_NOBLOCK;
2804 
2805 	mutex_exit(&ps->ps_mx);
2806 
2807 	clear_retry_error(cb);
2808 
2809 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2810 		md_shortname(getminor(cb->b_edev)));
2811 
2812 	md_call_strategy(cb, flags, NULL);
2813 }
2814 
2815 static void
2816 mirror_error(md_mps_t *ps)
2817 {
2818 	int		smi;	/* sub mirror index */
2819 	int		ci;	/* errored component */
2820 
2821 	if (panicstr) {
2822 		finish_error(ps);
2823 		return;
2824 	}
2825 
2826 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2827 		mirror_overlap_chain_remove(ps);
2828 
2829 	smi = 0;
2830 	ci = 0;
2831 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2832 		md_unit_readerexit(ps->ps_ui);
2833 		daemon_request(&md_mstr_daemon, error_update_unit,
2834 		    (daemon_queue_t *)ps, REQ_OLD);
2835 		return;
2836 	}
2837 
2838 	finish_error(ps);
2839 }
2840 
2841 static int
2842 copy_write_done(struct buf *cb)
2843 {
2844 	md_mps_t	*ps;
2845 	buf_t		*pb;
2846 	char		*wowbuf;
2847 	wowhdr_t	*wowhdr;
2848 	ssize_t		wow_resid;
2849 
2850 	/* get wowbuf ans save structure */
2851 	wowbuf = cb->b_un.b_addr;
2852 	wowhdr = WOWBUF_HDR(wowbuf);
2853 	ps = wowhdr->wow_ps;
2854 	pb = ps->ps_bp;
2855 
2856 	/* Save error information, then free cb */
2857 	if (cb->b_flags & B_ERROR)
2858 		pb->b_flags |= B_ERROR;
2859 
2860 	if (cb->b_flags & B_REMAPPED)
2861 		bp_mapout(cb);
2862 
2863 	freerbuf(cb);
2864 
2865 	/* update residual and continue if needed */
2866 	if ((pb->b_flags & B_ERROR) == 0) {
2867 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2868 		pb->b_resid = wow_resid;
2869 		if (wow_resid > 0)  {
2870 			daemon_request(&md_mstr_daemon, copy_write_cont,
2871 			    (daemon_queue_t *)wowhdr, REQ_OLD);
2872 			return (1);
2873 		}
2874 	}
2875 
2876 	/* Write is complete, release resources. */
2877 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2878 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2879 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2880 	MPS_FREE(mirror_parent_cache, ps);
2881 	md_biodone(pb);
2882 	return (0);
2883 }
2884 
2885 static void
2886 copy_write_cont(wowhdr_t *wowhdr)
2887 {
2888 	buf_t		*pb;
2889 	buf_t		*cb;
2890 	char		*wowbuf;
2891 	int		wow_offset;
2892 	size_t		wow_resid;
2893 	diskaddr_t	wow_blkno;
2894 
2895 	wowbuf = WOWHDR_BUF(wowhdr);
2896 	pb = wowhdr->wow_ps->ps_bp;
2897 
2898 	/* get data on current location */
2899 	wow_offset = wowhdr->wow_offset;
2900 	wow_resid = pb->b_bcount - wow_offset;
2901 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
2902 
2903 	/* setup child buffer */
2904 	cb = getrbuf(KM_SLEEP);
2905 	cb->b_flags = B_WRITE;
2906 	cb->b_edev = pb->b_edev;
2907 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
2908 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
2909 	cb->b_iodone = copy_write_done;
2910 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
2911 	cb->b_lblkno = wow_blkno;
2912 
2913 	/* move offset to next section */
2914 	wowhdr->wow_offset += cb->b_bcount;
2915 
2916 	/* copy and setup write for current section */
2917 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
2918 
2919 	/* do it */
2920 	/*
2921 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
2922 	 * that handles the WOW condition. The resultant increment on the
2923 	 * I/O count variable is cleared by copy_write_done()'s call to
2924 	 * md_biodone().
2925 	 */
2926 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
2927 				    | MD_STR_MAPPED, NULL);
2928 }
2929 
2930 static void
2931 md_mirror_copy_write(md_mps_t *ps)
2932 {
2933 	wowhdr_t	*wowhdr;
2934 
2935 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
2936 	mirror_wowblk_init(wowhdr);
2937 	wowhdr->wow_ps = ps;
2938 	wowhdr->wow_offset = 0;
2939 	copy_write_cont(wowhdr);
2940 }
2941 
2942 static void
2943 handle_wow(md_mps_t *ps)
2944 {
2945 	buf_t		*pb;
2946 
2947 	pb = ps->ps_bp;
2948 
2949 	bp_mapin(pb);
2950 
2951 	md_mirror_wow_cnt++;
2952 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
2953 		cmn_err(CE_NOTE,
2954 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
2955 		    md_shortname(getminor(pb->b_edev)),
2956 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
2957 	}
2958 
2959 	/*
2960 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2961 	 * operation therefore this I/O request has already been counted,
2962 	 * the I/O count variable will be decremented by mirror_done()'s
2963 	 * call to md_biodone().
2964 	 */
2965 	if (md_mirror_wow_flg & WOW_NOCOPY)
2966 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
2967 					    MD_STR_MAPPED | MD_IO_COUNTED, ps);
2968 	else
2969 		md_mirror_copy_write(ps);
2970 }
2971 
2972 /*
2973  * Return true if the specified submirror is either in the Last Erred
2974  * state or is transitioning into the Last Erred state.
2975  */
2976 static bool_t
2977 submirror_is_lasterred(mm_unit_t *un, int smi)
2978 {
2979 	mm_submirror_t		*sm;
2980 	mm_submirror_ic_t	*smic;
2981 	md_m_shared_t		*shared;
2982 	int			ci;
2983 	int			compcnt;
2984 
2985 	sm = &un->un_sm[smi];
2986 	smic = &un->un_smic[smi];
2987 
2988 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2989 	for (ci = 0; ci < compcnt; ci++) {
2990 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2991 		    (sm->sm_dev, sm, ci);
2992 
2993 		if (shared->ms_state == CS_LAST_ERRED)
2994 			return (B_TRUE);
2995 
2996 		/*
2997 		 * It is not currently Last Erred, check if entering Last Erred.
2998 		 */
2999 		if ((shared->ms_flags & MDM_S_IOERR) &&
3000 		    ((shared->ms_state == CS_OKAY) ||
3001 		    (shared->ms_state == CS_RESYNC))) {
3002 			if (mirror_other_sources(un, smi, ci, 0) == 1)
3003 				return (B_TRUE);
3004 		}
3005 	}
3006 
3007 	return (B_FALSE);
3008 }
3009 
3010 
3011 static int
3012 mirror_done(struct buf *cb)
3013 {
3014 	md_mps_t	*ps;
3015 	md_mcs_t	*cs;
3016 
3017 	/*LINTED*/
3018 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3019 	ps = cs->cs_ps;
3020 
3021 	mutex_enter(&ps->ps_mx);
3022 
3023 	/* check if we need to retry an errored failfast I/O */
3024 	if (cb->b_flags & B_ERROR) {
3025 		struct buf *pb = ps->ps_bp;
3026 
3027 		if (cb->b_flags & B_FAILFAST) {
3028 			int		i;
3029 			mm_unit_t	*un = ps->ps_un;
3030 
3031 			for (i = 0; i < NMIRROR; i++) {
3032 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3033 					continue;
3034 
3035 				if (cb->b_edev ==
3036 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3037 
3038 					/*
3039 					 * This is the submirror that had the
3040 					 * error.  Check if it is Last Erred.
3041 					 */
3042 					if (submirror_is_lasterred(un, i)) {
3043 						daemon_queue_t *dqp;
3044 
3045 						mutex_exit(&ps->ps_mx);
3046 						dqp = (daemon_queue_t *)cs;
3047 						dqp->dq_prev = NULL;
3048 						dqp->dq_next = NULL;
3049 						daemon_request(&md_done_daemon,
3050 						    last_err_retry, dqp,
3051 						    REQ_OLD);
3052 						return (1);
3053 					}
3054 					break;
3055 				}
3056 			}
3057 		}
3058 
3059 		/* continue to process the buf without doing a retry */
3060 		ps->ps_flags |= MD_MPS_ERROR;
3061 		pb->b_error = cb->b_error;
3062 	}
3063 
3064 	return (mirror_done_common(cb));
3065 }
3066 
3067 /*
3068  * Split from the original mirror_done function so we can handle bufs after a
3069  * retry.
3070  * ps->ps_mx is already held in the caller of this function and the cb error
3071  * has already been checked and handled in the caller.
3072  */
3073 static int
3074 mirror_done_common(struct buf *cb)
3075 {
3076 	struct buf	*pb;
3077 	mm_unit_t	*un;
3078 	mdi_unit_t	*ui;
3079 	md_mps_t	*ps;
3080 	md_mcs_t	*cs;
3081 	size_t		end_rr, start_rr, current_rr;
3082 
3083 	/*LINTED*/
3084 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3085 	ps = cs->cs_ps;
3086 	pb = ps->ps_bp;
3087 
3088 	if (cb->b_flags & B_REMAPPED)
3089 		bp_mapout(cb);
3090 
3091 	ps->ps_frags--;
3092 	if (ps->ps_frags != 0) {
3093 		mutex_exit(&ps->ps_mx);
3094 		kmem_cache_free(mirror_child_cache, cs);
3095 		return (1);
3096 	}
3097 	un = ps->ps_un;
3098 	ui = ps->ps_ui;
3099 
3100 	/*
3101 	 * Do not update outstanding_writes if we're running with ABR
3102 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3103 	 * Also a resync initiated write() has no outstanding_writes update
3104 	 * either.
3105 	 */
3106 	if (((cb->b_flags & B_READ) == 0) &&
3107 	    (un->un_nsm >= 2) &&
3108 	    (ps->ps_call == NULL) &&
3109 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3110 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3111 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3112 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3113 		mutex_enter(&un->un_resync_mx);
3114 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3115 			un->un_outstanding_writes[current_rr]--;
3116 		mutex_exit(&un->un_resync_mx);
3117 	}
3118 	kmem_cache_free(mirror_child_cache, cs);
3119 	mutex_exit(&ps->ps_mx);
3120 
3121 	if (ps->ps_call != NULL) {
3122 		daemon_request(&md_done_daemon, ps->ps_call,
3123 		    (daemon_queue_t *)ps, REQ_OLD);
3124 		return (1);
3125 	}
3126 
3127 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3128 		daemon_request(&md_done_daemon, mirror_error,
3129 		    (daemon_queue_t *)ps, REQ_OLD);
3130 		return (1);
3131 	}
3132 
3133 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3134 		mirror_overlap_chain_remove(ps);
3135 
3136 	/*
3137 	 * Handle Write-on-Write problem.
3138 	 * Skip In case of Raw and Direct I/O as they are
3139 	 * handled earlier.
3140 	 *
3141 	 */
3142 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3143 	    !(pb->b_flags & B_READ) &&
3144 	    !(ps->ps_flags & MD_MPS_WOW) &&
3145 	    !(pb->b_flags & B_PHYS) &&
3146 	    any_pages_dirty(pb)) {
3147 		md_unit_readerexit(ps->ps_ui);
3148 		daemon_request(&md_mstr_daemon, handle_wow,
3149 		    (daemon_queue_t *)ps, REQ_OLD);
3150 		return (1);
3151 	}
3152 
3153 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3154 	MPS_FREE(mirror_parent_cache, ps);
3155 	md_unit_readerexit(ui);
3156 	md_biodone(pb);
3157 	return (0);
3158 }
3159 
3160 /*
3161  * Clear error state in submirror component if the retry worked after
3162  * a failfast error.
3163  */
3164 static void
3165 clear_retry_error(struct buf *cb)
3166 {
3167 	int			smi;
3168 	md_mcs_t		*cs;
3169 	mm_unit_t		*un;
3170 	mdi_unit_t		*ui_sm;
3171 	mm_submirror_t		*sm;
3172 	mm_submirror_ic_t	*smic;
3173 	u_longlong_t		cnt;
3174 	md_m_shared_t		*shared;
3175 
3176 	/*LINTED*/
3177 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3178 	un = cs->cs_ps->ps_un;
3179 
3180 	for (smi = 0; smi < NMIRROR; smi++) {
3181 	    if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3182 		continue;
3183 
3184 	    if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) {
3185 		break;
3186 	    }
3187 	}
3188 
3189 	if (smi >= NMIRROR)
3190 	    return;
3191 
3192 	sm = &un->un_sm[smi];
3193 	smic = &un->un_smic[smi];
3194 	cnt = cb->b_bcount;
3195 
3196 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3197 	(void) md_unit_writerlock(ui_sm);
3198 
3199 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3200 	    cb->b_blkno, &cnt);
3201 
3202 	if (shared->ms_flags & MDM_S_IOERR) {
3203 	    shared->ms_flags &= ~MDM_S_IOERR;
3204 
3205 	} else {
3206 	    /* the I/O buf spans components and the first one is not erred */
3207 	    int	cnt;
3208 	    int	i;
3209 
3210 	    cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3211 	    for (i = 0; i < cnt; i++) {
3212 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3213 		    (sm->sm_dev, sm, i);
3214 
3215 		if (shared->ms_flags & MDM_S_IOERR &&
3216 		    shared->ms_state == CS_OKAY) {
3217 
3218 		    shared->ms_flags &= ~MDM_S_IOERR;
3219 		    break;
3220 		}
3221 	    }
3222 	}
3223 
3224 	md_unit_writerexit(ui_sm);
3225 }
3226 
3227 static size_t
3228 mirror_map_read(
3229 	md_mps_t *ps,
3230 	md_mcs_t *cs,
3231 	diskaddr_t blkno,
3232 	u_longlong_t	count
3233 )
3234 {
3235 	mm_unit_t	*un;
3236 	buf_t		*bp;
3237 	u_longlong_t	cando;
3238 
3239 	bp = &cs->cs_buf;
3240 	un = ps->ps_un;
3241 
3242 	bp->b_lblkno = blkno;
3243 	if (fast_select_read_unit(ps, cs) == 0) {
3244 		bp->b_bcount = ldbtob(count);
3245 		return (0);
3246 	}
3247 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando,
3248 							0, NULL, cs));
3249 	bp->b_bcount = ldbtob(cando);
3250 	if (count != cando)
3251 		return (cando);
3252 	return (0);
3253 }
3254 
3255 static void
3256 write_after_read(md_mps_t *ps)
3257 {
3258 	struct buf	*pb;
3259 	int		flags;
3260 
3261 	if (ps->ps_flags & MD_MPS_ERROR) {
3262 		mirror_error(ps);
3263 		return;
3264 	}
3265 
3266 	pb = ps->ps_bp;
3267 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3268 	ps->ps_call = NULL;
3269 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3270 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3271 	if (ps->ps_flags & MD_MPS_MAPPED)
3272 		flags |= MD_STR_MAPPED;
3273 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3274 		flags |= MD_NOBLOCK;
3275 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3276 		flags |= MD_STR_DIRTY_RD;
3277 	(void) mirror_write_strategy(pb, flags, ps);
3278 }
3279 
3280 static void
3281 continue_serial(md_mps_t *ps)
3282 {
3283 	md_mcs_t	*cs;
3284 	buf_t		*cb;
3285 	mm_unit_t	*un;
3286 	int		flags;
3287 
3288 	un = ps->ps_un;
3289 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3290 	mirror_child_init(cs);
3291 	cb = &cs->cs_buf;
3292 	ps->ps_call = NULL;
3293 	ps->ps_frags = 1;
3294 	(void) mirror_map_write(un, cs, ps, 0);
3295 	flags = MD_STR_NOTTOP;
3296 	if (ps->ps_flags & MD_MPS_MAPPED)
3297 		flags |= MD_STR_MAPPED;
3298 	md_call_strategy(cb, flags, NULL);
3299 }
3300 
3301 static int
3302 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3303 {
3304 	int i;
3305 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3306 	buf_t		*cb;
3307 	buf_t		*pb;
3308 	diskaddr_t	blkno;
3309 	size_t		bcount;
3310 	off_t		offset;
3311 
3312 	pb = ps->ps_bp;
3313 	cb = &cs->cs_buf;
3314 	cs->cs_ps = ps;
3315 
3316 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3317 
3318 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3319 
3320 	blkno = pb->b_lblkno;
3321 	bcount = pb->b_bcount;
3322 	offset = 0;
3323 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3324 		blkno = DK_LABEL_LOC + 1;
3325 		/*
3326 		 * This handles the case where we're requesting
3327 		 * a write to block 0 on a label partition
3328 		 * and the request size was smaller than the
3329 		 * size of the label.  If this is the case
3330 		 * then we'll return -1.  Failure to do so will
3331 		 * either cause the calling thread to hang due to
3332 		 * an ssd bug, or worse if the bcount were allowed
3333 		 * to go negative (ie large).
3334 		 */
3335 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3336 			return (-1);
3337 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3338 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3339 	}
3340 
3341 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3342 	    cb, KM_NOSLEEP);
3343 	if (war)
3344 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3345 
3346 	/*
3347 	 * If the submirror is in the erred stated, check if any component is
3348 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3349 	 * flag on the IO.
3350 	 *
3351 	 * Provide a fast path for the non-erred case (which should be the
3352 	 * normal case).
3353 	 */
3354 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3355 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3356 			mm_submirror_t		*sm;
3357 			mm_submirror_ic_t	*smic;
3358 			int			ci;
3359 			int			compcnt;
3360 
3361 			sm = &un->un_sm[i];
3362 			smic = &un->un_smic[i];
3363 
3364 			compcnt = (*(smic->sm_get_component_count))
3365 			    (sm->sm_dev, un);
3366 			for (ci = 0; ci < compcnt; ci++) {
3367 				md_m_shared_t	*shared;
3368 
3369 				shared = (md_m_shared_t *)
3370 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3371 				    sm, ci);
3372 
3373 				if (shared->ms_state == CS_LAST_ERRED)
3374 					break;
3375 			}
3376 			if (ci >= compcnt)
3377 				cb->b_flags |= B_FAILFAST;
3378 
3379 		} else {
3380 			cb->b_flags |= B_FAILFAST;
3381 		}
3382 	}
3383 
3384 	ps->ps_current_sm++;
3385 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3386 		if (un->un_write_option == WR_SERIAL) {
3387 			ps->ps_call = continue_serial;
3388 			return (0);
3389 		}
3390 		return (1);
3391 	}
3392 	return (0);
3393 }
3394 
3395 /*
3396  * directed_read_done:
3397  * ------------------
3398  * Completion routine called when a DMR request has been returned from the
3399  * underlying driver. Wake-up the original ioctl() and return the data to
3400  * the user.
3401  */
3402 static void
3403 directed_read_done(md_mps_t *ps)
3404 {
3405 	mm_unit_t	*un;
3406 	mdi_unit_t	*ui;
3407 
3408 	un = ps->ps_un;
3409 	ui = ps->ps_ui;
3410 
3411 	md_unit_readerexit(ui);
3412 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3413 	ps->ps_call = NULL;
3414 
3415 	mutex_enter(&un->un_dmr_mx);
3416 	cv_signal(&un->un_dmr_cv);
3417 	mutex_exit(&un->un_dmr_mx);
3418 
3419 	/* release the parent structure */
3420 	kmem_cache_free(mirror_parent_cache, ps);
3421 }
3422 
3423 /*
3424  * daemon_io:
3425  * ------------
3426  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3427  * call from a blockable context. NOTE: no mutex can be held on entry to this
3428  * routine
3429  */
3430 static void
3431 daemon_io(daemon_queue_t *dq)
3432 {
3433 	md_mps_t	*ps = (md_mps_t *)dq;
3434 	int		flag = MD_STR_NOTTOP;
3435 	buf_t		*pb = ps->ps_bp;
3436 
3437 	if (ps->ps_flags & MD_MPS_MAPPED)
3438 		flag |= MD_STR_MAPPED;
3439 	if (ps->ps_flags & MD_MPS_WOW)
3440 		flag |= MD_STR_WOW;
3441 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3442 		flag |= MD_STR_WAR;
3443 	if (ps->ps_flags & MD_MPS_ABR)
3444 		flag |= MD_STR_ABR;
3445 
3446 	/*
3447 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3448 	 * MD_STR_WAR before calling mirror_read_strategy
3449 	 */
3450 	if (pb->b_flags & B_READ) {
3451 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3452 			flag |= MD_STR_WAR;
3453 		mirror_read_strategy(pb, flag, ps);
3454 	} else
3455 		mirror_write_strategy(pb, flag, ps);
3456 }
3457 
3458 /*
3459  * update_resync:
3460  * -------------
3461  * Called to update the in-core version of the resync record with the latest
3462  * version that was committed to disk when the previous mirror owner
3463  * relinquished ownership. This call is likely to block as we must hold-off
3464  * any current resync processing that may be occurring.
3465  * On completion of the resync record update we issue the mirror_write_strategy
3466  * call to complete the i/o that first started this sequence. To remove a race
3467  * condition between a new write() request which is submitted and the resync
3468  * record update we acquire the writerlock. This will hold off all i/o to the
3469  * mirror until the resync update has completed.
3470  * NOTE: no mutex can be held on entry to this routine
3471  */
3472 static void
3473 update_resync(daemon_queue_t *dq)
3474 {
3475 	md_mps_t	*ps = (md_mps_t *)dq;
3476 	buf_t		*pb = ps->ps_bp;
3477 	mdi_unit_t	*ui = ps->ps_ui;
3478 	mm_unit_t	*un;
3479 	set_t		setno;
3480 	int		restart_resync;
3481 
3482 	un = md_unit_writerlock(ui);
3483 	ps->ps_un = un;
3484 	setno = MD_MIN2SET(getminor(pb->b_edev));
3485 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3486 		/*
3487 		 * Synchronize our in-core view of what regions need to be
3488 		 * resync'd with the on-disk version.
3489 		 */
3490 		mutex_enter(&un->un_rrp_inflight_mx);
3491 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3492 		    un->un_dirty_bm);
3493 		mutex_exit(&un->un_rrp_inflight_mx);
3494 
3495 		/* Region dirty map is now up to date */
3496 	}
3497 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3498 	md_unit_writerexit(ui);
3499 
3500 	/* Restart the resync thread if it was previously blocked */
3501 	if (restart_resync) {
3502 		mutex_enter(&un->un_rs_thread_mx);
3503 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3504 		cv_signal(&un->un_rs_thread_cv);
3505 		mutex_exit(&un->un_rs_thread_mx);
3506 	}
3507 	/* Continue with original deferred i/o */
3508 	daemon_io(dq);
3509 }
3510 
3511 /*
3512  * owner_timeout:
3513  * -------------
3514  * Called if the original mdmn_ksend_message() failed and the request is to be
3515  * retried. Reattempt the original ownership change.
3516  *
3517  * NOTE: called at interrupt context (see timeout(9f)).
3518  */
3519 static void
3520 owner_timeout(void *arg)
3521 {
3522 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3523 
3524 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3525 }
3526 
3527 /*
3528  * become_owner:
3529  * ------------
3530  * Called to issue RPC request to become the owner of the mirror
3531  * associated with this i/o request. We assume that the ownership request
3532  * is synchronous, so if it succeeds we will issue the request via
3533  * mirror_write_strategy().
3534  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3535  * service thread.
3536  * NOTE: no mutex should be held on entry to this routine.
3537  */
3538 static void
3539 become_owner(daemon_queue_t *dq)
3540 {
3541 	md_mps_t	*ps = (md_mps_t *)dq;
3542 	mm_unit_t	*un = ps->ps_un;
3543 	buf_t		*pb = ps->ps_bp;
3544 	set_t		setno;
3545 	md_mn_kresult_t	*kres;
3546 	int		msg_flags = md_mirror_msg_flags;
3547 	md_mps_t	*ps1;
3548 
3549 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3550 
3551 	/*
3552 	 * If we're already the mirror owner we do not need to send a message
3553 	 * but can simply process the i/o request immediately.
3554 	 * If we've already sent the request to become owner we requeue the
3555 	 * request as we're waiting for the synchronous ownership message to
3556 	 * be processed.
3557 	 */
3558 	if (MD_MN_MIRROR_OWNER(un)) {
3559 		/*
3560 		 * As the strategy() call will potentially block we need to
3561 		 * punt this to a separate thread and complete this request
3562 		 * as quickly as possible. Note: if we're a read request
3563 		 * this must be a resync, we cannot afford to be queued
3564 		 * behind any intervening i/o requests. In this case we put the
3565 		 * request on the md_mirror_rs_daemon queue.
3566 		 */
3567 		if (pb->b_flags & B_READ) {
3568 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3569 			    REQ_OLD);
3570 		} else {
3571 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3572 			    REQ_OLD);
3573 		}
3574 	} else {
3575 		mutex_enter(&un->un_owner_mx);
3576 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3577 			md_mn_req_owner_t	*msg;
3578 			int			rval = 0;
3579 
3580 			/*
3581 			 * Check to see that we haven't exceeded the maximum
3582 			 * retry count. If we have we fail the i/o as the
3583 			 * comms mechanism has become wedged beyond recovery.
3584 			 */
3585 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3586 				mutex_exit(&un->un_owner_mx);
3587 				cmn_err(CE_WARN,
3588 				    "md_mirror: Request exhausted ownership "
3589 				    "retry limit of %d attempts", dq->qlen);
3590 				pb->b_error = EIO;
3591 				pb->b_flags |= B_ERROR;
3592 				pb->b_resid = pb->b_bcount;
3593 				kmem_cache_free(mirror_parent_cache, ps);
3594 				md_biodone(pb);
3595 				return;
3596 			}
3597 
3598 			/*
3599 			 * Issue request to change ownership. The call is
3600 			 * synchronous so when it returns we can complete the
3601 			 * i/o (if successful), or enqueue it again so that
3602 			 * the operation will be retried.
3603 			 */
3604 			un->un_owner_state |= MM_MN_OWNER_SENT;
3605 			mutex_exit(&un->un_owner_mx);
3606 
3607 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3608 			setno = MD_MIN2SET(getminor(pb->b_edev));
3609 			msg->mnum = MD_SID(un);
3610 			msg->owner = md_mn_mynode_id;
3611 			msg_flags |= MD_MSGF_NO_LOG;
3612 			/*
3613 			 * If this IO is triggered by updating a watermark,
3614 			 * it might be issued by the creation of a softpartition
3615 			 * while the commd subsystem is suspended.
3616 			 * We don't want this message to block.
3617 			 */
3618 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3619 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3620 			}
3621 
3622 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3623 			rval = mdmn_ksend_message(setno,
3624 						MD_MN_MSG_REQUIRE_OWNER,
3625 						msg_flags, /* flags */
3626 						(char *)msg,
3627 						sizeof (md_mn_req_owner_t),
3628 						kres);
3629 
3630 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3631 
3632 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3633 				dq->qlen = 0;
3634 				/*
3635 				 * Successfully changed owner, reread the
3636 				 * resync record so that we have a valid idea of
3637 				 * any previously committed incomplete write()s.
3638 				 * NOTE: As we need to acquire the resync mutex
3639 				 * this may block, so we defer it to a separate
3640 				 * thread handler. This makes us (effectively)
3641 				 * non-blocking once the ownership message
3642 				 * handling has completed.
3643 				 */
3644 				mutex_enter(&un->un_owner_mx);
3645 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3646 					un->un_mirror_owner = md_mn_mynode_id;
3647 					/* Sets owner of un_rr_dirty record */
3648 					if (un->un_rr_dirty_recid)
3649 						(void) mddb_setowner(
3650 						    un->un_rr_dirty_recid,
3651 						    md_mn_mynode_id);
3652 					un->un_owner_state &=
3653 					    ~MM_MN_BECOME_OWNER;
3654 					/*
3655 					 * Release the block on the current
3656 					 * resync region if it is blocked
3657 					 */
3658 					ps1 = un->un_rs_prev_ovrlap;
3659 					if ((ps1 != NULL) &&
3660 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3661 						mirror_overlap_chain_remove(
3662 						    ps1);
3663 					mutex_exit(&un->un_owner_mx);
3664 
3665 					/*
3666 					 * If we're a read, this must be a
3667 					 * resync request, issue
3668 					 * the i/o request on the
3669 					 * md_mirror_rs_daemon queue. This is
3670 					 * to avoid a deadlock between the
3671 					 * resync_unit thread and
3672 					 * subsequent i/o requests that may
3673 					 * block on the resync region.
3674 					 */
3675 					if (pb->b_flags & B_READ) {
3676 						daemon_request(
3677 						    &md_mirror_rs_daemon,
3678 						    update_resync, dq, REQ_OLD);
3679 					} else {
3680 						daemon_request(
3681 						    &md_mirror_io_daemon,
3682 						    update_resync, dq, REQ_OLD);
3683 					}
3684 					kmem_free(kres,
3685 					    sizeof (md_mn_kresult_t));
3686 					return;
3687 				} else {
3688 					/*
3689 					 * Some other node has beaten us to
3690 					 * obtain ownership. We need to
3691 					 * reschedule our ownership request
3692 					 */
3693 					mutex_exit(&un->un_owner_mx);
3694 				}
3695 			} else {
3696 				mdmn_ksend_show_error(rval, kres,
3697 				    "MD_MN_MSG_REQUIRE_OWNER");
3698 				/*
3699 				 * Message transport failure is handled by the
3700 				 * comms layer. If the ownership change request
3701 				 * does not succeed we need to flag the error to
3702 				 * the initiator of the i/o. This is handled by
3703 				 * the retry logic above. As the request failed
3704 				 * we do not know _who_ the owner of the mirror
3705 				 * currently is. We reset our idea of the owner
3706 				 * to None so that any further write()s will
3707 				 * attempt to become the owner again. This stops
3708 				 * multiple nodes writing to the same mirror
3709 				 * simultaneously.
3710 				 */
3711 				mutex_enter(&un->un_owner_mx);
3712 				un->un_owner_state &=
3713 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3714 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3715 				mutex_exit(&un->un_owner_mx);
3716 			}
3717 			kmem_free(kres, sizeof (md_mn_kresult_t));
3718 		} else
3719 			mutex_exit(&un->un_owner_mx);
3720 
3721 		/*
3722 		 * Re-enqueue this request on the deferred i/o list. Delay the
3723 		 * request for md_mirror_owner_to usecs to stop thrashing.
3724 		 */
3725 		(void) timeout(owner_timeout, dq,
3726 		    drv_usectohz(md_mirror_owner_to));
3727 	}
3728 }
3729 
3730 static void
3731 mirror_write_strategy(buf_t *pb, int flag, void *private)
3732 {
3733 	md_mps_t	*ps;
3734 	md_mcs_t	*cs;
3735 	int		more;
3736 	mm_unit_t	*un;
3737 	mdi_unit_t	*ui;
3738 	buf_t		*cb;		/* child buf pointer */
3739 	set_t		setno;
3740 	int		rs_on_overlap = 0;
3741 
3742 	ui = MDI_UNIT(getminor(pb->b_edev));
3743 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3744 
3745 
3746 	md_kstat_waitq_enter(ui);
3747 
3748 	/*
3749 	 * If a state change is in progress for this mirror in a MN set,
3750 	 * suspend all non-resync writes until the state change is complete.
3751 	 * The objective of this suspend is to ensure that it is not
3752 	 * possible for one node to read data from a submirror that another node
3753 	 * has not written to because of the state change. Therefore we
3754 	 * suspend all writes until the state change has been made. As it is
3755 	 * not possible to read from the target of a resync, there is no need
3756 	 * to suspend resync writes.
3757 	 */
3758 
3759 	if (!(flag & MD_STR_WAR)) {
3760 		mutex_enter(&un->un_suspend_wr_mx);
3761 		while (un->un_suspend_wr_flag) {
3762 			cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx);
3763 		}
3764 		mutex_exit(&un->un_suspend_wr_mx);
3765 		(void) md_unit_readerlock(ui);
3766 	}
3767 
3768 	if (!(flag & MD_STR_NOTTOP)) {
3769 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3770 			md_kstat_waitq_exit(ui);
3771 			return;
3772 		}
3773 	}
3774 
3775 	setno = MD_MIN2SET(getminor(pb->b_edev));
3776 
3777 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3778 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3779 		flag |= MD_STR_ABR;
3780 
3781 	if (private == NULL) {
3782 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3783 		mirror_parent_init(ps);
3784 	} else {
3785 		ps = private;
3786 		private = NULL;
3787 	}
3788 	if (flag & MD_STR_MAPPED)
3789 		ps->ps_flags |= MD_MPS_MAPPED;
3790 
3791 	if (flag & MD_STR_WOW)
3792 		ps->ps_flags |= MD_MPS_WOW;
3793 
3794 	if (flag & MD_STR_ABR)
3795 		ps->ps_flags |= MD_MPS_ABR;
3796 
3797 	if (flag & MD_STR_WMUPDATE)
3798 		ps->ps_flags |= MD_MPS_WMUPDATE;
3799 
3800 	/*
3801 	 * Save essential information from the original buffhdr
3802 	 * in the md_save structure.
3803 	 */
3804 	ps->ps_un = un;
3805 	ps->ps_ui = ui;
3806 	ps->ps_bp = pb;
3807 	ps->ps_addr = pb->b_un.b_addr;
3808 	ps->ps_firstblk = pb->b_lblkno;
3809 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3810 	ps->ps_changecnt = un->un_changecnt;
3811 
3812 	/*
3813 	 * If not MN owner and this is an ABR write, make sure the current
3814 	 * resync region is on the overlaps chain
3815 	 */
3816 	mutex_enter(&un->un_owner_mx);
3817 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3818 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3819 		md_mps_t	*ps1;
3820 		/* Block the current resync region, if not already blocked */
3821 		ps1 = un->un_rs_prev_ovrlap;
3822 
3823 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3824 		    (ps1->ps_lastblk != 0))) {
3825 			/* Drop locks to avoid deadlock */
3826 			mutex_exit(&un->un_owner_mx);
3827 			md_unit_readerexit(ui);
3828 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3829 			rs_on_overlap = 1;
3830 			(void) md_unit_readerlock(ui);
3831 			mutex_enter(&un->un_owner_mx);
3832 			/*
3833 			 * Check to see if we have obtained ownership
3834 			 * while waiting for overlaps. If we have, remove
3835 			 * the resync_region entry from the overlap chain
3836 			 */
3837 			if (MD_MN_MIRROR_OWNER(un) &&
3838 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3839 				mirror_overlap_chain_remove(ps1);
3840 				rs_on_overlap = 0;
3841 			}
3842 		}
3843 	}
3844 	mutex_exit(&un->un_owner_mx);
3845 
3846 
3847 	/*
3848 	 * following keep write after read from writing to the
3849 	 * source in the case where it all came from one place
3850 	 */
3851 	if (flag & MD_STR_WAR) {
3852 		int	abort_write = 0;
3853 		/*
3854 		 * We are perfoming a write-after-read. This is either as a
3855 		 * result of a resync read or as a result of a read in a
3856 		 * dirty resync region when the optimized resync is not
3857 		 * complete. If in a MN set and a resync generated i/o,
3858 		 * if the current block is not in the current
3859 		 * resync region terminate the write as another node must have
3860 		 * completed this resync region
3861 		 */
3862 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3863 		    (!flag & MD_STR_DIRTY_RD)) {
3864 			if (!IN_RESYNC_REGION(un, ps))
3865 				abort_write = 1;
3866 		}
3867 		if ((select_write_after_read_units(un, ps) == 0) ||
3868 		    (abort_write)) {
3869 #ifdef DEBUG
3870 			if (mirror_debug_flag)
3871 				printf("Abort resync write on %x, block %lld\n",
3872 				    MD_SID(un), ps->ps_firstblk);
3873 #endif
3874 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3875 				mirror_overlap_chain_remove(ps);
3876 			kmem_cache_free(mirror_parent_cache, ps);
3877 			md_kstat_waitq_exit(ui);
3878 			md_unit_readerexit(ui);
3879 			md_biodone(pb);
3880 			return;
3881 		}
3882 	} else {
3883 		select_write_units(un, ps);
3884 
3885 		/* Drop readerlock to avoid deadlock */
3886 		md_unit_readerexit(ui);
3887 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
3888 		un = md_unit_readerlock(ui);
3889 		/*
3890 		 * For a MN set with an ABR write, if we are now the
3891 		 * owner and we have a resync region on the overlap
3892 		 * chain, remove the entry from overlaps and retry the write.
3893 		 */
3894 
3895 		if (MD_MNSET_SETNO(setno) &&
3896 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3897 			mutex_enter(&un->un_owner_mx);
3898 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
3899 				mirror_overlap_chain_remove(ps);
3900 				md_kstat_waitq_exit(ui);
3901 				mutex_exit(&un->un_owner_mx);
3902 				md_unit_readerexit(ui);
3903 				daemon_request(&md_mirror_daemon, daemon_io,
3904 				    (daemon_queue_t *)ps, REQ_OLD);
3905 				return;
3906 			}
3907 			mutex_exit(&un->un_owner_mx);
3908 		}
3909 	}
3910 
3911 	/*
3912 	 * For Multinode mirrors with a Resync Region (not ABR) we need to
3913 	 * become the mirror owner before continuing with the write(). For ABR
3914 	 * mirrors we check that we 'own' the resync if we're in
3915 	 * write-after-read mode. We do this _after_ ensuring that there are no
3916 	 * overlaps to ensure that the once we know that we are the owner, the
3917 	 * readerlock will not released until the write is complete. As a
3918 	 * change of ownership in a MN set requires the writerlock, this
3919 	 * ensures that ownership cannot be changed until the write is
3920 	 * complete
3921 	 */
3922 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
3923 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
3924 		if (!MD_MN_MIRROR_OWNER(un))  {
3925 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3926 				mirror_overlap_chain_remove(ps);
3927 			md_kstat_waitq_exit(ui);
3928 			ASSERT(!(flag & MD_STR_WAR));
3929 			md_unit_readerexit(ui);
3930 			daemon_request(&md_mirror_daemon, become_owner,
3931 			    (daemon_queue_t *)ps, REQ_OLD);
3932 			return;
3933 		}
3934 	}
3935 
3936 	/*
3937 	 * Mark resync region if mirror has a Resync Region _and_ we are not
3938 	 * a resync initiated write(). Don't mark region if we're flagged as
3939 	 * an ABR write.
3940 	 */
3941 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
3942 	    !(flag & MD_STR_WAR)) {
3943 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
3944 		    ps->ps_lastblk)) {
3945 			pb->b_flags |= B_ERROR;
3946 			pb->b_resid = pb->b_bcount;
3947 			ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3948 			kmem_cache_free(mirror_parent_cache, ps);
3949 			md_kstat_waitq_exit(ui);
3950 			md_unit_readerexit(ui);
3951 			md_biodone(pb);
3952 			return;
3953 		}
3954 	}
3955 
3956 	ps->ps_childbflags = pb->b_flags | B_WRITE;
3957 	ps->ps_childbflags &= ~B_READ;
3958 	if (flag & MD_STR_MAPPED)
3959 		ps->ps_childbflags &= ~B_PAGEIO;
3960 
3961 	if (!(flag & MD_STR_NOTTOP) && panicstr)
3962 		/* Disable WOW and don't free ps */
3963 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
3964 
3965 	md_kstat_waitq_to_runq(ui);
3966 
3967 	/*
3968 	 * Treat Raw and Direct I/O as Write-on-Write always
3969 	 */
3970 
3971 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3972 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
3973 	    (pb->b_flags & B_PHYS) &&
3974 	    !(ps->ps_flags & MD_MPS_WOW)) {
3975 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3976 			mirror_overlap_chain_remove(ps);
3977 		md_unit_readerexit(ui);
3978 		daemon_request(&md_mstr_daemon, handle_wow,
3979 			(daemon_queue_t *)ps, REQ_OLD);
3980 		return;
3981 	}
3982 
3983 	ps->ps_frags = 1;
3984 	do {
3985 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3986 		mirror_child_init(cs);
3987 		cb = &cs->cs_buf;
3988 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
3989 
3990 		/*
3991 		 * This handles the case where we're requesting
3992 		 * a write to block 0 on a label partition.  (more < 0)
3993 		 * means that the request size was smaller than the
3994 		 * size of the label.  If so this request is done.
3995 		 */
3996 		if (more < 0) {
3997 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3998 				mirror_overlap_chain_remove(ps);
3999 			md_kstat_runq_exit(ui);
4000 			kmem_cache_free(mirror_child_cache, cs);
4001 			kmem_cache_free(mirror_parent_cache, ps);
4002 			md_unit_readerexit(ui);
4003 			md_biodone(pb);
4004 			return;
4005 		}
4006 		if (more) {
4007 			mutex_enter(&ps->ps_mx);
4008 			ps->ps_frags++;
4009 			mutex_exit(&ps->ps_mx);
4010 		}
4011 		md_call_strategy(cb, flag, private);
4012 	} while (more);
4013 
4014 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4015 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4016 			md_daemon(1, &md_done_daemon);
4017 			drv_usecwait(10);
4018 		}
4019 		kmem_cache_free(mirror_parent_cache, ps);
4020 	}
4021 }
4022 
4023 static void
4024 mirror_read_strategy(buf_t *pb, int flag, void *private)
4025 {
4026 	md_mps_t	*ps;
4027 	md_mcs_t	*cs;
4028 	size_t		more;
4029 	mm_unit_t	*un;
4030 	mdi_unit_t	*ui;
4031 	size_t		current_count;
4032 	diskaddr_t	current_blkno;
4033 	off_t		current_offset;
4034 	buf_t		*cb;		/* child buf pointer */
4035 	set_t		setno;
4036 
4037 	ui = MDI_UNIT(getminor(pb->b_edev));
4038 
4039 	md_kstat_waitq_enter(ui);
4040 
4041 	un = (mm_unit_t *)md_unit_readerlock(ui);
4042 
4043 	if (!(flag & MD_STR_NOTTOP)) {
4044 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4045 			md_kstat_waitq_exit(ui);
4046 			return;
4047 		}
4048 	}
4049 
4050 	if (private == NULL) {
4051 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4052 		mirror_parent_init(ps);
4053 	} else {
4054 		ps = private;
4055 		private = NULL;
4056 	}
4057 
4058 	if (flag & MD_STR_MAPPED)
4059 		ps->ps_flags |= MD_MPS_MAPPED;
4060 	if (flag & MD_NOBLOCK)
4061 		ps->ps_flags |= MD_MPS_NOBLOCK;
4062 	if (flag & MD_STR_WMUPDATE)
4063 		ps->ps_flags |= MD_MPS_WMUPDATE;
4064 
4065 	/*
4066 	 * Check to see if this is a DMR driven read. If so we need to use the
4067 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4068 	 */
4069 	if (flag & MD_STR_DMR)
4070 		ps->ps_flags |= MD_MPS_DMR;
4071 
4072 	/*
4073 	 * Save essential information from the original buffhdr
4074 	 * in the md_save structure.
4075 	 */
4076 	ps->ps_un = un;
4077 	ps->ps_ui = ui;
4078 	ps->ps_bp = pb;
4079 	ps->ps_addr = pb->b_un.b_addr;
4080 	ps->ps_firstblk = pb->b_lblkno;
4081 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4082 	ps->ps_changecnt = un->un_changecnt;
4083 
4084 	current_count = btodb(pb->b_bcount);
4085 	current_blkno = pb->b_lblkno;
4086 	current_offset = 0;
4087 
4088 	/*
4089 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4090 	 * resync thread which may or may not be an optimised resync.
4091 	 *
4092 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4093 	 * code has not completed; either a resync has not started since snarf,
4094 	 * or there is an optimized resync in progress.
4095 	 *
4096 	 * We need to generate a write after this read in the following two
4097 	 * cases,
4098 	 *
4099 	 * 1. Any Resync-Generated read
4100 	 *
4101 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4102 	 *    pending or in progress.
4103 	 *
4104 	 * The write after read is done in these cases to ensure that all sides
4105 	 * of the mirror are in sync with the read data and that it is not
4106 	 * possible for an application to read the same block multiple times
4107 	 * and get different data.
4108 	 *
4109 	 * This would be possible if the block was in a dirty region.
4110 	 *
4111 	 * If we're performing a directed read we don't write the data out as
4112 	 * the application is responsible for restoring the mirror to a known
4113 	 * state.
4114 	 */
4115 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4116 	    !(flag & MD_STR_DMR)) {
4117 		size_t	start_rr, i, end_rr;
4118 		int	region_dirty = 1;
4119 
4120 		/*
4121 		 * We enter here under three circumstances,
4122 		 *
4123 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4124 		 * 0			1
4125 		 * 1			0
4126 		 * 1			1
4127 		 *
4128 		 * To be optimal we only care to explicitly check for dirty
4129 		 * regions in the second case since if MD_STR_WAR is set we
4130 		 * always do the write after read.
4131 		 */
4132 		if (!(flag & MD_STR_WAR)) {
4133 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4134 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4135 
4136 			for (i = start_rr; i <= end_rr; i++)
4137 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4138 					break;
4139 		}
4140 
4141 		if ((region_dirty) &&
4142 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4143 			ps->ps_call = write_after_read;
4144 			/*
4145 			 * Mark this as a RESYNC_READ in ps_flags.
4146 			 * This is used if the read fails during a
4147 			 * resync of a 3-way mirror to ensure that
4148 			 * the retried read to the remaining
4149 			 * good submirror has MD_STR_WAR set. This
4150 			 * is needed to ensure that the resync write
4151 			 * (write-after-read) takes place.
4152 			 */
4153 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4154 
4155 			/*
4156 			 * If MD_STR_FLAG_ERR is set in the flags we
4157 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4158 			 * write (issued by write_after_read) will be flagged
4159 			 * to the biowait'ing resync thread. This allows us to
4160 			 * avoid issuing further resync requests to a device
4161 			 * that has had a write failure.
4162 			 */
4163 			if (flag & MD_STR_FLAG_ERR)
4164 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4165 
4166 			setno = MD_UN2SET(un);
4167 			/*
4168 			 * Drop the readerlock to avoid
4169 			 * deadlock
4170 			 */
4171 			md_unit_readerexit(ui);
4172 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4173 			un = md_unit_readerlock(ui);
4174 			/*
4175 			 * Ensure that we are owner
4176 			 */
4177 			if (MD_MNSET_SETNO(setno)) {
4178 				/*
4179 				 * For a non-resync read that requires a
4180 				 * write-after-read to be done, set a flag
4181 				 * in the parent structure, so that the
4182 				 * write_strategy routine can omit the
4183 				 * test that the write is still within the
4184 				 * resync region
4185 				 */
4186 				if (!(flag & MD_STR_WAR))
4187 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4188 
4189 				/*
4190 				 * Before reading the buffer, see if
4191 				 * we are the owner
4192 				 */
4193 				if (!MD_MN_MIRROR_OWNER(un))  {
4194 					ps->ps_call = NULL;
4195 					mirror_overlap_chain_remove(ps);
4196 					md_kstat_waitq_exit(ui);
4197 					md_unit_readerexit(ui);
4198 					daemon_request(
4199 					    &md_mirror_daemon,
4200 					    become_owner,
4201 					    (daemon_queue_t *)ps,
4202 					    REQ_OLD);
4203 					return;
4204 				}
4205 				/*
4206 				 * For a resync read, check to see if I/O is
4207 				 * outside of the current resync region, or
4208 				 * the resync has finished. If so
4209 				 * just terminate the I/O
4210 				 */
4211 				if ((flag & MD_STR_WAR) &&
4212 				    (!(un->c.un_status & MD_UN_WAR) ||
4213 				    (!IN_RESYNC_REGION(un, ps)))) {
4214 #ifdef DEBUG
4215 					if (mirror_debug_flag)
4216 						printf("Abort resync read "
4217 						    "%x: %lld\n",
4218 						    MD_SID(un),
4219 						    ps->ps_firstblk);
4220 #endif
4221 					mirror_overlap_chain_remove(ps);
4222 					kmem_cache_free(mirror_parent_cache,
4223 					    ps);
4224 					md_kstat_waitq_exit(ui);
4225 					md_unit_readerexit(ui);
4226 					md_biodone(pb);
4227 					return;
4228 				}
4229 			}
4230 		}
4231 	}
4232 
4233 	if (flag & MD_STR_DMR) {
4234 		ps->ps_call = directed_read_done;
4235 	}
4236 
4237 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4238 		ps->ps_flags |= MD_MPS_DONTFREE;
4239 
4240 	md_kstat_waitq_to_runq(ui);
4241 
4242 	ps->ps_frags++;
4243 	do {
4244 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4245 		mirror_child_init(cs);
4246 		cb = &cs->cs_buf;
4247 		cs->cs_ps = ps;
4248 
4249 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4250 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4251 
4252 		more = mirror_map_read(ps, cs, current_blkno,
4253 				(u_longlong_t)current_count);
4254 		if (more) {
4255 			mutex_enter(&ps->ps_mx);
4256 			ps->ps_frags++;
4257 			mutex_exit(&ps->ps_mx);
4258 		}
4259 
4260 		/*
4261 		 * Do these calculations now,
4262 		 *  so that we pickup a valid b_bcount from the chld_bp.
4263 		 */
4264 		current_count -= more;
4265 		current_offset += cb->b_bcount;
4266 		current_blkno +=  more;
4267 		md_call_strategy(cb, flag, private);
4268 	} while (more);
4269 
4270 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4271 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4272 			md_daemon(1, &md_done_daemon);
4273 			drv_usecwait(10);
4274 		}
4275 		kmem_cache_free(mirror_parent_cache, ps);
4276 	}
4277 }
4278 
4279 void
4280 md_mirror_strategy(buf_t *bp, int flag, void *private)
4281 {
4282 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4283 
4284 	/*
4285 	 * When doing IO to a multi owner meta device, check if set is halted.
4286 	 * We do this check without the needed lock held, for performance
4287 	 * reasons.
4288 	 * If an IO just slips through while the set is locked via an
4289 	 * MD_MN_SUSPEND_SET, we don't care about it.
4290 	 * Only check for suspension if we are a top-level i/o request
4291 	 * (MD_STR_NOTTOP is cleared in 'flag').
4292 	 */
4293 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4294 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4295 		if ((flag & MD_STR_NOTTOP) == 0) {
4296 			mutex_enter(&md_mx);
4297 			/* Here we loop until the set is no longer halted */
4298 			while (md_set[setno].s_status & MD_SET_HALTED) {
4299 				cv_wait(&md_cv, &md_mx);
4300 			}
4301 			mutex_exit(&md_mx);
4302 		}
4303 	}
4304 
4305 	if ((flag & MD_IO_COUNTED) == 0) {
4306 		if ((flag & MD_NOBLOCK) == 0) {
4307 			if (md_inc_iocount(setno) != 0) {
4308 				bp->b_flags |= B_ERROR;
4309 				bp->b_error = ENXIO;
4310 				bp->b_resid = bp->b_bcount;
4311 				biodone(bp);
4312 				return;
4313 			}
4314 		} else {
4315 			md_inc_iocount_noblock(setno);
4316 		}
4317 	}
4318 
4319 	if (bp->b_flags & B_READ)
4320 		mirror_read_strategy(bp, flag, private);
4321 	else
4322 		mirror_write_strategy(bp, flag, private);
4323 }
4324 
4325 /*
4326  * mirror_directed_read:
4327  * --------------------
4328  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4329  * so that the application can determine what (if any) resync needs to be
4330  * performed. The data is copied out to the user-supplied buffer.
4331  *
4332  * Parameters:
4333  *	mdev	- dev_t for the mirror device
4334  *	vdr	- directed read parameters specifying location and submirror
4335  *		  to perform the read from
4336  *	mode	- used to ddi_copyout() any resulting data from the read
4337  *
4338  * Returns:
4339  *	0	success
4340  *	!0	error code
4341  *		EINVAL - invalid request format
4342  */
4343 int
4344 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4345 {
4346 	buf_t		*bp;
4347 	minor_t		mnum = getminor(mdev);
4348 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4349 	mm_unit_t	*un;
4350 	mm_submirror_t	*sm;
4351 	char		*sm_nm;
4352 	size_t		namelen;
4353 	uint_t		next_side;
4354 	void		*kbuffer;
4355 
4356 	if (ui == NULL)
4357 		return (ENXIO);
4358 
4359 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4360 		return (EINVAL);
4361 	}
4362 
4363 	/* Check for aligned block access. We disallow non-aligned requests. */
4364 	if (vdr->vdr_offset % DEV_BSIZE) {
4365 		return (EINVAL);
4366 	}
4367 
4368 	/*
4369 	 * Allocate kernel buffer for target of read(). If we had a reliable
4370 	 * (sorry functional) DDI this wouldn't be needed.
4371 	 */
4372 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4373 	if (kbuffer == NULL) {
4374 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4375 		    " bytes\n", vdr->vdr_nbytes);
4376 		return (ENOMEM);
4377 	}
4378 
4379 	bp = getrbuf(KM_SLEEP);
4380 
4381 	bp->b_un.b_addr = kbuffer;
4382 	bp->b_flags = B_READ;
4383 	bp->b_bcount = vdr->vdr_nbytes;
4384 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4385 	bp->b_edev = mdev;
4386 
4387 	un = md_unit_readerlock(ui);
4388 
4389 	/*
4390 	 * If DKV_SIDE_INIT is set we need to determine the first available
4391 	 * side to start reading from. If it isn't set we increment to the
4392 	 * next readable submirror.
4393 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4394 	 * Note: we check for a readable submirror on completion of the i/o so
4395 	 * we should _always_ have one available. If this becomes unavailable
4396 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4397 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4398 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4399 	 * The chance of this is small, but not non-existent.
4400 	 */
4401 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4402 		next_side = 0;
4403 	} else {
4404 		next_side = vdr->vdr_side + 1;
4405 	}
4406 	while ((next_side < NMIRROR) &&
4407 	    !SUBMIRROR_IS_READABLE(un, next_side))
4408 		next_side++;
4409 	if (next_side >= NMIRROR) {
4410 		vdr->vdr_flags |= DKV_DMR_ERROR;
4411 		freerbuf(bp);
4412 		vdr->vdr_bytesread = 0;
4413 		md_unit_readerexit(ui);
4414 		return (0);
4415 	}
4416 
4417 	/* Set the side to read from */
4418 	un->un_dmr_last_read = next_side;
4419 
4420 	md_unit_readerexit(ui);
4421 
4422 	/*
4423 	 * Save timestamp for verification purposes. Can be read by debugger
4424 	 * to verify that this ioctl has been executed and to find the number
4425 	 * of DMR reads and the time of the last DMR read.
4426 	 */
4427 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4428 	mirror_dmr_stats.dmr_count++;
4429 
4430 	/* Issue READ request and wait for completion */
4431 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4432 
4433 	mutex_enter(&un->un_dmr_mx);
4434 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4435 	mutex_exit(&un->un_dmr_mx);
4436 
4437 	/*
4438 	 * Check to see if we encountered an error during the read. If so we
4439 	 * can make no guarantee about any possibly returned data.
4440 	 */
4441 	if ((bp->b_flags & B_ERROR) == 0) {
4442 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4443 		if (bp->b_resid) {
4444 			vdr->vdr_flags |= DKV_DMR_SHORT;
4445 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4446 		} else {
4447 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4448 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4449 		}
4450 		/* Copy the data read back out to the user supplied buffer */
4451 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4452 		    mode)) {
4453 			kmem_free(kbuffer, vdr->vdr_nbytes);
4454 			return (EFAULT);
4455 		}
4456 
4457 	} else {
4458 		/* Error out with DKV_DMR_ERROR */
4459 		vdr->vdr_flags |= DKV_DMR_ERROR;
4460 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4461 	}
4462 	/*
4463 	 * Update the DMR parameters with the side and name of submirror that
4464 	 * we have just read from (un->un_dmr_last_read)
4465 	 */
4466 	un = md_unit_readerlock(ui);
4467 
4468 	vdr->vdr_side = un->un_dmr_last_read;
4469 	sm = &un->un_sm[un->un_dmr_last_read];
4470 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4471 
4472 	namelen = MIN(MD_MAX_SIDENAME_LEN, VOL_SIDENAME);
4473 	(void) strncpy(vdr->vdr_side_name, sm_nm, namelen);
4474 
4475 	/*
4476 	 * Determine if we've completed the read cycle. This is true iff the
4477 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4478 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4479 	 * can occur if a submirror is metadetached).
4480 	 */
4481 	next_side = un->un_dmr_last_read + 1;
4482 	while ((next_side < NMIRROR) &&
4483 	    !SUBMIRROR_IS_READABLE(un, next_side))
4484 		next_side++;
4485 	if (next_side >= NMIRROR) {
4486 		/* We've finished */
4487 		vdr->vdr_flags |= DKV_DMR_DONE;
4488 	}
4489 
4490 	md_unit_readerexit(ui);
4491 	freerbuf(bp);
4492 	kmem_free(kbuffer, vdr->vdr_nbytes);
4493 
4494 	return (0);
4495 }
4496 
4497 /*
4498  * mirror_resync_message:
4499  * ---------------------
4500  * Handle the multi-node resync messages that keep all nodes within a given
4501  * disk-set in sync with their view of a mirror's resync status.
4502  *
4503  * The message types dealt with are:
4504  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4505  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4506  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4507  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4508  *
4509  * Returns:
4510  *	0	Success
4511  *	>0	Failure error number
4512  */
4513 int
4514 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4515 {
4516 	mdi_unit_t		*ui;
4517 	mm_unit_t		*un;
4518 	set_t			setno;
4519 	int			is_ABR;
4520 	int			smi;
4521 	int			ci;
4522 	sm_state_t		state;
4523 	int			broke_out;
4524 	mm_submirror_t		*sm;
4525 	mm_submirror_ic_t	*smic;
4526 	md_m_shared_t		*shared;
4527 	md_error_t		mde = mdnullerror;
4528 	md_mps_t		*ps;
4529 	int			rs_active;
4530 
4531 	/* Check that the given device is part of a multi-node set */
4532 	setno = MD_MIN2SET(p->mnum);
4533 	if (setno >= md_nsets) {
4534 		return (ENXIO);
4535 	}
4536 	if (!MD_MNSET_SETNO(setno)) {
4537 		return (EINVAL);
4538 	}
4539 
4540 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4541 		return (EINVAL);
4542 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4543 		return (EINVAL);
4544 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4545 
4546 	/* Obtain the current resync status */
4547 	(void) md_ioctl_readerlock(lockp, ui);
4548 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4549 	md_ioctl_readerexit(lockp);
4550 
4551 	switch ((md_mn_msgtype_t)p->msg_type) {
4552 	case MD_MN_MSG_RESYNC_STARTING:
4553 		/* Start the resync thread for the mirror */
4554 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4555 		break;
4556 
4557 	case MD_MN_MSG_RESYNC_NEXT:
4558 		/*
4559 		 * We have to release any previously marked overlap regions
4560 		 * so that i/o can resume. Then we need to block the region
4561 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4562 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4563 		 */
4564 		(void) md_ioctl_readerlock(lockp, ui);
4565 		/*
4566 		 * Ignore the message if there is no active resync thread or
4567 		 * if it is for a resync type that we have already completed.
4568 		 * un_resync_completed is set to the last resync completed
4569 		 * when processing a PHASE_DONE message.
4570 		 */
4571 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4572 			break;
4573 		/*
4574 		 * If this message is for the same resync and is for an earlier
4575 		 * resync region, just ignore it. This can only occur if this
4576 		 * node has progressed on to the next resync region before
4577 		 * we receive this message. This can occur if the class for
4578 		 * this message is busy and the originator has to retry thus
4579 		 * allowing this node to move onto the next resync_region.
4580 		 */
4581 		if ((p->rs_type == un->un_rs_type) &&
4582 		    (p->rs_start < un->un_resync_startbl))
4583 			break;
4584 		ps = un->un_rs_prev_ovrlap;
4585 
4586 		/* Allocate previous overlap reference if needed */
4587 		if (ps == NULL) {
4588 			ps = kmem_cache_alloc(mirror_parent_cache,
4589 				MD_ALLOCFLAGS);
4590 			ps->ps_un = un;
4591 			ps->ps_ui = ui;
4592 			ps->ps_firstblk = 0;
4593 			ps->ps_lastblk = 0;
4594 			ps->ps_flags = 0;
4595 			md_ioctl_readerexit(lockp);
4596 			(void) md_ioctl_writerlock(lockp, ui);
4597 			un->un_rs_prev_ovrlap = ps;
4598 			md_ioctl_writerexit(lockp);
4599 		} else
4600 			md_ioctl_readerexit(lockp);
4601 
4602 		if (p->rs_originator != md_mn_mynode_id) {
4603 			/*
4604 			 * On all but the originating node, first update
4605 			 * the resync state, then unblock the previous
4606 			 * region and block the next one. No need
4607 			 * to do this if the region is already blocked.
4608 			 * Update the submirror state and flags from the
4609 			 * originator. This keeps the cluster in sync with
4610 			 * regards to the resync status.
4611 			 */
4612 
4613 			(void) md_ioctl_writerlock(lockp, ui);
4614 			un->un_rs_resync_done = p->rs_done;
4615 			un->un_rs_resync_2_do = p->rs_2_do;
4616 			un->un_rs_type = p->rs_type;
4617 			un->un_resync_startbl = p->rs_start;
4618 			md_ioctl_writerexit(lockp);
4619 			/*
4620 			 * Use un_owner_mx to ensure that an ownership change
4621 			 * cannot happen at the same time as this message
4622 			 */
4623 			mutex_enter(&un->un_owner_mx);
4624 			if (MD_MN_MIRROR_OWNER(un)) {
4625 				ps->ps_firstblk = p->rs_start;
4626 				ps->ps_lastblk = ps->ps_firstblk +
4627 				    p->rs_size - 1;
4628 			} else {
4629 				if ((ps->ps_firstblk != p->rs_start) ||
4630 				    (ps->ps_lastblk != p->rs_start +
4631 				    p->rs_size - 1)) {
4632 					/* Remove previous overlap range */
4633 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4634 						mirror_overlap_chain_remove(ps);
4635 
4636 					ps->ps_firstblk = p->rs_start;
4637 					ps->ps_lastblk = ps->ps_firstblk +
4638 					    p->rs_size - 1;
4639 
4640 					mutex_exit(&un->un_owner_mx);
4641 					/* Block this range from all i/o. */
4642 					if (ps->ps_firstblk != 0 ||
4643 					    ps->ps_lastblk != 0)
4644 						wait_for_overlaps(ps,
4645 						    MD_OVERLAP_ALLOW_REPEAT);
4646 					mutex_enter(&un->un_owner_mx);
4647 					/*
4648 					 * Check to see if we have obtained
4649 					 * ownership while waiting for
4650 					 * overlaps. If we have, remove
4651 					 * the resync_region entry from the
4652 					 * overlap chain
4653 					 */
4654 					if (MD_MN_MIRROR_OWNER(un) &&
4655 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4656 						mirror_overlap_chain_remove(ps);
4657 				}
4658 			}
4659 			mutex_exit(&un->un_owner_mx);
4660 
4661 			/*
4662 			 * If this is the first RESYNC_NEXT message (i.e.
4663 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4664 			 * issue RESYNC_START NOTIFY event
4665 			 */
4666 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4667 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4668 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4669 				    MD_SID(un));
4670 			}
4671 
4672 			/* Ensure that our local resync thread is running */
4673 			if (un->un_rs_thread == NULL) {
4674 				(void) mirror_resync_unit(p->mnum, NULL,
4675 				    &p->mde, lockp);
4676 			}
4677 		}
4678 		break;
4679 	case MD_MN_MSG_RESYNC_FINISH:
4680 		/*
4681 		 * Complete the resync by stopping the resync thread.
4682 		 * Also release the previous overlap region field.
4683 		 * Update the resync_progress_thread by cv_signal'ing it so
4684 		 * that we mark the end of the resync as soon as possible. This
4685 		 * stops an unnecessary delay should be panic after resync
4686 		 * completion.
4687 		 */
4688 #ifdef DEBUG
4689 		if (!rs_active) {
4690 			if (mirror_debug_flag)
4691 				printf("RESYNC_FINISH (mnum = %x), "
4692 				    "Resync *NOT* active",
4693 				    p->mnum);
4694 		}
4695 #endif
4696 
4697 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4698 		    (p->rs_originator != md_mn_mynode_id)) {
4699 			mutex_enter(&un->un_rs_thread_mx);
4700 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4701 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4702 			un->un_rs_thread_flags &=
4703 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4704 			cv_signal(&un->un_rs_thread_cv);
4705 			mutex_exit(&un->un_rs_thread_mx);
4706 		}
4707 		if (is_ABR) {
4708 			/* Resync finished, if ABR set owner to NULL */
4709 			mutex_enter(&un->un_owner_mx);
4710 			un->un_mirror_owner = 0;
4711 			mutex_exit(&un->un_owner_mx);
4712 		}
4713 		(void) md_ioctl_writerlock(lockp, ui);
4714 		ps = un->un_rs_prev_ovrlap;
4715 		if (ps != NULL) {
4716 			/* Remove previous overlap range */
4717 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4718 				mirror_overlap_chain_remove(ps);
4719 			/*
4720 			 * Release the overlap range reference
4721 			 */
4722 			un->un_rs_prev_ovrlap = NULL;
4723 			kmem_cache_free(mirror_parent_cache,
4724 			    ps);
4725 		}
4726 		md_ioctl_writerexit(lockp);
4727 
4728 		/* Mark the resync as complete in the metadb */
4729 		un->un_rs_resync_done = p->rs_done;
4730 		un->un_rs_resync_2_do = p->rs_2_do;
4731 		un->un_rs_type = p->rs_type;
4732 		mutex_enter(&un->un_rs_progress_mx);
4733 		cv_signal(&un->un_rs_progress_cv);
4734 		mutex_exit(&un->un_rs_progress_mx);
4735 
4736 		un = md_ioctl_writerlock(lockp, ui);
4737 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4738 		/* Deal with any pending grow_unit */
4739 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4740 			if ((mirror_grow_unit(un, &mde) != 0) ||
4741 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4742 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4743 			}
4744 		}
4745 		md_ioctl_writerexit(lockp);
4746 		break;
4747 
4748 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4749 		/*
4750 		 * A phase of the resync, optimized. component or
4751 		 * submirror is complete. Update mirror status.
4752 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4753 		 * mirror owner is peforming a resync. If we have just snarfed
4754 		 * this set, then we must clear any of the flags set at snarf
4755 		 * time by unit_setup_resync().
4756 		 * Note that unit_setup_resync() sets up these flags to
4757 		 * indicate that an optimized resync is required. These flags
4758 		 * need to be reset because if we get here,  the mirror owner
4759 		 * will have handled the optimized resync.
4760 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4761 		 * MD_UN_WAR. In addition, for each submirror,
4762 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4763 		 * set to SMS_OFFLINE.
4764 		 */
4765 #ifdef DEBUG
4766 		if (mirror_debug_flag)
4767 			printf("phase done mess received from %d, mnum=%x,"
4768 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4769 			    p->rs_type, p->rs_flags);
4770 #endif
4771 		/*
4772 		 * Ignore the message if there is no active resync thread.
4773 		 */
4774 		if (!rs_active)
4775 			break;
4776 
4777 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4778 		switch (RS_TYPE(p->rs_type)) {
4779 		case MD_RS_OPTIMIZED:
4780 			un = md_ioctl_writerlock(lockp, ui);
4781 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4782 				/* If we are originator, just clear rs_type */
4783 				if (p->rs_originator == md_mn_mynode_id) {
4784 					SET_RS_TYPE_NONE(un->un_rs_type);
4785 					md_ioctl_writerexit(lockp);
4786 					break;
4787 				}
4788 				/*
4789 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4790 				 * flags if OPT_NOT_DONE is set *and* rs_type
4791 				 * is MD_RS_NONE.
4792 				 */
4793 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4794 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4795 					/* No resync in progress */
4796 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4797 					un->c.un_status &= ~MD_UN_WAR;
4798 				} else {
4799 					/*
4800 					 * We are in the middle of an
4801 					 * optimized resync and this message
4802 					 * should be ignored.
4803 					 */
4804 					md_ioctl_writerexit(lockp);
4805 					break;
4806 				}
4807 			} else {
4808 				/*
4809 				 * This is the end of an optimized resync,
4810 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4811 				 */
4812 
4813 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4814 				if (!broke_out)
4815 					un->c.un_status &= ~MD_UN_WAR;
4816 			}
4817 
4818 			/*
4819 			 * Set resync_completed to last resync type and then
4820 			 * clear resync_type to indicate no resync in progress
4821 			 */
4822 			un->un_resync_completed = un->un_rs_type;
4823 			SET_RS_TYPE_NONE(un->un_rs_type);
4824 
4825 			/*
4826 			 * If resync is as a result of a submirror ONLINE,
4827 			 * reset the submirror state to SMS_RUNNING if the
4828 			 * resync was ok else set back to SMS_OFFLINE.
4829 			 */
4830 			for (smi = 0; smi < NMIRROR; smi++) {
4831 				un->un_sm[smi].sm_flags &=
4832 				    ~MD_SM_RESYNC_TARGET;
4833 				if (SMS_BY_INDEX_IS(un, smi,
4834 				    SMS_OFFLINE_RESYNC)) {
4835 					if (p->rs_flags &
4836 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4837 						state = SMS_OFFLINE;
4838 					} else {
4839 						state = (broke_out ?
4840 						    SMS_OFFLINE : SMS_RUNNING);
4841 					}
4842 					mirror_set_sm_state(
4843 					    &un->un_sm[smi],
4844 					    &un->un_smic[smi], state,
4845 					    broke_out);
4846 					mirror_commit(un, NO_SUBMIRRORS,
4847 					    0);
4848 				}
4849 				/*
4850 				 * If we still have an offline submirror, reset
4851 				 * the OFFLINE_SM flag in the mirror status
4852 				 */
4853 				if (SMS_BY_INDEX_IS(un, smi,
4854 				    SMS_OFFLINE))
4855 					un->c.un_status |=
4856 					    MD_UN_OFFLINE_SM;
4857 			}
4858 			md_ioctl_writerexit(lockp);
4859 			break;
4860 		case MD_RS_SUBMIRROR:
4861 			un = md_ioctl_writerlock(lockp, ui);
4862 			smi = RS_SMI(p->rs_type);
4863 			sm = &un->un_sm[smi];
4864 			smic = &un->un_smic[smi];
4865 			/* Clear RESYNC target */
4866 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4867 			/*
4868 			 * Set resync_completed to last resync type and then
4869 			 * clear resync_type to indicate no resync in progress
4870 			 */
4871 			un->un_resync_completed = un->un_rs_type;
4872 			SET_RS_TYPE_NONE(un->un_rs_type);
4873 			/*
4874 			 * If the resync completed ok reset the submirror
4875 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
4876 			 */
4877 			state = (broke_out ?
4878 			    SMS_ATTACHED : SMS_RUNNING);
4879 			mirror_set_sm_state(sm, smic, state, broke_out);
4880 			un->c.un_status &= ~MD_UN_WAR;
4881 			mirror_commit(un, SMI2BIT(smi), 0);
4882 			md_ioctl_writerexit(lockp);
4883 			break;
4884 		case MD_RS_COMPONENT:
4885 			un = md_ioctl_writerlock(lockp, ui);
4886 			smi = RS_SMI(p->rs_type);
4887 			ci = RS_CI(p->rs_type);
4888 			sm = &un->un_sm[smi];
4889 			smic = &un->un_smic[smi];
4890 			shared = (md_m_shared_t *)
4891 			    (*(smic->sm_shared_by_indx))
4892 			    (sm->sm_dev, sm, ci);
4893 			un->c.un_status &= ~MD_UN_WAR;
4894 			/* Clear RESYNC target */
4895 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4896 			/*
4897 			 * Set resync_completed to last resync type and then
4898 			 * clear resync_type to indicate no resync in progress
4899 			 */
4900 			un->un_resync_completed = un->un_rs_type;
4901 			SET_RS_TYPE_NONE(un->un_rs_type);
4902 
4903 			/*
4904 			 * If the resync completed ok, set the component state
4905 			 * to CS_OKAY.
4906 			 */
4907 			if (broke_out)
4908 				shared->ms_flags |= MDM_S_RS_TRIED;
4909 			else {
4910 				/*
4911 				 * As we don't transmit the changes,
4912 				 * no need to drop the lock.
4913 				 */
4914 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
4915 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
4916 			}
4917 			md_ioctl_writerexit(lockp);
4918 		default:
4919 			break;
4920 		}
4921 		/*
4922 		 * If the purpose of this PHASE_DONE message is just to
4923 		 * indicate to all other nodes that the optimized resync
4924 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
4925 		 * no need to generate a notify event as there has not
4926 		 * actually been a resync.
4927 		 */
4928 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
4929 			if (broke_out) {
4930 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
4931 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4932 				    MD_SID(un));
4933 			} else {
4934 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
4935 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4936 				    MD_SID(un));
4937 			}
4938 		}
4939 		break;
4940 
4941 	default:
4942 #ifdef DEBUG
4943 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
4944 		    " %x\n", p->msg_type);
4945 #endif
4946 		return (EINVAL);
4947 	}
4948 	return (0);
4949 }
4950 
4951 /* Return a -1 if snarf of optimized record failed and set should be released */
4952 static int
4953 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
4954 {
4955 	mddb_recid_t	recid;
4956 	int		gotsomething;
4957 	int		all_mirrors_gotten;
4958 	mm_unit_t	*un;
4959 	mddb_type_t	typ1;
4960 	mddb_de_ic_t    *dep;
4961 	mddb_rb32_t	*rbp;
4962 	size_t		newreqsize;
4963 	mm_unit_t	*big_un;
4964 	mm_unit32_od_t	*small_un;
4965 	int		retval;
4966 	mdi_unit_t	*ui;
4967 
4968 	if (cmd == MD_SNARF_CLEANUP) {
4969 		if (md_get_setstatus(setno) & MD_SET_STALE)
4970 			return (0);
4971 
4972 		recid = mddb_makerecid(setno, 0);
4973 		typ1 = (mddb_type_t)md_getshared_key(setno,
4974 		    mirror_md_ops.md_driver.md_drivername);
4975 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4976 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
4977 				un = (mm_unit_t *)mddb_getrecaddr(recid);
4978 				mirror_cleanup(un);
4979 				recid = mddb_makerecid(setno, 0);
4980 			}
4981 		}
4982 		return (0);
4983 	}
4984 
4985 	all_mirrors_gotten = 1;
4986 	gotsomething = 0;
4987 
4988 	recid = mddb_makerecid(setno, 0);
4989 	typ1 = (mddb_type_t)md_getshared_key(setno,
4990 	    mirror_md_ops.md_driver.md_drivername);
4991 
4992 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4993 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4994 			continue;
4995 
4996 		dep = mddb_getrecdep(recid);
4997 		dep->de_flags = MDDB_F_MIRROR;
4998 		rbp = dep->de_rb;
4999 
5000 		if ((rbp->rb_revision == MDDB_REV_RB) &&
5001 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
5002 			/*
5003 			 * This means, we have an old and small record
5004 			 * and this record hasn't already been converted.
5005 			 * Before we create an incore metadevice from this
5006 			 * we have to convert it to a big record.
5007 			 */
5008 			small_un = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5009 			newreqsize = sizeof (mm_unit_t);
5010 			big_un = (mm_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
5011 			mirror_convert((caddr_t)small_un, (caddr_t)big_un,
5012 			    SMALL_2_BIG);
5013 			kmem_free(small_un, dep->de_reqsize);
5014 
5015 			/*
5016 			 * Update userdata and incore userdata
5017 			 * incores are at the end of un
5018 			 */
5019 			dep->de_rb_userdata_ic = big_un;
5020 			dep->de_rb_userdata = big_un;
5021 			dep->de_icreqsize = newreqsize;
5022 			un = big_un;
5023 			rbp->rb_private |= MD_PRV_CONVD;
5024 		} else {
5025 			/* Big device */
5026 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5027 				sizeof (*un), 0);
5028 		}
5029 
5030 		/* Set revision and flag accordingly */
5031 		if (rbp->rb_revision == MDDB_REV_RB) {
5032 			un->c.un_revision = MD_32BIT_META_DEV;
5033 		} else {
5034 			un->c.un_revision = MD_64BIT_META_DEV;
5035 			un->c.un_flag |= MD_EFILABEL;
5036 		}
5037 
5038 		/*
5039 		 * Create minor device node for snarfed entry.
5040 		 */
5041 		(void) md_create_minor_node(setno, MD_SID(un));
5042 
5043 		if (MD_UNIT(MD_SID(un)) != NULL) {
5044 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5045 			continue;
5046 		}
5047 		all_mirrors_gotten = 0;
5048 		retval = mirror_build_incore(un, 1);
5049 		if (retval == 0) {
5050 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5051 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5052 			resync_start_timeout(setno);
5053 			gotsomething = 1;
5054 		} else if (retval == -1) {
5055 			return (-1);
5056 		}
5057 		/*
5058 		 * Set flag to indicate that the mirror has not yet
5059 		 * been through a reconfig. This flag is used for MN sets
5060 		 * when determining whether to update the mirror state from
5061 		 * the Master node.
5062 		 */
5063 		if (MD_MNSET_SETNO(setno)) {
5064 			ui = MDI_UNIT(MD_SID(un));
5065 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5066 		}
5067 	}
5068 
5069 	if (!all_mirrors_gotten)
5070 		return (gotsomething);
5071 
5072 	recid = mddb_makerecid(setno, 0);
5073 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5074 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5075 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5076 
5077 	return (0);
5078 }
5079 
5080 static int
5081 mirror_halt(md_haltcmd_t cmd, set_t setno)
5082 {
5083 	unit_t		i;
5084 	mdi_unit_t	*ui;
5085 	minor_t		mnum;
5086 	int		reset_mirror_flag = 0;
5087 
5088 	if (cmd == MD_HALT_CLOSE)
5089 		return (0);
5090 
5091 	if (cmd == MD_HALT_OPEN)
5092 		return (0);
5093 
5094 	if (cmd == MD_HALT_UNLOAD)
5095 		return (0);
5096 
5097 	if (cmd == MD_HALT_CHECK) {
5098 		for (i = 0; i < md_nunits; i++) {
5099 			mnum = MD_MKMIN(setno, i);
5100 			if ((ui = MDI_UNIT(mnum)) == NULL)
5101 				continue;
5102 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5103 				continue;
5104 			if (md_unit_isopen(ui))
5105 				return (1);
5106 		}
5107 		return (0);
5108 	}
5109 
5110 	if (cmd != MD_HALT_DOIT)
5111 		return (1);
5112 
5113 	for (i = 0; i < md_nunits; i++) {
5114 		mnum = MD_MKMIN(setno, i);
5115 		if ((ui = MDI_UNIT(mnum)) == NULL)
5116 			continue;
5117 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5118 			continue;
5119 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5120 
5121 		/* Set a flag if there is at least one mirror metadevice. */
5122 		reset_mirror_flag = 1;
5123 	}
5124 
5125 	/*
5126 	 * Only wait for the global dr_timeout to finish
5127 	 *  - if there are mirror metadevices in this diskset or
5128 	 *  - if this is the local set since an unload of the md_mirror
5129 	 *    driver could follow a successful mirror halt in the local set.
5130 	 */
5131 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5132 		while ((mirror_md_ops.md_head == NULL) &&
5133 		    (mirror_timeout.dr_timeout_id != 0))
5134 			delay(md_hz);
5135 	}
5136 
5137 	return (0);
5138 }
5139 
5140 /*ARGSUSED3*/
5141 static int
5142 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5143 {
5144 	IOLOCK	lock;
5145 	minor_t		mnum = getminor(*dev);
5146 	set_t		setno;
5147 
5148 	/*
5149 	 * When doing an open of a multi owner metadevice, check to see if this
5150 	 * node is a starting node and if a reconfig cycle is underway.
5151 	 * If so, the system isn't sufficiently set up enough to handle the
5152 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5153 	 */
5154 	setno = MD_MIN2SET(mnum);
5155 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5156 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5157 			return (ENXIO);
5158 	}
5159 
5160 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5161 		/*
5162 		 * This indicates that the caller is an ioctl service routine.
5163 		 * In this case we initialise our stack-based IOLOCK and pass
5164 		 * this into the internal open routine. This allows multi-owner
5165 		 * metadevices to avoid deadlocking if an error is encountered
5166 		 * during the open() attempt. The failure case is:
5167 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5168 		 * this configuration would deadlock as the mirror code has to
5169 		 * send a state-update to the other nodes when it detects the
5170 		 * failure of the underlying submirror with an errored soft-part
5171 		 * on it. As there is a class1 message in progress (metaclear)
5172 		 * set_sm_comp_state() cannot send another class1 message;
5173 		 * instead we do not send a state_update message as the
5174 		 * metaclear is distributed and the failed submirror will be
5175 		 * cleared from the configuration by the metaclear.
5176 		 */
5177 		IOLOCK_INIT(&lock);
5178 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5179 		    md_oflags, &lock));
5180 	} else {
5181 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5182 		    md_oflags, (IOLOCK *)NULL));
5183 	}
5184 }
5185 
5186 
5187 /*ARGSUSED1*/
5188 static int
5189 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5190 {
5191 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5192 		(IOLOCK *)NULL));
5193 }
5194 
5195 
5196 /*
5197  * This routine dumps memory to the disk.  It assumes that the memory has
5198  * already been mapped into mainbus space.  It is called at disk interrupt
5199  * priority when the system is in trouble.
5200  *
5201  */
5202 static int
5203 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5204 {
5205 	mm_unit_t	*un;
5206 	dev_t		mapdev;
5207 	int		result;
5208 	int		smi;
5209 	int		any_succeed = 0;
5210 	int		save_result = 0;
5211 
5212 	/*
5213 	 * Don't need to grab the unit lock.
5214 	 * Cause nothing else is suppose to be happenning.
5215 	 * Also dump is not suppose to sleep.
5216 	 */
5217 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5218 
5219 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5220 		return (EINVAL);
5221 
5222 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5223 		return (EINVAL);
5224 
5225 	for (smi = 0; smi < NMIRROR; smi++) {
5226 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5227 			continue;
5228 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5229 		result = bdev_dump(mapdev, addr, blkno, nblk);
5230 		if (result)
5231 			save_result = result;
5232 
5233 		if (result == 0)
5234 			any_succeed++;
5235 	}
5236 
5237 	if (any_succeed)
5238 		return (0);
5239 
5240 	return (save_result);
5241 }
5242 
5243 /*
5244  * NAME: mirror_probe_dev
5245  *
5246  * DESCRITPION: force opens every component of a mirror.
5247  *
5248  * On entry the unit writerlock is held
5249  */
5250 static int
5251 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5252 {
5253 	int		i;
5254 	int		smi;
5255 	int		ci;
5256 	mm_unit_t	*un;
5257 	int		md_devopen = 0;
5258 	set_t		setno;
5259 	int		sm_cnt;
5260 	int		sm_unavail_cnt;
5261 
5262 	if (md_unit_isopen(ui))
5263 		md_devopen++;
5264 
5265 	un = MD_UNIT(mnum);
5266 	setno = MD_UN2SET(un);
5267 
5268 	sm_cnt = 0;
5269 	sm_unavail_cnt = 0;
5270 	for (i = 0; i < NMIRROR; i++) {
5271 		md_dev64_t tmpdev;
5272 		mdi_unit_t	*sm_ui;
5273 
5274 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5275 			continue;
5276 		}
5277 
5278 		sm_cnt++;
5279 		tmpdev = un->un_sm[i].sm_dev;
5280 		(void) md_layered_open(mnum, &tmpdev,
5281 				MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5282 		un->un_sm[i].sm_dev = tmpdev;
5283 
5284 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5285 
5286 		/*
5287 		 * Logic similar to that in mirror_open_all_devs.  We set or
5288 		 * clear the submirror Unavailable bit.
5289 		 */
5290 		(void) md_unit_writerlock(sm_ui);
5291 		if (submirror_unavailable(un, i, 1)) {
5292 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5293 			sm_unavail_cnt++;
5294 		} else {
5295 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5296 		}
5297 		md_unit_writerexit(sm_ui);
5298 	}
5299 
5300 	/*
5301 	 * If all of the submirrors are unavailable, the mirror is also
5302 	 * unavailable.
5303 	 */
5304 	if (sm_cnt == sm_unavail_cnt) {
5305 		ui->ui_tstate |= MD_INACCESSIBLE;
5306 	} else {
5307 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5308 	}
5309 
5310 	/*
5311 	 * Start checking from probe failures. If failures occur we
5312 	 * set the appropriate erred state only if the metadevice is in
5313 	 * use. This is specifically to prevent unnecessary resyncs.
5314 	 * For instance if the disks were accidentally disconnected when
5315 	 * the system booted up then until the metadevice is accessed
5316 	 * (like file system mount) the user can shutdown, recable and
5317 	 * reboot w/o incurring a potentially huge resync.
5318 	 */
5319 
5320 	smi = 0;
5321 	ci = 0;
5322 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5323 
5324 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5325 			/*
5326 			 * Note that for a MN set, there is no need to call
5327 			 * SE_NOTIFY as that is done when processing the
5328 			 * state change
5329 			 */
5330 			if (md_devopen) {
5331 				/*
5332 				 * Never called from ioctl context,
5333 				 * so (IOLOCK *)NULL
5334 				 */
5335 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5336 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5337 				if (!MD_MNSET_SETNO(setno)) {
5338 					SE_NOTIFY(EC_SVM_STATE,
5339 					    ESC_SVM_LASTERRED,
5340 					    SVM_TAG_METADEVICE, setno,
5341 					    MD_SID(un));
5342 				}
5343 				continue;
5344 			} else {
5345 				(void) mirror_close_all_devs(un,
5346 				    MD_OFLG_PROBEDEV);
5347 				if (!MD_MNSET_SETNO(setno)) {
5348 					SE_NOTIFY(EC_SVM_STATE,
5349 					    ESC_SVM_OPEN_FAIL,
5350 					    SVM_TAG_METADEVICE, setno,
5351 					    MD_SID(un));
5352 				}
5353 				mirror_openfail_console_info(un, smi, ci);
5354 				return (ENXIO);
5355 			}
5356 		}
5357 
5358 		/*
5359 		 * Note that for a MN set, there is no need to call
5360 		 * SE_NOTIFY as that is done when processing the
5361 		 * state change
5362 		 */
5363 		if (md_devopen) {
5364 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5365 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5366 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5367 			if (!MD_MNSET_SETNO(setno)) {
5368 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5369 				    SVM_TAG_METADEVICE, setno,
5370 				    MD_SID(un));
5371 			}
5372 		}
5373 		mirror_openfail_console_info(un, smi, ci);
5374 		ci++;
5375 	}
5376 
5377 	if (MD_MNSET_SETNO(setno)) {
5378 		send_poke_hotspares(setno);
5379 	} else {
5380 		(void) poke_hotspares();
5381 	}
5382 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5383 
5384 	return (0);
5385 }
5386 
5387 
5388 static int
5389 mirror_imp_set(
5390 	set_t	setno
5391 )
5392 {
5393 
5394 	mddb_recid_t	recid;
5395 	int		gotsomething, i;
5396 	mddb_type_t	typ1;
5397 	mddb_de_ic_t	*dep;
5398 	mddb_rb32_t	*rbp;
5399 	mm_unit32_od_t	*un32;
5400 	mm_unit_t	*un64;
5401 	minor_t		*self_id;	/* minor needs to be updated */
5402 	md_parent_t	*parent_id;	/* parent needs to be updated */
5403 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5404 	mddb_recid_t	*optrec_id;
5405 	md_dev64_t	tmpdev;
5406 
5407 
5408 	gotsomething = 0;
5409 
5410 	typ1 = (mddb_type_t)md_getshared_key(setno,
5411 	    mirror_md_ops.md_driver.md_drivername);
5412 	recid = mddb_makerecid(setno, 0);
5413 
5414 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5415 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5416 			continue;
5417 
5418 		dep = mddb_getrecdep(recid);
5419 		rbp = dep->de_rb;
5420 
5421 		if (rbp->rb_revision == MDDB_REV_RB) {
5422 			/*
5423 			 * Small device
5424 			 */
5425 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5426 			self_id = &(un32->c.un_self_id);
5427 			parent_id = &(un32->c.un_parent);
5428 			record_id = &(un32->c.un_record_id);
5429 			optrec_id = &(un32->un_rr_dirty_recid);
5430 
5431 			for (i = 0; i < un32->un_nsm; i++) {
5432 			    tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5433 			    un32->un_sm[i].sm_dev = md_cmpldev
5434 				(md_makedevice(md_major, MD_MKMIN(setno,
5435 				MD_MIN2UNIT(md_getminor(tmpdev)))));
5436 
5437 			    if (!md_update_minor(setno, mddb_getsidenum
5438 				(setno), un32->un_sm[i].sm_key))
5439 				goto out;
5440 			}
5441 		} else {
5442 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5443 			self_id = &(un64->c.un_self_id);
5444 			parent_id = &(un64->c.un_parent);
5445 			record_id = &(un64->c.un_record_id);
5446 			optrec_id = &(un64->un_rr_dirty_recid);
5447 
5448 			for (i = 0; i < un64->un_nsm; i++) {
5449 			    tmpdev = un64->un_sm[i].sm_dev;
5450 			    un64->un_sm[i].sm_dev = md_makedevice
5451 				(md_major, MD_MKMIN(setno, MD_MIN2UNIT
5452 				(md_getminor(tmpdev))));
5453 
5454 			    if (!md_update_minor(setno, mddb_getsidenum
5455 				(setno), un64->un_sm[i].sm_key))
5456 				goto out;
5457 			}
5458 		}
5459 
5460 		/*
5461 		 * Update unit with the imported setno
5462 		 *
5463 		 */
5464 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5465 
5466 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5467 		if (*parent_id != MD_NO_PARENT)
5468 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5469 		*record_id = MAKERECID(setno, DBID(*record_id));
5470 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5471 
5472 		gotsomething = 1;
5473 	}
5474 
5475 out:
5476 	return (gotsomething);
5477 }
5478 
5479 /*
5480  * NAME: mirror_check_offline
5481  *
5482  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5483  *
5484  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5485  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5486  * ioctl.
5487  */
5488 int
5489 mirror_check_offline(md_dev64_t dev, int *offline_status)
5490 {
5491 	mm_unit_t		*un;
5492 	md_error_t		mde = mdnullerror;
5493 
5494 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5495 		return (EINVAL);
5496 	*offline_status = 0;
5497 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5498 		*offline_status = 1;
5499 	return (0);
5500 }
5501 
5502 /*
5503  * NAME: mirror_inc_abr_count
5504  *
5505  * DESCRIPTION: increment the count of layered soft parts with ABR set
5506  *
5507  * Called from ioctl, so access to un_abr_count is protected by the global
5508  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5509  */
5510 int
5511 mirror_inc_abr_count(md_dev64_t dev)
5512 {
5513 	mm_unit_t		*un;
5514 	md_error_t		mde = mdnullerror;
5515 
5516 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5517 		return (EINVAL);
5518 	un->un_abr_count++;
5519 	return (0);
5520 }
5521 
5522 /*
5523  * NAME: mirror_dec_abr_count
5524  *
5525  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5526  *
5527  * Called from ioctl, so access to un_abr_count is protected by the global
5528  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5529  */
5530 int
5531 mirror_dec_abr_count(md_dev64_t dev)
5532 {
5533 	mm_unit_t		*un;
5534 	md_error_t		mde = mdnullerror;
5535 
5536 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5537 		return (EINVAL);
5538 	un->un_abr_count--;
5539 	return (0);
5540 }
5541 
5542 static md_named_services_t mirror_named_services[] = {
5543 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5544 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5545 	{mirror_rename_check,				MDRNM_CHECK	    },
5546 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5547 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5548 			MDRNM_PARENT_UPDATE_TO},
5549 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5550 			MDRNM_SELF_UPDATE_FROM_DOWN },
5551 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5552 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5553 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5554 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5555 	{ NULL,						0		    }
5556 };
5557 
5558 md_ops_t mirror_md_ops = {
5559 	mirror_open,		/* open */
5560 	mirror_close,		/* close */
5561 	md_mirror_strategy,	/* strategy */
5562 	NULL,			/* print */
5563 	mirror_dump,		/* dump */
5564 	NULL,			/* read */
5565 	NULL,			/* write */
5566 	md_mirror_ioctl,	/* mirror_ioctl, */
5567 	mirror_snarf,		/* mirror_snarf */
5568 	mirror_halt,		/* mirror_halt */
5569 	NULL,			/* aread */
5570 	NULL,			/* awrite */
5571 	mirror_imp_set,		/* import set */
5572 	mirror_named_services
5573 };
5574 
5575 /* module specific initilization */
5576 static void
5577 init_init()
5578 {
5579 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5580 
5581 	/* Initialize the parent and child save memory pools */
5582 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5583 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5584 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5585 	    0);
5586 
5587 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5588 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5589 	    mirror_child_constructor, mirror_child_destructor,
5590 	    mirror_run_queue, NULL, NULL, 0);
5591 
5592 	/*
5593 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5594 	 * then initialize wowbuf memory pool.
5595 	 */
5596 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5597 	if (md_wowbuf_size <= 0)
5598 		md_wowbuf_size = 2 * DEV_BSIZE;
5599 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5600 		md_wowbuf_size = (32 * DEV_BSIZE);
5601 
5602 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5603 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5604 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5605 
5606 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5607 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5608 
5609 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5610 }
5611 
5612 /* module specific uninitilization (undo init_init()) */
5613 static void
5614 fini_uninit()
5615 {
5616 	kmem_cache_destroy(mirror_parent_cache);
5617 	kmem_cache_destroy(mirror_child_cache);
5618 	kmem_cache_destroy(mirror_wowblk_cache);
5619 	mirror_parent_cache = mirror_child_cache =
5620 	    mirror_wowblk_cache = NULL;
5621 
5622 	mutex_destroy(&mirror_timeout.dr_mx);
5623 	mutex_destroy(&hotspare_request.dr_mx);
5624 	mutex_destroy(&non_ff_drv_mutex);
5625 }
5626 
5627 /* define the module linkage */
5628 MD_PLUGIN_MISC_MODULE("mirrors module %I%", init_init(), fini_uninit())
5629